diff --git a/.gitignore b/.gitignore
index 900e5a53cbcf3bbb5e00389cca004c49f8600a66..bdcb067fc26d2a18ed88034ab616c08095794e17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,12 +4,11 @@ node_modules
 /.bazelrc
 /.tf_configure.bazelrc
 /bazel-*
-/third_party/py/numpy/numpy_include
-/tools/bazel.rc
+/bazel_pip
+/third_party/eigen3/mkl_include
+/third_party/mkl/*
 /tools/python_bin_path.sh
 /tools/git/gen
-/util/python/python_include
-/util/python/python_lib
 /pip_test
 /_python_build
 *.pyc
diff --git a/.mention-bot b/.mention-bot
deleted file mode 100644
index 9e4858977f5da2992ccc4053dfbbda3f5f86ee90..0000000000000000000000000000000000000000
--- a/.mention-bot
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "maxReviewers": 2,
-  "numFilesToCheck": 10,
-  "userBlacklist": ["tensorflower-gardener"],
-  "requiredOrgs": ["tensorflow"],
-  "skipAlreadyAssignedPR": true,
-  "skipAlreadyMentionedPR": true,
-  "skipTitle": "Branch",
-  "delayed": true,
-  "delayedUntil": "10m"
-}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5ae5c0fbbcd5b8da7e3f3f98e01f455e0c82e588..c78b6b1a150c98fa379a87f935e77b5803837f11 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,140 @@ contributions, often because we probably won't get to them right now. If you
 decide to start on an issue, leave a comment so that other people know that
 you're working on it. If you want to help out, but not alone, use the issue
 comment thread to coordinate.
+
+### Contribution guidelines and standards
+
+Before sending your pull request for
+[review](https://github.com/tensorflow/tensorflow/pulls),
+make sure your changes are consistent with the guidelines and follow the
+TensorFlow coding style.
+
+#### General guidelines and philosophy for contribution
+
+* Include unit tests when you contribute new features, as they help to
+  a) prove that your code works correctly, b) guard against future breaking
+  changes to lower the maintenance cost.
+* Bug fixes also generally require unit tests, because the presence of bugs
+  usually indicates insufficient test coverage.
+* Keep API compatibility in mind when you change code in core TensorFlow,
+  e.g., code in [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core) and  [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
+  TensorFlow has reached version 1 and hence cannot make
+  non-backward-compatible API changes without a major release. Reviewers of your
+  pull request will comment on any API compatibility issues.
+* When you contribute a new feature to TensorFlow, the maintenance burden is (by
+  default) transferred to the TensorFlow team. This means that benefit of
+  contribution must be compared against the cost of maintaining the feature.
+* Full new features (e.g., a new op implementing a cutting-edge algorithm)
+  typically will live in
+  [tensorflow/contrib](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib)
+  to get some airtime before decision is made regarding whether they are to be
+  migrated to the core.
+
+#### License
+
+Include a license at the top of new files.
+
+* [C/C++ license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op.cc#L1)
+* [Python license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn.py#L1)
+* [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
+* [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
+* [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
+* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+
+Bazel BUILD files also need to include a license section, e.g.,
+[BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
+
+#### C++ coding style
+
+Changes to TensorFlow C++ code should conform to
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+
+```bash
+apt-get install -y clang-tidy
+```
+
+You can check a C/C++ file by doing:
+
+
+```bash
+clang-format <my_cc_file> --style=google > /tmp/my_cc_file.cc
+diff <my_cc_file> /tmp/my_cc_file.cc
+```
+
+#### Python coding style
+
+Changes to TensorFlow Python code should conform to
+[Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+
+Use `pylint` to check your Python changes. To install `pylint` and
+retrieve TensorFlow's custom style definition:
+
+```bash
+pip install pylint
+wget -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
+```
+
+To check a file with `pylint`:
+
+```bash
+pylint --rcfile=/tmp/pylintrc myfile.py
+```
+
+#### Coding style for other languages
+
+* [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
+* [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
+* [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+
+#### Running sanity check
+
+If you have Docker installed on your system, you can perform a sanity check on
+your changes by running the command:
+
+```bash
+tensorflow/tools/ci_build/ci_build.sh CPU tensorflow/tools/ci_build/ci_sanity.sh
+```
+
+This will catch most license, Python coding style and BUILD file issues that
+may exist in your changes.
+
+#### Running unit tests
+
+There are two ways to run TensorFlow unit tests.
+
+1. Using tools and libraries installed directly on your system.
+
+   Refer to the
+   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
+   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+   for the required packages. Alternatively, use the said
+   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
+   for development to avoid installing the packages directly on your system.
+
+   Once you have the packages installed, you can run a specific unit test in
+   bazel by doing as follows:
+
+   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+   the `cuda` option flag
+
+   ```bash
+   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+   export flags="--config=opt --config=cuda -k"
+   ```
+
+   For example, to run all tests under tensorflow/python, do:
+
+   ```bash
+   bazel test ${flags} //tensorflow/python/...
+   ```
+
+2. Using Docker and TensorFlow's CI scripts.
+
+   See
+   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index af76188c2f4d2e1908f541918c8b680627a90cf9..6f4c048ce83fb47a611b5dfe08e0fde0779994c0 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -1,36 +1,36 @@
-NOTE: Only file GitHub issues for bugs and feature requests.  All other topics will be closed.
+Please go to Stack Overflow for help and support:
 
-For general support from the community, see [StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow).
-To make bugs and feature requests more easy to find and organize, we close issues that are deemed
-out of scope for GitHub Issues and point people to StackOverflow.
+http://stackoverflow.com/questions/tagged/tensorflow
 
-For bugs or installation issues, please provide the following information.
-The more information you provide, the more easily we will be able to offer
-help and advice.
+If you open a GitHub issue, here is our policy:
 
-### What related GitHub issues or StackOverflow threads have you found by searching the web for your problem?
+1. It must be a bug or a feature request.
+2. The form below must be filled out.
 
-### Environment info
-Operating System:
+**Here's why we have that policy**: TensorFlow developers respond to issues. We want to focus on work that benefits the whole community, e.g., fixing bugs and adding features. Support only helps individuals. GitHub also notifies thousands of people when issues are filed. We want them to see you communicating an interesting problem, rather than being redirected to Stack Overflow.
 
-Installed version of CUDA and cuDNN: 
-(please attach the output of `ls -l /path/to/cuda/lib/libcud*`):
+------------------------
 
-If installed from binary pip package, provide:
+### System information
+- **Have I written custom code (as opposed to using a stock example script provided in TensorFlow)**:
+- **OS Platform and Distribution (e.g., Linux Ubuntu 16.04)**:
+- **TensorFlow installed from (source or binary)**:
+- **TensorFlow version (use command below)**:
+- **Bazel version (if compiling from source)**:
+- **CUDA/cuDNN version**:
+- **GPU model and memory**:
+- **Exact command to reproduce**:
 
-1. A link to the pip package you installed:
-2. The output from `python -c "import tensorflow; print(tensorflow.__version__)"`.
+You can collect some of this information using our environment capture script:
 
-If installed from source, provide 
+https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh
 
-1. The commit hash (`git rev-parse HEAD`)
-2. The output of `bazel version`
+You can obtain the TensorFlow version with
 
-### If possible, provide a minimal reproducible example (We usually don't have time to read hundreds of lines of your code)
+python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
 
+### Describe the problem
+Describe the problem clearly here. Be sure to convey here why it's a bug in TensorFlow or a feature request.
 
-### What other attempted solutions have you tried?
-
-
-### Logs or other output that would be helpful
-(If logs are large, please upload as attachment or provide link).
+### Source code / logs
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached. Try to provide a reproducible test case that is the bare minimum necessary to generate the problem.
diff --git a/README.md b/README.md
index d9f05a67e0391fb5817cf1bd1ac492e3b3cce71d..2878dab2601351dabbfbcadfbe6a4ae94864ce56 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ guidelines](CONTRIBUTING.md).**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
 tracking requests and bugs, but please see
-[Community](tensorflow/docs_src/about/index.md#community) for general questions
+[Community](https://www.tensorflow.org/community/) for general questions
 and discussion.**
 
 ## Installation
@@ -34,12 +34,12 @@ and discussion.**
 
 People who are a little more adventurous can also try our nightly binaries:
 
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0rc0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.1.0rc0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
+* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
+* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
+* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.1.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
+* Mac GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-mac/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.1.0-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-mac-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-mac/))
+* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.1.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=cpu,OS=windows/))
+* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.1.0-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/DEVICE=gpu,OS=windows/))
 * Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
 ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
 
@@ -52,7 +52,7 @@ $ python
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> sess = tf.Session()
 >>> sess.run(hello)
-Hello, TensorFlow!
+'Hello, TensorFlow!'
 >>> a = tf.constant(10)
 >>> b = tf.constant(32)
 >>> sess.run(a+b)
@@ -62,7 +62,7 @@ Hello, TensorFlow!
 
 ## For more information
 
-* [TensorFlow website](http://tensorflow.org)
+* [TensorFlow website](https://tensorflow.org)
 * [TensorFlow whitepaper](http://download.tensorflow.org/paper/whitepaper2015.pdf)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
diff --git a/RELEASE.md b/RELEASE.md
index 156cc2e3af507ffa416a1a96b2d37caa4d87c2e5..02bdbd429772a79d2f8f9af6012b6ac3916c822f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,35 @@
+# Changes since the last release
+
+## Major Features and Improvements
+* Added `tf.layers.conv3d_transpose` layer for spatio temporal deconvolution.
+* Added `tf.Session.make_callable()`, which provides a lower overhead means of running a similar step multiple times.
+* Added ibverbs-based RDMA support to contrib (courtesy @junshi15 from Yahoo).
+* `RNNCell` objects now subclass `tf.layers._Layer`.  The strictness described
+  in the TensorFlow 1.1 release is gone:  The first time an RNNCell is used,
+  it caches its scope.  All future uses of the RNNCell will reuse variables from
+  that same scope.  This is a breaking change from the behavior of RNNCells
+  in TensorFlow versions <= 1.0.1.  TensorFlow 1.1 had checks in place to
+  ensure old code works correctly with the new semantics; this version
+  allows more flexible uses of RNNCell but can lead to subtle errors if
+  using code meant for TensorFlow <= 1.0.1.  For example, writing:
+  `MultiRNNCell([lstm] * 5)` will now build a 5-layer LSTM stack where each
+  layer shares the **same** parameters.  To get 5 layers each with their own
+  parameters, write: `MultiRNNCell([LSTMCell(...) for _ in range(5)])`.
+  If at all unsure, first test your code with TF 1.1; ensure it raises no
+  errors, and then upgrade to TF 1.2.
+
+## Bug Fixes and Other Changes
+* In python, `Operation.get_attr` on type attributes returns the Python DType
+  version of the type to match expected get_attr documentation rather than the
+  protobuf enum.
+* tensorflow/contrib/rnn undergoes RNN cell variable renaming for
+  consistency with Keras layers. Specifically, the previous variable names
+  "weights" and "biases" are changed to "kernel" and "bias", respectively.
+  This may cause backward incompatibility with regard to your old
+  checkpoints containing such RNN cells, in which case you can use the
+  [checkpoint_convert script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py)
+  to convert the variable names in your old checkpoints.
+
 # Release 1.1.0
 
 ## Major Features and Improvements
@@ -15,6 +47,8 @@
   * Ability to inspect Python source file against TF ops and tensors (command `print_source` / `ps`)
   * New navigation bar in Curses-based UI
   * NodeStepper (command `invoke_stepper`) now uses intermediate tensor dumps. It also uses `TensorHandles` as direct feeds during successive `cont` calls for improved performance and reduced memory consumption.
+* Initial release of installation guides for Java, C, and Go.
+* Added Text Dashboard to TensorBoard.
 
 ## Deprecations
 
@@ -68,38 +102,41 @@
 * Multiple tfdbg bug fixes:
   * Fixed Windows compatibility issues.
   * Command history now persists across runs.
+  * Bug fix in graph validation related to `tf.while_loops`.
+* Java Maven fixes for bugs with Windows installation.
+* Backport fixes and improvements from external keras.
+* Keras config file handling fix.
 
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
 A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
-Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton 
 Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
-Visser, Brady Zhou, Calpa Liu, Changming Sun, Chi Zeng, Chih Cheng Liang,
-Christopher Berner, Clark Zinzow, @Conchylicultor, Courtial Florian, Dan Ellis,
-Dan J, Dan Jarvis, Daniel Ylitalo, Darren Garvey, David Norman, David Truong,
-@DavidNorman, Dimitar Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan
-Noury, Eron Wright, Evgeny Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher
-Coder, Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang,
-@guilherme, @guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo
-Gao, Igor ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason
-Morton, Jay Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi,
-@jiqiu, Joan Thibault, John C F, Jojy G Varghese, Jon Malmaud, Julian Berman,
-Julian Niedermeier, Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle
-Bostelmann, @Lezcano, Li Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh,
-Marek Kolodziej, Mark Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael
-Gharbi, MichaëL Defferrard, Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal,
-Nayana Thorat, @nghiattran, Nicholas Connor, Nikolaas Steenbergen, Niraj Patel,
-Niranjan Hasabnis, @Panmari, Pavel Bulanov, Philip Pries Henningsen, Philipp
-Jund, @polonez, Prayag Verma, Rahul Kavi, Raphael Gontijo Lopes, @rasbt, Raven
-Iqqe, Reid Pryzant, Richard Shin, Rizwan Asif, Russell Kaplan, Ryo Asakura,
-RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay, Sean Papay, @seaotterman,
-@selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano Probst, @taknevski,
-@tbonza, @teldridge11, Yuan (Terry) Tang, Tim Anglade, Tomas Reimers, Tomer Gafner,
-Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad
-Firoiu, @wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yuan (Terry) Tang,
-@Yufeng, Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
+Visser, Brady Zhou, Calpa Liu, Changming Sun, Chih Cheng Liang, Christopher
+Berner, Clark Zinzow, @Conchylicultor, Dan Ellis, Dan J, Dan Jarvis, Daniel
+Ylitalo, Darren Garvey, David Norman, David Truong, @DavidNorman, Dimitar
+Pavlov, Dmitry Persiyanov, @Eddie, @elirex, Erfan Noury, Eron Wright, Evgeny
+Mazovetskiy, Fabrizio (Misto) Milo, @fanlu, Fisher Coder, Florian Courtial,
+Franck Dernoncourt, Gagan Goel, Gao, Xiang, @Gautam, Gefu Tang, @guilherme,
+@guschmue, Hannah Provenza, Hans Pabst, @hartb, Hsiao Yi, Huazuo Gao, Igor
+ChorążEwicz, Ivan Smirnov, Jakub Kolodziejczyk, Jason Gavris, Jason Morton, Jay
+Young, Jayaram Bobba, Jeremy Sawruk, Jiaming Liu, Jihun Choi, @jiqiu, Joan Thibault,
+John C F, Jojy George Varghese, Jon Malmaud, Julian Berman, Julian Niedermeier,
+Junpeng Lao, Kai Sasaki, @Kankroc, Karl Lessard, Kyle Bostelmann, @Lezcano, Li
+Yi, Luo Yun, @lurker, Mahmoud-Abuzaina, Mandeep Singh, Marek Kolodziej, Mark
+Szepieniec, Martial Hue, Medhat Omr, Memo Akten, Michael Gharbi, MichaëL Defferrard,
+Milan Straka, @MircoT, @mlucool, Muammar Ibn Faisal, Nayana Thorat, @nghiattran,
+Nicholas Connor, Nikolaas Steenbergen, Niraj Patel, Niranjan Hasabnis, @Panmari,
+Pavel Bulanov, Philip Pries Henningsen, Philipp Jund, @polonez, Prayag Verma, Rahul
+Kavi, Raphael Gontijo Lopes, @rasbt, Raven Iqqe, Reid Pryzant, Richard Shin, Rizwan
+Asif, Russell Kaplan, Ryo Asakura, RüDiger Busche, Saisai Shao, Sam Abrahams, @sanosay,
+Sean Papay, @seaotterman, @selay01, Shaurya Sharma, Sriram Narayanamoorthy, Stefano
+Probst, @taknevski, @tbonza, @teldridge11, Tim Anglade, Tomas Reimers, Tomer Gafner,
+Valentin Iovene, Vamsi Sripathi, Viktor Malyi, Vit Stepanovs, Vivek Rane, Vlad Firoiu,
+@wangg12, @will, Xiaoyu Tao, Yaroslav Bulatov, Yi Liu, Yuan (Terry) Tang, @Yufeng,
+Yuming Wang, Yuxin Wu, Zafar Takhirov, Ziming Dong
 
 We are also grateful to all who filed issues or helped resolve them, asked and
 answered questions, and were part of inspiring discussions.
diff --git a/WORKSPACE b/WORKSPACE
index cab8389a55ccfeddb9dc077c9b999edbe775f25d..edf655f6a7b0ab2781cf2d349732a102aedff112 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,11 +2,11 @@ workspace(name = "org_tensorflow")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "60fc6977908f999b23ca65698c2bb70213403824a84f7904310b6000d78be9ce",
-    strip_prefix = "rules_closure-5ca1dab6df9ad02050f7ba4e816407f88690cf7d",
+    sha256 = "4be8a887f6f38f883236e77bb25c2da10d506f2bf1a8e5d785c0f35574c74ca4",
+    strip_prefix = "rules_closure-aac19edc557aec9b603cd7ffe359401264ceff0d",
     urls = [
-        "http://bazel-mirror.storage.googleapis.com/github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",  # 2017-02-03
-        "https://github.com/bazelbuild/rules_closure/archive/5ca1dab6df9ad02050f7ba4e816407f88690cf7d.tar.gz",
+        "http://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",  # 2017-05-10
+        "https://github.com/bazelbuild/rules_closure/archive/aac19edc557aec9b603cd7ffe359401264ceff0d.tar.gz",
     ],
 )
 
@@ -20,7 +20,7 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_sdk_repository(
 #    name = "androidsdk",
 #    api_level = 23,
-#    # Ensure that you have the build_tools_version below installed in the 
+#    # Ensure that you have the build_tools_version below installed in the
 #    # SDK manager as it updates periodically.
 #    build_tools_version = "25.0.2",
 #    # Replace with path to Android SDK on your system
@@ -31,7 +31,7 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #android_ndk_repository(
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
-#    # This needs to be 14 or higher to compile TensorFlow. 
+#    # This needs to be 14 or higher to compile TensorFlow.
 #    # Note that the NDK version is not the API level.
 #    api_level=14)
 
@@ -39,485 +39,31 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 tf_workspace()
 
 new_http_archive(
-  name = "inception5h",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
-  sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364"
-)
-
-new_http_archive(
-  name = "mobile_multibox",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
-  sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96"
-)
-
-new_http_archive(
-  name = "stylize",
-  build_file = "models.BUILD",
-  url = "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
-  sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa"
-)
-
-# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT
-
-new_http_archive(
-  name = "d3",
-  build_file = "bower.BUILD",
-  url = "https://github.com/mbostock-bower/d3-bower/archive/v3.5.15.tar.gz",
-  strip_prefix = "d3-bower-3.5.15",
-)
-
-new_http_archive(
-  name = "dagre",
-  build_file = "bower.BUILD",
-  url = "https://github.com/cpettitt/dagre/archive/v0.7.4.tar.gz",
-  strip_prefix = "dagre-0.7.4",
-)
-
-new_http_archive(
-  name = "es6_promise",
-  build_file = "bower.BUILD",
-  url = "https://github.com/components/es6-promise/archive/v2.1.0.tar.gz",
-  strip_prefix = "es6-promise-2.1.0",
-)
-
-new_http_archive(
-  name = "font_roboto",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/font-roboto/archive/v1.0.1.tar.gz",
-  strip_prefix = "font-roboto-1.0.1",
-)
-
-new_http_archive(
-  name = "graphlib",
-  build_file = "bower.BUILD",
-  url = "https://github.com/cpettitt/graphlib/archive/v1.0.7.tar.gz",
-  strip_prefix = "graphlib-1.0.7",
-)
-
-new_http_archive(
-  name = "iron_a11y_announcer",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
-  strip_prefix = "iron-a11y-announcer-1.0.5",
-)
-
-new_http_archive(
-  name = "iron_a11y_keys_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
-  strip_prefix = "iron-a11y-keys-behavior-1.1.8",
-)
-
-new_http_archive(
-  name = "iron_ajax",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-ajax/archive/v1.2.0.tar.gz",
-  strip_prefix = "iron-ajax-1.2.0",
-)
-
-new_http_archive(
-  name = "iron_autogrow_textarea",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
-  strip_prefix = "iron-autogrow-textarea-1.0.12",
-)
-
-new_http_archive(
-  name = "iron_behaviors",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-behaviors/archive/v1.0.17.tar.gz",
-  strip_prefix = "iron-behaviors-1.0.17",
-)
-
-new_http_archive(
-  name = "iron_checked_element_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
-  strip_prefix = "iron-checked-element-behavior-1.0.4",
-)
-
-new_http_archive(
-  name = "iron_collapse",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-collapse/archive/v1.0.8.tar.gz",
-  strip_prefix = "iron-collapse-1.0.8",
-)
-
-new_http_archive(
-  name = "iron_dropdown",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-dropdown/archive/v1.4.0.tar.gz",
-  strip_prefix = "iron-dropdown-1.4.0",
-)
-
-new_http_archive(
-  name = "iron_fit_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-fit-behavior/archive/v1.2.5.tar.gz",
-  strip_prefix = "iron-fit-behavior-1.2.5",
-)
-
-new_http_archive(
-  name = "iron_flex_layout",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-flex-layout/archive/v1.3.0.tar.gz",
-  strip_prefix = "iron-flex-layout-1.3.0",
-)
-
-new_http_archive(
-  name = "iron_form_element_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
-  strip_prefix = "iron-form-element-behavior-1.0.6",
-)
-
-new_http_archive(
-  name = "iron_icon",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-icon/archive/v1.0.11.tar.gz",
-  strip_prefix = "iron-icon-1.0.11",
-)
-
-new_http_archive(
-  name = "iron_icons",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-icons/archive/v1.1.3.tar.gz",
-  strip_prefix = "iron-icons-1.1.3",
-)
-
-new_http_archive(
-  name = "iron_iconset_svg",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-iconset-svg/archive/v1.1.0.tar.gz",
-  strip_prefix = "iron-iconset-svg-1.1.0",
-)
-
-new_http_archive(
-  name = "iron_input",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-input/archive/1.0.10.tar.gz",
-  strip_prefix = "iron-input-1.0.10",
-)
-
-new_http_archive(
-  name = "iron_list",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-list/archive/v1.3.9.tar.gz",
-  strip_prefix = "iron-list-1.3.9",
-)
-
-new_http_archive(
-  name = "iron_menu_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-menu-behavior/archive/v1.1.10.tar.gz",
-  strip_prefix = "iron-menu-behavior-1.1.10",
-)
-
-new_http_archive(
-  name = "iron_meta",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-meta/archive/v1.1.1.tar.gz",
-  strip_prefix = "iron-meta-1.1.1",
-)
-
-new_http_archive(
-  name = "iron_overlay_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
-  strip_prefix = "iron-overlay-behavior-1.10.1",
-)
-
-new_http_archive(
-  name = "iron_range_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-range-behavior/archive/v1.0.4.tar.gz",
-  strip_prefix = "iron-range-behavior-1.0.4",
-)
-
-new_http_archive(
-  name = "iron_resizable_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
-  strip_prefix = "iron-resizable-behavior-1.0.3",
-)
-
-new_http_archive(
-  name = "iron_scroll_target_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
-  strip_prefix = "iron-scroll-target-behavior-1.0.3",
-)
-
-new_http_archive(
-  name = "iron_selector",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-selector/archive/v1.5.2.tar.gz",
-  strip_prefix = "iron-selector-1.5.2",
-)
-
-new_http_archive(
-  name = "iron_validatable_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
-  strip_prefix = "iron-validatable-behavior-1.1.1",
-)
-
-new_http_archive(
-  name = "lodash",
-  build_file = "bower.BUILD",
-  url = "https://github.com/lodash/lodash/archive/3.8.0.tar.gz",
-  strip_prefix = "lodash-3.8.0",
-)
-
-new_http_archive(
-  name = "neon_animation",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/neon-animation/archive/v1.2.2.tar.gz",
-  strip_prefix = "neon-animation-1.2.2",
-)
-
-http_file(
-  name = "numericjs_numeric_min_js",
-  url = "https://cdnjs.cloudflare.com/ajax/libs/numeric/1.2.6/numeric.min.js",
-)
-
-new_http_archive(
-  name = "paper_behaviors",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-behaviors/archive/v1.0.12.tar.gz",
-  strip_prefix = "paper-behaviors-1.0.12",
-)
-
-new_http_archive(
-  name = "paper_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-button/archive/v1.0.11.tar.gz",
-  strip_prefix = "paper-button-1.0.11",
-)
-
-new_http_archive(
-  name = "paper_checkbox",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-checkbox/archive/v1.4.0.tar.gz",
-  strip_prefix = "paper-checkbox-1.4.0",
-)
-
-new_http_archive(
-  name = "paper_dialog",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog/archive/v1.0.4.tar.gz",
-  strip_prefix = "paper-dialog-1.0.4",
-)
-
-new_http_archive(
-  name = "paper_dialog_behavior",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
-  strip_prefix = "paper-dialog-behavior-1.2.5",
-)
-
-new_http_archive(
-  name = "paper_dialog_scrollable",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
-  strip_prefix = "paper-dialog-scrollable-1.1.5",
-)
-
-new_http_archive(
-  name = "paper_dropdown_menu",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
-  strip_prefix = "paper-dropdown-menu-1.4.0",
-)
-
-new_http_archive(
-  name = "paper_header_panel",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-header-panel/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-header-panel-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_icon_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-icon-button/archive/v1.1.3.tar.gz",
-  strip_prefix = "paper-icon-button-1.1.3",
-)
-
-new_http_archive(
-  name = "paper_input",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-input/archive/v1.1.18.tar.gz",
-  strip_prefix = "paper-input-1.1.18",
-)
-
-new_http_archive(
-  name = "paper_item",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-item/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-item-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_listbox",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-listbox/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-listbox-1.1.2",
-)
-
-new_http_archive(
-  name = "paper_material",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-material/archive/v1.0.6.tar.gz",
-  strip_prefix = "paper-material-1.0.6",
-)
-
-new_http_archive(
-  name = "paper_menu",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-menu/archive/v1.2.2.tar.gz",
-  strip_prefix = "paper-menu-1.2.2",
-)
-
-new_http_archive(
-  name = "paper_menu_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-menu-button/archive/v1.5.1.tar.gz",
-  strip_prefix = "paper-menu-button-1.5.1",
-)
-
-new_http_archive(
-  name = "paper_progress",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-progress/archive/v1.0.9.tar.gz",
-  strip_prefix = "paper-progress-1.0.9",
-)
-
-new_http_archive(
-  name = "paper_radio_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-radio-button/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-radio-button-1.1.2",
-)
-
-new_http_archive(
-  name = "paper_radio_group",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-radio-group/archive/v1.0.9.tar.gz",
-  strip_prefix = "paper-radio-group-1.0.9",
-)
-
-new_http_archive(
-  name = "paper_ripple",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-ripple/archive/v1.0.5.tar.gz",
-  strip_prefix = "paper-ripple-1.0.5",
-)
-
-new_http_archive(
-  name = "paper_slider",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-slider/archive/v1.0.10.tar.gz",
-  strip_prefix = "paper-slider-1.0.10",
-)
-
-new_http_archive(
-  name = "paper_spinner",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-spinner/archive/v1.1.1.tar.gz",
-  strip_prefix = "paper-spinner-1.1.1",
-)
-
-new_http_archive(
-  name = "paper_styles",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-styles/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-styles-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_tabs",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-tabs/archive/v1.7.0.tar.gz",
-  strip_prefix = "paper-tabs-1.7.0",
-)
-
-new_http_archive(
-  name = "paper_toast",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toast/archive/v1.3.0.tar.gz",
-  strip_prefix = "paper-toast-1.3.0",
-)
-
-new_http_archive(
-  name = "paper_toggle_button",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toggle-button/archive/v1.2.0.tar.gz",
-  strip_prefix = "paper-toggle-button-1.2.0",
-)
-
-new_http_archive(
-  name = "paper_toolbar",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-toolbar/archive/v1.1.4.tar.gz",
-  strip_prefix = "paper-toolbar-1.1.4",
-)
-
-new_http_archive(
-  name = "paper_tooltip",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerelements/paper-tooltip/archive/v1.1.2.tar.gz",
-  strip_prefix = "paper-tooltip-1.1.2",
-)
-
-new_http_archive(
-  name = "plottable",
-  build_file = "bower.BUILD",
-  url = "https://github.com/palantir/plottable/archive/v1.16.1.tar.gz",
-  strip_prefix = "plottable-1.16.1",
-)
-
-new_http_archive(
-  name = "polymer",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
-  strip_prefix = "polymer-1.7.0",
-)
-
-new_http_archive(
-  name = "promise_polyfill",
-  build_file = "bower.BUILD",
-  url = "https://github.com/polymerlabs/promise-polyfill/archive/v1.0.0.tar.gz",
-  strip_prefix = "promise-polyfill-1.0.0",
-)
-
-http_file(
-  name = "three_js_three_min_js",
-  url = "https://raw.githubusercontent.com/mrdoob/three.js/r77/build/three.min.js",
-)
-
-http_file(
-  name = "three_js_orbitcontrols_js",
-  url = "https://raw.githubusercontent.com/mrdoob/three.js/r77/examples/js/controls/OrbitControls.js",
+    name = "inception5h",
+    build_file = "models.BUILD",
+    sha256 = "d13569f6a98159de37e92e9c8ec4dae8f674fbf475f69fe6199b514f756d4364",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip",
+        "http://download.tensorflow.org/models/inception5h.zip",
+    ],
 )
 
 new_http_archive(
-  name = "web_animations_js",
-  build_file = "bower.BUILD",
-  url = "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
-  strip_prefix = "web-animations-js-2.2.1",
+    name = "mobile_multibox",
+    build_file = "models.BUILD",
+    sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
+        "http://download.tensorflow.org/models/mobile_multibox_v1a.zip",
+    ],
 )
 
 new_http_archive(
-  name = "webcomponentsjs",
-  build_file = "bower.BUILD",
-  url = "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
-  strip_prefix = "webcomponentsjs-0.7.22",
-)
-
-http_file(
-  name = "weblas_weblas_js",
-  url = "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+    name = "stylize",
+    build_file = "models.BUILD",
+    sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
+    urls = [
+        "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
+        "http://download.tensorflow.org/models/stylize_v1.zip",
+    ],
 )
diff --git a/bower.BUILD b/bower.BUILD
deleted file mode 100644
index eabd1d6450728aab37ebeca6366009d74c6984b6..0000000000000000000000000000000000000000
--- a/bower.BUILD
+++ /dev/null
@@ -1,645 +0,0 @@
-# AUTOGENERATED FILE by tensorboard_bower_dependency_sync.py
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "d3",
-    srcs = [
-        "d3.js",
-        "d3.min.js",
-        "package.js",
-    ],
-)
-
-filegroup(
-    name = "dagre",
-    srcs = [
-        "dist/dagre.core.js",
-        "dist/dagre.core.min.js",
-    ],
-)
-
-filegroup(
-    name = "es6_promise",
-    srcs = [
-        "promise.js",
-        "promise.min.js",
-    ],
-)
-
-filegroup(
-    name = "font_roboto",
-    srcs = ["roboto.html"],
-)
-
-filegroup(
-    name = "graphlib",
-    srcs = [
-        "dist/graphlib.core.js",
-        "dist/graphlib.core.min.js",
-    ],
-)
-
-filegroup(
-    name = "iron_a11y_announcer",
-    srcs = [
-        "index.html",
-        "iron-a11y-announcer.html",
-    ],
-)
-
-filegroup(
-    name = "iron_a11y_keys_behavior",
-    srcs = [
-        "index.html",
-        "iron-a11y-keys-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_ajax",
-    srcs = [
-        "index.html",
-        "iron-ajax.html",
-        "iron-request.html",
-    ],
-)
-
-filegroup(
-    name = "iron_autogrow_textarea",
-    srcs = [
-        "index.html",
-        "iron-autogrow-textarea.html",
-    ],
-)
-
-filegroup(
-    name = "iron_behaviors",
-    srcs = [
-        "index.html",
-        "iron-button-state.html",
-        "iron-control-state.html",
-    ],
-)
-
-filegroup(
-    name = "iron_checked_element_behavior",
-    srcs = [
-        "index.html",
-        "iron-checked-element-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_collapse",
-    srcs = [
-        "index.html",
-        "iron-collapse.html",
-    ],
-)
-
-filegroup(
-    name = "iron_dropdown",
-    srcs = [
-        "index.html",
-        "iron-dropdown.html",
-        "iron-dropdown-scroll-manager.html",
-    ],
-)
-
-filegroup(
-    name = "iron_fit_behavior",
-    srcs = [
-        "index.html",
-        "iron-fit-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_flex_layout",
-    srcs = [
-        "classes/iron-flex-layout.html",
-        "classes/iron-shadow-flex-layout.html",
-        "index.html",
-        "iron-flex-layout.html",
-        "iron-flex-layout-classes.html",
-    ],
-)
-
-filegroup(
-    name = "iron_form_element_behavior",
-    srcs = [
-        "index.html",
-        "iron-form-element-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_icon",
-    srcs = [
-        "index.html",
-        "iron-icon.html",
-    ],
-)
-
-filegroup(
-    name = "iron_icons",
-    srcs = [
-        "av-icons.html",
-        "communication-icons.html",
-        "device-icons.html",
-        "editor-icons.html",
-        "hardware-icons.html",
-        "image-icons.html",
-        "index.html",
-        "iron-icons.html",
-        "maps-icons.html",
-        "notification-icons.html",
-        "places-icons.html",
-        "social-icons.html",
-    ],
-)
-
-filegroup(
-    name = "iron_iconset_svg",
-    srcs = [
-        "index.html",
-        "iron-iconset-svg.html",
-    ],
-)
-
-filegroup(
-    name = "iron_input",
-    srcs = [
-        "index.html",
-        "iron-input.html",
-    ],
-)
-
-filegroup(
-    name = "iron_list",
-    srcs = [
-        "index.html",
-        "iron-list.html",
-        "test/smoke/avg-worst-case.html",
-        "test/smoke/dummy-data.html",
-        "test/smoke/index.html",
-        "test/smoke/physical-count.html",
-    ],
-)
-
-filegroup(
-    name = "iron_menu_behavior",
-    srcs = [
-        "index.html",
-        "iron-menu-behavior.html",
-        "iron-menubar-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_meta",
-    srcs = [
-        "index.html",
-        "iron-meta.html",
-    ],
-)
-
-filegroup(
-    name = "iron_overlay_behavior",
-    srcs = [
-        "index.html",
-        "iron-focusables-helper.html",
-        "iron-overlay-backdrop.html",
-        "iron-overlay-behavior.html",
-        "iron-overlay-manager.html",
-    ],
-)
-
-filegroup(
-    name = "iron_range_behavior",
-    srcs = [
-        "index.html",
-        "iron-range-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_resizable_behavior",
-    srcs = [
-        "demo/src/x-app.html",
-        "index.html",
-        "iron-resizable-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_scroll_target_behavior",
-    srcs = [
-        "index.html",
-        "iron-scroll-target-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "iron_selector",
-    srcs = [
-        "index.html",
-        "iron-multi-selectable.html",
-        "iron-selectable.html",
-        "iron-selection.html",
-        "iron-selector.html",
-    ],
-)
-
-filegroup(
-    name = "iron_validatable_behavior",
-    srcs = [
-        "index.html",
-        "iron-validatable-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "lodash",
-    srcs = [
-        "lodash.js",
-        "lodash.min.js",
-    ],
-)
-
-filegroup(
-    name = "neon_animation",
-    srcs = [
-        "animations/cascaded-animation.html",
-        "animations/fade-in-animation.html",
-        "animations/fade-out-animation.html",
-        "animations/hero-animation.html",
-        "animations/opaque-animation.html",
-        "animations/reverse-ripple-animation.html",
-        "animations/ripple-animation.html",
-        "animations/scale-down-animation.html",
-        "animations/scale-up-animation.html",
-        "animations/slide-down-animation.html",
-        "animations/slide-from-bottom-animation.html",
-        "animations/slide-from-left-animation.html",
-        "animations/slide-from-right-animation.html",
-        "animations/slide-from-top-animation.html",
-        "animations/slide-left-animation.html",
-        "animations/slide-right-animation.html",
-        "animations/slide-up-animation.html",
-        "animations/transform-animation.html",
-        "demo/card/index.html",
-        "demo/card/x-card.html",
-        "demo/card/x-cards-list.html",
-        "demo/declarative/index.html",
-        "demo/doc/index.html",
-        "demo/doc/my-animatable.html",
-        "demo/doc/my-dialog.html",
-        "demo/dropdown/animated-dropdown.html",
-        "demo/dropdown/index.html",
-        "demo/grid/animated-grid.html",
-        "demo/grid/fullsize-page-with-card.html",
-        "demo/grid/index.html",
-        "demo/list/full-view.html",
-        "demo/list/index.html",
-        "demo/list/list-demo.html",
-        "demo/list/list-view.html",
-        "demo/load/animated-grid.html",
-        "demo/load/full-page.html",
-        "demo/load/index.html",
-        "demo/reprojection/animated-grid.html",
-        "demo/reprojection/fullsize-page-with-card.html",
-        "demo/reprojection/index.html",
-        "demo/reprojection/reprojected-pages.html",
-        "demo/tiles/circles-page.html",
-        "demo/tiles/index.html",
-        "demo/tiles/squares-page.html",
-        "index.html",
-        "neon-animatable.html",
-        "neon-animatable-behavior.html",
-        "neon-animated-pages.html",
-        "neon-animation.html",
-        "neon-animation-behavior.html",
-        "neon-animation-runner-behavior.html",
-        "neon-animations.html",
-        "neon-shared-element-animatable-behavior.html",
-        "neon-shared-element-animation-behavior.html",
-        "web-animations.html",
-    ],
-)
-
-filegroup(
-    name = "paper_behaviors",
-    srcs = [
-        "index.html",
-        "paper-button-behavior.html",
-        "paper-checked-element-behavior.html",
-        "paper-inky-focus-behavior.html",
-        "paper-ripple-behavior.html",
-    ],
-)
-
-filegroup(
-    name = "paper_button",
-    srcs = [
-        "index.html",
-        "paper-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_checkbox",
-    srcs = [
-        "index.html",
-        "paper-checkbox.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog",
-    srcs = [
-        "index.html",
-        "paper-dialog.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog_behavior",
-    srcs = [
-        "index.html",
-        "paper-dialog-behavior.html",
-        "paper-dialog-common.css",
-        "paper-dialog-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dialog_scrollable",
-    srcs = [
-        "index.html",
-        "paper-dialog-scrollable.html",
-    ],
-)
-
-filegroup(
-    name = "paper_dropdown_menu",
-    srcs = [
-        "index.html",
-        "paper-dropdown-menu.html",
-        "paper-dropdown-menu-icons.html",
-        "paper-dropdown-menu-light.html",
-        "paper-dropdown-menu-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_header_panel",
-    srcs = [
-        "index.html",
-        "paper-header-panel.html",
-    ],
-)
-
-filegroup(
-    name = "paper_icon_button",
-    srcs = [
-        "index.html",
-        "paper-icon-button.html",
-        "paper-icon-button-light.html",
-    ],
-)
-
-filegroup(
-    name = "paper_input",
-    srcs = [
-        "all-imports.html",
-        "index.html",
-        "paper-input.html",
-        "paper-input-addon-behavior.html",
-        "paper-input-behavior.html",
-        "paper-input-char-counter.html",
-        "paper-input-container.html",
-        "paper-input-error.html",
-        "paper-textarea.html",
-    ],
-)
-
-filegroup(
-    name = "paper_item",
-    srcs = [
-        "all-imports.html",
-        "index.html",
-        "paper-icon-item.html",
-        "paper-item.html",
-        "paper-item-behavior.html",
-        "paper-item-body.html",
-        "paper-item-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_listbox",
-    srcs = [
-        "index.html",
-        "paper-listbox.html",
-    ],
-)
-
-filegroup(
-    name = "paper_material",
-    srcs = [
-        "index.html",
-        "paper-material.html",
-        "paper-material-shared-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_menu",
-    srcs = [
-        "index.html",
-        "paper-menu.html",
-        "paper-menu-shared-styles.html",
-        "paper-submenu.html",
-    ],
-)
-
-filegroup(
-    name = "paper_menu_button",
-    srcs = [
-        "index.html",
-        "paper-menu-button.html",
-        "paper-menu-button-animations.html",
-    ],
-)
-
-filegroup(
-    name = "paper_progress",
-    srcs = [
-        "index.html",
-        "paper-progress.html",
-    ],
-)
-
-filegroup(
-    name = "paper_radio_button",
-    srcs = [
-        "index.html",
-        "paper-radio-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_radio_group",
-    srcs = [
-        "index.html",
-        "paper-radio-group.html",
-    ],
-)
-
-filegroup(
-    name = "paper_ripple",
-    srcs = [
-        "index.html",
-        "paper-ripple.html",
-    ],
-)
-
-filegroup(
-    name = "paper_slider",
-    srcs = [
-        "index.html",
-        "paper-slider.html",
-    ],
-)
-
-filegroup(
-    name = "paper_spinner",
-    srcs = [
-        "index.html",
-        "paper-spinner.html",
-        "paper-spinner-behavior.html",
-        "paper-spinner-lite.html",
-        "paper-spinner-styles.html",
-    ],
-)
-
-filegroup(
-    name = "paper_styles",
-    srcs = [
-        "classes/global.html",
-        "classes/shadow.html",
-        "classes/shadow-layout.html",
-        "classes/typography.html",
-        "color.html",
-        "default-theme.html",
-        "demo.css",
-        "demo-pages.html",
-        "index.html",
-        "paper-styles.html",
-        "paper-styles-classes.html",
-        "shadow.html",
-        "typography.html",
-    ],
-)
-
-filegroup(
-    name = "paper_tabs",
-    srcs = [
-        "index.html",
-        "paper-tab.html",
-        "paper-tabs.html",
-        "paper-tabs-icons.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toast",
-    srcs = [
-        "index.html",
-        "paper-toast.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toggle_button",
-    srcs = [
-        "index.html",
-        "paper-toggle-button.html",
-    ],
-)
-
-filegroup(
-    name = "paper_toolbar",
-    srcs = [
-        "index.html",
-        "paper-toolbar.html",
-    ],
-)
-
-filegroup(
-    name = "paper_tooltip",
-    srcs = [
-        "index.html",
-        "paper-tooltip.html",
-    ],
-)
-
-filegroup(
-    name = "plottable",
-    srcs = [
-        "plottable.css",
-        "plottable.js",
-        "plottable.min.js",
-    ],
-)
-
-filegroup(
-    name = "polymer",
-    srcs = [
-        "polymer.html",
-        "polymer-micro.html",
-        "polymer-mini.html",
-    ],
-)
-
-filegroup(
-    name = "promise_polyfill",
-    srcs = [
-        "Gruntfile.js",
-        "Promise.js",
-        "Promise.min.js",
-        "Promise-Statics.js",
-        "promise-polyfill.html",
-        "promise-polyfill-lite.html",
-    ],
-)
-
-filegroup(
-    name = "web_animations_js",
-    srcs = [
-        "web-animations.html",
-        "web-animations.min.js",
-        "web-animations-next.min.js",
-        "web-animations-next-lite.min.js",
-    ],
-)
-
-filegroup(
-    name = "webcomponentsjs",
-    srcs = [
-        "CustomElements.js",
-        "CustomElements.min.js",
-        "HTMLImports.js",
-        "HTMLImports.min.js",
-        "MutationObserver.js",
-        "MutationObserver.min.js",
-        "ShadowDOM.js",
-        "ShadowDOM.min.js",
-        "webcomponents.js",
-        "webcomponents.min.js",
-        "webcomponents-lite.js",
-        "webcomponents-lite.min.js",
-    ],
-)
diff --git a/configure b/configure
index 6360641be2ca99c8c8cbe58c95fc2fd59f917744..e455893ffc8539b0c9175b6acc242427b0930ce4 100755
--- a/configure
+++ b/configure
@@ -28,19 +28,12 @@ function is_macos() {
 
 function is_windows() {
   # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]; then
-    true
-  else
-    false
-  fi
+  [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]
 }
 
-function sed_hyphen_i() {
-  if is_macos; then
-    sed -i '' "$@"
-  else
-    sed -i "$@"
-  fi
+function sed_in_place() {
+  sed -e $1 $2 > "$2.bak"
+  mv "$2.bak" $2
 }
 
 function write_to_bazelrc() {
@@ -51,12 +44,133 @@ function write_action_env_to_bazelrc() {
   write_to_bazelrc "build --action_env $1=\"$2\""
 }
 
+function python_path {
+  "$PYTHON_BIN_PATH" - <<END
+from __future__ import print_function
+import site
+import os
+
+try:
+  input = raw_input
+except NameError:
+  pass
+
+python_paths = []
+if os.getenv('PYTHONPATH') is not None:
+  python_paths = os.getenv('PYTHONPATH').split(':')
+try:
+  library_paths = site.getsitepackages()
+except AttributeError:
+ from distutils.sysconfig import get_python_lib
+ library_paths = [get_python_lib()]
+all_paths = set(python_paths + library_paths)
+
+paths = []
+for path in all_paths:
+  if os.path.isdir(path):
+    paths.append(path)
+
+print(",".join(paths))
+END
+}
+
+function setup_python {
+  ## Set up python-related environment settings:
+  while true; do
+    fromuser=""
+    if [ -z "$PYTHON_BIN_PATH" ]; then
+      default_python_bin_path=$(which python || which python3 || true)
+      read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
+      fromuser="1"
+      if [ -z "$PYTHON_BIN_PATH" ]; then
+        PYTHON_BIN_PATH=$default_python_bin_path
+      fi
+    fi
+    if [ -e "$PYTHON_BIN_PATH" ]; then
+      break
+    fi
+    echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
+    if [ -z "$fromuser" ]; then
+      exit 1
+    fi
+    PYTHON_BIN_PATH=""
+    # Retry
+  done
+
+  if [ -z "$PYTHON_LIB_PATH" ]; then
+    # Split python_path into an array of paths, this allows path containing spaces
+    IFS=','
+    python_lib_path=($(python_path))
+    unset IFS
+
+    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
+      PYTHON_LIB_PATH=${python_lib_path[0]}
+      echo "Using python library path: $PYTHON_LIB_PATH"
+
+    else
+      echo "Found possible Python library paths:"
+      for x in "${python_lib_path[@]}"; do
+        echo "  $x"
+      done
+      set -- "${python_lib_path[@]}"
+      echo "Please input the desired Python library path to use.  Default is ["$1"]"
+      read b || true
+      if [ "$b" == "" ]; then
+        PYTHON_LIB_PATH=${python_lib_path[0]}
+        echo "Using python library path: $PYTHON_LIB_PATH"
+      else
+        PYTHON_LIB_PATH="$b"
+      fi
+    fi
+  fi
+
+  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
+    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
+    exit 1
+  fi
+
+  local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);' | head -c1)
+  if [ -z "$python_major_version" ]; then
+    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
+    exit 1
+  fi
+
+  # Convert python path to Windows style before writing into bazel.rc
+  if is_windows; then
+    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
+  fi
+
+  # Set-up env variables used by python_configure.bzl
+  write_action_env_to_bazelrc "PYTHON_BIN_PATH" "$PYTHON_BIN_PATH"
+  write_action_env_to_bazelrc "PYTHON_LIB_PATH" "$PYTHON_LIB_PATH"
+  write_to_bazelrc "build --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "build --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "build --force_python=py$python_major_version"
+  write_to_bazelrc "build --host_force_python=py$python_major_version"
+  write_to_bazelrc "build --python${python_major_version}_path=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --force_python=py$python_major_version"
+  write_to_bazelrc "test --host_force_python=py$python_major_version"
+  write_to_bazelrc "test --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "test --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+  write_to_bazelrc "run --define PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\""
+  write_to_bazelrc "run --define PYTHON_LIB_PATH=\"$PYTHON_LIB_PATH\""
+
+  # Write tools/python_bin_path.sh
+  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
+}
+
 # This file contains customized config settings.
 rm -f .tf_configure.bazelrc
 touch .tf_configure.bazelrc
-touch .bazelrc
-sed_hyphen_i "/tf_configure/d" .bazelrc
-echo "import .tf_configure.bazelrc" >> .bazelrc
+if [[ ! -e .bazelrc ]]; then
+  if [[ -e "${HOME}/.bazelrc" ]]; then
+    echo "import ${HOME}/.bazelrc" >.bazelrc
+  else
+    touch .bazelrc
+  fi
+fi
+sed_in_place "/tf_configure/d" .bazelrc
+echo "import %workspace%/.tf_configure.bazelrc" >> .bazelrc
 
 # Delete any leftover BUILD files from the Makefile build, which would interfere
 # with Bazel parsing.
@@ -65,58 +179,63 @@ if [ -d "${MAKEFILE_DOWNLOAD_DIR}" ]; then
   find ${MAKEFILE_DOWNLOAD_DIR} -type f -name '*BUILD' -delete
 fi
 
-## Set up python-related environment settings
-while true; do
+setup_python
+
+## Set up MKL related environment settings
+while [ "$TF_NEED_MKL" == "" ]; do
   fromuser=""
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    default_python_bin_path=$(which python || which python3  || true)
-    read -p "Please specify the location of python. [Default is $default_python_bin_path]: " PYTHON_BIN_PATH
-    fromuser="1"
-    if [ -z "$PYTHON_BIN_PATH" ]; then
-      PYTHON_BIN_PATH=$default_python_bin_path
-    fi
-  fi
-  if [ -e "$PYTHON_BIN_PATH" ]; then
-    break
-  fi
-  echo "Invalid python path. ${PYTHON_BIN_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  PYTHON_BIN_PATH=""
-  # Retry
+  read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
+  fromuser="1"
+  case $INPUT in
+    [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
+    [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
 done
 
-## Set up MKL related environment settings
-if false; then # Disable building with MKL for now
-  while [ "$TF_NEED_MKL" == "" ]; do
+OSNAME=`uname -s`
+
+if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  while [ "$TF_DOWNLOAD_MKL" == "" ]; do
     fromuser=""
-    read -p "Do you wish to build TensorFlow with MKL support? [y/N] " INPUT
+    read -p "Do you wish to download MKL LIB from the web? [Y/n] " INPUT
     fromuser="1"
     case $INPUT in
-      [Yy]* ) echo "MKL support will be enabled for TensorFlow"; TF_NEED_MKL=1;;
-      [Nn]* ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      "" ) echo "No MKL support will be enabled for TensorFlow"; TF_NEED_MKL=0;;
-      * ) echo "Invalid selection: " $INPUT;;
+      [Yy]* ) TF_DOWNLOAD_MKL=1;;
+      [Nn]* ) TF_DOWNLOAD_MKL=0;;
+      "" )    TF_DOWNLOAD_MKL=1;;
+      * )     echo "Invalid selection: " $INPUT; exit 1;;
     esac
   done
 
-  OSNAME=`uname -s`
-
-  if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL
+  if [[ "$TF_DOWNLOAD_MKL" == "1" ]]; then
     DST=`dirname $0`
-    ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
-    GITHUB_RELEASE_TAG=v0.5
+    ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
+    GITHUB_RELEASE_TAG=v0.7
     MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
-    if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then
-      wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL
+    if ! [ -e "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" ]; then
+      curl -fSsL -o "${DST}/third_party/mkl/${ARCHIVE_BASENAME}" "${MKLURL}"
     fi
     tar -xzf $DST/third_party/mkl/$ARCHIVE_BASENAME -C $DST/third_party/mkl/
     extracted_dir_name="${ARCHIVE_BASENAME%.*}"
     MKL_INSTALL_PATH=$DST/third_party/mkl/$extracted_dir_name
     MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
 
-    if [ "$OSNAME" == "Linux" ]; then
+  else
+    default_mkl_path=/opt/intel/mklml
+    fromuser=""
+    read -p "Please specify the location where MKL is installed. [Default is $default_mkl_path]: " MKL_INSTALL_PATH
+    fromuser="1"
+    if [ -z "$MKL_INSTALL_PATH" ]; then
+      MKL_INSTALL_PATH=$default_mkl_path
+    fi
+    # Result returned from "read" will be used unexpanded. That make "~" unusable.
+    # Going through one more level of expansion to handle that.
+    MKL_INSTALL_PATH=`${PYTHON_BIN_PATH} -c "import os; print(os.path.realpath(os.path.expanduser('${MKL_INSTALL_PATH}')))"`
+  fi
+
+  if [ "$OSNAME" == "Linux" ]; then
       # Full MKL configuration
       MKL_RT_LIB_PATH="lib/intel64/libmkl_rt.so" #${TF_MKL_EXT}#TODO version?
       MKL_RT_OMP_LIB_PATH="../compiler/lib/intel64/libiomp5.so" #TODO VERSION?
@@ -124,24 +243,29 @@ if false; then # Disable building with MKL for now
       # MKL-ML configuration
       MKL_ML_LIB_PATH="lib/libmklml_intel.so" #${TF_MKL_EXT}#TODO version?
       MKL_ML_OMP_LIB_PATH="lib/libiomp5.so" #TODO VERSION?
-    elif [ "$OSNAME" == "Darwin" ]; then
+  elif [ "$OSNAME" == "Darwin" ]; then
       echo "Darwin is unsupported yet";
       exit 1
-    fi
+  fi
 
-    if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
+  if [ -e "$MKL_INSTALL_PATH/${MKL_ML_LIB_PATH}" ]; then
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/${MKL_ML_OMP_LIB_PATH} third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
       ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
-    else
-      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} does not exist";
-      exit 1
-    fi
-
-    if [ -z "$fromuser" ]; then
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  elif [ -e "$MKL_INSTALL_PATH/${MKL_RT_LIB_PATH}" ]; then
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/${MKL_RT_OMP_LIB_PATH} third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/mkl/
+      ln -sf $MKL_INSTALL_PATH/include third_party/eigen3/mkl_include
+      loc=$(locate -e libdl.so.2 | sed -n 1p)
+      ln -sf $loc third_party/mkl/libdl.so.2
+  else
+      echo "ERROR: $MKL_INSTALL_PATH/${MKL_ML_LIB_PATH} nor $MKL_INSTALL_PATH/${MKL_RT_LIB_PATH} exists";
       exit 1
-    fi
+  fi
 
 cat > third_party/mkl/mkl.config <<EOF
 # MKL_INSTALL_PATH refers to the location of MKL root folder. The MKL header and library
@@ -149,9 +273,8 @@ cat > third_party/mkl/mkl.config <<EOF
 MKL_INSTALL_PATH=$MKL_INSTALL_PATH
 EOF
 
-  fi # TF_NEED_MKL
-  ################## MKL
-fi # Disable building with MKL for now
+fi # TF_NEED_MKL
+## End MKL setup
 
 ## Set up architecture-dependent optimization flags.
 if [ -z "$CC_OPT_FLAGS" ]; then
@@ -241,14 +364,28 @@ if [[ "$TF_ENABLE_XLA" == "1" ]]; then
   write_to_bazelrc 'build --define with_xla_support=true'
 fi
 
+# Verbs configuration
+while [ "$TF_NEED_VERBS" == "" ]; do
+  read -p "Do you wish to build TensorFlow with "\
+"VERBS support? [y/N] " INPUT
+  case $INPUT in
+    [Yy]* ) echo "VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=1;;
+    [Nn]* ) echo "No VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=0;;
+    "" ) echo "No VERBS support will be enabled for "\
+"TensorFlow"; TF_NEED_VERBS=0;;
+    * ) echo "Invalid selection: " $INPUT;;
+  esac
+done
 
-# Invoke python_config and set up symlinks to python includes
-./util/python/python_config.sh --setup "$PYTHON_BIN_PATH"
+if [[ "$TF_NEED_VERBS" == "1" ]]; then
+  write_to_bazelrc 'build --define with_verbs_support=true'
+fi
 
 # Append CC optimization flags to bazel.rc
-echo >> tools/bazel.rc
 for opt in $CC_OPT_FLAGS; do
-  echo "build:opt --cxxopt=$opt --copt=$opt" >> tools/bazel.rc
+  write_to_bazelrc "build:opt --cxxopt=$opt --copt=$opt"
 done
 
 # Run the gen_git_source to create links where bazel can track dependencies for
@@ -284,6 +421,7 @@ export TF_NEED_CUDA
 write_action_env_to_bazelrc "TF_NEED_CUDA" "$TF_NEED_CUDA"
 
 export TF_NEED_OPENCL
+write_action_env_to_bazelrc "TF_NEED_OPENCL" "$TF_NEED_OPENCL"
 
 if [ "$TF_NEED_CUDA" == "1" ]; then
 while [[ "$TF_CUDA_CLANG" == "" ]]; do
@@ -299,31 +437,6 @@ done
 export TF_CUDA_CLANG
 write_action_env_to_bazelrc "TF_CUDA_CLANG" "$TF_CUDA_CLANG"
 
-# Set up which gcc nvcc should use as the host compiler
-# No need to set this on Windows
-while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
-  fromuser=""
-  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-    default_gcc_host_compiler_path=$(which gcc || true)
-    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
-    fromuser="1"
-    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
-      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
-    fi
-  fi
-  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
-    export GCC_HOST_COMPILER_PATH
-    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
-    break
-  fi
-  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
-  if [ -z "$fromuser" ]; then
-    exit 1
-  fi
-  GCC_HOST_COMPILER_PATH=""
-  # Retry
-done
-
 # Set up which clang we should use as the cuda / host compiler.
 while [[ "$TF_CUDA_CLANG" == "1" ]] && true; do
   fromuser=""
@@ -364,6 +477,11 @@ while true; do
       else
         default_cuda_path="$(cygpath -m "$CUDA_PATH")"
       fi
+    elif is_linux; then
+      # If the default doesn't exist, try an alternative default.
+      if [ ! -d $default_cuda_path ] && [ -d /opt/cuda ]; then
+        default_cuda_path=/opt/cuda
+      fi
     fi
     read -p "Please specify the location where CUDA $TF_CUDA_VERSION toolkit is installed. Refer to README.md for more details. [Default is $default_cuda_path]: " CUDA_TOOLKIT_PATH
     fromuser="1"
@@ -403,6 +521,35 @@ while true; do
   CUDA_TOOLKIT_PATH=""
 done
 
+# Set up which gcc nvcc should use as the host compiler
+# No need to set this on Windows
+while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
+  fromuser=""
+  if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+    default_gcc_host_compiler_path=$(which gcc || true)
+    cuda_bin_symlink="$CUDA_TOOLKIT_PATH/bin/gcc"
+    if [ -L "$cuda_bin_symlink" ]; then
+      default_gcc_host_compiler_path=$(readlink $cuda_bin_symlink)
+    fi
+    read -p "Please specify which gcc should be used by nvcc as the host compiler. [Default is $default_gcc_host_compiler_path]: " GCC_HOST_COMPILER_PATH
+    fromuser="1"
+    if [ -z "$GCC_HOST_COMPILER_PATH" ]; then
+      GCC_HOST_COMPILER_PATH="$default_gcc_host_compiler_path"
+    fi
+  fi
+  if [ -e "$GCC_HOST_COMPILER_PATH" ]; then
+    export GCC_HOST_COMPILER_PATH
+    write_action_env_to_bazelrc "GCC_HOST_COMPILER_PATH" "$GCC_HOST_COMPILER_PATH"
+    break
+  fi
+  echo "Invalid gcc path. ${GCC_HOST_COMPILER_PATH} cannot be found" 1>&2
+  if [ -z "$fromuser" ]; then
+    exit 1
+  fi
+  GCC_HOST_COMPILER_PATH=""
+  # Retry
+done
+
 # Find out where the cuDNN library is installed
 while true; do
   # Configure the cuDNN version to use.
@@ -418,7 +565,7 @@ while true; do
     if [ -z "$CUDNN_INSTALL_PATH" ]; then
       CUDNN_INSTALL_PATH=$default_cudnn_path
     fi
-    # Result returned from "read" will be used unexpanded. That make "~" unuseable.
+    # Result returned from "read" will be used unexpanded. That make "~" unusable.
     # Going through one more level of expansion to handle that.
     CUDNN_INSTALL_PATH=`"${PYTHON_BIN_PATH}" -c "import os; print(os.path.realpath(os.path.expanduser('${CUDNN_INSTALL_PATH}')))"`
   fi
@@ -547,6 +694,7 @@ while true; do
   fi
   if [ -e "$HOST_CXX_COMPILER" ]; then
     export HOST_CXX_COMPILER
+    write_action_env_to_bazelrc "HOST_CXX_COMPILER" "$HOST_CXX_COMPILER"
     break
   fi
   echo "Invalid C++ compiler path. ${HOST_CXX_COMPILER} cannot be found" 1>&2
@@ -570,6 +718,7 @@ while true; do
   fi
   if [ -e "$HOST_C_COMPILER" ]; then
     export HOST_C_COMPILER
+    write_action_env_to_bazelrc "HOST_C_COMPILER" "$HOST_C_COMPILER"
     break
   fi
   echo "Invalid C compiler path. ${HOST_C_COMPILER} cannot be found" 1>&2
@@ -600,6 +749,7 @@ while true; do
 
   if [ -e "${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH}" ]; then
     export COMPUTECPP_TOOLKIT_PATH
+    write_action_env_to_bazelrc "COMPUTECPP_TOOLKIT_PATH" "$COMPUTECPP_TOOLKIT_PATH"
     break
   fi
   echo "Invalid SYCL $TF_OPENCL_VERSION library path. ${COMPUTECPP_TOOLKIT_PATH}/${SYCL_RT_LIB_PATH} cannot be found"
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 6a70c0e4057ed35bcdb10157e4147de35546b6a9..54da5bf3fee8b03c1b0ed34890c193575e324617 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -14,9 +14,7 @@ exports_files([
 # Config setting for determining if we are building for Android.
 config_setting(
     name = "android",
-    values = {
-        "crosstool_top": "//external:android/crosstool",
-    },
+    values = {"crosstool_top": "//external:android/crosstool"},
     visibility = ["//visibility:public"],
 )
 
@@ -76,9 +74,7 @@ config_setting(
 
 config_setting(
     name = "ios",
-    values = {
-        "crosstool_top": "//tools/osx/crosstool:crosstool",
-    },
+    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
     visibility = ["//visibility:public"],
 )
 
@@ -88,6 +84,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "linux_ppc64le",
+    values = {"cpu": "ppc"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "debug",
     values = {
@@ -112,7 +114,7 @@ config_setting(
 
 # TODO(jhseu): Enable on other platforms other than Linux.
 config_setting(
-    name = "with_jemalloc",
+    name = "with_jemalloc_linux_x86_64",
     values = {
         "cpu": "k8",
         "define": "with_jemalloc=true",
@@ -120,6 +122,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_jemalloc_linux_ppc64le",
+    values = {
+        "cpu": "ppc",
+        "define": "with_jemalloc=true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_gcp_support",
     values = {"define": "with_gcp_support=true"},
@@ -138,6 +149,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "with_verbs_support",
+    values = {"define": "with_verbs_support=true"},
+    visibility = ["//visibility:public"],
+)
+
 package_group(
     name = "internal",
     packages = ["//tensorflow/..."],
@@ -185,7 +202,6 @@ filegroup(
         "//tensorflow/compiler/xla/client:all_files",
         "//tensorflow/compiler/xla/client/lib:all_files",
         "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/port:all_files",
         "//tensorflow/compiler/xla/service:all_files",
         "//tensorflow/compiler/xla/service/cpu:all_files",
         "//tensorflow/compiler/xla/service/gpu:all_files",
@@ -196,18 +212,24 @@ filegroup(
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/batching:all_files",
+        "//tensorflow/contrib/batching/kernels:all_files",
         "//tensorflow/contrib/batching/test_util:all_files",
         "//tensorflow/contrib/batching/util:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
         "//tensorflow/contrib/boosted_trees:all_files",
         "//tensorflow/contrib/boosted_trees/lib:all_files",
         "//tensorflow/contrib/boosted_trees/proto:all_files",
+        "//tensorflow/contrib/boosted_trees/resources:all_files",
         "//tensorflow/contrib/cloud:all_files",
         "//tensorflow/contrib/cloud/kernels:all_files",
         "//tensorflow/contrib/compiler:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
         "//tensorflow/contrib/crf:all_files",
         "//tensorflow/contrib/cudnn_rnn:all_files",
+        "//tensorflow/contrib/data:all_files",
+        "//tensorflow/contrib/data/python/framework:all_files",
+        "//tensorflow/contrib/data/python/kernel_tests:all_files",
+        "//tensorflow/contrib/data/python/ops:all_files",
         "//tensorflow/contrib/distributions:all_files",
         "//tensorflow/contrib/factorization:all_files",
         "//tensorflow/contrib/factorization/kernels:all_files",
@@ -217,6 +239,7 @@ filegroup(
         "//tensorflow/contrib/graph_editor:all_files",
         "//tensorflow/contrib/grid_rnn:all_files",
         "//tensorflow/contrib/hooks:all_files",
+        "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
         "//tensorflow/contrib/image:all_files",
         "//tensorflow/contrib/imperative:all_files",
         "//tensorflow/contrib/input_pipeline:all_files",
@@ -239,16 +262,20 @@ filegroup(
         "//tensorflow/contrib/opt:all_files",
         "//tensorflow/contrib/rnn:all_files",
         "//tensorflow/contrib/saved_model:all_files",
+        "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
         "//tensorflow/contrib/seq2seq:all_files",
         "//tensorflow/contrib/session_bundle:all_files",
         "//tensorflow/contrib/session_bundle/example:all_files",
+        "//tensorflow/contrib/signal:all_files",
         "//tensorflow/contrib/slim:all_files",
         "//tensorflow/contrib/slim/python/slim/data:all_files",
         "//tensorflow/contrib/slim/python/slim/nets:all_files",
         "//tensorflow/contrib/solvers:all_files",
         "//tensorflow/contrib/sparsemax:all_files",
         "//tensorflow/contrib/specs:all_files",
+        "//tensorflow/contrib/staging:all_files",
         "//tensorflow/contrib/stat_summarizer:all_files",
+        "//tensorflow/contrib/stateless:all_files",
         "//tensorflow/contrib/tensor_forest:all_files",
         "//tensorflow/contrib/tensor_forest/hybrid:all_files",
         "//tensorflow/contrib/tensorboard:all_files",
@@ -256,6 +283,8 @@ filegroup(
         "//tensorflow/contrib/tfprof/python/tools/tfprof:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
+        "//tensorflow/contrib/verbs:all_files",
+        "//tensorflow/contrib/xla_tf_graph:all_files",
         "//tensorflow/core:all_files",
         "//tensorflow/core/debug:all_files",
         "//tensorflow/core/distributed_runtime:all_files",
@@ -265,8 +294,10 @@ filegroup(
         "//tensorflow/core/grappler/costs:all_files",
         "//tensorflow/core/grappler/inputs:all_files",
         "//tensorflow/core/grappler/optimizers:all_files",
+        "//tensorflow/core/grappler/utils:all_files",
         "//tensorflow/core/kernels:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
+        "//tensorflow/core/kernels/neon:all_files",
         "//tensorflow/core/ops/compat:all_files",
         "//tensorflow/core/platform/cloud:all_files",
         "//tensorflow/core/platform/default/build_config:all_files",
@@ -274,6 +305,7 @@ filegroup(
         "//tensorflow/core/util/ctc:all_files",
         "//tensorflow/core/util/tensor_bundle:all_files",
         "//tensorflow/examples/android:all_files",
+        "//tensorflow/examples/benchmark:all_files",
         "//tensorflow/examples/how_tos/reading_data:all_files",
         "//tensorflow/examples/image_retraining:all_files",
         "//tensorflow/examples/label_image:all_files",
@@ -282,6 +314,7 @@ filegroup(
         "//tensorflow/examples/tutorials/estimators:all_files",
         "//tensorflow/examples/tutorials/mnist:all_files",
         "//tensorflow/examples/tutorials/word2vec:all_files",
+        "//tensorflow/examples/wav_to_spectrogram:all_files",
         "//tensorflow/go:all_files",
         "//tensorflow/java:all_files",
         "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
@@ -289,27 +322,67 @@ filegroup(
         "//tensorflow/python:all_files",
         "//tensorflow/python/debug:all_files",
         "//tensorflow/python/estimator:all_files",
+        "//tensorflow/python/feature_column:all_files",
         "//tensorflow/python/kernel_tests:all_files",
+        "//tensorflow/python/kernel_tests/distributions:all_files",
+        "//tensorflow/python/ops/distributions:all_files",
         "//tensorflow/python/saved_model:all_files",
         "//tensorflow/python/tools:all_files",
         "//tensorflow/tensorboard:all_files",
-        "//tensorflow/tensorboard/app:all_files",
         "//tensorflow/tensorboard/backend:all_files",
         "//tensorflow/tensorboard/backend/event_processing:all_files",
         "//tensorflow/tensorboard/components:all_files",
-        "//tensorflow/tensorboard/components/tf_text_dashboard:all_files",
-        "//tensorflow/tensorboard/components/vz_data_summary:all_files",
-        "//tensorflow/tensorboard/components/vz_line_chart:all_files",
-        "//tensorflow/tensorboard/components/vz_line_chart/demo:all_files",
-        "//tensorflow/tensorboard/components/vz_projector:all_files",
-        "//tensorflow/tensorboard/components/vz_sorting:all_files",
-        "//tensorflow/tensorboard/components/vz_sorting/test:all_files",
-        "//tensorflow/tensorboard/lib:all_files",
+        "//tensorflow/tensorboard/components/tf_audio_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_app_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_option_selector_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo:all_files",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/tf_tensorboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/tf_text_dashboard_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_distribution_chart_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_heatmap_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4/test:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:all_files",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4/test:all_files",
+        "//tensorflow/tensorboard/demo:all_files",
+        "//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:all_files",
         "//tensorflow/tensorboard/plugins:all_files",
-        "//tensorflow/tensorboard/plugins/debugger:all_files",
         "//tensorflow/tensorboard/plugins/projector:all_files",
         "//tensorflow/tensorboard/plugins/text:all_files",
         "//tensorflow/tensorboard/scripts:all_files",
+        "//tensorflow/tools/api/golden:all_files",
+        "//tensorflow/tools/api/lib:all_files",
+        "//tensorflow/tools/api/tests:all_files",
         "//tensorflow/tools/common:all_files",
         "//tensorflow/tools/compatibility:all_files",
         "//tensorflow/tools/dist_test/server:all_files",
@@ -344,14 +417,34 @@ filegroup(
     ),
 )
 
+filegroup(
+    name = "docs_src",
+    data = glob(["docs_src/**/*.md"]),
+)
+
 # -------------------------------------------
 # New rules should be added above this target.
 # -------------------------------------------
 cc_binary(
     name = "libtensorflow.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
+            "//tensorflow/c:exported_symbols.lds",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-s",
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "//tensorflow/c:version_script.lds",
+        ],
+    }),
     linkshared = 1,
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:exported_symbols.lds",
+        "//tensorflow/c:version_script.lds",
         "//tensorflow/core:tensorflow",
     ],
 )
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 0bca6f8fb8051925908db5e86f30d97d534e60f4..083634bd7964b0c12e10a1f3c71be5eab597a6c4 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -24,19 +24,9 @@ from __future__ import print_function
 from tensorflow.python import *
 # pylint: enable=wildcard-import
 
-# Lazily import the `tf.contrib` module. This avoids loading all of the
-# dependencies of `tf.contrib` at `import tensorflow` time.
-class _LazyContribLoader(object):
-
-  def __getattr__(self, item):
-    global contrib
-    # Replace the lazy loader with the imported module itself.
-    import importlib  # pylint: disable=g-import-not-at-top
-    contrib = importlib.import_module('tensorflow.contrib')
-    return getattr(contrib, item)
-
-
-contrib = _LazyContribLoader()
+from tensorflow.python.util.lazy_loader import LazyLoader
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
 
 del absolute_import
 del division
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 0019dfeeb13f5e591d44dd37d73a93ce64a92d95..3ab4e8efcdb5b05cf8922edd302e7cbf3a3597f1 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -26,6 +26,22 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
+tf_cuda_library(
+    name = "c_api_internal",
+    srcs = ["c_api.h"],
+    hdrs = ["c_api_internal.h"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+        ],
+    }),
+)
+
 tf_cuda_library(
     name = "c_api",
     srcs = ["c_api.cc"],
@@ -34,10 +50,16 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
+            ":c_api_internal",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            ":c_api_internal",
             "//tensorflow/cc/saved_model:loader",
+            "//tensorflow/cc:gradients",
+            "//tensorflow/cc:ops",
+            "//tensorflow/cc:grad_ops",
+            "//tensorflow/cc:scope_internal",
             "//tensorflow/core:core_cpu",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
@@ -45,6 +67,14 @@ tf_cuda_library(
     }),
 )
 
+exports_files(
+    [
+        "version_script.lds",
+        "exported_symbols.lds",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 tf_cuda_library(
     name = "tf_status_helper",
     srcs = ["tf_status_helper.cc"],
@@ -89,21 +119,22 @@ tf_cc_test(
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
         ":c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
     ],
 )
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index d4bcc01b6b89329ad8149e2e98ac2df5d1c15882..f4775783f9f88c941445b62603c92cae00d34715 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -21,8 +21,12 @@ limitations under the License.
 #include <vector>
 
 #ifndef __ANDROID__
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #endif
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -93,9 +97,6 @@ size_t TF_DataTypeSize(TF_DataType dt) {
 }
 
 // --------------------------------------------------------------------------
-struct TF_Status {
-  Status status;
-};
 
 TF_Status* TF_NewStatus() { return new TF_Status; }
 
@@ -179,12 +180,6 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
 
 }  // namespace
 
-struct TF_Tensor {
-  TF_DataType dtype;
-  TensorShape shape;
-  TensorBuffer* buffer;
-};
-
 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                              int num_dims, size_t len) {
   void* data = allocate_tensor("TF_AllocateTensor", len);
@@ -220,6 +215,18 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
   return new TF_Tensor{dtype, TensorShape(dimvec), buf};
 }
 
+TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor) {
+  // It is safe to move the Tensor if and only if we own the unique reference to
+  // it. In that case, we might as well not delete and reallocate, but a future
+  // implementation might need to do so.
+  if (tensor->buffer->RefCountIsOne() &&
+      tensor->buffer->root_buffer()->RefCountIsOne() &&
+      tensor->buffer->OwnsMemory()) {
+    return tensor;
+  }
+  return nullptr;
+}
+
 void TF_DeleteTensor(TF_Tensor* t) {
   t->buffer->Unref();
   delete t;
@@ -277,9 +284,6 @@ size_t TF_StringEncodedSize(size_t len) {
 }
 
 // --------------------------------------------------------------------------
-struct TF_SessionOptions {
-  SessionOptions options;
-};
 TF_SessionOptions* TF_NewSessionOptions() { return new TF_SessionOptions; }
 void TF_DeleteSessionOptions(TF_SessionOptions* opt) { delete opt; }
 
@@ -320,9 +324,6 @@ void TF_DeleteBuffer(TF_Buffer* buffer) {
 TF_Buffer TF_GetBuffer(TF_Buffer* buffer) { return *buffer; }
 
 // --------------------------------------------------------------------------
-struct TF_DeprecatedSession {
-  Session* session;
-};
 
 TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions* opt,
                                               TF_Status* status) {
@@ -654,6 +655,7 @@ void TF_PRunSetup(TF_DeprecatedSession* s,
     memcpy(buf, new_handle.c_str(), new_handle.size() + 1);
     *handle = buf;
   } else {
+    *handle = nullptr;
     status->status = result;
   }
 }
@@ -685,11 +687,6 @@ void TF_PRun(TF_DeprecatedSession* s, const char* handle,
                 c_outputs, target_oper_names, nullptr, status);
 }
 
-struct TF_Library {
-  void* lib_handle;
-  TF_Buffer op_list;
-};
-
 TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
   TF_Library* lib_handle = new TF_Library;
   status->status = tensorflow::LoadLibrary(
@@ -726,66 +723,6 @@ TF_Buffer* TF_GetAllOpList() {
 // --------------------------------------------------------------------------
 // New Graph and Session API
 
-// Structures -----------------------------------------------------------------
-
-extern "C" {
-
-struct TF_Graph {
-  TF_Graph()
-      : graph(OpRegistry::Global()),
-        refiner(graph.versions().producer(), graph.op_registry()),
-        num_sessions(0),
-        delete_requested(false),
-        parent(nullptr),
-        parent_inputs(nullptr) {}
-  mutex mu;
-  Graph graph GUARDED_BY(mu);
-
-  // Runs shape inference.
-  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
-
-  // Maps from name of an operation to the Node* in 'graph'.
-  std::unordered_map<tensorflow::string, Node*> name_map GUARDED_BY(mu);
-
-  // TF_Graph may only / must be deleted when
-  //   num_sessions == 0 && delete_requested == true
-
-  // num_sessions incremented by TF_NewSession, and decremented by
-  // TF_DeleteSession.
-  int num_sessions GUARDED_BY(mu);
-  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
-
-  // Used to link graphs contained in TF_WhileParams to the parent graph that
-  // will eventually contain the full while loop.
-  TF_Graph* parent;
-  TF_Output* parent_inputs;
-};
-
-struct TF_OperationDescription {
-  TF_OperationDescription(TF_Graph* g, const char* op_type,
-                          const char* node_name)
-      : node_builder(node_name, op_type, g->graph.op_registry()), graph(g) {}
-
-  NodeBuilder node_builder;
-  TF_Graph* graph;
-  std::vector<tensorflow::string> colocation_constraints;
-};
-
-struct TF_Operation {
-  Node node;
-};
-
-struct TF_Session {
-  TF_Session(Session* s, TF_Graph* g)
-      : session(s), graph(g), last_num_graph_nodes(0) {}
-  Session* session;
-  TF_Graph* graph;
-  mutex mu;
-  int last_num_graph_nodes;
-};
-
-}  // end extern "C"
-
 // Helper functions -----------------------------------------------------------
 
 namespace {
@@ -801,8 +738,7 @@ tensorflow::string OutputName(const TF_Output& output) {
 const tensorflow::AttrValue* GetAttrValue(TF_Operation* oper,
                                           const char* attr_name,
                                           TF_Status* status) {
-  const tensorflow::AttrValue* attr =
-      tensorflow::AttrSlice(oper->node.def()).Find(attr_name);
+  const tensorflow::AttrValue* attr = oper->node.attrs().Find(attr_name);
   if (attr == nullptr) {
     status->status =
         InvalidArgument("Operation has no attr named '", attr_name, "'.");
@@ -1164,14 +1100,14 @@ static TF_Operation* TF_FinishOperationLocked(TF_OperationDescription* desc,
 
     if (status->status.ok()) {
       // Run shape inference function for newly added node.
-      //
-      // TODO(b/28152992): Enable returning the result of this
-      // code-path once we have converted all python shape functions
-      // to call their C++ versions.
-      desc->graph->refiner.AddNode(ret).IgnoreError();
-
+      status->status = desc->graph->refiner.AddNode(ret);
+    }
+    if (status->status.ok()) {
       // Add the node to the name-to-node mapping.
       desc->graph->name_map[ret->name()] = ret;
+    } else if (ret != nullptr) {
+      desc->graph->graph.RemoveNode(ret);
+      ret = nullptr;
     }
   }
 
@@ -1198,7 +1134,7 @@ const char* TF_OperationOpType(TF_Operation* oper) {
 }
 
 const char* TF_OperationDevice(TF_Operation* oper) {
-  return oper->node.def().device().c_str();
+  return oper->node.requested_device().c_str();
 }
 
 int TF_OperationNumOutputs(TF_Operation* oper) {
@@ -1213,8 +1149,8 @@ TF_DataType TF_OperationOutputType(TF_Output oper_out) {
 int TF_OperationOutputListLength(TF_Operation* oper, const char* arg_name,
                                  TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     nullptr, &name_ranges);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), nullptr, &name_ranges);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1235,8 +1171,8 @@ TF_DataType TF_OperationInputType(TF_Input oper_in) {
 int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
                                 TF_Status* status) {
   NameRangeMap name_ranges;
-  status->status = NameRangesForNode(oper->node.def(), oper->node.op_def(),
-                                     &name_ranges, nullptr);
+  status->status =
+      NameRangesForNode(oper->node, oper->node.op_def(), &name_ranges, nullptr);
   if (!status->status.ok()) return -1;
   auto iter = name_ranges.find(arg_name);
   if (iter == name_ranges.end()) {
@@ -1474,26 +1410,27 @@ void TF_OperationGetAttrStringList(TF_Operation* oper, const char* attr_name,
   }
 }
 
-#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                     \
-  void func(TF_Operation* oper, const char* attr_name, c_type* value,          \
-            TF_Status* status) {                                               \
-    cpp_type v;                                                                \
-    status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &v); \
-    *value = static_cast<c_type>(v);                                           \
-  }                                                                            \
-  void func##List(TF_Operation* oper, const char* attr_name, c_type* values,   \
-                  int max_values, TF_Status* status) {                         \
-    const auto* attr = GetAttrValue(oper, attr_name, status);                  \
-    if (!status->status.ok()) return;                                          \
-    if (attr->value_case() != tensorflow::AttrValue::kList) {                  \
-      status->status =                                                         \
-          InvalidArgument("Value for '", attr_name, "' is not a list.");       \
-      return;                                                                  \
-    }                                                                          \
-    const auto len = std::min(max_values, attr->list().list_field##_size());   \
-    for (int i = 0; i < len; ++i) {                                            \
-      values[i] = static_cast<c_type>(attr->list().list_field(i));             \
-    }                                                                          \
+#define DEFINE_GETATTR(func, c_type, cpp_type, list_field)                   \
+  void func(TF_Operation* oper, const char* attr_name, c_type* value,        \
+            TF_Status* status) {                                             \
+    cpp_type v;                                                              \
+    status->status =                                                         \
+        tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &v);          \
+    *value = static_cast<c_type>(v);                                         \
+  }                                                                          \
+  void func##List(TF_Operation* oper, const char* attr_name, c_type* values, \
+                  int max_values, TF_Status* status) {                       \
+    const auto* attr = GetAttrValue(oper, attr_name, status);                \
+    if (!status->status.ok()) return;                                        \
+    if (attr->value_case() != tensorflow::AttrValue::kList) {                \
+      status->status =                                                       \
+          InvalidArgument("Value for '", attr_name, "' is not a list.");     \
+      return;                                                                \
+    }                                                                        \
+    const auto len = std::min(max_values, attr->list().list_field##_size()); \
+    for (int i = 0; i < len; ++i) {                                          \
+      values[i] = static_cast<c_type>(attr->list().list_field(i));           \
+    }                                                                        \
   }
 DEFINE_GETATTR(TF_OperationGetAttrInt, int64_t, tensorflow::int64, i);
 DEFINE_GETATTR(TF_OperationGetAttrFloat, float, float, f);
@@ -1504,7 +1441,8 @@ DEFINE_GETATTR(TF_OperationGetAttrType, TF_DataType, DataType, type);
 void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
                               int64_t* value, int num_dims, TF_Status* status) {
   PartialTensorShape shape;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shape);
+  status->status =
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shape);
   if (!status->status.ok()) return;
   auto len = std::min(shape.dims(), num_dims);
   for (int i = 0; i < len; ++i) {
@@ -1518,7 +1456,7 @@ void TF_OperationGetAttrShapeList(TF_Operation* oper, const char* attr_name,
                                   int storage_size, TF_Status* status) {
   std::vector<PartialTensorShape> shapes;
   status->status =
-      tensorflow::GetNodeAttr(oper->node.def(), attr_name, &shapes);
+      tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &shapes);
   if (!status->status.ok()) return;
   auto len = std::min(static_cast<int>(shapes.size()), max_values);
   int64_t* p = storage;
@@ -1585,7 +1523,7 @@ void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
                                TF_Tensor** value, TF_Status* status) {
   *value = nullptr;
   Tensor t;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &t);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &t);
   if (!status->status.ok()) return;
   *value = new TF_Tensor{static_cast<TF_DataType>(t.dtype()), t.shape(),
                          tensorflow::TensorCApi::Buffer(t)};
@@ -1596,7 +1534,7 @@ void TF_OperationGetAttrTensorList(TF_Operation* oper, const char* attr_name,
                                    TF_Tensor** values, int max_values,
                                    TF_Status* status) {
   std::vector<Tensor> ts;
-  status->status = tensorflow::GetNodeAttr(oper->node.def(), attr_name, &ts);
+  status->status = tensorflow::GetNodeAttr(oper->node.attrs(), attr_name, &ts);
   if (!status->status.ok()) return;
   const auto len = std::min(max_values, static_cast<int>(ts.size()));
   for (int i = 0; i < len; ++i) {
@@ -1675,10 +1613,6 @@ void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
   status->status = MessageToBuffer(def, output_graph_def);
 }
 
-struct TF_ImportGraphDefOptions {
-  tensorflow::ImportGraphDefOptions opts;
-};
-
 TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions() {
   return new TF_ImportGraphDefOptions;
 }
@@ -2101,6 +2035,75 @@ void TF_FinishWhile(const TF_WhileParams* params, TF_Status* status,
 
 void TF_AbortWhile(const TF_WhileParams* params) { FreeWhileResources(params); }
 
+#ifndef __ANDROID__
+namespace {
+
+void OutputsFromTFOutputs(TF_Output* tf_outputs, int n, TF_Status* status,
+                          std::vector<tensorflow::Output>* outputs) {
+  outputs->resize(n);
+  for (int i = 0; i < n; i++) {
+    const TF_Output& tf_output = tf_outputs[i];
+    (*outputs)[i] = tensorflow::Output(&tf_output.oper->node, tf_output.index);
+  }
+}
+
+void TFOutputsFromOutputs(const std::vector<tensorflow::Output>& outputs,
+                          TF_Output* tf_outputs) {
+  for (int i = 0; i < outputs.size(); i++) {
+    tf_outputs[i].oper = ToOperation(outputs[i].node());
+    tf_outputs[i].index = outputs[i].index();
+  }
+}
+
+}  // namespace
+#endif  // __ANDROID__
+
+void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
+                     TF_Output* dx, TF_Status* status, TF_Output* dy) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Adding gradients is not supported in Android. File a bug at "
+      "https://github.com/tensorflow/tensorflow/issues if this feature is "
+      "important to you");
+#else
+  std::vector<tensorflow::Output> y_arg;
+  std::vector<tensorflow::Output> x_arg;
+  std::vector<tensorflow::Output> dy_arg;
+  OutputsFromTFOutputs(y, ny, status, &y_arg);
+  OutputsFromTFOutputs(x, nx, status, &x_arg);
+
+  {
+    // We need to hold on to the lock while we have a scope that uses TF_Graph.
+    mutex_lock graph_lock(g->mu);
+
+    const int max_node_id_before = g->graph.num_node_ids();
+
+    tensorflow::Scope scope =
+        NewInternalScope(&g->graph, &status->status, &g->refiner);
+
+    if (dx != nullptr) {
+      std::vector<tensorflow::Output> dx_arg;
+      OutputsFromTFOutputs(dx, ny, status, &dx_arg);
+      status->status =
+          AddSymbolicGradients(scope, y_arg, x_arg, dx_arg, &dy_arg);
+    } else {
+      status->status = AddSymbolicGradients(scope, y_arg, x_arg, &dy_arg);
+    }
+
+    // Update g->name_map with the name_map from the scope, which will contain
+    // the new gradient ops.
+    for (int i = max_node_id_before; i < g->graph.num_node_ids(); ++i) {
+      Node* n = g->graph.FindNodeId(i);
+      if (n == nullptr) continue;
+      g->name_map[n->name()] = n;
+    }
+  }
+
+  // Unpack the results from grad_outputs_arg.
+  TFOutputsFromOutputs(dy_arg, dy);
+#endif  // __ANDROID__
+}
+
 // TF_Session functions ----------------------------------------------
 
 TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index f837b68d76c34ba836720df820daaae5bc29c93c..ec9b01b388d1138644e28e3206e32726347b3d5e 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -64,6 +64,25 @@ limitations under the License.
 //   and the API just provides high level controls over the number of
 //   devices of each type.
 
+// Macro to control visibility of exported symbols in the shared library (.so,
+// .dylib, .dll).
+// This duplicates the TF_EXPORT macro definition in
+// tensorflow/core/platform/macros.h in order to keep this .h file independent
+// of any other includes.$a
+#ifdef SWIG
+#define TF_CAPI_EXPORT
+#else
+#if defined(COMPILER_MSVC)
+#ifdef TF_COMPILE_LIBRARY
+#define TF_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TF_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // COMPILER_MSVC
+#endif  // SWIG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -71,12 +90,12 @@ extern "C" {
 // --------------------------------------------------------------------------
 // TF_Version returns a string describing version information of the
 // TensorFlow library. TensorFlow using semantic versioning.
-extern const char* TF_Version();
+TF_CAPI_EXPORT extern const char* TF_Version();
 
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
 // The enum values here are identical to corresponding values in types.proto.
-typedef enum {
+typedef enum TF_DataType {
   TF_FLOAT = 1,
   TF_DOUBLE = 2,
   TF_INT32 = 3,  // Int32 tensors are always in 'host' memory.
@@ -103,12 +122,12 @@ typedef enum {
 // TF_DataTypeSize returns the sizeof() for the underlying type corresponding
 // to the given TF_DataType enum value. Returns 0 for variable length types
 // (eg. TF_STRING) or on failure.
-extern size_t TF_DataTypeSize(TF_DataType dt);
+TF_CAPI_EXPORT extern size_t TF_DataTypeSize(TF_DataType dt);
 
 // --------------------------------------------------------------------------
 // TF_Code holds an error code.  The enum values here are identical to
 // corresponding values in error_codes.proto.
-typedef enum {
+typedef enum TF_Code {
   TF_OK = 0,
   TF_CANCELLED = 1,
   TF_UNKNOWN = 2,
@@ -134,23 +153,24 @@ typedef enum {
 typedef struct TF_Status TF_Status;
 
 // Return a new status object.
-extern TF_Status* TF_NewStatus();
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus();
 
 // Delete a previously created status object.
-extern void TF_DeleteStatus(TF_Status*);
+TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
 
 // Record <code, msg> in *s.  Any previous information is lost.
 // A common use is to clear a status: TF_SetStatus(s, TF_OK, "");
-extern void TF_SetStatus(TF_Status* s, TF_Code code, const char* msg);
+TF_CAPI_EXPORT extern void TF_SetStatus(TF_Status* s, TF_Code code,
+                                        const char* msg);
 
 // Return the code record in *s.
-extern TF_Code TF_GetCode(const TF_Status* s);
+TF_CAPI_EXPORT extern TF_Code TF_GetCode(const TF_Status* s);
 
 // Return a pointer to the (null-terminated) error message in *s.  The
 // return value points to memory that is only usable until the next
 // mutation to *s.  Always returns an empty string if TF_GetCode(s) is
 // TF_OK.
-extern const char* TF_Message(const TF_Status* s);
+TF_CAPI_EXPORT extern const char* TF_Message(const TF_Status* s);
 
 // --------------------------------------------------------------------------
 // TF_Buffer holds a pointer to a block of data and its associated length.
@@ -168,14 +188,15 @@ typedef struct TF_Buffer {
 
 // Makes a copy of the input and sets an appropriate deallocator.  Useful for
 // passing in read-only, input protobufs.
-extern TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len);
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
+                                                        size_t proto_len);
 
 // Useful for passing *out* a protobuf.
-extern TF_Buffer* TF_NewBuffer();
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer();
 
-extern void TF_DeleteBuffer(TF_Buffer*);
+TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
-extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
+TF_CAPI_EXPORT extern TF_Buffer TF_GetBuffer(TF_Buffer* buffer);
 
 // --------------------------------------------------------------------------
 // TF_Tensor holds a multi-dimensional array of elements of a single data type.
@@ -202,11 +223,10 @@ typedef struct TF_Tensor TF_Tensor;
 //      (*deallocator)(data, len, deallocator_arg)
 // Clients must provide a custom deallocator function so they can pass in
 // memory managed by something like numpy.
-extern TF_Tensor* TF_NewTensor(TF_DataType, const int64_t* dims, int num_dims,
-                               void* data, size_t len,
-                               void (*deallocator)(void* data, size_t len,
-                                                   void* arg),
-                               void* deallocator_arg);
+TF_CAPI_EXPORT extern TF_Tensor* TF_NewTensor(
+    TF_DataType, const int64_t* dims, int num_dims, void* data, size_t len,
+    void (*deallocator)(void* data, size_t len, void* arg),
+    void* deallocator_arg);
 
 // Allocate and return a new Tensor.
 //
@@ -217,27 +237,32 @@ extern TF_Tensor* TF_NewTensor(TF_DataType, const int64_t* dims, int num_dims,
 //
 // The caller must set the Tensor values by writing them to the pointer returned
 // by TF_TensorData with length TF_TensorByteSize.
-extern TF_Tensor* TF_AllocateTensor(TF_DataType, const int64_t* dims,
-                                    int num_dims, size_t len);
+TF_CAPI_EXPORT extern TF_Tensor* TF_AllocateTensor(TF_DataType,
+                                                   const int64_t* dims,
+                                                   int num_dims, size_t len);
+
+// Deletes `tensor` and returns a new TF_Tensor with the same content if
+// possible. Returns nullptr and leaves `tensor` untouched if not.
+TF_CAPI_EXPORT extern TF_Tensor* TF_TensorMaybeMove(TF_Tensor* tensor);
 
 // Destroy a tensor.
-extern void TF_DeleteTensor(TF_Tensor*);
+TF_CAPI_EXPORT extern void TF_DeleteTensor(TF_Tensor*);
 
 // Return the type of a tensor element.
-extern TF_DataType TF_TensorType(const TF_Tensor*);
+TF_CAPI_EXPORT extern TF_DataType TF_TensorType(const TF_Tensor*);
 
 // Return the number of dimensions that the tensor has.
-extern int TF_NumDims(const TF_Tensor*);
+TF_CAPI_EXPORT extern int TF_NumDims(const TF_Tensor*);
 
 // Return the length of the tensor in the "dim_index" dimension.
 // REQUIRES: 0 <= dim_index < TF_NumDims(tensor)
-extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
+TF_CAPI_EXPORT extern int64_t TF_Dim(const TF_Tensor* tensor, int dim_index);
 
 // Return the size of the underlying data in bytes.
-extern size_t TF_TensorByteSize(const TF_Tensor*);
+TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 
 // Return a pointer to the underlying data buffer.
-extern void* TF_TensorData(const TF_Tensor*);
+TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
 // --------------------------------------------------------------------------
 // Encode the string `src` (`src_len` bytes long) into `dst` in the format
@@ -247,8 +272,9 @@ extern void* TF_TensorData(const TF_Tensor*);
 //
 // On success returns the size in bytes of the encoded string.
 // Returns an error into `status` otherwise.
-extern size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
-                              size_t dst_len, TF_Status* status);
+TF_CAPI_EXPORT extern size_t TF_StringEncode(const char* src, size_t src_len,
+                                             char* dst, size_t dst_len,
+                                             TF_Status* status);
 
 // Decode a string encoded using TF_StringEncode.
 //
@@ -258,19 +284,20 @@ extern size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
 // `*dst` and `*dst_len` are undefined and an error is set in `status`.
 //
 // Does not read memory more than `src_len` bytes beyond `src`.
-extern size_t TF_StringDecode(const char* src, size_t src_len, const char** dst,
-                              size_t* dst_len, TF_Status* status);
+TF_CAPI_EXPORT extern size_t TF_StringDecode(const char* src, size_t src_len,
+                                             const char** dst, size_t* dst_len,
+                                             TF_Status* status);
 
 // Return the size in bytes required to encode a string `len` bytes long into a
 // TF_STRING tensor.
-extern size_t TF_StringEncodedSize(size_t len);
+TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 
 // --------------------------------------------------------------------------
 // TF_SessionOptions holds options that can be passed during session creation.
 typedef struct TF_SessionOptions TF_SessionOptions;
 
 // Return a new options object.
-extern TF_SessionOptions* TF_NewSessionOptions();
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions();
 
 // Set the target in TF_SessionOptions.options.
 // target can be empty, a single entry, or a comma separated list of entries.
@@ -278,17 +305,19 @@ extern TF_SessionOptions* TF_NewSessionOptions();
 // "local"
 // ip:port
 // host:port
-extern void TF_SetTarget(TF_SessionOptions* options, const char* target);
+TF_CAPI_EXPORT extern void TF_SetTarget(TF_SessionOptions* options,
+                                        const char* target);
 
 // Set the config in TF_SessionOptions.options.
 // config should be a serialized tensorflow.ConfigProto proto.
 // If config was not parsed successfully as a ConfigProto, record the
 // error information in *status.
-extern void TF_SetConfig(TF_SessionOptions* options, const void* proto,
-                         size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetConfig(TF_SessionOptions* options,
+                                        const void* proto, size_t proto_len,
+                                        TF_Status* status);
 
 // Destroy an options object.
-extern void TF_DeleteSessionOptions(TF_SessionOptions*);
+TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 
 // TODO(jeff,sanjay):
 // - export functions to set Config fields
@@ -301,11 +330,11 @@ extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 typedef struct TF_Graph TF_Graph;
 
 // Return a new graph object.
-extern TF_Graph* TF_NewGraph();
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph();
 
 // Destroy an options object.  Graph will be deleted once no more
 // TFSession's are referencing it.
-extern void TF_DeleteGraph(TF_Graph*);
+TF_CAPI_EXPORT extern void TF_DeleteGraph(TF_Graph*);
 
 // Operation being built. The underlying graph must outlive this.
 typedef struct TF_OperationDescription TF_OperationDescription;
@@ -343,9 +372,11 @@ typedef struct TF_Output {
 //   * `output` is not in `graph`.
 //   * An invalid shape is being set (e.g., the shape being set
 //     is incompatible with the existing shape).
-extern void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
-                                   const int64_t* dims, const int num_dims,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphSetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  const int64_t* dims,
+                                                  const int num_dims,
+                                                  TF_Status* status);
 
 // Returns the number of dimensions of the Tensor referenced by `output`
 // in `graph`.
@@ -354,8 +385,9 @@ extern void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
 //
 // Returns an error into `status` if:
 //   * `output` is not in `graph`.
-extern int TF_GraphGetTensorNumDims(TF_Graph* graph, TF_Output output,
-                                    TF_Status* status);
+TF_CAPI_EXPORT extern int TF_GraphGetTensorNumDims(TF_Graph* graph,
+                                                   TF_Output output,
+                                                   TF_Status* status);
 
 // Returns the shape of the Tensor referenced by `output` in `graph`
 // into `dims`. `dims` must be an array large enough to hold `num_dims`
@@ -369,20 +401,21 @@ extern int TF_GraphGetTensorNumDims(TF_Graph* graph, TF_Output output,
 // Returns an error into `status` if:
 //   * `output` is not in `graph`.
 //   * `num_dims` does not match the actual number of dimensions.
-extern void TF_GraphGetTensorShape(TF_Graph* graph, TF_Output output,
-                                   int64_t* dims, int num_dims,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphGetTensorShape(TF_Graph* graph,
+                                                  TF_Output output,
+                                                  int64_t* dims, int num_dims,
+                                                  TF_Status* status);
 
 // Operation will only be added to *graph when TF_FinishOperation() is
 // called (assuming TF_FinishOperation() does not return an error).
 // *graph must not be deleted until after TF_FinishOperation() is
 // called.
-extern TF_OperationDescription* TF_NewOperation(TF_Graph* graph,
-                                                const char* op_type,
-                                                const char* oper_name);
+TF_CAPI_EXPORT extern TF_OperationDescription* TF_NewOperation(
+    TF_Graph* graph, const char* op_type, const char* oper_name);
 
 // Specify the device for `desc`.  Defaults to empty, meaning unconstrained.
-extern void TF_SetDevice(TF_OperationDescription* desc, const char* device);
+TF_CAPI_EXPORT extern void TF_SetDevice(TF_OperationDescription* desc,
+                                        const char* device);
 
 // The calls to TF_AddInput and TF_AddInputList must match (in number,
 // order, and type) the op declaration.  For example, the "Concat" op
@@ -405,101 +438,115 @@ extern void TF_SetDevice(TF_OperationDescription* desc, const char* device);
 //   TF_AddInputList(desc, values_inputs, 5);
 
 // For inputs that take a single tensor.
-extern void TF_AddInput(TF_OperationDescription* desc, TF_Output input);
+TF_CAPI_EXPORT extern void TF_AddInput(TF_OperationDescription* desc,
+                                       TF_Output input);
 
 // For inputs that take a list of tensors.
 // inputs must point to TF_Output[num_inputs].
-extern void TF_AddInputList(TF_OperationDescription* desc,
-                            const TF_Output* inputs, int num_inputs);
+TF_CAPI_EXPORT extern void TF_AddInputList(TF_OperationDescription* desc,
+                                           const TF_Output* inputs,
+                                           int num_inputs);
 
 // Call once per control input to `desc`.
-extern void TF_AddControlInput(TF_OperationDescription* desc,
-                               TF_Operation* input);
+TF_CAPI_EXPORT extern void TF_AddControlInput(TF_OperationDescription* desc,
+                                              TF_Operation* input);
 
 // Request that `desc` be co-located on the device where `op`
 // is placed.
 //
 // Use of this is discouraged since the implementation of device placement is
 // subject to change. Primarily intended for internal libraries
-extern void TF_ColocateWith(TF_OperationDescription* desc, TF_Operation* op);
+TF_CAPI_EXPORT extern void TF_ColocateWith(TF_OperationDescription* desc,
+                                           TF_Operation* op);
 
 // Call some TF_SetAttr*() function for every attr that is not
 // inferred from an input and doesn't have a default value you wish to
 // keep.
 
 // `value` must point to a string of length `length` bytes.
-extern void TF_SetAttrString(TF_OperationDescription* desc,
-                             const char* attr_name, const void* value,
-                             size_t length);
+TF_CAPI_EXPORT extern void TF_SetAttrString(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            const void* value, size_t length);
 // `values` and `lengths` each must have lengths `num_values`.
 // `values[i]` must point to a string of length `lengths[i]` bytes.
-extern void TF_SetAttrStringList(TF_OperationDescription* desc,
-                                 const char* attr_name,
-                                 const void* const* values,
-                                 const size_t* lengths, int num_values);
-extern void TF_SetAttrInt(TF_OperationDescription* desc, const char* attr_name,
-                          int64_t value);
-extern void TF_SetAttrIntList(TF_OperationDescription* desc,
-                              const char* attr_name, const int64_t* values,
-                              int num_values);
-extern void TF_SetAttrFloat(TF_OperationDescription* desc,
-                            const char* attr_name, float value);
-extern void TF_SetAttrFloatList(TF_OperationDescription* desc,
-                                const char* attr_name, const float* values,
-                                int num_values);
-extern void TF_SetAttrBool(TF_OperationDescription* desc, const char* attr_name,
-                           unsigned char value);
-extern void TF_SetAttrBoolList(TF_OperationDescription* desc,
-                               const char* attr_name,
-                               const unsigned char* values, int num_values);
-extern void TF_SetAttrType(TF_OperationDescription* desc, const char* attr_name,
-                           TF_DataType value);
-extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
-                               const char* attr_name, const TF_DataType* values,
-                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrStringList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* const* values,
+                                                const size_t* lengths,
+                                                int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrInt(TF_OperationDescription* desc,
+                                         const char* attr_name, int64_t value);
+TF_CAPI_EXPORT extern void TF_SetAttrIntList(TF_OperationDescription* desc,
+                                             const char* attr_name,
+                                             const int64_t* values,
+                                             int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrFloat(TF_OperationDescription* desc,
+                                           const char* attr_name, float value);
+TF_CAPI_EXPORT extern void TF_SetAttrFloatList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const float* values,
+                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrBool(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          unsigned char value);
+TF_CAPI_EXPORT extern void TF_SetAttrBoolList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const unsigned char* values,
+                                              int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrType(TF_OperationDescription* desc,
+                                          const char* attr_name,
+                                          TF_DataType value);
+TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
+                                              const char* attr_name,
+                                              const TF_DataType* values,
+                                              int num_values);
 
 // Set `num_dims` to -1 to represent "unknown rank".  Otherwise,
 // `dims` points to an array of length `num_dims`.  `dims[i]` must be
 // >= -1, with -1 meaning "unknown dimension".
-extern void TF_SetAttrShape(TF_OperationDescription* desc,
-                            const char* attr_name, const int64_t* dims,
-                            int num_dims);
+TF_CAPI_EXPORT extern void TF_SetAttrShape(TF_OperationDescription* desc,
+                                           const char* attr_name,
+                                           const int64_t* dims, int num_dims);
 // `dims` and `num_dims` must point to arrays of length `num_shapes`.
 // Set `num_dims[i]` to -1 to represent "unknown rank".  Otherwise,
 // `dims[i]` points to an array of length `num_dims[i]`.  `dims[i][j]`
 // must be >= -1, with -1 meaning "unknown dimension".
-extern void TF_SetAttrShapeList(TF_OperationDescription* desc,
-                                const char* attr_name,
-                                const int64_t* const* dims, const int* num_dims,
-                                int num_shapes);
+TF_CAPI_EXPORT extern void TF_SetAttrShapeList(TF_OperationDescription* desc,
+                                               const char* attr_name,
+                                               const int64_t* const* dims,
+                                               const int* num_dims,
+                                               int num_shapes);
 // `proto` must point to an array of `proto_len` bytes representing a
 // binary-serialized TensorShapeProto.
-extern void TF_SetAttrTensorShapeProto(TF_OperationDescription* desc,
-                                       const char* attr_name, const void* proto,
-                                       size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProto(
+    TF_OperationDescription* desc, const char* attr_name, const void* proto,
+    size_t proto_len, TF_Status* status);
 // `protos` and `proto_lens` must point to arrays of length `num_shapes`.
 // `protos[i]` must point to an array of `proto_lens[i]` bytes
 // representing a binary-serialized TensorShapeProto.
-extern void TF_SetAttrTensorShapeProtoList(TF_OperationDescription* desc,
-                                           const char* attr_name,
-                                           const void* const* protos,
-                                           const size_t* proto_lens,
-                                           int num_shapes, TF_Status* status);
-
-extern void TF_SetAttrTensor(TF_OperationDescription* desc,
-                             const char* attr_name, TF_Tensor* value,
-                             TF_Status* status);
-extern void TF_SetAttrTensorList(TF_OperationDescription* desc,
-                                 const char* attr_name,
-                                 TF_Tensor* const* values, int num_values,
-                                 TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorShapeProtoList(
+    TF_OperationDescription* desc, const char* attr_name,
+    const void* const* protos, const size_t* proto_lens, int num_shapes,
+    TF_Status* status);
+
+TF_CAPI_EXPORT extern void TF_SetAttrTensor(TF_OperationDescription* desc,
+                                            const char* attr_name,
+                                            TF_Tensor* value,
+                                            TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrTensorList(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                TF_Tensor* const* values,
+                                                int num_values,
+                                                TF_Status* status);
 
 // `proto` should point to a sequence of bytes of length `proto_len`
 // representing a binary serialization of an AttrValue protocol
 // buffer.
-extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
-                                 const char* attr_name, const void* proto,
-                                 size_t proto_len, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
+                                                const char* attr_name,
+                                                const void* proto,
+                                                size_t proto_len,
+                                                TF_Status* status);
 
 // If this function succeeds:
 //   * *status is set to an OK value,
@@ -511,37 +558,38 @@ extern void TF_SetAttrValueProto(TF_OperationDescription* desc,
 //   * the graph is not modified,
 //   * a null value is returned.
 // In either case, it deletes `desc`.
-extern TF_Operation* TF_FinishOperation(TF_OperationDescription* desc,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern TF_Operation* TF_FinishOperation(
+    TF_OperationDescription* desc, TF_Status* status);
 
 // TF_Operation functions.  Operations are immutable once created, so
 // these are all query functions.
 
-extern const char* TF_OperationName(TF_Operation* oper);
-extern const char* TF_OperationOpType(TF_Operation* oper);
-extern const char* TF_OperationDevice(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationName(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationOpType(TF_Operation* oper);
+TF_CAPI_EXPORT extern const char* TF_OperationDevice(TF_Operation* oper);
 
-extern int TF_OperationNumOutputs(TF_Operation* oper);
-extern TF_DataType TF_OperationOutputType(TF_Output oper_out);
-extern int TF_OperationOutputListLength(TF_Operation* oper,
-                                        const char* arg_name,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern int TF_OperationNumOutputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationOutputType(TF_Output oper_out);
+TF_CAPI_EXPORT extern int TF_OperationOutputListLength(TF_Operation* oper,
+                                                       const char* arg_name,
+                                                       TF_Status* status);
 
-extern int TF_OperationNumInputs(TF_Operation* oper);
-extern TF_DataType TF_OperationInputType(TF_Input oper_in);
-extern int TF_OperationInputListLength(TF_Operation* oper, const char* arg_name,
-                                       TF_Status* status);
+TF_CAPI_EXPORT extern int TF_OperationNumInputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern TF_DataType TF_OperationInputType(TF_Input oper_in);
+TF_CAPI_EXPORT extern int TF_OperationInputListLength(TF_Operation* oper,
+                                                      const char* arg_name,
+                                                      TF_Status* status);
 
 // In this code:
 //   TF_Output producer = TF_OperationInput(consumer);
 // There is an edge from producer.oper's output (given by
 // producer.index) to consumer.oper's input (given by consumer.index).
-extern TF_Output TF_OperationInput(TF_Input oper_in);
+TF_CAPI_EXPORT extern TF_Output TF_OperationInput(TF_Input oper_in);
 
 // Get the number of current consumers of a specific output of an
 // operation.  Note that this number can change when new operations
 // are added to the graph.
-extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
+TF_CAPI_EXPORT extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
 
 // Get list of all current consumers of a specific output of an
 // operation.  `consumers` must point to an array of length at least
@@ -550,24 +598,24 @@ extern int TF_OperationOutputNumConsumers(TF_Output oper_out);
 // modification of the graph can increase the number of consumers of
 // an operation.  Returns the number of output consumers (should match
 // TF_OperationOutputNumConsumers(oper_out)).
-extern int TF_OperationOutputConsumers(TF_Output oper_out, TF_Input* consumers,
-                                       int max_consumers);
+TF_CAPI_EXPORT extern int TF_OperationOutputConsumers(TF_Output oper_out,
+                                                      TF_Input* consumers,
+                                                      int max_consumers);
 
 // Get the number of control inputs to an operation.
-extern int TF_OperationNumControlInputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern int TF_OperationNumControlInputs(TF_Operation* oper);
 
 // Get list of all control inputs to an operation.  `control_inputs` must
 // point to an array of length `max_control_inputs` (ideally set to
 // TF_OperationNumControlInputs(oper)).  Returns the number of control
 // inputs (should match TF_OperationNumControlInputs(oper)).
-extern int TF_OperationGetControlInputs(TF_Operation* oper,
-                                        TF_Operation** control_inputs,
-                                        int max_control_inputs);
+TF_CAPI_EXPORT extern int TF_OperationGetControlInputs(
+    TF_Operation* oper, TF_Operation** control_inputs, int max_control_inputs);
 
 // Get the number of operations that have `*oper` as a control input.
 // Note that this number can change when new operations are added to
 // the graph.
-extern int TF_OperationNumControlOutputs(TF_Operation* oper);
+TF_CAPI_EXPORT extern int TF_OperationNumControlOutputs(TF_Operation* oper);
 
 // Get the list of operations that have `*oper` as a control input.
 // `control_outputs` must point to an array of length at least
@@ -576,12 +624,12 @@ extern int TF_OperationNumControlOutputs(TF_Operation* oper);
 // modification of the graph can increase the number of control
 // outputs.  Returns the number of control outputs (should match
 // TF_OperationNumControlOutputs(oper)).
-extern int TF_OperationGetControlOutputs(TF_Operation* oper,
-                                         TF_Operation** control_outputs,
-                                         int max_control_outputs);
+TF_CAPI_EXPORT extern int TF_OperationGetControlOutputs(
+    TF_Operation* oper, TF_Operation** control_outputs,
+    int max_control_outputs);
 
 // TF_AttrType describes the type of the value of an attribute on an operation.
-typedef enum {
+typedef enum TF_AttrType {
   TF_ATTR_STRING = 0,
   TF_ATTR_INT = 1,
   TF_ATTR_FLOAT = 2,
@@ -625,17 +673,18 @@ typedef struct TF_AttrMetadata {
 } TF_AttrMetadata;
 
 // Returns metadata about the value of the attribute `attr_name` of `oper`.
-extern TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
-                                                   const char* attr_name,
-                                                   TF_Status* status);
+TF_CAPI_EXPORT extern TF_AttrMetadata TF_OperationGetAttrMetadata(
+    TF_Operation* oper, const char* attr_name, TF_Status* status);
 
 // Fills in `value` with the value of the attribute `attr_name`.  `value` must
 // point to an array of length at least `max_length` (ideally set to
 // TF_AttrMetadata.total_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
-                                      void* value, size_t max_length,
-                                      TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrString(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     void* value,
+                                                     size_t max_length,
+                                                     TF_Status* status);
 
 // Get the list of strings in the value of the attribute `attr_name`.  Fills in
 // `values` and `lengths`, each of which must point to an array of length at
@@ -648,64 +697,78 @@ extern void TF_OperationGetAttrString(TF_Operation* oper, const char* attr_name,
 // attr_name).
 //
 // Fails if storage_size is too small to hold the requested number of strings.
-extern void TF_OperationGetAttrStringList(TF_Operation* oper,
-                                          const char* attr_name, void** values,
-                                          size_t* lengths, int max_values,
-                                          void* storage, size_t storage_size,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrStringList(
+    TF_Operation* oper, const char* attr_name, void** values, size_t* lengths,
+    int max_values, void* storage, size_t storage_size, TF_Status* status);
 
-extern void TF_OperationGetAttrInt(TF_Operation* oper, const char* attr_name,
-                                   int64_t* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrInt(TF_Operation* oper,
+                                                  const char* attr_name,
+                                                  int64_t* value,
+                                                  TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrIntList(TF_Operation* oper,
-                                       const char* attr_name, int64_t* values,
-                                       int max_values, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrIntList(TF_Operation* oper,
+                                                      const char* attr_name,
+                                                      int64_t* values,
+                                                      int max_values,
+                                                      TF_Status* status);
 
-extern void TF_OperationGetAttrFloat(TF_Operation* oper, const char* attr_name,
-                                     float* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloat(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    float* value,
+                                                    TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrFloatList(TF_Operation* oper,
-                                         const char* attr_name, float* values,
-                                         int max_values, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrFloatList(TF_Operation* oper,
+                                                        const char* attr_name,
+                                                        float* values,
+                                                        int max_values,
+                                                        TF_Status* status);
 
-extern void TF_OperationGetAttrBool(TF_Operation* oper, const char* attr_name,
-                                    unsigned char* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBool(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   unsigned char* value,
+                                                   TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrBoolList(TF_Operation* oper,
-                                        const char* attr_name,
-                                        unsigned char* values, int max_values,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrBoolList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       unsigned char* values,
+                                                       int max_values,
+                                                       TF_Status* status);
 
-extern void TF_OperationGetAttrType(TF_Operation* oper, const char* attr_name,
-                                    TF_DataType* value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrType(TF_Operation* oper,
+                                                   const char* attr_name,
+                                                   TF_DataType* value,
+                                                   TF_Status* status);
 
 // Fills in `values` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `max_values` (ideally set
 // to TF_AttrMetadata.list_size from TF_OperationGetAttrMetadata(oper,
 // attr_name)).
-extern void TF_OperationGetAttrTypeList(TF_Operation* oper,
-                                        const char* attr_name,
-                                        TF_DataType* values, int max_values,
-                                        TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTypeList(TF_Operation* oper,
+                                                       const char* attr_name,
+                                                       TF_DataType* values,
+                                                       int max_values,
+                                                       TF_Status* status);
 
 // Fills in `value` with the value of the attribute `attr_name` of `oper`.
 // `values` must point to an array of length at least `num_dims` (ideally set to
 // TF_Attr_Meta.size from TF_OperationGetAttrMetadata(oper, attr_name)).
-extern void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
-                                     int64_t* value, int num_dims,
-                                     TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShape(TF_Operation* oper,
+                                                    const char* attr_name,
+                                                    int64_t* value,
+                                                    int num_dims,
+                                                    TF_Status* status);
 
 // Fills in `dims` with the list of shapes in the attribute `attr_name` of
 // `oper` and `num_dims` with the corresponding number of dimensions. On return,
@@ -720,35 +783,32 @@ extern void TF_OperationGetAttrShape(TF_Operation* oper, const char* attr_name,
 // attr_name).
 //
 // Fails if storage_size is insufficient to hold the requested shapes.
-extern void TF_OperationGetAttrShapeList(TF_Operation* oper,
-                                         const char* attr_name, int64_t** dims,
-                                         int* num_dims, int num_shapes,
-                                         int64_t* storage, int storage_size,
-                                         TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrShapeList(
+    TF_Operation* oper, const char* attr_name, int64_t** dims, int* num_dims,
+    int num_shapes, int64_t* storage, int storage_size, TF_Status* status);
 
 // Sets `value` to the binary-serialized TensorShapeProto of the value of
 // `attr_name` attribute of `oper`'.
-extern void TF_OperationGetAttrTensorShapeProto(TF_Operation* oper,
-                                                const char* attr_name,
-                                                TF_Buffer* value,
-                                                TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* value,
+    TF_Status* status);
 
 // Fills in `values` with binary-serialized TensorShapeProto values of the
 // attribute `attr_name` of `oper`. `values` must point to an array of length at
 // least `num_values` (ideally set to TF_AttrMetadata.list_size from
 // TF_OperationGetAttrMetadata(oper, attr_name)).
-extern void TF_OperationGetAttrTensorShapeProtoList(TF_Operation* oper,
-                                                    const char* attr_name,
-                                                    TF_Buffer** values,
-                                                    int max_values,
-                                                    TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorShapeProtoList(
+    TF_Operation* oper, const char* attr_name, TF_Buffer** values,
+    int max_values, TF_Status* status);
 
 // Gets the TF_Tensor valued attribute of `attr_name` of `oper`.
 //
 // Allocates a new TF_Tensor which the caller is expected to take
 // ownership of (and can deallocate using TF_DeleteTensor).
-extern void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
-                                      TF_Tensor** value, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensor(TF_Operation* oper,
+                                                     const char* attr_name,
+                                                     TF_Tensor** value,
+                                                     TF_Status* status);
 
 // Fills in `values` with the TF_Tensor values of the attribute `attr_name` of
 // `oper`. `values` must point to an array of TF_Tensor* of length at least
@@ -757,22 +817,22 @@ extern void TF_OperationGetAttrTensor(TF_Operation* oper, const char* attr_name,
 //
 // The caller takes ownership of all the non-null TF_Tensor* entries in `values`
 // (which can be deleted using TF_DeleteTensor(values[i])).
-extern void TF_OperationGetAttrTensorList(TF_Operation* oper,
-                                          const char* attr_name,
-                                          TF_Tensor** values, int max_values,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrTensorList(TF_Operation* oper,
+                                                         const char* attr_name,
+                                                         TF_Tensor** values,
+                                                         int max_values,
+                                                         TF_Status* status);
 
 // Sets `output_attr_value` to the binary-serialized AttrValue proto
 // representation of the value of the `attr_name` attr of `oper`.
-extern void TF_OperationGetAttrValueProto(TF_Operation* oper,
-                                          const char* attr_name,
-                                          TF_Buffer* output_attr_value,
-                                          TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationGetAttrValueProto(
+    TF_Operation* oper, const char* attr_name, TF_Buffer* output_attr_value,
+    TF_Status* status);
 
 // Returns the operation in the graph with `oper_name`. Returns nullptr if
 // no operation found.
-extern TF_Operation* TF_GraphOperationByName(TF_Graph* graph,
-                                             const char* oper_name);
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphOperationByName(
+    TF_Graph* graph, const char* oper_name);
 
 // Iterate through the operations of a graph.  To use:
 // size_t pos = 0;
@@ -780,7 +840,8 @@ extern TF_Operation* TF_GraphOperationByName(TF_Graph* graph,
 // while ((oper = TF_GraphNextOperation(graph, &pos)) != nullptr) {
 //   DoSomethingWithOperation(oper);
 // }
-extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph, size_t* pos);
+TF_CAPI_EXPORT extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph,
+                                                          size_t* pos);
 
 // Write out a serialized representation of `graph` (as a GraphDef protocol
 // message) to `output_graph_def` (allocated by TF_NewBuffer()).
@@ -788,25 +849,27 @@ extern TF_Operation* TF_GraphNextOperation(TF_Graph* graph, size_t* pos);
 // is called.
 //
 // May fail on very large graphs in the future.
-extern void TF_GraphToGraphDef(TF_Graph* graph, TF_Buffer* output_graph_def,
-                               TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphToGraphDef(TF_Graph* graph,
+                                              TF_Buffer* output_graph_def,
+                                              TF_Status* status);
 
 // TF_ImportGraphDefOptions holds options that can be passed to
 // TF_GraphImportGraphDef.
 typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
 
-extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
-extern void TF_DeleteImportGraphDefOptions(TF_ImportGraphDefOptions* opts);
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
+    TF_ImportGraphDefOptions* opts);
 
 // Set the prefix to be prepended to the names of nodes in `graph_def` that will
 // be imported into `graph`.
-extern void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
-                                              const char* prefix);
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
+    TF_ImportGraphDefOptions* opts, const char* prefix);
 
 // Set any imported nodes with input `src_name:src_index` to have that input
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references a node already existing in the graph being imported into.
-extern void TF_ImportGraphDefOptionsAddInputMapping(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddInputMapping(
     TF_ImportGraphDefOptions* opts, const char* src_name, int src_index,
     TF_Output dst);
 
@@ -814,23 +877,23 @@ extern void TF_ImportGraphDefOptionsAddInputMapping(
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references an operation already existing in the graph being imported
 // into.
-extern void TF_GraphImportGraphDefOptionsRemapControlDependency(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsRemapControlDependency(
     TF_ImportGraphDefOptions* opts, const char* src_name, TF_Operation* dst);
 
 // Cause the imported graph to have a control dependency on `oper`. `oper`
 // should exist in the graph being imported into.
-extern void TF_ImportGraphDefOptionsAddControlDependency(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddControlDependency(
     TF_ImportGraphDefOptions* opts, TF_Operation* oper);
 
 // Add an output in `graph_def` to be returned via the `return_outputs` output
 // parameter of TF_GraphImportGraphDef(). If the output is remapped via an input
 // mapping, the corresponding existing tensor in `graph` will be returned.
-extern void TF_ImportGraphDefOptionsAddReturnOutput(
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsAddReturnOutput(
     TF_ImportGraphDefOptions* opts, const char* oper_name, int index);
 
 // Returns the number of return outputs added via
 // TF_ImportGraphDefOptionsAddReturnOutput().
-extern int TF_ImportGraphDefOptionsNumReturnOutputs(
+TF_CAPI_EXPORT extern int TF_ImportGraphDefOptionsNumReturnOutputs(
     const TF_ImportGraphDefOptions* opts);
 
 // Import the graph serialized in `graph_def` into `graph`.
@@ -839,22 +902,22 @@ extern int TF_ImportGraphDefOptionsNumReturnOutputs(
 // result of TF_ImportGraphDefOptionsNumReturnOutputs()).  If
 // `num_return_outputs` is non-zero, `return_outputs` must be of length
 // `num_return_outputs`. Otherwise it can be null.
-extern void TF_GraphImportGraphDefWithReturnOutputs(
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDefWithReturnOutputs(
     TF_Graph* graph, const TF_Buffer* graph_def,
     const TF_ImportGraphDefOptions* options, TF_Output* return_outputs,
     int num_return_outputs, TF_Status* status);
 
 // Import the graph serialized in `graph_def` into `graph`.
 // Convenience function for when no return outputs have been added.
-extern void TF_GraphImportGraphDef(TF_Graph* graph, const TF_Buffer* graph_def,
-                                   const TF_ImportGraphDefOptions* options,
-                                   TF_Status* status);
+TF_CAPI_EXPORT extern void TF_GraphImportGraphDef(
+    TF_Graph* graph, const TF_Buffer* graph_def,
+    const TF_ImportGraphDefOptions* options, TF_Status* status);
 
 // Note: The following function may fail on very large protos in the future.
 
-extern void TF_OperationToNodeDef(TF_Operation* oper,
-                                  TF_Buffer* output_node_def,
-                                  TF_Status* status);
+TF_CAPI_EXPORT extern void TF_OperationToNodeDef(TF_Operation* oper,
+                                                 TF_Buffer* output_node_def,
+                                                 TF_Status* status);
 
 typedef struct TF_WhileParams {
   // The number of inputs to the while loop, i.e. the number of loop variables.
@@ -894,7 +957,7 @@ typedef struct TF_WhileParams {
 // TF_FinishWhile() or TF_AbortWhile().
 //
 // Missing functionality (TODO):
-// - Gradients (not yet implmented for any ops)
+// - Gradients
 // - Reference-type inputs
 // - Directly referencing external tensors from the cond/body graphs (this is
 //   possible in the Python API)
@@ -917,7 +980,22 @@ void TF_FinishWhile(const TF_WhileParams* params, TF_Status* status,
 // called after a successful TF_NewWhile() call.
 void TF_AbortWhile(const TF_WhileParams* params);
 
-// TODO(andydavis): Function to add gradients to a graph.
+// Adds operations to compute the partial derivatives of sum of `y`s w.r.t `x`s,
+// i.e., d(y_1 + y_2 + ...)/dx_1, d(y_1 + y_2 + ...)/dx_2...
+// `dx` are used as initial gradients (which represent the symbolic partial
+// derivatives of some loss function `L` w.r.t. `y`).
+// `dx` must be nullptr or have size `ny`.
+// If `dx` is nullptr, the implementation will use dx of `OnesLike` for all
+// shapes in `y`.
+// The partial derivatives are returned in `dy`. `dy` should be allocated to
+// size `nx`.
+//
+// WARNING: This function does not yet support all the gradients that python
+// supports. See
+// https://www.tensorflow.org/code/tensorflow/cc/gradients/README.md
+// for instructions on how to add C++ more gradients.
+void TF_AddGradients(TF_Graph* g, TF_Output* y, int ny, TF_Output* x, int nx,
+                     TF_Output* dx, TF_Status* status, TF_Output* dy);
 
 // TODO(josh11b): Register OpDef, available to all operations added
 // to this graph.
@@ -936,8 +1014,9 @@ typedef struct TF_Session TF_Session;
 // *graph must be a valid graph (not deleted or nullptr).  This function will
 // prevent the graph from being deleted until TF_DeleteSession() is called.
 // Does not take ownership of opts.
-extern TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opts,
-                                 TF_Status* status);
+TF_CAPI_EXPORT extern TF_Session* TF_NewSession(TF_Graph* graph,
+                                                const TF_SessionOptions* opts,
+                                                TF_Status* status);
 
 // This function creates a new TF_Session (which is created on success) using
 // `session_options`, and then initializes state (restoring tensors and other
@@ -962,7 +1041,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 //
 // Contacts any other processes associated with the session, if applicable.
 // May not be called after TF_DeleteSession().
-extern void TF_CloseSession(TF_Session*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_CloseSession(TF_Session*, TF_Status* status);
 
 // Destroy a session object.
 //
@@ -970,7 +1049,7 @@ extern void TF_CloseSession(TF_Session*, TF_Status* status);
 // local resources associated with the session.  The session may not be used
 // during or after this call (and the session drops its reference to the
 // corresponding graph).
-extern void TF_DeleteSession(TF_Session*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteSession(TF_Session*, TF_Status* status);
 
 // Run the graph associated with the session starting with the supplied inputs
 // (inputs[0,ninputs-1] with corresponding values in input_values[0,ninputs-1]).
@@ -996,21 +1075,20 @@ extern void TF_DeleteSession(TF_Session*, TF_Status* status);
 // to the caller, which must eventually call TF_DeleteTensor on them.
 //
 // On failure, output_values[] contains NULLs.
-extern void TF_SessionRun(TF_Session* session,
-                          // RunOptions
-                          const TF_Buffer* run_options,
-                          // Input tensors
-                          const TF_Output* inputs,
-                          TF_Tensor* const* input_values, int ninputs,
-                          // Output tensors
-                          const TF_Output* outputs, TF_Tensor** output_values,
-                          int noutputs,
-                          // Target operations
-                          const TF_Operation* const* target_opers, int ntargets,
-                          // RunMetadata
-                          TF_Buffer* run_metadata,
-                          // Output status
-                          TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionRun(
+    TF_Session* session,
+    // RunOptions
+    const TF_Buffer* run_options,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // RunMetadata
+    TF_Buffer* run_metadata,
+    // Output status
+    TF_Status*);
 
 // Set up the graph with the intended feeds (inputs) and fetches (outputs) for a
 // sequence of partial run calls.
@@ -1022,38 +1100,36 @@ extern void TF_SessionRun(TF_Session* session,
 // On failure, out_status contains a tensorflow::Status with an error
 // message.
 // NOTE: This is EXPERIMENTAL and subject to change.
-extern void TF_SessionPRunSetup(TF_Session*,
-                                // Input names
-                                const TF_Output* inputs, int ninputs,
-                                // Output names
-                                const TF_Output* outputs, int noutputs,
-                                // Target operations
-                                const TF_Operation* const* target_opers,
-                                int ntargets,
-                                // Output handle
-                                const char** handle,
-                                // Output status
-                                TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionPRunSetup(
+    TF_Session*,
+    // Input names
+    const TF_Output* inputs, int ninputs,
+    // Output names
+    const TF_Output* outputs, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output handle
+    const char** handle,
+    // Output status
+    TF_Status*);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
 // NOTE: This is EXPERIMENTAL and subject to change.
-extern void TF_SessionPRun(TF_Session*, const char* handle,
-                           // Input tensors
-                           const TF_Output* inputs,
-                           TF_Tensor* const* input_values, int ninputs,
-                           // Output tensors
-                           const TF_Output* outputs, TF_Tensor** output_values,
-                           int noutputs,
-                           // Target operations
-                           const TF_Operation* const* target_opers,
-                           int ntargets,
-                           // Output status
-                           TF_Status*);
+TF_CAPI_EXPORT extern void TF_SessionPRun(
+    TF_Session*, const char* handle,
+    // Input tensors
+    const TF_Output* inputs, TF_Tensor* const* input_values, int ninputs,
+    // Output tensors
+    const TF_Output* outputs, TF_Tensor** output_values, int noutputs,
+    // Target operations
+    const TF_Operation* const* target_opers, int ntargets,
+    // Output status
+    TF_Status*);
 
 // Deletes a handle allocated by TF_SessionPRunSetup.
 // Once called, no more calls to TF_SessionPRun should be made.
-extern void TF_DeletePRunHandle(const char* handle);
+TF_CAPI_EXPORT extern void TF_DeletePRunHandle(const char* handle);
 
 // --------------------------------------------------------------------------
 // The deprecated session API.  Please switch to the above instead of
@@ -1062,39 +1138,47 @@ extern void TF_DeletePRunHandle(const char* handle);
 
 typedef struct TF_DeprecatedSession TF_DeprecatedSession;
 
-extern TF_DeprecatedSession* TF_NewDeprecatedSession(const TF_SessionOptions*,
+TF_CAPI_EXPORT extern TF_DeprecatedSession* TF_NewDeprecatedSession(
+    const TF_SessionOptions*, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_CloseDeprecatedSession(TF_DeprecatedSession*,
                                                      TF_Status* status);
-extern void TF_CloseDeprecatedSession(TF_DeprecatedSession*, TF_Status* status);
-extern void TF_DeleteDeprecatedSession(TF_DeprecatedSession*,
-                                       TF_Status* status);
-extern void TF_Reset(const TF_SessionOptions* opt, const char** containers,
-                     int ncontainers, TF_Status* status);
+TF_CAPI_EXPORT extern void TF_DeleteDeprecatedSession(TF_DeprecatedSession*,
+                                                      TF_Status* status);
+TF_CAPI_EXPORT extern void TF_Reset(const TF_SessionOptions* opt,
+                                    const char** containers, int ncontainers,
+                                    TF_Status* status);
 // Treat the bytes proto[0,proto_len-1] as a serialized GraphDef and
 // add the nodes in that GraphDef to the graph for the session.
 //
 // Prefer use of TF_Session and TF_GraphImportGraphDef over this.
-extern void TF_ExtendGraph(TF_DeprecatedSession*, const void* proto,
-                           size_t proto_len, TF_Status*);
+TF_CAPI_EXPORT extern void TF_ExtendGraph(TF_DeprecatedSession*,
+                                          const void* proto, size_t proto_len,
+                                          TF_Status*);
 
 // See TF_SessionRun() above.
-extern void TF_Run(TF_DeprecatedSession*, const TF_Buffer* run_options,
-                   const char** input_names, TF_Tensor** inputs, int ninputs,
-                   const char** output_names, TF_Tensor** outputs, int noutputs,
-                   const char** target_oper_names, int ntargets,
-                   TF_Buffer* run_metadata, TF_Status*);
+TF_CAPI_EXPORT extern void TF_Run(TF_DeprecatedSession*,
+                                  const TF_Buffer* run_options,
+                                  const char** input_names, TF_Tensor** inputs,
+                                  int ninputs, const char** output_names,
+                                  TF_Tensor** outputs, int noutputs,
+                                  const char** target_oper_names, int ntargets,
+                                  TF_Buffer* run_metadata, TF_Status*);
 
 // See TF_SessionPRunSetup() above.
-extern void TF_PRunSetup(TF_DeprecatedSession*, const char** input_names,
-                         int ninputs, const char** output_names, int noutputs,
-                         const char** target_oper_names, int ntargets,
-                         const char** handle, TF_Status*);
+TF_CAPI_EXPORT extern void TF_PRunSetup(TF_DeprecatedSession*,
+                                        const char** input_names, int ninputs,
+                                        const char** output_names, int noutputs,
+                                        const char** target_oper_names,
+                                        int ntargets, const char** handle,
+                                        TF_Status*);
 
 // See TF_SessionPRun above.
-extern void TF_PRun(TF_DeprecatedSession*, const char* handle,
-                    const char** input_names, TF_Tensor** inputs, int ninputs,
-                    const char** output_names, TF_Tensor** outputs,
-                    int noutputs, const char** target_oper_names, int ntargets,
-                    TF_Status*);
+TF_CAPI_EXPORT extern void TF_PRun(TF_DeprecatedSession*, const char* handle,
+                                   const char** input_names, TF_Tensor** inputs,
+                                   int ninputs, const char** output_names,
+                                   TF_Tensor** outputs, int noutputs,
+                                   const char** target_oper_names, int ntargets,
+                                   TF_Status*);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
@@ -1113,19 +1197,19 @@ typedef struct TF_Library TF_Library;
 // The caller owns the library handle.
 //
 // On failure, place an error status in status and return NULL.
-extern TF_Library* TF_LoadLibrary(const char* library_filename,
-                                  TF_Status* status);
+TF_CAPI_EXPORT extern TF_Library* TF_LoadLibrary(const char* library_filename,
+                                                 TF_Status* status);
 
 // Get the OpList of OpDefs defined in the library pointed by lib_handle.
 //
 // Returns a TF_Buffer. The memory pointed to by the result is owned by
 // lib_handle. The data in the buffer will be the serialized OpList proto for
 // ops defined in the library.
-extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
+TF_CAPI_EXPORT extern TF_Buffer TF_GetOpList(TF_Library* lib_handle);
 
 // Frees the memory associated with the library handle.
 // Does NOT unload the library.
-extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
+TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 
 // Get the OpList of all OpDefs defined in this address space.
 // Returns a TF_Buffer, ownership of which is transferred to the caller
@@ -1133,7 +1217,7 @@ extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 //
 // The data in the buffer will be the serialized OpList proto for ops registered
 // in this address space.
-extern TF_Buffer* TF_GetAllOpList();
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5320d20dadb0f466b8b29b8ba5eda1693e0faba
--- /dev/null
+++ b/tensorflow/c/c_api_internal.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api.h"
+
+#include <vector>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+
+struct TF_Status {
+  tensorflow::Status status;
+};
+
+struct TF_Tensor {
+  TF_DataType dtype;
+  tensorflow::TensorShape shape;
+  tensorflow::TensorBuffer* buffer;
+};
+
+struct TF_SessionOptions {
+  tensorflow::SessionOptions options;
+};
+
+struct TF_DeprecatedSession {
+  tensorflow::Session* session;
+};
+
+struct TF_Library {
+  void* lib_handle;
+  TF_Buffer op_list;
+};
+
+struct TF_Graph {
+  TF_Graph()
+      : graph(tensorflow::OpRegistry::Global()),
+        refiner(graph.versions().producer(), graph.op_registry()),
+        num_sessions(0),
+        delete_requested(false),
+        parent(nullptr),
+        parent_inputs(nullptr) {}
+  tensorflow::mutex mu;
+  tensorflow::Graph graph GUARDED_BY(mu);
+
+  // Runs shape inference.
+  tensorflow::ShapeRefiner refiner GUARDED_BY(mu);
+
+  // Maps from name of an operation to the Node* in 'graph'.
+  std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
+      GUARDED_BY(mu);
+
+  // TF_Graph may only / must be deleted when
+  //   num_sessions == 0 && delete_requested == true
+
+  // num_sessions incremented by TF_NewSession, and decremented by
+  // TF_DeleteSession.
+  int num_sessions GUARDED_BY(mu);
+  bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
+
+  // Used to link graphs contained in TF_WhileParams to the parent graph that
+  // will eventually contain the full while loop.
+  TF_Graph* parent;
+  TF_Output* parent_inputs;
+};
+
+struct TF_OperationDescription {
+  TF_OperationDescription(TF_Graph* g, const char* op_type,
+                          const char* node_name)
+      : node_builder(node_name, op_type, g->graph.op_registry()), graph(g) {}
+
+  tensorflow::NodeBuilder node_builder;
+  TF_Graph* graph;
+  std::vector<tensorflow::string> colocation_constraints;
+};
+
+struct TF_Operation {
+  tensorflow::Node node;
+};
+
+struct TF_Session {
+  TF_Session(tensorflow::Session* s, TF_Graph* g)
+      : session(s), graph(g), last_num_graph_nodes(0) {}
+  tensorflow::Session* session;
+  TF_Graph* graph;
+  tensorflow::mutex mu;
+  int last_num_graph_nodes;
+};
+
+struct TF_ImportGraphDefOptions {
+  tensorflow::ImportGraphDefOptions opts;
+};
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 5673f657d3c5b77618c481da614573b9e4a63aba..cdb7406c86e8b10d24c303615d13089272bcab5d 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 using tensorflow::int32;
 using tensorflow::string;
@@ -105,6 +107,22 @@ TEST(CAPI, AllocateTensor) {
   TF_DeleteTensor(t);
 }
 
+TEST(CAPI, MaybeMove) {
+  const int num_bytes = 6 * sizeof(float);
+  float* values =
+      reinterpret_cast<float*>(tensorflow::cpu_allocator()->AllocateRaw(
+          EIGEN_MAX_ALIGN_BYTES, num_bytes));
+  int64_t dims[] = {2, 3};
+  bool deallocator_called = false;
+  TF_Tensor* t = TF_NewTensor(TF_FLOAT, dims, 2, values, num_bytes,
+                              &Deallocator, &deallocator_called);
+
+  TF_Tensor* o = TF_TensorMaybeMove(t);
+  ASSERT_TRUE(o == nullptr);  // It is unsafe to move memory TF might not own.
+  TF_DeleteTensor(t);
+  EXPECT_TRUE(deallocator_called);
+}
+
 TEST(CAPI, LibraryLoadFunctions) {
   // Load the library.
   TF_Status* status = TF_NewStatus();
@@ -261,6 +279,19 @@ static void Int32Deallocator(void* data, size_t, void* arg) {
   delete[] static_cast<int32*>(data);
 }
 
+// Create a tensor with values of type TF_INT8 provided by `values`.
+static TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims,
+                             const char* values) {
+  int64_t num_values = 1;
+  for (int i = 0; i < num_dims; ++i) {
+    num_values *= dims[i];
+  }
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
+  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
+  return t;
+}
+
 static TF_Tensor* Int32Tensor(int32 v) {
   const int num_bytes = sizeof(int32);
   int32* values = new int32[1];
@@ -276,16 +307,21 @@ TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s,
   return TF_FinishOperation(desc, s);
 }
 
-TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
-                          const char* name = "scalar") {
-  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s,
+                    const char* name = "const") {
   TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
-  TF_SetAttrTensor(desc, "value", tensor.get(), s);
+  TF_SetAttrTensor(desc, "value", t, s);
   if (TF_GetCode(s) != TF_OK) return nullptr;
-  TF_SetAttrType(desc, "dtype", TF_INT32);
+  TF_SetAttrType(desc, "dtype", TF_TensorType(t));
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* ScalarConst(int32 v, TF_Graph* graph, TF_Status* s,
+                          const char* name = "scalar") {
+  unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor);
+  return Const(tensor.get(), graph, s, name);
+}
+
 TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                   TF_Status* s, const char* name = "add") {
   TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
@@ -805,6 +841,33 @@ TEST(CAPI, ImportGraphDef) {
   EXPECT_EQ(feed, control_inputs[0]);
   EXPECT_EQ(feed2, control_inputs[1]);
 
+  // Export to a graph def so we can import a graph with control dependencies
+  TF_DeleteBuffer(graph_def);
+  graph_def = TF_NewBuffer();
+  TF_GraphToGraphDef(graph, graph_def, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Import again, with remapped control dependency, into the same graph
+  TF_DeleteImportGraphDefOptions(opts);
+  opts = TF_NewImportGraphDefOptions();
+  TF_ImportGraphDefOptionsSetPrefix(opts, "imported4");
+  TF_ImportGraphDefOptionsRemapControlDependency(opts, "imported/feed", feed);
+  TF_GraphImportGraphDef(graph, graph_def, opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* scalar4 =
+      TF_GraphOperationByName(graph, "imported4/imported3/scalar");
+  TF_Operation* feed4 =
+      TF_GraphOperationByName(graph, "imported4/imported2/feed");
+
+  // Check that imported `imported3/scalar` has remapped control dep from
+  // original graph and imported control dep
+  num_control_inputs = TF_OperationGetControlInputs(
+      scalar4, control_inputs, TF_OperationNumControlInputs(scalar4));
+  ASSERT_EQ(2, num_control_inputs);
+  EXPECT_EQ(feed, control_inputs[0]);
+  EXPECT_EQ(feed4, control_inputs[1]);
+
   TF_DeleteImportGraphDefOptions(opts);
   TF_DeleteBuffer(graph_def);
 
@@ -1049,6 +1112,35 @@ TEST(CAPI, SessionPRun) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, ShapeInferenceError) {
+  // TF_FinishOperation should fail if the shape of the added operation cannot
+  // be inferred.
+  TF_Status* status = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Create this failure by trying to add two nodes with incompatible shapes
+  // (A tensor with shape [2] and a tensor with shape [3] cannot be added).
+  const char data[] = {1, 2, 3};
+  const int64_t vec2_dims[] = {2};
+  unique_tensor_ptr vec2_tensor(
+      Int8Tensor(vec2_dims, TF_ARRAYSIZE(vec2_dims), data), TF_DeleteTensor);
+  TF_Operation* vec2 = Const(vec2_tensor.get(), graph, status, "vec2");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const int64_t vec3_dims[] = {3};
+  unique_tensor_ptr vec3_tensor(
+      Int8Tensor(vec3_dims, TF_ARRAYSIZE(vec3_dims), data), TF_DeleteTensor);
+  TF_Operation* vec3 = Const(vec3_tensor.get(), graph, status, "vec3");
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Operation* add = Add(vec2, vec3, graph, status);
+  ASSERT_NE(TF_OK, TF_GetCode(status));
+  ASSERT_TRUE(add == nullptr);
+
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, ColocateWith) {
   TF_Status* s = TF_NewStatus();
   TF_Graph* graph = TF_NewGraph();
@@ -1482,16 +1574,280 @@ TEST_F(CApiWhileLoopTest, BadTypes) {
   TF_AbortWhile(params_.get());
 }
 
-// Create a tensor with values of type TF_INT8 provided by `values`.
-TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) {
-  int64_t num_values = 1;
-  for (int i = 0; i < num_dims; ++i) {
-    num_values *= dims[i];
+REGISTER_OP("TestOpWithNoGradient")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {float, double}")
+    .Doc(R"doc(
+Test op with no grad registered.
+
+x: input
+y: output
+)doc")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
+
+class CApiGradientsTest : public ::testing::Test {
+ protected:
+  CApiGradientsTest()
+      : s_(TF_NewStatus()),
+        graph_(TF_NewGraph()),
+        expected_graph_(TF_NewGraph()) {}
+
+  ~CApiGradientsTest() override {
+    TF_DeleteGraph(graph_);
+    TF_DeleteGraph(expected_graph_);
+    TF_DeleteStatus(s_);
   }
-  TF_Tensor* t =
-      TF_AllocateTensor(TF_INT8, dims, num_dims, sizeof(char) * num_values);
-  memcpy(TF_TensorData(t), values, sizeof(char) * num_values);
-  return t;
+
+  void TestGradientsSuccess(bool grad_inputs_provided) {
+    TF_Output inputs[2];
+    TF_Output outputs[1];
+    TF_Output grad_outputs[2];
+    TF_Output expected_grad_outputs[2];
+
+    BuildSuccessGraph(inputs, outputs);
+    BuildExpectedGraph(grad_inputs_provided, expected_grad_outputs);
+
+    AddGradients(grad_inputs_provided, inputs, 2, outputs, 1, grad_outputs);
+
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+    // Compare that the graphs match.
+    GraphDef expected_gdef;
+    GraphDef gdef;
+    EXPECT_TRUE(GetGraphDef(expected_graph_, &expected_gdef));
+    EXPECT_TRUE(GetGraphDef(graph_, &gdef));
+    TF_EXPECT_GRAPH_EQ(expected_gdef, gdef);
+
+    // Compare that the output of the gradients of both graphs match.
+    RunGraphsAndCompareOutputs(grad_outputs, expected_grad_outputs);
+  }
+
+  void TestGradientsError(bool grad_inputs_provided) {
+    TF_Output inputs[1];
+    TF_Output outputs[1];
+    TF_Output grad_outputs[1];
+
+    BuildErrorGraph(inputs, outputs);
+
+    AddGradients(grad_inputs_provided, inputs, 1, outputs, 1, grad_outputs);
+
+    string expected_msg =
+        "No gradient defined for op: TestOpWithNoGradient. Please see "
+        "https://www.tensorflow.org/code/"
+        "tensorflow/cc/gradients/README.md"
+        " for instructions on how to add C++ gradients.";
+    EXPECT_EQ(expected_msg, TF_Message(s_));
+  }
+
+  // Run the graph and ensure that the gradient values are as expected.
+  void RunGraphsAndCompareOutputs(TF_Output* grad_outputs,
+                                  TF_Output* expected_grad_outputs) {
+    std::unique_ptr<CSession> csession(new CSession(graph_, s_));
+    std::unique_ptr<CSession> expected_csession(
+        new CSession(expected_graph_, s_));
+
+    std::vector<TF_Output> grad_outputs_vec;
+    grad_outputs_vec.assign(grad_outputs, grad_outputs + 2);
+    csession->SetOutputs(grad_outputs_vec);
+    csession->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Tensor* out0 = csession->output_tensor(0);
+    TF_Tensor* out1 = csession->output_tensor(1);
+
+    std::vector<TF_Output> expected_grad_outputs_vec;
+    expected_grad_outputs_vec.assign(expected_grad_outputs,
+                                     expected_grad_outputs + 2);
+    expected_csession->SetOutputs(expected_grad_outputs_vec);
+    expected_csession->Run(s_);
+    ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+    TF_Tensor* expected_out0 = expected_csession->output_tensor(0);
+    TF_Tensor* expected_out1 = expected_csession->output_tensor(1);
+
+    CompareTensors(out0, expected_out0);
+    CompareTensors(out1, expected_out1);
+  }
+
+  void CompareTensors(TF_Tensor* a, TF_Tensor* b) {
+    float* a_data = static_cast<float*>(TF_TensorData(a));
+    float* b_data = static_cast<float*>(TF_TensorData(b));
+    EXPECT_EQ(*a_data, *b_data);
+  }
+
+  void AddGradients(bool grad_inputs_provided, TF_Output* inputs, int ninputs,
+                    TF_Output* outputs, int noutputs, TF_Output* grad_outputs) {
+    if (grad_inputs_provided) {
+      TF_Output grad_inputs[1];
+      const float grad_inputs_val[] = {1.0, 1.0, 1.0, 1.0};
+      TF_Operation* grad_inputs_op =
+          FloatConst2x2(graph_, s_, grad_inputs_val, "GradInputs");
+      grad_inputs[0] = TF_Output{grad_inputs_op, 0};
+      TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, grad_inputs,
+                      s_, grad_outputs);
+    } else {
+      TF_AddGradients(graph_, outputs, noutputs, inputs, ninputs, nullptr, s_,
+                      grad_outputs);
+    }
+  }
+
+  void BuildErrorGraph(TF_Output* inputs, TF_Output* outputs) {
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    TF_Operation* const0 = FloatConst2x2(graph_, s_, const0_val, "Const_0");
+    TF_Operation* nograd = NoGradientOp(graph_, s_, const0, "NoGrad");
+    inputs[0] = TF_Output{const0, 0};
+    outputs[0] = TF_Output{nograd, 0};
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void BuildSuccessGraph(TF_Output* inputs, TF_Output* outputs) {
+    // Construct the following graph:
+    //            |
+    //           z|
+    //            |
+    //          MatMul
+    //         /       \
+    //        ^         ^
+    //        |         |
+    //       x|        y|
+    //        |         |
+    //        |         |
+    //      Const_0    Const_1
+    //
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    const float const1_val[] = {1.0, 0.0, 0.0, 1.0};
+    TF_Operation* const0 = FloatConst2x2(graph_, s_, const0_val, "Const_0");
+    TF_Operation* const1 = FloatConst2x2(graph_, s_, const1_val, "Const_1");
+    TF_Operation* matmul = MatMul(graph_, s_, const0, const1, "MatMul");
+    inputs[0] = TF_Output{const0, 0};
+    inputs[1] = TF_Output{const1, 0};
+    outputs[0] = TF_Output{matmul, 0};
+    EXPECT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+  }
+
+  void BuildExpectedGraph(bool grad_inputs_provided,
+                          TF_Output* expected_grad_outputs) {
+    // The expected graph looks like this if grad_inputs_provided.
+    // If grad_inputs_provided is false, Const_0 will be a OnesLike op.
+    //      ^             ^
+    //    dy|           dx|        // MatMul Gradient Graph
+    //      |             |
+    //   MatMul_2      MatMul_1
+    //   ^   ^          ^    ^
+    //   |   |----------|    |
+    //   |        ^          |
+    //   |      dz|          |
+    //   |        |          |
+    //   |     Const_3       |
+    //   |                   |
+    //   |        ^          |
+    //   |       z|          |     // MatMul Forward Graph
+    //   |        |          |
+    //   |      MatMul       |
+    //   |     /       \     |
+    //   |    ^         ^    |
+    //   |    |         |    |
+    //   |---x|        y|----|
+    //        |         |
+    //        |         |
+    //      Const_0   Const_1
+    //
+    const float const0_val[] = {1.0, 2.0, 3.0, 4.0};
+    const float const1_val[] = {1.0, 0.0, 0.0, 1.0};
+    TF_Operation* const0 =
+        FloatConst2x2(expected_graph_, s_, const0_val, "Const_0");
+    TF_Operation* const1 =
+        FloatConst2x2(expected_graph_, s_, const1_val, "Const_1");
+    TF_Operation* matmul =
+        MatMul(expected_graph_, s_, const0, const1, "MatMul");
+
+    TF_Operation* const3;
+    if (grad_inputs_provided) {
+      const float const3_val[] = {1.0, 1.0, 1.0, 1.0};
+      const3 = FloatConst2x2(expected_graph_, s_, const3_val, "GradInputs");
+    } else {
+      const3 = OnesLike(expected_graph_, s_, matmul, "OnesLike");
+    }
+
+    TF_Operation* matmul1 =
+        MatMul(expected_graph_, s_, const3, const1, "MatMul_1", false, true);
+    TF_Operation* matmul2 =
+        MatMul(expected_graph_, s_, const0, const3, "MatMul_2", true, false);
+    expected_grad_outputs[0] = {matmul1, 0};
+    expected_grad_outputs[1] = {matmul2, 0};
+  }
+
+  TF_Tensor* FloatTensor2x2(const float* values) {
+    const int64_t dims[2] = {2, 2};
+    TF_Tensor* t = TF_AllocateTensor(TF_FLOAT, dims, 2, sizeof(float) * 4);
+    memcpy(TF_TensorData(t), values, sizeof(float) * 4);
+    return t;
+  }
+
+  TF_Operation* FloatConst2x2(TF_Graph* graph, TF_Status* s,
+                              const float* values, const char* name) {
+    unique_tensor_ptr tensor(FloatTensor2x2(values), TF_DeleteTensor);
+    TF_OperationDescription* desc = TF_NewOperation(graph, "Const", name);
+    TF_SetAttrTensor(desc, "value", tensor.get(), s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    TF_SetAttrType(desc, "dtype", TF_FLOAT);
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* MatMul(TF_Graph* graph, TF_Status* s, TF_Operation* l,
+                       TF_Operation* r, const char* name,
+                       bool transpose_a = false, bool transpose_b = false) {
+    TF_OperationDescription* desc = TF_NewOperation(graph, "MatMul", name);
+    if (transpose_a) {
+      TF_SetAttrBool(desc, "transpose_a", 1);
+    }
+    if (transpose_b) {
+      TF_SetAttrBool(desc, "transpose_b", 1);
+    }
+    TF_AddInput(desc, {l, 0});
+    TF_AddInput(desc, {r, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* OnesLike(TF_Graph* graph, TF_Status* s, TF_Operation* in,
+                         const char* name) {
+    TF_OperationDescription* desc = TF_NewOperation(graph, "OnesLike", name);
+    TF_AddInput(desc, {in, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Operation* NoGradientOp(TF_Graph* graph, TF_Status* s, TF_Operation* in,
+                             const char* name) {
+    TF_OperationDescription* desc =
+        TF_NewOperation(graph, "TestOpWithNoGradient", name);
+    TF_AddInput(desc, {in, 0});
+    TF_Operation* op = TF_FinishOperation(desc, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    return op;
+  }
+
+  TF_Status* s_;
+  TF_Graph* graph_;
+  TF_Graph* expected_graph_;
+};
+
+TEST_F(CApiGradientsTest, Gradients_GradInputs) { TestGradientsSuccess(true); }
+
+TEST_F(CApiGradientsTest, Gradients_NoGradInputs) {
+  TestGradientsSuccess(false);
+}
+
+TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_GradInputs) {
+  TestGradientsError(true);
+}
+
+TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) {
+  TestGradientsError(false);
 }
 
 void StringVectorToArrays(const std::vector<string>& v,
@@ -1509,9 +1865,13 @@ void StringVectorToArrays(const std::vector<string>& v,
 // Registers two ops, each with a single attribute called 'v'.
 // The attribute in one op will have a type 'type', the other
 // will have list(type).
-#define ATTR_TEST_REGISTER_OP(type)                            \
-  REGISTER_OP("CApiAttributesTestOp" #type).Attr("v: " #type); \
-  REGISTER_OP("CApiAttributesTestOpList" #type).Attr("v: list(" #type ")")
+#define ATTR_TEST_REGISTER_OP(type)                           \
+  REGISTER_OP("CApiAttributesTestOp" #type)                   \
+      .Attr("v: " #type)                                      \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape); \
+  REGISTER_OP("CApiAttributesTestOpList" #type)               \
+      .Attr("v: list(" #type ")")                             \
+      .SetShapeFn(tensorflow::shape_inference::UnknownShape)
 ATTR_TEST_REGISTER_OP(string);
 ATTR_TEST_REGISTER_OP(int);
 ATTR_TEST_REGISTER_OP(float);
diff --git a/tensorflow/c/exported_symbols.lds b/tensorflow/c/exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..a14bdaa48be55641a652795e2677b16e86918c11
--- /dev/null
+++ b/tensorflow/c/exported_symbols.lds
@@ -0,0 +1 @@
+_TF_*
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..73d427d9b2280123f9d54cdd7e4f9a76a7dddad1
--- /dev/null
+++ b/tensorflow/c/generate-pc.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TF_PREFIX='/usr/local'
+
+usage() {
+    echo "Usage: $0 OPTIONS"
+    echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-v, --version\tset TensorFlow version"
+    echo -e "-h, --help\tdisplay this message"
+}
+
+[ $# == 0 ] && usage && exit 0
+
+# read the options
+ARGS=`getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@"`
+eval set -- "$ARGS"
+
+# extract options and their arguments into variables.
+while true ; do
+    case "$1" in
+        -h|--help) usage ; exit ;;
+        -p|--prefix)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_PREFIX=$2 ; shift 2 ;;
+            esac ;;
+        -v|--version)
+            case "$2" in
+                "") shift 2 ;;
+                *) TF_VERSION=$2 ; shift 2 ;;
+            esac ;;
+        --) shift ; break ;;
+        *) echo "Internal error! Try '$0 --help' for more information." ; exit 1 ;;
+    esac
+done
+
+[ -z $TF_VERSION ] && echo "Specify a version using -v or --version" && exit 1
+
+echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
+
+cat << EOF > tensorflow.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/lib
+includedir=\${prefix}/include
+
+Name: TensorFlow
+Version: ${TF_VERSION}
+Description: Library for computation using data flow graphs for scalable machine learning
+Requires:
+Libs: -L\${libdir} -ltensorflow
+Cflags: -I\${includedir}
+EOF
diff --git a/tensorflow/c/version_script.lds b/tensorflow/c/version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..455bd7362bb36d30af421a17f0e2f8e9ba66e02b
--- /dev/null
+++ b/tensorflow/c/version_script.lds
@@ -0,0 +1,9 @@
+VERS_1.0 {
+  # Export symbols in c_api.h.
+  global:
+    TF_*;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index aaebdded9a5b3232bec824b0768a536e36349204..8d4260a0b9ca38593a912398e8460d826fb31ccf 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -91,6 +91,7 @@ cc_library(
     deps = [
         ":array_grad",
         ":math_grad",
+        ":nn_grad",
     ],
 )
 
@@ -122,7 +123,10 @@ cc_library_with_android_deps(
 
 cc_library_with_android_deps(
     name = "scope",
-    srcs = ["framework/scope.cc"],
+    srcs = [
+        "framework/scope.cc",
+        "framework/scope_internal.h",
+    ],
     hdrs = ["framework/scope.h"],
     android_deps = ["//tensorflow/core:android_tensorflow_lib"],
     common_deps = [
@@ -136,6 +140,15 @@ cc_library_with_android_deps(
     ],
 )
 
+cc_library_with_android_deps(
+    name = "scope_internal",
+    hdrs = ["framework/scope_internal.h"],
+    common_deps = [
+        ":scope",
+    ],
+    deps = [],
+)
+
 tf_cc_test(
     name = "framework_scope_test",
     srcs = ["framework/scope_test.cc"],
@@ -376,6 +389,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "functional_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "functional_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "resource_variable_ops",
     include_internal_ops = 1,
diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc
index 2732f3f5010d7522a1cf8631183e9b4df7ac86d8..2879445441d0a80c1320a30976412b416feaecc9 100644
--- a/tensorflow/cc/client/client_session.cc
+++ b/tensorflow/cc/client/client_session.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/cc/client/client_session.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/platform/env.h"
@@ -31,7 +32,7 @@ class ClientSession::Impl {
   friend class ClientSession;
 
   Impl(Session* session, std::shared_ptr<Graph> graph)
-      : session_(session), graph_(graph) {}
+      : session_(session), graph_(std::move(graph)) {}
 
   static SessionOptions MakeDefaultSessionOptions(const string& target);
   Status MaybeExtendGraph() const;
diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc
index 9c0f00f2b128c7d06bc7c0ca8579d4ca8e530fe8..dfbac9788e16e9c7c65abcd1ea213b51d5d5d060 100644
--- a/tensorflow/cc/client/client_session_test.cc
+++ b/tensorflow/cc/client/client_session_test.cc
@@ -49,7 +49,7 @@ TEST(ClientSessionTest, Feed) {
 
 TEST(ClientSessionTest, Extend) {
   Scope root = Scope::NewRootScope();
-  auto a = Placeholder(root, DT_INT32);
+  auto a = Placeholder(root, DT_INT32, Placeholder::Shape({2}));
   auto c = Add(root, a, {2, 2});
   ClientSession session(root);
   std::vector<Tensor> outputs;
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 22cd7fb0d438db9d9f7f29f5386c7a9722afe43d..71aa986f918de68822d457422f6c7a73d6253819 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -126,7 +126,11 @@ string PrintString(const string& str) {
   return strings::StrCat("\"", str_util::CEscape(str), "\"");
 }
 
-string PrintTensorShape(const TensorShape& shape) {
+string PrintTensorShape(const TensorShapeProto& shape_proto) {
+  PartialTensorShape shape(shape_proto);
+  if (shape.IsIdenticalTo(PartialTensorShape())) {
+    return "::tensorflow::PartialTensorShape() /* unknown */";
+  }
   string ret = "{";
   for (int d = 0; d < shape.dims(); ++d) {
     if (d > 0) strings::StrAppend(&ret, ", ");
@@ -188,7 +192,13 @@ string PrintTensor(const TensorProto& tensor_proto) {
   }
 }
 
-string PrintAttrValue(string op, const AttrValue& attr_value) {
+string PrintTensorProto(const TensorProto& proto) {
+  return strings::StrCat("Input::Initializer(", "{", PrintTensor(proto), "}, ",
+                         PrintTensorShape(proto.tensor_shape()),
+                         ").AsTensorProto()");
+}
+
+string PrintAttrValue(const string& op, const AttrValue& attr_value) {
   switch (attr_value.value_case()) {
     case AttrValue::kS:
       return PrintString(attr_value.s());
@@ -203,12 +213,9 @@ string PrintAttrValue(string op, const AttrValue& attr_value) {
     case AttrValue::kType:
       return EnumName_DataType(attr_value.type());
     case AttrValue::kShape:
-      return PrintTensorShape(TensorShape(attr_value.shape()));
+      return PrintTensorShape(attr_value.shape());
     case AttrValue::kTensor:
-      return strings::StrCat(
-          "Input::Initializer(", "{", PrintTensor(attr_value.tensor()), "}, ",
-          PrintTensorShape(TensorShape(attr_value.tensor().tensor_shape())),
-          ").AsTensorProto()");
+      return PrintTensorProto(attr_value.tensor());
     case AttrValue::kList: {
       string ret = "{";
       if (attr_value.list().s_size() > 0) {
@@ -241,8 +248,14 @@ string PrintAttrValue(string op, const AttrValue& attr_value) {
       } else if (attr_value.list().shape_size() > 0) {
         for (int i = 0; i < attr_value.list().shape_size(); ++i) {
           if (i > 0) strings::StrAppend(&ret, ", ");
-          strings::StrAppend(
-              &ret, PrintTensorShape(TensorShape(attr_value.list().shape(i))));
+          strings::StrAppend(&ret,
+                             PrintTensorShape(attr_value.list().shape(i)));
+        }
+      } else if (attr_value.list().tensor_size() > 0) {
+        for (int i = 0; i < attr_value.list().tensor_size(); ++i) {
+          if (i > 0) strings::StrAppend(&ret, ", ");
+          strings::StrAppend(&ret,
+                             PrintTensorProto(attr_value.list().tensor(i)));
         }
       }
       strings::StrAppend(&ret, "}");
@@ -292,8 +305,8 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"list(bool)", {"gtl::ArraySlice<bool>", true}},
           {"type", {"DataType", false}},
           {"list(type)", {"DataTypeSlice", true}},
-          {"shape", {"TensorShape", false}},
-          {"list(shape)", {"gtl::ArraySlice<TensorShape>", true}},
+          {"shape", {"PartialTensorShape", false}},
+          {"list(shape)", {"gtl::ArraySlice<PartialTensorShape>", true}},
           {"tensor", {"TensorProto", true}},
           {"list(tensor)", {"gtl::ArraySlice<TensorProto>", true}},
           {"func", {"NameAttrList", true}},
@@ -717,7 +730,7 @@ void OpInfo::GetOutput(string* out) const {
     // One output, no need for NameRangeMap
     if (is_list_output[0]) {
       strings::StrAppend(out,
-                         "  for (int64 i = 0; i < ret->num_outputs(); ++i)\n");
+                         "  for (int32 i = 0; i < ret->num_outputs(); ++i)\n");
       strings::StrAppend(out, "    this->", output_names[0],
                          ".push_back(Output(ret, i));\n");
     } else {
@@ -727,11 +740,10 @@ void OpInfo::GetOutput(string* out) const {
     return;
   }
   strings::StrAppend(out, "  ::tensorflow::NameRangeMap _outputs_range;\n");
-  strings::StrAppend(
-      out,
-      "  ::tensorflow::Status _status_ = "
-      "::tensorflow::NameRangesForNode(ret->def(), ret->op_def(), "
-      "nullptr, &_outputs_range);\n");
+  strings::StrAppend(out,
+                     "  ::tensorflow::Status _status_ = "
+                     "::tensorflow::NameRangesForNode(*ret, ret->op_def(), "
+                     "nullptr, &_outputs_range);\n");
   strings::StrAppend(out, "  if (!_status_.ok()) {\n", "    ", scope_str,
                      ".UpdateStatus(_status_);\n", "    return;\n");
   strings::StrAppend(out, "  }\n\n");
@@ -740,7 +752,7 @@ void OpInfo::GetOutput(string* out) const {
     const string arg_range = strings::StrCat(
         "_outputs_range[\"", graph_op_def.output_arg(i).name(), "\"]");
     if (is_list_output[i]) {
-      strings::StrAppend(out, "  for (int64 i = ", arg_range, ".first; i < ",
+      strings::StrAppend(out, "  for (int32 i = ", arg_range, ".first; i < ",
                          arg_range, ".second; ++i)\n");
       strings::StrAppend(out, "    this->", output_names[i],
                          ".push_back(Output(ret, i));\n");
diff --git a/tensorflow/cc/framework/cc_ops_test.cc b/tensorflow/cc/framework/cc_ops_test.cc
index 6dc0d84c16d5b534341575b384997cc398c80bec..5da23036eaadbef270ba839357dc4613bf3bf490 100644
--- a/tensorflow/cc/framework/cc_ops_test.cc
+++ b/tensorflow/cc/framework/cc_ops_test.cc
@@ -32,10 +32,11 @@ Output Linear(const Scope& scope, Input x, Input w, Input b) {
   return BiasAdd(cop_scopes.last, m, b);
 }
 
-void GetColocationConstraints(Output tensor, std::vector<string>* constraints) {
+void GetColocationConstraints(const Output& tensor,
+                              std::vector<string>* constraints) {
   constraints->clear();
-  TF_EXPECT_OK(
-      GetNodeAttr(tensor.op().node()->def(), kColocationAttrName, constraints));
+  TF_EXPECT_OK(GetNodeAttr(tensor.op().node()->attrs(), kColocationAttrName,
+                           constraints));
 }
 
 }  // namespace
@@ -158,11 +159,11 @@ TEST(CCOpTest, KernelLabel) {
   Scope root = Scope::NewRootScope();
   auto add = Add(root.WithKernelLabel("AddWithKernelLabel"), 1.0f, 2.0f);
   TF_EXPECT_OK(root.status());
-  const auto& attrs = add.z.op().node()->def().attr();
-  ASSERT_TRUE(attrs.find("_kernel") != attrs.end());
-  auto kernel_attr = attrs.find("_kernel")->second;
-  TF_EXPECT_OK(AttrValueHasType(kernel_attr, "string"));
-  EXPECT_EQ(kernel_attr.s(), "AddWithKernelLabel");
+  AttrSlice attrs = add.z.op().node()->attrs();
+  const auto* kernel_attr = attrs.Find("_kernel");
+  ASSERT_TRUE(kernel_attr);
+  TF_EXPECT_OK(AttrValueHasType(*kernel_attr, "string"));
+  EXPECT_EQ(kernel_attr->s(), "AddWithKernelLabel");
 }
 
 TEST(CCOpTest, ColocateWith) {
@@ -189,8 +190,7 @@ TEST(CCOpTest, ColocateWith) {
 
   Scope with_colocate = root.ColocateWith(c3).ColocateWith(c4);
   auto c6 = Const(with_colocate.WithOpName("c6").ClearColocation(), 7);
-  const auto& attrs = c6.op().node()->def().attr();
-  EXPECT_TRUE(attrs.find("_class") == attrs.end());
+  EXPECT_FALSE(c6.op().node()->attrs().Find("_class"));
 }
 
 TEST(CCOpTest, TemplatedConst) {
diff --git a/tensorflow/cc/framework/grad_op_registry.cc b/tensorflow/cc/framework/grad_op_registry.cc
index 0d6a377b507161c4420a6076b9ee71e799e0223b..254705736e7711e58aa87054f36c8a19eebd4f0d 100644
--- a/tensorflow/cc/framework/grad_op_registry.cc
+++ b/tensorflow/cc/framework/grad_op_registry.cc
@@ -32,7 +32,13 @@ bool GradOpRegistry::Register(const string& op, GradFunc func) {
 Status GradOpRegistry::Lookup(const string& op, GradFunc* func) const {
   auto iter = registry_.find(op);
   if (iter == registry_.end()) {
-    return errors::NotFound("No gradient defined for op: ", op);
+    const string error_msg =
+        "No gradient defined for op: " + op +
+        ". Please see "
+        "https://www.tensorflow.org/code/"
+        "tensorflow/cc/gradients/README.md"
+        " for instructions on how to add C++ gradients.";
+    return errors::NotFound(error_msg);
   }
   *func = iter->second;
   return Status::OK();
diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc
index 849a8eed6f23fb8dd1290d1bfa9db9c47d5d9f9d..8f20ff1457b219da3f11d9ffdafdd470875b25b0 100644
--- a/tensorflow/cc/framework/gradient_checker.cc
+++ b/tensorflow/cc/framework/gradient_checker.cc
@@ -40,8 +40,8 @@ Status ComputeTheoreticalJacobianTranspose(
     const std::vector<Tensor>& x_datas, const OutputList& ys,
     const std::vector<TensorShape>& y_shapes,
     std::vector<Tensor>& jacobian_ts) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
   // Call AddSymbolicGradients to get 'dxs' (we will feed 'dys').
   OutputList dys;
   for (const auto& y_shape : y_shapes) {
@@ -130,8 +130,8 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs,
                                        const T delta,
                                        std::vector<Tensor>& x_datas,
                                        std::vector<Tensor>& jacobian_ts) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
 
   ClientSession session(scope);
   for (int x_idx = 0; x_idx < x_num; x_idx++) {
@@ -176,8 +176,8 @@ void InitJacobians(const OutputList& xs,
                    const std::vector<TensorShape>& x_shapes,
                    const std::vector<TensorShape>& y_shapes,
                    std::vector<Tensor>& jacobians) {
-  int y_num = y_shapes.size();
-  int x_num = x_shapes.size();
+  size_t y_num = y_shapes.size();
+  size_t x_num = x_shapes.size();
 
   jacobians.resize(y_num * x_num);
   for (int x_idx = 0; x_idx < x_num; x_idx++) {
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index 2c60f947a55479e27937b98de91d80b559d32576..8c00a6f70497df2c70f266a747197e50c98375bb 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -210,8 +210,8 @@ Status SymbolicGradientBuilder::Initialize() {
 
   {
     // Initialize backprop with `grad_inputs_`.
-    const int num_dy = grad_inputs_.size();
-    for (int i = 0; i < num_dy; ++i) {
+    const size_t num_dy = grad_inputs_.size();
+    for (size_t i = 0; i < num_dy; ++i) {
       TF_RETURN_IF_ERROR(BackpropAlongEdge(grad_inputs_[i], outputs_[i]));
     }
   }
@@ -308,7 +308,7 @@ Status SymbolicGradientBuilder::AddGradients() {
       continue;
     }
 
-    const int num_no_grad = no_grad_dy_indices.size();
+    const size_t num_no_grad = no_grad_dy_indices.size();
     if (IsPrimitiveOpWithNoGrad(n->type_string()) || num_no_grad == num_y) {
       // No grad defined for this op, or all outputs returned 'NoGradient':
       // Backprop 'NoGradient' along the in edges.
@@ -367,6 +367,19 @@ Status AddSymbolicGradients(const Scope& scope,
   return builder.AddGradients();
 }
 
+Status AddSymbolicGradients(const Scope& scope,
+                            const std::vector<Output>& outputs,
+                            const std::vector<Output>& inputs,
+                            std::vector<Output>* grad_outputs) {
+  std::vector<Output> grad_inputs;
+  grad_inputs.reserve(outputs.size());
+  for (const Output& output : outputs) {
+    grad_inputs.emplace_back(ops::OnesLike(scope, output));
+  }
+  return AddSymbolicGradients(scope, outputs, inputs, grad_inputs,
+                              grad_outputs);
+}
+
 Output NoGradient() { return SymbolicGradientBuilder::NoGradient(); }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/cc/framework/gradients.h b/tensorflow/cc/framework/gradients.h
index d076bc43b4fbb1c8911b52c5ab258b7e9837113b..717f6f0636d3dd1a546ef7477b100bbfc86ba13d 100644
--- a/tensorflow/cc/framework/gradients.h
+++ b/tensorflow/cc/framework/gradients.h
@@ -27,16 +27,19 @@ namespace tensorflow {
 /// derivatives of some loss function 'L' w.r.t 'outputs'), adds gradient nodes
 /// to the graph associated with 'scope', which compute (and return in
 /// 'grad_outputs') the symbolic partial derivatives of 'L' w.r.t 'inputs'.
-///
-
-// TODO(andydavis) Add overload of this function with no 'grad_inputs' arg.
-// Implementation will fill in 'OnesLike' for all shapes in 'outputs'.
 Status AddSymbolicGradients(const Scope& scope,
                             const std::vector<Output>& outputs,
                             const std::vector<Output>& inputs,
                             const std::vector<Output>& grad_inputs,
                             std::vector<Output>* grad_outputs);
 
+// Same as above, but uses 'OnesLike' for all shapes in
+// 'outputs' as grad_inputs.
+Status AddSymbolicGradients(const Scope& scope,
+                            const std::vector<Output>& outputs,
+                            const std::vector<Output>& inputs,
+                            std::vector<Output>* grad_outputs);
+
 /// Returns a sentinel Output that represents 'no gradient' (i.e. no gradient
 /// flows along some graph edge during backpropagation).
 /// Can be returned in 'grad_outputs' by an invocation of 'AddSymbolicGradients'
diff --git a/tensorflow/cc/framework/gradients_test.cc b/tensorflow/cc/framework/gradients_test.cc
index 6c2c2fcd1e2c5941dadebfbc78fb5bae9122e7c3..6a249825812b4d39b55f7170a35436b6ae88c020 100644
--- a/tensorflow/cc/framework/gradients_test.cc
+++ b/tensorflow/cc/framework/gradients_test.cc
@@ -40,7 +40,7 @@ class GradientsTest : public ::testing::Test {
     TF_ASSERT_OK(scope_test_.ToGraphDef(&gdef_test));
     GraphDef gdef_exp;
     TF_ASSERT_OK(scope_expected_.ToGraphDef(&gdef_exp));
-    TF_EXPECT_GRAPH_EQ(gdef_test, gdef_exp);
+    TF_EXPECT_GRAPH_EQ(gdef_exp, gdef_test);
   }
 
   Scope scope_expected_;
@@ -98,6 +98,32 @@ TEST_F(GradientsTest, OneMatMul) {
   CompareTestAndExpectedGraphs();
 }
 
+TEST_F(GradientsTest, OneMatMul_InferGradInputs) {
+  for (const bool expected : {false, true}) {
+    const Scope& scope = expected ? scope_expected_ : scope_test_;
+    // Construct forward graph.
+    auto x = Const(scope, {{1.0, 2.0}, {3.0, 4.0}});
+    auto y = Const(scope, {{1.0, 0.0}, {0.0, 1.0}});
+    auto z = MatMul(scope, x, y);
+    TF_ASSERT_OK(scope.status());
+    CHECK_NOTNULL(z.node());
+
+    if (expected) {
+      // Construct backward graph.
+      // The gradients function adds a OnesLike to create a dz of ones with the
+      // shape of z.
+      auto dz = OnesLike(scope, z);
+      auto dx = MatMul(scope, dz, y, MatMul::TransposeB(true));
+      auto dy = MatMul(scope, x, dz, MatMul::TransposeA(true));
+    } else {
+      // Call AddSymbolicGradients.
+      std::vector<Output> grad_outputs;
+      TF_ASSERT_OK(AddSymbolicGradients(scope, {z}, {x, y}, &grad_outputs));
+    }
+  }
+  CompareTestAndExpectedGraphs();
+}
+
 TEST_F(GradientsTest, TwoMatMuls_Chained) {
   for (const bool expected : {false, true}) {
     const Scope& scope = expected ? scope_expected_ : scope_test_;
@@ -234,7 +260,7 @@ TEST_F(GradientsTest, StackUnstack_StopBackprop) {
 }
 
 TEST_F(GradientsTest, DependentGradOutputs) {
-  // Tests that dependant gradients (in this case the gradients w.r.t to the
+  // Tests that dependent gradients (in this case the gradients w.r.t to the
   // output and one input of MatMul) are computed properly.
 
   // Create two chained MatMul ops.
diff --git a/tensorflow/cc/framework/ops.cc b/tensorflow/cc/framework/ops.cc
index 50df891a4c434ad58e962d7a31599df08cedaeb7..920a8e7955631ba0d33d2d36506703e107420a69 100644
--- a/tensorflow/cc/framework/ops.cc
+++ b/tensorflow/cc/framework/ops.cc
@@ -20,7 +20,7 @@ namespace tensorflow {
 
 Operation::Operation(Node* n) : inputs_(GetInputs(n)), node_(n) {}
 
-Output Operation::input(int i) const {
+Output Operation::input(int32 i) const {
   CHECK_NOTNULL(node_);
   CHECK_GE(i, 0);
   CHECK_LT(i, node_->num_inputs());
@@ -37,14 +37,14 @@ Output Operation::input(int i) const {
   return Output(inputs_[i].first, inputs_[i].second);
 }
 
-Output Operation::output(int i) const {
+Output Operation::output(int32 i) const {
   CHECK_NOTNULL(node_);
   CHECK_GE(i, 0);
   CHECK_LT(i, node_->num_outputs());
   return Output(node_, i);
 }
 
-uint64 Operation::hash(int64 index) const {
+uint64 Operation::hash(int32 index) const {
   return ::tensorflow::Hash64(reinterpret_cast<const char*>(&node_),
                               sizeof(Node*), index);
 }
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index 889d5db31dd06fd25b7a72e209a8d7f37b8429ca..8d4154220c4b18f9286094b10c1b1e96eb4e31e7 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -39,22 +39,22 @@ class Operation {
   Operation() : node_(nullptr) {}
   explicit Operation(Node* n);
 
-  int num_inputs() const { return node_->num_inputs(); }
-  DataType input_type(int o) const { return node_->input_type(o); }
-  Output input(int i) const;
+  int32 num_inputs() const { return node_->num_inputs(); }
+  DataType input_type(int32 o) const { return node_->input_type(o); }
+  Output input(int32 i) const;
 
-  int num_outputs() const { return node_->num_outputs(); }
-  DataType output_type(int o) const { return node_->output_type(o); }
-  Output output(int i) const;
+  int32 num_outputs() const { return node_->num_outputs(); }
+  DataType output_type(int32 o) const { return node_->output_type(o); }
+  Output output(int32 i) const;
 
   Node* node() const { return node_; }
 
-  uint64 hash(int64 index) const;
+  uint64 hash(int32 index) const;
 
   bool operator==(const Operation& other) const { return node_ == other.node_; }
 
  private:
-  typedef std::vector<std::pair<Node*, int64>> Inputs;
+  typedef std::vector<std::pair<Node*, int32>> Inputs;
   static Inputs GetInputs(Node* node);
 
   Inputs inputs_;
@@ -66,12 +66,12 @@ class Output {
  public:
   Output() = default;
   explicit Output(Node* n) : op_(n) {}
-  Output(Node* n, int64 index) : op_(n), index_(index) {}
-  Output(const Operation& op, int64 index) : op_(op), index_(index) {}
+  Output(Node* n, int32 index) : op_(n), index_(index) {}
+  Output(const Operation& op, int32 index) : op_(op), index_(index) {}
 
   Operation op() const { return op_; }
   Node* node() const { return op().node(); }
-  int64 index() const { return index_; }
+  int32 index() const { return index_; }
   DataType type() const { return op_.output_type(index_); }
   string name() const { return strings::StrCat(node()->name(), ":", index()); }
   bool operator==(const Output& other) const {
@@ -82,14 +82,14 @@ class Output {
 
  private:
   Operation op_ = Operation(nullptr);
-  int64 index_ = 0;
+  int32 index_ = 0;
 };
 
 /// Hash class that can be used for e.g. storing Outputs in an unordered_map
 struct OutputHash {
   std::size_t operator()(const Output& output) const {
     return Hash64Combine(std::hash<Node*>()(output.node()),
-                         std::hash<int64>()(output.index()));
+                         std::hash<int32>()(output.index()));
   }
 };
 
@@ -230,12 +230,12 @@ class Input {
 
   /// Constructor specifying a node name, index and datatype. This should only
   /// be used for specifying a backward edge, needed by control flow.
-  Input(const string& name, int i, DataType dt)
+  Input(const string& name, int32 i, DataType dt)
       : node_name_(name), index_(i), data_type_(dt) {}
 
   Node* node() const { return output_.node(); }
   string node_name() const { return node_name_; }
-  int index() const { return node_name_.empty() ? output_.index() : index_; }
+  int32 index() const { return node_name_.empty() ? output_.index() : index_; }
   DataType data_type() const { return data_type_; }
   Status status() const { return status_; }
   const Tensor& tensor() const { return tensor_; }
@@ -245,7 +245,7 @@ class Input {
   Output output_ = Output(Operation(nullptr), 0);
   Tensor tensor_;
   const string node_name_ = "";
-  int index_ = 0;
+  int32 index_ = 0;
   DataType data_type_ = DT_INVALID;
 };
 
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 571c6e1e579f630db473ffc1312d1a1f3162f475..32c0822de69da7989ceaa4028539db928b6fcea3 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/framework/scope_internal.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -25,6 +25,20 @@ limitations under the License.
 namespace tensorflow {
 
 class Scope::Impl {
+ public:
+  // A NameMap is used to keep track of suffixes for names used in a scope. A
+  // name that has not been used so far in a scope will get no suffix. Later
+  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
+  // can share the same NameMap. For instance, a new scope created using
+  // WithControlDependencies() should would share the same NameMap with the
+  // parent.
+  typedef std::unordered_map<string, int> NameMap;
+
+  Impl(const std::shared_ptr<Graph>& graph,
+       const std::shared_ptr<Status>& status,
+       const std::shared_ptr<NameMap>& name_map,
+       const std::shared_ptr<ShapeRefiner>& refiner);
+
  private:
   friend class Scope;
 
@@ -40,14 +54,6 @@ class Scope::Impl {
     enum class Colocate;
   };
 
-  // A NameMap is used to keep track of suffixes for names used in a scope. A
-  // name that has not been used so far in a scope will get no suffix. Later
-  // uses of the same name will get suffixes _1, _2, _3, etc. Multiple scopes
-  // can share the same NameMap. For instance, a new scope created using
-  // WithControlDependencies() should would share the same NameMap with the
-  // parent.
-  typedef std::unordered_map<string, int> NameMap;
-
   Impl(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner);
   Impl(const Scope& other, Tags::ScopeName, const string& name,
        bool copy_names);
@@ -116,6 +122,17 @@ Scope::Impl::Impl(Graph* graph, Status* status, NameMap* name_map,
       scope_used_(nullptr),
       colocation_constraints_() {}
 
+Scope::Impl::Impl(const std::shared_ptr<Graph>& graph,
+                  const std::shared_ptr<Status>& status,
+                  const std::shared_ptr<NameMap>& name_map,
+                  const std::shared_ptr<ShapeRefiner>& refiner)
+    : graph_(graph),
+      status_(status),
+      name_map_(name_map),
+      refiner_(refiner),
+      scope_used_(nullptr),
+      colocation_constraints_() {}
+
 Scope Scope::NewRootScope() {
   Graph* graph = new Graph(OpRegistry::Global());
   ShapeRefiner* refiner =
@@ -254,9 +271,9 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
 std::unordered_set<string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
   std::unordered_set<string> current_constraints(colocation_constraints_);
-  const NodeDef& node_def = colocate_with_op.node()->def();
+  const AttrSlice attrs = colocate_with_op.node()->attrs();
   std::vector<string> node_constraints;
-  if (GetNodeAttr(node_def, kColocationAttrName, &node_constraints).ok()) {
+  if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
       if (s.Consume(kColocationGroupPrefix)) {
@@ -277,7 +294,7 @@ std::shared_ptr<Graph> Scope::graph_as_shared_ptr() const {
   return impl()->graph_;
 }
 
-Status Scope::status() const { return *impl()->status_; };
+Status Scope::status() const { return *impl()->status_; }
 
 const std::vector<Operation>& Scope::control_deps() const {
   return impl()->control_deps_;
@@ -464,4 +481,26 @@ CompositeOpScopes Scope::GetCompositeOpScopes(
   }
 }
 
+class InternalScope {
+ public:
+  // NewScope doesn't take ownership of the inputs.
+  static Scope NewScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
+    Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap;
+    for (const Node* node : graph->nodes()) {
+      (*name_map)[node->name()] = 0;
+    }
+    // We provide null destructors for these shared ptrs (except for name_map)
+    // since the caller owns them and doesn't want the scope to destroy them.
+    return Scope(new Scope::Impl(
+        std::shared_ptr<Graph>(graph, [](Graph*) {}),
+        std::shared_ptr<Status>(status, [](Status*) {}),
+        std::shared_ptr<Scope::Impl::NameMap>(name_map),
+        std::shared_ptr<ShapeRefiner>(refiner, [](ShapeRefiner*) {})));
+  }
+};
+
+Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
+  return InternalScope::NewScope(graph, status, refiner);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index ce70da709630bd402be9c75b3f6a5d638cd4a588..ec3543772d8febfb35488311886f1a4e9586a53e 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -204,6 +204,7 @@ class Scope {
   const std::vector<Operation>& control_deps() const;
 
  private:
+  friend class InternalScope;
   class Impl;
   std::unique_ptr<Impl> impl_;
   Impl* impl() { return impl_.get(); }
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2a911877f0b036080876b348b6a82f2a45df13a
--- /dev/null
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -0,0 +1,33 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+#define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
+
+#include "tensorflow/cc/framework/scope.h"
+
+namespace tensorflow {
+
+class ShapeRefiner;
+
+// NewInternalScope returns a new scope which doesn't take ownership of
+// graph, status, name_map, and refiner.
+// This is intended to enable the C API (which are used by other language
+// bindings) to create a Scope and access C++ functionality (i.e. gradients).
+Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_SCOPE_INTERNAL_H_
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 26abd2438e652f29a1d25caf689ab0606a12b00a..37f07e71a0dff9144f193679bbcfcf581c1538cf 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -43,9 +43,9 @@ Status PackGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   int N;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "N", &N));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "N", &N));
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
 
   grad_outputs->reserve(N);
   auto grad_op = Unstack(scope, grad_inputs[0], N, Unstack::Axis(axis));
@@ -60,7 +60,7 @@ Status UnpackGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
   int axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "axis", &axis));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
   grad_outputs->push_back(Stack(scope, grad_inputs, Stack::Axis(axis)));
   return scope.status();
 }
@@ -162,7 +162,7 @@ Status CheckNumericsGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string message;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "message", &message));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "message", &message));
   string err_msg = strings::StrCat(
       "Not a number (NaN) or infinity (Inf) values detected in gradient. ",
       message);
@@ -215,9 +215,9 @@ Status ReverseSequenceGrad(const Scope& scope, const Operation& op,
                            std::vector<Output>* grad_outputs) {
   auto seq_lengths = op.input(1);
   int batch_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "batch_dim", &batch_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "batch_dim", &batch_dim));
   int seq_dim;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "seq_dim", &seq_dim));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "seq_dim", &seq_dim));
   grad_outputs->push_back(
       ReverseSequence(scope, grad_inputs[0], seq_lengths, seq_dim,
                       ReverseSequence::BatchDim(batch_dim)));
@@ -267,7 +267,8 @@ Status SpaceToBatchGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       BatchToSpace(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -290,7 +291,8 @@ Status BatchToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(
       SpaceToBatch(scope, grad_inputs[0], op.input(1), block_size));
   grad_outputs->push_back(NoGradient());
@@ -313,7 +315,8 @@ Status SpaceToDepthGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(DepthToSpace(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -323,7 +326,8 @@ Status DepthToSpaceGrad(const Scope& scope, const Operation& op,
                         const std::vector<Output>& grad_inputs,
                         std::vector<Output>* grad_outputs) {
   int block_size;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "block_size", &block_size));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "block_size", &block_size));
   grad_outputs->push_back(SpaceToDepth(scope, grad_inputs[0], block_size));
   return scope.status();
 }
@@ -333,7 +337,7 @@ Status MirrorPadGrad(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(tensorflow::ops::internal::MirrorPadGrad(
       scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
@@ -346,7 +350,7 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op,
                          const std::vector<Output>& grad_inputs,
                          std::vector<Output>* grad_outputs) {
   string mode;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->def(), "mode", &mode));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "mode", &mode));
   grad_outputs->push_back(MirrorPad(scope, grad_inputs[0], op.input(1), mode));
   grad_outputs->push_back(NoGradient());
   return scope.status();
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index aff0653139538820a705371ee9446a3d38ca69b5..8c1a01f518f9ad3a4571c2f36c01d4eae712e813 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -21,6 +21,17 @@ namespace tensorflow {
 namespace ops {
 namespace {
 
+// Conjugate helper function returns the conjugate of an Output if it
+// is complex valued.
+Output ConjugateHelper(const Scope& scope, const Output& out) {
+  DataType dtype = out.type();
+  if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
+    return Conj(scope, out);
+  } else {
+    return out;
+  }
+}
+
 // TODO(andydavis) Add control dependencies to gradient functions (as needed).
 
 Status AbsGrad(const Scope& scope, const Operation& op,
@@ -44,9 +55,11 @@ REGISTER_GRADIENT_OP("Neg", NegGrad);
 Status InvGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // dx = dy * (-1 * (y * y))
+  // dy/dx = -1/x^2 = -y^2
+  auto dydx = Neg(scope, Square(scope, op.output(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Neg(scope, Square(scope, op.output(0)))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Inv", InvGrad);
@@ -55,10 +68,12 @@ REGISTER_GRADIENT_OP("Reciprocal", InvGrad);
 Status SquareGrad(const Scope& scope, const Operation& op,
                   const std::vector<Output>& grad_inputs,
                   std::vector<Output>* grad_outputs) {
-  // dx = dy * (2 * x)
+  // dy/dx = (2 * x)
   auto two = Cast(scope, Const(scope, 2), op.input(0).type());
+  auto dydx = Mul(scope, two, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Mul(scope, two, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Square", SquareGrad);
@@ -68,11 +83,12 @@ Status SqrtGrad(const Scope& scope, const Operation& op,
                 std::vector<Output>* grad_outputs) {
   // y = sqrt(x)
   // dy/dx =  0.5 * (1 / sqrt(x)) = 0.5 * (1 / y)
-  // dx = dy * (0.5 * (1 / y))
   auto y_inv = Reciprocal(scope, op.output(0));
   auto half = Cast(scope, Const(scope, 0.5), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, half, y_inv));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, half, y_inv);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sqrt", SqrtGrad);
@@ -82,14 +98,14 @@ Status RsqrtGrad(const Scope& scope, const Operation& op,
                  std::vector<Output>* grad_outputs) {
   // y = 1/x^1/2 = x^-1/2
   // dy/dx = -1/2 * x^-3/2 = -1/2 * x^-1/2 * x^-1 = -1/2 * y * x^-1
-  // dx = dy * (-1/2 * y * x^-1)
   auto x_inv = Reciprocal(scope, op.input(0));
   auto y = op.output(0);
   auto neghalf = Cast(scope, Const(scope, -0.5), op.input(0).type());
   auto a = Mul(scope, neghalf, x_inv);
-  auto b = Mul(scope, a, y);
-  auto dx = Mul(scope, grad_inputs[0], b);
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, a, y);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
@@ -97,10 +113,11 @@ REGISTER_GRADIENT_OP("Rsqrt", RsqrtGrad);
 Status ExpGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // y = exp(x)
-  // dy/dx = exp(x)
-  // dx = dy * y
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], op.output(0)));
+  // dy/dx = exp(x) = y
+  // grad(x) = grad(y) * conj(dy/dx)
+  //         = grad(y) * conj(y)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, op.output(0))));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Exp", ExpGrad);
@@ -108,10 +125,12 @@ REGISTER_GRADIENT_OP("Exp", ExpGrad);
 Status Expm1Grad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = expm1(x)
-  // df/dx = exp(x)
-  // dx = dy * exp(x)
-  grad_outputs->push_back(Mul(scope, grad_inputs[0], Exp(scope, op.input(0))));
+  // y = expm1(x)
+  // dy/dx = exp(x)
+  auto dydx = Exp(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
@@ -119,11 +138,12 @@ REGISTER_GRADIENT_OP("Expm1", Expm1Grad);
 Status LogGrad(const Scope& scope, const Operation& op,
                const std::vector<Output>& grad_inputs,
                std::vector<Output>* grad_outputs) {
-  // f(x) = log(x) = y
-  // df/dx = 1 / x
-  // dx = dy * (1 / x)
+  // y = log(x)
+  // dy/dx = 1 / x
+  auto dydx = Reciprocal(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Mul(scope, grad_inputs[0], Reciprocal(scope, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log", LogGrad);
@@ -131,12 +151,13 @@ REGISTER_GRADIENT_OP("Log", LogGrad);
 Status Log1pGrad(const Scope& scope, const Operation& op,
                  const std::vector<Output>& grad_inputs,
                  std::vector<Output>* grad_outputs) {
-  // f(x) = log1p(x) = y
-  // df/dx = 1 / (1 + x)
-  // dx = dy * (1 / (1 + x))
+  // y = log1p(x)
+  // dy/dx = 1 / (1 + x)
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
+  auto dydx = Reciprocal(scope, Add(scope, one, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
   grad_outputs->push_back(
-      Div(scope, grad_inputs[0], Add(scope, one, op.input(0))));
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Log1p", Log1pGrad);
@@ -146,11 +167,12 @@ Status TanhGrad(const Scope& scope, const Operation& op,
                 std::vector<Output>* grad_outputs) {
   // y = tanh(x)
   // dy/dx = 1 - (tanh(x))^2 = 1 - y^2
-  // dx = dy * (1 - y^2)
   auto y2 = Square(scope, op.output(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Sub(scope, one, y2));
-  grad_outputs->push_back(dx);
+  auto dydx = Sub(scope, one, y2);
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Tanh", TanhGrad);
@@ -160,11 +182,13 @@ Status SigmoidGrad(const Scope& scope, const Operation& op,
                    std::vector<Output>* grad_outputs) {
   // y = 1 / (1 + exp(-x))
   // dy/dx = y * (1 - y)
-  // dx = dy * y * (1 - y)
   auto y = op.output(0);
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
-  auto dx = Mul(scope, grad_inputs[0], Mul(scope, y, Sub(scope, one, y)));
-  grad_outputs->push_back(dx);
+  auto dydx = Mul(scope, y, Sub(scope, one, y));
+  // dx = dy * y * (1 - y)
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sigmoid", SigmoidGrad);
@@ -185,9 +209,10 @@ Status SinGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = sin(x)
   // dy/dx = cos(x)
-  // dx = dy * cos(x)
-  auto dx = Mul(scope, grad_inputs[0], Cos(scope, op.input(0)));
-  grad_outputs->push_back(dx);
+  auto dydx = Cos(scope, op.input(0));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Sin", SinGrad);
@@ -197,9 +222,10 @@ Status CosGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = cos(x)
   // dy/dx = -sin(x)
-  // dx = dy * -sin(x)
-  auto dx = Mul(scope, grad_inputs[0], Neg(scope, Sin(scope, op.input(0))));
-  grad_outputs->push_back(dx);
+  auto dydx = Neg(scope, Sin(scope, op.input(0)));
+  // grad(x) = grad(y) * conj(dy/dx)
+  grad_outputs->push_back(
+      Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx)));
   return scope.status();
 }
 REGISTER_GRADIENT_OP("Cos", CosGrad);
@@ -208,12 +234,12 @@ Status AsinGrad(const Scope& scope, const Operation& op,
                 const std::vector<Output>& grad_inputs,
                 std::vector<Output>* grad_outputs) {
   // y = asin(x)
-  // dy/dx = 1 / (1 - x * x)^1/2
-  // dx = dy * (1 / (1 - x * x)^1/2)
+  // dy/dx = 1 / sqrt(1 - x^2)
   auto x2 = Square(scope, op.input(0));
   auto one = Cast(scope, Const(scope, 1.0), op.input(0).type());
   auto dydx = Reciprocal(scope, Sqrt(scope, Sub(scope, one, x2)));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -239,9 +265,9 @@ Status TanGrad(const Scope& scope, const Operation& op,
                std::vector<Output>* grad_outputs) {
   // y = tan(x)
   // dy/dx = sec(x)^2 = 1 / cos(x)^2
-  // dx = dy * (1 / cos(x)^2)
   auto dydx = Square(scope, Reciprocal(scope, Cos(scope, op.input(0))));
-  auto dx = Mul(scope, grad_inputs[0], dydx);
+  // grad(x) = grad(y) * conj(dy/dx)
+  auto dx = Mul(scope, grad_inputs[0], ConjugateHelper(scope, dydx));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -324,7 +350,7 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
                         const string& attr_adj_x, const string& attr_adj_y,
                         std::vector<Output>* grad_outputs) {
   DataType dtype;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), "T", &dtype));
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->attrs(), "T", &dtype));
   if (dtype == DT_COMPLEX64 || dtype == DT_COMPLEX128) {
     return errors::Unimplemented(
         "MatMul gradient for complex data type is not supported yet.");
@@ -332,8 +358,10 @@ Status MatMulGradCommon(const Scope& scope, const Operation& op,
 
   bool ta;
   bool tb;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_x, &ta));
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.output(0).node()->def(), attr_adj_y, &tb));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_x, &ta));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), attr_adj_y, &tb));
 
   if (!ta && !tb) {
     return MatMulGradHelper(scope, is_batch, grad_inputs[0], false, op.input(1),
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index d7278929d4651f17d25670934b15e6da33d6a960..de6baa176936bcda7d0899c3795e1fbd37627058 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -56,23 +56,25 @@ class CWiseUnaryGradTest : public ::testing::Test {
     ATAN
   };
 
-  void TestCWiseGrad(UnaryOpType op_type, std::function<float(int)> x_fn,
-                     std::function<float(float)> dy_fn,
-                     std::function<float(float, float)> dx_fn) {
-    Tensor x(DT_FLOAT, {2, 3, 2});
-    auto x_flat = x.flat<float>();
+  template <typename T>
+  void TestCWiseGrad(UnaryOpType op_type, const std::function<T(int)>& x_fn,
+                     const std::function<T(const T&)>& dy_fn,
+                     const std::function<T(const T&, const T&)>& dx_fn) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    Tensor x(dtype, {2, 3, 2});
+    auto x_flat = x.flat<T>();
     for (int i = 0; i < x_flat.size(); ++i) {
       x_flat(i) = x_fn(i);
     }
 
-    Tensor dy(DT_FLOAT, {2, 3, 2});
-    auto dy_flat = dy.flat<float>();
+    Tensor dy(dtype, {2, 3, 2});
+    auto dy_flat = dy.flat<T>();
     for (int i = 0; i < dy_flat.size(); ++i) {
       dy_flat(i) = dy_fn(x_flat(i));
     }
 
-    Tensor dx(DT_FLOAT, {2, 3, 2});
-    auto dx_flat = dx.flat<float>();
+    Tensor dx(dtype, {2, 3, 2});
+    auto dx_flat = dx.flat<T>();
     for (int i = 0; i < dx_flat.size(); ++i) {
       dx_flat(i) = dx_fn(x_flat(i), dy_flat(i));
     }
@@ -146,7 +148,19 @@ class CWiseUnaryGradTest : public ::testing::Test {
     test::ExpectClose(output, dx);
   }
 
-  float RV(std::vector<float> v) { return v[random::New64() % v.size()]; }
+  float RV(const std::vector<float>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 CRV(const std::vector<complex64>& v) {
+    return v[random::New64() % v.size()];
+  }
+
+  complex64 conjugate(const complex64& val) {
+    return complex64(val.real(), -val.imag());
+  }
+
+  const complex64 one_{1.0, 0};
 
   Scope scope_;
 };
@@ -155,14 +169,14 @@ TEST_F(CWiseUnaryGradTest, Abs) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return x * dy; };
-  TestCWiseGrad(ABS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ABS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Neg) {
   auto x_fn = [this](const int i) { return RV({-1, 0, 1}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return -dy; };
-  TestCWiseGrad(NEG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(NEG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Reciprocal) {
@@ -171,14 +185,36 @@ TEST_F(CWiseUnaryGradTest, Reciprocal) {
   auto dx_fn = [this](const float x, const float dy) {
     return -(1 / (x * x)) * dy;
   };
-  TestCWiseGrad(INV, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(INV, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Reciprocal_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64 x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64 x, const complex64 dy) {
+    return -conjugate(one_ / (x * x)) * dy;
+  };
+  TestCWiseGrad<complex64>(INV, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Square) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return RV({0, -7, 7, -8, 8, -9, 9}); };
   auto dx_fn = [this](const float x, const float dy) { return 2 * x * dy; };
-  TestCWiseGrad(SQUARE, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQUARE, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Square_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(2, 0) * x) * dy;
+  };
+  TestCWiseGrad<complex64>(SQUARE, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sqrt) {
@@ -187,7 +223,18 @@ TEST_F(CWiseUnaryGradTest, Sqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * 0.5 * (1.0 / std::sqrt(x));
   };
-  TestCWiseGrad(SQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(0.5, 0) / std::sqrt(x)) * dy;
+  };
+  TestCWiseGrad<complex64>(SQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Rsqrt) {
@@ -196,7 +243,18 @@ TEST_F(CWiseUnaryGradTest, Rsqrt) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -0.5 * (1 / std::sqrt(x)) * (1 / x);
   };
-  TestCWiseGrad(RSQRT, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(RSQRT, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Rsqrt_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return conjugate(complex64(-0.5, 0) / std::sqrt(x) / x) * dy;
+  };
+  TestCWiseGrad<complex64>(RSQRT, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Exp) {
@@ -205,7 +263,18 @@ TEST_F(CWiseUnaryGradTest, Exp) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXP, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXP, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Exp_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXP, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Expm1) {
@@ -214,14 +283,36 @@ TEST_F(CWiseUnaryGradTest, Expm1) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::exp(x);
   };
-  TestCWiseGrad(EXPM1, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(EXPM1, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Expm1_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::exp(x));
+  };
+  TestCWiseGrad<complex64>(EXPM1, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log) {
   auto x_fn = [this](const int i) { return RV({-1, 1, -2, 2, -3, 3, -4, 4}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return dy * (1.0 / x); };
-  TestCWiseGrad(LOG, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log_Complex) {
+  auto x_fn = [this](const int i) { return CRV({{-1, 0}, {1, 0}, {2, -1}}); };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(one_ / x);
+  };
+  TestCWiseGrad<complex64>(LOG, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Log1p) {
@@ -230,7 +321,20 @@ TEST_F(CWiseUnaryGradTest, Log1p) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / (1.0 + x));
   };
-  TestCWiseGrad(LOG1P, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(LOG1P, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Log1p_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{0, 0}, {1e-6, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + conjugate(x));
+  };
+  TestCWiseGrad<complex64>(LOG1P, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Tanh) {
@@ -240,7 +344,21 @@ TEST_F(CWiseUnaryGradTest, Tanh) {
     const float y = std::tanh(x);
     return dy * (1.0 - y * y);
   };
-  TestCWiseGrad(TANH, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TANH, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tanh_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = std::tanh(x);
+    return dy * conjugate((one_ - y * y));
+  };
+  TestCWiseGrad<complex64>(TANH, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sigmoid) {
@@ -250,14 +368,28 @@ TEST_F(CWiseUnaryGradTest, Sigmoid) {
     const float y = 1.0 / (1.0 + std::exp(-x));
     return dy * y * (1.0 - y);
   };
-  TestCWiseGrad(SIGMOID, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGMOID, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sigmoid_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 0}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 y = one_ / (one_ + std::exp(-x));
+    return dy * conjugate(y * (one_ - y));
+  };
+  TestCWiseGrad<complex64>(SIGMOID, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sign) {
   auto x_fn = [this](const int i) { return RV({0, -1, 1, -2, 2, -3, 3}); };
   auto dy_fn = [this](const float x) { return x + RV({-2, 2, -3, 3, -4, 4}); };
   auto dx_fn = [this](const float x, const float dy) { return 0.0; };
-  TestCWiseGrad(SIGN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIGN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Sin) {
@@ -266,7 +398,20 @@ TEST_F(CWiseUnaryGradTest, Sin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * std::cos(x);
   };
-  TestCWiseGrad(SIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(SIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Sin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(std::cos(x));
+  };
+  TestCWiseGrad<complex64>(SIN, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Cos) {
@@ -275,7 +420,20 @@ TEST_F(CWiseUnaryGradTest, Cos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * -1.0 * std::sin(x);
   };
-  TestCWiseGrad(COS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(COS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Cos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy * conjugate(-std::sin(x));
+  };
+  TestCWiseGrad<complex64>(COS, x_fn, dy_fn, dx_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Asin) {
@@ -284,7 +442,24 @@ TEST_F(CWiseUnaryGradTest, Asin) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ASIN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ASIN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Asin_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Enable test when the asin kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ASIN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Acos) {
@@ -293,7 +468,24 @@ TEST_F(CWiseUnaryGradTest, Acos) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (-1.0 / std::sqrt(1.0 - x * x));
   };
-  TestCWiseGrad(ACOS, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ACOS, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Acos_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / -conjugate(std::sqrt(one_ - x * x));
+  };
+  // TODO(kbsriram)
+  // Add test when the acos kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ACOS, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Tan) {
@@ -303,7 +495,25 @@ TEST_F(CWiseUnaryGradTest, Tan) {
     const float cosx = std::cos(x);
     return dy * (1 / (cosx * cosx));
   };
-  TestCWiseGrad(TAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(TAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Tan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    const complex64 cosx = std::cos(x);
+    return dy / conjugate(cosx * cosx);
+  };
+  // TODO(kbsriram)
+  // Enable when tan kernel supports complex inputs
+  if (false) {
+    TestCWiseGrad<complex64>(TAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
@@ -312,7 +522,24 @@ TEST_F(CWiseUnaryGradTest, Atan) {
   auto dx_fn = [this](const float x, const float dy) {
     return dy * (1 / (1 + x * x));
   };
-  TestCWiseGrad(ATAN, x_fn, dy_fn, dx_fn);
+  TestCWiseGrad<float>(ATAN, x_fn, dy_fn, dx_fn);
+}
+
+TEST_F(CWiseUnaryGradTest, Atan_Complex) {
+  auto x_fn = [this](const int i) {
+    return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
+  };
+  auto dy_fn = [this](const complex64& x) {
+    return x + CRV({{-2, 2}, {-3, 3}, {1, -4}});
+  };
+  auto dx_fn = [this](const complex64& x, const complex64& dy) {
+    return dy / (one_ + x * x);
+  };
+  // TODO(kbsriram)
+  // Add test when the atan kernel supports complex numbers
+  if (false) {
+    TestCWiseGrad<complex64>(ATAN, x_fn, dy_fn, dx_fn);
+  }
 }
 
 class CWiseUnaryComplexGradTest : public ::testing::Test {
diff --git a/tensorflow/cc/ops/const_op_test.cc b/tensorflow/cc/ops/const_op_test.cc
index 5a4770f879ff9a1422a63a88bd2b67ba201a0567..3184edeb3307cafcbfbc41c6477fd092ab613b46 100644
--- a/tensorflow/cc/ops/const_op_test.cc
+++ b/tensorflow/cc/ops/const_op_test.cc
@@ -28,9 +28,9 @@ void ExpectNodeEqual(const Node* n, gtl::ArraySlice<T> values,
                      TensorShape shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(tensor.dtype(), dtype);
   test::ExpectTensorEqual<T>(tensor, test::AsTensor(values, shape));
 }
@@ -39,9 +39,9 @@ void ExpectTypeAndShape(const Node* n, DataType expected_dtype,
                         TensorShape expected_shape) {
   EXPECT_TRUE(n->IsConstant());
   Tensor tensor;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor));
   DataType dtype;
-  TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+  TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
   EXPECT_EQ(dtype, expected_dtype);
   EXPECT_EQ(expected_shape, TensorShape(tensor.shape()));
 }
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
index cd94ddf4a1b67d3b98da7769db95bbda294e76db..1dffb10c03379571907e921c1add98d1f11625c3 100644
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ b/tensorflow/cc/ops/op_gen_overrides.pbtxt
@@ -22,7 +22,7 @@ op { name: "Where" input_rename: { from: "input" to: "condition" } }
 op { name: "ThreadUnsafeUnigramCandidateSampler", skip: true }
 
 # control_flow_ops
-# TODO(josh11b): Hide Switch and Merge once we write and migrate users to
+# TODO(joshl): Hide Switch and Merge once we write and migrate users to
 # a Cond() API.
 #op { name: "Switch" hide: true }
 #op { name: "Merge" hide: true }
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index b144bfc33e46c3db192cfb1e3ef8a0633e9fa519..908aa01a3470b67233c61d150ea955c1c13a8cd3 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -36,7 +36,7 @@ auto* load_attempt_count = monitoring::Counter<2>::New(
     "status");
 auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/cc/saved_model/load_latency",
-    "Latency in microseconds for SavedModels that were succesfully loaded.",
+    "Latency in microseconds for SavedModels that were successfully loaded.",
     "model_path");
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
diff --git a/tensorflow/cc/training/coordinator.cc b/tensorflow/cc/training/coordinator.cc
index 4618c932c310eefe775ccf9d8c38fbe1eea702ca..fe45931f7f802bf483d39ea02ee280b38b8d894c 100644
--- a/tensorflow/cc/training/coordinator.cc
+++ b/tensorflow/cc/training/coordinator.cc
@@ -116,17 +116,13 @@ void Coordinator::WaitForStop() {
 }
 
 Status Coordinator::ExportCostGraph(CostGraphDef* cost_graph) const {
-  RunMetadata tmp_metadata;
-  {
-    mutex_lock l(runners_lock_);
-    for (auto& t : runners_) {
-      Status s = t->ExportRunMetadata(&tmp_metadata);
-      if (!s.ok()) {
-        return s;
-      }
+  mutex_lock l(runners_lock_);
+  for (auto& t : runners_) {
+    Status s = t->ExportCostGraph(cost_graph);
+    if (!s.ok()) {
+      return s;
     }
   }
-  cost_graph->MergeFrom(tmp_metadata.cost_graph());
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/training/coordinator.h b/tensorflow/cc/training/coordinator.h
index 632418c5ca5f523defe781a780ca0987202f59e4..0e01b19cd98bc797b7bb25da55c05d96f3eb93c7 100644
--- a/tensorflow/cc/training/coordinator.h
+++ b/tensorflow/cc/training/coordinator.h
@@ -36,8 +36,8 @@ class RunnerInterface {
  public:
   virtual ~RunnerInterface() {}
   virtual Status Join() = 0;
-  virtual Status ExportRunMetadata(RunMetadata* metadata) const {
-    return Status(error::INVALID_ARGUMENT, "No RunMetadata to export.");
+  virtual Status ExportCostGraph(CostGraphDef* cost_graph) const {
+    return Status(error::INVALID_ARGUMENT, "No cost model to export.");
   }
   /// Returns true iff the runner is running, i.e. if it is trying to populate
   /// its queue.
diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 6b615916813519d7eaa94e69e846dcbfb87623bc..5aaaa116cf00dac6c1de3056c6121913a23acd77 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -49,7 +49,12 @@ Status QueueRunner::Init(const QueueRunnerDef& queue_runner_def) {
   enqueue_op_names_.insert(enqueue_op_names_.end(),
                            queue_runner_def.enqueue_op_name().begin(),
                            queue_runner_def.enqueue_op_name().end());
-  runs_ = enqueue_op_names_.size();
+  size_t op_names_size = enqueue_op_names_.size();
+  if (op_names_size > kint32max) {
+    return Status(error::INVALID_ARGUMENT,
+                  "Enqueue ops to run cannot exceed kint32max");
+  }
+  runs_ = static_cast<int>(op_names_size);
   if (runs_ == 0) {
     return Status(error::INVALID_ARGUMENT, "Empty enqueue ops to run.");
   }
@@ -82,9 +87,9 @@ QueueRunner::~QueueRunner() {
 
 Status QueueRunner::Start(Session* sess) { return Start(sess, 0); }
 
-Status QueueRunner::StartAndCollectRunMetadata(Session* sess,
-                                               const RunOptions* run_options) {
-  SetRunArgumentsAndRunMetadata(run_options);
+Status QueueRunner::StartAndCollectCostGraph(Session* sess,
+                                             const RunOptions* run_options) {
+  SetRunArgumentsAndCostGraph(run_options);
   return Start(sess, 0);
 }
 
@@ -115,10 +120,9 @@ Status QueueRunner::Start(Session* sess, int wait_for) {
   return Status::OK();
 }
 
-Status QueueRunner::StartAndCollectRunMetadata(Session* session,
-                                               int wait_for_ms,
-                                               const RunOptions* run_options) {
-  SetRunArgumentsAndRunMetadata(run_options);
+Status QueueRunner::StartAndCollectCostGraph(Session* session, int wait_for_ms,
+                                             const RunOptions* run_options) {
+  SetRunArgumentsAndCostGraph(run_options);
   return Start(session, wait_for_ms);
 }
 
@@ -127,7 +131,7 @@ void QueueRunner::Stop(Session* sess) {
     coord_->WaitForStop();
   }
   if (!cancel_op_name_.empty()) {
-    UpdateStatus(RealRun(sess, cancel_op_name_));
+    UpdateStatus(RealRun(sess, cancel_op_name_, false));
   }
   stopped_ = true;
 }
@@ -162,7 +166,7 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
     if (coord_ && coord_->ShouldStop()) {
       break;
     }
-    status = RealRun(sess, enqueue_op);
+    status = RealRun(sess, enqueue_op, true);
     if (first_iteration) {
       if (!status.ok()) {
         mutex_lock l(mu_);
@@ -183,9 +187,11 @@ void QueueRunner::Run(Session* sess, const string& enqueue_op) {
   // will be run anway in this case.
   if (IsQueueClosed(status) && (!coord_ || !coord_->ShouldStop())) {
     if (last_run && !close_op_name_.empty()) {
-      UpdateStatus(RealRun(sess, close_op_name_));
+      UpdateStatus(RealRun(sess, close_op_name_, false));
     }
   } else if (!status.ok()) {
+    LOG(ERROR) << "Queue runner thread got a failure status: "
+               << status.ToString();
     UpdateStatus(status);
     if (coord_) {
       coord_->RequestStop().IgnoreError();
@@ -198,34 +204,35 @@ Status QueueRunner::GetStatus() {
   return status_;
 }
 
-Status QueueRunner::ExportRunMetadata(RunMetadata* metadata) const {
-  if (!rm_mu_) {
+Status QueueRunner::ExportCostGraph(CostGraphDef* cost_graph) const {
+  if (!cg_mu_) {
     return Status(error::FAILED_PRECONDITION,
-                  "This QueueRunner doesn't collect and store RunMetadata.");
+                  "This QueueRunner doesn't collect a cost graph.");
   }
-  mutex_lock l(*rm_mu_);
-  metadata->MergeFrom(*run_metadata_);
+  mutex_lock l(*cg_mu_);
+  cost_graph->MergeFrom(*cost_graph_);
   return Status::OK();
 }
 
-void QueueRunner::SetRunArgumentsAndRunMetadata(const RunOptions* run_options) {
-  rm_mu_.reset(new mutex());
+void QueueRunner::SetRunArgumentsAndCostGraph(const RunOptions* run_options) {
+  cg_mu_.reset(new mutex());
   {
-    mutex_lock l(*rm_mu_);
-    run_metadata_.reset(new RunMetadata());
+    mutex_lock l(*cg_mu_);
+    cost_graph_.reset(new CostGraphDef());
   }
   if (run_options) {
     run_options_ = *run_options;
   }
 }
 
-Status QueueRunner::RealRun(Session* sess, const string& op) {
+Status QueueRunner::RealRun(Session* sess, const string& op,
+                            bool update_costs) {
   Status s;
-  if (rm_mu_) {
+  if (update_costs && cg_mu_) {
     RunMetadata metadata;
     s = sess->Run(run_options_, {}, {}, {op}, nullptr, &metadata);
-    mutex_lock l(*rm_mu_);
-    run_metadata_->MergeFrom(metadata);
+    mutex_lock l(*cg_mu_);
+    cost_graph_->Swap(metadata.mutable_cost_graph());
   } else {
     s = sess->Run({}, {}, {op}, nullptr);
   }
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index c69f28793a95990901961e835e004b019b98dbdc..71ed44c9c6064a4e0e4a61a8e2e649e7a8a235ec 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -60,15 +60,15 @@ class QueueRunner : public RunnerInterface {
   Status Start(Session* sess);
 
   /// Starts the queue runner with the given session and sets the run arguments
-  /// for sess->Run. It also collects and stores the run metedata.
-  Status StartAndCollectRunMetadata(Session* sess,
-                                    const RunOptions* run_options = nullptr);
+  /// for sess->Run. It also collects and stores the cost model.
+  Status StartAndCollectCostGraph(Session* sess,
+                                  const RunOptions* run_options = nullptr);
 
   /// Starts the queue runner with the given session, and wait for up to the
   /// specified time (in milliseconds) for the queues to start to fill up.
   Status Start(Session* sess, int wait_for_ms);
-  Status StartAndCollectRunMetadata(Session* session, int wait_for_ms,
-                                    const RunOptions* run_options = nullptr);
+  Status StartAndCollectCostGraph(Session* session, int wait_for_ms,
+                                  const RunOptions* run_options = nullptr);
 
   /// Requests to stop and runs the cancel op. It would be called in a separate
   /// thread when coordinator is set. If there is no coordinator it should be
@@ -82,11 +82,11 @@ class QueueRunner : public RunnerInterface {
   /// Returns the latest status.
   Status GetStatus();
 
-  // Returns the stored run metadata.
-  Status ExportRunMetadata(RunMetadata* metadata) const override;
+  // Returns the stored cost model.
+  Status ExportCostGraph(CostGraphDef* cost_graph) const override;
 
  private:
-  QueueRunner() : coord_(nullptr), stopped_(false), rm_mu_(nullptr) {}
+  QueueRunner() : coord_(nullptr), stopped_(false), cg_mu_(nullptr) {}
 
   // Initializes the instance with the QueueRunnerDef proto.
   Status Init(const QueueRunnerDef& queue_runner_def);
@@ -105,9 +105,9 @@ class QueueRunner : public RunnerInterface {
 
   bool IsRunning() const override { return !stopped_; }
 
-  void SetRunArgumentsAndRunMetadata(const RunOptions* run_options);
+  void SetRunArgumentsAndCostGraph(const RunOptions* run_options);
 
-  Status RealRun(Session* sess, const string& op);
+  Status RealRun(Session* sess, const string& op, bool update_costs);
 
   string queue_name_;
   std::vector<string> enqueue_op_names_;
@@ -130,8 +130,8 @@ class QueueRunner : public RunnerInterface {
   mutex cb_mu_;
   std::vector<std::function<void(Status)>> callbacks_;
 
-  mutable std::unique_ptr<mutex> rm_mu_;
-  std::unique_ptr<RunMetadata> run_metadata_ GUARDED_BY(rm_mu_);
+  mutable std::unique_ptr<mutex> cg_mu_;
+  std::unique_ptr<CostGraphDef> cost_graph_ GUARDED_BY(cg_mu_);
   RunOptions run_options_;
 };
 
diff --git a/tensorflow/cc/training/queue_runner_test.cc b/tensorflow/cc/training/queue_runner_test.cc
index c37a69a7f76b6d83634d0b01e2038c4e6b4fa22e..da2fc03b6c07ef3dec26434eaae8e3f70c07c5f1 100644
--- a/tensorflow/cc/training/queue_runner_test.cc
+++ b/tensorflow/cc/training/queue_runner_test.cc
@@ -44,6 +44,7 @@ using ops::FIFOQueue;
 using ops::QueueClose;
 using ops::QueueDequeue;
 using ops::QueueEnqueue;
+using ops::RandomNormal;
 using ops::Square;
 using ops::Variable;
 
@@ -84,7 +85,7 @@ QueueRunnerDef BuildQueueRunnerDef(
     const std::string& close_op, const std::string& cancel_op,
     const std::vector<Code>& queue_closed_error_codes) {
   QueueRunnerDef queue_runner_def;
-  *queue_runner_def.mutable_queue_name() = kQueueName;
+  *queue_runner_def.mutable_queue_name() = queue_name;
   for (const std::string& enqueue_op : enqueue_ops) {
     *queue_runner_def.mutable_enqueue_op_name()->Add() = enqueue_op;
   }
@@ -345,37 +346,54 @@ TEST(QueueRunnerTest, CallbackCalledOnError) {
 }
 
 TEST(QueueRunnerTest, RunMetaDataTest) {
+  Scope root = Scope::NewRootScope();
+  auto q0 = FIFOQueue(root.WithOpName(kQueueName), {DataType::DT_FLOAT});
+  Output rnd = RandomNormal(root.WithOpName("rnd"), {1, 1}, DataType::DT_FLOAT);
+  Output square = Square(root.WithOpName(kSquareOpName), rnd);
+  auto enqueue0 = QueueEnqueue(root.WithOpName(kEnqueueOp0), q0, {square});
+  auto close0 = QueueClose(root.WithOpName(kCloseOp0), q0);
+  auto cancel0 = QueueClose(root.WithOpName(kCancelOp0), q0,
+                            QueueClose::CancelPendingEnqueues(true));
+  auto dequeue0 =
+      QueueDequeue(root.WithOpName(kDequeueOp0), q0, {DataType::DT_FLOAT});
+
+  GraphDef graph_def;
+  TF_EXPECT_OK(root.ToGraphDef(&graph_def));
+  for (auto& node : *graph_def.mutable_node()) {
+    node.set_device("/cpu:0");
+  }
   SessionOptions sess_options;
   sess_options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(sess_options));
 
-  GraphDef graph_def = BuildSimpleGraph();
   TF_CHECK_OK(session->Create(graph_def));
-  TF_CHECK_OK(session->Run({}, {}, {kAssignOpName}, nullptr));
 
-  RunOptions run_options;
-  run_options.set_trace_level(RunOptions::HARDWARE_TRACE);
-
-  QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
-      kQueueName, {kCountUpToOpName}, kSquareOpName, "", {});
+  QueueRunnerDef queue_runner_def =
+      BuildQueueRunnerDef(kQueueName, {kEnqueueOp0}, kCloseOp0, kCancelOp0, {});
   std::unique_ptr<QueueRunner> qr;
   TF_EXPECT_OK(QueueRunner::New(queue_runner_def, &qr));
-  TF_CHECK_OK(qr->StartAndCollectRunMetadata(session.get(), &run_options));
+  RunOptions run_options;
+  TF_CHECK_OK(qr->StartAndCollectCostGraph(session.get(), &run_options));
 
-  TF_EXPECT_OK(qr->Join());
-  RunMetadata run_metadata;
-  TF_CHECK_OK(qr->ExportRunMetadata(&run_metadata));
+  // Make sure there was at least one element enqueued in q0: this prevents a
+  // race condition where we close the queue before it was populated.
+  std::vector<Tensor> dq0;
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp0}, {}, &dq0));
+  // Second call to run dequeue op is to make sure the cost graph has been
+  // stored.
+  TF_EXPECT_OK(session->Run({}, {kDequeueOp0}, {}, &dq0));
+
+  CostGraphDef cost_graph;
+  TF_CHECK_OK(qr->ExportCostGraph(&cost_graph));
+  EXPECT_TRUE(cost_graph.node_size() > 0);
 
-  EXPECT_TRUE(run_metadata.has_cost_graph());
+  qr->Stop(session.get());
 }
 
 TEST(QueueRunnerTest, NoRunMetaDataTest) {
   GraphDef graph_def = BuildSimpleGraph();
   auto session = BuildSessionAndInitVariable(graph_def);
 
-  RunOptions run_options;
-  run_options.set_trace_level(RunOptions::HARDWARE_TRACE);
-
   QueueRunnerDef queue_runner_def = BuildQueueRunnerDef(
       kQueueName, {kCountUpToOpName}, kSquareOpName, "", {});
   std::unique_ptr<QueueRunner> qr;
@@ -383,8 +401,8 @@ TEST(QueueRunnerTest, NoRunMetaDataTest) {
   TF_CHECK_OK(qr->Start(session.get()));
 
   TF_EXPECT_OK(qr->Join());
-  RunMetadata run_metadata;
-  EXPECT_EQ(qr->ExportRunMetadata(&run_metadata).code(),
+  CostGraphDef cost_graph;
+  EXPECT_EQ(qr->ExportCostGraph(&cost_graph).code(),
             error::FAILED_PRECONDITION);
 }
 
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index f2ecd2eddc28da94ac1c2404c02324e7782831c3..49d3cca3a4e2cc1aa16af2ac251b16b7a45753b1 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -227,7 +227,7 @@ int main(int argc, char* argv[]) {
     argv[dst++] = f;
   }
   argv[dst++] = nullptr;
-  argc = unknown_flags.size() + 1;
+  argc = static_cast<int>(unknown_flags.size() + 1);
   tensorflow::port::InitMain(argv[0], &argc, &argv);
   tensorflow::example::ConcurrentSessions(opts);
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index c52a56b6428fb8a8415ed53477ba3e81c57b0ded..c12005a4cab903c15a4f95efa0fdc3b8b2563942 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 042a72745a78c4a11b22c85e3a094d78c4ab2ed5..bbdb342a623f5d4435e437fbb94e282b685751c9 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -152,8 +152,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
 string RewriteWithName(const string& name, string code,
                        const std::vector<std::pair<string, string>>& rewrites) {
   str_util::ReplaceAllPairs(&code, rewrites);
-  str_util::ReplaceAll(&code, "{{NAME}}", name);
-  return code;
+  return str_util::StringReplace(code, "{{NAME}}", name, /*replace_all=*/true);
 }
 
 // Generate methods for args (inputs).
@@ -366,7 +365,7 @@ Status GenerateHeader(const HeaderOpts& opts, const Config& config,
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 46d7c03006a1344df17fc99c8b837f31ee86feb9..01963c6df4682ec8c23a93201d7fbbab63558060 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -15,7 +15,7 @@
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace Eigen { class ThreadPoolDevice; }
+namespace Eigen { struct ThreadPoolDevice; }
 
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 1284155c07b1a253d42e7641354626eb153f0c35..0c7b97b01f43ea255ed4b7773ab5268396e7c306 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -203,14 +203,14 @@ Status RewriteAndPruneGraph(Graph* graph, const Config& config,
   for (const Node* n : graph->nodes()) {
     if (n->type_string() == kArgOp) {
       string feed_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFeedIdAttr, &feed_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFeedIdAttr, &feed_id));
       if (missing_feeds.erase(feed_id) == 0) {
         return errors::Aborted(kArgOp,
                                " node found with unknown feed id: ", feed_id);
       }
     } else if (n->type_string() == kRetvalOp) {
       string fetch_id;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), kFetchIdAttr, &fetch_id));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), kFetchIdAttr, &fetch_id));
       if (missing_fetches.erase(fetch_id) == 0) {
         return errors::Aborted(kRetvalOp,
                                " node found with unknown fetch id: ", fetch_id);
@@ -234,7 +234,7 @@ Status CollectArgNodes(const Graph& graph, std::vector<Node*>* arg_nodes) {
   for (Node* n : graph.nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       auto insert_result = indexed_arg_nodes.insert({index, n});
       if (!insert_result.second) {
         const Node* dup = insert_result.first->second;
@@ -264,9 +264,9 @@ Status CreateXlaArgs(const Graph& graph,
   for (const Node* node : arg_nodes) {
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kShapeAttr, &arg.shape));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), kDebugNameAttr, &arg.name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
   return Status::OK();
@@ -274,8 +274,8 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
-                         const FunctionLibraryDefinition* flib_def,
+Status ConvertGraphToXla(xla::CompileOnlyClient* client,
+                         std::unique_ptr<Graph> graph,
                          xla::Computation* computation, bool* has_context_arg) {
   // Create a device and context to convert the graph into an XLA computation.
   XlaOpRegistry::RegisterCompilationKernels();
@@ -289,18 +289,19 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  compiler_options.device_type = &device_type;
+  compiler_options.flib_def = &graph->flib_def();
+  compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
   XlaCompiler compiler(compiler_options);
 
-  std::unique_ptr<FunctionLibraryRuntime> flib_run(NewFunctionLibraryRuntime(
-      compiler.device_mgr(), Env::Default(), compiler.device(),
-      graph->versions().producer(), flib_def, OptimizerOptions()));
   XlaCompiler::CompilationResult result;
-  TF_RETURN_IF_ERROR(compiler.CompileGraph("tfcompile", std::move(graph),
-                                           flib_run.get(), xla_args, &result));
+  TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                           "tfcompile", std::move(graph),
+                                           xla_args, &result));
   *has_context_arg = result.requires_runtime_context;
-  *computation = std::move(result.computation);
+  *computation = std::move(*result.computation);
 
   int num_const_results = 0;
   for (int i = 0; i < result.outputs.size(); ++i) {
@@ -334,7 +335,8 @@ Status ConvertGraphToXla(xla::LocalClient* client, std::unique_ptr<Graph> graph,
 }
 
 // Compiles the XLA computation into executable code.
-Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
+Status CompileXla(xla::CompileOnlyClient* client,
+                  const xla::Computation& computation,
                   const xla::cpu::CpuAotCompilationOptions& aot_opts,
                   CompileResult* compile_result) {
   // Retrieves arg and result layouts from the computation.
@@ -351,7 +353,7 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::LocalClient::AheadOfTimeComputationInstance instance;
+  xla::CompileOnlyClient::AotComputationInstance instance;
   instance.computation = &computation;
   instance.argument_layouts = std::move(arg_layouts);
   instance.result_layout = &pshape->result();
@@ -366,17 +368,17 @@ Status CompileXla(xla::LocalClient* client, const xla::Computation& computation,
           std::move(aot_or.ValueOrDie().back()));
   compile_result->entry_point = aot_opts.entry_point_name();
   compile_result->pointer_size =
-      xla::LocalClient::PointerSizeForTriple(aot_opts.triple());
+      xla::CompileOnlyClient::PointerSizeForTriple(aot_opts.triple());
   return Status::OK();
 }
 
 }  // namespace
 
 Status InitGraph(const GraphDef& graph_def, const Config& config,
-                 const MainFlags& flags, const FunctionLibraryDefinition* flib,
-                 std::unique_ptr<Graph>* graph) {
+                 const MainFlags& flags, std::unique_ptr<Graph>* graph) {
   TF_RETURN_IF_ERROR(ValidateConfig(config));
-  std::unique_ptr<Graph> g(new Graph(flib));
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), graph_def.library());
+  std::unique_ptr<Graph> g(new Graph(flib_def));
   GraphDef copy_def(graph_def);
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(&copy_def, *g->op_registry(),
                                                0 /*node_offset*/));
@@ -388,7 +390,6 @@ Status InitGraph(const GraphDef& graph_def, const Config& config,
 }
 
 Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
-                    const FunctionLibraryDefinition* flib,
                     CompileResult* compile_result) {
   // Converts the graph into an XLA computation, and compiles the
   // computation.
@@ -396,11 +397,11 @@ Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
   namespace gpu = perftools::gputools;
   gpu::Platform* cpu_platform =
       gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie();
-  xla::LocalClient* client =
-      xla::ClientLibrary::GetOrCreateLocalClient(cpu_platform).ValueOrDie();
+  xla::CompileOnlyClient* client =
+      xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
+          .ValueOrDie();
   xla::Computation computation;
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), flib,
-                                       &computation,
+  TF_RETURN_IF_ERROR(ConvertGraphToXla(client, std::move(graph), &computation,
                                        &compile_result->has_context_arg));
   if (!flags.debug_dir.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 8e9c64820baf0cb672cead59954098f10a9c9a32..e929272b2e4760e39cddba7e585cb12a7d2d7e98 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -56,8 +56,7 @@ extern const char* const kDebugNameAttr;
 // compute the outputs.  If dump_graphs is true, graph rewrites will be dumped
 // for debugging.
 Status InitGraph(const GraphDef& graph_def, const Config& config,
-                 const MainFlags& flags, const FunctionLibraryDefinition* flib,
-                 std::unique_ptr<Graph>* graph);
+                 const MainFlags& flags, std::unique_ptr<Graph>* graph);
 
 // CompileResult describes the output of CompileGraph, where the object file
 // data and meta-information is available in aot.
@@ -83,7 +82,6 @@ struct CompileResult {
 //
 // The XLA compilation options are specified in the flags.
 Status CompileGraph(std::unique_ptr<Graph> graph, const MainFlags& flags,
-                    const FunctionLibraryDefinition* flib,
                     CompileResult* result);
 
 }  // namespace tfcompile
diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/aot/runtime.cc
index 208de5498dbee6773683ac1aa2b33400a8a21f35..5772776666129ed55a479c8917e69df3f3ce2fc0 100644
--- a/tensorflow/compiler/aot/runtime.cc
+++ b/tensorflow/compiler/aot/runtime.cc
@@ -31,6 +31,8 @@ namespace {
 inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN)
   return memalign(minimum_alignment, size);
+#elif defined(COMPILER_MSVC)
+  return _aligned_malloc(size, minimum_alignment);
 #else  // !__ANDROID__ && !OS_ANDROID && !OS_CYGWIN
   void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
@@ -45,7 +47,13 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) {
 #endif
 }
 
-inline void aligned_free(void* aligned_memory) { free(aligned_memory); }
+inline void aligned_free(void* aligned_memory) {
+#if defined(COMPILER_MSVC)
+  _aligned_free(aligned_memory);
+#else
+  free(aligned_memory);
+#endif
+}
 
 size_t align_to(size_t n, size_t align) {
   return (((n - 1) / align) + 1) * align;
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index ecb071a416c330065b286c41467c302df40714db..59d13e5393445330ba5f1c5a54b73de6b3b4c0d8 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -51,6 +51,7 @@ genrule(
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
         "test_graph_tfmatmulandadd.pb",
+        "test_graph_tffunction.pb",
     ],
     cmd = "$(location :make_test_graphs) --out_dir $(@D)",
     tags = ["manual"],
@@ -114,6 +115,15 @@ tf_library(
     tags = ["manual"],
 )
 
+tf_library(
+    name = "test_graph_tffunction",
+    testonly = 1,
+    config = "test_graph_tffunction.config.pbtxt",
+    cpp_class = "FunctionComp",
+    graph = "test_graph_tffunction.pb",
+    tags = ["manual"],
+)
+
 cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
@@ -122,6 +132,7 @@ cc_test(
         ":test_graph_tfadd",
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
+        ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
         ":test_graph_tfmatmulandadd",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 9279c45f3738e6b667bff5928849491f9d97dada..98c13958d3729bc6c7f554630e236892be130a4a 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -95,6 +96,17 @@ def tfmatmulandadd(_):
   math_ops.add(x, y, name='x_y_sum')
 
 
+def tffunction(_):
+
+  @function.Defun(dtypes.int32, dtypes.int32)
+  def test_func(a, b):
+    return a + b
+
+  x = constant_op.constant([1], name='x_const')
+  y = constant_op.constant([2], name='y_const')
+  test_func(x, y, name='func_call')  # pylint: disable=unexpected-keyword-arg
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -112,6 +124,7 @@ def main(_):
   write_graph(tfgather, FLAGS.out_dir)
   write_graph(tfmatmul, FLAGS.out_dir)
   write_graph(tfmatmulandadd, FLAGS.out_dir)
+  write_graph(tffunction, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
@@ -121,7 +134,6 @@ if __name__ == '__main__':
       '--out_dir',
       type=str,
       default='',
-      help='Output directory for graphs, checkpoints and savers.'
-  )
+      help='Output directory for graphs, checkpoints and savers.')
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb9c1cacb7ffe1ad60d985a2e5a1846707191fe7
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tffunction.config.pbtxt
@@ -0,0 +1,16 @@
+# Text form of tensorflow.tfcompile.Config proto.
+feed {
+  id { node_name: "x_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_const" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "func_call" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index f57d2859dfa4979fe0b04efea734817462af3bbf..76343b9752199fc4d26e4988452cd3c055bb5d96 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd.h"
@@ -376,6 +377,21 @@ TEST(TFCompileTest, MatMulAndAdd1) {
   }
 }
 
+TEST(TFCompileTest, Function) {
+  // The function is equivalent to an addition
+  FunctionComp add_fn;
+  EXPECT_EQ(add_fn.arg0_data(), add_fn.args()[0]);
+  EXPECT_EQ(add_fn.arg1_data(), add_fn.args()[1]);
+
+  add_fn.arg0() = 1;
+  add_fn.arg1() = 2;
+  EXPECT_TRUE(add_fn.Run());
+  EXPECT_EQ(add_fn.error_msg(), "");
+  EXPECT_EQ(add_fn.result0(), 3);
+  EXPECT_EQ(add_fn.result0_data()[0], 3);
+  EXPECT_EQ(add_fn.result0_data(), add_fn.results()[0]);
+}
+
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 64e5bfd602cb2898dcbe57bfa0949c954f17acc1..7d61bee8caf7edcbbc1fa3cc1c79d7b5af2c942c 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -282,5 +282,6 @@ def target_llvm_triple():
       "//tensorflow:android_arm": "armv7-none-android",
       "//tensorflow:android_arm64": "aarch64-none-android",
       "//tensorflow:android_x86": "i686-none-android",
+      "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
       "//conditions:default": "x86_64-pc-linux",
   })
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 85ef9560bbf1a7130dd6b140d552d96c2a0e21d6..4b7e22076937808334726d9f67c086696eab1b73 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -52,7 +52,8 @@ const char kUsageHeader[] =
     "header file that gives access to the functionality in the object file.\n"
     "A typical invocation looks like this:\n"
     "\n"
-    "   $ tfcompile --graph=mygraph.pb --config=myfile.pbtxt\n"
+    "   $ tfcompile --graph=mygraph.pb --config=myfile.pbtxt "
+    "--cpp_class=\"mynamespace::MyComputation\"\n"
     "\n";
 
 Status ReadProtoFile(const string& kind, const string& fname,
@@ -73,6 +74,9 @@ void ParseTensorId(const string& name, TensorId* id) {
 Status Main(const MainFlags& flags) {
   // Process config.
   Config config;
+  if (flags.config.empty()) {
+    return errors::InvalidArgument("Must specify --config");
+  }
   TF_RETURN_IF_ERROR(ReadProtoFile("config", flags.config, &config));
   TF_RETURN_IF_ERROR(ValidateConfig(config));
   if (flags.dump_fetch_nodes) {
@@ -85,15 +89,16 @@ Status Main(const MainFlags& flags) {
   }
 
   // Read and initialize the graph.
+  if (flags.graph.empty()) {
+    return errors::InvalidArgument("Must specify --graph");
+  }
   GraphDef graph_def;
   TF_RETURN_IF_ERROR(ReadProtoFile("graph", flags.graph, &graph_def));
   std::unique_ptr<Graph> graph;
-  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
-  TF_RETURN_IF_ERROR(InitGraph(graph_def, config, flags, &flib, &graph));
+  TF_RETURN_IF_ERROR(InitGraph(graph_def, config, flags, &graph));
 
   CompileResult compile_result;
-  TF_RETURN_IF_ERROR(
-      CompileGraph(std::move(graph), flags, &flib, &compile_result));
+  TF_RETURN_IF_ERROR(CompileGraph(std::move(graph), flags, &compile_result));
 
   // Write output files.
   Env* env = Env::Default();
@@ -101,6 +106,9 @@ Status Main(const MainFlags& flags) {
   TF_RETURN_IF_ERROR(WriteStringToFile(env, flags.out_object,
                                        StringPiece(obj.data(), obj.size())));
   HeaderOpts header_opts;
+  if (flags.cpp_class.empty()) {
+    return errors::InvalidArgument("Must specify --cpp_class");
+  }
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &header_opts.class_name,
                                    &header_opts.namespaces));
   string header;
@@ -131,12 +139,16 @@ int main(int argc, char** argv) {
   QCHECK(parsed_flags_ok) << "\n" << usage;
 
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
-  QCHECK(argc == 1 && !flags.config.empty() &&
-         (flags.dump_fetch_nodes ||
-          (!flags.graph.empty() && !flags.entry_point.empty())))
-      << "\n"
-      << usage;
-
-  TF_QCHECK_OK(tensorflow::tfcompile::Main(flags));
+  QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
+                       "other than flags\n\n"
+                    << usage;
+  tensorflow::Status status = tensorflow::tfcompile::Main(flags);
+  if (status.code() == tensorflow::error::INVALID_ARGUMENT) {
+    std::cerr << "INVALID ARGUMENTS: " << status.error_message() << "\n\n"
+              << usage;
+    return 1;
+  } else {
+    TF_QCHECK_OK(status);
+  }
   return 0;
 }
diff --git a/tensorflow/compiler/aot/tfcompile_util_test.cc b/tensorflow/compiler/aot/tfcompile_util_test.cc
index 108ab1eab7bf3b087e8049c5b24d652d871789c8..c321d3ff4c779fbd2e9c67dfc1eb24c734a9103f 100644
--- a/tensorflow/compiler/aot/tfcompile_util_test.cc
+++ b/tensorflow/compiler/aot/tfcompile_util_test.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace tfcompile {
 namespace {
 
-void ExpectErrorContains(Status status, StringPiece str) {
+void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
   EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
       << "expected error: " << status.error_message() << " to contain: " << str;
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c16fe56122fca8cf8a88d6098b2374285f33e9f2..04f15a6a0b44cdbc54dea3d2963047bbcff1be77 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -18,7 +18,23 @@ package(
     default_visibility = [":internal"],
 )
 
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+# This target can be used by XLA device plugins to prevent circular
+# dependencies, and provides access to all of the required headers
+# for building a device library.
+cc_header_only_library(
+    name = "xla_jit_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ],
+)
 
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
@@ -29,6 +45,7 @@ cc_library(
         ":xla_cpu_jit",
         ":xla_gpu_device",
         ":xla_gpu_jit",
+        "//tensorflow/compiler/plugin",
     ],
     alwayslink = 1,
 )
@@ -48,12 +65,12 @@ cc_library(
 cc_library(
     name = "xla_gpu_jit",
     visibility = [":friends"],
-    deps = [
+    deps = if_cuda([
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
-    ],
+    ]),
     alwayslink = 1,
 )
 
@@ -125,7 +142,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
@@ -133,9 +149,9 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:tensorflow_opensource",
-        "//tensorflow/core/kernels:assign_op",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:identity_op",
@@ -176,22 +192,33 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "graph_to_functiondef",
+    srcs = ["graph_to_functiondef.cc"],
+    hdrs = ["graph_to_functiondef.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "compilation_passes",
     srcs = [
         "build_xla_launch_ops_pass.cc",
         "encapsulate_subgraphs_pass.cc",
-        "graph_to_functiondef.cc",
         "mark_for_compilation_pass.cc",
     ],
     hdrs = [
         "build_xla_launch_ops_pass.h",
         "encapsulate_subgraphs_pass.h",
-        "graph_to_functiondef.h",
         "mark_for_compilation_pass.h",
     ],
     deps = [
         ":common",
+        ":graph_to_functiondef",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
         "//tensorflow/compiler/jit/kernels:xla_local_launch_op",
@@ -222,6 +249,7 @@ cc_test(
     deps = [
         ":common",
         ":compilation_passes",
+        ":graph_to_functiondef",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
diff --git a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
index abb68f73d7e3870f733c350be0dc99ab21a6b083..48eed7fce07f0855934600890e157b2752d38838 100644
--- a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
@@ -66,9 +66,9 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
 
   int num_constant_args, num_resource_args;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->def(), kXlaNumConstantArgsAttr, &num_constant_args));
+      GetNodeAttr(node->attrs(), kXlaNumConstantArgsAttr, &num_constant_args));
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(node->def(), kXlaNumResourceArgsAttr, &num_resource_args));
+      GetNodeAttr(node->attrs(), kXlaNumResourceArgsAttr, &num_resource_args));
 
   if (num_constant_args < 0 || num_resource_args < 0 ||
       num_constant_args + num_resource_args > node->num_inputs()) {
@@ -88,7 +88,7 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
   Node* launch_node;
   TF_RETURN_IF_ERROR(BuildLaunchNode(
       graph->NewName(node->name()), node->type_string(), node->def().attr(),
-      node->def().device(), const_dtypes, num_resource_args, arg_dtypes,
+      node->requested_device(), const_dtypes, num_resource_args, arg_dtypes,
       node->output_types(), graph, &launch_node));
   launch_node->set_assigned_device_name(node->assigned_device_name());
 
@@ -173,7 +173,8 @@ Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
   FunctionLibraryRuntime::Handle handle;
   // If ndef is not instantiable, e.g., the function does not exist,
   // simply bail out.
-  TF_RETURN_IF_ERROR(flr->Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
   const FunctionBody* fbody = flr->GetFunctionBody(handle);
   CHECK(fbody);  // Can't be nullptr since we just instantiated it.
   std::vector<bool> const_args(fbody->arg_types.size());
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 1d2793d3c55f4436a07e4f632887561202d0498e..88ec45f8d86643aa4f7c643ac5bee333fb2ec559 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -88,9 +88,12 @@ class Encapsulator {
 
   // Build a FunctionDef for each subgraph, and add it 'library'. The values of
   // the 'group_attribute' annotations become the function names.
+  // If 'reuse_existing_functions' is set, use an existing function with the
+  // same name, if any.
   // If 'rewrite_subgraph_fn' is set, it is applied to each subgraph before
   // function conversion.
   Status BuildFunctionDefs(const RewriteSubgraphFn& rewrite_subgraph_fn,
+                           bool reuse_existing_functions,
                            FunctionLibraryDefinition* library);
 
   // Write a copy of the input graph to 'graph_out', where the subgraphs are
@@ -162,7 +165,7 @@ static const char* const kRetValOp = "_Retval";
 // none.
 string Encapsulator::GetFunctionNameAttr(Node const* node) const {
   string attr;
-  if (!GetNodeAttr(node->def(), group_attribute_, &attr).ok()) {
+  if (!GetNodeAttr(node->attrs(), group_attribute_, &attr).ok()) {
     attr.clear();
   }
   return attr;
@@ -192,7 +195,7 @@ Status Encapsulator::SplitIntoSubgraphs() {
 
     // Check the device matches any existing device.
     string device = node->assigned_device_name().empty()
-                        ? node->def().device()
+                        ? node->requested_device()
                         : node->assigned_device_name();
 
     if (subgraph.device.empty()) {
@@ -236,9 +239,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // Create a new _Retval node
         DataType dtype = edge->src()->output_type(edge->src_output());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef ret_def;
         ret_def.set_op(kRetValOp);
-        ret_def.set_name(src_subgraph.graph->NewName("output"));
+        ret_def.set_name(strings::StrCat(edge->src()->name(), "_",
+                                         edge->src_output(), "_retval"));
         AddNodeAttr("T", dtype, &ret_def);
         AddNodeAttr("index", ret_index, &ret_def);
         Node* ret = src_subgraph.graph->AddNode(ret_def, &s);
@@ -263,8 +273,16 @@ Status Encapsulator::SplitIntoSubgraphs() {
         // This is the first time we have seen this tensor. Create an _Arg node.
         DataType dtype = edge->dst()->input_type(edge->dst_input());
 
+        if (IsRefType(dtype)) {
+          return errors::InvalidArgument(
+              "Ref Tensors (e.g., Variables) are not supported: tensor ",
+              edge->src()->name(), ":", edge->src_output());
+        }
+
         NodeDef arg_def;
-        NodeDefBuilder builder(dst_subgraph.graph->NewName("input"), kArgOp);
+        NodeDefBuilder builder(strings::StrCat(edge->src()->name(), "_",
+                                               edge->src_output(), "_arg"),
+                               kArgOp);
         builder.Attr("T", dtype);
         builder.Attr("index", arg_index);
         s = builder.Finalize(&arg_def);
@@ -291,11 +309,11 @@ Status Encapsulator::SplitIntoSubgraphs() {
 }
 
 Status Encapsulator::BuildFunctionDefs(
-    const RewriteSubgraphFn& rewrite_subgraph_fn,
+    const RewriteSubgraphFn& rewrite_subgraph_fn, bool reuse_existing_functions,
     FunctionLibraryDefinition* library) {
   // For each subgraph, build a FunctionDef.
   for (auto& subgraph_entry : subgraphs_) {
-    const string& name = subgraph_entry.first;
+    string name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
 
     subgraph.call_node_def.set_op(name);
@@ -332,6 +350,8 @@ Status Encapsulator::BuildFunctionDefs(
       for (auto& result : subgraph.results) {
         result.second = output_permutation[result.second];
       }
+
+      name = subgraph.call_node_def.op();
     }
 
     FunctionDef fdef;
@@ -346,7 +366,9 @@ Status Encapsulator::BuildFunctionDefs(
           strings::StrCat("encapsulate_fdef_", name), fdef);
     }
 
-    TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    if (!reuse_existing_functions || library->Find(name) == nullptr) {
+      TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+    }
   }
   return Status::OK();
 }
@@ -545,14 +567,16 @@ Status Encapsulator::BuildOutputGraph(bool parallel_checking,
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library) {
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library) {
   Status s;
 
   Encapsulator encapsulator(std::move(group_attribute), &graph_in);
   s = encapsulator.SplitIntoSubgraphs();
   if (!s.ok()) return s;
 
-  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn, library);
+  s = encapsulator.BuildFunctionDefs(rewrite_subgraph_fn,
+                                     reuse_existing_functions, library);
   if (!s.ok()) return s;
 
   std::unique_ptr<Graph> out(new Graph(library));
@@ -569,7 +593,7 @@ static Status GetArgTypes(const Graph& graph, DataTypeVector* types) {
   for (Node* n : graph.nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       if (index < 0 || index >= types->size()) {
         return errors::InvalidArgument("Invalid argument number");
       }
@@ -586,7 +610,7 @@ static Status RenumberArguments(Graph* graph,
   for (Node* n : graph->nodes()) {
     if (n->type_string() == kArgOp) {
       int index;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
       if (index < 0 || index >= permutation.size()) {
         return errors::InvalidArgument("Invalid argument number");
       }
@@ -674,7 +698,8 @@ Status EncapsulateSubgraphsPass::Run(
 
   TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
       kXlaClusterAttr, **options.graph, rewrite_subgraph,
-      flags->tf_xla_parallel_checking, &graph_out, library));
+      flags->tf_xla_parallel_checking, /*reuse_existing_functions=*/false,
+      &graph_out, library));
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("after_encapsulate_subgraphs", *graph_out,
@@ -688,7 +713,7 @@ Status EncapsulateSubgraphsPass::Run(
 bool IsXlaCompiledKernel(const Node& node) {
   bool is_compiled = false;
   bool has_compilation_attr =
-      GetNodeAttr(node.def(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
+      GetNodeAttr(node.attrs(), kXlaCompiledKernelAttr, &is_compiled).ok() &&
       is_compiled;
   return has_compilation_attr ? is_compiled : false;
 }
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 3ca7dfbf6a0ec29d9517139ffb952298d503cabc..b0987f76c91ed48df52fab303ea6052ebd8fd336 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -34,6 +34,8 @@ namespace tensorflow {
 // 'input_permutation' and 'output_permutation' are initialized to the identity
 // permutation. 'nodedef' is the NodeDef for the call to the function under
 // construction, provided to allow additional attributes to be set.
+// The rewrite may also change the NodeDef's operator name, and that
+// name will be used as the name of the generated function.
 typedef std::function<Status(
     std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
     std::vector<int>* output_permutation, NodeDef* node_def)>
@@ -53,6 +55,9 @@ typedef std::function<Status(
 // output graph, together with a "ParallelCheck" operator, that verifies that
 // the original and encapsulated subgraphs produce similar results.
 //
+// If 'reuse_existing_functions' is set, use an existing function with the
+// same name, if any.
+//
 // TODO(phawkins): currently, some information in control edges
 // is not preserved. Suppose you have A and B in the main
 // graph, C and D in a subgraph. B and C have control deps from A, D has control
@@ -61,7 +66,8 @@ typedef std::function<Status(
 Status EncapsulateSubgraphsInFunctions(
     string group_attribute, const Graph& graph_in,
     const RewriteSubgraphFn& rewrite_subgraph_fn, bool parallel_checking,
-    std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
+    bool reuse_existing_functions, std::unique_ptr<Graph>* graph_out,
+    FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
 // subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index faab7bd3d25d2491cf74faeb3b06acf4c2d6a054..a8869c8e2a7c164f97917cdae312289efb8b2663 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -76,7 +76,7 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
 #define TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(expected, actual)         \
   do {                                                            \
     string diff;                                                  \
-    EXPECT_TRUE(EqualFunctionDefLibrary(actual, expected, &diff)) \
+    EXPECT_TRUE(EqualFunctionDefLibrary(expected, actual, &diff)) \
         << diff << "\nActual: " << actual.DebugString();          \
   } while (false)
 
@@ -109,7 +109,7 @@ Node* Binary(ops::NodeOut a, ops::NodeOut b,
   return ops::BinaryOp("BinaryTest", a, b, opts);
 }
 
-Node* AddNLike(std::vector<ops::NodeOut> inputs,
+Node* AddNLike(const std::vector<ops::NodeOut>& inputs,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp("AddN"), "AddNLikeTest",
@@ -144,8 +144,9 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
 
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions("_encapsulate", *graph,
-                                      /* rewrite_subgraph_fn= */ {},
-                                      /* parallel_checking= */ false,
+                                      /*rewrite_subgraph_fn=*/{},
+                                      /*parallel_checking=*/false,
+                                      /*reuse_existing_functions=*/false,
                                       &graph_out, lib_def.get());
   if (!s.ok()) return s;
 
@@ -205,12 +206,12 @@ TEST(EncapsulateSubgraphsTest, OneFunction) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
-          {{"c"}, "BinaryTest", {"input__1", "C:o:0"}, {}, {"C"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
+          {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
       },
-      {{"output__2", "c:o:0"}});
+      {{"c_0_retval", "c:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -261,17 +262,17 @@ TEST(EncapsulateSubgraphsTest, TwoFunctions) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"input__0:float"}, {"output__1:float"}, {},
+      "F1", {"a_0_arg:float"}, {"c_0_retval:float"}, {},
       {
-          {{"C"}, "UnaryTest", {"input__0"}},
+          {{"C"}, "UnaryTest", {"a_0_arg"}},
       },
-      {{"output__1", "C:o:0"}});
+      {{"c_0_retval", "C:o:0"}});
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"input__0:float", "input__1:float"}, {"output__2:float"}, {},
+      "F2", {"b_0_arg:float", "c_0_arg:float"}, {"d_0_retval:float"}, {},
       {
-          {{"D"}, "BinaryTest", {"input__0", "input__1"}},
+          {{"D"}, "BinaryTest", {"b_0_arg", "c_0_arg"}},
       },
-      {{"output__2", "D:o:0"}});
+      {{"d_0_retval", "D:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -340,7 +341,8 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/false, &graph, &library));
+      /*parallel_checking=*/false, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {"cluster1", "cluster2", "mul", "x"};
   EXPECT_EQ(expected_nodes, GraphNodes(*graph));
@@ -371,7 +373,8 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
       "_cluster", graph_before_encapsulation, /*rewrite_subgraph_fn=*/{},
-      /*parallel_checking=*/true, &graph, &library));
+      /*parallel_checking=*/true, /*reuse_existing_functions=*/false, &graph,
+      &library));
 
   std::vector<string> expected_nodes = {
       "add1", "add2", "cluster1", "cluster1_parallel_check/_0",
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index ce943471fb07fe02f18596247ccfddb94bd35158..83c23385008d56859b81abee7d292276036a45ee 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -126,8 +126,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     if (node->type_string() == kArgOp) {
       int index;
       DataType type;
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &type));
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().input_arg_size() <= index) {
         fdef->mutable_signature()->add_input_arg();
       }
@@ -143,8 +143,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     if (node->type_string() == kRetValOp) {
       int index;
       DataType type;
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "T", &type));
-      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "index", &index));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &type));
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
       while (fdef->signature().output_arg_size() <= index) {
         fdef->mutable_signature()->add_output_arg();
       }
@@ -161,9 +161,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
     }
 
     NodeDef* node_def = fdef->add_node_def();
-    node_def->CopyFrom(node->def());
+    *node_def = node->def();
     node_def->set_name(node_names.Uniquify(node->name()));
-    node_def->clear_device();
 
     // Reset input names based on graph rather than the NodeDef.
     node_def->clear_input();
@@ -204,8 +203,8 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
 
     // Populate tensor_renaming.
     NameRangeMap output_ranges;
-    TF_RETURN_IF_ERROR(NameRangesForNode(node->def(), node->op_def(), nullptr,
-                                         &output_ranges));
+    TF_RETURN_IF_ERROR(
+        NameRangesForNode(*node, node->op_def(), nullptr, &output_ranges));
     for (const auto& output : output_ranges) {
       for (int i = output.second.first; i < output.second.second; ++i) {
         const string tensor_name = strings::StrCat(
diff --git a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
index c741ccfb31efa8794ae745e2e52e3c91b20cfcfc..29c5ff724299ec84d31268c4227259ec02d10742 100644
--- a/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_device_launch_op.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 
 namespace {
 
-Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** compiler) {
+Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** cache) {
   XlaDevice::Metadata* metadata;
   Status s = rm->Lookup<XlaDevice::Metadata>(rm->default_container(),
                                              "xla_metadata", &metadata);
@@ -42,12 +42,8 @@ Status BuildCompilationCache(ResourceMgr* rm, XlaCompilationCache** compiler) {
     return s;
   }
   core::ScopedUnref metadata_ref(metadata);
-  XlaCompiler::Options options;
-  options.device_type = metadata->jit_device_type();
-  options.client = metadata->client();
-  options.allow_cpu_custom_calls = false;
-  options.local_executable_has_hybrid_result = false;
-  *compiler = new XlaCompilationCache(options);
+  *cache =
+      new XlaCompilationCache(metadata->client(), metadata->jit_device_type());
   return Status::OK();
 }
 
@@ -59,7 +55,7 @@ XlaDeviceLaunchOp::XlaDeviceLaunchOp(OpKernelConstruction* ctx)
   OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
   function_ = *func;
   VLOG(1) << "XlaDeviceLaunch created function="
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   DataTypeVector constant_types;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
   num_constant_args_ = constant_types.size();
@@ -85,29 +81,37 @@ std::vector<OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
 
 void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaDeviceLaunch::Compute "
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
   OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
 
-  XlaCompilationCache* compiler;
+  XlaCompilationCache* cache;
   OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
-                          rm->default_container(), "xla_compiler", &compiler,
-                          [rm](XlaCompilationCache** compiler) {
-                            return BuildCompilationCache(rm, compiler);
+                          rm->default_container(), "xla_compiler", &cache,
+                          [rm](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(rm, cache);
                           }));
   // Holds the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
   // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
+  core::ScopedUnref cache_ref(cache);
 
   std::vector<OptionalTensor> variables =
       SnapshotResourceVariables(ctx, num_resource_args_);
 
+  XlaCompiler::Options options;
+  options.client = cache->client();
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = false;
+  options.local_executable_has_hybrid_result = false;
+
   const XlaCompiler::CompilationResult* kernel;
-  OP_REQUIRES_OK(ctx, compiler->Compile(function_, num_constant_args_,
-                                        variables, ctx, &kernel, nullptr));
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_,
+                                     variables, ctx, &kernel, nullptr));
 
   VLOG(1) << "XLA compilation complete...";
 
@@ -117,7 +121,7 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
   // Runs the computation, if any. There might not be a computation if all
   // outputs were compile-time constants.
   std::vector<std::unique_ptr<xla::GlobalData>> outputs;
-  if (!kernel->computation.IsNull()) {
+  if (!kernel->computation->IsNull()) {
     auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
 
     // Builds the inputs to the computation.
@@ -148,8 +152,8 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
     Env* env = Env::Default();
     auto start_time = env->NowMicros();
     VLOG(1) << "Executing XLA Computation...";
-    auto result = compiler->client()->Execute(kernel->computation, arg_ptrs,
-                                              &execution_options, &profile);
+    auto result = cache->client()->Execute(*kernel->computation, arg_ptrs,
+                                           &execution_options, &profile);
     auto elapsed = env->NowMicros() - start_time;
     OP_REQUIRES(ctx, result.ok(), result.status());
 
@@ -158,7 +162,7 @@ void XlaDeviceLaunchOp::Compute(OpKernelContext* ctx) {
 
     if (xla::ShapeUtil::IsTuple(kernel->xla_output_shape)) {
       auto outputs_or_error =
-          compiler->client()->DeconstructTuple(*result.ValueOrDie());
+          cache->client()->DeconstructTuple(*result.ValueOrDie());
       OP_REQUIRES(ctx, outputs_or_error.ok(), outputs_or_error.status());
       outputs = outputs_or_error.ConsumeValueOrDie();
     } else {
diff --git a/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
index 8b43c7c1564a340b70e8cfa271a3ef50379b46bc..40acc0d81d08230b373823e333cd5e3e407b9c4f 100644
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.cc
@@ -148,24 +148,28 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   OP_REQUIRES(ctx, num_resource_args == 0,
               errors::Unimplemented(
                   "XlaLocalLaunchOp does not support resource variables"));
-}
-
-Status XlaLocalLaunchOp::BuildCompilationCache(XlaCompilationCache** compiler) {
-  gpu::Platform::Id platform_id;
   if (device_type_ == DeviceType(DEVICE_CPU)) {
-    platform_id = gpu::host::kHostPlatformId;
+    platform_id_ = gpu::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
-    platform_id = gpu::cuda::kCudaPlatformId;
+    platform_id_ = gpu::cuda::kCudaPlatformId;
   } else {
-    return errors::InvalidArgument("Unknown device type for local _XlaLaunch");
+    ctx->SetStatus(
+        errors::InvalidArgument("Unknown device type for local _XlaLaunch"));
+    return;
   }
+}
 
-  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id);
+Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
+                                               XlaCompilationCache** cache) {
+  auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_);
   if (!platform.ok()) {
     return StreamExecutorUtil::ConvertStatus(platform.status());
   }
-  auto client =
-      xla::ClientLibrary::GetOrCreateLocalClient(platform.ValueOrDie());
+  xla::LocalClientOptions client_options;
+  client_options.set_platform(platform.ValueOrDie());
+  client_options.set_intra_op_parallelism_threads(
+      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+  auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
   if (!client.ok()) {
     return client.status();
   }
@@ -175,18 +179,14 @@ Status XlaLocalLaunchOp::BuildCompilationCache(XlaCompilationCache** compiler) {
     return errors::InvalidArgument("No JIT device registered for ",
                                    device_type_.type());
   }
-  XlaCompiler::Options options;
-  options.device_type = DeviceType(registration->compilation_device_name);
-  options.client = client.ValueOrDie();
-  options.allow_cpu_custom_calls = (platform_id == gpu::host::kHostPlatformId);
-  options.local_executable_has_hybrid_result = true;
-  *compiler = new XlaCompilationCache(options);
+  *cache = new XlaCompilationCache(
+      client.ValueOrDie(), DeviceType(registration->compilation_device_name));
   return Status::OK();
 }
 
 void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOp::Compute "
-          << Canonicalize(function_.name(), function_.attr());
+          << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
@@ -195,23 +195,31 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   gpu::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
 
-  XlaCompilationCache* compiler;
+  XlaCompilationCache* cache;
   OP_REQUIRES_OK(ctx, rm->LookupOrCreate<XlaCompilationCache>(
-                          rm->default_container(), "xla_compiler", &compiler,
-                          [this](XlaCompilationCache** compiler) {
-                            return BuildCompilationCache(compiler);
+                          rm->default_container(), "xla_cache", &cache,
+                          [this, ctx](XlaCompilationCache** cache) {
+                            return BuildCompilationCache(ctx, cache);
                           }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
   // this is more obviously correct.)
-  core::ScopedUnref compiler_ref(compiler);
+  core::ScopedUnref cache_ref(cache);
+
+  xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  xla::LocalClient* client = static_cast<xla::LocalClient*>(compiler->client());
+  XlaCompiler::Options options;
+  options.client = client;
+  options.device_type = &cache->device_type();
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
+  options.local_executable_has_hybrid_result = true;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
-  OP_REQUIRES_OK(ctx, compiler->Compile(function_, num_constant_args_, {}, ctx,
-                                        &kernel, &executable));
+  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, num_constant_args_, {},
+                                     ctx, &kernel, &executable));
 
   VLOG(1) << "Executing XLA Computation...";
 
@@ -221,7 +229,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   std::unique_ptr<xla::ShapedBuffer> output;
   bool output_is_tuple;
-  if (!kernel->computation.IsNull()) {
+  if (!kernel->computation->IsNull()) {
     // Build xla::ShapedBuffers that point directly to the Tensor buffers.
     std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers;
     arg_buffers.reserve(kernel->xla_input_shapes.size() + 1);
@@ -260,8 +268,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     xla::ExecutableRunOptions run_options;
     run_options.set_stream(stream);
     run_options.set_allocator(&xla_allocator);
-    run_options.set_inter_op_thread_pool(
-        ctx->device()->tensorflow_cpu_worker_threads()->workers);
     run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
     Env* env = Env::Default();
     auto start_time = env->NowMicros();
diff --git a/tensorflow/compiler/jit/kernels/xla_local_launch_op.h b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
index 8023206762951a4dafba900dd291f2ee9bdbbdf3..5e4d3336a91001fac1d222709f64300e777247c7 100644
--- a/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_local_launch_op.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
 
@@ -43,11 +44,15 @@ class XlaLocalLaunchOp : public OpKernel {
 
  private:
   // Builds a XlaCompilationCache class suitable for the current device.
-  Status BuildCompilationCache(XlaCompilationCache** compiler);
+  Status BuildCompilationCache(OpKernelContext* ctx,
+                               XlaCompilationCache** compiler);
 
   DeviceType device_type_;
   NameAttrList function_;
   int num_constant_args_;
+
+  perftools::gputools::Platform::Id platform_id_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 22dbf7ec99fe93fb7fe8c524a3dc84ac1a97f015..73c4e80551485189d1e43fd93eed39083bd6b6b7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -50,22 +50,24 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
 }
 
 // Make sure we don't recurse infinitely on recursive functions.
-const int kMaxRecursionDepth = 5;
+const int kMaxRecursionDepth = 10;
 
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime);
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime);
 
-// Tests whether 'while_def' is a completely compilable loop.
+// Tests whether 'while_node' is a completely compilable loop.
 // Every operator in the condition and body functions must be compilable for a
 // while loop to be compilable.
-bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
-                       int depth, FunctionLibraryRuntime* lib_runtime) {
-  VLOG(2) << "Loop marking: " << while_def.op();
+bool IsCompilableWhile(const Node& while_node,
+                       const DeviceType& jit_device_type, int depth,
+                       FunctionLibraryRuntime* lib_runtime) {
+  VLOG(2) << "Loop marking: " << while_node.type_string();
 
   const NameAttrList* name_attr;
   NodeDef call;
   Status status;
-  status = GetNodeAttr(while_def, "cond", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "cond", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'cond' attribute on While node.";
     return false;
@@ -78,7 +80,7 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
     VLOG(2) << "Can't compile loop condition: " << cond_func;
     return false;
   }
-  status = GetNodeAttr(while_def, "body", &name_attr);
+  status = GetNodeAttr(while_node.attrs(), "body", &name_attr);
   if (!status.ok()) {
     VLOG(2) << "Missing 'body' attribute on While node.";
     return false;
@@ -98,8 +100,9 @@ bool IsCompilableWhile(const NodeDef& while_def, DeviceType jit_device_type,
 // Tests whether 'call_def' is a call to a completely compilable function.
 // Every operator in the function must be compilable for a function to be
 // compilable.
-bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
-                      int depth, FunctionLibraryRuntime* lib_runtime) {
+bool IsCompilableCall(const NodeDef& call_def,
+                      const DeviceType& jit_device_type, int depth,
+                      FunctionLibraryRuntime* lib_runtime) {
   VLOG(2) << "Function marking: " << call_def.op();
 
   if (depth > kMaxRecursionDepth) {
@@ -109,7 +112,7 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
 
   FunctionLibraryRuntime::Handle handle;
   Status status =
-      lib_runtime->Instantiate(call_def.op(), call_def.attr(), &handle);
+      lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle);
   if (!status.ok()) {
     VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status;
     return false;
@@ -131,11 +134,11 @@ bool IsCompilableCall(const NodeDef& call_def, DeviceType jit_device_type,
 
   for (Node* node : fbody->graph->nodes()) {
     if (node->IsSource() || node->IsSink()) continue;
-    if (node->def().op() == "_Arg" || node->def().op() == "_Retval") continue;
-    if (node->def().op() == "While") {
+    if (node->type_string() == "_Arg" || node->type_string() == "_Retval")
+      continue;
+    if (node->type_string() == "While") {
       // Handle functional While loop (not in open source build).
-      return IsCompilableWhile(node->def(), jit_device_type, depth + 1,
-                               lib_runtime);
+      return IsCompilableWhile(*node, jit_device_type, depth + 1, lib_runtime);
     }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, depth + 1,
@@ -189,17 +192,16 @@ Status FindCompilationCandidates(
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime.get())) {
       VLOG(2) << "Compilation rejected node: unsupported op " << node->name()
-              << ": " << node->def().op();
+              << ": " << node->type_string();
       continue;
     }
     if (!registration->compile_resource_ops && HasResourceArgument(*node)) {
       VLOG(2) << "Compilation rejected node: resource argument " << node->name()
-              << ": " << node->def().op();
+              << ": " << node->type_string();
       continue;
     }
-    if (node->def().op() == "While" &&
-        !IsCompilableWhile(node->def(), jit_device_type, 0,
-                           lib_runtime.get())) {
+    if (node->type_string() == "While" &&
+        !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime.get())) {
       continue;
     }
     candidates->insert(node);
@@ -316,10 +318,10 @@ Status MarkForCompilationPass::Run(
 
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
-    Status status = GetNodeAttr(node->def(), kXlaCompileAttr, &compile);
+    Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
-    status = fld->GetAttr(node->def(), kXlaCompileAttr, &compile);
+    status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
     // Otherwise use the value of global_jit_level.
@@ -482,8 +484,8 @@ Status MarkForCompilationPass::RunImpl(
       // all nodes marked with _XlaCompile=true to also have a
       // _XlaScope property set (and raise an error otherwise); but
       // for now we don't do this.
-      if (GetNodeAttr(node_from->def(), kXlaScopeAttr, &from_scope).ok() &&
-          GetNodeAttr(node_to->def(), kXlaScopeAttr, &to_scope).ok() &&
+      if (GetNodeAttr(node_from->attrs(), kXlaScopeAttr, &from_scope).ok() &&
+          GetNodeAttr(node_to->attrs(), kXlaScopeAttr, &to_scope).ok() &&
           from_scope != to_scope) {
         continue;
       }
@@ -538,10 +540,9 @@ Status MarkForCompilationPass::RunImpl(
     // Compile if the user marked this node _XlaCompile=true
     bool compile_attr = false;
     bool marked_for_compilation = false;
-    if (GetNodeAttr(n->def(), kXlaCompileAttr, &compile_attr).ok()) {
+    if (GetNodeAttr(n->attrs(), kXlaCompileAttr, &compile_attr).ok()) {
       marked_for_compilation = compile_attr;
-    } else if (options.flib_def
-                   ->GetAttr(n->def(), kXlaCompileAttr, &compile_attr)
+    } else if (options.flib_def->GetAttr(*n, kXlaCompileAttr, &compile_attr)
                    .ok()) {
       marked_for_compilation = compile_attr;
     }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 91e4a2b41c7026b6ca028ed6a7e61588d57e9e50..9f30e12e0e30fef6b4bcd0ea3c091842b008c29a 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -57,7 +57,7 @@ std::unordered_map<string, string> GetClusters(const Graph& graph) {
   std::unordered_map<string, string> ids;
   for (Node* node : graph.nodes()) {
     string cluster;
-    if (GetNodeAttr(node->def(), kXlaClusterAttr, &cluster).ok()) {
+    if (GetNodeAttr(node->attrs(), kXlaClusterAttr, &cluster).ok()) {
       CHECK(!cluster.empty());
       ids[node->name()] = cluster;
     }
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 41abea02eb2d17423744dfb719ee9a3f6b8f1198..63ca77f9a912acce2078f3da43d64f2e10049380 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -37,9 +37,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaCompilationCache::XlaCompilationCache(const XlaCompiler::Options& options)
-    : compiler_(options) {}
-
+XlaCompilationCache::XlaCompilationCache(xla::Client* client,
+                                         DeviceType device_type)
+    : client_(client), device_type_(std::move(device_type)) {}
 XlaCompilationCache::~XlaCompilationCache() = default;
 
 string XlaCompilationCache::DebugString() {
@@ -95,7 +95,7 @@ Status XlaCompilationCache::BuildSignature(
     const NameAttrList& function, int num_constant_args,
     const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
     Signature* signature) {
-  signature->name = Canonicalize(function.name(), function.attr());
+  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
   signature->arg_values.resize(num_constant_args);
 
   signature->arg_types.reserve(ctx->num_inputs() - num_constant_args);
@@ -205,8 +205,9 @@ Status BuildArguments(int num_constant_args,
 }  // namespace
 
 Status XlaCompilationCache::Compile(
-    const NameAttrList& function, int num_constant_args,
-    const std::vector<OptionalTensor>& variable_args, OpKernelContext* ctx,
+    const XlaCompiler::Options& options, const NameAttrList& function,
+    int num_constant_args, const std::vector<OptionalTensor>& variable_args,
+    OpKernelContext* ctx,
     const XlaCompiler::CompilationResult** compilation_result,
     xla::LocalExecutable** executable) {
   VLOG(1) << "XlaCompilationCache::Compile " << DebugString();
@@ -263,21 +264,18 @@ Status XlaCompilationCache::Compile(
     TF_RETURN_IF_ERROR(
         BuildArguments(num_constant_args, variable_args, ctx, &args));
 
-    std::unique_ptr<FunctionLibraryRuntime> flr(NewFunctionLibraryRuntime(
-        compiler_.device_mgr(), ctx->env(), compiler_.device(),
-        TF_GRAPH_DEF_VERSION,
-        ctx->function_library()->GetFunctionLibraryDefinition(),
-        OptimizerOptions(), nullptr /* custom_kernel_creator */));
-
+    XlaCompiler compiler(options);
     entry->compiled = true;
-    entry->compilation_status = compiler_.CompileFunction(
-        flr.get(), function, args, &entry->compilation_result);
+    entry->compilation_status =
+        compiler.CompileFunction(XlaCompiler::CompileOptions(), function, args,
+                                 &entry->compilation_result);
   }
   *compilation_result = &entry->compilation_result;
   if (entry->compilation_status.ok() && executable) {
     if (entry->executable == nullptr &&
-        !entry->compilation_result.computation.IsNull()) {
-      entry->compilation_status = compiler_.BuildExecutable(
+        !entry->compilation_result.computation->IsNull()) {
+      XlaCompiler compiler(options);
+      entry->compilation_status = compiler.BuildExecutable(
           entry->compilation_result, &entry->executable);
     }
     *executable = entry->executable.get();
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index ff67e48d1a9a9f16881c2e141b23ce8c479aef50..4ffcb68a3220b2354a3542e4c2a4d3e000969e0b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -46,7 +46,7 @@ struct OptionalTensor {
 // bound.
 class XlaCompilationCache : public ResourceBase {
  public:
-  explicit XlaCompilationCache(const XlaCompiler::Options& options);
+  XlaCompilationCache(xla::Client* client, DeviceType device_type);
   ~XlaCompilationCache() override;
 
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
@@ -61,19 +61,21 @@ class XlaCompilationCache : public ResourceBase {
   // xla::LocalExecutable and sets `executable to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
-  Status Compile(const NameAttrList& function, int num_constant_args,
+  Status Compile(const XlaCompiler::Options& options,
+                 const NameAttrList& function, int num_constant_args,
                  const std::vector<OptionalTensor>& variable_args,
                  OpKernelContext* ctx,
                  const XlaCompiler::CompilationResult** compilation_result,
                  xla::LocalExecutable** executable);
 
-  xla::Client* client() const { return compiler_.client(); }
+  xla::Client* client() const { return client_; }
+  const DeviceType& device_type() const { return device_type_; }
 
   string DebugString() override;
 
  private:
-  XlaCompiler compiler_;
-  std::unique_ptr<FunctionLibraryRuntime> function_library_runtime_;
+  xla::Client* const client_;
+  const DeviceType device_type_;
 
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 3c6793b89420ed61259070f7bf637d6f4aa097d0..5e336c5287bd9e2067e93cd8db8a5a1b62b62bd2 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -108,12 +109,23 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
 
 string XlaDevice::Metadata::DebugString() { return "XLA device metadata"; }
 
+/* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
+                                           Metadata** metadata) {
+  ResourceMgr* rm = ctx->resource_manager();
+  if (rm == nullptr) {
+    return errors::Internal("No resource manager.");
+  }
+  TF_RETURN_IF_ERROR(
+      rm->Lookup<Metadata>(rm->default_container(), "xla_metadata", metadata));
+  return Status::OK();
+}
+
 XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceAttributes& attrs, int device_ordinal,
                      const DeviceType& jit_device_name,
                      perftools::gputools::Platform* platform,
                      Allocator* xla_allocator)
-    : LocalDevice(options, attrs, xla_allocator),
+    : LocalDevice(options, attrs),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(xla_allocator),
@@ -163,6 +175,10 @@ Status XlaDevice::FillContextMap(const Graph* graph,
 void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":"
           << op_kernel->type_string();
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   op_kernel->Compute(context);
 }
 
@@ -170,6 +186,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                              AsyncOpKernel::DoneCallback done) {
   VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":"
           << op_kernel->type_string();
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   op_kernel->ComputeAsync(context, done);
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 3de14f306168937bb0483e0c442984a02e2b1442..0badb390c6b7785b36f58c786e1d32a8d10d7c29 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -67,6 +67,10 @@ class XlaDevice : public LocalDevice {
     perftools::gputools::Platform* platform_;  // Not owned.
   };
 
+  // Sets `*metadata` to the XlaDevice Metadata in the resource manager of
+  // `ctx`.
+  static Status GetMetadata(OpKernelContext* ctx, Metadata** metadata);
+
   // Factory function. 'platform_name' is the name of the XLA platform.
   // 'device_name' is the name of the Tensorflow device to create.
   // 'jit_device_name' is the name of the corresponding JIT device.
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index a52239df252b2b556987fa9701f43047765c60de..8699006ebc5aacafd46046a7c3f093356f687280 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -63,30 +63,10 @@ class XlaDeviceDummyOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
                           PlaceholderOp);                                      \
                                                                                \
-  REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE),               \
-                          ControlTriggerOp);                                   \
-  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
-  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
-                          NextIterationOp);                                    \
-  REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
-                          SwitchOp);                                           \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
-  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
-                              .Device(DEVICE)                                  \
-                              .HostMemory("input")                             \
-                              .HostMemory("output"),                           \
-                          IdentityOp);                                         \
-                                                                               \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("VarHandleOp").Device(DEVICE).HostMemory("resource"),               \
       ResourceHandleOp<Var>);
 
-// TODO(b/32507444): the registrations for the control flow operators are
-// temporary and exist primarily to work around a bug in the graph partitioning
-// code.
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_DEVICE_OPS_H_
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..056f2228ca64b083bf05a8728dc25213e99e4cd8
--- /dev/null
+++ b/tensorflow/compiler/plugin/BUILD
@@ -0,0 +1,34 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Configuration file for an XLA plugin.
+- please don't check in changes to this file
+- to prevent changes appearing in git status, use:
+  git update-index --assume-unchanged tensorflow/compiler/plugin/BUILD
+
+To add additional devices to the XLA subsystem, add targets to the
+dependency list in the 'plugin' target. For instance:
+
+    deps = ["//tensorflow/compiler/plugin/example:plugin_lib"],
+"""
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "plugin",
+    deps = [],
+)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 03e255e6b842668a491d254953926500ce3a50ec..19f7ff835456855a2b2ab7d5856f1d3e6f7f9733 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -65,6 +65,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "binary_ops_test",
     size = "small",
@@ -156,6 +170,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "slice_ops_test",
+    size = "small",
+    srcs = ["slice_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "function_test",
     size = "small",
@@ -305,6 +332,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "spacetobatch_op_test",
+    size = "medium",
+    srcs = ["spacetobatch_op_test.py"],
+    shard_count = 3,
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "ternary_ops_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index 0a2c9e26c6fbd827d5ab669dea5419f9fa50025b..a5c5885b4284aee167ae4cb18f7e42820c6d251d 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional tests for aggregate operations."""
+"""Tests for Adagrad."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3215dc36e5b2d517aa951db1b0d41188185ef93a
--- /dev/null
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -0,0 +1,176 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(XLATestCase):
+
+  def testBasic(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in self.float_types:
+      with self.test_session(), self.test_scope():
+        variable_scope.get_variable_scope().set_use_resource(True)
+
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = array_ops.placeholder(dtype)
+        grads1 = array_ops.placeholder(dtype)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+          else:
+            update2.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9efdaee7ab66f7cfc84bc1c30a9ba700e268abe2..7221a0a3c745f939b88cae0f66af2421922dcd68 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -107,6 +107,12 @@ class BinaryOpsTest(XLATestCase):
           np.array([5, 6, 7, 8], dtype=dtype),
           expected=np.array([-75, -48, -21, 0], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops._elu_grad,
+          np.array([1, 2, 3, 4, 5, 6], dtype=dtype),
+          np.array([-.6, -.4, -.2, 0, .2, .4], dtype=dtype),
+          expected=np.array([0.4, 1.2, 2.4, 4, 5, 6], dtype=dtype))
+
       self._testBinary(
           gen_nn_ops._relu_grad,
           np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
diff --git a/tensorflow/compiler/tests/nary_ops_test.py b/tensorflow/compiler/tests/nary_ops_test.py
index e89c411d01f8eb27f39bf65f3d3d21ec817c3ddf..2660e1d5728caf88e2b9ae73b3e3fde2aee71ed8 100644
--- a/tensorflow/compiler/tests/nary_ops_test.py
+++ b/tensorflow/compiler/tests/nary_ops_test.py
@@ -116,13 +116,14 @@ class NAryOpsTest(XLATestCase):
                     np.array([1, 1], dtype=np.int32)],
                    expected=np.array([[], []], dtype=np.float32))
 
-    if (np.int64 in self.int_types):
-      self._testNAry(lambda x: array_ops.strided_slice(*x),
-                     [np.array([[], [], []], dtype=np.float32),
-                      np.array([1, 0], dtype=np.int64),
-                      np.array([3, 0], dtype=np.int64),
-                      np.array([1, 1], dtype=np.int64)],
-                     expected=np.array([[], []], dtype=np.float32))
+    if np.int64 in self.int_types:
+      self._testNAry(
+          lambda x: array_ops.strided_slice(*x), [
+              np.array([[], [], []], dtype=np.float32), np.array(
+                  [1, 0], dtype=np.int64), np.array([3, 0], dtype=np.int64),
+              np.array([1, 1], dtype=np.int64)
+          ],
+          expected=np.array([[], []], dtype=np.float32))
 
     self._testNAry(lambda x: array_ops.strided_slice(*x),
                    [np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index c3e8ff724c178c5e635fedae0c3295cf598b2b00..2a71543f3febe3cb692fdcd563772c3bd2d3724a 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -94,7 +94,7 @@ class OpTestBuilder {
   explicit OpTestBuilder(const string& op_name);
 
   // Adds an input 'tensor'.
-  OpTestBuilder& Input(Tensor tensor);
+  OpTestBuilder& Input(const Tensor& tensor);
 
   // Sets an attribute.
   template <class T>
@@ -111,8 +111,8 @@ class OpTestBuilder {
   // sets it to the NodeDef of the operator under test. Fills 'inputs' and
   // 'outputs' with the names of the input placeholder nodes and the output
   // identity nodes, respectively.
-  Status BuildGraph(string name_prefix, string device, bool use_jit,
-                    GraphDef* graphdef, NodeDef** test_node_def,
+  Status BuildGraph(const string& name_prefix, const string& device,
+                    bool use_jit, GraphDef* graphdef, NodeDef** test_node_def,
                     std::vector<string>* inputs,
                     std::vector<string>* outputs) const;
 
@@ -127,7 +127,7 @@ OpTestBuilder::OpTestBuilder(const string& op_name) {
   node_def_.set_op(op_name);
 }
 
-OpTestBuilder& OpTestBuilder::Input(Tensor tensor) {
+OpTestBuilder& OpTestBuilder::Input(const Tensor& tensor) {
   VLOG(1) << "Adding input: " << tensor.DebugString();
   inputs_.push_back(tensor);
   return *this;
@@ -146,9 +146,9 @@ OpTestBuilder& OpTestBuilder::Attr(StringPiece attr_name,
   return *this;
 }
 
-Status OpTestBuilder::BuildGraph(string name_prefix, string device,
-                                 bool use_jit, GraphDef* graphdef,
-                                 NodeDef** test_node_def,
+Status OpTestBuilder::BuildGraph(const string& name_prefix,
+                                 const string& device, bool use_jit,
+                                 GraphDef* graphdef, NodeDef** test_node_def,
                                  std::vector<string>* inputs,
                                  std::vector<string>* outputs) const {
   OpRegistryInterface* op_registry = OpRegistry::Global();
@@ -209,7 +209,7 @@ class OpTest : public ::testing::Test {
 
   // Runs 'fn' up to --tf_xla_test_repetitions times, or until a failure occurs;
   // whichever happens first.
-  void Repeatedly(std::function<void(void)> fn);
+  void Repeatedly(const std::function<void(void)>& fn);
 
   // Select a random element from 'candidates'.
   template <typename T>
@@ -218,12 +218,11 @@ class OpTest : public ::testing::Test {
   static constexpr int kDefaultMaxRank = 5;
   static constexpr int64 kDefaultMaxDimensionSize = 20LL;
 
-  // Returns a random dimension size.
+  // Returns a random dimension size, in the range [min, max).
   int64 RandomDim(int64 min = 0, int64 max = kDefaultMaxDimensionSize);
 
   // Returns a random shape. The tensor has rank in the range [min_rank,
-  // max_rank).
-  // Each dimension has size [0, kDefaultMaxDimensionSize].
+  // max_rank). Each dimension has size [min_size, max_size).
   std::vector<int64> RandomDims(int min_rank = 0,
                                 int max_rank = kDefaultMaxRank,
                                 int64 min_size = 0,
@@ -316,7 +315,7 @@ OpTest::OpTest() {
   TF_CHECK_OK(session_->Create(def));
 }
 
-void OpTest::Repeatedly(std::function<void(void)> fn) {
+void OpTest::Repeatedly(const std::function<void(void)>& fn) {
   int const max_repetitions = tf_xla_test_repetitions;
   for (int i = 0; !HasFailure() && i < max_repetitions; ++i) {
     fn();
@@ -668,6 +667,9 @@ void OpTest::ExpectTfAndXlaOutputsAreClose(const OpTestBuilder& builder,
     VLOG(1) << "Expected graph failed with status: " << s << ". Skipping test";
     return;
   }
+  for (const Tensor& expected : expected_outputs) {
+    VLOG(1) << "Expected: " << expected.DebugString();
+  }
 
   VLOG(1) << "Running test graph";
   TF_ASSERT_OK(session_->Run(test_feeds, test_fetches, {}, &test_outputs));
@@ -877,6 +879,79 @@ TEST_F(OpTest, BatchMatMul) {
   });
 }
 
+TEST_F(OpTest, BatchToSpace) {
+  Repeatedly([this]() {
+    const int num_block_dims = 2;
+    std::vector<int64> block_dims =
+        RandomDims(num_block_dims, num_block_dims, 0, 5);
+    int64 block_size = RandomDim(0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + 1);
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[0] *= block_size;
+      input_dims[1 + i] = block_dims[i];
+    }
+    input_dims[1 + num_block_dims] = RandomDim();
+
+    std::vector<int64> crop_vals;
+    std::uniform_int_distribution<int> distribution(0, 4);
+    for (int i = 0; i < num_block_dims; ++i) {
+      // Chooses crop values; does not always choose legal values.
+      crop_vals.push_back(distribution(generator()));
+      crop_vals.push_back(distribution(generator()));
+    }
+    Tensor crops;
+    CHECK(crops.CopyFrom(AsIntTensor(DT_INT32, crop_vals),
+                         TensorShape({num_block_dims, 2})));
+
+    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("BatchToSpace")
+                                      .Input(RandomTensor(DT_FLOAT, input_dims))
+                                      .Input(crops)
+                                      .Attr("T", DT_FLOAT)
+                                      .Attr("block_size", block_size));
+  });
+}
+
+TEST_F(OpTest, BatchToSpaceND) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(1, 3, 0, 5);
+    int num_block_dims = block_dims.size();
+    std::vector<int64> remaining_dims = RandomDims(0, 3);
+    std::vector<int64> block_multipliers =
+        RandomDims(block_dims.size(), block_dims.size(), 0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + remaining_dims.size());
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[0] *= block_dims[i];
+    }
+    std::copy(block_multipliers.begin(), block_multipliers.end(),
+              input_dims.begin() + 1);
+    std::copy(remaining_dims.begin(), remaining_dims.end(),
+              input_dims.begin() + 1 + num_block_dims);
+
+    std::vector<int64> crop_vals;
+    std::uniform_int_distribution<int> distribution(0, 3);
+    for (int i = 0; i < num_block_dims; ++i) {
+      // Chooses crop values; does not always choose legal values.
+      crop_vals.push_back(distribution(generator()));
+      crop_vals.push_back(distribution(generator()));
+    }
+    Tensor crops;
+    CHECK(crops.CopyFrom(AsIntTensor(DT_INT32, crop_vals),
+                         TensorShape({num_block_dims, 2})));
+
+    ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("BatchToSpaceND")
+            .Input(RandomTensor(DT_FLOAT, input_dims))
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(crops)
+            .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, BiasAdd) {
   Repeatedly([this]() {
     auto x = RandomTensor(DT_FLOAT, RandomDims(2, kDefaultMaxRank));
@@ -1214,6 +1289,23 @@ TEST_F(OpTest, DynamicStitch) {
   });
 }
 
+TEST_F(OpTest, Elu) {
+  Repeatedly([this]() {
+    ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("Elu").Input(RandomTensor(DT_FLOAT)).Attr("T", DT_FLOAT));
+  });
+}
+
+TEST_F(OpTest, EluGrad) {
+  Repeatedly([this]() {
+    auto dims = RandomDims();
+    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("EluGrad")
+                                      .Input(RandomTensor(DT_FLOAT, dims))
+                                      .Input(RandomTensor(DT_FLOAT, dims))
+                                      .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, Equal) {
   Repeatedly([this]() {
     DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
@@ -2019,6 +2111,87 @@ TEST_F(OpTest, SoftplusGrad) {
   });
 }
 
+TEST_F(OpTest, SpaceToBatch) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(4, 4, 0, 5);
+    const int num_block_dims = 2;
+    int64 block_size = RandomDim(0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + 1);
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[1 + i] = block_dims[i] * block_size;
+    }
+    input_dims[1 + num_block_dims] = RandomDim();
+
+    std::vector<int64> padding_vals;
+    std::uniform_int_distribution<int> distribution(0, 7);
+    for (int i = 0; i < num_block_dims; ++i) {
+      int64 pad_before;
+      int64 pad_after;
+      do {
+        pad_before = distribution(generator());
+        pad_after = distribution(generator());
+      } while (pad_before + pad_after > input_dims[1 + i]);
+      input_dims[1 + i] -= pad_before + pad_after;
+      padding_vals.push_back(pad_before);
+      padding_vals.push_back(pad_after);
+    }
+    Tensor paddings;
+    CHECK(paddings.CopyFrom(AsIntTensor(DT_INT32, padding_vals),
+                            TensorShape({num_block_dims, 2})));
+
+    ExpectTfAndXlaOutputsAreClose(OpTestBuilder("SpaceToBatch")
+                                      .Input(RandomTensor(DT_FLOAT, input_dims))
+                                      .Input(paddings)
+                                      .Attr("T", DT_FLOAT)
+                                      .Attr("block_size", block_size));
+  });
+}
+
+TEST_F(OpTest, SpaceToBatchND) {
+  Repeatedly([this]() {
+    std::vector<int64> block_dims = RandomDims(1, 3, 0, 5);
+    int num_block_dims = block_dims.size();
+    std::vector<int64> remaining_dims = RandomDims(0, 3);
+    std::vector<int64> block_multipliers =
+        RandomDims(block_dims.size(), block_dims.size(), 0, 4);
+
+    std::vector<int64> input_dims(1 + num_block_dims + remaining_dims.size());
+    input_dims[0] = RandomDim();
+    for (int i = 0; i < num_block_dims; ++i) {
+      input_dims[1 + i] = block_dims[i] * block_multipliers[i];
+    }
+    std::copy(remaining_dims.begin(), remaining_dims.end(),
+              input_dims.begin() + 1 + num_block_dims);
+
+    std::vector<int64> padding_vals;
+    std::uniform_int_distribution<int> distribution(0, 7);
+    for (int i = 0; i < num_block_dims; ++i) {
+      int64 pad_before;
+      int64 pad_after;
+      do {
+        pad_before = distribution(generator());
+        pad_after = distribution(generator());
+      } while (pad_before + pad_after > input_dims[1 + i]);
+      input_dims[1 + i] -= pad_before + pad_after;
+      padding_vals.push_back(pad_before);
+      padding_vals.push_back(pad_after);
+    }
+    Tensor paddings;
+    CHECK(paddings.CopyFrom(AsIntTensor(DT_INT32, padding_vals),
+                            TensorShape({num_block_dims, 2})));
+
+    ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("SpaceToBatchND")
+            .Input(RandomTensor(DT_FLOAT, input_dims))
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(block_dims.begin(), block_dims.end())))
+            .Input(paddings)
+            .Attr("T", DT_FLOAT));
+  });
+}
+
 TEST_F(OpTest, SparseMatMul) {
   Repeatedly([this]() {
     int64 x = RandomDim();
@@ -2339,6 +2512,14 @@ TEST_F(OpTest, ZerosLike) {
   });
 }
 
+TEST_F(OpTest, OnesLike) {
+  Repeatedly([this]() {
+    DataType type = Choose<DataType>({DT_INT32, DT_FLOAT});
+    ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("OnesLike").Input(RandomTensor(type)).Attr("T", type));
+  });
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tests/slice_ops_test.py b/tensorflow/compiler/tests/slice_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ddf2ee0dcb2b5f514ff9820c07f7cc10609ff66
--- /dev/null
+++ b/tensorflow/compiler/tests/slice_ops_test.py
@@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slicing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+
+class SliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.slice(i, [2], [4])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 3, 4, 5], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.slice(i, [1, 2, 2], [1, 1, 4])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[6, 5, 4, 3]]], result)
+
+
+
+class StridedSliceTest(XLATestCase):
+
+  def test1D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2], [6], [2])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([2, 4], result)
+
+  def test1DNegtiveStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [6], [2], [-2])
+        params = {
+            i: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([6, 4], result)
+
+  def test3D(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 3, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [0, 2, 2], [2, 3, 6], [1, 1, 2])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[1, 9]], [[6, 4]]], result)
+
+  def test3DNegativeStride(self):
+    for dtype in self.numeric_types:
+      with self.test_session():
+        i = array_ops.placeholder(dtype, shape=[3, 4, 10])
+        with self.test_scope():
+          o = array_ops.strided_slice(i, [2, 2, 6], [0, 0, 2], [-1, -1, -2])
+        params = {
+            i: [[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                 [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
+                 [5, 3, 1, 7, 9, 2, 4, 6, 8, 0],
+                 [4, 5, 2, 4, 3, 7, 6, 8, 9, 4]],
+                [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
+                 [4, 3, 4, 5, 7, 6, 5, 3, 4, 5],
+                 [8, 7, 6, 5, 4, 3, 2, 1, 8, 7],
+                 [7, 1, 7, 1, 8, 1, 8, 1, 3, 1]],
+                [[7, 5, 7, 5, 7, 5, 7, 5, 7, 5],
+                 [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+                 [9, 8, 7, 9, 8, 7, 9, 8, 7, 9],
+                 [9, 9, 5, 5, 6, 6, 3, 3, 6, 6]]]
+        }
+        result = o.eval(feed_dict=params)
+
+        self.assertAllEqual([[[9, 8],
+                              [1, 1]],
+                             [[2, 4],
+                              [5, 7]]], result)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c3b86c84b2b92089da0dfc0070a4a7b8a03c81a
--- /dev/null
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -0,0 +1,266 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for SpaceToBatch and BatchToSpace ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import test
+
+
+def space_to_batch_direct(input_array, block_shape, paddings):
+  """Direct Python implementation of space-to-batch conversion.
+
+  This is used for tests only.
+
+  Args:
+    input_array: N-D array
+    block_shape: 1-D array of shape [num_block_dims].
+    paddings: 2-D array of shape [num_block_dims, 2].
+
+  Returns:
+    Converted tensor.
+  """
+  input_array = np.array(input_array)
+  block_shape = np.array(block_shape)
+  num_block_dims = len(block_shape)
+  paddings = np.array(paddings).reshape((len(block_shape), 2))
+
+  padded = np.pad(input_array,
+                  pad_width=([[0, 0]] + list(paddings) + [[0, 0]] *
+                             (input_array.ndim - 1 - num_block_dims)),
+                  mode="constant")
+  reshaped_padded_shape = [input_array.shape[0]]
+  output_shape = [input_array.shape[0] * np.prod(block_shape)]
+  for block_dim, block_shape_value in enumerate(block_shape):
+    reduced_size = padded.shape[block_dim + 1] // block_shape_value
+    reshaped_padded_shape.append(reduced_size)
+    output_shape.append(reduced_size)
+    reshaped_padded_shape.append(block_shape_value)
+  reshaped_padded_shape.extend(input_array.shape[num_block_dims + 1:])
+  output_shape.extend(input_array.shape[num_block_dims + 1:])
+
+  reshaped_padded = padded.reshape(reshaped_padded_shape)
+  permuted_reshaped_padded = np.transpose(reshaped_padded, (
+      list(np.arange(num_block_dims) * 2 + 2) + [0] +
+      list(np.arange(num_block_dims) * 2 + 1) + list(
+          np.arange(input_array.ndim - num_block_dims - 1) + 1 + num_block_dims
+          * 2)))
+  return permuted_reshaped_padded.reshape(output_shape)
+
+
+class SpaceToBatchTest(XLATestCase):
+  """Tests input-output pairs for the SpaceToBatch and BatchToSpace ops."""
+
+  def _testPad(self, inputs, paddings, block_size, outputs):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self.float_types:
+        # outputs = space_to_batch(inputs)
+        placeholder = array_ops.placeholder(dtype)
+        x_tf = gen_array_ops._space_to_batch(
+            placeholder, paddings, block_size=block_size)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        # inputs = batch_to_space(outputs)
+        x_tf = gen_array_ops._batch_to_space(
+            placeholder, paddings, block_size=block_size)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+
+  def _testOne(self, inputs, block_size, outputs):
+    paddings = np.zeros((2, 2), dtype=np.int32)
+    self._testPad(inputs, paddings, block_size, outputs)
+
+  # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  def testSmallInput2x2(self):
+    x_np = [[[[1], [2]], [[3], [4]]]]
+    block_size = 2
+    x_out = [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
+  def testSmallInput2x2Pad1x0(self):
+    x_np = [[[[1], [2]], [[3], [4]]]]
+    paddings = np.array([[1, 0], [1, 0]], dtype=np.int32)
+    block_size = 3
+    x_out = [[[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[1]]], [[[2]]], [[[0]]],
+             [[[3]]], [[[4]]]]
+    self._testPad(x_np, paddings, block_size, x_out)
+
+  # Test with depth larger than 1.
+  # [1, 2, 2, 3] <-> [4, 1, 1, 3]
+  def testDepthInput2x2(self):
+    x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
+    block_size = 2
+    x_out = [[[[1, 2, 3]]], [[[4, 5, 6]]], [[[7, 8, 9]]], [[[10, 11, 12]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Test for larger input dimensions.
+  # [1, 4, 4, 1] <-> [4, 2, 2, 1]
+  def testLargerInput2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
+             [[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
+    block_size = 2
+    x_out = [[[[1], [3]], [[9], [11]]], [[[2], [4]], [[10], [12]]],
+             [[[5], [7]], [[13], [15]]], [[[6], [8]], [[14], [16]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Test with batch larger than 1.
+  # [2, 2, 4, 1] <-> [8, 1, 2, 1]
+  def testBatchInput2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]]],
+            [[[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
+    block_size = 2
+    x_out = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+             [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+    self._testOne(x_np, block_size, x_out)
+
+  # Tests for larger input spatial dimensions AND batch larger than 1, to ensure
+  # that elements are correctly laid out spatially and properly interleaved
+  # along the batch dimension.
+  # [2, 4, 4, 1] <-> [8, 2, 2, 1]
+  def testLargerInputBatch2x2(self):
+    x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
+             [[9], [10], [11], [12]], [[13], [14], [15], [16]]],
+            [[[17], [18], [19], [20]], [[21], [22], [23], [24]],
+             [[25], [26], [27], [28]], [[29], [30], [31], [32]]]]
+    x_out = [[[[1], [3]], [[9], [11]]], [[[17], [19]], [[25], [27]]],
+             [[[2], [4]], [[10], [12]]], [[[18], [20]], [[26], [28]]],
+             [[[5], [7]], [[13], [15]]], [[[21], [23]], [[29], [31]]],
+             [[[6], [8]], [[14], [16]]], [[[22], [24]], [[30], [32]]]]
+    block_size = 2
+    self._testOne(x_np, block_size, x_out)
+
+
+class SpaceToBatchNDTest(XLATestCase):
+  """Tests input-output pairs for the SpaceToBatchND and BatchToSpaceND ops."""
+
+  def _testPad(self, inputs, block_shape, paddings, outputs):
+    block_shape = np.array(block_shape)
+    paddings = np.array(paddings).reshape((len(block_shape), 2))
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self.float_types:
+        placeholder = array_ops.placeholder(dtype)
+        # outputs = space_to_batch(inputs)
+        x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: inputs}), outputs)
+        # inputs = batch_to_space(outputs)
+        placeholder = array_ops.placeholder(dtype)
+        x_tf = array_ops.batch_to_space_nd(placeholder, block_shape, paddings)
+        self.assertAllEqual(sess.run(x_tf, {placeholder: outputs}), inputs)
+
+  def _testDirect(self, input_shape, block_shape, paddings):
+    inputs = np.arange(np.prod(input_shape), dtype=np.float32)
+    inputs = inputs.reshape(input_shape)
+    self._testPad(inputs, block_shape, paddings,
+                  space_to_batch_direct(inputs, block_shape, paddings))
+
+  def testZeroBlockDimsZeroRemainingDims(self):
+    self._testPad(
+        inputs=[1, 2],
+        block_shape=[],
+        paddings=[],
+        outputs=[1, 2],)
+
+  def testZeroBlockDimsOneRemainingDim(self):
+    self._testPad(
+        inputs=[[1, 2], [3, 4]],
+        block_shape=[],
+        paddings=[],
+        outputs=[[1, 2], [3, 4]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(
+        inputs=[[1, 2], [3, 4]],
+        block_shape=[1],
+        paddings=[[0, 0]],
+        outputs=[[1, 2], [3, 4]])
+
+  def testZeroBlockDimsTwoRemainingDims(self):
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[],
+        paddings=[],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with a no-op block dim.
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[1],
+        paddings=[[0, 0]],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+    # Same thing, but with two no-op block dims.
+    self._testPad(
+        inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+        block_shape=[1, 1],
+        paddings=[[0, 0], [0, 0]],
+        outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+
+  def testOneBlockDimZeroRemainingDims(self):
+    self._testPad(
+        inputs=[[1, 2, 3], [4, 5, 6]],
+        block_shape=[2],
+        paddings=[1, 0],
+        outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
+
+  def testOneBlockDimOneRemainingDim(self):
+    self._testPad(
+        inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
+        block_shape=[2],
+        paddings=[1, 0],
+        outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
+                 [[4, 41], [6, 61]]])
+
+  def testDirect(self):
+    # Test with zero-size remaining dimension.
+    self._testDirect(
+        input_shape=[3, 1, 2, 0], block_shape=[3], paddings=[[0, 2]])
+
+    # Test with zero-size blocked dimension.
+    self._testDirect(
+        input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[0, 0]])
+
+    # Test with padding up from zero size.
+    self._testDirect(
+        input_shape=[3, 0, 2, 5], block_shape=[3], paddings=[[1, 2]])
+
+    self._testDirect(
+        input_shape=[3, 3, 4, 5, 2],
+        block_shape=[3, 4, 2],
+        paddings=[[1, 2], [0, 0], [3, 0]])
+
+    self._testDirect(
+        input_shape=[3, 3, 4, 5, 2],
+        block_shape=[3, 4, 2, 2],
+        paddings=[[1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(
+        input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+        block_shape=[1, 1, 3, 4, 2, 2],
+        paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0]])
+
+    self._testDirect(
+        input_shape=[3, 2, 2, 3, 4, 5, 2, 5],
+        block_shape=[1, 1, 3, 4, 2, 2, 1],
+        paddings=[[0, 0], [0, 0], [1, 2], [0, 0], [3, 0], [0, 0], [0, 0]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index c96826fd0a64b2d8fb02da22cfdc72edbb674317..51d8786ce3d7148e6863be7e1557a8bb23153d63 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -182,6 +182,11 @@ class UnaryOpsTest(XLATestCase):
                [0.7310586, 0.880797, 0.95257413, 0.98201376]],
               dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sigmoid,
+          np.array([-300, -150, 0, 150, 300], dtype=dtype),
+          expected=np.array([0, 0, 0.5, 1, 1], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.sqrt,
           np.array([[4, 9]], dtype=dtype),
@@ -209,6 +214,11 @@ class UnaryOpsTest(XLATestCase):
                [-3.4401896, -2.4401896, -1.4401897, -0.44018969]],
               dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.elu,
+          np.array([[-1, 0, 1]], dtype=dtype),
+          expected=np.array([[-0.63212056, 0, 1]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           nn_ops.relu,
           np.array([[-1, 1]], dtype=dtype),
@@ -257,6 +267,11 @@ class UnaryOpsTest(XLATestCase):
           np.array([[4, 3], [2, 1]], dtype=dtype),
           expected=np.array([[0, 0], [0, 0]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          array_ops.ones_like,
+          np.array([[4, 3], [2, 1]], dtype=dtype),
+          expected=np.array([[1, 1], [1, 1]], dtype=dtype))
+
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
         math_ops.logical_not,
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index dcb9e2db2f8ca7ef6e89cb9c6493d15dcaacd46e..70dacd9de4b95dfb77986dfaf177c16b758406f1 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for XLA JIT compiler."""
+"""Tests for reading and writing variables."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -36,6 +39,21 @@ from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
 class VariableOpsTest(XLATestCase):
   """Test cases for resource variable operators."""
 
+  def testOneWriteOneOutput(self):
+    # Regression test for a bug where computations with one non-constant
+    # output and one variable update were mishandled.
+    for dtype in self.numeric_types:
+      init = np.array([[1, 2], [3, 4]], dtype=dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        p = array_ops.placeholder(dtype)
+        x = v.assign_add(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(np.array([[2, 3], [4, 5]], dtype=dtype),
+                            sess.run(y, {p: 1}))
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     with self.test_session() as session:
@@ -98,5 +116,68 @@ class VariableOpsTest(XLATestCase):
       self.assertAllClose(np.array([1.9, 2.9], dtype=np.float32), vb, rtol=1e-4)
 
 
+class StridedSliceAssignChecker(object):
+  """Compares the results of a slice assignment using Tensorflow and numpy."""
+
+  def __init__(self, test, x, dtype):
+    self.dtype = dtype
+    self.test = test
+    self.x_np = np.array(x).astype(dtype)
+
+  def __setitem__(self, index, value):
+    value = np.array(value).astype(self.dtype)
+
+    with self.test.test_session() as sess, self.test.test_scope():
+      x = constant_op.constant(self.x_np, dtype=self.dtype)
+      var = resource_variable_ops.ResourceVariable(x)
+      sess.run(variables.variables_initializer([var]))
+      val = sess.run(var[index].assign(value))
+      # val_copy is used to check that tf.assign works equivalently to the
+      # assign method above.
+      val_copy = sess.run(state_ops.assign(var[index], value))
+      valnp = np.copy(self.x_np)
+      valnp[index] = np.array(value)
+      self.test.assertAllEqual(val, valnp)
+      self.test.assertAllEqual(val_copy, valnp)
+
+
+class SliceAssignTest(XLATestCase):
+
+  def testSliceAssign(self):
+    for dtype in self.numeric_types:
+      checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]],
+                                          dtype=dtype)
+      # No-op assignment
+      checker[:] = [[10, 20, 30], [40, 50, 60]]
+      # Checks trivial (1,1) shape tensor
+      checker[1:2, 1:2] = [[66]]
+      # shrink shape changes
+      checker[1:2, 1] = [66]
+      checker[1, 1:2] = [66]
+      checker[1, 1] = 66
+      # newaxis shape changes
+      checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
+      # shrink and newaxis
+      checker[None, None, 0, 0:1] = [[[99]]]
+      # Non unit strides
+      checker[::1, 1::-1] = [[3, 33], [4, 44]]
+      # degenerate interval
+      checker[8:10, 0] = []
+      checker[8:10, 8:10] = [[]]
+
+      # Assign vector to scalar (rank-0) using newaxis
+      checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
+      checker2[()] = 6  # no indices
+      checker2[...] = 6  # ellipsis
+      checker2[None] = [6]  # new axis
+
+  def testUninitialized(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "uninitialized variable"):
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable([1, 2])
+        sess.run(v[:].assign([1, 2]))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 1388a892ba5a1d07c05eedf277085099923ae901..f5c228f8305d740b994dadc34c93b4e0ae32d785 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -18,15 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -48,34 +43,6 @@ class XlaDeviceTest(test.TestCase):
       result = sess.run(w, {x: [1.5, 0.5]})
     self.assertAllClose(result, [12., 2.], rtol=1e-3)
 
-  def testLoops(self):
-    """Tests that loops work on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      with ops.device("device:XLA_CPU:0"):
-        c = lambda i, _: math_ops.less(i, 5)
-        b = lambda i, x: (i + 1, x * 2.0 + 1.0)
-        _, y = control_flow_ops.while_loop(c, b, (constant_op.constant(0), x))
-
-      result = session.run(y, {x: np.float32(2)})
-      self.assertAllClose(result, np.float32(95), rtol=1e-3)
-
-  def testCond(self):
-    """Tests that tf.cond works on XLA devices."""
-
-    with session_lib.Session() as session:
-      x = array_ops.placeholder(dtypes.float32)
-      y = array_ops.placeholder(dtypes.float32)
-      c = array_ops.placeholder(dtypes.bool)
-      with ops.device("device:XLA_CPU:0"):
-        z = x + 1.0
-        w = control_flow_ops.cond(c, lambda: z, lambda: y)
-        t = math_ops.add(z, w)
-
-      result = session.run(t, {x: np.float32(2), y: np.float32(4), c: True})
-      self.assertAllClose(result, np.float32(6), rtol=1e-3)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 7a18c1e3750afa276d6721ffea9a4d481cb37136..12537b9765469da6d906d556ff69685149e2cc32 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -45,6 +45,7 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 53aa749a0a90bf3fad06ed4bc57c4327c5d24dcc..c4cbaebb258fc19552227a51de616429e2e6221b 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -35,6 +35,9 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Any", "reduction_indices"},
       {"ArgMax", "dimension"},
       {"AvgPoolGrad", "orig_input_shape"},
+      {"BatchToSpace", "crops"},
+      {"BatchToSpaceND", "block_shape"},
+      {"BatchToSpaceND", "crops"},
       {"BroadcastGradientArgs", "s0"},
       {"BroadcastGradientArgs", "s1"},
       {"Concat", "concat_dim"},
@@ -65,10 +68,16 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Range", "limit"},
       {"Range", "delta"},
       {"Reshape", "shape"},
+      {"ResourceStridedSliceAssign", "begin"},
+      {"ResourceStridedSliceAssign", "end"},
+      {"ResourceStridedSliceAssign", "strides"},
       {"Reverse", "dims"},
       {"ReverseV2", "axis"},
       {"Slice", "begin"},
       {"Slice", "size"},
+      {"SpaceToBatch", "paddings"},
+      {"SpaceToBatchND", "block_shape"},
+      {"SpaceToBatchND", "paddings"},
       {"Split", "split_dim"},
       {"SplitV", "split_dim"},
       {"SplitV", "size_splits"},
@@ -102,7 +111,7 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (must_be_const.find(node) != must_be_const.end()) {
       if (node->type_string() == "_Arg") {
         int index;
-        status = GetNodeAttr(node->def(), "index", &index);
+        status = GetNodeAttr(node->attrs(), "index", &index);
         if (!status.ok()) return;
         compile_time_const_args->at(index) = true;
         return;
@@ -118,8 +127,8 @@ Status BackwardsConstAnalysis(const Graph& g,
     if (range.first == range.second) return;
 
     NameRangeMap input_name_ranges;
-    status = NameRangesForNode(node->def(), node->op_def(), &input_name_ranges,
-                               nullptr);
+    status =
+        NameRangesForNode(*node, node->op_def(), &input_name_ranges, nullptr);
     if (!status.ok()) return;
 
     for (auto it = range.first; it != range.second; ++it) {
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 2ee80a41e820b5ecc92816c84b6de9625f319b19..81b065689da4d8314c6ae9480d73745830fc31f5 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -14,18 +14,21 @@ tf_kernel_library(
     name = "xla_ops",
     srcs = [
         "aggregate_ops.cc",
+        "arg_op.cc",
         "batch_matmul_op.cc",
+        "batchtospace_op.cc",
         "bcast_ops.cc",
         "bias_ops.cc",
         "binary_ops.cc",
         "cast_op.cc",
         "concat_op.cc",
+        "const_op.cc",
         "conv_ops.cc",
         "cwise_ops.cc",
-        "declaration_op.cc",
         "depthwise_conv_ops.cc",
         "diag_op.cc",
         "dynamic_stitch_op.cc",
+        "elu_op.cc",
         "fill_op.cc",
         "function_ops.cc",
         "identity_op.cc",
@@ -49,6 +52,7 @@ tf_kernel_library(
         "shape_op.cc",
         "slice_op.cc",
         "softmax_op.cc",
+        "spacetobatch_op.cc",
         "split_op.cc",
         "strided_slice_op.cc",
         "tile_ops.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/declaration_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
similarity index 56%
rename from tensorflow/compiler/tf2xla/kernels/declaration_op.cc
rename to tensorflow/compiler/tf2xla/kernels/arg_op.cc
index be2ce038016e852e48c312e26bf959ca5b9215af..d6897d6e3313414a5fd781f8a71ce143d5db2614 100644
--- a/tensorflow/compiler/tf2xla/kernels/declaration_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -23,58 +23,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// This OpKernel implements the Constant Op for XLA JIT
-// devices. It extracts the constant Tensor from the Proto at kernel
-// construction time, and then every time the Constant Op is executed
-// an expression containing the constant is compiled.
-class ConstantDeclarationOp : public XlaOpKernel {
- public:
-  explicit ConstantDeclarationOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx), tensor_(ctx->output_type(0)) {
-    const TensorProto* proto = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
-    // MakeTensorFromProto uses the cpu_allocator, so tensor_ is a
-    // "real" tensor backed by CPU memory, holding the value of the
-    // constant.
-    OP_REQUIRES_OK(ctx, MakeTensorFromProto(*proto, &tensor_));
-    OP_REQUIRES(
-        ctx, ctx->output_type(0) == tensor_.dtype(),
-        errors::InvalidArgument(
-            "Type mismatch between value (", DataTypeString(tensor_.dtype()),
-            ") and dtype (", DataTypeString(ctx->output_type(0)), ")"));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    ctx->SetConstantOutput(0, tensor_);
-  }
-
- private:
-  // Extract the value of the constant from the Proto during Op kernel
-  // construction. The constant must be stored in a Tensor allocated
-  // using the cpu_allocator so that it is backed by real memory. The
-  // OpKernelConstruction's default allocator is the JITAllocator
-  // which only allocates enough space for metadata for each Tensor.
-  static Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                                    Tensor* tensor) {
-    Tensor parsed(tensor_proto.dtype());
-    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-      return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                     tensor_proto.DebugString());
-    }
-    *tensor = parsed;
-    return Status::OK();
-  }
-
-  // This is a "real" tensor backed by CPU memory, containing the
-  // constant values.
-  Tensor tensor_;
-  TF_DISALLOW_COPY_AND_ASSIGN(ConstantDeclarationOp);
-};
-
-// XLA_* devices also register a "real" Identity operator so we suppress the
-// dummy operator using CompilationOnly().
-REGISTER_XLA_OP(Name("Const").CompilationOnly(), ConstantDeclarationOp);
-
 // This OpKernel implements the _Arg Op for XLA JIT devices. It
 // associates its output with one of the arguments to a
 // subcomputation.
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4bd47ee50090722801329466cc88d34cd2449b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -0,0 +1,186 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+void BatchToSpace(XlaOpKernelContext* ctx,
+                  const xla::ComputationDataHandle& input, DataType input_dtype,
+                  const TensorShape& input_tensor_shape,
+                  gtl::ArraySlice<int64> block_shape,
+                  const xla::Literal& crops) {
+  const int input_rank = input_tensor_shape.dims();
+  const gtl::InlinedVector<int64, 4> input_shape =
+      input_tensor_shape.dim_sizes();
+  const int block_rank = block_shape.size();
+
+  OP_REQUIRES(
+      ctx, input_rank >= 1 + block_rank,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_rank,
+                              " instead of ", input_rank));
+  gtl::ArraySlice<int64> remainder_shape(input_shape);
+  remainder_shape.remove_prefix(1 + block_rank);
+
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Rank(crops.shape()) == 2 &&
+          block_rank == xla::ShapeUtil::GetDimension(crops.shape(), 0) &&
+          2 == xla::ShapeUtil::GetDimension(crops.shape(), 1),
+      errors::InvalidArgument("crops should have shape [", block_rank,
+                              ", 2] instead of ",
+                              xla::ShapeUtil::HumanString(crops.shape())));
+
+  xla::ComputationBuilder* b = ctx->builder();
+  const int64 batch_size = input_shape[0];
+
+  // Compute the product of the block_shape values.
+  int64 block_num_elems = 1;
+  for (int i = 0; i < block_rank; ++i) {
+    block_num_elems *= block_shape[i];
+  }
+  OP_REQUIRES(ctx, block_num_elems > 0,
+              errors::InvalidArgument(
+                  "The product of the block dimensions must be positive"));
+
+  // 1. Reshape `input` to `reshaped` of shape:
+  //      [block_shape[0], ..., block_shape[M-1],
+  //       batch / prod(block_shape),
+  //       input_shape[1], ..., input_shape[N-1]]
+
+  OP_REQUIRES(
+      ctx, batch_size % block_num_elems == 0,
+      errors::InvalidArgument("Input batch dimension (", batch_size,
+                              ") is not divisible by product of block sizes (",
+                              block_num_elems, ")"));
+  std::vector<int64> reshaped_shape(input_rank + block_rank);
+  std::copy(block_shape.begin(), block_shape.end(), reshaped_shape.begin());
+  reshaped_shape[block_rank] = batch_size / block_num_elems;
+  std::copy(input_shape.begin() + 1, input_shape.end(),
+            reshaped_shape.begin() + block_rank + 1);
+  xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
+
+  // 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1], block_shape[0],
+  //       ...,
+  //       input_shape[M], block_shape[M-1],
+  //
+  //       input_shape[M+1], ..., input_shape[N-1]]
+  std::vector<int64> permutation(reshaped_shape.size());
+  permutation[0] = block_rank;
+  for (int i = 0; i < block_rank; ++i) {
+    permutation[1 + 2 * i] = block_rank + 1 + i;
+    permutation[1 + 2 * i + 1] = i;
+  }
+  std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
+            1 + block_rank * 2);
+  xla::ComputationDataHandle permuted = b->Transpose(reshaped, permutation);
+
+  // 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1] * block_shape[0],
+  //       ...,
+  //       input_shape[M] * block_shape[M-1],
+  //
+  //       input_shape[M+1],
+  //       ...,
+  //       input_shape[N-1]]
+  std::vector<int64> reshaped_permuted_shape(input_rank);
+  reshaped_permuted_shape[0] = batch_size / block_num_elems;
+  for (int i = 0; i < block_rank; ++i) {
+    reshaped_permuted_shape[1 + i] = block_shape[i] * input_shape[1 + i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            reshaped_permuted_shape.begin() + 1 + block_rank);
+
+  xla::ComputationDataHandle reshaped_permuted =
+      b->Reshape(permuted, reshaped_permuted_shape);
+
+  // 4. Crop the start and end of dimensions `[1, ..., M]` of
+  //    `reshaped_permuted` according to `crops` to produce the output of shape:
+  //      [batch / prod(block_shape),
+  //
+  //       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+  //       ...,
+  //       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+  //
+  //       input_shape[M+1], ..., input_shape[N-1]]
+  std::vector<int64> start_indices(input_rank, 0);
+  std::vector<int64> end_indices = reshaped_permuted_shape;
+  for (int i = 0; i < block_rank; ++i) {
+    int64 crop_start = xla::LiteralUtil::Get<int64>(crops, {i, 0});
+    int64 crop_end = xla::LiteralUtil::Get<int64>(crops, {i, 1});
+    OP_REQUIRES(ctx, crop_start >= 0 && crop_end >= 0,
+                errors::InvalidArgument("Crops must be non-negative"));
+    start_indices[1 + i] = crop_start;
+    end_indices[1 + i] -= crop_end;
+    OP_REQUIRES(
+        ctx, start_indices[1 + i] <= end_indices[1 + i],
+        errors::InvalidArgument(
+            "Cropped size must be non-negative: start: ", crop_start,
+            " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
+  }
+  xla::ComputationDataHandle output =
+      b->Slice(reshaped_permuted, start_indices, end_indices);
+  ctx->SetOutput(0, output);
+}
+
+class BatchToSpaceNDOp : public XlaOpKernel {
+ public:
+  explicit BatchToSpaceNDOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> block_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &block_shape));
+
+    xla::Literal crops;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(2, &crops));
+
+    BatchToSpace(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 block_shape, crops);
+  }
+};
+REGISTER_XLA_OP(Name("BatchToSpaceND"), BatchToSpaceNDOp);
+
+class BatchToSpaceOp : public XlaOpKernel {
+ public:
+  explicit BatchToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
+    OP_REQUIRES(
+        ctx, block_size_ > 1,
+        errors::InvalidArgument("Block size should be > 1: ", block_size_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::Literal crops;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(1, &crops));
+
+    BatchToSpace(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 {block_size_, block_size_}, crops);
+  }
+
+ private:
+  int block_size_;
+};
+REGISTER_XLA_OP(Name("BatchToSpace"), BatchToSpaceOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad676e7a2bb3d3f28ecb98164323cbf1e32f61a9
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+namespace {
+
+class ConstOp : public XlaOpKernel {
+ public:
+  explicit ConstOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    const TensorProto* proto = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("value", &proto));
+    proto_ = *proto;
+    OP_REQUIRES(
+        ctx, ctx->output_type(0) == proto_.dtype(),
+        errors::InvalidArgument("Type mismatch between value (",
+                                DataTypeString(proto_.dtype()), ") and dtype (",
+                                DataTypeString(ctx->output_type(0)), ")"));
+    OP_REQUIRES_OK(ctx, TensorShape::IsValidShape(proto_.tensor_shape()));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape(proto_.tensor_shape());
+
+    xla::ComputationBuilder* b = ctx->builder();
+
+    // To avoid blowups for large constants filled with the same value,
+    // recognize that case and emit a scalar broadcast instead.
+    if (shape.num_elements() > 1) {
+      switch (proto_.dtype()) {
+        case DT_BOOL:
+          if (proto_.bool_val_size() == 1) {
+            ctx->SetOutput(0,
+                           b->Broadcast(b->ConstantR0<bool>(proto_.bool_val(0)),
+                                        shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_FLOAT:
+          if (proto_.float_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<float>(proto_.float_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_DOUBLE:
+          if (proto_.double_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<double>(proto_.double_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_INT32:
+          if (proto_.int_val_size() == 1) {
+            ctx->SetOutput(0,
+                           b->Broadcast(b->ConstantR0<int32>(proto_.int_val(0)),
+                                        shape.dim_sizes()));
+            return;
+          }
+          break;
+        case DT_INT64:
+          if (proto_.int64_val_size() == 1) {
+            ctx->SetOutput(
+                0, b->Broadcast(b->ConstantR0<int64>(proto_.int64_val(0)),
+                                shape.dim_sizes()));
+            return;
+          }
+          break;
+        default:
+          break;
+      }
+    }
+
+    // General case
+    Tensor tensor(proto_.dtype());
+    OP_REQUIRES(ctx, tensor.FromProto(cpu_allocator(), proto_),
+                errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                        proto_.DebugString()));
+    ctx->SetConstantOutput(0, tensor);
+  }
+
+ private:
+  TensorProto proto_;
+  TF_DISALLOW_COPY_AND_ASSIGN(ConstOp);
+};
+
+// XLA_* devices also register a "real" Const operator so we suppress the
+// dummy operator using CompilationOnly().
+REGISTER_XLA_OP(Name("Const").CompilationOnly(), ConstOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62a5e1bd421a75fb0a8fa6eacd58e4aaa2f02236
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Native XLA implementations of XLA Elu Ops
+
+#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/no_op.h"
+
+namespace tensorflow {
+namespace {
+
+class EluOp : public XlaOpKernel {
+ public:
+  explicit EluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Computes the max of the scalar input x and 0.
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto pred = b->Gt(ctx->Input(0), zero);
+    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    ctx->SetOutput(0, b->Select(pred, ctx->Input(0), expm1));
+  }
+};
+
+class EluGradOp : public XlaOpKernel {
+ public:
+  explicit EluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
+  // otherwise return lhs * (1 + rhs).
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+    const auto zero = XlaHelpers::Zero(b, input_type(0));
+    const auto one = XlaHelpers::One(b, input_type(0));
+    const auto grad = ctx->Input(0);
+    const auto activation = ctx->Input(1);
+    const auto exp_grad = b->Mul(grad, b->Add(activation, one));
+    const auto pred = b->Gt(activation, zero);
+    ctx->SetOutput(0, b->Select(pred, grad, exp_grad));
+  }
+};
+
+REGISTER_XLA_OP(Name("Elu"), EluOp);
+REGISTER_XLA_OP(Name("EluGrad"), EluGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index d718f98545f66cb79a77d758a3fb7ee486d87b4b..8dacb6627bde516c92cb07b747207adbe85ada5b 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -68,7 +68,8 @@ class SymbolicGradientOp : public AsyncOpKernel {
                       done);
 
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle_), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(&def().attr()), &handle_),
+        done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
index eff23bd77d23afc882c67f8168270d1cb4413977..691a0b972d5c09ad632d706d72a1b60988730986 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
-gather_float_int32_xla_impl(float* out, void** data) {
+extern "C" void TF_EXPORT gather_float_int32_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int32_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
index ae31f6f2006959c03941a1eb04b31aecf52424b0..3dff6e2737bf1af7f5d646928e740fa895692a03 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
@@ -63,7 +64,6 @@ EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
 
 // Implements gather on CPU. This is called by an XLA custom call, set up by
 // gather_op.cc.
-extern "C" void __attribute__((visibility("default")))
-gather_float_int64_xla_impl(float* out, void** data) {
+extern "C" void TF_EXPORT gather_float_int64_xla_impl(float* out, void** data) {
   tensorflow::gather_float_int64_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 0033a949a372684caadce70bf46a996a942e9ec4..afbd64ca5038378d48744d6d773e0dfb1376e1f9 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -43,7 +44,6 @@ EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
-argmax_float_1d_xla_impl(void* out, void** data) {
+extern "C" void TF_EXPORT argmax_float_1d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_1d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
index be8ad2317c9ba6a39f839c4a535440fb94365aa9..841ff2f4df79fdd790ee3aace9e38aaeb01a3080 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_2d.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -45,7 +46,6 @@ EIGEN_STRONG_INLINE void argmax_float_2d_xla_impl(void* out, void** data) {
 
 // Implements argmax on CPU. This is called by an XLA custom call, set up by
 // index_ops.cc.
-extern "C" void __attribute__((visibility("default")))
-argmax_float_2d_xla_impl(void* out, void** data) {
+extern "C" void TF_EXPORT argmax_float_2d_xla_impl(void* out, void** data) {
   tensorflow::argmax_float_2d_xla_impl(out, data);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 74e3297dc3340d9e98e149065a738c3d2e73cf45..24a99f253d6dc8bb699fff587c363b12c227e821 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -241,5 +241,19 @@ class ZerosLikeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("ZerosLike"), ZerosLikeOp);
 
+class OnesLikeOp : public XlaOpKernel {
+ public:
+  explicit OnesLikeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+
+    auto one = XlaHelpers::One(ctx->builder(), input_type(0));
+    ctx->SetOutput(0, ctx->builder()->Broadcast(one, input_shape.dim_sizes()));
+  }
+};
+
+REGISTER_XLA_OP(Name("OnesLike"), OnesLikeOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f15b354cb26d390352d866a8e827970f7c8b0c7f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -0,0 +1,190 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+void SpaceToBatch(XlaOpKernelContext* ctx,
+                  const xla::ComputationDataHandle& input, DataType input_dtype,
+                  const TensorShape& input_tensor_shape,
+                  gtl::ArraySlice<int64> block_shape,
+                  const xla::Literal& paddings) {
+  const int input_rank = input_tensor_shape.dims();
+  const gtl::InlinedVector<int64, 4> input_shape =
+      input_tensor_shape.dim_sizes();
+  const int block_rank = block_shape.size();
+
+  OP_REQUIRES(
+      ctx, input_rank >= 1 + block_rank,
+      errors::InvalidArgument("input rank should be >= ", 1 + block_rank,
+                              " instead of ", input_rank));
+  gtl::ArraySlice<int64> remainder_shape(input_shape);
+  remainder_shape.remove_prefix(1 + block_rank);
+
+  OP_REQUIRES(
+      ctx,
+      xla::ShapeUtil::Rank(paddings.shape()) == 2 &&
+          block_rank == xla::ShapeUtil::GetDimension(paddings.shape(), 0) &&
+          2 == xla::ShapeUtil::GetDimension(paddings.shape(), 1),
+      errors::InvalidArgument("paddings should have shape [", block_rank,
+                              ", 2] instead of ",
+                              xla::ShapeUtil::HumanString(paddings.shape())));
+
+  xla::ComputationBuilder* b = ctx->builder();
+
+  // 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+  //  input according to `paddings` to produce `padded` of shape `padded_shape`.
+  xla::PaddingConfig padding_config;
+  std::vector<int64> padded_shape(input_shape.begin(), input_shape.end());
+  int64 block_num_elems = 1LL;
+  padding_config.add_dimensions();  // Don't pad the batch dimension.
+  for (int i = 0; i < block_rank; ++i) {
+    auto* dim = padding_config.add_dimensions();
+    int64 pad_start = xla::LiteralUtil::Get<int64>(paddings, {i, 0});
+    int64 pad_end = xla::LiteralUtil::Get<int64>(paddings, {i, 1});
+    OP_REQUIRES(ctx, pad_start >= 0 && pad_end >= 0,
+                errors::InvalidArgument("Paddings must be non-negative"));
+    dim->set_edge_padding_low(pad_start);
+    dim->set_edge_padding_high(pad_end);
+    padded_shape[1 + i] += pad_start + pad_end;
+    block_num_elems *= block_shape[i];
+  }
+  // Don't pad the remainder dimensions.
+  for (int i = 0; i < remainder_shape.size(); ++i) {
+    padding_config.add_dimensions();
+  }
+  OP_REQUIRES(ctx, block_num_elems > 0,
+              errors::InvalidArgument(
+                  "The product of the block dimensions must be positive"));
+
+  xla::ComputationDataHandle padded =
+      b->Pad(input, XlaHelpers::Zero(b, input_dtype), padding_config);
+
+  // 2. Reshape `padded` to `reshaped_padded` of shape:
+  //
+  //      [batch] +
+  //      [padded_shape[1] / block_shape[0],
+  //        block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1],
+  //       block_shape[M-1]] +
+  //      remaining_shape
+  const int64 batch_size = input_shape[0];
+  std::vector<int64> reshaped_padded_shape(input_rank + block_rank);
+  reshaped_padded_shape[0] = batch_size;
+  for (int i = 0; i < block_rank; ++i) {
+    OP_REQUIRES(ctx, padded_shape[1 + i] % block_shape[i] == 0,
+                errors::InvalidArgument("padded_shape[", 1 + i,
+                                        "]=", padded_shape[1 + i],
+                                        " is not divisible by block_shape[", i,
+                                        "]=", block_shape[i]));
+
+    reshaped_padded_shape[1 + i * 2] = padded_shape[1 + i] / block_shape[i];
+    reshaped_padded_shape[1 + i * 2 + 1] = block_shape[i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            reshaped_padded_shape.begin() + 1 + 2 * block_rank);
+
+  xla::ComputationDataHandle reshaped_padded =
+      b->Reshape(padded, reshaped_padded_shape);
+
+  // 3. Permute dimensions of `reshaped_padded` to produce
+  //    `permuted_reshaped_padded` of shape:
+  //
+  //      block_shape +
+  //      [batch] +
+  //      [padded_shape[1] / block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1]] +
+  //      remaining_shape
+  std::vector<int64> permutation(reshaped_padded_shape.size());
+  for (int i = 0; i < block_rank; ++i) {
+    permutation[i] = 1 + 2 * i + 1;
+    permutation[block_rank + 1 + i] = 1 + 2 * i;
+  }
+  permutation[block_rank] = 0;
+  std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
+            1 + block_rank * 2);
+  xla::ComputationDataHandle permuted_reshaped_padded =
+      b->Transpose(reshaped_padded, permutation);
+
+  // 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the
+  //    batch dimension, producing an output tensor of shape:
+  //
+  //      [batch * prod(block_shape)] +
+  //      [padded_shape[1] / block_shape[0],
+  //       ...,
+  //       padded_shape[M] / block_shape[M-1]] +
+  //      remaining_shape
+  // Determine the length of the prefix of block dims that can be combined
+  // into the batch dimension due to having no padding and block_shape=1.
+  std::vector<int64> output_shape(input_rank);
+  output_shape[0] = batch_size * block_num_elems;
+  for (int i = 0; i < block_rank; ++i) {
+    output_shape[1 + i] = padded_shape[1 + i] / block_shape[i];
+  }
+  std::copy(remainder_shape.begin(), remainder_shape.end(),
+            output_shape.begin() + 1 + block_rank);
+
+  xla::ComputationDataHandle output =
+      b->Reshape(permuted_reshaped_padded, output_shape);
+  ctx->SetOutput(0, output);
+}
+
+class SpaceToBatchNDOp : public XlaOpKernel {
+ public:
+  explicit SpaceToBatchNDOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    std::vector<int64> block_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &block_shape));
+
+    xla::Literal paddings;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(2, &paddings));
+
+    SpaceToBatch(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 block_shape, paddings);
+  }
+};
+REGISTER_XLA_OP(Name("SpaceToBatchND"), SpaceToBatchNDOp);
+
+class SpaceToBatchOp : public XlaOpKernel {
+ public:
+  explicit SpaceToBatchOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
+    OP_REQUIRES(
+        ctx, block_size_ > 1,
+        errors::InvalidArgument("Block size should be > 1: ", block_size_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::Literal paddings;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsInt64Literal(1, &paddings));
+
+    SpaceToBatch(ctx, ctx->Input(0), input_type(0), ctx->InputShape(0),
+                 {block_size_, block_size_}, paddings);
+  }
+
+ private:
+  int block_size_;
+};
+REGISTER_XLA_OP(Name("SpaceToBatch"), SpaceToBatchOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 03e02299e33a4e2bf62e757b2092db35288b0bea..a6cac62ca4bcb7e2d1c722862208f673d0a2c86f 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -77,11 +77,9 @@ class StridedSliceOp : public XlaOpKernel {
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
     gtl::InlinedVector<int64, 4> slice_begin, slice_end;
+    bool simple_strides = true;
     for (int i = 0; i < begin.size(); ++i) {
-      // TODO(phawkins): implement strides != 1 when b/30878775 is fixed.
-      OP_REQUIRES(
-          ctx, strides[i] == 1 || strides[i] == -1,
-          errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
+      simple_strides &= (std::abs(strides[i]) == 1);
       if (strides[i] > 0) {
         slice_begin.push_back(begin[i]);
         slice_end.push_back(end[i]);
@@ -99,6 +97,35 @@ class StridedSliceOp : public XlaOpKernel {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
 
+    // If at least one of the strides is > 1 (or < -1) then use Slice
+    // to pull out each of the strided slices, and Concat to put them
+    // together again.
+    if (!simple_strides) {
+      // Re-adjust the begin and end now that the periphery has been
+      // sliced away.
+      for (int d = 0; d < strides.size(); ++d) {
+        slice_end[d] -= slice_begin[d];
+        slice_begin[d] = 0;
+      }
+
+      for (int d = 0; d < strides.size(); ++d) {
+        int64 stride = std::abs(strides[d]);
+        if (stride > 1) {
+          std::vector<xla::ComputationDataHandle> to_concat;
+          int64 end = slice_end[d];
+          for (int64 i = 0; i < end; i += stride) {
+            slice_begin[d] = i;
+            slice_end[d] = i + 1;
+            to_concat.push_back(
+                ctx->builder()->Slice(slice, slice_begin, slice_end));
+          }
+          slice = ctx->builder()->ConcatInDim(to_concat, d);
+          slice_begin[d] = 0;
+          slice_end[d] = to_concat.size();
+        }
+      }
+    }
+
     slice = ctx->builder()->Reshape(slice, final_shape.dim_sizes());
     ctx->SetOutput(0, slice);
   }
@@ -219,5 +246,118 @@ class StridedSliceGradOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("StridedSliceGrad"), StridedSliceGradOp);
 
+class StridedSliceAssignOp : public XlaOpKernel {
+ public:
+  explicit StridedSliceAssignOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("begin_mask", &begin_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("end_mask", &end_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ellipsis_mask", &ellipsis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("new_axis_mask", &new_axis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shrink_axis_mask", &shrink_axis_mask_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape final_shape;
+    gtl::InlinedVector<int64, 4> begin;
+    gtl::InlinedVector<int64, 4> end;
+    gtl::InlinedVector<int64, 4> strides;
+
+    xla::Literal begin_literal, end_literal, strides_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &begin_literal));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &end_literal));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
+
+    Tensor begin_tensor, end_tensor, strides_tensor;
+    OP_REQUIRES_OK(
+        ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
+    OP_REQUIRES_OK(ctx,
+                   LiteralToHostTensor(end_literal, index_type_, &end_tensor));
+    OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
+                                            &strides_tensor));
+
+    DataType lhs_type;
+    TensorShape lhs_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &lhs_type, &lhs_shape));
+
+    const TensorShape rhs_shape = ctx->InputShape(4);
+
+    TensorShape dummy_processing_shape;
+    ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
+    ShapeReadWriteFromTensorShape wrapped_dummy_processing_shape(
+        &dummy_processing_shape);
+    bool dummy = false;
+    OP_REQUIRES_OK(
+        ctx, ValidateStridedSliceOp(
+                 &begin_tensor, &end_tensor, strides_tensor,
+                 ShapeReadWriteFromTensorShape(&lhs_shape), begin_mask_,
+                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
+                 &wrapped_dummy_processing_shape, &wrapped_final_shape, &dummy,
+                 &dummy, &dummy, &begin, &end, &strides));
+
+    if (final_shape.num_elements() == 0 && rhs_shape.num_elements() == 0) {
+      // DynamicUpdateSlice does not allow 0-element updates. We should probably
+      // check that rhs_shape can be broadcast to final_shape, but that is
+      // probably better handled when implementing broadcasting more generally.
+      return;
+    }
+
+    // TODO(aselle): This check is too strong, we only should need
+    // input_shape to be broadcastable to final_shape
+    OP_REQUIRES(ctx, final_shape == rhs_shape,
+                errors::Unimplemented(
+                    "sliced l-value shape ", final_shape.DebugString(),
+                    " does not match r-value shape ", rhs_shape.DebugString(),
+                    ". Automatic broadcasting not yet implemented."));
+
+    xla::ComputationDataHandle lhs;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &lhs));
+
+    xla::ComputationDataHandle rhs = ctx->Input(4);
+
+    gtl::InlinedVector<int64, 4> dimensions_to_reverse;
+    gtl::InlinedVector<int64, 4> slice_begin, slice_dims;
+    for (int i = 0; i < begin.size(); ++i) {
+      // TODO(phawkins): implement strides != 1
+      OP_REQUIRES(
+          ctx, strides[i] == 1 || strides[i] == -1,
+          errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
+      if (strides[i] > 0) {
+        slice_begin.push_back(begin[i]);
+        slice_dims.push_back(end[i] - begin[i]);
+      } else {
+        // Negative stride: swap begin and end, add 1 because the interval
+        // is semi-open, and mark the dimension to be reversed.
+        slice_begin.push_back(end[i] + 1);
+        slice_dims.push_back(begin[i] - end[i]);
+        dimensions_to_reverse.push_back(i);
+      }
+    }
+
+    if (!dimensions_to_reverse.empty()) {
+      rhs = ctx->builder()->Rev(rhs, dimensions_to_reverse);
+    }
+    rhs = ctx->builder()->Reshape(rhs, slice_dims);
+
+    if (lhs_shape.dims() == 0) {
+      // TODO(b/38323843): DynamicUpdateSlice crashes on rank 0 inputs. Fix
+      // and remove this workaround.
+      lhs = rhs;
+    } else {
+      lhs = ctx->builder()->DynamicUpdateSlice(
+          lhs, rhs, ctx->builder()->ConstantR1<int64>(slice_begin));
+    }
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, lhs_type, lhs));
+  }
+
+ private:
+  int32 begin_mask_, end_mask_;
+  int32 ellipsis_mask_, new_axis_mask_, shrink_axis_mask_;
+  DataType index_type_;
+};
+
+REGISTER_XLA_OP(Name("ResourceStridedSliceAssign"), StridedSliceAssignOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index f1d81f871423b220c6859c1dedf79b1c36a43e65..ddd81cb490cd76065735a5b7e78d04fd76c05f82 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -165,6 +165,106 @@ class ResourceApplyAdagrad : public XlaOpKernel {
 };
 REGISTER_XLA_OP(Name("ResourceApplyAdagrad"), ResourceApplyAdagrad);
 
+class ResourceApplyAdam : public XlaOpKernel {
+ public:
+  explicit ResourceApplyAdam(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType var_type, m_type, v_type;
+    TensorShape var_shape, m_shape, v_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(0, &var_type, &var_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(1, &m_type, &m_shape));
+    OP_REQUIRES_OK(ctx, ctx->GetVariableTypeAndShape(2, &v_type, &v_shape));
+
+    OP_REQUIRES(
+        ctx, dtype_ == var_type && dtype_ == m_type && dtype_ == v_type,
+        errors::InvalidArgument(
+            "Types of variable arguments to ResourceApplyRMSProp must match: ",
+            DataTypeString(dtype_), " vs. ", DataTypeString(var_type), " vs. ",
+            DataTypeString(m_type), " vs. ", DataTypeString(v_type)));
+
+    TensorShape beta1_power_shape = ctx->InputShape(3);
+    TensorShape beta2_power_shape = ctx->InputShape(4);
+    TensorShape lr_shape = ctx->InputShape(5);
+    TensorShape beta1_shape = ctx->InputShape(6);
+    TensorShape beta2_shape = ctx->InputShape(7);
+    TensorShape epsilon_shape = ctx->InputShape(8);
+    TensorShape grad_shape = ctx->InputShape(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_shape),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_shape),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_shape),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_shape),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_shape),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon_shape.DebugString()));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(m_shape),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        m_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(v_shape),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var_shape.DebugString(), " ",
+                                        v_shape.DebugString()));
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    xla::ComputationDataHandle var, m, v;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, &m));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, &v));
+    xla::ComputationDataHandle beta1_power = ctx->Input(3);
+    xla::ComputationDataHandle beta2_power = ctx->Input(4);
+    xla::ComputationDataHandle lr = ctx->Input(5);
+    xla::ComputationDataHandle beta1 = ctx->Input(6);
+    xla::ComputationDataHandle beta2 = ctx->Input(7);
+    xla::ComputationDataHandle epsilon = ctx->Input(8);
+    xla::ComputationDataHandle grad = ctx->Input(9);
+
+    // alpha <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+    // m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+    // v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+    // variable <- variable - alpha * m_t / (sqrt(v_t) + epsilon)
+
+    xla::ComputationBuilder* b = ctx->builder();
+    xla::ComputationDataHandle half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
+    xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
+    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+
+    xla::ComputationDataHandle alpha =
+        b->Div(b->Mul(lr, b->Pow(b->Sub(one, beta2_power), half)),
+               b->Sub(one, beta1_power));
+    m = b->Add(m, b->Mul(b->Sub(grad, m), b->Sub(one, beta1)));
+    v = b->Add(v, b->Mul(b->Sub(b->Pow(grad, two), v), b->Sub(one, beta2)));
+    var =
+        b->Sub(var, b->Div(b->Mul(m, alpha), b->Add(b->Pow(v, half), epsilon)));
+
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, m));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(2, dtype_, v));
+  }
+
+ private:
+  DataType dtype_;
+};
+REGISTER_XLA_OP(Name("ResourceApplyAdam"), ResourceApplyAdam);
+
 class ResourceApplyRMSProp : public XlaOpKernel {
  public:
   explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/str_util.cc b/tensorflow/compiler/tf2xla/str_util.cc
index ce25d631271b54a36078cd0d3ac4d318d58db9fa..2b0834fe7b6c4d2199267dbe0ec1f7c2785aa9c7 100644
--- a/tensorflow/compiler/tf2xla/str_util.cc
+++ b/tensorflow/compiler/tf2xla/str_util.cc
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-void ReplaceAll(string* text, StringPiece from, StringPiece to) {
+static void ReplaceAll(string* text, StringPiece from, StringPiece to) {
   size_t pos = 0;
   while ((pos = text->find(from.data(), pos, from.size())) != string::npos) {
     text->replace(pos, from.size(), to.data(), to.size());
diff --git a/tensorflow/compiler/tf2xla/str_util.h b/tensorflow/compiler/tf2xla/str_util.h
index 4920b1a4d4875192d6f06988b810ad388bc6293b..51f25009d7003db0d72296619a469ecbbbb1808d 100644
--- a/tensorflow/compiler/tf2xla/str_util.h
+++ b/tensorflow/compiler/tf2xla/str_util.h
@@ -29,10 +29,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-// Replace all non-overlapping occurrences of from with to in-place in text.  If
-// from is empty, it matches at the beginning of the text and after every byte.
-void ReplaceAll(string* text, StringPiece from, StringPiece to);
-
 // Replace all non-overlapping occurrences of the given (from,to) pairs in-place
 // in text.  If from is empty, it matches at the beginning of the text and after
 // every byte.  Each (from,to) replacement pair is processed in the order it is
diff --git a/tensorflow/compiler/tf2xla/str_util_test.cc b/tensorflow/compiler/tf2xla/str_util_test.cc
index f992007a34532157f86c90c717a5e24c3923f22d..8817f6902a8e58e796ca5240a9a24d7506d38793 100644
--- a/tensorflow/compiler/tf2xla/str_util_test.cc
+++ b/tensorflow/compiler/tf2xla/str_util_test.cc
@@ -25,36 +25,6 @@ limitations under the License.
 namespace tensorflow {
 namespace str_util {
 
-class ReplaceAllTest : public ::testing::Test {
- protected:
-  void ExpectReplaceAll(string text, StringPiece from, StringPiece to,
-                        StringPiece want) {
-    ReplaceAll(&text, from, to);
-    EXPECT_EQ(text, want);
-  }
-};
-
-TEST_F(ReplaceAllTest, Simple) {
-  ExpectReplaceAll("", "", "", "");
-  ExpectReplaceAll("", "", "X", "X");
-  ExpectReplaceAll("", "", "XYZ", "XYZ");
-  ExpectReplaceAll("banana", "", "", "banana");
-  ExpectReplaceAll("banana", "", "_", "_b_a_n_a_n_a_");
-  ExpectReplaceAll("banana", "", "__", "__b__a__n__a__n__a__");
-  ExpectReplaceAll("banana", "a", "a", "banana");
-  ExpectReplaceAll("banana", "a", "", "bnn");
-  ExpectReplaceAll("banana", "a", "X", "bXnXnX");
-  ExpectReplaceAll("banana", "a", "XX", "bXXnXXnXX");
-  ExpectReplaceAll("banana", "an", "an", "banana");
-  ExpectReplaceAll("banana", "an", "", "ba");
-  ExpectReplaceAll("banana", "an", "X", "bXXa");
-  ExpectReplaceAll("banana", "an", "XY", "bXYXYa");
-  ExpectReplaceAll("banana", "an", "XYZ", "bXYZXYZa");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "X", "foo X baz X");
-  ExpectReplaceAll("foo {{bar}} baz {{bar}}", "{{bar}}", "ABCDEFGHIJKLMNOP",
-                   "foo ABCDEFGHIJKLMNOP baz ABCDEFGHIJKLMNOP");
-}
-
 class ReplaceAllPairsTest : public ::testing::Test {
  protected:
   void ExpectReplaceAllPairs(
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index d86e741b69e08652bac2dd7b5295c8ab2d94433a..362a1018955f9b6adbdea5ba718b81e9a2389957 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -76,8 +76,7 @@ XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
           options,
           Device::BuildDeviceAttributes(
               "", type, Bytes(256 << 20), DeviceLocality(),
-              strings::StrCat("device: XLA compilation device ", type.type())),
-          cpu_allocator()),
+              strings::StrCat("device: XLA compilation device ", type.type()))),
       allocator_(new XlaCompilationAllocator()) {}
 
 XlaCompilationDevice::~XlaCompilationDevice() {}
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index ba975d617dcd52de74830b3e69446c752fce1fcb..f8a9c5e9bc6f9ce778594209c9f974328cdb4b8f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -57,11 +57,38 @@ Status CheckSignature(const DataTypeVector& types,
 
 }  // namespace
 
+bool XlaCompiler::Argument::operator==(
+    const XlaCompiler::Argument& other) const {
+  if (std::tie(kind, type, shape, name) !=
+      std::tie(other.kind, other.type, other.shape, other.name)) {
+    return false;
+  }
+  if (constant_value.shape() != other.constant_value.shape()) {
+    return false;
+  }
+  return constant_value.tensor_data() == other.constant_value.tensor_data();
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(std::move(options)),
+      initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
-      device_mgr_({device_}) {}
+      device_(
+          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
+      device_mgr_({device_}) {
+  // We no longer need the device_type.
+  options_.device_type = nullptr;
+
+  if (options_.populate_resource_manager) {
+    initialization_status_ =
+        (*options_.populate_resource_manager)(device_->resource_manager());
+  }
+
+  flib_runtime_.reset(NewFunctionLibraryRuntime(
+      &device_mgr_, Env::Default(), device_, options.graph_def_version,
+      options.flib_def, OptimizerOptions(),
+      nullptr /* custom_kernel_creator */));
+}
 
 XlaCompiler::~XlaCompiler() = default;
 
@@ -70,37 +97,35 @@ int64 XlaCompiler::NextStepId() {
   return next_step_id_++;
 }
 
-// Prunes any nodes from a function that are not dependencies of the _Retval
-// nodes. Used to prune stateful ops from within a function body, such as
-// variable initializers, that should not be executed unless requested.
-static void PruneUnreachableNodes(Graph* graph) {
-  std::unordered_set<const Node*> nodes;
-  for (Node* node : graph->nodes()) {
-    if (node->type_string() == "_Retval" ||
-        StringPiece(node->type_string()).ends_with("Send")) {
-      nodes.insert(node);
-    }
-  }
-  PruneForReverseReachability(graph, nodes);
+uint64 XlaCompiler::SignatureHash::operator()(
+    const std::pair<string, std::vector<Argument>>& signature) const {
+  return std::hash<string>()(signature.first);
 }
 
 Status XlaCompiler::CompileFunction(
-    FunctionLibraryRuntime* flr, const NameAttrList& function,
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
     const std::vector<XlaCompiler::Argument>& args,
     XlaCompiler::CompilationResult* result) {
-  const string function_id = Canonicalize(function.name(), function.attr());
+  const string function_id =
+      Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
+  auto it = cache_.find({function_id, args});
+  if (it != cache_.end()) {
+    *result = it->second;
+    return Status::OK();
+  }
+
   FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(
-      flr->Instantiate(function.name(), function.attr(), &handle));
+  TF_RETURN_IF_ERROR(flib_runtime_->Instantiate(
+      function.name(), AttrSlice(&function.attr()), &handle));
 
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+  const FunctionBody* fbody = flib_runtime_->GetFunctionBody(handle);
   CHECK(fbody);
 
   TF_RETURN_IF_ERROR(CheckSignature(fbody->arg_types, args));
 
-  std::unique_ptr<Graph> graph(new Graph(flr->GetFunctionLibraryDefinition()));
+  std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
   if (VLOG_IS_ON(1)) {
@@ -109,11 +134,13 @@ Status XlaCompiler::CompileFunction(
   }
 
   // Optimize the graph before running the compiler.
-  // TODO(pbar): The constant folder currently does not simplify int32
-  // operations for devices other than CPU.
   OptimizerOptions opts;
+  opts.set_do_common_subexpression_elimination(true);
+  opts.set_do_function_inlining(true);
+  opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  OptimizeGraph(flr, &graph);
+  optimizer.Optimize(flib_runtime_.get(), flib_runtime_->env(),
+                     /*device=*/nullptr, &graph);
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile(
@@ -123,9 +150,10 @@ Status XlaCompiler::CompileFunction(
 
   VLOG(1) << "====================================================";
   TF_RETURN_IF_ERROR(
-      CompileGraph(function_id, std::move(graph), flr, args, result));
+      CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
+  cache_[{function_id, args}] = *result;
   return Status::OK();
 }
 
@@ -152,7 +180,7 @@ Status XlaCompiler::BuildExecutable(
   build_options.set_has_hybrid_result(
       options_.local_executable_has_hybrid_result);
 
-  auto compile_result = local_client->Compile(result.computation,
+  auto compile_result = local_client->Compile(*result.computation,
                                               argument_layouts, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
@@ -372,44 +400,45 @@ Status BuildComputation(
 
 }  // namespace
 
-Status XlaCompiler::CompileGraph(string const& name,
+Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
+                                 string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 FunctionLibraryRuntime* flib,
                                  const std::vector<XlaCompiler::Argument>& args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
 
+  // Report the error here if initialization failed.
+  TF_RETURN_IF_ERROR(initialization_status_);
+
   xla::ComputationBuilder builder(client(), name);
   XlaContext* context =
       new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
                      options_.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
-  result->tuple_arg = options_.use_tuple_arg;
+  result->tuple_arg = options.use_tuple_arg;
 
   std::vector<XlaContext::Argument> context_args;
-  TF_RETURN_IF_ERROR(BuildArguments(args, options_.use_tuple_arg, &builder,
+  TF_RETURN_IF_ERROR(BuildArguments(args, options.use_tuple_arg, &builder,
                                     &context_args, &result->input_mapping,
                                     &result->xla_input_shapes));
   context->set_args(std::move(context_args));
 
-  if (options_.prune_unreachable_nodes) {
-    PruneUnreachableNodes(graph.get());
-  }
-
-  TF_RETURN_IF_ERROR(
-      ExecuteGraph(context, std::move(graph), device_, flib, NextStepId()));
+  TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
+                                  flib_runtime_.get(), NextStepId()));
 
   int num_nonconst_outputs;
+  result->computation = std::make_shared<xla::Computation>();
   TF_RETURN_IF_ERROR(BuildComputation(
       context->retvals(), context->variables(), context->has_side_effects(),
-      options_.return_updated_values_for_all_variables, &builder,
-      &result->computation, &num_nonconst_outputs, &result->variable_updates));
+      options.return_updated_values_for_all_variables, &builder,
+      result->computation.get(), &num_nonconst_outputs,
+      &result->variable_updates));
 
   result->requires_runtime_context = context->has_context_parameter();
 
   // Tuple arguments and runtime context parameters are incompatible.
-  CHECK(!(options_.use_tuple_arg && result->requires_runtime_context));
+  CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
@@ -425,19 +454,21 @@ Status XlaCompiler::CompileGraph(string const& name,
     }
   }
 
-  if (result->computation.IsNull()) {
+  if (result->computation->IsNull()) {
     return Status::OK();
   }
 
   // Compute the output shapes, if there is a computation with non-constant
   // outputs.
-  auto computation_shape = client()->GetComputationShape(result->computation);
+  auto computation_shape = client()->GetComputationShape(*result->computation);
   if (!computation_shape.ok()) {
     return computation_shape.status();
   }
 
   result->xla_output_shape.Swap(
       computation_shape.ValueOrDie()->mutable_result());
+  VLOG(2) << "XLA output shape: "
+          << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
   auto num_computation_outputs =
       (xla::ShapeUtil::IsTuple(result->xla_output_shape))
@@ -463,10 +494,10 @@ Status XlaCompiler::CompileGraph(string const& name,
        i < context->retvals().size(); ++i) {
     const XlaContext::HandleOrConstant& retval = context->retvals()[i];
     if (!retval.is_constant) {
-      CHECK_LT(computation_output, num_nonconst_outputs);
+      CHECK_LT(computation_output, num_computation_outputs);
       OutputDescription& output = result->outputs[i];
       output.is_constant = false;
-      if (num_nonconst_outputs > 1) {
+      if (num_computation_outputs > 1) {
         output.shape =
             XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(
                 result->xla_output_shape, computation_output));
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 3ed920521b229c1ddac9ffffe924066624f3de5c..15f723ad782376b99ae7d72a5f15129e7880e9b1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/notification.h"
@@ -112,6 +113,8 @@ class XlaCompiler {
 
     // The name of this argument, used for debugging.
     string name;
+
+    bool operator==(const Argument& other) const;
   };
 
   struct OutputDescription {
@@ -172,15 +175,22 @@ class XlaCompiler {
 
     // The XLA computation built from the tensorflow subgraph. May be null
     // if the output consists solely of compile-time constants.
-    xla::Computation computation;
+    std::shared_ptr<xla::Computation> computation;
   };
 
   struct Options {
-    // Name of the compilation device to use.
-    DeviceType device_type = DeviceType("");
+    // Name of the compilation device to use. Needs to be live only during
+    // XlaCompiler's constructor.
+    const DeviceType* device_type = nullptr;
 
     xla::Client* client = nullptr;
 
+    // Function library in which to find function definitions. Must be non-null.
+    const FunctionLibraryDefinition* flib_def = nullptr;
+
+    // The graph def version to be compiled.
+    int graph_def_version = TF_GRAPH_DEF_VERSION;
+
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
     // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
     // to the computation.
@@ -198,6 +208,19 @@ class XlaCompiler {
     // computation.
     bool resolve_compile_time_constants = true;
 
+    // If not nullptr, populate_resource_manager is called with the
+    // compilation device's resource manager when the compilation
+    // device is created, and can be used to create metadata objects
+    // that can be accessed by XLA op kernels.
+    std::function<Status(ResourceMgr*)>* populate_resource_manager = nullptr;
+  };
+
+  explicit XlaCompiler(Options options);
+  ~XlaCompiler();
+
+  // Options pertaining to an individual call to CompileGraph() or
+  // CompileFunction().
+  struct CompileOptions {
     // If `use_tuple_arg` is true, a single tuple parameter will be used for all
     // arguments; if false, each argument gets its own parameter.
     bool use_tuple_arg = false;
@@ -208,17 +231,8 @@ class XlaCompiler {
     // modified by the computation. Used when compiling loop bodies to ensure
     // the input and output signatures match.
     bool return_updated_values_for_all_variables = false;
-
-    // If 'prune_unreachable_nodes' is true, then nodes that are not
-    // dependencies of graph's _Retval nodes will be pruned before compilation.
-    // This is useful to prune stateful operators that should not be executed
-    // from a function body.
-    bool prune_unreachable_nodes = false;
   };
 
-  explicit XlaCompiler(Options options);
-  ~XlaCompiler();
-
   // Compiles a Tensorflow function `fn_name_attrs` into an XLA computation.
   // `args` describes the arguments to the function, each of which must either
   // be a runtime-parameter to the XLA computation, a compile-time constant, or
@@ -229,7 +243,7 @@ class XlaCompiler {
   // arguments are returned as host memory tensors in the output list and are
   // not included in the XLA computation's outputs. The XLA computation is
   // null if there are no data-dependent outputs and no side effects.
-  Status CompileFunction(FunctionLibraryRuntime* flr,
+  Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
                          const std::vector<Argument>& args,
                          CompilationResult* result);
@@ -237,8 +251,8 @@ class XlaCompiler {
   // Compiles a tensorflow::Graph into an xla::Computation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
-  Status CompileGraph(string const& name, std::unique_ptr<Graph> graph,
-                      FunctionLibraryRuntime* flr,
+  Status CompileGraph(const CompileOptions& options, string const& name,
+                      std::unique_ptr<Graph> graph,
                       const std::vector<Argument>& args,
                       CompilationResult* result);
 
@@ -247,9 +261,11 @@ class XlaCompiler {
   Status BuildExecutable(const CompilationResult& result,
                          std::unique_ptr<xla::LocalExecutable>* executable);
 
+  const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
   XlaCompilationDevice* device() const { return device_; }
   const DeviceMgr* device_mgr() const { return &device_mgr_; }
+  FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_.get(); }
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
@@ -260,6 +276,9 @@ class XlaCompiler {
  private:
   Options options_;
 
+  // Status set to non-OK in the constructor if initialization fails.
+  Status initialization_status_;
+
   // Returns the next step sequence number.
   int64 NextStepId();
 
@@ -271,6 +290,17 @@ class XlaCompiler {
   XlaCompilationDevice* device_;  // Owned by device_mgr_
   DeviceMgr device_mgr_;
 
+  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+
+  struct SignatureHash {
+    uint64 operator()(
+        const std::pair<string, std::vector<Argument>>& signature) const;
+  };
+
+  std::unordered_map<std::pair<string, std::vector<Argument>>,
+                     CompilationResult, SignatureHash>
+      cache_;
+
   std::unordered_map<string, xla::ChannelHandle> channels_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index aa809f85a150cbff1b4504fced467c21e0314f6f..58d74057d101cdef89fca24ec6c0858291d825fa 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -17,12 +17,14 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -33,8 +35,69 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Helper class to test the ability to pass resources through to XLA
+// compiled kernels.
+class DummyResourceForTest : public ResourceBase {
+ public:
+  string DebugString() override { return "dummy"; }
+  void Increment() { ++value_; }
+  int Get() { return value_; }
+
+ private:
+  int value_ = 0;
+};
+
+class DummyReadResourceOp : public XlaOpKernel {
+ public:
+  explicit DummyReadResourceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    ResourceMgr* rm = ctx->op_kernel_context()->resource_manager();
+    OP_REQUIRES(ctx, rm, errors::Internal("No resource manager."));
+    DummyResourceForTest* dummy;
+    OP_REQUIRES_OK(ctx, rm->Lookup<DummyResourceForTest>(
+                            rm->default_container(), "dummy", &dummy));
+    dummy->Increment();
+    dummy->Unref();
+
+    ctx->SetOutput(0, ctx->Input(0));
+  }
+};
+
+class DummyReadResourceCC {
+ public:
+  DummyReadResourceCC(const Scope& scope, const Input& value) {
+    if (!scope.ok()) return;
+    auto _value = ops::AsNodeOut(scope, value);
+    if (!scope.ok()) return;
+    Node* ret;
+    const auto unique_name = scope.GetUniqueNameForOp("DummyReadResource");
+    auto builder = NodeBuilder(unique_name, "DummyReadResource").Input(_value);
+    scope.UpdateBuilder(&builder);
+    scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+    if (!scope.ok()) return;
+    this->output_ = Output(ret, 0);
+  }
+  Node* node() const { return output_.node(); }
+
+  Output output_;
+};
+
+REGISTER_OP("DummyReadResource")
+    .Input("input: int32")
+    .Output("output: int32")
+    .Doc(R"doc(
+A dummy Op.
+
+input: dummy input.
+output: dummy output.
+)doc");
+
+REGISTER_XLA_OP(Name("DummyReadResource"), DummyReadResourceOp);
+
 class XlaCompilerTest : public ::testing::Test {
  protected:
+  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
+
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
@@ -46,19 +109,13 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.device_type = &cpu_device_type_;
     options.client = client_;
+    options.flib_def = flib_def_.get();
     return options;
   }
 
-  std::unique_ptr<FunctionLibraryRuntime> BuildFunctionLibraryRuntime(
-      const XlaCompiler& compiler) {
-    return std::unique_ptr<FunctionLibraryRuntime>(NewFunctionLibraryRuntime(
-        compiler.device_mgr(), /*env=*/nullptr, compiler.device(),
-        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
-        /*custom_kernel_creator=*/nullptr));
-  }
-
+  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -66,15 +123,15 @@ class XlaCompilerTest : public ::testing::Test {
 // Tests compilation of an empty graph.
 TEST_F(XlaCompilerTest, EmptyReturnValues) {
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph("add", std::move(graph), flr.get(),
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph),
                                      /*args=*/{}, &result));
 
   // No computation should be generated.
-  EXPECT_EQ(0, result.computation.handle().handle());
+  EXPECT_EQ(0, result.computation->handle().handle());
 }
 
 // Tests compilation and execution of a graph that adds two tensors.
@@ -99,11 +156,10 @@ TEST_F(XlaCompilerTest, Simple) {
 
   // Compiles the graph.
   XlaCompiler compiler(DefaultOptions());
-  auto flr = BuildFunctionLibraryRuntime(compiler);
 
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(
-      compiler.CompileGraph("add", std::move(graph), flr.get(), args, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args, &result));
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
@@ -117,7 +173,7 @@ TEST_F(XlaCompilerTest, Simple) {
 
   std::unique_ptr<xla::GlobalData> actual =
       client_
-          ->Execute(result.computation, {param0_data.get(), param1_data.get()})
+          ->Execute(*result.computation, {param0_data.get(), param1_data.get()})
           .ConsumeValueOrDie();
   std::unique_ptr<xla::Literal> actual_literal =
       client_->Transfer(*actual).ConsumeValueOrDie();
@@ -152,14 +208,14 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = true;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, &result));
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
+                                       &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_TRUE(result.outputs[0].is_constant);
@@ -174,7 +230,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -189,14 +245,14 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
     XlaCompiler::Options options = DefaultOptions();
     options.resolve_compile_time_constants = false;
     XlaCompiler compiler(options);
-    auto flr = BuildFunctionLibraryRuntime(compiler);
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
 
     XlaCompiler::CompilationResult result;
-    TF_ASSERT_OK(compiler.CompileGraph("constants", std::move(graph_copy),
-                                       flr.get(), args, &result));
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(),
+                                       "constants", std::move(graph_copy), args,
+                                       &result));
 
     ASSERT_EQ(2, result.outputs.size());
     EXPECT_FALSE(result.outputs[0].is_constant);
@@ -209,7 +265,7 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
     std::unique_ptr<xla::GlobalData> actual =
-        client_->Execute(result.computation, {param0_data.get()})
+        client_->Execute(*result.computation, {param0_data.get()})
             .ConsumeValueOrDie();
     std::unique_ptr<xla::Literal> actual_literal =
         client_->Transfer(*actual).ConsumeValueOrDie();
@@ -224,5 +280,44 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
   }
 }
 
+// Tests compilation and execution of a graph that adds two tensors.
+TEST_F(XlaCompilerTest, ResourceManager) {
+  // Builds a graph that calls the dummy resource Op.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = DummyReadResourceCC(scope.WithOpName("B"), a);
+  auto c = ops::_Retval(scope.WithOpName("C"), b.output_, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the argument.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2});
+
+  DummyResourceForTest* resource = new DummyResourceForTest();
+
+  // Compiles the graph.
+  auto options = DefaultOptions();
+  std::function<Status(ResourceMgr*)> populate_function =
+      [resource](ResourceMgr* rm) {
+        resource->Ref();
+        return rm->Create(rm->default_container(), "dummy", resource);
+      };
+  options.populate_resource_manager = &populate_function;
+  XlaCompiler compiler(options);
+
+  EXPECT_EQ(0, resource->Get());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "dummy",
+                                     std::move(graph), args, &result));
+
+  EXPECT_EQ(1, resource->Get());
+
+  resource->Unref();
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 57d946509b65a6d5ebf013857cf52297559431ea..3592680303c95e310b8da85294ed961a5350e09c 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -183,9 +184,14 @@ const xla::Computation* XlaContext::GetOrCreateSigmoid(const DataType type) {
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
-    auto one = b.ConstantLiteral(xla::LiteralUtil::One(xla_type));
-    auto minus_one = b.Neg(one);
-    b.Div(one, b.Add(b.Exp(b.Mul(x, minus_one)), one));
+    // Clamp the inputs to the range [-18, 18] since anything outside
+    // this range is 0.0f or 1.0f in single-precision. We must clamp the range
+    // of x to avoid incorrect outputs due to fast-math optimizations for large
+    // negative x.
+    x = b.Clamp(XlaHelpers::IntegerLiteral(&b, type, -18), x,
+                XlaHelpers::IntegerLiteral(&b, type, 18));
+    auto one = XlaHelpers::One(&b, type);
+    b.Div(one, b.Add(b.Exp(b.Neg(x)), one));
     return b.Build().ConsumeValueOrDie();
   });
 }
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 10d8b67bbd2d0e897e3ca55e584f575448a3a4fd..f060f8f2f178b2bc56caf7a3df9df32c8a407473 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
@@ -89,7 +90,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     case xla::U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case xla::F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      literal =
+          *xla::LiteralUtil::CreateR0<xla::half>(static_cast<xla::half>(value));
+      break;
     case xla::TUPLE:
       LOG(FATAL) << "tuple element type is not integral";
     case xla::OPAQUE:
@@ -107,6 +110,9 @@ xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   switch (type) {
+    case xla::F16:
+      return b->ConstantR0<xla::half>(static_cast<xla::half>(value));
+      break;
     case xla::F32:
       return b->ConstantR0<float>(static_cast<float>(value));
       break;
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
index cd773d64ed4154aa2a05ac2d15e9358614239b1f..dca420d6ee3fec45f88ac3b450ab0cb4fb83d38a 100644
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
@@ -23,7 +23,7 @@ limitations under the License.
 // actually used.  E.g. some ahead-of-time compiled computations don't need a
 // thread pool.
 namespace Eigen {
-class ThreadPoolDevice;
+struct ThreadPoolDevice;
 }
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index dc5a342bcdd2cc3e47e873c4e495730eb4d0fcde..4de69ee43c355621c429bcd1ba3f4d623e9b0d78 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -186,6 +186,31 @@ Status XlaOpKernelContext::ConstantInputAsIntVector(int index,
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
+                                                       xla::Literal* out) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  switch (literal.shape().element_type()) {
+    case xla::S32:
+      out->Clear();
+      *out->mutable_shape() = literal.shape();
+      out->mutable_shape()->set_element_type(xla::S64);
+      for (int32 x : literal.s32s()) {
+        out->add_s64s(x);
+      }
+      return Status::OK();
+
+    case xla::S64:
+      out->Swap(&literal);
+      return Status::OK();
+
+    default:
+      return errors::InvalidArgument(
+          "Invalid argument to ConstantInputAsInt64Literal: ",
+          xla::ShapeUtil::HumanString(literal.shape()));
+  }
+}
+
 // TODO(phawkins): validate that the dimensions form a valid shape, fail
 // gracefully if they do not.
 Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
@@ -332,6 +357,7 @@ void XlaOpKernelContext::SetVariableOutput(int index, int variable_id) {
 
 Status XlaOpKernelContext::AssignVariable(
     int index, DataType type, const xla::ComputationDataHandle& handle) {
+  TF_RET_CHECK(handle.handle() != 0);
   SetOpHasSideEffects();
 
   const XlaExpression* expression =
@@ -354,6 +380,10 @@ void XlaOpKernelContext::SetOpHasSideEffects() {
   XlaContext::Get(context_).AddSideEffects();
 }
 
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return XlaContext::Get(context_).compiler();
+}
+
 void XlaOpKernelContext::CtxFailure(Status s) { context_->CtxFailure(s); }
 void XlaOpKernelContext::CtxFailureWithWarning(Status s) {
   context_->CtxFailureWithWarning(s);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index d214879e3cc9a86e6499d0afa68f572b6c6a3a15..0a8a9284186e5b72a8a376ad159eb7b2482699c5 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
@@ -109,6 +110,9 @@ class XlaOpKernelContext {
   // Converts a constant 1D int32 or int64 tensor into a vector of int64s.
   Status ConstantInputAsIntVector(int index, std::vector<int64>* out);
 
+  // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
+  Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
+
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
 
@@ -182,6 +186,10 @@ class XlaOpKernelContext {
   // Returns the underlying OpKernelContext. Use rarely.
   OpKernelContext* op_kernel_context() const { return context_; }
 
+  // Returns the XlaCompiler that is performing the compilation. Used for, e.g.,
+  // While to compile nested computations.
+  XlaCompiler* compiler() const;
+
   // TODO(phawkins): find a better home for these helpers.
 
   // Get an XLA lambda to compute Max. This is cached in the
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 5b895bfdf60d2206316af1b023e5ed91e7eec424..13fdfc3b0c82e3d0018c72eebaaf7fa313111648 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -167,6 +167,8 @@ void XlaOpRegistry::RegisterCompilationKernels() {
           !backend.second.op_filter(kdef.get())) {
         continue;
       }
+      VLOG(2) << "XLA op registration: device: " << backend.first
+              << " op: " << op.first;
       registry.kernel_registrars_.emplace_back(
           new kernel_factory::OpKernelRegistrar(
               new KernelDef(*kdef), "XlaJitOp", op.second->factory));
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index e73a29ddee1cd5b02453524618d5f3623b331cf8..6b424d23092b138e9f4d32062d575f17ab4791cb 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -6,6 +6,7 @@ package_group(
     name = "friends",
     packages = [
         "//tensorflow/compiler/...",
+        "//tensorflow/contrib/xla_tf_graph/...",
     ],
 )
 
@@ -16,6 +17,7 @@ package_group(
     ],
 )
 
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
 # Filegroup used to collect source files for dependency checking.
@@ -43,11 +45,42 @@ xla_proto_library(
     ],
 )
 
+# This is a headers target that extra XLA devices can use to prevent
+# circular dependencies.  Devices that are compiled as separate shared
+# objects can also use it to prevent linking of library code.
+cc_header_only_library(
+    name = "xla_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ],
+)
+
+cc_library(
+    name = "test",
+    testonly = 1,
+    hdrs = ["test.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "types",
     hdrs = ["types.h"],
     visibility = [":friends"],
-    deps = ["//tensorflow/core:lib"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
 )
 
 cc_library(
@@ -80,6 +113,7 @@ cc_test(
     deps = [
         ":status_macros",
         ":statusor",
+        ":test",
         ":test_helpers",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -115,6 +149,7 @@ cc_test(
     srcs = ["statusor_test.cc"],
     deps = [
         ":statusor",
+        ":test",
         ":types",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -157,9 +192,9 @@ cc_test(
     name = "util_test",
     srcs = ["util_test.cc"],
     deps = [
+        ":test",
         ":types",
         ":util",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -198,10 +233,11 @@ cc_test(
     srcs = ["shape_util_test.cc"],
     deps = [
         ":shape_util",
+        ":test",
         ":test_helpers",
         ":types",
         ":util",
-        "//tensorflow/core:test",
+        ":xla_data_proto",
         "//tensorflow/core:test_main",
     ],
 )
@@ -211,6 +247,7 @@ cc_test(
     srcs = ["layout_util_test.cc"],
     deps = [
         ":shape_util",
+        ":test",
         ":test_helpers",
         "//tensorflow/compiler/xla/legacy_flags:layout_util_flags",
         "//tensorflow/core:test",
@@ -223,9 +260,9 @@ cc_test(
     srcs = ["index_util_test.cc"],
     deps = [
         ":shape_util",
+        ":test",
         ":test_helpers",
         ":xla_data_proto",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -240,6 +277,7 @@ cc_library(
         ":array3d",
         ":array4d",
         ":shape_util",
+        ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto",
@@ -255,7 +293,7 @@ cc_test(
         ":array4d",
         ":literal_util",
         ":shape_util",
-        ":test_helpers",
+        ":test",
         ":types",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -270,7 +308,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":util",
-        ":xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -303,7 +340,7 @@ cc_test(
     srcs = ["array2d_test.cc"],
     deps = [
         ":array2d",
-        "//tensorflow/core:test",
+        ":test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -323,8 +360,8 @@ cc_test(
     srcs = ["array3d_test.cc"],
     deps = [
         ":array3d",
+        ":test",
         ":types",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -345,8 +382,8 @@ cc_test(
     srcs = ["array4d_test.cc"],
     deps = [
         ":array4d",
+        ":test",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -378,7 +415,6 @@ cc_library(
 cc_library(
     name = "test_helpers",
     testonly = 1,
-    srcs = ["test_helpers.cc"],
     hdrs = ["test_helpers.h"],
     visibility = [":internal"],
     deps = [
@@ -414,11 +450,11 @@ cc_test(
     deps = [
         ":literal_util",
         ":shape_util",
+        ":test",
         ":text_literal_reader",
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -443,6 +479,7 @@ cc_test(
     srcs = ["text_literal_writer_test.cc"],
     deps = [
         ":literal_util",
+        ":test",
         ":test_helpers",
         ":text_literal_writer",
         ":types",
@@ -471,8 +508,8 @@ cc_test(
     deps = [
         ":shape_tree",
         ":shape_util",
+        ":test",
         ":xla_data_proto",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -530,11 +567,11 @@ cc_test(
         ":array4d",
         ":literal_util",
         ":reference_util",
+        ":test",
         ":util",
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index f885821210eb68dfb599303830c814c309e0a24d..593084a0c111690d9e239ed5837f6f0c6c713048 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -45,11 +45,15 @@ class Array2D {
 
   // Creates an array of dimensions n1 x n2, uninitialized values.
   Array2D(const int64 n1, const int64 n2)
-      : n1_(n1), n2_(n2), values_(n1 * n2) {}
+      : n1_(n1), n2_(n2), values_(new T[n1 * n2]()) {
+    Fill(T());
+  }
 
   // Creates an array of dimensions n1 x n2, initialized to value.
   Array2D(const int64 n1, const int64 n2, const T value)
-      : n1_(n1), n2_(n2), values_(n1 * n2, value) {}
+      : n1_(n1), n2_(n2), values_(new T[n1 * n2]()) {
+    Fill(value);
+  }
 
   // Creates an array from the given nested initializer list. The outer
   // initializer list is the first dimension; the inner is the second dimension.
@@ -65,16 +69,30 @@ class Array2D {
     }
   }
 
-  T& operator()(const int64 n1, const int64 n2) {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    return values_[n1 * n2_ + n2];
+  Array2D(const Array2D<T>& other) : Array2D(other.n1(), other.n2()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+  }
+
+  Array2D<T>& operator=(const Array2D<T>& other) {
+    n1_ = other.n1();
+    n2_ = other.n2();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
+  T& operator()(const int64 i1, const int64 i2) {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    return values_[i1 * n2_ + i2];
   }
 
-  const T& operator()(const int64 n1, const int64 n2) const {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    return values_[n1 * n2_ + n2];
+  const T& operator()(const int64 i1, const int64 i2) const {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    return values_[i1 * n2_ + i2];
   }
 
   // Access to the array's dimensions. height() and width() provide the
@@ -84,15 +102,15 @@ class Array2D {
   int64 n2() const { return n2_; }
   int64 height() const { return n1_; }
   int64 width() const { return n2_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return n1_ * n2_; }
 
   // Low-level accessor for stuff like memcmp, handle with care. Returns pointer
   // to the underlying storage of the array (similarly to std::vector::data()).
-  T* data() const { return const_cast<Array2D*>(this)->values_.data(); }
+  T* data() const { return const_cast<Array2D*>(this)->values_.get(); }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Applies f to all cells in this array, in row-major order.
@@ -124,8 +142,8 @@ class Array2D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -150,7 +168,7 @@ class Array2D {
  private:
   int64 n1_;
   int64 n2_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 // Returns a linspace-populated Array2D in the range [from, to] (inclusive)
diff --git a/tensorflow/compiler/xla/array2d_test.cc b/tensorflow/compiler/xla/array2d_test.cc
index ac107b1c0d426c676629762dbc8191c74e2e1c7e..795d50ca5b56a60c34279a33e65aa635a65fa5ec 100644
--- a/tensorflow/compiler/xla/array2d_test.cc
+++ b/tensorflow/compiler/xla/array2d_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <initializer_list>
 
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/compiler/xla/test.h"
 
 namespace xla {
 namespace {
@@ -84,6 +84,17 @@ TEST(Array2dTest, IndexingReadWrite) {
   EXPECT_EQ(arr(1, 2), 61);
 }
 
+TEST(Array2dTest, IndexingReadWriteBool) {
+  Array2D<bool> arr = {{false, true, false}, {true, true, false}};
+
+  EXPECT_EQ(arr(1, 1), true);
+  EXPECT_EQ(arr(1, 2), false);
+  arr(1, 1) = false;
+  arr(1, 2) = true;
+  EXPECT_EQ(arr(1, 1), false);
+  EXPECT_EQ(arr(1, 2), true);
+}
+
 TEST(Array2dTest, Fill) {
   Array2D<int> fullof7(2, 3, 7);
   for (int64 n1 = 0; n1 < fullof7.n1(); ++n1) {
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index 654af8f03074f30dd1561db412ad36f43a33aab9..124ccd1975b3a9ab047e9bbbfb38921fe7386fe4 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <random>
-#include <vector>
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -39,11 +39,15 @@ class Array3D {
  public:
   // Creates an array of dimensions n1 x n2 x n3, uninitialized values.
   Array3D(const int64 n1, const int64 n2, const int64 n3)
-      : n1_(n1), n2_(n2), n3_(n3), values_(n1 * n2 * n3) {}
+      : n1_(n1), n2_(n2), n3_(n3), values_(new T[n1 * n2 * n3]) {
+    Fill(T());
+  }
 
   // Creates an array of dimensions n1 x n2 x n3, initialized to value.
   Array3D(const int64 n1, const int64 n2, const int64 n3, const T value)
-      : n1_(n1), n2_(n2), n3_(n3), values_(n1 * n2 * n3, value) {}
+      : n1_(n1), n2_(n2), n3_(n3), values_(new T[n1 * n2 * n3]) {
+    Fill(value);
+  }
 
   // Creates an array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
@@ -69,34 +73,50 @@ class Array3D {
     }
   }
 
-  T& operator()(const int64 n1, const int64 n2, const int64 n3) {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    CHECK_LT(n3, n3_);
-    return values_[n1 * n2_ * n3_ + n2 * n3_ + n3];
+  Array3D(const Array3D<T>& other)
+      : Array3D(other.n1(), other.n2(), other.n3()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+  }
+
+  Array3D<T>& operator=(const Array3D<T>& other) {
+    n1_ = other.n1();
+    n2_ = other.n2();
+    n3_ = other.n3();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
+  T& operator()(const int64 i1, const int64 i2, const int64 i3) {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    CHECK_LT(i3, n3_);
+    return values_[i1 * n2_ * n3_ + i2 * n3_ + i3];
   }
 
-  const T& operator()(const int64 n1, const int64 n2, const int64 n3) const {
-    CHECK_LT(n1, n1_);
-    CHECK_LT(n2, n2_);
-    CHECK_LT(n3, n3_);
-    return values_[n1 * n2_ * n3_ + n2 * n3_ + n3];
+  const T& operator()(const int64 i1, const int64 i2, const int64 i3) const {
+    CHECK_LT(i1, n1_);
+    CHECK_LT(i2, n2_);
+    CHECK_LT(i3, n3_);
+    return values_[i1 * n2_ * n3_ + i2 * n3_ + i3];
   }
 
   // Access to the array's dimensions.
   int64 n1() const { return n1_; }
   int64 n2() const { return n2_; }
   int64 n3() const { return n3_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return n1_ * n2_ * n3_; }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with sequentially increasing values.
   void FillIota(const T& value) {
-    std::iota(values_.begin(), values_.end(), value);
+    std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with random normal values with a mean of 0 and standard
@@ -106,8 +126,8 @@ class Array3D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -115,7 +135,7 @@ class Array3D {
   int64 n1_;
   int64 n2_;
   int64 n3_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/array3d_test.cc b/tensorflow/compiler/xla/array3d_test.cc
index fa4435dfc48edcd5b88230e7d2de21e29e269b7e..6b5f4b343b2113652758bbd5ce0fc803239c1266 100644
--- a/tensorflow/compiler/xla/array3d_test.cc
+++ b/tensorflow/compiler/xla/array3d_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <initializer_list>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index 199ad2baaeb7999349fd6bb201a476706bb12ce7..56b638d9782a6c9db5206c070d69c5b2b367313f 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <random>
 #include <string>
@@ -60,15 +61,15 @@ class Array4D {
         depth_(depth),
         height_(height),
         width_(width),
-        values_(planes * depth * height * width) {}
+        values_(new T[planes * depth * height * width]) {
+    Fill(T());
+  }
 
-  // Creates a 4D array, initalized to value.
+  // Creates a 4D array, initialized to value.
   Array4D(int64 planes, int64 depth, int64 height, int64 width, T value)
-      : planes_(planes),
-        depth_(depth),
-        height_(height),
-        width_(width),
-        values_(planes * depth * height * width, value) {}
+      : Array4D(planes, depth, height, width) {
+    Fill(value);
+  }
 
   // Creates a 4D array, filled with values.
   //
@@ -111,6 +112,23 @@ class Array4D {
     }
   }
 
+  Array4D(const Array4D<T>& other)
+      : Array4D(other.planes(), other.depth(), other.height(), other.width()) {
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+  }
+
+  Array4D<T>& operator=(const Array4D<T>& other) {
+    planes_ = other.planes();
+    depth_ = other.depth();
+    height_ = other.height();
+    width_ = other.width();
+    values_.reset(new T[num_elements()]);
+    std::copy(&other.values_[0], &other.values_[0] + num_elements(),
+              &values_[0]);
+    return *this;
+  }
+
   T& operator()(int64 plane, int64 depth, int64 height, int64 width) {
     CHECK_LT(plane, planes_);
     CHECK_LT(depth, depth_);
@@ -135,24 +153,24 @@ class Array4D {
   int64 n3() const { return height_; }
   int64 n2() const { return depth_; }
   int64 n1() const { return planes_; }
-  int64 num_elements() const { return values_.size(); }
+  int64 num_elements() const { return width_ * height_ * depth_ * planes_; }
 
   // Sets all the values in the array to values.
   template <typename Container = std::initializer_list<T>>
   void SetValues(const Container& container) {
     CHECK_EQ(std::distance(std::begin(container), std::end(container)),
              num_elements());
-    values_.assign(std::begin(container), std::end(container));
+    std::copy(std::begin(container), std::end(container), &values_[0]);
   }
 
   // Fills the array with the given value.
   void Fill(const T& value) {
-    std::fill(values_.begin(), values_.end(), value);
+    std::fill(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with iota.
   void FillIota(const T& value) {
-    std::iota(values_.begin(), values_.end(), value);
+    std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
   // Fills the array with random variable with a deviation of value and a mean
@@ -162,8 +180,8 @@ class Array4D {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
                                                   static_cast<double>(value));
-    for (auto& v : values_) {
-      v = static_cast<T>(distribution(g));
+    for (int64 i = 0; i < num_elements(); ++i) {
+      values_[i] = static_cast<T>(distribution(g));
     }
   }
 
@@ -268,7 +286,7 @@ class Array4D {
   int64 depth_;
   int64 height_;
   int64 width_;
-  std::vector<T> values_;
+  std::unique_ptr<T[]> values_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/array4d_test.cc b/tensorflow/compiler/xla/array4d_test.cc
index 72ada467e515eff98a2e5845dc6a3714a770650e..3bc8148c911df0aeade364e4ac2e2ee828bacb53 100644
--- a/tensorflow/compiler/xla/array4d_test.cc
+++ b/tensorflow/compiler/xla/array4d_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <initializer_list>
 #include <numeric>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 3e9dfe2a922c913c528d586413c11e2da8cbdc39..2d96128e259da316a41e83bea221ae201ad88a13 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -99,6 +99,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compile_only_client",
+    srcs = ["compile_only_client.cc"],
+    hdrs = ["compile_only_client.h"],
+    deps = [
+        ":client",
+        ":computation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:compile_only_service",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "@llvm//:support",
+    ],
+)
+
 # This target is used to instantiate the XLA service in-process and create
 # a client for it.
 cc_library(
@@ -106,12 +126,14 @@ cc_library(
     srcs = ["client_library.cc"],
     hdrs = ["client_library.h"],
     deps = [
+        ":compile_only_client",
         ":local_client",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 93437023bc8956e449f828f5bf6dea7a6bff8610..8238261e1c90cadeda9005e437d684d3770bd67b 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -43,6 +43,16 @@ int LocalClientOptions::number_of_replicas() const {
   return number_of_replicas_;
 }
 
+LocalClientOptions& LocalClientOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int LocalClientOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ ClientLibrary& ClientLibrary::Singleton() {
   static ClientLibrary* c = new ClientLibrary;
   return *c;
@@ -69,22 +79,24 @@ ClientLibrary::~ClientLibrary() = default;
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  auto it = client_library.instances_.find(platform->id());
-  if (it != client_library.instances_.end()) {
+  auto it = client_library.local_instances_.find(platform->id());
+  if (it != client_library.local_instances_.end()) {
     return it->second->client.get();
   }
 
   ServiceOptions service_options;
   service_options.set_platform(platform);
   service_options.set_number_of_replicas(replica_count);
+  service_options.set_intra_op_parallelism_threads(
+      options.intra_op_parallelism_threads());
 
-  std::unique_ptr<LocalInstance> instance = MakeUnique<LocalInstance>();
+  auto instance = MakeUnique<LocalInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       LocalService::NewService(service_options));
   instance->client = MakeUnique<LocalClient>(instance->service.get());
   LocalClient* cl = instance->client.get();
 
-  client_library.instances_.insert(
+  client_library.local_instances_.insert(
       std::make_pair(platform->id(), std::move(instance)));
   return cl;
 }
@@ -99,9 +111,35 @@ ClientLibrary::~ClientLibrary() = default;
     perftools::gputools::Platform* platform) {
   ClientLibrary& client_library = Singleton();
   tensorflow::mutex_lock lock(client_library.service_mutex_);
-  auto it = client_library.instances_.find(platform->id());
-  CHECK(it != client_library.instances_.end());
+  auto it = client_library.local_instances_.find(platform->id());
+  CHECK(it != client_library.local_instances_.end());
   return it->second->service.get();
 }
 
+/* static */ StatusOr<CompileOnlyClient*>
+ClientLibrary::GetOrCreateCompileOnlyClient(
+    perftools::gputools::Platform* platform) {
+  ClientLibrary& client_library = Singleton();
+  tensorflow::mutex_lock lock(client_library.service_mutex_);
+
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  auto it = client_library.compile_only_instances_.find(platform->id());
+  if (it != client_library.compile_only_instances_.end()) {
+    return it->second->client.get();
+  }
+
+  auto instance = MakeUnique<CompileOnlyInstance>();
+  TF_ASSIGN_OR_RETURN(instance->service,
+                      CompileOnlyService::NewService(platform));
+  instance->client = MakeUnique<CompileOnlyClient>(instance->service.get());
+  CompileOnlyClient* cl = instance->client.get();
+
+  client_library.compile_only_instances_.insert(
+      std::make_pair(platform->id(), std::move(instance)));
+  return cl;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 2bc319f9333368635690add017ad3d89947e2551..3ddd235d0efeeb78f49eafbf670d7c74a88960dd 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -51,9 +53,14 @@ class LocalClientOptions {
   LocalClientOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  LocalClientOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 class ClientLibrary {
@@ -76,6 +83,13 @@ class ClientLibrary {
   // access user computations from client.
   static LocalService* GetXlaService(perftools::gputools::Platform* platform);
 
+  // Singleton constructor-or-accessor for compile-only clients. Arguments:
+  //
+  //   platform : The platform the underlying XLA service should target. If
+  //     null then default platform is used.
+  static StatusOr<CompileOnlyClient*> GetOrCreateCompileOnlyClient(
+      perftools::gputools::Platform* platform = nullptr);
+
  private:
   // Returns the singleton instance of ClientLibrary.
   static ClientLibrary& Singleton();
@@ -90,10 +104,21 @@ class ClientLibrary {
     std::unique_ptr<LocalClient> client;
   };
 
+  struct CompileOnlyInstance {
+    // Service that is wrapped by the singleton client object.
+    std::unique_ptr<CompileOnlyService> service;
+    // Singleton client object.
+    std::unique_ptr<CompileOnlyClient> client;
+  };
+
   tensorflow::mutex service_mutex_;  // Guards the singleton creation state.
   std::unordered_map<perftools::gputools::Platform::Id,
                      std::unique_ptr<LocalInstance>>
-      instances_ GUARDED_BY(service_mutex_);
+      local_instances_ GUARDED_BY(service_mutex_);
+
+  std::unordered_map<perftools::gputools::Platform::Id,
+                     std::unique_ptr<CompileOnlyInstance>>
+      compile_only_instances_ GUARDED_BY(service_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary);
 };
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ff6f0b300f9e2cc776e60bb27a3952356657780
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/compile_only_client.h"
+
+#include "external/llvm/include/llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyClient::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
+  service_instances.reserve(computations.size());
+  for (const AotComputationInstance& instance : computations) {
+    service_instances.push_back({});
+    CompileOnlyService::AotComputationInstance& service_instance =
+        service_instances.back();
+    TF_RET_CHECK(instance.computation != nullptr);
+    service_instance.computation = instance.computation->handle();
+    service_instance.argument_layouts = instance.argument_layouts;
+    service_instance.result_layout = instance.result_layout;
+  }
+  return compiler_service_->CompileAheadOfTime(service_instances, options);
+}
+
+int64 CompileOnlyClient::PointerSizeForTriple(
+    tensorflow::StringPiece target_triple) {
+  llvm::Triple triple(
+      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
+  if (triple.isArch64Bit()) {
+    return 8;
+  } else if (triple.isArch32Bit()) {
+    return 4;
+  } else {
+    CHECK(triple.isArch16Bit());
+    return 2;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5900048711384e0240a3cd502260eb388eb40f51
--- /dev/null
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Client specialization for doing ahead-of-time compilation.  This does
+// not require (or attempt to instantiate) an execution-capable backend for the
+// relevant platform.
+class CompileOnlyClient : public Client {
+ public:
+  explicit CompileOnlyClient(CompileOnlyService* service)
+      : Client(service), compiler_service_(service) {}
+
+  CompileOnlyClient(const CompileOnlyClient&) = delete;
+  void operator=(const CompileOnlyClient&) = delete;
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    const Computation* computation;
+    // Inform the compiler of the expected layout for arguments.
+    std::vector<const Shape*> argument_layouts;
+    // Specifies the expected result layout.
+    const Shape* result_layout;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation. The |options| parameter describes
+  // the target for which the compiler should emit code.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& options);
+
+  // Returns the size of a pointer in bytes for a given triple.
+  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
+
+ private:
+  CompileOnlyService* compiler_service_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 88efd87d1cc3efd16750d6dfedb18159114b2cb6..22a70681468f16b12793274bf5ce72613534df42 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -1229,8 +1229,7 @@ StatusOr<bool> ComputationBuilder::IsConstant(
   VLOG(2) << "done with request";
 
   if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
+    return s;
   }
   return response.is_constant();
 }
@@ -1255,8 +1254,7 @@ StatusOr<std::unique_ptr<GlobalData>> ComputationBuilder::ComputeConstant(
   VLOG(2) << "done with request";
 
   if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
+    return s;
   }
 
   TF_RET_CHECK(response.output().handle() != 0);
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 87ceb43d1fe6650e1d160f3099b883ea208d8aac..6af69eeec12dec0ea1303826859d4655cf92932e 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -668,6 +668,14 @@ class ComputationBuilder {
   // then Build() should be used instead.
   Computation BuildAndNoteError();
 
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // ComputationDataHandle and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
  private:
   using PopulateLiteral = std::function<void(Literal*)>;
 
diff --git a/tensorflow/compiler/xla/client/global_data.h b/tensorflow/compiler/xla/client/global_data.h
index eb11d91034ba524f093ff80fa7cd0473e04eac2c..b7929357d06032b55c04bf0391f7fa703ee15f17 100644
--- a/tensorflow/compiler/xla/client/global_data.h
+++ b/tensorflow/compiler/xla/client/global_data.h
@@ -23,13 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-// Wraps a GlobalDataHandle with a lifetime.
+// A GlobalData object represents a globally-accessible allocation of
+// data in the associated XLA service.
 class GlobalData {
  public:
   // Gives ownership of the global data handle to this object.
   GlobalData(ServiceInterface* parent, GlobalDataHandle handle);
 
-  // Unregisters the wrapped handle.
+  // Unregisters the wrapped handle, which causes the service to
+  // deallocate the associated data.
   ~GlobalData();
 
   const GlobalDataHandle& handle() const { return handle_; }
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index bfd14bc1c010353e3e473f10dd6c030cb0438648..02cf57e7632a2064e646d4dc441e3ec119053564 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -176,17 +176,24 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
   TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
 
   ExecutableRunOptions actual_options = options;
-  Backend::StreamPtr stream;
   if (options.stream() == nullptr) {
     TF_ASSIGN_OR_RETURN(
-        stream, BorrowStreamForDevice(options.device_ordinal(), backend_));
+        Backend::StreamPtr stream,
+        BorrowStreamForDevice(options.device_ordinal(), backend_));
     actual_options.set_stream(stream.get());
   }
   if (options.allocator() == nullptr) {
     actual_options.set_allocator(backend_->memory_allocator());
   }
-  ServiceExecutableRunOptions service_options(actual_options,
-                                              backend_->StreamBorrower());
+
+  // For local client execution on CPU backends:
+  // *) The thread pool used for eigen CPU ops is from
+  //    ExecutableRunOptions.eigen_intra_op_thread_pool.
+  // *) The thread pool used for XLA CPU ops is from
+  //    backend_->eigen_intra_op_thread_pool().
+  ServiceExecutableRunOptions service_options(
+      actual_options, backend_->StreamBorrower(),
+      backend_->eigen_intra_op_thread_pool());
 
   if (executable_->dumping()) {
     return ExecuteAndDump(&service_options, arguments);
@@ -253,46 +260,6 @@ StatusOr<std::unique_ptr<GlobalData>> LocalClient::AllocateBufferOnDevice(
   return std::unique_ptr<GlobalData>(new GlobalData(local_service_, handle));
 }
 
-tensorflow::Status LocalClient::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  return local_service_->ResolveArguments(arguments, device_ordinal,
-                                          argument_ptrs);
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<LocalService::AheadOfTimeComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    LocalService::AheadOfTimeComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return local_service_->CompileAheadOfTime(service_instances, options);
-}
-
-int64 LocalClient::PointerSizeForTriple(tensorflow::StringPiece target_triple) {
-  llvm::Triple triple(
-      llvm::Triple::normalize(llvm_ir::AsStringRef(target_triple)));
-  if (triple.isArch64Bit()) {
-    return 8;
-  } else if (triple.isArch32Bit()) {
-    return 4;
-  } else {
-    CHECK(triple.isArch16Bit());
-    return 2;
-  }
-}
-
 se::Platform* LocalClient::platform() const {
   return local_service_->backend().platform();
 }
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 2c467efcea119b66ad08e0636eca0f1acec3a3b8..c903cd271125b44677f7bb191f100f6604f40bbc 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -56,7 +56,7 @@ class ExecutableBuildOptions {
 
   // If set, this specifies the layout of the result of the computation. If not
   // set, the service will chose the layout of the result. A Shape is used to
-  // store the layout to accomodate tuple result shapes. A value of nullptr
+  // store the layout to accommodate tuple result shapes. A value of nullptr
   // indicates the option has not been set.
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
@@ -148,7 +148,7 @@ class LocalExecutable {
   const ExecutableBuildOptions& build_options_;
 };
 
-// An XLA service client object for use when the client and service run in
+// An XLA Client specialization for use when the client and service run in
 // the same process.
 class LocalClient : public Client {
  public:
@@ -158,14 +158,6 @@ class LocalClient : public Client {
   LocalClient(const LocalClient&) = delete;
   void operator=(const LocalClient&) = delete;
 
-  // For an array of arguments held on the local service, validate
-  // that each is placed on the specified device_ordinal, and return
-  // the DeviceMemoryBase corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal on the local service. If
   // allocate_space_for_deep_copy, the buffer is large enough to hold
@@ -182,30 +174,6 @@ class LocalClient : public Client {
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
       const ExecutableBuildOptions& options);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  //
-  // TODO(b/31222190): This doesn't really belong in LocalClient. Move it to its
-  // own library.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& options);
-
-  // Returns the size of a pointer in bytes for a given triple.
-  static int64 PointerSizeForTriple(tensorflow::StringPiece triple);
-
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index e3248d8e908b60c7e6f7224d25b963601c92f24a..76c0168f370ff1f0749759705b7ecff359a80341 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -118,17 +118,36 @@ namespace xla {
   return multi_index;
 }
 
-/* static */ bool IndexUtil::BumpIndices(const Shape& shape,
-                                         std::vector<int64>* indices) {
-  for (int64 dimno = indices->size() - 1; dimno >= 0; --dimno) {
+/* static */ bool IndexUtil::BumpIndices(
+    const Shape& shape, tensorflow::gtl::MutableArraySlice<int64> indices) {
+  for (int64 dimno = indices.size() - 1; dimno >= 0; --dimno) {
     int64 limit = shape.dimensions(dimno);
-    if ((*indices)[dimno] + 1 < limit) {
-      (*indices)[dimno]++;
-      std::fill(indices->begin() + dimno + 1, indices->end(), 0);
+    if (indices[dimno] + 1 < limit) {
+      indices[dimno]++;
+      std::fill(indices.begin() + dimno + 1, indices.end(), 0);
       return true;
     }
   }
   return false;
 }
 
+/* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
+                                                 int64 dimension) {
+  const Layout& layout = shape.layout();
+  int64 pdim_size = layout.padded_dimensions_size();
+  int64 stride = 1;
+  DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
+  for (auto dim : layout.minor_to_major()) {
+    if (dim == dimension) {
+      break;
+    }
+    if (pdim_size == 0) {
+      stride *= shape.dimensions(dim);
+    } else {
+      stride *= layout.padded_dimensions(dim);
+    }
+  }
+  return stride;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 2d8753c3fe8fc05bdcdeaa18360ac5fe4a5e587b..c9838966a5b67397eb5fc4afe3ab9d98e82eb2b1 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -58,7 +58,16 @@ class IndexUtil {
   //
   // Returns true iff the indices were successfully bumped; false if we've hit
   // the limit where it can no longer be bumped in-bounds.
-  static bool BumpIndices(const Shape& shape, std::vector<int64>* indices);
+  static bool BumpIndices(const Shape& shape,
+                          tensorflow::gtl::MutableArraySlice<int64> indices);
+
+  // Calculates the stride size (in number of elements, not byte size) of a
+  // given logical shape dimension (from 0 to rank-1). If available, padded
+  // dimensions are used.
+  // Example:
+  //  GetDimensionStride(F32[5,8,10,4]{3,2,1,0}, 1) ==
+  //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
+  static int64 GetDimensionStride(const Shape& shape, int64 dimension);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(IndexUtil);
diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc
index 85259b33f0beea4b508c0d5c1f3a6294dda76813..7c4efdee484d9530a69b31cbe3a0d69a8a3cffa7 100644
--- a/tensorflow/compiler/xla/index_util_test.cc
+++ b/tensorflow/compiler/xla/index_util_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 #include <initializer_list>
 
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -144,14 +143,11 @@ TEST(IndexUtilTest, BumpIndices2x2) {
   auto shape = ShapeUtil::MakeShape(S32, {2, 2});
   std::vector<int64> indices = {0, 0};
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{0, 1}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(0, 1));
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{1, 0}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(1, 0));
   EXPECT_TRUE(IndexUtil::BumpIndices(shape, &indices));
-  EXPECT_MATCH(indices,
-               testing::VectorMatcher<int64>(std::vector<int64>{1, 1}));
+  EXPECT_THAT(indices, ::testing::ElementsAre(1, 1));
   EXPECT_FALSE(IndexUtil::BumpIndices(shape, &indices));
 }
 
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 531a6e03dad4759416f56465a6c582a06e440a5a..d3fcccff654fbbafa0b3c6a3d900123691f059fb 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -14,11 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-
 #include "tensorflow/compiler/xla/legacy_flags/layout_util_flags.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -114,8 +113,8 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleDifferentRank) {
   Shape dst = MakeShapeWithLayout(F32, {2, 3}, {1, 0});
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
@@ -133,8 +132,8 @@ TEST_F(LayoutUtilTest, CopyLayoutNotCompatibleTuple) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("cannot copy layout from shape"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("cannot copy layout from shape"));
 }
 
 TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
@@ -145,9 +144,10 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
 
   auto status = LayoutUtil::CopyLayoutBetweenShapes(src, &dst);
   EXPECT_FALSE(status.ok());
-  EXPECT_MATCH(status.error_message(),
-               testing::ContainsRegex("layout minor_to_major field contains .* "
-                                      "elements, but shape is rank"));
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::ContainsRegex("layout minor_to_major field contains .* "
+                               "elements, but shape is rank"));
 }
 
 TEST_F(LayoutUtilTest, ClearLayoutTuple) {
diff --git a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
index e79d3635095a0aacf20b37e586d2c9ac799cbe07..7d3ad60aea44bedcd5dccce91f1c4d24576f02b0 100644
--- a/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/gpu_compiler_flags.cc
@@ -38,7 +38,6 @@ static void AllocateFlags() {
   flags = new GpuCompilerFlags;
   flags->xla_gpu_embed_ir = false;
   flags->xla_cuda_data_dir = "./cuda_sdk_lib";
-  flags->xla_ptxas_path = "/usr/local/cuda/bin/ptxas";
   flag_list = new std::vector<tensorflow::Flag>({
       tensorflow::Flag(
           "xla_gpu_embed_ir", &flags->xla_gpu_embed_ir,
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
index 8822f6f6107d3d9ff121c04e5904a7367c604be7..ba43a5919522ff783f450481c629d64613e1f8ab 100644
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.cc
@@ -36,10 +36,14 @@ static std::once_flag flags_init;
 static void AllocateFlags() {
   flags = new HloGraphDumperFlags;
   flags->xla_hlo_dump_graph_path = "/tmp/";
+  flags->xla_hlo_dump_as_graphdef = false;
   flag_list = new std::vector<tensorflow::Flag>({
       tensorflow::Flag("xla_hlo_dump_graph_path",
                        &flags->xla_hlo_dump_graph_path,
                        "Path to write dumped HLO graphs to"),
+      tensorflow::Flag("xla_hlo_dump_as_graphdef",
+                       &flags->xla_hlo_dump_as_graphdef,
+                       "Dumps HLO graphs as tensorflow GraphDefs"),
   });
   ParseFlagsFromEnv(*flag_list);
 }
diff --git a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
index b6dfced87cae90c67bd46975a8e36eaef10b19e7..d0b4d092ff1003bc1df90c3d878feacf71a5aa21 100644
--- a/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
+++ b/tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h
@@ -34,6 +34,9 @@ void AppendHloGraphDumperFlags(std::vector<tensorflow::Flag>* flag_list);
 // The values of flags associated with XLA's hlo_graph_dumper module.
 typedef struct {
   string xla_hlo_dump_graph_path;  // Path to write dumped HLO graphs to
+  // If set, dumps HLO graphs as tensorflow GraphDef; otherwise, dumps HLO
+  // graphs as DOT graph.
+  bool xla_hlo_dump_as_graphdef;
 } HloGraphDumperFlags;
 
 // Return a pointer to the HloGraphDumperFlags struct;
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 7091c324d14552d8b7603c3872d0ffc59771d8f7..ec4012a7036e19ec0c75e958b29511b2c5aa4713 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 
 #include <algorithm>
+#include <cstring>
+#include <functional>
 #include <limits>
 #include <numeric>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -33,6 +36,137 @@ limitations under the License.
 
 namespace xla {
 
+LiteralUtil::StrideConfig::StrideConfig(
+    const Shape& source_shape, const Shape& dest_shape,
+    tensorflow::gtl::ArraySlice<int64> dimensions)
+    : dimensions(dimensions),
+      base(dimensions.size(), 0),
+      step(dimensions.size(), 1) {
+  if (!dimensions.empty()) {
+    // Selects the shape with the highest minor dimension as the one upon
+    // where to run the tight stride loop.
+    if (source_shape.layout().minor_to_major()[0] >=
+        dest_shape.layout().minor_to_major()[0]) {
+      minor_dimension = source_shape.layout().minor_to_major()[0];
+      dest_stride = IndexUtil::GetDimensionStride(dest_shape, minor_dimension);
+    } else {
+      minor_dimension = dest_shape.layout().minor_to_major()[0];
+      source_stride =
+          IndexUtil::GetDimensionStride(source_shape, minor_dimension);
+    }
+    minor_loop_size = dimensions[minor_dimension];
+    step[minor_dimension] = minor_loop_size;
+  }
+}
+
+/* static */ std::unique_ptr<Literal> LiteralUtil::CreateFromShape(
+    const Shape& shape) {
+  auto literal = MakeUnique<Literal>();
+  *literal->mutable_shape() = shape;
+  Reserve(ShapeUtil::ElementsIn(literal->shape()), literal.get());
+  return literal;
+}
+
+/* static */ std::unique_ptr<Literal> LiteralUtil::CreateFromDimensions(
+    PrimitiveType primitive_type,
+    tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
+}
+
+template <typename T>
+/* static */ Status LiteralUtil::CopyRange(
+    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    Literal* dest_literal, tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
+  const Shape& src_shape = src_literal.shape();
+  const Shape& dest_shape = dest_literal->shape();
+  tensorflow::gtl::ArraySlice<T> src_data = GetArraySlice<T>(src_literal);
+  tensorflow::gtl::MutableArraySlice<T> dest_data =
+      GetMutableArraySlice<T>(dest_literal);
+
+  TF_RET_CHECK(ShapeUtil::Rank(src_shape) == src_base.size());
+  TF_RET_CHECK(ShapeUtil::Rank(dest_shape) == dest_base.size());
+  if (ShapeUtil::Rank(src_shape) == 0 || ShapeUtil::Rank(dest_shape) == 0) {
+    // If any of the two shapes are scalars, we can just call the StridedCopy()
+    // directly, and we know we will be copying only one value.
+    TF_RET_CHECK(copy_size.empty());
+    StridedCopy(dest_data, LinearIndex(*dest_literal, dest_base), 0, src_data,
+                LinearIndex(src_literal, src_base), 0, 1);
+  } else if (!ShapeUtil::HasZeroElements(dest_shape)) {
+    TF_RET_CHECK(!ShapeUtil::HasZeroElements(src_shape));
+    TF_RET_CHECK(src_base.size() == dest_base.size());
+    TF_RET_CHECK(src_base.size() == copy_size.size());
+
+    // Scan the source from minor, stepping in copy size blocks, then within
+    // the index enumaration functor, do a strided copy advancing source index
+    // by one (walking through the minor dimension), and destination index by
+    // proper stride size at the matching dimension.
+    DimensionVector src_indexes(src_base.size(), 0);
+    DimensionVector dest_indexes(dest_base.size(), 0);
+    StrideConfig stride_config(src_shape, dest_shape, copy_size);
+
+    auto copy_proc = [&](const std::vector<int64>& indexes) {
+      // Map from multi-dimensional index, to source index.
+      std::transform(indexes.begin(), indexes.end(), src_base.begin(),
+                     src_indexes.begin(), std::plus<int64>());
+      // Map from multi-dimensional index, to destination index.
+      std::transform(indexes.begin(), indexes.end(), dest_base.begin(),
+                     dest_indexes.begin(), std::plus<int64>());
+
+      int64 src_index = LinearIndex(src_literal, src_indexes);
+      int64 dest_index = LinearIndex(*dest_literal, dest_indexes);
+
+      StridedCopy(dest_data, dest_index, stride_config.dest_stride, src_data,
+                  src_index, stride_config.source_stride,
+                  stride_config.minor_loop_size);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(src_shape, stride_config.base,
+                            stride_config.dimensions, stride_config.step,
+                            copy_proc);
+  }
+  return Status::OK();
+}
+
+/* static */ Status LiteralUtil::Copy(
+    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    Literal* dest_literal, tensorflow::gtl::ArraySlice<int64> dest_base,
+    tensorflow::gtl::ArraySlice<int64> copy_size) {
+  TF_RET_CHECK(
+      ShapeUtil::SameElementType(src_literal.shape(), dest_literal->shape()));
+  switch (src_literal.shape().element_type()) {
+    case U32:
+      return CopyRange<uint32>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case U64:
+      return CopyRange<uint64>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case S32:
+      return CopyRange<int32>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case S64:
+      return CopyRange<int64>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case F16:
+      return CopyRange<half>(src_literal, src_base, dest_literal, dest_base,
+                             copy_size);
+    case F32:
+      return CopyRange<float>(src_literal, src_base, dest_literal, dest_base,
+                              copy_size);
+    case F64:
+      return CopyRange<double>(src_literal, src_base, dest_literal, dest_base,
+                               copy_size);
+    case PRED:
+      return CopyRange<bool>(src_literal, src_base, dest_literal, dest_base,
+                             copy_size);
+    default:
+      break;
+  }
+  return Unimplemented("Unhandled primitive type %d",
+                       src_literal.shape().element_type());
+}
+
 /* static */ Literal LiteralUtil::Zero(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case U8:
@@ -47,6 +181,8 @@ namespace xla {
       return *LiteralUtil::CreateR0<int32>(0);
     case S64:
       return *LiteralUtil::CreateR0<int64>(0);
+    case F16:
+      return *LiteralUtil::CreateR0<half>(static_cast<half>(0.0f));
     case F32:
       return *LiteralUtil::CreateR0<float>(0);
     case F64:
@@ -56,8 +192,6 @@ namespace xla {
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
     case OPAQUE:
@@ -91,7 +225,7 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -127,7 +261,8 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(
+          static_cast<half>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -163,7 +298,8 @@ namespace xla {
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
     case F16:
-      LOG(FATAL) << "f16 literals not yet implemented";
+      return *LiteralUtil::CreateR0<half>(
+          static_cast<half>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -197,37 +333,16 @@ namespace xla {
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::Relayout(
     const Literal& original, const Layout& layout) {
-  // Note: if this were a performance bottleneck, we avoid cloning and just make
-  // an uninitialized array instead, since all values are clobbered below.
   std::unique_ptr<Literal> result = CloneToUnique(original);
   *result->mutable_shape()->mutable_layout() = layout;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      LiteralUtil::EachCell<float>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float value) {
-            LiteralUtil::Set<float>(result.get(), indices, value);
-          });
-      return result;
-    case S32:
-      LiteralUtil::EachCell<int32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 value) {
-            LiteralUtil::Set<int32>(result.get(), indices, value);
-          });
-      return result;
-    case U32:
-      LiteralUtil::EachCell<uint32>(
-          original,
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 value) {
-            LiteralUtil::Set<uint32>(result.get(), indices, value);
-          });
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
-  }
+
+  const Shape& shape = original.shape();
+  DimensionVector base(ShapeUtil::Rank(shape), 0);
+  DimensionVector copy_size(shape.dimensions().begin(),
+                            shape.dimensions().end());
+
+  TF_CHECK_OK(Copy(original, base, result.get(), base, copy_size));
+  return result;
 }
 
 /* static */ StatusOr<std::unique_ptr<Literal>> LiteralUtil::Reshape(
@@ -235,25 +350,19 @@ namespace xla {
   if (ShapeUtil::IsTuple(input.shape())) {
     return InvalidArgument("Reshape does not support tuples.");
   }
-
+  std::unique_ptr<Literal> output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(input.shape().layout())) {
-    return Unimplemented(
-        "Input shape must have a monotonic layout where dimension 0 is major, "
-        "was: %s",
-        LayoutUtil::HumanString(input.shape().layout()).c_str());
+    std::vector<int64> minor_to_major(ShapeUtil::Rank(input.shape()));
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(),
+              static_cast<int64>(0));
+    output = Relayout(input, LayoutUtil::MakeLayout(minor_to_major));
+  } else {
+    output = CloneToUnique(input);
   }
-  std::vector<int64> layout(dimensions.size());
-  std::iota(layout.rbegin(), layout.rend(), 0);
-
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  std::unique_ptr<Literal> output = CloneToUnique(input);
-  output->clear_shape();
-  output->mutable_shape()->set_element_type(input.shape().element_type());
-  for (int64 dimension : dimensions) {
-    output->mutable_shape()->add_dimensions(dimension);
-  }
-  *output->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout(layout);
+  *output->mutable_shape() =
+      ShapeUtil::MakeShape(input.shape().element_type(), dimensions);
 
   int64 elements_before = ShapeUtil::ElementsIn(input.shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@@ -267,73 +376,42 @@ namespace xla {
   return std::move(output);
 }
 
-namespace {
-
-template <class T>
-void TransposeLiteralInternal(const Literal& original,
-                              tensorflow::gtl::ArraySlice<int64> permutation,
-                              Literal* result) {
-  std::vector<int64> new_indices(ShapeUtil::Rank(original.shape()));
-  LiteralUtil::EachCell<T>(
-      original, [&](tensorflow::gtl::ArraySlice<int64> indices, T value) {
-        for (int64 i = 0; i < indices.size(); ++i) {
-          new_indices[i] = indices[permutation[i]];
-        }
-        LiteralUtil::Set<T>(result, new_indices, value);
-      });
-}
-}  // namespace
-
 /* static */ std::unique_ptr<Literal> LiteralUtil::Transpose(
     const Literal& original, tensorflow::gtl::ArraySlice<int64> permutation) {
   CHECK(!ShapeUtil::IsTuple(original.shape()))
-      << "tuple is not supported for transpose";
-  std::vector<int64> dimension_numbers(ShapeUtil::Rank(original.shape()));
-  std::iota(dimension_numbers.begin(), dimension_numbers.end(), 0);
-  CHECK(std::is_permutation(permutation.begin(), permutation.end(),
-                            dimension_numbers.begin()))
-      << "given permutation is not a permutation of dimension numbers";
-  std::vector<int64> new_dimension_sizes;
-  for (const int64 dim : permutation) {
-    new_dimension_sizes.push_back(original.shape().dimensions(dim));
-  }
-  const auto result_shape = ShapeUtil::MakeShape(
-      original.shape().element_type(), new_dimension_sizes);
-  std::unique_ptr<Literal> result = CloneToUnique(original);
-  *result->mutable_shape() = result_shape;
-  const PrimitiveType primitive_type = original.shape().element_type();
-  switch (primitive_type) {
-    case F32:
-      TransposeLiteralInternal<float>(original, permutation, result.get());
-      return result;
-    case F64:
-      TransposeLiteralInternal<double>(original, permutation, result.get());
-      return result;
-    case PRED:
-      TransposeLiteralInternal<bool>(original, permutation, result.get());
-      return result;
-    case S8:
-      TransposeLiteralInternal<int8>(original, permutation, result.get());
-      return result;
-    case U8:
-      TransposeLiteralInternal<uint8>(original, permutation, result.get());
-      return result;
-    case S32:
-      TransposeLiteralInternal<int32>(original, permutation, result.get());
-      return result;
-    case U32:
-      TransposeLiteralInternal<uint32>(original, permutation, result.get());
-      return result;
-    case S64:
-      TransposeLiteralInternal<int64>(original, permutation, result.get());
-      return result;
-    case U64:
-      TransposeLiteralInternal<uint64>(original, permutation, result.get());
-      return result;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(primitive_type);
+      << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, ShapeUtil::Rank(original.shape())))
+      << "Given permutation is not a permutation of dimension numbers";
+  // To transpose the array, we just permute the dimensions and layout, and
+  // do a straight memory copy of the raw data set.
+  // This is considerably faster than iterating over every array element using
+  // the EachCell<>() and Set<>() APIs.
+  std::vector<int64> inverse_permutation = InversePermutation(permutation);
+  Shape shape =
+      ShapeUtil::PermuteDimensions(inverse_permutation, original.shape());
+  // Replace the layout with one affine to the original shape, such that a
+  // transpose operation can be performed by leaving the flat values
+  // representation intact.
+  // For example, consider the shape F32[11,8]{1,0} under a {1,0} permutation.
+  // The shape with affine layout resulting from that operation will be
+  // F32[8,11]{0,1}, since it leave the original most minor (the 8 sized), the
+  // most minor.
+  // Essentially, given MinMaj(Di) the position of the Di dimension within the
+  // minor to major vector, and given T(Di) the index that the original Di
+  // dimension has within the transposed array, a layout is affine if
+  // MinMaj(Di) == TMinMaj(T(Di)), with TMinMaj() being the minor to major
+  // vector of the affine layout.
+  Layout* layout = shape.mutable_layout();
+  layout->clear_minor_to_major();
+  for (auto index : original.shape().layout().minor_to_major()) {
+    layout->add_minor_to_major(inverse_permutation[index]);
   }
+  std::unique_ptr<Literal> new_literal = CreateFromShape(shape);
+  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+            ShapeUtil::ByteSizeOf(original.shape()));
+  std::memcpy(MutableInternalData(new_literal.get()), InternalData(original),
+              ShapeUtil::ByteSizeOf(original.shape()));
+  return new_literal;
 }
 
 /* static */ std::unique_ptr<Literal> LiteralUtil::Slice(
@@ -342,7 +420,7 @@ void TransposeLiteralInternal(const Literal& original,
   CHECK(!ShapeUtil::IsTuple(literal.shape()))
       << "tuple is not supported for reshape";
 
-  std::vector<int64> result_dimensions;
+  DimensionVector result_dimensions;
   for (int64 dnum = 0; dnum < ShapeUtil::Rank(literal.shape()); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
     CHECK_LE(limit_indices[dnum], literal.shape().dimensions(dnum));
@@ -358,7 +436,7 @@ void TransposeLiteralInternal(const Literal& original,
   *result_literal->mutable_shape() = result_shape;
   Reserve(ShapeUtil::ElementsIn(result_shape), result_literal.get());
 
-  std::vector<int64> new_indices(ShapeUtil::Rank(result_shape));
+  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
     case F32:
       LiteralUtil::EachCell<float>(
@@ -425,6 +503,8 @@ void TransposeLiteralInternal(const Literal& original,
       return tensorflow::strings::StrCat(Get<float>(literal, multi_index));
     case F64:
       return tensorflow::strings::StrCat(Get<double>(literal, multi_index));
+    case F16:
+      return tensorflow::strings::StrCat(Get<half>(literal, multi_index));
     default:
       return tensorflow::strings::StrCat(
           "[", PrimitiveType_Name(literal.shape().element_type()), "]");
@@ -579,6 +659,8 @@ void TransposeLiteralInternal(const Literal& original,
       return reinterpret_cast<const void*>(literal.f32s().data());
     case F64:
       return reinterpret_cast<const void*>(literal.f64s().data());
+    case F16:
+      return reinterpret_cast<const void*>(literal.f16s().data());
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(literal.shape().element_type());
@@ -593,38 +675,34 @@ void TransposeLiteralInternal(const Literal& original,
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
   switch (literal->shape().element_type()) {
     case PRED:
-      GetMutableRepeatedField<bool>(literal)->Resize(num_elements, false);
+      Resize<bool>(num_elements, false, literal);
+      break;
+    case S8:
+      Resize<int8>(num_elements, 0, literal);
       break;
     case U8:
-      // u8s is an optional "bytes", rather than a repeated field. Therefore its
-      // access methods are somewhat different from the others.
-      literal->mutable_u8s()->resize(num_elements, 0);
+      Resize<uint8>(num_elements, 0, literal);
       break;
     case S32:
-      GetMutableRepeatedField<int32>(literal)->Resize(num_elements,
-                                                      /*value=*/0);
+      Resize<int32>(num_elements, 0, literal);
       break;
     case S64:
-      GetMutableRepeatedField<tensorflow::protobuf_int64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<int64>(num_elements, 0, literal);
       break;
     case U32:
-      GetMutableRepeatedField<uint32>(literal)->Resize(num_elements,
-                                                       /*value=*/0);
+      Resize<uint32>(num_elements, 0, literal);
       break;
     case U64:
-      GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal)->Resize(
-          num_elements,
-          /*value=*/0);
+      Resize<uint64>(num_elements, 0, literal);
       break;
     case F32:
-      GetMutableRepeatedField<float>(literal)->Resize(num_elements,
-                                                      /*value=*/0.0f);
+      Resize<float>(num_elements, 0, literal);
       break;
     case F64:
-      GetMutableRepeatedField<double>(literal)->Resize(num_elements,
-                                                       /*value=*/0.0);
+      Resize<double>(num_elements, 0, literal);
+      break;
+    case F16:
+      Resize<half>(num_elements, static_cast<half>(0.0f), literal);
       break;
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
@@ -662,6 +740,9 @@ void TransposeLiteralInternal(const Literal& original,
     case F64:
       actual = literal.f64s_size();
       break;
+    case F16:
+      actual = literal.f16s().size() / sizeof(half);
+      break;
     default:
       return tensorflow::errors::Unimplemented(
           "unhandled element type for literal validation: " +
@@ -680,50 +761,16 @@ void TransposeLiteralInternal(const Literal& original,
 
 /* static */ void LiteralUtil::EachCellAsString(
     const Literal& literal,
-    std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                       const string& value)>
-        per_cell) {
-  if (ShapeUtil::Rank(literal.shape()) == 1) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      per_cell({i0}, GetAsString(literal, {i0}));
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 2) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        per_cell({i0, i1}, GetAsString(literal, {i0, i1}));
-      }
-    }
+    const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                             const string& value)>& per_cell) {
+  if (ShapeUtil::HasZeroElements(literal.shape())) {
     return;
   }
-
-  if (ShapeUtil::Rank(literal.shape()) == 3) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          per_cell({i0, i1, i2}, GetAsString(literal, {i0, i1, i2}));
-        }
-      }
-    }
-    return;
-  }
-
-  if (ShapeUtil::Rank(literal.shape()) == 4) {
-    for (int64 i0 = 0; i0 < literal.shape().dimensions(0); ++i0) {
-      for (int64 i1 = 0; i1 < literal.shape().dimensions(1); ++i1) {
-        for (int64 i2 = 0; i2 < literal.shape().dimensions(2); ++i2) {
-          for (int64 i3 = 0; i3 < literal.shape().dimensions(3); ++i3) {
-            per_cell({i0, i1, i2, i3}, GetAsString(literal, {i0, i1, i2, i3}));
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  LOG(FATAL) << "unhandled rank: " << ShapeUtil::Rank(literal.shape());
+  std::vector<int64> indices = IndexUtil::LinearIndexToMultidimensionalIndex(
+      literal.shape(), /*linear_index=*/0);
+  do {
+    per_cell(indices, GetAsString(literal, indices));
+  } while (IndexUtil::BumpIndices(literal.shape(), &indices));
 }
 
 namespace {
@@ -786,6 +833,8 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
         return EqualElements<float>(literal1, literal2, 0, &multi_index);
       case F64:
         return EqualElements<double>(literal1, literal2, 0, &multi_index);
+      case F16:
+        return EqualElements<half>(literal1, literal2, 0, &multi_index);
       default:
         LOG(FATAL) << "Unimplemented: LiteralUtil::Equal for type "
                    << PrimitiveType_Name(literal1.shape().element_type());
@@ -794,96 +843,175 @@ bool EqualElements(const Literal& literal1, const Literal& literal2,
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
-    const Literal& literal) {
-  CHECK(literal.shape().element_type() == PRED);
-  return literal.preds();
+/* static */ tensorflow::gtl::MutableArraySlice<bool>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_preds();
+  return tensorflow::gtl::MutableArraySlice<bool>(values->mutable_data(),
+                                                  values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal) {
-  CHECK(literal->shape().element_type() == PRED);
-  return literal->mutable_preds();
+/* static */ tensorflow::gtl::MutableArraySlice<int8>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = literal->mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<int8>(
+      reinterpret_cast<int8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U32);
-  return literal.u32s();
+/* static */ tensorflow::gtl::MutableArraySlice<uint8>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  auto values = literal->mutable_u8s();
+  return tensorflow::gtl::MutableArraySlice<uint8>(
+      reinterpret_cast<uint8*>(&(*values)[0]), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == U32);
-  return literal->mutable_u32s();
+/* static */ tensorflow::gtl::MutableArraySlice<int32>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_s32s();
+  return tensorflow::gtl::MutableArraySlice<int32>(values->mutable_data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint64>
-LiteralUtil::GetArraySlice<uint64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == U64);
-  return AsUInt64Slice(literal.u64s());
+/* static */ tensorflow::gtl::MutableArraySlice<uint32>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_u32s();
+  return tensorflow::gtl::MutableArraySlice<uint32>(values->mutable_data(),
+                                                    values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == U64);
-  return literal->mutable_u64s();
+/* static */ tensorflow::gtl::MutableArraySlice<int64>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  static_assert(sizeof(int64) == sizeof(tensorflow::protobuf_int64) &&
+                    alignof(int64) == alignof(tensorflow::protobuf_int64),
+                "The int64 and tensorflow::protobuf_int64 types are not "
+                "compatible");
+  auto values = literal->mutable_s64s();
+  // Because of the fact that tensorflow::protobuf_int64 is defined as int64_t
+  // while tensorflow::int64 is defined as long long, a reinterpret_cast<> is
+  // necessary from the raw data pointer returned by the mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<int64>(
+      reinterpret_cast<int64*>(values->mutable_data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int32>
-LiteralUtil::GetArraySlice<int32>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S32);
-  return literal.s32s();
+/* static */ tensorflow::gtl::MutableArraySlice<uint64>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  static_assert(sizeof(uint64) == sizeof(tensorflow::protobuf_uint64) &&
+                    alignof(uint64) == alignof(tensorflow::protobuf_uint64),
+                "The uint64 and tensorflow::protobuf_uint64 types are not "
+                "compatible");
+  auto values = literal->mutable_u64s();
+  // Because of the fact that tensorflow::protobuf_uint64 is defined as uint64_t
+  // while tensorflow::uint64 is defined as unsigned long long, a
+  // reinterpret_cast<> is necessary from the raw data pointer returned by the
+  // mutable_data() API.
+  return tensorflow::gtl::MutableArraySlice<uint64>(
+      reinterpret_cast<uint64*>(values->mutable_data()), values->size());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal) {
-  CHECK(literal->shape().element_type() == S32);
-  return literal->mutable_s32s();
+/* static */ tensorflow::gtl::MutableArraySlice<float>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_f32s();
+  return tensorflow::gtl::MutableArraySlice<float>(values->mutable_data(),
+                                                   values->size());
 }
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<int64>
-LiteralUtil::GetArraySlice<int64>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == S64);
-  return AsInt64Slice(literal.s64s());
+/* static */ tensorflow::gtl::MutableArraySlice<double>
+LiteralUtil::GetMutableArraySlice(Literal* literal) {
+  auto values = literal->mutable_f64s();
+  return tensorflow::gtl::MutableArraySlice<double>(values->mutable_data(),
+                                                    values->size());
+}
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<half>
+LiteralUtil::GetMutableArraySlice<half>(Literal* literal) {
+  // C++11 standard, basic_string 21.4.1.5, values should be stored
+  // contiguously. From C++17 a mutable data() member will be provided.
+  // TODO - there is an endianess problem here. fix it, or wait for uint16
+  //        support in protobuf
+  auto values = literal->mutable_f16s();
+  return tensorflow::gtl::MutableArraySlice<half>(
+      reinterpret_cast<half*>(&(*values)[0]), values->size() / sizeof(half));
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
+    const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), PRED);
+  return literal.preds();
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<uint8>
+LiteralUtil::GetArraySlice<uint8>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U8);
+  return tensorflow::gtl::ArraySlice<uint8>(
+      reinterpret_cast<const uint8*>(literal.u8s().data()),
+      literal.u8s().size());
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<int8> LiteralUtil::GetArraySlice<int8>(
+    const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S8);
+  return tensorflow::gtl::ArraySlice<int8>(
+      reinterpret_cast<const int8*>(literal.u8s().data()),
+      literal.u8s().size());
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<uint32>
+LiteralUtil::GetArraySlice<uint32>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U32);
+  return literal.u32s();
+}
+
+template <>
+/* static */ tensorflow::gtl::ArraySlice<uint64>
+LiteralUtil::GetArraySlice<uint64>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), U64);
+  return AsUInt64Slice(literal.u64s());
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal) {
-  CHECK(literal->shape().element_type() == S64);
-  return literal->mutable_s64s();
+/* static */ tensorflow::gtl::ArraySlice<int32>
+LiteralUtil::GetArraySlice<int32>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S32);
+  return literal.s32s();
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F32);
-  return literal->mutable_f32s();
+/* static */ tensorflow::gtl::ArraySlice<int64>
+LiteralUtil::GetArraySlice<int64>(const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), S64);
+  return AsInt64Slice(literal.s64s());
 }
 
 template <>
 /* static */ tensorflow::gtl::ArraySlice<double>
 LiteralUtil::GetArraySlice<double>(const Literal& literal) {
-  CHECK(literal.shape().element_type() == F64);
+  CHECK_EQ(literal.shape().element_type(), F64);
   return literal.f64s();
 }
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal) {
-  CHECK(literal->shape().element_type() == F64);
-  return literal->mutable_f64s();
+/* static */ tensorflow::gtl::ArraySlice<half> LiteralUtil::GetArraySlice<half>(
+    const Literal& literal) {
+  CHECK_EQ(literal.shape().element_type(), F16);
+  return tensorflow::gtl::ArraySlice<half>(
+      reinterpret_cast<const half*>(literal.f16s().data()),
+      literal.f16s().size() / sizeof(half));
 }
 
 template <typename NativeT>
@@ -925,6 +1053,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return AllElementsEqualValue<float>(literal, value);
     case F64:
       return AllElementsEqualValue<double>(literal, value);
+    case F16:
+      return AllElementsEqualValue<half>(literal, static_cast<half>(value));
     case PRED:
       if (value == 0) {
         return AllElementsEqualValue<bool>(literal, false);
@@ -944,6 +1074,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return AllElementsEqualValue<float>(literal, value);
     case F64:
       return AllElementsEqualValue<double>(literal, value);
+    case F16:
+      return AllElementsEqualValue<half>(literal, static_cast<half>(value));
     default:
       return false;
   }
@@ -968,6 +1100,8 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
       return Get<float>(literal, indices) == 0.0f;
     case F64:
       return Get<double>(literal, indices) == 0.0;
+    case F16:
+      return Get<half>(literal, indices) == static_cast<half>(0.0f);
     case PRED:
       return Get<bool>(literal, indices) == false;
     default:
@@ -976,51 +1110,77 @@ static bool AllElementsEqualValue(const Literal& literal, NativeT value) {
 }
 
 template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<int64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_preds()->Resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal) {
-  *literal->mutable_shape() = ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<uint64>(), dimensions);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u8s()->resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u8s()->resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_s32s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
+                                              Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u32s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
+                                             Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_s64s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
+                                              Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_u64s()->Resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal) {
+/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
+                                             Literal* literal) {
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_int64>(literal);
-  repeated_field->Resize(num_elements, value);
+  literal->mutable_f32s()->Resize(num_elements, value);
 }
 
 template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal) {
+/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
+                                              Literal* literal) {
   CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-      repeated_field =
-          GetMutableRepeatedField<tensorflow::protobuf_uint64>(literal);
-  repeated_field->Resize(num_elements, value);
+  literal->mutable_f64s()->Resize(num_elements, value);
+}
+
+template <>
+/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
+                                            Literal* literal) {
+  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
+  literal->mutable_f16s()->resize(num_elements * sizeof(half));
+  auto data = GetMutableArraySlice<half>(literal);
+  for (int i = 0; i < num_elements; i++) {
+    data[i] = value;
+  }
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 21bb2e46cf2ebcd72bcce393a1e5526f41757544..8e06f35b33d132ba92ce6309db916940362e5a7b 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -100,9 +101,34 @@ class LiteralUtil {
           values,
       const Layout& layout);
 
+  // Create a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
+
+  // Create a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to dest_literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and dest_literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  static Status Copy(const Literal& src_literal,
+                     tensorflow::gtl::ArraySlice<int64> src_base,
+                     Literal* dest_literal,
+                     tensorflow::gtl::ArraySlice<int64> dest_base,
+                     tensorflow::gtl::ArraySlice<int64> copy_size);
+
   // Creates a new value that has the equivalent value as literal, but conforms
   // to new_layout; e.g. a literal matrix that was in {0, 1} minor-to-major
-  // dimension layout can be re-layed-out as {1, 0} minor-to-major dimension
+  // dimension layout can be re-laid-out as {1, 0} minor-to-major dimension
   // layout and the value in the cell at any given logical index (i0, i1) will
   // be the same.
   //
@@ -213,6 +239,11 @@ class LiteralUtil {
   // Clones literal into an owned unique_ptr version.
   static std::unique_ptr<Literal> CloneToUnique(const Literal& literal);
 
+  // Returns the linear index of the given index within the literal's
+  // element_type repeated field.
+  static int64 LinearIndex(const Literal& literal,
+                           tensorflow::gtl::ArraySlice<int64> multi_index);
+
   // Gets or sets an element in the literal at the given index. The index is
   // CHECKed against the dimension sizes.
   template <typename NativeT>
@@ -223,6 +254,12 @@ class LiteralUtil {
                   tensorflow::gtl::ArraySlice<int64> multi_index,
                   NativeT value);
 
+  // Retrieves the mutable array slice interface which can be used to manipulate
+  // pre-allocated literal values.
+  template <typename NativeT>
+  static tensorflow::gtl::MutableArraySlice<NativeT> GetMutableArraySlice(
+      Literal* literal);
+
   // Returns the element value at index (0, ..., 0), however many zeroes are
   // required for that index.
   template <typename NativeT>
@@ -257,9 +294,8 @@ class LiteralUtil {
   // like representation in a protobuf).
   static void EachCellAsString(
       const Literal& literal,
-      std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                         const string& value)>
-          per_cell);
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell);
   template <typename NativeT>
   static void EachCell(
       const Literal& literal,
@@ -315,6 +351,14 @@ class LiteralUtil {
                                               const Layout& layout,
                                               Literal* literal);
 
+  // Populates literal values by calling the generator function for every cell
+  // in the literal object.
+  template <typename NativeT>
+  static Status Populate(
+      Literal* literal,
+      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+          generator);
+
   // Creates a Literal of the given dimensions with all elements set to the
   // given value.
   template <typename NativeT>
@@ -383,70 +427,73 @@ class LiteralUtil {
     static_assert(!std::is_same<NativeT, NativeT>::value,
                   "Cannot map native type to primitive type.");
   }
-  template <typename NativeT>
-  static tensorflow::protobuf::RepeatedField<NativeT>* GetMutableRepeatedField(
-      Literal* literal) {
-    // Make the expression depend on the template parameter NativeT so
-    // that this compile-time error only apperas if this function is
-    // instantiated with some concrete type that is not specialized
-    // below.
-    static_assert(!std::is_same<NativeT, NativeT>::value,
-                  "Cannot map native type to primitive type.");
-  }
 
-  // Returns the linear index of the given index within the literal's
-  // element_type repeated field.
-  static int64 LinearIndex(const Literal& literal,
-                           tensorflow::gtl::ArraySlice<int64> multi_index);
+  // Internal template helper for the Copy() API, matching its arguments one by
+  // one.
+  template <typename T>
+  static Status CopyRange(const Literal& src_literal,
+                          tensorflow::gtl::ArraySlice<int64> src_base,
+                          Literal* dest_literal,
+                          tensorflow::gtl::ArraySlice<int64> dest_base,
+                          tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Utility structure which is used to create the optimal configuration for
+  // a ShapeUtil::ForEachIndex() scan across two literals.
+  struct StrideConfig {
+    StrideConfig(const Shape& source_shape, const Shape& dest_shape,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+    // The dimensions of the stride operation. Essentially every dimension
+    // will be iterated from base[i] to base[i]+dimensions[i], in step[i]
+    // steps.
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    DimensionVector base;
+    DimensionVector step;
+    int64 minor_dimension = 0;
+    // The size of the strides for source and destination. One of the two
+    // (the one looping through its most minor dimension) will be 1, while
+    // the other will be the stride size at the dimension matching the other
+    // shape most minor dimension being scanned.
+    int64 dest_stride = 1;
+    int64 source_stride = 1;
+    // The size of the inner loop on the most minor dimension.
+    int64 minor_loop_size = 1;
+  };
 
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralUtil);
 };
 
 // Declarations of template specializations for GetArraySlice and
-// GetMutableRepeatedField. The specializations map native type to XLA primitive
+// GetMutableArraySlice. The specializations map native type to XLA primitive
 // type.
 template <>
 /* static */ tensorflow::gtl::ArraySlice<bool> LiteralUtil::GetArraySlice<bool>(
     const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<bool>*
-LiteralUtil::GetMutableRepeatedField<bool>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<uint8>
+LiteralUtil::GetArraySlice<uint8>(const Literal& literal);
 
 template <>
-/* static */ tensorflow::gtl::ArraySlice<uint32>
-LiteralUtil::GetArraySlice<uint32>(const Literal& literal);
+/* static */ tensorflow::gtl::ArraySlice<int8> LiteralUtil::GetArraySlice<int8>(
+    const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<uint32>*
-LiteralUtil::GetMutableRepeatedField<uint32>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<uint32>
+LiteralUtil::GetArraySlice<uint32>(const Literal& literal);
 
 template <>
 /* static */ tensorflow::gtl::ArraySlice<uint64>
 LiteralUtil::GetArraySlice<uint64>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_uint64>(
-    Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<int32>
 LiteralUtil::GetArraySlice<int32>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<int32>*
-LiteralUtil::GetMutableRepeatedField<int32>(Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<int64>
 LiteralUtil::GetArraySlice<int64>(const Literal& literal);
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-LiteralUtil::GetMutableRepeatedField<tensorflow::protobuf_int64>(
-    Literal* literal);
-
 template <>
 /* static */ inline tensorflow::gtl::ArraySlice<float>
 LiteralUtil::GetArraySlice<float>(const Literal& literal) {
@@ -454,22 +501,98 @@ LiteralUtil::GetArraySlice<float>(const Literal& literal) {
   return literal.f32s();
 }
 
-template <>
-/* static */ tensorflow::protobuf::RepeatedField<float>*
-LiteralUtil::GetMutableRepeatedField<float>(Literal* literal);
-
 template <>
 /* static */ tensorflow::gtl::ArraySlice<double>
 LiteralUtil::GetArraySlice<double>(const Literal& literal);
 
 template <>
-/* static */ tensorflow::protobuf::RepeatedField<double>*
-LiteralUtil::GetMutableRepeatedField<double>(Literal* literal);
+/* static */ tensorflow::gtl::ArraySlice<half> LiteralUtil::GetArraySlice<half>(
+    const Literal& literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<bool>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int8>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint8>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int32>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint32>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<int64>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<uint64>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<float>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<double>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ tensorflow::gtl::MutableArraySlice<half>
+LiteralUtil::GetMutableArraySlice(Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<bool>(int64 num_elements, bool value,
+                                            Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int8>(int64 num_elements, int8 value,
+                                            Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint8>(int64 num_elements, uint8 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int32>(int64 num_elements, int32 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint32>(int64 num_elements, uint32 value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<int64>(int64 num_elements, int64 value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<uint64>(int64 num_elements, uint64 value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<float>(int64 num_elements, float value,
+                                             Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<double>(int64 num_elements, double value,
+                                              Literal* literal);
+
+template <>
+/* static */ void LiteralUtil::Resize<half>(int64 num_elements, half value,
+                                            Literal* literal);
 
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::CreateR0(NativeT value) {
   auto literal = MakeUnique<Literal>();
-  PopulateR0(value, literal.get());
+  PopulateR0<NativeT>(value, literal.get());
   return literal;
 }
 
@@ -695,12 +818,20 @@ template <>
   return literal.u8s()[linear_index];
 }
 
+template <>
+/* static */ inline half LiteralUtil::Get<half>(
+    const Literal& literal, tensorflow::gtl::ArraySlice<int64> multi_index) {
+  CHECK(literal.shape().element_type() == F16);
+  int64 linear_index = LinearIndex(literal, multi_index);
+  return GetArraySlice<half>(literal)[linear_index];
+}
+
 template <typename NativeT>
 /* static */ void LiteralUtil::Set(
     Literal* literal, tensorflow::gtl::ArraySlice<int64> multi_index,
     NativeT value) {
   int64 linear_index = LinearIndex(*literal, multi_index);
-  GetMutableRepeatedField<NativeT>(literal)->Set(linear_index, value);
+  GetMutableArraySlice<NativeT>(literal).at(linear_index) = value;
 }
 
 template <>
@@ -760,44 +891,11 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-/* static */ void LiteralUtil::PopulateR0(NativeT value, Literal* literal) {
+/* static */ inline void LiteralUtil::PopulateR0(NativeT value,
+                                                 Literal* literal) {
   *literal->mutable_shape() = ShapeUtil::MakeShape(
       primitive_util::NativeToPrimitiveType<NativeT>(), {});
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint8>(uint8 value,
-                                                        Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int8>(int8 value,
-                                                       Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int8>(), {});
-  literal->mutable_u8s()->push_back(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<uint64>(uint64 value,
-                                                         Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<uint64>(), {});
-  literal->mutable_u64s()->Add(value);
-}
-
-template <>
-/* static */ inline void LiteralUtil::PopulateR0<int64>(int64 value,
-                                                        Literal* literal) {
-  *literal->mutable_shape() =
-      ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType<int64>(), {});
-  literal->mutable_s64s()->Add(value);
+  Resize<NativeT>(1, value, literal);
 }
 
 template <typename NativeT>
@@ -944,65 +1042,72 @@ template <typename NativeT>
                                   literal);
 }
 
+template <typename NativeT>
+/* static */ Status LiteralUtil::Populate(
+    Literal* literal,
+    const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+        generator) {
+  const Shape& shape = literal->shape();
+  int64 rank = ShapeUtil::Rank(shape);
+  TF_RET_CHECK(shape.element_type() ==
+               primitive_util::NativeToPrimitiveType<NativeT>());
+  tensorflow::gtl::MutableArraySlice<NativeT> data =
+      GetMutableArraySlice<NativeT>(literal);
+  if (rank > 0) {
+    StrideConfig stride_config(shape, shape, AsInt64Slice(shape.dimensions()));
+    DimensionVector minor_scan_indexes(rank, 0);
+    int64 minor_dimension_size =
+        ShapeUtil::GetDimension(shape, stride_config.minor_dimension);
+
+    auto init_function = [&](const std::vector<int64>& indexes) {
+      int64 index = LinearIndex(*literal, indexes);
+      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
+      for (int64 i = 0; i < minor_dimension_size; ++i) {
+        minor_scan_indexes[stride_config.minor_dimension] = i;
+        data.at(index + i) = generator(minor_scan_indexes);
+      }
+      return true;
+    };
+    ShapeUtil::ForEachIndex(shape, stride_config.base, stride_config.dimensions,
+                            stride_config.step, init_function);
+  } else {
+    data.at(0) = generator({});
+  }
+  return Status::OK();
+}
+
 template <typename NativeT>
 /* static */ void LiteralUtil::PopulateWithValue(
     NativeT value, tensorflow::gtl::ArraySlice<int64> dimensions,
     Literal* literal) {
   *literal->mutable_shape() = ShapeUtil::MakeShape(
       primitive_util::NativeToPrimitiveType<NativeT>(), dimensions);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal->shape()); ++i) {
-    repeated_field->Add(value);
-  }
+  Resize<NativeT>(ShapeUtil::ElementsIn(literal->shape()), value, literal);
 }
 
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    int64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::PopulateWithValue(
-    uint64 value, tensorflow::gtl::ArraySlice<int64> dimensions,
-    Literal* literal);
-
 template <typename NativeSrcT, typename NativeDestT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::Convert(
     const Literal& literal) {
+  const Shape& shape = literal.shape();
   auto result_literal = MakeUnique<Literal>();
-  Shape result_shape = literal.shape();
-  result_shape.set_element_type(
+  Shape* result_shape = result_literal->mutable_shape();
+  *result_shape = shape;
+  result_shape->set_element_type(
       primitive_util::NativeToPrimitiveType<NativeDestT>());
-  *result_literal->mutable_shape() = result_shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(result_shape),
+  LiteralUtil::Reserve(ShapeUtil::ElementsIn(*result_shape),
                        result_literal.get());
-  LiteralUtil::EachCell<NativeSrcT>(
-      literal,
-      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeSrcT value) {
-        LiteralUtil::Set<NativeDestT>(result_literal.get(), indices,
-                                      static_cast<NativeDestT>(value));
-      });
+  tensorflow::gtl::ArraySlice<NativeSrcT> src_data =
+      GetArraySlice<NativeSrcT>(literal);
+  tensorflow::gtl::MutableArraySlice<NativeDestT> dest_data =
+      GetMutableArraySlice<NativeDestT>(result_literal.get());
+  int64 num_elements = ShapeUtil::ElementsIn(shape);
+
+  for (int64 i = 0; i < num_elements; ++i) {
+    dest_data[i] = static_cast<NativeDestT>(src_data[i]);
+  }
   return result_literal;
 }
 
-template <typename NativeT>
-/* static */ void LiteralUtil::Resize(int64 num_elements, NativeT value,
-                                      Literal* literal) {
-  CHECK_EQ(ShapeUtil::ElementsIn(literal->shape()), num_elements);
-  tensorflow::protobuf::RepeatedField<NativeT>* repeated_field =
-      GetMutableRepeatedField<NativeT>(literal);
-  repeated_field->Resize(num_elements, value);
-}
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, int64 value,
-                                      Literal* literal);
-
-template <>
-/* static */ void LiteralUtil::Resize(int64 num_elements, uint64 value,
-                                      Literal* literal);
-
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal>
 LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
@@ -1022,10 +1127,7 @@ LiteralUtil::CreateFullWithMonotonicDim0MajorLayout(
 template <typename NativeT>
 /* static */ std::unique_ptr<Literal> LiteralUtil::Replicate(
     const Literal& input, int64 times) {
-  // Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
-  // the bounds and indices.
-  static constexpr int kInlineRank = 8;
-  tensorflow::gtl::InlinedVector<int64, kInlineRank> bounds = {times};
+  DimensionVector bounds = {times};
   bounds.reserve(input.shape().dimensions_size() + 1);
   for (int64 bound : input.shape().dimensions()) {
     bounds.push_back(bound);
@@ -1039,8 +1141,7 @@ template <typename NativeT>
   }
   Reserve(elements, literal.get());
 
-  tensorflow::gtl::InlinedVector<int64, kInlineRank> output_indices(
-      bounds.size(), 0);
+  DimensionVector output_indices(bounds.size(), 0);
   tensorflow::gtl::ArraySlice<int64> input_indices = output_indices;
   input_indices.remove_prefix(1);
 
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index e53763376bfe58b7c5a811987161cac966d14222..9a09822174d9c93c8195af193f34017268bbc503 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -21,14 +21,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
 class LiteralUtilTest : public ::testing::Test {
  protected:
   LiteralUtilTest() {
@@ -101,6 +105,9 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
   auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
   ASSERT_EQ("3.14", LiteralUtil::ToString(*f32_lit));
+
+  auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
+  ASSERT_EQ("0.5", LiteralUtil::ToString(*f16_lit));
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -159,9 +166,7 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   // clang-format on
 
   auto literal = LiteralUtil::CreateR3FromArray3D(array_3d);
-  EXPECT_MATCH(testing::PBToVec<tensorflow::protobuf_int64>(
-                   literal->shape().dimensions()),
-               testing::VectorMatcher<tensorflow::protobuf_int64>({2, 3, 2}));
+  EXPECT_THAT(literal->shape().dimensions(), ElementsAre(2, 3, 2));
   string result = LiteralUtil::ToString(*literal);
   const string expected = R"(f32[2,3,2] {
 { { 1, 2 },
@@ -182,9 +187,7 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
     {2001, 2002},
   }, /*projection_p=*/1, /*projection_z=*/2);
   // clang-format on
-  EXPECT_MATCH(
-      testing::PBToVec(literal->shape().dimensions()),
-      testing::VectorMatcher<tensorflow::protobuf_int64>({1, 2, 3, 2}));
+  EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = LiteralUtil::ToString(*literal);
   const string expected = R"(f32[1,2,3,2] {
   {  // i0=0
@@ -204,10 +207,8 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
-  EXPECT_MATCH(
-      testing::PBToVec<tensorflow::protobuf_int64>(
-          literal_r4_2x2x3x3_dim0major_->shape().dimensions()),
-      testing::VectorMatcher<tensorflow::protobuf_int64>({2, 2, 3, 3}));
+  EXPECT_THAT(literal_r4_2x2x3x3_dim0major_->shape().dimensions(),
+              ElementsAre(2, 2, 3, 3));
   string result = LiteralUtil::ToString(*literal_r4_2x2x3x3_dim0major_);
   const string expected = R"(f32[2,2,3,3] {
   {  // i0=0
@@ -375,6 +376,15 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(
       LiteralUtil::IsAll(*LiteralUtil::CreateR2<uint64>({{9, 8}, {8, 8}}), 8));
 
+  half h8(8.0f);
+  half h9(9.0f);
+  EXPECT_TRUE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h8}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h8}, {h9}}), 8));
+  EXPECT_FALSE(
+      LiteralUtil::IsAll(*LiteralUtil::CreateR2<half>({{h9}, {h8}}), 8));
+
   auto uint64_max = std::numeric_limits<uint64>::max();
   EXPECT_FALSE(LiteralUtil::IsAll(
       *LiteralUtil::CreateR2<uint64>(
@@ -471,6 +481,26 @@ TEST_F(LiteralUtilTest, ReshapeR4) {
   EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
 }
 
+TEST_F(LiteralUtilTest, ReshapeR4Dim0Minor) {
+  // clang-format off
+  // F32[1x3x2x4]
+  auto original = LiteralUtil::CreateR4WithLayout<float>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0minor_);
+  // F32[1x3x4x2]
+  auto expected = LiteralUtil::CreateR3WithLayout<float>({
+    {{10, 11}, {12, 13}, {14, 15}, {16, 17}},
+    {{18, 19}, {20, 21}, {22, 23}, {24, 25}},
+    {{26, 27}, {28, 29}, {30, 31}, {32, 33}},
+  }, layout_r3_dim0major_);
+  // clang-format on
+  auto reshape = LiteralUtil::Reshape(*original, {3, 4, 2}).ConsumeValueOrDie();
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *reshape));
+}
+
 TEST_F(LiteralUtilTest, TransposeR0) {
   auto original = LiteralUtil::CreateR0<float>(1.7f);
   auto reshape = LiteralUtil::Transpose(*original, /*permutation=*/{});
@@ -516,27 +546,23 @@ TEST_F(LiteralUtilTest, TestR2LinearLayout) {
   auto mat_dim0minor = LiteralUtil::CreateR2WithLayout<int>(
       {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0minor_);
   EXPECT_EQ(mat_dim0minor->s32s_size(), 6);
-  EXPECT_MATCH(testing::PBToVec<int32>(mat_dim0minor->s32s()),
-               testing::VectorMatcher<int32>({1, 4, 2, 5, 3, 6}));
+  EXPECT_THAT(mat_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_mat_to_dim0major =
       LiteralUtil::Relayout(*mat_dim0minor, layout_r2_dim0major_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_mat_to_dim0major->s32s()),
-               testing::VectorMatcher<int32>({1, 2, 3, 4, 5, 6}));
+  EXPECT_THAT(relaid_mat_to_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout of R2 created with dim0-major (row-major).
   auto mat_dim0major = LiteralUtil::CreateR2WithLayout<int>(
       {{1, 2, 3}, {4, 5, 6}}, layout_r2_dim0major_);
   EXPECT_EQ(mat_dim0major->s32s_size(), 6);
-  EXPECT_MATCH(testing::PBToVec<int32>(mat_dim0major->s32s()),
-               testing::VectorMatcher<int32>({1, 2, 3, 4, 5, 6}));
+  EXPECT_THAT(mat_dim0major->s32s(), ElementsAre(1, 2, 3, 4, 5, 6));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_mat_to_dim0minor =
       LiteralUtil::Relayout(*mat_dim0major, layout_r2_dim0minor_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_mat_to_dim0minor->s32s()),
-               testing::VectorMatcher<int32>({1, 4, 2, 5, 3, 6}));
+  EXPECT_THAT(relaid_mat_to_dim0minor->s32s(), ElementsAre(1, 4, 2, 5, 3, 6));
 }
 
 TEST_F(LiteralUtilTest, TestR3LinearLayout) {
@@ -558,28 +584,28 @@ TEST_F(LiteralUtilTest, TestR3LinearLayout) {
 
   EXPECT_EQ(lit_dim0minor->s32s_size(), 12);
   std::vector<int> expected_dim0minor{1, 7, 4, 10, 2, 8, 5, 11, 3, 9, 6, 12};
-  EXPECT_MATCH(testing::PBToVec<int32>(lit_dim0minor->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0minor));
+  EXPECT_THAT(lit_dim0minor->s32s(),
+              testing::ElementsAreArray(expected_dim0minor));
 
   // Test expected memory layout when using Relayout to row major.
   auto relaid_lit_to_dim0major =
       LiteralUtil::Relayout(*lit_dim0minor, layout_r3_dim0major_);
   std::vector<int> expected_dim0major{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_lit_to_dim0major->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0major));
+  EXPECT_THAT(relaid_lit_to_dim0major->s32s(),
+              testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout of R3 created with dim0-major (row-major).
   auto lit_dim0major = LiteralUtil::CreateR3FromArray3DWithLayout<int>(
       arr3d, layout_r3_dim0major_);
   EXPECT_EQ(lit_dim0major->s32s_size(), 12);
-  EXPECT_MATCH(testing::PBToVec<int32>(lit_dim0major->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0major));
+  EXPECT_THAT(lit_dim0major->s32s(),
+              testing::ElementsAreArray(expected_dim0major));
 
   // Test expected memory layout when using Relayout to column major.
   auto relaid_lit_to_dim0minor =
       LiteralUtil::Relayout(*lit_dim0major, layout_r3_dim0minor_);
-  EXPECT_MATCH(testing::PBToVec<int32>(relaid_lit_to_dim0minor->s32s()),
-               testing::VectorMatcher<int32>(expected_dim0minor));
+  EXPECT_THAT(relaid_lit_to_dim0minor->s32s(),
+              testing::ElementsAreArray(expected_dim0minor));
 }
 
 TEST_F(LiteralUtilTest, SliceR0S32) {
@@ -645,6 +671,30 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2U64) {
   EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
+  Literal output;
+  half h(0.25f);
+  LiteralUtil::PopulateWithValue<half>(h, {}, &output);
+  auto expected = LiteralUtil::CreateR0<half>(h);
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR1F16) {
+  Literal output;
+  half h(0.5f);
+  LiteralUtil::PopulateWithValue<half>(h, {3}, &output);
+  auto expected = LiteralUtil::CreateR1<half>({h, h, h});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR2F16) {
+  Literal output;
+  half h(2.0f);
+  LiteralUtil::PopulateWithValue<half>(h, {2, 2}, &output);
+  auto expected = LiteralUtil::CreateR2<half>({{h, h}, {h, h}});
+  EXPECT_TRUE(LiteralUtil::Equal(output, *expected));
+}
+
 TEST_F(LiteralUtilTest, ReplicateR2U32) {
   auto input = LiteralUtil::CreateR2<uint32>(
       {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}});
@@ -656,5 +706,156 @@ TEST_F(LiteralUtilTest, ReplicateR2U32) {
   EXPECT_TRUE(LiteralUtil::Equal(*output, *expected));
 }
 
+TEST_F(LiteralUtilTest, Copy) {
+  const int64 dimensions[] = {17, 15, 34, 21};
+  const int64 layouts[][4] = {
+      {3, 2, 1, 0}, {0, 2, 1, 3}, {0, 1, 2, 3}, {2, 0, 3, 1}, {1, 3, 0, 2}};
+  for (const auto& layout : layouts) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), dimensions, layout);
+    auto blank = LiteralUtil::CreateFromShape(shape);
+    auto source = LiteralUtil::CreateFromShape(shape);
+    const int64 zero_base[] = {0, 0, 0, 0};
+    const int64 step[] = {1, 1, 1, 1};
+    uint32 seqnr = 0;
+    auto init_proc = [&](const std::vector<int64>& indexes) {
+      LiteralUtil::Set(source.get(), indexes, ++seqnr);
+      return true;
+    };
+
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, dimensions, step,
+                            init_proc);
+
+    const int64 src_base[] = {3, 1, 5, 7};
+    const int64 dest_base[] = {6, 4, 12, 2};
+    const int64 copy_size[] = {7, 8, 11, 9};
+
+    TF_EXPECT_OK(LiteralUtil::Copy(*source, src_base, blank.get(), dest_base,
+                                   copy_size));
+    std::vector<int64> source_indexes(TF_ARRAYSIZE(dimensions), 0);
+    std::vector<int64> blank_indexes(TF_ARRAYSIZE(dimensions), 0);
+    bool matched = true;
+    auto check_proc = [&](const std::vector<int64>& indexes) {
+      std::copy(indexes.begin(), indexes.end(), source_indexes.begin());
+      std::transform(source_indexes.begin(), source_indexes.end(), src_base,
+                     source_indexes.begin(), std::plus<int64>());
+      std::copy(indexes.begin(), indexes.end(), blank_indexes.begin());
+      std::transform(blank_indexes.begin(), blank_indexes.end(), dest_base,
+                     blank_indexes.begin(), std::plus<int64>());
+      auto bval = LiteralUtil::Get<uint32>(*blank, blank_indexes);
+      matched = (bval != 0 &&
+                 bval == LiteralUtil::Get<uint32>(*source, source_indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(source->shape(), zero_base, copy_size, step,
+                            check_proc);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, CopyScalars) {
+  auto zero = LiteralUtil::CreateR0<uint32>(0);
+  auto nine = LiteralUtil::CreateR0<uint32>(9);
+  TF_EXPECT_OK(LiteralUtil::Copy(*nine, {}, zero.get(), {}, {}));
+  EXPECT_TRUE(LiteralUtil::Equal(*zero, *nine));
+
+  auto vect = LiteralUtil::CreateR1<uint32>({3, 4, 9, 12, 5, 17, 21});
+  TF_EXPECT_OK(LiteralUtil::Copy(*vect, {5}, zero.get(), {}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*zero, {}), 17);
+  TF_EXPECT_OK(LiteralUtil::Copy(*zero, {}, vect.get(), {4}, {}));
+  EXPECT_EQ(LiteralUtil::Get<uint32>(*vect, {4}), 17);
+}
+
+TEST_F(LiteralUtilTest, F16) {
+  // Verify that the internal data views are consistent and that they
+  // are in little endian format
+  // TODO - modify if we make the data format machine endianess dependent
+  auto m1 = LiteralUtil::CreateFromShape(ShapeUtil::MakeShape(F16, {2, 2}));
+  Literal* l1 = m1.get();
+  const char* d1 = (const char*)LiteralUtil::InternalData(*l1);
+  EXPECT_EQ(d1[0], 0);
+  EXPECT_EQ(d1[1], 0);
+  EXPECT_EQ(d1[2], 0);
+  EXPECT_EQ(d1[3], 0);
+  EXPECT_EQ(d1[4], 0);
+  EXPECT_EQ(d1[5], 0);
+  EXPECT_EQ(d1[6], 0);
+  EXPECT_EQ(d1[7], 0);
+  EXPECT_EQ(LiteralUtil::InternalData(*l1),
+            LiteralUtil::MutableInternalData(l1));
+
+  half h1(1.0f);
+  half h2(2.0f);
+  auto m2 = LiteralUtil::CreateR2<half>({{h1, h2}, {h2, h1}});
+  Literal* l2 = m2.get();
+  const char* d2 = (const char*)LiteralUtil::InternalData(*l2);
+  EXPECT_EQ(d2[0], 0);
+  EXPECT_EQ(d2[1], 0x3C);
+  EXPECT_EQ(d2[2], 0);
+  EXPECT_EQ(d2[3], 0x40);
+  EXPECT_EQ(d2[4], 0);
+  EXPECT_EQ(d2[5], 0x40);
+  EXPECT_EQ(d2[6], 0);
+  EXPECT_EQ(d2[7], 0x3C);
+  EXPECT_EQ(LiteralUtil::InternalData(*l2),
+            LiteralUtil::MutableInternalData(l2));
+}
+
+TEST_F(LiteralUtilTest, Populate) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{16}, {0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = LiteralUtil::CreateFromShape(shape);
+    auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return LiteralUtil::LinearIndex(*literal, indexes) + 17;
+    };
+    TF_EXPECT_OK(LiteralUtil::Populate<uint32>(literal.get(), generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](const std::vector<int64>& indexes) {
+      auto value = LiteralUtil::Get<uint32>(*literal, indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
+TEST_F(LiteralUtilTest, ConvertR4) {
+  // clang-format off
+  auto original = LiteralUtil::CreateR4WithLayout<int8>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  auto expected = LiteralUtil::CreateR4WithLayout<uint32>({{
+     {{10, 11, 12, 13}, {14, 15, 16, 17}},
+     {{18, 19, 20, 21}, {22, 23, 24, 25}},
+     {{26, 27, 28, 29}, {30, 31, 32, 33}},
+  }}, layout_r4_dim0major_);
+  // clang-format on
+  auto converted = LiteralUtil::Convert<int8, uint32>(*original);
+
+  EXPECT_TRUE(LiteralUtil::Equal(*expected, *converted));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index cd7c42f6e17e15b5e1c6ebfa1f24a40a9003a63e..0d4ddc239243b79d47b6a1672b65abe9b23e7b52 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -38,7 +38,8 @@ void MetricTableReport::SetEntryName(string entry_name) {
 
 void MetricTableReport::SetShowAllEntries() {
   max_entries_to_show_ = std::numeric_limits<int64>::max();
-  max_metric_proportion_to_show = 1.1;  // more than 100%
+  max_entries_per_category_to_show_ = std::numeric_limits<int64>::max();
+  max_metric_proportion_to_show_ = 1.1;  // more than 100%
 }
 
 void MetricTableReport::SetShowCategoryTable() { show_category_table_ = true; }
@@ -141,7 +142,7 @@ void MetricTableReport::AppendCategoryTable() {
   int64 categories_shown = 0;
   for (const auto& category : categories) {
     if (categories_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++categories_shown;
@@ -156,15 +157,14 @@ void MetricTableReport::AppendCategoryTable() {
                                    entry_name_, ")");
     AppendTableRow(text, category.metric_sum, metric_sum);
 
-    // Show the top few entries in the category.
-    const int64 kMaxToShow = 5;
+    // Show the top entries in the category.
     const char* const kIndentPrefix = "                              * ";
-    int64 entries_to_show =
-        std::min<int64>(kMaxToShow, category.entries.size());
-    if (category.entries.size() == kMaxToShow + 1) {
+    int64 entries_to_show = std::min<int64>(max_entries_per_category_to_show_,
+                                            category.entries.size());
+    if (category.entries.size() == entries_to_show + 1) {
       // May as well show the last entry on the line that would otherwise say
       // that there is a single entry not shown.
-      entries_to_show = category.entries.size();
+      ++entries_to_show;
     }
     for (int64 i = 0; i < entries_to_show; ++i) {
       AppendLine(kIndentPrefix, MetricPercent(category.entries[i]->metric), " ",
@@ -193,7 +193,7 @@ void MetricTableReport::AppendEntryTable() {
   int64 entries_shown = 0;
   for (const auto& entry : entries_) {
     if (entries_shown >= max_entries_to_show_ ||
-        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show) {
+        metric_sum / expected_metric_sum_ > max_metric_proportion_to_show_) {
       break;
     }
     ++entries_shown;
diff --git a/tensorflow/compiler/xla/metric_table_report.h b/tensorflow/compiler/xla/metric_table_report.h
index e967627bff4446a695bfae514faac4b1acca4968..818fb1d3fe0b8bbe1a8eba363ff6445e2f3df9d2 100644
--- a/tensorflow/compiler/xla/metric_table_report.h
+++ b/tensorflow/compiler/xla/metric_table_report.h
@@ -103,6 +103,7 @@ class MetricTableReport {
  private:
   static constexpr double kDefaultMaxMetricProportionToShow = 0.99;
   static constexpr int64 kDefaultMaxEntriesToShow = 100;
+  static constexpr int64 kDefaultMaxEntriesPerCategoryToShow = 5;
 
   // Append all parameters to the report.
   template <typename... Args>
@@ -162,7 +163,8 @@ class MetricTableReport {
 
   // These members control how many categories and entries to show in tables.
   int64 max_entries_to_show_ = kDefaultMaxEntriesToShow;
-  double max_metric_proportion_to_show = kDefaultMaxMetricProportionToShow;
+  int64 max_entries_per_category_to_show_ = kDefaultMaxEntriesPerCategoryToShow;
+  double max_metric_proportion_to_show_ = kDefaultMaxMetricProportionToShow;
 
   // The report that is being created.
   string report_;
diff --git a/tensorflow/compiler/xla/port/BUILD b/tensorflow/compiler/xla/port/BUILD
deleted file mode 100644
index 6fc5f1185c9d56075f18928e4b2c8e3819cf9ddd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/port/BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-    visibility = ["//tensorflow/compiler/xla:internal"],
-)
-
-cc_library(
-    name = "initialize",
-    hdrs = ["initialize.h"],
-    visibility = [
-        "//tensorflow/compiler/xla:__subpackages__",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index e3909ae8e9736351d3ee91332572b5db62727289..e4e37177a2d74e6da20300f1439942a146ad8d49 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -78,6 +78,11 @@ PrimitiveType NativeToPrimitiveType<double>() {
   return F64;
 }
 
+template <>
+PrimitiveType NativeToPrimitiveType<half>() {
+  return F16;
+}
+
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64;
 }
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 78f0ee6f592d9b9ec2ed85f23297634c5e2e4d41..162a11c7d2966346979b98c804917203f82c806c 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -75,6 +75,8 @@ template <>
 PrimitiveType NativeToPrimitiveType<float>();
 template <>
 PrimitiveType NativeToPrimitiveType<double>();
+template <>
+PrimitiveType NativeToPrimitiveType<half>();
 
 bool IsFloatingPointType(PrimitiveType type);
 
@@ -150,6 +152,10 @@ template <>
 struct PrimitiveTypeToNative<F64> {
   using type = double;
 };
+template <>
+struct PrimitiveTypeToNative<F16> {
+  using type = half;
+};
 
 }  // namespace primitive_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 86c9c3b1ac38d755effad733590f78aafa9571db..4194d5fc6be0ad552e9fe6dd14b51fa0a67f2eca 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -180,14 +180,28 @@ ReferenceUtil::ReduceWindow4DGeneric(
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
                                  operand.n4()};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+  return ReduceWindow4DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
+
+/* static */ std::unique_ptr<Array4D<float>>
+ReferenceUtil::ReduceWindow4DGeneric(
+    const Array4D<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{operand.n1(), operand.n2(), operand.n3(),
+                                 operand.n4()};
 
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<Array4D<float>>(window_counts[0], window_counts[1],
                                            window_counts[2], window_counts[3]);
@@ -649,4 +663,39 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
+/* static */ Array4D<float> ReferenceUtil::PadArray4D(
+    const Array4D<float>& operand, const PaddingConfig& padding,
+    const float pad) {
+  CHECK_EQ(padding.dimensions_size(), 4);
+
+  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                           operand.n3(), operand.n4()};
+  std::vector<int64> pad_low(4);
+  std::vector<int64> pad_high(4);
+  std::vector<int64> output_bounds(4);
+  for (int64 i = 0; i < 4; ++i) {
+    pad_low[i] = padding.dimensions(i).edge_padding_low();
+    pad_high[i] = padding.dimensions(i).edge_padding_high();
+    CHECK_EQ(padding.dimensions(i).interior_padding(), 0) << "not implemented";
+
+    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i];
+  }
+
+  Array4D<float> result(output_bounds[0], output_bounds[1], output_bounds[2],
+                        output_bounds[3]);
+  result.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    for (int i = 0; i < 4; ++i) {
+      bool in_low_padding = indices[i] < pad_low[i];
+      bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+      if (in_low_padding || in_high_padding) {
+        *value = pad;
+        return;
+      }
+    }
+    *value = operand(indices[0] - pad_low[0], indices[1] - pad_low[1],
+                     indices[2] - pad_low[2], indices[3] - pad_low[3]);
+  });
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 9e0f247203866d544595a877fabd33af148cc307..f58f0bdc9f51dff62c10dda4aba7aac03e689ce7 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -162,6 +162,12 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
+      const Array4D<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
 
   // Performs select and scatter with Greater Than or equal as the select, plus
   // as the scatter, and Same Padding.
@@ -395,7 +401,51 @@ class ReferenceUtil {
       const Array2D<float>& operand, const PaddingConfig& padding,
       const float pad);
 
+  // Returns the result of a 4D pad on an input array.
+  static Array4D<float> PadArray4D(const Array4D<float>& operand,
+                                   const PaddingConfig& padding,
+                                   const float pad);
+
+  // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
+  // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
+  //
+  // The given arrays must have the same size and element type, and the return
+  // type of f must be implicitly convertible to the arrays' element type.
+  //
+  // Example usage:
+  //
+  //   Array2D<float> x, y, z = ...;
+  //   std::unique_ptr<Array2D> result = ReferenceUtil::ApplyElementwise2D(
+  //     [](float a, float b, float c) { return a * b + c; }, x, y, z);
+  //
+  template <typename F, typename T1, typename... Ts>
+  static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
+      F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
+    AssertSameSize2D(array1, arrays...);
+    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
+    for (int64 i = 0; i < array1.n1(); ++i) {
+      for (int64 j = 0; j < array1.n2(); ++j) {
+        (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
+      }
+    }
+    return result;
+  }
+
  private:
+  template <typename T1, typename T2, typename... Ts>
+  static void AssertSameSize2D(const Array2D<T1>& array1,
+                               const Array2D<T2>& array2,
+                               const Array2D<Ts>&... arrays) {
+    static_assert(std::is_same<T1, T2>::value, "Args must be same type.");
+    CHECK_EQ(array1.n1(), array2.n1());
+    CHECK_EQ(array1.n2(), array2.n2());
+    AssertSameSize2D(array2, arrays...);
+  }
+
+  // Recursive base case for AssertSameSize2D.
+  template <typename Array1>
+  static void AssertSameSize2D(const Array1& array1) {}
+
   TF_DISALLOW_COPY_AND_ASSIGN(ReferenceUtil);
 };
 
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index c53351ca93e81f70920291019798f16f0f1c6a57..f839ac019df07c5c5e07eed856ea55463bb3efae 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -52,9 +52,9 @@ class ReferenceUtilTest : public ::testing::Test {
 
 TEST_F(ReferenceUtilTest, TransposeArray2D) {
   auto result = ReferenceUtil::TransposeArray2D(*matrix_);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 4.f}, {2.f, 5.f}, {3.f, 6.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MatmulArray2D) {
@@ -62,32 +62,32 @@ TEST_F(ReferenceUtilTest, MatmulArray2D) {
       {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{58.f, 64.f}, {139.f, 154.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToColArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToColArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({6.f, 15.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, ReduceToRowArray2D) {
   auto add = [](float lhs, float rhs) { return lhs + rhs; };
   auto result = ReferenceUtil::ReduceToRowArray2D(*matrix_, 0.0f, add);
-  auto result_literal = LiteralUtil::CreateR1<float>(*result);
-  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR1<float>(*result);
+  LiteralTestUtil::ExpectR1Near<float>({5.f, 7.f, 9.f}, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray2D) {
   auto identity = [](float value) { return log(exp(value)); };
   auto result = ReferenceUtil::MapArray2D(*matrix_, identity);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
-  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *result_literal,
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  LiteralTestUtil::ExpectR2NearArray2D(*matrix_, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -96,9 +96,9 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray2D) {
     return value + row + col;
   };
   auto result = ReferenceUtil::MapWithIndexArray2D(*matrix_, add_index);
-  auto result_literal = LiteralUtil::CreateR2FromArray2D(*result);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*result);
   LiteralTestUtil::ExpectR2Near<float>({{1.f, 3.f, 5.f}, {5.f, 7.f, 9.f}},
-                                       *result_literal, ErrorSpec(0.0001));
+                                       *actual_literal, ErrorSpec(0.0001));
 }
 
 TEST_F(ReferenceUtilTest, MapArray4D) {
@@ -107,11 +107,11 @@ TEST_F(ReferenceUtilTest, MapArray4D) {
   input->FillWithMultiples(1.0f);
   auto multiply_by_two = [](float value) { return 2 * value; };
   auto result = ReferenceUtil::MapArray4D(*input, multiply_by_two);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.FillWithMultiples(2.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -124,11 +124,11 @@ TEST_F(ReferenceUtilTest, MapWithIndexArray4D) {
     return value - (3 * 4 * 5 * plane + 4 * 5 * depth + 5 * height + width);
   };
   auto result = ReferenceUtil::MapWithIndexArray4D(*input, subtract_index);
-  auto result_literal = LiteralUtil::CreateR4FromArray4D(*result);
+  auto actual_literal = LiteralUtil::CreateR4FromArray4D(*result);
 
   Array4D<float> expected(/*planes=*/2, /*depth=*/3, /*height=*/4, /*width=*/5);
   expected.Fill(0.0f);
-  LiteralTestUtil::ExpectR4NearArray4D(expected, *result_literal,
+  LiteralTestUtil::ExpectR4NearArray4D(expected, *actual_literal,
                                        ErrorSpec(0.0001));
 }
 
@@ -302,5 +302,17 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
                                               ErrorSpec(0.0001));
 }
 
+TEST_F(ReferenceUtilTest, ApplyElementwise2D) {
+  Array2D<float> a({{1, 2}, {3, 4}});
+  Array2D<float> b({{10, 20}, {30, 40}});
+  Array2D<float> c({{100, 200}, {300, 400}});
+
+  auto actual = ReferenceUtil::ApplyElementwise2D(
+      [](float x, float y, float z) { return 100 * x + 10 * y + z; }, a, b, c);
+  auto actual_literal = LiteralUtil::CreateR2FromArray2D(*actual);
+  LiteralTestUtil::ExpectR2Near({{300.f, 600.f}, {900.f, 1200.f}},
+                                *actual_literal, ErrorSpec(0.0001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index b9118fab2549689d045a1caf826b9d3937019e1c..3c53cf4dd3c5d663cf703cce3d479cb3a2cea2eb 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -52,6 +52,7 @@ cc_test(
     deps = [
         ":shape_inference",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -64,8 +65,42 @@ cc_test(
     srcs = ["hlo_opcode_test.cc"],
     deps = [
         ":hlo",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "hlo_evaluator",
+    srcs = ["hlo_evaluator.cc"],
+    hdrs = ["hlo_evaluator.h"],
+    deps = [
+        ":hlo",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "hlo_evaluator_test",
+    srcs = ["hlo_evaluator_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
@@ -88,6 +123,7 @@ cc_library(
         "hlo_opcode.h",
     ],
     deps = [
+        ":hlo_module_config",
         ":name_uniquer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:literal_util",
@@ -105,6 +141,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_matchers",
+    testonly = 1,
+    srcs = ["hlo_matchers.cc"],
+    hdrs = ["hlo_matchers.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_test(
+    name = "hlo_matchers_test",
+    srcs = ["hlo_matchers_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:shape_util",
+    ],
+)
+
 cc_library(
     name = "versioned_computation_handle",
     srcs = ["versioned_computation_handle.cc"],
@@ -122,7 +179,9 @@ cc_test(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -137,7 +196,6 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
     ],
@@ -151,6 +209,42 @@ cc_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "flatten_call_graph",
+    srcs = ["flatten_call_graph.cc"],
+    hdrs = ["flatten_call_graph.h"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "flatten_call_graph_test",
+    srcs = ["flatten_call_graph_test.cc"],
+    deps = [
+        ":call_graph",
+        ":flatten_call_graph",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -185,10 +279,12 @@ cc_test(
     name = "user_computation_test",
     srcs = ["user_computation_test.cc"],
     deps = [
+        ":hlo_matchers",
         ":user_computation",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
@@ -311,6 +407,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "compile_only_service",
+    srcs = ["compile_only_service.cc"],
+    hdrs = ["compile_only_service.h"],
+    deps = [
+        ":backend",
+        ":compiler",
+        ":computation_layout",
+        ":computation_tracker",
+        ":platform_util",
+        ":service",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
     name = "cpu_plugin",
     deps = [
@@ -451,6 +568,7 @@ cc_library(
     hdrs = ["computation_tracker.h"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":session_proto",
         ":user_computation",
         ":versioned_computation_handle",
@@ -504,7 +622,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
     ],
 )
 
@@ -515,9 +632,6 @@ cc_test(
         ":hlo",
         ":liveness_util",
         ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test_main",
     ],
@@ -532,6 +646,7 @@ cc_library(
         "buffer_liveness.h",
     ],
     deps = [
+        ":call_graph",
         ":hlo",
         ":hlo_ordering",
         ":liveness_util",
@@ -572,8 +687,8 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -593,11 +708,17 @@ cc_test(
     srcs = ["buffer_assignment_test.cc"],
     deps = [
         ":buffer_assignment",
+        ":call_graph",
         ":computation_tracker",
+        ":copy_insertion",
         ":cpu_plugin",
+        ":flatten_call_graph",
         ":hlo",
+        ":hlo_ordering",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -607,56 +728,38 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "heap_simulator",
-    srcs = [
-        "heap_simulator.cc",
-    ],
-    hdrs = [
-        "heap_simulator.h",
-    ],
-    deps = [
-        ":hlo",
-        ":liveness_util",
-        ":logical_buffer",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_test(
     name = "heap_simulator_test",
     srcs = ["heap_simulator_test.cc"],
     deps = [
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
 
+# The hlo_ordering library contains both hlo_ordering and heap_simulator because
+# they are mutually dependent.
 cc_library(
     name = "hlo_ordering",
     srcs = [
+        "heap_simulator.cc",
         "hlo_ordering.cc",
     ],
     hdrs = [
+        "heap_simulator.h",
         "hlo_ordering.h",
     ],
     deps = [
-        ":heap_simulator",
+        ":call_graph",
         ":hlo",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -710,6 +813,7 @@ cc_test(
     name = "instruction_fusion_test",
     srcs = ["instruction_fusion_test.cc"],
     deps = [
+        ":hlo_matchers",
         ":instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test_main",
@@ -743,10 +847,11 @@ cc_test(
         ":algebraic_simplifier",
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -764,7 +869,9 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -773,9 +880,11 @@ cc_test(
     srcs = ["reshape_mover_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
         ":reshape_mover",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -806,10 +915,11 @@ cc_test(
     deps = [
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":inliner",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -941,8 +1051,10 @@ cc_test(
     deps = [
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test_main",
@@ -972,7 +1084,7 @@ cc_test(
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1024,10 +1136,12 @@ cc_test(
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
         ":instruction_fusion",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1089,6 +1203,7 @@ cc_library(
         ":buffer_liveness",
         ":hlo",
         ":hlo_pass",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:status_macros",
@@ -1103,13 +1218,14 @@ cc_test(
     name = "copy_insertion_test",
     srcs = ["copy_insertion_test.cc"],
     deps = [
-        ":buffer_liveness",
         ":copy_insertion",
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1137,16 +1253,7 @@ cc_library(
     name = "hlo_verifier",
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
-    deps = [
-        ":hlo",
-        ":hlo_pass",
-        "//tensorflow/compiler/xla:status",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-    ],
+    deps = [":hlo_pass"],
 )
 
 cc_library(
@@ -1156,10 +1263,12 @@ cc_library(
     deps = [
         ":buffer_liveness",
         ":call_graph",
+        ":flatten_call_graph",
         ":hlo",
         ":hlo_cost_analysis",
         ":hlo_dce",
         ":hlo_ordering",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -1177,6 +1286,7 @@ cc_test(
     deps = [
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":hlo_ordering",
         ":hlo_rematerialization",
         "//tensorflow/compiler/xla:shape_util",
@@ -1203,6 +1313,7 @@ cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
@@ -1215,10 +1326,12 @@ cc_test(
         ":computation_layout",
         ":cpu_plugin",
         ":hlo",
+        ":hlo_matchers",
         ":layout_assignment",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1274,7 +1387,6 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
@@ -1288,6 +1400,7 @@ cc_test(
         ":cpu_plugin",
         ":hlo",
         ":hlo_cse",
+        ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -1310,13 +1423,34 @@ cc_library(
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
 
+cc_test(
+    name = "hlo_constant_folding_test",
+    srcs = ["hlo_constant_folding_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":hlo",
+        ":hlo_constant_folding",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
@@ -1403,6 +1537,33 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_tfgraph_builder",
+    srcs = ["hlo_tfgraph_builder.cc"],
+    hdrs = ["hlo_tfgraph_builder.h"],
+    visibility = ["//tensorflow/compiler/xla/tools:__pkg__"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "hlo_tfgraph_builder_test",
+    srcs = ["hlo_tfgraph_builder_test.cc"],
+    deps = [
+        ":hlo_tfgraph_builder",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_graph_dumper",
     srcs = [
@@ -1412,6 +1573,7 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_execution_profile",
+        ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -1429,7 +1591,9 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/core:lib",
     ],
@@ -1440,11 +1604,15 @@ cc_test(
     srcs = ["transpose_folding_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
+        ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 4c058484b9fbdeeabd5c240cc85b7439181896df..6e6da38f9e33bd8bd7723a3a96608a2eacf2a5a2 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -51,6 +51,16 @@ bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
          LiteralUtil::IsAll(operand->literal(), value);
 }
 
+bool IsAll(const HloInstruction* op, int8 value) {
+  if (IsLiteralWithValue(op, value)) {
+    return true;
+  }
+  if (op->opcode() == HloOpcode::kBroadcast && IsAll(op->operand(0), value)) {
+    return true;
+  }
+  return false;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -112,6 +122,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
   Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override;
 
   Status HandleConvert(HloInstruction* convert,
@@ -146,9 +160,19 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function) override;
 
+  Status HandleReduceWindow(HloInstruction* reduce_window,
+                            HloInstruction* operand, const Window& window,
+                            HloComputation* function) override;
+
   Status HandleReverse(HloInstruction* reverse,
                        HloInstruction* operand) override;
   Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+  Status HandleDynamicSlice(HloInstruction* slice, HloInstruction* operand,
+                            HloInstruction* start_indices) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                  HloInstruction* operand,
+                                  HloInstruction* update,
+                                  HloInstruction* start_indices) override;
 
   Status HandleTranspose(HloInstruction* transpose) override;
 
@@ -210,6 +234,29 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
       HloInstruction* reshape_or_broadcast);
 
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithNewInstruction(
+      HloInstruction* old_instruction,
+      std::unique_ptr<HloInstruction> new_instruction) {
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        old_instruction, std::move(new_instruction)));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceInstruction(HloInstruction* old_instruction,
+                            HloInstruction* new_instruction) {
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceInstruction(old_instruction, new_instruction));
+    changed_ = true;
+    return Status::OK();
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -258,8 +305,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
   auto bitcast = computation_->AddInstruction(
       HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
                                   instruction->mutable_operand(0)));
-  TF_CHECK_OK(computation_->ReplaceInstruction(instruction, bitcast));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
 bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
@@ -267,9 +313,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
   if (!SameShape(old_instruction, new_instruction)) {
     return false;
   }
-  TF_CHECK_OK(
-      computation_->ReplaceInstruction(old_instruction, new_instruction));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(old_instruction, new_instruction));
   return true;
 }
 
@@ -278,12 +322,12 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add,
                                              HloInstruction* rhs) {
   // A + 0 => A
   VLOG(10) << "trying transform [A + 0 => A]: " << add->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(add, lhs)) {
     return Status::OK();
   }
   // 0 + A => A
   VLOG(10) << "trying transform [0 + A => A]: " << add->ToString();
-  if (IsLiteralWithValue(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
+  if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(add, rhs)) {
     return Status::OK();
   }
 
@@ -297,12 +341,45 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy,
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  if (operands.size() == 1) {
+    // Unary concatenates are useless.
+    ReplaceInstructionIfSameShape(concatenate, operands[0]);
+    return Status::OK();
+  }
+  // Filter out and remove empty operands.
+  std::vector<HloInstruction*> nonempty_operands;
+  for (HloInstruction* operand : operands) {
+    if (!ShapeUtil::HasZeroElements(operand->shape())) {
+      nonempty_operands.push_back(operand);
+    }
+  }
+  if (nonempty_operands.size() < operands.size()) {
+    HloInstruction* replacement;
+    if (nonempty_operands.empty()) {
+      replacement = operands[0];
+    } else if (nonempty_operands.size() == 1) {
+      replacement = nonempty_operands[0];
+    } else {
+      replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), nonempty_operands));
+    }
+    VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
+             << replacement->ToString();
+    ReplaceInstructionIfSameShape(concatenate, replacement);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleSubtract(HloInstruction* sub,
                                                   HloInstruction* lhs,
                                                   HloInstruction* rhs) {
   // A - 0 => A
   VLOG(10) << "trying transform [A - 0 => A]: " << sub->ToString();
-  if (IsLiteralWithValue(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
+  if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(sub, lhs)) {
     return Status::OK();
   }
 
@@ -314,8 +391,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
                                                 HloInstruction* rhs) {
   // A/1 => A
   VLOG(10) << "trying transform [A/1 => A]: " << divide->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(divide, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(divide, lhs)) {
     return Status::OK();
   }
 
@@ -326,8 +402,7 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide,
         computation_->AddInstruction(HloInstruction::CreateBinary(
             divide->shape(), HloOpcode::kSubtract, lhs->mutable_operand(0),
             rhs->mutable_operand(0)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         divide, HloInstruction::CreateUnary(divide->shape(), HloOpcode::kExp,
                                             subtract));
   }
@@ -354,8 +429,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
       ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
@@ -364,8 +438,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
         ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
         rhs->mutable_operand(0), lhs->mutable_operand(0)));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
   }
 
@@ -373,8 +446,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
   if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
                                           lhs, rhs));
   }
@@ -398,8 +470,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
     auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
         ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
         {0}, add_reduce_computation));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
 
@@ -438,8 +509,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
                                {rhs->shape().dimensions(1)}),
           multiply, zero, {0}, add_reduce_computation));
     }
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
 
@@ -465,8 +535,7 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot,
         ShapeUtil::MakeShape(dot->shape().element_type(),
                              {lhs->shape().dimensions(0)}),
         multiply, zero, {1}, add_reduce_computation));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateReshape(dot->shape(), reduce));
   }
   return Status::OK();
@@ -477,14 +546,12 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply,
                                                   HloInstruction* rhs) {
   // A*1 => A
   VLOG(10) << "trying transform [A*1 => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(rhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(multiply, lhs)) {
     return Status::OK();
   }
   // 1*A => A
   VLOG(10) << "trying transform [1*A => A]: " << multiply->ToString();
-  if (IsLiteralWithValue(lhs, 1) &&
-      ReplaceInstructionIfSameShape(multiply, rhs)) {
+  if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(multiply, rhs)) {
     return Status::OK();
   }
   return Status::OK();
@@ -605,8 +672,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> reshape(X) where "
                 "n(broadcast(X)) == n(X)";
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         broadcast, HloInstruction::CreateReshape(broadcast->shape(), operand));
   }
 
@@ -618,8 +684,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
                 "n(broadcast(X)) == n(X)";
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         broadcast, HloInstruction::CreateTranspose(broadcast->shape(), operand,
                                                    broadcast->dimensions()));
   }
@@ -639,8 +704,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
       for (auto inserted_index : inserted_indices) {
         dims.erase(dims.begin() + inserted_index);
       }
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           broadcast,
           HloInstruction::CreateBroadcast(broadcast->shape(),
                                           operand->mutable_operand(0), dims));
@@ -683,65 +747,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-static std::unique_ptr<HloInstruction> ConvertIfTypesMatch(
-    const Literal& src_literal) {
-  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-
-  return HloInstruction::CreateConstant(
-      LiteralUtil::Convert<typename primitive_util::PrimitiveTypeToNative<
-                               primitive_src_type>::type,
-                           typename primitive_util::PrimitiveTypeToNative<
-                               primitive_dest_type>::type>(src_literal));
-}
-
-template <PrimitiveType primitive_src_type>
-static std::unique_ptr<HloInstruction> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
-    CONVERT_IF_TYPES_MATCH(PRED)
-    CONVERT_IF_TYPES_MATCH(S8)
-    CONVERT_IF_TYPES_MATCH(S32)
-    CONVERT_IF_TYPES_MATCH(S64)
-    CONVERT_IF_TYPES_MATCH(U8)
-    CONVERT_IF_TYPES_MATCH(U32)
-    CONVERT_IF_TYPES_MATCH(U64)
-    CONVERT_IF_TYPES_MATCH(F32)
-    CONVERT_IF_TYPES_MATCH(F64)
-#undef CONVERT_IF_TYPES_MATCH
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
-static std::unique_ptr<HloInstruction> ConvertIfSrcTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (src_literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
-    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
-    CONVERT_IF_DEST_TYPE_MATCHES(S8)
-    CONVERT_IF_DEST_TYPE_MATCHES(S32)
-    CONVERT_IF_DEST_TYPE_MATCHES(S64)
-    CONVERT_IF_DEST_TYPE_MATCHES(U8)
-    CONVERT_IF_DEST_TYPE_MATCHES(U32)
-    CONVERT_IF_DEST_TYPE_MATCHES(U64)
-    CONVERT_IF_DEST_TYPE_MATCHES(F32)
-    CONVERT_IF_DEST_TYPE_MATCHES(F64)
-#undef CONVERT_IF_DEST_TYPE_MATCHES
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
@@ -750,16 +755,7 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert,
   PrimitiveType src_type = operand->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   if (src_type == dest_type) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(convert, operand);
-  }
-  if (operand->opcode() == HloOpcode::kConstant) {
-    const Literal& src_literal = operand->literal();
-    std::unique_ptr<HloInstruction> new_constant =
-        ConvertIfSrcTypeMatches(src_literal, dest_type);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(convert,
-                                                   std::move(new_constant));
+    return ReplaceInstruction(convert, operand);
   }
   return Status::OK();
 }
@@ -845,8 +841,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
 
     std::unique_ptr<HloInstruction> slice = HloInstruction::CreateSlice(
         pad->shape(), nonzero_pad, start_indices, end_indices);
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(pad, std::move(slice));
+    return ReplaceWithNewInstruction(pad, std::move(slice));
   }
 
   return Status::OK();
@@ -856,7 +851,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
                                                HloInstruction* lhs,
                                                HloInstruction* rhs) {
   VLOG(10) << "trying transform [pow(A, 0) => 1]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 0)) {
+  if (IsAll(rhs, 0)) {
     auto one = HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
         LiteralUtil::One(power->shape().element_type())));
     std::unique_ptr<HloInstruction> ones;
@@ -866,30 +861,27 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power,
       ones = HloInstruction::CreateBroadcast(
           power->shape(), computation_->AddInstruction(std::move(one)), {});
     }
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(power, std::move(ones));
+    return ReplaceWithNewInstruction(power, std::move(ones));
   }
 
   VLOG(10) << "trying transform [pow(A, 1) => A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
+  if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(power, lhs)) {
     return Status::OK();
   }
 
   VLOG(10) << "trying transform [pow(A, 2) => A*A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, 2)) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+  if (IsAll(rhs, 2)) {
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(),
                                             HloOpcode::kMultiply, lhs, lhs));
   }
 
   VLOG(10) << "trying transform [pow(A, -1) => 1/A]: " << power->ToString();
-  if (IsLiteralWithValue(rhs, -1)) {
+  if (IsAll(rhs, -1)) {
     auto* one = computation_->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CloneToUnique(
             LiteralUtil::One(rhs->shape().element_type()))));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
                                             one, lhs));
   }
@@ -967,17 +959,24 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
 Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
+  // Reshape directly to empty constant if the shape contains zero-element
+  // dimension.
+  if (ShapeUtil::HasZeroElements(reshape->shape())) {
+    auto empty_constant = HloInstruction::CreateConstant(
+        LiteralUtil::CreateFromShape(reshape->shape()));
+
+    return ReplaceWithNewInstruction(reshape, std::move(empty_constant));
+  }
+
   // Delete no-op reshapes, i.e. where shape = operand shape.
   if (SameShape(reshape, operand)) {
     VLOG(10) << "deleting no-op reshape";
-    changed_ = true;
-    return computation_->ReplaceInstruction(reshape, operand);
+    return ReplaceInstruction(reshape, operand);
   }
 
   // Merge reshapes.
   if (HloOpcode::kReshape == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
@@ -986,8 +985,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     auto opt_dims = ReshapeLeavesDimensionsUnmodified(
         reshape, reshape->operand(0)->dimensions());
     if (opt_dims.first) {
-      changed_ = true;
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           reshape,
           HloInstruction::CreateBroadcast(
               reshape->shape(), reshape->mutable_operand(0)->mutable_operand(0),
@@ -1023,8 +1021,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse,
   };
   if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
                   dim_is_one)) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(reverse, operand);
+    return ReplaceInstruction(reverse, operand);
   }
   return Status::OK();
 }
@@ -1038,12 +1035,31 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice,
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
+    HloInstruction* dynamic_slice, HloInstruction* operand,
+    HloInstruction* start_indices) {
+  if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
+    return ReplaceInstruction(dynamic_slice, operand);
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
+    HloInstruction* dynamic_update_slice, HloInstruction* operand,
+    HloInstruction* update, HloInstruction* start_indices) {
+  // DynamicUpdateSlice on a scalar just passes through the update argument.
+  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
+    return ReplaceInstruction(dynamic_update_slice, update);
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function) {
   if (ShapeUtil::HasZeroElements(arg->shape()) ||
       ShapeUtil::HasZeroElements(reduce->shape())) {
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce,
         HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
     return Status::OK();
@@ -1056,7 +1072,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
     for (auto dim : dimensions) {
       new_reduce_dimensions.push_back(transpose_dimensions[dim]);
     }
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateReduce(
                     reduce->shape(), arg->mutable_operand(0), init_value,
                     new_reduce_dimensions, function));
@@ -1100,7 +1116,7 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
           new_reduce_dimensions.push_back(i);
         }
       }
-      return computation_->ReplaceWithNewInstruction(
+      return ReplaceWithNewInstruction(
           reduce, HloInstruction::CreateReduce(
                       reduce->shape(), arg->mutable_operand(0), init_value,
                       new_reduce_dimensions, function));
@@ -1111,27 +1127,84 @@ Status AlgebraicSimplifierVisitor::HandleReduce(
       ShapeUtil::HasZeroElements(arg->shape())) {
     auto reshape = computation_->AddInstruction(
         HloInstruction::CreateReshape(reduce->shape(), arg));
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateMap(reduce->shape(),
                                           {reshape, init_value}, function));
   }
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleReduceWindow(
+    HloInstruction* reduce_window, HloInstruction* operand,
+    const Window& window, HloComputation* function) {
+  VLOG(10) << "Considering folding Pad: " << operand->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString();
+
+  // This optimization folds a pad op into reduce_window.
+  if (operand->opcode() != HloOpcode::kPad) {
+    VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
+    return Status::OK();
+  }
+
+  // Do not fold interior padding into ReduceWindow since the backends do not
+  // support it.
+  const PaddingConfig& pad_config = operand->padding_config();
+  if (HasInteriorPadding(pad_config)) {
+    VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
+    return Status::OK();
+  }
+
+  // If reduce_window already has padding, the pad value of the pad op and the
+  // init value of reduce_window must match to allow folding the pad.
+  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* reduce_init_value = reduce_window->operand(1);
+  if (pad_value != reduce_init_value) {
+    // The pad value is usually a constant, so we handle that case and do not
+    // try to get more fancy about proving equivalence in cases beyond that.
+    if (pad_value->opcode() != HloOpcode::kConstant ||
+        reduce_init_value->opcode() != HloOpcode::kConstant ||
+        !LiteralUtil::Equal(pad_value->literal(),
+                            reduce_init_value->literal())) {
+      VLOG(10)
+          << "Not folding pad into reduce-window due to different pad values.";
+      return Status::OK();
+    }
+  }
+
+  // Carry out the folding of the pad into reduce_window.
+  VLOG(10) << "Folding pad into reduce-window.";
+  Window new_window = window;
+  const int64 rank = ShapeUtil::Rank(reduce_window->shape());
+  TF_RET_CHECK(pad_config.dimensions_size() == rank);
+  TF_RET_CHECK(window.dimensions_size() == rank);
+  for (int64 i = 0; i < rank; ++i) {
+    const auto& pad_dim = pad_config.dimensions(i);
+    auto& window_dim = *new_window.mutable_dimensions(i);
+    window_dim.set_padding_low(window_dim.padding_low() +
+                               pad_dim.edge_padding_low());
+    window_dim.set_padding_high(window_dim.padding_high() +
+                                pad_dim.edge_padding_high());
+  }
+  return ReplaceWithNewInstruction(
+      reduce_window, HloInstruction::CreateReduceWindow(
+                         /*shape=*/reduce_window->shape(),
+                         /*operand=*/operand->mutable_operand(0),
+                         /*init_value=*/reduce_window->mutable_operand(1),
+                         /*window=*/new_window,
+                         /*reduce_computation=*/function));
+}
+
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
 
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
     VLOG(10) << "deleting no-op transpose";
-    changed_ = true;
-    return computation_->ReplaceInstruction(transpose, operand);
+    return ReplaceInstruction(transpose, operand);
   }
 
   if (HloOpcode::kTranspose == operand->opcode()) {
-    changed_ = true;
-    return computation_->ReplaceWithNewInstruction(
+    return ReplaceWithNewInstruction(
         transpose, HloInstruction::CreateTranspose(
                        transpose->shape(), operand->mutable_operand(0),
                        ComposePermutations(operand->dimensions(),
@@ -1258,9 +1331,7 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
       dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
-  changed_ = true;
-  return computation_->ReplaceInstruction(convolution,
-                                          add_bitcast(convolution_shape, dot));
+  return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
 bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
@@ -1274,8 +1345,7 @@ bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
 
   auto clamp = HloInstruction::CreateTernary(root->shape(), HloOpcode::kClamp,
                                              max_operand, operand, min_operand);
-  TF_CHECK_OK(computation_->ReplaceWithNewInstruction(root, std::move(clamp)));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceWithNewInstruction(root, std::move(clamp)));
   return true;
 }
 
@@ -1348,13 +1418,20 @@ Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum,
 StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
-  bool changed =
-      std::any_of(module->computations().begin(), module->computations().end(),
-                  [=](const std::unique_ptr<HloComputation>& computation) {
-                    return AlgebraicSimplifierVisitor::Run(
-                        computation.get(), is_layout_sensitive_,
-                        valid_bitcast_callback_, enable_dot_simplification_);
-                  });
+  bool changed = false;
+  // Make a copy of the computations because we may add computations to the
+  // module, invalidating iteration.
+  std::vector<HloComputation*> computations;
+  for (auto& comp : module->computations()) {
+    computations.push_back(comp.get());
+  }
+  for (auto& comp : computations) {
+    if (AlgebraicSimplifierVisitor::Run(comp, is_layout_sensitive_,
+                                        valid_bitcast_callback_,
+                                        enable_dot_simplification_)) {
+      changed = true;
+    }
+  }
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), after:\n" + module->ToString());
   return changed;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 27a1c0fec8855810cd016b36b1706a17c0204d63..87d8a7165ccfad587474a0c89e9387597e341d8f 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -23,21 +23,25 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
 AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
+
 AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
@@ -66,6 +70,52 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, zero, {0, 1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0, 0, 0})));
+  HloInstruction* bcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(r2f32, zero, {1}));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -157,9 +207,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_EQ(root, add);
-  EXPECT_EQ(root->operand(0), param1);
-  EXPECT_EQ(root->operand(1), param2);
+  EXPECT_THAT(root, op::Add(param1, param2));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
@@ -179,17 +227,16 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kDivide);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Divide(op::Exp(param0), op::Exp(param1)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kExp);
-  EXPECT_EQ(root->operand_count(), 1);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kSubtract);
-  EXPECT_EQ(root->operand(0)->operand(0), param0);
-  EXPECT_EQ(root->operand(0)->operand(1), param1);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Exp(op::Subtract(param0, param1)));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -205,14 +252,14 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kLog);
+
+  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
-  EXPECT_EQ(root, param0);
+
+  EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that ln(exp(A)/exp(B)) is simplified to A-B
@@ -234,15 +281,15 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kLog);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  EXPECT_EQ(root->operand(0), param0);
-  EXPECT_EQ(root->operand(1), param1);
+
+  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
@@ -259,11 +306,15 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
+  EXPECT_THAT(root, op::Constant());
   EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->literal()), 1);
 }
 
@@ -280,11 +331,15 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  EXPECT_THAT(root, op::Broadcast());
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -306,12 +361,14 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
-  EXPECT_EQ(root, param0);
+
+  EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that pow(A, 2) is simplified to A*A.
@@ -327,13 +384,14 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  EXPECT_EQ(root->operand(0), param0);
-  EXPECT_EQ(root->operand(1), param0);
+
+  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
@@ -349,15 +407,17 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kDivide);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConstant);
+  EXPECT_THAT(root, op::Divide(op::Constant(), param0));
   EXPECT_EQ(LiteralUtil::GetFirstElement<float>(root->operand(0)->literal()),
             1);
-  EXPECT_EQ(root->operand(1), param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -376,12 +436,15 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
   auto computation = builder.Build();
   auto module = MakeUnique<HloModule>(TestName());
   module->AddEntryComputation(std::move(computation));
-  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(op::Broadcast(op::Reshape(op))));
+
   HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
                                              non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  root = module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
@@ -395,103 +458,117 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), input);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertF32ToS64) {
+// Test that copies are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32, "param0"));
   builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
+      HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
-                computation->root_instruction()->literal()),
-            42);
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertS64ToF32) {
+// Test that unary concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
   builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+      HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
-                computation->root_instruction()->literal()),
-            42.0f);
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertF32ArrayToS64Array) {
+// Test that empty operands of concatenates are removed.
+TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r1f32, "param1"));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param1, {42}, {42}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {3 * kParamLength});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(HloOpcode::kConvert, computation->root_instruction()->opcode());
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kConstant, computation->root_instruction()->opcode());
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
-      42);
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
-      19);
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(param0, param0, param1));
 }
 
-// Test that copies are removed.
-TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
-  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+// Test a concatenate with only empty operands is removed.
+TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  const int kParamLength = 100;
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32, "param0"));
-  HloInstruction* copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
+      HloInstruction::CreateParameter(0, r1f32, "param0"));
+  HloInstruction* empty_literal = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({})));
+  HloInstruction* empty_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {0}), param0, {42}, {42}));
+  Shape result_shape = ShapeUtil::MakeShape(F32, {0});
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      result_shape, {empty_literal, empty_slice}, 0));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Concatenate(empty_literal, empty_slice));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(param0, computation->root_instruction());
+  EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -511,14 +588,14 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
@@ -538,14 +615,14 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_EQ(copy, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Copy has been removed.
-  EXPECT_EQ(param0, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 // Test that a reshape which could be replaced with a bitcast is not if
@@ -566,14 +643,14 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(reshape, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_EQ(reshape, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
 // Test transforming reshapes to bitcasts under various conditions.
@@ -612,22 +689,18 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(transformable_reshape, computation->root_instruction()->operand(0));
-  EXPECT_EQ(dimensions_wrong_reshape,
-            computation->root_instruction()->operand(1));
-  EXPECT_EQ(layout_wrong_reshape, computation->root_instruction()->operand(2));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
+                        layout_wrong_reshape));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
   simplifier.Run(module.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
-  EXPECT_NE(transformable_reshape, computation->root_instruction()->operand(0));
-  EXPECT_EQ(HloOpcode::kBitcast,
-            computation->root_instruction()->operand(0)->opcode());
-  EXPECT_EQ(dimensions_wrong_reshape,
-            computation->root_instruction()->operand(1));
-  EXPECT_EQ(layout_wrong_reshape, computation->root_instruction()->operand(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
@@ -645,14 +718,16 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
                                    HloOpcode::kMaximum, movable_reshape, zero));
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kMaximum);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Reshape(param), zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  bitcasting_callback());
+
   simplifier.Run(module.get()).ValueOrDie();
-  EXPECT_EQ(HloOpcode::kReshape, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kMaximum,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Maximum(param, zero)));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
@@ -672,13 +747,14 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(HloOpcode::kBitcast, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
@@ -698,13 +774,14 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                  bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(HloOpcode::kBitcast, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
@@ -717,23 +794,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {2, 1, 2}), param0));
 
-  HloInstruction* reshape2 =
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(reshape2, computation->root_instruction());
-  EXPECT_EQ(reshape1, computation->root_instruction()->operand(0));
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Reshape(param0)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kReshape, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
@@ -746,25 +820,21 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 4, 2}), param0, {1, 2, 0}));
 
-  HloInstruction* transpose2 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(transpose2, computation->root_instruction());
-  EXPECT_EQ(transpose1, computation->root_instruction()->operand(0));
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kTranspose, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
 }
 
 // Test merging reshape and broadcast.
@@ -780,13 +850,14 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Broadcast(op::Reshape(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 // Test merging broadcast and reshape.
@@ -802,13 +873,14 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param0)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_EQ(HloOpcode::kParameter,
-            computation->root_instruction()->operand(0)->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
@@ -821,11 +893,17 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
   auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
@@ -840,12 +918,16 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   auto module = MakeUnique<HloModule>(TestName());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
-  EXPECT_MATCH(computation->root_instruction()->dimensions(),
-               testing::VectorMatcher<int64>({3}));
+
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction()->dimensions(),
+              ::testing::ElementsAre(3));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
@@ -860,15 +942,18 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   auto module = MakeUnique<HloModule>(TestName());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(HloOpcode::kBroadcast, computation->root_instruction()->opcode());
+
+  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
-  EXPECT_TRUE(broadcast_dims[0] == 1 || broadcast_dims[0] == 2 ||
-              broadcast_dims[3] == 3);
+  EXPECT_THAT(broadcast_dims[0], ::testing::AnyOf(1, 2, 3));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
@@ -881,11 +966,17 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
   auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Broadcast(param)));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -908,10 +999,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, NegativePadding) {
@@ -951,18 +1045,14 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(computation->root_instruction(), pad);
+  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
 
-  EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kSlice);
-  const HloInstruction* root_operand =
-      computation->root_instruction()->operand(0);
-  EXPECT_EQ(root_operand->opcode(), HloOpcode::kPad);
-  EXPECT_FALSE(has_negative_padding(root_operand));
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_FALSE(
+      has_negative_padding(computation->root_instruction()->operand(0)));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
@@ -976,10 +1066,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
@@ -996,10 +1089,13 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   HloModule module(TestName());
   HloComputation* computation = module.AddEntryComputation(builder.Build());
 
+  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_EQ(1, computation->instruction_count());
+
+  EXPECT_THAT(computation->root_instruction(), param);
 }
 
 TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
@@ -1235,21 +1331,21 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, param0, min_value));
-  HloInstruction* max = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMaximum, min, max_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, max);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Maximum(op::Minimum(param0, min_value), max_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, x), y) is transformed to clamp(x, A, y) for scalar
@@ -1265,21 +1361,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, x), y) is transformed to clamp(x, A, y) for
@@ -1296,21 +1392,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kClamp);
-  EXPECT_EQ(root->operand(0), max_value);
-  EXPECT_EQ(root->operand(1), param0);
-  EXPECT_EQ(root->operand(2), min_value);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Clamp(max_value, param0, min_value));
 }
 
 // Test that min(max(A, non-constant1), non-constant2) is not canonicalized to
@@ -1326,17 +1422,21 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
       HloInstruction::CreateParameter(2, r0f32, "param2"));
   HloInstruction* max = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMaximum, param0, max_value));
-  HloInstruction* min = builder.AddInstruction(
+  builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Maximum(param0, max_value), min_value));
 }
 
 // Test that min(f(max(A, constant1)), constant2) is not transformed to
@@ -1354,18 +1454,23 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
       r0f32, HloOpcode::kMaximum, param0, max_value));
   HloInstruction* fmax = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, max, max_value));
-  HloInstruction* min = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, fmax, min_value));
 
   HloModule module(TestName());
   auto computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
+                          min_value));
+
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
   EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-  root = computation->root_instruction();
-  EXPECT_EQ(root, min);
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
+                          min_value));
 }
 
 // Test that slice(broadcast(/*scalar value*/)) simplifies to a single
@@ -1402,8 +1507,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
 
   root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  EXPECT_EQ(scalar_param, root->operand(0));
+  EXPECT_THAT(root, op::Broadcast(scalar_param));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 }
 
@@ -1440,11 +1544,90 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
 
   root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  EXPECT_EQ(forty_two, root->operand(0));
+  EXPECT_THAT(root, op::Broadcast(forty_two));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 }
 
+// Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
+TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* operand =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {1, 3, 3, 5}), operand, pad_value, padding));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, pad, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module.AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
 TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   HloComputation::Builder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
@@ -1461,10 +1644,39 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kParameter);
   EXPECT_EQ(a, root);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
 }
 
+TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
+  // Dots add computations to the parent module. Test that, when the HloModule's
+  // computations are updated, then iterator invalidation doesn't occur
+  // when running on subsequent computations.
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {1});
+  HloComputation::Builder builder(TestName() + ".Dot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kDot, x, y));
+  std::unique_ptr<HloComputation> dot_computation(builder.Build());
+
+  HloComputation::Builder call_builder(TestName() + ".Call");
+  HloInstruction* zero = call_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({0.0f})));
+  HloInstruction* one = call_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  module->AddEmbeddedComputation(std::move(dot_computation));
+  module->AddEntryComputation(call_builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index e59fad4e05252ebd54b3a7cecbdf990127a5264c..83759a7a0c62222b81b82b8a0f8e0396a8f17eff 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -64,8 +64,9 @@ GlobalDataHandle AllocationTracker::RegisterInternal(
     auto& allocation = FindOrDie(handle_to_allocation_, handle);
     int ref_count = allocation->ref_count();
     CHECK_GT(ref_count, 0);
-    VLOG(2) << "ref_count: " << ref_count << " -> " << ref_count + 1;
-    allocation->increment_ref_count();
+    VLOG(2) << "ref_count: " << ref_count << " -> " <<
+            (ref_count + initial_ref_count);
+    allocation->increment_ref_count(initial_ref_count);
   } else {
     handle = next_handle_++;
     VLOG(2) << "ref_count: " << initial_ref_count;
@@ -125,9 +126,7 @@ tensorflow::Status AllocationTracker::DeallocateShape(
     handle_map.erase(device_memory->opaque());
   }
 
-  // TODO(b/36256956) Ideally tuple elements could always be distinct buffers.
-  if (ShapeUtil::IsTuple(shape) &&
-      backend->transfer_manager()->TupleElementsAreDistinctBuffers()) {
+  if (ShapeUtil::IsTuple(shape)) {
     // Traverse into tuple recursively deallocating buffers.
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend->stream_executor(device_ordinal));
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index e00768001620275d702c2f96a89d981526ea81a7..ebbf35b6fe87bc7322ccb99cfe8f8eed56de06b3 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -63,10 +63,10 @@ class Allocation {
     CHECK_GE(ref_count_, 0);
     return ref_count_;
   }
-  void increment_ref_count() {
+  void increment_ref_count(int inc) {
     CHECK_GT(ref_count_, 0);
-    CHECK_LT(ref_count_, INT_MAX);
-    ++ref_count_;
+    CHECK_LE(ref_count_, INT_MAX - inc);
+    ref_count_ += inc;
   }
   void decrement_ref_count() {
     CHECK_GT(ref_count_, 0);
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 5c05417c6dcb887b5352d1270c24a4eae62149e3..1913617fecf757a529bbdc803b4227a560c6e1cf 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -41,13 +41,39 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+BackendOptions& BackendOptions::set_platform(
+    perftools::gputools::Platform* platform) {
+  platform_ = platform;
+  return *this;
+}
+
+perftools::gputools::Platform* BackendOptions::platform() const {
+  return platform_;
+}
+
+BackendOptions& BackendOptions::set_number_of_replicas(int number_of_replicas) {
+  number_of_replicas_ = number_of_replicas;
+  return *this;
+}
+
+int BackendOptions::number_of_replicas() const { return number_of_replicas_; }
+
+BackendOptions& BackendOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int BackendOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper()
-      : pool(new tensorflow::thread::ThreadPool(
-            tensorflow::Env::Default(), "XLAEigen",
-            tensorflow::port::NumSchedulableCPUs())),
+  explicit EigenThreadPoolWrapper(const int num_threads)
+      : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
+                                                "XLAEigen", num_threads)),
         wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
@@ -58,18 +84,21 @@ struct Backend::EigenThreadPoolWrapper {
 };
 
 /* static */ StatusOr<std::unique_ptr<Backend>> Backend::CreateBackend(
-    perftools::gputools::Platform* platform, int64 replica_count) {
+    const BackendOptions& options) {
+  int64 replica_count = options.number_of_replicas();
   if (replica_count == -1) {
     legacy_flags::BackendFlags* flags = legacy_flags::GetBackendFlags();
     replica_count = flags->xla_replicas;
   }
+  perftools::gputools::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto stream_executors,
                       PlatformUtil::GetStreamExecutors(platform));
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
-  std::unique_ptr<Backend> backend(new Backend(
-      replica_count, platform, compiler, stream_executors, transfer_manager));
+  std::unique_ptr<Backend> backend(
+      new Backend(replica_count, platform, compiler, stream_executors,
+                  transfer_manager, options.intra_op_parallelism_threads()));
   TF_RETURN_IF_ERROR(backend->PoolStreams(kInitialStreamsToPool,
                                           backend->default_stream_executor()));
   return std::move(backend);
@@ -79,7 +108,9 @@ struct Backend::EigenThreadPoolWrapper {
 Backend::CreateDefaultBackend() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetDefaultPlatform());
-  return CreateBackend(platform);
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  return CreateBackend(backend_options);
 }
 
 tensorflow::Status Backend::PoolStreams(int n, se::StreamExecutor* executor) {
@@ -114,7 +145,7 @@ Backend::Backend(
     int64 replica_count, perftools::gputools::Platform* platform,
     Compiler* compiler,
     tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors,
-    TransferManager* transfer_manager)
+    TransferManager* transfer_manager, int intra_op_parallelism_threads)
     : platform_(platform),
       compiler_(compiler),
       transfer_manager_(transfer_manager),
@@ -144,7 +175,11 @@ Backend::Backend(
     inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
         tensorflow::Env::Default(), "xla_inter_op",
         tensorflow::port::NumSchedulableCPUs()));
-    intra_op_thread_pool_wrapper_.reset(new EigenThreadPoolWrapper());
+    const int num_threads = intra_op_parallelism_threads > 0
+                                ? intra_op_parallelism_threads
+                                : tensorflow::port::NumSchedulableCPUs();
+    intra_op_thread_pool_wrapper_.reset(
+        new EigenThreadPoolWrapper(num_threads));
   }
 }
 
@@ -190,10 +225,17 @@ tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) return nullptr;
+  if (intra_op_thread_pool_wrapper_ == nullptr) {
+    return nullptr;
+  }
   return intra_op_thread_pool_wrapper_->device.get();
 }
 
+tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
+  if (intra_op_thread_pool_wrapper_ == nullptr) return nullptr;
+  return intra_op_thread_pool_wrapper_->pool.get();
+}
+
 StatusOr<perftools::gputools::StreamExecutor*> Backend::stream_executor(
     int device_ordinal) const {
   if (device_ordinal < 0 ||
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 9f6829b7d937cec6a67d4016a40506de5df8572d..1068bac2779e9a3dc6c23c0b9fbcc5403fcc2815 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -39,6 +39,31 @@ struct ThreadPoolDevice;
 
 namespace xla {
 
+// Options to configure the backend when it is created.
+class BackendOptions {
+ public:
+  // Set the platform backing the backend, or nullptr for the default platform.
+  BackendOptions& set_platform(perftools::gputools::Platform* platform);
+  perftools::gputools::Platform* platform() const;
+
+  // Set the number of replicas to use when compiling replicated
+  // programs. The default is -1 meaning that the value is read from
+  // the xla_replicas flag.
+  BackendOptions& set_number_of_replicas(int number_of_replicas);
+  int number_of_replicas() const;
+
+  // Sets the thread pool size for parallel execution of an individual operator.
+  // The default value of -1 will result in initializing the thread pool with
+  // the number of threads equal to the number of cores in the system.
+  BackendOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
+ private:
+  perftools::gputools::Platform* platform_ = nullptr;
+  int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
+};
+
 // Class which encapsulates an XLA backend. It includes everything necessary
 // to compile and execute computations on a particular platform.
 //
@@ -53,9 +78,9 @@ class Backend {
   static constexpr int kInitialStreamsToPool = 8;
 
   // Creates a new backend for the given platform with the given number of
-  // replicas. A value of -1 means to use the flag value.
+  // replicas.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
-      perftools::gputools::Platform* platform, int64 replica_count = -1);
+      const BackendOptions& options);
 
   // Creates a backend for the default platform. The default platform is defined
   // in PlatformUtil.
@@ -150,6 +175,7 @@ class Backend {
   // For the host platform, returns the configured eigen threadpool device to be
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
+  tensorflow::thread::ThreadPool* eigen_intra_op_thread_pool() const;
 
   // Resets the devices associated with this backend.
   Status ResetDevices();
@@ -160,7 +186,7 @@ class Backend {
           Compiler* compiler,
           tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
               stream_executors,
-          TransferManager* transfer_manager);
+          TransferManager* transfer_manager, int intra_op_parallelism_threads);
   Backend(const Backend&) = delete;
   Backend& operator=(const Backend&) = delete;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index e2b550fc022610c72aa312281727c9c2aea66388..ccb84b026e8782bdf76006a484ac5077a616fb5f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -41,6 +41,8 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::gtl::FlatMap;
+using ::tensorflow::gtl::FlatSet;
 using ::tensorflow::strings::Appendf;
 using ::tensorflow::strings::HumanReadableNumBytes;
 
@@ -394,8 +396,8 @@ Status GatherComputationsByAllocationType(
 
   // Sets for quickly checking membership. Computations are returned in vectors
   // for stable iteration.
-  tensorflow::gtl::FlatSet<HloComputation*> thread_local_set;
-  tensorflow::gtl::FlatSet<HloComputation*> global_set;
+  FlatSet<HloComputation*> thread_local_set;
+  FlatSet<HloComputation*> global_set;
 
   while (!worklist.empty()) {
     auto worklist_front = worklist.front();
@@ -487,21 +489,10 @@ Status GatherComputationsByAllocationType(
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-    bool colocate_related_buffers,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
+    bool allow_input_output_aliasing) {
   BufferAssigner assigner(std::move(buffer_size), alignment,
-                          colocate_related_buffers);
-  return assigner.CreateAssignment(module, std::move(hlo_ordering),
-                                   hlos_to_allocate);
-}
-
-/* static */
-StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    LogicalBuffer::SizeFunction buffer_size, int64 alignment) {
-  return BufferAssigner::Run(module, std::move(hlo_ordering),
-                             std::move(buffer_size), alignment,
-                             /*colocate_related_buffers=*/true);
+                          allow_input_output_aliasing);
+  return assigner.CreateAssignment(module, std::move(hlo_ordering));
 }
 
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
@@ -535,6 +526,28 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
               << " may interfere with " << buffer;
       return false;
     }
+    // Copy instruction don't share a buffer with their input operand.
+    if (buffer.instruction()->IsUserOf(assigned_buffer.instruction()) &&
+        buffer.instruction()->opcode() == HloOpcode::kCopy) {
+      VLOG(4) << "Can't assign: assignee " << assigned_buffer
+              << " is used at copy instruction " << buffer;
+      return false;
+    }
+  }
+
+  if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
+    HloComputation* entry_computation =
+        assignment->module_->entry_computation();
+    for (auto param : entry_computation->parameter_instructions()) {
+      for (auto& param_buffer :
+           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
+               param)) {
+        if (assignment->liveness().MayInterfere(*param_buffer, buffer)) {
+          VLOG(4) << "Can't assign: Parameter interference with result";
+          return false;
+        }
+      }
+    }
   }
 
   // If the buffer is live out of the computation then it should only be
@@ -554,31 +567,28 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 
 Status BufferAssigner::AssignBuffersForComputation(
     const HloComputation* computation, bool is_thread_local,
-    const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
-    const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
-    const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
-        colocated_allocations,
+    const FlatSet<const LogicalBuffer*>& colocated_buffers,
+    const FlatSet<BufferAllocation::Index>& colocated_allocations,
+    FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+        buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
   // size.
   std::vector<const LogicalBuffer*> sorted_buffers;
   for (auto& instruction : computation->instructions()) {
-    if (hlos_to_allocate == nullptr ||
-        hlos_to_allocate->count(instruction.get()) > 0) {
-      // Add all buffers which this instruction defines. Instruction which don't
-      // define buffers (eg, bitcast which just forwards a pointer) don't need
-      // any allocations.
-      for (const LogicalBuffer* buffer :
-           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-               instruction.get())) {
-        sorted_buffers.push_back(buffer);
-      }
+    // Add all buffers which this instruction defines. Instruction which don't
+    // define buffers (eg, bitcast which just forwards a pointer) don't need
+    // any allocations.
+    for (const LogicalBuffer* buffer :
+         assignment->points_to_analysis().GetBuffersDefinedByInstruction(
+             instruction.get())) {
+      sorted_buffers.push_back(buffer);
     }
   }
 
   // Generate a post order sort of instructions for sorting of the
   // LogicalBuffers.
-  tensorflow::gtl::FlatMap<const HloInstruction*, int> post_order_position;
+  FlatMap<const HloInstruction*, int> post_order_position;
   int position = 0;
   for (auto* instruction : computation->MakeInstructionPostOrder()) {
     post_order_position.emplace(instruction, position);
@@ -588,9 +598,16 @@ Status BufferAssigner::AssignBuffersForComputation(
   // If there is a sequential instruction ordering, we'll delay assignment of
   // temp buffers until after the main assignment loop.
   const BufferLiveness& liveness = assignment->liveness();
-  const std::vector<const HloInstruction*>* sequential_order =
-      liveness.hlo_ordering().SequentialOrder(*computation);
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> unassigned_temp_buffers;
+  const bool has_sequential_order =
+      liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
+  if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
+    // Every sequential computation must get an entry in the
+    // buffers_to_assign_sequentially map, even if we end up with an empty set
+    // of buffers. This ensures we can correctly determine whether to run
+    // whole-module heap simulation.
+    buffers_to_assign_sequentially->emplace(computation,
+                                            FlatSet<const LogicalBuffer*>());
+  }
 
   // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
   // first for simplicity. This means any previously created BufferAllocation is
@@ -609,7 +626,7 @@ Status BufferAssigner::AssignBuffersForComputation(
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
   std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [this, sequential_order, &liveness, &post_order_position](
+            [this, has_sequential_order, &liveness, &post_order_position](
                 const LogicalBuffer* a, const LogicalBuffer* b) {
               // Primary sort is by decreasing buffer size.
               const int64 a_size = buffer_size_(*a);
@@ -619,7 +636,7 @@ Status BufferAssigner::AssignBuffersForComputation(
               }
               // Otherwise live out buffers come before others, if the
               // instructions are sequentially ordered.
-              if (sequential_order != nullptr) {
+              if (has_sequential_order) {
                 const bool a_live_out = liveness.MaybeLiveOut(*a);
                 const bool b_live_out = liveness.MaybeLiveOut(*b);
                 if (a_live_out != b_live_out) {
@@ -756,7 +773,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       }
     }
 
-    if (!assignment->HasAllocation(*buffer) && sequential_order != nullptr &&
+    if (!assignment->HasAllocation(*buffer) && has_sequential_order &&
         !liveness.MaybeLiveOut(*buffer)) {
       // There is a sequential instruction ordering, so we delay assignment of
       // temp buffers until after the loop. We do this right before we decide to
@@ -768,7 +785,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       // for the definition of temp buffers.
       CHECK(!is_entry_parameter) << *buffer;
       CHECK(!is_thread_local) << *buffer;
-      unassigned_temp_buffers.insert(buffer);
+      (*buffers_to_assign_sequentially)[computation].insert(buffer);
       VLOG(3) << "Delaying assignment of temp buffer: " << *buffer;
       continue;
     }
@@ -782,27 +799,68 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
   }
 
-  if (!unassigned_temp_buffers.empty()) {
-    TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
-        *sequential_order, unassigned_temp_buffers, *computation, assignment));
-  }
   return Status::OK();
 }
 
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
-    const std::vector<const HloInstruction*>& sequence,
-    const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers_to_assign,
-    const HloComputation& computation, BufferAssignment* assignment) {
+    const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+        buffers_to_assign_sequentially,
+    bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
   // Run the sequence of instructions through the heap simulator.  The heuristic
   // that seems to give the best results is lazy-best-fit, with all runs of
   // alloc / free calls sorted in decreasing size order.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                             MakeUnique<LazyBestFitHeap>(alignment_)),
-                         sequence, computation,
-                         assignment->points_to_analysis(), buffer_size_,
-                         &buffers_to_assign));
+  const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
+  if (run_whole_module_heap_simulation) {
+    // Run the heap simulation over the whole module. This reduces memory usage,
+    // since buffers for kCall and kWhile sub-computations are only live for the
+    // duration of their calling instructions.
+    VLOG(1) << "Running whole-module heap simulation";
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    FlatSet<const LogicalBuffer*> all_buffers_to_assign;
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      module_sequence[computation] = *instruction_sequence;
+      all_buffers_to_assign.insert(buffers_to_assign.begin(),
+                                   buffers_to_assign.end());
+    }
+    TF_ASSIGN_OR_RETURN(
+        const HeapSimulator::Result result,
+        HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                               MakeUnique<LazyBestFitHeap>(alignment_)),
+                           assignment->module(), module_sequence,
+                           assignment->points_to_analysis(), buffer_size_,
+                           &all_buffers_to_assign));
+    AssignBuffersFromHeapSimulator(result, assignment);
+  } else {
+    // Run the heap-simulation on a per-computation basis. Buffers for
+    // sub-computations are assigned disjoint BufferAllocations, assuming the
+    // worst-case that they may all be live concurrently.
+    VLOG(1) << "Running per-computation heap simulation";
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      TF_ASSIGN_OR_RETURN(
+          const HeapSimulator::Result result,
+          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                             *computation, *instruction_sequence,
+                             assignment->points_to_analysis(), buffer_size_,
+                             &buffers_to_assign));
+      AssignBuffersFromHeapSimulator(result, assignment);
+    }
+  }
+  return Status::OK();
+}
+
+void BufferAssigner::AssignBuffersFromHeapSimulator(
+    const HeapSimulator::Result& result, BufferAssignment* assignment) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
         result.fragmentation_size;
@@ -811,8 +869,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         result.fragmentation_size;
   }
 
-  // Use the results of the heap simulator to create one allocation per
-  // computation, with LogicalBuffers packed to specific offsets.
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
       result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true);
   for (const auto& buffer_chunk : result.chunk_map) {
@@ -820,7 +876,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
-  return Status::OK();
 }
 
 // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
@@ -881,40 +936,152 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   }
 }
 
+// Conceptually the same as AddSetToColocatedBufferSets, but specific to the
+// colocated buffers for while instructions. 'colocated_set' contains the
+// buffers for a single while instruction that must be colocated. The idea here
+// is to apply a memory-saving heuristic for separate while instructions whose
+// buffers are disjoint in liveness, by using the colocation mechanism to force
+// buffer sharing. This often reduces memory for multi-layer RNNs.
+//
+// TODO(b/32491382): We should be able to remove this heuristic after we
+// implement module-level liveness analysis, which would let us directly detect
+// buffer sharing opportunities between the while instruction buffer and the
+// buffers from the predicate and body computation, as well as sharing across
+// different while instructions.
+void BufferAssigner::AddWhileSetToColocatedBufferSets(
+    const std::vector<const LogicalBuffer*>& colocated_set,
+    const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+    const HloComputation& computation, const BufferLiveness& buffer_liveness,
+    std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
+  CHECK(!colocated_set.empty());
+  const TuplePointsToAnalysis& points_to_analysis =
+      buffer_liveness.points_to_analysis();
+
+  // Parallel while loops cannot safely share colocated buffer sets.
+  if (buffer_liveness.hlo_ordering().SequentialOrder(computation) == nullptr) {
+    AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+    return;
+  }
+
+  // Scan 'colocated_buffer_sets' in reverse order for locality; colocated sets
+  // are added in postorder over computations and instructions.
+  const int64 init_buffer_size = buffer_size_(*while_init_buffer);
+  for (int i = colocated_buffer_sets->size() - 1; i >= 0; --i) {
+    const ColocatedBufferSet& predecessor_set = (*colocated_buffer_sets)[i];
+
+    // Skip predecessor sets not associated with while loops.
+    if (std::all_of(predecessor_set.begin(), predecessor_set.end(),
+                    [](const LogicalBuffer* buffer) {
+                      return buffer->instruction()->opcode() !=
+                             HloOpcode::kWhile;
+                    })) {
+      continue;
+    }
+
+    // Skip predecessor sets already associated with 'while_hlo'.
+    if (std::any_of(predecessor_set.begin(), predecessor_set.end(),
+                    [&while_hlo](const LogicalBuffer* buffer) {
+                      return buffer->instruction() == while_hlo;
+                    })) {
+      continue;
+    }
+
+    // Build vector of predecessor while result and init buffers, which are
+    // checked for liveness interference below. We must check both the result
+    // and init buffers because they're aliased together, but
+    // TuplePointsToAnalysis is unaware of this aliasing.
+    std::vector<const LogicalBuffer*> predecessor_while_buffers;
+    for (const LogicalBuffer* buffer : predecessor_set) {
+      const HloInstruction* instruction = buffer->instruction();
+      if (instruction->opcode() == HloOpcode::kWhile &&
+          buffer_size_(*buffer) == init_buffer_size &&
+          instruction->parent() == &computation) {
+        predecessor_while_buffers.push_back(buffer);
+        // Add the init buffer at the same index, which must also exist in the
+        // predecessor set, and must be unambiguous.
+        const PointsToSet& init_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        const std::vector<const LogicalBuffer*>& init_buffers =
+            init_points_to.element(buffer->index());
+        CHECK_EQ(init_buffers.size(), 1);
+        CHECK_GT(predecessor_set.count(init_buffers[0]), 0);
+        predecessor_while_buffers.push_back(init_buffers[0]);
+      }
+    }
+    if (predecessor_while_buffers.empty()) {
+      continue;
+    }
+
+    // Skip predecessor set if the live range of any predecessor buffers
+    // overlaps with 'while_init_buffer'. Note that tuple element buffer
+    // forwarding can cause the same buffer to appear on both sides of the
+    // interference comparison below.
+    if (std::any_of(
+            predecessor_while_buffers.begin(), predecessor_while_buffers.end(),
+            [while_init_buffer, &buffer_liveness](const LogicalBuffer* buffer) {
+              return while_init_buffer->id() != buffer->id() &&
+                     buffer_liveness.MayInterfere(*while_init_buffer, *buffer);
+            })) {
+      continue;
+    }
+
+    // All our checks have passed; merge 'predecessor_set' with 'colocated_set',
+    // and add the merged set to 'colocated_buffer_sets'. This forces the
+    // colocation of buffers across different while instructions.
+    FlatSet<const LogicalBuffer*> unique;
+    unique.insert(predecessor_set.begin(), predecessor_set.end());
+    unique.insert(colocated_set.begin(), colocated_set.end());
+    std::vector<const LogicalBuffer*> merged_set(unique.begin(), unique.end());
+    AddSetToColocatedBufferSets(merged_set, colocated_buffer_sets);
+    return;
+  }
+
+  // Failed to merge into predecessor set; add 'colocated_set' as-is.
+  AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+}
+
 namespace {
+
 // Checks that points-to set of 'instruction' is unambiguous and distinct
 // (ensured by CopyInsertion), then adds the buffer from the points-to set at
 // 'index' to 'colocated_set'.
-void AddBufferToColocatedSet(const HloInstruction* instruction,
-                             const ShapeIndex& index,
-                             const TuplePointsToAnalysis& points_to_analysis,
-                             std::vector<const LogicalBuffer*>* colocated_set) {
+const LogicalBuffer* AddBufferToColocatedSet(
+    const HloInstruction* instruction, const ShapeIndex& index,
+    const TuplePointsToAnalysis& points_to_analysis,
+    std::vector<const LogicalBuffer*>* colocated_set) {
   // CopyInsertion ensures root points-to set is unambiguous and distinct.
   const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
   CHECK(!points_to.IsAmbiguous());
   CHECK(points_to.IsDistinct());
   colocated_set->push_back(points_to.element(index)[0]);
+  return colocated_set->back();
 }
+
 }  // namespace
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
 // in the same allocation (currently just supports kWhile and kCall).
 void BufferAssigner::BuildColocatedBufferSets(
-    const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+    const HloModule* module, const BufferLiveness& buffer_liveness,
     std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
-  for (auto& computation : module->computations()) {
-    for (auto& instruction : computation->instructions()) {
+  const TuplePointsToAnalysis& points_to_analysis =
+      buffer_liveness.points_to_analysis();
+  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    for (const HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       const HloOpcode opcode = instruction->opcode();
       if (opcode == HloOpcode::kWhile) {
-        HloInstruction* while_hlo = instruction.get();
+        const HloInstruction* while_hlo = instruction;
         TF_CHECK_OK(ShapeUtil::ForEachSubshape(
             while_hlo->shape(),
-            [this, while_hlo, &points_to_analysis, colocated_buffer_sets](
-                const Shape& /*subshape*/, const ShapeIndex& index) {
+            [this, while_hlo, &points_to_analysis, &buffer_liveness,
+             computation, colocated_buffer_sets](const Shape& /*subshape*/,
+                                                 const ShapeIndex& index) {
               std::vector<const LogicalBuffer*> colocated_set;
               // Add while.init.
-              AddBufferToColocatedSet(while_hlo->operand(0), index,
-                                      points_to_analysis, &colocated_set);
+              auto* init_buffer =
+                  AddBufferToColocatedSet(while_hlo->operand(0), index,
+                                          points_to_analysis, &colocated_set);
               // Add while.result.
               AddBufferToColocatedSet(while_hlo, index, points_to_analysis,
                                       &colocated_set);
@@ -930,12 +1097,15 @@ void BufferAssigner::BuildColocatedBufferSets(
               AddBufferToColocatedSet(
                   while_hlo->while_body()->root_instruction(), index,
                   points_to_analysis, &colocated_set);
-              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+              AddWhileSetToColocatedBufferSets(
+                  colocated_set, init_buffer, while_hlo, *computation,
+                  buffer_liveness, colocated_buffer_sets);
               return Status::OK();
             }));
       } else if (opcode == HloOpcode::kCall) {
-        HloInstruction* call_hlo = instruction.get();
-        HloInstruction* root_hlo = call_hlo->to_apply()->root_instruction();
+        const HloInstruction* call_hlo = instruction;
+        const HloInstruction* root_hlo =
+            call_hlo->to_apply()->root_instruction();
         TF_CHECK_OK(ShapeUtil::ForEachSubshape(
             call_hlo->shape(),
             [this, call_hlo, root_hlo, &points_to_analysis,
@@ -961,8 +1131,8 @@ void BufferAssigner::BuildColocatedBufferSets(
 void BufferAssigner::AssignColocatedBufferSets(
     const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
     BufferAssignment* assignment,
-    tensorflow::gtl::FlatSet<const LogicalBuffer*>* colocated_buffers,
-    tensorflow::gtl::FlatSet<BufferAllocation::Index>* colocated_allocations) {
+    FlatSet<const LogicalBuffer*>* colocated_buffers,
+    FlatSet<BufferAllocation::Index>* colocated_allocations) {
   for (const ColocatedBufferSet& colocated_buffer_set : colocated_buffer_sets) {
     BufferAllocation* allocation = nullptr;
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
@@ -980,40 +1150,33 @@ void BufferAssigner::AssignColocatedBufferSets(
                                   buffer_size_(*buffer));
       }
       colocated_buffers->insert(buffer);
+
+      // Each entry parameter must reside in its own BufferAllocation. As a
+      // result, it doesn't make sense for entry parameters to appear in a
+      // colocated buffer set, since the only correct scenario would be a
+      // degenerate colocated set that only contains the entry parameter.
+      const HloInstruction* instruction = buffer->instruction();
+      const HloComputation* computation = instruction->parent();
+      const bool is_entry_parameter =
+          instruction->opcode() == HloOpcode::kParameter &&
+          computation == computation->parent()->entry_computation();
+      CHECK(!is_entry_parameter)
+          << "allocation: " << *allocation << " instruction: " << *buffer << " "
+          << instruction->ToString();
     }
   }
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
-    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-    const std::vector<const HloInstruction*>* hlos_to_allocate) {
+    const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
                       BufferLiveness::Run(module, std::move(hlo_ordering)));
 
-  std::vector<const HloComputation*> thread_local_computations;
-  std::vector<const HloComputation*> global_computations;
   VLOG(1) << "Assigning buffers to module " << module->name();
-  if (hlos_to_allocate != nullptr) {
-    VLOG(3) << "LogicalBuffer assignment restricted to hlos: ";
-    for (auto hlo : *hlos_to_allocate) {
-      VLOG(3) << "  " << hlo->parent()->name() << "::" << hlo->name();
-    }
-  }
-  XLA_VLOG_LINES(3, module->ToString());
+  XLA_VLOG_LINES(2, module->ToString());
   XLA_VLOG_LINES(3, liveness->ToString());
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
-  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
-      module, &thread_local_computations, &global_computations));
-
-  // Set of HLO's to allocate if hlos_to_allocate is given. Passed as a set to
-  // AssignBuffersForComputation for fast membership testing.
-  std::unique_ptr<tensorflow::gtl::FlatSet<const HloInstruction*>> hlo_set;
-  if (hlos_to_allocate != nullptr) {
-    hlo_set = MakeUnique<tensorflow::gtl::FlatSet<const HloInstruction*>>(
-        hlos_to_allocate->begin(), hlos_to_allocate->end());
-  }
-
   // Can't use MakeUnique because BufferAssignment constructor is private.
   std::unique_ptr<BufferAssignment> assignment(
       new BufferAssignment(module, std::move(liveness), alignment_));
@@ -1022,26 +1185,46 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // Once b/32491382 enables module-level liveness analysis, we may be able
   // to assign colocated buffers (or at least reuse their allocation for
   // buffers outside of the set) in AssignBuffersForComputation.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> colocated_buffers;
-  tensorflow::gtl::FlatSet<BufferAllocation::Index> colocated_allocations;
-  if (colocate_related_buffers_) {
-    std::vector<ColocatedBufferSet> colocated_buffer_sets;
-    BuildColocatedBufferSets(module, assignment->points_to_analysis(),
-                             &colocated_buffer_sets);
-    AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
-                              &colocated_buffers, &colocated_allocations);
-  }
+  FlatSet<const LogicalBuffer*> colocated_buffers;
+  FlatSet<BufferAllocation::Index> colocated_allocations;
+  std::vector<ColocatedBufferSet> colocated_buffer_sets;
+  BuildColocatedBufferSets(module, assignment->liveness(),
+                           &colocated_buffer_sets);
+  AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
+                            &colocated_buffers, &colocated_allocations);
+
+  std::vector<const HloComputation*> thread_local_computations;
+  std::vector<const HloComputation*> global_computations;
+  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
+      module, &thread_local_computations, &global_computations));
 
+  // First assign buffers for global computatations. Temporary buffers for
+  // sequential computations are collected in 'buffers_to_assign_sequentially'.
+  FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>
+      buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/false, hlo_set.get(),
-        colocated_buffers, colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/false, colocated_buffers,
+        colocated_allocations, &buffers_to_assign_sequentially,
+        assignment.get()));
   }
+  // Assign buffers with sequential ordering, if any. If all global computations
+  // are sequential, we can run heap simuation on the whole module, which
+  // reduces memory usage.
+  const bool run_whole_module_heap_simulation =
+      buffers_to_assign_sequentially.size() == global_computations.size();
+  TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
+      buffers_to_assign_sequentially, run_whole_module_heap_simulation,
+      assignment.get()));
+
+  // Now assign buffers for thread-local computations. All LogicalBuffers get
+  // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, /*is_thread_local=*/true, hlo_set.get(), colocated_buffers,
-        colocated_allocations, assignment.get()));
+        computation, /*is_thread_local=*/true, colocated_buffers,
+        colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr,
+        assignment.get()));
   }
 
   // Mark all buffers which may be live out of the entry computation as
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index b82acb19b3488884bdc8d2d5c4a1524ac165676a..4b8b2cb9c4b5c27ad48e00e7f73635ffd6207882 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -308,6 +309,9 @@ class BufferAssignment {
     return liveness_->points_to_analysis();
   }
 
+  // Returns the BufferLiveness object used to construct this assignment.
+  const BufferLiveness& liveness() const { return *liveness_; }
+
   string ToString() const;
 
   // Statistics for the assignment.  Values initialized to -1 are not always
@@ -354,8 +358,8 @@ class BufferAssignment {
   void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
                      int64 offset, int64 size);
 
-  // Returns the BufferLiveness object used to construct this assignment.
-  const BufferLiveness& liveness() { return *liveness_; }
+  // Returns the HloModule used to construct this assignment.
+  const HloModule& module() { return *module_; }
 
   // Convenience function which returns the PointsToSet for the given
   // instruction. Extracted from the liveness object.
@@ -396,58 +400,55 @@ class BufferAssigner {
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size is a function
   // which returns the size of a LogicalBuffer. Alignment is the the minimum
-  // alignment of any buffer. If hlos_to_allocate is not null then only
-  // instructions in this vector are considered for buffer assignment. If
-  // hlos_to_allocate is null then all instructions are considered. If
-  // 'colocate_related_buffers' is true, related LogicalBuffers will be
-  // colocated in the same allocation (i.e buffers for while result will share
-  // an allocation with buffers related to that same while instruction: init
-  // operand, condition/body parameter and body result).
+  // alignment of any buffer. allow_input_output_aliasing specifies whether
+  // input buffer are allowed to be reused as outbut buffers by the client code.
   static StatusOr<std::unique_ptr<BufferAssignment>> Run(
       const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
       LogicalBuffer::SizeFunction buffer_size, int64 alignment,
-      bool colocate_related_buffers,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
-
-  // Overload of Run which uses ShapeUtil::ByteSizeOf to determine buffer size
-  // and assigns buffers to all HLO instructions in the module.
-  static StatusOr<std::unique_ptr<BufferAssignment>> Run(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      LogicalBuffer::SizeFunction buffer_size, int64 alignment);
+      bool allow_input_output_aliasing = false);
 
  private:
-  explicit BufferAssigner(LogicalBuffer::SizeFunction buffer_size,
-                          int64 alignment, bool colocate_related_buffers)
+  BufferAssigner(LogicalBuffer::SizeFunction buffer_size, int64 alignment,
+                 bool allow_input_output_aliasing)
       : buffer_size_(std::move(buffer_size)),
         alignment_(alignment),
-        colocate_related_buffers_(colocate_related_buffers) {}
+        allow_input_output_aliasing_(allow_input_output_aliasing) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
   StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
-      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
-      const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
+      const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering);
 
   // Assigns buffers to the instructions in the given computation. "assignment"
   // is modified to reflect the new buffer assignments. If is_thread_local is
   // true, then all assigned buffers have the is_thread_local flag set to
-  // true. If hlos_to_allocate is not null it indicates which HLOs to include in
-  // buffer assignment. If null, all instructions in the computation are
-  // included.
+  // true.
   Status AssignBuffersForComputation(
       const HloComputation* computation, bool is_thread_local,
-      const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
+      tensorflow::gtl::FlatMap<const HloComputation*,
+                               tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+          buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
-  // Assigns 'buffers_to_assign' assuming the HLO instructions will be executed
-  // in the given 'sequential_order'.
+  // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
+  // the HLO instructions will be executed in the sequential order given by
+  // assignment->liveness().hlo_ordering().SequentialOrder. If
+  // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
+  // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
-      const std::vector<const HloInstruction*>& sequential_order,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers_to_assign,
-      const HloComputation& computation, BufferAssignment* assignment);
+      const tensorflow::gtl::FlatMap<
+          const HloComputation*,
+          tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+          buffers_to_assign_sequentially,
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment);
+
+  // Uses the results of the heap simulator to create a single allocation, with
+  // LogicalBuffers packed to specific offsets.
+  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
+                                      BufferAssignment* assignment);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
@@ -465,7 +466,7 @@ class BufferAssigner {
   // ColocatedBufferSet aggregates a set of related LogicalBuffers from 'module'
   // which should be colocated in the same buffer allocation.
   void BuildColocatedBufferSets(
-      const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
+      const HloModule* module, const BufferLiveness& buffer_liveness,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
   // For each buffer set in 'colocated_buffer_sets', assigns all buffers in the
@@ -482,7 +483,13 @@ class BufferAssigner {
       const std::vector<const LogicalBuffer*>& colocated_set,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  const HloModule* module_;
+  // Conceptually the same as AddSetToColocatedBufferSets, but specific to the
+  // colocated buffers for while instructions.
+  void AddWhileSetToColocatedBufferSets(
+      const std::vector<const LogicalBuffer*>& colocated_set,
+      const LogicalBuffer* while_init_buffer, const HloInstruction* while_hlo,
+      const HloComputation& computation, const BufferLiveness& buffer_liveness,
+      std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
   // Function which returns the buffer size for a given logical buffer (shape).
   LogicalBuffer::SizeFunction buffer_size_;
@@ -490,8 +497,9 @@ class BufferAssigner {
   // Minimum alignment of any buffer.
   int64 alignment_;
 
-  // Indicates whether related buffers should share the same buffer allocation.
-  const bool colocate_related_buffers_;
+  // If true, buffer assignments assumes that input parameter buffers and output
+  // buffers can be shared if their sizes match.
+  bool allow_input_output_aliasing_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bb7342d5081af32c9882311af8dddf08c115becc..ac1d769010c55ee4430554abe3205391bee5ebf1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -22,12 +22,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -208,30 +214,6 @@ class BufferAssignmentTest : public HloTestBase {
     return total_size;
   }
 
-  // Returns true if the buffers assigned to instructions in "a" are distinct
-  // from the buffers assigned to those in "b" (ie, intersection is empty).
-  bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
-                       const std::vector<const HloInstruction*>& b,
-                       const BufferAssignment& assignment) {
-    std::set<BufferAllocation::Slice> a_slices;
-    for (const HloInstruction* instruction : a) {
-      if (assignment.HasTopLevelAllocation(instruction)) {
-        a_slices.insert(
-            assignment.GetUniqueTopLevelSlice(instruction).ConsumeValueOrDie());
-      }
-    }
-
-    for (const HloInstruction* instruction : b) {
-      if (assignment.HasTopLevelAllocation(instruction)) {
-        if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
-                               .ConsumeValueOrDie())) {
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
   // Computation tracker for nested computations.
   ComputationTracker computation_tracker_;
 
@@ -246,6 +228,30 @@ class BufferAssignmentTest : public HloTestBase {
   Shape t_s32_f32v10_ = ShapeUtil::MakeTupleShape({s32_, f32vec10_});
 };
 
+// Returns true if the buffers assigned to instructions in "a" are distinct
+// from the buffers assigned to those in "b" (ie, intersection is empty).
+static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
+                            const std::vector<const HloInstruction*>& b,
+                            const BufferAssignment& assignment) {
+  std::set<BufferAllocation::Slice> a_slices;
+  for (const HloInstruction* instruction : a) {
+    if (assignment.HasTopLevelAllocation(instruction)) {
+      a_slices.insert(
+          assignment.GetUniqueTopLevelSlice(instruction).ConsumeValueOrDie());
+    }
+  }
+
+  for (const HloInstruction* instruction : b) {
+    if (assignment.HasTopLevelAllocation(instruction)) {
+      if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
+                             .ConsumeValueOrDie())) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 // Tests a computation consisting of a single scalar constant node.
 TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
@@ -850,8 +856,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   EXPECT_FALSE(map_root_alloc.maybe_live_out());
   EXPECT_TRUE(map_root_alloc.is_thread_local());
 
-  // Allocations for the call computation should not be thread-local and not
-  // live-out.
+  // Allocations for the call computation should not be thread-local.
   auto& call_param_alloc = GetTopLevelAllocation(*assignment, call_param);
   EXPECT_FALSE(call_param_alloc.is_entry_computation_parameter());
   EXPECT_FALSE(call_param_alloc.maybe_live_out());
@@ -859,7 +864,6 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
 
   auto& call_root_alloc = GetTopLevelAllocation(*assignment, call_root);
   EXPECT_FALSE(call_root_alloc.is_entry_computation_parameter());
-  EXPECT_FALSE(call_root_alloc.maybe_live_out());
   EXPECT_FALSE(call_root_alloc.is_thread_local());
 
   // Entry computation allocations can be marked liveout and
@@ -1144,12 +1148,12 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
   // should include the slices of both of the elements in the parameters.
   auto element_slices = assignment->GetAllSlices(select, /*index=*/{0});
   EXPECT_EQ(2, element_slices.size());
-  EXPECT_MATCH(testing::SetToVec<BufferAllocation::Slice>(element_slices),
-               testing::UnorderedMatcher<BufferAllocation::Slice>(
-                   assignment->GetUniqueSlice(tuple_param0, /*index=*/{0})
-                       .ConsumeValueOrDie(),
-                   assignment->GetUniqueSlice(tuple_param1, /*index=*/{0})
-                       .ConsumeValueOrDie()));
+  EXPECT_THAT(element_slices,
+              ::testing::UnorderedElementsAre(
+                  assignment->GetUniqueSlice(tuple_param0, /*index=*/{0})
+                      .ConsumeValueOrDie(),
+                  assignment->GetUniqueSlice(tuple_param1, /*index=*/{0})
+                      .ConsumeValueOrDie()));
 }
 
 // TODO(b/34669761): Remove this test when buffers are allowed to share
@@ -1245,6 +1249,257 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
   }
 }
 
-}  // namespace
+class WhileBufferAssignmentTest : public HloTestBase {
+ protected:
+  std::unique_ptr<HloComputation> BuildWhileConditionComputation(
+      const string& name) {
+    auto builder = HloComputation::Builder(name);
+    builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto zero = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(0)));
+    auto ten = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>(10)));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, zero, ten));
+    return builder.Build();
+  }
+
+  std::unique_ptr<HloComputation> BuildWhileBodyComputation(
+      const string& name) {
+    auto builder = HloComputation::Builder(name);
+    auto loop_state = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto input = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 0));
+    auto weights = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    auto output = builder.AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kMultiply, input, weights));
+    builder.AddInstruction(
+        HloInstruction::CreateTuple({input, weights, output}));
+    return builder.Build();
+  }
+
+  std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
+                                                        int64 alignment = 1) {
+    auto sequence =
+        CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
+    return BufferAssigner::Run(
+               module, MakeUnique<SequentialHloOrdering>(module, sequence),
+               ByteSizeOf, alignment)
+        .ConsumeValueOrDie();
+  }
+
+  static int64 ByteSizeOf(const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
+  }
+
+  Shape data_shape_ = ShapeUtil::MakeShape(F32, {4});
+  Shape loop_state_shape_ =
+      ShapeUtil::MakeTupleShape({data_shape_, data_shape_, data_shape_});
+};
 
+static void RunCopyInsertion(HloModule* module) {
+  CopyInsertion copy_insertion;
+  EXPECT_IS_OK(copy_insertion.Run(module).status());
+}
+
+TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+  auto weights1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, data_shape_, "weights1"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+  auto output1 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+
+  auto cond1 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body1 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+  auto input1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input1, weights1, output1}));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+
+  module->AddEntryComputation(builder.Build());
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  // While instruction 'while0' has no predecessor while instructions with
+  // which to share allocations.
+
+  // While instruction 'while1' can share allocations with the following
+  // buffers:
+  // *) while0[2], while1[0]
+  // *) while0[1], while1[1]
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {2}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {0}).ConsumeValueOrDie());
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {1}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
+}
+
+TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder("entry");
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+  auto output1 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+
+  auto cond1 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body1 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple1 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output1}));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+
+  module->AddEntryComputation(builder.Build());
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  // while0 and while1 buffers should be completely aligned.
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {0}).ConsumeValueOrDie());
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {1}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {1}).ConsumeValueOrDie());
+  EXPECT_EQ(assignment->GetUniqueSlice(while0, {2}).ConsumeValueOrDie(),
+            assignment->GetUniqueSlice(while1, {2}).ConsumeValueOrDie());
+}
+
+TEST_F(BufferAssignmentTest, TwoCalls) {
+  auto module = MakeUnique<HloModule>(TestName());
+  Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
+  HloComputation* sub_computation;
+  {
+    auto builder = HloComputation::Builder(TestName() + "_sub_comp");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32, "param"));
+    auto constant1 = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+    auto add = builder.AddInstruction(
+        HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, constant1));
+    sub_computation = module->AddEmbeddedComputation(builder.Build(add));
+  }
+  auto builder = HloComputation::Builder(TestName());
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(3.0)));
+  auto call1 = builder.AddInstruction(
+      HloInstruction::CreateCall(r0f32, {constant2}, sub_computation));
+  auto call2 = builder.AddInstruction(
+      HloInstruction::CreateCall(r0f32, {constant3}, sub_computation));
+  auto add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, call1, constant2));
+  auto add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, call2, add1));
+  module->AddEntryComputation(builder.Build(add2));
+
+  {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  }
+
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+
+  EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
+}
+
+// Test buffer assignment for while nodes with multiple uses.
+// TODO(b/37245345): Fix buffer assignment for this case.
+TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
+  auto module = MakeUnique<HloModule>(TestName());
+  auto builder = HloComputation::Builder(TestName());
+
+  auto input0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape_, "input0"));
+  auto weights0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
+
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+  auto output0 = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
+
+  auto cond0 =
+      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
+  auto body0 =
+      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
+
+  auto tuple0 = builder.AddInstruction(
+      HloInstruction::CreateTuple({input0, weights0, output0}));
+  auto while0 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
+  auto while1 = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, while0));
+
+  auto get0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
+  auto get1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, get0, get1));
+  module->AddEntryComputation(builder.Build());
+
+  RunCopyInsertion(module.get());
+
+  {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, flatten.Run(module.get()));
+    EXPECT_TRUE(result);
+  }
+
+  auto assignment = RunBufferAssignment(module.get());
+
+  EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
+}
+
+}  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 736f227aa423120ecb4a5e82824defac2d345b2e..d69a84cd0e3ffffad32b89afc726a31b175e47c5 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -45,9 +45,7 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
 }
 
 tensorflow::Status BufferLiveness::Analyze() {
-  TF_ASSIGN_OR_RETURN(points_to_analysis_,
-                      TuplePointsToAnalysis::Run(
-                          module_, /*include_loop_fusion_instructions=*/true));
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto& computation : module_->computations()) {
     // Gather all instructions whose buffers might alias other instructions into
     // the set aliased_buffers_.  This includes those contained as a tuple
@@ -117,11 +115,7 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
 
   // If 'b' is a user of 'a' then the buffers interfere unless 'a.instruction'
   // and 'b.instruction' emit the same shape/layout, and 'b.instruction' meets
-  // one of following qualifications:
-  // *) Is element-wise.
-  // *) Is a loop fusion instruction (with DynamicUpdateSlice fused root) where
-  //    the singleton use of 'a' at 'a.index' is the fused root at operand 0.
-  // *) Use of 'operand' is DynamicUpdateSlice at operand index 0.
+  // the qualifications specified in CanShareOperandBufferWithUser.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     if (b.instruction()->IsUserOf(alias.instruction()) &&
         !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
@@ -133,10 +127,30 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   return true;
 }
 
+namespace {
+bool IsEntryParameter(const HloInstruction* instruction) {
+  const HloComputation* computation = instruction->parent();
+  return instruction->opcode() == HloOpcode::kParameter &&
+         computation == computation->parent()->entry_computation();
+}
+}  // namespace
+
 bool BufferLiveness::MayInterfere(const LogicalBuffer& a,
                                   const LogicalBuffer& b) const {
-  return (!live_range_strictly_before(a, b) &&
-          !live_range_strictly_before(b, a));
+  // Entry parameters live at the entry of the execution, thus always interfere
+  // with all other instructions executing before them in the ordering.
+  const HloInstruction* a_instruction = a.instruction();
+  const HloInstruction* b_instruction = b.instruction();
+  if (IsEntryParameter(a_instruction) &&
+      hlo_ordering_->ExecutesBefore(b_instruction, a_instruction)) {
+    return true;
+  }
+  if (IsEntryParameter(b_instruction) &&
+      hlo_ordering_->ExecutesBefore(a_instruction, b_instruction)) {
+    return true;
+  }
+  // Buffers without disjoint liveness may interfere.
+  return !live_range_strictly_before(a, b) && !live_range_strictly_before(b, a);
 }
 
 bool BufferLiveness::MaybeLiveOut(const LogicalBuffer& buffer) const {
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index e71b98298b344b5689785bfa67a8bea54e0248e3..bee9a351f5df00aea6178fab4fd0e222ff9e9a99 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -92,6 +92,12 @@ class BufferLivenessTest : public HloTestBase {
         GetBuffer(liveness, instruction, /*index=*/{}));
   }
 
+  std::unique_ptr<HloComputation> BuildDummyComputation() {
+    auto builder = HloComputation::Builder(TestName() + "_dummy");
+    builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
+    return builder.Build();
+  }
+
   const Shape vec_ = ShapeUtil::MakeShape(xla::F32, {42});
 };
 
@@ -118,12 +124,17 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
                           MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
-  // No buffers should interfere.
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, log));
+
+  // No buffers should interfere.
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, log));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, negate));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, log));
-  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, log));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, log, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, log, exp));
 
   // Buffers should interfere with itself.
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, exp));
@@ -135,18 +146,69 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   EXPECT_TRUE(InstructionMaybeLiveOut(*liveness, log));
 }
 
+TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
+  // Two entry params, which interfere with each other.
+  //
+  // param0 --> negate ---------------\
+  //                   param1 --> exp --> add
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec_, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, vec_, "param1"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, param0));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kExp, param1));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  HloComputation* entry = module->AddEntryComputation(builder.Build());
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  sequence.insert({entry, {param0, negate, param1, exp, add}});
+  auto liveness = BufferLiveness::Run(
+                      module.get(),
+                      MakeUnique<SequentialHloOrdering>(module.get(), sequence))
+                      .ConsumeValueOrDie();
+
+  // Entry parameters interfere as if they are defined simultaneously at
+  // the very beginning.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param0, param1));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param0, add));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param1, param0));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, param1, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param1, exp));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param1, add));
+
+  // Negate and exp still interfere.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+
+  // But {negate, add} and {exp, add} don't interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
+}
+
 TEST_F(BufferLivenessTest, NonElementwiseOperand) {
-  // A chain of operations with one elementwise and one non-elementwise. The
+  // A chain of operations with two elementwise and one non-elementwise. The
   // elementwise op should not interfere with its operand, while the
-  // non-elementwise op should interfere.
+  // non-elementwise op should interfere. Entry params always interfere.
   //
-  // param --> negate -> reverse
+  // param --> exp -> negate -> reverse
   //
   auto builder = HloComputation::Builder(TestName());
   auto param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec_, HloOpcode::kExp, param));
   auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, param));
+      HloInstruction::CreateUnary(vec_, HloOpcode::kNegate, exp));
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
@@ -158,10 +220,14 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
                           MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
-  // No buffers should interfere.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, reverse));
+
+  // Negate is elementwise, so doesn't interfere with its operand.
+  // Reverse is non-elementwise, so does interfere with its operand.
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, negate));
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, reverse));
-  EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
 }
 
 TEST_F(BufferLivenessTest, OverlappedBuffers) {
@@ -190,8 +256,15 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, exp));
-  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
 }
 
 TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
@@ -204,8 +277,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   // Sequential order:
   //  param, negate, exp, add
   //
-  // Liveness is identical to the DependencyHloOrdering except that 'param' and
-  // exp no longer interfere.
+  // Liveness is identical to the DependencyHloOrdering.
   auto builder = HloComputation::Builder(TestName());
   auto param =
       builder.AddInstruction(HloInstruction::CreateParameter(0, vec_, "param"));
@@ -229,8 +301,15 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
-  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, add));
+
+  // Negate and exp interfere with each other, but not with add.
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, negate, exp));
+  EXPECT_TRUE(InstructionsMayInterfere(*liveness, exp, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, negate, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, negate));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, exp, add));
+  EXPECT_FALSE(InstructionsMayInterfere(*liveness, add, exp));
 }
 
 TEST_F(BufferLivenessTest, TupleLiveOut) {
@@ -392,7 +471,8 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
   auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(BuildDummyComputation());
+  module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
       BufferLiveness::Run(module.get(),
@@ -452,7 +532,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
   auto module = MakeUnique<HloModule>(TestName());
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputation(BuildDummyComputation());
+  module->AddEmbeddedComputation(builder.Build());
 
   auto liveness =
       BufferLiveness::Run(module.get(),
@@ -524,7 +605,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
     auto module = MakeUnique<HloModule>(TestName());
-    auto* computation = module->AddEntryComputation(builder.Build());
+    module->AddEntryComputation(BuildDummyComputation());
+    auto* computation = module->AddEmbeddedComputation(builder.Build());
     // Create fusion instruction based on number of tuple element 1 users.
     if (update_uses_tuple_element1) {
       computation->CreateFusionInstruction(
@@ -546,7 +628,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         BufferLiveness::Run(module.get(),
                             MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
-    // Return whether or not buffers interfernce is detected between
+    // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
     return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
   }
@@ -651,13 +733,14 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
     auto module = MakeUnique<HloModule>(TestName());
-    module->AddEntryComputation(builder.Build());
+    module->AddEntryComputation(BuildDummyComputation());
+    module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
     auto liveness =
         BufferLiveness::Run(module.get(),
                             MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
-    // Return whether or not buffers interfernce is detected between
+    // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
     return TupleElementsMayInterfere(*liveness, tuple_param0, tuple_root, {1});
   }
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index ab3eae2495ec55e8667db86b025f980157517ccc..fa7b2a309525dd80d655e10474c5d49f9da14ea8 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -51,6 +51,22 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
   return out;
 }
 
+CallContext GetInstructionCallContext(const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kWhile:
+      return CallContext::kSequential;
+    case HloOpcode::kMap:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kFusion:
+      return CallContext::kParallel;
+    default:
+      return CallContext::kNone;
+  }
+}
+
 string CallSite::ToString() const {
   return StrCat(instruction()->name(), " calls in context ",
                 CallContextToString(context()), ": ",
@@ -82,32 +98,12 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
   }
 }
 
-namespace {
-
-CallContext GetInstructionCallContext(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
-    case HloOpcode::kCall:
-    case HloOpcode::kWhile:
-      return CallContext::kSequential;
-    case HloOpcode::kMap:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReduceWindow:
-    case HloOpcode::kSelectAndScatter:
-    case HloOpcode::kFusion:
-      return CallContext::kParallel;
-    default:
-      return CallContext::kNone;
-  }
-}
-
-}  // namespace
-
-Status CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
-  TF_RET_CHECK(instruction->parent() == computation());
+void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
+  CHECK_EQ(instruction->parent(), computation());
   const CallContext context = GetInstructionCallContext(instruction);
   if (!instruction->called_computations().empty()) {
-    TF_RET_CHECK(context == CallContext::kSequential ||
-                 context == CallContext::kParallel);
+    CHECK(context == CallContext::kSequential ||
+          context == CallContext::kParallel);
     callsite_instructions_.insert({instruction, callsites_.size()});
     callsites_.push_back(
         CallSite(instruction, instruction->called_computations(), context));
@@ -120,22 +116,21 @@ Status CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
       }
     }
   }
-  return Status::OK();
 }
 
 CallGraph::CallGraph(const HloModule* module) : module_(module) {}
 
-StatusOr<const CallGraphNode*> CallGraph::GetNode(
+const CallGraphNode& CallGraph::GetNode(
     const HloComputation* computation) const {
   auto it = node_indices_.find(computation);
-  TF_RET_CHECK(it != node_indices_.end());
-  return &nodes_[it->second];
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
 }
 
-StatusOr<CallGraphNode*> CallGraph::GetNode(const HloComputation* computation) {
+CallGraphNode& CallGraph::GetNode(const HloComputation* computation) {
   auto it = node_indices_.find(computation);
-  TF_RET_CHECK(it != node_indices_.end());
-  return &nodes_[it->second];
+  CHECK(it != node_indices_.end());
+  return nodes_[it->second];
 }
 
 namespace {
@@ -158,17 +153,17 @@ CallContext UnionContexts(CallContext a, CallContext b) {
 
 }  // namespace
 
-Status CallGraph::SetCallContexts() {
+void CallGraph::SetCallContexts() {
   std::queue<CallGraphNode*> worklist;
 
   // Initialize worklist with all roots of the call graph (computations without
   // callers).
   for (const std::unique_ptr<HloComputation>& computation :
        module_->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * node, GetNode(computation.get()));
-    if (node->callers().empty()) {
-      node->set_context(CallContext::kSequential);
-      worklist.push(node);
+    CallGraphNode& node = GetNode(computation.get());
+    if (node.callers().empty()) {
+      node.set_context(CallContext::kSequential);
+      worklist.push(&node);
     }
   }
 
@@ -178,7 +173,7 @@ Status CallGraph::SetCallContexts() {
 
     for (const CallSite& callsite : node->callsites()) {
       for (const HloComputation* callee : callsite.called_computations()) {
-        TF_ASSIGN_OR_RETURN(CallGraphNode * callee_node, GetNode(callee));
+        CallGraphNode& callee_node = GetNode(callee);
 
         // Update context of callee computation based on the callsite and its
         // current context.
@@ -186,16 +181,16 @@ Status CallGraph::SetCallContexts() {
         if (callsite.context() == CallContext::kParallel) {
           context_to_add = CallContext::kParallel;
         } else {
-          TF_RET_CHECK(callsite.context() == CallContext::kSequential);
+          CHECK_EQ(callsite.context(), CallContext::kSequential);
           context_to_add = node->context();
         }
         CallContext new_context =
-            UnionContexts(context_to_add, callee_node->context());
+            UnionContexts(context_to_add, callee_node.context());
 
-        if (new_context != callee_node->context()) {
+        if (new_context != callee_node.context()) {
           // Context of computation has been changed so add node to worklist.
-          callee_node->set_context(new_context);
-          worklist.push(callee_node);
+          callee_node.set_context(new_context);
+          worklist.push(&callee_node);
         }
       }
     }
@@ -204,14 +199,12 @@ Status CallGraph::SetCallContexts() {
   // No node should have a kNone calling context.
   for (const std::unique_ptr<HloComputation>& computation :
        module_->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * node, GetNode(computation.get()));
-    TF_RET_CHECK(node->context() != CallContext::kNone);
+    CHECK_NE(GetNode(computation.get()).context(), CallContext::kNone);
   }
-  return Status::OK();
 }
 
 /* static */
-StatusOr<std::unique_ptr<CallGraph>> CallGraph::Build(const HloModule* module) {
+std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   // Constructor for CallGraph is private so MakeUnique can't be used.
   auto call_graph = WrapUnique<CallGraph>(new CallGraph(module));
 
@@ -223,56 +216,51 @@ StatusOr<std::unique_ptr<CallGraph>> CallGraph::Build(const HloModule* module) {
        module->computations()) {
     auto it_added = call_graph->node_indices_.insert(
         {computation.get(), call_graph->nodes_.size()});
-    // All computation should be unique, so the computation should not already
+    // All computations should be unique, so the computation should not already
     // exist in the map.
-    TF_RET_CHECK(it_added.second);
+    CHECK(it_added.second);
     call_graph->nodes_.emplace_back(computation.get());
 
     // Add all callsites in this computation.
     for (const std::unique_ptr<HloInstruction>& instruction :
          computation->instructions()) {
-      TF_RETURN_IF_ERROR(call_graph->nodes_.back().AddCallSiteForInstruction(
-          instruction.get()));
+      call_graph->nodes_.back().AddCallSiteForInstruction(instruction.get());
     }
   }
 
   // Add caller callsites to each node.
   for (const std::unique_ptr<HloComputation>& computation :
        module->computations()) {
-    TF_ASSIGN_OR_RETURN(CallGraphNode * caller_node,
-                        call_graph->GetNode(computation.get()));
-    for (const CallSite& callsite : caller_node->callsites()) {
+    for (const CallSite& callsite :
+         call_graph->GetNode(computation.get()).callsites()) {
       for (auto* callee : callsite.called_computations()) {
         // Add caller callsites.
-        TF_ASSIGN_OR_RETURN(CallGraphNode * callee_node,
-                            call_graph->GetNode(callee));
-        callee_node->AddCallerCallSite(callsite);
+        call_graph->GetNode(callee).AddCallerCallSite(callsite);
       }
     }
   }
 
-  TF_RETURN_IF_ERROR(call_graph->SetCallContexts());
-
+  call_graph->SetCallContexts();
   XLA_VLOG_LINES(1, call_graph->ToString());
 
-  return std::move(call_graph);
+  return call_graph;
 }
 
 Status CallGraph::VisitNodesInternal(
-    const VisitorFunction& visitor_func, const CallGraphNode* node,
+    const VisitorFunction& visitor_func, const CallGraphNode& node,
     tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const {
-  auto pair = visited->insert(node);
+  auto pair = visited->insert(&node);
   if (!pair.second) {
     // Node was not inserted. Node has already been visited.
     return Status::OK();
   }
 
-  for (const HloComputation* computation : node->callees()) {
-    TF_ASSIGN_OR_RETURN(const CallGraphNode* callee_node, GetNode(computation));
-    TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, callee_node, visited));
+  for (const HloComputation* computation : node.callees()) {
+    TF_RETURN_IF_ERROR(
+        VisitNodesInternal(visitor_func, GetNode(computation), visited));
   }
 
-  return visitor_func(*node);
+  return visitor_func(node);
 }
 
 Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
@@ -282,14 +270,13 @@ Status CallGraph::VisitNodes(const VisitorFunction& visitor_func,
     // Traverse from all roots in the call graph.
     for (const CallGraphNode& node : nodes()) {
       if (node.callers().empty()) {
-        TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, &node, &visited));
+        TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, node, &visited));
       }
     }
   } else {
     // Traverse only from the entry computation.
-    TF_ASSIGN_OR_RETURN(const CallGraphNode* entry_node,
-                        GetNode(module_->entry_computation()));
-    TF_RETURN_IF_ERROR(VisitNodesInternal(visitor_func, entry_node, &visited));
+    TF_RETURN_IF_ERROR(VisitNodesInternal(
+        visitor_func, GetNode(module_->entry_computation()), &visited));
   }
 
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index e2fed044c88008d0a7e43f0166d397627ed72267..7f9990f06d4fee4c52fa516fc2f6031f5dab2bb9 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 
@@ -54,6 +53,8 @@ enum class CallContext {
 string CallContextToString(CallContext context);
 std::ostream& operator<<(std::ostream& out, const CallContext& context);
 
+CallContext GetInstructionCallContext(const HloInstruction* instruction);
+
 // Represents an HLO instruction which calls one or more computations.
 class CallSite {
  public:
@@ -136,7 +137,7 @@ class CallGraphNode {
   // If instruction calls any computations adds a call site for this instruction
   // to the call graph node. If the instruction calls no computations then no
   // call site is added.
-  Status AddCallSiteForInstruction(HloInstruction* instruction);
+  void AddCallSiteForInstruction(HloInstruction* instruction);
 
   // Computation represented by this call graph node.
   HloComputation* computation_;
@@ -172,12 +173,11 @@ class CallGraph {
   using VisitorFunction = std::function<Status(const CallGraphNode&)>;
 
   // Builds and returns a call graph for the given HLO module.
-  static StatusOr<std::unique_ptr<CallGraph>> Build(const HloModule* module);
+  static std::unique_ptr<CallGraph> Build(const HloModule* module);
 
   // Returns the node associated with the given computation.
-  StatusOr<const CallGraphNode*> GetNode(
-      const HloComputation* computation) const;
-  StatusOr<CallGraphNode*> GetNode(const HloComputation* computation);
+  const CallGraphNode& GetNode(const HloComputation* computation) const;
+  CallGraphNode& GetNode(const HloComputation* computation);
 
   // Returns the vector of all nodes in the call graph.
   const std::vector<CallGraphNode>& nodes() const { return nodes_; }
@@ -195,14 +195,14 @@ class CallGraph {
   CallGraph(const HloModule* module);
 
   // Sets the call contexts for every node in the graph.
-  Status SetCallContexts();
+  void SetCallContexts();
 
   // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
   // post order (callee before caller) calling visitor_func on each node. Adds
   // nodes to 'visited' as each node is visited. Skips nodes already in
   // 'visited'.
   Status VisitNodesInternal(
-      const VisitorFunction& visitor_func, const CallGraphNode* node,
+      const VisitorFunction& visitor_func, const CallGraphNode& node,
       tensorflow::gtl::FlatSet<const CallGraphNode*>* visited) const;
 
   // The HLO module represented by this call graph.
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 65900fd4f86cd07d5d956da0df429d30fcdf7561..ab0ea47d024d871be88bfcab957810deb1ecac99 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -28,7 +29,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using testing::UnorderedMatcher;
+using ::testing::UnorderedElementsAre;
 
 class CallGraphTest : public HloTestBase {
  protected:
@@ -60,14 +61,15 @@ class CallGraphTest : public HloTestBase {
   // Build and return a computation which takes a scalar and calls (kCall) the
   // given computation with value 'callsites' number of times.
   std::unique_ptr<HloComputation> MakeCallingComputation(
-      HloComputation* map_computation, int64 callsites) {
-    HloComputation::Builder builder(TestName() + ".CallingComputation");
+      HloComputation* callee_computation, int64 callsites,
+      const string& suffix = ".CallingComputation") {
+    HloComputation::Builder builder(TestName() + suffix);
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     HloInstruction* last_value = param0;
     for (int64 i = 0; i < callsites; ++i) {
       last_value = builder.AddInstruction(HloInstruction::CreateCall(
-          kScalarShape, {last_value}, map_computation));
+          kScalarShape, {last_value}, callee_computation));
     }
     return builder.Build();
   }
@@ -93,17 +95,15 @@ TEST_F(CallGraphTest, SingletonComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(1, call_graph->nodes().size());
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* node,
-                         call_graph->GetNode(computation));
-  EXPECT_EQ(computation, node->computation());
-  EXPECT_TRUE(node->callsites().empty());
-  EXPECT_TRUE(node->callees().empty());
-  EXPECT_TRUE(node->caller_callsites().empty());
-  EXPECT_TRUE(node->callers().empty());
-  EXPECT_EQ(CallContext::kSequential, node->context());
+  const CallGraphNode& node = call_graph->GetNode(computation);
+  EXPECT_EQ(computation, node.computation());
+  EXPECT_TRUE(node.callsites().empty());
+  EXPECT_TRUE(node.callees().empty());
+  EXPECT_TRUE(node.caller_callsites().empty());
+  EXPECT_TRUE(node.callers().empty());
+  EXPECT_EQ(CallContext::kSequential, node.context());
 }
 
 TEST_F(CallGraphTest, UnreachableComputation) {
@@ -115,19 +115,17 @@ TEST_F(CallGraphTest, UnreachableComputation) {
   HloComputation* unreachable_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* unreachable_node,
-                         call_graph->GetNode(unreachable_computation));
-  EXPECT_EQ(unreachable_computation, unreachable_node->computation());
-  EXPECT_EQ(CallContext::kSequential, unreachable_node->context());
+  const CallGraphNode& unreachable_node =
+      call_graph->GetNode(unreachable_computation);
+  EXPECT_EQ(unreachable_computation, unreachable_node.computation());
+  EXPECT_EQ(CallContext::kSequential, unreachable_node.context());
 }
 
 TEST_F(CallGraphTest, ParallelComputation) {
@@ -136,30 +134,27 @@ TEST_F(CallGraphTest, ParallelComputation) {
   HloModule module(TestName());
   HloComputation* map_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
-  HloComputation* entry_computation = module.AddEmbeddedComputation(
+  HloComputation* entry_computation = module.AddEntryComputation(
       MakeMappingComputation(map_computation, /*callsites=*/5));
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
-  EXPECT_EQ(5, entry_node->callsites().size());
-  EXPECT_EQ(1, entry_node->callees().size());
-  EXPECT_TRUE(entry_node->caller_callsites().empty());
-  EXPECT_TRUE(entry_node->callers().empty());
-
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* map_node,
-                         call_graph->GetNode(map_computation));
-  EXPECT_EQ(map_computation, map_node->computation());
-  EXPECT_EQ(CallContext::kParallel, map_node->context());
-  EXPECT_TRUE(map_node->callsites().empty());
-  EXPECT_TRUE(map_node->callees().empty());
-  EXPECT_EQ(5, map_node->caller_callsites().size());
-  EXPECT_EQ(1, map_node->callers().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(5, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& map_node = call_graph->GetNode(map_computation);
+  EXPECT_EQ(map_computation, map_node.computation());
+  EXPECT_EQ(CallContext::kParallel, map_node.context());
+  EXPECT_TRUE(map_node.callsites().empty());
+  EXPECT_TRUE(map_node.callees().empty());
+  EXPECT_EQ(5, map_node.caller_callsites().size());
+  EXPECT_EQ(1, map_node.callers().size());
 }
 
 TEST_F(CallGraphTest, SequentialComputations) {
@@ -168,30 +163,27 @@ TEST_F(CallGraphTest, SequentialComputations) {
   HloModule module(TestName());
   HloComputation* called_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
-  HloComputation* entry_computation = module.AddEmbeddedComputation(
+  HloComputation* entry_computation = module.AddEntryComputation(
       MakeCallingComputation(called_computation, /*callsites=*/3));
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
-  EXPECT_EQ(3, entry_node->callsites().size());
-  EXPECT_EQ(1, entry_node->callees().size());
-  EXPECT_TRUE(entry_node->caller_callsites().empty());
-  EXPECT_TRUE(entry_node->callers().empty());
-
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* called_node,
-                         call_graph->GetNode(called_computation));
-  EXPECT_EQ(called_computation, called_node->computation());
-  EXPECT_EQ(CallContext::kSequential, called_node->context());
-  EXPECT_TRUE(called_node->callsites().empty());
-  EXPECT_TRUE(called_node->callees().empty());
-  EXPECT_EQ(3, called_node->caller_callsites().size());
-  EXPECT_EQ(1, called_node->callers().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+  EXPECT_EQ(3, entry_node.callsites().size());
+  EXPECT_EQ(1, entry_node.callees().size());
+  EXPECT_TRUE(entry_node.caller_callsites().empty());
+  EXPECT_TRUE(entry_node.callers().empty());
+
+  const CallGraphNode& called_node = call_graph->GetNode(called_computation);
+  EXPECT_EQ(called_computation, called_node.computation());
+  EXPECT_EQ(CallContext::kSequential, called_node.context());
+  EXPECT_TRUE(called_node.callsites().empty());
+  EXPECT_TRUE(called_node.callees().empty());
+  EXPECT_EQ(3, called_node.caller_callsites().size());
+  EXPECT_EQ(1, called_node.callers().size());
 }
 
 TEST_F(CallGraphTest, ContextBothComputations) {
@@ -209,34 +201,31 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   HloInstruction* map = builder.AddInstruction(
       HloInstruction::CreateMap(kScalarShape, {call}, subcomputation));
   HloComputation* entry_computation =
-      module.AddEmbeddedComputation(builder.Build());
+      module.AddEntryComputation(builder.Build());
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(2, call_graph->nodes().size());
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  EXPECT_EQ(entry_computation, entry_node->computation());
-  EXPECT_EQ(2, entry_node->callsites().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(2, entry_node.callsites().size());
 
-  const CallSite& call_callsite = entry_node->callsites()[0];
+  const CallSite& call_callsite = entry_node.callsites()[0];
   EXPECT_EQ(call, call_callsite.instruction());
-  EXPECT_MATCH(call_callsite.called_computations(),
-               UnorderedMatcher<HloComputation*>(subcomputation));
+  EXPECT_THAT(call_callsite.called_computations(),
+              UnorderedElementsAre(subcomputation));
   EXPECT_EQ(CallContext::kSequential, call_callsite.context());
-  EXPECT_EQ(entry_node->GetCallSite(call), &call_callsite);
+  EXPECT_EQ(entry_node.GetCallSite(call), &call_callsite);
 
-  const CallSite& map_callsite = entry_node->callsites()[1];
+  const CallSite& map_callsite = entry_node.callsites()[1];
   EXPECT_EQ(map, map_callsite.instruction());
-  EXPECT_MATCH(map_callsite.called_computations(),
-               UnorderedMatcher(subcomputation));
+  EXPECT_THAT(map_callsite.called_computations(),
+              UnorderedElementsAre(subcomputation));
   EXPECT_EQ(CallContext::kParallel, map_callsite.context());
-  EXPECT_EQ(entry_node->GetCallSite(map), &map_callsite);
+  EXPECT_EQ(entry_node.GetCallSite(map), &map_callsite);
 
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* sub_node,
-                         call_graph->GetNode(subcomputation));
-  EXPECT_EQ(CallContext::kBoth, sub_node->context());
+  const CallGraphNode& sub_node = call_graph->GetNode(subcomputation);
+  EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
 TEST_F(CallGraphTest, ComplexGraph) {
@@ -282,27 +271,24 @@ TEST_F(CallGraphTest, ComplexGraph) {
     entry_computation = module.AddEntryComputation(builder.Build());
   }
 
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
   EXPECT_EQ(5, call_graph->nodes().size());
 
   // Entry computation has one while instruction calling two computations
   // (cond_computation and a_computation).
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* entry_node,
-                         call_graph->GetNode(entry_computation));
-  ASSERT_EQ(1, entry_node->callsites().size());
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  ASSERT_EQ(1, entry_node.callsites().size());
   const std::vector<HloComputation*>& called_computations =
-      entry_node->callsites()[0].called_computations();
-  EXPECT_MATCH(called_computations,
-               UnorderedMatcher(cond_computation, a_computation));
-  EXPECT_EQ(CallContext::kSequential, entry_node->context());
-
-  TF_ASSIGN_OR_ASSERT_OK(const CallGraphNode* c_node,
-                         call_graph->GetNode(c_computation));
-  EXPECT_TRUE(c_node->callsites().empty());
-  EXPECT_MATCH(c_node->callers(),
-               UnorderedMatcher(a_computation, b_computation));
-  EXPECT_EQ(CallContext::kBoth, c_node->context());
+      entry_node.callsites()[0].called_computations();
+  EXPECT_THAT(called_computations,
+              UnorderedElementsAre(cond_computation, a_computation));
+  EXPECT_EQ(CallContext::kSequential, entry_node.context());
+
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_TRUE(c_node.callsites().empty());
+  EXPECT_THAT(c_node.callers(),
+              UnorderedElementsAre(a_computation, b_computation));
+  EXPECT_EQ(CallContext::kBoth, c_node.context());
 
   // Visit the graph and verify nodes were visited in callee-before-caller
   // order.
@@ -335,15 +321,14 @@ TEST_F(CallGraphTest, VisitSingletonComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   std::vector<HloComputation*> visited;
   TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
     visited.push_back(node.computation());
     return Status::OK();
   }));
-  EXPECT_MATCH(visited, UnorderedMatcher(computation));
+  EXPECT_THAT(visited, UnorderedElementsAre(computation));
 }
 
 TEST_F(CallGraphTest, VisitUnreachableComputation) {
@@ -353,8 +338,7 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
       module.AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module.AddEmbeddedComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   // Test visitation of only reachable nodes.
   {
@@ -379,8 +363,8 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
         },
         /*visit_unreachable_nodes=*/true));
     EXPECT_EQ(visited.size(), 2);
-    EXPECT_MATCH(visited,
-                 UnorderedMatcher(entry_computation, unreachable_computation));
+    EXPECT_THAT(visited, UnorderedElementsAre(entry_computation,
+                                              unreachable_computation));
   }
 }
 
@@ -388,15 +372,15 @@ TEST_F(CallGraphTest, VisitWithError) {
   // Test that the call graph visitor properly propagates errors.
   HloModule module(TestName());
   module.AddEntryComputation(MakeScalarComputation());
-  TF_ASSIGN_OR_ASSERT_OK(std::unique_ptr<CallGraph> call_graph,
-                         CallGraph::Build(&module));
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
 
   Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return InternalError("Visitation failed"); });
 
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.code(), tensorflow::error::INTERNAL);
-  ASSERT_MATCH(status.error_message(), testing::HasSubstr("Visitation failed"));
+  ASSERT_THAT(status.error_message(),
+              ::testing::HasSubstr("Visitation failed"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86f7d6478244dec390b355f2c97a85d85d82a79c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compile_only_service.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace se = ::perftools::gputools;
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(perftools::gputools::Platform* platform) {
+  ServiceOptions default_options;
+  default_options.set_platform(platform);
+  return NewService(default_options);
+}
+
+/* static */ StatusOr<std::unique_ptr<CompileOnlyService>>
+CompileOnlyService::NewService(const ServiceOptions& options) {
+  perftools::gputools::Platform* platform = options.platform();
+  if (platform == nullptr) {
+    TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
+  }
+
+  TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
+                      CreateComputeConstantBackend());
+  std::unique_ptr<CompileOnlyService> service(
+      new CompileOnlyService(compiler, std::move(compute_constant_backend)));
+  return std::move(service);
+}
+
+CompileOnlyService::CompileOnlyService(
+    Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend)
+    : Service(/*backend=*/nullptr, std::move(compute_constant_backend)),
+      compiler_(compiler) {
+  runs_in_client_process_ = true;
+}
+
+StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CompileOnlyService::CompileAheadOfTime(
+    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+    const AotCompilationOptions& options) {
+  std::vector<std::unique_ptr<HloModule>> hlo_modules;
+  for (const AotComputationInstance& instance : computations) {
+    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
+                        computation_tracker_.Resolve(instance.computation));
+    VersionedComputationHandle versioned_handle =
+        user_computation->GetVersionedHandle();
+
+    // Dump computation proto state if flag is set.
+    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
+    const string& directory_path = flags->xla_dump_computations_to;
+    if (!directory_path.empty()) {
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<SessionModule> session_module,
+          computation_tracker_.SnapshotComputation(versioned_handle.handle));
+      string filename = tensorflow::strings::StrCat(
+          "computation_", versioned_handle.handle.handle(), "__",
+          session_module->entry().name(), "__version_",
+          versioned_handle.version);
+      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
+                                                     *session_module));
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const ProgramShape> program_shape,
+        user_computation->ComputeProgramShape(versioned_handle.version));
+
+    HloModuleConfig hlo_module_config(*program_shape);
+    auto* computation_layout =
+        hlo_module_config.mutable_entry_computation_layout();
+    if (flags->xla_hlo_profile) {
+      hlo_module_config.enable_hlo_profiling(true);
+    }
+    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
+      const Shape& argument_layout = *instance.argument_layouts[i];
+      if (ShapeUtil::IsTuple(argument_layout)) {
+        return Unimplemented("tuple arguments not supported yet");
+      }
+      TF_RETURN_IF_ERROR(
+          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
+              argument_layout));
+    }
+    TF_RETURN_IF_ERROR(
+        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            *instance.result_layout));
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
+                        computation_tracker_.BuildHloModule(
+                            versioned_handle, &hlo_module_config,
+                            /*include_unreachable_instructions=*/true));
+    hlo_modules.push_back(std::move(hlo_module));
+  }
+
+  return compiler_->CompileAheadOfTime(std::move(hlo_modules),
+                                       MakeHloDumper(), options);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dae49e3e1acf144847d44af4507880d8bf2efc4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
+
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// An XLA Service specialization for ahead-of-time compilation.  This only
+// instantiates a Compiler object for the relevant platform; it does not
+// instantiate or require an execution backend.
+class CompileOnlyService : public Service {
+ public:
+  // Factory for creating a CompileOnlyService. The parameter platform is the
+  // platform that the service should target. If platform is null then the
+  // default platform is used.
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      perftools::gputools::Platform* platform);
+  static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
+      const ServiceOptions& options);
+
+  // A description of a computation to compile using CompileAheadOfTime.
+  struct AotComputationInstance {
+    ComputationHandle computation;
+    std::vector<const Shape*> argument_layouts;
+    const Shape* result_layout = nullptr;
+  };
+
+  // Compiles a list of computations for ahead-of-time execution.  This is
+  // intended for use in static compilation.  See
+  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
+  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  CompileAheadOfTime(
+      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
+      const AotCompilationOptions& Options);
+
+  // Override Service methods that require or imply the existence of an
+  // execute backend.  Note that this does not include TransferToClient and
+  // TransferToClientInProcess, as computing contants produces global data
+  // that we may wish to transfer.
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support execution.");
+  }
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status TransferToServerInProcess(
+      const TransferToServerInProcessRequest* arg,
+      TransferToServerInProcessResponse* result) override {
+    return Unimplemented(
+        "CompileOnlyService does not support device data transfers.");
+  }
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override {
+    return Unimplemented("CompileOnlyService does not support devices.");
+  }
+
+ private:
+  explicit CompileOnlyService(
+      Compiler* compiler, std::unique_ptr<Backend> compute_constant_backend);
+  CompileOnlyService(const CompileOnlyService&) = delete;
+  void operator=(const CompileOnlyService&) = delete;
+
+  // The compiler for the target platform.  This is included in place of
+  // the Service::execute_backend_'s compiler, since execute_backend_ is a
+  // nullptr in CompileOnlyService.
+  Compiler* compiler_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_ONLY_SERVICE_H_
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 6f43c9b8040e9b21e7c0fcf86e2dc5b8ff8c6475..1876417c03a03ec80a05dac3d0936ef6db60055c 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -112,26 +112,22 @@ class Compiler {
   //
   // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* executor) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) = 0;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo, const AotCompilationOptions& options) = 0;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                     HloDumper dump_hlo,
+                     const AotCompilationOptions& options) = 0;
 
   /////
   // The Compiler class also serves as a point to register compiler objects
diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc
index f78806bce82f7f524ba2bf80fbf602ad49e103c7..7e59f03773132b05590fd71d2e2e918d52fe5d98 100644
--- a/tensorflow/compiler/xla/service/computation_tracker.cc
+++ b/tensorflow/compiler/xla/service/computation_tracker.cc
@@ -169,6 +169,7 @@ void ComputationTracker::ComputeComputationPostOrder(
 
 StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
     const VersionedComputationHandle& entry_handle,
+    const HloModuleConfig* config,
     bool include_unreachable_instructions) const {
   tensorflow::mutex_lock lock(computation_mutex_);
 
@@ -208,7 +209,12 @@ StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
 
   string module_name =
       tensorflow::strings::StrCat(entry_computation->name(), "_module");
-  auto module = MakeUnique<HloModule>(module_name, entry_handle);
+  std::unique_ptr<HloModule> module;
+  if (config == nullptr) {
+    module = MakeUnique<HloModule>(module_name, entry_handle);
+  } else {
+    module = MakeUnique<HloModule>(module_name, entry_handle, *config);
+  }
   for (auto versioned_handle : post_order) {
     UserComputation* computation =
         ResolveInternal(versioned_handle.handle).ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/computation_tracker.h b/tensorflow/compiler/xla/service/computation_tracker.h
index 1922908747c6ef3b74c5b87d3c3924e5ffb38fc5..c7ca357398a9351ed8647fdef256b2af255eab0f 100644
--- a/tensorflow/compiler/xla/service/computation_tracker.h
+++ b/tensorflow/compiler/xla/service/computation_tracker.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
@@ -72,12 +73,15 @@ class ComputationTracker {
   // Builds an HLO module using the specified computation as the entry. The
   // module will include the entry computation as well as all computations which
   // are called directly or indirectly from the entry computation via operations
-  // like "map". If include_unreachable_instructions is true, then instructions
+  // like "map". config is the HLO module configuration to use for the
+  // constructed module; pass nullptr for "no configuration".
+  // If include_unreachable_instructions is true, then instructions
   // which are not reachable from the root are lowered into HloInstructions
   // including unreachable parameters. This ensures the entry HloComputation has
   // the same program shape (ProgramShape) as the entry UserComputation.
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(
       const VersionedComputationHandle& entry_handle,
+      const HloModuleConfig* config,
       bool include_unreachable_instructions = true) const;
 
   string ToString() const;
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 7dae49acad388e6d18a8cb1e4ea70244616978bb..3a1a9fe8709e33c7cfe56f4d8648ee2151e3bdd0 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -16,19 +16,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
 #include <memory>
-#include <set>
-#include <string>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,6 +37,9 @@ namespace xla {
 
 namespace {
 
+using tensorflow::gtl::FlatMap;
+using tensorflow::gtl::FlatSet;
+
 // InstructionCopier encapsulates indices at which to copy 'instruction'.
 // All 'instruction' users in 'copy_users' are updated to use the copy.
 //
@@ -52,7 +56,7 @@ namespace {
 //
 //      Example two-element tuple with one element that needs a copy:
 //
-//                    Tuple  // instruction
+//             original-instruction
 //                   /    \
 //                GTE(0)  GTE(1)
 //                  |       |
@@ -60,23 +64,54 @@ namespace {
 //                   \     /
 //                    Tuple  // copied-instruction
 //
+//      As an optimization, if the original instruction is itself a Tuple
+//      instruction, we elide the unnecessary extra GTE and Tuple instructions,
+//      and just insert the copy into a new Tuple instruction, with control
+//      dependencies to ensure the copy occurs after any possible interference.
 class InstructionCopier {
  public:
-  InstructionCopier(const bool init_value, HloInstruction* instruction,
-                    const std::vector<HloInstruction*>& copy_users);
+  InstructionCopier(HloInstruction* instruction,
+                    const std::vector<HloInstruction*>& copy_users)
+      : instruction_(instruction),
+        copy_users_(copy_users),
+        indices_to_copy_(instruction->shape()),
+        control_predecessors_(instruction->shape()) {}
+
+  // Sets indices that are read-only, and thus do not need to be copied.
+  void SetReadOnlyIndices(const ShapeTree<bool>& read_only_indices) {
+    read_only_indices_ = read_only_indices;
+  }
+
+  // Sets copy overrides, which are copy instructions to use at each index. This
+  // is used to share a single copy of read-only entry parameters and constants
+  // between multiple While loops.
+  void SetCopyOverrides(const ShapeTree<HloInstruction*>& copy_overrides) {
+    copy_overrides_ = copy_overrides;
+  }
 
   // Returns true if all recorded indices are false (returns true otherwise).
   bool HasAllIndicesFalse() const;
 
   // Records instruction buffer indices which point-to a Parameter or Constant.
-  tensorflow::Status RecordIndicesWhichPointToParamOrConstant(
+  Status RecordIndicesWhichPointToParamOrConstant(
       const TuplePointsToAnalysis& points_to_analysis);
 
   // Records instruction buffer indices to copy which are necessary to ensure:
   // *) PointsToSet of 'instruction_' is unambiguous and distinct.
   // *) No liveness interference between 'instruction_' and 'other_instruction'.
-  tensorflow::Status RecordIndicesToCopyForColocatingBuffers(
-      BufferLiveness* liveness, HloInstruction* other_instruction);
+  //
+  // If 'read_only_indices_out' is non-null, read-only indices are set to true.
+  Status RecordIndicesToCopyForColocatingBuffers(
+      const BufferLiveness& liveness, const HloInstruction* other_instruction,
+      ShapeTree<bool>* read_only_indices_out);
+
+  // Records control predecessors to add for inserted copy instructions.
+  // 'parameter' must have the same shape as the instruction that will be
+  // copied, and must define all buffers in the shape. Control predecessors are
+  // only recorded for indices that have already been marked for copying.
+  Status RecordControlPredecessors(
+      const TuplePointsToAnalysis& points_to_analysis,
+      HloInstruction* parameter);
 
   // Inserts copies of 'instruction' buffers at indices in 'indices_to_copy',
   // and replaces all uses for instructions in 'copy_users_' with copy.
@@ -88,15 +123,29 @@ class InstructionCopier {
   const std::vector<HloInstruction*>& copy_users() const { return copy_users_; }
 
  private:
+  // Does the given index represent a read-only buffer?
+  bool IsReadOnlyIndex(const ShapeIndex& index) const {
+    return !ShapeUtil::IsNil(read_only_indices_.shape()) &&
+           read_only_indices_.element(index);
+  }
+
+  // Returns the copy override at the given index, or nullptr.
+  HloInstruction* GetCopyOverride(const ShapeIndex& index) const {
+    return ShapeUtil::IsNil(copy_overrides_.shape())
+               ? nullptr
+               : copy_overrides_.element(index);
+  }
+
   // Records instruction buffer indices which have ambiguous or non-distinct
   // points-to sets.
-  tensorflow::Status RecordAmbiguousOrNonDistinctIndices(
+  Status RecordAmbiguousOrNonDistinctIndices(
       const TuplePointsToAnalysis& points_to_analysis);
 
-  // Records instruction buffer indices which have interferring live ranges
+  // Records instruction buffer indices which have interfering live ranges
   // with 'other_instruction' buffers at same index.
-  tensorflow::Status RecordIndicesWhichInterfereWithOtherInstruction(
-      BufferLiveness* liveness, HloInstruction* other_instruction);
+  Status RecordIndicesWhichInterfereWithOtherInstruction(
+      const BufferLiveness& liveness, const HloInstruction* other_instruction,
+      ShapeTree<bool>* read_only_indices_out);
 
   // Recursively inserts copies of 'instruction' tuple elements at indices
   // specified in 'indices_to_copy', and returns the copy of 'instruction'.
@@ -107,28 +156,25 @@ class InstructionCopier {
   }
 
   HloInstruction* instruction_;
-  std::vector<HloInstruction*> copy_users_;
+  const std::vector<HloInstruction*> copy_users_;
   ShapeTree<bool> indices_to_copy_;
+  ShapeTree<std::vector<HloInstruction*>> control_predecessors_;
+  ShapeTree<bool> read_only_indices_;
+  ShapeTree<HloInstruction*> copy_overrides_;
 };
 
-InstructionCopier::InstructionCopier(
-    const bool init_value, HloInstruction* instruction,
-    const std::vector<HloInstruction*>& copy_users)
-    : instruction_(instruction),
-      copy_users_(copy_users),
-      indices_to_copy_(instruction->shape(), init_value) {}
-
 bool InstructionCopier::HasAllIndicesFalse() const {
   bool all_indices_false = true;
-  TF_CHECK_OK(indices_to_copy_.ForEachElement([&all_indices_false](
-      const ShapeIndex& /*index*/, bool /*is_leaf*/, const bool& data) {
-    if (data) all_indices_false = false;
-    return tensorflow::Status::OK();
-  }));
+  TF_CHECK_OK(indices_to_copy_.ForEachElement(
+      [&all_indices_false](const ShapeIndex& /*index*/, bool /*is_leaf*/,
+                           bool data) {
+        if (data) all_indices_false = false;
+        return tensorflow::Status::OK();
+      }));
   return all_indices_false;
 }
 
-tensorflow::Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
+Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
     const TuplePointsToAnalysis& points_to_analysis) {
   const PointsToSet& points_to =
       points_to_analysis.GetPointsToSet(instruction_);
@@ -141,41 +187,47 @@ tensorflow::Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
 
   // Multiple buffers within a parameter/constant may be live out, so collect
   // a set of indices at which to copy first.
-  TF_RETURN_IF_ERROR(points_to.ForEachElement([this](
-      const ShapeIndex& index, bool /*is_leaf*/,
-      const std::vector<const LogicalBuffer*>& buffers) {
-    for (auto buffer : buffers) {
-      // pointee is the HloInstruction producing the buffer which may be
-      // liveout.
-      HloInstruction* pointee = buffer->instruction();
-      if (pointee->opcode() == HloOpcode::kParameter ||
-          pointee->opcode() == HloOpcode::kConstant) {
-        VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
-                << " index: " << tensorflow::str_util::Join(index, ",")
-                << " may be live out of computation: " << pointee->ToString();
-        RecordIndex(index);
-      }
-    }
-    return tensorflow::Status::OK();
-  }));
-  return tensorflow::Status::OK();
+  TF_RETURN_IF_ERROR(points_to.ForEachElement(
+      [this](const ShapeIndex& index, bool /*is_leaf*/,
+             const std::vector<const LogicalBuffer*>& buffers) {
+        if (IsReadOnlyIndex(index)) {
+          return Status::OK();
+        }
+        for (const LogicalBuffer* buffer : buffers) {
+          // pointee is the HloInstruction producing the buffer which may be
+          // liveout.
+          HloInstruction* pointee = buffer->instruction();
+          if (pointee->opcode() == HloOpcode::kParameter ||
+              pointee->opcode() == HloOpcode::kConstant) {
+            VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
+                    << " index: " << tensorflow::str_util::Join(index, ",")
+                    << " may be live out of computation: "
+                    << pointee->ToString();
+            RecordIndex(index);
+            break;
+          }
+        }
+        return Status::OK();
+      }));
+  return Status::OK();
 }
 
-tensorflow::Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
-    BufferLiveness* liveness, HloInstruction* other_instruction) {
+Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
+    const BufferLiveness& liveness, const HloInstruction* other_instruction,
+    ShapeTree<bool>* read_only_indices_out) {
   TF_RETURN_IF_ERROR(
-      RecordAmbiguousOrNonDistinctIndices(liveness->points_to_analysis()));
+      RecordAmbiguousOrNonDistinctIndices(liveness.points_to_analysis()));
   TF_RETURN_IF_ERROR(RecordIndicesWhichInterfereWithOtherInstruction(
-      liveness, other_instruction));
-  return tensorflow::Status::OK();
+      liveness, other_instruction, read_only_indices_out));
+  return Status::OK();
 }
 
-tensorflow::Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
+Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
     const TuplePointsToAnalysis& points_to_analysis) {
   const PointsToSet& points_to =
       points_to_analysis.GetPointsToSet(instruction_);
   // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  std::unordered_map<const LogicalBuffer*, std::vector<ShapeIndex>>
+  FlatMap<const LogicalBuffer*, std::vector<ShapeIndex>>
       buffer_to_source_indices;
   TF_RETURN_IF_ERROR(points_to.ForEachElement([this, &buffer_to_source_indices](
       const ShapeIndex& index, bool /*is_leaf*/,
@@ -191,22 +243,18 @@ tensorflow::Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
       }
     }
     // For each 'buffer': record a mapping from 'buffer' to 'index'.
-    for (auto& buffer : buffers) {
-      auto it = buffer_to_source_indices.find(buffer);
-      if (it == buffer_to_source_indices.end()) {
-        buffer_to_source_indices.insert({buffer, std::vector<ShapeIndex>()});
-      }
+    for (const LogicalBuffer* buffer : buffers) {
       buffer_to_source_indices[buffer].push_back(index);
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }));
 
   // Record all non-distinct indices detected in 'buffer_to_source_indices'.
-  for (auto& buff_to_src : buffer_to_source_indices) {
+  for (const auto& buff_to_src : buffer_to_source_indices) {
     if (buff_to_src.second.size() == 1) {
       continue;
     }
-    for (auto& src_index : buff_to_src.second) {
+    for (const ShapeIndex& src_index : buff_to_src.second) {
       // Record non-distinct points-to set at 'src_index'.
       if (!indices_to_copy_.element(src_index)) {
         VLOG(2) << "Adding copy of buffer for instruction: "
@@ -217,23 +265,26 @@ tensorflow::Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status
-InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
-    BufferLiveness* liveness, HloInstruction* other_instruction) {
+Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
+    const BufferLiveness& liveness, const HloInstruction* other_instruction,
+    ShapeTree<bool>* read_only_indices_out) {
   // Record all buffer indices for 'instruction_', which interfere with
   // 'other_instruction' at the same index.
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshape(
       instruction_->shape(),
-      [this, &liveness, &other_instruction](const Shape& /*subshape*/,
-                                            const ShapeIndex& index) {
+      [this, &liveness, other_instruction, read_only_indices_out](
+          const Shape& /*subshape*/, const ShapeIndex& index) {
+        if (IsReadOnlyIndex(index)) {
+          return Status::OK();
+        }
         if (indices_to_copy_.element(index)) {
           // Return if previous pass already set index.
-          return tensorflow::Status::OK();
+          return Status::OK();
         }
-        auto& points_to_analysis = liveness->points_to_analysis();
+        const auto& points_to_analysis = liveness.points_to_analysis();
         // Lookup buffers for 'instruction_' and 'other_instruction'.
         const std::vector<const LogicalBuffer*> instruction_buffers =
             points_to_analysis.GetPointsToSet(instruction_).element(index);
@@ -252,20 +303,24 @@ InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
         // then that buffer is not updated on the path between the two
         // instructions. Therefore, any other (possibly interference-causing)
         // users of that buffer from 'other_instruction' will see the same data,
-        // irrespecive of whether we insert a copy of this buffer at
+        // irrespective of whether we insert a copy of this buffer at
         // 'instruction_' or not.
         if (other_instruction_buffers.size() == 1 &&
             other_instruction_buffers[0]->id() == instruction_buffer->id()) {
-          return tensorflow::Status::OK();
+          if (read_only_indices_out != nullptr) {
+            *read_only_indices_out->mutable_element(index) = true;
+          }
+          return Status::OK();
         }
-        // We cant say anything about the ambiguity of 'other_instruction' at
+        // We can't say anything about the ambiguity of 'other_instruction' at
         // this point, so we need to check interference between the single
         // buffer in the points-to set of 'instruction_' and all buffers in
         // 'other_instruction_buffers'.
-        for (auto& other_buffer : other_instruction_buffers) {
-          if (liveness->MayInterfere(*instruction_buffer, *other_buffer)) {
+        for (const LogicalBuffer* other_buffer : other_instruction_buffers) {
+          if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) {
             VLOG(2) << "Adding copy of buffer for instruction: "
                     << instruction_->name()
+                    << " instruction_buffer: " << instruction_buffer->ToString()
                     << " at index: " << tensorflow::str_util::Join(index, ",")
                     << " because of interference with buffer: "
                     << other_buffer->ToString();
@@ -273,40 +328,89 @@ InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
             break;
           }
         }
-        return tensorflow::Status::OK();
+        return Status::OK();
       }));
-  return tensorflow::Status::OK();
+  return Status::OK();
+}
+
+// This is called when 'instruction_' is a while body root, and 'parameter' is
+// the while body parameter. We record all users of all aliases of 'parameter'
+// as control predecessors, so that when we add a copy of 'instruction_', we can
+// mark the control dependencies. This is necessary because points-to and
+// liveness analysis doesn't know about the aliasing between the while body root
+// and param. Without these control dependencies, the copy might get scheduled
+// to run at a point that interferes with users of the buffer.
+Status InstructionCopier::RecordControlPredecessors(
+    const TuplePointsToAnalysis& points_to_analysis,
+    HloInstruction* parameter) {
+  return indices_to_copy_.ForEachElement(
+      [this, &points_to_analysis, parameter](const ShapeIndex& index,
+                                             bool /*is_leaf*/, bool will_copy) {
+        if (will_copy) {
+          TF_ASSIGN_OR_RETURN(
+              const LogicalBuffer* buffer,
+              points_to_analysis.GetBufferDefinedAt(parameter, index));
+          for (const BufferAlias& alias :
+               points_to_analysis.GetBufferAliases(*buffer)) {
+            for (HloInstruction* user : alias.instruction()->users()) {
+              if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                          user, points_to_analysis)) {
+                continue;
+              }
+
+              if (user != instruction_) {
+                control_predecessors_.mutable_element(index)->push_back(user);
+              }
+            }
+          }
+        }
+        return Status::OK();
+      });
 }
 
 // Recursively inserts copies of 'instruction' tuple element buffers at
 // indices in 'indices_to_copy_', expanding tuples as needed.
-// TODO(b/31159897) Remove superfluous Tuple->GTE->Tuple expressions.
 HloInstruction* InstructionCopier::CopyTuple(HloInstruction* instruction,
                                              ShapeIndex* index) {
-  std::vector<HloInstruction*> element_copies;
   const int64 num_tuple_elements =
       ShapeUtil::TupleElementCount(instruction->shape());
+  std::vector<HloInstruction*> elem_copies(num_tuple_elements);
   for (int64 i = 0; i < num_tuple_elements; ++i) {
-    HloInstruction* gte = instruction->parent()->AddInstruction(
-        HloInstruction::CreateGetTupleElement(
-            ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction, i));
-    HloInstruction* element_copy;
+    HloInstruction* elem;
+    if (instruction->opcode() == HloOpcode::kTuple) {
+      // If the instruction is already a Tuple instruction, we know that the
+      // element buffers are aliased, so we can just grab the operand directly.
+      elem = instruction->mutable_operand(i);
+    } else {
+      // Otherwise we need to add a GTE to unpack the element out of the tuple.
+      elem = instruction->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
+              i));
+    }
     index->push_back(i);
-    if (ShapeUtil::IsTuple(gte->shape())) {
-      element_copy = CopyTuple(gte, index);
+    if (ShapeUtil::IsTuple(elem->shape())) {
+      elem_copies[i] = CopyTuple(elem, index);
+    } else if (!indices_to_copy_.element(*index)) {
+      elem_copies[i] = elem;
+    } else if (HloInstruction* copy_override = GetCopyOverride(*index)) {
+      elem_copies[i] = copy_override;
     } else {
-      if (indices_to_copy_.element(*index)) {
-        element_copy = gte->parent()->AddInstruction(
-            HloInstruction::CreateUnary(gte->shape(), HloOpcode::kCopy, gte));
-      } else {
-        element_copy = gte;
+      HloInstruction* elem_copy = elem->parent()->AddInstruction(
+          HloInstruction::CreateUnary(elem->shape(), HloOpcode::kCopy, elem));
+      for (HloInstruction* control_predecessor :
+           control_predecessors_.element(*index)) {
+        VLOG(2) << "Adding control dependency from "
+                << control_predecessor->ToString() << " to "
+                << elem_copy->ToString();
+        TF_CHECK_OK(control_predecessor->AddControlDependencyTo(elem_copy));
       }
+      elem_copies[i] = elem_copy;
     }
     index->pop_back();
-    element_copies.push_back(element_copy);
   }
   return instruction->parent()->AddInstruction(
-      HloInstruction::CreateTuple(element_copies));
+      HloInstruction::CreateTuple(elem_copies));
 }
 
 // Inserts copies of 'instruction_' buffers at indices in 'indices_to_copy_'.
@@ -327,8 +431,85 @@ HloInstruction* InstructionCopier::Copy() {
   return copy;
 }
 
+// The 'read_only_indices' are initialized based on points-to analysis on the
+// while body corresponding to 'while_hlo'. If the init buffer corresponding to
+// a read-only index aliases with an entry parameter (or constant), it cannot be
+// considered read-only, and must be copied. This is necessary because some
+// backends don't support entry-parameter (or constant) aliasing with regular
+// instructions. This function performs this fix-up of 'read_only_indices'.
+//
+// Returns a ShapeTree of copy_overrides, which implements an optimization to
+// allow multiple while loops that share the same read-only entry parameters to
+// share a single copy.
+StatusOr<ShapeTree<HloInstruction*>>
+RevertReadOnlyIndicesForEntryParamsAndConstants(
+    const HloInstruction* while_hlo,
+    const TuplePointsToAnalysis& points_to_analysis,
+    ShapeTree<bool>* read_only_indices,
+    FlatMap<const HloInstruction*, HloInstruction*>* shared_copies) {
+  const HloInstruction* init_hlo = while_hlo->operand(0);
+  const PointsToSet& points_to = points_to_analysis.GetPointsToSet(init_hlo);
+  ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
+  TF_RETURN_IF_ERROR(points_to.ForEachElement(
+      [init_hlo, read_only_indices, shared_copies, &copy_overrides](
+          const ShapeIndex& index, bool /*is_leaf*/,
+          const std::vector<const LogicalBuffer*>& buffers) {
+        // Look for read-only entry parameters.
+        if (!read_only_indices->element(index)) {
+          return Status::OK();
+        }
+        for (const LogicalBuffer* buffer : buffers) {
+          HloInstruction* pointee = buffer->instruction();
+          const HloComputation* computation = pointee->parent();
+          const bool is_entry_parameter =
+              pointee->opcode() == HloOpcode::kParameter &&
+              computation == computation->parent()->entry_computation();
+          const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
+          if (!is_entry_parameter && !is_constant) {
+            continue;
+          }
+          // We have found an entry parameter or constant that is read-only in
+          // the while body. These buffers are managed by the caller, and cannot
+          // be aliased with non-parameter buffers. Revert this read-only index,
+          // to allow it to be copied.
+          *read_only_indices->mutable_element(index) = false;
+
+          // Optimization to allow multiple while loops that share the same
+          // read-only entry parameters (or constants) to share a single copy.
+          // Only unambiguous array-shaped buffers are allowed, to reduce code
+          // complexity. The shape of the entry parameter must be identical to
+          // the shape of the init_hlo at this index, to ensure there were no
+          // intervening bitcast or GTE instructions, which are also hard to
+          // handle.
+          const Shape& pointee_shape = pointee->shape();
+          const Shape& init_shape =
+              ShapeUtil::GetSubshape(init_hlo->shape(), index);
+          if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
+              ShapeUtil::Equal(pointee_shape, init_shape)) {
+            HloInstruction** copy = &(*shared_copies)[pointee];
+            if (*copy == nullptr) {
+              *copy =
+                  pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
+                      pointee_shape, HloOpcode::kCopy, pointee));
+            }
+            // Add the copy as an override.
+            *copy_overrides.mutable_element(index) = *copy;
+          }
+
+          // We've already reverted the read-only index and handled the
+          // single-copy optimization above, so there's nothing more to do.
+          break;
+        }
+        return Status::OK();
+      }));
+  return copy_overrides;
+}
+
 }  // anonymous namespace
 
+// NOTE: This is only called by gpu::CopyInsertion. It's not called here in the
+// base class, since the regular CopyInsertion logic above selectively copies
+// tuple elements, while this method assumes all buffers need to be deep copied.
 StatusOr<HloInstruction*> CopyInsertion::FindOrInsertCopy(HloInstruction* hlo) {
   auto copy_it = inserted_copies_.find(hlo);
   if (copy_it == inserted_copies_.end()) {
@@ -347,85 +528,96 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferLiveness> liveness,
       BufferLiveness::Run(module, MakeUnique<DependencyHloOrdering>(module)));
-  auto& points_to_analysis = liveness->points_to_analysis();
+  const auto& points_to_analysis = liveness->points_to_analysis();
   XLA_VLOG_LINES(2, points_to_analysis.ToString());
   XLA_VLOG_LINES(2, module->ToString());
 
-  // Gather references to all while body computations in 'module'.
-  std::unordered_set<const HloComputation*> while_body_computations;
-  // Gather references to all while instructions in 'module' by computation.
-  std::unordered_map<const HloComputation*, std::vector<HloInstruction*>>
-      while_instructions;
+  // Gather all while body computations and while instructions.
+  FlatSet<const HloComputation*> while_body_computations;
+  std::vector<HloInstruction*> while_instructions;
   for (auto& computation : module->computations()) {
     for (auto& instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kWhile) {
-        continue;
-      }
-      while_body_computations.insert(instruction->while_body());
-      auto it = while_instructions.find(computation.get());
-      if (it == while_instructions.end()) {
-        while_instructions.insert(
-            {computation.get(), std::vector<HloInstruction*>()});
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        while_body_computations.insert(instruction->while_body());
+        while_instructions.push_back(instruction.get());
       }
-      while_instructions[computation.get()].emplace_back(instruction.get());
     }
   }
 
+  // Collect instruction buffer indices to copy in 'instructions_to_copy'.
+  std::vector<InstructionCopier> instructions_to_copy;
+
+  // Add copies of computation root instructions, if needed.
+  FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
   for (auto& computation : module->computations()) {
     VLOG(2) << "computation " << computation->name();
-
-    // Collect instruction buffer indices to copy in 'instructions_to_copy'.
-    std::vector<InstructionCopier> instructions_to_copy;
-
-    // Add copies of while 'init' operand instructions (if needed).
-    // TODO(b/33301720) Remove redundant while instruction copies.
-    auto it = while_instructions.find(computation.get());
-    if (it != while_instructions.end()) {
-      for (auto& while_hlo : it->second) {
-        // Create InstructionCopier for init operand of while instruction.
-        HloInstruction* init_hlo = while_hlo->mutable_operand(0);
-        instructions_to_copy.push_back(
-            InstructionCopier(/*init_value=*/false, init_hlo, {while_hlo}));
-        InstructionCopier& init_copier = instructions_to_copy.back();
-        // Record 'init' buffer indices which point-to a Constant or Parameter.
-        TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
-            liveness->points_to_analysis()));
-        // Record indices necessary to colocate while and init operand buffers.
-        TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
-            liveness.get(), while_hlo));
-      }
-    }
-
-    // Create InstructionCopier for computation root instruction.
-    instructions_to_copy.push_back(InstructionCopier(
-        /*init_value=*/false, computation->root_instruction(), {}));
-    InstructionCopier& root_copier = instructions_to_copy.back();
-
+    InstructionCopier root_copier(computation->root_instruction(),
+                                  /*copy_users=*/{});
     if (while_body_computations.count(computation.get()) > 0) {
-      // Record root indices to copy for while body sub-computations.
-      // We do not need to call RecordIndicesWhichPointToParamOrConstant for
-      // the while root instruction here, because any neccessary copies needed
-      // to avoid constant or parameters in the output are handled by while.init
-      // operand copy insertion above (which will share an allocation).
+      // Record root indices to copy for while body sub-computations. We do not
+      // need to call RecordIndicesWhichPointToParamOrConstant for the while
+      // body root instruction here, because any necessary copies needed to
+      // avoid constants or parameters in the output are handled by while.init
+      // operand copy insertion below (which will share an allocation).
+      HloInstruction* while_body_param = computation->parameter_instruction(0);
+      ShapeTree<bool> read_only_indices(while_body_param->shape());
       TF_RETURN_IF_ERROR(root_copier.RecordIndicesToCopyForColocatingBuffers(
-          liveness.get(), computation->parameter_instruction(0)));
-    } else if (copy_param_and_const_) {
+          *liveness, while_body_param, &read_only_indices));
+      while_body_read_only_indices[computation.get()] = read_only_indices;
+
+      // Mark control predecessors, based on the body param, for any copies
+      // we'll be inserting. This ensures the copy doesn't run too early.
+      TF_RETURN_IF_ERROR(root_copier.RecordControlPredecessors(
+          points_to_analysis, while_body_param));
+    } else {
       // Record root indices to copy for general computations.
       TF_RETURN_IF_ERROR(root_copier.RecordIndicesWhichPointToParamOrConstant(
-          liveness->points_to_analysis()));
+          points_to_analysis));
     }
+    instructions_to_copy.push_back(root_copier);
+  }
 
-    for (auto& to_copy : instructions_to_copy) {
-      if (to_copy.HasAllIndicesFalse()) {
-        continue;
-      }
-      changed = true;
+  // Add copies of while 'init' operand instructions, if needed. 'shared_copies'
+  // is used to ensure that multiple while loops can share a single copy of the
+  // same entry parameter or constant, if all loops use it read-only.
+  //
+  // TODO(b/33301720) Remove redundant while instruction copies.
+  FlatMap<const HloInstruction*, HloInstruction*> shared_copies;
+  for (HloInstruction* while_hlo : while_instructions) {
+    // Fix read_only_indices to account for entry parameters and constants. Also
+    // initialize copy_overrides, which ensures a single copy for each read-only
+    // entry parameter or constant that is used in multiple while loops.
+    ShapeTree<bool>* read_only_indices =
+        &while_body_read_only_indices[while_hlo->while_body()];
+    TF_ASSIGN_OR_RETURN(
+        const ShapeTree<HloInstruction*> copy_overrides,
+        RevertReadOnlyIndicesForEntryParamsAndConstants(
+            while_hlo, points_to_analysis, read_only_indices, &shared_copies));
+    // Create InstructionCopier for init operand of while instruction.
+    HloInstruction* init_hlo = while_hlo->mutable_operand(0);
+    InstructionCopier init_copier(init_hlo, {while_hlo});
+    init_copier.SetReadOnlyIndices(*read_only_indices);
+    init_copier.SetCopyOverrides(copy_overrides);
+    // Record 'init' buffer indices which point-to a Constant or Parameter.
+    TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
+        points_to_analysis));
+    // Record indices necessary to colocate while and init operand buffers.
+    TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
+        *liveness, while_hlo, /*read_only_indices_out=*/nullptr));
+    instructions_to_copy.push_back(init_copier);
+  }
 
-      // Copy instruction at recorded buffer indices.
-      HloInstruction* copy = to_copy.Copy();
-      if (to_copy.instruction() == computation->root_instruction()) {
-        computation->set_root_instruction(copy);
-      }
+  for (InstructionCopier& to_copy : instructions_to_copy) {
+    if (to_copy.HasAllIndicesFalse()) {
+      continue;
+    }
+    changed = true;
+
+    // Copy instruction at recorded buffer indices.
+    HloComputation* computation = to_copy.instruction()->parent();
+    HloInstruction* copy = to_copy.Copy();
+    if (to_copy.instruction() == computation->root_instruction()) {
+      computation->set_root_instruction(copy);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index ce91ac0de56f3fc1101c38cee838c0b0593214ad..28bb62e40c7674960dbb1bb63dc8967b06956028 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 
@@ -32,9 +33,6 @@ namespace xla {
 // different lifetimes than computation results.
 class CopyInsertion : public HloPassInterface {
  public:
-  explicit CopyInsertion(bool copy_param_and_const = true)
-      : copy_param_and_const_(copy_param_and_const) {}
-  ~CopyInsertion() override {}
   tensorflow::StringPiece name() const override { return "copy-insertion"; }
 
   // Run the pass on the given module. Returns whether the module was changed
@@ -46,13 +44,9 @@ class CopyInsertion : public HloPassInterface {
   // duplicate copies.
   StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
 
-  // Determines whether to insert copies if the root instruction is, or
-  // points-to, any constant or parameter instruction.
-  const bool copy_param_and_const_;
-
   // A map containing all copies inserted during the copy insertion pass. The
   // key is the copied instruction and the value is the copy.
-  std::unordered_map<HloInstruction*, HloInstruction*> inserted_copies_;
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 4c26b2de124b0b42f6de1ebdf82d4584f2904cab..661f682e38a3cefd09f36eb0e42084d35491e196 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -20,18 +20,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
+using ::testing::UnorderedElementsAre;
+
 class CopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
@@ -51,43 +56,6 @@ class CopyInsertionTest : public HloTestBase {
       EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kParameter);
     }
   }
-
-  // OperandTree is a test helper class that simplifies the expression of
-  // an expected tree of operands (starting at some root instruction) in a
-  // unit test.
-  // Each HLO instruction is represented as a node in the OperandTree.
-  struct OperandTree {
-    // The expected opcode for this OperandTree node.
-    HloOpcode opcode;
-    // The set of operands expected for this OperandTree node.
-    std::vector<OperandTree> operands;
-    // If non-null, a pointer to the expected HloInstruction at this node.
-    const HloInstruction* instruction = nullptr;
-
-    // Returns a mutable reference to operand 'i' of this node.
-    OperandTree& op(int i) {
-      if (i >= operands.size()) {
-        operands.resize(i + 1);
-      }
-      return operands[i];
-    }
-
-    // Check that 'instruction' and its operands match expected values recorded
-    // in OperandTree.
-    void Check(const HloInstruction* instruction) {
-      EXPECT_EQ(opcode, instruction->opcode());
-      if (instruction != nullptr) {
-        EXPECT_EQ(instruction, instruction);
-      }
-      if (operands.empty()) {
-        return;
-      }
-      EXPECT_EQ(operands.size(), instruction->operand_count());
-      for (int i = 0; i < instruction->operand_count(); ++i) {
-        operands[i].Check(instruction->operand(i));
-      }
-    }
-  };
 };
 
 TEST_F(CopyInsertionTest, SingleParameter) {
@@ -97,25 +65,16 @@ TEST_F(CopyInsertionTest, SingleParameter) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({x}));
 
-  ExpectEqUnordered(x->users(), {tuple});
+  EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
-
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
 
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, SingleConstant) {
@@ -125,25 +84,16 @@ TEST_F(CopyInsertionTest, SingleConstant) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant}));
 
-  ExpectEqUnordered(constant->users(), {tuple});
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
@@ -172,30 +122,10 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
 
-  // "constant2" and parameter "x" are pointed to by the tuple and should be
-  // copied.
-
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.op(2).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(2).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(2).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0)),
+                        op::Copy(old_root->operand(1)), old_root->operand(2)));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
@@ -219,32 +149,19 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
-  ExpectEqUnordered(constant1->users(), {tuple1});
-  ExpectEqUnordered(constant2->users(), {tuple1, tuple2});
-  ExpectEqUnordered(constant3->users(), {tuple2});
+  EXPECT_THAT(constant1->users(), UnorderedElementsAre(tuple1));
+  EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
+  EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
-
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
 
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kSelect;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kSelect;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
+                        op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, BitcastParameter) {
@@ -259,19 +176,13 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  ExpectEqUnordered(x->users(), {bitcast});
+  EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kBitcast;
-  op_tree.op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 TEST_F(CopyInsertionTest, BitcastConstant) {
@@ -287,19 +198,13 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  ExpectEqUnordered(constant->users(), {bitcast});
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
-
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kBitcast;
-  op_tree.op(0).instruction = old_root;
 
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
@@ -314,21 +219,13 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  ExpectEqUnordered(x->users(), {bitcast});
+  EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
-
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
 
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(old_root->operand(0))));
 }
 
 TEST_F(CopyInsertionTest, NestedTupleParameter) {
@@ -339,10 +236,11 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
 
   // Param shape is: ((F32[], S32[1,2,3]), F32[42])
   builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
-                                         ShapeUtil::MakeShape(S32, {1, 2, 3})}),
-              ShapeUtil::MakeShape(F32, {42})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
+                                      ShapeUtil::MakeShape(S32, {1, 2, 3})}),
+           ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
   HloModule module(TestName());
@@ -356,30 +254,13 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
   HloInstruction* new_root = module.entry_computation()->root_instruction();
   EXPECT_NE(old_root, new_root);
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(0).op(0).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(1).op(0).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(0).op(1).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kParameter;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(
+      new_root,
+      op::Tuple(
+          op::Tuple(
+              op::Copy(op::GetTupleElement(op::GetTupleElement(old_root))),
+              op::Copy(op::GetTupleElement(op::GetTupleElement(old_root)))),
+          op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
@@ -389,10 +270,11 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
 
   // Param shape is: ((F32[], S32[1,2,3]), F32[42])
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape(
-             {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
-                                         ShapeUtil::MakeShape(S32, {1, 2, 3})}),
-              ShapeUtil::MakeShape(F32, {42})}),
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {}),
+                                      ShapeUtil::MakeShape(S32, {1, 2, 3})}),
+           ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
   // The return value of the computation is the zero-th elemnt of the nested
@@ -407,23 +289,10 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
-
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
 
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
+                        op::Copy(op::GetTupleElement(old_root))));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
@@ -456,15 +325,9 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
-  HloInstruction* new_root = module.entry_computation()->root_instruction();
 
-  // Check path from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kCopy;
-  op_tree.op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(module.entry_computation()->root_instruction(),
+              op::Copy(old_root));
 }
 
 class WhileCopyInsertionTest : public CopyInsertionTest {
@@ -528,7 +391,6 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   }
 
   // Builds a While body computation with read-only tuple element 0.
-  // both input tuple elements.
   // EX:
   // Body({in0, in1})
   //   out0 = in0
@@ -563,11 +425,14 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   //   out0 = Add(in0, 1)
   //   out1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
   //   Tuple(out0, out1)
-  std::unique_ptr<HloComputation> BuildIndependentBodyComputation() {
+  std::unique_ptr<HloComputation> BuildIndependentBodyComputation(
+      bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".Body");
     // Create param instruction to access loop state.
+    const Shape& loop_state_shape =
+        nested ? nested_loop_state_shape_ : loop_state_shape_;
     auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+        HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
     // Update the induction variable GTE(0).
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -578,16 +443,30 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
         induction_variable->shape(), HloOpcode::kAdd, induction_variable, inc));
     // Update data GTE(1).
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    HloInstruction* data = nullptr;
+    if (nested) {
+      data = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          nested_tuple_shape_, loop_state, 1));
+      data = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(data_shape_, data, 0));
+    } else {
+      data = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(data_shape_, loop_state, 1));
+    }
     auto update = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>(
             {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
-    // add0 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
+    // add1 = Add(in1, {1, 1, 1, 1, 1, 1, 1, 1})
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
     // Create output Tuple.
-    builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+    if (nested) {
+      auto nested_tuple =
+          builder.AddInstruction(HloInstruction::CreateTuple({add1, add1}));
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, nested_tuple}));
+    } else {
+      builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
+    }
     return builder.Build();
   }
 
@@ -640,8 +519,9 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 
   // Builds a While instruction using 'condition' and 'body' sub-computations.
   // Init operand is initialized to zeros of appropriate shape.
-  void BuildWhileInstruction(HloComputation* condition, HloComputation* body,
-                             bool nested = false) {
+  HloInstruction* BuildWhileInstruction(HloComputation* condition,
+                                        HloComputation* body,
+                                        bool nested = false) {
     auto builder = HloComputation::Builder(TestName() + ".While");
     auto induction_var_init = builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
@@ -655,17 +535,18 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
           HloInstruction::CreateTuple({data_init, data_init}));
       auto loop_state_init = builder.AddInstruction(
           HloInstruction::CreateTuple({induction_var_init, inner_init}));
-      builder.AddInstruction(HloInstruction::CreateWhile(
+      auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
           loop_state_shape_, condition, body, loop_state_init));
       module_.AddEntryComputation(builder.Build());
-      return;
+      return while_hlo;
     }
 
     auto loop_state_init = builder.AddInstruction(
         HloInstruction::CreateTuple({induction_var_init, data_init}));
-    builder.AddInstruction(HloInstruction::CreateWhile(
+    auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
         loop_state_shape_, condition, body, loop_state_init));
     module_.AddEntryComputation(builder.Build());
+    return while_hlo;
   }
 
   HloInstruction* BuildWhileInstruction_InitPointsToConstant() {
@@ -743,12 +624,14 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   HloInstruction* BuildWhileInstructionWithCustomInit(
       const Shape& loop_state_shape, HloInstruction* data_init,
       HloComputation::Builder* builder) {
+    const bool nested =
+        ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_);
     auto induction_var_init = builder->AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
     auto condition =
-        module_.AddEmbeddedComputation(BuildConditionComputation());
+        module_.AddEmbeddedComputation(BuildConditionComputation(nested));
     auto body =
-        module_.AddEmbeddedComputation(BuildIndependentBodyComputation());
+        module_.AddEmbeddedComputation(BuildIndependentBodyComputation(nested));
     auto loop_state_init = builder->AddInstruction(
         HloInstruction::CreateTuple({induction_var_init, data_init}));
     auto while_hlo = builder->AddInstruction(HloInstruction::CreateWhile(
@@ -781,14 +664,20 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
   auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
   auto body = module_.AddEmbeddedComputation(BuildIndependentBodyComputation());
-  BuildWhileInstruction(condition, body);
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
   InsertCopies(&module_);
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
   // No copies should be inserted so root should not be updated.
-  CHECK_EQ(old_root, new_root);
+  EXPECT_EQ(old_root, new_root);
+
+  // Both init indices need copies.
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
 }
 
 // Tests while body computation with dependent tuple elements:
@@ -798,39 +687,25 @@ TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
 //     out1 = Add(BCast(in0), in1)
 //     Tuple(out0, out1)
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass should convert the root instruction to:
 //
-//                    Tuple  // old root
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy     |
-//                   \     /
-//                    Tuple  // new root
+//     Tuple(Copy(out0), out1)
 //
 TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
   auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
   auto body = module_.AddEmbeddedComputation(BuildDependentBodyComputation());
-  BuildWhileInstruction(condition, body);
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
   InsertCopies(&module_);
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(new_root,
+              op::Tuple(op::Copy(old_root->operand(0)), old_root->operand(1)));
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
 }
 
 // Tests while body computation with read-only tuple element 0:
@@ -846,20 +721,110 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
 //                         \      /
 //                           TUPLE (root)
 //
-// CopyInsertion pass should not generate any copies.
-//
+// CopyInsertion pass should not generate any copies for the while body.
 TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
   auto condition = module_.AddEmbeddedComputation(BuildConditionComputation());
   auto body = module_.AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
-  BuildWhileInstruction(condition, body);
+  auto while_hlo = BuildWhileInstruction(condition, body);
 
+  const HloInstruction* old_init = while_hlo->operand(0);
   HloInstruction* old_root = body->root_instruction();
   InsertCopies(&module_);
   HloInstruction* new_root = body->root_instruction();
+  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // No copies should be inserted so root should not be updated.
-  CHECK_EQ(old_root, new_root);
+  // No copies should be inserted in the body, so root should not be updated.
+  EXPECT_EQ(old_root, new_root);
+
+  // Both indices need copies, even though Index 0 is read-only, since both are
+  // constants, which must be copied.
+  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
+                                  op::Copy(old_init->operand(1))));
+}
+
+// Same as above, but with two while loops, sharing entry parameters.
+TEST_F(WhileCopyInsertionTest,
+       DependentTupleElements_OneReadOnly_TwoLoops_EntryParams) {
+  auto condition1 = module_.AddEmbeddedComputation(BuildConditionComputation());
+  auto condition2 = module_.AddEmbeddedComputation(BuildConditionComputation());
+  auto body1 = module_.AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+  auto body2 = module_.AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+
+  auto builder = HloComputation::Builder(TestName() + ".While");
+  auto iter_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "data"));
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({iter_param, data_param}));
+
+  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition1, body1, loop_init));
+  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition2, body2, loop_init));
+  module_.AddEntryComputation(builder.Build());
+
+  InsertCopies(&module_);
+
+  // Both while loops share a single copy of iter_param, since index 0 is
+  // read-only in the body.
+  EXPECT_EQ(while_hlo1->operand(0)->operand(0),
+            while_hlo2->operand(0)->operand(0));
+  EXPECT_THAT(while_hlo1->operand(0)->operand(0), op::Copy(iter_param));
+
+  // Each while loop gets its own copy of data_param, since index 1 is not
+  // read-only in the body.
+  EXPECT_NE(while_hlo1->operand(0)->operand(1),
+            while_hlo2->operand(0)->operand(1));
+  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_param));
+  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_param));
+}
+
+// Same as above, but with two while loops, sharing non-parameters.
+TEST_F(WhileCopyInsertionTest,
+       DependentTupleElements_OneReadOnly_TwoLoops_NonParams) {
+  auto condition1 = module_.AddEmbeddedComputation(BuildConditionComputation());
+  auto condition2 = module_.AddEmbeddedComputation(BuildConditionComputation());
+  auto body1 = module_.AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+  auto body2 = module_.AddEmbeddedComputation(
+      BuildDependentBodyOneReadOnlyComputation());
+
+  auto builder = HloComputation::Builder(TestName() + ".While");
+  auto iter_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, induction_variable_shape_, "iter"));
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, data_shape_, "data"));
+  // Add dummy ops to ensure loop_init elements aren't entry parameters.
+  auto iter_value = builder.AddInstruction(HloInstruction::CreateUnary(
+      iter_param->shape(), HloOpcode::kExp, iter_param));
+  auto data_value = builder.AddInstruction(HloInstruction::CreateUnary(
+      data_param->shape(), HloOpcode::kExp, data_param));
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({iter_value, data_value}));
+
+  auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition1, body1, loop_init));
+  auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_state_shape_, condition2, body2, loop_init));
+  module_.AddEntryComputation(builder.Build());
+
+  InsertCopies(&module_);
+
+  // No copies of iter_value are necessary, since index 0 is read-only in both
+  // while bodies.
+  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_value);
+  EXPECT_EQ(while_hlo2->operand(0)->operand(0), iter_value);
+
+  // Each while loop gets its own copy of data_value, since index 1 is not
+  // read-only in the body.
+  EXPECT_NE(while_hlo1->operand(0)->operand(1),
+            while_hlo2->operand(0)->operand(1));
+  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_value));
+  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_value));
 }
 
 // Tests while body computation with nested tuple elements:
@@ -872,7 +837,8 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
 //          Add                           Reverse
 //           |                              |
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with the
+// actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old root
 //                   /     \
@@ -898,104 +864,41 @@ TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
 
   HloInstruction* old_root = body->root_instruction();
   InsertCopies(&module_);
-  HloInstruction* new_root = body->root_instruction();
-
-  // Check all paths from 'new_root' to 'old_root'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).instruction = old_root;
-
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).instruction = old_root;
 
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_root;
-
-  op_tree.Check(new_root);
+  EXPECT_THAT(body->root_instruction(),
+              op::Tuple(old_root->operand(0),
+                        op::Tuple(old_root->operand(1)->operand(0),
+                                  op::Copy(old_root->operand(1)->operand(1)))));
 }
 
 // Tests while init instruction which points-to a constant.
 //
 //     init = Tuple(Constant(S32, {}), Constant(F32, {8}))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should add copies for both constants.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
   auto while_hlo = BuildWhileInstruction_InitPointsToConstant();
   auto old_init = while_hlo->operand(0);
   InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
-
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
 
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
 }
 
 // Tests while init instruction which points-to a parameter.
 //
 //     init = Tuple(Constant(S32, {}), Parameter(F32, {8}))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should add copies for both the constant and parameter.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
   auto while_hlo = BuildWhileInstruction_InitPointsToParameter();
   auto old_init = while_hlo->operand(0);
   InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
-
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
 
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
 }
 
 // Tests while init instruction which has an ambiguous points-to set.
@@ -1003,7 +906,8 @@ TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
 //     select = Select(pred, tuple1, tuple2)
 //     init = Tuple(Constant(S32, {}), Parameter(F32, {8}))
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with some of
+// the actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old init
 //                   /     \
@@ -1025,39 +929,21 @@ TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
   auto while_hlo = BuildWhileInstruction_InitPointsToAmbiguous();
   auto old_init = while_hlo->operand(0);
   InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
-
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
 
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(
+      while_hlo->operand(0),
+      op::Tuple(
+          op::Copy(old_init->operand(0)),
+          op::Tuple(op::Copy(op::GetTupleElement(old_init->operand(1))),
+                    op::Copy(op::GetTupleElement(old_init->operand(1))))));
 }
 
 // Tests while init instruction which has a non-distinct points-to set.
 //
 //     init = Tuple(Constant(S32, {}), Tuple({vec_one, vec_one}))
 //
-// CopyInsertion pass should generate:
+// CopyInsertion pass will conceptually generate the following, but with some of
+// the actual GTE and Tuple instructions optimized away:
 //
 //                    Tuple  // old init
 //                   /     \
@@ -1079,71 +965,28 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
   auto while_hlo = BuildWhileInstruction_InitPointsToNonDistinct();
   auto old_init = while_hlo->operand(0);
   InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
-
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kTuple;
-
-  op_tree.op(1).op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).op(0).op(0).instruction = old_init;
 
-  op_tree.op(1).op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(1).op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(1).op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(old_init->operand(0)),
+                        op::Tuple(op::Copy(old_init->operand(1)->operand(0)),
+                                  op::Copy(old_init->operand(1)->operand(0)))));
 }
 
-// Tests while init instruction buffer which interfers with while result buffer.
+// Tests while init instruction buffer which interferes with while result buffer.
 //
 //     init_data = Broadcast(...)
 //     add_unrelated = Add(init_data) // takes a reference to cause interference
 //     init = Tuple(Constant(S32, {}), init_data))
 //
-// CopyInsertion pass should generate:
-//
-//                    Tuple  // old init
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy    Copy
-//                   \     /
-//                    Tuple  // new init
+// CopyInsertion pass should copy both operands.
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
   auto while_hlo = BuildWhileInstruction_InitPointsToInterfering();
   auto old_init = while_hlo->operand(0);
   InsertCopies(&module_);
-  auto new_init = while_hlo->operand(0);
-
-  // Check all paths from 'new_init' to 'old_init'.
-  OperandTree op_tree;
-  op_tree.opcode = HloOpcode::kTuple;
-
-  op_tree.op(0).opcode = HloOpcode::kCopy;
-  op_tree.op(0).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(0).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(0).op(0).op(0).instruction = old_init;
-
-  op_tree.op(1).opcode = HloOpcode::kCopy;
-  op_tree.op(1).op(0).opcode = HloOpcode::kGetTupleElement;
-  op_tree.op(1).op(0).op(0).opcode = HloOpcode::kTuple;
-  op_tree.op(1).op(0).op(0).instruction = old_init;
 
-  op_tree.Check(new_init);
+  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
+                                               op::Copy(old_init->operand(1))));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index e9963528111994c4918861eaa52ab915fe34fd93..affb5f99066d8278c583c469d97e78646d52f3c6 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -53,13 +53,13 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/port:initialize",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -97,6 +97,7 @@ cc_library(
     name = "simple_orc_jit",
     srcs = ["simple_orc_jit.cc"],
     hdrs = ["simple_orc_jit.h"],
+    linkopts = ["-ldl"],
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index d18141af83e4653e18d3b0118d0892f41db5b69b..b42702dbe1abe3db838159bda2665743e416a2d5 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -28,6 +29,8 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+using ::testing::ElementsAre;
+
 class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
@@ -96,17 +99,14 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
 
   // The input is in CNHW order. input_reshape should produce
   // NHWC for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(input_reshape->dimensions(),
-                              std::vector<int64>({1, 2, 3, 0})));
+  EXPECT_THAT(input_reshape->dimensions(), ElementsAre(1, 2, 3, 0));
   // The kernel is in OIHW order. kernel_reshape should produce
   // HWIO for the convolution to hit the Eigen fast path.
-  EXPECT_TRUE(ContainersEqual(kernel_reshape->dimensions(),
-                              std::vector<int64>({2, 3, 1, 0})));
+  EXPECT_THAT(kernel_reshape->dimensions(), ElementsAre(2, 3, 1, 0));
   // The output of the canonical convolution is in NHWC order (the same as
   // input_reshape's order). output_reshape should restore that order to the
   // order of the computation root (CNHW).
-  EXPECT_TRUE(ContainersEqual(output_reshape->dimensions(),
-                              std::vector<int64>({3, 0, 1, 2})));
+  EXPECT_THAT(output_reshape->dimensions(), ElementsAre(3, 0, 1, 2));
 }
 
 TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index c5433d4b89d7ccab0f04e9ab2787ce150417b669..97458f0fcc344f42f6d7244b1f812e29666437e1 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/port/initialize.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
@@ -58,6 +57,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
@@ -192,16 +192,16 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   }
   // It is important to recurse for "while" or else we risk overly coarse
   // profiling information.
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* /*init*/,
-                     HloComputation* condition, HloComputation* body) override {
+  Status HandleWhile(HloInstruction* xla_while) override {
     TF_RETURN_IF_ERROR(DefaultAction(xla_while));
 
     CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
-    TF_RETURN_IF_ERROR(
-        condition->root_instruction()->Accept(&candidates_for_condition));
+    TF_RETURN_IF_ERROR(xla_while->while_condition()->root_instruction()->Accept(
+        &candidates_for_condition));
 
     CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
-    TF_RETURN_IF_ERROR(body->root_instruction()->Accept(&candidates_for_body));
+    TF_RETURN_IF_ERROR(xla_while->while_body()->root_instruction()->Accept(
+        &candidates_for_body));
 
     return Status::OK();
   }
@@ -210,9 +210,7 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 }  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* hlo_module,
-                                 HloModuleConfig* module_config,
-                                 HloDumper dump_hlo) {
+Status CpuCompiler::RunHloPasses(HloModule* module, HloDumper dump_hlo) {
   // Optimization pipeline.
   HloPassPipeline pipeline("CPU", dump_hlo);
   pipeline.AddInvariantChecker<HloVerifier>();
@@ -232,12 +230,18 @@ Status CpuCompiler::RunHloPasses(HloModule* hlo_module,
     pass.AddPass<ReshapeMover>();
     pass.AddPass<HloConstantFolding>();
   }
-  pipeline.AddPass<TransposeFolding>(PotentiallyImplementedAsEigenDot);
-  pipeline.AddPass<HloSubcomputationUnification>();
+  pipeline.AddPass<TransposeFolding>(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return PotentiallyImplementedAsEigenDot(dot)
+                   ? candidate_operands
+                   : TransposeFolding::OperandIndices{};
+      },
+      TransposeFolding::NeverFoldTranspose);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
   pipeline.AddPass<CpuLayoutAssignment>(
-      module_config->mutable_entry_computation_layout());
+      module->mutable_config()->mutable_entry_computation_layout());
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -250,10 +254,13 @@ Status CpuCompiler::RunHloPasses(HloModule* hlo_module,
   if (flags->xla_cpu_parallel) {
     pipeline.AddPass<ParallelizationPreparation>();
   }
-  // Copy insertion should be performed immediately before IR emission to
-  // avoid inserting unnecessary copies (later pass adds an instruction which
-  // materializes the value) or missing a necessary copy (later pass removes
-  // an instruction which materializes a value).
+  // Copy insertion should be performed immediately before IR emission to avoid
+  // inserting unnecessary copies (later pass adds an instruction which
+  // materializes the value) or missing a necessary copy (later pass removes an
+  // instruction which materializes a value). DCE must be run immediately before
+  // (and sometime after) copy insertion, to avoid dead code from interfering
+  // with the rewrites.
+  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<CopyInsertion>();
   if (flags->xla_cpu_parallel) {
     // Re-run the outlining, in case any copies were inserted into the entry
@@ -261,7 +268,8 @@ Status CpuCompiler::RunHloPasses(HloModule* hlo_module,
     pipeline.AddPass<ParallelizationPreparation>();
   }
   pipeline.AddPass<HloDCE>();
-  return pipeline.Run(hlo_module).status();
+  pipeline.AddPass<FlattenCallGraph>();
+  return pipeline.Run(module).status();
 }
 
 namespace {
@@ -295,8 +303,7 @@ llvm::CodeGenOpt::Level CodeGenOptLevel() {
 }  // namespace
 
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
     se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
@@ -304,17 +311,16 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
   auto llvm_context = MakeUnique<llvm::LLVMContext>();
   auto llvm_module =
       MakeUnique<llvm::Module>("__compute_module", *llvm_context);
-  auto jit = MakeUnique<SimpleOrcJIT>(CompilerTargetOptions(*module_config),
+  auto jit = MakeUnique<SimpleOrcJIT>(CompilerTargetOptions(module->config()),
                                       CodeGenOptLevel());
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
-  TF_RETURN_IF_ERROR(
-      RunHloPasses(hlo_module.get(), module_config.get(), dump_hlo));
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), dump_hlo));
 
-  HloComputation* computation = hlo_module->entry_computation();
+  HloComputation* computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
-  if (module_config->hlo_profiling_enabled()) {
+  if (module->config().hlo_profiling_enabled()) {
     TF_ASSIGN_OR_RETURN(
         hlo_to_profile_idx,
         CollectProfileCandidates::GetCandidatesForComputation(computation));
@@ -331,8 +337,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // uses data dependencies for determining order.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(hlo_module.get(),
-                            MakeUnique<DependencyHloOrdering>(hlo_module.get()),
+        BufferAssigner::Run(module.get(),
+                            MakeUnique<DependencyHloOrdering>(module.get()),
                             [this](const LogicalBuffer& buffer) {
                               return ShapeSizeBytes(buffer.shape());
                             },
@@ -363,13 +369,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       // The parallel preparation should have ensured that the top-level
       // computation consists solely of Call instructions.
       TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall)
-          << hlo_module->ToString();
+          << module->ToString();
       HloComputation* to_apply = instruction->to_apply();
       parallel_computations.emplace(to_apply, instruction);
     }
 
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment,
-                         llvm_module.get(), &hlo_to_profile_idx);
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         &hlo_to_profile_idx);
     std::unique_ptr<std::map<HloInstruction*, string>> function_names(
         new std::map<HloInstruction*, string>());
     for (auto embedded_computation :
@@ -403,9 +409,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new ParallelCpuExecutable(
-        std::move(jit), std::move(assignment), std::move(hlo_module),
-        std::move(module_config), std::move(function_names),
-        std::move(hlo_to_profile_idx), std::move(aligned_constants)));
+        std::move(jit), std::move(assignment), std::move(module),
+        std::move(function_names), std::move(hlo_to_profile_idx),
+        std::move(aligned_constants)));
 
     if (flags->xla_cpu_embed_ir) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -417,7 +423,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // and reduced memory usage (as compared to using DependencyHloOrdering).
     TF_ASSIGN_OR_RETURN(
         SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(*hlo_module,
+        CreateMemoryMinimizingSequence(*module,
                                        [this](const LogicalBuffer& buffer) {
                                          return ShapeSizeBytes(buffer.shape());
                                        }));
@@ -426,20 +432,20 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(hlo_module.get(),
-                            MakeUnique<SequentialHloOrdering>(hlo_module.get(),
-                                                              module_sequence),
-                            [this](const LogicalBuffer& buffer) {
-                              return ShapeSizeBytes(buffer.shape());
-                            },
-                            kMemoryAlignment));
+        BufferAssigner::Run(
+            module.get(),
+            MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
+            [this](const LogicalBuffer& buffer) {
+              return ShapeSizeBytes(buffer.shape());
+            },
+            kMemoryAlignment));
 
     // Each computation is a single function.  Emit all embedded computations
     // before the entry computation. The order of computations returned from
     // GetEmbeddedComputations guarantees that a called computation occurs
     // before a caller computation.
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment,
-                         llvm_module.get(), &hlo_to_profile_idx);
+    IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+                         &hlo_to_profile_idx);
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
       TF_RETURN_IF_ERROR(
@@ -466,10 +472,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 
     // JIT compile the LLVM IR module to in-memory machine code.
     jit->AddModule(std::move(llvm_module));
-    cpu_executable.reset(
-        new CpuExecutable(std::move(jit), std::move(assignment),
-                          std::move(hlo_module), std::move(module_config),
-                          function_name, std::move(hlo_to_profile_idx)));
+    cpu_executable.reset(new CpuExecutable(
+        std::move(jit), std::move(assignment), std::move(module), function_name,
+        std::move(hlo_to_profile_idx)));
 
     if (flags->xla_cpu_embed_ir) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -481,27 +486,24 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on CPU.");
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CpuCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlo, const AotCompilationOptions& aot_options) {
-  TF_RET_CHECK(hlo_modules.size() == module_configs.size());
-  TF_RET_CHECK(!hlo_modules.empty());
+CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                                HloDumper dump_hlo,
+                                const AotCompilationOptions& aot_options) {
+  TF_RET_CHECK(!modules.empty());
 
   // We can pass just one llvm::TargetOptions when we compile the LLVM module,
   // so we bail if the configs have conflicting flags. At the moment, the only
   // flag that needs to be consistent is fast-math.
-  bool fast_math_disabled = module_configs[0]->fast_math_disabled();
-  for (const auto& module_config : module_configs) {
-    if (module_config->fast_math_disabled() != fast_math_disabled) {
+  bool fast_math_disabled = modules[0]->config().fast_math_disabled();
+  for (const auto& module : modules) {
+    if (module->config().fast_math_disabled() != fast_math_disabled) {
       return InvalidArgument(
           "All HLO module configs must have the same value for "
           "fast_math_disabled.");
@@ -559,7 +561,7 @@ CpuCompiler::CompileAheadOfTime(
   std::unique_ptr<llvm::TargetMachine> target_machine =
       WrapUnique(target->createTargetMachine(
           triple.getTriple(), cpu_name, features,
-          CompilerTargetOptions(*module_configs[0]), reloc_model,
+          CompilerTargetOptions(modules[0]->config()), reloc_model,
           llvm::CodeModel::Default, opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
@@ -575,15 +577,14 @@ CpuCompiler::CompileAheadOfTime(
   }
 
   std::vector<std::unique_ptr<AotCompilationResult>> results;
-  for (size_t i = 0; i < hlo_modules.size(); ++i) {
-    HloModule* hlo_module = hlo_modules[i].get();
-    HloModuleConfig* module_config = module_configs[i].get();
+  for (size_t i = 0; i < modules.size(); ++i) {
+    HloModule* module = modules[i].get();
 
-    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module, module_config, dump_hlo));
+    TF_RETURN_IF_ERROR(RunHloPasses(module, dump_hlo));
 
     TF_ASSIGN_OR_RETURN(
         SequentialHloOrdering::HloModuleSequence module_sequence,
-        CreateMemoryMinimizingSequence(*hlo_module,
+        CreateMemoryMinimizingSequence(*module,
                                        [this](const LogicalBuffer& buffer) {
                                          return ShapeSizeBytes(buffer.shape());
                                        }));
@@ -593,16 +594,15 @@ CpuCompiler::CompileAheadOfTime(
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
-            hlo_module,
-            MakeUnique<SequentialHloOrdering>(hlo_module, module_sequence),
+            module, MakeUnique<SequentialHloOrdering>(module, module_sequence),
             [this](const LogicalBuffer& buffer) {
               return ShapeSizeBytes(buffer.shape());
             },
             kMemoryAlignment));
 
-    IrEmitter ir_emitter(*hlo_module, *module_config, *assignment, &llvm_module,
+    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          /*hlo_to_profile_idx=*/nullptr);
-    HloComputation* computation = hlo_module->entry_computation();
+    HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
       TF_RETURN_IF_ERROR(
@@ -672,8 +672,10 @@ int64 CpuCompiler::ShapeSizeBytes(const Shape& shape) const {
 }  // namespace cpu
 }  // namespace xla
 
-REGISTER_MODULE_INITIALIZER(cpu_compiler, {
+static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() {
     return xla::MakeUnique<xla::cpu::CpuCompiler>();
   });
-});
+  return true;
+}
+static bool module_initialized = InitModule();
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index a32aa84ea51123f76551ad617cc914a53d4ca4d1..cadafa83320e17e6baddfc64dcaa8a988de6360d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
@@ -113,21 +112,17 @@ class CpuCompiler : public Compiler {
   ~CpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo, const AotCompilationOptions& options) override;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+                     HloDumper dump_hlo,
+                     const AotCompilationOptions& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
 
@@ -139,8 +134,7 @@ class CpuCompiler : public Compiler {
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* hlo_module, HloModuleConfig* module_config,
-                      HloDumper dump_hlo);
+  Status RunHloPasses(HloModule* hlo_module, HloDumper dump_hlo);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 88283e6010ea784e2a977a80adbe6315782f7fdc..a4fcea7aec83fc64fa40fc28d4713a651290641c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -53,11 +52,9 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<BufferAssignment> assignment,
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    : Executable(std::move(hlo_module)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index b04b4e8dd1fd23839a4684f72622e32eca9c3730..0cc0965ae1df6ab64a2f146e02b6e19b43ca81a5 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -52,7 +51,6 @@ class CpuExecutable : public Executable {
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config,
       const string& entry_function_name,
       std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
   ~CpuExecutable() override {}
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 240da35ef190eb7080947ab7d1da91d8d2dd8973..dc002846e9e6b07c767ddc8af939657c4c51bf23 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -24,6 +24,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on CPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // Condition for consumer: must be elementwise or a fusion op
   // (which necessarily only contains elementwise operations)
   if (!(consumer->opcode() == HloOpcode::kFusion ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 8e06f0520edfb05c7ec606dcb8e85c5ef997c2c0..253de20f25127bf0ac23d5969e0f16c143396e47 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 
-#include <sched.h>
 #include <functional>
 
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 51c6dc4426f8c40d60ba933ce0a31f8fb9d927c1..2d81ba7882747f77ca93adf71a37172f5f2bff24 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -63,8 +63,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    const HloModule& hlo_module, const HloModuleConfig& hlo_module_config,
-    const BufferAssignment& assignment, llvm::Module* llvm_module,
+    const HloModule& hlo_module, const BufferAssignment& assignment,
+    llvm::Module* llvm_module,
     const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
     : assignment_(assignment),
       module_(llvm_module),
@@ -72,8 +72,8 @@ IrEmitter::IrEmitter(
       ir_builder_(llvm_module->getContext()),
       hlo_to_profile_idx_(hlo_to_profile_idx),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
-      hlo_module_config_(hlo_module_config) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(hlo_module_config));
+      hlo_module_config_(hlo_module.config()) {
+  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(hlo_module_config_));
 }
 
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
@@ -201,7 +201,8 @@ void IrEmitter::InitializeIrFunction(const string& function_name,
     if (&argument == retval) {
       continue;
     }
-    compute_function_->setDoesNotAlias(argument.getArgNo() + 1);
+    compute_function_->addAttribute(argument.getArgNo() + 1,
+                                    llvm::Attribute::NoAlias);
   }
 
   ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
@@ -1136,6 +1137,41 @@ Status IrEmitter::HandleSend(HloInstruction* send) {
   return Unimplemented("Send is not implemented on CPU. See b/33942983.");
 }
 
+Status IrEmitter::HandleSlice(HloInstruction* slice, HloInstruction* operand) {
+  if (ShapeUtil::IsScalar(slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(slice));
+    emitted_value_[slice] = target_address;
+    return EmitMemcpy(*operand, *slice);
+  }
+  return DefaultAction(slice);
+}
+
+Status IrEmitter::HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                     HloInstruction* operand,
+                                     HloInstruction* /*start_indices*/) {
+  if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(dynamic_slice));
+    emitted_value_[dynamic_slice] = target_address;
+    return EmitMemcpy(*operand, *dynamic_slice);
+  }
+  return DefaultAction(dynamic_slice);
+}
+
+Status IrEmitter::HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                           HloInstruction* /*operand*/,
+                                           HloInstruction* update,
+                                           HloInstruction* /*start_indices*/) {
+  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * target_address,
+                        EmitTargetAddressForOp(dynamic_update_slice));
+    emitted_value_[dynamic_update_slice] = target_address;
+    return EmitMemcpy(*update, *dynamic_update_slice);
+  }
+  return DefaultAction(dynamic_update_slice);
+}
+
 Status IrEmitter::HandleRecv(HloInstruction* recv) {
   // TODO(b/33942983): Support Send/Recv on CPU.
   return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
@@ -1265,13 +1301,12 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   }
 }
 
-Status IrEmitter::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
+Status IrEmitter::HandleCall(HloInstruction* call) {
+  HloComputation* computation = call->to_apply();
   llvm::Function* call_ir_function = FindOrDie(emitted_functions_, computation);
 
   std::vector<llvm::Value*> parameter_addresses;
-  for (HloInstruction* operand : operands) {
+  for (const HloInstruction* operand : call->operands()) {
     parameter_addresses.push_back(GetEmittedValueFor(operand));
   }
 
@@ -1322,9 +1357,9 @@ Status IrEmitter::HandleCustomCall(
   return Status::OK();
 }
 
-Status IrEmitter::HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                              HloComputation* condition, HloComputation* body) {
+Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Precondition: Condition computation must return a scalar bool.
+  HloComputation* condition = xla_while->while_condition();
   TF_RET_CHECK(ShapeUtil::IsScalar(condition->root_instruction()->shape()) &&
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
@@ -1361,12 +1396,14 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while, HloInstruction* init,
       }));
 
   // Set emitted value to that of 'init' with which it shares an allocation.
+  const HloInstruction* init = xla_while->operand(0);
   emitted_value_[xla_while] = GetEmittedValueFor(init);
 
   // The called computation should have been emitted previously.
   llvm::Function* condition_ir_function =
       FindOrDie(emitted_functions_, condition);
-  llvm::Function* body_ir_function = FindOrDie(emitted_functions_, body);
+  llvm::Function* body_ir_function =
+      FindOrDie(emitted_functions_, xla_while->while_body());
 
   // Generating:
   //   while (Condition(while_result)) {
@@ -1710,8 +1747,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetAddressForOp(
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
       attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape));
-      retval->addAttr(llvm::AttributeList::get(
-          retval->getContext(), retval->getArgNo() + 1, attr_builder));
+      retval->addAttrs(attr_builder);
     }
     return ir_builder_.CreateBitCast(retval,
                                      IrShapeType(target_shape)->getPointerTo());
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 66bae457e3741332f23abc7d54b8d775aa193ca9..b564b359b07a6ca52193bd0c5934f8563a00346c 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -60,8 +60,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // llvm_module: the LLVM module to emit IR into.
   // hlo_to_profile_idx: the mapping from HLO to its index in the profiling
   //                     array.
-  IrEmitter(const HloModule& hlo_module, const HloModuleConfig& module_config,
-            const BufferAssignment& assignment, llvm::Module* llvm_module,
+  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
+            llvm::Module* llvm_module,
             const std::unordered_map<const HloInstruction*, size_t>*
                 hlo_to_profile_idx);
   ~IrEmitter() override;
@@ -114,6 +114,15 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                             HloComputation* function) override;
   Status HandleSelectAndScatter(HloInstruction* instruction) override;
   Status HandleSend(HloInstruction* send) override;
+  Status HandleSlice(HloInstruction* slice,
+                     HloInstruction* /*operand*/) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* /*operand*/,
+                            HloInstruction* /*start_indices*/) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
+                                  HloInstruction* /*operand*/,
+                                  HloInstruction* /*update*/,
+                                  HloInstruction* /*start_indices*/) override;
   Status HandleRecv(HloInstruction* recv) override;
   Status HandlePad(HloInstruction* pad) override;
   Status HandleTuple(
@@ -125,14 +134,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloComputation* function,
       tensorflow::gtl::ArraySlice<HloInstruction*> static_operands) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index 7a4723e8d75588d8ccb711892b4082024695e444..5f7b2c663f7a6a554afda17702160e70ce4e04a0 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -58,12 +57,11 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config,
     std::unique_ptr<std::map<HloInstruction*, string>> function_names,
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
         aligned_constants)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    : Executable(std::move(hlo_module)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       functions_names_(std::move(function_names)),
@@ -146,7 +144,7 @@ Status ParallelCpuExecutable::AllocateBuffers(
 }
 
 Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
@@ -160,7 +158,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
 }
 
 Status ParallelCpuExecutable::ExecuteComputeFunctions(
-    const ExecutableRunOptions* run_options,
+    const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
@@ -214,7 +212,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
 
   void** temps_array = buffer_pointers.data();
   uint64* profile_counters_array = profile_counters.data();
-  auto* thread_pool = CHECK_NOTNULL(run_options->inter_op_thread_pool());
+  auto* thread_pool = CHECK_NOTNULL(run_options->xla_intra_op_thread_pool());
   tensorflow::mutex completion_queue_lock;
   tensorflow::condition_variable completion_queue_cv;
   std::deque<HloInstruction*> completion_queue;
@@ -251,11 +249,12 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
                      });
       auto function = FindOrDie(functions, instruction);
       // The thread pool entry takes ownership of |operand_buffers|.
+      const auto* exec_run_options = &run_options->run_options();
       thread_pool->Schedule([instruction, &completion_queue,
                              &completion_queue_lock, &completion_queue_cv,
-                             result_buffer, run_options, operand_buffers,
+                             result_buffer, exec_run_options, operand_buffers,
                              temps_array, profile_counters_array, function] {
-        function(result_buffer, run_options, operand_buffers, temps_array,
+        function(result_buffer, exec_run_options, operand_buffers, temps_array,
                  profile_counters_array);
         delete[] operand_buffers;
         // Push the completed HLO instruction on the queue, the main thread
@@ -345,9 +344,8 @@ ParallelCpuExecutable::ExecuteOnStream(
   const BufferAllocation::Index result_index = result_slice.index();
   VLOG(3) << "result index: " << result_index;
 
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(&run_options->run_options(),
-                                             arguments, device_allocations,
-                                             hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
+      run_options, arguments, device_allocations, hlo_execution_profile));
 
   // Mark the buffers that are actually live (used in the output) when the
   // computation finishes executing.
@@ -400,8 +398,8 @@ StatusOr<std::unique_ptr<ShapedBuffer>> ParallelCpuExecutable::ExecuteOnStream(
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
 
-  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
+  TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers,
+                                             hlo_execution_profile));
 
   // Copy DeviceMemoryBase values which contain the array(s) of the result into
   // the respective location in ShapedBuffer which is returned to the caller.
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index 7223de9f0798365138cdb26ca9dce07cd0e474e3..a3278c9510e9661f53ecbc729aa500b3636d3f6d 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -52,7 +51,6 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config,
       std::unique_ptr<std::map<HloInstruction*, string>> instruction_functions,
       std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
       std::unordered_map<const HloInstruction*,
@@ -96,14 +94,14 @@ class ParallelCpuExecutable : public Executable {
   // Calls the generated functions in 'function_names_', performing the
   // computation with the given arguments using the supplied buffers.
   Status ExecuteComputeFunctions(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           buffers,
       HloExecutionProfile* hlo_execution_profile);
   Status ExecuteComputeFunctions(
-      const ExecutableRunOptions* run_options,
+      const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           buffers,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 677080a8623224cdd65e35b3116ae57b7b3b3ca2..ee772f5c3967b6671f3d89c8ee3034e78501018b 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -54,7 +54,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
   const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 384a978873de89526f43556296aaa51c46ac1d3f..6f1c97a2334e08a5ea62b9b7837aa83fa3cde631 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -48,7 +48,7 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
   int rhs_contract_dim = transpose_rhs ? 1 : 0;
   const Eigen::array<DimPair, 1> dims(
-      DimPair(lhs_contract_dim, rhs_contract_dim));
+      {DimPair(lhs_contract_dim, rhs_contract_dim)});
 
   // Matrix multiply is a special case of the "contract" operation where
   // the contraction is performed along dimension 1 of the lhs and dimension
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 8beb565ab3e220f9b9eebac836c8de8c1fc2e8ee..7c74912a7ab9c388c9911fe8194f268623f0abd1 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -112,13 +112,25 @@ llvm::SmallVector<std::string, 0> DetectMachineAttributes() {
   if (llvm::sys::getHostCPUFeatures(host_features)) {
     for (auto &feature : host_features) {
       if (feature.second) {
-        result.push_back(feature.first());
+        llvm::StringRef feature_name = feature.first();
+        // Skip avx512 for now, it isn't quite ready in LLVM.
+        if (feature_name.startswith("avx512")) {
+          continue;
+        }
+        result.push_back(feature_name);
       }
     }
   }
   return result;
 }
 
+llvm::StringRef GetHostCpuName() {
+  auto cpu_name = llvm::sys::getHostCPUName();
+  // Skip avx512 for now, it isn't quite ready in LLVM.
+  cpu_name.consume_back("-avx512");
+  return cpu_name;
+}
+
 CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() {
   CompilerFunctor::VectorIntrinsics intrinsics;
   intrinsics.sse_intrinsics = (&runtime::ExpV4F32 != nullptr);
@@ -136,13 +148,16 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions &target_options,
                             .setOptLevel(opt_level)
                             .selectTarget(
                                 /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-                                /*MCPU=*/llvm::sys::getHostCPUName(),
+                                /*MCPU=*/GetHostCpuName(),
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
-                                     opt_level, GetAvailableIntrinsics())) {}
+                                     opt_level, GetAvailableIntrinsics())) {
+  VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
+          << " features: " << target_machine_->getTargetFeatureString().str();
+}
 
 SimpleOrcJIT::ModuleHandleT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 351efa82dd21dd9f618ed38cdb54bd2e26fcd5d5..49e9874cda2dd4cc5087b2467442d44bc0245734 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -189,19 +189,16 @@ class DfsHloVisitor {
   virtual Status HandleTranspose(HloInstruction* transpose) = 0;
   virtual Status HandleParameter(HloInstruction* parameter) = 0;
   virtual Status HandleFusion(HloInstruction* fusion) = 0;
-  virtual Status HandleCall(
-      HloInstruction* call,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloComputation* computation) = 0;
+  virtual Status HandleCall(HloInstruction* call) = 0;
   virtual Status HandleCustomCall(
       HloInstruction* custom_call,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       tensorflow::StringPiece custom_call_target) = 0;
   virtual Status HandleSlice(HloInstruction* slice,
                              HloInstruction* operand) = 0;
-  virtual Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) = 0;
+  virtual Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                    HloInstruction* operand,
+                                    HloInstruction* start_indices) = 0;
   virtual Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                           HloInstruction* operand,
                                           HloInstruction* update,
@@ -219,9 +216,7 @@ class DfsHloVisitor {
                                     const Window& window,
                                     HloComputation* function) = 0;
   virtual Status HandleSelectAndScatter(HloInstruction* instruction) = 0;
-  virtual Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                             HloComputation* condition,
-                             HloComputation* body) = 0;
+  virtual Status HandleWhile(HloInstruction* xla_while) = 0;
 
   virtual Status HandlePad(HloInstruction* pad) = 0;
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 18cfaf83e1cd558928c9fc65452524567f3cbb49..c27710fbdb2cb01776137370a61541c7e44c66c7 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -121,9 +121,7 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   Status HandleFusion(HloInstruction* fusion) override {
     return DefaultAction(fusion);
   }
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> /*operands*/,
-                    HloComputation* /*computation*/) override {
+  Status HandleCall(HloInstruction* call) override {
     return DefaultAction(call);
   }
   Status HandleCustomCall(
@@ -136,10 +134,10 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
                      HloInstruction* /*operand*/) override {
     return DefaultAction(slice);
   }
-  Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> /*operands*/) override {
-    return DefaultAction(slice);
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* /*operand*/,
+                            HloInstruction* /*start_indices*/) override {
+    return DefaultAction(dynamic_slice);
   }
   Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                   HloInstruction* /*operand*/,
@@ -188,9 +186,7 @@ class DfsHloVisitorWithDefault : public DfsHloVisitor {
   Status HandleTranspose(HloInstruction* transpose) override {
     return DefaultAction(transpose);
   }
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* /*init*/,
-                     HloComputation* /*condition*/,
-                     HloComputation* /*body*/) override {
+  Status HandleWhile(HloInstruction* xla_while) override {
     return DefaultAction(xla_while);
   }
   Status HandleSend(HloInstruction* send) override {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index a04815dad94484a6f01ebd27d3ec73f547086722..bea1da4044669f5e910af09ba1b65416a69367b5 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -240,14 +240,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
       return ir_builder_->CreateFDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
       return ir_builder_->CreateFRem(lhs_value, rhs_value);
-
-    // The 'O' prefix on the LLVM ops means "ordered" compare where comparisons
-    // with NAN always return false.
+    // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
+    // comparisons always return false when one of the operands is NaN, whereas
+    // unordered comparisons return true.
+    //
+    // We use ordered comparisons for everything except kNe, where we use an
+    // unordered comparison.  This makes x != y equivalent to !(x == y), and
+    // matches C++'s semantics.
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kNe:
-      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_ONE, lhs_value,
+      return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
                                      rhs_value, ir_builder_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
@@ -739,11 +743,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
           const HloInstruction* operand = hlo->operand(operand_idx);
           auto true_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_from_operand", operand_idx),
+                      "concat_index_from_operand", operand_idx),
               ir_builder_);
           auto false_block = llvm_ir::CreateBasicBlock(
               exit_block, tensorflow::strings::StrCat(
-                              "concat_index_not_from_operand", operand_idx),
+                      "concat_index_not_from_operand", operand_idx),
               ir_builder_);
           auto concat_dim_size =
               llvm::ConstantInt::get(source_index[concat_dim]->getType(),
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index eb36aba33a7694c43985b5e5636e7e0fa2ad4794..5a65f829fcd1e854c266b2d958a8f3d6408b87d4 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -41,10 +40,8 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<HloModule> hlo_module,
-                      std::unique_ptr<HloModuleConfig> module_config)
-      : hlo_module_(std::move(hlo_module)),
-        module_config_(std::move(module_config)) {}
+  explicit Executable(std::unique_ptr<HloModule> hlo_module)
+      : hlo_module_(std::move(hlo_module)) {}
   virtual ~Executable() {}
 
   // Enqueues the compilation result on the provided stream, passing the given
@@ -98,15 +95,17 @@ class Executable {
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
   bool hlo_profiling_enabled() const {
-    return module_config_->hlo_profiling_enabled();
+    return hlo_module_->config().hlo_profiling_enabled();
   }
 
   const HloModule& module() const { return *hlo_module_; }
 
-  const HloModuleConfig& module_config() const { return *module_config_; }
+  const HloModuleConfig& module_config() const { return hlo_module_->config(); }
 
   // Returns whether this executable has an associated HloModuleConfig.
-  bool has_module_config() const { return module_config_ != nullptr; }
+  bool has_module_config() const {
+    return hlo_module_ != nullptr && hlo_module_->has_config();
+  }
 
   // Returns the versioned computation handle of the computation computed by
   // this executable.
@@ -117,7 +116,7 @@ class Executable {
   // The shape (including layout) that results from this execution. This is the
   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
   const Shape& result_shape() const {
-    return module_config_->entry_computation_layout().result_shape();
+    return hlo_module_->config().entry_computation_layout().result_shape();
   }
 
   // Dumping helpers.
@@ -143,10 +142,6 @@ class Executable {
   // around.
   std::unique_ptr<HloModule> hlo_module_;
 
-  // The configuration used to build this executable (parameter layouts, result
-  // layout, profiling enabled, etc).
-  std::unique_ptr<HloModuleConfig> module_config_;
-
   // SessionModule this was compiled from. Null if not dumping executions.
   std::unique_ptr<SessionModule> session_module_;
 
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..297a4f7599f9c127386b2f53f7ffb987befc456e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Helper to replace the called computation at a while- or call-instruction.
+void ReplaceCalledComputation(HloInstruction* instruction,
+                              HloComputation* computation,
+                              HloComputation* new_computation) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kWhile: {
+      if (computation == instruction->while_condition()) {
+        instruction->set_while_condition(new_computation);
+      } else {
+        CHECK_EQ(computation, instruction->while_body());
+        instruction->set_while_body(new_computation);
+      }
+      break;
+    }
+    case HloOpcode::kCall: {
+      CHECK_EQ(instruction->to_apply(), computation);
+      instruction->set_to_apply(new_computation);
+      break;
+    }
+    default:
+      LOG(FATAL) << "unexpected opcode: "
+                 << HloOpcodeString(instruction->opcode());
+  }
+}
+
+// Flatten a single call graph node. Expects to visit nodes in postorder.
+Status FlattenNode(const CallGraphNode& node) {
+  HloComputation* computation = node.computation();
+  HloModule* module = computation->parent();
+  // Clone callee for all call-sites except the first one.
+  for (int i = 0; i < node.caller_callsites().size(); ++i) {
+    CallSite call_site = node.caller_callsites()[i];
+    // Only consider sequential call contexts.
+    if (call_site.context() == CallContext::kParallel) {
+      continue;
+    }
+    CHECK_EQ(call_site.context(), CallContext::kSequential);
+
+    // Skip first element if this computation is only called from a sequential
+    // context.
+    if (node.context() != CallContext::kBoth && i == 0) {
+      continue;
+    }
+
+    // Clone computation for the remaining sequential context call sites.
+    HloComputation* clone =
+        module->AddEmbeddedComputation(computation->Clone());
+    ReplaceCalledComputation(call_site.instruction(), computation, clone);
+    // Clone the sub-tree of all computations called from this node.
+    std::vector<HloComputation*> worklist;
+    worklist.push_back(clone);
+    while (!worklist.empty()) {
+      auto current = worklist.back();
+      worklist.pop_back();
+      for (auto& instruction : current->instructions()) {
+        if (GetInstructionCallContext(instruction.get()) !=
+            CallContext::kSequential) {
+          continue;
+        }
+        for (auto callee : instruction->called_computations()) {
+          HloComputation* callee_clone =
+              module->AddEmbeddedComputation(callee->Clone());
+          ReplaceCalledComputation(instruction.get(), callee, callee_clone);
+          worklist.push_back(callee_clone);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+StatusOr<bool> FlattenCallGraph::Run(HloModule* module) {
+  XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
+
+  XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
+  return true;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.h b/tensorflow/compiler/xla/service/flatten_call_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3efab3614912e4b0c2c8aa3b80277c326382ed0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.h
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Flatten the call graph for an HLO module into a tree.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Flattening associates each call site with a unique computation (for
+// sequential calling contexts) This simplifies buffer assignment and
+// points-to analysis (see b/36865746 for details).
+class FlattenCallGraph : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "flatten-call-graph"; }
+
+  // Duplicates computations called from multiple call- or while-nodes to
+  // flatten the call graph.
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_FLATTEN_CALL_GRAPH_H_
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e03a96fb3f03710cd3062a79aa4955311cf19c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -0,0 +1,227 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class FlattenCallGraphTest : public HloTestBase {
+ protected:
+  // Build and return a trivial computation taking and returning a scalar.
+  std::unique_ptr<HloComputation> MakeScalarComputation() {
+    HloComputation::Builder builder(TestName() + ".ScalarComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and maps (kMap) the
+  // given computation to the value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeMappingComputation(
+      HloComputation* map_computation, int64 callsites) {
+    HloComputation::Builder builder(TestName() + ".MappingComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateMap(
+          kScalarShape, {last_value}, map_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and calls (kCall) the
+  // given computation with value 'callsites' number of times.
+  std::unique_ptr<HloComputation> MakeCallingComputation(
+      HloComputation* callee_computation, int64 callsites,
+      const string& suffix = ".CallingComputation") {
+    HloComputation::Builder builder(TestName() + suffix);
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* last_value = param0;
+    for (int64 i = 0; i < callsites; ++i) {
+      last_value = builder.AddInstruction(HloInstruction::CreateCall(
+          kScalarShape, {last_value}, callee_computation));
+    }
+    return builder.Build();
+  }
+
+  // Build and return a computation which takes a scalar and returns a PRED
+  // value.
+  std::unique_ptr<HloComputation> MakeConditionComputation() {
+    HloComputation::Builder builder(TestName() + ".ConditionComputation");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* zero = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, param0, zero));
+    return builder.Build();
+  }
+
+  StatusOr<bool> RunFlattenCallGraph(HloModule* module) {
+    FlattenCallGraph flatten;
+    TF_ASSIGN_OR_RETURN(bool result, flatten.Run(module));
+    return result;
+  }
+
+  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(FlattenCallGraphTest, ComplexGraph) {
+  // Test a call graph of a module with several computation called in various
+  // contexts. The call graph looks like:
+  //
+  //      entry
+  //      /  |
+  //     a   |
+  //   / | \ |
+  //  b  |  cond
+  //   \ |
+  //    c
+  //
+  // Calls are made via kCall, kWhile, and kMap instructions.
+  HloModule module(TestName());
+  HloComputation* cond_computation =
+      module.AddEmbeddedComputation(MakeConditionComputation());
+  HloComputation* c_computation =
+      module.AddEmbeddedComputation(MakeScalarComputation());
+  HloComputation* b_computation = module.AddEmbeddedComputation(
+      MakeMappingComputation(c_computation, /*callsites=*/1));
+
+  HloComputation* a_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".a");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    HloInstruction* call = builder.AddInstruction(
+        HloInstruction::CreateCall(kScalarShape, {param0}, c_computation));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, b_computation, call));
+    a_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation* entry_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".entry");
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, kScalarShape, "param0"));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        kScalarShape, cond_computation, a_computation, param0));
+    entry_computation = module.AddEntryComputation(builder.Build());
+  }
+
+  {
+    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(&module);
+    const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
+    EXPECT_EQ(1, c_node.caller_callsites().size());
+  }
+}
+
+// Test corner case of a computation used as a body and a loop condition.
+TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
+  HloModule module(TestName());
+  HloComputation* cond_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".cond");
+    HloInstruction* param0 =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            0, ShapeUtil::MakeShape(PRED, {}), "param0"));
+    HloInstruction* false_constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                     HloOpcode::kEq, param0, false_constant));
+    cond_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  HloComputation* entry_computation;
+  {
+    HloComputation::Builder builder(TestName() + ".entry");
+    HloInstruction* false_constant = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+    builder.AddInstruction(HloInstruction::CreateWhile(
+        ShapeUtil::MakeShape(PRED, {}), cond_computation, cond_computation,
+        false_constant));
+    entry_computation = module.AddEntryComputation(builder.Build());
+  }
+
+  {
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(2, cond_node.caller_callsites().size());
+  }
+
+  {
+    TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
+    EXPECT_TRUE(result);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+    const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+    EXPECT_EQ(1, cond_node.caller_callsites().size());
+  }
+}
+
+// Test flattening of a nested calling computations.
+//
+//   Entry
+//    / \
+//    \ /
+//     B
+//    / \
+//    \ /
+//     C
+//
+TEST_F(FlattenCallGraphTest, FlattenCalls) {
+  HloModule module(TestName());
+  HloComputation* c_computation =
+      module.AddEmbeddedComputation(MakeScalarComputation());
+
+  HloComputation* b_computation = module.AddEmbeddedComputation(
+      MakeCallingComputation(c_computation, /*callsites=*/2, ".B"));
+
+  module.AddEntryComputation(
+      MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
+
+  TF_ASSIGN_OR_ASSERT_OK(bool result, RunFlattenCallGraph(&module));
+  EXPECT_TRUE(result);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+  EXPECT_EQ(7, module.computations().size());
+
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  EXPECT_EQ(1, c_node.caller_callsites().size());
+
+  const CallGraphNode& b_node = call_graph->GetNode(b_computation);
+  EXPECT_EQ(1, b_node.caller_callsites().size());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 9de6d65a27bfcb6747d59eac75f8b13debba0ebd..d26f415fd4bdfec597c70b760942cc406a0d6cfa 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -264,6 +264,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ],
 )
@@ -425,6 +427,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
@@ -529,14 +532,10 @@ cc_test(
     deps = [
         ":instruction_fusion",
         ":while_transformer",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:copy_insertion",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index f6b7fe1e8ef10e4e66018d887707e587ecfa3465..94acf5a35945a33048038bfae67d46c38a07ef8d 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -125,7 +125,7 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   CHECK_LE(num_dimensions, 3);
   // cuDNN does not support 1D convolutions. We therefore express 1D
   // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behaviour of TF (see definition of conv1d in
+  // This matches the behavior of TF (see definition of conv1d in
   // tensorflow/python/ops/nn_ops.py).
   const int effective_num_dimensions = std::max(2, num_dimensions);
 
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 1667ab36792c91cbbf3c6396a673bedff2208045..e57eb0bdee64948290d5eaf15965afcdc8bea0ad 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -113,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<PrimitiveType> input_types,
     PrimitiveType output_type) const {
-  // Binary math functions tranform are of type [T] -> T.
+  // Binary math functions transform are of type [T] -> T.
   for (PrimitiveType input_type : input_types) {
     if (output_type != input_type) {
       return Unimplemented("Input type ≠ output type: %s ≠ %s",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index f692f28bd9858ab809732389fcc2908b8fa66a42..b616d958b96c41e9b9021bf375d51a32ef73ceb9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
@@ -133,8 +134,13 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       pass.AddPass<HloConstantFolding>();
     }
     pipeline.AddPass<ConvolutionFolding>();
-    pipeline.AddPass<TransposeFolding>(ImplementedAsGemm);
-    pipeline.AddPass<HloSubcomputationUnification>();
+    pipeline.AddPass<TransposeFolding>(
+        [](const HloInstruction& dot,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return ImplementedAsGemm(dot) ? candidate_operands
+                                        : TransposeFolding::OperandIndices{};
+        },
+        TransposeFolding::NeverFoldTranspose);
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
@@ -172,16 +178,20 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
   // materializes the value) or missing a necessary copy (later pass removes an
-  // instruction which materializes a value).
+  // instruction which materializes a value). DCE must be run immediately before
+  // (and sometime after) copy insertion, to avoid dead code from interfering
+  // with the rewrites.
+  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<GpuCopyInsertion>();
   pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(hlo_module).status();
 }
 
 // Invokes the ptxas tool on the given PTX string, and dumps its output.
 void DumpPtxasInfo(const string& ptx) {
-  legacy_flags::GpuCompilerFlags* flags = legacy_flags::GetGpuCompilerFlags();
-  const string ptxas_path = flags->xla_ptxas_path;
+  const string ptxas_path =
+      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
   // Do not log PTX stats if ptxas is not found at the given path.
   if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
     LOG(WARNING)
@@ -222,15 +232,14 @@ GpuCompiler::GpuCompiler()
       pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
 
 StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+    std::unique_ptr<HloModule> module, HloDumper dump_hlo,
     se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(OptimizeHloModule(hlo_module.get(), dump_hlo,
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), dump_hlo,
                                        stream_exec->GetDeviceDescription()));
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(dump_hlo, hlo_module.get(),
-                                                   module_config.get()));
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(dump_hlo, module.get(),
+                                                   module->mutable_config()));
 
   llvm::LLVMContext llvm_context;
   std::string buffer;
@@ -243,7 +252,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   };
   llvm_context.setDiagnosticHandler(DiagnosticHandler, &printer);
 
-  llvm::Module llvm_module(hlo_module->name().c_str(), llvm_context);
+  llvm::Module llvm_module(module->name().c_str(), llvm_context);
   // Set the target triple and the data layout.
   llvm_module.setTargetTriple(kTargetTriple);
   llvm_module.setDataLayout(kDataLayout);
@@ -251,29 +260,28 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   // Determine the HLO schedule, which is an ordering of HLO instructions.  This
   // is used by buffer assignment to enable buffer reuse, and the same ordering
   // must also be used to determine the thunk launch schedule.
-  std::unique_ptr<StreamAssignment> stream_assignment =
-      AssignStreams(*hlo_module);
+  std::unique_ptr<StreamAssignment> stream_assignment = AssignStreams(*module);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloSchedule> hlo_schedule,
-      HloSchedule::Build(*hlo_module, *stream_assignment, pointer_size_));
+      HloSchedule::Build(*module, *stream_assignment, pointer_size_));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(hlo_module.get(), hlo_schedule->ConsumeHloOrdering(),
+      BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
                           [this](const LogicalBuffer& buffer) {
                             return ShapeSizeBytes(buffer.shape());
                           },
                           kMemoryAlignment));
 
-  IrEmitterContext ir_emitter_context(hlo_module.get(), buffer_assignment.get(),
+  IrEmitterContext ir_emitter_context(module.get(), buffer_assignment.get(),
                                       &stream_exec->GetDeviceDescription(),
                                       &llvm_module);
 
-  HloComputation* entry_computation = hlo_module->entry_computation();
-  IrEmitterUnnested ir_emitter(*module_config, entry_computation,
-                               module_config->has_hybrid_result(),
+  HloComputation* entry_computation = module->entry_computation();
+  IrEmitterUnnested ir_emitter(module->config(), entry_computation,
+                               module->config().has_hybrid_result(),
                                &ir_emitter_context);
   TF_RETURN_IF_ERROR(
       entry_computation->root_instruction()->Accept(&ir_emitter));
@@ -302,7 +310,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     cc_minor = 0;
   }
   TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
-                                         *module_config, libdevice_dir_));
+                                         module->config(), libdevice_dir_));
 
   VLOG(2) << "LLVM module after optimizations:";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
@@ -319,8 +327,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
   auto* gpu_executable =
-      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(hlo_module),
-                        std::move(module_config), std::move(buffer_assignment));
+      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(module),
+                        std::move(buffer_assignment));
   if (flags->xla_gpu_embed_ir) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -329,9 +337,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    HloDumper dump_hlos, std::vector<se::StreamExecutor*> stream_execs) {
+    std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlos,
+    std::vector<se::StreamExecutor*> stream_execs) {
   return Unimplemented(
       "Compilation of multiple HLO modules is not yet supported on GPU.");
 }
@@ -339,7 +346,6 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::CompileAheadOfTime(
     std::vector<std::unique_ptr<HloModule>> module,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_config,
     HloDumper dump_hlo, const AotCompilationOptions& options) {
   return Unimplemented("not yet implemented: GpuCompiler::CompileAheadOfTime");
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 22f492b42294838bf323b70f492d83fa9c7b4ce2..921d683f03066a57bbadeacb6e33c91cadb3c095 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -42,20 +42,16 @@ class GpuCompiler : public Compiler {
   ~GpuCompiler() override {}
 
   StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloModuleConfig> module_config, HloDumper dump_hlo,
+      std::unique_ptr<HloModule> module, HloDumper dump_hlo,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
-      HloDumper dump_hlo,
+      std::vector<std::unique_ptr<HloModule>> modules, HloDumper dump_hlo,
       std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       std::vector<std::unique_ptr<HloModule>> module,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_config,
       HloDumper dump_hlo, AotCompilationOptions const& options) override;
 
   perftools::gputools::Platform::Id PlatformId() const override;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 32f0368b4bc523d3d81147a8cbbde745387c21d4..69bcd53e05d5de013be2af1cfdba934cea34af6b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -110,9 +110,8 @@ class HloExecutionProfiler {
 GpuExecutable::GpuExecutable(tensorflow::StringPiece ptx,
                              std::unique_ptr<ThunkSchedule> thunk_schedule,
                              std::unique_ptr<HloModule> hlo_module,
-                             std::unique_ptr<HloModuleConfig> module_config,
                              std::unique_ptr<BufferAssignment> assignment)
-    : Executable(std::move(hlo_module), std::move(module_config)),
+    : Executable(std::move(hlo_module)),
       ptx_(ptx),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {}
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index e308de79ba582d3497e7f217285ae4b1ed0be1a7..ad178b7249e4a265ca88a45985142b08b1023417 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -51,7 +50,6 @@ class GpuExecutable : public Executable {
   GpuExecutable(tensorflow::StringPiece ptx,
                 std::unique_ptr<ThunkSchedule> thunk_schedule,
                 std::unique_ptr<HloModule> hlo_module,
-                std::unique_ptr<HloModuleConfig> module_config,
                 std::unique_ptr<BufferAssignment> assignment);
 
   // This should be called after set_ir_module_string.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 34a44ad40548272a0c2a87efadfa1ab2aca7b979..a36dcbbd2faf3258ec2790f51bb2aec3ce834a6c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -46,6 +46,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on GPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // RNG operations are not currently parallel-friendly on GPU.
   if (producer->opcode() == HloOpcode::kRng) {
     return false;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index e8378a7f447cebf8d491e98595188d2391333c58..c6e8a2f78b5a398d9e9d5a684ac4d42520ec20c8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -59,6 +59,11 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
@@ -85,6 +90,11 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 }
 
 bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
+  // We can only do this if the HLO is unnested.
+  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
+    return false;
+  }
+
   // Forward convolution.
   if (hlo.opcode() == HloOpcode::kConvolution) {
     const ConvolutionDimensionNumbers& dnums =
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 4d3e9b10b2e69b083d74cf7b56edc5b781991b55..e8c68a6ef72ede8f2f3dd2279a8e43468ce8f35d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -25,16 +25,7 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-const int64 kWarpSize = 32;
-
-// Precondition: "hlo" is an operand of a Dot instruction.
-//
-// Returns whether "hlo" is foldable to its user.
-bool IsOperandFoldableToDot(const HloInstruction& hlo);
-
-// Returns true if GpuCompiler can fold any operands of "dot" into "dot" for
-// better performance.
-bool CanFoldOperandsIntoDot(const HloInstruction& dot);
+constexpr int64 kWarpSize = 32;
 
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
 bool ImplementedAsGemm(const HloInstruction& hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 5f3ce85f857a96ca0cca6b0bea4bf1e86b971827..36619a845413b19ec2d559252409dae1b96b76e4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -399,7 +399,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot,
   llvm::Type* accum_type = target_array.GetElementLlvmType();
   llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
       accum_type,       // The pointee type of the alloca instruction.
-      "accum_address",  // The name of the alloca instuction.
+      "accum_address",  // The name of the alloca instruction.
       &ir_builder_);
 
   // Initialize the accumulator in the preheader to zero.
@@ -549,14 +549,12 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   return EmitTargetElementLoop(*fusion, fused_emitter.GetRootGenerator());
 }
 
-Status IrEmitter::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
+Status IrEmitter::HandleCall(HloInstruction* call) {
   std::vector<llvm::Value*> operand_addresses;
-  for (HloInstruction* operand : operands) {
+  for (HloInstruction* operand : call->operands()) {
     operand_addresses.push_back(GetBasePointer(*operand));
   }
-  return EmitCallToNestedComputation(*computation, operand_addresses,
+  return EmitCallToNestedComputation(*call->to_apply(), operand_addresses,
                                      GetBasePointer(*call));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 1aefee2739978ec05f4094f79acaece39e221bea..513bead62d8db38e550bc550fe2212b6e5dc4baf 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -101,9 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                       HloInstruction* on_true,
                       HloInstruction* on_false) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
@@ -249,8 +247,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleTuple(
       HloInstruction* tuple,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
   Status HandleRng(HloInstruction* random,
                    RandomDistribution distribution) override;
   Status HandleSelect(HloInstruction* select, HloInstruction* pred,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 9b7aa7c860b14e03c238bd7037f0df832eacfef3..e52e55a1a8199019e2c149a777a4e948f830ce0e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -196,7 +196,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
           ir_emitter_context_->buffer_assignment().GetTempAllocation()) {
     kernel->addDereferenceableAttr(temp_buffer_arg_no + 1, allocation->size());
   }
-  kernel->setDoesNotAlias(temp_buffer_arg_no + 1);
+  kernel->addAttribute(temp_buffer_arg_no + 1, llvm::Attribute::NoAlias);
 
   // Add the declaration of this kernel to llvm.nvvm.annotations so that NVPTX
   // treats it as a CUDA kernel.
@@ -1540,10 +1540,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       .EmitLoop();
 }
 
-Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while,
-                                      HloInstruction* init,
-                                      HloComputation* condition,
-                                      HloComputation* body) {
+Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
+  HloComputation* condition = xla_while->while_condition();
   TF_RET_CHECK(ShapeUtil::IsScalar(condition->root_instruction()->shape()) &&
                condition->root_instruction()->shape().element_type() == PRED)
       << "While condition computation must return bool";
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 485216837dc727bfe8565ff22678dd2fa470bc40..4f34cb77b0390e21350ad146695dd5be67fdabbf 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -396,7 +396,7 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   // The LLVM IR verifier performs sanity checking on the IR. This helps
   // discover problems and report them in a meaningful manner, rather than let
-  // later passes report obscure assertions becasue of unfulfilled invariants.
+  // later passes report obscure assertions because of unfulfilled invariants.
   module_passes.add(llvm::createVerifierPass());
 
   // Create the function-level pass manager. It needs data layout information
@@ -405,9 +405,9 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
 
   AddOptimizationPasses(flags->opt_level, /*size_level=*/0,
                         target_machine.get(), &module_passes, &function_passes);
-  // Loop unrolling exposes more opportunites for SROA. Therefore, we run SROA
+  // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
   // again after the standard optimization passes [http://b/13329423].
-  // TODO(jingyue): SROA may further expose more optimization opportunites, such
+  // TODO(jingyue): SROA may further expose more optimization opportunities, such
   // as more precise alias analysis and more function inlining (SROA may change
   // the inlining cost of a function). For now, running SROA already emits good
   // enough code for the evaluated benchmarks. We may want to run more
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
index c10346bbc235d8949525eb2008bac5312395381d..72f6cfd2d60712bb74af3dca2041ed1413004d23 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
@@ -28,7 +28,8 @@ limitations under the License.
 namespace {
 
 static void DieWithSMDiagnosticError(llvm::SMDiagnostic* diagnostic) {
-  LOG(FATAL) << diagnostic->getLineNo() << ":" << diagnostic->getColumnNo()
+  LOG(FATAL) << diagnostic->getFilename().str() << ":"
+             << diagnostic->getLineNo() << ":" << diagnostic->getColumnNo()
              << ": " << diagnostic->getMessage().str();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 8ac4c5996632587fe4518df5560a1a74d9e8caa6..8f7fce884acc93fd39510ad0826b819a6d9731a7 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -33,7 +33,7 @@ namespace gpu {
 enum class PartitionStrategy {
   // Optimized for latency by allowing maximum number of registers per thread.
   kLatency,
-  // Optimized for throughtput. This may limit registers per thread and cause
+  // Optimized for throughput. This may limit registers per thread and cause
   // longer latency.
   kThroughput
 };
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ec75e1358142764d80152a6d8abbc6d5b72acb9a..61a9e7e9e1bfca3b73e427ef6bbb956aee51c2e7 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -37,7 +37,7 @@ namespace {
 // patterns to match.
 //
 // Each ExprTree node is comprised of an HloOpcode, and a set of operands (each
-// of type ExprTree). Operands can be added by specifing the index and HloOpcode
+// of type ExprTree). Operands can be added by specifying the index and HloOpcode
 // of the operand.
 //
 // For example, the following computation:
@@ -122,10 +122,12 @@ class ExprTree {
   Status Match(const HloInstruction* instruction,
                TaggedInstructionMap* tagged_instructions) const {
     if (opcode_ != instruction->opcode()) {
-      return InvalidArgument("Unexpected opcode: %s",
-                             HloOpcodeString(instruction->opcode()).c_str());
+      return InvalidArgument("got opcode %s, want %s",
+                             HloOpcodeString(instruction->opcode()).c_str(),
+                             HloOpcodeString(opcode_).c_str());
     }
 
+    VLOG(2) << "Matched " << HloOpcodeString(opcode_) << ": " << tag_;
     if (!tag_.empty()) {
       tagged_instructions->insert({tag_, instruction});
     }
@@ -166,7 +168,7 @@ class MatcherBase {
   virtual ~MatcherBase() {}
 
   // Attempts to match each ExprTree in 'expr_trees_'.
-  // Returns OK on the first succesful match, error status otherwise.
+  // Returns OK on the first successful match, error status otherwise.
   virtual tensorflow::Status Run() {
     Status status;
     for (const ExprTree& expr_tree : expr_trees_) {
@@ -275,6 +277,7 @@ class WhileConditionComputationMatcher : public MatcherBase {
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while condition";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
                                        &tagged_instructions));
@@ -344,10 +347,6 @@ class WhileInitOperandMatcher : public MatcherBase {
   //
   //             Const
   //               |
-  //             Tuple1
-  //               |
-  //             GTE0
-  //               |
   //             Copy
   //               |
   //             Tuple0
@@ -355,15 +354,15 @@ class WhileInitOperandMatcher : public MatcherBase {
   //             While
   //
   ExprTree BuildInitExprTree() {
-    ExprTree gte0(HloOpcode::kGetTupleElement, "gte",
-                  ExprTree(HloOpcode::kTuple, tuple_index_,
-                           ExprTree(HloOpcode::kConstant, "loop_start")));
-    return ExprTree(HloOpcode::kWhile, "while",
-                    ExprTree(HloOpcode::kTuple, tuple_index_,
-                             ExprTree(HloOpcode::kCopy, gte0)));
+    return ExprTree(
+        HloOpcode::kWhile, "while",
+        ExprTree(HloOpcode::kTuple, tuple_index_,
+                 ExprTree(HloOpcode::kCopy,
+                          ExprTree(HloOpcode::kConstant, "loop_start"))));
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while init";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(while_hlo_, &tagged_instructions));
 
@@ -375,14 +374,6 @@ class WhileInitOperandMatcher : public MatcherBase {
                              while_hlo->name().c_str());
     }
 
-    // Get tagged GTE instruction and check 'tuple_index_'.
-    TF_ASSIGN_OR_RETURN(const HloInstruction* gte,
-                        GetTaggedInstruction("gte", tagged_instructions));
-    if (gte->tuple_index() != tuple_index_) {
-      return InvalidArgument("Unexpected tuple index instruction : %s",
-                             gte->name().c_str());
-    }
-
     // Get tagged Constant instruction and parse 'loop_start_'.
     TF_ASSIGN_OR_RETURN(
         const HloInstruction* const_hlo,
@@ -427,10 +418,6 @@ class WhileBodyComputationMatcher : public MatcherBase {
   //                     \  /              \  /
   //                    Fusion -----------> Add
   //                      |
-  //                     Tuple1
-  //                      |
-  //                     GTE0
-  //                      |
   //                     Copy
   //                      |
   //                     Tuple0
@@ -450,15 +437,13 @@ class WhileBodyComputationMatcher : public MatcherBase {
     fusion.SetFusedRoot(fused_root);
 
     // Build top-level computation.
-    ExprTree tuple0(
-        HloOpcode::kTuple, tuple_index_,
-        ExprTree(HloOpcode::kCopy,
-                 ExprTree(HloOpcode::kGetTupleElement, "gte",
-                          ExprTree(HloOpcode::kTuple, tuple_index_, fusion))));
+    ExprTree tuple0(HloOpcode::kTuple, tuple_index_,
+                    ExprTree(HloOpcode::kCopy, fusion));
     return tuple0;
   }
 
   Status MatchExprTree(const ExprTree& expr_tree) override {
+    VLOG(2) << "MATCHING while body";
     ExprTree::TaggedInstructionMap tagged_instructions;
     TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(),
                                        &tagged_instructions));
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index ddf9676e378c5445418d30ae767d19ef2fb74be8..a315b9ad11a4a15d4c4d624320283d4467e9bf41 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -17,12 +17,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace {
 
+using ::testing::Eq;
+using ::testing::HasSubstr;
+
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
@@ -135,12 +139,10 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_TRUE(result.ok());
+  ASSERT_TRUE(result.ok());
   // Check results.
-  auto tuple = result.ConsumeValueOrDie();
-  EXPECT_EQ(0, std::get<0>(tuple));
-  EXPECT_EQ(10, std::get<1>(tuple));
-  EXPECT_EQ(1, std::get<2>(tuple));
+  EXPECT_THAT(result.ConsumeValueOrDie(),
+              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
 TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
@@ -154,12 +156,10 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_TRUE(result.ok());
+  ASSERT_TRUE(result.ok());
   // Check results.
-  auto tuple = result.ConsumeValueOrDie();
-  EXPECT_EQ(0, std::get<0>(tuple));
-  EXPECT_EQ(10, std::get<1>(tuple));
-  EXPECT_EQ(1, std::get<2>(tuple));
+  EXPECT_THAT(result.ConsumeValueOrDie(),
+              Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
 TEST_F(WhileTransformerTest, InvalidLoopLimit) {
@@ -173,10 +173,9 @@ TEST_F(WhileTransformerTest, InvalidLoopLimit) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_FALSE(result.ok());
-  EXPECT_MATCH(
-      result.status().error_message(),
-      testing::ContainsRegex("Loop start must be less than loop limit."));
+  ASSERT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              HasSubstr("Loop start must be less than loop limit."));
 }
 
 TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
@@ -190,10 +189,9 @@ TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  EXPECT_FALSE(result.ok());
-  EXPECT_MATCH(
-      result.status().error_message(),
-      testing::ContainsRegex("Loop increment must greater than zero."));
+  ASSERT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              HasSubstr("Loop increment must greater than zero."));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 46c0d8edead1eaba518fd1040b7dd7d0d6c79159..645c68e0438f875e9c4c560b875a18a71618e61c 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -53,12 +53,44 @@ std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
 
 /*static*/
 StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm,
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  const HloComputation* entry_computation = module.entry_computation();
+  const std::vector<const HloInstruction*>& instruction_sequence =
+      FindOrDie(module_sequence, entry_computation);
+  TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation,
+                                         instruction_sequence,
+                                         points_to_analysis, &module_sequence));
+  return heap.Finish();
+}
+
+/*static*/
+StatusOr<HeapSimulator::Result> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
-    const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_fn,
     const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         points_to_analysis,
+                                         /*module_sequence=*/nullptr));
+  return heap.Finish();
+}
+
+// Runs a heap simulation for the given 'computation', assuming the given
+// 'instruction_sequence'. If 'module_sequence' is non-null, it is used to find
+// kCall and kWhile sub-computations, and the heap simulation for those
+// sub-computations will be run recursively.
+Status HeapSimulator::RunComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const SequentialHloOrdering::HloModuleSequence* module_sequence) {
   // The goal here is to minimize memory usage, assuming the given sequential
   // ordering of instructions.  The strategy is to walk through the instruction
   // sequence, calling Alloc and Free on the underlying heap algorithm.  The
@@ -67,25 +99,29 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
   // 'live_buffers' tracks the liveness of each buffer that we assign, by
   // associating it with a set of HloInstructions that need to be visited.  When
   // the set becomes empty, the buffer is no longer used, and can be freed.
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
   FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
 
+  const HloInstruction* root = computation.root_instruction();
+  FlatSet<const LogicalBuffer*> output_source_buffers =
+      points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
+
   for (const HloInstruction* instruction : instruction_sequence) {
     const std::vector<const LogicalBuffer*>& buffers_defined_by_instruction =
         points_to_analysis.GetBuffersDefinedByInstruction(instruction);
 
-    const HloInstruction* root = computation.root_instruction();
-    FlatSet<const LogicalBuffer*> output_source_buffers =
-        points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
-
     // Initialize live_buffers for each buffer that we're going to assign.  The
     // set of instructions that need to be visited contains all users of all
     // aliases.  The alias itself is not necessary; if it has users, the users
     // are necessarily scheduled after the alias.  And if it has no users, it is
     // either a dead value or an output, both of which are handled below.
+    //
+    // We ignore control dependencies here. The reasoning is that the control
+    // dependencies have already been accounted for in the ordering of the given
+    // 'instruction_sequence', and should not otherwise artificially extend the
+    // lifetime of buffers that aren't already connected by a data dependency.
     std::vector<const LogicalBuffer*> dead_buffers_to_free;
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
       for (const BufferAlias& alias :
@@ -122,7 +158,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::vector<const LogicalBuffer*> operand_buffers_to_free;
     for (const LogicalBuffer* operand_buffer :
          UniqueOperandSourceBuffers(instruction, points_to_analysis)) {
-      if (heap.IgnoreBuffer(operand_buffer)) {
+      if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
       live_buffers[operand_buffer].erase(instruction);
@@ -137,10 +173,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     // happen before dead or operand buffers are freed; the instruction reads
     // the operand buffers to produce its output.
     //
-    // INVARIANT: Either heap.Alloc or heap.ShareBuffer will be called for each
-    // buffer that we should assign.
+    // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
+    // that we should assign.
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
 
@@ -151,27 +187,54 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
       bool shared = false;
       for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
         if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
+            buffer->instruction()->opcode() != HloOpcode::kCopy &&
             CanShareOperandBufferWithUser(
                 operand_buffer->instruction(), operand_buffer->index(),
                 buffer->instruction(), buffer->index(), points_to_analysis)) {
-          heap.ShareBuffer(buffer, operand_buffer);
+          ShareBuffer(buffer, operand_buffer);
           shared = true;
           break;
         }
       }
 
       if (!shared) {
-        heap.Alloc(buffer);
+        Alloc(buffer);
       }
     }
 
+    // If the whole module is sequential, we can save memory by running the
+    // heap-simulation for sub-computations inline. E.g. the buffers for the
+    // condition and body of a kWhile instruction are only live for the duration
+    // of the instruction itself.
+    //
+    // The order that the sub-computations are simulated does not affect
+    // correctness; since the whole module is sequential, we know that the
+    // sub-computations will never be run concurrently.
+    if (module_sequence != nullptr) {
+      if (instruction->opcode() == HloOpcode::kCall ||
+          instruction->opcode() == HloOpcode::kWhile) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          const std::vector<const HloInstruction*>& called_sequence =
+              FindOrDie(*module_sequence, called_computation);
+          TF_RETURN_IF_ERROR(RunComputation(*called_computation,
+                                            called_sequence, points_to_analysis,
+                                            module_sequence));
+        }
+      }
+
+      // Other sub-computations (e.g. Map, Reduce, ...) are skipped; they are
+      // assigned "thread-local" allocations, meaning their buffers are not
+      // allocated up-front at the beginning of the computation.
+    }
+
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
     for (const LogicalBuffer* buffer : dead_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
     for (const LogicalBuffer* buffer : operand_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
   }
 
@@ -182,10 +245,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
-    heap.Free(buffer);
+    Free(buffer);
   }
 
-  return heap.Finish();
+  return Status::OK();
 }
 
 HeapSimulator::HeapSimulator(
@@ -304,6 +367,11 @@ HeapSimulator::Result HeapSimulator::Finish() {
         result.chunk_map.emplace(buffer, chunk);
       }
     }
+    // If we were told to assign specific buffers, make sure we've assigned
+    // exactly that many buffers.
+    if (buffers_to_assign_ != nullptr) {
+      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    }
   }
 
   // Fragmentation is the difference between the actual and ideal sizes.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 0ce2906767898bcace45e296d76f958c50a2b3a7..3d98046261902b41a17a8ab0f9a349634a1e4545 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -63,17 +64,32 @@ class HeapSimulator {
   };
 
   // Run the heap simulation with the given algorithm, assuming the given
-  // sequential ordering of instructions.  The 'instruction_sequence' must
-  // contain a topologically-consistent total ordering of all instructions in
-  // the computation.  The result is invalid if instructions are not run in
-  // exactly this sequence.
+  // module_sequence, which must contain a topologically-consistent total
+  // ordering of all instructions within each computation. The result is invalid
+  // if instructions are not run in exactly this sequence.
+  //
+  // Running heap simulation on the whole module tends to save memory, compared
+  // to running on a per-computation basis, since we can re-use buffer space for
+  // called sub-computations.
   //
   // If 'buffers_to_assign' is provided, only those buffers are assigned
   // offsets, otherwise all buffers defined by the instructions are assigned.
+  static StatusOr<Result> Run(
+      std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+      const SequentialHloOrdering::HloModuleSequence& module_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
+          nullptr);
+
+  // Same as above, but runs on a single computation. The 'instruction_sequence'
+  // must contain a topologically-consistent total ordering of all instructions
+  // in the computation. The result is invalid if instructions are not run in
+  // exactly this sequence.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const std::vector<const HloInstruction*>& instruction_sequence,
       const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
@@ -86,6 +102,12 @@ class HeapSimulator {
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign);
   ~HeapSimulator();
 
+  Status RunComputation(
+      const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const SequentialHloOrdering::HloModuleSequence* module_sequence);
+
   bool IgnoreBuffer(const LogicalBuffer* buffer) const;
   void Alloc(const LogicalBuffer* buffer);
   void Free(const LogicalBuffer* buffer);
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 874bd5f1060c179d5547510c351909069aa935b8..0a6900f73304f7a7b1209807fd3a1e8220484e03 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -19,13 +19,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace {
@@ -69,6 +72,7 @@ class HeapCallRecorder : public HeapAlgorithm {
 // sequence against an expected sequence.
 class HeapSimulatorTracker {
  public:
+  // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
@@ -83,12 +87,48 @@ class HeapSimulatorTracker {
     auto zero_size = [](const LogicalBuffer& buffer) { return 0; };
     auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
         MakeUnique<HeapCallRecorder>(&actual_calls_));
-    result_ = HeapSimulator::Run(std::move(algorithm), instruction_sequence,
-                                 *module_->entry_computation(),
-                                 *points_to_analysis_, zero_size)
+    result_ = HeapSimulator::Run(
+                  std::move(algorithm), *module_->entry_computation(),
+                  instruction_sequence, *points_to_analysis_, zero_size)
                   .ConsumeValueOrDie();
   }
 
+  explicit HeapSimulatorTracker(const string& name) {
+    module_ = MakeUnique<HloModule>(name);
+  }
+
+  // Similar to the single entry computation constructor above, but runs the
+  // simulation over the entire module.
+  void RunWholeModule(
+      const std::vector<const HloInstruction*>& full_module_sequence) {
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+
+    // Construct the module sequence grouped by computation.
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    tensorflow::gtl::FlatMap<const HloInstruction*, int> reverse_position;
+    for (int i = 0; i < full_module_sequence.size(); ++i) {
+      const HloInstruction* instruction = full_module_sequence[i];
+      module_sequence[instruction->parent()].push_back(instruction);
+      reverse_position[instruction] = full_module_sequence.size() - i;
+    }
+
+    // Hack the size_fn so that it returns a decreasing value as we step through
+    // the sequence. This lets us ensure the Alloc calls are in the sequence
+    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // deterministic.
+    auto size_fn = [&reverse_position](const LogicalBuffer& buffer) {
+      return reverse_position[buffer.instruction()];
+    };
+    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
+        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    result_ = HeapSimulator::Run(std::move(algorithm), *module_,
+                                 module_sequence, *points_to_analysis_, size_fn)
+                  .ConsumeValueOrDie();
+  }
+
+  HloModule* module() { return module_.get(); }
+
   // Returns the buffer defined at the given instruction and index.
   const LogicalBuffer* BufferAt(const HloInstruction* instruction,
                                 const ShapeIndex& index) const {
@@ -358,6 +398,86 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
   });
 }
 
+TEST_F(HeapSimulatorTest, WholeModule) {
+  HeapSimulatorTracker tracker(TestName());
+
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      tracker.module()->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      tracker.module()->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, param));
+  tracker.module()->AddEntryComputation(builder.Build());
+
+  tracker.RunWholeModule(
+      {param, while_op, body_param, cond_param, cond_iter, cond_data, cond_lt});
+  tracker.ExpectCallSequence({
+      // The entry computation param and while_op are allocated first.
+      {kAlloc, tracker.BufferAt(param, {})},
+      {kAlloc, tracker.BufferAt(param, {0})},
+      {kAlloc, tracker.BufferAt(param, {1})},
+      {kAlloc, tracker.BufferAt(while_op, {})},
+      {kAlloc, tracker.BufferAt(while_op, {0})},
+      {kAlloc, tracker.BufferAt(while_op, {1})},
+
+      // Now the while body param is allocated and freed.
+      {kAlloc, tracker.BufferAt(body_param, {})},
+      {kAlloc, tracker.BufferAt(body_param, {0})},
+      {kAlloc, tracker.BufferAt(body_param, {1})},
+      {kFree, tracker.BufferAt(body_param, {})},
+      {kFree, tracker.BufferAt(body_param, {0})},
+      {kFree, tracker.BufferAt(body_param, {1})},
+
+      // Now the while cond param is allocated. The GTE instructions just alias
+      // the param elements, so the param tuple can immediately be freed.
+      {kAlloc, tracker.BufferAt(cond_param, {})},
+      {kAlloc, tracker.BufferAt(cond_param, {0})},
+      {kAlloc, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_param, {})},
+
+      // Now the final cond less-than buffer is allocated.
+      {kAlloc, tracker.BufferAt(cond_lt, {})},
+
+      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // which is deterministic, but not obvious.
+      {kFree, tracker.BufferAt(param, {})},
+      {kFree, tracker.BufferAt(param, {0})},
+      {kFree, tracker.BufferAt(param, {1})},
+
+      {kFree, tracker.BufferAt(while_op, {})},
+      {kFree, tracker.BufferAt(while_op, {0})},
+      {kFree, tracker.BufferAt(while_op, {1})},
+
+      {kFree, tracker.BufferAt(cond_param, {0})},
+      {kFree, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_lt, {})},
+
+      {kFinish, nullptr},
+  });
+}
+
 // Base class for heap algorithm tests.
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 35f8dcb7ca614f5660850c9022049eea908f323c..2584ad39ae1c58c187d00985919a39dd184c9c63 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -35,10 +35,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+using ::tensorflow::strings::StrCat;
+
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
     HloInstruction* root_instruction) {
   int parameter_count = 0;
@@ -52,16 +56,17 @@ std::unique_ptr<HloComputation> HloComputation::Builder::Build(
       root_instruction ? root_instruction : last_added_instruction_;
   CHECK_NE(nullptr, root);
 
-  return WrapUnique(
-      new HloComputation(name_, parameter_count, &instructions_, root));
+  return WrapUnique(new HloComputation(name_, parameter_count, &instructions_,
+                                       root, is_fusion_computation_));
 }
 
 HloComputation::HloComputation(
     const string& name, int parameter_count,
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
-    HloInstruction* root_instruction)
+    HloInstruction* root_instruction, bool is_fusion_computation)
     : name_(name),
       root_instruction_(root_instruction),
+      is_fusion_computation_(is_fusion_computation),
       instruction_name_uniquer_(/*separator=*/".") {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
@@ -90,8 +95,7 @@ HloInstruction* HloComputation::AddInstruction(
 HloInstruction* HloComputation::AddInstructionInternal(
     std::unique_ptr<HloInstruction> instruction) {
   // Generate a unique name for the instruction.
-  instruction->set_name(
-      instruction_name_uniquer_.GetUniqueName(instruction->name()));
+  instruction->UniquifyName(&instruction_name_uniquer_);
   Reparent(instruction.get());
   HloInstruction* pinst = instruction.get();
   instruction_iterators_[pinst] =
@@ -99,19 +103,77 @@ HloInstruction* HloComputation::AddInstructionInternal(
   return pinst;
 }
 
-void HloComputation::Reparent(HloInstruction* instruction) {
+HloInstruction* HloComputation::AddParameter(
+    std::unique_ptr<HloInstruction> instruction) {
+  CHECK(instruction->opcode() == HloOpcode::kParameter);
+  CHECK(is_fusion_computation_);
+  CHECK(root_instruction_->fusion_instruction() != nullptr);
+  instruction->SetParentFusion(root_instruction_->fusion_instruction());
+  CHECK(root_instruction_->fusion_instruction()->operand_count() ==
+        param_instructions_.size());
   instruction->set_parent(this);
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    for (auto& i : instruction->fused_instructions()) {
-      Reparent(i.get());
+  param_instructions_.push_back(instruction.get());
+  AddInstructionInternal(std::move(instruction));
+  return instructions_.back().get();
+}
+
+Status HloComputation::RemoveParameter(int64 param_no) {
+  CHECK_GE(param_no, 0);
+  CHECK_LT(param_no, param_instructions_.size());
+  CHECK(is_fusion_computation_);
+  CHECK(root_instruction_->fusion_instruction() != nullptr);
+  HloInstruction* param_instruction = param_instructions_[param_no];
+  auto param_instruction_iterator = param_instructions_.begin() + param_no;
+  param_instructions_.erase(param_instruction_iterator);
+  // Throw removed fused parameter instruction away.
+  TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+
+  while (param_no < param_instructions_.size()) {
+    param_instruction = param_instructions_[param_no];
+    string param_name = param_instruction->parameter_name();
+    // Fusion parameters are named foo.param_1, bar.param_2, etc. We are
+    // renumbering the parameters so replace the final number in the name with
+    // the updated value.
+    const string param_underscore = ".param_";
+    size_t index = param_name.rfind(param_underscore);
+    if (index == string::npos) {
+      string after_param = name().substr(index + param_underscore.size());
+      int64 numeric_suffix;
+      if (tensorflow::strings::safe_strto64(after_param, &numeric_suffix)) {
+        param_name =
+            StrCat(param_name.substr(0, index), param_underscore, param_no);
+      }
     }
+
+    HloInstruction* new_instr =
+        AddInstructionInternal(HloInstruction::CreateParameter(
+            param_no, param_instruction->shape(), param_name));
+    TF_RETURN_IF_ERROR(param_instruction->ReplaceAllUsesWith(new_instr));
+    new_instr->SetParentFusion(root_instruction_->fusion_instruction());
+    param_instructions_[param_no] = new_instr;
+    TF_RETURN_IF_ERROR(RemoveInstruction(param_instruction));
+    param_no++;
   }
+
+  return Status::OK();
 }
 
-/* static */ bool HloComputation::IsRemovable(const HloOpcode& opcode) {
-  return !(opcode == HloOpcode::kParameter || opcode == HloOpcode::kRecv ||
-           opcode == HloOpcode::kSend || opcode == HloOpcode::kTrace ||
-           opcode == HloOpcode::kOutfeed);
+void HloComputation::Reparent(HloInstruction* instruction) {
+  instruction->set_parent(this);
+}
+
+bool HloComputation::IsRemovable(const HloInstruction* instruction) {
+  // If the instruction has control predecessors or successors then we cannot
+  // remove the instruction without violating ordering constraints (added, for
+  // example, to avert interference due to buffer aliasing).
+  if (!instruction->control_predecessors().empty() ||
+      !instruction->control_successors().empty()) {
+    return false;
+  }
+  const HloOpcode opcode = instruction->opcode();
+  return !((opcode == HloOpcode::kParameter && !is_fusion_computation_) ||
+           opcode == HloOpcode::kRecv || opcode == HloOpcode::kSend ||
+           opcode == HloOpcode::kTrace || opcode == HloOpcode::kOutfeed);
 }
 
 Status HloComputation::RemoveInstructionAndUnusedOperands(
@@ -119,7 +181,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(root_instruction() != instruction);
 
   TF_RET_CHECK(instruction->user_count() == 0);
-  TF_RET_CHECK(HloComputation::IsRemovable(instruction->opcode()));
+  TF_RET_CHECK(IsRemovable(instruction));
   std::unordered_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
@@ -128,8 +190,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.count(item) != 0 || item->user_count() != 0 ||
-        item == root_instruction() ||
-        !HloComputation::IsRemovable(item->opcode())) {
+        item == root_instruction() || !IsRemovable(item)) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
@@ -145,7 +206,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
 Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
   VLOG(2) << "Removing instruction " << instruction->name()
           << " from computation " << name();
-  TF_RET_CHECK(IsRemovable(instruction->opcode()));
+  TF_RET_CHECK(IsRemovable(instruction));
   TF_RET_CHECK(root_instruction() != instruction)
       << "cannot remove root instruction " << instruction->name();
   TF_RET_CHECK(instruction->user_count() == 0)
@@ -295,21 +356,27 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-string HloComputation::ToString() const {
+string HloComputation::ToString(int nested_level) const {
   std::ostringstream s;
+  for (int i = 0; i < nested_level; i++) {
+    s << "    ";
+  }
   s << name() << " " << ShapeUtil::HumanString(ComputeProgramShape())
     << " { \n";
   for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
+    for (int i = 0; i < nested_level; i++) {
+      s << "    ";
+    }
     s << "  " << instruction->ToString() << "\n";
     if (instruction->opcode() == HloOpcode::kFusion) {
-      tensorflow::gtl::FlatSet<HloInstruction*> added_instructions;
-      auto fused_instructions = InstructionPostOrderer::GetOrder(
-          instruction->fused_expression_root(), &added_instructions);
-      for (const auto& fused_instruction : fused_instructions) {
-        s << "    " << fused_instruction->ToString() << "\n";
-      }
+      s << instruction->fused_instructions_computation()->ToString(
+               nested_level + 1)
+        << "\n";
     }
   }
+  for (int i = 0; i < nested_level; i++) {
+    s << "    ";
+  }
   s << "}";
   return s.str();
 }
@@ -583,4 +650,44 @@ Status HloComputation::Accept(
   return this->Accept(&visitor);
 }
 
+std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix) {
+  VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
+  auto postorder = MakeInstructionPostOrder();
+  std::unordered_map<HloInstruction*, HloInstruction*> clone_map;
+  std::vector<std::unique_ptr<HloInstruction>> instructions;
+  std::unique_ptr<HloInstruction> new_instr = nullptr;
+  for (auto instr : postorder) {
+    std::vector<HloInstruction*> new_operands;
+    for (auto operand : instr->operands()) {
+      HloInstruction* new_operand = FindOrDie(clone_map, operand);
+      CHECK(new_operand != nullptr);
+      new_operands.push_back(new_operand);
+    }
+
+    new_instr = instr->CloneWithNewOperands(instr->shape(), new_operands);
+    InsertOrDie(&clone_map, instr, new_instr.get());
+    instructions.push_back(std::move(new_instr));
+  }
+  Builder builder(name() + suffix);
+  for (auto& instr : instructions) {
+    builder.AddInstruction(std::move(instr));
+  }
+  auto result = builder.Build(
+      /*root_instruction=*/FindOrDie(clone_map, root_instruction()));
+
+  // Clone control dependencies.
+  for (auto instr : postorder) {
+    HloInstruction* new_instr = FindOrDie(clone_map, instr);
+    for (auto successor : instr->control_successors()) {
+      TF_CHECK_OK(
+          new_instr->AddControlDependencyTo(FindOrDie(clone_map, successor)));
+    }
+  }
+  return result;
+}
+
+void HloComputation::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index ef3cba6fa08da81d35a3e8b06c8028cba0de8111..62e00a24fbb523e1e30f08141f9e026407a2015d 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -54,8 +54,10 @@ class HloComputation {
   // Builder class for HloComputation.
   class Builder {
    public:
-    explicit Builder(const string& name)
-        : name_(name), last_added_instruction_(nullptr) {}
+    explicit Builder(const string& name, bool is_fusion_computation = false)
+        : name_(name),
+          last_added_instruction_(nullptr),
+          is_fusion_computation_(is_fusion_computation) {}
 
     // Build and return an HloComputation. The parameter root_instruction
     // specifies the already-added instruction to use as the root. If
@@ -74,6 +76,7 @@ class HloComputation {
    private:
     const string name_;
     HloInstruction* last_added_instruction_;
+    bool is_fusion_computation_;
     std::vector<std::unique_ptr<HloInstruction>> instructions_;
   };
 
@@ -81,6 +84,16 @@ class HloComputation {
   // the instruction.
   HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
 
+  // Remove the param_no'th parameter from the computation.
+  // Note this is only applicatable to the computation for the fusion
+  // instruction.
+  Status RemoveParameter(int64 param_no);
+
+  // Add new parameter instruction to the computation.
+  // This should be a new parameter. Instruction will be appended to parameters
+  // and inserted to the instruction list.
+  HloInstruction* AddParameter(std::unique_ptr<HloInstruction> instruction);
+
   // Remove an instruction from the computation. The instruction must have no
   // users. Instruction is deallocated with this call.
   Status RemoveInstruction(HloInstruction* instruction);
@@ -121,8 +134,12 @@ class HloComputation {
 
   const string& name() const { return name_; }
 
+  // Use the given NameUniquer to select a unique name for the computation based
+  // on the computation's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
+
   // Return a string representation of the computation.
-  string ToString() const;
+  string ToString(int nested_level = 0) const;
 
   const std::list<std::unique_ptr<HloInstruction>>& instructions() const {
     return instructions_;
@@ -219,17 +236,24 @@ class HloComputation {
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const FunctionVisitor::VisitorFunction& visitor_func) const;
 
-  // Returns true if instructions of the given opcode can be removed from the
+  // Returns a deep copy of this computation including all instructions.
+  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone");
+
+  // Returns true if the given instruction can be removed from the
   // computation. Instructions such as parameters and send/receive instructions
   // cannot be removed without violating invariants of the HLO computation or
-  // module.
-  static bool IsRemovable(const HloOpcode& opcode);
+  // module with the exception of fusion computation.  A parameter instruction
+  // is removable for a fusion computation.
+  bool IsRemovable(const HloInstruction* instruction);
+
+  // Returns if this computation is a fusion computation.
+  bool IsFusionComputation() const { return is_fusion_computation_; }
 
  private:
   explicit HloComputation(
       const string& name, int parameter_count,
       std::vector<std::unique_ptr<HloInstruction>>* instructions,
-      HloInstruction* root_instruction);
+      HloInstruction* root_instruction, bool is_fusion_computation = false);
 
   // Internal helper for adding instructions.
   HloInstruction* AddInstructionInternal(
@@ -237,10 +261,6 @@ class HloComputation {
 
   // Helper for setting the parent of instructions that are added to this
   // computation.
-  //
-  // Because we clone HLO instructions without knowing what computation they're
-  // destined to be added to, this is required to appropriate set the parent on
-  // fused instruction sequences.
   void Reparent(HloInstruction* instruction);
 
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
@@ -257,9 +277,12 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
-  const string name_;
+  string name_;
   HloInstruction* root_instruction_;
 
+  // A tag shows if this is a fusion computation.
+  bool is_fusion_computation_;
+
   // Module containing this computation.
   HloModule* parent_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 12a568339627bea412dbbf478474df0f7e8190a6..3812653fe3f02f176e556e4bfb3abc6056c0cd01 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -20,15 +20,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
 class HloComputationTest : public HloTestBase {
  protected:
   HloComputationTest() {}
@@ -67,8 +74,8 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   auto negate_computation = CreateNegateComputation();
   auto map_computation = CreateMapComputation(negate_computation.get());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
-  EXPECT_EQ(map_computation->MakeEmbeddedComputationsList().front(),
-            negate_computation.get());
+  EXPECT_THAT(map_computation->MakeEmbeddedComputationsList(),
+              ElementsAre(negate_computation.get()));
 }
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
@@ -93,10 +100,10 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // GetEmbeddedComputations returns a post order of the embedded computations,
   // so the negate computation must come first.
   EXPECT_EQ(negate_computation.get(), *embedded_computations.begin());
-  EXPECT_MATCH(testing::ListToVec<HloComputation*>(embedded_computations),
-               testing::UnorderedMatcher<HloComputation*>(
-                   negate_computation.get(), map1_computation.get(),
-                   map2_computation.get()));
+  EXPECT_THAT(
+      embedded_computations,
+      UnorderedElementsAre(negate_computation.get(), map1_computation.get(),
+                           map2_computation.get()));
 }
 
 TEST_F(HloComputationTest, PostOrderSingleton) {
@@ -106,7 +113,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
-  EXPECT_EQ(computation->MakeInstructionPostOrder().front(), constant);
+  EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
 
 TEST_F(HloComputationTest, PostOrderSimple) {
@@ -121,10 +128,8 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
   auto computation = builder.Build();
 
-  EXPECT_MATCH(
-      testing::ListToVec<HloInstruction*>(
-          computation->MakeInstructionPostOrder()),
-      testing::OrderedMatcher<HloInstruction*>(constant, negate1, negate2));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              ElementsAre(constant, negate1, negate2));
 }
 
 TEST_F(HloComputationTest, PostOrderTrace) {
@@ -141,10 +146,8 @@ TEST_F(HloComputationTest, PostOrderTrace) {
   auto computation = builder.Build();
 
   // Trace instructions should be at the end of the sort.
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(
-                   computation->MakeInstructionPostOrder()),
-               testing::OrderedMatcher<HloInstruction*>(constant, negate1,
-                                                        negate2, trace));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              ElementsAre(constant, negate1, negate2, trace));
 }
 
 TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
@@ -161,10 +164,8 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto computation = builder.Build();
 
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(
-                   computation->MakeInstructionPostOrder()),
-               testing::UnorderedMatcher<HloInstruction*>(
-                   constant1, constant2, constant3, constant4));
+  EXPECT_THAT(computation->MakeInstructionPostOrder(),
+              UnorderedElementsAre(constant1, constant2, constant3, constant4));
 }
 
 TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
@@ -187,9 +188,8 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
 
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
-  EXPECT_MATCH(testing::ListToVec<HloInstruction*>(post_order),
-               testing::UnorderedMatcher<HloInstruction*>(
-                   constant1, constant2, constant3, add1, add2, add3));
+  EXPECT_THAT(post_order, UnorderedElementsAre(constant1, constant2, constant3,
+                                               add1, add2, add3));
 }
 
 TEST_F(HloComputationTest, VisitWithMultipleRoots) {
@@ -253,8 +253,7 @@ TEST_F(HloComputationTest, DeepCopyArray) {
 
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
-  EXPECT_EQ(HloOpcode::kCopy, copy->opcode());
-  EXPECT_EQ(constant, copy->operand(0));
+  EXPECT_THAT(copy, op::Copy(constant));
 }
 
 TEST_F(HloComputationTest, DeepCopyTuple) {
@@ -271,18 +270,10 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
 
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
-  EXPECT_EQ(HloOpcode::kTuple, tuple_copy->opcode());
-  EXPECT_EQ(HloOpcode::kCopy, tuple_copy->operand(0)->opcode());
-  const HloInstruction* gte0 = tuple_copy->operand(0)->operand(0);
-  EXPECT_EQ(HloOpcode::kGetTupleElement, gte0->opcode());
-  EXPECT_EQ(0, gte0->tuple_index());
-  EXPECT_EQ(tuple, gte0->operand(0));
-
-  EXPECT_EQ(HloOpcode::kCopy, tuple_copy->operand(1)->opcode());
-  const HloInstruction* gte1 = tuple_copy->operand(1)->operand(0);
-  EXPECT_EQ(HloOpcode::kGetTupleElement, gte1->opcode());
-  EXPECT_EQ(1, gte1->tuple_index());
-  EXPECT_EQ(tuple, gte1->operand(0));
+  EXPECT_THAT(tuple_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
+                                    op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_EQ(0, tuple_copy->operand(0)->operand(0)->tuple_index());
+  EXPECT_EQ(1, tuple_copy->operand(1)->operand(0)->tuple_index());
 }
 
 TEST_F(HloComputationTest, CycleDetection) {
@@ -302,8 +293,8 @@ TEST_F(HloComputationTest, CycleDetection) {
   const auto visitor = [](HloInstruction* instruction) { return Status::OK(); };
   auto visit_status = computation->Accept(visitor);
   ASSERT_FALSE(visit_status.ok());
-  ASSERT_MATCH(visit_status.error_message(),
-               testing::ContainsRegex("cycle is detecte"));
+  ASSERT_THAT(visit_status.error_message(),
+              ::testing::ContainsRegex("cycle is detecte"));
 }
 
 TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
@@ -322,14 +313,45 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
   auto computation = builder.Build();
 
   EXPECT_EQ(4, computation->instruction_count());
+  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
   EXPECT_EQ(negate, computation->root_instruction());
 
   ASSERT_IS_OK(computation->RemoveInstructionAndUnusedOperands(dead_add));
 
   EXPECT_EQ(2, computation->instruction_count());
+  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
   EXPECT_EQ(negate, computation->root_instruction());
 }
 
+TEST_F(HloComputationTest, CloneWithControlDependency) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
+  auto computation = builder.Build(/*root_instruction=*/add);
+
+  TF_CHECK_OK(negate->AddControlDependencyTo(add));
+
+  auto clone = computation->Clone();
+
+  auto cloned_add = clone->root_instruction();
+  EXPECT_EQ(cloned_add->opcode(), HloOpcode::kAdd);
+
+  auto predecessors = cloned_add->control_predecessors();
+  EXPECT_EQ(1, predecessors.size());
+  EXPECT_EQ(HloOpcode::kNegate, predecessors[0]->opcode());
+  auto successors = predecessors[0]->control_successors();
+  EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 9a5345dc13d6db42553e9c343f7c81cd0e6c9d0e..cb0a99d773c57ba9a2fedc2842fe17cd5fe3571e 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -15,16 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 
-#include <list>
-#include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -34,52 +32,222 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
+namespace {
+
+template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
+static std::unique_ptr<Literal> ConvertIfTypesMatch(
+    const Literal& src_literal) {
+  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
+  return LiteralUtil::Convert<
+      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<
+          primitive_dest_type>::type>(src_literal);
+}
+
+template <PrimitiveType primitive_src_type>
+static std::unique_ptr<Literal> ConvertIfDestTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (primitive_dest_type) {
+#define CONVERT_IF_TYPES_MATCH(type) \
+  case (type):                       \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+    CONVERT_IF_TYPES_MATCH(PRED)
+    CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S32)
+    CONVERT_IF_TYPES_MATCH(S64)
+    CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U32)
+    CONVERT_IF_TYPES_MATCH(U64)
+    CONVERT_IF_TYPES_MATCH(F32)
+    CONVERT_IF_TYPES_MATCH(F64)
+#undef CONVERT_IF_TYPES_MATCH
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+static std::unique_ptr<Literal> ConvertIfSrcTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (src_literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
+  case (type):                             \
+    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
+    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
+    CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S32)
+    CONVERT_IF_DEST_TYPE_MATCHES(S64)
+    CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U32)
+    CONVERT_IF_DEST_TYPE_MATCHES(U64)
+    CONVERT_IF_DEST_TYPE_MATCHES(F32)
+    CONVERT_IF_DEST_TYPE_MATCHES(F64)
+#undef CONVERT_IF_DEST_TYPE_MATCHES
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+}  // namespace
+
+// ConstantFolderVisitor traverses the HLO computation and reduces certain
+// constant graph sections, to literals.
+class ConstantFolderVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
+  Status HandleConvert(HloInstruction* convert,
+                       HloInstruction* operand) override;
+
+  Status HandleReshape(HloInstruction* reshape) override;
+
+  Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+
+  Status HandleTranspose(HloInstruction* transpose) override;
+
+  // Returns whether a constant folding operation has occurred.
+  const bool changed() const { return changed_; }
+
+  // Runs the visitor on a computation and returns whether any changes were
+  // performed.
+  static StatusOr<bool> Run(HloComputation* computation);
+
+ private:
+  ConstantFolderVisitor() = default;
+
+  // Replaces the existing HLO instruction old_instruction, with a literal,
+  // and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithConstant(HloInstruction* old_instruction,
+                             std::unique_ptr<Literal> literal) {
+    TF_RETURN_IF_ERROR(old_instruction->parent()->ReplaceWithNewInstruction(
+        old_instruction, HloInstruction::CreateConstant(std::move(literal))));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Whether any constant folding operations have occurred.
+  bool changed_ = false;
+};
+
+StatusOr<bool> ConstantFolderVisitor::Run(HloComputation* computation) {
+  ConstantFolderVisitor visitor;
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.changed();
+}
 
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
+  XLA_VLOG_LINES(2,
+                 "HloConstantFolding::Run(), before:\n" + module->ToString());
   bool changed = false;
-  for (auto& computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      // Skip dead code.
-      if (instruction->user_count() == 0 &&
-          computation->root_instruction() != instruction) {
-        continue;
-      }
-      // Depending on the opcode, choose how to handle constant operands.
-      //
-      // TODO(b/35975797): Fold constant computations for more than reshapes and
-      // transposes.
-      switch (instruction->opcode()) {
-        case HloOpcode::kReshape: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            TF_ASSIGN_OR_RETURN(
-                auto reshaped_literal,
-                LiteralUtil::Reshape(
-                    instruction->operand(0)->literal(),
-                    AsInt64Slice(instruction->shape().dimensions())));
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(reshaped_literal))));
-            changed = true;
-          }
-          break;
-        }
-        case HloOpcode::kTranspose: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            auto transposed_literal = LiteralUtil::Transpose(
-                instruction->operand(0)->literal(), instruction->dimensions());
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(transposed_literal))));
-            changed = true;
-          }
-          break;
-        }
-        default:
-          break;
+  for (auto& comp : module->computations()) {
+    TF_ASSIGN_OR_RETURN(bool result, ConstantFolderVisitor::Run(comp.get()));
+    changed = changed || result;
+  }
+  XLA_VLOG_LINES(2, "HloConstantFolding::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+Status ConstantFolderVisitor::HandleReshape(HloInstruction* reshape) {
+  if (reshape->operand(0)->opcode() == HloOpcode::kConstant) {
+    TF_ASSIGN_OR_RETURN(
+        auto reshaped_literal,
+        LiteralUtil::Reshape(reshape->operand(0)->literal(),
+                             AsInt64Slice(reshape->shape().dimensions())));
+    return ReplaceWithConstant(reshape, std::move(reshaped_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleTranspose(HloInstruction* transpose) {
+  if (transpose->operand(0)->opcode() == HloOpcode::kConstant) {
+    auto transposed_literal = LiteralUtil::Transpose(
+        transpose->operand(0)->literal(), transpose->dimensions());
+    return ReplaceWithConstant(transpose, std::move(transposed_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  if (operands[0]->opcode() == HloOpcode::kConstant) {
+    // If all the operands of a concatenate are constant, fold them into a
+    // single constant tensor.
+    // The result concatenate dimension is going to be the sum of all the
+    // concatenate dimensions of the arrays taking part of the operation.
+    int64 concat_dim = concatenate->dimensions()[0];
+    const Shape& reference_shape = operands[0]->shape();
+    CHECK(!ShapeUtil::IsTuple(reference_shape));
+    int64 rank = ShapeUtil::Rank(reference_shape);
+    std::vector<int64> concat_dimensions(reference_shape.dimensions().begin(),
+                                         reference_shape.dimensions().end());
+    if (concat_dim < 0) {
+      concat_dim += rank;
+    }
+    for (int64 i = 1; i < operands.size(); ++i) {
+      const Shape& operand_shape = operands[i]->shape();
+      CHECK(!ShapeUtil::IsTuple(operand_shape));
+      if (operands[i]->opcode() != HloOpcode::kConstant) {
+        return Status::OK();
       }
+      // Accumulate the concat dimension from all tensors taking part to the
+      // operation.
+      concat_dimensions[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
+    }
+
+    auto literal = LiteralUtil::CreateFromDimensions(
+        reference_shape.element_type(), concat_dimensions);
+    std::vector<int64> source_indices(rank, 0);
+    std::vector<int64> dest_indices(concat_dimensions.size(), 0);
+    for (auto operand : operands) {
+      const Shape& operand_shape = operand->shape();
+      TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+          operand->literal(), source_indices, literal.get(), dest_indices,
+          AsInt64Slice(operand_shape.dimensions())));
+      dest_indices[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
     }
+    return ReplaceWithConstant(concatenate, std::move(literal));
   }
-  return changed;
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleSlice(HloInstruction* slice,
+                                          HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Shape& shape = slice->shape();
+    auto literal = LiteralUtil::CreateFromDimensions(
+        shape.element_type(), AsInt64Slice(shape.dimensions()));
+    std::vector<int64> dest_indices(slice->slice_starts().size(), 0);
+    TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+        operand->literal(), slice->slice_starts(), literal.get(), dest_indices,
+        AsInt64Slice(shape.dimensions())));
+    TF_RETURN_IF_ERROR(ReplaceWithConstant(slice, std::move(literal)));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConvert(HloInstruction* convert,
+                                            HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Literal& src_literal = operand->literal();
+    std::unique_ptr<Literal> new_constant =
+        ConvertIfSrcTypeMatches(src_literal, convert->shape().element_type());
+    return ReplaceWithConstant(convert, std::move(new_constant));
+  }
+  return Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
index 514bb8164c1e1fa10a36ceeeac63dc946de2ab5a..331480bd029727fa15476cb9ced2e7b7afd170f3 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.h
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -21,16 +21,14 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs constant folding in order to avoid unecessary
+// A pass which performs constant folding in order to avoid unnecessary
 // computation on constants.
 class HloConstantFolding : public HloPassInterface {
  public:
-  explicit HloConstantFolding() {}
-  ~HloConstantFolding() override {}
   tensorflow::StringPiece name() const override { return "constant_folding"; }
 
-  // Run ConstantFolding on the given module. Returns whether the module was
-  // changed (common subexpressions were found and eliminated).
+  // Run constant folding operations on the given module. Returns whether the
+  // module was changed (constant expressions folded).
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a56225da156dfc0a44b6a4b99191a3c7e706561f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using HloConstantFoldingTest = HloTestBase;
+
+TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
+                computation->root_instruction()->literal()),
+            42);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
+                computation->root_instruction()->literal()),
+            42.0f);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
+      42);
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
+      19);
+}
+
+TEST_F(HloConstantFoldingTest, Concatenate) {
+  const struct TestConfig {
+    int concat_dimension;
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    tensorflow::gtl::ArraySlice<int64> concat_sizes;
+  } test_configs[] = {
+      {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}},
+      {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}},
+  };
+
+  for (auto& test_config : test_configs) {
+    HloComputation::Builder builder(TestName());
+    std::vector<int64> dimensions(test_config.dimensions.begin(),
+                                  test_config.dimensions.end());
+    int64 concat_size = 0;
+    std::vector<HloInstruction*> operands;
+    for (auto csize : test_config.concat_sizes) {
+      dimensions[test_config.concat_dimension] = csize;
+      concat_size += csize;
+      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+      HloInstruction* insn = builder.AddInstruction(
+          HloInstruction::CreateConstant(std::move(literal)));
+      operands.push_back(insn);
+    }
+    dimensions[test_config.concat_dimension] = concat_size;
+    Shape shape = ShapeUtil::MakeShape(F32, dimensions);
+    builder.AddInstruction(HloInstruction::CreateConcatenate(
+        shape, operands, test_config.concat_dimension));
+    auto module = MakeUnique<HloModule>(TestName());
+    auto computation = module->AddEntryComputation(builder.Build());
+
+    HloConstantFolding const_folder;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+    EXPECT_TRUE(result);
+
+    HloInstruction* root = computation->root_instruction();
+    EXPECT_THAT(root, op::Constant());
+    EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+  }
+}
+
+TEST_F(HloConstantFoldingTest, Slice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  const int64 slice_start[] = {4, 2, 3, 1, 5};
+  const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      shape, literal_instruction, slice_start, slice_limits));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+}
+
+TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
+  const int64 permutation[] = {1, 2, 0, 4, 3};
+  builder.AddInstruction(
+      HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
+
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
+  bool matched = true;
+  LiteralUtil::EachCell<NativeT>(
+      root->literal(),
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+        std::vector<int64> rindexes = Permute(permutation, indices);
+        matched = matched && (value == LiteralUtil::Get<NativeT>(*literal_clone,
+                                                                 rindexes));
+      });
+  EXPECT_TRUE(matched);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 8fe1897e75cd0b5f013877b718735d117a5ee06b..38cc74b0f1e640d4e72188416258d9b262053152 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -136,9 +136,9 @@ Status HloCostAnalysis::HandleSlice(HloInstruction* slice,
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleDynamicSlice(
-    HloInstruction* slice,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+Status HloCostAnalysis::HandleDynamicSlice(HloInstruction* dynamic_slice,
+                                           HloInstruction* operand,
+                                           HloInstruction* start_indices) {
   return Status::OK();
 }
 
@@ -357,7 +357,9 @@ Status HloCostAnalysis::HandleRng(HloInstruction* random,
 Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
   // Compute the cost of the fused expression.
   HloInstruction* fused_expression_root = fusion->fused_expression_root();
-  HloCostAnalysis visitor(shape_size_);
+  // Don't compute sizes inside of fused ops. We don't use the size here and the
+  // operations inside might not have a layout.
+  HloCostAnalysis visitor([](const Shape&) { return 0; });
   TF_RETURN_IF_ERROR(fused_expression_root->Accept(&visitor));
 
   // Attribute the cost of the fused expression to the fusion node.
@@ -366,11 +368,9 @@ Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCall(
-    HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloComputation* computation) {
+Status HloCostAnalysis::HandleCall(HloInstruction* call) {
   HloCostAnalysis computation_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(computation->Accept(&computation_visitor));
+  TF_RETURN_IF_ERROR(call->to_apply()->Accept(&computation_visitor));
 
   current_flop_count_ = computation_visitor.flop_count();
   current_transcendental_count_ = computation_visitor.transcendental_count();
@@ -394,18 +394,15 @@ Status HloCostAnalysis::HandleSort(HloInstruction* sort,
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while,
-                                    HloInstruction* init,
-                                    HloComputation* condition,
-                                    HloComputation* body) {
+Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while) {
   // Since the number of iterations of the while node is not statically
   // determined, we cannot precisely compute the cost of a while node. For now
   // compute the cost of a single iteration.
   // TODO(b/26346211): Improve the cost analysis for while node.
   HloCostAnalysis body_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(body->Accept(&body_visitor));
+  TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&body_visitor));
   HloCostAnalysis condition_visitor(shape_size_);
-  TF_RETURN_IF_ERROR(condition->Accept(&condition_visitor));
+  TF_RETURN_IF_ERROR(xla_while->while_condition()->Accept(&condition_visitor));
 
   current_flop_count_ =
       body_visitor.flop_count() + condition_visitor.flop_count();
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index e6f059f53379df51c9f0b99e0e01f34f1aebb52a..b2c40f75ca4e833f1f5529977564b0e3a7ca25b1 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -84,16 +84,14 @@ class HloCostAnalysis : public DfsHloVisitor {
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       HloComputation* function_handle) override;
   Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call,
-                    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-                    HloComputation* computation) override;
+  Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call,
                           tensorflow::gtl::ArraySlice<HloInstruction*> operands,
                           tensorflow::StringPiece custom_call_target) override;
   Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
-  Status HandleDynamicSlice(
-      HloInstruction* slice,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice,
+                            HloInstruction* operand,
+                            HloInstruction* start_indices) override;
   Status HandleDynamicUpdateSlice(HloInstruction* dynamic_update_slice,
                                   HloInstruction* operand,
                                   HloInstruction* update,
@@ -115,8 +113,7 @@ class HloCostAnalysis : public DfsHloVisitor {
   Status HandlePad(HloInstruction* pad) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
-  Status HandleWhile(HloInstruction* xla_while, HloInstruction* init,
-                     HloComputation* condition, HloComputation* body) override;
+  Status HandleWhile(HloInstruction* xla_while) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -136,7 +133,7 @@ class HloCostAnalysis : public DfsHloVisitor {
   int64 bytes_accessed() const { return bytes_accessed_; }
 
  private:
-  // An FMA counts as two floating point operations in these analyses.
+  // An FMA counts as two floating point operations in these analyzes.
   static constexpr int64 kFmaFlops = 2;
 
   // Utility function to handle all element-wise operations.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 9f1c91d41c6bbe8f4cd61120ab0e260097214187..f71ffeb887a6a066a1516b941ca5bf237efc2890 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -126,8 +126,10 @@ class HloCostAnalysisTest : public ::testing::Test {
     auto user_computation = user_computation_status.ConsumeValueOrDie();
     VersionedComputationHandle versioned_handle =
         user_computation->GetVersionedHandle();
-    return std::move(
-        computation_tracker_.BuildHloModule(versioned_handle).ValueOrDie());
+    return std::move(computation_tracker_
+                         .BuildHloModule(versioned_handle,
+                                         /*config=*/nullptr)
+                         .ValueOrDie());
   }
 
   Client* client_;
@@ -375,6 +377,33 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
   EXPECT_EQ(fusion_analysis.transcendental_count(), 4);
 }
 
+TEST_F(FusionCostAnalysis, NoLayout) {
+  Shape shape_with_layout = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
+  // Instructions within a fused op may have no layout.
+  Shape shape_without_layout = shape_with_layout;
+  shape_without_layout.clear_layout();
+
+  auto c1 = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR4FromArray4D(Array4D<float>(2, 3, 4, 5)));
+  auto c2 =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({1, 2, 3}));
+
+  auto broadcast =
+      HloInstruction::CreateBroadcast(shape_without_layout, c2.get(), {1});
+  auto add = HloInstruction::CreateBinary(shape_with_layout, HloOpcode::kAdd,
+                                          c1.get(), broadcast.get());
+
+  auto fusion = HloInstruction::CreateFusion(
+      shape_with_layout, HloInstruction::FusionKind::kLoop, add.get());
+  fusion->FuseInstruction(broadcast.get());
+
+  HloCostAnalysis fusion_analysis(ShapeSize);
+  ASSERT_IS_OK(fusion->Accept(&fusion_analysis));
+
+  EXPECT_EQ(fusion_analysis.flop_count(), 120);
+  EXPECT_EQ(fusion_analysis.transcendental_count(), 0);
+}
+
 TEST_F(HloCostAnalysisTest, TupleCost) {
   HloCostAnalysis analysis(ShapeSize);
   {
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index ec8161f55fd56c95bb088a0c539255aed2fe6993..9444382b5270b0f76fa33b598297d24572e5b2c9 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -36,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -88,13 +91,15 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_NE(add->operand(0), add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_EQ(add->operand(0), add->operand(1));
+  auto first_operand = add->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(constant1, constant2));
+  EXPECT_THAT(add, op::Add(first_operand, first_operand));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -118,15 +123,13 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(constant1, add->operand(0));
-  EXPECT_EQ(constant2, add->operand(1));
+  EXPECT_THAT(add, op::Add(constant1, constant2));
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = LiteralUtil::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
@@ -185,16 +188,18 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
+  EXPECT_THAT(tuple,
+              op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(uncommon_constant, tuple->operand(2));
-  EXPECT_TRUE(tuple->operand(0) == common_constant1 ||
-              tuple->operand(0) == common_constant2);
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand,
+              ::testing::AnyOf(common_constant1, common_constant2));
+  EXPECT_THAT(tuple,
+              op::Tuple(first_operand, first_operand, uncommon_constant));
 }
 
 TEST_F(HloCseTest, IdenticalInstructions) {
@@ -215,16 +220,15 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
-  EXPECT_NE(tuple->operand(1), tuple->operand(2));
-  EXPECT_NE(tuple->operand(0), tuple->operand(2));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(tuple->operand(1), tuple->operand(2));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2, exp3));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
@@ -249,13 +253,13 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_FALSE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 }
 
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
@@ -280,13 +284,15 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
+  auto first_operand = tuple->operand(0);
+  EXPECT_THAT(first_operand, ::testing::AnyOf(exp1, exp2));
+  EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand));
 }
 
 TEST_F(HloCseTest, IdenticalExpressions) {
@@ -328,14 +334,15 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto computation = module.AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
-  EXPECT_NE(tuple->operand(0), tuple->operand(1));
+  EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(&module).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
-  EXPECT_EQ(tuple->operand(0), tuple->operand(1));
-  EXPECT_EQ(HloOpcode::kAdd, tuple->operand(0)->opcode());
+  auto operand = tuple->operand(0);
+  EXPECT_THAT(tuple, op::Tuple(operand, operand));
+  EXPECT_THAT(operand, op::Add(op::Negate(), op::Exp()));
 }
 
 TEST_F(HloCseTest, DoNotCombineRng) {
@@ -351,12 +358,16 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   auto rng2 = builder.AddInstruction(HloInstruction::CreateRng(
       ShapeUtil::MakeShape(F32, {}), RandomDistribution::RNG_UNIFORM,
       {constant1, constant2}));
+
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
 
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
+
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
@@ -364,11 +375,8 @@ TEST_F(HloCseTest, DoNotCombineRng) {
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kRng);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kRng);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(rng1, rng2));
 }
 
 // TODO(b/28245743): Handle impure functions correctly in CSE.
@@ -412,16 +420,17 @@ TEST_F(HloCseTest, DISABLED_DoNotCombineCallsToImpureFunctions) {
   }
 
   EXPECT_EQ(4, computation->instruction_count());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Map(), op::Map()));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kMap);
-  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kMap);
-  EXPECT_NE(root->operand(0), root->operand(1));
+  root = computation->root_instruction();
+  auto operand = root->operand(0)->operand(0);
+  EXPECT_THAT(operand, op::Map());
+  EXPECT_THAT(root, op::Add(operand, operand));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index fdfbbf8baf65884fcb1eed846e6ce3eda07bc45d..3755b9e4c005c5e50b149d8dc8c51363eb111868 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -52,7 +52,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto& instruction : computation->instructions()) {
       if (instruction->user_count() == 0 &&
           live_instructions.count(instruction.get()) == 0 &&
-          HloComputation::IsRemovable(instruction->opcode())) {
+          computation->IsRemovable(instruction.get())) {
         dead_roots.push_back(instruction.get());
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index dcd9e00c56c76046e6c1de75558637b7e941e57e..4191eaaad06da5baf01cd74e6a52d6aacf396cd6 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -93,5 +94,65 @@ TEST_F(HloDceTest, DeadParameters) {
   EXPECT_EQ(0, dead_param1->user_count());
 }
 
+TEST_F(HloDceTest, ControlDependencies) {
+  // Verify that instructions with control dependencies are not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(123.0f)));
+
+  // Create two dead instructions: a negate and an add.
+  auto dead_negate = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant1->shape(), HloOpcode::kNegate, constant1));
+  auto dead_add = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  // Create the same two instructions again, but these will have a control
+  // dependency added.
+  auto dead_negate_with_control_dep =
+      builder.AddInstruction(HloInstruction::CreateUnary(
+          constant1->shape(), HloOpcode::kNegate, constant1));
+  auto dead_add_with_control_dep =
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  // Create a root so the previously added instruction is dead.
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      constant1->shape(), HloOpcode::kAdd, constant1, constant2));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Add a control dependency between two instructions.
+  TF_ASSERT_OK(dead_negate_with_control_dep->AddControlDependencyTo(
+      dead_add_with_control_dep));
+
+  // Returns whether the given instruction exists in the test computation.
+  auto has_instruction = [computation](const HloInstruction* instruction) {
+    for (auto& inst : computation->instructions()) {
+      if (inst.get() == instruction) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  EXPECT_EQ(7, computation->instruction_count());
+  EXPECT_TRUE(has_instruction(dead_negate));
+  EXPECT_TRUE(has_instruction(dead_add));
+  EXPECT_TRUE(has_instruction(dead_negate_with_control_dep));
+  EXPECT_TRUE(has_instruction(dead_add_with_control_dep));
+
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(5, computation->instruction_count());
+  EXPECT_FALSE(has_instruction(dead_negate));
+  EXPECT_FALSE(has_instruction(dead_add));
+  EXPECT_TRUE(has_instruction(dead_negate_with_control_dep));
+  EXPECT_TRUE(has_instruction(dead_add_with_control_dep));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0447d69aa2229e2cb391aac8b2afa8fde6145c1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -0,0 +1,557 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+template <typename ReturnT>
+class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
+
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()).c_str());
+  };
+
+  // TODO(b/35950897): many of the stl functions used in the handlers are not
+  // overloaded for every XLA primitive types.
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleAbs(HloInstruction* abs, HloInstruction* operand) override {
+    return HandleAbs<ReturnT>(abs, operand);
+  };
+
+  Status HandleCeil(HloInstruction* ceil, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
+                        ElementWiseUnaryOp(ceil, [](ReturnT elem_operand) {
+                          return std::ceil(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleCopy(HloInstruction* copy, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[copy],
+                        ElementWiseUnaryOp(copy, [](ReturnT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleExp(HloInstruction* exp, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ReturnT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleFloor(HloInstruction* floor, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[floor],
+                        ElementWiseUnaryOp(floor, [](ReturnT elem_operand) {
+                          return std::floor(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleIsFinite(HloInstruction* is_finite,
+                        HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[is_finite],
+                        ElementWiseUnaryOp(is_finite, [](ReturnT elem_operand) {
+                          return std::isfinite(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLog(HloInstruction* log, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ReturnT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalNot(HloInstruction* logical_not,
+                          HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_not],
+        ElementWiseUnaryOp(logical_not,
+                           [](ReturnT elem_operand) { return !elem_operand; }));
+    return Status::OK();
+  };
+
+  Status HandleNegate(HloInstruction* negate,
+                      HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
+                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
+                          return -elem_operand;
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleSign(HloInstruction* sign, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ReturnT elem_operand) {
+                          return (ReturnT(0) < elem_operand) -
+                                 (elem_operand < ReturnT(0));
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleTanh(HloInstruction* tanh, HloInstruction* operand) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ReturnT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return Status::OK();
+  };
+
+  Status HandleMultiply(HloInstruction* multiply, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem * rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleSubtract(HloInstruction* subtract, HloInstruction* lhs,
+                        HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(subtract, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem - rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleAdd(HloInstruction* add, HloInstruction* lhs,
+                   HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[add],
+        ElementWiseBinaryOp(add, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem + rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleDivide(HloInstruction* divide, HloInstruction* lhs,
+                      HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[divide],
+        ElementWiseBinaryOp(divide, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return lhs_elem / rhs_elem;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleCompare(HloInstruction* compare, HloOpcode opcode,
+                       HloInstruction* lhs, HloInstruction* rhs) override {
+    std::function<bool(ReturnT, ReturnT)> compare_op;
+    switch (opcode) {
+      case HloOpcode::kEq:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el == rhs_el;
+        };
+        break;
+      case HloOpcode::kNe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el != rhs_el;
+        };
+        break;
+      case HloOpcode::kGe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el >= rhs_el;
+        };
+        break;
+      case HloOpcode::kGt:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el > rhs_el;
+        };
+        break;
+      case HloOpcode::kLe:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el <= rhs_el;
+        };
+        break;
+      case HloOpcode::kLt:
+        compare_op = [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el < rhs_el;
+        };
+        break;
+      default:
+        LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
+                   << HloOpcodeString(opcode);
+    }
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(compare->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Compare operation with mismatched dimensions, likely due to "
+          "broadcasting is unsupported.");
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = LiteralUtil::CreateFromShape(compare->shape());
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<bool>(
+          result.get(), multi_index,
+          compare_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
+                     LiteralUtil::Get<ReturnT>(rhs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    parent_->evaluated_[compare] = std::move(result);
+
+    return Status::OK();
+  };
+
+  Status HandleMaximum(HloInstruction* maximum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ReturnT lhs, ReturnT rhs) {
+          return std::max(lhs, rhs);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleMinimum(HloInstruction* minimum, HloInstruction* lhs,
+                       HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::min(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandlePower(HloInstruction* power, HloInstruction* lhs,
+                     HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(power, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::pow(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleRemainder(HloInstruction* remainder, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[remainder],
+        ElementWiseBinaryOp(remainder, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return std::remainder(lhs_el, rhs_el);
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalAnd(HloInstruction* logical_and, HloInstruction* lhs,
+                          HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_and],
+        ElementWiseBinaryOp(logical_and, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el && rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleLogicalOr(HloInstruction* logical_or, HloInstruction* lhs,
+                         HloInstruction* rhs) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[logical_or],
+        ElementWiseBinaryOp(logical_or, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el || rhs_el;
+        }));
+    return Status::OK();
+  };
+
+  Status HandleClamp(HloInstruction* clamp, HloInstruction* min,
+                     HloInstruction* arg, HloInstruction* max) override {
+    std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
+        [](ReturnT low, ReturnT high, ReturnT value) {
+          return std::max(low, std::min(value, high));
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
+                        ElementWiseTernaryOp(clamp, std::move(clamp_op)));
+    return Status::OK();
+  };
+
+  Status HandleSelect(HloInstruction* select, HloInstruction* pred,
+                      HloInstruction* on_true,
+                      HloInstruction* on_false) override {
+    CHECK(!ShapeUtil::IsTuple(select->shape()));
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementWiseTernaryOp(select, std::move(select_op)));
+    return Status::OK();
+  };
+
+  Status Preprocess(HloInstruction* hlo) override {
+    VLOG(2) << hlo->ToString();
+    return Status::OK();
+  };
+
+ private:
+  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT)>& unary_op) {
+    const auto shape = instruction->shape();
+    const auto* operand = instruction->operand(0);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(operand->shape()).c_str());
+    }
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          unary_op(LiteralUtil::Get<ReturnT>(operand_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(ReturnT, ReturnT)>& binary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          binary_op(LiteralUtil::Get<ReturnT>(lhs_literal, multi_index),
+                    LiteralUtil::Get<ReturnT>(rhs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  StatusOr<std::unique_ptr<Literal>> ElementWiseTernaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str(),
+          ShapeUtil::HumanString(ehs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    auto result = LiteralUtil::CreateFromShape(shape);
+    std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
+    do {
+      LiteralUtil::Set<ReturnT>(
+          result.get(), multi_index,
+          ternary_op(LiteralUtil::Get<LhsType>(lhs_literal, multi_index),
+                     LiteralUtil::Get<RhsType>(rhs_literal, multi_index),
+                     LiteralUtil::Get<EhsType>(ehs_literal, multi_index)));
+    } while (IndexUtil::BumpIndices(result->shape(), &multi_index));
+
+    return std::move(result);
+  };
+
+  HloEvaluator* parent_;
+};
+
+HloEvaluator::HloEvaluator() {
+  typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
+  typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
+  typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: U16.");
+  });
+  typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
+  typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
+  typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: S16.");
+  });
+  typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
+  typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
+  typed_visitors_[F16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("unhandled primitive type: F16.");
+  });
+  typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
+  typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloComputation* computation,
+    tensorflow::gtl::ArraySlice<const Literal*> args) {
+  arg_literals_ = args;
+  evaluated_.clear();
+
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+  return std::move(FindOrDie(evaluated_, computation->root_instruction()));
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const Literal*> operands) {
+  DCHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
+  Shape shape = instruction->shape();
+  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
+
+  arg_literals_ = operands;
+  evaluated_.clear();
+
+  // Evaluate operands of Parameter type against the input literals which
+  // caches the evaluated literal results.
+  for (const auto operand : instruction->operands()) {
+    if (operand->opcode() == HloOpcode::kParameter) {
+      const Literal* input_literal = arg_literals_[operand->parameter_number()];
+      VLOG(2) << "Parameter operand evaluated to: "
+              << LiteralUtil::ToString(*input_literal);
+      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
+
+      evaluated_[operand] = MakeUnique<Literal>(*input_literal);
+    } else if (operand->opcode() == HloOpcode::kConstant) {
+      evaluated_[operand] = MakeUnique<Literal>(operand->literal());
+    }
+  }
+
+  TF_RETURN_IF_ERROR(instruction->Visit(this));
+  return std::move(FindOrDie(evaluated_, instruction));
+}
+
+Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  VLOG(2) << "HandleParameter: " << parameter->ToString();
+  const Literal* input_literal = arg_literals_[parameter->parameter_number()];
+  VLOG(2) << "Parameter evaluated to: "
+          << LiteralUtil::ToString(*input_literal);
+  DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+
+  evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConstant(HloInstruction* constant,
+                                    const Literal& literal) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  DCHECK(ShapeUtil::Equal(constant->shape(), literal.shape()));
+
+  evaluated_[constant] = MakeUnique<Literal>(literal);
+  return Status::OK();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
new file mode 100644
index 0000000000000000000000000000000000000000..040fd3d73c8e5887f4b5d2952a088687b099c560
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// Responsible for evaluating HLO and obtain literal as the evaluation results.
+//
+// This class is not thread-safe.
+class HloEvaluator : public DfsHloVisitorWithDefault {
+ public:
+  HloEvaluator();
+  // Evaluates a HLO computation and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition: argument literals are corresponds to the input computation's
+  // parameters in their post-ordering. For e.g., consider the following graph:
+  //
+  //                *
+  //            /       \
+  //            +     Parameter1
+  //        /      \
+  //       /        \
+  //    Parameter0  Constant
+  //
+  // The input literals array will have its first literal map to Parameter0 and
+  // the second map to Parameter1.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloComputation* computation,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+  // Evaluates a single HLO instruction and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition:
+  // 1. argument literals are corresponds to the input instruction's
+  // parameters in their post-orderring.
+  // 2. the instruction's operands must be of either Parameter or Constant type.
+  // TODO(b/35950897): implement more ops other than element-wise ops.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloInstruction* instruction,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+ protected:
+  // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
+  // literal type of each evaluated Handle* method of a TypedVisitor. One
+  // exception to this is HandleCompare, where the resulting literal type is
+  // always boolean.
+  // Note the forward declaration here is necessary to enable TypedVisitor to
+  // access parent members.
+  template <typename ReturnT>
+  class TypedVisitor;
+
+  // Wraps around instruction handling to infer types before dispatching to
+  // the corresponding typed Visitor.
+  Status DefaultAction(HloInstruction* hlo) override {
+    return hlo->Visit(typed_visitors_.at(hlo->shape().element_type()).get());
+  }
+
+  Status HandleParameter(HloInstruction* parameter) override;
+
+  Status HandleConstant(HloInstruction* constant,
+                        const Literal& literal) override;
+
+ private:
+  // Returns the already-evaluated literal result for the instruction.
+  // Crash with log if the given instruction has not been evaluated previously.
+  const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
+    auto it = evaluated_.find(hlo);
+    CHECK(it != evaluated_.end())
+        << "could not find evaluated value for: " << hlo->ToString();
+    return *(it->second);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  // Note: the hash function here is only needed because current gcc std::hash
+  // does not specialize for enum types. This should however be fixed in the
+  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
+  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
+                           std::hash<int>>
+      typed_visitors_;
+
+  // Tracks the HLO instruction and its evaluated literal result.
+  // TODO(b/35950897): have better memory management here to free instructions
+  // that are no longer a parent for any other subsequent instruction in
+  // post-orderring.
+  tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
+      evaluated_;
+
+  // Stores input literals, assuming they are in post-order. Literals are not
+  // owned by this class, and they must outlive the lifetime of the instance of
+  // this class.
+  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..443e5ad4f4290ff10b867887ac5ed359a0c8f73a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -0,0 +1,191 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class HloEvaluatorTest : public ::testing::Test {
+ protected:
+  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+
+  std::unique_ptr<HloEvaluator> evaluator_;
+};
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
+// with 3 operands.
+TEST_F(HloEvaluatorTest, DoesClamp) {
+  auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
+  auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+
+  Shape shape = low->shape();
+  auto c1 = HloInstruction::CreateConstant(std::move(low));
+  auto c2 = HloInstruction::CreateConstant(std::move(high));
+  auto c3 = HloInstruction::CreateConstant(std::move(value));
+  auto instruction = HloInstruction::CreateTernary(
+      shape, HloOpcode::kClamp, c1.get(), c2.get(), c3.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs select
+// with 3 operands.
+TEST_F(HloEvaluatorTest, DoesSelect) {
+  auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
+  auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
+  auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+
+  Shape shape = on_true->shape();
+  auto c1 = HloInstruction::CreateConstant(std::move(pred));
+  auto c2 = HloInstruction::CreateConstant(std::move(on_true));
+  auto c3 = HloInstruction::CreateConstant(std::move(on_false));
+  auto instruction = HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, c1.get(), c2.get(), c3.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise addition with 2 operands.
+TEST_F(HloEvaluatorTest, DoesAdd) {
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1 = HloInstruction::CreateConstant(std::move(lhs));
+  auto c2 = HloInstruction::CreateConstant(std::move(rhs));
+  auto instruction =
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1.get(), c2.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise divide with 2 operands.
+TEST_F(HloEvaluatorTest, DoesDivide) {
+  auto lhs_s64 = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs_s64 = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+
+  Shape shape_s64 = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1_s64 = HloInstruction::CreateConstant(std::move(lhs_s64));
+  auto c2_s64 = HloInstruction::CreateConstant(std::move(rhs_s64));
+  auto instruction = HloInstruction::CreateBinary(shape_s64, HloOpcode::kDivide,
+                                                  c1_s64.get(), c2_s64.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+
+  auto lhs_f64 = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
+  auto rhs_f64 = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
+
+  Shape shape_f64 = ShapeUtil::MakeShape(F64, {2, 2});
+  auto c1_f64 = HloInstruction::CreateConstant(std::move(lhs_f64));
+  auto c2_f64 = HloInstruction::CreateConstant(std::move(rhs_f64));
+  instruction = HloInstruction::CreateBinary(shape_f64, HloOpcode::kDivide,
+                                             c1_f64.get(), c2_f64.get());
+
+  result = evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  expected =
+      LiteralUtil::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise abs op with 1 operand.
+TEST_F(HloEvaluatorTest, DoesAbs) {
+  auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+  auto c1 = HloInstruction::CreateConstant(std::move(operand));
+  auto instruction =
+      HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1.get());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+// Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
+// constant operands.
+TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
+  HloComputation::Builder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  std::vector<const Literal*> args = {lhs.get(), rhs.get(), rhs2.get()};
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+
+  auto param_lhs = HloInstruction::CreateParameter(0, shape, "lhs");
+  auto param_rhs = HloInstruction::CreateParameter(1, shape, "rhs");
+  auto lhs_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, param_lhs.get(), param_rhs.get());
+
+  auto param_rhs2 = HloInstruction::CreateParameter(2, shape, "rhs2");
+  auto root_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, lhs_instruction.get(), param_rhs2.get());
+
+  builder.AddInstruction(std::move(root_instruction));
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), args).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 447892c8dec9ea0549a35c9ea2b20303c52b9aa2..9e25f1aceb1595b89aee601b294792e9e801c6f3 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -70,6 +70,7 @@ string HloExecutionProfile::ToString(
   string result;
   const int64 total_cycles = total_cycles_executed(computation);
   double clock_rate_ghz = device_description.clock_rate_ghz();
+  CHECK_GE(clock_rate_ghz, 1e-9);
 
   const auto cycles_to_microseconds = [&](double cycles) {
     return cycles / clock_rate_ghz / 1000.0;
@@ -80,14 +81,19 @@ string HloExecutionProfile::ToString(
     double nsecs = cycles / clock_rate_ghz;
     string bytes_per_sec;
     string bytes_per_cycle;
-    if (bytes_accessed >= 0) {
+    if (cycles <= 0 || bytes_accessed < 0) {
+      bytes_per_sec = "<unknown>";
+      bytes_per_cycle = "<unknown>";
+    } else {
       bytes_per_sec = tensorflow::strings::HumanReadableNumBytes(
           bytes_accessed / (nsecs / 1e9));
       bytes_per_cycle =
           tensorflow::strings::HumanReadableNumBytes(bytes_accessed / cycles);
-    } else {
-      bytes_per_sec = "<unknown>";
-      bytes_per_cycle = "<unknown>";
+    }
+
+    double cycles_percent = 0;
+    if (total_cycles > 0) {
+      cycles_percent = cycles / static_cast<double>(total_cycles) * 100;
     }
 
     tensorflow::strings::StrAppend(
@@ -97,8 +103,7 @@ string HloExecutionProfile::ToString(
             ":: "
             "%12s/cycle :: "
             "%s",
-            cycles, cycles / static_cast<double>(total_cycles) * 100,
-            cycles_to_microseconds(cycles),
+            cycles, cycles_percent, cycles_to_microseconds(cycles),
             flops <= 0 ? "<none>" : HumanReadableNumFlops(flops, nsecs).c_str(),
             bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
   };
@@ -114,26 +119,30 @@ string HloExecutionProfile::ToString(
   for (const auto& item : items) {
     const HloInstruction* hlo = item.first;
     tensorflow::strings::StrAppend(&result, "\n\t");
-    int64 flops = hlo == nullptr ? -1 : cost_analysis.flop_count(*hlo);
-    int64 bytes_accessed =
-        hlo == nullptr ? -1 : cost_analysis.bytes_accessed(*hlo);
-    string display = hlo == nullptr ? "<none>" : hlo->ToString();
+    const int64 flops = (hlo == nullptr) ? -1 : cost_analysis.flop_count(*hlo);
+    const int64 bytes_accessed =
+        (hlo == nullptr) ? -1 : cost_analysis.bytes_accessed(*hlo);
+    const string display = (hlo == nullptr) ? "<none>" : hlo->ToString();
     append_item(item.second, flops, bytes_accessed, display);
   }
 
-  MetricTableReport table;
-  table.SetMetricName("microseconds");
-  table.SetEntryName("ops");
-  table.SetShowCategoryTable();
-  for (const auto& item : items) {
-    MetricTableReport::Entry entry;
-    entry.text = item.first->ToString();
-    entry.short_text = item.first->ToString(/*compact_operands=*/true);
-    entry.category_text = item.first->ToCategory();
-    entry.metric = cycles_to_microseconds(item.second);
-    table.AddEntry(std::move(entry));
+  if (total_cycles <= 0) {
+    result += "****** 0 total cycles ******\n";
+  } else {
+    MetricTableReport table;
+    table.SetMetricName("microseconds");
+    table.SetEntryName("ops");
+    table.SetShowCategoryTable();
+    for (const auto& item : items) {
+      MetricTableReport::Entry entry;
+      entry.text = item.first->ToString();
+      entry.short_text = item.first->ToString(/*compact_operands=*/true);
+      entry.category_text = item.first->ToCategory();
+      entry.metric = cycles_to_microseconds(item.second);
+      table.AddEntry(std::move(entry));
+    }
+    result += table.MakeReport(cycles_to_microseconds(total_cycles));
   }
-  result += table.MakeReport(cycles_to_microseconds(total_cycles));
 
   return result;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 0af4c99d0a51ab6e4d3048abae1b9c3fb6dca5e6..eb2e5dfb37f33fd138e20ee930a2242cb1db89ea 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -47,6 +48,73 @@ namespace xla {
 namespace hlo_graph_dumper {
 namespace {
 
+// Node color schemes, used by NodeColorAttributes.
+enum ColorScheme {
+  kBlue,
+  kBrown,
+  kDarkBlue,
+  kDarkGreen,
+  kDarkRed,
+  kGray,
+  kGreen,
+  kOrange,
+  kPurple,
+  kRed,
+  kWhite,
+  kYellow,
+};
+
+// Given a ColorScheme, returns an attribute string for a node of that color.
+// Sets the node's fill, stroke, and text colors.
+//
+// Colors are from https://material.io/color.
+string NodeColorAttributes(ColorScheme color) {
+  using std::make_tuple;
+
+  const char *fill_color, *stroke_color, *font_color;
+  std::tie(fill_color, stroke_color, font_color) =
+      [color]() -> std::tuple<const char*, const char*, const char*> {
+    switch (color) {
+      case kBlue:
+        return make_tuple("#bbdefb", "#8aacc8", "black");
+      case kBrown:
+        return make_tuple("#bcaaa4", "#8c7b75", "black");
+      case kDarkBlue:
+        return make_tuple("#1565c0", "#003c8f", "white");
+      case kDarkGreen:
+        return make_tuple("#2e7d32", "#005005", "white");
+      case kDarkRed:
+        return make_tuple("#b71c1c", "#7f0000", "white");
+      case kGray:
+        return make_tuple("#cfd8dc", "#9ea7aa", "black");
+      case kGreen:
+        return make_tuple("#c8e6c9", "#97b498", "black");
+      case kOrange:
+        return make_tuple("#ffe0b2", "#cbae82", "black");
+      case kPurple:
+        return make_tuple("#e1bee7", "#af8eb5", "black");
+      case kRed:
+        return make_tuple("#ffcdd2", "#cb9ca1", "black");
+      case kWhite:
+        return make_tuple("white", "black", "black");
+      case kYellow:
+        return make_tuple("#fff9c4", "#cbc693", "black");
+    }
+  }();
+
+  return Printf(
+      "style=filled, fontcolor=\"%s\", color=\"%s\", fillcolor=\"%s\"",
+      font_color, stroke_color, fill_color);
+}
+
+// Replaces <> with &lt;&gt;, so that this string is safe(er) for use in a
+// graphviz HTML-like string.
+string HtmlLikeStringSanitize(tensorflow::StringPiece s) {
+  return tensorflow::str_util::StringReplace(
+      tensorflow::str_util::StringReplace(s, "<", "&lt;", /*replace_all=*/true),
+      ">", "&gt;", /*replace_all=*/true);
+}
+
 // Returns the dot graph identifier for the given instruction.
 string InstructionId(const HloInstruction* instruction) {
   return Printf("%lld", reinterpret_cast<uint64>(instruction));
@@ -101,30 +169,36 @@ string InstructionSequenceGraph(
       param_ports.push_back(
           Printf("<%s> %s", InstructionId(param).c_str(), label.c_str()));
     }
-    StrAppend(&graph_body, param_node_name,
-              " [shape=record,style=filled,fillcolor=\"lightblue1\",",
-              "label=\"{parameters | {", Join(param_ports, "|"), "}}\"];\n");
+    // (If we wanted the word "parameters" to be bold like the other op names,
+    // we'd have to make this into an HTML-like table.  It is possible but
+    // complicated; see http://www.graphviz.org/doc/info/shapes.html#html.)
+    StrAppend(&graph_body, param_node_name, " [shape=record ",
+              NodeColorAttributes(kOrange), "label=\"{parameters | {",
+              Join(param_ports, "|"), "}}\"];\n");
   }
 
   for (auto& instruction : instructions) {
-    string color = "peachpuff";
-    string shape = "ellipse";
-    string name = instruction->ExtendedOpcodeStr();
+    ColorScheme color = kYellow;
+    string shape = "box";
+    string name =
+        StrCat("<b>", HtmlLikeStringSanitize(instruction->ExtendedOpcodeStr()),
+               "</b> ", HtmlLikeStringSanitize(instruction->name()));
     if (HloOpcode::kConvolution == instruction->opcode()) {
-      name += ":\\n" + instruction->ConvolutionDimensionNumbersToString() +
-              "\\n" + window_util::ToString(instruction->window());
+      StrAppend(
+          &name, "<br/>",
+          HtmlLikeStringSanitize(
+              instruction->ConvolutionDimensionNumbersToString()),
+          "<br/>",
+          HtmlLikeStringSanitize(window_util::ToString(instruction->window())));
     }
 
-    name += "\\n" + instruction->name();
-    if (!instruction->metadata().op_type().empty()) {
-      StrAppend(&name, "\\n", instruction->metadata().op_type());
-    }
     if (!instruction->metadata().op_name().empty()) {
-      StrAppend(&name, "\\n", instruction->metadata().op_name());
+      StrAppend(&name, "<br/>",
+                HtmlLikeStringSanitize(instruction->metadata().op_name()));
     }
     if (!instruction->metadata().source_file().empty() &&
         instruction->metadata().source_line() != 0) {
-      StrAppend(&name, "\\n", instruction->metadata().source_file(), ":",
+      StrAppend(&name, "<br/>", instruction->metadata().source_file(), ":",
                 instruction->metadata().source_line());
     }
 
@@ -139,11 +213,8 @@ string InstructionSequenceGraph(
       case HloOpcode::kAdd:
       case HloOpcode::kCeil:
       case HloOpcode::kClamp:
-      case HloOpcode::kConcatenate:
       case HloOpcode::kConvert:
       case HloOpcode::kDivide:
-      case HloOpcode::kDynamicSlice:
-      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kEq:
       case HloOpcode::kExp:
       case HloOpcode::kFloor:
@@ -162,64 +233,49 @@ string InstructionSequenceGraph(
       case HloOpcode::kMultiply:
       case HloOpcode::kNe:
       case HloOpcode::kNegate:
-      case HloOpcode::kPad:
       case HloOpcode::kPower:
       case HloOpcode::kRemainder:
-      case HloOpcode::kReshape:
-      case HloOpcode::kReverse:
       case HloOpcode::kSelect:
       case HloOpcode::kSign:
       case HloOpcode::kSlice:
       case HloOpcode::kSort:
       case HloOpcode::kSubtract:
       case HloOpcode::kTanh:
-      case HloOpcode::kTuple:
-      case HloOpcode::kUpdate:
-        break;
-
-      case HloOpcode::kBroadcast:
-      case HloOpcode::kTranspose:
-        StrAppend(&name, "\\n", "dims={", Join(instruction->dimensions(), ","),
-                  "}");
-        break;
-      case HloOpcode::kGetTupleElement:
-        StrAppend(&name, "\\nindex=", instruction->tuple_index());
         break;
       case HloOpcode::kRng:
-        StrAppend(&name, "\\n",
+        StrAppend(&name, "<br/>",
                   RandomDistribution_Name(instruction->random_distribution()));
         break;
-      case HloOpcode::kConstant:
-        shape = "boxed";
-        color = "palegreen";
-        if (ShapeUtil::IsScalar(instruction->shape())) {
-          StrAppend(&name, "\\n", "value=", LiteralUtil::GetAsString(
-                                                instruction->literal(), {}));
-        }
+      case HloOpcode::kBroadcast:
+      case HloOpcode::kTranspose:
+        StrAppend(&name, "<br/>", "dims={",
+                  Join(instruction->dimensions(), ","), "}");
         break;
       case HloOpcode::kBitcast:
-      case HloOpcode::kCopy:
-        color = "white";
-        break;
-      case HloOpcode::kCall:
-        color = "tomato";
-        break;
-      case HloOpcode::kCustomCall:
-        color = "tomato4";
-        StrAppend(&name, "\\n",
-                  "custom_call_target=", instruction->custom_call_target());
+      case HloOpcode::kTuple:
+      case HloOpcode::kTrace:
+        color = kWhite;
         break;
-      case HloOpcode::kDot:
-        color = "slateblue";
+      case HloOpcode::kGetTupleElement:
+        color = kWhite;
+        StrAppend(&name, "<br/>index=", instruction->tuple_index());
         break;
-      case HloOpcode::kSend:
-        color = "purple";
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kCopy:
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kPad:
+      case HloOpcode::kReshape:
+      case HloOpcode::kReverse:
+      case HloOpcode::kUpdate:
+        color = kGreen;
         break;
-      case HloOpcode::kRecv:
-        color = "orange";
+      case HloOpcode::kConstant:
+        color = kBlue;
         break;
-      case HloOpcode::kMap:
-        color = "palevioletred";
+      case HloOpcode::kConvolution:
+      case HloOpcode::kDot:
+        color = kDarkBlue;
         break;
       case HloOpcode::kParameter:
         // A single record node is created for all the parameter nodes with a
@@ -228,38 +284,54 @@ string InstructionSequenceGraph(
         continue;
       case HloOpcode::kReduce:
         StrAppend(&name, " dims=", Join(instruction->dimensions(), ","));
-        color = "lightsalmon";
+        color = kPurple;
         break;
       case HloOpcode::kSelectAndScatter:
       case HloOpcode::kReduceWindow:
-        color = "lightsalmon";
-        break;
-      case HloOpcode::kTrace:
-        color = "white";
+        color = kPurple;
         break;
       case HloOpcode::kWhile:
-        color = "forestgreen";
+        shape = "ellipse";
+        color = kDarkGreen;
         break;
+      case HloOpcode::kMap:
       case HloOpcode::kFusion:
-        color = "gray";
-        break;
-      case HloOpcode::kConvolution:
-        color = "red";
-        break;
-      case HloOpcode::kCrossReplicaSum:
-        color = "turquoise";
+        color = kGray;
         break;
+      case HloOpcode::kSend:
+      case HloOpcode::kRecv:
       case HloOpcode::kInfeed:
       case HloOpcode::kOutfeed:
-        color = "blue";
+      case HloOpcode::kCrossReplicaSum:
+        color = kBrown;
+        break;
+      case HloOpcode::kCall:
+        color = kDarkGreen;
+        break;
+      case HloOpcode::kCustomCall:
+        color = kDarkGreen;
+        StrAppend(&name, "<br/>",
+                  "custom_call_target=", instruction->custom_call_target());
         break;
     }
 
     // Create instruction node with appropriate label, shape, and color.
+    // label is interpreted as an HTML-like string, so newlines must be
+    // delimited with <br/>, rather than \n.
     string label =
-        StrCat(name, "\\n", ShapeUtil::HumanString(instruction->shape()));
+        StrCat(name, "<br/>", ShapeUtil::HumanString(instruction->shape()));
+
+    if (instruction->opcode() == HloOpcode::kConstant &&
+        ShapeUtil::IsEffectiveScalar(instruction->shape())) {
+      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
+          instruction->shape(), /*linear_index=*/0);
+      StrAppend(&label, " = {",
+                LiteralUtil::GetAsString(instruction->literal(), elem_idx),
+                "}");
+    }
+
     if (show_addresses) {
-      Appendf(&label, "\\n[%p]", instruction.get());
+      Appendf(&label, "<br/>[%p]", instruction.get());
     }
     if (show_layouts && LayoutUtil::HasLayout(instruction->shape())) {
       string layout_string;
@@ -271,7 +343,7 @@ string InstructionSequenceGraph(
         layout_string =
             Join(instruction->shape().layout().minor_to_major(), ",");
       }
-      StrAppend(&label, "\\nlayout={", layout_string, "}");
+      StrAppend(&label, "<br/>layout={", layout_string, "}");
     }
     if (hlo_execution_profile != nullptr) {
       auto hlo_cycles_executed =
@@ -279,16 +351,16 @@ string InstructionSequenceGraph(
       auto total_cycles_executed =
           hlo_execution_profile->total_cycles_executed(*instruction->parent());
       if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
-        Appendf(&label, "\\n%% of cycles executed=%.2f",
+        Appendf(&label, "<br/>%% of cycles executed=%.2f",
                 (static_cast<double>(hlo_cycles_executed) /
                  static_cast<double>(total_cycles_executed)) *
                     100);
       }
     }
-    Appendf(&graph_body,
-            "%s [label=\"%s\", shape=%s, style=filled, fillcolor=%s];\n",
+
+    Appendf(&graph_body, "%s [label=<%s>, shape=%s, %s];\n",
             InstructionId(instruction.get()).c_str(), label.c_str(),
-            shape.c_str(), color.c_str());
+            shape.c_str(), NodeColorAttributes(color).c_str());
 
     // Create edges from the instruction's operands to the instruction.
     int64 operand_number = 0;
@@ -318,7 +390,7 @@ string InstructionSequenceGraph(
           StrCat("cluster_", InstructionId(instruction.get()));
       StrAppend(&graph_body, "subgraph ", cluster_name, " {\n");
       StrAppend(&graph_body,
-                "label=\"fused expression\";\nstyle=filled;\n"
+                "label=<<b>fused expression</b>>;\nstyle=\"rounded,filled\";\n"
                 "color=lightgrey;\n");
       StrAppend(&graph_body, InstructionSequenceGraph(
                                  instruction->fused_instructions(),
@@ -348,19 +420,39 @@ string InstructionSequenceGraph(
   return graph_body;
 }
 
+// DOT graphs accept a stylesheet as a URL.  So naturally, an inline stylesheet
+// is a data URI!
+//
+// We don't perform any escaping on this string, so be careful not to use double
+// quotes inside.
+static const char* dot_stylesheet = R"(
+data:text/css,
+@import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+svg text {
+  font-family: 'Roboto';
+  font-size: 12px;
+}
+)";
+
 string ComputationToDotGraph(const HloComputation& computation,
                              const string& label, bool show_addresses,
                              bool show_layouts,
                              const HloExecutionProfile* hlo_execution_profile) {
-  string graph_label = StrCat(label, "\\n", computation.name());
+  string graph_label = StrCat(label, "<br/>", computation.name());
   if (hlo_execution_profile != nullptr) {
     auto cycles = hlo_execution_profile->total_cycles_executed(computation);
-    Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles,
+    Appendf(&graph_label, "<br/>total cycles = %lld (%s)", cycles,
             tensorflow::strings::HumanReadableNum(cycles).c_str());
   }
-  string graph =
-      Printf("digraph G {\nrankdir=TB;\ncompound=true;\nlabel=\"%s\"\n",
-             graph_label.c_str());
+  string graph = Printf(
+      R"(digraph G {
+rankdir=TB;
+compound=true;
+label=<<b>%s</b>>;
+labelloc=t;
+stylesheet="%s"
+)",
+      graph_label.c_str(), dot_stylesheet);
 
   // Emit embedded computations as subgraph clusters.
   std::vector<string> intercomputation_edges;
@@ -368,7 +460,9 @@ string ComputationToDotGraph(const HloComputation& computation,
     string graph_body = InstructionSequenceGraph(
         embedded->instructions(), show_addresses, show_layouts,
         &intercomputation_edges, hlo_execution_profile);
-    Appendf(&graph, "subgraph cluster_%s {\nlabel=\"%s\";\n%s}\n",
+    Appendf(&graph,
+            "subgraph cluster_%s "
+            "{\nstyle=rounded;label=<<b>%s</b>>;labelloc=t;\n%s}\n",
             ComputationId(embedded).c_str(), embedded->name().c_str(),
             graph_body.c_str());
   }
@@ -414,14 +508,24 @@ namespace {
 
 class FileGraphRenderer : public GraphRendererInterface {
  public:
-  string RenderGraph(const string& graph) override {
+  string RenderGraph(const string& graph, GraphKind graph_kind) override {
     static std::atomic<int> output_num(0);
     legacy_flags::HloGraphDumperFlags* flags =
         legacy_flags::GetHloGraphDumperFlags();
-    string path = StrCat(flags->xla_hlo_dump_graph_path, "hlo_graph_",
-                         output_num++, ".XXXXXX.dot");
+    string file_extension;
+    switch (graph_kind) {
+      case DOT_GRAPH:
+        file_extension = ".dot";
+        break;
+      case TF_GRAPHDEF:
+        file_extension = ".pbtxt";
+        break;
+    }
+    string path =
+        JoinPath(flags->xla_hlo_dump_graph_path,
+                 StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
     auto status = Status::OK();
-    int fd = mkstemps(&path[0], 4);
+    int fd = mkstemps(&path[0], file_extension.length());
     if (fd < 0) {
       status =
           Status(tensorflow::error::Code::UNKNOWN,
@@ -446,10 +550,26 @@ XLA_REGISTER_GRAPH_RENDERER(FileGraphRenderer, 0);
 string DumpGraph(const HloComputation& computation, const string& label,
                  bool show_addresses, bool show_layouts,
                  const HloExecutionProfile* hlo_execution_profile) {
-  string graph = ComputationToDotGraph(computation, label, show_addresses,
-                                       show_layouts, hlo_execution_profile);
-
-  string graph_url = GetGraphRenderer()->RenderGraph(graph);
+  string graph;
+  string graph_url;
+  legacy_flags::HloGraphDumperFlags* flags =
+      legacy_flags::GetHloGraphDumperFlags();
+  if (flags->xla_hlo_dump_as_graphdef) {
+    HloTfGraphBuilder builder;
+    TF_CHECK_OK(builder.AddComputation(computation));
+    CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
+                                                          &graph));
+    // TODO(b/37198616): Use the default registered renderers when all
+    // renderers support rendering GraphDefs. Always dump GraphDefs to files
+    // for now.
+    graph_url = FileGraphRenderer().RenderGraph(
+        graph, GraphRendererInterface::TF_GRAPHDEF);
+  } else {
+    graph = ComputationToDotGraph(computation, label, show_addresses,
+                                  show_layouts, hlo_execution_profile);
+    graph_url = GetGraphRenderer()->RenderGraph(
+        graph, GraphRendererInterface::DOT_GRAPH);
+  }
   LOG(INFO) << "computation " << computation.name() << " [" << label
             << "]: " << graph_url;
   return graph_url;
@@ -467,5 +587,4 @@ void DumpText(const HloModule& module, const string& label,
 }
 
 }  // namespace hlo_graph_dumper
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 5f841da1f35c40042fde54dbc03eb7682a8d31cb..8ed50c38473a6f6dd36603e155285e855ff0c5be 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -25,8 +25,25 @@ limitations under the License.
 namespace xla {
 namespace hlo_graph_dumper {
 
-// Dumps a graph of the computation to the GraphViz server and returns
-// a description of the rendered graph (e.g., a URL).
+// Abstract interface for classes that render HLO graphs (e.g. DOT graph,
+// tensorflow GraphDef).
+class GraphRendererInterface {
+ public:
+  enum GraphKind {
+    DOT_GRAPH,
+    TF_GRAPHDEF,
+  };
+
+  virtual ~GraphRendererInterface() = default;
+
+  // Renders a DOT graph, returning a description of the rendered output
+  // (e.g., a URL)
+  virtual string RenderGraph(const string& graph, GraphKind graph_kind) = 0;
+};
+
+// Dumps a graph of the computation and returns a description of the rendered
+// graph (e.g., a URL) based on the renderer. The "best" renderer in the
+// registry is used.
 string DumpGraph(const HloComputation& computation, const string& label,
                  bool show_addresses, bool show_layouts,
                  const HloExecutionProfile* hlo_execution_profile = nullptr);
@@ -40,16 +57,6 @@ string DumpGraph(const HloComputation& computation, const string& label,
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix = true);
 
-// Abstract interface for classes that render DOT graphs.
-class GraphRendererInterface {
- public:
-  virtual ~GraphRendererInterface() = default;
-
-  // Renders a DOT graph, returning a description of the rendered output
-  // (e.g., a URL)
-  virtual string RenderGraph(const string& graph) = 0;
-};
-
 // Graph renderers may be added using a registration mechanism, e.g.:
 // XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
 // The renderer with the highest numeric priority value is used.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 905647c2ed9f30dca31dccd07b6bcf99479ae2aa..bfb2129e13cd22cabb466ca383ce7b6ead90e96f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <deque>
+#include <ostream>
 #include <set>
 #include <unordered_set>
 #include <utility>
@@ -27,6 +28,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -41,9 +44,10 @@ limitations under the License.
 
 namespace xla {
 
-using ::tensorflow::strings::StrAppend;
 using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateParameter(
     int64 parameter_number, const Shape& shape, const string& name) {
@@ -209,10 +213,10 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kConvolution, shape));
   if (window_util::HasBaseDilation(window)) {
-    instruction->set_name(instruction->name() + "-base-dilated");
+    instruction->name_ = instruction->name() + "-base-dilated";
   }
   if (window_util::HasWindowDilation(window)) {
-    instruction->set_name(instruction->name() + "-window-dilated");
+    instruction->name_ = instruction->name() + "-window-dilated";
   }
   instruction->AppendOperand(lhs);
   instruction->AppendOperand(rhs);
@@ -406,7 +410,9 @@ HloInstruction::CreateSelectAndScatter(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReshape(
     const Shape& shape, HloInstruction* operand) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape),
-           ShapeUtil::ElementsIn(operand->shape()));
+           ShapeUtil::ElementsIn(operand->shape()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << " operand: " << ShapeUtil::HumanString(operand->shape());
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReshape, shape));
   instruction->AppendOperand(operand);
   return instruction;
@@ -432,6 +438,7 @@ HloInstruction::CreateSelectAndScatter(
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
   instruction->fusion_kind_ = fusion_kind;
   instruction->set_parent(fused_root->parent());
+  instruction->set_metadata(fused_root->metadata());
   instruction->CloneAndFuseInternal(fused_root);
   instruction->CheckFusionInstruction();
   return instruction;
@@ -497,14 +504,20 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(instruction_to_fuse->IsFusable());
 
-  bool new_fusion_instruction = fused_instructions_.empty();
-  fused_instructions_.emplace_back(instruction_to_fuse->Clone());
-  HloInstruction* clone = fused_instructions_.back().get();
-  clone->parent_fusion_instruction_ = this;
-
-  if (new_fusion_instruction) {
-    fused_root_ = clone;
+  HloInstruction* clone = nullptr;
+  if (fused_instructions_computation_ == nullptr) {
+    // New fusion instruction.
+    auto builder = HloComputation::Builder("fused_computation", true);
+    builder.AddInstruction(instruction_to_fuse->Clone(/*suffix=*/""));
+    fused_instructions_computation_ = builder.Build();
+    clone = fused_expression_root();
+    clone->parent_fusion_instruction_ = this;
   } else {
+    CHECK(fused_instructions_computation_ != nullptr &&
+          fused_instructions_computation_->IsFusionComputation());
+    clone = fused_instructions_computation_->AddInstruction(
+        instruction_to_fuse->Clone(/*suffix=*/""));
+    clone->parent_fusion_instruction_ = this;
     // instruction_to_fuse is necessarily an operand of the fusion instruction.
     // After fusion this will no longer be the case. Remove the operand from the
     // operand list and remove its corresponding fused parameter
@@ -512,6 +525,8 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     // consistent with their index in the fused_parameter_ vector.
     CHECK(std::find(operands_.begin(), operands_.end(), instruction_to_fuse) !=
           operands_.end());
+    const std::vector<HloInstruction*>& fused_parameters_ =
+        fused_instructions_computation_->parameter_instructions();
     for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
       if (instruction_to_fuse == operands_[operand_num]) {
         // replace the fused parameter instruction's uses with the clone.
@@ -520,22 +535,9 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
 
         // Remove the corresponding fused parameter and operand from their
         // respective vectors.
-        fused_parameters_.erase(fused_parameters_.begin() + operand_num);
+        TF_CHECK_OK(
+            fused_instructions_computation_->RemoveParameter(operand_num));
         operands_.erase(operands_.begin() + operand_num);
-
-        // Renumber fused parameter numbers to match the vector index.
-        while (operand_num < fused_parameters_.size()) {
-          fused_parameters_[operand_num]->parameter_number_ = operand_num;
-          operand_num++;
-        }
-        // Throw removed fused parameter instruction away.
-        auto inst_it =
-            std::find_if(fused_instructions_.begin(), fused_instructions_.end(),
-                         [=](const std::unique_ptr<HloInstruction>& inst) {
-                           return inst.get() == fused_parameter;
-                         });
-        CHECK(inst_it != fused_instructions_.end());
-        fused_instructions_.erase(inst_it);
         break;
       }
     }
@@ -544,6 +546,10 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
     instruction_to_fuse->RemoveUser(this);
   }
 
+  // Reread the parameters in the computation.
+  const std::vector<HloInstruction*>& fused_parameters_ =
+      fused_instructions_computation_->parameter_instructions();
+
   // Add each operand of the clone as an operand of the fusion instruction. A
   // complication is that some clone operands may already be operands of the
   // fusion instruction.
@@ -566,16 +572,18 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       // instruction. Add it as an operand and add a corresponding fused
       // parameter instruction.
       int64 param_no = fused_parameters_.size();
+      // Name the parameter after the instruction it represents in the outer
+      // (non-fusion) computation. Strip the leading "%" from the operand name
+      // to avoid a double %%.
+      string param_name =
+          StrCat(operand->name().substr(1), ".param_", param_no);
       std::unique_ptr<HloInstruction> param_instruction =
-          CreateParameter(param_no, operand->shape(), "fusion_param");
+          CreateParameter(param_no, operand->shape(), param_name);
 
-      param_instruction->set_parent(parent());
       param_instruction->parent_fusion_instruction_ = this;
-      fused_parameters_.push_back(param_instruction.get());
-      fused_instructions_.push_back(std::move(param_instruction));
+      fused_param = fused_instructions_computation_->AddParameter(
+          std::move(param_instruction));
       AppendOperand(operand);
-
-      fused_param = fused_instructions_.back().get();
     }
     TF_CHECK_OK(clone->ReplaceOperandWith(operand_num, fused_param));
   }
@@ -598,18 +606,25 @@ RandomDistribution HloInstruction::random_distribution() const {
 
 void HloInstruction::CheckFusionInstruction() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
 
+  const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
+      fused_instructions_computation_->instructions();
   // All instructions owned by this fusion instruction must be fused, and the
   // parent fusion instruction of the fused instructions must be 'this'.
   for (auto& instruction : fused_instructions_) {
     CHECK(instruction->IsFused());
     CHECK_EQ(this, instruction->fusion_instruction());
-    CHECK_EQ(parent(), instruction->parent()) << instruction->ToString();
+    CHECK_EQ(fused_instructions_computation_.get(), instruction->parent())
+        << instruction->ToString();
   }
 
   // Fused root instruction and fused parameters must all be owned by the fusion
   // instruction.
   bool root_owned = false;
+  const std::vector<HloInstruction*>& fused_parameters_ = fused_parameters();
+  const HloInstruction* fused_root_ = fused_expression_root();
   std::vector<bool> parameter_owned(fused_parameters_.size(), false);
   for (auto& instruction : fused_instructions_) {
     if (fused_root_ == instruction.get()) {
@@ -702,7 +717,8 @@ void HloInstruction::CheckFusionInstruction() const {
 }
 
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands) {
   // Explicitly call the factory for the instruction type. This is more robust
   // in the face of code changes than copying fields explicitly. This also
   // properly sets the user fields of the operands.
@@ -721,8 +737,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSign:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
-      CHECK_EQ(operands.size(), 1);
-      return CreateUnary(shape, opcode_, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateUnary(shape, opcode_, new_operands[0]);
     // Binary ops.
     case HloOpcode::kAdd:
     case HloOpcode::kDivide:
@@ -741,93 +757,92 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kRemainder:
     case HloOpcode::kLogicalAnd:
     case HloOpcode::kLogicalOr:
-      CHECK_EQ(operands.size(), 2);
-      return CreateBinary(shape, opcode_, operands[0], operands[1]);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateBinary(shape, opcode_, new_operands[0], new_operands[1]);
     // Ternary ops.
     case HloOpcode::kClamp:
     case HloOpcode::kSelect:
-      CHECK_EQ(operands.size(), 3);
-      return CreateTernary(shape, opcode_, operands[0], operands[1],
-                           operands[2]);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateTernary(shape, opcode_, new_operands[0], new_operands[1],
+                           new_operands[2]);
     // Other supported ops.
     case HloOpcode::kBroadcast:
-      CHECK_EQ(operands.size(), 1);
-      return CreateBroadcast(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateBroadcast(shape, new_operands[0], dimensions_);
     case HloOpcode::kCall:
-      return CreateCall(shape, operands, to_apply());
+      return CreateCall(shape, new_operands, to_apply());
     case HloOpcode::kCustomCall:
-      return CreateCustomCall(shape, operands, custom_call_target_);
+      return CreateCustomCall(shape, new_operands, custom_call_target_);
     case HloOpcode::kConcatenate:
-      return CreateConcatenate(shape, operands, dimensions(0));
+      return CreateConcatenate(shape, new_operands, dimensions(0));
     case HloOpcode::kConvert:
-      CHECK_EQ(operands.size(), 1);
-      return CreateConvert(shape, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateConvert(shape, new_operands[0]);
     case HloOpcode::kConvolution:
-      CHECK_EQ(operands.size(), 2);
-      return CreateConvolve(shape, operands[0], operands[1], *window_,
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
                             *convolution_dimension_numbers_);
     case HloOpcode::kCrossReplicaSum:
-      CHECK_EQ(operands.size(), 1);
-      return CreateCrossReplicaSum(shape, operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateCrossReplicaSum(shape, new_operands[0]);
     case HloOpcode::kGetTupleElement:
-      CHECK_EQ(operands.size(), 1);
-      return CreateGetTupleElement(shape, operands[0], tuple_index());
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateGetTupleElement(shape, new_operands[0], tuple_index());
     case HloOpcode::kMap:
-      return CreateMap(shape, operands, to_apply());
+      return CreateMap(shape, new_operands, to_apply());
     case HloOpcode::kPad:
-      CHECK_EQ(operands.size(), 2);
-      return CreatePad(shape, operands[0], operands[1], *padding_config_);
+      CHECK_EQ(new_operands.size(), 2);
+      return CreatePad(shape, new_operands[0], new_operands[1],
+                       *padding_config_);
     case HloOpcode::kReduce:
-      CHECK_EQ(operands.size(), 2);
-      return CreateReduce(shape, operands[0], operands[1], dimensions_,
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateReduce(shape, new_operands[0], new_operands[1], dimensions_,
                           to_apply());
     case HloOpcode::kReduceWindow:
-      CHECK_EQ(operands.size(), 2);
-      return CreateReduceWindow(shape, operands[0], operands[1], *window_,
-                                to_apply());
+      CHECK_EQ(new_operands.size(), 2);
+      return CreateReduceWindow(shape, new_operands[0], new_operands[1],
+                                *window_, to_apply());
     case HloOpcode::kSelectAndScatter:
-      CHECK_EQ(operands.size(), 3);
-      return CreateSelectAndScatter(shape, operands[0], select(), *window_,
-                                    operands[1], operands[2], scatter());
-    case HloOpcode::kRecv:
-      CHECK_EQ(operands.size(), 0);
-      return CreateRecv(shape, channel_id_);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateSelectAndScatter(shape, new_operands[0], select(), *window_,
+                                    new_operands[1], new_operands[2],
+                                    scatter());
     case HloOpcode::kReverse:
-      CHECK_EQ(operands.size(), 1);
-      return CreateReverse(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateReverse(shape, new_operands[0], dimensions_);
     case HloOpcode::kRng:
-      return CreateRng(shape, distribution_, operands);
+      return CreateRng(shape, distribution_, new_operands);
     case HloOpcode::kReshape:
-      CHECK_EQ(operands.size(), 1);
-      return CreateReshape(shape, operands[0]);
-    case HloOpcode::kSend:
-      CHECK_EQ(operands.size(), 1);
-      return CreateSend(operands[0], channel_id_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateReshape(shape, new_operands[0]);
     case HloOpcode::kSlice:
-      CHECK_EQ(operands.size(), 1);
-      return CreateSlice(shape, operands[0], slice_starts_, slice_limits_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_);
     case HloOpcode::kDynamicSlice:
-      return CreateDynamicSlice(shape, operands[0], operands[1],
+      return CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                 dynamic_slice_sizes_);
     case HloOpcode::kDynamicUpdateSlice:
-      CHECK_EQ(operands.size(), 3);
-      return CreateDynamicUpdateSlice(shape, operands[0], operands[1],
-                                      operands[2]);
+      CHECK_EQ(new_operands.size(), 3);
+      return CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
+                                      new_operands[2]);
     case HloOpcode::kTranspose:
-      CHECK_EQ(operands.size(), 1);
-      return CreateTranspose(shape, operands[0], dimensions_);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateTranspose(shape, new_operands[0], dimensions_);
     case HloOpcode::kTuple:
-      return CreateTuple(operands_);
+      return CreateTuple(new_operands);
     case HloOpcode::kWhile:
-      CHECK_EQ(operands.size(), 1);
-      return CreateWhile(shape, while_condition(), while_body(), operands[0]);
+      CHECK_EQ(new_operands.size(), 1);
+      return CreateWhile(shape, while_condition(), while_body(),
+                         new_operands[0]);
     case HloOpcode::kConstant:
       return CreateConstant(LiteralUtil::CloneToUnique(*literal_));
     case HloOpcode::kFusion:
-      return CloneFusionWithNewOperands(shape, operands);
+      return CloneFusionWithNewOperands(shape, new_operands);
     case HloOpcode::kParameter:
       return CreateParameter(parameter_number_, shape, parameter_name_);
     // Unsupported ops for cloning.
+    case HloOpcode::kRecv:
+    case HloOpcode::kSend:
     case HloOpcode::kUpdate:
     case HloOpcode::kIndex:
     case HloOpcode::kInfeed:
@@ -837,11 +852,46 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
 }
 
+HloInstruction::~HloInstruction() {}
+
 std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix) {
   std::unique_ptr<HloInstruction> clone =
       CloneWithNewOperands(shape_, operands_);
-  clone->name_ = name() + "." + suffix;
+  if (suffix.empty()) {
+    clone->name_ = name();
+  } else {
+    // If an instruction is cloned multiple times avoid names like
+    // foo.suffix.suffix.suffix. Instead of repeating the suffix add a numeric
+    // suffix. Specifically, the clone of foo.suffix is named foo.suffix2, the
+    // clone of foo.suffix2 is named foo.suffix3 and so on.
+    const string dot_suffix = "." + suffix;
+    size_t index = name().rfind(dot_suffix);
+    if (index == string::npos) {
+      // Existing name does not include ".suffix".
+      clone->name_ = name() + dot_suffix;
+    } else {
+      // Existing name includes ".suffix". Determine if substring after
+      // ".suffix" is numeric and should be replaced with an incremented number.
+      string after_suffix = name().substr(index + dot_suffix.size());
+      if (after_suffix.empty()) {
+        // Existing name ends in ".suffix". New name should end in ".suffix2".
+        clone->name_ = name() + "2";
+      } else {
+        // If names ends with .suffix[0-9]+ then replace with a suffix with the
+        // numeric value incremented.
+        int64 numeric_suffix;
+        if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) {
+          clone->name_ =
+              StrCat(name().substr(0, index), dot_suffix, numeric_suffix + 1);
+        } else {
+          // Substring after ".suffix" is non-numeric.
+          clone->name_ = name() + dot_suffix;
+        }
+      }
+    }
+  }
   clone->set_parent(parent());
+  clone->set_metadata(metadata_);
   return clone;
 }
 
@@ -849,6 +899,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(parent() != nullptr);
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
 
   auto new_instruction =
       WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
@@ -862,6 +914,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
   // Create the list of fused parameters by mapping through the cloned,
   // fused instructions.
   std::vector<HloInstruction*> new_fused_parameters;
+  const std::vector<HloInstruction*>& fused_parameters_ =
+      fused_instructions_computation_->parameter_instructions();
+  const std::list<std::unique_ptr<HloInstruction>>& fused_instructions_ =
+      fused_instructions_computation_->instructions();
+
   for (HloInstruction* old_fused_parameter : fused_parameters_) {
     new_fused_instructions.push_back(old_fused_parameter->Clone());
     HloInstruction* new_fusion_parameter = new_fused_instructions.back().get();
@@ -892,13 +949,19 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     new_fused_instruction->parent_fusion_instruction_ = new_instruction.get();
     InsertOrDie(&old_to_new, old_fused_instruction, new_fused_instruction);
   }
+  new_instruction->fusion_kind_ = fusion_kind_;
+  auto computation_builder = HloComputation::Builder(
+      fused_instructions_computation_->name() + ".clone", true);
   // We iterated the fusion instructions in reverse post order which means
   // that we must reverse our new list of fusion instructions.
-  std::reverse(new_fused_instructions.begin(), new_fused_instructions.end());
-  new_instruction->fusion_kind_ = fusion_kind_;
-  new_instruction->fused_instructions_ = std::move(new_fused_instructions);
-  new_instruction->fused_parameters_ = std::move(new_fused_parameters);
-  new_instruction->fused_root_ = FindOrDie(old_to_new, fused_root_);
+  for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
+       new_fused_instruction_iter != new_fused_instructions.rend();
+       ++new_fused_instruction_iter) {
+    computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
+  }
+  auto fused_root_ = fused_expression_root();
+  new_instruction->fused_instructions_computation_ =
+      computation_builder.Build(FindOrDie(old_to_new, fused_root_));
   new_instruction->set_parent(parent());
   new_instruction->CheckFusionInstruction();
   return new_instruction;
@@ -1020,7 +1083,7 @@ bool HloInstruction::Identical(
   // general, there is no need to check shape because shape is inferred from the
   // shape of the operands.
   if (opcode() != other.opcode() ||
-      !ContainersEqual(operands(), other.operands(), eq_operands)) {
+      !ContainersEqual(operands(), other.operands(), std::move(eq_operands))) {
     return false;
   }
 
@@ -1355,8 +1418,7 @@ string HloInstruction::SignatureString() const {
       Join(operands_, ", ", [](string* out, HloInstruction* operand) {
         StrAppend(out, ShapeUtil::HumanString(operand->shape()));
       });
-  return tensorflow::strings::StrCat("(", operands, ") -> ",
-                                     ShapeUtil::HumanString(shape()));
+  return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
 string HloInstruction::ExtendedOpcodeStr() const {
@@ -1368,7 +1430,8 @@ string HloInstruction::ExtendedOpcodeStr() const {
   return opc_name;
 }
 
-string HloInstruction::ToString(bool compact_operands) const {
+string HloInstruction::ToString(bool compact_operands,
+                                bool include_metadata) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
@@ -1390,6 +1453,8 @@ string HloInstruction::ToString(bool compact_operands) const {
       // Do not show large constants.
       operands = "{...}";
     }
+  } else if (opcode() == HloOpcode::kParameter) {
+    operands = Printf("%lld", parameter_number_);
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
     const int64 kMaxOperandsToShowIfCompact = 4;
@@ -1420,8 +1485,8 @@ string HloInstruction::ToString(bool compact_operands) const {
   if (!slice_starts_.empty() && !slice_limits_.empty()) {
     std::vector<string> bounds;
     for (int i = 0; i < slice_starts_.size(); ++i) {
-      bounds.push_back(tensorflow::strings::StrCat("[", slice_starts_[i], ":",
-                                                   slice_limits_[i], "]"));
+      bounds.push_back(
+          StrCat("[", slice_starts_[i], ":", slice_limits_[i], "]"));
     }
     StrAppend(&extra, ", slice={", Join(bounds, ", "), "}");
   }
@@ -1447,10 +1512,12 @@ string HloInstruction::ToString(bool compact_operands) const {
   if (opcode() == HloOpcode::kGetTupleElement) {
     StrAppend(&extra, ", index=", tuple_index());
   }
-  if (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
-      !metadata_.source_file().empty()) {
+  if (include_metadata &&
+      (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
+       !metadata_.source_file().empty())) {
     StrAppend(&extra, " # metadata=", metadata_.ShortDebugString());
   }
+
   return Printf("%s = %s %s(%s)%s", name().c_str(),
                 ShapeUtil::HumanStringWithLayout(shape()).c_str(),
                 ExtendedOpcodeStr().c_str(), operands.c_str(), extra.c_str());
@@ -1503,7 +1570,9 @@ string HloInstruction::ToCategory() const {
           return "non-elementwise fusion";
         }
       case FusionKind::kInput:
-        return "reduce fusion";
+        return "input fusion";
+      case FusionKind::kOutput:
+        return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot fusion";
       case FusionKind::kConvBackwardFilter:
@@ -1521,11 +1590,10 @@ string HloInstruction::ToCategory() const {
 
 string HloInstruction::FullyQualifiedName() const {
   if (IsFused()) {
-    return tensorflow::strings::StrCat(fusion_instruction()->parent()->name(),
-                                       "::", fusion_instruction()->name(),
-                                       "::", name_);
+    return StrCat(fusion_instruction()->parent()->name(),
+                  "::", fusion_instruction()->name(), "::", name_);
   }
-  return tensorflow::strings::StrCat(parent_->name(), "::", name_);
+  return StrCat(parent_->name(), "::", name_);
 }
 
 HloInstruction* HloInstruction::tracing() const { return trace_instruction_; }
@@ -1552,7 +1620,6 @@ bool HloInstruction::IsFusable() const {
 
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kFusion:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
@@ -1569,6 +1636,11 @@ bool HloInstruction::IsFusable() const {
   }
 }
 
+HloComputation* HloInstruction::fused_instructions_computation() const {
+  CHECK_EQ(opcode_, HloOpcode::kFusion);
+  return fused_instructions_computation_.get();
+}
+
 HloInstruction* HloInstruction::fusion_instruction() const {
   CHECK(IsFused());
   return parent_fusion_instruction_;
@@ -1576,25 +1648,32 @@ HloInstruction* HloInstruction::fusion_instruction() const {
 
 HloInstruction* HloInstruction::fused_expression_root() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_root_;
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->root_instruction();
 }
 
 HloInstruction* HloInstruction::fused_parameter(int64 parameter_number) const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK_GE(parameter_number, 0);
-  CHECK_LT(parameter_number, fused_parameters_.size());
-  return fused_parameters_[parameter_number];
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->parameter_instruction(
+      parameter_number);
 }
 
 const std::vector<HloInstruction*>& HloInstruction::fused_parameters() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_parameters_;
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->parameter_instructions();
 }
 
 const std::list<std::unique_ptr<HloInstruction>>&
 HloInstruction::fused_instructions() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  return fused_instructions_;
+  CHECK(fused_instructions_computation_ != nullptr &&
+        fused_instructions_computation_->IsFusionComputation());
+  return fused_instructions_computation_->instructions();
 }
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
@@ -1703,7 +1782,7 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kSlice:
       return visitor->HandleSlice(this, operands_[0]);
     case HloOpcode::kDynamicSlice:
-      return visitor->HandleDynamicSlice(this, operands_);
+      return visitor->HandleDynamicSlice(this, operands_[0], operands_[1]);
     case HloOpcode::kDynamicUpdateSlice:
       return visitor->HandleDynamicUpdateSlice(this, operands_[0], operands_[1],
                                                operands_[2]);
@@ -1716,12 +1795,11 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
     case HloOpcode::kRng:
       return visitor->HandleRng(this, distribution_);
     case HloOpcode::kWhile:
-      return visitor->HandleWhile(this, operands_[0], while_condition(),
-                                  while_body());
+      return visitor->HandleWhile(this);
     case HloOpcode::kFusion:
       return visitor->HandleFusion(this);
     case HloOpcode::kCall:
-      return visitor->HandleCall(this, operands_, to_apply());
+      return visitor->HandleCall(this);
     case HloOpcode::kCustomCall:
       return visitor->HandleCustomCall(this, operands_, custom_call_target_);
     case HloOpcode::kSend:
@@ -1740,7 +1818,8 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
 }
 
 Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor,
-                                      const CompareFunction* operand_order) {
+                                      const CompareFunction* operand_order,
+                                      bool ignore_control_predecessors) {
   // Do not visit this HLO node again if it is already visited.
   if (visitor->DidVisit(*this)) {
     VLOG(3) << "Not visiting HLO " << name() << " as it was already visited.";
@@ -1755,34 +1834,41 @@ Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor,
   }
   visitor->SetVisiting(*this);
 
-  // Sort operands and control predecessors, if an ordering was provided.  Note
-  // that 'temp_sorted_operands' must live at this scope, since 'operands' will
-  // point to it if the operands are sorted.  The point of the 'operands'
-  // pointer is to avoid copying the operands in the common case where the
-  // operands are not sorted.
+  // Sort operands, if an ordering was provided. 'temp_sorted_operands' must
+  // live at this scope, since 'operands' will point to it if the operands are
+  // sorted.  The purpose of the 'operands' pointer is to avoid copying the
+  // operands in the common case where the operands are not sorted.
   std::vector<HloInstruction*>* operands = &operands_;
   std::vector<HloInstruction*> temp_sorted_operands;
-  std::vector<HloInstruction*> predecessors(control_predecessors_.begin(),
-                                            control_predecessors_.end());
   if (operand_order != nullptr) {
     temp_sorted_operands = operands_;
     std::sort(temp_sorted_operands.begin(), temp_sorted_operands.end(),
               *operand_order);
-    std::sort(predecessors.begin(), predecessors.end(), *operand_order);
     operands = &temp_sorted_operands;
   }
-
-  for (auto operand : *operands) {
+  for (HloInstruction* operand : *operands) {
     VLOG(3) << "Going to visit HLO " << operand->name() << " as operand of HLO "
             << name();
-    TF_RETURN_IF_ERROR(operand->AcceptInternal(visitor, operand_order));
-  }
-
-  for (auto control_predecessor : predecessors) {
-    VLOG(3) << "Going to visit HLO " << control_predecessor->name()
-            << " as a control predecessor of HLO " << name();
-    TF_RETURN_IF_ERROR(
-        control_predecessor->AcceptInternal(visitor, operand_order));
+    TF_RETURN_IF_ERROR(operand->AcceptInternal(visitor, operand_order,
+                                               ignore_control_predecessors));
+  }
+
+  if (!ignore_control_predecessors) {
+    // This uses the same pointer/vector sorting to avoid extra copies as above.
+    std::vector<HloInstruction*>* predecessors = &control_predecessors_;
+    std::vector<HloInstruction*> temp_sorted_predecessors;
+    if (operand_order != nullptr) {
+      temp_sorted_predecessors = control_predecessors_;
+      std::sort(temp_sorted_predecessors.begin(),
+                temp_sorted_predecessors.end(), *operand_order);
+      predecessors = &temp_sorted_predecessors;
+    }
+    for (HloInstruction* control_predecessor : *predecessors) {
+      VLOG(3) << "Going to visit HLO " << control_predecessor->name()
+              << " as a control predecessor of HLO " << name();
+      TF_RETURN_IF_ERROR(control_predecessor->AcceptInternal(
+          visitor, operand_order, ignore_control_predecessors));
+    }
   }
 
   TF_RETURN_IF_ERROR(visitor->Preprocess(this));
@@ -1792,9 +1878,11 @@ Status HloInstruction::AcceptInternal(DfsHloVisitor* visitor,
   return visitor->Postprocess(this);
 }
 
-Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit) {
+Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
+                              bool ignore_control_predecessors) {
   VLOG(2) << "HloInstruction::Accept(" << name() << ")";
-  TF_RETURN_IF_ERROR(AcceptInternal(visitor, nullptr));
+  TF_RETURN_IF_ERROR(
+      AcceptInternal(visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
     TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
   }
@@ -1805,7 +1893,8 @@ Status HloInstruction::AcceptWithOperandOrder(
     DfsHloVisitor* visitor, const CompareFunction& operand_order,
     bool call_finish_visit) {
   VLOG(2) << "HloInstruction::AcceptWithOperandOrder(" << name() << ")";
-  TF_RETURN_IF_ERROR(AcceptInternal(visitor, &operand_order));
+  TF_RETURN_IF_ERROR(AcceptInternal(visitor, &operand_order,
+                                    /*ignore_control_predecessors=*/false));
   if (call_finish_visit) {
     TF_RETURN_IF_ERROR(visitor->FinishVisit(this));
   }
@@ -2076,7 +2165,7 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
             }
             return cache[&hlo];
           };
-      return reuses_parameter_elements(*fused_root_);
+      return reuses_parameter_elements(*fused_expression_root());
     }
     default:
       return IsElementwise() ? UseKind::kUse : UseKind::kReuse;
@@ -2098,6 +2187,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kLoop";
     case HloInstruction::FusionKind::kInput:
       return "kInput";
+    case HloInstruction::FusionKind::kOutput:
+      return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
     case HloInstruction::FusionKind::kConvBackwardFilter:
@@ -2107,6 +2198,10 @@ string ToString(HloInstruction::FusionKind kind) {
   }
 }
 
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
+  return os << ToString(kind);
+}
+
 string HloInstruction::ConvolutionDimensionNumbersToString() const {
   string result;
   if (convolution_dimension_numbers_ == nullptr) {
@@ -2133,15 +2228,14 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   lhs_dims[dnums.batch_dimension()] = 'b';
   lhs_dims[dnums.feature_dimension()] = 'f';
   for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    lhs_dims[dnums.spatial_dimensions(i)] = tensorflow::strings::StrCat(i);
+    lhs_dims[dnums.spatial_dimensions(i)] = StrCat(i);
   }
 
   std::vector<string> rhs_dims(2 + dnums.kernel_spatial_dimensions().size());
   rhs_dims[dnums.kernel_input_feature_dimension()] = "i";
   rhs_dims[dnums.kernel_output_feature_dimension()] = "o";
   for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    rhs_dims[dnums.kernel_spatial_dimensions(i)] =
-        tensorflow::strings::StrCat(i);
+    rhs_dims[dnums.kernel_spatial_dimensions(i)] = StrCat(i);
   }
 
   result += "dim_labels=";
@@ -2164,4 +2258,15 @@ bool HloInstruction::CouldBeBitcast() const {
   }
 }
 
+HloModule* HloInstruction::GetModule() const {
+  if (parent_) {
+    return parent_->parent();
+  }
+  return nullptr;
+}
+
+void HloInstruction::UniquifyName(NameUniquer* name_uniquer) {
+  name_ = name_uniquer->GetUniqueName(name_);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 6557ca9116312c4bc31b9f0ba734edd11106d1e7..d300d99adec5201b70b0fe4eb65ef5b84362b018 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -22,6 +22,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
 
 #include <functional>
+#include <iosfwd>
 #include <list>
 #include <memory>
 #include <string>
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -45,18 +47,23 @@ limitations under the License.
 namespace xla {
 
 class HloComputation;
+class HloModule;
 
 // HLO instructions are the IR used by the high-level compiler.
 class HloInstruction {
  public:
   enum class FusionKind {
     kLoop,                // Fused into a loop.
-    kInput,               // Fused into a reduction kernel.
+    kInput,               // Op's input is fused into the op itself.
+    kOutput,              // Op's output is fused into the op itself.
+                          // REQUIRES: At least one operand buffer must be able
+                          // to alias the output buffer.
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
   };
 
+  ~HloInstruction();
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
                                                          const Shape& shape,
@@ -371,8 +378,12 @@ class HloInstruction {
 
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
-  // complete.
-  Status Accept(DfsHloVisitor* visitor, bool call_finish_visit = true);
+  // complete. If ignore_control_predecessors is true, instructions only
+  // reachable via control dependencies will not be visited, and the postorder
+  // will not take control dependencies into account. It is as if the control
+  // dependencies didn't exist in the graph at all.
+  Status Accept(DfsHloVisitor* visitor, bool call_finish_visit = true,
+                bool ignore_control_predecessors = false);
 
   // Same as Accept() above, but the order of operand and control predecessor
   // visitation is determined by the given operand order; if compare(A, B) ==
@@ -418,6 +429,11 @@ class HloInstruction {
     return parameter_name_;
   }
 
+  void set_parameter_name(const string& str) {
+    CHECK_EQ(HloOpcode::kParameter, opcode_);
+    parameter_name_ = str;
+  }
+
   // Returns the dimension sizes or numbers associated with this instruction.
   //
   // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape,
@@ -476,7 +492,10 @@ class HloInstruction {
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false) const;
+  string ToString(bool compact_operands = false,
+                  bool include_metadata = true) const;
+
+  string ToStringNoMetadata() const { return ToString(false, false); }
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
@@ -485,7 +504,9 @@ class HloInstruction {
   // or "elementwise".
   string ToCategory() const;
 
-  // Returns the string concatenation of parent name and this instructions name.
+  // Returns the string concatenation of parent name and this instructions
+  // name. This name is guaranteed to be unique among all instructions in the
+  // HloModule.
   string FullyQualifiedName() const;
 
   // Returns a logging instruction, if the output of this instruction is logged.
@@ -534,6 +555,11 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kFusion
   HloInstruction* fused_expression_root() const;
 
+  // Returns the computation for this fused instruction.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  HloComputation* fused_instructions_computation() const;
+
   // Returns the vector of fused instructions inside this fusion
   // instruction. The order is a reverse postorder of the fused expression (root
   // is first in the order).
@@ -704,8 +730,9 @@ class HloInstruction {
   // this instruction.
   const string& name() const { return name_; }
 
-  // Sets the string identifier for this instruction.
-  void set_name(const string& name) { name_ = name; }
+  // Use the given NameUniquer to select a unique name for the instruction based
+  // on the instruction's existing name.
+  void UniquifyName(NameUniquer* name_uniquer);
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
@@ -718,10 +745,21 @@ class HloInstruction {
   const HloComputation* parent() const { return parent_; }
   HloComputation* parent() { return parent_; }
 
+  // Returns the module for this instruction.
+  HloModule* GetModule() const;
+
   // Returns whether we could assign input and output layouts to this
   // instruction to make it a bitcast.
   bool CouldBeBitcast() const;
 
+  // Sets the parent fusion instruction for this instruction.
+  //
+  // Precondition: opcode() == HloOpcode::kFusion
+  void SetParentFusion(HloInstruction* fusion_instruction) {
+    CHECK_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+    parent_fusion_instruction_ = fusion_instruction;
+  }
+
  private:
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
@@ -758,7 +796,8 @@ class HloInstruction {
   // Inner DFS traversal function -- this function being called (rather than
   // Accept above) allows us to distinguish the root of the traversal.
   Status AcceptInternal(DfsHloVisitor* visitor,
-                        const CompareFunction* operand_order);
+                        const CompareFunction* operand_order,
+                        bool ignore_control_predecessors);
 
   // CHECKs various invariants of a fusion instruction.
   void CheckFusionInstruction() const;
@@ -807,22 +846,14 @@ class HloInstruction {
   // padding of this pad instruction. Only set for pad instructions.
   std::unique_ptr<PaddingConfig> padding_config_;
 
-  // The set of instruction fused into this fusion instruction. Only set for
-  // fusion instructions.
-  std::list<std::unique_ptr<HloInstruction>> fused_instructions_;
+  // The computation that stores of instructions fused into this fusion
+  // instruction. Only set for fusion instructions.
+  std::unique_ptr<HloComputation> fused_instructions_computation_;
 
   // If this instruction is fused into a fusion instruction, this field points
   // to the fusion instruction.
   HloInstruction* parent_fusion_instruction_ = nullptr;
 
-  // The vector of parameter instructions inside this fusion instruction.  The
-  // index of the vector is the parameter_number of the parameter instruction.
-  // This vector is non-empty only for fusion instructions.
-  std::vector<HloInstruction*> fused_parameters_;
-
-  // The root of the expression fused into this fusion instruction.
-  HloInstruction* fused_root_ = nullptr;
-
   // The type of the fusion. Used by kFusion only.
   FusionKind fusion_kind_;
 
@@ -898,6 +929,8 @@ class HloInstruction {
 
 string ToString(HloInstruction::FusionKind kind);
 
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8eabaa1c474aa068c423099919d3382f04c7591c..a226ab0d0c43e6df6216e4b0f58ed4270cb03d40 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -31,6 +33,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAre;
+
 class HloInstructionTest : public HloTestBase {
  protected:
   HloInstructionTest() {}
@@ -148,9 +153,9 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
   auto add = HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo.get(),
                                           bar.get());
 
-  ExpectEqOrdered(add->operands(), {foo.get(), bar.get()});
-  ExpectEqUnordered(foo->users(), {add.get()});
-  ExpectEqUnordered(bar->users(), {add.get()});
+  EXPECT_THAT(add->operands(), UnorderedElementsAre(foo.get(), bar.get()));
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add.get()));
+  EXPECT_THAT(bar->users(), UnorderedElementsAre(add.get()));
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(add->Accept(&visitor));
@@ -383,12 +388,13 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
   EXPECT_EQ(1, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
 
-  ExpectEqUnordered(foo->users(), {add_foobar.get()});
-  ExpectEqOrdered(add_foobar->operands(), {foo.get(), bar.get()});
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add_foobar.get()));
+  EXPECT_THAT(add_foobar->operands(), ElementsAre(foo.get(), bar.get()));
 
-  ExpectEqUnordered(bar->users(), {add_foobar.get(), add_foofoo.get()});
-  ExpectEqOrdered(add_foobar->operands(), {foo.get(), bar.get()});
-  ExpectEqOrdered(add_foofoo->operands(), {bar.get(), bar.get()});
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), add_foofoo.get()));
+  EXPECT_THAT(add_foobar->operands(), ElementsAre(foo.get(), bar.get()));
+  EXPECT_THAT(add_foofoo->operands(), ElementsAre(bar.get(), bar.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
@@ -404,16 +410,17 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
                                                  foo.get(), bar.get());
 
   EXPECT_EQ(2, foo->user_count());
-  ExpectEqUnordered(foo->users(), {tuple.get(), add_foobar.get()});
+  EXPECT_THAT(foo->users(),
+              UnorderedElementsAre(tuple.get(), add_foobar.get()));
 
   // Replace the use of foo in tuple with bar.
   ASSERT_IS_OK(foo->ReplaceUseWith(tuple.get(), bar.get()));
 
-  ExpectEqUnordered(foo->users(), {add_foobar.get()});
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(add_foobar.get()));
 
   // Both uses of foo in tuple should have been replaced with bar.
-  ExpectEqOrdered(tuple->operands(),
-                  {bar.get(), bar.get(), baz.get(), bar.get()});
+  EXPECT_THAT(tuple->operands(),
+              ElementsAre(bar.get(), bar.get(), baz.get(), bar.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
@@ -426,7 +433,7 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
   auto log = HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo.get());
 
   EXPECT_EQ(2, foo->user_count());
-  ExpectEqUnordered(foo->users(), {exp.get(), log.get()});
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(exp.get(), log.get()));
   EXPECT_EQ(0, bar->user_count());
 
   // Replace the use of foo in exp with bar.
@@ -434,8 +441,8 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
 
   // The use of foo in log should not have been affected.
   EXPECT_EQ(1, foo->user_count());
-  ExpectEqUnordered(foo->users(), {log.get()});
-  ExpectEqOrdered(log->operands(), {foo.get()});
+  EXPECT_THAT(foo->users(), UnorderedElementsAre(log.get()));
+  EXPECT_THAT(log->operands(), ElementsAre(foo.get()));
 
   // Bar should now be used in exp.
   EXPECT_EQ(1, bar->user_count());
@@ -466,7 +473,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
   EXPECT_EQ(0, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
 
-  ExpectEqUnordered(bar->users(), {add_foobar.get(), add_foofoo.get()});
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), add_foofoo.get()));
 }
 
 TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
@@ -490,7 +498,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   EXPECT_EQ(0, foo->user_count());
   EXPECT_EQ(3, bar->user_count());
 
-  ExpectEqUnordered(bar->users(), {add_foobar.get(), exp.get(), tuple.get()});
+  EXPECT_THAT(bar->users(),
+              UnorderedElementsAre(add_foobar.get(), exp.get(), tuple.get()));
 }
 
 // Simple visitor that collects and post-processes each node in the graph.
@@ -558,8 +567,8 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
   auto fusion = HloInstruction::CreateFusion(
       r0f32_, HloInstruction::FusionKind::kLoop, exp.get());
 
-  ExpectEqOrdered(fusion->operands(), {constant.get()});
-  ExpectEqUnordered(constant->users(), {fusion.get(), exp.get()});
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
+  EXPECT_THAT(constant->users(), UnorderedElementsAre(fusion.get(), exp.get()));
 }
 
 TEST_F(HloInstructionTest, BinaryFusionOp) {
@@ -574,9 +583,12 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
   auto fusion = HloInstruction::CreateFusion(
       r0f32_, HloInstruction::FusionKind::kLoop, add.get());
 
-  ExpectEqOrdered(fusion->operands(), {constant1.get(), constant2.get()});
-  ExpectEqUnordered(constant1->users(), {fusion.get(), add.get()});
-  ExpectEqUnordered(constant2->users(), {fusion.get(), add.get()});
+  EXPECT_THAT(fusion->operands(),
+              ElementsAre(constant1.get(), constant2.get()));
+  EXPECT_THAT(constant1->users(),
+              UnorderedElementsAre(fusion.get(), add.get()));
+  EXPECT_THAT(constant2->users(),
+              UnorderedElementsAre(fusion.get(), add.get()));
 }
 
 TEST_F(HloInstructionTest, ChainFusionOp) {
@@ -593,8 +605,28 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   fusion->FuseInstruction(exp2.get());
   fusion->FuseInstruction(exp1.get());
 
-  ExpectEqOrdered(fusion->operands(), {constant.get()});
-  ExpectEqUnordered(constant->users(), {fusion.get(), exp1.get()});
+  EXPECT_THAT(fusion->operands(), ElementsAre(constant.get()));
+  EXPECT_THAT(constant->users(),
+              UnorderedElementsAre(fusion.get(), exp1.get()));
+}
+
+TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
+  // Create a chain of fused unary ops.
+  auto constant =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f));
+  auto exp1 =
+      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant.get());
+  auto exp2 = HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp1.get());
+  OpMetadata metadata;
+  metadata.set_op_name("tf_op");
+  exp1->set_metadata(metadata);
+  exp2->set_metadata(metadata);
+
+  auto fusion = HloInstruction::CreateFusion(
+      r0f32_, HloInstruction::FusionKind::kLoop, exp2.get());
+  auto* fused = fusion->FuseInstruction(exp1.get());
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fusion->metadata()));
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(metadata, fused->metadata()));
 }
 
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
@@ -626,15 +658,15 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   auto fusion = HloInstruction::CreateFusion(
       scalar_shape, HloInstruction::FusionKind::kLoop, map_3_y.get());
 
-  ASSERT_EQ(fusion->called_computations().size(), 1);
-  EXPECT_EQ(fusion->called_computations()[0], computation_y.get());
+  EXPECT_THAT(fusion->called_computations(), ElementsAre(computation_y.get()));
 
   fusion->FuseInstruction(map_2_x.get());
-  ASSERT_EQ(fusion->called_computations().size(), 2);
-  EXPECT_EQ(fusion->called_computations()[1], computation_x.get());
+  EXPECT_THAT(fusion->called_computations(),
+              ElementsAre(computation_y.get(), computation_x.get()));
 
   fusion->FuseInstruction(map_1_x.get());
-  ASSERT_EQ(fusion->called_computations().size(), 2);
+  EXPECT_THAT(fusion->called_computations(),
+              ElementsAre(computation_y.get(), computation_x.get()));
 }
 
 TEST_F(HloInstructionTest, ComplexFusionOp) {
@@ -675,8 +707,9 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
 
   // Operands in the fusion instruction's operands() vector should be in the
   // order in which their users were added fused.
-  ExpectEqOrdered(fusion->operands(), {c1.get(), c3.get(), c2.get()});
-  ExpectEqUnordered(c1->users(), {add.get(), tuple.get(), fusion.get()});
+  EXPECT_THAT(fusion->operands(), ElementsAre(c1.get(), c3.get(), c2.get()));
+  EXPECT_THAT(c1->users(),
+              UnorderedElementsAre(add.get(), tuple.get(), fusion.get()));
 }
 
 // Convenience function for comparing two HloInstructions inside of
@@ -929,5 +962,44 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
                                root2->operand(1)->operand(0)->shape()));
 }
 
+TEST_F(HloInstructionTest, CloneSuffixNames) {
+  // Test that the suffix string added to cloned instructions is not
+  // duplicated. Rather a numeric incrementing value should be appended. That
+  // is, we want "foo.clone2", not "foo.clone.clone".
+
+  // Test cloning the same instruction multiple times.
+  auto foo =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "foo");
+  EXPECT_EQ(foo->Clone()->name(), "%foo.clone");
+  EXPECT_EQ(foo->Clone()->Clone()->name(), "%foo.clone2");
+  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "%foo.clone3");
+
+  // Test custom suffixes.
+  EXPECT_EQ(foo->Clone("bar")->name(), "%foo.bar");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "%foo.bar2");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(),
+            "%foo.bar2.clone");
+
+  // Test instruction name with a dot.
+  auto foo_baz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.baz");
+  EXPECT_EQ(foo_baz->Clone()->name(), "%foo.baz.clone");
+
+  // Test incrementing a large number after the suffix.
+  auto foo_clone234 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone234");
+  EXPECT_EQ(foo_clone234->Clone()->name(), "%foo.clone235");
+
+  // Test a non-numeric string after the cloning suffix.
+  auto foo_clonexyz = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clonexyz");
+  EXPECT_EQ(foo_clonexyz->Clone()->name(), "%foo.clonexyz.clone");
+
+  // Test a name with multiple appearances of the suffix.
+  auto foo_clone_clone3 = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "foo.clone.clone3");
+  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "%foo.clone.clone4");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e022c4836d87866925ab7e56c2250d87d0f5dfec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace testing {
+
+bool HloMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  // These cases are self-explanatory from the printed value.
+  if (!instruction || instruction->opcode() != opcode_) {
+    return false;
+  }
+  // Special case: no operand matchers means don't verify.
+  if (operands_.empty()) {
+    return true;
+  }
+  const auto& operands = instruction->operands();
+  if (operands.size() != operands_.size()) {
+    *listener << "has too "
+              << (operands.size() > operands_.size() ? "many" : "few")
+              << " operands (got " << operands.size() << ", want "
+              << operands_.size() << ")";
+    return false;
+  }
+  for (int index = 0; index < operands.size(); index++) {
+    ::testing::StringMatchResultListener inner_listener;
+    if (!operands_[index].MatchAndExplain(operands[index], &inner_listener)) {
+      if (listener->IsInterested()) {
+        *listener << "\noperand " << index << ":\n\t"
+                  << operands[index]->ToString()
+                  << "\ndoesn't match expected:\n\t";
+        operands_[index].DescribeTo(listener->stream());
+        string explanation = inner_listener.str();
+        if (!explanation.empty()) {
+          *listener << ", " << explanation;
+        }
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+void HloMatcher::DescribeTo(::std::ostream* os) const {
+  *os << opcode_;
+  if (!operands_.empty()) {
+    *os << "(";
+    for (int i = 0; i < operands_.size(); i++) {
+      if (i > 0) {
+        *os << ", ";
+      }
+      operands_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+}
+
+}  // namespace testing
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
new file mode 100644
index 0000000000000000000000000000000000000000..141251011cc0b4205b6069ff90415492ead9f7a9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace testing {
+
+class HloMatcher : public ::testing::MatcherInterface<const HloInstruction*> {
+ public:
+  HloMatcher(HloOpcode opcode,
+             std::vector<::testing::Matcher<const HloInstruction*>> operands)
+      : opcode_(opcode), operands_(operands) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  HloOpcode opcode_;
+  std::vector<::testing::Matcher<const HloInstruction*>> operands_;
+};
+
+// HloInstruction* matchers for opcode and operands. Example:
+//   namespace op = xla::opcode_matchers;
+//   EXPECT_THAT(instruction,
+//               op::Add(op::Reshape(), op::Add(op::Reshape(), _)));
+namespace opcode_matchers {
+#define HLO_MATCHER(opcode)                                                \
+  template <typename... M>                                                 \
+  ::testing::Matcher<const ::xla::HloInstruction*> opcode(M... operands) { \
+    return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(          \
+        ::xla::HloOpcode::k##opcode, {operands...}));                      \
+  }
+HLO_MATCHER(Abs);
+HLO_MATCHER(Add);
+HLO_MATCHER(Bitcast);
+HLO_MATCHER(Broadcast);
+HLO_MATCHER(Call);
+HLO_MATCHER(Ceil);
+HLO_MATCHER(Clamp);
+HLO_MATCHER(Concatenate);
+HLO_MATCHER(Constant);
+HLO_MATCHER(Convert);
+HLO_MATCHER(Convolution);
+HLO_MATCHER(Copy);
+HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CustomCall);
+HLO_MATCHER(Divide);
+HLO_MATCHER(Dot);
+HLO_MATCHER(DynamicSlice);
+HLO_MATCHER(DynamicUpdateSlice);
+HLO_MATCHER(Eq);
+HLO_MATCHER(Exp);
+HLO_MATCHER(Floor);
+HLO_MATCHER(Fusion);
+HLO_MATCHER(Ge);
+HLO_MATCHER(GetTupleElement);
+HLO_MATCHER(Gt);
+HLO_MATCHER(Index);
+HLO_MATCHER(Infeed);
+HLO_MATCHER(IsFinite);
+HLO_MATCHER(Le);
+HLO_MATCHER(Log);
+HLO_MATCHER(LogicalAnd);
+HLO_MATCHER(LogicalNot);
+HLO_MATCHER(LogicalOr);
+HLO_MATCHER(Lt);
+HLO_MATCHER(Map);
+HLO_MATCHER(Maximum);
+HLO_MATCHER(Minimum);
+HLO_MATCHER(Multiply);
+HLO_MATCHER(Ne);
+HLO_MATCHER(Negate);
+HLO_MATCHER(Outfeed);
+HLO_MATCHER(Pad);
+HLO_MATCHER(Parameter);
+HLO_MATCHER(Power);
+HLO_MATCHER(Recv);
+HLO_MATCHER(Reduce);
+HLO_MATCHER(ReduceWindow);
+HLO_MATCHER(Remainder);
+HLO_MATCHER(Reshape);
+HLO_MATCHER(Reverse);
+HLO_MATCHER(Rng);
+HLO_MATCHER(Select);
+HLO_MATCHER(SelectAndScatter);
+HLO_MATCHER(Send);
+HLO_MATCHER(Sign);
+HLO_MATCHER(Slice);
+HLO_MATCHER(Sort);
+HLO_MATCHER(Subtract);
+HLO_MATCHER(Tanh);
+HLO_MATCHER(Trace);
+HLO_MATCHER(Transpose);
+HLO_MATCHER(Tuple);
+HLO_MATCHER(Update);
+HLO_MATCHER(While);
+#undef HLO_MATCHER
+}  // namespace opcode_matchers
+
+// Helper to convert smart to raw pointers for matching.
+template <typename Container>
+std::vector<const HloInstruction*> Pointers(const Container& container) {
+  std::vector<const HloInstruction*> result;
+  result.reserve(container.size());
+  for (const auto& entry : container) result.push_back(entry.get());
+  return result;
+}
+
+}  // namespace testing
+
+// Tell GMock to print HloInstruction* by value, so error messages are nice.
+// Has to be in the same namespace as 'HloInstruction'.
+void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
+  *os << (inst ? inst->ToString() : "nullptr");
+}
+
+void PrintTo(HloInstruction* inst, ::std::ostream* os) {
+  PrintTo(const_cast<const HloInstruction*>(inst), os);
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1465d1cacdc971a04c620bc48bed33239a67a955
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+using ::testing::Eq;
+
+namespace xla {
+namespace {
+
+template <typename M, typename T>
+string Explain(const T& t, const M& m) {
+  ::testing::StringMatchResultListener listener;
+  EXPECT_THAT(t, ::testing::Not(m));  // For the error message.
+  EXPECT_FALSE(m.MatchAndExplain(t, &listener));
+  return listener.str();
+}
+
+TEST(HloMatchersTest, Test) {
+  auto shape = ShapeUtil::MakeShape(F32, {1});
+  auto param = HloInstruction::CreateParameter(0, shape, "param");
+  auto mul = HloInstruction::CreateBinary(shape, HloOpcode::kMultiply,
+                                          param.get(), param.get());
+  auto add = HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param.get(),
+                                          mul.get());
+
+  EXPECT_THAT(add.get(), op::Add());
+  EXPECT_THAT(add.get(), op::Add(op::Parameter(), op::Multiply()));
+  EXPECT_THAT(add.get(),
+              op::Add(op::Parameter(), op::Multiply(_, op::Parameter())));
+
+  // Negative matches: check the explanation string.
+  EXPECT_THAT(Explain(add.get(), op::Parameter()), Eq(""));
+  EXPECT_THAT(Explain(add.get(), op::Add(op::Parameter())),
+              Eq("has too many operands (got 2, want 1)"));
+  EXPECT_THAT(
+      Explain(add.get(), op::Add(op::Parameter(), op::Parameter())),
+      Eq("\noperand 1:\n\t"
+         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
+         "doesn't match expected:\n\t"
+         "parameter"));
+  EXPECT_THAT(
+      Explain(add.get(),
+              op::Add(op::Parameter(), op::Multiply(op::Add(), op::Add()))),
+      Eq("\noperand 1:\n\t"
+         "%multiply = f32[1]{0} multiply(f32[1]{0} %param, f32[1]{0} %param)\n"
+         "doesn't match expected:\n\t"
+         "multiply(add, add), \n"
+         "operand 0:\n\t"
+         "%param = f32[1]{0} parameter(0)\n"
+         "doesn't match expected:\n\t"
+         "add"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 36064e93fe8a750d05183d78738e92768506a835..cff9a6658d73dde6fbf0c754eb6df7cc7e9d6d16 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -31,20 +32,53 @@ limitations under the License.
 
 namespace xla {
 
-HloComputation* HloModule::AddEntryComputation(
+HloModule::HloModule(const string& name,
+                     const VersionedComputationHandle& entry_computation_handle)
+    : name_(name),
+      config_(nullptr),
+      entry_computation_(nullptr),
+      has_entry_computation_handle_(true),
+      entry_computation_handle_(entry_computation_handle),
+      computation_name_uniquer_(/*separator=*/".") {}
+
+HloModule::HloModule(const string& name,
+                     const VersionedComputationHandle& entry_computation_handle,
+                     const HloModuleConfig& config)
+    : name_(name),
+      config_(MakeUnique<HloModuleConfig>(config)),
+      entry_computation_(nullptr),
+      has_entry_computation_handle_(true),
+      entry_computation_handle_(entry_computation_handle),
+      computation_name_uniquer_(/*separator=*/".") {}
+
+HloModule::HloModule(const string& name)
+    : name_(name),
+      config_(nullptr),
+      entry_computation_(nullptr),
+      computation_name_uniquer_(/*separator=*/".") {}
+
+void HloModule::set_config(const HloModuleConfig& config) {
+  config_ = MakeUnique<HloModuleConfig>(config);
+}
+
+HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation) {
-  CHECK_EQ(nullptr, entry_computation_);
-  entry_computation_ = computation.get();
+  computation->UniquifyName(&computation_name_uniquer_);
   computation->set_parent(this);
   computations_.push_back(std::move(computation));
   return computations_.back().get();
 }
 
+HloComputation* HloModule::AddEntryComputation(
+    std::unique_ptr<HloComputation> computation) {
+  CHECK_EQ(nullptr, entry_computation_);
+  entry_computation_ = computation.get();
+  return AddComputationInternal(std::move(computation));
+}
+
 HloComputation* HloModule::AddEmbeddedComputation(
     std::unique_ptr<HloComputation> computation) {
-  computation->set_parent(this);
-  computations_.push_back(std::move(computation));
-  return computations_.back().get();
+  return AddComputationInternal(std::move(computation));
 }
 
 void HloModule::ReplaceComputations(
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index d598750da657ab3d72c6c8689b6642ea5d7e602c..3efb9c72bb16249fbac5d7b84908305d003b31b4 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -25,6 +25,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -41,19 +43,18 @@ namespace xla {
 // computations are owned by the module.
 class HloModule {
  public:
-  explicit HloModule(const string& name,
-                     const VersionedComputationHandle& entry_computation_handle)
-      : name_(name),
-        entry_computation_(nullptr),
-        has_entry_computation_handle_(true),
-        entry_computation_handle_(entry_computation_handle) {}
+  HloModule(const string& name,
+            const VersionedComputationHandle& entry_computation_handle);
+
+  HloModule(const string& name,
+            const VersionedComputationHandle& entry_computation_handle,
+            const HloModuleConfig& config);
 
   // Constructor without a versioned computation handle. This constructor should
   // only be used for HloModules used outside of the XLA service (eg
   // tests). The versioned handle is used by the service in the compilation
   // cache.
-  explicit HloModule(const string& name)
-      : name_(name), entry_computation_(nullptr) {}
+  explicit HloModule(const string& name);
 
   // Adds an entry computation to the module. A module can only have one entry
   // computation. Returns a pointer to the newly added computation.
@@ -95,6 +96,14 @@ class HloModule {
   // computation B, then A will appear after B in the sort.
   std::list<HloComputation*> MakeComputationPostOrder() const;
 
+  bool has_config() const { return config_ != nullptr; }
+
+  void set_config(const HloModuleConfig& config);
+
+  const HloModuleConfig& config() const { return *config_; }
+
+  HloModuleConfig* mutable_config() { return config_.get(); }
+
   string ToString() const;
 
   // Outlines the given expression from the given computation.
@@ -110,8 +119,17 @@ class HloModule {
   // Returns a randomly generated uint64.
   uint64 RandomNew64() const;
 
+  // Returns the unique name for a computation in this module.
+  string GetUniqueCompuationName(const string& prefix) {
+    return computation_name_uniquer_.GetUniqueName(prefix);
+  }
+
  private:
+  HloComputation* AddComputationInternal(
+      std::unique_ptr<HloComputation> computation);
+
   const string name_;
+  std::unique_ptr<HloModuleConfig> config_;
   HloComputation* entry_computation_;
   std::vector<std::unique_ptr<HloComputation>> computations_;
 
@@ -125,6 +143,9 @@ class HloModule {
   // Versioned handle of the entry computation of the module.
   bool has_entry_computation_handle_ = false;
   VersionedComputationHandle entry_computation_handle_;
+
+  // Unique name generator for computation names, which are unique per module.
+  NameUniquer computation_name_uniquer_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 0f4252522d3c021ee5d95d1713167b2fb0fb1d69..1175be4f5082401483767ba02b83a8cec68605dd 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace xla {
@@ -61,7 +61,8 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
-  EXPECT_EQ(module->MakeComputationPostOrder().front(), computation);
+  EXPECT_THAT(module->MakeComputationPostOrder(),
+              ::testing::ElementsAre(computation));
 }
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
@@ -71,9 +72,13 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
 
-  EXPECT_MATCH(
-      testing::ListToVec<HloComputation*>(module->MakeComputationPostOrder()),
-      testing::UnorderedMatcher<HloComputation*>(computation1, computation2));
+  EXPECT_THAT(module->MakeComputationPostOrder(),
+              ::testing::UnorderedElementsAre(computation1, computation2));
+
+  // We specified the same name for both computations, but the HloModule should
+  // have made the names unique.
+  EXPECT_EQ(computation1->name(), "Constant");
+  EXPECT_EQ(computation2->name(), "Constant.1");
 }
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
@@ -89,9 +94,9 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
       CreateCallComputation({computation2, computation3}));
 
   auto post_order = module->MakeComputationPostOrder();
-  EXPECT_MATCH(testing::ListToVec<HloComputation*>(post_order),
-               testing::UnorderedMatcher<HloComputation*>(
-                   computation1, computation2, computation3, computation4));
+  EXPECT_THAT(post_order,
+              ::testing::UnorderedElementsAre(computation1, computation2,
+                                              computation3, computation4));
   EXPECT_EQ(post_order.back(), computation4);
   EXPECT_EQ(post_order.front(), computation1);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 616b239a9310bc13e14c861184b7efebe7da6b2f..ceb0cdaa3169bb57e4ebb61ac1b2ea41f1ef7995 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -165,4 +165,17 @@ bool HloOpcodeIsComparison(HloOpcode opcode) {
   }
 }
 
+bool HloOpcodeIsVariadic(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kFusion:
+    case HloOpcode::kMap:
+    case HloOpcode::kTuple:
+      return true;
+    default:
+      return false;
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 978ed5e79b90c3c12f31b4d4e3d3314849fed75c..e2cdbfdfa7a4b5509dccf9a83ffbd799f9ab1374 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -104,6 +104,9 @@ inline std::ostream& operator<<(std::ostream& os, HloOpcode opcode) {
 // Returns true iff the given opcode is a comparison operation.
 bool HloOpcodeIsComparison(HloOpcode opcode);
 
+// Returns true iff the given opcode has variadic operands.
+bool HloOpcodeIsVariadic(HloOpcode opcode);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_OPCODE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 0b64c16fdc6639a0288b4a69698a600b09ba32f7..892c89f9df209f2e39005a4901feae6699ce4d0b 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index b3168ed40ece3ea65c6b26b96250f2ea77969953..d1ef8cb6918d02287912b76b213ed2acd7940d76 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -34,15 +34,95 @@ limitations under the License.
 
 namespace xla {
 
-PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
-    : module_(module) {}
+namespace {
+
+// Returns the nearest call graph ancestors of instructions 'a' and 'b' for
+// which the ancestors are in the same computation. An instruction is an call
+// graph ancestor of 'a' if the instruction calls the computation containing 'a'
+// either directly or transitively. Degeneratively an instruction is an ancestor
+// of itself. nullptr is returned if there is no common ancestor or if the
+// caller chain of 'a' or 'b' diverges (has multiple callers) before the nearest
+// common ancestor.
+//
+// Example:
+//
+// Entry computation:
+//   %x = Call(A, {Constant(42.0)})
+//   %y = Call(B, {%x})
+//
+// Computation A:
+//   %a = Negate(Param())
+//
+// Computation B:
+//   %b = Exp(Param());
+//
+// If called with %a and %b, this function would return (%x, %y). %x is an
+// ancestor of %a, and %y is an ancestor of %b, and %x and %y are in the same
+// computation.
+std::pair<const HloInstruction*, const HloInstruction*>
+GetNearestCallGraphAncestorsInSameComputation(const HloInstruction* a,
+                                              const HloInstruction* b,
+                                              const CallGraph& call_graph) {
+  // Lambda which returns the next instruction in the callee->caller chain in
+  // the call graph. This is the unique instruction which calls the computation
+  // containing 'instruction'. If more than one instruction calls the
+  // computation containing 'instruction' or no instructions call the
+  // computation then nullptr is returned.
+  auto next_caller =
+      [&call_graph](
+          const HloInstruction* instruction) -> const HloInstruction* {
+    const CallGraphNode& node = call_graph.GetNode(instruction->parent());
+    if (node.caller_callsites().size() != 1) {
+      return nullptr;
+    }
+    return node.caller_callsites()[0].instruction();
+  };
+
+  // Iterate through the callee->caller chains and find the earliest common
+  // element.
+  for (const HloInstruction* a_ancestor = a; a_ancestor != nullptr;
+       a_ancestor = next_caller(a_ancestor)) {
+    for (const HloInstruction* b_ancestor = b; b_ancestor != nullptr;
+         b_ancestor = next_caller(b_ancestor)) {
+      if (a_ancestor->parent() == b_ancestor->parent()) {
+        return {a_ancestor, b_ancestor};
+      }
+    }
+  }
+  return {nullptr, nullptr};
+}
+
+}  // namespace
 
-bool PredecessorHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                            const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
+bool HloOrdering::ExecutesBefore(const HloInstruction* a,
+                                 const HloInstruction* b) const {
+  // 'a' and 'b' may be in different computations. In this case, find the
+  // callgraph ancestor instructions which call (potentially transitively) the
+  // computations containing 'a' and 'b' and use these ancestor instructions to
+  // compare order.
+  const HloInstruction* a_ancestor;
+  const HloInstruction* b_ancestor;
+  std::tie(a_ancestor, b_ancestor) =
+      GetNearestCallGraphAncestorsInSameComputation(a, b, *call_graph_);
+
+  if (a_ancestor == nullptr) {
+    // Ancestors in a common computation could not be found so consider the
+    // instructions 'a' and 'b' to be unordered.
     return false;
   }
+  // a_ancestor and b_ancestor must be either both null or both non-null.
+  CHECK_NE(b_ancestor, nullptr);
+  CHECK_EQ(a_ancestor->parent(), b_ancestor->parent());
+  return ExecutesBeforeInSameComputation(a_ancestor, b_ancestor);
+}
+
+PredecessorHloOrdering::PredecessorHloOrdering(const HloModule* module)
+    : HloOrdering(module) {}
+
+bool PredecessorHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
+
   // 'a' executes before 'b' if 'a' is in the strict predecessor set of 'b'.
   return strict_predecessors_.at(b->parent())->IsReachable(b, a);
 }
@@ -86,7 +166,7 @@ string DependencyHloOrdering::ToString() const {
 
 SequentialHloOrdering::SequentialHloOrdering(
     const HloModule* module, const HloModuleSequence& module_sequence)
-    : module_(module), module_sequence_(module_sequence) {
+    : HloOrdering(module), module_sequence_(module_sequence) {
   // Create a map from instruction to its order position.
   for (auto computation_order : module_sequence_) {
     const std::vector<const HloInstruction*>& order = computation_order.second;
@@ -97,12 +177,9 @@ SequentialHloOrdering::SequentialHloOrdering(
   }
 }
 
-bool SequentialHloOrdering::ExecutesBefore(const HloInstruction* a,
-                                           const HloInstruction* b) const {
-  // Instructions in different computations are unordered.
-  if (a->parent() != b->parent()) {
-    return false;
-  }
+bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
+    const HloInstruction* a, const HloInstruction* b) const {
+  CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
   if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
     return false;
@@ -144,23 +221,6 @@ string SequentialHloOrdering::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
-namespace {
-StatusOr<int64> MinimumMemoryForSequence(
-    const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // The absolute minimum memory required for a given sequence of instructions
-  // is determined by the sequence of Alloc and Free calls on a simulated heap,
-  // ignoring fragmentation.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), sequence,
-                         computation, points_to_analysis, size_function));
-  return result.heap_size;
-}
-}  // namespace
-
 StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function) {
@@ -172,17 +232,16 @@ StatusOr<int64> MinimumMemoryForSequence(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
 
-  int64 total_memory = 0;
-  for (const auto& pair : module_sequence) {
-    const HloComputation* computation = pair.first;
-    const std::vector<const HloInstruction*>& sequence = pair.second;
-    TF_ASSIGN_OR_RETURN(
-        const int64 memory,
-        MinimumMemoryForSequence(*computation, sequence, *points_to_analysis,
-                                 size_function));
-    total_memory += memory;
-  }
-  return total_memory;
+  // The absolute minimum memory required for a given sequence of instructions
+  // is determined by the sequence of Alloc and Free calls on a simulated heap,
+  // ignoring fragmentation. We run the heap simulation on the whole module,
+  // rather than summing each computation, since it gives us a better lower
+  // bound, by minimizing the liveness of sub-computations.
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+                         module_sequence, *points_to_analysis, size_function));
+  return result.heap_size;
 }
 
 namespace {
@@ -284,7 +343,7 @@ class ListScheduler {
     return freed_bytes;
   }
 
-  // Construct the scheduling priority of the given instruciton.
+  // Construct the scheduling priority of the given instruction.
   Priority GetPriority(const HloInstruction* instruction) {
     return {BytesFreedIfScheduled(instruction), instruction->user_count()};
   }
@@ -439,6 +498,18 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   return sequence;
 }
 
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
@@ -446,13 +517,17 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
+  //
+  // Note that this is just a heuristic. One obvious inaccuracy is that the
+  // memory required for sub-computations might be different when considered
+  // within the caller's context. But it's good enough for now.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
       ListScheduler::Run(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
-      MinimumMemoryForSequence(computation, list_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, list_sequence,
+                                  points_to_analysis, size_function));
   VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
 
   TF_ASSIGN_OR_RETURN(
@@ -460,8 +535,8 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
-      MinimumMemoryForSequence(computation, dfs_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
+                                  size_function));
   VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
 
   if (list_memory <= dfs_memory) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index e964c4c51ae14f89d1f1b0450990cfc50c8a74be..d2db18be0009b1ca62b538d3975e1a0a105c5e83 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
@@ -36,13 +37,13 @@ namespace xla {
 // buffers.
 class HloOrdering {
  public:
-  HloOrdering() = default;
+  HloOrdering(const HloModule* module)
+      : module_(module), call_graph_(CallGraph::Build(module)) {}
   virtual ~HloOrdering() = default;
 
   // Returns true if instruction 'a' executes before instruction 'b'. This is
   // not reflexive, that is, an instruction does not execute before itself.
-  virtual bool ExecutesBefore(const HloInstruction* a,
-                              const HloInstruction* b) const = 0;
+  bool ExecutesBefore(const HloInstruction* a, const HloInstruction* b) const;
 
   // Returns the sequential instruction order for the given computation, or
   // nullptr if the computation does not have a sequential ordering.
@@ -50,6 +51,21 @@ class HloOrdering {
       const HloComputation& computation) const = 0;
 
   virtual string ToString() const = 0;
+
+ protected:
+  // Returns true if instruction 'a' executes before instruction 'b'.
+  // Precondition: 'a' and 'b' are in the same computation.
+  //
+  // Derived classes should implement this method for determining order of
+  // instructions in the same comptuation. ExecutesBefore() analyzes the
+  // callgraph and uses this method to determine ordering of instructions in
+  // different computations.
+  virtual bool ExecutesBeforeInSameComputation(
+      const HloInstruction* a, const HloInstruction* b) const = 0;
+
+  const HloModule* module_;
+
+  std::unique_ptr<CallGraph> call_graph_;
 };
 
 // Base class for partial orderings implemented by a map of strict predecessors
@@ -58,11 +74,6 @@ class PredecessorHloOrdering : public HloOrdering {
  public:
   ~PredecessorHloOrdering() override = default;
 
-  // Returns true if instruction 'a' executes before instruction 'b'.
-  // Instructions in different computations are not ordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
-
   // Returns nullptr indicating the computation does not have a sequential
   // ordering.
   const std::vector<const HloInstruction*>* SequentialOrder(
@@ -74,11 +85,12 @@ class PredecessorHloOrdering : public HloOrdering {
   explicit PredecessorHloOrdering(const HloModule* module);
   string ToStringHelper(const string& name) const;
 
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
 
-  // For each each computation in the module, this is the set of the
-  // instruction's strict predecessors. An instruction is not an element of its
-  // own strict predecessor set.
+  // For each computation in the module, this is the set of the instruction's
+  // strict predecessors. An instruction is not an element of its own strict
+  // predecessor set.
   //
   // Subclasses should fill this in to define the desired ordering.
   tensorflow::gtl::FlatMap<const HloComputation*,
@@ -150,12 +162,6 @@ class SequentialHloOrdering : public HloOrdering {
                         const HloModuleSequence& module_sequence);
   ~SequentialHloOrdering() override = default;
 
-  // Instruction 'a' executes before 'b' if 'a' appears before 'b' in the
-  // instruction sequence for the computation. Instructions in different
-  // computations are unordered.
-  bool ExecutesBefore(const HloInstruction* a,
-                      const HloInstruction* b) const override;
-
   // Returns the sequential instruction order for the given computation.
   const std::vector<const HloInstruction*>* SequentialOrder(
       const HloComputation& computation) const override;
@@ -163,7 +169,9 @@ class SequentialHloOrdering : public HloOrdering {
   string ToString() const override;
 
  protected:
-  const HloModule* module_;
+  bool ExecutesBeforeInSameComputation(const HloInstruction* a,
+                                       const HloInstruction* b) const override;
+
   const HloModuleSequence module_sequence_;
 
   // The position of every instruction in the HLO module in its respective
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 425bee601a8d6357e21d3d00f8ccf5d69af03862..c387fbb89b196c340852db057754f85e3e5435f3 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -78,6 +78,142 @@ TEST_F(HloOrderingTest, LastUseScheduledFirst) {
   EXPECT_TRUE(ordering.ExecutesBefore(add, negate));
 }
 
+TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
+  // Tests the ordering of instructions in different computations using the
+  // following HLO code:
+  //
+  // Entry computation:
+  //   %x = Call(A, {})
+  //   %y = Call(B, {%x})
+  //
+  // Computation A:
+  //   %a = Call(C, {})
+  //
+  // Computation B:
+  //   %b = Call(C, {})
+  //
+  // Computation C:
+  //   %c = Constant(42.0f)
+  //
+  // This results in a diamond-shaped callgraph.
+  HloModule module(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+
+  auto builder_c = HloComputation::Builder("C");
+  HloInstruction* c = builder_c.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  HloComputation* computation_c =
+      module.AddEmbeddedComputation(builder_c.Build());
+
+  auto builder_b = HloComputation::Builder("B");
+  builder_b.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param"));
+  HloInstruction* b = builder_b.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_b =
+      module.AddEmbeddedComputation(builder_b.Build());
+
+  auto builder_a = HloComputation::Builder("A");
+  HloInstruction* a = builder_a.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_c));
+  HloComputation* computation_a =
+      module.AddEmbeddedComputation(builder_a.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* x = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {}, computation_a));
+  HloInstruction* y = builder.AddInstruction(
+      HloInstruction::CreateCall(scalar_shape, {x}, computation_b));
+  module.AddEntryComputation(builder.Build());
+
+  DependencyHloOrdering ordering(&module);
+  EXPECT_TRUE(ordering.ExecutesBefore(x, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, x));
+
+  EXPECT_TRUE(ordering.ExecutesBefore(a, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(a, x));
+  EXPECT_TRUE(ordering.ExecutesBefore(a, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, a));
+
+  EXPECT_FALSE(ordering.ExecutesBefore(b, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, y));
+  EXPECT_TRUE(ordering.ExecutesBefore(x, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, b));
+
+  // Instruction 'c' is called from multiple callsites and should be unordered
+  // relative to all other instructions in the module.
+  EXPECT_FALSE(ordering.ExecutesBefore(c, a));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, b));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, x));
+  EXPECT_FALSE(ordering.ExecutesBefore(c, y));
+  EXPECT_FALSE(ordering.ExecutesBefore(a, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(b, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(x, c));
+  EXPECT_FALSE(ordering.ExecutesBefore(y, c));
+}
+
+class MinimumMemoryForSequenceTest : public HloTestBase {};
+
+TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+  HloModule module(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      module.AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      module.AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
+  // While: 8 bytes (4 bytes per element), TOTAL=32
+  // Both cond and body use a max of 24 bytes, TOTAL=56
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
+                                       cond_lt};
+  module_sequence[body_computation] = {body_param};
+  module_sequence[entry_computation] = {iter, data, tuple, while_op};
+  EXPECT_EQ(56,
+            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 6e3c983071245c548914bd9eecd0d1e86bc64d99..78aebe9c36dfb5f63099f5e2df7bffe8529b08de 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -40,11 +40,19 @@ void DumpModule(const Compiler::HloDumper& dumper_, const HloModule& module,
 }  // namespace
 
 StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
+  run_called_ = true;
+
+  VLOG(1) << "Running HLO pass pipeline " << name();
+
   legacy_flags::HloPassPipelineFlags* flags =
       legacy_flags::GetHloPassPipelineFlags();
   std::vector<string> tmp =
       tensorflow::str_util::Split(flags->xla_disable_hlo_passes, ',');
   tensorflow::gtl::FlatSet<string> disabled_passes(tmp.begin(), tmp.end());
+  if (!disabled_passes.empty()) {
+    VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
+            << tensorflow::str_util::Join(disabled_passes, ", ");
+  }
 
   auto run_invariant_checkers = [this, module]() -> Status {
     for (auto& invariant_checker : invariant_checkers_) {
@@ -60,9 +68,13 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   for (auto& pass : passes_) {
     if (!disabled_passes.empty() &&
         disabled_passes.count(pass->name().ToString()) > 0) {
+      VLOG(1) << "  Skipping HLO pass " << pass->name()
+              << ", disabled by --xla_disable_hlo_passes";
       continue;
     }
 
+    VLOG(1) << "  HLO pass " << pass->name();
+
     // Emit label containing: "after foo-pass, before bar-pass".
     message.clear();
     StrAppend(&message, prefix, ", before ", pass->name());
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index a8c2d518730b9fab8febaae35797ea4a315ab9b1..682c4b952df6aae8cb933c222772dbd823070ecc 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -47,6 +47,7 @@ class HloPassPipeline : public HloPassInterface {
   // Returns a reference to the added pass.
   template <typename T, typename... Args>
   T& AddPass(Args&&... args) {
+    CHECK(!run_called_) << "AddPass cannot be called after Run";
     auto pass = new T(std::forward<Args>(args)...);
     passes_.push_back(std::unique_ptr<T>(pass));
     return *pass;
@@ -57,6 +58,7 @@ class HloPassPipeline : public HloPassInterface {
   // (it is required to always return "false" from its Run() method).
   template <typename T, typename... Args>
   T& AddInvariantChecker(Args&&... args) {
+    CHECK(!run_called_) << "AddInvariantChecker cannot be called after Run";
     auto pass = new T(std::forward<Args>(args)...);
     invariant_checkers_.push_back(std::unique_ptr<T>(pass));
     return *pass;
@@ -70,6 +72,7 @@ class HloPassPipeline : public HloPassInterface {
   Compiler::HloDumper dumper_;
   std::vector<std::unique_ptr<HloPassInterface>> passes_;
   std::vector<std::unique_ptr<HloPassInterface>> invariant_checkers_;
+  bool run_called_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloPassPipeline);
 };
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 1556d1772f934ea02506aff27396034814d61698..a153d73dbd838663c0d7e0d72ad54668f243f2c2 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -32,6 +32,16 @@ bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   return false;
 }
 
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kParameter &&
+        operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AllOperandsAreParameters(const HloInstruction& instruction) {
   for (const auto& operand : instruction.operands()) {
     if (operand->opcode() != HloOpcode::kParameter) {
@@ -41,6 +51,15 @@ bool AllOperandsAreParameters(const HloInstruction& instruction) {
   return true;
 }
 
+bool AllOperandsAreConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 HloInstruction* GetMatchingOperand(
     std::function<bool(const HloInstruction*)> matcher,
     HloInstruction* instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index 864f892e92047e6f39b2949854190522b2f4a906..c79347bbf9d6146943b7b787f713369cb37fadee 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -28,9 +28,16 @@ namespace hlo_query {
 // Precondition: out != nullptr
 bool IsConstantR0F32(HloInstruction* instruction, float* out);
 
+// Returns whether all of an instruction's operands are of the types constants
+// and parameters.
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction);
+
 // Returns whether all of an instruction's operands are parameters.
 bool AllOperandsAreParameters(const HloInstruction& instruction);
 
+// Returns whether all of an instruction's operands are constants.
+bool AllOperandsAreConstants(const HloInstruction& instruction);
+
 // Returns whether the instruction is a scalar constant.
 bool IsScalarConstant(const HloInstruction* instruction);
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 52a0181029ddb7eb373bb9e9f91e2899c3140c71..5d4fd7c2deae7e1b03f49f123e2aff174ab34667 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -22,14 +22,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -45,63 +46,58 @@ namespace xla {
 
 namespace {
 
-// Returns a vector of the operands of 'instruction' with repeated elements
-// removed.
-std::vector<HloInstruction*> UniqueOperands(const HloInstruction* instruction) {
-  std::vector<HloInstruction*> unique_operands;
-  for (HloInstruction* operand : instruction->operands()) {
-    if (std::find(unique_operands.begin(), unique_operands.end(), operand) ==
-        unique_operands.end()) {
-      unique_operands.push_back(operand);
-    }
-  }
-  return unique_operands;
-}
-
 // Returns true if the given instruction is rematerializable.
 bool IsRematerializable(const HloInstruction* instruction) {
+  // Conservatively, don't rematerialize instruction with control
+  // dependencies. For one, control dependencies are added to prevent
+  // interference of aliased buffers (say, in while bodies) and
+  // rematerialization is ignorant of liveness and may break the intended
+  // ordering.
+  if (!instruction->control_predecessors().empty() ||
+      !instruction->control_successors().empty()) {
+    return false;
+  }
+
   // Don't rematerialize instructions with side effects, those with a cost that
   // might not be captured by HloCostAnalysis, or instructions which cannot be
   // cloned safely.
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
+    case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
     case HloOpcode::kOutfeed:
     case HloOpcode::kInfeed:
+    case HloOpcode::kParameter:
     case HloOpcode::kRecv:
     case HloOpcode::kSend:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
     default:
-      break;
-  }
-
-  // Skip tuple shapes because we do not currently account for buffer aliasing
-  // properly which results in improperly accounting of rematerialization cost
-  // for these shapes.
-  if (ShapeUtil::IsTuple(instruction->shape())) {
-    return false;
-  }
-  for (auto* operand : instruction->operands()) {
-    if (ShapeUtil::IsTuple(operand->shape())) {
-      return false;
-    }
+      return true;
   }
-
-  return true;
 }
 
-// Class which maintains an ordered list of instructions with fast insertion and
-// removal of arbitrary elements.
+// Class which maintains an ordered list of instructions with fast insertion
+// before arbitrary elements.
 class InstructionList {
  public:
   explicit InstructionList(const std::vector<const HloInstruction*> order) {
+    int64 position = 0;
     for (const HloInstruction* inst : order) {
       instructions_.push_back(const_cast<HloInstruction*>(inst));
       instruction_iterators_.insert({const_cast<HloInstruction*>(inst),
                                      std::next(instructions_.end(), -1)});
+      // Initially position numbers are uniquely assigned in order. Later as
+      // instructions are added with InsertBefore* methods, some instructions
+      // may have duplicate position numbers, but the values will be guaranteed
+      // to be monotonically increasing through the list, and so is still useful
+      // for quickly(-ish) determining the order of arbitrary instructions in
+      // the list.
+      position_number_[inst] = position;
+      first_at_position_[position] = inst;
+      position++;
     }
   }
 
@@ -110,22 +106,63 @@ class InstructionList {
     return instructions_;
   }
 
-  // Insert instruction 'to_insert' before instruction 'before' in the list.
-  Status InsertBefore(HloInstruction* to_insert, HloInstruction* before) {
+  // Insert instruction 'to_insert' immediately before instruction 'before' in
+  // the list.
+  void InsertBefore(HloInstruction* to_insert, HloInstruction* before) {
+    VLOG(3) << "InsertBefore: " << to_insert->name() << " before "
+            << before->name();
     auto it = instruction_iterators_.find(before);
-    TF_RET_CHECK(it != instruction_iterators_.end());
+    CHECK(it != instruction_iterators_.end());
     instruction_iterators_.insert(
         {to_insert, instructions_.insert(it->second, to_insert)});
-    return Status::OK();
+    // Assign the same position number to the newly added instruction as
+    // 'before'. This guarantees monotonicity of the position numbers, but not
+    // uniqueness.
+    int64 pos = position_number_.at(before);
+    position_number_[to_insert] = pos;
+    if (first_at_position_.at(pos) == before) {
+      first_at_position_[pos] = to_insert;
+    }
   }
 
-  // Removes instruction from the list.
-  Status Remove(HloInstruction* instruction) {
-    auto it = instruction_iterators_.find(instruction);
-    TF_RET_CHECK(it != instruction_iterators_.end());
-    instructions_.erase(it->second);
-    instruction_iterators_.erase(it);
-    return Status::OK();
+  // Insert instruction 'to_insert' immediately before the earliest instruction
+  // in 'before_instructions'.
+  void InsertBeforeInstructions(
+      HloInstruction* to_insert,
+      tensorflow::gtl::ArraySlice<HloInstruction*> before_instructions) {
+    VLOG(3) << "InsertBeforeInstructions: " << to_insert->name() << " before {"
+            << tensorflow::str_util::Join(
+                   before_instructions, ", ",
+                   [](string* out, HloInstruction* inst) {
+                     tensorflow::strings::StrAppend(out, inst->name());
+                   })
+            << "}";
+
+    // Find the minimal position number of any instruction in
+    // 'before_instructions'.
+    CHECK(!before_instructions.empty());
+    int64 min_position_number = std::numeric_limits<int64>::max();
+    for (const HloInstruction* instruction : before_instructions) {
+      min_position_number =
+          std::min(min_position_number, position_number_.at(instruction));
+    }
+
+    // Because more than one instruction in 'before_instructions' may have a
+    // position number of 'min_position_number', find the first such instruction
+    // with position number 'min_position_number'.
+    for (auto it = instruction_iterators_.at(
+             first_at_position_.at(min_position_number));
+         it != instructions_.end() &&
+         position_number_.at(*it) == min_position_number;
+         ++it) {
+      if (std::find(before_instructions.begin(), before_instructions.end(),
+                    *it) != before_instructions.end()) {
+        return InsertBefore(to_insert, *it);
+      }
+    }
+    LOG(FATAL) << "Expected to find instruction in before_instructions with "
+                  "position number "
+               << min_position_number;
   }
 
  private:
@@ -136,283 +173,630 @@ class InstructionList {
   tensorflow::gtl::FlatMap<const HloInstruction*,
                            std::list<HloInstruction*>::iterator>
       instruction_iterators_;
+
+  // A number assigned to each instruction which increases monotonically through
+  // 'instructions_'. Used to facilitate fast insertion of an instruction before
+  // the earliest instruction in a set of instructions
+  // (InsertBeforeInstructions) by enabling fast-ish ordering queries between
+  // instructions. If position_number_[a] < position_number_[b] then 'a' comes
+  // before 'b' in the list. If the position numbers are the same then nothing
+  // can be said about their order without examining the list.
+  //
+  // On object construction this value is precisely the instruction's ordinal
+  // position in the list. Instructions inserted via InsertBefore receive
+  // duplicate values. However, monotonicity is preserved.
+  tensorflow::gtl::FlatMap<const HloInstruction*, int64> position_number_;
+
+  // The first instruction in the list assigned a particular position number.
+  tensorflow::gtl::FlatMap<int64, const HloInstruction*> first_at_position_;
 };
 
+// Return the HloInstructions which use the given LogicalBuffer. Sets
+// has_indirect_users to whether any of the uses is indirect. A use is indirect
+// if the instruction defining logical_buffer is not an operand of the use. This
+// can happen via buffer aliasing (eg, tuples).
+std::vector<const HloInstruction*> GetUsers(
+    const LogicalBuffer* logical_buffer,
+    const TuplePointsToAnalysis& points_to_analysis, bool* has_indirect_users) {
+  std::vector<const HloInstruction*> users;
+  // To identify uses iterate through all HloInstruction users of the
+  // BufferAliases of the logical buffer.
+  *has_indirect_users = false;
+  for (const BufferAlias& buffer_alias :
+       points_to_analysis.GetBufferAliases(*logical_buffer)) {
+    for (const HloInstruction* user : buffer_alias.instruction()->users()) {
+      if (DoesNotUseOperandBuffer(buffer_alias.instruction(),
+                                  buffer_alias.index(), user,
+                                  points_to_analysis)) {
+        // The alias may be an operand of 'user', but the LogicalBuffer cannot
+        // possibly be used by the instruction so ignore 'user'. This is the
+        // case, for example, for the tuple element buffers in a GetTupleElement
+        // instruction (the GTE instruction only uses the pointer vector).
+        continue;
+      }
+      if (buffer_alias.instruction() != logical_buffer->instruction()) {
+        *has_indirect_users = true;
+      }
+      // A buffer may be used by the instruction via more than one alias. For
+      // example, a buffer which appears in more than one element of a tuple.
+      if (std::find(users.begin(), users.end(), user) == users.end()) {
+        users.push_back(user);
+      }
+    }
+  }
+  return users;
+}
+
 // Class for tracking memory usage of a computation as the instructions are
-// placed sequentially. Memory usage is the sum of live values at the current
-// point in the instruction sequence.
+// placed sequentially. Memory usage is the sum of the sizes of live values
+// (LogicalBuffers) at the current point in the instruction sequence.
 class MemoryUsageTracker {
  public:
   MemoryUsageTracker(
       const HloComputation* computation,
-      const HloRematerialization::ShapeSizeFunction& size_function)
-      : computation_(computation), size_function_(size_function) {
-    for (const std::unique_ptr<HloInstruction>& instruction :
-         computation->instructions()) {
-      // Initially only live-in values occupy memory.
-      if (IsLiveIn(instruction.get())) {
-        memory_usage_ += TotalSizeBytes(instruction->shape());
-      }
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const InstructionList& instruction_list);
+
+  // Starts the placement of the given instruction. This adds the sizes of the
+  // LogicalBuffers defined by the instruction to the current memory
+  // usage. Placement is broken into two steps (BeginInstruction and
+  // EndInstruction) to accurately model memory usage. At BeginInstruction the
+  // memory for the output value(s) of the current instruction is allocated. At
+  // EndInstruction memory for dead operand(s) is freed.
+  Status BeginInstruction(const HloInstruction* instruction);
+
+  // Finishes the placement of the current instruction. This frees any dead
+  // operands or dead result of the instruction. This must be called after
+  // each call to BeginInstruction.
+  Status EndInstruction();
+
+  // Returns the number of bytes that the current memory usage will be reduced
+  // if the given instruction is rematerialized.
+  int64 MemoryReducedIfRematerialized(const HloInstruction* instruction) const;
+
+  // Adjusts memory usage to account for the rematerialization of
+  // original_instruction for all remaining unplaced uses. The rematerialization
+  // is remat_instruction. This method should be called after the HLO graph has
+  // been transformed (rematerialization instruction created and connected to
+  // uses).
+  Status AddRematerializedInstruction(HloInstruction* original_instruction,
+                                      HloInstruction* remat_instruction);
+
+  // Returns whether the given instruction has been placed (BeginInstruction
+  // has been called with 'instruction' as the argument).
+  bool IsPlaced(const HloInstruction* instruction) const {
+    return ContainsKey(placed_instructions_, instruction);
+  }
+
+  // Returns the current memory usage. This is the sum of sizes of all live
+  // values.
+  int64 memory_usage() const { return memory_usage_; }
+
+  // Returns the current instruction being placed.
+  const HloInstruction* in_progress_instruction() const {
+    return in_progress_instruction_;
+  }
+
+  // Check invariants of the data structure. This is expensive to call.
+  bool Check() const;
+
+  string ToString() const;
+
+ private:
+  // Type holding a unique identifier for each Buffer object.
+  using BufferId = int64;
+
+  // A Buffer represents a single LogicalBuffer in the computation including
+  // various metadata useful for tracking liveness of the value. A LogicalBuffer
+  // is not used directly because the HLO graph is transformed and
+  // TuplePointsToAnalysis which owns all LogicalBuffers cannot be updated after
+  // HLO graph transformations.
+  struct Buffer {
+    // The unique id of this Buffer. This value is equal to the buffer's index
+    // in the vector buffers_.
+    const BufferId id;
+
+    // The instruction which defines this buffer.
+    const HloInstruction* defining_instruction;
+
+    // The materialized size of the buffer in bytes.
+    const int64 size;
+
+    // Whether this buffer is live-out of the computation.
+    bool live_out;
+
+    // Whether this buffer has indirect uses. Ie, an instruction which is not a
+    // user of defining_instruction uses this buffer. This can occur due to
+    // buffer aliasing (eg, tuples).
+    bool has_indirect_uses;
+
+    // The instructions which use this buffer.
+    std::vector<const HloInstruction*> users;
+
+    // The number of users (HloInstructions) of this buffer which have not yet
+    // been placed in the sequence.
+    int64 unfinished_user_count;
+
+    string ToString() const {
+      return tensorflow::strings::StrCat("Buffer ", id, " (defined by ",
+                                         defining_instruction->name(),
+                                         ", size ", size, " bytes)");
     }
+  };
+
+  // Creates a Buffer representing the given logical buffer. The buffer is added
+  // to buffers_ and a reference is returned.
+  Buffer& CreateBufferFromLogicalBuffer(
+      const LogicalBuffer* logical_buffer,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const HloRematerialization::ShapeSizeFunction& size_function,
+      bool live_out) {
+    bool has_indirect_uses = false;
+    std::vector<const HloInstruction*> users =
+        GetUsers(logical_buffer, points_to_analysis, &has_indirect_uses);
+    return NewBuffer(logical_buffer->instruction(),
+                     size_function(logical_buffer->shape()), std::move(users),
+                     live_out, has_indirect_uses);
   }
 
-  // Starts the placement of the given instruction. This adds the output size of
-  // the instruction to the current memory usage. Placement is broken into two
-  // steps (BeginInstruction and EndInstruction) to accurately model memory
-  // usage. At BeginInstruction the memory for the output value of the current
-  // instruction is allocated. At EndInstruction memory for dead operands is
-  // freed.
-  Status BeginInstruction(const HloInstruction* instruction) {
-    VLOG(3) << "BeginInstruction " << instruction->name();
-    TF_RET_CHECK(in_progress_instruction_ == nullptr);
-    in_progress_instruction_ = instruction;
-
-    // Add instruction to remaining_uses_.
-    TF_RET_CHECK(!ContainsKey(remaining_uses_, instruction));
-    std::vector<HloInstruction*>& instruction_uses =
-        remaining_uses_[instruction];
-    instruction_uses.insert(instruction_uses.begin(),
-                            instruction->users().begin(),
-                            instruction->users().end());
-
-    if (!IsLiveIn(instruction)) {
-      // Instruction was not previously live so add output size to memory usage.
-      memory_usage_ += TotalSizeBytes(instruction->shape());
+  // Create a new buffer representing a rematerialization of given buffer for
+  // the given uses.
+  Buffer& RematerializeBuffer(
+      const Buffer& original_buffer, const HloInstruction* remat_instruction,
+      std::vector<const HloInstruction*>&& rematerialized_uses) {
+    CHECK(IsPlaced(original_buffer.defining_instruction));
+    CHECK(!original_buffer.has_indirect_uses);
+    CHECK(!original_buffer.live_out);
+    for (const HloInstruction* use : rematerialized_uses) {
+      CHECK(!IsPlaced(use));
     }
+    return NewBuffer(remat_instruction, original_buffer.size,
+                     std::move(rematerialized_uses), /*live_out=*/false,
+                     /*has_indirect_uses=*/false);
+  }
+
+  // Return number of bytes allocated for the buffer with the given id. Buffers
+  // allocated by the calling computation (eg, parameter and output buffers) are
+  // considered to have zero bytes because the memory is accounted for in a
+  // different computation.
+  int64 AllocatedSize(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    HloOpcode def_opcode = buffer.defining_instruction->opcode();
+    if (buffer.live_out || def_opcode == HloOpcode::kParameter) {
+      return 0;
+    } else {
+      return buffer.size;
+    }
+  }
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  // Returns true if BeginInstruction and EndInstruction has been called for the
+  // given instruction.
+  bool IsFinished(const HloInstruction* instruction) const {
+    return IsPlaced(instruction) && instruction != in_progress_instruction_;
   }
 
-  // Finishes the placement of the current instruction. This frees any dead
-  // operands or dead result of the instruction. This must be called after each
-  // call to BeginInstruction.
-  Status EndInstruction() {
-    TF_RET_CHECK(in_progress_instruction_ != nullptr);
-    VLOG(3) << "EndInstruction " << in_progress_instruction_->name();
-
-    for (HloInstruction* operand : UniqueOperands(in_progress_instruction_)) {
-      TF_RET_CHECK(ContainsKey(remaining_uses_, operand));
-      std::vector<HloInstruction*>& uses = remaining_uses_.at(operand);
-      auto it = std::find(uses.begin(), uses.end(), in_progress_instruction_);
-      TF_RET_CHECK(it != uses.end());
-      uses.erase(it);
-
-      if (uses.empty()) {
-        // Operand is dead.
-        int64 operand_size = TotalSizeBytes(operand->shape());
-        if (!IsLiveOut(operand)) {
-          VLOG(4) << operand->name() << " ("
-                  << HumanReadableNumBytes(operand_size) << ") is dead";
-          memory_usage_ -= operand_size;
-          TF_RET_CHECK(memory_usage_ >= 0);
+  // Returns whether the given buffer is being used by the in-progress
+  // instruction.
+  bool IsInUse(BufferId buffer_id) const {
+    if (in_progress_instruction_ == nullptr) {
+      return false;
+    }
+    const std::vector<BufferId>& in_progress_uses =
+        buffers_used_by_instruction_.at(in_progress_instruction_);
+    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
+                     buffer_id) != in_progress_uses.end();
+  }
+
+  // Returns whether the given instruction is live at the current program
+  // point.
+  bool IsCurrentlyLive(BufferId buffer_id) const {
+    const Buffer& buffer = buffers_[buffer_id];
+    return (IsPlaced(buffer.defining_instruction) &&
+            buffer.unfinished_user_count > 0);
+  }
+
+  // Create a new buffer, add it to buffers_, and return a reference.
+  Buffer& NewBuffer(const HloInstruction* defining_instruction, int64 size,
+                    std::vector<const HloInstruction*>&& users, bool live_out,
+                    bool has_indirect_uses) {
+    int buffer_id = buffers_.size();
+    buffers_.push_back(Buffer{buffer_id, defining_instruction, size, live_out,
+                              has_indirect_uses, users,
+                              static_cast<int64>(users.size())});
+    return buffers_.back();
+  }
+
+  const HloComputation* computation_;
+
+  // Instruction list containing the ordering of instructions in
+  // computation_. This is the order in which instructions are placed
+  // (BeginInstruction/EndInstruction calls).
+  const InstructionList& instruction_list_;
+
+  // Memory usage at the currently placed instruction.
+  int64 memory_usage_ = 0;
+
+  // The instruction currently being placed. This value is non-null only
+  // between the calling of BeginInstruction and EndInstruction.
+  const HloInstruction* in_progress_instruction_ = nullptr;
+
+  // The buffers defined by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_defined_by_instruction_;
+
+  // The buffers used by each instruction.
+  std::unordered_map<const HloInstruction*, std::vector<BufferId>>
+      buffers_used_by_instruction_;
+
+  // The set of instructions which have been placed. That is, BeginInstruction
+  // has been called with the instruction as an argument.
+  tensorflow::gtl::FlatSet<const HloInstruction*> placed_instructions_;
+
+  // All buffers in the computation.
+  std::vector<Buffer> buffers_;
+};
+
+MemoryUsageTracker::MemoryUsageTracker(
+    const HloComputation* computation,
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const InstructionList& instruction_list)
+    : computation_(computation), instruction_list_(instruction_list) {
+  // Iterate through all LogicalBuffers in the computation and gather the
+  // instructions which define them in buffers_defined_by_instruction_ and the
+  // instructions which use them in buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    // Initialize empty vectors for defs and uses of each instruction.
+    buffers_used_by_instruction_[instruction.get()];
+    buffers_defined_by_instruction_[instruction.get()];
+  }
+
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_out_set =
+      points_to_analysis.GetPointsToSet(computation_->root_instruction())
+          .CreateFlattenedSet();
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferId>
+      logical_buffer_to_buffer_id;
+
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    for (const LogicalBuffer* logical_buffer :
+         points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
+      Buffer* buffer;
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        // The while instruction defines no new buffers. Instead it reuses the
+        // buffers of its operand. Find the Buffer of its operand at the
+        // proper ShapeIndex.
+        const PointsToSet& operand_points_to =
+            points_to_analysis.GetPointsToSet(instruction->operand(0));
+        CHECK_EQ(operand_points_to.element(logical_buffer->index()).size(), 1);
+        const LogicalBuffer* source_logical_buffer =
+            operand_points_to.element(logical_buffer->index())[0];
+        buffer =
+            &buffers_.at(logical_buffer_to_buffer_id.at(source_logical_buffer));
+
+        // Mark buffer as has indirect use and live out.
+        buffer->has_indirect_uses = true;
+        buffer->live_out =
+            buffer->live_out || ContainsKey(live_out_set, logical_buffer);
+
+        // Add users of while to Buffer users.
+        bool unused;
+        for (const HloInstruction* user :
+             GetUsers(logical_buffer, points_to_analysis, &unused)) {
+          if (std::find(buffer->users.begin(), buffer->users.end(), user) ==
+              buffer->users.end()) {
+            buffer->users.push_back(user);
+            buffer->unfinished_user_count++;
+            buffers_used_by_instruction_.at(user).push_back(buffer->id);
+          }
+        }
+      } else {
+        buffer = &CreateBufferFromLogicalBuffer(
+            logical_buffer, points_to_analysis, size_function,
+            ContainsKey(live_out_set, logical_buffer));
+        buffers_defined_by_instruction_.at(instruction).push_back(buffer->id);
+        for (const HloInstruction* user : buffer->users) {
+          buffers_used_by_instruction_.at(user).push_back(buffer->id);
         }
       }
-    }
 
-    // Value is dead if the instruction has no uses and is not live out.
-    if (in_progress_instruction_->users().empty() &&
-        !IsLiveOut(in_progress_instruction_)) {
-      memory_usage_ -= TotalSizeBytes(in_progress_instruction_->shape());
-      TF_RET_CHECK(memory_usage_ >= 0);
+      logical_buffer_to_buffer_id[logical_buffer] = buffer->id;
     }
+  }
+  XLA_VLOG_LINES(10, ToString());
+  DCHECK(Check());
+}
+
+Status MemoryUsageTracker::BeginInstruction(const HloInstruction* instruction) {
+  VLOG(3) << "BeginInstruction " << instruction->name();
+  TF_RET_CHECK(in_progress_instruction_ == nullptr);
+  in_progress_instruction_ = instruction;
 
-    in_progress_instruction_ = nullptr;
+  placed_instructions_.insert(in_progress_instruction_);
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  // All buffers defined by this instruction need memory.
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    VLOG(3) << "  Buffer " << buffers_.at(buffer_id).ToString()
+            << " is now live.";
+    memory_usage_ += AllocatedSize(buffer_id);
   }
 
-  // Adjusts memory usage to account for the rematerialization of
-  // original_instruction for the given use. The rematerialization is
-  // remat_instruction. This method should be called after the HLO graph has
-  // been transformed (rematerialization instruction created and connected to
-  // its use).
-  Status RematerializeInstructionForUse(HloInstruction* original_instruction,
-                                        HloInstruction* remat_instruction,
-                                        HloInstruction* use) {
-    VLOG(3) << "RematerializeInstructionForUse: original_instruction = "
-            << original_instruction->name()
-            << ", remat_instruction = " << remat_instruction->name()
-            << ", use = " << use->name();
-
-    TF_RET_CHECK(in_progress_instruction_ != nullptr);
-    TF_RET_CHECK(IsPlaced(original_instruction));
-    TF_RET_CHECK(!IsPlaced(remat_instruction));
-    TF_RET_CHECK(!IsPlaced(use));
-    TF_RET_CHECK(IsCurrentlyLive(original_instruction));
-
-    // Remove 'use' from remaining uses of original_instruction.
-    auto it = std::find(remaining_uses_[original_instruction].begin(),
-                        remaining_uses_[original_instruction].end(), use);
-    TF_RET_CHECK(it != remaining_uses_[original_instruction].end());
-    remaining_uses_[original_instruction].erase(it);
-
-    // If original_instruction is no longer live ('use' was its last use) then
-    // deduct original_instruction's memory usage.
-    if (!IsCurrentlyLive(original_instruction)) {
-      memory_usage_ -= TotalSizeBytes(original_instruction->shape());
-      TF_RET_CHECK(memory_usage_ >= 0);
+  // TODO(b/37686934): Elementwise instructions can share the buffer of a (dead)
+  // operand. Account for this potential reuse here.
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
+  return Status::OK();
+}
+
+Status MemoryUsageTracker::EndInstruction() {
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  VLOG(3) << "EndInstruction " << in_progress_instruction_->name();
+
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(in_progress_instruction_)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    buffer.unfinished_user_count--;
+    CHECK_GE(buffer.unfinished_user_count, 0)
+        << buffer.ToString() << " has negative unfinished use count.";
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer is now dead.
+      VLOG(3) << "  " << buffer.ToString() << " is now dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
     }
+  }
 
-    // Add the new remat_instruction to the remaining uses of its operands.
-    for (auto* operand : UniqueOperands(remat_instruction)) {
-      // Rematerialization may extend the lifetime of the operand so account for
-      // this in memory_usage_.
-      TF_RET_CHECK(IsPlaced(operand));
-      if (!IsCurrentlyLive(operand)) {
-        memory_usage_ += TotalSizeBytes(operand->shape());
-      }
-      remaining_uses_.at(operand).push_back(remat_instruction);
+  // If any buffer defined by this instruction has no uses, then memory can be
+  // reclaimed immediately.
+  for (BufferId buffer_id :
+       buffers_defined_by_instruction_.at(in_progress_instruction_)) {
+    const Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      VLOG(3) << "  " << buffer.ToString() << " is immediately dead.";
+      memory_usage_ -= AllocatedSize(buffer_id);
+      CHECK_GE(memory_usage_, 0);
     }
+  }
+
+  in_progress_instruction_ = nullptr;
+
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  VLOG(10) << ToString();
+
+  DCHECK(Check());
 
-    VLOG(3) << "  memory usage = " << memory_usage_;
-    VLOG(10) << ToString();
-    return Status::OK();
+  return Status::OK();
+}
+
+int64 MemoryUsageTracker::MemoryReducedIfRematerialized(
+    const HloInstruction* instruction) const {
+  CHECK_NE(in_progress_instruction_, nullptr);
+  if (!IsPlaced(instruction) || instruction == in_progress_instruction_) {
+    return 0;
   }
 
-  // Returns the number of bytes that the current memory usage will be reduced
-  // if the given instruction is rematerialized.
-  int64 MemoryReducedIfRematerialized(const HloInstruction* instruction) const {
-    // To reduce memory consumption 'instruction' must be currently live and
-    // rematerialization must make 'instruction' not live.
-    if (IsLiveIn(instruction) || IsLiveOut(instruction) ||
-        !IsCurrentlyLive(instruction)) {
+  // TODO(b/37687140): Rematerialization can increase peak memory consumption at
+  // an earlier point in the program if rematerialization extends the live range
+  // of the operand of the instruction being rematerialized across the live
+  // range of the value of instruction being rematerialized. Don't rematerialize
+  // in this case (ie, return 0 here).
+
+  // Compute the amount of memory reduced (if any) by rematerializing
+  // 'instruction'. The LogicalBuffers defined by 'instruction' will no longer
+  // be live at this program point, so initially set memory_reduced to the
+  // size of its defined values.
+  int64 memory_reduced = 0;
+  for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+    // Avoid rematerializing instructions with indirect uses as it is difficult
+    // to reason about liveness after rematerializing the instruction.
+    // TODO(b/37714814): Consider rematerialzing instructions with indirect
+    // uses.
+    if (buffers_.at(buffer_id).has_indirect_uses) {
       return 0;
     }
 
-    // If the in-progress instruction is a user of 'instruction' (or
-    // 'instruction' itself) then rematerializing 'instruction' cannot reduce
-    // memory usage because the value is required to be live at this program
-    // point.
-    if (in_progress_instruction_ == instruction ||
-        in_progress_instruction_->IsUserOf(instruction)) {
-      return 0;
+    if (IsCurrentlyLive(buffer_id) && !IsInUse(buffer_id)) {
+      memory_reduced += AllocatedSize(buffer_id);
     }
+  }
 
-    // Compute the amount of memory reduced (if any) by rematerializing
-    // 'instruction'. 'instruction' will no longer be live at this program
-    // point, so initially set memory_reduced to the size of its output value.
-    int64 memory_reduced = TotalSizeBytes(instruction->shape());
-
-    // Account for any operands whose live range must be extended across this
-    // program point.
-    for (const HloInstruction* operand : UniqueOperands(instruction)) {
-      if (!IsCurrentlyLive(operand)) {
-        // This operand of candidate is not live at this program
-        // point. Rematerializing 'instruction' will extend the operand's live
-        // range across this program point.
-        memory_reduced -= TotalSizeBytes(operand->shape());
-      }
+  // Account for any logical buffers whose live range must be extended across
+  // this program point.
+  for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+    if (!IsCurrentlyLive(buffer_id)) {
+      // This logical buffer is used by 'instruction' but is not live at this
+      // program point. Rematerializing 'instruction' will extend the buffer's
+      // live range across this program point.
+      memory_reduced -= AllocatedSize(buffer_id);
     }
-    return memory_reduced;
   }
 
-  // Returns the remaining unplaced uses of the given instruction.
-  const std::vector<HloInstruction*>& RemainingUses(
-      const HloInstruction* instruction) const {
-    return remaining_uses_.at(instruction);
+  return memory_reduced;
+}
+
+Status MemoryUsageTracker::AddRematerializedInstruction(
+    HloInstruction* original_instruction, HloInstruction* remat_instruction) {
+  VLOG(3) << "AddRematerializedInstruction: original_instruction = "
+          << original_instruction->name()
+          << ", remat_instruction = " << remat_instruction->name();
+
+  TF_RET_CHECK(in_progress_instruction_ != nullptr);
+  TF_RET_CHECK(IsPlaced(original_instruction));
+  TF_RET_CHECK(!IsPlaced(remat_instruction));
+  CHECK(!ContainsKey(buffers_defined_by_instruction_, remat_instruction));
+  CHECK(!ContainsKey(buffers_used_by_instruction_, remat_instruction));
+
+  // Construct the list of buffers used and defined by the rematerialization.
+  buffers_defined_by_instruction_[remat_instruction];
+  buffers_used_by_instruction_[remat_instruction] =
+      buffers_used_by_instruction_.at(original_instruction);
+
+  // Account for the additional buffer uses created by the new rematerialization
+  // instruction. Update memory usage if the rematerialization makes a dead
+  // buffer live again.
+  for (BufferId buffer_id :
+       buffers_used_by_instruction_.at(original_instruction)) {
+    Buffer& buffer = buffers_.at(buffer_id);
+    if (buffer.unfinished_user_count == 0) {
+      // Buffer used by this instruction was dead, now is alive.
+      memory_usage_ += AllocatedSize(buffer.id);
+    }
+
+    buffer.unfinished_user_count++;
+    buffer.users.push_back(remat_instruction);
   }
 
-  // Returns whether the given instruction has been placed (BeginInstruction has
-  // been called with 'instruction' as the argument).
-  bool IsPlaced(const HloInstruction* instruction) const {
-    return ContainsKey(remaining_uses_, instruction);
-  }
-
-  // Returns whether the given instruction is live at the current program point.
-  bool IsCurrentlyLive(const HloInstruction* instruction) const {
-    return (!IsPlaced(instruction) && IsLiveIn(instruction)) ||
-           (IsPlaced(instruction) &&
-            (!RemainingUses(instruction).empty() || IsLiveOut(instruction)));
-  }
-
-  string ToString() const {
-    string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
-                                                computation_->name(), "\n");
-    tensorflow::strings::StrAppend(&output, "memory usage = ", memory_usage(),
-                                   "\n");
-    tensorflow::strings::StrAppend(&output, "Live values:\n");
-    for (const auto& pair : remaining_uses_) {
-      const HloInstruction* instruction = pair.first;
-      const std::vector<HloInstruction*>& uses = pair.second;
-      tensorflow::strings::StrAppend(
-          &output, "  ", instruction->name(), "; remaining uses: ",
-          tensorflow::str_util::Join(uses, ", ",
-                                     [](string* out, HloInstruction* use) {
-                                       tensorflow::strings::StrAppend(
-                                           out, use->name());
-                                     }),
-          "\n");
+  // Create a new set of Buffers defined by the new rematerialization
+  // instruction. Update the internal data structures and memory use to account
+  // for them.
+  for (BufferId old_buffer_id :
+       buffers_defined_by_instruction_.at(original_instruction)) {
+    Buffer& old_buffer = buffers_.at(old_buffer_id);
+
+    std::vector<const HloInstruction*> placed_users;
+    std::vector<const HloInstruction*> unplaced_users;
+    for (const HloInstruction* user : old_buffer.users) {
+      if (IsPlaced(user)) {
+        CHECK(IsFinished(user));
+        placed_users.push_back(user);
+      } else {
+        unplaced_users.push_back(user);
+      }
+    }
+    old_buffer.users = std::move(placed_users);
+    old_buffer.unfinished_user_count = 0;
+
+    // Buffer is now dead.
+    memory_usage_ -= AllocatedSize(old_buffer.id);
+
+    Buffer& new_buffer = RematerializeBuffer(old_buffer, remat_instruction,
+                                             std::move(unplaced_users));
+
+    buffers_defined_by_instruction_.at(remat_instruction)
+        .push_back(new_buffer.id);
+    for (const HloInstruction* user : new_buffer.users) {
+      std::vector<BufferId>& buffers_used =
+          buffers_used_by_instruction_.at(user);
+      std::replace(buffers_used.begin(), buffers_used.end(), old_buffer_id,
+                   new_buffer.id);
     }
-    return output;
   }
 
-  // Returns the current memory usage. This is the sum of sizes of all live
-  // values.
-  int64 memory_usage() const { return memory_usage_; }
+  VLOG(3) << "  memory usage = " << memory_usage_;
+  XLA_VLOG_LINES(10, ToString());
 
-  // Returns the current instruction being placed.
-  const HloInstruction* in_progress_instruction() const {
-    return in_progress_instruction_;
-  }
+  DCHECK(Check());
 
- private:
-  // Returns the total size of the shape (including nested elements) in bytes.
-  int64 TotalSizeBytes(const Shape& shape) const {
-    int64 total_size = 0;
-    ShapeUtil::ForEachSubshape(
-        shape,
-        [this, &total_size](const Shape& subshape,
-                            const ShapeIndex& /*index*/) {
-          total_size += size_function_(subshape);
-          return Status::OK();
-        })
-        .IgnoreError();
-    return total_size;
-  }
-
-  // Returns true if the value of given instruction is live into the
-  // computation.
-  bool IsLiveIn(const HloInstruction* instruction) const {
-    return instruction->opcode() == HloOpcode::kConstant ||
-           instruction->opcode() == HloOpcode::kParameter;
-  }
-
-  // Returns true if the value of given instruction is live out of the
-  // computation.
-  bool IsLiveOut(const HloInstruction* instruction) const {
-    return instruction->opcode() == HloOpcode::kConstant ||
-           instruction->opcode() == HloOpcode::kParameter ||
-           instruction == instruction->parent()->root_instruction();
+  return Status::OK();
+}
+
+string MemoryUsageTracker::ToString() const {
+  string output = tensorflow::strings::StrCat("MemoryUsageTracker for ",
+                                              computation_->name(), "\n");
+  tensorflow::strings::StrAppend(
+      &output, "Memory usage: ", HumanReadableNumBytes(memory_usage()), " (",
+      memory_usage(), " bytes)");
+  for (const HloInstruction* instruction : instruction_list_.instructions()) {
+    string inprogress =
+        instruction == in_progress_instruction_ ? " in-progress" : "";
+    string placed = IsPlaced(instruction) ? " placed" : "";
+    tensorflow::strings::StrAppend(&output, "  ", instruction->name(),
+                                   inprogress, placed, "\n    Defines:\n");
+    for (BufferId buffer_id : buffers_defined_by_instruction_.at(instruction)) {
+      const Buffer& buffer = buffers_[buffer_id];
+      string live = IsCurrentlyLive(buffer_id) ? " live" : "";
+      tensorflow::strings::StrAppend(&output, "      ", buffer.ToString(), live,
+                                     ", ", buffer.unfinished_user_count,
+                                     " unfinished uses\n");
+    }
+    tensorflow::strings::StrAppend(&output, "    Uses:\n");
+    for (BufferId buffer_id : buffers_used_by_instruction_.at(instruction)) {
+      tensorflow::strings::StrAppend(&output, "      ",
+                                     buffers_[buffer_id].ToString(), "\n");
+    }
   }
+  return output;
+}
 
-  const HloComputation* computation_;
+bool MemoryUsageTracker::Check() const {
+  auto elements_are_unique = [](const std::vector<BufferId>& vec) {
+    return vec.size() == std::set<BufferId>(vec.begin(), vec.end()).size();
+  };
+
+  // Verify buffers_defined_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& defined_buffers =
+        buffers_defined_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(defined_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique defined buffers: "
+        << tensorflow::str_util::Join(
+               defined_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
 
-  // Function which computes the size of the top-level buffer of a shape.
-  const HloRematerialization::ShapeSizeFunction size_function_;
+    for (const Buffer& buffer : buffers_) {
+      if (buffer.defining_instruction == instruction.get()) {
+        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
+                        buffer.id) != defined_buffers.end())
+            << "Instruction " << instruction->name()
+            << " defined buffers is missing: " << buffer.ToString();
+      }
+    }
+  }
 
-  // Memory usage at the currently placed instruction.
-  int64 memory_usage_ = 0;
+  // Verify buffers_used_by_instruction_.
+  for (auto& instruction : computation_->instructions()) {
+    const std::vector<BufferId>& used_buffers =
+        buffers_used_by_instruction_.at(instruction.get());
+    CHECK(elements_are_unique(used_buffers))
+        << "Instruction " << instruction->name()
+        << " does not have unique used buffers: "
+        << tensorflow::str_util::Join(
+               used_buffers, ", ", [this](string* out, BufferId buffer_id) {
+                 tensorflow::strings::StrAppend(
+                     out, buffers_.at(buffer_id).ToString());
+               });
+  }
+  for (const Buffer& buffer : buffers_) {
+    int64 unfinished_uses = 0;
+    for (const HloInstruction* user : buffer.users) {
+      const std::vector<BufferId>& used_buffers =
+          buffers_used_by_instruction_.at(user);
+      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
+            used_buffers.end())
+          << "Instruction " << user->name() << " used buffers is missing "
+          << buffer.ToString();
+      if (!IsFinished(user)) {
+        unfinished_uses++;
+      }
+    }
+    CHECK_EQ(buffer.unfinished_user_count, unfinished_uses)
+        << "Incorrect unplaced use count for " << buffer.ToString();
+  }
 
-  // The instruction currently being placed. This value is non-null only between
-  // the calling of BeginInstruction and EndInstruction.
-  const HloInstruction* in_progress_instruction_ = nullptr;
+  // Verify live set size against memory_usage_.
+  int64 live_size = 0;
+  for (const Buffer& buffer : buffers_) {
+    // The while instruction reuses its input buffers as output buffers so
+    // don't double count its buffers if it is currently executing.
+    if (IsCurrentlyLive(buffer.id) &&
+        !(buffer.defining_instruction == in_progress_instruction_ &&
+          in_progress_instruction_->opcode() == HloOpcode::kWhile)) {
+      live_size += AllocatedSize(buffer.id);
+    }
+  }
+  CHECK_EQ(live_size, memory_usage_);
 
-  // remaining_uses is a vector of uses of the HLO instruction's value which
-  // have not yet been visited by in the rematerialization loop. Use to track
-  // liveness of HLO instructions.
-  // TODO(b/35212854): Track values using logical buffers rather than HLO
-  // instructions. Using HLO instructions over-estimates memory usage because
-  // buffer aliasing is ignored.
-  tensorflow::gtl::FlatMap<const HloInstruction*, std::vector<HloInstruction*>>
-      remaining_uses_;
-};
+  return true;
+}
 
-// Computes and returns the cost of rematerializing the given instruction. Cost
-// per rematerialized instruction is defined as:
+// Computes and returns the cost of rematerializing the given instruction.
+// Cost per rematerialized instruction is defined as:
 //
 // (flop_count + transcendental_count + element_count) / memory_reduced
 //
@@ -424,33 +808,36 @@ class MemoryUsageTracker {
 //     instruction.
 //
 // This is a rough estimate of the extra execution time per byte saved by
-// rematerializing this instruction for its remaining uses. In general, we want
-// the most memory saving for the least latency penalty which is captured by
-// this heuristic.
+// rematerializing this instruction for its remaining uses. In general, we
+// want the most memory saving for the least latency penalty which is captured
+// by this heuristic.
 int64 RematerializationCost(const HloInstruction* instruction,
                             const MemoryUsageTracker& memory_tracker,
                             const HloCostAnalysis& cost_analysis,
                             int64 memory_reduced) {
-  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
-  const int64 elements_accessed =
-      bytes_accessed /
-      ShapeUtil::ByteSizeOfPrimitiveType(instruction->shape().element_type());
-
-  // A duplicate of the rematerialized instruction will be created at each
-  // remaining use.
-  int64 duplication = memory_tracker.RemainingUses(instruction).size();
-  if (duplication == instruction->users().size()) {
-    // All remaining uses of instruction are after this point so we can remove
-    // the original instruciton after rematerialization.
-    duplication -= 1;
+  // If none of the users of 'instruction' have been placed in the sequence (as
+  // tracked by memory_tracker), then rematerialization of 'instruction' is a
+  // zero-cost move of 'instruction' in the sequence.
+  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
+                   [&memory_tracker](const HloInstruction* inst) {
+                     return memory_tracker.IsPlaced(inst);
+                   })) {
+    return 0;
   }
+
   CHECK_GT(memory_reduced, 0);
+  const int64 bytes_accessed = cost_analysis.bytes_accessed(*instruction);
+  const int64 elements_accessed =
+      ShapeUtil::IsTuple(instruction->shape())
+          ? bytes_accessed
+          : bytes_accessed / ShapeUtil::ByteSizeOfPrimitiveType(
+                                 instruction->shape().element_type());
 
   // Multiply by 256 to improve precision of cost. Without this factor,
   // many instructions such as many elementwise instructions would have
   // zero cost because the bytes reduced can be several times greater than
   // the element count.
-  return 256 * duplication *
+  return 256 *
          (cost_analysis.flop_count(*instruction) +
           cost_analysis.transcendental_count(*instruction) +
           elements_accessed) /
@@ -466,7 +853,7 @@ HloInstruction* PickRematerializationCandidate(
     const MemoryUsageTracker& memory_tracker,
     const InstructionList& instruction_list,
     const HloCostAnalysis& cost_analysis,
-    const tensorflow::gtl::FlatSet<const HloInstruction*>& remat_instructions) {
+    const tensorflow::gtl::FlatSet<const HloInstruction*>& blacklist) {
   HloInstruction* best = nullptr;
   int64 best_cost = 0;
 
@@ -481,11 +868,11 @@ HloInstruction* PickRematerializationCandidate(
     }
     VLOG(5) << "considering rematerialization candidate " << candidate->name();
 
-    if (ContainsKey(remat_instructions, candidate)) {
-      // Skip instructions which are rematerialization clones to avoid infinite
-      // loops of rematerializing the same instruction(s) repeatedly.
+    if (ContainsKey(blacklist, candidate)) {
+      // Skip instructions on the blacklist to avoid infinite loops of
+      // rematerializing the same instruction(s) repeatedly.
       VLOG(5) << "candidate " << candidate->name()
-              << " not viable: is a rematerialized instruction";
+              << " is excluded from rematerialization";
       continue;
     }
 
@@ -524,7 +911,9 @@ HloInstruction* PickRematerializationCandidate(
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
     const std::vector<const HloInstruction*>& order) const {
-  MemoryUsageTracker tracker(computation, size_function_);
+  InstructionList instruction_list(order);
+  MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
+                             instruction_list);
   int64 peak_memory = tracker.memory_usage();
   for (const HloInstruction* instruction : order) {
     TF_RETURN_IF_ERROR(tracker.BeginInstruction(instruction));
@@ -541,9 +930,8 @@ StatusOr<int64> HloRematerialization::ComputePeakMemory(
 
 StatusOr<int64> HloRematerialization::CalledComputationsMemoryUsage(
     const HloInstruction* instruction) const {
-  TF_ASSIGN_OR_RETURN(const CallGraphNode* node,
-                      call_graph_->GetNode(instruction->parent()));
-  const CallSite* callsite = node->GetCallSite(instruction);
+  const CallSite* callsite =
+      call_graph_->GetNode(instruction->parent()).GetCallSite(instruction);
   if (callsite == nullptr || callsite->context() == CallContext::kParallel) {
     return 0;
   }
@@ -563,15 +951,24 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << " with limit " << HumanReadableNumBytes(memory_limit_bytes);
   VLOG(1) << "peak memory usage is "
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
+  CHECK(!ContainsKey(rematerialized_computations_, computation));
 
   InstructionList instruction_list(sequence->at(computation));
-  MemoryUsageTracker memory_tracker(computation, size_function_);
+  MemoryUsageTracker memory_tracker(computation, size_function_,
+                                    *points_to_analysis_, instruction_list);
   bool changed = false;
 
-  // Set of instruction clones (not the originals) created during
-  // rematerialization. A record is kept to avoid rematerializing an instruction
-  // more than once to avoid looping infinitely during rematerialization.
-  tensorflow::gtl::FlatSet<const HloInstruction*> remat_instructions;
+  // To avoid an infinite loop rematerializing the same set of instructions ad
+  // infinitum, keep a blacklist of instructions which should not be
+  // rematerialized.
+  tensorflow::gtl::FlatSet<const HloInstruction*> blacklist;
+
+  // If the rematerialization makes the source instruction dead, then the
+  // rematerialization is added to 'remat_move_instructions' (the
+  // rematerialization is essentially a move). If the next rematerialization of
+  // the instruction is also a move then the rematerialization is added to the
+  // blacklist.
+  tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
 
   // The peak memory of the computation at any point in the instruction
   // sequence.
@@ -583,12 +980,12 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // instructions which are dead.
   int64 net_instructions_added = 0;
 
-  TF_ASSIGN_OR_RETURN(const CallGraphNode* call_graph_node,
-                      call_graph_->GetNode(computation));
+  const CallGraphNode& call_graph_node = call_graph_->GetNode(computation);
 
   // Iterate through all instructions in the sequence. At each instruction
   // (program point) if memory_usage exceeds the specified limit then
   // rematerialize HLO instructions until memory_usage is reduced.
+  int64 instruction_index = 0;
   for (auto list_it = instruction_list.instructions().begin();
        list_it != instruction_list.instructions().end(); ++list_it) {
     HloInstruction* instruction = *list_it;
@@ -598,7 +995,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
     VLOG(2) << "Program point at " << instruction->name()
             << ", memory usage = " << memory_tracker.memory_usage()
-            << ", callee usage = " << callee_usage;
+            << ", callee usage = " << callee_usage << ", [" << instruction_index
+            << "/" << instruction_list.instructions().size() << "]";
+    instruction_index++;
 
     while (memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
       VLOG(2) << "Over memory limit at instruction " << instruction->name()
@@ -608,7 +1007,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
               << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
       HloInstruction* best = PickRematerializationCandidate(
-          memory_tracker, instruction_list, cost_analysis_, remat_instructions);
+          memory_tracker, instruction_list, cost_analysis_, blacklist);
 
       if (best == nullptr) {
         VLOG(3) << "Unable to find rematerialization candidate at program "
@@ -619,44 +1018,42 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         break;
       }
 
-      VLOG(1) << "Rematerializing instruction " << best->name();
+      VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
+              << memory_tracker.MemoryReducedIfRematerialized(best) << ")";
       changed = true;
       remat_count++;
 
-      // Create a rematerialized copy of the candidate at each remaining use.
-      // Make a copy of remaining uses because RematerializeInstructionForUse
-      // modifies the remaining uses vector in memory_tracker.
-      // TODO(b/35213652): It may be profitable to share one rematerialized copy
-      // amongst more than one use.
-      std::vector<HloInstruction*> remaining_uses_copy =
-          memory_tracker.RemainingUses(best);
-      for (HloInstruction* use : remaining_uses_copy) {
-        // Create a new rematerialized instruction in the HLO graph.
-        HloInstruction* remat =
-            computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
-
-        VLOG(3) << "Replacing use of " << best->name() << " in " << use->name()
-                << " with rematerialization " << remat->name();
-
-        TF_RETURN_IF_ERROR(best->ReplaceUseWith(use, remat));
+      HloInstruction* remat =
+          computation->AddInstruction(best->Clone(/*suffix=*/"remat"));
 
-        // Account for the rematerialization in the memory tracker.
-        TF_RETURN_IF_ERROR(
-            memory_tracker.RematerializeInstructionForUse(best, remat, use));
-
-        // Insert rematerialized instruction right before its use.
-        TF_RETURN_IF_ERROR(instruction_list.InsertBefore(remat, use));
-
-        // Add rematerialized instruction to remat_instructions so the
-        // rematerialized instruction is not rematerialized again.
-        remat_instructions.insert(remat);
-
-        net_instructions_added++;
+      // Replace each remaining use of 'best' with the rematerialization.
+      std::vector<HloInstruction*> best_users_copy = best->users();
+      for (HloInstruction* user : best_users_copy) {
+        if (!memory_tracker.IsPlaced(user)) {
+          VLOG(2) << "  Replacing use of " << best->name() << " in "
+                  << user->name() << " with " << remat->name();
+          TF_RETURN_IF_ERROR(best->ReplaceUseWith(user, remat));
+        }
       }
 
-      // Original instruction should no longer be live at this point. All
-      // of its remaining uses are fed by rematerialized instructions.
-      TF_RET_CHECK(!memory_tracker.IsCurrentlyLive(best));
+      // Account for the rematerialization in the memory tracker.
+      TF_RETURN_IF_ERROR(
+          memory_tracker.AddRematerializedInstruction(best, remat));
+
+      // Insert rematerialized instruction right before the earliest unplaced
+      // use of the instruction *and* the earliest unplaced last use of any
+      // operands of remat. Unplaced uses of the remat's operands are included
+      // because we don't want to extend the live range of remat's operands as
+      // this could increase memory usage.
+      std::vector<HloInstruction*> place_before = remat->users();
+      for (auto* operand : remat->operands()) {
+        for (auto* operand_user : operand->users()) {
+          if (!memory_tracker.IsPlaced(operand_user) && operand_user != remat) {
+            place_before.push_back(operand_user);
+          }
+        }
+      }
+      instruction_list.InsertBeforeInstructions(remat, place_before);
 
       // If the rematerialized instruction is dead then rematerialization is
       // essentially a move. Don't delete the instruction now because we don't
@@ -664,15 +1061,24 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
       // transformation because we keep maps with HloInstruction* values as
       // keys.
       if (best->users().empty()) {
-        VLOG(3) << best->name() << " is now dead";
-        net_instructions_added--;
+        VLOG(2) << best->name() << " is now dead";
+        if (ContainsKey(remat_move_instructions, best)) {
+          // Previously, 'best' was a rematerialization which killed the
+          // instruction it was a copying of. Now 'remat' is a rematerialization
+          // of 'best' and kills 'best'. Stop rematerializing this instruction
+          // to avoid an infinite loop.
+          blacklist.insert(remat);
+        }
+        remat_move_instructions.insert(remat);
+      } else {
+        net_instructions_added++;
       }
 
       VLOG(3) << "memory_usage after rematerialization = "
               << memory_tracker.memory_usage();
     }
 
-    const CallSite* callsite = call_graph_node->GetCallSite(instruction);
+    const CallSite* callsite = call_graph_node.GetCallSite(instruction);
     if (callsite != nullptr &&
         callsite->context() == CallContext::kSequential &&
         memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
@@ -686,21 +1092,22 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       // Recompute callee usage to account for any rematerialization performed
       // in the callee computations.
-      callee_usage = 0;
       for (HloComputation* called_computation :
            callsite->called_computations()) {
-        // Memory limit for the subcomputation is the memory limit less the
-        // amount of memory used at this point in the computation.
-        int64 subcomputation_memory_limit_bytes = std::max<int64>(
-            0, memory_limit_bytes - memory_tracker.memory_usage());
-        TF_ASSIGN_OR_RETURN(
-            bool subcomputation_changed,
-            RematerializeComputation(called_computation, sequence,
-                                     subcomputation_memory_limit_bytes));
-        changed |= subcomputation_changed;
-
-        callee_usage += computation_peak_memory_.at(called_computation);
+        if (!ContainsKey(rematerialized_computations_, called_computation)) {
+          // Memory limit for the subcomputation is the memory limit less the
+          // amount of memory used at this point in the computation.
+          int64 subcomputation_memory_limit_bytes = std::max<int64>(
+              0, memory_limit_bytes - memory_tracker.memory_usage());
+          TF_ASSIGN_OR_RETURN(
+              bool subcomputation_changed,
+              RematerializeComputation(called_computation, sequence,
+                                       subcomputation_memory_limit_bytes));
+          changed |= subcomputation_changed;
+        }
       }
+      TF_ASSIGN_OR_RETURN(callee_usage,
+                          CalledComputationsMemoryUsage(instruction));
     }
 
     peak_memory = std::max<int64>(peak_memory,
@@ -710,37 +1117,33 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
     TF_RETURN_IF_ERROR(memory_tracker.EndInstruction());
   }
 
-  if (peak_memory > memory_limit_bytes) {
-    LOG(WARNING) << "Can't reduce memory use of computation "
-                 << computation->name() << " below "
-                 << HumanReadableNumBytes(memory_limit_bytes)
-                 << " by rematerialization (only reduced to "
-                 << HumanReadableNumBytes(peak_memory) << ")";
-  }
-
-  // Verify that there are no more remaining uses.
+  // Verify some invariants on the memory tracker.
+  CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto& instruction : computation->instructions()) {
-    auto& remaining_uses = memory_tracker.RemainingUses(instruction.get());
-    CHECK(remaining_uses.empty())
-        << instruction->name() << " has remaining uses: "
-        << tensorflow::str_util::Join(
-               remaining_uses, ", ", [](string* out, HloInstruction* inst) {
-                 tensorflow::strings::StrAppend(out, inst->name());
-               });
+    CHECK(memory_tracker.IsPlaced(instruction.get()));
   }
 
-  VLOG(1) << "Rematerialized " << remat_count << " instructions; "
-          << net_instructions_added << " net instructions added";
-  VLOG(1) << "peak memory usage now " << HumanReadableNumBytes(peak_memory);
+  VLOG(1) << "In computation " << computation->name() << " rematerialized "
+          << remat_count << " instructions; " << net_instructions_added
+          << " net instructions added";
+  VLOG(1) << "  peak memory usage now " << HumanReadableNumBytes(peak_memory)
+          << " (was "
+          << HumanReadableNumBytes(computation_peak_memory_.at(computation))
+          << ")";
 
   // Update peak memory used by computation.
-  computation_peak_memory_[computation] = peak_memory;
+  computation_peak_memory_.at(computation) = peak_memory;
 
   // Update order to include rematerialized instructions.
   sequence->at(computation)
       .assign(instruction_list.instructions().begin(),
               instruction_list.instructions().end());
 
+  rematerialized_computations_.insert(computation);
+
+  instructions_rematerialized_ += remat_count;
+  net_instructions_added_ += net_instructions_added;
+
   return changed;
 }
 
@@ -753,18 +1156,38 @@ StatusOr<bool> HloRematerialization::Run(
   VLOG(1) << "HloRematerialization() with memory limit of "
           << HumanReadableNumBytes(memory_limit_bytes);
 
-  XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
+  TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module));
+
+  // Adjust memory limit to account for the output of the entry
+  // computation. This is necessary because the per-computation accounting in
+  // MemoryUsageTracker do not include output as these are typically allocated
+  // by the caller.
+  int64 module_output_size = 0;
+  ShapeUtil::ForEachSubshape(
+      module->entry_computation()->root_instruction()->shape(),
+      [&module_output_size, this](const Shape& subshape,
+                                  const ShapeIndex& /*index*/) {
+        module_output_size += size_function_(subshape);
+        return Status::OK();
+      })
+      .IgnoreError();
 
+  const int64 adjusted_memory_limit_bytes =
+      memory_limit_bytes - module_output_size;
+  VLOG(1) << "Adjusted memory limit accounting for output ("
+          << HumanReadableNumBytes(module_output_size)
+          << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
+
+  XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
   // Create initial sequence of HLO instructions.
   TF_ASSIGN_OR_RETURN(*sequence,
                       CreateMemoryMinimizingSequence(
                           *module, [this](const LogicalBuffer& buffer) {
                             return size_function_(buffer.shape());
                           }));
-
   // Compute peak memory usage of all computations in the module called in a
   // sequential context.
-  TF_ASSIGN_OR_RETURN(call_graph_, CallGraph::Build(module));
+  call_graph_ = CallGraph::Build(module);
   TF_RETURN_IF_ERROR(call_graph_->VisitNodes(
       [this, sequence](const CallGraphNode& node) -> Status {
         if (node.context() == CallContext::kSequential) {
@@ -776,9 +1199,15 @@ StatusOr<bool> HloRematerialization::Run(
         return Status::OK();
       }));
 
+  // The peak memory usage of the module equals the peak memory use of the entry
+  // computation plus the output size of the computation. This is because the
+  // peak memory for a computation does not include the output as this is
+  // typically accounted for in the caller.
+  const int64 before_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
   VLOG(1) << "Peak memory usage of module (before): "
-          << HumanReadableNumBytes(
-                 computation_peak_memory_[module->entry_computation()]);
+          << HumanReadableNumBytes(before_peak_memory);
 
   // Run cost analysis. Operation cost is used in the heuristic for selecting
   // instructions for rematerialization.
@@ -787,9 +1216,9 @@ StatusOr<bool> HloRematerialization::Run(
 
   // Subcomputations called by the entry computation will also be
   // rematerialized.
-  TF_ASSIGN_OR_RETURN(bool changed,
-                      RematerializeComputation(module->entry_computation(),
-                                               sequence, memory_limit_bytes));
+  TF_ASSIGN_OR_RETURN(bool changed, RematerializeComputation(
+                                        module->entry_computation(), sequence,
+                                        adjusted_memory_limit_bytes));
 
   // Rematerialization can introduce dead code. This occurs if all uses of an
   // instruction are replaced with rematerializations of the instruction.
@@ -824,19 +1253,38 @@ StatusOr<bool> HloRematerialization::Run(
                    computation->instruction_count());
     }
   }
-
-  VLOG(1) << "Peak memory usage of module (after): "
-          << HumanReadableNumBytes(
-                 computation_peak_memory_[module->entry_computation()]);
+  VLOG(1) << "Rematerialized " << instructions_rematerialized_
+          << " instructions in module " << module->name() << "; "
+          << net_instructions_added_ << " net instructions added";
+  const int64 current_peak_memory =
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
+  VLOG(1) << "Peak memory usage of module now "
+          << HumanReadableNumBytes(current_peak_memory) << " ("
+          << current_peak_memory << " bytes), was "
+          << HumanReadableNumBytes(before_peak_memory) << " ("
+          << before_peak_memory << " bytes)";
+  const int64 reduced_peak_memory = before_peak_memory - current_peak_memory;
+  VLOG(1) << "Reduced peak memory by "
+          << HumanReadableNumBytes(reduced_peak_memory) << " ("
+          << reduced_peak_memory << " bytes)";
 
   XLA_VLOG_LINES(3, "After HloRematerialization:\n" + module->ToString());
 
+  if (current_peak_memory > memory_limit_bytes) {
+    LOG(WARNING) << "Can't reduce memory use below "
+                 << HumanReadableNumBytes(memory_limit_bytes)
+                 << " by rematerialization (only reduced to "
+                 << HumanReadableNumBytes(current_peak_memory) << ")";
+  }
+
   return changed;
 }
 
 /* static */ StatusOr<bool> HloRematerialization::RematerializeAndSchedule(
-    const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
-    HloModule* hlo_module, SequentialHloOrdering::HloModuleSequence* sequence) {
+    const HloRematerialization::ShapeSizeFunction& size_function,
+    int64 memory_limit_bytes, HloModule* hlo_module,
+    SequentialHloOrdering::HloModuleSequence* sequence) {
   HloRematerialization remat(size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 86e1998b89454f75b1c10d0de2118fd1034c134d..1693f93183bc59c343e3c765cb4051566d4377ef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -21,6 +21,7 @@
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 
 namespace xla {
 
@@ -108,6 +109,23 @@ class HloRematerialization {
   // occurs.
   tensorflow::gtl::FlatMap<const HloComputation*, int64>
       computation_peak_memory_;
+
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+
+  // Set of computations which have had rematerialization
+  // applied. Rematerialization is only applied once per computation.
+  tensorflow::gtl::FlatSet<const HloComputation*> rematerialized_computations_;
+
+  // Count of the total instructions rematerialized.
+  int64 instructions_rematerialized_ = 0;
+
+  // Count of the net instructions added to the HLO module by
+  // rematerialization. This can be different than instructions_rematerialized_
+  // because some rematerializations are effectively moves in the HLO
+  // schedule. In these cases, the rematerialization instruction replaces all
+  // uses of the original instruction and the original instruction is
+  // dead. Hence, no net instructions were added.
+  int64 net_instructions_added_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 0a4f2776891cfc932b4fc0627daaa9b5408f420a..2a1d728bc84067e6ad7f1f622216ab39b2b474d3 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -30,12 +31,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class HloOrderingTest : public HloTestBase {
+namespace op = xla::testing::opcode_matchers;
+
+using ::testing::_;
+
+class HloRematerializationTest : public HloTestBase {
  protected:
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
   //
-  //   F32[1] %param = {...}
+  //   F32[] %param = {...}
   //   F32[1024] %bcast = broadcast(%param)
   //   F32[1024] %negate = negate(%bcast)
   //   F32[2048] %concat_1 = concat({%negate, %negate})
@@ -52,7 +57,7 @@ class HloOrderingTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
     auto bcast = builder.AddInstruction(
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto negate = builder.AddInstruction(
@@ -77,7 +82,7 @@ class HloOrderingTest : public HloTestBase {
   // Creates and returns a computation which includes a while and can benefit
   // from rematerialization. The computation looks like:
   //
-  //   F32[1] %param = {...}
+  //   F32[] %param = {...}
   //   F32[1024] %bcast = broadcast(%param)
   //   F32[1] %slice_1 = slice(%bcast, {0:1})
   //   F32[1] %while = while(%slice_1, while_body, while_cond)
@@ -93,7 +98,7 @@ class HloOrderingTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
     auto bcast = builder.AddInstruction(
         HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
     auto slice_1 = builder.AddInstruction(
@@ -127,13 +132,14 @@ class HloOrderingTest : public HloTestBase {
   }
 
   // Various shapes used in the canned computations.
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {});
   const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1});
   const Shape vec1024_shape_ = ShapeUtil::MakeShape(xla::F32, {1024});
 };
 
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation.
-TEST_F(HloOrderingTest, SingleComputation) {
+TEST_F(HloRematerializationTest, SingleComputation) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeRematerializableComputation());
@@ -141,11 +147,9 @@ TEST_F(HloOrderingTest, SingleComputation) {
   // Find and save the original broadcast instruction which should be
   // rematerialized.
   const HloInstruction* slice = computation->root_instruction();
-  ASSERT_EQ(HloOpcode::kSlice, slice->opcode());
+  ASSERT_THAT(slice, op::Slice(op::Concatenate(op::Broadcast(_), _)));
   const HloInstruction* concat = slice->operand(0);
-  ASSERT_EQ(HloOpcode::kConcatenate, concat->opcode());
   const HloInstruction* bcast = concat->operand(0);
-  ASSERT_EQ(HloOpcode::kBroadcast, bcast->opcode());
 
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
@@ -161,8 +165,7 @@ TEST_F(HloOrderingTest, SingleComputation) {
 
   // The broadcast should have been rematerialized.
   const HloInstruction* remat_bcast = concat->operand(0);
-  EXPECT_EQ(HloOpcode::kBroadcast, remat_bcast->opcode());
-  EXPECT_NE(bcast, remat_bcast);
+  EXPECT_THAT(remat_bcast, op::Broadcast(::testing::Ne(bcast)));
 
   // The rematerialized broadcast should be immediate before the concat in the
   // sequence.
@@ -175,7 +178,7 @@ TEST_F(HloOrderingTest, SingleComputation) {
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
-TEST_F(HloOrderingTest, SingleComputationNoRematerialization) {
+TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   HloModule module(TestName());
   HloComputation* computation =
       module.AddEntryComputation(MakeRematerializableComputation());
@@ -199,7 +202,7 @@ TEST_F(HloOrderingTest, SingleComputationNoRematerialization) {
 // only one computation needs to have an instruction rematerialized. The entry
 // computation should be the one chosen because rematerialization in the while
 // will presumably be more expensive.
-TEST_F(HloOrderingTest, RematerializeAroundWhile) {
+TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -237,7 +240,7 @@ TEST_F(HloOrderingTest, RematerializeAroundWhile) {
 // Test rematerialization of a computation which calls another computation via a
 // while. Both the entry computation and while body computation should have
 // computations rematerialized.
-TEST_F(HloOrderingTest, RematerializeEntryAndWhileBody) {
+TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -271,7 +274,7 @@ TEST_F(HloOrderingTest, RematerializeEntryAndWhileBody) {
 
 // Test rematerialization of a doubly nested computation. All computations
 // should have an instruction rematerialized.
-TEST_F(HloOrderingTest, RematerializeNestedComputations) {
+TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   HloModule module(TestName());
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
@@ -311,6 +314,203 @@ TEST_F(HloOrderingTest, RematerializeNestedComputations) {
   EXPECT_EQ(inner_computation->instruction_count(), 8);
 }
 
+TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
+  // Test that a single instruction is rematerialized several times. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call_1 = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call_1)
+  //   F32[1024] %call_2 = call(SubComputation, {%add_2})
+  //   F32[1024] %add_3 = add(%bcast, call_2)
+  //   F32[1024] %call_3 = call(Subcomputation, {%add_3})
+  //   F32[1024] %add_4 = add(%bcast, call_3)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across each call of Subcomputation (which requires
+  // 8KB) though the value is not used in the calls. Rematerializing %bcast
+  // across these calls reduces peak memory use from ~20KB down to ~16KB.
+  HloModule module(TestName());
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}));
+    subcomputation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto call_2 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_2}, subcomputation));
+  auto add_3 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_2));
+  auto call_3 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_3}, subcomputation));
+  auto add_4 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_3));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  auto count_broadcasts = [](const HloComputation* computation) {
+    int64 bcast_count = 0;
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kBroadcast) {
+        bcast_count++;
+      }
+    }
+    return bcast_count;
+  };
+
+  // Before rematerialization there should be a single broadcast instruction in
+  // the graph.
+  EXPECT_EQ(count_broadcasts(entry_computation), 1);
+  EXPECT_EQ(entry_computation->instruction_count(), 9);
+
+  EXPECT_EQ(add_2->operand(0), bcast);
+  EXPECT_EQ(add_3->operand(0), bcast);
+  EXPECT_EQ(add_4->operand(0), bcast);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/22 * 1024, &module, &sequence));
+  EXPECT_TRUE(changed);
+
+  // The broadcast should have been rematerialized 3 times.
+  EXPECT_EQ(count_broadcasts(entry_computation), 4);
+  EXPECT_EQ(entry_computation->instruction_count(), 12);
+
+  // The operands of add_2, add_3, and add_4 should all be rematerialized
+  // broadcasts.
+  EXPECT_NE(add_2->operand(0), bcast);
+  EXPECT_THAT(add_2->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_3->operand(0), bcast);
+  EXPECT_THAT(add_3->operand(0), op::Broadcast(param));
+  EXPECT_NE(add_4->operand(0), bcast);
+  EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
+}
+
+class IndirectUseTest : public HloRematerializationTest,
+                        public ::testing::WithParamInterface<bool> {};
+
+TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
+  // Test that an rematerializable instruction is not rematerialized if it has
+  // an indirect use. Test is parameterized on whether the value has an indirect
+  // use, and the instruction should be rematerialized iff the value has no
+  // indirect use. Module:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] %bcast = broadcast(%param)
+  //   F32[1024] %add_1 = add(%bcast, bcast)
+  //   F32[1024] %call = call(Subcomputation, {%add_1})
+  //   F32[1024] %add_2 = add(%bcast, call)
+  //   {F32[1024], F32[1024]} %tuple = tuple(%bcast, %add_2)
+  //   F32[1024] %gte = GetTupleElememt(%tuple, 0)
+  //   F32[1024] %negate = negate(%gte)
+  //
+  // Subcomputation:
+  //   F32[1024] %param = {...}
+  //   F32[2048] %concat = concat({%param, %param})
+  //   F32[1024] %slice = slice(%concat)
+  //
+  // The value %bcast is live across the call and rematerialization of %bcast
+  // across that point would reduce peak memory use by 4KB. However, %bcast is
+  // used indirectly in the %negate so rematerialization should not happen.
+  //
+  // This test is parameterized on whether the broadcast has an indirect use or
+  // not. The indirect use is controlled by the index of the GetTupleElement
+  // instruction. If the element is 0, then the %negate operand aliases %bcast
+  // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
+  // aliases %add_2.
+  const bool indirectly_used = GetParam();
+  HloModule module(TestName());
+
+  HloComputation* subcomputation = nullptr;
+  {
+    auto builder = HloComputation::Builder(TestName() + ".subcomputation");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+    auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
+        ShapeUtil::MakeShape(xla::F32, {2048}), {param, param},
+        /*dimension=*/0));
+    builder.AddInstruction(HloInstruction::CreateSlice(
+        vec1024_shape_, concat, /*start_indices=*/{0},
+        /*limit_indices=*/{1024}));
+    subcomputation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, bcast));
+  auto call_1 = builder.AddInstruction(
+      HloInstruction::CreateCall(vec1024_shape_, {add_1}, subcomputation));
+  auto add_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, bcast, call_1));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({bcast, add_2}));
+  auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      vec1024_shape_, tuple, indirectly_used ? 0 : 1));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, gte));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(entry_computation->instruction_count(), 8);
+
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSIGN_OR_ASSERT_OK(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/22 * 1024, &module, &sequence));
+  // Rematerialization should only occur if the rematerializable instruction has
+  // no indirect uses.
+  if (indirectly_used) {
+    EXPECT_FALSE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 8);
+  } else {
+    EXPECT_TRUE(changed);
+    EXPECT_EQ(entry_computation->instruction_count(), 9);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IndirectUseTestInstantiation, IndirectUseTest,
+                        ::testing::Values(true, false));
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b14eca5d1b36fbe8b863cb32d64c79fb56ce761
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+LIcensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+using ::tensorflow::GraphDef;
+using ::tensorflow::NodeDef;
+using ::tensorflow::TensorShapeProto;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+using ::tensorflow::str_util::Join;
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+string GetOpDefName(const HloInstruction* instruction) {
+  string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
+  tensorflow::str_util::TitlecaseString(&name, "-");
+  name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
+
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    string fusion_name = ToString(instruction->fusion_kind());
+    StrAppend(&name, tensorflow::StringPiece(fusion_name).substr(1));
+  }
+  return name;
+}
+
+TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
+  TensorShapeProto tensor_shape;
+  const Shape& shape = instruction->shape();
+  for (auto dim : shape.dimensions()) {
+    tensor_shape.add_dim()->set_size(dim);
+  }
+  return tensor_shape;
+}
+
+}  // namespace
+
+void CleanNodeName(string* name) {
+  name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
+  const string chars_to_replace = "<>[]";
+  auto pred = [&](char c) {
+    return std::find(chars_to_replace.begin(), chars_to_replace.end(), c) !=
+           chars_to_replace.end();
+  };
+  std::replace_if(name->begin(), name->end(), pred, '_');
+}
+
+Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
+  VLOG(2) << "Adding computation " << computation.name();
+  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
+    for (auto& instruction : embedded->instructions()) {
+      TF_RETURN_IF_ERROR(AddInstruction(instruction.get()));
+    }
+  }
+  for (auto& instruction : computation.instructions()) {
+    TF_RETURN_IF_ERROR(AddInstruction(instruction.get()));
+  }
+  return Status::OK();
+}
+
+const GraphDef& HloTfGraphBuilder::GetGraphDef() const { return graph_def_; }
+
+const string& HloTfGraphBuilder::GetNodeNameForInstruction(
+    const HloInstruction* instruction) {
+  if (ContainsKey(instruction_to_node_name_, instruction)) {
+    return instruction_to_node_name_[instruction];
+  }
+  string node_name;
+  // If an instruction is fused, put it in the subgraph of the fusion;
+  // otherwise, put it in the computation subgraph.
+  if (instruction->IsFused()) {
+    node_name = GetNodeNameForInstruction(instruction->fusion_instruction());
+  } else {
+    node_name = instruction->parent()->name();
+    if (!instruction->metadata().op_name().empty()) {
+      // Always make computations contain TF ops but not the other way around.
+      StrAppend(&node_name, "/", instruction->metadata().op_name());
+    }
+  }
+  string instruction_name = instruction->name();
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    StrAppend(&instruction_name, ".", instruction->parameter_number());
+  }
+  StrAppend(&node_name, "/", instruction_name);
+  CleanNodeName(&node_name);
+  auto ret =
+      instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
+  CHECK(ret.second);
+  return ret.first->second;
+}
+
+void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
+                                     NodeDef* node_def) const {
+  auto& attrs = *node_def->mutable_attr();
+
+  // Set the number of arguments for instructions that have variadic operands.
+  if (HloOpcodeIsVariadic(instruction->opcode())) {
+    tensorflow::AttrValue attr_value;
+    attr_value.set_i(instruction->operands().size());
+    attrs["arg_num"] = attr_value;
+  }
+
+  // Set the node type.
+  attrs["type"].set_s(
+      xla::PrimitiveType_Name(instruction->shape().element_type()));
+
+  // Set the framework op (e.g. Tensorflow op) that generated this XLA op.
+  attrs["tf_op_type"].set_s(instruction->metadata().op_type());
+  attrs["tf_op_name"].set_s(instruction->metadata().op_name());
+
+  // Set the shape of the output tensor. "_output_shapes" is a special attribute
+  // name used by Tensorboard for shapes of output tensors.
+  tensorflow::AttrValue shapes;
+  *shapes.mutable_list()->add_shape() = GetTensorShape(instruction);
+  attrs["_output_shapes"] = shapes;
+
+  // Set the layout.
+  if (LayoutUtil::HasLayout(instruction->shape())) {
+    string layout_string;
+    if (ShapeUtil::IsTuple(instruction->shape())) {
+      // For tuples, emit the full shape because the layout of a tuple is not
+      // represented in a single Layout field.
+      layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
+    } else {
+      layout_string = StrCat(
+          "{", Join(instruction->shape().layout().minor_to_major(), ","), "}");
+    }
+    attrs["layout"].set_s(layout_string);
+  }
+
+  // Set op-specific attributes.
+  switch (instruction->opcode()) {
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReverse:
+    case HloOpcode::kTranspose:
+      for (auto dim : instruction->dimensions()) {
+        attrs["dims"].mutable_list()->add_i(dim);
+      }
+      break;
+    case HloOpcode::kGetTupleElement:
+      attrs["index"].set_i(instruction->tuple_index());
+      break;
+    case HloOpcode::kRng:
+      attrs["dist"].set_s(
+          RandomDistribution_Name(instruction->random_distribution()));
+      break;
+    case HloOpcode::kConstant:
+      if (ShapeUtil::IsScalar(instruction->shape())) {
+        attrs["value"].set_s(
+            LiteralUtil::GetAsString(instruction->literal(), {}));
+      }
+      break;
+    case HloOpcode::kCustomCall:
+      attrs["custom_call_target"].set_s(instruction->custom_call_target());
+      break;
+    default:
+      break;
+  }
+}
+
+Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) {
+  if (!visited_instructions_.insert(instruction).second) {
+    // Skip instructions that have already been added.
+    return Status::OK();
+  }
+
+  NodeDef* node_def = graph_def_.add_node();
+  node_def->set_name(GetNodeNameForInstruction(instruction));
+  node_def->set_op(GetOpDefName(instruction));
+  SetNodeAttrs(instruction, node_def);
+  if (instruction->opcode() == HloOpcode::kFusion) {
+    for (auto& fused_instruction : instruction->fused_instructions()) {
+      TF_RETURN_IF_ERROR(AddInstruction(fused_instruction.get()));
+    }
+  }
+  // Add all edges including control edges.
+  for (unsigned i = 0; i < instruction->operands().size(); ++i) {
+    *node_def->add_input() = GetNodeNameForInstruction(instruction->operand(i));
+  }
+  // Called computations are control dependencies.
+  for (const auto* called_computation : instruction->called_computations()) {
+    *node_def->add_input() = StrCat(
+        "^", GetNodeNameForInstruction(called_computation->root_instruction()));
+  }
+  return Status::OK();
+}
+
+}  // namespace hlo_graph_dumper
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2c578af912ac0b777d1bc72a198504735a6b845
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+
+// This constructs a tensorflow graph for HLO computations.
+class HloTfGraphBuilder {
+ public:
+  // Adds a computation to the graph.
+  Status AddComputation(const HloComputation& computation);
+
+  const tensorflow::GraphDef& GetGraphDef() const;
+
+ private:
+  // Gets the node name of an instruction. The node name is hierarchical. For
+  // example, if an instruction is fused, it will be put in a subgraph of the
+  // fusion instruction.
+  const string& GetNodeNameForInstruction(const HloInstruction* instruction);
+
+  void SetNodeAttrs(const HloInstruction* instruction,
+                    tensorflow::NodeDef* node_def) const;
+
+  Status AddInstruction(const HloInstruction* instruction);
+
+  tensorflow::GraphDef graph_def_;
+  // This records instructions that have been visited.
+  std::unordered_set<const HloInstruction*> visited_instructions_;
+  // A cache that maps instruction to the node name.
+  std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
+};
+
+// Cleans the node name to make it a valid name in a tensorflow graph.
+void CleanNodeName(string* name);
+
+}  // namespace hlo_graph_dumper
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6041debc4ae0ccbaad99bec9a461b640aeffbccf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+using ::tensorflow::GraphDef;
+
+class HloTfGraphBuilderTest : public HloTestBase {
+ protected:
+  HloTfGraphBuilderTest() {}
+  HloTfGraphBuilder generator_;
+
+  // Create a computation which takes a scalar and returns its negation.
+  std::unique_ptr<HloComputation> CreateNegateComputation() {
+    auto builder = HloComputation::Builder("Negate");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
+    return builder.Build();
+  }
+
+  // Creates a computation which calls map with the given computation.
+  std::unique_ptr<HloComputation> CreateMapComputation(
+      HloComputation *map_computation) {
+    auto builder = HloComputation::Builder("Map");
+    auto param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, r0f32_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateMap(r0f32_, {param}, map_computation));
+    return builder.Build();
+  }
+  Shape r0f32_ = ShapeUtil::MakeShape(PrimitiveType::F32, {});
+};
+
+static const tensorflow::AttrValue &GetNodeAttr(const tensorflow::NodeDef &node,
+                                                const string &attr_name) {
+  auto attr = node.attr().find(attr_name);
+  CHECK(attr != node.attr().end());
+  return attr->second;
+}
+
+TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
+  auto builder = HloComputation::Builder("Concatenate");
+  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2});
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {param_1, param_2}, 1));
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  const auto &node = graph_def.node(2);
+  EXPECT_EQ(node.name(), "Concatenate/concatenate");
+
+  // Check dimensions.
+  auto dims_value = GetNodeAttr(node, "dims");
+  EXPECT_EQ(dims_value.list().i_size(), 1);
+  EXPECT_EQ(dims_value.list().i(0), 1);
+
+  // Check shapes.
+  auto shape_value = GetNodeAttr(node, "_output_shapes");
+  EXPECT_EQ(shape_value.list().shape_size(), 1);
+  EXPECT_EQ(shape_value.list().shape(0).dim_size(), 2);
+  EXPECT_EQ(shape_value.list().shape(0).dim(0).size(), 2);
+  EXPECT_EQ(shape_value.list().shape(0).dim(1).size(), 4);
+}
+
+TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
+  auto builder = HloComputation::Builder("Const");
+  HloInstruction *instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
+  OpMetadata metadata;
+  metadata.set_op_name("x");
+  metadata.set_op_type("y");
+  instruction->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 1);
+  const auto &node = graph_def.node(0);
+  EXPECT_EQ(GetNodeAttr(node, "value").s(), "123");
+  EXPECT_EQ(GetNodeAttr(node, "type").s(), "S32");
+  EXPECT_EQ(GetNodeAttr(node, "tf_op_name").s(), "x");
+  EXPECT_EQ(GetNodeAttr(node, "tf_op_type").s(), "y");
+}
+
+TEST_F(HloTfGraphBuilderTest, SimpleNegateComputation) {
+  auto negate_computation = CreateNegateComputation();
+  TF_CHECK_OK(generator_.AddComputation(*negate_computation));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 2);
+  EXPECT_EQ(graph_def.node(0).name(), "Negate/param0.0");
+  EXPECT_EQ(graph_def.node(0).op(), "HloParameter");
+  EXPECT_EQ(graph_def.node(1).name(), "Negate/negate");
+  EXPECT_EQ(graph_def.node(1).op(), "HloNegate");
+  EXPECT_EQ(graph_def.node(1).input_size(), 1);
+  EXPECT_EQ(graph_def.node(1).input(0), "Negate/param0.0");
+}
+
+TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
+TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  auto ge = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  OpMetadata metadata;
+  metadata.set_op_name("x/y");
+  metadata.set_op_type("Y");
+  ge->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
+TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
+  // Create computations with a diamond-shaped callgraph.
+  auto negate_computation = CreateNegateComputation();
+  auto map1_computation = CreateMapComputation(negate_computation.get());
+  auto map2_computation = CreateMapComputation(negate_computation.get());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto map1 = builder.AddInstruction(
+      HloInstruction::CreateMap(r0f32_, {param}, map1_computation.get()));
+  auto map2 = builder.AddInstruction(
+      HloInstruction::CreateMap(r0f32_, {param}, map2_computation.get()));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, map1, map2));
+  auto computation = builder.Build();
+  TF_CHECK_OK(generator_.AddComputation(*computation));
+  EXPECT_GT(generator_.GetGraphDef().node_size(), 0);
+}
+
+}  // namespace
+}  // namespace hlo_graph_dumper
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 035b570ed3419503ad2325c5fdb46118b5076187..de6081e57e7f27a07b314692c6935ecf3e3c54a9 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -23,7 +23,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
       TF_RET_CHECK(instruction->parent() == computation.get());
       if (instruction->opcode() == HloOpcode::kFusion) {
         for (const auto& fused : instruction->fused_instructions()) {
-          TF_RET_CHECK(fused->parent() == computation.get())
+          TF_RET_CHECK(fused->parent() ==
+                       instruction->fused_instructions_computation())
               << "Fused HLO was missing a parent: " << fused->ToString()
               << " parent: " << fused->parent()
               << " computation: " << computation.get();
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 0054edcf6ab3b5134abbc43a8b326d56919364bc..a8d4ecf2614809d73f7c31eeab29b9e765bdeb4c 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -22,13 +22,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -59,11 +62,11 @@ TEST_F(InlinerTest, MapMax) {
   auto hlo_module = MakeUnique<HloModule>("test_module");
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
-  HloInstruction* root = hlo_module->entry_computation()->root_instruction();
+
   Inliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
-  root = hlo_module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kMaximum);
+  EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
+              op::Maximum(lhs, rhs));
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -97,7 +100,7 @@ TEST_F(InlinerTest, MapConstant) {
   Inliner inliner;
   EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
+  EXPECT_THAT(root, op::Broadcast(op::Constant()));
 
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index c162945bcae33f4e94b8fbd3a7e48bacce802925..5069215031bac496967cb446ee27dc3f44297df0 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -130,6 +130,33 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         computation_->MakeInstructionPostOrder();
     std::vector<HloInstruction*> post_order(post_order_list.begin(),
                                             post_order_list.end());
+
+    std::set<HloInstruction*> all_consumers_fusable;
+    // Find which ops can be fused into all of their operands. We would rather
+    // not fuse an op into only some of its users, as that offers no benefit in
+    // terms of memory bandwidth, but forces us to keep more live values around.
+    for (auto* hlo : post_order) {
+      auto user_fusable_into_hlo = [this, &hlo](HloInstruction* consumer) {
+        if (!consumer->IsFusable()) {
+          return false;
+        }
+        for (int operand_number = 0;
+             operand_number < consumer->operands().size(); ++operand_number) {
+          if (consumer->operand(operand_number) == hlo) {
+            if (!ShouldFuse(consumer, operand_number)) {
+              return false;
+            }
+          }
+        }
+        return true;
+      };
+
+      if (std::all_of(hlo->users().begin(), hlo->users().end(),
+                      user_fusable_into_hlo)) {
+        all_consumers_fusable.insert(hlo);
+      }
+    }
+
     tensorflow::gtl::FlatMap<HloInstruction*, int> post_order_index;
     for (size_t i = 0; i < post_order.size(); ++i) {
       InsertOrDie(&post_order_index, post_order[i], i);
@@ -216,6 +243,12 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
+
+        if (FusionWouldDuplicate(*operand, *instruction) &&
+            (all_consumers_fusable.count(operand) == 0)) {
+          continue;
+        }
+
         if (operand->IsFusable() && ShouldFuse(instruction, i)) {
           HloInstruction* fusion_instruction = Fuse(operand, instruction);
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index a4c269f0ebd40b2a1ab46619fec24e76ffd73ff0..9a79e4c38249323b1384192dfed81647d00b77b8 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 
 using InstructionFusionTest = HloTestBase;
@@ -60,7 +63,7 @@ TEST_F(InstructionFusionTest,
       InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
           .Run(module.get())
           .ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest,
@@ -80,7 +83,7 @@ TEST_F(InstructionFusionTest,
       InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
           .Run(module.get())
           .ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest,
@@ -100,7 +103,7 @@ TEST_F(InstructionFusionTest,
       InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
           .Run(module.get())
           .ValueOrDie());
-  EXPECT_EQ(HloOpcode::kFusion, computation->root_instruction()->opcode());
+  EXPECT_THAT(computation->root_instruction(), op::Fusion());
 }
 
 TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
@@ -151,4 +154,23 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {16, 16}), "0"));
+  HloInstruction* unary1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShape(S32, {}), HloOpcode::kFloor, param0));
+  builder.AddInstruction(HloInstruction::CreateSend(unary1, 0));
+  HloInstruction* unary2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShape(S32, {}), HloOpcode::kAbs, unary1));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(unary2, computation->root_instruction());
+  EXPECT_FALSE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 5e7bd4a7ce8a1152973979d4a8fdb790a7fbd219..a8366ae794932464d11e9a44a8282c5b9a8a9013 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -60,8 +60,9 @@ std::ostream& operator<<(std::ostream& out,
 }
 
 BufferLayoutConstraint::BufferLayoutConstraint(const Layout& layout,
-                                               const LogicalBuffer& buffer)
-    : layout_(layout), buffer_(&buffer) {
+                                               const LogicalBuffer& buffer,
+                                               bool mandatory)
+    : LayoutConstraint(mandatory), layout_(layout), buffer_(&buffer) {
   CHECK(LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()).ok());
 }
 
@@ -73,8 +74,9 @@ string BufferLayoutConstraint::ToString() const {
 
 OperandLayoutConstraint::OperandLayoutConstraint(
     const ShapeLayout& shape_layout, const HloInstruction* instruction,
-    int64 operand_no)
-    : shape_layout_(shape_layout),
+    int64 operand_no, bool mandatory)
+    : LayoutConstraint(mandatory),
+      shape_layout_(shape_layout),
       instruction_(instruction),
       operand_no_(operand_no) {
   CHECK(shape_layout_.LayoutIsSet());
@@ -124,7 +126,8 @@ bool LayoutConstraints::OperandBufferForwarded(
 }
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
-                                          const LogicalBuffer& buffer) {
+                                          const LogicalBuffer& buffer,
+                                          bool mandatory) {
   VLOG(3) << "SetBufferLayout : " << buffer << " : "
           << LayoutUtil::HumanString(layout);
 
@@ -139,26 +142,38 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
   TF_RETURN_IF_ERROR(
       LayoutUtil::ValidateLayoutForShape(layout, buffer.shape()));
 
-  const Layout* curr_layout = BufferLayout(buffer);
-  if (curr_layout != nullptr) {
-    if (!LayoutUtil::Equal(*curr_layout, layout)) {
+  const BufferLayoutConstraint* curr_constraint =
+      GetBufferLayoutConstraint(buffer);
+  if (curr_constraint != nullptr) {
+    if (LayoutUtil::Equal(curr_constraint->layout(), layout)) {
+      // New constraint matches existing constraint. Nothing to do.
+      return Status::OK();
+    }
+    if (curr_constraint->mandatory()) {
       return FailedPrecondition(
           "Buffer %s already has the layout constraint %s, cannot add "
           "incompatible constraint %s",
           buffer.ToString().c_str(),
-          LayoutUtil::HumanString(*curr_layout).c_str(),
+          LayoutUtil::HumanString(curr_constraint->layout()).c_str(),
           LayoutUtil::HumanString(layout).c_str());
     }
-    // New constraint matches existing constraint. Nothing to do.
-    return Status::OK();
   }
 
-  auto new_constraint_it = buffer_constraints_.insert(
-      {&buffer, BufferLayoutConstraint(layout, buffer)});
-  added_constraints_.push_back(&new_constraint_it.first->second);
+  auto iter = buffer_constraints_.find(&buffer);
+  bool overwrite = iter != buffer_constraints_.end();
+  if (!overwrite) {
+    iter = buffer_constraints_
+               .insert(std::make_pair(
+                   &buffer, BufferLayoutConstraint(layout, buffer, mandatory)))
+               .first;
+  } else {
+    iter->second = BufferLayoutConstraint(layout, buffer, /*mandatory=*/true);
+  }
+  added_constraints_.push_back(&iter->second);
 
   // Remove buffer from the set of unconstrained buffers.
-  TF_RET_CHECK(unconstrained_buffer_ids_.count(buffer.id()) == 1);
+  TF_RET_CHECK(unconstrained_buffer_ids_.count(buffer.id()) ==
+               static_cast<int>(!overwrite));
   unconstrained_buffer_ids_.erase(buffer.id());
 
   return Status::OK();
@@ -166,23 +181,27 @@ Status LayoutConstraints::SetBufferLayout(const Layout& layout,
 
 Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
                                            const HloInstruction* instruction,
-                                           int64 operand_no) {
+                                           int64 operand_no, bool mandatory) {
   VLOG(3) << "SetOperandLayout : " << instruction->name() << ", operand "
           << operand_no << " : "
           << ShapeUtil::HumanStringWithLayout(shape_with_layout);
 
-  const ShapeLayout* curr_shape_layout = OperandLayout(instruction, operand_no);
+  const OperandLayoutConstraint* curr_shape_layout =
+      GetOperandLayoutConstraint(instruction, operand_no);
   if (curr_shape_layout != nullptr) {
-    if (!curr_shape_layout->MatchesLayoutInShape(shape_with_layout)) {
+    if (curr_shape_layout->shape_layout().MatchesLayoutInShape(
+            shape_with_layout)) {
+      // New constraint matches existing constraint. Nothing to do.
+      return Status::OK();
+    }
+    if (curr_shape_layout->mandatory()) {
       return FailedPrecondition(
           "Operand %lld of instruction %s already has a layout constraint "
           "%s, cannot add incompatible constraint %s",
           operand_no, instruction->name().c_str(),
-          curr_shape_layout->ToString().c_str(),
+          curr_shape_layout->shape_layout().ToString().c_str(),
           ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str());
     }
-    // New constraint matches existing constraint. Nothing to do.
-    return Status::OK();
   }
 
   // If any buffers in the operand occur in the output of the instruction, then
@@ -196,22 +215,31 @@ Status LayoutConstraints::SetOperandLayout(const Shape& shape_with_layout,
   }
 
   auto key = std::make_pair(instruction, operand_no);
-  auto new_constraint_it = operand_constraints_.insert(
-      {key, OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
-                                    operand_no)});
-  added_constraints_.push_back(&new_constraint_it.first->second);
+  auto iter = operand_constraints_.find(key);
+  if (iter == operand_constraints_.end()) {
+    auto pair = std::make_pair(
+        key, OperandLayoutConstraint(ShapeLayout(shape_with_layout),
+                                     instruction, operand_no, mandatory));
+    iter = operand_constraints_.insert(pair).first;
+  } else {
+    iter->second =
+        OperandLayoutConstraint(ShapeLayout(shape_with_layout), instruction,
+                                operand_no, /*mandatory=*/true);
+  }
+  added_constraints_.push_back(&iter->second);
 
   return Status::OK();
 }
 
 Status LayoutConstraints::SetArrayOperandLayout(
-    const Layout& layout, const HloInstruction* instruction, int64 operand_no) {
+    const Layout& layout, const HloInstruction* instruction, int64 operand_no,
+    bool mandatory) {
   const HloInstruction* operand = instruction->operand(operand_no);
   TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
-  return SetOperandLayout(shape, instruction, operand_no);
+  return SetOperandLayout(shape, instruction, operand_no, mandatory);
 }
 
 Status LayoutConstraints::SetResultLayout(const Shape& shape_with_layout) {
@@ -274,15 +302,29 @@ Status LayoutConstraints::SetInstructionLayout(
 
 const Layout* LayoutConstraints::BufferLayout(
     const LogicalBuffer& buffer) const {
+  if (const auto* constraint = GetBufferLayoutConstraint(buffer)) {
+    return &constraint->layout();
+  }
+  return nullptr;
+}
+const BufferLayoutConstraint* LayoutConstraints::GetBufferLayoutConstraint(
+    const LogicalBuffer& buffer) const {
   auto it = buffer_constraints_.find(&buffer);
-  return it == buffer_constraints_.end() ? nullptr : &it->second.layout();
+  return it == buffer_constraints_.end() ? nullptr : &it->second;
 }
 
 const ShapeLayout* LayoutConstraints::OperandLayout(
     const HloInstruction* instruction, int64 operand_no) const {
+  if (const auto* constraint =
+          GetOperandLayoutConstraint(instruction, operand_no)) {
+    return &constraint->shape_layout();
+  }
+  return nullptr;
+}
+const OperandLayoutConstraint* LayoutConstraints::GetOperandLayoutConstraint(
+    const HloInstruction* instruction, int64 operand_no) const {
   auto it = operand_constraints_.find(std::make_pair(instruction, operand_no));
-  return it == operand_constraints_.end() ? nullptr
-                                          : &it->second.shape_layout();
+  return it == operand_constraints_.end() ? nullptr : &it->second;
 }
 
 const ShapeLayout* LayoutConstraints::ResultLayout() const {
@@ -343,7 +385,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       // Constrain the input to the Outfeed instruction to be the expected
       // layout of the Outfeed.
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          instruction->outfeed_shape(), instruction.get(), 0));
+          instruction->outfeed_shape(), instruction.get(), 0,
+          /*mandatory=*/true));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
       // Parameter layouts must match the respective layout in
       // ComputationLayout.
@@ -375,7 +418,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             called_computation_layout.parameter_layout(i).shape(),
-            instruction.get(), i));
+            instruction.get(), i, /*mandatory=*/true));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile) {
       // Layout of input and output of kWhile instruction must be equal and must
@@ -426,7 +469,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           body_layout.result_shape(), instruction.get()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-          body_layout.result_shape(), instruction.get(), 0));
+          body_layout.result_shape(), instruction.get(), 0,
+          /*mandatory=*/true));
     } else if (instruction->opcode() == HloOpcode::kCustomCall) {
       // Add constraints for kCustomCall instruction operands and instructions.
       // For now we only support row major layouts for all inputs and outputs.
@@ -450,7 +494,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
 
         Shape row_major_operand_shape(row_major_shape(operand_shape));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction.get(), i));
+            row_major_operand_shape, instruction.get(), i, /*mandatory=*/true));
       }
     }
   }
@@ -659,44 +703,6 @@ LayoutAssignment::LayoutAssignment(ComputationLayout* entry_computation_layout)
   }
 }
 
-namespace {
-
-// Given a pemutation of `{0, 1, ..., n}` `indices`, returns a permutation of
-// `{0, 1, ..., n - to_delete.size() + to_insert.size()}` by deleting the
-// indices `to_delete` wherever in `indices` they are, and inserting the indices
-// `to_insert` arbitrarily at the back.
-tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>
-DeleteAndInsertIndices(
-    std::vector<int64> to_delete, std::vector<int64> to_insert,
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> indices) {
-  std::sort(to_delete.begin(), to_delete.end(), std::greater<int64>());
-  std::sort(to_insert.begin(), to_insert.end(), std::less<int64>());
-  for (auto index : to_delete) {
-    auto i = indices.begin();
-    while (i != indices.end()) {
-      if (*i == index) {
-        i = indices.erase(i);
-      } else {
-        if (*i > index) {
-          (*i)--;
-        }
-        ++i;
-      }
-    }
-  }
-  for (auto index : to_insert) {
-    for (auto i = indices.begin(); i != indices.end(); ++i) {
-      if (*i >= index) {
-        (*i)++;
-      }
-    }
-    indices.Add(index);
-  }
-  return indices;
-}
-
-}  // namespace
-
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
@@ -705,7 +711,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   CHECK(ShapeUtil::IsArray(instruction->shape()) &&
         ShapeUtil::IsArray(operand->shape()));
 
-  if (instruction->IsElementwiseOnOperand(operand_no) &&
+  if ((instruction->IsElementwiseOnOperand(operand_no) ||
+       InstructionRequiresInputLayoutEqualToOutputLayout(instruction)) &&
       !ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) ==
           ShapeUtil::Rank(instruction->shape())) {
@@ -719,21 +726,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
   }
 
   if (instruction->opcode() == HloOpcode::kReshape) {
-    // Pick the operand layout that makes the reshape a bitcast. If the reshape
-    // only inserts or deletes degenerate dimensions, we can easily compute the
-    // desired layout by accordingly inserting and deleting the elements in the
-    // minor-to-major list.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        instruction->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout operand_layout = LayoutUtil::MakeLayout(
-          AsInt64Slice(DeleteAndInsertIndices(inserted_indices, deleted_indices,
-                                              output_layout.minor_to_major())));
+    // Prefer the operand layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the operand shape, there may be several such
+    // layouts. So if 'output_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    const Shape& output_shape = instruction->shape();
+    Shape output_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+        AsInt64Slice(output_layout.minor_to_major()));
+    const Shape& operand_shape = operand->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(output_layout)) {
+      Shape operand_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              operand_shape.element_type(),
+              AsInt64Slice(operand_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(operand_shape_with_layout,
+                                      output_shape_with_layout)) {
+        return MakeUnique<Layout>(operand_shape_with_layout.layout());
+      }
+    }
+    auto aligned_operand_shape =
+        ShapeUtil::AlignLayouts(output_shape_with_layout, operand_shape);
+    if (aligned_operand_shape) {
+      auto operand_layout = aligned_operand_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(operand_layout, operand->shape()));
+          LayoutUtil::ValidateLayoutForShape(operand_layout, operand_shape));
       return MakeUnique<Layout>(operand_layout);
     }
   }
@@ -768,18 +786,32 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
   }
 
   if (user->opcode() == HloOpcode::kReshape) {
-    // Pick the user layout that makes the reshape a bitcast.
-    bool merely_inserts_or_deletes_1_sized_dims;
-    std::vector<int64> inserted_indices, deleted_indices;
-    std::tie(merely_inserts_or_deletes_1_sized_dims, deleted_indices,
-             inserted_indices) =
-        user->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
-    if (merely_inserts_or_deletes_1_sized_dims) {
-      Layout user_layout = LayoutUtil::MakeLayout(AsInt64Slice(
-          DeleteAndInsertIndices(deleted_indices, inserted_indices,
-                                 operand_layout.minor_to_major())));
+    // Prefer the user layout that makes the reshape an bitcast. If any
+    // dimension bound is 1 in the user shape, there may be several such
+    // layouts. So if 'operand_layout' is a MajorToMinor layout, try if the
+    // reshape is a bitcast when using the same layout. This may avoid copy
+    // operations.
+    Shape operand_shape_with_layout = ShapeUtil::MakeShapeWithLayout(
+        operand->shape().element_type(),
+        AsInt64Slice(operand->shape().dimensions()),
+        AsInt64Slice(operand_layout.minor_to_major()));
+    const Shape& output_shape = user->shape();
+    if (LayoutUtil::IsMonotonicWithDim0Major(operand_layout)) {
+      Shape output_shape_with_layout =
+          ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
+              output_shape.element_type(),
+              AsInt64Slice(output_shape.dimensions()));
+      if (ShapeUtil::ReshapeIsBitcast(output_shape_with_layout,
+                                      operand_shape_with_layout)) {
+        return MakeUnique<Layout>(output_shape_with_layout.layout());
+      }
+    }
+    auto aligned_user_shape =
+        ShapeUtil::AlignLayouts(operand_shape_with_layout, output_shape);
+    if (aligned_user_shape) {
+      auto user_layout = aligned_user_shape.value().layout();
       TF_CHECK_OK(
-          LayoutUtil::ValidateLayoutForShape(user_layout, user->shape()));
+          LayoutUtil::ValidateLayoutForShape(user_layout, output_shape));
       return MakeUnique<Layout>(user_layout);
     }
   }
@@ -936,7 +968,8 @@ Status LayoutAssignment::PropagateOperandConstraint(
         operand_constraint.shape_layout().layout(), user,
         operand_constraint.operand_no());
     if (layout != nullptr) {
-      TF_RETURN_IF_ERROR(constraints->SetBufferLayout(*layout, *buffer));
+      TF_RETURN_IF_ERROR(
+          constraints->SetBufferLayout(*layout, *buffer, /*mandatory=*/false));
     }
   }
   return Status::OK();
@@ -966,11 +999,19 @@ Status LayoutAssignment::PropagateBufferConstraint(
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
-              *operand_layout, instruction, operand_no));
+              *operand_layout, instruction, operand_no, /*mandatory=*/true));
         }
       }
     }
   }
+  return PropagateBufferConstraintToUses(buffer_constraint, constraints);
+}
+
+Status LayoutAssignment::PropagateBufferConstraintToUses(
+    const BufferLayoutConstraint& buffer_constraint,
+    LayoutConstraints* constraints) {
+  const LogicalBuffer& buffer = buffer_constraint.buffer();
+  TF_RET_CHECK(buffer.IsArray());
 
   // Propagate the layout to all array uses of the logical buffer. This skips
   // uses of the buffer where the buffer is the element of a tuple.
@@ -983,7 +1024,7 @@ Status LayoutAssignment::PropagateBufferConstraint(
     if (constraints->OperandLayout(user, operand_no) == nullptr &&
         !constraints->OperandBufferForwarded(user, operand_no)) {
       TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
-          buffer_constraint.layout(), user, operand_no));
+          buffer_constraint.layout(), user, operand_no, /*mandatory=*/false));
     }
   }
 
@@ -1040,7 +1081,7 @@ StatusOr<Layout> InferArrayLayout(
                                   *first_buffer_layout)) {
       // The points-to set is ambiguous for this index and the different source
       // buffers have different layouts. This case is possible in valid XLA
-      // computations because we do not propagate BufferLayoutConstaints to all
+      // computations because we do not propagate BufferLayoutConstraints to all
       // LogicalBuffers which may alias the constrained LogicalBuffer at some
       // point in the computation.
       return FailedPrecondition(
@@ -1253,7 +1294,7 @@ Status LayoutAssignment::RunOnComputation(
   TF_ASSIGN_OR_RETURN(auto points_to_analysis,
                       TuplePointsToAnalysis::Run(computation->parent()));
 
-  // Construct LayoutConstaints with all layout constraints of the computation.
+  // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(*points_to_analysis, computation);
 
   // Add constraints required for correctness on all backends (eg, entry
@@ -1278,7 +1319,8 @@ Status LayoutAssignment::RunOnComputation(
     const LogicalBuffer& buffer = points_to_analysis->GetBuffer(
         *constraints.unconstrained_buffer_ids().begin());
     TF_RETURN_IF_ERROR(constraints.SetBufferLayout(
-        LayoutUtil::GetDefaultLayoutForShape(buffer.shape()), buffer));
+        LayoutUtil::GetDefaultLayoutForShape(buffer.shape()), buffer,
+        /*mandatory=*/false));
 
     TF_RETURN_IF_ERROR(PropagateConstraints(&constraints));
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 61dc7b120752d57cf09423f38546441de2fc8dd9..689e4510ed2e0c32a194b8488d09c4d7af522d2b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -46,10 +46,16 @@ namespace xla {
 // gathered together in LayoutConstraints object.
 class LayoutConstraint {
  public:
-  LayoutConstraint() = default;
+  LayoutConstraint(bool mandatory) : mandatory_(mandatory) {}
   virtual ~LayoutConstraint() = default;
 
   virtual string ToString() const = 0;
+
+  // True if this constraint cannot be overwritten by a different constraint.
+  bool mandatory() const { return mandatory_; }
+
+ private:
+  bool mandatory_;
 };
 
 std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
@@ -58,7 +64,8 @@ std::ostream& operator<<(std::ostream& out, const LayoutConstraint& constraint);
 // array produced by a particular instruction.
 class BufferLayoutConstraint : public LayoutConstraint {
  public:
-  BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer);
+  BufferLayoutConstraint(const Layout& layout, const LogicalBuffer& buffer,
+                         bool mandatory);
 
   const LogicalBuffer& buffer() const { return *buffer_; }
   const Layout& layout() const { return layout_; }
@@ -66,7 +73,7 @@ class BufferLayoutConstraint : public LayoutConstraint {
   string ToString() const override;
 
  private:
-  const Layout layout_;
+  Layout layout_;
   const LogicalBuffer* buffer_;
 };
 
@@ -78,7 +85,8 @@ class BufferLayoutConstraint : public LayoutConstraint {
 class OperandLayoutConstraint : public LayoutConstraint {
  public:
   OperandLayoutConstraint(const ShapeLayout& shape_layout,
-                          const HloInstruction* instruction, int64 operand_no);
+                          const HloInstruction* instruction, int64 operand_no,
+                          bool mandatory);
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   const HloInstruction* instruction() const { return instruction_; }
@@ -90,7 +98,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
   string ToString() const override;
 
  private:
-  const ShapeLayout shape_layout_;
+  ShapeLayout shape_layout_;
   const HloInstruction* instruction_;
   int64 operand_no_;
 };
@@ -99,7 +107,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
 class ResultLayoutConstraint : public LayoutConstraint {
  public:
   explicit ResultLayoutConstraint(const ShapeLayout& shape_layout)
-      : shape_layout_(shape_layout) {}
+      : LayoutConstraint(/*mandatory=*/true), shape_layout_(shape_layout) {}
 
   const ShapeLayout& shape_layout() const { return shape_layout_; }
   string ToString() const override;
@@ -124,8 +132,7 @@ class LayoutConstraints {
   // Return a vector containing the constraints which have been added to the
   // LayoutConstraints object since the construction of the object or since the
   // last time ConsumeAddedConstraints() has been called. This is used to
-  // identify
-  // newly added constraints when propagating layouts.
+  // identify newly added constraints when propagating layouts.
   std::vector<const LayoutConstraint*> ConsumeAddedConstraints() {
     std::vector<const LayoutConstraint*> ret_vec(std::move(added_constraints_));
     added_constraints_.clear();
@@ -137,23 +144,29 @@ class LayoutConstraints {
   // instruction, or the layout of the result of the computation, respectively,
   // if it has been constrained. Otherwise return nullptr.
   const Layout* BufferLayout(const LogicalBuffer& buffer) const;
+  const BufferLayoutConstraint* GetBufferLayoutConstraint(
+      const LogicalBuffer& buffer) const;
   const ShapeLayout* OperandLayout(const HloInstruction* instruction,
                                    int64 operand_no) const;
+  const OperandLayoutConstraint* GetOperandLayoutConstraint(
+      const HloInstruction* instruction, int64 operand_no) const;
   const ShapeLayout* ResultLayout() const;
 
   // Add a constraint on the layout of a LogicalBuffer, the layout of the
   // operand of the instruction, or the layout of the result of the computation,
   // respectively.
-  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer);
+  Status SetBufferLayout(const Layout& layout, const LogicalBuffer& buffer,
+                         bool mandatory = true);
   Status SetOperandLayout(const Shape& shape_with_layout,
-                          const HloInstruction* instruction, int64 operand_no);
+                          const HloInstruction* instruction, int64 operand_no,
+                          bool mandatory = true);
   Status SetResultLayout(const Shape& shape_with_layout);
 
   // Convenience wrapper around SetOperandLayout for setting the layout of a
   // operand using a Layout object. The operand must be array-shaped.
   Status SetArrayOperandLayout(const Layout& layout,
                                const HloInstruction* instruction,
-                               int64 operand_no);
+                               int64 operand_no, bool mandatory = true);
 
   // Convenience wrapper around SetBufferLayout. Sets the layouts of all buffers
   // created by the instruction to the layouts in the given shape. The
@@ -233,6 +246,18 @@ class LayoutAssignment : public HloPassInterface {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
+  // Propagates a buffer layout constraint into the operands that use it.
+  Status PropagateBufferConstraintToUses(
+      const BufferLayoutConstraint& layout_constraint,
+      LayoutConstraints* constraints);
+
+  // Propagates a layout constraint on the use of the result of the given
+  // instruction to the definitions of the LogicalBuffers which make up the
+  // result.
+  Status PropagateUseConstraintToDefs(const ShapeLayout& shape_layout,
+                                      const HloInstruction* instruction,
+                                      LayoutConstraints* constraints);
+
  private:
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
@@ -248,6 +273,15 @@ class LayoutAssignment : public HloPassInterface {
     return Status::OK();
   }
 
+  // This method can be overridden to mark instructions as requiring the operands
+  // to have the same layout as the result, for performance or correctness. This
+  // will propagate constraints through the instruction from the result into the
+  // operands.
+  virtual bool InstructionRequiresInputLayoutEqualToOutputLayout(
+      const HloInstruction* instruction) {
+    return false;
+  }
+
   // Construct contraints and assign layouts to all instructions in the
   // computation satisfying the given ComputationLayout. Layouts constraints are
   // added, then propagated until all LogicalBuffers in the computation are
@@ -267,13 +301,6 @@ class LayoutAssignment : public HloPassInterface {
   // required for correctness.
   Status PropagateConstraints(LayoutConstraints* constraints);
 
-  // Propagates a layout constraint on the use of the result of the given
-  // instruction to the definitions of the LogicalBuffers which make up the
-  // result.
-  Status PropagateUseConstraintToDefs(const ShapeLayout& shape_layout,
-                                      const HloInstruction* instruction,
-                                      LayoutConstraints* constraints);
-
   // Chooses a layout of operand `operand_no` of `instruction` that minimizes
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
   // Returns null if it can't decide the best layout.
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 6361907b0e4ad8e21baec88b975f88fc65e42b38..bfb9e4ac2ee707233a82c9cd8dc5e3cc0e5ff8e7 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -26,10 +26,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -38,9 +40,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 class LayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
@@ -304,18 +310,16 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {1}),
                                root->operand(1)->shape()));
 
-  // Verify some of the structure of the HLO graph.
-  EXPECT_EQ(constant, root->operand(0)->operand(0));
-  EXPECT_EQ(HloOpcode::kCopy, root->operand(1)->operand(0)->opcode());
-  EXPECT_EQ(HloOpcode::kConstant,
-            root->operand(1)->operand(0)->operand(0)->opcode());
+  // Verify the structure of the HLO graph.
+  EXPECT_THAT(root,
+              op::Tuple(op::Tuple(constant), op::Tuple(op::Copy(constant))));
 }
 
 TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   // param -> log -> reshape -> tanh
   auto builder = HloComputation::Builder(TestName());
   Shape ashape = ShapeUtil::MakeShape(F32, {1, 2, 3, 1});
-  Shape bshape = ShapeUtil::MakeShape(F32, {2, 1, 3});
+  Shape bshape = ShapeUtil::MakeShape(F32, {3, 1, 2});
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ashape, "param"));
   auto log = builder.AddInstruction(
@@ -330,8 +334,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
-  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2, 3});
-  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2});
+  *ashape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 2, 1, 3});
+  *bshape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({2, 1, 0});
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   *computation_layout.mutable_parameter_layout(0) =
@@ -341,12 +345,12 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(log_minor_to_major, 1),
+  EXPECT_GT(PositionInContainer(log_minor_to_major, 1),
             PositionInContainer(log_minor_to_major, 2));
 
   auto reshape_minor_to_major =
       AsInt64Slice(reshape->shape().layout().minor_to_major());
-  EXPECT_LT(PositionInContainer(reshape_minor_to_major, 0),
+  EXPECT_GT(PositionInContainer(reshape_minor_to_major, 0),
             PositionInContainer(reshape_minor_to_major, 2));
 }
 
@@ -419,8 +423,8 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(output_shape_with_layout);
   AssignLayouts(&module, &computation_layout);
 
-  EXPECT_TRUE(ContainersEqual(broadcast->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1, 2}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
+              ElementsAre(0, 1, 2));
 }
 
 TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
@@ -472,15 +476,80 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
   AssignLayouts(&module, &computation_layout);
 
-  EXPECT_TRUE(ContainersEqual(broadcast->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1}));
-  EXPECT_TRUE(ContainersEqual(transpose->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{1, 0}));
-  EXPECT_TRUE(ContainersEqual(tanh->shape().layout().minor_to_major(),
-                              tensorflow::gtl::ArraySlice<int64>{0, 1}));
+  EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
+  EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
+  EXPECT_THAT(tanh->shape().layout().minor_to_major(), ElementsAre(0, 1));
 }
 
-// Add test which fails due to copy tuple.
+class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
+ public:
+  explicit OperandsMustBeTheSameLayoutAssignment(
+      ComputationLayout* entry_computation_layout)
+      : LayoutAssignment(entry_computation_layout) {}
+
+ protected:
+  Status PropagateBufferConstraint(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints) override {
+    const LogicalBuffer& buffer = buffer_constraint.buffer();
+    const HloInstruction* instruction = buffer.instruction();
+
+    // Force the operands' layout to the output layout.
+    for (int64 operand_no = 0; operand_no < instruction->operand_count();
+         ++operand_no) {
+      const HloInstruction* operand = instruction->operand(operand_no);
+      if (ShapeUtil::Rank(instruction->shape()) !=
+          ShapeUtil::Rank(operand->shape())) {
+        continue;
+      }
+      TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
+          buffer_constraint.layout(), instruction, operand_no,
+          /*mandatory=*/true));
+    }
+    return PropagateBufferConstraintToUses(buffer_constraint, constraints);
+  }
+};
+
+TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
+  // param0 -> concatenate -> reshape
+  // param1   -^
+  auto builder = HloComputation::Builder(TestName());
+  Shape ashape = ShapeUtil::MakeShape(F32, {50, 1});
+  Shape bshape = ShapeUtil::MakeShape(F32, {50, 2});
+  Shape cshape = ShapeUtil::MakeShape(F32, {100});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ashape, "param"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ashape, "param"));
+  auto concatenate = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(bshape, {param0, param1}, 1));
+  auto reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(cshape, concatenate));
+  HloModule module(TestName());
+  HloComputation* computation =
+      module.AddEntryComputation(builder.Build(reshape));
+
+  Shape param0_shape_with_layout(ashape);
+  Shape param1_shape_with_layout(ashape);
+  *param0_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  *param1_shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout({1, 0});
+
+  ComputationLayout computation_layout(computation->ComputeProgramShape());
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(param0_shape_with_layout);
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(param1_shape_with_layout);
+  OperandsMustBeTheSameLayoutAssignment layout_assignment(&computation_layout);
+  EXPECT_IS_OK(layout_assignment.Run(&module).status());
+
+  EXPECT_EQ(HloOpcode::kCopy, concatenate->operand(0)->opcode());
+  EXPECT_THAT(concatenate->operand(0)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+  EXPECT_THAT(concatenate->operand(1)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+  EXPECT_THAT(concatenate->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index caaf56a5516fcf9f21d8754feec04db23381809e..e1004256ff63bd7ef58fbba8e4144b7c1a71d32b 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -28,8 +28,9 @@ limitations under the License.
 
 namespace xla {
 
-bool DoesNotUseOperandBuffer(HloInstruction* operand, const ShapeIndex& index,
-                             HloInstruction* user,
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
                              const TuplePointsToAnalysis& points_to_analysis) {
   CHECK(user->IsUserOf(operand))
       << "user: " << user->ToString() << " operand: " << operand->ToString();
@@ -98,15 +99,53 @@ std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
   return uses;
 }
 
+// Returns true if there is exactly one use of 'operand' at 'operand_index'
+// in 'fusion.fused_instructions', where the singleton use is the fused
+// root at operand index 'use_operand_index'. Returns false otherwise.
+//
+// REQUIRES: 'fusion' opcode is a kFusion instruction.
+bool HasUniqueFusedUseOfOperandAt(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* fusion, const int64 use_operand_index,
+    const TuplePointsToAnalysis& points_to_analysis) {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Check that 'operand' is unique in the operand list of 'fusion'.
+  if (fusion->OperandIndices(operand).size() > 1) {
+    return false;
+  }
+  // Find fusion parameter associated with 'operand'.
+  const auto& fused_params = fusion->fused_parameters();
+  auto fused_param_it = std::find_if(
+      fused_params.begin(), fused_params.end(),
+      [&](HloInstruction* fused_param) {
+        return fusion->operand(fused_param->parameter_number()) == operand;
+      });
+  if (fused_param_it == fused_params.end()) {
+    return false;
+  }
+  auto* fused_param = *fused_param_it;
+  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
+  auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
+      fused_param, operand_index, points_to_analysis);
+  // Return true iff there is exactly one use of 'operand' at 'index', and
+  // this singleton use is the fused root (at index in 'use_operand_indices').
+  return fused_param_uses.size() == 1 &&
+         fused_param_uses[0].first == fusion->fused_expression_root() &&
+         fused_param_uses[0].second == use_operand_index;
+}
+
 }  // namespace
 
 // User and operand can share buffers iff both instructions emit the same shape
-// and layout, and 'user' meets one of the following two qualifications:
-// *) Is element-wise.
+// and layout, and 'user' meets one of the following qualifications:
+// *) Is element-wise. Or...
 // *) Is a loop fusion instruction where the only use of 'operand' at 'index'
 //    in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
-//    at operand 0.
-// *) Use of 'operand' is DynamicUpdateSlice at operand index 0.
+//    at operand 0. Or...
+// *) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
+//    instruction where the only use of 'operand' at 'index' in the set
+//    'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
+// *) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index 0.
 bool CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index,
@@ -120,31 +159,49 @@ bool CanShareOperandBufferWithUser(
   if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
     return false;
   }
-  // Check if 'user' is a loop fusion instruction with a kDynamicUpdateSlice
-  // fused root instruction.
-  if (user->opcode() == HloOpcode::kFusion &&
-      user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-      user->fused_expression_root()->opcode() ==
-          HloOpcode::kDynamicUpdateSlice) {
-    for (auto& fused_param : user->fused_parameters()) {
-      // Find fusion parameter associated with 'operand'.
-      if (user->operand(fused_param->parameter_number()) != operand) {
-        continue;
-      }
-      // Get all uses of 'operand' at 'index' from 'user.fused_instructions'.
-      auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
-          fused_param, operand_index, points_to_analysis);
-      // Return true iff there is exactly one use of 'operand' at 'index', and
-      // this singleton use is the fused root at operand index 0.
-      if (fused_param_uses.size() == 1 &&
-          fused_param_uses[0].first == user->fused_expression_root() &&
-          fused_param_uses[0].second == 0) {
-        return true;
+  if (user->opcode() == HloOpcode::kFusion) {
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
+                                          points_to_analysis);
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is either kDot, or nested
+      // kFusion of kind kTransposeDot.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kDot ||
+                                (operand->opcode() == HloOpcode::kFusion &&
+                                 operand->fusion_kind() ==
+                                     HloInstruction::FusionKind::kTransposeDot);
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
       }
-      break;
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
+                                          other_add_operand_index,
+                                          points_to_analysis);
     }
-    return false;
-  } else if (user->opcode() == HloOpcode::kDynamicUpdateSlice) {
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
     // We eliminated other users in BufferLiveness::live_range_strictly_before,
     // so here we just need to check that the use is at operand index 0.
     std::vector<int64> operand_indices = user->OperandIndices(operand);
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
index 410a7b1b519e117f21c01938cb8e4a5b1c358ad2..52de282ca6b444867c865f845ce794196c98b277 100644
--- a/tensorflow/compiler/xla/service/liveness_util.h
+++ b/tensorflow/compiler/xla/service/liveness_util.h
@@ -32,8 +32,9 @@ namespace xla {
 // 'operand'. Returns false otherwise.
 //
 // REQUIRES: 'operand' is an operand of 'user'.
-bool DoesNotUseOperandBuffer(HloInstruction* operand, const ShapeIndex& index,
-                             HloInstruction* user,
+bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                             const ShapeIndex& index,
+                             const HloInstruction* user,
                              const TuplePointsToAnalysis& points_to_analysis);
 
 // Returns true if 'user' (at 'user_index') can share a buffer with its operand
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index 2ff71d6f3c8eff58b83783fc867d5874c6c700a3..ac670069b499eadd452f7faf3a56aa00d808d77f 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -34,9 +34,7 @@ class PointsToAnalysisTestBase : public HloTestBase {
   void RunAnalysis() {
     CHECK_NOTNULL(module_.get());
     points_to_analysis_ =
-        TuplePointsToAnalysis::Run(module_.get(),
-                                   /*include_loop_fusion_instructions=*/true)
-            .ConsumeValueOrDie();
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
   }
 
   void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
@@ -150,6 +148,25 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
                                              *points_to_analysis_));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(exp, {}, copy, {}, *points_to_analysis_));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -185,5 +202,167 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
                                             *points_to_analysis_));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(data, {}, dus, {}, *points_to_analysis_));
+  EXPECT_FALSE(
+      CanShareOperandBufferWithUser(update, {}, dus, {}, *points_to_analysis_));
+  EXPECT_FALSE(
+      CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+  auto b_t = builder.AddInstruction(
+      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
+
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+
+  auto nested_fusion = computation_->CreateFusionInstruction(
+      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused transpose-dot-add should be share buffer with 'add_operand'.
+  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
+                                            *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
+                                             *points_to_analysis_));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = MakeUnique<HloModule>(TestName());
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      CanShareOperandBufferWithUser(data, {}, whil, {}, *points_to_analysis_));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 28488ca99912335a4ead43c9c7cd227f85f7db68..964b359bb094b43a1a8b126a217293567c5fc865 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -130,7 +130,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
     llvm::IRBuilder<>* ir_builder, int alignment = 0);
 
-// Creates a basic block with the same context and funtion as for the
+// Creates a basic block with the same context and function as for the
 // builder. Inserts at the end of the function if insert_before is
 // null.
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 17d7b97b21bd3296711295e0779b0a273c9917e0..78d21233c765ec8f18a865f55b752d418ad126d6 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -60,9 +60,12 @@ namespace xla {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Backend> backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform)
+      .set_number_of_replicas(options.number_of_replicas())
+      .set_intra_op_parallelism_threads(options.intra_op_parallelism_threads());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> backend,
+                      Backend::CreateBackend(backend_options));
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
@@ -77,21 +80,6 @@ LocalService::LocalService(std::unique_ptr<Backend> execute_backend,
   runs_in_client_process_ = true;
 }
 
-tensorflow::Status LocalService::ResolveArguments(
-    const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-    int device_ordinal,
-    std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs) {
-  TF_ASSIGN_OR_RETURN(std::vector<const Allocation*> arg_allocations,
-                      ResolveAndValidateArguments(
-                          arguments, execute_backend_.get(), device_ordinal));
-  argument_ptrs->resize(arg_allocations.size());
-  for (int i = 0; i < arguments.size(); ++i) {
-    const Allocation& allocation = *arg_allocations[i];
-    (*argument_ptrs)[i] = allocation.device_memory();
-  }
-  return tensorflow::Status::OK();
-}
-
 namespace {
 // Returns the space required to allocate a shape. If
 // allocate_space_for_deep_copy the space includes all sub-buffers of
@@ -128,70 +116,6 @@ StatusOr<GlobalDataHandle> LocalService::AllocateBufferOnDevice(
                                   allocation_size));
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-LocalService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-        computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  for (const AheadOfTimeComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
-    // Dump computation proto state if flag is set.
-    legacy_flags::ServiceFlags* flags = legacy_flags::GetServiceFlags();
-    const string& directory_path = flags->xla_dump_computations_to;
-    if (!directory_path.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SessionModule> session_module,
-          computation_tracker_.SnapshotComputation(versioned_handle.handle));
-      string filename = tensorflow::strings::StrCat(
-          "computation_", versioned_handle.handle.handle(), "__",
-          session_module->entry().name(), "__version_",
-          versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-    }
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle,
-                            /*include_unreachable_instructions=*/true));
-    hlo_modules.push_back(std::move(hlo_module));
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    module_configs.push_back(MakeUnique<HloModuleConfig>(*program_shape));
-    HloModuleConfig* module_config = module_configs.back().get();
-    auto* computation_layout =
-        module_config->mutable_entry_computation_layout();
-    if (flags->xla_hlo_profile) {
-      module_config->enable_hlo_profiling(true);
-    }
-    for (int i = 0; i < instance.argument_layouts.size(); ++i) {
-      const Shape& argument_layout = *instance.argument_layouts[i];
-      if (ShapeUtil::IsTuple(argument_layout)) {
-        return Unimplemented("tuple arguments not supported yet");
-      }
-      TF_RETURN_IF_ERROR(
-          computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-              argument_layout));
-    }
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
-            *instance.result_layout));
-  }
-
-  return execute_backend_->compiler()->CompileAheadOfTime(
-      std::move(hlo_modules), std::move(module_configs), MakeHloDumper(),
-      options);
-}
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index df27f0a7a60dca99caf09994f417f1bc45ec15de..767a3ab697febb283af448b25369445152381a5e 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -43,14 +43,6 @@ class LocalService : public Service {
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
-  // For an array of arguments, validate that each is placed on the
-  // specified device_ordinal, and return the DeviceMemoryBase
-  // corresponding to each argument.
-  tensorflow::Status ResolveArguments(
-      const tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
-      int device_ordinal,
-      std::vector<perftools::gputools::DeviceMemoryBase>* argument_ptrs);
-
   // Return a handle to a buffer large enough to hold shape, allocated
   // on device_ordinal. If allocate_space_for_deep_copy, the buffer is
   // large enough to hold all sub-buffers of a tuple shape, otherwise
@@ -59,22 +51,6 @@ class LocalService : public Service {
       const Shape& shape, int device_ordinal,
       bool allocate_space_for_deep_copy);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AheadOfTimeComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |LocalClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
-          computations,
-      const AotCompilationOptions& Options);
-
   // Builds an Executable with the given argument layouts and options. If
   // result_layout is non-null, then the executable is compiled to produce a
   // result of the given layout.
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 3bff35544c83b09557e5623b10304348a41ec336..768977ba6bba2f9af55fcd467aa3d91488e4bf0f 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -13,17 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/reshape_mover.h"
-
-#include <algorithm>
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-
-namespace {
-
+// Implementation note:
+//
 // The general idea behind this pass is that we're converting from this:
 //   %param.A = OldShape
 //   %param.B = OldShape
@@ -44,6 +35,19 @@ namespace {
 // only implicit scalar broadcast is on Pred, not on A or B. Since reshapes or
 // transposes to a scalar should be cheap, we simply never move them.
 
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
+
+#include <algorithm>
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
 // Finds the first non-scalar operand of an instruction that is a reshape or
 // transpose and returns the operand if it is found or nullptr if not found.
 HloInstruction* FirstNonScalarReshapeOperand(const HloInstruction* hlo) {
@@ -51,6 +55,9 @@ HloInstruction* FirstNonScalarReshapeOperand(const HloInstruction* hlo) {
     if (!ShapeUtil::IsScalar(operand->shape()) &&
         (operand->opcode() == HloOpcode::kReshape ||
          operand->opcode() == HloOpcode::kTranspose)) {
+      VLOG(5) << "Found first non-scalar reshape operand of "
+              << hlo->ToStringNoMetadata() << ":\n\t"
+              << operand->ToStringNoMetadata();
       return operand;
     }
   }
@@ -70,6 +77,9 @@ bool OperandCanTrivallyChangeShape(const HloInstruction* instruction,
   // A constant can trivially reshape the literal it holds.
   if (operand->opcode() == HloOpcode::kConstant &&
       ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+    VLOG(5) << "Constant had same dimensions as instruction:\n\toperand: "
+            << operand->ToStringNoMetadata()
+            << "\n\tinstruction: " << instruction->ToStringNoMetadata();
     return true;
   }
 
@@ -116,135 +126,173 @@ bool IsElementwiseOfEquivalentReshapesOrTransposes(
   if (!first_reshape_operand) {
     return false;
   }
-  return (instruction->user_count() > 0 ||
-          instruction == instruction->parent()->root_instruction()) &&
-         instruction->IsElementwise() && !operands.empty() &&
-         // Check whether all operands:
-         //    1. are all reshapes or transposes that have the same input and
-         //    output shapes as all other reshaped or transposed operands.
-         //      or
-         //    2. can be any shape like kConstant, kRng, and scalars.
-         std::all_of(
-             operands.begin(), operands.end(),
-             [instruction,
-              first_reshape_operand](const HloInstruction* operand) {
-               return AreEquivalentReshapes(first_reshape_operand, operand) ||
-                      OperandCanTrivallyChangeShape(instruction, operand);
-             });
+  VLOG(3) << "** Checking whether instruction is an elementwise operation of "
+             "equivalent reshapes/transposes: "
+          << instruction->ToStringNoMetadata();
+  bool result =
+      (instruction->user_count() > 0 ||
+       instruction == instruction->parent()->root_instruction()) &&
+      instruction->IsElementwise() && !operands.empty() &&
+      // Check whether all operands:
+      //    0. Have the same dimensions as the output -- if not, it may be
+      //       implicitly broadcast, which can confound the movement's
+      //       correctness.
+      //    1. Are all reshapes or transposes that have the same input and
+      //       output shapes as all other reshaped or transposed operands.
+      //     or
+      //    2. Can be any shape like kConstant, kRng, and scalars.
+      std::all_of(
+          operands.begin(), operands.end(),
+          [instruction, first_reshape_operand](const HloInstruction* operand) {
+            if (!ShapeUtil::SameDimensions(operand->shape(),
+                                           instruction->shape())) {
+              VLOG(5) << "Operand shape differs from output shape; may be "
+                         "implicitly broadcast, so preventing "
+                         "movement\n\toperand: "
+                      << operand->ToStringNoMetadata() << "\n\tinstruction: "
+                      << instruction->ToStringNoMetadata();
+              return false;
+            }
+            if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+              VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
+                      << first_reshape_operand->ToStringNoMetadata()
+                      << "\n\toperand: " << operand->ToStringNoMetadata();
+              return true;
+            }
+            if (OperandCanTrivallyChangeShape(instruction, operand)) {
+              VLOG(5) << "Operand can trivially change shape: "
+                      << operand->ToStringNoMetadata();
+              return true;
+            }
+            return false;
+          });
+  VLOG(3) << "ElementwiseOfEquivalentReshapesOrTransposes result for "
+          << instruction->ToStringNoMetadata() << ": " << result;
+  return result;
 }
 
 // Try to sink any reshape or transpose operands of `instruction` across it. We
 // do so if `instruction` is elementwise and all operands are equivalent
 // reshapes or transposes.
-bool TrySinkReshapeOrTranspose(HloComputation* computation,
-                               HloInstruction* instruction) {
-  if (IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
-    std::vector<HloInstruction*> operands = instruction->operands();
-    HloInstruction* old_reshape = FirstNonScalarReshapeOperand(instruction);
-    CHECK(old_reshape != nullptr);
-    Shape new_elementwise_shape = old_reshape->operand(0)->shape();
-    for (size_t i = 0; i < operands.size(); ++i) {
-      // All scalar operands remain as-is, even if they're reshape or transpose,
-      // to simplify handling wrt special scalar broadcast rules for ops like
-      // Select. Scalar reshapes should be cheap anyways.
-      if (ShapeUtil::IsScalar(operands[i]->shape())) {
-        continue;
-      }
-      auto element_type = operands[i]->shape().element_type();
-      switch (operands[i]->opcode()) {
-        case HloOpcode::kConstant: {
-          if (old_reshape->opcode() == HloOpcode::kReshape) {
-            operands[i] = instruction->parent()->AddInstruction(
-                HloInstruction::CreateReshape(
-                    ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                                 element_type),
-                    operands[i]));
-          } else {
-            CHECK_EQ(old_reshape->opcode(), HloOpcode::kTranspose);
-            std::vector<int64> inverse_permutation =
-                InversePermutation(old_reshape->dimensions());
-            operands[i] = instruction->parent()->AddInstruction(
-                HloInstruction::CreateTranspose(
-                    ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                                 element_type),
-                    operands[i], inverse_permutation));
-          }
-          break;
-        }
-        case HloOpcode::kRng: {
-          CHECK_EQ(operands[i]->user_count(), 1);
+StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
+                                         HloInstruction* instruction) {
+  if (!IsElementwiseOfEquivalentReshapesOrTransposes(instruction)) {
+    return false;
+  }
+
+  std::vector<HloInstruction*> operands = instruction->operands();
+  HloInstruction* old_reshape = FirstNonScalarReshapeOperand(instruction);
+  TF_RET_CHECK(old_reshape != nullptr);
+  Shape new_elementwise_shape = old_reshape->operand(0)->shape();
+
+  VLOG(3) << "** Trying to sink reshape or transpose: "
+          << instruction->ToStringNoMetadata()
+          << "\n\told reshape: " << old_reshape->ToStringNoMetadata()
+          << "\n\tnew elementwise shape: "
+          << ShapeUtil::HumanString(new_elementwise_shape);
+  for (size_t i = 0; i < operands.size(); ++i) {
+    // All scalar operands remain as-is, even if they're reshape or transpose,
+    // to simplify handling wrt special scalar broadcast rules for ops like
+    // Select. Scalar reshapes should be cheap anyways.
+    if (ShapeUtil::IsScalar(operands[i]->shape())) {
+      continue;
+    }
+    PrimitiveType element_type = operands[i]->shape().element_type();
+    switch (operands[i]->opcode()) {
+      case HloOpcode::kConstant: {
+        if (old_reshape->opcode() == HloOpcode::kReshape) {
+          VLOG(3) << "Creating reshape for kConstant operand " << i << ": "
+                  << operands[i]->ToStringNoMetadata();
+          operands[i] = instruction->parent()->AddInstruction(
+              HloInstruction::CreateReshape(
+                  ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                               element_type),
+                  operands[i]));
+        } else {
+          TF_RET_CHECK(old_reshape->opcode() == HloOpcode::kTranspose);
+          std::vector<int64> inverse_permutation =
+              InversePermutation(old_reshape->dimensions());
           operands[i] = instruction->parent()->AddInstruction(
-              operands[i]->CloneWithNewOperands(
+              HloInstruction::CreateTranspose(
                   ShapeUtil::ChangeElementType(new_elementwise_shape,
                                                element_type),
-                  operands[i]->operands()));
-          break;
+                  operands[i], inverse_permutation));
         }
-        case HloOpcode::kReshape:
-        case HloOpcode::kTranspose:
-          operands[i] = operands[i]->mutable_operand(0);
-          break;
-        default:
-          LOG(FATAL) << "Unexpected opcode while trying to sink reshapes or "
-                        "transposes.";
+        break;
       }
-    }
-    if (HloOpcode::kFusion == instruction->opcode()) {
-      // Here we already know `instruction` is elementwise, and no operand is
-      // implicit broadcast as if it were the operands would not be equivalent
-      // reshapes, so all the fused instructions have the same dimensions.
-      for (const auto& fused_instruction : instruction->fused_instructions()) {
-        Shape* shape = fused_instruction->mutable_shape();
-        *shape->mutable_dimensions() = new_elementwise_shape.dimensions();
-        *shape->mutable_layout() = new_elementwise_shape.layout();
+      case HloOpcode::kRng: {
+        CHECK_EQ(operands[i]->user_count(), 1);
+        operands[i] = instruction->parent()->AddInstruction(
+            operands[i]->CloneWithNewOperands(
+                ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                             element_type),
+                operands[i]->operands()));
+        break;
       }
-    }
-    auto new_elementwise =
-        computation->AddInstruction(instruction->CloneWithNewOperands(
-            // `instruction` may change the element type, e.g., from
-            //   operands[0] -> reshape -> convert (`instruction`)
-            // to
-            //   operands[0] -> convert' -> reshape'
-            //
-            // In this case, convert' should have the same element type as
-            // `convert` and the same dimensions as operands[0].
-            ShapeUtil::ChangeElementType(new_elementwise_shape,
-                                         instruction->shape().element_type()),
-            operands));
-    std::unique_ptr<HloInstruction> new_reshape;
-    switch (old_reshape->opcode()) {
       case HloOpcode::kReshape:
-        new_reshape = HloInstruction::CreateReshape(instruction->shape(),
-                                                    new_elementwise);
-        break;
       case HloOpcode::kTranspose:
-        new_reshape = HloInstruction::CreateTranspose(
-            instruction->shape(), new_elementwise, old_reshape->dimensions());
+        operands[i] = operands[i]->mutable_operand(0);
         break;
       default:
-        LOG(FATAL) << "Bad opcode";
+        LOG(FATAL) << "Unexpected opcode while trying to sink reshapes or "
+                      "transposes.";
     }
-    TF_CHECK_OK(computation->ReplaceWithNewInstruction(instruction,
-                                                       std::move(new_reshape)));
-    return true;
   }
-  return false;
+  if (HloOpcode::kFusion == instruction->opcode()) {
+    // Here we already know `instruction` is elementwise, and no operand is
+    // implicit broadcast as if it were the operands would not be equivalent
+    // reshapes, so all the fused instructions have the same dimensions.
+    for (const auto& fused_instruction : instruction->fused_instructions()) {
+      Shape* shape = fused_instruction->mutable_shape();
+      *shape->mutable_dimensions() = new_elementwise_shape.dimensions();
+      *shape->mutable_layout() = new_elementwise_shape.layout();
+    }
+  }
+  HloInstruction* new_elementwise =
+      computation->AddInstruction(instruction->CloneWithNewOperands(
+          // `instruction` may change the element type, e.g., from
+          //   operands[0] -> reshape -> convert (`instruction`)
+          // to
+          //   operands[0] -> convert' -> reshape'
+          //
+          // In this case, convert' should have the same element type as
+          // `convert` and the same dimensions as operands[0].
+          ShapeUtil::ChangeElementType(new_elementwise_shape,
+                                       instruction->shape().element_type()),
+          operands));
+
+  std::unique_ptr<HloInstruction> new_reshape;
+  switch (old_reshape->opcode()) {
+    case HloOpcode::kReshape:
+      VLOG(3) << "Creating new reshape for new elementwise op: "
+              << new_elementwise->ToStringNoMetadata();
+      new_reshape =
+          HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
+      break;
+    case HloOpcode::kTranspose:
+      new_reshape = HloInstruction::CreateTranspose(
+          instruction->shape(), new_elementwise, old_reshape->dimensions());
+      break;
+    default:
+      LOG(FATAL) << "Bad opcode";
+  }
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      instruction, std::move(new_reshape)));
+  return true;
 }
 
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
-  return std::any_of(
-      module->computations().begin(), module->computations().end(),
-      [](const std::unique_ptr<HloComputation>& computation) {
-        std::list<HloInstruction*> postorder =
-            computation->MakeInstructionPostOrder();
-        return std::any_of(postorder.begin(), postorder.end(),
-                           [&computation](HloInstruction* instruction) {
-                             return TrySinkReshapeOrTranspose(computation.get(),
-                                                              instruction);
-                           });
-      });
+  bool changed = false;
+  for (const auto& comp : module->computations()) {
+    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+      TF_ASSIGN_OR_RETURN(bool did_change,
+                          TrySinkReshapeOrTranspose(comp.get(), instruction));
+      changed |= did_change;
+    }
+  }
+  return changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 1862e2e992ec7ca9fac7444e6b83018fd1f17372..5217e85d4fc12e2adc412644b8f11fd11a58039a 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -20,14 +20,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 using ReshapeMoverTest = HloTestBase;
@@ -43,14 +47,19 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
   auto reshape1 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+
   EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
@@ -64,14 +73,20 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
   auto reshape1 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
+
   EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Add(op::Reshape(op::Parameter()), op::Reshape(op::Parameter())));
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
@@ -85,18 +100,20 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
   auto reshape1 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), op::Reshape(param1)));
   EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
 
-  auto new_root = computation->root_instruction();
-  EXPECT_NE(add, new_root);
-  EXPECT_EQ(HloOpcode::kReshape, new_root->opcode());
-  EXPECT_EQ(root_shape.DebugString(), new_root->shape().DebugString());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Add(param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
 }
 
 TEST_F(ReshapeMoverTest, ConstantAndReshapeMoved) {
@@ -108,18 +125,21 @@ TEST_F(ReshapeMoverTest, ConstantAndReshapeMoved) {
       LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+  builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(add, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Add(op::Reshape(param0), const1));
+
   EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
 
-  auto new_root = computation->root_instruction();
-  EXPECT_NE(add, new_root);
-  EXPECT_EQ(HloOpcode::kReshape, new_root->opcode());
-  EXPECT_EQ(root_shape.DebugString(), new_root->shape().DebugString());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Add(param0, op::Reshape(const1))));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
@@ -141,13 +161,16 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto fusion = computation->AddInstruction(HloInstruction::CreateFusion(
       add->shape(), HloInstruction::FusionKind::kLoop, add));
   TF_CHECK_OK(computation->ReplaceInstruction(add, fusion));
-  EXPECT_EQ(fusion, computation->root_instruction());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Fusion(op::Reshape(param0), op::Reshape(param1)));
+
   EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
 
-  auto new_root = computation->root_instruction();
-  EXPECT_NE(fusion, new_root);
-  EXPECT_EQ(HloOpcode::kReshape, new_root->opcode());
-  EXPECT_EQ(root_shape.DebugString(), new_root->shape().DebugString());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Fusion(param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
@@ -166,18 +189,22 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
       builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
   auto reshape_pred =
       builder.AddInstruction(HloInstruction::CreateReshape(pred_shape, pred));
-  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
+  builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(select, computation->root_instruction());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
+
   EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
 
-  auto new_root = computation->root_instruction();
-  EXPECT_NE(select, new_root);
-  EXPECT_EQ(HloOpcode::kReshape, new_root->opcode());
-  EXPECT_EQ(root_shape.DebugString(), new_root->shape().DebugString());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Reshape(op::Select(pred, param0, param1)));
+  EXPECT_EQ(root_shape.DebugString(),
+            computation->root_instruction()->shape().DebugString());
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
@@ -197,10 +224,119 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
 
   auto module = MakeUnique<HloModule>(TestName());
   auto computation = module->AddEntryComputation(builder.Build());
-  EXPECT_EQ(select, computation->root_instruction());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(op::Reshape(pred), param0, param1));
+
   EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Select(op::Reshape(pred), param0, param1));
   EXPECT_EQ(select, computation->root_instruction());
 }
 
+// Tree looks like:
+//
+// param0 [1,128,1]
+//  |
+// reshape [128,1]          constant [128,1024]
+//   \                         /
+//     multiply w/implicit broadcast [128,1024]
+//
+// The reshape mover would like to sink the reshape below the multiply.
+//
+// Previously we would attempt to insert a reshape of the constant to [1,128,1]
+// (which is unsound, because it has a different number of elements) as
+// preparation for sinking the reshape.
+//
+// To eliminate the unsoundness, we outlaw reshape sinking when one of the
+// operands is implicitly broadcast in the elementwise consumer.
+//
+// TODO(b/37799338) However, it would be possible in this case to do a more
+// in-depth analysis to get reshape movement to occur:
+//
+// 1. Note that the broadcast dimension (logical dimension 1) in the operands
+//    would map back to logical dimension 2 in the param0 node.
+// 2. Match rank of the constant to the param0 node (by prepending a trivial 1
+//    dimension).
+// 3. Reshape to [128,1024] at the root.
+//
+// But this is not currently done.
+TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) {
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0"));
+  auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(F32, {128, 1}), param0));
+  Array2D<float> a(128, 1024);
+  auto literal = LiteralUtil::CreateR2FromArray2D<float>(a);
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  auto multiply = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant->shape(), HloOpcode::kMultiply, constant, reshape));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+
+  EXPECT_FALSE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              op::Multiply(op::Constant(), op::Reshape(param0)));
+  EXPECT_EQ(multiply, computation->root_instruction());
+}
+
+// Tree looks like this:
+//
+// add1
+// |
+// +- reshape2 - param2
+// |
+// +- reshape3 - add0
+//               |
+//               + reshape0 - param0
+//               |
+//               + reshape1 - param1
+//
+// We expect reshape{0,1} AND reshape{2,3} to be lifted.
+TEST_F(ReshapeMoverTest, MultiplePasses) {
+  auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
+  auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
+  auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
+  HloComputation::Builder builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape1, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape1, "param1"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, shape2, "param2"));
+  auto reshape0 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param0));
+  auto reshape1 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param1));
+  auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape2, HloOpcode::kAdd, reshape0, reshape1));
+  auto reshape2 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape3, param2));
+  auto reshape3 =
+      builder.AddInstruction(HloInstruction::CreateReshape(shape3, add0));
+  builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
+                                                      reshape2, reshape3));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Add(op::Reshape(param2),
+              op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
+
+  EXPECT_TRUE(ReshapeMover().Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Reshape(op::Add(param2, op::Reshape(op::Add(param0, param1)))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 451bb8c7eadf3e2210788a722d8f75aa3050e30f..8b373ab09623a930a7a15aab0e39d37af0995250 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -112,6 +112,16 @@ ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) {
 
 int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
 
+ServiceOptions& ServiceOptions::set_intra_op_parallelism_threads(
+    int num_threads) {
+  intra_op_parallelism_threads_ = num_threads;
+  return *this;
+}
+
+int ServiceOptions::intra_op_parallelism_threads() const {
+  return intra_op_parallelism_threads_;
+}
+
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     perftools::gputools::Platform* platform) {
   ServiceOptions default_options;
@@ -126,9 +136,10 @@ int ServiceOptions::number_of_replicas() const { return number_of_replicas_; }
   if (platform == nullptr) {
     TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform());
   }
-  TF_ASSIGN_OR_RETURN(
-      execute_backend,
-      Backend::CreateBackend(platform, options.number_of_replicas()));
+  BackendOptions backend_options;
+  backend_options.set_platform(platform);
+  backend_options.set_number_of_replicas(options.number_of_replicas());
+  TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> compute_constant_backend,
                       CreateComputeConstantBackend());
   std::unique_ptr<Service> service(new Service(
@@ -142,7 +153,10 @@ Service::CreateComputeConstantBackend() {
                       PlatformUtil::GetSupportedPlatforms());
   for (auto* platform : platforms) {
     if (platform->id() == se::host::kHostPlatformId) {
-      return Backend::CreateBackend(platform, /*replica_count=*/1);
+      BackendOptions backend_options;
+      backend_options.set_platform(platform);
+      backend_options.set_number_of_replicas(1);
+      return Backend::CreateBackend(backend_options);
     }
   }
   return NotFound("CPU platform not found");
@@ -180,20 +194,24 @@ Service::Service(std::unique_ptr<Backend> execute_backend,
                  std::unique_ptr<Backend> compute_constant_backend)
     : execute_backend_(std::move(execute_backend)),
       compute_constant_backend_(std::move(compute_constant_backend)) {
-  LOG(INFO) << Printf(
-      "XLA service %p executing computations on platform %s. Devices:", this,
-      execute_backend_->platform()->Name().c_str());
-  for (int i = 0; i < execute_backend_->device_count(); ++i) {
-    if (execute_backend_->device_ordinal_supported(i)) {
-      se::StreamExecutor* executor =
-          execute_backend_->stream_executor(i).ValueOrDie();
-      const auto& description = executor->GetDeviceDescription();
-      LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
-                          description.name().c_str(),
-                          description.platform_version().c_str());
-    } else {
-      LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+  if (execute_backend_) {
+    LOG(INFO) << Printf(
+        "XLA service %p executing computations on platform %s. Devices:", this,
+        execute_backend_->platform()->Name().c_str());
+    for (int i = 0; i < execute_backend_->device_count(); ++i) {
+      if (execute_backend_->device_ordinal_supported(i)) {
+        se::StreamExecutor* executor =
+            execute_backend_->stream_executor(i).ValueOrDie();
+        const auto& description = executor->GetDeviceDescription();
+        LOG(INFO) << Printf("  StreamExecutor device (%d): %s, %s", i,
+                            description.name().c_str(),
+                            description.platform_version().c_str());
+      } else {
+        LOG(INFO) << Printf("  StreamExecutor device (%d) not supported", i);
+      }
     }
+  } else {
+    VLOG(1) << "XLA compile-only service constructed";
   }
 }
 
@@ -286,7 +304,7 @@ StatusOr<std::vector<const Allocation*>> Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options, Backend* backend) {
   auto module_config = MakeUnique<HloModuleConfig>(program_shape);
   auto* computation_layout = module_config->mutable_entry_computation_layout();
 
@@ -326,7 +344,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     module_config->enable_hlo_profiling(true);
   }
 
-  module_config->set_replica_count(execute_backend_->Replicas().size());
+  module_config->set_replica_count(backend->Replicas().size());
   module_config->set_fast_math_disabled(execution_options.disable_fast_math());
   module_config->set_seed(execution_options.seed());
 
@@ -367,20 +385,23 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     VLOG(1) << versioned_handle;
   }
 
+  CHECK_EQ(versioned_handles.size(), module_configs.size());
   std::vector<std::unique_ptr<HloModule>> modules;
-  for (const VersionedComputationHandle& versioned_handle : versioned_handles) {
+  for (int64 i = 0; i < versioned_handles.size(); ++i) {
+    const VersionedComputationHandle& versioned_handle = versioned_handles[i];
+    const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module,
                         computation_tracker_.BuildHloModule(
-                            versioned_handle,
+                            versioned_handle, &config,
                             /*include_unreachable_instructions=*/true));
     modules.push_back(std::move(module));
   }
 
   Compiler::HloDumper hlo_dumper = MakeHloDumper();
-  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
-                      backend->compiler()->Compile(
-                          std::move(modules), std::move(module_configs),
-                          hlo_dumper, std::move(executors)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), hlo_dumper,
+                                   std::move(executors)));
 
   if (!other_directory_path.empty()) {
     for (size_t i = 0; i < versioned_handles.size(); ++i) {
@@ -423,7 +444,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle,
+      computation_tracker_.BuildHloModule(versioned_handle, module_config.get(),
                                           /*include_unreachable_instructions=*/
                                           !executable_for_compute_constant));
 
@@ -435,8 +456,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend->compiler()->Compile(std::move(module), std::move(module_config),
-                                   hlo_dumper, executor));
+      backend->compiler()->Compile(std::move(module), hlo_dumper, executor));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -474,7 +494,7 @@ StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
       std::unique_ptr<Executable> executable_unique_ptr,
       BuildExecutable(versioned_handle, std::move(module_config),
                       /*executable_for_compute_constant=*/false, arguments,
-                      execute_backend_.get(), executor));
+                      backend, executor));
 
   if (profile != nullptr) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -569,21 +589,21 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
-    run_options.emplace_back(options, backend->StreamBorrower());
+    run_options.emplace_back(options, backend->StreamBorrower(),
+                             backend->inter_op_thread_pool());
   }
 
   perftools::gputools::DeviceMemoryBase result;
   if (backend->Replicas().size() == 1) {
     TF_ASSIGN_OR_RETURN(
-        result,
-        ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
-            executable, &run_options[0], profile, execute_backend_.get(),
-            [&arguments](Executable* executable,
-                         const ServiceExecutableRunOptions* run_options,
-                         HloExecutionProfile* hlo_execution_profile) {
-              return executable->ExecuteOnStream(run_options, arguments,
-                                                 hlo_execution_profile);
-            }));
+        result, ExecuteOnStreamWrapper<StatusOr<se::DeviceMemoryBase>>(
+                    executable, &run_options[0], profile, backend,
+                    [&arguments](Executable* executable,
+                                 const ServiceExecutableRunOptions* run_options,
+                                 HloExecutionProfile* hlo_execution_profile) {
+                      return executable->ExecuteOnStream(run_options, arguments,
+                                                         hlo_execution_profile);
+                    }));
   } else {
     std::vector<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
@@ -666,7 +686,8 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // the program and the argument allocations.
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
                         CreateModuleConfig(*program_shape, arg_allocations,
-                                           request.execution_options()));
+                                           request.execution_options(),
+                                           execute_backend_.get()));
     VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
             << module_config->entry_computation_layout().ToString();
 
@@ -751,9 +772,10 @@ tensorflow::Status Service::Execute(const ExecuteRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "Execute created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -818,9 +840,10 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
       ResolveAndValidateArguments(arg->arguments(), execute_backend_.get(),
                                   execute_backend_->default_device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, arg_allocations,
-                                         arg->execution_options()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(*program_shape, arg_allocations,
+                         arg->execution_options(), execute_backend_.get()));
 
   VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
@@ -1141,7 +1164,8 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options));
+                      CreateModuleConfig(program_shape, {}, execution_options,
+                                         compute_constant_backend_.get()));
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<Executable> executable,
@@ -1202,7 +1226,8 @@ tensorflow::Status Service::GetComputationStats(
       user_computation->GetVersionedHandle();
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      computation_tracker_.BuildHloModule(versioned_handle));
+                      computation_tracker_.BuildHloModule(versioned_handle,
+                                                          /*config=*/nullptr));
 
   MakeHloDumper()(*module, "computation statistics subject");
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 9600f6989a40c9180d00ccabbeb29cb37a28900a..05a955137f8dfe7aa085058c5a6673ce8f2f77f1 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -63,9 +63,14 @@ class ServiceOptions {
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
 
+  // Sets the thread pool size for parallel execution of an individual operator.
+  ServiceOptions& set_intra_op_parallelism_threads(int num_threads);
+  int intra_op_parallelism_threads() const;
+
  private:
   perftools::gputools::Platform* platform_ = nullptr;
   int number_of_replicas_ = -1;
+  int intra_op_parallelism_threads_ = -1;
 };
 
 // The XLA service object, which is the same across all
@@ -265,11 +270,11 @@ class Service : public ServiceInterface {
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments,
       const Backend* backend, int device_ordinal);
 
-  // Create a Hlo module config foe the given program shape and arguments.
+  // Create a Hlo module config for the given program shape and arguments.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Allocation*> arguments,
-      const ExecutionOptions& execution_options);
+      const ExecutionOptions& execution_options, Backend* backend);
 
   // Builds an Executable for the given parameters. If
   // executable_for_compute_constant is true, then the executable is intended to
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 0d4b214f5f3624971ae68e23f0f4fdba846f9178..017e5ef09ed2f52b862821e9408540d188a1edf5 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -30,10 +30,12 @@ class ServiceExecutableRunOptions {
   using StreamBorrower =
       std::function<StatusOr<Pool<perftools::gputools::Stream>::SmartPtr>(int)>;
 
-  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
-                                       StreamBorrower borrow_stream = nullptr)
+  explicit ServiceExecutableRunOptions(
+      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
+      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
       : run_options_(std::move(run_options)),
-        borrow_stream_(std::move(borrow_stream)) {}
+        borrow_stream_(std::move(borrow_stream)),
+        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
   const ExecutableRunOptions& run_options() const { return run_options_; }
@@ -53,9 +55,15 @@ class ServiceExecutableRunOptions {
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
   }
 
+  // Returns reference to thread pool for execution of XLA ops on CPU backend.
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
+    return xla_intra_op_thread_pool_;
+  }
+
  private:
   ExecutableRunOptions run_options_;
   StreamBorrower borrow_stream_;
+  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index c05cf8c37d84f4120df344db939551e26a0355af..b2ef8ed486b5ab4643cb0e26fa6c18e1f3894a4b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -244,8 +244,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     }
     if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
       return InvalidArgument(
-          "cannot concatenate arrays with different ranks: %lld vs %lld",
-          ShapeUtil::Rank(*arg_shape), ShapeUtil::Rank(*shape));
+          "Cannot concatenate arrays with different ranks: %lld (%s) vs %lld "
+          "(%s)",
+          ShapeUtil::Rank(*arg_shape),
+          ShapeUtil::HumanString(*arg_shape).c_str(), ShapeUtil::Rank(*shape),
+          ShapeUtil::HumanString(*shape).c_str());
     }
     if (arg_shape->element_type() != shape->element_type()) {
       return InvalidArgument(
@@ -309,6 +312,10 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return InvalidArgument(
         "the rank of the operand and the padding configuration do not match.");
   }
+  if (operand_shape.element_type() != padding_value_shape.element_type()) {
+    return InvalidArgument(
+        "the element types of the operands to pad do not match");
+  }
   std::vector<int64> dimensions(ShapeUtil::Rank(operand_shape));
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     dimensions[i] = operand_shape.dimensions(i) +
@@ -338,7 +345,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
   // Check if both element types are the same.
   if (lhs.element_type() != rhs.element_type()) {
-    return fail("element types mismatch");
+    return fail("element types do not match");
   }
 
   if (ShapeUtil::Rank(lhs) < 1 || ShapeUtil::Rank(lhs) > 2 ||
@@ -633,26 +640,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_DCHECK_OK(ShapeUtil::ValidateShape(ehs));
   switch (operation) {
     case TRIOP_CLAMP:
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(lhs, "lhs of ternary operation"));
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(rhs, "rhs of ternary operation"));
-      TF_RETURN_IF_ERROR(
-          ExpectNotTupleOrOpaque(ehs, "ehs of ternary operation"));
-      if (((ShapeUtil::Compatible(lhs, rhs) || ShapeUtil::Rank(lhs) == 0) &&
-           (ShapeUtil::Compatible(rhs, ehs) || ShapeUtil::Rank(ehs) == 0))) {
-        return rhs;
-      }
-      if (ShapeUtil::Rank(rhs) == 0) {
-        if (ShapeUtil::Compatible(lhs, ehs)) {
-          return lhs;
-        }
-        return ShapeUtil::Rank(ehs) == 0 ? lhs : ehs;
-      }
-      return Unimplemented("not yet implemented: %s, %s <clamp> %s",
-                           lhs.ShortDebugString().c_str(),
-                           ehs.ShortDebugString().c_str(),
-                           rhs.ShortDebugString().c_str());
+      return InferClampShape(lhs, rhs, ehs);
     case TRIOP_SELECT:
       return InferSelectShape(lhs, rhs, ehs);
     case TRIOP_UPDATE:
@@ -1332,6 +1320,41 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   return ShapeUtil::PermuteDimensions(InversePermutation(dimensions), operand);
 }
 
+// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
+// "degenerate" cases, as with binary elementwise ops.
+/* static */ StatusOr<Shape> ShapeInference::InferClampShape(
+    const Shape& min, const Shape& operand, const Shape& max) {
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(min, "clamp min"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "clamp operand"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(max, "clamp max"));
+  if (!ShapeUtil::SameElementType(min, operand) ||
+      !ShapeUtil::SameElementType(max, operand)) {
+    return InvalidArgument("clamp op with different operand types: %s, %s, %s",
+                           ShapeUtil::HumanString(min).c_str(),
+                           ShapeUtil::HumanString(operand).c_str(),
+                           ShapeUtil::HumanString(max).c_str());
+  }
+  if (((ShapeUtil::Compatible(min, operand) || ShapeUtil::IsScalar(min)) &&
+       (ShapeUtil::Compatible(max, operand) || ShapeUtil::IsScalar(max)))) {
+    return operand;
+  }
+  if (ShapeUtil::IsScalar(operand)) {
+    if (ShapeUtil::Compatible(min, max)) {
+      return min;
+    } else if (ShapeUtil::IsScalar(min)) {
+      return max;
+    } else if (ShapeUtil::IsScalar(max)) {
+      return min;
+    }
+  }
+  return Unimplemented(
+      "not yet implemented: %s, %s <clamp> %s", min.ShortDebugString().c_str(),
+      max.ShortDebugString().c_str(), operand.ShortDebugString().c_str());
+}
+
+// TODO(b/36794510): Make broadcast semantics more consistent, by supporting
+// "degenerate" cases, as with binary elementwise ops, as well as scalar
+// broadcast from all operands, not just the predicate.
 /* static */ StatusOr<Shape> ShapeInference::InferSelectShape(
     const Shape& pred, const Shape& on_true, const Shape& on_false) {
   if (!ShapeUtil::Compatible(on_true, on_false)) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index ced2f4d0017e26b8f6d54b78f240dedecdbc79f3..c2223423e9223ba8ad995212415f219eea48e2a6 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -190,6 +190,10 @@ class ShapeInference {
       BinaryOperation operation, const Shape& lhs, const Shape& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Helper for inferring the shape of Clamp ops.
+  static StatusOr<Shape> InferClampShape(const Shape& min, const Shape& operand,
+                                         const Shape& max);
+
   // Helper for inferring the shape of Select ops.
   static StatusOr<Shape> InferSelectShape(const Shape& pred,
                                           const Shape& on_true,
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 5a1ae6b0024c65c9a451f1500146dc81408b8684..7cff042a48db436b3d165e8eaedc5a3f3c76b15e 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -20,12 +20,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+using ::testing::HasSubstr;
+
 class ShapeInferenceTest : public ::testing::Test {
  protected:
   // Some handy scalar shapes.
@@ -128,23 +132,21 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
   auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(
-      inferred_status_error1.status().error_message(),
-      testing::ContainsRegex("operands to select must be the same shape"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("operands to select must be the same shape"));
 
   auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, s32_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("pred operand must have PRED"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("pred operand must have PRED"));
 
   auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
       TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeShape(PRED, {64}),
       matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(
-      inferred_status_error3.status().error_message(),
-      testing::ContainsRegex("with non-scalar predicate with dimensionality"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("with non-scalar predicate with dimensionality"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
   auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
@@ -152,9 +154,101 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
       ShapeUtil::MakeTupleShape({f32_, f32_}),
       ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
-      inferred_status_error4.status().error_message(),
-      testing::ContainsRegex("pred operand must have PRED element type"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("pred operand must have PRED element type"));
+}
+
+TEST_F(ShapeInferenceTest, ClampAllMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_,
+      matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampAllScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, f32_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMinScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMaxScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampOperandScalar) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMinMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, f32_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
+  auto inferred_status = ShapeInference::InferTernaryOpShape(
+      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
+}
+
+TEST_F(ShapeInferenceTest, ClampBadShapes) {
+  // Type mismatch
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, s32_, f32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, s32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, f32_, s32_)
+                   .ok());
+  // Dimension mismatch
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_64_, vector_32_, vector_32_)
+          .ok());
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_32_, vector_64_, vector_32_)
+          .ok());
+  ASSERT_FALSE(
+      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
+                                          vector_32_, vector_32_, vector_64_)
+          .ok());
+  // Dimension mismatch, where one operand is a scalar
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, vector_64_, vector_32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, vector_64_, f32_, vector_32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
+                   TernaryOperation::TRIOP_CLAMP, f32_, vector_64_, vector_32_)
+                   .ok());
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
@@ -205,8 +299,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
       operand_shape_, select_program_shape_, window_, source_shape_fail,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("source shape does not match"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("source shape does not match"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
@@ -216,9 +310,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(
-      inferred_status_fail.status().error_message(),
-      testing::ContainsRegex("select function must take 2 parameters"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function must take 2 parameters"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
@@ -228,8 +321,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function must have rank-0 PRED"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function must have rank-0 PRED"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
@@ -239,8 +332,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function's first parameter"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function's first parameter"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
@@ -250,8 +343,8 @@ TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
       operand_shape_, select_program_shape_fail, window_, source_shape_,
       init_value_shape_, scatter_program_shape_);
   ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_MATCH(inferred_status_fail.status().error_message(),
-               testing::ContainsRegex("select function's second parameter"));
+  ASSERT_THAT(inferred_status_fail.status().error_message(),
+              HasSubstr("select function's second parameter"));
 }
 
 TEST_F(ShapeInferenceTest, Convolve) {
@@ -405,8 +498,8 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   auto inferred_status =
       ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, window, dnums);
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("each dimension exactly once"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("each dimension exactly once"));
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
@@ -443,43 +536,42 @@ TEST_F(ShapeInferenceTest, Map) {
   auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(no_args_error.ok());
-  ASSERT_MATCH(no_args_error.status().error_message(),
-               testing::ContainsRegex("expects at least one argument"));
+  ASSERT_THAT(no_args_error.status().error_message(),
+              HasSubstr("expects at least one argument"));
 
   auto args_diff_shapes_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_64_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(args_diff_shapes_error.ok());
-  ASSERT_MATCH(
-      args_diff_shapes_error.status().error_message(),
-      testing::ContainsRegex("requires all operands to have the same shape"));
+  ASSERT_THAT(args_diff_shapes_error.status().error_message(),
+              HasSubstr("requires all operands to have the same shape"));
 
   auto arity_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_));
   ASSERT_FALSE(arity_error.ok());
-  ASSERT_MATCH(arity_error.status().error_message(),
-               testing::ContainsRegex("function arity must match"));
+  ASSERT_THAT(arity_error.status().error_message(),
+              HasSubstr("function arity must match"));
 
   auto output_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, vector_32_));
   ASSERT_FALSE(output_shape_error.ok());
-  ASSERT_MATCH(output_shape_error.status().error_message(),
-               testing::ContainsRegex("result has to be a scalar"));
+  ASSERT_THAT(output_shape_error.status().error_message(),
+              HasSubstr("result has to be a scalar"));
 
   auto param_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({vector_32_, f32_}, f32_));
   ASSERT_FALSE(param_shape_error.ok());
-  ASSERT_MATCH(param_shape_error.status().error_message(),
-               testing::ContainsRegex("parameter has to be a scalar"));
+  ASSERT_THAT(param_shape_error.status().error_message(),
+              HasSubstr("parameter has to be a scalar"));
 
   auto param_element_type_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, s32_}, f32_));
   ASSERT_FALSE(param_element_type_error.ok());
-  ASSERT_MATCH(param_element_type_error.status().error_message(),
-               testing::ContainsRegex("parameter type has to match argument"));
+  ASSERT_THAT(param_element_type_error.status().error_message(),
+              HasSubstr("parameter type has to match argument"));
 
   Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, f32_);
@@ -490,26 +582,26 @@ TEST_F(ShapeInferenceTest, Map) {
   auto inferred_status_error1 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_));
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("arity must match number of arguments"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("arity must match number of arguments"));
 
   auto inferred_status_error2 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_));
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("has to be a scalar"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("has to be a scalar"));
 
   auto inferred_status_error3 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_));
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("has to be a scalar"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("has to be a scalar"));
 
   auto inferred_status_error5 = ShapeInference::InferMapShape(
       {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_));
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(inferred_status_error5.status().error_message(),
-               testing::ContainsRegex("parameter type has to match argument"));
+  ASSERT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("parameter type has to match argument"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceVectorToScalar) {
@@ -563,8 +655,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
       ShapeUtil::MakeShape(F32, {5, 3}), f32_, /*dimensions_to_reduce=*/{3, 4},
       to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("out-of-bounds dimension"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("out-of-bounds dimension"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
@@ -573,8 +665,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
       ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_,
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("take 2 parameters"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("take 2 parameters"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
@@ -583,8 +675,8 @@ TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
       ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_,
                                        /*dimensions_to_reduce=*/{0}, to_apply);
   EXPECT_FALSE(inferred_status.ok());
-  EXPECT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("first parameter shape differs"));
+  EXPECT_THAT(inferred_status.status().error_message(),
+              HasSubstr("first parameter shape differs"));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
@@ -726,8 +818,8 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
   auto inferred_status =
       ShapeInference::InferBinaryOpShape(BINOP_DOT, f32_, vector_32_, {});
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("dot only supports rank"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("dot only supports rank"));
 }
 
 // 3D <dot> 2D: error
@@ -735,8 +827,8 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   auto inferred_status = ShapeInference::InferBinaryOpShape(
       BINOP_DOT, ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, {});
   ASSERT_FALSE(inferred_status.ok());
-  ASSERT_MATCH(inferred_status.status().error_message(),
-               testing::ContainsRegex("dot only supports rank"));
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("dot only supports rank"));
 }
 
 // vector <dot> vector -> scalar
@@ -848,46 +940,43 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error1 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("automatic"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("automatic"));
 
   // broadcast_dimension out of bounds for tensor's rank
   auto inferred_status_error2 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(
-      inferred_status_error2.status().error_message(),
-      testing::ContainsRegex("broadcast dimension number .* too large"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              ContainsRegex("broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
   auto inferred_status_error3 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
   auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
-      inferred_status_error4.status().error_message(),
-      testing::ContainsRegex("size of broadcast_dimensions has to match"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("size of broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
   auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(
-      inferred_status_error5.status().error_message(),
-      testing::ContainsRegex("broadcast dimension number .* too large"));
+  ASSERT_THAT(inferred_status_error5.status().error_message(),
+              ContainsRegex("broadcast dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
   auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor, matrix8_4, {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_MATCH(inferred_status_error6.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  ASSERT_THAT(inferred_status_error6.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
@@ -895,14 +984,14 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
-  ASSERT_MATCH(inferred_status_error7.status().error_message(),
-               testing::ContainsRegex("broadcast dimensions order is wrong"));
+  ASSERT_THAT(inferred_status_error7.status().error_message(),
+              HasSubstr("broadcast dimensions order is wrong"));
 
   auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
       BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
-  ASSERT_MATCH(inferred_status_error8.status().error_message(),
-               testing::ContainsRegex("broadcast dimensions order is wrong"));
+  ASSERT_THAT(inferred_status_error8.status().error_message(),
+              HasSubstr("broadcast dimensions order is wrong"));
 }
 
 // Tests for the while instruction with proper shapes.
@@ -927,30 +1016,30 @@ TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("condition must take 1 arguments"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("condition must take 1 arguments"));
 
   auto bad_shape_2 =
       ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
   auto inferred_status_error2 =
       ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("body must take 1 arguments"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("body must take 1 arguments"));
 
   auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
   auto inferred_status_error3 =
       ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("condition must return a boolean"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("condition must return a boolean"));
 
   auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
   auto inferred_status_error4 =
       ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(inferred_status_error4.status().error_message(),
-               testing::ContainsRegex("parameter of condition and body"));
+  ASSERT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("parameter of condition and body"));
 }
 
 // Tests for the concatenate instruction with proper shapes.
@@ -980,49 +1069,44 @@ TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
   auto inferred_status_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(
-      inferred_status_error1.status().error_message(),
-      testing::ContainsRegex("Concatenate expects at least one argument"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("Concatenate expects at least one argument"));
 
   auto inferred_status_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: -1"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("dimension to concatenate along out of bounds: -1"));
 
   auto inferred_status_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: 1"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("dimension to concatenate along out of bounds: 1"));
 
   Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
   auto inferred_status_error4 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &tuple}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_MATCH(
+  ASSERT_THAT(
       inferred_status_error4.status().error_message(),
-      testing::ContainsRegex(
-          "Expected non-tuple argument for operand of concatenation."));
+      HasSubstr("Expected non-tuple argument for operand of concatenation."));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
   auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
       {&vector_32_, &vector_s32}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_MATCH(inferred_status_error5.status().error_message(),
-               testing::ContainsRegex(
-                   "cannot concatenate arrays with different element types"));
+  ASSERT_THAT(
+      inferred_status_error5.status().error_message(),
+      HasSubstr("cannot concatenate arrays with different element types"));
 
   auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
       {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
   ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_MATCH(
-      inferred_status_error6.status().error_message(),
-      testing::ContainsRegex("cannot concatenate arrays that differ in "
-                             "dimensions other than the one being "
-                             "concatenated"));
+  ASSERT_THAT(inferred_status_error6.status().error_message(),
+              HasSubstr("cannot concatenate arrays that differ in "
+                        "dimensions other than the one being "
+                        "concatenated"));
 }
 
 TEST_F(ShapeInferenceTest, Pad) {
@@ -1063,27 +1147,27 @@ TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
   auto inferred_status_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
   ASSERT_FALSE(inferred_status_error0.ok());
-  ASSERT_MATCH(inferred_status_error0.status().error_message(),
-               testing::ContainsRegex("out-of-bounds"));
+  ASSERT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("out-of-bounds"));
 
   auto inferred_status_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
   ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("out-of-bounds"));
+  ASSERT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("out-of-bounds"));
 
   auto inferred_status_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
   ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("duplicated"));
+  ASSERT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("duplicated"));
 
   Shape tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
   auto inferred_status_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_MATCH(inferred_status_error3.status().error_message(),
-               testing::ContainsRegex("Expected non-tuple argument"));
+  ASSERT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("Expected non-tuple argument"));
 }
 
 TEST_F(ShapeInferenceTest, Call) {
@@ -1103,20 +1187,20 @@ TEST_F(ShapeInferenceTest, Call) {
   auto inferred_status_error0 = ShapeInference::InferCallShape(
       {}, ShapeUtil::MakeProgramShape({f32_}, f32_));
   EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_MATCH(inferred_status_error0.status().error_message(),
-               testing::ContainsRegex("arity must match"));
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("arity must match"));
 
   auto inferred_status_error1 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({}, f32_));
   EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_MATCH(inferred_status_error1.status().error_message(),
-               testing::ContainsRegex("arity must match"));
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("arity must match"));
 
   auto inferred_status_error2 = ShapeInference::InferCallShape(
       {&f32_}, ShapeUtil::MakeProgramShape({s32_}, f32_));
   EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_MATCH(inferred_status_error2.status().error_message(),
-               testing::ContainsRegex("parameter must match argument"));
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("parameter must match argument"));
 }
 
 TEST_F(ShapeInferenceTest, Transpose) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index b052bb814693c2e9364c94154ca223fe98526622..83e893a14a6d95e3741af57d34eadef4e5c088d9 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -99,13 +99,6 @@ class TransferManager {
   // region for a host-to-device transfer.
   virtual int64 GetByteSizeRequirement(const Shape& shape) = 0;
 
-  // Returns whether tuple elements are distinct buffers (in which case each of
-  // the elements of a tuple should be deallocated, in addition to the tuple's
-  // buffer itself).
-  //
-  // TODO(b/36256956) Ideally tuple elements could always be distinct buffers.
-  virtual bool TupleElementsAreDistinctBuffers() const { return true; }
-
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 07e0ce89f6ad2ba194832096de2399ab618422a4..a0c88c6bbc23972bb6a0f3729e51ee0eaee72bc7 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -30,43 +32,55 @@ namespace xla {
 
 namespace {
 
-bool IsOperandFoldableToDot(const HloInstruction& hlo) {
-  return hlo.IsRank2Transpose() &&
-         hlo.user_count() == 1;  // The dot is its only user.
-}
-
-bool CanFoldOperandsIntoDot(
+TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     const HloInstruction& dot,
-    const TransposeFolding::IsTransposableGemmFn& is_transposable_gemm) {
+    const TransposeFolding::TransposableGemmOperandsFn&
+        transposable_gemm_operands) {
   if (HloOpcode::kDot != dot.opcode()) {
-    return false;
+    return {};
   }
 
-  if (!is_transposable_gemm(dot)) {
-    return false;
+  TransposeFolding::OperandIndices operand_set;
+  for (int64 i = 0; i < dot.operand_count(); ++i) {
+    auto& operand = *dot.operand(i);
+    if (operand.IsRank2Transpose() && operand.user_count() == 1) {
+      operand_set.push_back(i);
+    }
   }
 
-  const HloInstruction* lhs = dot.operand(0);
-  const HloInstruction* rhs = dot.operand(1);
-  bool lhs_foldable = IsOperandFoldableToDot(*lhs);
-  bool rhs_foldable = IsOperandFoldableToDot(*rhs);
-  if (!lhs_foldable && !rhs_foldable) {
-    return false;
+  return transposable_gemm_operands(dot, operand_set);
+}
+
+TransposeFolding::OperandIndices CanFoldOperandsIntoConvolution(
+    const HloInstruction& convolution,
+    const TransposeFolding::TransposableConvOperandsFn&
+        transposable_conv_operands) {
+  if (HloOpcode::kConvolution != convolution.opcode()) {
+    return {};
   }
-  return true;
+
+  // We only support folding the RHS.
+  const int64 kRhsOperandIndex = 1;
+  auto& operand = *convolution.operand(kRhsOperandIndex);
+  if (operand.opcode() == HloOpcode::kTranspose && operand.user_count() == 1) {
+    return transposable_conv_operands(convolution, {kRhsOperandIndex});
+  }
+
+  return {};
 }
 
+using InstructionOperandsPair =
+    std::pair<HloInstruction*, TransposeFolding::OperandIndices>;
+
 // Folds the operands of `dot` that are foldable transposes. `computation` is
-// the parent HLO computation of `dot`. `module` is the parent HloModule of
-// `computation`.
+// the parent HLO computation of `dot`.
 //
 // Returns whether the module is changed.
-bool FoldTransposeIntoDot(HloInstruction* dot, HloComputation* computation) {
+bool FoldTransposeIntoDot(InstructionOperandsPair pair) {
+  auto* dot = pair.first;
   std::vector<HloInstruction*> instructions_to_fuse(1, dot);
-  for (HloInstruction* operand : dot->operands()) {
-    if (IsOperandFoldableToDot(*operand)) {
-      instructions_to_fuse.push_back(operand);
-    }
+  for (const int64 operand_index : pair.second) {
+    instructions_to_fuse.push_back(dot->mutable_operand(operand_index));
   }
 
   // Early-exit if no operands are foldable.
@@ -74,33 +88,100 @@ bool FoldTransposeIntoDot(HloInstruction* dot, HloComputation* computation) {
     return false;
   }
 
-  computation->CreateFusionInstruction(
+  dot->parent()->CreateFusionInstruction(
       instructions_to_fuse, HloInstruction::FusionKind::kTransposeDot);
   return true;
 }
 
+// Folds the operands of `convolution` that are foldable transposes.
+// `computation` is the parent HLO computation of `convolution`.
+//
+// Returns whether the module is changed.
+bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
+  auto& convolution = *pair.first;
+
+  // We only support fusing the RHS transpose into convolution.
+  //
+  // ConvolutionDimensionNumbers doesn't make enough of a distinction between
+  // the output and the activations.
+  //
+  // TODO(b/37125184): Support transposing the LHS too.
+  if (pair.second.size() != 1 || pair.second.front() != 1) {
+    return false;
+  }
+
+  const ConvolutionDimensionNumbers& dnums =
+      convolution.convolution_dimension_numbers();
+  HloInstruction& transpose = *convolution.mutable_operand(1);
+  CHECK_EQ(transpose.opcode(), HloOpcode::kTranspose);
+  const auto& transpose_dimensions = transpose.dimensions();
+  HloInstruction& transpose_operand = *transpose.mutable_operand(0);
+
+  // Everything remains the same except for the kernel dimension numbers. We
+  // need to apply the transpose permutation to the original shape to figure out
+  // what the new logical dimensions are.
+  ConvolutionDimensionNumbers new_dnums = dnums;
+  new_dnums.set_kernel_input_feature_dimension(
+      transpose_dimensions[dnums.kernel_input_feature_dimension()]);
+  new_dnums.set_kernel_output_feature_dimension(
+      transpose_dimensions[dnums.kernel_output_feature_dimension()]);
+  for (auto& kernel_spatial_dimension :
+       *new_dnums.mutable_kernel_spatial_dimensions()) {
+    kernel_spatial_dimension = transpose_dimensions[kernel_spatial_dimension];
+  }
+
+  auto new_conv = HloInstruction::CreateConvolve(
+      convolution.shape(), convolution.mutable_operand(0), &transpose_operand,
+      convolution.window(), new_dnums);
+  TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
+      &convolution, std::move(new_conv)));
+
+  return true;
+}
+
 }  // namespace
 
-TransposeFolding::TransposeFolding(IsTransposableGemmFn is_transposable_gemm)
-    : is_transposable_gemm_(std::move(is_transposable_gemm)) {}
+TransposeFolding::TransposeFolding(
+    TransposableGemmOperandsFn transposable_gemm_operands,
+    TransposableConvOperandsFn transposable_conv_operands)
+    : transposable_gemm_operands_(std::move(transposable_gemm_operands)),
+      transposable_conv_operands_(std::move(transposable_conv_operands)) {}
 
 StatusOr<bool> TransposeFolding::Run(HloModule* module) {
   // Modifying the graph while traversing is dangerous, so we find all folding
   // opportunities before actually folding them.
-  HloComputation* entry_computation = module->entry_computation();
-
-  std::vector<HloInstruction*> foldable_dots;
-  auto visit_fn = [this, &foldable_dots](HloInstruction* instruction) {
-    if (CanFoldOperandsIntoDot(*instruction, is_transposable_gemm_)) {
-      foldable_dots.emplace_back(instruction);
+  std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_dots;
+  std::vector<std::pair<HloInstruction*, OperandIndices>> foldable_convolutions;
+  auto visit_fn = [this, &foldable_dots,
+                   &foldable_convolutions](HloInstruction* instruction) {
+    {
+      OperandIndices operand_indices =
+          CanFoldOperandsIntoDot(*instruction, transposable_gemm_operands_);
+      if (!operand_indices.empty()) {
+        foldable_dots.emplace_back(instruction, operand_indices);
+      }
+    }
+    {
+      OperandIndices operand_indices = CanFoldOperandsIntoConvolution(
+          *instruction, transposable_conv_operands_);
+      if (!operand_indices.empty()) {
+        foldable_convolutions.emplace_back(
+            std::make_pair(instruction, operand_indices));
+      }
     }
     return tensorflow::Status::OK();
   };
-  TF_RETURN_IF_ERROR(entry_computation->root_instruction()->Accept(visit_fn));
+
+  for (auto& comp : module->computations()) {
+    TF_RETURN_IF_ERROR(comp->Accept(visit_fn));
+  }
 
   bool changed = false;
-  for (HloInstruction* dot : foldable_dots) {
-    changed |= FoldTransposeIntoDot(dot, entry_computation);
+  for (InstructionOperandsPair& pair : foldable_dots) {
+    changed |= FoldTransposeIntoDot(pair);
+  }
+  for (InstructionOperandsPair& pair : foldable_convolutions) {
+    changed |= FoldTransposeIntoConvolution(pair);
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/transpose_folding.h b/tensorflow/compiler/xla/service/transpose_folding.h
index d857c04ed8d0c0d9d6c005db0f29ab0c5abd3bb2..71e8446452f072c22bb730cbda65a1743a95cd4c 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.h
+++ b/tensorflow/compiler/xla/service/transpose_folding.h
@@ -25,16 +25,37 @@ namespace xla {
 // operator is implemented by a GEMM kernel that can transpose its inputs.
 class TransposeFolding : public HloPassInterface {
  public:
-  // IsTransposableGemmFn should return true iff the instruction argument is
-  // implemented as a GEMM kernel that supports transposing its arguments.
-  typedef std::function<bool(const HloInstruction&)> IsTransposableGemmFn;
-  explicit TransposeFolding(IsTransposableGemmFn is_transposable_gemm);
+  using OperandIndices = std::vector<int64>;
+
+  // Returns the set of foldable operands for a given HLO and some candidate
+  // operands.
+  using FoldableOperands = std::function<OperandIndices(const HloInstruction&,
+                                                        const OperandIndices&)>;
+  using TransposableGemmOperandsFn = FoldableOperands;
+  using TransposableConvOperandsFn = FoldableOperands;
+
+  // Helper function to explicitly not fold transposes.
+  static OperandIndices NeverFoldTranspose(const HloInstruction&,
+                                           const OperandIndices&) {
+    return {};
+  }
+  // transposable_gemm_operands returns the set of operands it wants to fold if
+  // the instruction argument is implemented as a GEMM kernel that supports
+  // transposing its arguments.
+  //
+  // transposable_conv_operands returns the set of operands it wants to fold if
+  // the instruction argument is implemented as a convolution that supports
+  // transposing its arguments.
+  explicit TransposeFolding(
+      TransposableGemmOperandsFn transposable_gemm_operands,
+      TransposableConvOperandsFn transposable_conv_operands);
   tensorflow::StringPiece name() const override { return "transpose-folding"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  IsTransposableGemmFn is_transposable_gemm_;
+  TransposableGemmOperandsFn transposable_gemm_operands_;
+  TransposableConvOperandsFn transposable_conv_operands_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 09f932e29e61a24b178e7ced0d2643aa484bea02..c72d127ea86e4e9daf99dff4335c538c081f0605 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -16,16 +16,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 
 #include <memory>
-#include <set>
+#include <unordered_set>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,12 +38,20 @@ namespace xla {
 class TransposeFoldingTest : public ::testing::Test {
  protected:
   void FoldTranspose(HloModule* module) {
-    TransposeFolding transpose_folding(gpu::ImplementedAsGemm);
+    TransposeFolding transpose_folding(
+        [](const HloInstruction& dot,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return candidate_operands;
+        },
+        [](const HloInstruction& convolution,
+           const TransposeFolding::OperandIndices& candidate_operands) {
+          return candidate_operands;
+        });
     EXPECT_IS_OK(transpose_folding.Run(module).status());
   }
 };
 
-TEST_F(TransposeFoldingTest, FoldTranspose) {
+TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   auto builder = HloComputation::Builder("entry_computation");
   HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
@@ -61,7 +72,7 @@ TEST_F(TransposeFoldingTest, FoldTranspose) {
   FoldTranspose(&module);
 
   // Instructions after folding: x, y, and the fusion.
-  std::set<HloInstruction*> instruction_set;
+  std::unordered_set<HloInstruction*> instruction_set;
   for (auto& instruction : entry_computation->instructions()) {
     instruction_set.insert(instruction.get());
   }
@@ -77,7 +88,7 @@ TEST_F(TransposeFoldingTest, FoldTranspose) {
   EXPECT_EQ(4, fusion->fused_instructions().size());
 }
 
-TEST_F(TransposeFoldingTest, FoldTransposeConstant) {
+TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   auto builder = HloComputation::Builder("entry_computation");
   // 2x1
   HloInstruction* const0 = builder.AddInstruction(
@@ -115,7 +126,7 @@ TEST_F(TransposeFoldingTest, FoldTransposeConstant) {
             entry_computation->root_instruction()->fused_instructions().size());
 }
 
-TEST_F(TransposeFoldingTest, FuseWithConstantOperands) {
+TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   auto builder = HloComputation::Builder("entry");
   // (1.0 + 2.0) * (2.0 - 3.0)
   HloInstruction* const1 = builder.AddInstruction(
@@ -139,11 +150,219 @@ TEST_F(TransposeFoldingTest, FuseWithConstantOperands) {
   EXPECT_EQ(call, entry_computation->root_instruction());
   HloComputation* callee_computation = call->to_apply();
   // The arguments to the call should be const1, const2, and const3.
-  EXPECT_MATCH(call->operands(), testing::UnorderedMatcher<HloInstruction*>(
-                                     const1, const2, const3));
+  EXPECT_THAT(call->operands(),
+              ::testing::UnorderedElementsAre(const1, const2, const3));
 
   // The callee should contain 3 parameters and 3 binary operators.
   EXPECT_EQ(6, callee_computation->instructions().size());
 }
 
+TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
+      /*lhs=*/x, /*rhs=*/transpose_y));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(dot));
+
+  HloInstruction* call = module.OutlineExpressionFromComputation(
+      {transpose_y, dot}, "outlined", entry_computation);
+
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the fusion.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(call))
+      << "call is not in entry_computation.";
+  CHECK(instruction_set.empty())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* fusion =
+      call->called_computations().front()->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+
+  // The fusion instruction should contain two parameters, one transpose and
+  // one dot.
+  EXPECT_EQ(4, fusion->fused_instructions().size());
+}
+
+// Test that a two dimension swap of the kernel gets folded into convolution.
+TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 0, 2, 3}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(
+        transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      x->shape(), transpose_y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.kernel_input_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_output_feature_dimension());
+  EXPECT_EQ(dnums.kernel_output_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_input_feature_dimension());
+}
+
+// Test that a complex transpose of the kernel gets folded into convolution.
+TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {1, 2, 1, 3}),
+      /*name=*/"y"));
+  HloInstruction* transpose_y =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 3, 0, 2}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(
+        transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      x->shape(), transpose_y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.kernel_input_feature_dimension(),
+            new_conv->convolution_dimension_numbers()
+                .kernel_output_feature_dimension());
+  EXPECT_EQ(dnums.kernel_spatial_dimensions(1),
+            new_conv->convolution_dimension_numbers()
+                .kernel_input_feature_dimension());
+  EXPECT_EQ(
+      dnums.kernel_output_feature_dimension(),
+      new_conv->convolution_dimension_numbers().kernel_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.kernel_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().kernel_spatial_dimensions(1));
+}
+
+// Test that a transpose of the activations does not get folded into
+// convolution.
+TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_x =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 2, 3}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      transpose_x->shape(), y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: transpose_x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set;
+  for (auto& instruction : entry_computation->instructions()) {
+    instruction_set.insert(instruction.get());
+  }
+  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(transpose_x))
+      << "transpose_x is not in entry_computation.";
+  CHECK_EQ(1, instruction_set.erase(conv))
+      << "transpose_x is not in entry_computation.";
+  CHECK_EQ(0, instruction_set.size())
+      << "entry_computation should contain exactly 4 instructions.";
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 98c51b48f9022c5f2d1e23b59a6ce775f3a48e0b..554adaf0e32f7cb896e07a59d5235ff84a11bb92 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -131,10 +131,9 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
 }
 
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
-TuplePointsToAnalysis::Run(const HloModule* module,
-                           const bool include_loop_fusion_instructions) {
+TuplePointsToAnalysis::Run(const HloModule* module) {
   std::unique_ptr<TuplePointsToAnalysis> analysis(
-      new TuplePointsToAnalysis(module, include_loop_fusion_instructions));
+      new TuplePointsToAnalysis(module));
   TF_RETURN_IF_ERROR(analysis->Analyze());
   return std::move(analysis);
 }
@@ -145,17 +144,14 @@ Status TuplePointsToAnalysis::Analyze() {
     TF_RETURN_IF_ERROR(computation->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(computation->instructions()));
-    if (include_loop_fusion_instructions_) {
-      // Run points-to analysis on loop fusion instructions in 'computation'.
-      for (auto& instruction : computation->instructions()) {
-        if (instruction->opcode() != HloOpcode::kFusion ||
-            instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) {
-          continue;
-        }
-        TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
-        TF_RETURN_IF_ERROR(PopulateDefinedBuffersAndAliases(
-            instruction->fused_instructions()));
+    // Run points-to analysis on fusion instructions in 'computation'.
+    for (auto& instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        continue;
       }
+      TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+      TF_RETURN_IF_ERROR(
+          PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
     }
   }
 
@@ -482,9 +478,7 @@ string TuplePointsToAnalysis::ToString() const {
     for (const HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       InstructionToString(instruction, &output);
-      if (include_loop_fusion_instructions_ &&
-          instruction->opcode() == HloOpcode::kFusion &&
-          instruction->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
         for (auto& fused : instruction->fused_instructions()) {
           InstructionToString(fused.get(), &output);
         }
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index a384529171a7371c848ca8949d22cb6717d83a78..85a71b56ce5e9fb1a3441c302e18bd1fa7b68864 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -148,12 +148,9 @@ std::ostream& operator<<(std::ostream& out, const BufferAlias& buffer_alias);
 // the potential sources of each buffer in each instruction's output.
 class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
  public:
-  // Runs points-to analysis on 'module'. If 'include_loop_fusion_instructions'
-  // is true, includes fused instructions from each loop fusion instruction
-  // in 'module' in the points-to analysis.
+  // Runs points-to analysis on 'module'.
   static StatusOr<std::unique_ptr<TuplePointsToAnalysis>> Run(
-      const HloModule* module,
-      const bool include_loop_fusion_instructions = false);
+      const HloModule* module);
 
   // Return the points-to set of an instruction. This describes the potential
   // sources of each buffer in the instruction's output.
@@ -218,10 +215,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   string ToString() const;
 
  private:
-  explicit TuplePointsToAnalysis(const HloModule* module,
-                                 const bool include_loop_fusion_instructions)
-      : module_(module),
-        include_loop_fusion_instructions_(include_loop_fusion_instructions) {}
+  explicit TuplePointsToAnalysis(const HloModule* module) : module_(module) {}
 
   // Perform the analysis. Should be called immediately after constructing the
   // object and before calling GetPointsToSet.
@@ -261,9 +255,6 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   // The module this analysis is performed on.
   const HloModule* module_;
 
-  // Whether to run points-to analysis on loop fusion instructions in 'module_'.
-  const bool include_loop_fusion_instructions_;
-
   // A map containing a PointsToSet for every HLO instruction.
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<PointsToSet>>
       points_to_;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 4a4a6e64ffae265bc143cfd7adb9f7d53b2b0359..87e1b058b79c0dc327cc1ad63a8cffa97c190df4 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,18 +19,25 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
+using ::testing::UnorderedElementsAreArray;
+using ::testing::UnorderedElementsAre;
+
 class TuplePointsToAnalysisTest : public HloTestBase {
  protected:
   // Builds a module with the given entry computation and runs points to
@@ -45,11 +52,10 @@ class TuplePointsToAnalysisTest : public HloTestBase {
     module_->AddEntryComputation(std::move(computation));
   }
 
-  void RunAnalysis(const bool include_loop_fusion_instructions = false) {
+  void RunAnalysis() {
     CHECK_NOTNULL(module_.get());
-    points_to_analysis_ = TuplePointsToAnalysis::Run(
-                              module_.get(), include_loop_fusion_instructions)
-                              .ConsumeValueOrDie();
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
   }
 
   // Returns the LogicalBuffer defined at the given instruction and
@@ -70,7 +76,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
       const std::vector<const LogicalBuffer*>& points_to_set,
       tensorflow::gtl::ArraySlice<const LogicalBuffer*> buffers) {
     std::vector<const LogicalBuffer*> vec(buffers.begin(), buffers.end());
-    EXPECT_MATCH(points_to_set, testing::UnorderedElementsAre(vec));
+    EXPECT_THAT(points_to_set, UnorderedElementsAreArray(vec));
   }
 
   // Checks that the given points-to set contains exactly (unordered) the
@@ -107,20 +113,14 @@ class TuplePointsToAnalysisTest : public HloTestBase {
     for (auto& pair : expected) {
       expected_aliases.push_back(BufferAlias(*buffer, pair.first, pair.second));
     }
-    EXPECT_MATCH(points_to_analysis_->GetBufferAliases(*buffer),
-                 testing::UnorderedElementsAre(expected_aliases));
+    EXPECT_THAT(points_to_analysis_->GetBufferAliases(*buffer),
+                UnorderedElementsAreArray(expected_aliases));
   }
 
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
 };
 
-// Expect the given std::set<HloInstruction*> as A contains exactly the given
-// HloInstruction*s as __VA_ARGS__.
-#define EXPECT_ISET(A, ...)                           \
-  EXPECT_MATCH(testing::SetToVec<HloInstruction*>(A), \
-               testing::UnorderedMatcher<HloInstruction*>(__VA_ARGS__))
-
 TEST_F(TuplePointsToAnalysisTest, SimpleTuple) {
   auto builder = HloComputation::Builder(TestName());
   auto constant1 = builder.AddInstruction(
@@ -146,8 +146,8 @@ TEST_F(TuplePointsToAnalysisTest, SimpleTuple) {
 
   EXPECT_EQ(3, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
 
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
@@ -205,9 +205,9 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) {
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(inner_tuple).element({}),
       {inner_tuple});
-  EXPECT_ISET(
+  EXPECT_THAT(
       points_to_analysis_->GetPointsToSet(inner_tuple).tuple_sources({}),
-      inner_tuple);
+      UnorderedElementsAre(inner_tuple));
 
   EXPECT_EQ(5, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
@@ -215,10 +215,10 @@ TEST_F(TuplePointsToAnalysisTest, NestedTuple) {
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
       {constant1, constant2, constant3, inner_tuple, tuple});
 
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({0}),
-              inner_tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({0}),
+              UnorderedElementsAre(inner_tuple));
   EXPECT_TRUE(
       points_to_analysis_->GetPointsToSet(tuple).tuple_sources({1}).empty());
 
@@ -262,7 +262,8 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
                            {constant1, constant2, inner_tuple});
   ExpectHasTopLevelBuffers(points_to_set.element({}), {inner_tuple});
 
-  EXPECT_ISET(points_to_set.tuple_sources({}), inner_tuple);
+  EXPECT_THAT(points_to_set.tuple_sources({}),
+              UnorderedElementsAre(inner_tuple));
 }
 
 TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
@@ -460,8 +461,10 @@ TEST_F(TuplePointsToAnalysisTest, NestedTupleSelect) {
   ExpectHasTopLevelBuffers(points_to_set.element({0, 1}), {constant2});
 
   // Verify tuple sources.
-  EXPECT_ISET(points_to_set.tuple_sources({}), tuple1, tuple2);
-  EXPECT_ISET(points_to_set.tuple_sources({0}), inner_tuple1, inner_tuple2);
+  EXPECT_THAT(points_to_set.tuple_sources({}),
+              UnorderedElementsAre(tuple1, tuple2));
+  EXPECT_THAT(points_to_set.tuple_sources({0}),
+              UnorderedElementsAre(inner_tuple1, inner_tuple2));
   EXPECT_EQ(0, points_to_set.tuple_sources({0, 0}).size());
   EXPECT_EQ(0, points_to_set.tuple_sources({0, 1}).size());
 }
@@ -489,8 +492,8 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) {
 
   EXPECT_EQ(3, points_to_analysis_->GetPointsToSet(tuple).size());
   EXPECT_FALSE(points_to_analysis_->GetPointsToSet(tuple).IsAmbiguous());
-  EXPECT_ISET(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
-              tuple);
+  EXPECT_THAT(points_to_analysis_->GetPointsToSet(tuple).tuple_sources({}),
+              UnorderedElementsAre(tuple));
 
   ExpectHasTopLevelBuffers(
       points_to_analysis_->GetPointsToSet(tuple).CreateFlattenedSet(),
@@ -603,9 +606,9 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
                     .ValueOrDie());
     // Get computation root instruction (should be a kFusion).
     auto* fusion = module_->entry_computation()->root_instruction();
-    EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+    EXPECT_THAT(fusion, op::Fusion(tuple_param0));
     // Run points-to analysis (should include fused instructions from 'fusion').
-    RunAnalysis(/*include_loop_fusion_instructions=*/true);
+    RunAnalysis();
 
     // Check points-to set of fusion parameter associated with 'tuple_param0'.
     auto* fusion_param = GetFusionParameterForOperand(fusion, tuple_param0);
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index a77788e0b63b984328c0ea52ebbb94cb8583e6e3..e9fcc9fa6666bb2e3c24252e1c0f5e8d763a5d48 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1510,6 +1510,7 @@ void ConstantVisitor(const SessionComputation& session_computation,
                       is_constant);
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
+      *is_constant = false;
       break;
     }
 
@@ -1927,6 +1928,12 @@ HloInstruction* ComputationLowerer::Visit(
 
   const OperationRequest& request =
       session_computation_.requests().at(handle.handle());
+  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
+    HloInstruction* hlo_instruction =
+        hlo_builder_.AddInstruction(std::move(instruction));
+    hlo_instruction->set_metadata(request.request().metadata());
+    return hlo_instruction;
+  };
   HloInstruction* hlo_instruction;
   switch (request.request().op_case()) {
     case OpRequest::kRngRequest: {
@@ -1935,7 +1942,7 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& param : rng_request.parameter()) {
         parameters.push_back(Visit(param, visited));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRng(
+      hlo_instruction = add_instruction(HloInstruction::CreateRng(
           request.output_shape(), rng_request.distribution(), parameters));
       break;
     }
@@ -1943,9 +1950,8 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kConstantRequest: {
       const ConstantRequest& constant_request =
           request.request().constant_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CloneToUnique(constant_request.literal())));
+      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
+          LiteralUtil::CloneToUnique(constant_request.literal())));
       break;
     }
 
@@ -1954,17 +1960,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().get_tuple_element_request();
       HloInstruction* operand =
           Visit(get_tuple_element_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateGetTupleElement(
-              request.output_shape(), operand,
-              get_tuple_element_request.index()));
+      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
+          request.output_shape(), operand, get_tuple_element_request.index()));
       break;
     }
 
     case OpRequest::kSliceRequest: {
       const SliceRequest& slice_request = request.request().slice_request();
       HloInstruction* operand = Visit(slice_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSlice(
+      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
           AsInt64Slice(slice_request.limit_indices())));
@@ -1978,10 +1982,9 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_slice_request.start_indices(), visited);
 
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicSlice(
-              request.output_shape(), operand, start_indices,
-              AsInt64Slice(dynamic_slice_request.slice_sizes())));
+      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
+          request.output_shape(), operand, start_indices,
+          AsInt64Slice(dynamic_slice_request.slice_sizes())));
       break;
     }
 
@@ -1995,7 +1998,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_update_slice_request.start_indices(), visited);
       hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
               request.output_shape(), operand, update, start_indices));
       break;
     }
@@ -2009,9 +2012,8 @@ HloInstruction* ComputationLowerer::Visit(
         HloInstruction* operand = Visit(handle, visited);
         operands.push_back(operand);
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateConcatenate(request.output_shape(), operands,
-                                            concatenate_request.dimension()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
+          request.output_shape(), operands, concatenate_request.dimension()));
       break;
     }
 
@@ -2020,10 +2022,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().convolve_request();
       HloInstruction* lhs = Visit(convolve_request.lhs(), visited);
       HloInstruction* rhs = Visit(convolve_request.rhs(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConvolve(
-              request.output_shape(), lhs, rhs, convolve_request.window(),
-              convolve_request.dimension_numbers()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
+          request.output_shape(), lhs, rhs, convolve_request.window(),
+          convolve_request.dimension_numbers()));
       break;
     }
 
@@ -2032,17 +2033,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           Visit(cross_replica_sum_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-              request.output_shape(), operand));
+      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
+          request.output_shape(), operand));
       break;
     }
 
     case OpRequest::kInfeedRequest: {
       const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateInfeed(
-              request.output_shape(), infeed_request.config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
+          request.output_shape(), infeed_request.config()));
       break;
     }
 
@@ -2050,9 +2049,8 @@ HloInstruction* ComputationLowerer::Visit(
       const OutfeedRequest& outfeed_request =
           request.request().outfeed_request();
       HloInstruction* operand = Visit(outfeed_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateOutfeed(outfeed_request.shape(), operand,
-                                        outfeed_request.outfeed_config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
+          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
       break;
     }
 
@@ -2068,7 +2066,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* map_computation =
           ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateMap(
+      hlo_instruction = add_instruction(HloInstruction::CreateMap(
           request.output_shape(), operands, map_computation));
       break;
     }
@@ -2082,10 +2080,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_computation =
           ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduce(
-              request.output_shape(), operand, init_value,
-              AsInt64Slice(reduce_request.dimensions()), reduce_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
+          request.output_shape(), operand, init_value,
+          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
       break;
     }
 
@@ -2100,10 +2097,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_window_computation = ResolveComputation(
           reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduceWindow(
-              request.output_shape(), operand, init_value,
-              reduce_window_request.window(), reduce_window_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
+          request.output_shape(), operand, init_value,
+          reduce_window_request.window(), reduce_window_computation));
       break;
     }
 
@@ -2125,11 +2121,10 @@ HloInstruction* ComputationLowerer::Visit(
           select_and_scatter_request.select(), select_version);
       HloComputation* scatter_computation = ResolveComputation(
           select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateSelectAndScatter(
-              request.output_shape(), operand, select_computation,
-              select_and_scatter_request.window(), source, init_value,
-              scatter_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
+          request.output_shape(), operand, select_computation,
+          select_and_scatter_request.window(), source, init_value,
+          scatter_computation));
       break;
     }
 
@@ -2150,9 +2145,8 @@ HloInstruction* ComputationLowerer::Visit(
                                        ShapeUtil::Rank(request.output_shape()) -
                                        ShapeUtil::Rank(operand->shape()));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
-              request.output_shape(), operand, broadcast_dimensions));
+      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
+          request.output_shape(), operand, broadcast_dimensions));
       break;
     }
 
@@ -2164,14 +2158,13 @@ HloInstruction* ComputationLowerer::Visit(
       if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
         transposed = operand;
       } else {
-        transposed =
-            hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-                ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                                 reshape_request.dimensions())),
-                                             operand->shape()),
-                operand, AsInt64Slice(reshape_request.dimensions())));
+        transposed = add_instruction(HloInstruction::CreateTranspose(
+            ShapeUtil::PermuteDimensions(
+                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
+                operand->shape()),
+            operand, AsInt64Slice(reshape_request.dimensions())));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateReshape(request.output_shape(), transposed));
       break;
     }
@@ -2180,12 +2173,11 @@ HloInstruction* ComputationLowerer::Visit(
       const TransposeRequest& transpose_request =
           request.request().transpose_request();
       HloInstruction* operand = Visit(transpose_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                               transpose_request.dimensions())),
-                                           operand->shape()),
-              operand, AsInt64Slice(transpose_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(
+              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
+              operand->shape()),
+          operand, AsInt64Slice(transpose_request.dimensions())));
       break;
     }
 
@@ -2193,10 +2185,9 @@ HloInstruction* ComputationLowerer::Visit(
       const ReverseRequest& reverse_request =
           request.request().reverse_request();
       HloInstruction* operand = Visit(reverse_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReverse(
-              request.output_shape(), operand,
-              AsInt64Slice(reverse_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
+          request.output_shape(), operand,
+          AsInt64Slice(reverse_request.dimensions())));
       break;
     }
 
@@ -2205,7 +2196,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* operand = Visit(pad_request.operand(), visited);
       HloInstruction* padding_value =
           Visit(pad_request.padding_value(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreatePad(
+      hlo_instruction = add_instruction(HloInstruction::CreatePad(
           request.output_shape(), operand, padding_value,
           pad_request.padding_config()));
       break;
@@ -2213,7 +2204,7 @@ HloInstruction* ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRecv(
+      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
       break;
     }
@@ -2221,10 +2212,9 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kParameterRequest: {
       const ParameterRequest& parameter_request =
           request.request().parameter_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateParameter(
-              parameter_request.parameter(), request.output_shape(),
-              parameter_request.name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
+          parameter_request.parameter(), request.output_shape(),
+          parameter_request.name()));
       break;
     }
 
@@ -2232,7 +2222,7 @@ HloInstruction* ComputationLowerer::Visit(
       const ConvertRequest& convert_request =
           request.request().convert_request();
       HloInstruction* operand = Visit(convert_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateConvert(request.output_shape(), operand));
       break;
     }
@@ -2249,7 +2239,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloComputation* body =
           ResolveComputation(while_request.body(), body_version);
       HloInstruction* init = Visit(while_request.init(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateWhile(
+      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
           request.output_shape(), condition, body, init));
       break;
     }
@@ -2261,9 +2251,8 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* rhs = Visit(ternary_op_request.rhs(), visited);
       HloInstruction* ehs = Visit(ternary_op_request.ehs(), visited);
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTernary(
-              request.output_shape(), hlo_opcode, lhs, rhs, ehs));
+      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
+          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
     }
 
@@ -2278,9 +2267,8 @@ HloInstruction* ComputationLowerer::Visit(
       }
       auto hlo_opcode =
           VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateVariadic(
-              request.output_shape(), hlo_opcode, operands));
+      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
+          request.output_shape(), hlo_opcode, operands));
       break;
     }
 
@@ -2295,7 +2283,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* call_computation =
           ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateCall(
+      hlo_instruction = add_instruction(HloInstruction::CreateCall(
           request.output_shape(), operands, call_computation));
       break;
     }
@@ -2307,9 +2295,8 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& operand : cc_request.operands()) {
         operands.push_back(Visit(operand, visited));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCustomCall(
-              cc_request.shape(), operands, cc_request.call_target_name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
+          cc_request.shape(), operands, cc_request.call_target_name()));
       break;
     }
 
@@ -2318,7 +2305,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().unary_op_request();
       HloInstruction* operand = Visit(unary_op_request.operand(), visited);
       auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateUnary(
+      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
           request.output_shape(), hlo_opcode, operand));
       break;
     }
@@ -2346,23 +2333,22 @@ HloInstruction* ComputationLowerer::Visit(
         // identical to the HLO broadcast semantics so the broadcast_dimensions
         // field can just be passed to the instruction builder.
         HloInstruction* broadcasted_operand =
-            hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+            add_instruction(HloInstruction::CreateBroadcast(
                 broadcast_shape, operand_to_broadcast,
                 AsInt64Slice(binary_op_request.broadcast_dimensions())));
 
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBinary(
-              request.output_shape(), hlo_opcode, lhs, rhs));
+      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
+          request.output_shape(), hlo_opcode, lhs, rhs));
       break;
     }
 
     case OpRequest::kTraceRequest: {
       const TraceRequest& trace_request = request.request().trace_request();
       HloInstruction* operand = Visit(trace_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
       operand->set_tracing(hlo_instruction);
       break;
@@ -2371,7 +2357,7 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kSendRequest: {
       const SendRequest& send_request = request.request().send_request();
       HloInstruction* operand = Visit(send_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSend(
+      hlo_instruction = add_instruction(HloInstruction::CreateSend(
           operand, send_request.channel_handle().handle()));
       break;
     }
@@ -2382,7 +2368,6 @@ HloInstruction* ComputationLowerer::Visit(
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
-  hlo_instruction->set_metadata(request.request().metadata());
   (*visited)[handle.handle()] = hlo_instruction;
   return hlo_instruction;
 }
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index e67254328ad6973ee63a83d45cd3c2618e39ff56..cf04cfde5003d70e26ce0a1543039c18c19282c9 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -17,12 +17,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -55,6 +59,9 @@ TEST_F(UserComputationTest, SimpleComputation) {
   param_request.set_name("param0");
   TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle param_handle,
                          computation.AddParameterInstruction(param_request));
+  OpMetadata metadata;
+  metadata.set_op_name("meta");
+  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
 
   OutfeedRequest outfeed_request;
   *outfeed_request.mutable_operand() = constant_handle;
@@ -89,8 +96,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
     EXPECT_EQ(3, hlo_computation->instruction_count());
     // The root of the instruction should be the parameter instruction (not the
     // outfeed).
-    EXPECT_EQ(HloOpcode::kParameter,
-              hlo_computation->root_instruction()->opcode());
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
   }
 
   {
@@ -114,8 +120,7 @@ TEST_F(UserComputationTest, SimpleComputation) {
                            computation.BuildHloComputation(
                                version_at_param.version, hlo_resolver));
     EXPECT_EQ(2, hlo_computation->instruction_count());
-    EXPECT_EQ(HloOpcode::kParameter,
-              hlo_computation->root_instruction()->opcode());
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
   }
   {
     // Test the computation at the latest version, but lowered with
@@ -132,8 +137,9 @@ TEST_F(UserComputationTest, SimpleComputation) {
     EXPECT_EQ(1, hlo_computation->instruction_count());
     // The root of the instruction should be the parameter instruction (not the
     // outfeed).
-    EXPECT_EQ(HloOpcode::kParameter,
-              hlo_computation->root_instruction()->opcode());
+    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
+              "meta");
   }
 }
 
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 2159386152b34e4f9b59ca14faa756e37551d724..c8851d2ca512450b4022e0f70d55399323b2fa08 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -21,7 +21,10 @@ limitations under the License.
 
 namespace xla {
 
-// Defines the interface for an XLA service.
+// Defines the interface for an XLA service on the client side. This service
+// helps abstract around the actual implementation of a service - the service
+// can be local (running in the same process), or remote - in which case an RPC
+// stub is used as the implementation.
 class ServiceInterface {
  public:
   ServiceInterface() {}
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 6963a68d10d527acebde65f30f9caf87608950cb..aa4341d18e1e6ef0dba5a4bcc057d9ef43d9bfb0 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -33,22 +33,65 @@ limitations under the License.
 
 namespace xla {
 
+namespace internal {
+
+// Internal representation of each node in a ShapeTree.
+template <typename T>
+struct ShapeTreeNode {
+  // Data corresponding to this node.
+  T data;
+
+  // Children of this node.
+  std::vector<std::unique_ptr<ShapeTreeNode>> children;
+
+  explicit ShapeTreeNode(const T& data) : data(data) {}
+
+  ShapeTreeNode(const ShapeTreeNode& other)
+      : data(other.data), children(other.children.size()) {
+    for (size_t i = 0; i < children.size(); ++i) {
+      children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+    }
+  }
+
+  ShapeTreeNode& operator=(const ShapeTreeNode& other) {
+    if (this != &other) {
+      data = other.data;
+      children.resize(other.children.size());
+      for (size_t i = 0; i < children.size(); ++i) {
+        children[i] = MakeUnique<ShapeTreeNode>(*other.children[i]);
+      }
+    }
+    return *this;
+  }
+};
+
+}  // namespace internal
+
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
-// XLA shape and holds a value of type T for each array in the shape. For
-// array shapes, a ShapeTree trivially holds a single value of type T. For tuple
-// shapes which can be an arbitrary tree with arrays at the leaves, a ShapeTree
-// is an identically structured tree with data elements of type T at the leaves.
+// XLA shape and holds a value of type T for each subshape (i.e. tuple or array)
+// in the shape. For array shapes, a ShapeTree trivially holds a single value of
+// type T.
+//
+// For tuple shapes which can be an arbitrary tree with arrays at the leaves, a
+// ShapeTree is an identically structured tree with data elements of type T at
+// every node. I.e. the root is a tuple by definition, all interior nodes are
+// also tuples, and all leaves are arrays.
 //
 // Like the Shape data structure, this is a tree and tuple elements cannot be
-// duplicated. That is, every distinct element position in the Shape has a
-// unique T object.
+// duplicated. That is, every distinct ShapeIndex in the Shape has a unique T
+// object.
 template <typename T>
 class ShapeTree {
  public:
-  explicit ShapeTree(const Shape& shape);
+  // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
+  ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
+  // Create ShapeTree with the given shape, and default T values for all nodes.
+  explicit ShapeTree(const Shape& shape) : ShapeTree(shape, T()) {}
+  // Create ShapeTree with the given shape, and init_value for all nodes.
   ShapeTree(const Shape& shape, const T& init_value);
-  ShapeTree(const ShapeTree<T>& other);
-  ShapeTree<T>& operator=(const ShapeTree<T>& other);
+
+  ShapeTree(const ShapeTree& other) = default;
+  ShapeTree& operator=(const ShapeTree& other) = default;
 
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
@@ -56,12 +99,12 @@ class ShapeTree {
   T* mutable_element(const ShapeIndex& index);
 
   // Return the shape represented with this ShapeTree.
-  const Shape& shape() const { return *shape_; }
+  const Shape& shape() const { return shape_; }
 
   // Returns true if the node at the given index is a leaf node (an array
   // shape).
   bool IsLeaf(const ShapeIndex& index) const {
-    return Lookup(index).elements_.empty();
+    return Lookup(index)->children.empty();
   }
 
   // Recursively traverses the shape and calls the given function at each
@@ -76,183 +119,125 @@ class ShapeTree {
   //
   // If any call to the given function returns a non-OK status, then traversal
   // is aborted and the status value is returned.
-  using VisitorFunction = std::function<tensorflow::Status(
+  using VisitorFunction = std::function<Status(
       const ShapeIndex& /*index*/, bool /*is_leaf*/, const T& /*data*/)>;
-  tensorflow::Status ForEachElement(VisitorFunction func) const;
+  Status ForEachElement(const VisitorFunction& func) const;
 
-  using MutableVisitorFunction = std::function<tensorflow::Status(
+  using MutableVisitorFunction = std::function<Status(
       const ShapeIndex& /*index*/, bool /*is_leaf*/, T* /*data*/)>;
-  tensorflow::Status ForEachMutableElement(MutableVisitorFunction func);
+  Status ForEachMutableElement(const MutableVisitorFunction& func);
 
  private:
-  // Private default constructor for non-root nodes of the tree.
-  ShapeTree() = default;
+  using Node = internal::ShapeTreeNode<T>;
+
+  // Initialize node->children based on 'shape'. All children are assigned the
+  // the given 'init_value'.
+  void InitChildren(const Shape& shape, const T& init_value, Node* node);
 
   // Helpers for traversing the shape via ForEachElement. The helpers
   // recursively traverse the subtree rooted at "index" (defined as in
   // ShapeUtil::GetSubshape).
-  static tensorflow::Status ForEachHelperMutable(ShapeIndex* index,
-                                                 ShapeTree<T>* shape_tree,
-                                                 MutableVisitorFunction func);
-  static tensorflow::Status ForEachHelper(ShapeIndex* index,
-                                          const ShapeTree<T>& shape_tree,
-                                          VisitorFunction func);
-
-  // Copy all the data elements (of type T) from "other" into "this". "this"
-  // must have the same tree structure as "other" prior to calling this method.
-  void CopyDataElements(const ShapeTree<T>& other);
-
-  // Recursive helper for constructing a subtree beneath "this" node.
-  void BuildTree(const Shape& shape);
+  static Status ForEachHelper(const VisitorFunction& func, const Node& node,
+                              ShapeIndex* index);
+  static Status ForEachMutableHelper(const MutableVisitorFunction& func,
+                                     Node* node, ShapeIndex* index);
 
   // Return the tree node at the given index.
-  ShapeTree<T>& Lookup(const ShapeIndex& index);
-  const ShapeTree<T>& Lookup(const ShapeIndex& index) const;
-
-  // The data corresponding to the array at this node.
-  T data_;
+  Node* Lookup(const ShapeIndex& index);
+  const Node* Lookup(const ShapeIndex& index) const;
 
-  // The XLA shape mirrored in this ShapeTree. Only the root of the
-  // ShapeTree has this member set.
-  std::unique_ptr<Shape> shape_;
+  // The root node, which contains all other nodes.
+  Node root_;
 
-  // The children of this node in the tree.
-  std::vector<std::unique_ptr<ShapeTree>> elements_;
+  // The XLA shape mirrored in this ShapeTree.
+  Shape shape_;
 };
 
 template <typename T>
-void ShapeTree<T>::BuildTree(const Shape& shape) {
+void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
+                                Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      elements_.emplace_back(new ShapeTree());
-      elements_.back()->BuildTree(shape.tuple_shapes(i));
+      node->children.emplace_back(new Node(init_value));
+      InitChildren(shape.tuple_shapes(i), init_value,
+                   node->children.back().get());
     }
   }
 }
 
-template <typename T>
-ShapeTree<T>::ShapeTree(const Shape& shape) : shape_(MakeUnique<Shape>(shape)) {
-  // The shape_ field is just used to hold the structure of the shape. It should
-  // not be relied upon to store layout information.
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
-}
-
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape& shape, const T& init_value)
-    : shape_(MakeUnique<Shape>(shape)) {
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
-  TF_CHECK_OK(ForEachMutableElement(
-      [&init_value](const ShapeIndex& /*index*/, bool /*is_leaf*/, bool* data) {
-        *data = init_value;
-        return tensorflow::Status::OK();
-      }));
-}
-
-template <typename T>
-ShapeTree<T>::ShapeTree(const ShapeTree& other)
-    : shape_(MakeUnique<Shape>(other.shape())) {
-  LayoutUtil::ClearLayout(shape_.get());
-  BuildTree(*shape_);
-  CopyDataElements(other);
-}
-
-template <typename T>
-ShapeTree<T>& ShapeTree<T>::operator=(const ShapeTree<T>& other) {
-  if (this == &other) {
-    return *this;
-  }
-  elements_.clear();
-  shape_ = MakeUnique<Shape>(other.shape());
-  LayoutUtil::ClearLayout(shape_.get());
-
-  BuildTree(*shape_);
-  CopyDataElements(other);
-  return *this;
-}
-
-template <typename T>
-void ShapeTree<T>::CopyDataElements(const ShapeTree<T>& other) {
-  CHECK(ShapeUtil::Compatible(shape(), other.shape()));
-  TF_CHECK_OK(ForEachMutableElement(
-      [&other](const ShapeIndex& index, bool /*is_leaf*/, T* data) {
-        *data = other.element(index);
-        return tensorflow::Status::OK();
-      }));
+    : root_(init_value), shape_(shape) {
+  // The shape_ field is just used to hold the structure of the shape.
+  // It should not be relied upon to store layout information.
+  LayoutUtil::ClearLayout(&shape_);
+  InitChildren(shape_, init_value, &root_);
 }
 
 template <typename T>
 const T& ShapeTree<T>::element(const ShapeIndex& index) const {
-  return Lookup(index).data_;
+  return Lookup(index)->data;
 }
 
 template <typename T>
 T* ShapeTree<T>::mutable_element(const ShapeIndex& index) {
-  return &Lookup(index).data_;
+  return &Lookup(index)->data;
 }
 
 template <typename T>
-ShapeTree<T>& ShapeTree<T>::Lookup(const ShapeIndex& index) {
-  ShapeTree<T>* node = this;
-  for (auto& i : index) {
+internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(const ShapeIndex& index) {
+  Node* node = &root_;
+  for (const int64 i : index) {
     CHECK_GE(i, 0);
-    CHECK_LT(i, node->elements_.size());
-    node = node->elements_[i].get();
+    CHECK_LT(i, node->children.size());
+    node = node->children[i].get();
   }
-  return *node;
+  return node;
 }
 
 template <typename T>
-const ShapeTree<T>& ShapeTree<T>::Lookup(const ShapeIndex& index) const {
-  return const_cast<ShapeTree<T>*>(this)->Lookup(index);
+const internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(
+    const ShapeIndex& index) const {
+  return const_cast<ShapeTree*>(this)->Lookup(index);
 }
 
 /* static */
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachHelperMutable(
-    ShapeIndex* index, ShapeTree<T>* shape_tree,
-    ShapeTree<T>::MutableVisitorFunction func) {
-  TF_RETURN_IF_ERROR(
-      func(*index, shape_tree->elements_.empty(), &shape_tree->data_));
-  for (int i = 0; i < shape_tree->elements_.size(); ++i) {
+Status ShapeTree<T>::ForEachHelper(const VisitorFunction& func,
+                                   const Node& node, ShapeIndex* index) {
+  TF_RETURN_IF_ERROR(func(*index, node.children.empty(), node.data));
+  for (int64 i = 0; i < node.children.size(); ++i) {
     index->push_back(i);
-    TF_RETURN_IF_ERROR(
-        ForEachHelperMutable(index, shape_tree->elements_[i].get(), func));
+    TF_RETURN_IF_ERROR(ForEachHelper(func, *node.children[i], index));
     index->pop_back();
   }
-
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 /* static */
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachHelper(
-    ShapeIndex* index, const ShapeTree<T>& shape_tree,
-    ShapeTree<T>::VisitorFunction func) {
-  TF_RETURN_IF_ERROR(
-      func(*index, shape_tree.elements_.empty(), shape_tree.data_));
-  for (int i = 0; i < shape_tree.elements_.size(); ++i) {
+Status ShapeTree<T>::ForEachMutableHelper(const MutableVisitorFunction& func,
+                                          Node* node, ShapeIndex* index) {
+  TF_RETURN_IF_ERROR(func(*index, node->children.empty(), &node->data));
+  for (int64 i = 0; i < node->children.size(); ++i) {
     index->push_back(i);
-    TF_RETURN_IF_ERROR(ForEachHelper(index, *shape_tree.elements_[i], func));
+    TF_RETURN_IF_ERROR(
+        ForEachMutableHelper(func, node->children[i].get(), index));
     index->pop_back();
   }
-
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachElement(
-    ShapeTree<T>::VisitorFunction func) const {
+Status ShapeTree<T>::ForEachElement(const VisitorFunction& func) const {
   ShapeIndex index;
-  return ForEachHelper(&index, *this, func);
+  return ForEachHelper(func, root_, &index);
 }
 
 template <typename T>
-tensorflow::Status ShapeTree<T>::ForEachMutableElement(
-    ShapeTree<T>::MutableVisitorFunction func) {
+Status ShapeTree<T>::ForEachMutableElement(const MutableVisitorFunction& func) {
   ShapeIndex index;
-  return ForEachHelperMutable(&index, this, func);
+  return ForEachMutableHelper(func, &root_, &index);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index d37f536b755d1feca57360edf950329197ba2dd4..efb6f422e008221c2f7d98e066c8aa6ae7bbf426 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_tree.h"
 
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -35,6 +35,9 @@ class ShapeTreeTest : public ::testing::Test {
               array_shape_})});
   }
 
+  void TestShapeConstructor(const Shape& shape, int expected_num_nodes);
+  void TestInitValueConstructor(const Shape& shape, int expected_num_nodes);
+
   // An array shape (non-tuple).
   Shape array_shape_;
 
@@ -45,6 +48,81 @@ class ShapeTreeTest : public ::testing::Test {
   Shape nested_tuple_shape_;
 };
 
+TEST_F(ShapeTreeTest, DefaultConstructor) {
+  ShapeTree<int> int_tree;
+  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+
+  ShapeTree<bool> bool_tree;
+  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+}
+
+void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
+                                         int expected_num_nodes) {
+  ShapeTree<int> int_tree(shape);
+  int num_nodes = 0;
+  TF_CHECK_OK(int_tree.ForEachElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool /*is_leaf*/, int data) {
+        EXPECT_EQ(0, data);
+        ++num_nodes;
+        return Status::OK();
+      }));
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  ShapeTree<bool> bool_tree(shape);
+  num_nodes = 0;
+  TF_CHECK_OK(bool_tree.ForEachElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool /*is_leaf*/, bool data) {
+        EXPECT_EQ(false, data);
+        ++num_nodes;
+        return Status::OK();
+      }));
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+}
+
+TEST_F(ShapeTreeTest, ShapeConstructor) {
+  TestShapeConstructor(array_shape_, 1);
+  TestShapeConstructor(tuple_shape_, 4);
+  TestShapeConstructor(nested_tuple_shape_, 10);
+}
+
+void ShapeTreeTest::TestInitValueConstructor(const Shape& shape,
+                                             int expected_num_nodes) {
+  ShapeTree<int> tree(shape, 42);
+  int num_nodes = 0;
+  TF_CHECK_OK(tree.ForEachElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool /*is_leaf*/, int data) {
+        EXPECT_EQ(42, data);
+        ++num_nodes;
+        return Status::OK();
+      }));
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  num_nodes = 0;
+  TF_CHECK_OK(tree.ForEachMutableElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool /*is_leaf*/, int* data) {
+        EXPECT_EQ(42, *data);
+        *data = num_nodes;
+        ++num_nodes;
+        return Status::OK();
+      }));
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+
+  num_nodes = 0;
+  TF_CHECK_OK(tree.ForEachElement(
+      [&num_nodes](const ShapeIndex& /*index*/, bool /*is_leaf*/, int data) {
+        EXPECT_EQ(num_nodes, data);
+        ++num_nodes;
+        return Status::OK();
+      }));
+  EXPECT_EQ(expected_num_nodes, num_nodes);
+}
+
+TEST_F(ShapeTreeTest, InitValueConstructor) {
+  TestInitValueConstructor(array_shape_, 1);
+  TestInitValueConstructor(tuple_shape_, 4);
+  TestInitValueConstructor(nested_tuple_shape_, 10);
+}
+
 TEST_F(ShapeTreeTest, ArrayShape) {
   ShapeTree<int> shape_tree{array_shape_};
   *shape_tree.mutable_element({}) = 42;
@@ -57,6 +135,15 @@ TEST_F(ShapeTreeTest, ArrayShape) {
   // Test the copy constructor.
   ShapeTree<int> copy{shape_tree};
   EXPECT_EQ(123, copy.element({}));
+
+  // Mutate the copy, and ensure the original doesn't change.
+  *copy.mutable_element({}) = 99;
+  EXPECT_EQ(99, copy.element({}));
+  EXPECT_EQ(123, shape_tree.element({}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(123, copy.element({}));
 }
 
 TEST_F(ShapeTreeTest, TupleShape) {
@@ -77,7 +164,7 @@ TEST_F(ShapeTreeTest, TupleShape) {
   TF_CHECK_OK(shape_tree.ForEachElement(
       [&sum](const ShapeIndex& /*index*/, bool /*is_leaf*/, int data) {
         sum += data;
-        return tensorflow::Status::OK();
+        return Status::OK();
       }));
   EXPECT_EQ(66, sum);
 
@@ -92,12 +179,23 @@ TEST_F(ShapeTreeTest, TupleShape) {
   TF_CHECK_OK(shape_tree.ForEachMutableElement(
       [&sum](const ShapeIndex& /*index*/, bool /*is_leaf*/, int* data) {
         *data = 0;
-        return tensorflow::Status::OK();
+        return Status::OK();
       }));
   EXPECT_EQ(0, shape_tree.element({}));
   EXPECT_EQ(0, shape_tree.element({0}));
   EXPECT_EQ(0, shape_tree.element({1}));
   EXPECT_EQ(0, shape_tree.element({2}));
+  EXPECT_EQ(1, copy.element({}));
+  EXPECT_EQ(42, copy.element({0}));
+  EXPECT_EQ(123, copy.element({1}));
+  EXPECT_EQ(-100, copy.element({2}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(0, copy.element({}));
+  EXPECT_EQ(0, copy.element({0}));
+  EXPECT_EQ(0, copy.element({1}));
+  EXPECT_EQ(0, copy.element({2}));
 }
 
 TEST_F(ShapeTreeTest, NestedTupleShape) {
@@ -116,6 +214,23 @@ TEST_F(ShapeTreeTest, NestedTupleShape) {
   EXPECT_EQ(42, copy.element({0}));
   EXPECT_EQ(123, copy.element({1, 1}));
   EXPECT_EQ(-100, copy.element({2, 0, 1}));
+
+  // Mutate the copy, and ensure the original doesn't change.
+  *copy.mutable_element({0}) = 1;
+  *copy.mutable_element({1, 1}) = 2;
+  *copy.mutable_element({2, 0, 1}) = 3;
+  EXPECT_EQ(1, copy.element({0}));
+  EXPECT_EQ(2, copy.element({1, 1}));
+  EXPECT_EQ(3, copy.element({2, 0, 1}));
+  EXPECT_EQ(42, shape_tree.element({0}));
+  EXPECT_EQ(123, shape_tree.element({1, 1}));
+  EXPECT_EQ(-100, shape_tree.element({2, 0, 1}));
+
+  // Test the assignment operator.
+  copy = shape_tree;
+  EXPECT_EQ(42, copy.element({0}));
+  EXPECT_EQ(123, copy.element({1, 1}));
+  EXPECT_EQ(-100, copy.element({2, 0, 1}));
 }
 
 TEST_F(ShapeTreeTest, InvalidIndexingTuple) {
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 57d91e4bfc1145faa25c9b5c57422c7653d4a163..ccc1dc63e78f8cb5aeaa5664a0d6917898db26b3 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/index_util.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -200,7 +202,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
-  shape->mutable_layout()->add_minor_to_major(ShapeUtil::Rank(*shape));
+  shape->mutable_layout()->add_minor_to_major(Rank(*shape));
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
 }
@@ -293,7 +295,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 
   std::vector<Shape> new_elements(tuple.tuple_shapes().begin() + start,
                                   tuple.tuple_shapes().begin() + limit);
-  return ShapeUtil::MakeTupleShape(new_elements);
+  return MakeTupleShape(new_elements);
 }
 
 /* static */ bool ShapeUtil::IsOpaque(const Shape& shape) {
@@ -307,7 +309,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   if (shape.element_type() != element_type) {
     return false;
   }
-  if (shape.dimensions_size() != ShapeUtil::Rank(shape)) {
+  if (shape.dimensions_size() != Rank(shape)) {
     return false;
   }
   int64 i = 0;
@@ -321,7 +323,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  CHECK_EQ(shape.dimensions_size(), ShapeUtil::Rank(shape));
+  CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
       std::multiplies<int64>());
@@ -332,7 +334,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 }
 
 /* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) {
-  return shape.element_type() == F32 && ShapeUtil::Rank(shape) == 0;
+  return shape.element_type() == F32 && Rank(shape) == 0;
 }
 
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
@@ -430,13 +432,12 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     }
     Shape result;
     if (layout_string.empty()) {
-      result = ShapeUtil::MakeShape(primitive_type, dimensions);
+      result = MakeShape(primitive_type, dimensions);
     } else {
       TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
                           comma_list_to_int64s(layout_string));
       TF_RET_CHECK(dimensions.size() == min2maj.size());
-      result =
-          ShapeUtil::MakeShapeWithLayout(primitive_type, dimensions, min2maj);
+      result = MakeShapeWithLayout(primitive_type, dimensions, min2maj);
     }
     TF_DCHECK_OK(ValidateShape(result));
     return result;
@@ -466,7 +467,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
 /* static */ int64 ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                  int64 dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += ShapeUtil::Rank(shape);
+    dimension_number += Rank(shape);
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -518,7 +519,7 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
   }
   int64 allocated_element_count;
   if (shape.layout().padded_dimensions_size() > 0) {
-    CHECK_EQ(ShapeUtil::Rank(shape), shape.layout().padded_dimensions_size());
+    CHECK_EQ(Rank(shape), shape.layout().padded_dimensions_size());
     allocated_element_count = 1;
     for (int64 dimension_size : shape.layout().padded_dimensions()) {
       allocated_element_count *= dimension_size;
@@ -534,9 +535,9 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     const Shape& shape) {
   if (shape.element_type() == TUPLE) {
     // Tuple shape.
-    if (ShapeUtil::Rank(shape) != 0) {
+    if (Rank(shape) != 0) {
       return InvalidArgument("tuples must be rank-0; got rank %lld",
-                             ShapeUtil::Rank(shape));
+                             Rank(shape));
     }
     if (shape.dimensions_size() != 0) {
       return InvalidArgument("tuples must not have dimensions specified");
@@ -556,13 +557,13 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     return InvalidArgument("shape has invalid element type: %s",
                            shape.ShortDebugString().c_str());
   }
-  if (ShapeUtil::Rank(shape) != shape.dimensions_size()) {
+  if (Rank(shape) != shape.dimensions_size()) {
     return InvalidArgument(
         "shape's rank is mismatched with dimension count; rank=%lld "
         "dimensions_size=%d",
-        ShapeUtil::Rank(shape), shape.dimensions_size());
+        Rank(shape), shape.dimensions_size());
   }
-  for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+  for (int64 i = 0; i < Rank(shape); ++i) {
     int64 dimension = shape.dimensions(i);
     if (dimension < 0) {
       return InvalidArgument(
@@ -675,7 +676,7 @@ namespace {
 // Helper for ForEachSubshape which visits the subshapes of the given shape in
 // DFS pre-order starting with the index.
 Status ForEachSubshapeHelper(const Shape& shape,
-                             const ShapeUtil::VisitorFunction func,
+                             const ShapeUtil::VisitorFunction& func,
                              ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(shape)) {
@@ -692,7 +693,7 @@ Status ForEachSubshapeHelper(const Shape& shape,
 // Helper for ForEachMutableSubshape which visits the subshapes of the given
 // shape in DFS pre-order starting with the index.
 Status ForEachMutableSubshapeHelper(
-    Shape* shape, const ShapeUtil::MutatingVisitorFunction func,
+    Shape* shape, const ShapeUtil::MutatingVisitorFunction& func,
     ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
   if (ShapeUtil::IsTuple(*shape)) {
@@ -709,13 +710,13 @@ Status ForEachMutableSubshapeHelper(
 }  // namespace
 
 /* static */ Status ShapeUtil::ForEachSubshape(const Shape& shape,
-                                               VisitorFunction func) {
+                                               const VisitorFunction& func) {
   ShapeIndex index;
   return ForEachSubshapeHelper(shape, func, &index);
 }
 
 /* static */ Status ShapeUtil::ForEachMutableSubshape(
-    Shape* shape, MutatingVisitorFunction func) {
+    Shape* shape, const MutatingVisitorFunction& func) {
   ShapeIndex index;
   return ForEachMutableSubshapeHelper(shape, func, &index);
 }
@@ -728,9 +729,17 @@ Status ForEachMutableSubshapeHelper(
     new_shape.add_dimensions(dim);
   }
   if (shape.has_layout()) {
-    new_shape.mutable_layout()->clear_minor_to_major();
+    Layout* new_layout = new_shape.mutable_layout();
+    new_layout->clear_minor_to_major();
     for (auto index : Permute(permutation, shape.layout().minor_to_major())) {
-      new_shape.mutable_layout()->add_minor_to_major(index);
+      new_layout->add_minor_to_major(index);
+    }
+    if (shape.layout().padded_dimensions_size() > 0) {
+      new_layout->clear_padded_dimensions();
+      for (auto dim :
+           Permute(permutation, shape.layout().padded_dimensions())) {
+        new_layout->add_padded_dimensions(dim);
+      }
     }
   }
   return new_shape;
@@ -783,8 +792,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(ShapeUtil::Rank(shape_pre),
-                             ShapeUtil::Rank(shape_post));
+            : std::make_pair(Rank(shape_pre), Rank(shape_post));
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return nil;
     }
@@ -859,9 +867,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     return false;
   }
 
-  CHECK_EQ(ShapeUtil::ElementsIn(input_shape),
-           ShapeUtil::ElementsIn(output_shape));
-  if (ShapeUtil::ElementsIn(input_shape) == 0) {
+  CHECK_EQ(ElementsIn(input_shape), ElementsIn(output_shape));
+  if (ElementsIn(input_shape) == 0) {
     return true;
   }
 
@@ -975,21 +982,17 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     // as input_shape/output_shape and the dimension-0-major layout. These two
     // shapes are used for conversion between logical linear indices and
     // multi-dimensional indices.
-    Shape input_shape_dim0_major =
-        ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-            input_shape.element_type(), AsInt64Slice(input_shape.dimensions()));
-    Shape output_shape_dim0_major =
-        ShapeUtil::MakeShapeWithMonotonicDim0MajorLayout(
-            output_shape.element_type(),
-            AsInt64Slice(output_shape.dimensions()));
-
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
+    Shape input_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+        input_shape.element_type(), AsInt64Slice(input_shape.dimensions()));
+    Shape output_shape_dim0_major = MakeShapeWithMonotonicDim0MajorLayout(
+        output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
+
+    for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64> input_unit_index(ShapeUtil::Rank(input_shape), 0);
+      std::vector<int64> input_unit_index(Rank(input_shape), 0);
       input_unit_index[input_dim] = 1;
       int64 logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1013,6 +1016,140 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
          check_input_unit_indices(output_shape, input_shape);
 }
 
+/* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
+    const Shape& input_shape, const Shape& output_shape) {
+  int64 input_rank = Rank(input_shape);
+  int64 output_rank = Rank(output_shape);
+
+  // First, calculate an alignment of the dimensions. A consecutive sequence of
+  // input dimensions and output dimensions belong to the same alignment part if
+  // the products of their dimension bounds are the same. In the easiest case,
+  // an alignment part consists of one input dimension and one output dimension
+  // which both have the same dimension bound. An alignment part specifies which
+  // dimensions need to be kept together in a physical layout if we want a
+  // reshape to be a bitcast. The order of the alignment parts is defined by the
+  // physical layout of the input shape, so when we construct the layout for the
+  // output shape we just process the alignment parts in this order, and then
+  // layout the dimensions belonging to each part in descending (major to minor)
+  // order.
+
+  // Stores the input and output dimension numbers where each alignment part
+  // starts.
+  std::vector<std::pair<int64, int64>> alignment;
+  alignment.push_back({0, 0});
+
+  // Stores a mapping from the input dimension to the alignment part it belongs
+  // to.
+  std::vector<int64> dimension_to_alignment_index(input_rank);
+  int64 input_dimension_product = 1, output_dimension_product = 1;
+  for (int64 i = 0, j = 0; i < input_rank || j < output_rank;) {
+    // Check if we have reached the end of an alignment part.
+    if (input_dimension_product == output_dimension_product &&
+        input_dimension_product > 1) {
+      alignment.push_back({i, j});
+      input_dimension_product = output_dimension_product = 1;
+    }
+    if (input_dimension_product < output_dimension_product ||
+        j == output_rank) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      dimension_to_alignment_index[i] = alignment.size() - 1;
+      input_dimension_product *= input_shape.dimensions(i);
+      ++i;
+    } else {
+      output_dimension_product *= output_shape.dimensions(j);
+      ++j;
+    }
+  }
+  if (input_dimension_product != output_dimension_product) {
+    return tensorflow::gtl::nullopt;
+  }
+  // We also need to store an end element so that we know where the last
+  // alignment part ends.
+  alignment.push_back({input_rank, output_rank});
+
+  // Now check if the physical layout can potentially be aligned to the output
+  // shape by changing the physical layout of the output shape. We need to check
+  // that all dimension numbers that belong to the same alignment part appear
+  // consecutively, and are in descending order. However we can ignore any
+  // trivial dimension bounds of 1, because they can be placed anywhere.
+  auto input_dimension_numbers = input_shape.layout().minor_to_major();
+  std::vector<int64> output_layout;
+  output_layout.reserve(output_rank);
+  for (int64 i = 0; i < input_rank;) {
+    int64 current_dimension_number = input_dimension_numbers[i];
+
+    // Skip trivial dimensions with a bound of 1.
+    if (input_shape.dimensions(current_dimension_number) == 1) {
+      ++i;
+      continue;
+    }
+
+    // Calculate the number of non-trivial dimension bounds in the input shape
+    // belonging to the current alignment part.
+    const int64 current_alignment_index =
+        dimension_to_alignment_index[current_dimension_number];
+    // Because of the special end element that we added, we can be sure that
+    // 'current_alignment_index' is < alignment.size() - 1.
+    CHECK_LT(current_alignment_index, alignment.size() - 1);
+    int64 num_non_trivial_dimensions_in_alignment_part = 0;
+    for (int64 j = alignment[current_alignment_index].first;
+         j < alignment[current_alignment_index + 1].first; ++j) {
+      if (input_shape.dimensions(j) != 1) {
+        ++num_non_trivial_dimensions_in_alignment_part;
+      }
+    }
+
+    // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
+    // dimension numbers (ignoring dimension numbers with dimension bound 1) are
+    // in descending order and belong to the current alignment part.
+    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+         ++i, ++j) {
+      if (i == input_rank) {
+        return tensorflow::gtl::nullopt;
+      }
+      // Skip trivial dimensions with a bound of 1.
+      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
+        --j;
+        continue;
+      }
+      // If the current dimension number belongs to a different alignment part,
+      // or the dimension numbers are not in descending order, we can return
+      // early.
+      if (dimension_to_alignment_index[input_dimension_numbers[i]] !=
+              current_alignment_index ||
+          input_dimension_numbers[i] > current_dimension_number) {
+        return tensorflow::gtl::nullopt;
+      }
+      current_dimension_number = input_dimension_numbers[i];
+    }
+
+    // The output dimension numbers that belong to the current alignment part
+    // need to appear in the same descending order as in the input. Again, we
+    // can skip dimensions with a bound of 1.
+    for (int64 j = alignment[current_alignment_index + 1].second - 1;
+         j >= alignment[current_alignment_index].second; --j) {
+      if (output_shape.dimensions(j) != 1) {
+        output_layout.push_back(j);
+      }
+    }
+  }
+  // Now add all the dimensions with dimension bound 1 at the end of
+  // 'output_layout'.
+  for (int64 i = 0; i < output_rank; ++i) {
+    if (output_shape.dimensions(i) == 1) {
+      output_layout.push_back(i);
+    }
+  }
+  CHECK_EQ(output_layout.size(), output_rank);
+  Shape output_shape_with_layout = MakeShapeWithLayout(
+      output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
+      output_layout);
+  CHECK(ReshapeIsBitcast(input_shape, output_shape_with_layout));
+  return output_shape_with_layout;
+}
+
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
@@ -1047,4 +1184,31 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
+/* static */ void ShapeUtil::ForEachIndex(
+    const Shape& shape, tensorflow::gtl::ArraySlice<int64> base,
+    tensorflow::gtl::ArraySlice<int64> count,
+    tensorflow::gtl::ArraySlice<int64> incr,
+    const IndexVisitorFunction& visitor_function) {
+  DCHECK_EQ(Rank(shape), base.size());
+  DCHECK_EQ(incr.size(), base.size());
+  DCHECK_EQ(count.size(), base.size());
+  const Layout& layout = shape.layout();
+  int64 rank = layout.minor_to_major_size();
+  // Allows handling R0 arrays, such that the visitor function will be called
+  // once with the proper empty indexes.
+  int64 n = -1;
+  std::vector<int64> indexes(base.begin(), base.end());
+  while (n < rank && visitor_function(indexes)) {
+    // Increments dimensions in minor to major order.
+    for (n = 0; n < rank; ++n) {
+      int64 dim = layout.minor_to_major(n);
+      indexes[dim] += incr[dim];
+      if (indexes[dim] < base[dim] + count[dim]) {
+        break;
+      }
+      indexes[dim] = base[dim];
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 68e138e6aca9d2cf157466eca1ea6960e3c448e8..aaf8e84cfecb89080d690c66acd4f8d50ee17d56 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -299,13 +300,14 @@ class ShapeUtil {
   // pre-order starting with the entire shape (index {}).
   using VisitorFunction = std::function<Status(const Shape& /*subshape*/,
                                                const ShapeIndex& /*index*/)>;
-  static Status ForEachSubshape(const Shape& shape, VisitorFunction func);
+  static Status ForEachSubshape(const Shape& shape,
+                                const VisitorFunction& func);
 
   // Mutating variant of ForEachSubshape.
   using MutatingVisitorFunction =
       std::function<Status(Shape* /*subshape*/, const ShapeIndex& /*index*/)>;
   static Status ForEachMutableSubshape(Shape* shape,
-                                       MutatingVisitorFunction func);
+                                       const MutatingVisitorFunction& func);
 
   // Removes all degenerate dimensions (size one) from the given shape. The
   // stripped minor_to_major preserves the relative ordering of non-degenerate
@@ -377,6 +379,15 @@ class ShapeUtil {
   static bool ReshapeIsBitcast(const Shape& input_shape,
                                const Shape& output_shape);
 
+  // Find a physical layout for 'output_shape' such that
+  // ShapeUtil::ReshapeIsBitcast(input_shape, output_shape_with_layout) returns
+  // true (where 'output_shape_with_layout' is 'output_shape' with the found
+  // layout). The layout of 'input_shape' is kept fixed. Returns
+  // 'output_shape_with_layout' if such a layout can be found, and an error
+  // otherwise.
+  static tensorflow::gtl::optional<Shape> AlignLayouts(
+      const Shape& input_shape, const Shape& output_shape);
+
   // Returns a shape with the given dimension deleted.
   // For example:
   // • `DeleteDimension(1, T[m, n, k]) = T[m, k]`
@@ -390,6 +401,19 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
+  // Iterates through all the shape indexes, in minor to major order, starting
+  // from the base indexes, incrementing by the incr steps, up to count
+  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
+  // current index.
+  // The visitor_function visitor function should return true if it wants to
+  // continue, or false otherwise.
+  using IndexVisitorFunction = std::function<bool(const std::vector<int64>&)>;
+  static void ForEachIndex(const Shape& shape,
+                           tensorflow::gtl::ArraySlice<int64> base,
+                           tensorflow::gtl::ArraySlice<int64> count,
+                           tensorflow::gtl::ArraySlice<int64> incr,
+                           const IndexVisitorFunction& visitor_function);
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 9e6b243611b57d38339a8f6460c655255f60899d..73538b8b88ecf14c00854d3c31715af8189bc21d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/test.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
   Shape matrix = ShapeUtil::MakeShape(F32, {2, 3});
   EXPECT_EQ(3, ShapeUtil::GetDimension(matrix, -1));
@@ -446,21 +449,21 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1_to_1x1x1x1) {
   // All input dimensions should be unmodified. One of the output dimensions is
   // modified because the output rank is larger by one.
-  EXPECT_EQ(3,
-            ShapeUtil::DimensionsUnmodifiedByReshape(
-                ShapeUtil::MakeShape(S32, {1, 1, 1}),
-                ShapeUtil::MakeShape(S32, {1, 1, 1, 1}))
-                .size());
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {1, 1, 1}),
+                  ShapeUtil::MakeShape(S32, {1, 1, 1, 1})),
+              ElementsAre(std::make_pair(0, 0), std::make_pair(1, 1),
+                          std::make_pair(2, 2)));
 }
 
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
@@ -468,11 +471,10 @@ TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_4x1x3x5x6x7_to_2x6x1x5x1x42) {
   // 4, 1, 3, 5, 6, 7
   //          |
   // 2, 6, 1, 5, 1, 42
-  EXPECT_TRUE(
-      ContainersEqual(ShapeUtil::DimensionsUnmodifiedByReshape(
-                          ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
-                          ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
-                      std::vector<std::pair<int64, int64>>({{3, 3}})));
+  EXPECT_THAT(ShapeUtil::DimensionsUnmodifiedByReshape(
+                  ShapeUtil::MakeShape(S32, {4, 1, 3, 5, 6, 7}),
+                  ShapeUtil::MakeShape(S32, {2, 6, 1, 5, 1, 42})),
+              ElementsAre(std::make_pair(3, 3)));
 }
 
 TEST(ShapeUtilTest, ReshapeIsBitcast_3x4_6x2) {
@@ -521,5 +523,58 @@ TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
       ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1})));
 }
 
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensions) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(4, 3, 2, 1, 0, 5));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+
+  aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {3, 2, 4, 35, 11}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(3, 2, 1, 0, 4));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 3, 8, 1, 5, 7, 1, 11, 1, 1},
+                                     {5, 0, 4, 2, 1, 3, 6, 7, 9, 8});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
+  EXPECT_TRUE(aligned_shape);
+  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
+              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+// A test case where the consecutive elements of the input shape belonging to
+// the same layout part are not in descending order.
+TEST(AlignmentTest, AlignLayoutsWithoutTrivialDimensionsWrongInputLayout) {
+  // Same physical layout as in AlignLayoutsWithoutTrivialDimensions, except
+  // that the first two dimension numbers are exchanged.
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {2, 3, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 7, 5, 11}));
+  EXPECT_FALSE(aligned_shape);
+}
+
+// A test case where the physical layout of the input shape does not place all
+// dimensions that belong to the same alignment part consecutively.
+TEST(AlignmentTest,
+     AlignLayoutsWithoutTrivialDimensionsNonConsecutiveAlignmentPart) {
+  Shape input = ShapeUtil::MakeShapeWithLayout(xla::F32, {3, 8, 5, 7, 11},
+                                               {3, 2, 1, 0, 4});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {4, 3, 2, 5, 77}));
+  EXPECT_FALSE(aligned_shape);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/status_macros_test.cc b/tensorflow/compiler/xla/status_macros_test.cc
index 4e7b9161db5c7e01a4b80da49bdded025eaf298a..dead17cdfa1e9f19e0ecfbc071e74e159ae82b5f 100644
--- a/tensorflow/compiler/xla/status_macros_test.cc
+++ b/tensorflow/compiler/xla/status_macros_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 
@@ -40,15 +40,15 @@ Status RetCheckSuccess() {
 TEST(StatusMacros, RetCheckFailing) {
   Status status = RetCheckFail();
   EXPECT_EQ(status.code(), tensorflow::error::INTERNAL);
-  EXPECT_MATCH(status.error_message(),
-               xla::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("RET_CHECK failure.*2 > 3"));
 }
 
 TEST(StatusMacros, RetCheckFailingWithExtraMessage) {
   Status status = RetCheckFailWithExtraMessage();
   EXPECT_EQ(status.code(), tensorflow::error::INTERNAL);
-  EXPECT_MATCH(status.error_message(),
-               xla::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
+  EXPECT_THAT(status.error_message(),
+              ::testing::ContainsRegex("RET_CHECK.*2 > 3 extra message"));
 }
 
 TEST(StatusMacros, RetCheckSucceeding) {
@@ -73,7 +73,7 @@ Status ReturnStatusError() { return (tensorflow::errors::Internal("foobar")); }
 
 using StatusReturningFunction = std::function<Status()>;
 
-StatusOr<int> CallStatusReturningFunction(StatusReturningFunction func) {
+StatusOr<int> CallStatusReturningFunction(const StatusReturningFunction& func) {
   TF_RETURN_IF_ERROR(func());
   return 42;
 }
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index d98eb2793363ac855b43f88eb4201f34a3b7693b..d3bc3e9225fd65b9ded18e970ecb7c81588078fe 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <memory>
 #include <type_traits>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/test.h b/tensorflow/compiler/xla/test.h
new file mode 100644
index 0000000000000000000000000000000000000000..87a8c5f3a528289d47c1729ae6719aae47037c36
--- /dev/null
+++ b/tensorflow/compiler/xla/test.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPLIER_XLA_TEST_H_
+#define TENSORFLOW_COMPLIER_XLA_TEST_H_
+
+// This header includes gmock.h and enables the use of gmock matchers in tests
+// in third_party/tensorflow/compiler/xla.
+//
+// Test including this header can use the macros EXPECT_THAT(...) and
+// ASSERT_THAT(...) in combination with gmock matchers.
+// Example:
+//  std::vector<int> vec = Foo();
+//  EXPECT_THAT(vec, ::testing::ElementsAre(1,2,3));
+//
+// For more details on gmock matchers see:
+// https://github.com/google/googletest/blob/master/googlemock/docs/CheatSheet.md#matchers
+//
+// The advantages of using gmock matchers instead of self defined matchers are
+// better error messages, more maintainable tests and more test coverage.
+//
+// Note that while the use of gmock matchers is allowed in the xla project, the
+// use of mocks is disallowed in the whole tensorflow project!
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
+#include "testing/base/public/gmock.h"
+#else
+#include <gmock/gmock-generated-matchers.h>
+#include <gmock/gmock-matchers.h>
+#endif
+
+#include "tensorflow/core/platform/test.h"
+
+#endif  // TENSORFLOW_COMPLIER_XLA_TEST_H_
diff --git a/tensorflow/compiler/xla/test_helpers.cc b/tensorflow/compiler/xla/test_helpers.cc
deleted file mode 100644
index 02abfdeab80ee34c79e8d54b825937d6fc4b4053..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/test_helpers.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/regexp.h"
-
-namespace xla {
-namespace testing {
-
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_ != nullptr ? new std::string(*other.message_)
-                                         : static_cast<std::string*>(nullptr)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_ != nullptr) negation << *message_;
-  return negation;
-}
-
-AssertionResult& AssertionResult::operator=(const AssertionResult& ar) {
-  success_ = ar.success_;
-  message_.reset(ar.message_ != nullptr ? new std::string(*ar.message_)
-                                        : nullptr);
-  return *this;
-}
-
-AssertionResult AssertionFailure() { return AssertionResult(false); }
-
-AssertionResult AssertionSuccess() { return AssertionResult(true); }
-
-std::function<bool(tensorflow::StringPiece)> ContainsRegex(
-    const tensorflow::StringPiece regex) {
-  return [regex](const tensorflow::StringPiece to_test) {
-    if (RE2::PartialMatch(
-            tensorflow::RegexpStringPiece(to_test.data(), to_test.size()),
-            tensorflow::RegexpStringPiece(regex.data(), regex.size()))) {
-      return true;
-    } else {
-      LOG(ERROR) << "Expected to find " << regex << " in " << to_test;
-      return false;
-    }
-  };
-}
-
-std::function<bool(tensorflow::StringPiece)> HasSubstr(
-    const tensorflow::StringPiece part) {
-  return [part](const tensorflow::StringPiece whole) {
-    return whole.contains(part);
-  };
-}
-
-}  // namespace testing
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index f923d9f36c878c1ae4e37f052a84e9c2a279b4ed..634cdb5aa29651b08090ff99f0a6cafb9facb645 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -39,286 +39,6 @@ class Literal;
 
 namespace testing {
 
-class AssertionResult {
- public:
-  explicit AssertionResult(bool success) : success_(success) {}
-
-  // Returns true iff the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_ != nullptr ? message_->c_str() : "";
-  }
-
-  // Streams a custom failure message into this object.
-  template <typename T>
-  AssertionResult& operator<<(const T& value) {
-    AppendMessage(::testing::Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      std::ostream& (*basic_manipulator)(std::ostream& stream)) {
-    AppendMessage(::testing::Message() << basic_manipulator);
-    return *this;
-  }
-
-  // Copy operator.
-  AssertionResult(const AssertionResult& ar);
-
-  // Assignment operator.
-  AssertionResult& operator=(const AssertionResult&);
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const ::testing::Message& a_message) {
-    if (message_ == nullptr) message_.reset(new std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  bool success_ = false;
-
-  // Stores the message describing the condition in case the
-  // expectation construct is not satisfied with the predicate's
-  // outcome.  Referenced via a pointer to avoid taking too much stack
-  // frame space with test assertions.
-  std::unique_ptr<std::string> message_;
-};
-
-AssertionResult AssertionFailure();
-
-AssertionResult AssertionSuccess();
-
-std::function<bool(tensorflow::StringPiece)> ContainsRegex(
-    const tensorflow::StringPiece regex);
-
-std::function<bool(tensorflow::StringPiece)> HasSubstr(
-    const tensorflow::StringPiece part);
-
-// Matcher for a vector of same-type values for which operator= is
-// defined.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> VectorMatcher(
-    const std::vector<T>& expected) {
-  return [expected](const std::vector<T>& actual) -> AssertionResult {
-    int len = expected.size();
-    if (actual.size() != len) {
-      return AssertionFailure() << "Actual values len of " << actual.size()
-                                << " != expected.size " << len;
-    }
-    for (int i = 0; i < len; ++i) {
-      if (actual[i] != expected[i]) {
-        return AssertionFailure() << "Element " << i << " actual " << actual[i]
-                                  << " != " << expected[i];
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Approximate matcher for a vector of floats or similar.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)>
-ApproxVectorMatcher(const std::vector<T>& expected, float abs_diff,
-                    float rel_diff) {
-  return [abs_diff, rel_diff,
-          expected](const std::vector<T>& actual) -> AssertionResult {
-    int len = expected.size();
-    if (actual.size() != len) {
-      AssertionResult ar = AssertionFailure() << "Actual values len of "
-                                              << actual.size()
-                                              << " != expected.size " << len;
-      LOG(ERROR) << ar.message();
-      return ar;
-    }
-    for (int i = 0; i < len; ++i) {
-      T diff = actual[i] - expected[i];
-      if (diff < 0) {
-        diff *= -1;
-      }
-      if (diff > abs_diff) {
-        T rdiff = (expected[i] != 0 ? diff / expected[i] : 0.0 * expected[i]);
-        if (rdiff > rel_diff) {
-          AssertionResult ar = AssertionFailure()
-                               << "Element " << i << " actual " << actual[i]
-                               << " != " << expected[i]
-                               << "( abs_diff = " << diff
-                               << ", rel_diff = " << rdiff << ")";
-          LOG(ERROR) << ar.message();
-          return ar;
-        }
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Matches a vector of same-type values against another, succeeding so
-// long as they have the same length and every value in 'actual'
-// matches one in 'expected.'  Does not verify an exhaustive
-// one-to-one mapping between the two.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)>
-UnorderedElementsAre(const std::vector<T>& expected) {
-  return [expected](const std::vector<T>& actual) -> AssertionResult {
-    if (actual.size() != expected.size()) {
-      return AssertionFailure() << "sizes don't match";
-    }
-    for (auto a : actual) {
-      bool found = false;
-      for (auto e : expected) {
-        if (a == e) {
-          found = true;
-          break;
-        }
-      }
-      if (!found) {
-        return AssertionFailure() << "actual element " << a
-                                  << " not in expected";
-      }
-    }
-    return AssertionSuccess();
-  };
-}
-
-// Overloaded cover functions for UnorderedElementsAre, for the numbers
-// of values used in practice.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d, T e) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  expected.push_back(e);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> UnorderedMatcher(
-    T a, T b, T c, T d, T e, T f) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  expected.push_back(e);
-  expected.push_back(f);
-  return testing::UnorderedElementsAre<T>(expected);
-}
-
-// Overloaded cover functions for VectorMatcher for the numbers of
-// elements used in practice.
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b, T c) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  return testing::VectorMatcher<T>(expected);
-}
-
-template <typename T>
-std::function<AssertionResult(const std::vector<T>& actual)> OrderedMatcher(
-    T a, T b, T c, T d) {
-  std::vector<T> expected;
-  expected.push_back(a);
-  expected.push_back(b);
-  expected.push_back(c);
-  expected.push_back(d);
-  return testing::VectorMatcher<T>(expected);
-}
-
-// Convert a RepeatedField to a flat vector.
-template <typename T>
-std::vector<T> PBToVec(const tensorflow::protobuf::RepeatedField<T> rf) {
-  return std::vector<T>(rf.begin(), rf.end());
-}
-
-// Convert a List to a flat vector.
-template <typename T>
-std::vector<T> ListToVec(const std::list<T>& l) {
-  return std::vector<T>(l.begin(), l.end());
-}
-
-// Convert a Set to a flat vector.
-template <typename T>
-std::vector<T> SetToVec(const std::set<T>& c) {
-  return std::vector<T>(c.begin(), c.end());
-}
-
-// Convert an Array to a flat vector.
-template <typename T>
-std::vector<T> Array2DToVec(const Array2D<T>& a) {
-  return std::vector<T>(a.data(), a.data() + a.num_elements());
-}
-
 namespace internal_status {
 inline const ::tensorflow::Status& GetStatus(
     const ::tensorflow::Status& status) {
@@ -347,9 +67,4 @@ inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
   ASSERT_EQ(tensorflow::Status::OK(), \
             xla::testing::internal_status::GetStatus(expression))
 
-// Macros that apply a Matcher to a Value, returning an
-// AssertionResult which gets digested by a standard gunit macro.
-#define EXPECT_MATCH(V, M) EXPECT_TRUE((M)((V)))
-#define ASSERT_MATCH(V, M) ASSERT_TRUE(M(V))
-
 #endif  // TENSORFLOW_COMPILER_XLA_TEST_HELPERS_H_
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index c7cbbdf4999970b0a09660ddadc31a068c752a55..e0c2b9ab09c28a7b7a31917b9250bdca8016d1e0 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -103,6 +104,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -198,11 +200,13 @@ cc_library(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:pool",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//third_party/eigen3",
     ],
 )
 
@@ -889,6 +893,7 @@ xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
     deps = [
+        ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
@@ -1204,12 +1209,12 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:cpu_compiler_flags",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/legacy_flags:hlo_pass_pipeline_flags",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -1359,6 +1364,7 @@ cc_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:local_service",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index d18511a6b4a98d42640ed22f6aa69c2e66465f8a..319cd2c6fd18e328435613de86fa2ad1d84f90aa 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -29,13 +29,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/casts.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -242,6 +242,150 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
+TEST_F(ArrayElementwiseOpTest, DivS32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<int32> vals = {
+    INT32_MIN, INT32_MIN + 1, INT32_MIN + 2, -0x40000000, -0x3fffffff,
+    -271181, -1309, -17, -10, -5, -3, -2, -1, 0, 1, 2, 3, 5, 10, 17, 26, 101,
+    7919, 0x40000000, INT32_MAX - 2, INT32_MAX - 1, INT32_MAX};
+  // clang-format on
+
+  std::vector<int32> dividends, divisors, quotients, remainders;
+  for (int32 divisor : vals) {
+    if (divisor != 0) {
+      for (int32 dividend : vals) {
+        // Avoid integer overflow.
+        if (dividend != INT32_MIN || divisor != -1) {
+          dividends.push_back(dividend);
+          divisors.push_back(divisor);
+          quotients.push_back(dividend / divisor);
+          remainders.push_back(dividend % divisor);
+        }
+      }
+    }
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Div(dividend, divisor);
+
+    ComputeAndCompareR1<int32>(&builder, quotients,
+                               {dividend_data.get(), divisor_data.get()});
+  }
+
+  // Test with a compile-time constant divisor.
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    builder.Div(dividend, builder.ConstantR1<int32>(divisors));
+
+    ComputeAndCompareR1<int32>(&builder, quotients, {dividend_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<int32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Rem(dividend, divisor);
+
+    ComputeAndCompareR1<int32>(&builder, remainders,
+                               {dividend_data.get(), divisor_data.get()});
+  }
+
+  // Test with a compile-time constant divisor.
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data =
+        CreateR1Parameter<int32>(dividends, 0, "dividend", &builder, &dividend);
+    builder.Rem(dividend, builder.ConstantR1<int32>(divisors));
+
+    ComputeAndCompareR1<int32>(&builder, remainders, {dividend_data.get()});
+  }
+}
+
+TEST_F(ArrayElementwiseOpTest, DivU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0xABCDEF12, 0xCAFEBEEF, 0x80000000,
+    0x80000001, UINT32_MAX - 2, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
+
+  std::vector<uint32> dividends, divisors, quotients, remainders;
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        dividends.push_back(dividend);
+        divisors.push_back(divisor);
+        quotients.push_back(dividend / divisor);
+        remainders.push_back(dividend % divisor);
+      }
+    }
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Div(dividend, divisor);
+
+    ComputeAndCompareR1<uint32>(&builder, quotients,
+                                {dividend_data.get(), divisor_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    builder.Div(dividend, builder.ConstantR1<uint32>(divisors));
+
+    ComputeAndCompareR1<uint32>(&builder, quotients, {dividend_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    ComputationDataHandle divisor;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    auto divisor_data =
+        CreateR1Parameter<uint32>(divisors, 1, "divisor", &builder, &divisor);
+    builder.Rem(dividend, divisor);
+
+    ComputeAndCompareR1<uint32>(&builder, remainders,
+                                {dividend_data.get(), divisor_data.get()});
+  }
+
+  {
+    ComputationBuilder builder(client_, TestName());
+    ComputationDataHandle dividend;
+    auto dividend_data = CreateR1Parameter<uint32>(dividends, 0, "dividend",
+                                                   &builder, &dividend);
+    builder.Rem(dividend, builder.ConstantR1<uint32>(divisors));
+
+    ComputeAndCompareR1<uint32>(&builder, remainders, {dividend_data.get()});
+  }
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>(
@@ -486,6 +630,18 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
+TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
+  // Disable fast-math because we're operating on NaNs.
+  SetFastMathDisabled(true);
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+  auto rhs = builder.ConstantR1<float>({10.0f, 25.5f, 1.0f, 10.0f, NAN});
+  auto compare = builder.Ne(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {true, false, true, true, true}, {});
+}
+
 TEST_F(ArrayElementwiseOpTest, CompareNeS32s) {
   const int32 min = std::numeric_limits<int32>::min();
   const int32 max = std::numeric_limits<int32>::max();
@@ -620,12 +776,14 @@ TEST_F(ArrayElementwiseOpTest, CompareLtU32s) {
 TEST_F(ArrayElementwiseOpTest, PowF32s) {
   SetFastMathDisabled(true);
   ComputationBuilder builder(client_, TestName());
-  auto lhs = builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f});
-  auto rhs = builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN});
+  auto lhs =
+      builder.ConstantR1<float>({4.0f, 2.0f, 2.0f, NAN, 6.0f, -2.0f, -2.0f});
+  auto rhs =
+      builder.ConstantR1<float>({2.0f, -2.0f, 3.0f, 10.0f, NAN, 3.0f, 4.0f});
   auto minimum = builder.Pow(lhs, rhs);
 
-  ComputeAndCompareR1<float>(&builder, {16.0f, 0.25f, 8.0f, NAN, NAN}, {},
-                             error_spec_);
+  ComputeAndCompareR1<float>(
+      &builder, {16.0f, 0.25f, 8.0f, NAN, NAN, -8.0f, 16.0f}, {}, error_spec_);
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
@@ -1667,9 +1825,9 @@ TEST_F(ArrayElementwiseOpTest, CannotAddOpaques) {
   auto concatenated = builder.Add(x, x);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::ContainsRegex(
-                   "Expected non-opaque argument for lhs of binary operation"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              ::testing::ContainsRegex(
+                  "Expected non-opaque argument for lhs of binary operation"));
 }
 
 // Regression test for b/31927799. "slice - y" is fused and requires implicit
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index c7b533b80f1901a32324a15a8f6584e628a4ad30..a67f18a44e10249bb4674624476c617d6f5c5ce5 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -23,12 +23,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -45,8 +44,8 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) {
   StatusOr<Computation> computation = builder.Build();
   EXPECT_FALSE(computation.ok());
   LOG(INFO) << "status received: " << computation.status();
-  EXPECT_MATCH(computation.status().error_message(),
-               testing::HasSubstr("shape has invalid"));
+  EXPECT_THAT(computation.status().error_message(),
+              ::testing::HasSubstr("shape has invalid"));
 }
 
 TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) {
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 63744afb4ea72006262aad74e9b8d75a09b107e6..901bed5f1488d6df19b6b0d3a1772d07fb60bf6d 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
 using BroadcastSimpleTest = ClientLibraryTestBase;
+using ::testing::HasSubstr;
 
 XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) {
   ComputationBuilder b(client_, TestName());
@@ -89,6 +89,33 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
   ComputeAndCompareR2<float>(&b, expected, {}, ErrorSpec(0.0001));
 }
 
+// Tests implicit broadcasting of PREDs.
+XLA_TEST_F(BroadcastSimpleTest, LogicalAnd2DTo3D_Pred) {
+  ComputationBuilder b(client_, TestName());
+
+  Array2D<bool> x_vals(2, 1);
+  x_vals(0, 0) = true;
+  x_vals(1, 0) = false;
+  Array3D<bool> y_vals(2, 2, 1);
+  y_vals(0, 0, 0) = false;
+  y_vals(0, 1, 0) = false;
+  y_vals(1, 0, 0) = true;
+  y_vals(1, 1, 0) = true;
+
+  ComputationDataHandle x, y;
+  auto x_data = CreateR2Parameter<bool>(x_vals, 0, "x", &b, &x);
+  auto y_data = CreateR3Parameter<bool>(y_vals, 1, "y", &b, &y);
+  b.LogicalAnd(x, y, /*broadcast_dimensions=*/{1, 2});
+
+  Array3D<bool> expected(2, 2, 1);
+  expected(0, 0, 0) = false;
+  expected(0, 1, 0) = false;
+  expected(1, 0, 0) = true;
+  expected(1, 1, 0) = false;
+
+  ComputeAndCompareR3<bool>(&b, expected, {x_data.get(), y_data.get()});
+}
+
 XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) {
   ComputationBuilder b(client_, TestName());
   b.Broadcast(b.ConstantR1<float>({}), {2});
@@ -127,6 +154,251 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) {
   ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
 }
 
+struct R3ImplicitBroadcastSpec {
+  std::array<int64, 3> output_bounds;
+  std::array<int64, 3> minor2major_layout;
+  std::array<int64, 3> input_bounds;
+  HloOpcode op;
+} kR3ImplicitBroadcastTestCases[] = {
+    {{{1, 1, 1}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 5}}, HloOpcode::kMaximum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 1}}, HloOpcode::kMultiply},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 1, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{1, 4, 5}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 4, 1}}, HloOpcode::kAdd},
+    {{{3, 4, 5}}, {{2, 1, 0}}, {{3, 1, 5}}, HloOpcode::kAdd},
+    {{{3, 199, 5}}, {{2, 1, 0}}, {{1, 199, 1}}, HloOpcode::kMinimum},
+    {{{3, 4, 199}}, {{2, 1, 0}}, {{1, 1, 199}}, HloOpcode::kAdd},
+};
+
+class BroadcastR3ImplicitTest
+    : public BroadcastSimpleTest,
+      public ::testing::WithParamInterface<R3ImplicitBroadcastSpec> {};
+
+XLA_TEST_P(BroadcastR3ImplicitTest, Doit) {
+  const R3ImplicitBroadcastSpec& spec = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  const Shape r3_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, spec.output_bounds, spec.minor2major_layout);
+  Array3D<float> r3_array(spec.output_bounds[0], spec.output_bounds[1],
+                          spec.output_bounds[2]);
+  r3_array.FillRandom(1.0, 2.5, 56789);
+  auto r3_input =
+      LiteralUtil::Relayout(*LiteralUtil::CreateR3FromArray3D(r3_array),
+                            LayoutUtil::MakeLayout(spec.minor2major_layout));
+  std::unique_ptr<GlobalData> r3_global_data =
+      client_->TransferToServer(*r3_input).ConsumeValueOrDie();
+
+  const Shape r3_implicit_shape = ShapeUtil::MakeShapeWithLayout(
+      F32, spec.input_bounds, spec.minor2major_layout);
+  Array3D<float> r3_implicit_array(spec.input_bounds[0], spec.input_bounds[1],
+                                   spec.input_bounds[2]);
+  r3_implicit_array.FillRandom(1.0, 0.2, 56789);
+  auto r3_implicit_input = LiteralUtil::Relayout(
+      *LiteralUtil::CreateR3FromArray3D(r3_implicit_array),
+      LayoutUtil::MakeLayout(spec.minor2major_layout));
+  std::unique_ptr<GlobalData> r3_implicit_global_data =
+      client_->TransferToServer(*r3_implicit_input).ConsumeValueOrDie();
+
+  auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input");
+  auto r3_parameter = builder.Parameter(1, r3_shape, "input");
+  ComputationDataHandle op;
+  switch (spec.op) {
+    case HloOpcode::kMinimum: {
+      auto tmp_op = builder.Min(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    case HloOpcode::kMaximum: {
+      auto tmp_op = builder.Max(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    case HloOpcode::kMultiply: {
+      auto tmp_op = builder.Mul(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+      break;
+    }
+    default: {
+      // Default to Add
+      auto tmp_op = builder.Add(r3_implicit_parameter, r3_parameter);
+      op.Swap(&tmp_op);
+    }
+  }
+
+  Array3D<float> expected_array(spec.output_bounds[0], spec.output_bounds[1],
+                                spec.output_bounds[2]);
+  auto Each = ([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
+    float r3_implicit = r3_implicit_array(indices[0] % spec.input_bounds[0],
+                                          indices[1] % spec.input_bounds[1],
+                                          indices[2] % spec.input_bounds[2]);
+    float r3 = r3_array(indices[0], indices[1], indices[2]);
+    switch (spec.op) {
+      case HloOpcode::kMinimum: {
+        *value = std::min(r3_implicit, r3);
+        break;
+      }
+      case HloOpcode::kMaximum: {
+        *value = std::max(r3_implicit, r3);
+        break;
+      }
+      case HloOpcode::kMultiply: {
+        *value = r3_implicit * r3;
+        break;
+      }
+      default: {
+        // Default to Add
+        *value = r3_implicit + r3;
+        break;
+      }
+    }
+  });
+
+  int n1 = expected_array.n1();
+  int n2 = expected_array.n2();
+  int n3 = expected_array.n3();
+  for (int64 i = 0; i < n1; i++) {
+    for (int64 j = 0; j < n2; j++) {
+      for (int64 k = 0; k < n3; k++) {
+        Each({i, j, k}, &expected_array(i, j, k));
+      }
+    }
+  }
+  auto expected = LiteralUtil::CreateR3FromArray3D(expected_array);
+  ComputeAndCompareLiteral(
+      &builder, *expected,
+      {r3_implicit_global_data.get(), r3_global_data.get()},
+      ErrorSpec(1e-7, 1e-7));
+}
+
+INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances,
+                        BroadcastR3ImplicitTest,
+                        ::testing::ValuesIn(kR3ImplicitBroadcastTestCases));
+
+// r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1:
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) {
+  ComputationBuilder b(client_, TestName());
+  ComputationDataHandle r1h;
+  ComputationDataHandle r3h;
+
+  Array3D<float> r1d = {{{1}}, {{2}}};
+  Array3D<float> r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
+  auto r1 = CreateR3Parameter(r1d, 1, "r1", &b, &r1h);
+  auto r3 = CreateR3Parameter(r3d, 0, "r3", &b, &r3h);
+
+  b.Add(r3h, r1h);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{7, 8}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {r3.get(), r1.get()},
+                           ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{6, 8}, {8, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}, {2}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{6, 7}, {9, 10}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {6, 8}}, {{6, 8}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 =
+      b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1, 2}}, {{3, 4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 4}, {4, 6}}, {{8, 10}, {10, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1}, {2}}, {{3}, {4}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {5, 6}}, {{8, 9}, {11, 12}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR3<float>({{{1}}}));
+  auto r3 = b.ConstantLiteral(
+      *LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}));
+  b.Add(r3, r1);
+
+  auto expected =
+      LiteralUtil::CreateR3<float>({{{2, 3}, {4, 5}}, {{6, 7}, {8, 9}}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 4}, {4, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
+XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) {
+  ComputationBuilder b(client_, TestName());
+  auto r1 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1}, {2}}));
+  auto r2 = b.ConstantLiteral(*LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}));
+  b.Add(r2, r1);
+
+  auto expected = LiteralUtil::CreateR2<float>({{2, 3}, {5, 6}});
+
+  ComputeAndCompareLiteral(&b, *expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) {
   ComputationBuilder b(client_, TestName());
   auto r1 = b.ConstantR1<float>({10, 20});
@@ -220,8 +492,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(result_status.status().error_message(),
-               testing::ContainsRegex("broadcast dimension 0 mismatch"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("broadcast dimension 0 mismatch"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
@@ -233,9 +505,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("binary op BINOP_ADD with incompatible shapes"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
@@ -247,9 +518,8 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
 
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("binary op BINOP_ADD with incompatible shapes"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("binary op BINOP_ADD with incompatible shapes"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 1796a732e543b7f040adf6055e349d72cfcfad6e..16d4282466c8a6db5e3e34bfa9deb86fd339c27b 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -265,6 +265,37 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
 }
 
+TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
+  auto builder = HloComputation::Builder(TestName());
+  Array3D<float> input_vals(2, 3, 4);
+  input_vals.FillRandom(1.0);
+
+  Array4D<float> expected(2, 3, 4, 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int m = 0; m < 5; ++m) {
+          expected(i, j, k, m) = input_vals(i, j, k);
+        }
+      }
+    }
+  }
+  auto input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR3FromArray3D<float>(input_vals)));
+
+  // Broadcast vector in dimensions 2 and 3.
+  builder.AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), input, {0, 1, 2}));
+
+  // Create HLO module, compile, and execute.
+  auto hlo_module = MakeUnique<HloModule>(TestName());
+  hlo_module->AddEntryComputation(builder.Build());
+  auto result = ExecuteAndTransfer(std::move(hlo_module), {});
+
+  LiteralTestUtil::ExpectNear(
+      *LiteralUtil::CreateR4FromArray4D<float>(expected), *result, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 0b5e6d512771cc6aebfd92af81bfdfa56d176088..9b96173aaa01199bdaf18d4b56d9f118432b2655 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 675c9fccb007f5a0a16b50618e849d3740877403..1bb1a1d6b4e4ce79413642b542cec8dd64ecba86 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -22,15 +22,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+
 class CheckExecutionArityTest : public ClientLibraryTestBase {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
@@ -60,15 +62,15 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
   ASSERT_FALSE(result_one_arg.ok());
   ASSERT_EQ(result_one_arg.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(result_one_arg.status().error_message(),
-               testing::ContainsRegex("takes 2"));
+  ASSERT_THAT(result_one_arg.status().error_message(),
+              ContainsRegex("takes 2"));
 
   auto result_zero_args = client_->Execute(computation, {});
   ASSERT_FALSE(result_zero_args.ok());
   ASSERT_EQ(result_zero_args.status().code(),
             tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(result_zero_args.status().error_message(),
-               testing::ContainsRegex("takes 2"));
+  ASSERT_THAT(result_zero_args.status().error_message(),
+              ContainsRegex("takes 2"));
 }
 
 XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
@@ -99,22 +101,22 @@ XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
   status = client_->Execute(computation, {f32_4_data.get(), f32_4_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 0"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 0"));
 
   // Shape mismatch in parameter 1 (rank)
   status = client_->Execute(computation, {f32_data.get(), f32_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 1"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 1"));
 
   // Shape mismatch in parameter 1 (element type)
   status = client_->Execute(computation, {f32_data.get(), u8_4_data.get()});
   ASSERT_FALSE(status.ok());
   ASSERT_EQ(status.status().code(), tensorflow::error::INVALID_ARGUMENT);
-  ASSERT_MATCH(status.status().error_message(),
-               testing::ContainsRegex("expects parameter 1"));
+  ASSERT_THAT(status.status().error_message(),
+              ContainsRegex("expects parameter 1"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index e6f3225bb79fca99f189d1e7ae7913715c5c2246..d5acea32ef700dc802dd7900b7ec8d454112f3e8 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -43,12 +42,13 @@ void CodegenTestBase::CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
 
 std::unique_ptr<Executable> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
-  auto module_config = MakeUnique<HloModuleConfig>(
+  HloModuleConfig module_config(
       hlo_module->entry_computation()->ComputeProgramShape());
-  module_config->set_fast_math_disabled(fast_math_disabled_);
+  module_config.set_fast_math_disabled(fast_math_disabled_);
+  hlo_module->set_config(module_config);
   return backend_->compiler()
-      ->Compile(std::move(hlo_module), std::move(module_config),
-                test_hlo_dumper_, backend_->default_stream_executor())
+      ->Compile(std::move(hlo_module), test_hlo_dumper_,
+                backend_->default_stream_executor())
       .ConsumeValueOrDie();
 }
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 709ce5029c82d52fe7a577d1e4cf7ea6ec07cecb..1d998fe33ebf71a2b35f99a51038e874edacc046 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -17,43 +17,81 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
+#include "tensorflow/compiler/xla/legacy_flags/hlo_pass_pipeline_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace {
 
-class ComputeConstantTest : public ClientLibraryTestBase {
+// An enumerator for the client types that we want to iterate over in
+// the various tests.
+enum class ClientType { kLocal, kCompileOnly };
+ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly};
+
+class ComputeConstantTest : public ::testing::Test {
  public:
+  explicit ComputeConstantTest(
+      perftools::gputools::Platform* platform = nullptr,
+      tensorflow::gtl::ArraySlice<string> disabled_pass_names = {})
+      : platform_(platform) {
+    legacy_flags::HloPassPipelineFlags* flags =
+        legacy_flags::GetHloPassPipelineFlags();
+    flags->xla_disable_hlo_passes =
+        tensorflow::str_util::Join(disabled_pass_names, ",");
+  }
+
+  string TestName() const {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  Client* ClientOrDie(::perftools::gputools::Platform* platform,
+                      ClientType client_type) {
+    if (client_type == ClientType::kLocal) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateLocalClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create LocalClient for testing";
+      return result.ValueOrDie();
+    } else if (client_type == ClientType::kCompileOnly) {
+      StatusOr<Client*> result =
+          ClientLibrary::GetOrCreateCompileOnlyClient(platform);
+      TF_CHECK_OK(result.status())
+          << "could not create CompileOnlyClient for testing";
+      return result.ValueOrDie();
+    }
+    LOG(FATAL) << "invalid client_type value";
+  }
+
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      ComputationDataHandle operand, ComputationBuilder* builder,
-      Layout* output_layout = nullptr) {
+      Client* client, const ComputationDataHandle& operand,
+      ComputationBuilder* builder, Layout* output_layout = nullptr) {
     TF_ASSIGN_OR_RETURN(auto remote_computed,
                         builder->ComputeConstant(operand, output_layout));
-    TF_ASSIGN_OR_RETURN(auto computed, client_->Transfer(*remote_computed));
+    TF_ASSIGN_OR_RETURN(auto computed, client->Transfer(*remote_computed));
     return std::move(computed);
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(ComputationDataHandle operand,
+  StatusOr<Scalar> ComputeConstantScalar(Client* client,
+                                         const ComputationDataHandle& operand,
                                          ComputationBuilder* builder) {
-    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(operand, builder));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        ComputeConstantLiteral(client, operand, builder));
     return LiteralUtil::Get<Scalar>(*literal, {});
   }
 
@@ -64,168 +102,193 @@ class ComputeConstantTest : public ClientLibraryTestBase {
     return result.ok() ? result.ValueOrDie() : false;
   }
 
-  template <class Scalar>
-  void ExpectConstantComputedScalar(ComputationDataHandle operand,
-                                    Scalar expected,
-                                    ComputationBuilder* builder) {
-    Scalar computed = ComputeConstantScalar<Scalar>(operand, builder);
-    ASSERT_TRUE(computed.ok()) << computed.status();
-    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0(expected);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
-  }
+  perftools::gputools::Platform* platform_;
 };
 
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.ConstantR0<int32>(42);
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<int32>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 42);
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.ConstantR0<int32>(42);
+    EXPECT_TRUE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<int32>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 42);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
+    EXPECT_TRUE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, ScalarRng) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
-                   ShapeUtil::MakeShape(F32, {}));
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  ASSERT_FALSE(value.ok())
-      << "computing a RNG value should not be considered a constant";
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
+                     ShapeUtil::MakeShape(F32, {}));
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    ASSERT_FALSE(value.ok())
+        << "computing a RNG value should not be considered a constant";
+  }
 }
 
 TEST_F(ComputeConstantTest, DirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 TEST_F(ComputeConstantTest, IndirectParam) {
-  ComputationBuilder b(client_, TestName());
-  auto computation =
-      b.Add(b.ConstantR0<float>(1.0f),
-            b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
-  EXPECT_FALSE(IsConstant(computation, &b));
-
-  auto value = ComputeConstantScalar<float>(computation, &b);
-  EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                  .contains("depends on parameter"))
-      << value.status();
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation =
+        b.Add(b.ConstantR0<float>(1.0f),
+              b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
+    EXPECT_FALSE(IsConstant(computation, &b));
+
+    auto value = ComputeConstantScalar<float>(client, computation, &b);
+    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
+                    .contains("depends on parameter"))
+        << value.status();
+  }
 }
 
 // Test computation of an expression interspersed with param nodes but
 // the expression does not depend on the param nodes.
 TEST_F(ComputeConstantTest, UnrelatedParam) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
-  auto constant_4 = b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
-  auto not_constant_a = b.Add(constant_4, param_a);
+    auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
+    auto constant_4 =
+        b.Add(b.ConstantR0<float>(2.5f), b.ConstantR0<float>(1.5f));
+    auto not_constant_a = b.Add(constant_4, param_a);
 
-  auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
-  auto constant_9 = b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
-  auto not_constant_b = b.Add(param_b, constant_9);
+    auto param_b = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "param1");
+    auto constant_9 =
+        b.Mul(b.ConstantR0<float>(2.0f), b.ConstantR0<float>(4.5f));
+    auto not_constant_b = b.Add(param_b, constant_9);
 
-  auto constant_13 = b.Add(constant_4, constant_9);
-  b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
+    auto constant_13 = b.Add(constant_4, constant_9);
+    b.Add(not_constant_b, b.Add(constant_13, not_constant_a));
 
-  EXPECT_TRUE(IsConstant(constant_13, &b));
+    EXPECT_TRUE(IsConstant(constant_13, &b));
 
-  auto value = ComputeConstantScalar<float>(constant_13, &b);
-  ASSERT_TRUE(value.ok()) << value.status();
-  EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+  }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
-  ComputationBuilder b(client_, TestName());
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
 
-  auto computation =
-      b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
-  EXPECT_TRUE(IsConstant(computation, &b));
+    auto computation =
+        b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal =
-      LiteralUtil::CreateR1<int32>({4, 6});
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    auto computed = ComputeConstantLiteral(client, computation, &b);
+    ASSERT_TRUE(computed.ok()) << computed.status();
+    std::unique_ptr<Literal> expected_literal =
+        LiteralUtil::CreateR1<int32>({4, 6});
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+  }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
-  ComputationBuilder b(client_, TestName());
-  auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
-  EXPECT_TRUE(IsConstant(computation, &b));
-
-  auto computed = ComputeConstantLiteral(computation, &b);
-  ASSERT_TRUE(computed.ok()) << computed.status();
-  std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
-  LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
-}
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
+    EXPECT_TRUE(IsConstant(computation, &b));
 
-XLA_TEST_F(ComputeConstantTest, Layout) {
-  ComputationBuilder b(client_, TestName());
-
-  std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
-  for (const std::vector<int64>& layout : layouts) {
-    auto layout_proto = LayoutUtil::MakeLayout(layout);
-    auto computed =
-        ComputeConstantLiteral(b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                                     b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-                               &b, &layout_proto);
+    auto computed = ComputeConstantLiteral(client, computation, &b);
     ASSERT_TRUE(computed.ok()) << computed.status();
-
-    std::unique_ptr<Literal> expected_literal =
-        test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
-                                                     layout);
-    LiteralTestUtil::AssertEqualShapesAndLayouts(
-        expected_literal->shape(), computed.ValueOrDie()->shape());
+    std::unique_ptr<Literal> expected_literal = LiteralUtil::CreateR0<int32>(5);
     LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
   }
 }
 
+XLA_TEST_F(ComputeConstantTest, Layout) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+
+    std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
+    for (const std::vector<int64>& layout : layouts) {
+      auto layout_proto = LayoutUtil::MakeLayout(layout);
+      auto computed = ComputeConstantLiteral(
+          client,
+          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+          &b, &layout_proto);
+      ASSERT_TRUE(computed.ok()) << computed.status();
+
+      std::unique_ptr<Literal> expected_literal =
+          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
+                                                       layout);
+      LiteralTestUtil::AssertEqualShapesAndLayouts(
+          expected_literal->shape(), computed.ValueOrDie()->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    }
+  }
+}
+
 // This test is permanently disabled on CPU because it requires that the
 // backend used for execution is different than the backend used for
 // ComputeConstant which is always cpu.
 TEST_F(ComputeConstantTest, DISABLED_ON_CPU(ReuseComputedConstant)) {
   // Compute a trivial constant, then try to use the value in an Execute
   // call. This should fail because the constant resides on the CPU and the
-  // Execute call is executed on a different backend.
-  ComputationBuilder constant_b(client_, TestName());
+  // Execute call is executed on a different backend.  This test only makes
+  // sense with LocalClient, since CompileOnlyClient does not support
+  // execution.
+  Client* client = ClientOrDie(platform_, ClientType::kLocal);
+  ComputationBuilder constant_b(client, TestName());
   auto constant = constant_b.ConstantR0<int32>(42);
   auto handle = constant_b.ComputeConstant(constant).ConsumeValueOrDie();
-  auto literal = client_->Transfer(*handle).ConsumeValueOrDie();
+  auto literal = client->Transfer(*handle).ConsumeValueOrDie();
   LiteralTestUtil::ExpectR0Equal(42, *literal);
 
   // Build trivial computation which takes one parameter.
-  ComputationBuilder b(client_, TestName());
+  ComputationBuilder b(client, TestName());
   b.Neg(b.Parameter(0, ShapeUtil::MakeShape(S32, {}), "param0"));
   auto computation = b.Build().ConsumeValueOrDie();
 
   // Try to use value from ComputeConstant in Execute.
-  auto execute_status = client_->Execute(computation, {handle.get()});
+  auto execute_status = client->Execute(computation, {handle.get()});
   EXPECT_FALSE(execute_status.ok());
-  EXPECT_MATCH(
+  EXPECT_THAT(
       execute_status.status().error_message(),
-      testing::ContainsRegex("argument 0 is on device Host:0 but computation "
-                             "will be executed on device"));
+      ::testing::ContainsRegex("argument 0 is on device Host:0 but computation "
+                               "will be executed on device"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9a48b19b96aea829ded626ddb4ac64c0fa42b64c..63bfac441d3c1f7aa257a7f9fc81df98f47551d5 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -34,6 +35,7 @@ namespace xla {
 namespace {
 
 using ConcatTest = ClientLibraryTestBase;
+using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
@@ -41,9 +43,8 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
   auto concatenated = builder.ConcatInDim({}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(
-      computation_status.status().ToString(),
-      testing::ContainsRegex("Concatenate expects at least one argument"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              HasSubstr("Concatenate expects at least one argument"));
 }
 
 // Concatenate with one argument works.
@@ -56,6 +57,15 @@ XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({});
+  auto concatenated = builder.ConcatInDim({a}, 0);
+
+  std::vector<float> expected = {};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
@@ -65,9 +75,8 @@ XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
   auto concatenated = builder.ConcatInDim({a, b}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::ContainsRegex(
-                   "dimension to concatenate along out of bounds: 0"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              HasSubstr("dimension to concatenate along out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
@@ -404,10 +413,9 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
   auto concatenated = builder.ConcatInDim({x, y}, 0);
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
-  EXPECT_MATCH(
+  EXPECT_THAT(
       computation_status.status().ToString(),
-      testing::ContainsRegex(
-          "Expected non-opaque argument for operand of concatenation"));
+      HasSubstr("Expected non-opaque argument for operand of concatenation"));
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 9f38dc4b365672733ed773043f77bc4a3e8405ef..4aff6dc7d57f635fbb8a14c2bdeb5581e00119c9 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -25,12 +25,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -43,8 +42,8 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
       ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_MATCH(dimension_numbers_status.status().error_message(),
-               testing::ContainsRegex("input are not unique"));
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("input are not unique"));
 }
 
 // Tests the convolution operation with invalid weight dimension numbers.
@@ -52,8 +51,8 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
       ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 2, 3, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
-  ASSERT_MATCH(dimension_numbers_status.status().error_message(),
-               testing::ContainsRegex("weight are not unique"));
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("weight are not unique"));
 }
 
 XLA_TEST_F(ConvolutionDimensionNumbersTest,
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 29e29505333b64926cdd0b3e9fe7ef3407eaaec2..8ea97e67d640d97baa70cddf60f3336a8849552a 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -256,6 +257,22 @@ XLA_TEST_F(CopyOpTest, CopyConstantR4Layout0312_MultipleTilesPerLayer) {
   TestCopyConstantLayoutR4(2, 14, 5, 35, {0, 3, 1, 2});
 }
 
+using CopyOpClientTest = ClientLibraryTestBase;
+
+XLA_TEST_F(CopyOpClientTest, Copy0x0) {
+  Shape in_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {0, 1});
+  Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
+  auto empty = LiteralUtil::CreateFromShape(in_shape);
+
+  ComputationBuilder builder(client_, TestName());
+  auto param0 = builder.Parameter(0, in_shape, "input");
+  auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie();
+
+  auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape)
+                    .ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*empty, *actual);
+}
+
 }  // namespace
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index dc54c9defec2394049c38781a8d02fc8bd05158a..f7dcf68c1b63a2efeb226965dd3a09963e876f2a 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -29,23 +29,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 
-extern "C" void __attribute__((visibility("default")))
-R0F32Add2(float* out, float** in) {
+
+extern "C" void TF_EXPORT R0F32Add2(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float*));
   *out = **in + 2.0f;
 }
 
-extern "C" void __attribute__((visibility("default")))
-R2F32ReduceSum(float* out, float** in) {
+extern "C" void TF_EXPORT R2F32ReduceSum(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
   *out = array[0] + array[1] + array[2] + array[3];
 }
 
-extern "C" void __attribute__((visibility("default")))
-Add1ToValues(float* out, float** in) {
+extern "C" void TF_EXPORT Add1ToValues(float* out, float** in) {
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(in, sizeof(float) * 4);
   float* array = in[0];
   out[0] = array[0] + 1;
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index 528efd2942b0ebbba16faba2a0543a2694cd5c2a..cc3c4a2a5e115d7791e8574f4ead17f77dcd5e7c 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -21,15 +21,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+using ::testing::HasSubstr;
+
 class DeallocationTest : public ClientLibraryTestBase {
  protected:
   // Build and execute the given computation then verify the results can be
@@ -50,7 +52,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
   builder.ConstantR0<float>(42.0);
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
-  // A result can be transfered an arbitrary number of times.  Add an extra
+  // A result can be transferred an arbitrary number of times.  Add an extra
   // transfer here so we're not just testing that a second call to Transfer
   // fails.
   ASSERT_IS_OK(client_->Transfer(*global_data).status());
@@ -59,8 +61,8 @@ TEST_F(DeallocationTest, DeallocateScalar) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 TEST_F(DeallocationTest, DeallocateVector) {
@@ -72,8 +74,8 @@ TEST_F(DeallocationTest, DeallocateVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 TEST_F(DeallocationTest, DeallocateEmptyVector) {
@@ -85,8 +87,8 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTuple) {
@@ -99,8 +101,8 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
@@ -114,8 +116,8 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
@@ -130,8 +132,8 @@ XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
 
   auto transfer_status = client_->Transfer(*global_data);
   ASSERT_FALSE(transfer_status.ok());
-  ASSERT_MATCH(transfer_status.status().error_message(),
-               testing::HasSubstr("was previously deallocated"));
+  ASSERT_THAT(transfer_status.status().error_message(),
+              HasSubstr("was previously deallocated"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index 57a7c61b141f3e8c5cf3ecc7e34043a79129c01b..60ce2b1b58c6a3b93d394b1fcd8066313ec30e9d 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -34,6 +35,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ContainsRegex;
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+
 class DeconstructTupleTest : public ClientLibraryTestBase {
  protected:
   // Build and execute the given computation then verify the results can be
@@ -63,9 +68,9 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) {
   auto handles = result_status.ConsumeValueOrDie();
   std::vector<float> copy(4);
   ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
@@ -85,16 +90,16 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
   std::vector<float> copy(4);
 
   ASSERT_IS_OK(client_->TransferInProcess(*handles1[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles1[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
   handles1[0].reset();
   handles1[1].reset();
 
   ASSERT_IS_OK(client_->TransferInProcess(*handles2[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles2[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
@@ -114,13 +119,13 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
 
   std::vector<float> copy(4);
   ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[3], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
@@ -140,17 +145,17 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
 
   std::vector<float> copy(4);
   ASSERT_IS_OK(client_->TransferInProcess(*handles[0], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[1], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({2.0, 4.0, 6.0, 8.0}));
+  EXPECT_THAT(copy, ElementsAre(2.0, 4.0, 6.0, 8.0));
   ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
 
   /// Try deallocating one of the repeated elements, then copy
   handles[0].reset();
 
   ASSERT_IS_OK(client_->TransferInProcess(*handles[2], &copy[0]));
-  EXPECT_MATCH(copy, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(copy, ElementsAre(1.0, 2.0, 3.0, 4.0));
 }
 
 TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
@@ -160,8 +165,8 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(result_status.status().error_message(),
-               testing::ContainsRegex("global data handle .* is not a tuple"));
+  EXPECT_THAT(result_status.status().error_message(),
+              ContainsRegex("global data handle .* is not a tuple"));
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
@@ -189,9 +194,8 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
 
   auto result_status = client_->DeconstructTuple(*global_data);
   EXPECT_FALSE(result_status.ok());
-  EXPECT_MATCH(
-      result_status.status().error_message(),
-      testing::ContainsRegex("deconstructing nested tuples not yet supported"));
+  EXPECT_THAT(result_status.status().error_message(),
+              HasSubstr("deconstructing nested tuples not yet supported"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 1d1fb337075855372ae54ac3c7e9abf55a6c32f1..cdb4498f4ed1e4f7fb2ad7a29a1cec4e26b76ed3 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -108,7 +109,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR1(const std::vector<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const std::vector<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -126,7 +127,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR2(const Array2D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array2D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
@@ -144,7 +145,7 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   template <typename IndexT>
   void RunR3(const Array3D<float>& input_values,
              const std::vector<IndexT> slice_starts,
-             const std::vector<int64> slice_sizes,
+             const std::vector<int64>& slice_sizes,
              const Array3D<float>& expected_values) {
     ComputationBuilder builder(client_, TestName());
     // Initialize and transfer dynamic slice start indices parameter.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 62878fed5549a6720a782d01c292ff143187e9a4..ca15f7395da79d7c5c05c03b6fafdca9e6953955 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -94,10 +94,10 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
           << LayoutUtil::HumanString(module_config->entry_computation_layout()
                                          .result_layout()
                                          .layout());
+  hlo_module->set_config(*module_config);
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend_->compiler()->Compile(std::move(hlo_module),
-                                    std::move(module_config), test_hlo_dumper_,
+      backend_->compiler()->Compile(std::move(hlo_module), test_hlo_dumper_,
                                     backend_->default_stream_executor()));
 
   se::Stream stream(backend_->default_stream_executor());
@@ -111,8 +111,9 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
       backend_->eigen_intra_op_thread_pool_device());
 
   HloExecutionProfile hlo_execution_profile;
-  ServiceExecutableRunOptions service_run_options(run_options,
-                                                  backend_->StreamBorrower());
+  ServiceExecutableRunOptions service_run_options(
+      run_options, backend_->StreamBorrower(),
+      backend_->inter_op_thread_pool());
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase result,
       executable->ExecuteOnStream(&service_run_options, arguments,
@@ -123,9 +124,7 @@ StatusOr<se::DeviceMemoryBase> HloTestBase::Execute(
 
   *result_shape = executable->result_shape();
 
-  // TODO(b/36256956) Ideally tuple elements could always be distinct buffers.
-  if (ShapeUtil::IsTuple(*result_shape) &&
-      backend_->transfer_manager()->TupleElementsAreDistinctBuffers()) {
+  if (ShapeUtil::IsTuple(*result_shape)) {
     // We must record element buffers of tuples as well to avoid leaks.
     DCHECK(!ShapeUtil::IsNestedTuple(*result_shape));
     TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 6119473d8158fe87b3611a3edc3490058556288a..d94602ffda2ea6cba8d734b0d814ae5d0dbbd28d 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -65,7 +65,7 @@ class HloTestBase : public ::testing::Test {
   perftools::gputools::DeviceMemoryBase TransferToDevice(
       const Literal& literal);
 
-  // Transfers the array refered to by the given handle from the device and
+  // Transfers the array referred to by the given handle from the device and
   // returns as a Literal.
   std::unique_ptr<Literal> TransferFromDevice(
       const Shape& shape, perftools::gputools::DeviceMemoryBase device_base);
@@ -84,28 +84,6 @@ class HloTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments);
 
-  // Helpers for comparing ordered and unordered equality of HloInstruction
-  // containers.
-  void ExpectEqOrdered(
-      tensorflow::gtl::ArraySlice<const HloInstruction*> actual,
-      tensorflow::gtl::ArraySlice<const HloInstruction*> expected) {
-    std::vector<const HloInstruction*> expected_vec(expected.begin(),
-                                                    expected.end());
-    std::vector<const HloInstruction*> actual_vec(actual.begin(), actual.end());
-    EXPECT_TRUE(testing::VectorMatcher<const HloInstruction*>(expected_vec)(
-        actual_vec));
-  }
-
-  void ExpectEqUnordered(
-      tensorflow::gtl::ArraySlice<const HloInstruction*> actual,
-      tensorflow::gtl::ArraySlice<const HloInstruction*> expected) {
-    std::vector<const HloInstruction*> expected_vec(expected.begin(),
-                                                    expected.end());
-    std::vector<const HloInstruction*> actual_vec(actual.begin(), actual.end());
-    EXPECT_TRUE(testing::UnorderedElementsAre<const HloInstruction*>(
-        expected_vec)(actual_vec));
-  }
-
   string TestName() const;
 
   std::unique_ptr<Backend> backend_;
diff --git a/tensorflow/compiler/xla/tests/inprocess_service_test.cc b/tensorflow/compiler/xla/tests/inprocess_service_test.cc
index ea0be07872f31b8e3357d91a164ce8727a159f63..97adf7ad6c974a3e258ee42b04f6be2e04c04e9d 100644
--- a/tensorflow/compiler/xla/tests/inprocess_service_test.cc
+++ b/tensorflow/compiler/xla/tests/inprocess_service_test.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/cpu_compiler_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -74,7 +74,7 @@ XLA_TEST_F(InProcessServiceTest, TransferFromServer) {
 
   std::vector<int32> result(3, 0);
   ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
-  EXPECT_MATCH(result, testing::VectorMatcher<int32>({1, 42, 5}));
+  EXPECT_THAT(result, ::testing::ElementsAre(1, 42, 5));
 }
 
 XLA_TEST_F(InProcessServiceTest, TransferToServer) {
@@ -148,7 +148,7 @@ XLA_TEST_F(InProcessServiceTest, ExecuteRowMajor) {
   Shape shape;
   ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
 
-  EXPECT_MATCH(result, testing::VectorMatcher<float>({1.0, 2.0, 3.0, 4.0}));
+  EXPECT_THAT(result, ::testing::ElementsAre(1.0, 2.0, 3.0, 4.0));
 }
 
 XLA_TEST_F(InProcessServiceTest, ExecuteColumnMajor) {
@@ -159,7 +159,7 @@ XLA_TEST_F(InProcessServiceTest, ExecuteColumnMajor) {
   Shape shape;
   ASSERT_IS_OK(client_->TransferInProcess(*handle, result.data()));
 
-  EXPECT_MATCH(result, testing::VectorMatcher<float>({1.0, 3.0, 2.0, 4.0}));
+  EXPECT_THAT(result, ::testing::ElementsAre(1.0, 3.0, 2.0, 4.0));
 }
 
 XLA_TEST_F(InProcessServiceTest, ExecuteAndReuseDifferentLayouts) {
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index f7bbc0f38bb501e042542cf7f0a3d4fadb3a2a23..23453db57bc4a5db0d3a4f7c327e3313333d1ae2 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -76,11 +76,11 @@ string Hostname() {
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
   if (ulhs != urhs) {
-    return testing::AssertionFailure() << tensorflow::strings::Printf(
+    return ::testing::AssertionFailure() << tensorflow::strings::Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
                tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
@@ -90,33 +90,33 @@ testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
                    .c_str(),
                rhs, rhs);
   }
-  return testing::AssertionSuccess();
+  return ::testing::AssertionSuccess();
 }
 
 // Templated comparator that specializes for float equality comparison with the
 // bitwise helper above (this is the un-specialized fallback, to just use the
 // default gunit implementation).
 template <typename NativeT>
-testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
+::testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
   if (lhs == rhs) {
-    return testing::AssertionSuccess();
+    return ::testing::AssertionSuccess();
   }
   ::testing::Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs;
   msg << "\n  " << rhs;
 
-  return testing::AssertionFailure() << msg;
+  return ::testing::AssertionFailure() << msg;
 }
 
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
-testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
+::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
   return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
 }
 template <>
-testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
+::testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
   return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
 }
 
@@ -130,7 +130,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = LiteralUtil::Get<NativeT>(expected, multi_index);
     NativeT actual_value = LiteralUtil::Get<NativeT>(actual, multi_index);
-    testing::AssertionResult result =
+    ::testing::AssertionResult result =
         CompareEqual<NativeT>(expected_value, actual_value);
     return result;  // Defines implicit coersion to bool.
   }
@@ -159,7 +159,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   EXPECT_FALSE(Equal(expected, actual));
 }
 
-/* static */ testing::AssertionResult LiteralTestUtil::Equal(
+/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
     const Literal& expected, const Literal& actual) {
   VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
   VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
@@ -207,9 +207,9 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
           << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
           << PrimitiveType_Name(expected.shape().element_type());
   }
-  testing::AssertionResult result = testing::AssertionSuccess();
+  ::testing::AssertionResult result = ::testing::AssertionSuccess();
   if (!match) {
-    result = testing::AssertionFailure()
+    result = ::testing::AssertionFailure()
              << "expected: " << LiteralUtil::ToString(expected)
              << "\nactual:   " << LiteralUtil::ToString(actual);
     VLOG(1) << result.message();
@@ -314,7 +314,7 @@ class NearComparator {
 
  private:
   // EXPECTs that the two given scalar values are within the error bound. Keeps
-  // track of how many mismatches have occured to keep the size of the output
+  // track of how many mismatches have occurred to keep the size of the output
   // manageable.
   template <typename NativeT>
   bool ExpectValuesNear(NativeT expected, NativeT actual) {
@@ -421,12 +421,12 @@ class NearComparator {
 
 }  // namespace
 
-/* static */ testing::AssertionResult LiteralTestUtil::Near(
+/* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
   NearComparator comparator(error);
   return comparator.ExpectNear(expected, actual)
-             ? testing::AssertionSuccess()
-             : testing::AssertionFailure() << "values were not near";
+             ? ::testing::AssertionSuccess()
+             : ::testing::AssertionFailure() << "values were not near";
 }
 
 /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
@@ -435,14 +435,14 @@ class NearComparator {
   EXPECT_TRUE(Near(expected, actual, error));
 }
 
-/* static */ testing::AssertionResult LiteralTestUtil::NearTuple(
+/* static */ ::testing::AssertionResult LiteralTestUtil::NearTuple(
     const Literal& expected, const Literal& actual, const ErrorSpec& error) {
   VLOG(1) << "expected: " << LiteralUtil::ToString(expected);
   VLOG(1) << "actual:   " << LiteralUtil::ToString(actual);
 
   if (!ShapeUtil::IsTuple(expected.shape()) ||
       !ShapeUtil::IsTuple(actual.shape())) {
-    return testing::AssertionFailure()
+    return ::testing::AssertionFailure()
            << "tuples expected expected shape = "
            << expected.shape().ShortDebugString()
            << " actual shape = " << actual.shape().ShortDebugString();
@@ -469,7 +469,7 @@ class NearComparator {
     }
   }
 
-  return testing::AssertionSuccess();
+  return ::testing::AssertionSuccess();
 }
 
 /* static */ void LiteralTestUtil::ExpectNearTuple(const Literal& expected,
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 85656a53e4400f2b0522e20a7b46922016432103..4f98083033310baf6ec95de0d2331d1aff8f3f7d 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -18,12 +18,14 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
+#include <random>
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -57,7 +59,7 @@ class LiteralTestUtil {
   // Asserts that the expected and actual literals are (bitwise) equal for all
   // elements in the literal. Also, asserts that the rank, dimensions sizes, and
   // primitive type are equal.
-  static testing::AssertionResult Equal(
+  static ::testing::AssertionResult Equal(
       const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
 
   // Expects that expected and actual are Equal.
@@ -101,7 +103,7 @@ class LiteralTestUtil {
   // Asserts that the expected and actual literals are within the given error
   // bound for all elements. Also, asserts that the rank, dimensions sizes, and
   // bounds are equivalent. Only supported for floating point values.
-  static testing::AssertionResult Near(
+  static ::testing::AssertionResult Near(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
 
@@ -147,7 +149,7 @@ class LiteralTestUtil {
   // tuples are within the given error bound. Tuples are matched recursively.
   // If the elements of the tuple are not floating-point types, the error spec
   // is ignored and exact equality is checked.
-  static testing::AssertionResult NearTuple(
+  static ::testing::AssertionResult NearTuple(
       const Literal& expected, const Literal& actual,
       const ErrorSpec& error) TF_MUST_USE_RESULT;
 
@@ -170,6 +172,36 @@ class LiteralTestUtil {
       tensorflow::gtl::ArraySlice<int64> minor_to_major,
       const Literal& literal);
 
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
@@ -269,6 +301,40 @@ template <typename NativeT>
   ExpectNear(*LiteralUtil::CreateR4FromArray4D(expected), actual, error);
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  std::unique_ptr<Literal> literal = LiteralUtil::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<NativeT>(
+      literal.get(), [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
+                                     T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_LITERAL_TEST_UTIL_H_
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 7ea83a9e956ca8b5bb26ea6aaa844d2b63107328..52816dc72cc4d094054b2aea72f0cc63c7ff478d 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -42,7 +42,7 @@ xla::Computation Doubler(xla::Client* client) {
 int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
-  auto client = xla::ClientLibrary::LocalClientOrDie();
+  auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
 
   xla::ComputationBuilder builder(client, "aot_test_helper");
   auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
@@ -74,7 +74,7 @@ int main(int argc, char** argv) {
   llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
 
   xla::Computation computation = builder.Build().ConsumeValueOrDie();
-  xla::LocalClient::AheadOfTimeComputationInstance instance{
+  xla::CompileOnlyClient::AotComputationInstance instance{
       &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
 
   xla::cpu::CpuAotCompilationOptions options(
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 7fe4c9020f4c67ecc9888425cf0a2c358ad49e6d..7fcf687655a98d3ee972f8d3b784be655410a313 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -17,12 +17,19 @@ limitations under the License.
 
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -91,16 +98,34 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const {
   return allocator_;
 }
 
+// Define this in .cc file to avoid having to include eigen or forward declare
+// these types in the header.
+struct LocalClientTestBase::EigenThreadPoolWrapper {
+  explicit EigenThreadPoolWrapper()
+      : pool(new tensorflow::thread::ThreadPool(
+            tensorflow::Env::Default(), "XLAEigenTest", /*num_threads=*/2)),
+        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        device(new Eigen::ThreadPoolDevice(wrapper.get(),
+                                           wrapper->NumThreads())) {}
+
+  std::unique_ptr<tensorflow::thread::ThreadPool> pool;
+  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<Eigen::ThreadPoolDevice> device;
+};
+
 LocalClientTestBase::LocalClientTestBase(
     perftools::gputools::Platform* platform)
     : local_client_(
-          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()) {
+          ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()),
+      thread_pool_wrapper_(new EigenThreadPoolWrapper()) {
   stream_executor_ = PlatformUtil::GetStreamExecutors(local_client_->platform())
                          .ValueOrDie()[local_client_->default_device_ordinal()];
   transfer_manager_ =
       TransferManager::GetForPlatform(local_client_->platform()).ValueOrDie();
 }
 
+LocalClientTestBase::~LocalClientTestBase() {}
+
 std::unique_ptr<ScopedShapedBuffer>
 LocalClientTestBase::LiteralToScopedShapedBuffer(const Literal& literal) {
   return LiteralToScopedShapedBuffer(literal,
@@ -190,8 +215,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   ExecutableRunOptions run_options;
   run_options.set_inter_op_thread_pool(
       local_client_->backend().inter_op_thread_pool());
-  run_options.set_intra_op_thread_pool(
-      local_client_->backend().eigen_intra_op_thread_pool_device());
+  run_options.set_intra_op_thread_pool(thread_pool_wrapper_->device.get());
   run_options.set_allocator(GetOrCreateAllocator(local_client_->platform()));
   return run_options;
 }
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 4e7b05cea60887eec628ce9b4848321e721030e5..e3c3bb46cf26cc742b7abb39a3e457d823d829ec 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -74,8 +74,10 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
 // A base class for tests which exercise the LocalClient interface.
 class LocalClientTestBase : public ::testing::Test {
  protected:
+  struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(
       perftools::gputools::Platform* platform = nullptr);
+  virtual ~LocalClientTestBase();
 
   static TestAllocator* GetOrCreateAllocator(
       perftools::gputools::Platform* platform);
@@ -142,6 +144,8 @@ class LocalClientTestBase : public ::testing::Test {
   TransferManager* transfer_manager_;
 
   LocalClient* local_client_;
+
+  std::unique_ptr<EigenThreadPoolWrapper> thread_pool_wrapper_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 2433c5653a6562b9672eeff81192dfc3152dffed..3cfa89e2e7d8d145932f0ceca0df349da3695f38 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -529,9 +529,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   StatusOr<Computation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
-  EXPECT_MATCH(computation_status.status().ToString(),
-               testing::HasSubstr("error from: ErrorAdd: binary op with "
-                                  "different element types: f32[] and u16[]"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              ::testing::HasSubstr("error from: ErrorAdd: binary op with "
+                                   "different element types: f32[] and u16[]"));
 }
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 0cd0f97b0621d771ae039f0be6bd6c67161b49a4..a0f98fcfef3b73f8ffff67ef679041197f78f5ba 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -55,7 +56,7 @@ void PrngTest::UniformTest(T a, T b, tensorflow::gtl::ArraySlice<int64> dims) {
 
   SetSeed(42);
   auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{});
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   LiteralUtil::EachCell<T>(*actual,
                            [=](tensorflow::gtl::ArraySlice<int64>, T value) {
                              EXPECT_LE(a, value);
@@ -75,7 +76,7 @@ void PrngTest::BernoulliTest(float p, tensorflow::gtl::ArraySlice<int64> dims) {
       auto actual,
       client_->ExecuteAndTransfer(computation, /*arguments=*/{},
                                   &execution_options));
-  EXPECT_TRUE(ContainersEqual(dims, actual->shape().dimensions()));
+  EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions()));
   int32 sum = 0;
   LiteralUtil::EachCell<uint32>(
       *actual, [&sum](tensorflow::gtl::ArraySlice<int64>, uint32 value) {
@@ -193,7 +194,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) {
   }
 }
 
-// This tests demonstrates the global seeding behaviour.
+// This tests demonstrates the global seeding behavior.
 // * If a seed is passed in via Execute (ExecuteAndTransfer) then the output is
 //   fixed (i.e., there is a single output for a given seed);
 // * If no seed is passed in then the output of every call can be different;
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 34fce21758b98c52831ac4ddb168d3e1538e9f1d..feb2b465fca6b1ffda190025568470e8daf297a3 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -61,7 +61,7 @@ namespace {
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
-    // Implementation note: layed out z >> y >> x by default.
+    // Implementation note: laid out z >> y >> x by default.
     // clang-format off
     literal_2d_ = LiteralUtil::CreateR2<float>({
       // x0   x1   x2
@@ -211,9 +211,9 @@ XLA_TEST_F(ReduceTest, ReduceR1_0_F32_To_R0) { RunR1ToR0Test(0); }
 XLA_TEST_F(ReduceTest, ReduceR1_1_F32_To_R0) { RunR1ToR0Test(1); }
 XLA_TEST_F(ReduceTest, ReduceR1_2_F32_To_R0) { RunR1ToR0Test(2); }
 XLA_TEST_F(ReduceTest, ReduceR1_16_F32_To_R0) { RunR1ToR0Test(16); }
-XLA_TEST_F(ReduceTest, ReduceR1_240_F32_To_R0) { RunR1ToR0Test(240); }
 XLA_TEST_F(ReduceTest, ReduceR1_128_F32_To_R0) { RunR1ToR0Test(128); }
 XLA_TEST_F(ReduceTest, ReduceR1_129_F32_To_R0) { RunR1ToR0Test(129); }
+XLA_TEST_F(ReduceTest, ReduceR1_240_F32_To_R0) { RunR1ToR0Test(240); }
 XLA_TEST_F(ReduceTest, ReduceR1_256_F32_To_R0) { RunR1ToR0Test(256); }
 XLA_TEST_F(ReduceTest, ReduceR1_1024_F32_To_R0) { RunR1ToR0Test(1024); }
 XLA_TEST_F(ReduceTest, ReduceR1_2048_F32_To_R0) { RunR1ToR0Test(2048); }
@@ -221,6 +221,9 @@ XLA_TEST_F(ReduceTest, ReduceR1_16K_F32_To_R0) { RunR1ToR0Test(16 * 1024); }
 XLA_TEST_F(ReduceTest, ReduceR1_16KP1_F32_To_R0) {
   RunR1ToR0Test(16 * 1024 + 1);
 }
+XLA_TEST_F(ReduceTest, ReduceR1_64K_F32_To_R0) { RunR1ToR0Test(64 * 1024); }
+XLA_TEST_F(ReduceTest, ReduceR1_1M_F32_To_R0) { RunR1ToR0Test(1024 * 1024); }
+XLA_TEST_F(ReduceTest, ReduceR1_16M_F32_To_R0) { RunR1ToR0Test(4096 * 4096); }
 
 XLA_TEST_F(ReduceTest, ReduceR2_0x0_To_R0) { RunR2ToR0Test(0, 0); }
 XLA_TEST_F(ReduceTest, ReduceR2_0x2_To_R0) { RunR2ToR0Test(0, 2); }
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 56501e43b5c5d965ea4305f2ca88909b253ed273..c3b768579a401706eff4a2a24d840da84080d26d 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -43,7 +43,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
  public:
   ReduceWindowTest() : builder_(client_, TestName()) {}
 
-  void ReduceWindowAdd(ComputationDataHandle input,
+  void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -52,7 +52,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(ComputationDataHandle input,
+  void ReduceWindowMax(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -61,7 +61,7 @@ class ReduceWindowTest : public ClientLibraryTestBase {
         CreateScalarMax(), window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(ComputationDataHandle input,
+  void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -182,6 +182,7 @@ TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
 
   ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
 }
+
 // TODO(b/31809540): Implement minor dim reduction to reduce num of reshapes.
 TEST_F(ReduceWindowTest, ReduceR4AmongXYMinorSmall) {
   Array4D<float> input_array(2, 2, 4, 16);
@@ -368,6 +369,16 @@ TEST_F(ReduceWindowTest, Add2x2In2x2Disjoint) {
   ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(ReduceWindowTest, Add1x2In2x2Same) {
+  Array2D<float> input_array({{1.0f, 2.0f}, {3.0f, 4.0f}});
+  auto input = builder_.ConstantR2FromArray2D<float>(input_array);
+  ReduceWindowAdd(input, {1, 2}, {1, 1}, Padding::kSame);
+  Array2D<float> expected({
+      {3.0f, 2.0f}, {7.0f, 4.0f},
+  });
+  ComputeAndCompareR2<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+}
+
 XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 18e6e2d3f1d6aedb68f83b8058517398760c39ba..c5f20b9ca1db1812f52a4d6f568ff9093016a90b 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -31,13 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -68,6 +67,22 @@ XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
   ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
 }
 
+XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal = LiteralUtil::CreateR0<float>(1.0f);
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
+  a = builder.Neg(a);
+  auto reshape =
+      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+
+  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
@@ -76,6 +91,24 @@ XLA_TEST_F(ReshapeTest, Trivial0x3) {
   ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-05-15
+// with an incorrect result rank.
+XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::unique_ptr<Literal> param0_literal =
+      LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
+  std::unique_ptr<GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+
+  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
+  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
+
+  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
+                             zero_error_spec_);
+}
+
 XLA_TEST_F(ReshapeTest, Trivial3x0) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
@@ -383,15 +416,15 @@ XLA_TEST_F(ReshapeTest, ToScalar) {
 XLA_TEST_F(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
   b.Reshape(b.ConstantR1<int32>({1}), {}, {});
-  EXPECT_MATCH(ExecuteToString(&b, {}),
-               testing::HasSubstr("dimensions not a permutation"));
+  EXPECT_THAT(ExecuteToString(&b, {}),
+              ::testing::HasSubstr("dimensions not a permutation"));
 }
 
 XLA_TEST_F(ReshapeTest, BadNewSizes) {
   ComputationBuilder b(client_, TestName());
   b.Reshape(b.ConstantR1<int32>({1, 2}), {1}, {});
-  EXPECT_MATCH(ExecuteToString(&b, {}),
-               testing::HasSubstr("mismatched element counts"));
+  EXPECT_THAT(ExecuteToString(&b, {}),
+              ::testing::HasSubstr("mismatched element counts"));
 }
 
 XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 134eb91a1fedf8624363c273813fe2145f64aab7..ceee24c307ed0a71200ba6b17b17a90ab009cd2d 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -245,37 +246,183 @@ XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) {
   ComputeAndCompareR0<float>(&builder, 2.5f, {}, error_spec_);
 }
 
-XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Div(builder.ConstantR0<int32>(-5), builder.ConstantR0<int32>(2));
+struct DivS32Params {
+  int32 dividend;
+  int32 divisor;
+  int32 quotient;
+  int32 remainder;
+};
 
-  ComputeAndCompareR0<int32>(&builder, -2, {});
-}
+void PrintTo(const DivS32Params& p, std::ostream* os) {
+  *os << "{" << p.dividend << ", " << p.divisor << ", " << p.quotient << ", "
+      << p.remainder << "}";
+}
+
+class DivS32Test : public ClientLibraryTestBase,
+                   public ::testing::WithParamInterface<DivS32Params> {};
+
+XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  builder.Div(builder.ConstantR0<int32>(p.dividend),
+              builder.ConstantR0<int32>(p.divisor));
+
+  ComputeAndCompareR0<int32>(&builder, p.quotient, {});
+}
+
+XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  builder.Rem(builder.ConstantR0<int32>(p.dividend),
+              builder.ConstantR0<int32>(p.divisor));
+
+  ComputeAndCompareR0<int32>(&builder, p.remainder, {});
+}
+
+XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle dividend;
+  ComputationDataHandle divisor;
+  auto dividendd =
+      CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
+  auto divisord =
+      CreateR0Parameter<int32>(p.divisor, 1, "divisor", &builder, &divisor);
+  builder.Div(dividend, divisor);
+
+  ComputeAndCompareR0<int32>(&builder, p.quotient,
+                             {dividendd.get(), divisord.get()});
+}
+
+XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
+  DivS32Params p = GetParam();
+  ComputationBuilder builder(client_, TestName());
+  ComputationDataHandle dividend;
+  ComputationDataHandle divisor;
+  auto dividendd =
+      CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
+  auto divisord =
+      CreateR0Parameter<int32>(p.divisor, 1, "divisor", &builder, &divisor);
+  builder.Rem(dividend, divisor);
+
+  ComputeAndCompareR0<int32>(&builder, p.remainder,
+                             {dividendd.get(), divisord.get()});
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DivS32Test_Instantiation, DivS32Test,
+    ::testing::Values(
+        // Positive divisors.
+        DivS32Params{5, 2, 2, 1},      //
+        DivS32Params{-5, 2, -2, -1},   //
+        DivS32Params{17, 3, 5, 2},     //
+        DivS32Params{-17, 3, -5, -2},  //
+        // Negative divisors.
+        DivS32Params{5, -2, -2, 1},    //
+        DivS32Params{-5, -2, 2, -1},   //
+        DivS32Params{17, -3, -5, 2},   //
+        DivS32Params{-17, -3, 5, -2},  //
+        // Large positive divisors.
+        DivS32Params{INT32_MIN, 7919, -271181, -1309},             //
+        DivS32Params{INT32_MIN, INT32_MAX, -1, -1},                //
+        DivS32Params{INT32_MIN + 1, INT32_MAX, -1, 0},             //
+        DivS32Params{INT32_MIN + 2, INT32_MAX, 0, INT32_MIN + 2},  //
+        DivS32Params{INT32_MIN, 0x40000000, -2, 0},                //
+        DivS32Params{INT32_MIN + 1, 0x40000000, -1, -0x3fffffff},  //
+        // Large negative divisors.
+        DivS32Params{INT32_MIN, INT32_MIN, 1, 0},                  //
+        DivS32Params{INT32_MIN, INT32_MIN + 1, 1, -1},             //
+        DivS32Params{INT32_MIN + 1, INT32_MIN, 0, INT32_MIN + 1},  //
+        DivS32Params{INT32_MAX, INT32_MIN, 0, INT32_MAX},          //
+        DivS32Params{INT32_MAX, INT32_MIN + 1, -1, 0},             //
+        DivS32Params{INT32_MIN, -0x40000000, 2, 0},                //
+        DivS32Params{INT32_MIN + 1, -0x40000000, 1, -0x3fffffff}));
+
+TEST_F(ScalarComputationsTest, DivU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
+
+  Computation div_computation;
+  {
+    ComputationBuilder builder(client_, TestName());
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsNegativeResultS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(-5), builder.ConstantR0<int32>(2));
+    ComputationDataHandle dividend =
+        builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
+    ComputationDataHandle divisor =
+        builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
+    builder.Div(dividend, divisor);
+    TF_ASSIGN_OR_ASSERT_OK(div_computation, builder.Build());
+  }
 
-  ComputeAndCompareR0<int32>(&builder, -1, {});
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
+        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
+        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
+                               client_->TransferToServer(*dividend_literal));
+        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
+                               client_->TransferToServer(*divisor_literal));
+        auto actual_literal =
+            client_
+                ->ExecuteAndTransfer(div_computation,
+                                     {dividend_data.get(), divisor_data.get()},
+                                     &execution_options_)
+                .ConsumeValueOrDie();
+        auto expected_literal =
+            LiteralUtil::CreateR0<uint32>(dividend / divisor);
+        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+      }
+    }
+  }
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsIntMinS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(INT_MIN),
-              builder.ConstantR0<int32>(7919));
+TEST_F(ScalarComputationsTest, RemU32s) {
+  // clang-format off
+  // Some interesting values to test.
+  std::vector<uint32> vals = {
+    0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
+  // clang-format on
 
-  ComputeAndCompareR0<int32>(&builder, -1309, {});
-}
+  Computation rem_computation;
+  {
+    ComputationBuilder builder(client_, TestName());
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsIntMinVsIntMaxS32) {
-  ComputationBuilder builder(client_, TestName());
-  builder.Rem(builder.ConstantR0<int32>(INT_MIN),
-              builder.ConstantR0<int32>(INT_MAX));
+    ComputationDataHandle dividend =
+        builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
+    ComputationDataHandle divisor =
+        builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
+    builder.Rem(dividend, divisor);
+    TF_ASSIGN_OR_ASSERT_OK(rem_computation, builder.Build());
+  }
 
-  ComputeAndCompareR0<int32>(&builder, -1, {});
+  for (uint32 divisor : vals) {
+    if (divisor != 0) {
+      for (uint32 dividend : vals) {
+        auto dividend_literal = LiteralUtil::CreateR0<uint32>(dividend);
+        auto divisor_literal = LiteralUtil::CreateR0<uint32>(divisor);
+        TF_ASSIGN_OR_ASSERT_OK(auto dividend_data,
+                               client_->TransferToServer(*dividend_literal));
+        TF_ASSIGN_OR_ASSERT_OK(auto divisor_data,
+                               client_->TransferToServer(*divisor_literal));
+        auto actual_literal =
+            client_
+                ->ExecuteAndTransfer(rem_computation,
+                                     {dividend_data.get(), divisor_data.get()},
+                                     &execution_options_)
+                .ConsumeValueOrDie();
+        auto expected_literal =
+            LiteralUtil::CreateR0<uint32>(dividend % divisor);
+        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+      }
+    }
+  }
 }
 
-TEST_F(ScalarComputationsTest, RemainderTwoScalarsPositiveResultS32) {
+TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
   ComputationBuilder builder(client_, TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 4cff1990865bcf1214a403a6241accbf82f06d00..5a2333e3386acbca43e3311cb6a316e298af9612 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -247,6 +248,230 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
+// Tests two while nodes when the result type T is a Tuple and the second
+// while node uses the result of the first while node which is used in two
+// nodes.
+// tuple<int32, vector<float>> w0(0, vector<float>(10, 0.0f));
+// w0 = while (get<0>(w0) < c1) {
+//        get<0>(w0) = get<0>(w0) + 1;
+//        get<1>(w0) = get<1>(w0) + vector<float>(10, 1.0f);
+//      }
+// tuple<int32, vector<float>> w1(get<0>(w0), get<1>(w0));
+// w1 = while (get<0>(w1) < c2) {
+//        get<0>(w1) = get<0>(w1) + 1;
+//        get<1>(w1) = get<1>(w1) + vector<float>(10, 1.0f);
+//      }
+// result = get<1>(w0) + get<1>(w1)
+TEST_F(WhileTest, TwoWhileWithTupleResult) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  Computation body2;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body2, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+
+  auto while2 = builder.While(condition2, body2, while1);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+// Test while nodes that share the while body computation.
+TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+
+  auto while2 = builder.While(condition2, body, while1);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+// Test while nodes that share the while body computation.
+// TODO(b/37245345): Fails on GPU backend.
+TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
+  std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
+                                       ShapeUtil::MakeShape(F32, {10})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for 5 iterations.
+  Computation condition;
+  const int c1 = 5;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c1));
+    TF_ASSIGN_OR_ASSERT_OK(condition, builder.Build());
+  }
+
+  Computation condition2;
+  const int c2 = 7;
+  {
+    ComputationBuilder builder(client_, "condition2");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Lt(iteration, builder.ConstantR0<int32>(c2));
+    TF_ASSIGN_OR_ASSERT_OK(condition2, builder.Build());
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and add a constant vector of 1.0f to
+  // the weight variable, both of which are tuple elements.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto weights = builder.GetTupleElement(prev, 1);
+    auto input = builder.ConstantR1<float>(10, 1.f);
+    auto new_weights = builder.Add(weights, input);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
+    TF_ASSIGN_OR_ASSERT_OK(body, builder.Build());
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
+  auto while1 = builder.While(condition, body, init);
+  auto while2 = builder.While(condition2, body, init);
+
+  auto while_result1 = builder.GetTupleElement(while1, 1);
+  auto while_result2 = builder.GetTupleElement(while2, 1);
+  VLOG(2) << "while_result2 = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+  auto result = builder.Add(while_result1, while_result2);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  const float sum = c1 + c2;
+  std::vector<float> expected(10, sum);
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
 // WhileTest that uses DynamicUpdateSlice instruction in body computation.
 // Loop state tuple element 1 has as its single user operand(0) of
 // DynamicUpdateSlice, which will trigger in-place dynamic slice update on GPU.
@@ -315,7 +540,8 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 //   result += (1, U[0, 100], U[0, 100], U[0, 100], U[0, 100], U[0, 100]);
 // }
 //
-// This test misuses a vector to represent a pair:
+// This test misuses a vector WhileTest.WhileLoopsWithSharedBodyto represent a
+// pair:
 //   ((iteration, (random vector))).
 //
 // Note: this test currently only tests generating random values within a loop.
diff --git a/tensorflow/compiler/xla/text_literal_reader_test.cc b/tensorflow/compiler/xla/text_literal_reader_test.cc
index 94d0f2646b15930f78c44fbb3d2b49fd6033a545..a167d80f73b0273739e22d94be8d90ab00839dc9 100644
--- a/tensorflow/compiler/xla/text_literal_reader_test.cc
+++ b/tensorflow/compiler/xla/text_literal_reader_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/text_literal_writer_test.cc b/tensorflow/compiler/xla/text_literal_writer_test.cc
index 9dce4d13bb0e21d399795c5310e30b7ab64ea4ea..177ae4ea036af660b7a2be1d4082b30ca8fb9fac 100644
--- a/tensorflow/compiler/xla/text_literal_writer_test.cc
+++ b/tensorflow/compiler/xla/text_literal_writer_test.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 46eab7f02bb12ca39e5713e7b0f96bfa178e9102..535e5b605b4f68671c9b6a8af4a12732f88e744e 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -153,6 +153,7 @@ cc_binary(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
@@ -176,6 +177,24 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "dumped_computation_to_tf_graphdef",
+    srcs = ["dumped_computation_to_tf_graphdef.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/legacy_flags:hlo_graph_dumper_flags",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 8b96e13489774539b50022808975db56c5ddc6f7..dc5a86f34e5b975fd8ba565d54e5c2c0b70bf53e 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,7 +35,7 @@ limitations under the License.
 namespace xla {
 namespace tools {
 
-void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
+void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
@@ -50,23 +51,38 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     }
     Computation computation = computation_status.ConsumeValueOrDie();
 
-    std::unique_ptr<ProgramShape> program_shape =
-        client->GetComputationShape(computation).ConsumeValueOrDie();
+    if (compile) {
+      std::unique_ptr<ProgramShape> program_shape =
+          client->GetComputationShape(computation).ConsumeValueOrDie();
 
-    std::vector<const Shape*> layouts;
-    for (int i = 0; i < program_shape->parameters_size(); ++i) {
-      layouts.push_back(&program_shape->parameters(i));
-    }
-    StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(
-            computation.handle(), layouts, &program_shape->result(),
-            /*device_ordinal=*/0, /*has_hybrid_result=*/true);
+      std::vector<const Shape*> layouts;
+      for (int i = 0; i < program_shape->parameters_size(); ++i) {
+        layouts.push_back(&program_shape->parameters(i));
+      }
+      StatusOr<std::unique_ptr<Executable>> executable =
+          local_service->CompileExecutable(
+              computation.handle(), layouts, &program_shape->result(),
+              /*device_ordinal=*/0, /*has_hybrid_result=*/true);
+
+      const HloModule& module = executable.ValueOrDie()->module();
 
-    const HloModule& module = executable.ValueOrDie()->module();
+      fprintf(stdout, "HLO compiled for %s backend:\n%s\n",
+              local_service->backend().platform()->Name().c_str(),
+              module.ToString().c_str());
+    } else {
+      const ComputationTracker& tracker = local_service->computation_tracker();
+      UserComputation* user_computation =
+          tracker.Resolve(computation.handle()).ConsumeValueOrDie();
+      VersionedComputationHandle versioned_handle =
+          user_computation->GetVersionedHandle();
+      std::unique_ptr<HloModule> module =
+          tracker
+              .BuildHloModule(versioned_handle,
+                              /*config=*/nullptr)
+              .ConsumeValueOrDie();
 
-    fprintf(stdout, "HLO for %s backend:\n%s\n",
-            local_service->backend().platform()->Name().c_str(),
-            module.ToString().c_str());
+      fprintf(stdout, "%s\n", module->ToString().c_str());
+    }
   }
 }
 
@@ -74,10 +90,21 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  bool compile = false;
+  std::vector<tensorflow::Flag> flag_list = {
+      {"compile", &compile,
+       "If true, compile the computation using the default client before "
+       "dumping the HLO. Otherwise dump the raw (uncompiled) HLO."},
+  };
+  const xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc > 1) << "\nERROR: must specify at least one module\n" << usage;
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
+  xla::tools::RealMain(args, compile);
   return 0;
 }
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
new file mode 100644
index 0000000000000000000000000000000000000000..850267d3195785a96bf8d2c80fe64fdb8aae0a91
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Usage: dumped_computation_to_tf_graph some_binary_snapshot_proto*
+//
+// Dumps a tensorflow GraphDef in text format for a snapshot computation. The
+// dumped graph is an HLO computation with HLO instructions as nodes and can be
+// visualized on Tensorboard. Upload the dumped files on Tensorboard.
+//
+// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// ServiceInterface::SnapshotComputation to disk.
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/legacy_flags/hlo_graph_dumper_flags.h"
+#include "tensorflow/compiler/xla/service/service.h"
+#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::Env;
+
+namespace xla {
+namespace tools {
+
+void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
+  Client* client = ClientLibrary::LocalClientOrDie();
+  for (char* arg : args) {
+    SessionModule module;
+    TF_CHECK_OK(
+        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
+    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    ComputationStats stats =
+        client->GetComputationStats(computation).ConsumeValueOrDie();
+    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
+  }
+}
+
+}  // namespace tools
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  xla::legacy_flags::ServiceFlags* flags = xla::legacy_flags::GetServiceFlags();
+  flags->xla_generate_hlo_graph = ".*";
+
+  xla::legacy_flags::HloGraphDumperFlags* dumper_flags =
+      xla::legacy_flags::GetHloGraphDumperFlags();
+  dumper_flags->xla_hlo_dump_as_graphdef = true;
+
+  tensorflow::gtl::ArraySlice<char*> args(argv, argc);
+  args.pop_front();  // Pop off the binary name, argv[0]
+  xla::tools::RealMain(args);
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 8258031a2c5119d085a483a0826f7284897dcee3..ea8b4b7b989b72034f33920a7d8c1a75e15a7dd1 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_TYPES_H_
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
+#include <Eigen/Core>
+
 namespace xla {
 
 using ::tensorflow::string;
@@ -32,6 +35,8 @@ using ::tensorflow::uint16;
 using ::tensorflow::uint32;
 using ::tensorflow::uint64;
 
+using ::Eigen::half;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index a711b5035d842cd26945b2dac1159392813d56ab..d467178cb528a93b2c1030fc72d054cc0edf95b6 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -33,7 +33,7 @@ namespace {
 // Adds a backtrace to the provided status iff the xla_status_add_backtrace flag
 // is set. This is useful for quickly tracing status errors observed coming out
 // of the service.
-Status MaybeAddBacktrace(Status prior) {
+Status MaybeAddBacktrace(const Status& prior) {
   DCHECK(!prior.ok());
   if (legacy_flags::GetUtilFlags()->xla_status_add_backtrace) {
     return Status{prior.code(),
@@ -153,16 +153,26 @@ string Reindent(tensorflow::StringPiece original,
       });
 }
 
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank) {
+  if (rank != permutation.size()) {
+    return false;
+  }
+  std::vector<int64> output(permutation.size(), -1);
+  for (auto index : permutation) {
+    CHECK_GE(index, 0);
+    CHECK_LT(index, rank);
+    output[index] = 0;
+  }
+  return std::find(output.begin(), output.end(), -1) == output.end();
+}
+
 std::vector<int64> InversePermutation(
     tensorflow::gtl::ArraySlice<int64> input_permutation) {
+  DCHECK(IsPermutation(input_permutation, input_permutation.size()));
   std::vector<int64> output_permutation(input_permutation.size(), -1);
   for (size_t i = 0; i < input_permutation.size(); ++i) {
     output_permutation[input_permutation[i]] = i;
   }
-  DCHECK_EQ(
-      0, std::count(output_permutation.begin(), output_permutation.end(), -1));
-  DCHECK(std::is_permutation(input_permutation.begin(), input_permutation.end(),
-                             output_permutation.begin()));
   return output_permutation;
 }
 
@@ -196,6 +206,15 @@ PaddingConfig MakeNoPaddingConfig(int64 rank) {
   return padding_config;
 }
 
+bool HasInteriorPadding(const PaddingConfig& config) {
+  for (const auto& dim : config.dimensions()) {
+    if (dim.interior_padding() != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
 string HumanReadableNumFlops(double flops, double nanoseconds) {
   if (nanoseconds == 0) {
     return "NaN FLOP/s";
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 55a66a7499571b4979ff375a8199cb329a799ef7..42d5c1d15501fb912551a044414e6fa0c83283b8 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -38,6 +39,13 @@ limitations under the License.
 
 namespace xla {
 
+// Ranks greater than 8 are very rare, so use InlinedVector<int64, 8> to store
+// the bounds and indices. And for the rare cases of ranks greater than 8,
+// the InlinedVector will just behave like an std::vector<> and allocate the
+// memory to store its values.
+static constexpr int kInlineRank = 8;
+using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
+
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
@@ -120,6 +128,14 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2) {
           std::equal(std::begin(c1), std::end(c1), std::begin(c2)));
 }
 
+template <typename Container1T,
+          typename ElementType = typename Container1T::value_type>
+bool ContainersEqual(const Container1T& c1,
+                     std::initializer_list<ElementType> il) {
+  tensorflow::gtl::ArraySlice<ElementType> c2{il};
+  return ContainersEqual(c1, c2);
+}
+
 // Compares two containers for equality. Returns true iff the two containers
 // have the same size and all their elements compare equal using the predicate
 // p. Like std::equal, but forces size equality.
@@ -130,6 +146,18 @@ bool ContainersEqual(const Container1T& c1, const Container2T& c2,
           std::equal(std::begin(c1), std::end(c1), std::begin(c2), p));
 }
 
+// Performs a copy of count values from src to dest, using different strides for
+// source and destination. The source starting index is src_base, while the
+// destination one is dest_base.
+template <typename D, typename S>
+void StridedCopy(tensorflow::gtl::MutableArraySlice<D> dest, int64 dest_base,
+                 int64 dest_stride, tensorflow::gtl::ArraySlice<S> src,
+                 int64 src_base, int64 src_stride, int64 count) {
+  for (; count > 0; --count, dest_base += dest_stride, src_base += src_stride) {
+    dest[dest_base] = static_cast<D>(src[src_base]);
+  }
+}
+
 // Adds some context information to the error message in a
 // Status.  This is useful as Statuses are
 // propagated upwards.
@@ -156,6 +184,9 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 string Reindent(tensorflow::StringPiece original,
                 tensorflow::StringPiece indentation);
 
+// Checks whether permutation is a permutation of the [0, rank) integer range.
+bool IsPermutation(tensorflow::gtl::ArraySlice<int64> permutation, int64 rank);
+
 // Applies `permutation` on `input` and returns the permuted array.
 // For each i, output[permutation[i]] = input[i].
 //
@@ -166,12 +197,11 @@ template <template <typename...> class C, typename T>
 std::vector<T> Permute(tensorflow::gtl::ArraySlice<int64> permutation,
                        C<T> input_) {
   tensorflow::gtl::ArraySlice<T> input(input_);
-  CHECK_EQ(permutation.size(), input.size());
+  CHECK(IsPermutation(permutation, input.size()));
   std::vector<T> output(input.size());
   for (size_t i = 0; i < permutation.size(); ++i) {
     output[permutation[i]] = input[i];
   }
-  DCHECK(std::is_permutation(input.begin(), input.end(), output.begin()));
   return output;
 }
 
@@ -192,9 +222,53 @@ int64 PositionInContainer(const Container& container, int64 value) {
                        std::find(container.begin(), container.end(), value));
 }
 
+// Formats the container as a comma-separated string. StrAppend must support
+// appending the elements of the container. Prefix is prepended and suffix is
+// appended to the returned string.
+template <typename Container>
+string CommaSeparatedString(const Container& c, const char* prefix = "",
+                            const char* suffix = "") {
+  // Not using Join() since the implementation here is simple anyway and this
+  // avoids copying the string to append prefix.
+  string comma_separated = prefix;
+  const char* separator = "";
+  for (const auto& entry : c) {
+    tensorflow::strings::StrAppend(&comma_separated, separator, entry);
+    separator = ", ";
+  }
+  comma_separated += suffix;
+  return comma_separated;
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+string CommaSeparatedString(const std::initializer_list<T>& c,
+                            const char* prefix = "", const char* suffix = "") {
+  return CommaSeparatedString<std::initializer_list<T>>(c, prefix, suffix);
+}
+
+// Formats the container in the mathematical notation for a vector, e.g. (1, 3,
+// 7). StrAppend must support appending the elements of c.
+template <typename Container>
+string VectorString(const Container& c) {
+  return CommaSeparatedString(c, "(", ")");
+}
+
+// Overload needed to allow the container to be an initializer list. The default
+// type for T makes an empty initializer list work as well.
+template <typename T = int>
+string VectorString(const std::initializer_list<T>& c) {
+  return VectorString<std::initializer_list<T>>(c);
+}
+
 // Returns a PaddingConfig object that represents no padding for the given rank.
 PaddingConfig MakeNoPaddingConfig(int64 rank);
 
+// Returns true if the padding configuration has at least one dimension with
+// non-zero interior padding.
+bool HasInteriorPadding(const PaddingConfig& config);
+
 // Imports the templated FloorOfRatio math function from the TensorFlow
 // namespace, as it is very commonly used.
 template <typename T>
diff --git a/tensorflow/compiler/xla/util_test.cc b/tensorflow/compiler/xla/util_test.cc
index bf01ac0fec2899c193c841cb3f02d4cdbd858990..547b924180bf59091ebd552618bf6bd5be9cd6a7 100644
--- a/tensorflow/compiler/xla/util_test.cc
+++ b/tensorflow/compiler/xla/util_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <list>
 
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
@@ -80,6 +80,26 @@ TEST(UtilTest, HumanReadableNumFlopsExample) {
   ASSERT_EQ("1.00GFLOP/s", HumanReadableNumFlops(1e9, 1e9));
 }
 
+TEST(UtilTest, CommaSeparatedString) {
+  EXPECT_EQ(CommaSeparatedString({}), "");
+  EXPECT_EQ(CommaSeparatedString({"hello world"}), "hello world");
+  EXPECT_EQ(CommaSeparatedString({1, 57, 2}, "foo", "bar"), "foo1, 57, 2bar");
+}
+
+TEST(UtilTest, VectorString) {
+  std::list<int64> empty_list;
+  EXPECT_EQ(VectorString(empty_list), "()");
+
+  std::vector<float> float_vector = {5.5};
+  EXPECT_EQ(VectorString(float_vector), "(5.5)");
+
+  std::set<const char*> string_set = {"a", "b"};
+  EXPECT_EQ(VectorString(string_set), "(a, b)");
+
+  EXPECT_EQ(VectorString({}), "()");
+  EXPECT_EQ(VectorString({1, 57, 2}), "(1, 57, 2)");
+}
+
 TEST(UtilTest, LogLines) {
   // Just make sure this code runs (not verifying the output).
   LogLines(tensorflow::INFO, "hello\n\nworld", __FILE__, __LINE__);
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 2bb09c069c747888148ad82ff23857ff49abaaa3..1239816c50e5fbef486842123ba0d0d8e0d8ab38 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 package xla;
+option cc_enable_arenas = true;
 
 // Primitive types are the individual values that can be held in rectangular
 // multidimensional arrays. A description of the rectangular multidimensional
@@ -285,6 +286,7 @@ message Literal {
   repeated float f32s = 8;
   repeated double f64s = 9;
   repeated Literal tuple_literals = 10;
+  bytes f16s = 11;  // Note: the F16s are encoded in little endian byte order
 }
 
 message WindowDimension {
@@ -407,7 +409,7 @@ message InfeedRequest {
   Shape shape = 2;
 
   // Additional infeed configuration for the backend.
-  string config = 3;
+  bytes config = 3;
 }
 
 message OutfeedRequest {
@@ -418,7 +420,7 @@ message OutfeedRequest {
   ComputationDataHandle operand = 2;
 
   // Backend-specific information for how to perform the outfeed.
-  string outfeed_config = 3;
+  bytes outfeed_config = 3;
 }
 
 message CallRequest {
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
old mode 100644
new mode 100755
index 595d8997388e822e7998a1465961d72451dc09fa..1ac7e536543d53985e15850291878f76b6608271
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -7,20 +7,20 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-load("//tensorflow:tensorflow.bzl", "if_not_windows")
-
 py_library(
     name = "contrib_py",
     srcs = glob(["**/*.py"]),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_py",
         "//tensorflow/contrib/bayesflow:bayesflow_py",
         "//tensorflow/contrib/cloud:cloud_py",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
+        "//tensorflow/contrib/data",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/factorization:factorization_py",
@@ -30,6 +30,7 @@ py_library(
         "//tensorflow/contrib/grid_rnn:grid_rnn_py",
         "//tensorflow/contrib/hooks",
         "//tensorflow/contrib/image:image_py",
+        "//tensorflow/contrib/image:single_image_random_dot_stereograms_py",
         "//tensorflow/contrib/imperative",
         "//tensorflow/contrib/input_pipeline:input_pipeline_py",
         "//tensorflow/contrib/integrate:integrate_py",
@@ -46,6 +47,7 @@ py_library(
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/memory_stats:memory_stats_py",
         "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
@@ -58,26 +60,27 @@ py_library(
         "//tensorflow/contrib/solvers:solvers_py",
         "//tensorflow/contrib/sparsemax:sparsemax_py",
         "//tensorflow/contrib/specs",
+        "//tensorflow/contrib/staging",
         "//tensorflow/contrib/stat_summarizer:stat_summarizer_py",
+        "//tensorflow/contrib/stateless",
         "//tensorflow/contrib/tensor_forest:init_py",
         "//tensorflow/contrib/tensorboard",
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/contrib/tfprof",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/contrib/util:util_py",
-    ] + if_not_windows([
-        "//tensorflow/contrib/nccl:nccl_py",
-    ]),
+    ],
 )
 
 cc_library(
     name = "contrib_kernels",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_kernels",
         "//tensorflow/contrib/factorization/kernels:all_kernels",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels",
-        "//tensorflow/contrib/layers:bucketization_op_kernel",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel",
+        "//tensorflow/contrib/nccl:nccl_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
     ],
 )
@@ -86,11 +89,12 @@ cc_library(
     name = "contrib_ops_op_lib",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/batching:batch_ops_op_lib",
         "//tensorflow/contrib/factorization:all_ops",
         "//tensorflow/contrib/framework:all_ops",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
-        "//tensorflow/contrib/layers:bucketization_op_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
+        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
     ],
 )
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index d4ddd1cf6a6aa58fa90995bf8ec3485ad3bcbcf6..3908c37fcd10487ad0a5e43e74460d5ec8e1ee4c 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -25,6 +25,7 @@ from tensorflow.contrib import compiler
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
+from tensorflow.contrib import data
 from tensorflow.contrib import deprecated
 from tensorflow.contrib import distributions
 from tensorflow.contrib import factorization
@@ -35,6 +36,7 @@ from tensorflow.contrib import image
 from tensorflow.contrib import input_pipeline
 from tensorflow.contrib import integrate
 from tensorflow.contrib import keras
+from tensorflow.contrib import kernel_methods
 from tensorflow.contrib import labeled_tensor
 from tensorflow.contrib import layers
 from tensorflow.contrib import learn
@@ -45,6 +47,7 @@ from tensorflow.contrib import lookup
 from tensorflow.contrib import losses
 from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
+from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import quantization
@@ -54,7 +57,9 @@ from tensorflow.contrib import seq2seq
 from tensorflow.contrib import slim
 from tensorflow.contrib import solvers
 from tensorflow.contrib import sparsemax
+from tensorflow.contrib import staging
 from tensorflow.contrib import stat_summarizer
+from tensorflow.contrib import stateless
 from tensorflow.contrib import tensor_forest
 from tensorflow.contrib import tensorboard
 from tensorflow.contrib import testing
@@ -64,6 +69,11 @@ from tensorflow.contrib import util
 from tensorflow.contrib.ndlstm import python as ndlstm
 from tensorflow.contrib.specs import python as specs
 
+from tensorflow.python.util.lazy_loader import LazyLoader
+ffmpeg = LazyLoader("ffmpeg", globals(),
+                    "tensorflow.contrib.ffmpeg")
+del LazyLoader
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index 26885828edc9f22fa01ced97c81d75b4e6f1f6e3..88960b48c934f84cb253adee0fcb16f4103bdd6c 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -14,6 +14,23 @@ For prebuilt libraries, see the
 [nightly Android build artifacts](https://ci.tensorflow.org/view/Nightly/job/nightly-android/)
 page for a recent build.
 
+The TensorFlow Inference Interface is also available as a
+[JCenter package](https://bintray.com/google/tensorflow/tensorflow-android) and
+can be included quite simply in your android project with a couple of lines in
+the project's `build.gradle` file:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-android:1.2.0-preview'
+}
+```
+
 To build the libraries yourself (if, for example, you want to support custom
 TensorFlow operators), pick your preferred approach below:
 
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index ff6854426c5cec2c09e99cf9da0c8e7da9843a1c..b441906cd4dacfbbd930e05c021c87577b07aaab 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -9,23 +9,20 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+cc_library(
+    name = "batch_scheduler_hdrs",
+    hdrs = ["batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+    ],
 )
 
 cc_library(
     name = "batch_scheduler",
     hdrs = ["batch_scheduler.h"],
     deps = [
+        ":batch_scheduler_hdrs",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
@@ -41,14 +38,26 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "shared_batch_scheduler_hdrs",
+    hdrs = ["shared_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
 cc_library(
     name = "shared_batch_scheduler",
     hdrs = ["shared_batch_scheduler.h"],
     deps = [
         ":batch_scheduler",
+        ":shared_batch_scheduler_hdrs",
         "//tensorflow/contrib/batching/util:periodic_function",
         "//tensorflow/core:lib",
     ],
+    alwayslink = 1,
 )
 
 cc_test(
@@ -102,3 +111,95 @@ cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "py_test",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+tf_custom_op_library(
+    name = "python/ops/_batch_ops.so",
+    srcs = [
+        "ops/batch_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["batch_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "batch_ops",
+    deps = [":batch_ops_op_lib"],
+)
+
+tf_kernel_library(
+    name = "batch_ops_kernels",
+    deps = [
+        "//tensorflow/contrib/batching/kernels:batch_kernels",
+        "//tensorflow/contrib/batching/util:periodic_function",
+        "//tensorflow/core/kernels:concat_lib",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels:split_lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_py_library(
+    name = "batch_py",
+    srcs = glob(["python/ops/*.py"]) + ["__init__.py"],
+    dso = [":python/ops/_batch_ops.so"],
+    kernels = [
+        ":batch_ops_kernels",
+        ":batch_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":batch_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "batch_ops_test",
+    size = "small",
+    srcs = ["python/ops/batch_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nomac"],
+    deps = [
+        ":batch_py",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/__init__.py b/tensorflow/contrib/batching/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..44fa5f42a73bfb1bf008f6f4eafd14913c88dcfa
--- /dev/null
+++ b/tensorflow/contrib/batching/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to batch.
+
+@@batch_function
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.python.ops.batch_ops import batch_function
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
index cfbe765b5d1b86b9b51edd9de8e6c957a3e88211..9d3805fbaf39978159dd2f4a754e6d41a07acf6a 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc b/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
index 259c1d7879f3de5c186aedbe83b62e0e3f516c5c..ab6c81043359cd10d90668fcf88d61a5e0ea7ee0 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_benchmark.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
index 29081dc48b6f28ecb175dfe29c56a904362ecc64..e020301795c7dadee2815c0e0d727e53e5fb9e6e 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
index c35b0773ae89fb7d87e502ba5cf9e1ed0bdd4801..7c41ad88180badd37398f5bae057dcd0006922c3 100644
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/batch_scheduler_test.cc b/tensorflow/contrib/batching/batch_scheduler_test.cc
index 90841fce906bcb746cb4f40aa888a3c9ac209efb..f15d8cc8e57300dddc06dcffb24ec98920e193ef 100644
--- a/tensorflow/contrib/batching/batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/batch_scheduler_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/kernels/BUILD b/tensorflow/contrib/batching/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6e53dd9a5fc0201c5ed91d1eaf07f940e341fb5e
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/BUILD
@@ -0,0 +1,34 @@
+# Description:
+#   Contains kernels for the batching ops.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "batch_kernels",
+    srcs = ["batch_kernels.cc"],
+    deps = [
+        "//tensorflow/contrib/batching:shared_batch_scheduler_hdrs",
+        "//tensorflow/contrib/batching/util:periodic_function_dynamic",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:concat_lib_hdrs",
+        "//tensorflow/core/kernels:ops_util_hdrs",
+        "//tensorflow/core/kernels:split_lib_hdrs",
+    ],
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e0957298badf9842f9617f1db1ead24d26b26ba
--- /dev/null
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -0,0 +1,996 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/batching/shared_batch_scheduler.h"
+#include "tensorflow/contrib/batching/util/periodic_function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+#endif  // TENSORFLOW_USE_SYCL
+
+// Concatenates 'inputs' into a single tensor along the zeroth dimension.
+// Requires that all elements of 'inputs' have element type T. Writes to the
+// op's output at position 'output_index', using 'context' for the allocation to
+// ensure proper device placement.
+template <typename T>
+Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
+              int output_index) {
+  const int input_dims = inputs[0].dims();
+  const TensorShape& input_shape = inputs[0].shape();
+
+  // Note that we reduce the concat of k-dimensional tensors into a two
+  // dimensional concat. Assuming the dimensions of any input tensor are
+  // {y0, y1,...,ym-1}, we flatten it to {1, y}, where y = Prod_i(yi).
+  std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>> inputs_flat;
+  inputs_flat.reserve(inputs.size());
+  int64 output_dim0 = 0;
+  for (int i = 0; i < inputs.size(); ++i) {
+    const Tensor& input = inputs[i];
+    if (input.dims() != input_dims) {
+      return errors::InvalidArgument(
+          "Ranks of all input tensors should match: shape[0] = ",
+          input_shape.DebugString(), " vs. shape[", i,
+          "] = ", input.shape().DebugString());
+    }
+    for (int j = 1; j < input_dims; ++j) {
+      if (input.dim_size(j) != input_shape.dim_size(j)) {
+        return errors::InvalidArgument(
+            "Dimensions of inputs should match: shape[0] = ",
+            input_shape.DebugString(), " vs. shape[", i,
+            "] = ", input.shape().DebugString());
+      }
+    }
+    if (input.NumElements() > 0) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          input.shaped<T, 2>({1, input.NumElements()})));
+    }
+    output_dim0 += input.dim_size(0);
+  }
+
+  TensorShape output_shape(input_shape);
+  output_shape.set_dim(0, output_dim0);
+  Tensor* output = nullptr;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, output_shape, &output));
+  if (output->NumElements() > 0) {
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+#if GOOGLE_CUDA
+    if (std::is_same<Device, GPUDevice>::value) {
+      ConcatGPU<T>(context, inputs_flat, output, &output_flat);
+      return Status::OK();
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
+  }
+
+  return Status::OK();
+}
+
+// The Split*() functions split 'input' with element type T into 'sizes.size()'
+// tensors along the zeroth dimension, with the ith split having zeroth-
+// dimension size 'sizes[i]'. They allocate the output tensors using 'context',
+// for proper device placement.
+
+// Handles special cases that are cheap. Sets 'done==true' iff it found an
+// applicable special case and wrote to the outputs. Otherwise acts as a no-op.
+template <typename T>
+Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
+                      const gtl::ArraySlice<int64>& sizes,
+                      std::vector<Tensor>* outputs, bool* done) {
+  *done = false;
+
+  int64 total_size = 0;
+  for (const int64 size : sizes) {
+    total_size += size;
+  }
+  if (total_size > input.shape().dim_size(0)) {
+    return errors::InvalidArgument(
+        "Sum of split sizes must not exceed dim0-size of input tensor");
+  }
+
+  // Special case 0: trivial 1-way split.
+  if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
+    outputs->push_back(input);
+    *done = true;
+    return Status::OK();
+  }
+
+  // Special case 1: input is aligned.
+  if (IsInnerDimsSizeAligned<T>(input.shape())) {
+    int64 position = 0;
+    for (const int64 size : sizes) {
+      outputs->emplace_back(input.Slice(position, position + size));
+      position += size;
+    }
+    *done = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+// Handles the general case, on CPU.
+template <typename T>
+Status SplitCPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  int64 suffix_dim_size = 1;
+  for (int i = 1; i < input.shape().dims(); ++i) {
+    suffix_dim_size *= input.shape().dim_size(i);
+  }
+  auto input_reshaped =
+      input.shaped<T, 3>({1, input.shape().dim_size(0), suffix_dim_size});
+
+  int64 position = 0;
+  for (const int64 size : sizes) {
+    TensorShape output_shape = input.shape();
+    output_shape.set_dim(0, size);
+    Tensor output;
+    TF_RETURN_IF_ERROR(
+        context->allocate_temp(input.dtype(), output_shape, &output));
+    auto output_shaped = output.shaped<T, 3>({1, size, suffix_dim_size});
+
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices{0, position, 0};
+    Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes{1, size, suffix_dim_size};
+    functor::Split<CPUDevice, T>()(context->eigen_device<CPUDevice>(),
+                                   output_shaped, input_reshaped, slice_indices,
+                                   slice_sizes);
+
+    outputs->emplace_back(output);
+
+    position += size;
+  }
+
+  return Status::OK();
+}
+
+#if GOOGLE_CUDA
+
+// Handles the general case, on GPU.
+template <typename T>
+Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                const gtl::ArraySlice<int64>& sizes,
+                std::vector<Tensor>* outputs) {
+  // TODO(olston, apassos): Implement this.
+  LOG(FATAL) << "Not yet implemented";  // Crash ok
+}
+
+#endif  // GOOGLE_CUDA
+
+// The outer function that dispatches to the various Split*() functions above.
+template <typename T>
+Status Split(OpKernelContext* context, const Tensor& input,
+             const gtl::ArraySlice<int64>& sizes,
+             std::vector<Tensor>* outputs) {
+  bool easy_cases_done;
+  TF_RETURN_IF_ERROR(
+      SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
+  if (easy_cases_done) {
+    return Status::OK();
+  }
+
+#if GOOGLE_CUDA
+// TODO(olston, apassos): Handle non-CPU cases.
+// return SplitGPU<T>(context, input, sizes, outputs);
+#endif  // GOOGLE_CUDA
+  return SplitCPU<T>(context, input, sizes, outputs);
+}
+
+// A class encapsulating the state and logic for batching tensors.
+class BatchResource : public ResourceBase {
+ public:
+  static Status Create(int32 num_batch_threads, int32 max_batch_size,
+                       int32 batch_timeout_micros,
+                       const std::vector<int32>& allowed_batch_sizes,
+                       std::unique_ptr<BatchResource>* resource) {
+    std::unique_ptr<BatchResource> new_resource(new BatchResource);
+
+    Batcher::Options batcher_options;
+    batcher_options.num_batch_threads = num_batch_threads;
+    TF_RETURN_IF_ERROR(
+        Batcher::Create(batcher_options, &new_resource->batcher_));
+
+    new_resource->batcher_queue_options_.max_batch_size = max_batch_size;
+    new_resource->batcher_queue_options_.batch_timeout_micros =
+        batch_timeout_micros;
+
+    new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
+
+    *resource = std::move(new_resource);
+    return Status::OK();
+  }
+
+  string DebugString() final { return "BatchResource"; }
+
+  // Ingests data from one invocation of the batch op. The data is enqueued to
+  // be combined with others into a batch, asynchronously.
+  Status RegisterInput(int64 guid, OpKernelContext* context,
+                       const string& batcher_queue_name,
+                       AsyncOpKernel::DoneCallback done_callback) {
+    std::unique_ptr<BatchTask> batch_components(new BatchTask);
+    batch_components->guid = guid;
+    OpInputList tensors;
+    TF_RETURN_IF_ERROR(context->input_list("in_tensors", &tensors));
+    for (int i = 0; i < tensors.size(); ++i) {
+      const Tensor& tensor = tensors[i];
+      if (tensor.shape().dims() == 0) {
+        return errors::InvalidArgument(
+            "Batching input tensors must have at least one dimension");
+      }
+      if (tensors.size() >= 2 &&
+          tensor.shape().dim_size(0) != tensors[0].shape().dim_size(0)) {
+        return errors::InvalidArgument(
+            "Batching input tensors supplied in a given op invocation must "
+            "have equal 0th-dimension size");
+      }
+      batch_components->inputs.push_back(tensor);
+    }
+    batch_components->context = context;
+    batch_components->done_callback = std::move(done_callback);
+
+    BatcherQueue* batcher_queue;
+    TF_RETURN_IF_ERROR(
+        LookupOrCreateBatcherQueue(batcher_queue_name, &batcher_queue));
+    return batcher_queue->Schedule(&batch_components);
+  }
+
+ private:
+  BatchResource() = default;
+
+  // One input to be batched. Corresponds to one invocation of the batch op.
+  struct BatchTask : public serving::BatchTask {
+    // A unique ID to identify this invocation of Batch.
+    int64 guid;
+
+    std::vector<Tensor> inputs;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done_callback;
+
+    size_t size() const override { return inputs[0].shape().dim_size(0); }
+  };
+
+  using Batcher = serving::SharedBatchScheduler<BatchTask>;
+  using BatcherQueue = serving::BatchScheduler<BatchTask>;
+  using Batch = serving::Batch<BatchTask>;
+
+  // Validates that it's legal to combine the tasks in 'batch' into a batch.
+  // Assumes the batch is non-empty.
+  static Status ValidateBatch(const Batch& batch) {
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+
+      if (task.inputs.size() != batch.task(0).inputs.size()) {
+        return errors::InvalidArgument(
+            "Batching inputs must have equal number of edges");
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
+  // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
+  // returns 'batch_size'.
+  int RoundToLowestAllowedBatchSize(int batch_size) const {
+    if (allowed_batch_sizes_.empty()) {
+      return batch_size;
+    }
+    for (int allowed_size : allowed_batch_sizes_) {
+      if (allowed_size >= batch_size) {
+        return allowed_size;
+      }
+    }
+    LOG(ERROR) << "Maximum batch size greater than largest allowed size; "
+                  "ignoring allowed sizes constraint";
+    return batch_size;
+  }
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch->size());
+    const int padding_amount = padded_batch_size - batch->size();
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+    AsyncOpKernel::DoneCallback last_task_callback =
+        batch->task(batch->num_tasks() - 1).done_callback;
+
+    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                         last_task_callback);
+
+    // All tasks should have the same number of input edges.
+    const int num_input_edges = batch->task(0).inputs.size();
+
+    // Process each input edge one at a time (the typical case has just one).
+    for (int i = 0; i < num_input_edges; ++i) {
+      // Emit batch->num_tasks() - 1 empty output tensors.
+      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+        const BatchTask& task = batch->task(task_idx);
+        TensorShape output_shape(task.inputs.at(i).shape());
+        output_shape.set_dim(0, 0);
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            task.context,
+            task.context->allocate_output(i, output_shape, &output),
+            task.done_callback);
+      }
+
+      // Concatenate the tasks ith input tensors into a big output tensor.
+      std::vector<Tensor> to_concatenate;
+      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
+      }
+
+      // Add padding as needed. Use the first row of the first task's tensor as
+      // the data for padding.
+      if (padding_amount > 0) {
+        const Tensor& padding_source = batch->task(0).inputs.at(i);
+        Tensor padding;
+        if (padding_source.shape().dim_size(0) == 1) {
+          padding = padding_source;
+        } else {
+          const std::vector<int64> slice_sizes = {1};
+          const DataType type = padding_source.dtype();
+          Status slice_status;
+          std::vector<Tensor> slices;
+          switch (type) {
+#define CASE(type)                                                   \
+  case DataTypeToEnum<type>::value:                                  \
+    slice_status = SplitCPU<type>(last_task_context, padding_source, \
+                                  slice_sizes, &slices);             \
+    break;
+            TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+            default:
+              slice_status =
+                  errors::InvalidArgument("Unsupported data type: ", type);
+              break;
+          }
+          OP_REQUIRES_OK_ASYNC(last_task_context, slice_status,
+                               last_task_callback);
+          padding = slices.at(0);
+        }
+        for (int i = 0; i < padding_amount; ++i) {
+          to_concatenate.push_back(padding);
+        }
+      }
+
+      const DataType type = to_concatenate[0].dtype();
+      Status concat_status;
+      switch (type) {
+#define CASE(type)                                                      \
+  case DataTypeToEnum<type>::value:                                     \
+    concat_status = Concat<type>(last_task_context, to_concatenate, i); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          concat_status =
+              errors::InvalidArgument("Unsupported data type: ", type);
+          break;
+      }
+      OP_REQUIRES_OK_ASYNC(last_task_context, concat_status,
+                           last_task_callback);
+    }
+
+    // Emit batch->num_tasks() - 1 empty index tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      TensorShape index_shape({0, 3});
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          task.context,
+          task.context->allocate_output(num_input_edges, index_shape, &output),
+          task.done_callback);
+    }
+    // Emit all ID tensors.
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      const BatchTask& task = batch->task(task_idx);
+      Tensor* id;
+      OP_REQUIRES_OK_ASYNC(task.context,
+                           task.context->allocate_output(num_input_edges + 1,
+                                                         TensorShape({}), &id),
+                           task.done_callback);
+      id->scalar<int64>()() = task.guid;
+    }
+    OP_REQUIRES_OK_ASYNC(
+        last_task_context,
+        EmitIndexTensor(last_task_context, *batch, num_input_edges),
+        last_task_callback);
+
+    // Signal done for each element of the batch. (At this point, the contexts
+    // are no longer guaranteed to remain live.)
+    for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
+      batch->mutable_task(task_idx)->done_callback();
+    }
+  }
+
+  // Emits an index tensor, which the Unbatch op will use to un-concatenate
+  // the tensor and attribute the pieces to the right batch keys. The index
+  // tensor contains, for each input: [batch_key, start_offset, end_offset]
+  // where start_offset and end_offset represent the range of entries in the
+  // concatenated tensors that belong to that input.
+  //
+  // Emits the result to the output at 'output_index' using 'context'.
+  static Status EmitIndexTensor(OpKernelContext* context, const Batch& batch,
+                                int output_index) {
+    const TensorShape index_shape({batch.num_tasks(), 3});
+    Tensor* index = nullptr;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(output_index, index_shape, &index));
+    auto index_flat = index->shaped<int64, 2>({batch.num_tasks(), 3});
+    size_t offset = 0;
+    for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+      const BatchTask& task = batch.task(task_idx);
+      index_flat(task_idx, 0) = task.guid;
+      index_flat(task_idx, 1) = offset;
+      index_flat(task_idx, 2) = offset + task.size();
+      offset += task.size();
+    }
+    return Status::OK();
+  }
+
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // creates it.
+  Status LookupOrCreateBatcherQueue(const string& queue_name,
+                                    BatcherQueue** queue) {
+    mutex_lock l(batcher_queues_mu_);
+
+    auto it = batcher_queues_.find(queue_name);
+    if (it != batcher_queues_.end()) {
+      *queue = it->second.get();
+      return Status::OK();
+    }
+
+    std::unique_ptr<BatcherQueue> new_queue;
+    auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
+      ProcessBatch(std::move(batch));
+    };
+    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
+                                          process_batch_callback, &new_queue));
+    *queue = new_queue.get();
+    batcher_queues_[queue_name] = std::move(new_queue);
+    return Status::OK();
+  }
+
+  // A batch scheduler, and options for creating queues.
+  std::shared_ptr<Batcher> batcher_;
+  Batcher::QueueOptions batcher_queue_options_;
+
+  // A collection of batcher queues, keyed on queue name.
+  // TODO(olston): Garbage-collect unused queues (perhaps simply remove empty
+  // ones (with a time delay?); it's okay if they get recreated later).
+  mutable mutex batcher_queues_mu_;
+  std::map<string, std::unique_ptr<BatcherQueue>> batcher_queues_
+      GUARDED_BY(batcher_queues_mu_);
+
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+class BatchKernel : public AsyncOpKernel {
+ public:
+  explicit BatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    BatchResource* br;
+    std::function<Status(BatchResource * *r)> creator =
+        [this](BatchResource** r) {
+          std::unique_ptr<BatchResource> new_resource;
+          TF_RETURN_IF_ERROR(BatchResource::Create(
+              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+              allowed_batch_sizes_, &new_resource));
+          *r = new_resource.release();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &br, creator),
+                         done);
+    const Status status =
+        br->RegisterInput(random::New64(), c, batcher_queue_, done);
+    br->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume br calls done, so nothing to do here.
+  }
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  Status ValidateAllowedBatchSizes() const {
+    if (allowed_batch_sizes_.empty()) {
+      return Status::OK();
+    }
+    int32 last_size = 0;
+    for (int i = 0; i < allowed_batch_sizes_.size(); ++i) {
+      const int32 size = allowed_batch_sizes_.at(i);
+      if (i > 0 && size <= last_size) {
+        return errors::InvalidArgument(
+            "allowed_batch_sizes entries must be monotonically increasing");
+      }
+      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+        return errors::InvalidArgument(
+            "final entry in allowed_batch_sizes must equal max_batch_size");
+      }
+      last_size = size;
+    }
+    return Status::OK();
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  std::vector<int32> allowed_batch_sizes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("Batch").Device(DEVICE_CPU), BatchKernel);
+
+// A class encapsulating the state and logic for unbatching tensors.
+//
+// UnbatchResource keeps two data structures indexed by batch-key: one which has
+// the continuations for all concurrent kernels which are waiting for tensors
+// and another which has tensors which are waiting for their corresponding
+// kernels to run. Whenever a kernel runs, we either grab its tensor if it's
+// waiting already, or we insert it in the queue and then look at its tensor to
+// see if it can be used to dispatch any stored continuations.
+class UnbatchResource : public ResourceBase {
+ public:
+  explicit UnbatchResource(int32 timeout_micros)
+      : timeout_micros_(timeout_micros),
+        timeout_enforcer_(new serving::PeriodicFunction(
+            [this] { EnforceTimeout(); }, 1000 /* 1 ms */)) {}
+
+  ~UnbatchResource() override {
+    // Tear down 'timeout_enforcer_' first, since it accesses other state in
+    // this class.
+    timeout_enforcer_ = nullptr;
+  }
+
+  string DebugString() final { return "UnbatchResource"; }
+
+  Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+
+    if (batch_index_t.shape().dim_size(0) > data_t.shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 0th dimension size to be no "
+          "greater than ",
+          data_t.shape().dim_size(0),
+          "; Got: ", batch_index_t.shape().dim_size(0), ".");
+    }
+    if (batch_index_t.shape().dim_size(1) != 3) {
+      return errors::InvalidArgument(
+          "Wrong shape for index tensor. Expected 1st dimension size to be 3 ; "
+          "Got: ",
+          batch_index_t.shape().dim_size(1), ".");
+    }
+
+    const int64 batch_key = context->input(2).scalar<int64>()();
+    const bool nonempty_input = batch_index_t.dim_size(0) > 0;
+
+    // If we have a non-empty tensor, slice it up.
+    // (It is important to do this outside of the critical section below.)
+    // The following variables are populated iff 'nonempty_input==true'.
+    std::vector<int64> sizes;
+    std::vector<int64> batch_keys;
+    std::vector<Tensor> split_inputs;
+    if (nonempty_input) {
+      auto batch_indices =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        sizes.push_back(batch_indices(i, 2) - batch_indices(i, 1));
+        batch_keys.push_back(batch_indices(i, 0));
+      }
+
+      const DataType type = data_t.dtype();
+      switch (type) {
+#define CASE(type)                                                          \
+  case DataTypeToEnum<type>::value:                                         \
+    TF_RETURN_IF_ERROR(Split<type>(context, data_t, sizes, &split_inputs)); \
+    break;
+        TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+        default:
+          return errors::InvalidArgument("Unsupported data type: ", type);
+      }
+    }
+
+    // Critical section.
+    std::vector<AsyncOpKernel::DoneCallback> done_callbacks_to_call;
+    Status status = [&]() -> Status {
+      mutex_lock ml(mu_);
+
+      // Check to see whether the tensor we want is already ready.
+      auto tensor_it = waiting_tensors_.find(batch_key);
+      if (tensor_it != waiting_tensors_.end()) {
+        context->set_output(0, tensor_it->second.tensor);
+        waiting_tensors_.erase(tensor_it);
+        done_callbacks_to_call.push_back(done);
+        return Status::OK();
+      }
+
+      const uint64 deadline_micros =
+          Env::Default()->NowMicros() + timeout_micros_;
+
+      // Add ourselves to the waitlist for tensors.
+      if (!waiting_callbacks_
+               .emplace(batch_key,
+                        WaitingCallback{deadline_micros, context, done})
+               .second) {
+        return errors::AlreadyExists(
+            "Multiple session runs with the same batch key.");
+      }
+
+      // If we have a non-empty tensor, finish the waitlisted runs,
+      // and store any remaining pieces.
+      if (nonempty_input) {
+        for (int i = 0; i < batch_keys.size(); ++i) {
+          auto runs_it = waiting_callbacks_.find(batch_keys[i]);
+          if (runs_it != waiting_callbacks_.end()) {
+            runs_it->second.context->set_output(0, split_inputs[i]);
+            done_callbacks_to_call.push_back(runs_it->second.done);
+            waiting_callbacks_.erase(runs_it);
+          } else {
+            // Note: the deadline here is in case we are arriving late and the
+            // kernel that should rendezvous with this tensor has already waited
+            // and timed out.
+            if (!waiting_tensors_
+                     .emplace(batch_keys[i],
+                              WaitingTensor{deadline_micros, split_inputs[i]})
+                     .second) {
+              return errors::AlreadyExists(
+                  "Multiple tensors returned for same batch key.");
+            }
+          }
+        }
+      }
+
+      return Status::OK();
+    }();
+
+    for (const AsyncOpKernel::DoneCallback& done_callback :
+         done_callbacks_to_call) {
+      done_callback();
+    }
+
+    return status;
+  }
+
+ private:
+  // Evicts waiting tensors and callbacks that have exceeded their deadline.
+  void EnforceTimeout() {
+    const uint64 now = Env::Default()->NowMicros();
+    std::vector<WaitingCallback> evicted_callbacks;
+
+    {
+      mutex_lock ml(mu_);
+
+      for (auto it = waiting_tensors_.begin(); it != waiting_tensors_.end();) {
+        const WaitingTensor& waiting_tensor = it->second;
+        if (waiting_tensor.deadline_micros < now) {
+          it = waiting_tensors_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+
+      for (auto it = waiting_callbacks_.begin();
+           it != waiting_callbacks_.end();) {
+        const WaitingCallback& waiting_callback = it->second;
+        if (waiting_callback.deadline_micros < now) {
+          evicted_callbacks.push_back(waiting_callback);
+          it = waiting_callbacks_.erase(it);
+        } else {
+          ++it;
+        }
+      }
+    }
+
+    for (const WaitingCallback& evicted_callback : evicted_callbacks) {
+      evicted_callback.context->CtxFailureWithWarning(errors::DeadlineExceeded(
+          "Batched data did not arrive within timeout window."));
+      evicted_callback.done();
+    }
+  }
+
+  struct WaitingTensor {
+    uint64 deadline_micros;
+    Tensor tensor;
+  };
+
+  struct WaitingCallback {
+    uint64 deadline_micros;
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  const int32 timeout_micros_;
+
+  mutex mu_;
+
+  // Maps keyed by BatchKey of tensors waiting for callbacks and callbacks
+  // waiting for tensors.
+  std::unordered_map<int64, WaitingTensor> waiting_tensors_ GUARDED_BY(mu_);
+  std::unordered_map<int64, WaitingCallback> waiting_callbacks_ GUARDED_BY(mu_);
+
+  // A thread that evicts waiting tensors and callbacks that have exceeded their
+  // deadline.
+  std::unique_ptr<serving::PeriodicFunction> timeout_enforcer_;
+};
+
+class UnbatchKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_micros", &timeout_micros_));
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchResource* ubr;
+    std::function<Status(UnbatchResource * *r)> creator =
+        [this](UnbatchResource** r) {
+          *r = new UnbatchResource(timeout_micros_);
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    auto status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  int32 timeout_micros_;
+};
+REGISTER_KERNEL_BUILDER(Name("Unbatch").Device(DEVICE_CPU), UnbatchKernel);
+
+// A class encapsulating the state and logic for batching tensors
+// deterministically for the gradient of unbatch.
+class UnbatchGradResource : public ResourceBase {
+ public:
+  UnbatchGradResource() {}
+
+  string DebugString() final { return "UnbatchGradResource"; }
+
+  // Flushes the information for one batch, given its context and done
+  // callback. Clears all information about it from the available_tensors_.
+  Status OutputBatch(OpKernelContext* context,
+                     const AsyncOpKernel::DoneCallback& done)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    const Tensor& batch_index_t = context->input(1);
+    auto batch_index =
+        batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+    std::vector<Tensor> tensors;
+    for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+      auto available_it = available_tensors_.find(batch_index(i, 0));
+      if (available_it == available_tensors_.end()) {
+        return errors::Internal("bad bookkeeping of available tensors.");
+      }
+      tensors.push_back(available_it->second);
+      available_tensors_.erase(available_it);
+    }
+
+    const DataType type = tensors[0].dtype();
+    switch (type) {
+#define CASE(type)                                         \
+  case DataTypeToEnum<type>::value:                        \
+    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, 0)); \
+    break;
+      TF_CALL_ALL_TYPES(CASE);
+#undef CASE
+      default:
+        return errors::InvalidArgument("Unsupported data type: ", type);
+    }
+    done();
+    return Status::OK();
+  }
+
+  // Ingests data from one invocation of the op.
+  Status Compute(OpKernelContext* context,
+                 const AsyncOpKernel::DoneCallback& done) {
+    const Tensor& data_t = context->input(0);
+    const Tensor& batch_index_t = context->input(1);
+    const Tensor& grad_t = context->input(2);
+
+    mutex_lock ml(mu_);
+
+    const int64 batch_key = context->input(3).scalar<int64>()();
+    // Mark our tensor as available.
+    if (!available_tensors_.emplace(batch_key, grad_t).second) {
+      return errors::InvalidArgument("Two runs with the same batch key.");
+    }
+
+    // Check whether we have a valid input tensor and, if so, create its
+    // dispatch logic.
+    if (data_t.NumElements() > 0) {
+      if (batch_index_t.NumElements() == 0) {
+        return errors::InvalidArgument(
+            "batch_index is empty while the tensor isn't.");
+      }
+      std::unordered_set<int64> missing_tensors;
+      const auto batch_index =
+          batch_index_t.shaped<int64, 2>({batch_index_t.dim_size(0), 3});
+      for (int i = 0; i < batch_index_t.dim_size(0); ++i) {
+        const int64 batch_key = batch_index(i, 0);
+        if (available_tensors_.find(batch_key) == available_tensors_.end()) {
+          missing_tensors.emplace(batch_key);
+        }
+      }
+      if (missing_tensors.empty()) {
+        return OutputBatch(context, done);
+      }
+      if (!available_batches_
+               .emplace(batch_key, Batch{missing_tensors, context, done})
+               .second) {
+        return errors::InvalidArgument(
+            "Batch key with valid batch used twice.");
+      }
+      for (const int64 i : missing_tensors) {
+        if (!desired_tensor_to_batch_map_.emplace(i, batch_key).second) {
+          return errors::InvalidArgument(
+              "Missing tensor wanted by more than one batch.");
+        }
+      }
+    } else {
+      // If we don't have a valid input tensor we can output an empty tensor and
+      // call our done closure.
+      TensorShape output_shape(grad_t.shape());
+      output_shape.set_dim(0, 0);
+      Tensor* output = nullptr;
+      TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, &output));
+      done();
+    }
+
+    // Search to see whether our tensor is desired by any existing batch.
+    auto desire_it = desired_tensor_to_batch_map_.find(batch_key);
+    if (desire_it != desired_tensor_to_batch_map_.end()) {
+      // Mark our tensor as no longer missing.
+      auto batch_it = available_batches_.find(desire_it->second);
+      desired_tensor_to_batch_map_.erase(desire_it);
+      if (batch_it == available_batches_.end()) {
+        return errors::InvalidArgument("Batch no longer exists.");
+      }
+      batch_it->second.missing_tensors.erase(batch_key);
+      // If all tensors are available we should concatenate them and dispatch
+      // the batch.
+      if (batch_it->second.missing_tensors.empty()) {
+        TF_RETURN_IF_ERROR(
+            OutputBatch(batch_it->second.context, batch_it->second.done));
+        available_batches_.erase(batch_it);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  mutex mu_;
+
+  // Represents a still-incomplete batch of tensors. When all tensors become
+  // available they will be concatenated in the right order and sent through the
+  // context.
+  struct Batch {
+    // Batch keys for tensors which are still missing from this batch. When this
+    // is empty the Tensors can be concatenated and forwarded.
+    std::unordered_set<int64> missing_tensors;
+
+    // Context and callback for the session responsible for finishing this
+    // batch.
+    OpKernelContext* context;
+    AsyncOpKernel::DoneCallback done;
+  };
+
+  // Map from batch key of the session which will output the batched gradients
+  // to still-incomplete batches.
+  std::unordered_map<int64, Batch> available_batches_;
+
+  // Map from batch key to tensors which are waiting for their batches to be
+  // available.
+  std::unordered_map<int64, Tensor> available_tensors_;
+
+  // Map from batch key of a tensor which is not yet available to the batch key
+  // of the batch to which it belongs.
+  std::unordered_map<int64, int64> desired_tensor_to_batch_map_;
+};
+
+class UnbatchGradKernel : public AsyncOpKernel {
+ public:
+  explicit UnbatchGradKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    UnbatchGradResource* ubr;
+    std::function<Status(UnbatchGradResource * *r)> creator =
+        [this](UnbatchGradResource** r) {
+          *r = new UnbatchGradResource();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &ubr, creator),
+                         done);
+    Status status = ubr->Compute(c, done);
+    ubr->Unref();
+    if (!status.ok()) {
+      OP_REQUIRES_OK_ASYNC(c, status, done);
+    }
+    // Assume ubr calls done, so nothing to do here.
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+};
+REGISTER_KERNEL_BUILDER(Name("UnbatchGrad").Device(DEVICE_CPU),
+                        UnbatchGradKernel);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/ops/batch_ops.cc b/tensorflow/contrib/batching/ops/batch_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85e0ccba4aa372bdc21fb194263569b8b787bb6c
--- /dev/null
+++ b/tensorflow/contrib/batching/ops/batch_ops.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("Batch")
+    .Input("in_tensors: T")
+    .Output("batched_tensors: T")
+    .Output("batch_index: int64")
+    .Output("id: int64")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("grad_timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("T: list(type)")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> in_shapes;
+      TF_RETURN_IF_ERROR(c->input("in_tensors", &in_shapes));
+      std::vector<shape_inference::ShapeHandle> out_shapes(in_shapes.size());
+      for (int i = 0; i < in_shapes.size(); ++i) {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(in_shapes[i], 0, c->UnknownDim(), &out_shapes[i]));
+      }
+      TF_RETURN_IF_ERROR(c->set_output("batched_tensors", out_shapes));
+      TF_RETURN_IF_ERROR(c->set_output("id", {c->Scalar()}));
+      TF_RETURN_IF_ERROR(c->set_output(
+          "batch_index",
+          {c->MakeShape({shape_inference::DimensionOrConstant(c->UnknownDim()),
+                         shape_inference::DimensionOrConstant(3)})}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Batches all input tensors nondeterministically.
+
+When many instances of this Op are being run concurrently with the same
+container/shared_name in the same device, some will output zero-shaped Tensors
+and others will output Tensors of size up to max_batch_size.
+
+All Tensors in in_tensors are batched together (so, for example, labels and
+features should be batched with a single instance of this operation.
+
+Each invocation of batch emits an `id` scalar which will be used to identify
+this particular invocation when doing unbatch or its gradient.
+
+Each op which emits a non-empty batch will also emit a non-empty batch_index
+Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+start, and length of elements of each set of Tensors present in batched_tensors.
+
+Batched tensors are concatenated along the first dimension, and all tensors in
+in_tensors must have the first dimension of the same size.
+
+in_tensors: The tensors to be batched.
+num_batch_threads: Number of scheduling threads for processing batches of work.
+ Determines the number of batches processed in parallel.
+max_batch_size: Batch sizes will never be bigger than this.
+batch_timeout_micros: Maximum number of microseconds to wait before outputting
+ an incomplete batch.
+allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+ nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+ batches up to one of those sizes. The entries must increase monotonically, and
+ the final entry must equal max_batch_size.
+grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+batch_index: If out_tensors is non-empty, has information to invert it.
+container: Controls the scope of sharing of this batch.
+id: always contains a scalar with a unique ID for this invocation of Batch.
+shared_name: Concurrently running instances of batch in the same device with the
+ same container and shared_name will batch their elements together. If left
+ empty, the op name will be used as the shared name.
+T: the types of tensors to be batched.
+)doc");
+
+REGISTER_OP("Unbatch")
+    .Input("batched_tensor: T")
+    .Input("batch_index: int64")
+    .Input("id: int64")
+    .Output("unbatched_tensor: T")
+    .Attr("timeout_micros: int")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out_shape;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &out_shape));
+      c->set_output(0, out_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Reverses the operation of Batch for a single output Tensor.
+
+An instance of Unbatch either receives an empty batched_tensor, in which case it
+asynchronously waits until the values become available from a concurrently
+running instance of Unbatch with the same container and shared_name, or receives
+a non-empty batched_tensor in which case it finalizes all other concurrently
+running instances and outputs its own element from the batch.
+
+batched_tensor: The possibly transformed output of Batch. The size of the first
+ dimension should remain unchanged by the transformations for the operation to
+ work.
+batch_index: The matching batch_index obtained from Batch.
+id: The id scalar emitted by Batch.
+unbatched_tensor: The Tensor corresponding to this execution.
+timeout_micros: Maximum amount of time (in microseconds) to wait to receive the
+ batched input tensor associated with a given invocation of the op.
+container: Container to control resource sharing.
+shared_name: Instances of Unbatch with the same container and shared_name are
+ assumed to possibly belong to the same batch. If left empty, the op name will
+ be used as the shared name.
+)doc");
+
+REGISTER_OP("UnbatchGrad")
+    .Input("original_input: T")
+    .Input("batch_index: int64")
+    .Input("grad: T")
+    .Input("id: int64")
+    .Output("batched_grad: T")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(c->Rank(c->input(2))));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gradient of Unbatch.
+
+Acts like Batch but using the given batch_index index of batching things as they
+become available. This ensures that the gradients are propagated back in the
+same session which did the forward pass.
+
+original_input: The input to the Unbatch operation this is the gradient of.
+batch_index: The batch_index given to the Unbatch operation this is the gradient
+of.
+grad: The downstream gradient.
+id: The id scalar emitted by Batch.
+batched_grad: The return value, either an empty tensor or the batched gradient.
+container: Container to control resource sharing.
+shared_name: Instances of UnbatchGrad with the same container and shared_name
+ are assumed to possibly belong to the same batch. If left empty, the op name
+ will be used as the shared name.
+  )doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..867ee6dfbc8ecad5f0a057ec8b9ac7a3656a23a8
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -0,0 +1,138 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Operations for automatic batching and unbatching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.batching.ops import gen_batch_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.contrib.batching.ops.gen_batch_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+
+_batch_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_batch_ops.so"))
+
+
+@ops.RegisterGradient("Batch")
+def _BatchGrad(op, *out_grads):  # pylint: disable=invalid-name
+  """Gradient for batch op."""
+  gradients = []
+  for i in range(len(op.inputs)):
+    gradients.append(
+        gen_batch_ops.unbatch(
+            out_grads[i],
+            op.outputs[-2],
+            op.outputs[-1],
+            timeout_micros=op.get_attr("grad_timeout_micros"),
+            shared_name="batch_gradient_{}_{}".format(op.name, i)))
+  return gradients
+
+
+@ops.RegisterGradient("Unbatch")
+def _UnbatchGrad(op, grad):   # pylint: disable=invalid-name
+  return [
+      gen_batch_ops.unbatch_grad(
+          op.inputs[0],
+          op.inputs[1],
+          grad,
+          op.inputs[2],
+          shared_name="unbatch_gradient_{}".format(op.name)), None, None
+  ]
+
+
+def batch_function(num_batch_threads, max_batch_size, batch_timeout_micros,
+                   allowed_batch_sizes=None,
+                   grad_timeout_micros=60 * 1000 * 1000,
+                   unbatch_timeout_micros=60 * 1000 * 1000):
+  """Batches the computation done by the decorated function.
+
+  So, for example, in the following code
+
+  ```
+  @batch_function(1, 2, 3)
+  def layer(a):
+    return tf.matmul(a, a)
+
+  b = layer(w)
+  ```
+
+  if more than one session.run call is simultaneously trying to compute `b`
+  the values of `w` will be gathered, non-deterministically concatenated
+  along the first axis, and only one thread will run the computation. See the
+  documentation of the `Batch` op for more details.
+
+  Assumes that all arguments of the decorated function are Tensors which will
+  be batched along their first dimension.
+
+  SparseTensor is not supported. The return value of the decorated function
+  must be a Tensor or a list/tuple of Tensors.
+
+  Args:
+    num_batch_threads: Number of scheduling threads for processing batches
+     of work. Determines the number of batches processed in parallel.
+    max_batch_size: Batch sizes will never be bigger than this.
+    batch_timeout_micros: Maximum number of microseconds to wait before
+     outputting an incomplete batch.
+    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
+     does nothing. Otherwise, supplies a list of batch sizes, causing the op
+     to pad batches up to one of those sizes. The entries must increase
+     monotonically, and the final entry must equal max_batch_size.
+    grad_timeout_micros: The timeout to use for the gradient. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+    unbatch_timeout_micros: The timeout to use for unbatching. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+
+  Returns:
+    The decorated function will return the unbatched computation output Tensors.
+  """
+  def decorator(f):  # pylint: disable=missing-docstring
+    def decorated(*args):
+      with ops.name_scope("batch") as name:
+        for a in args:
+          if not isinstance(a, ops.Tensor):
+            raise ValueError("All arguments to functions decorated with "
+                             "`batch_function`  are supposed to be Tensors; "
+                             "found %s" % repr(a))
+        batched_tensors, batch_index, id_t = gen_batch_ops.batch(
+            args,
+            num_batch_threads=num_batch_threads,
+            max_batch_size=max_batch_size,
+            batch_timeout_micros=batch_timeout_micros,
+            allowed_batch_sizes=allowed_batch_sizes,
+            grad_timeout_micros=grad_timeout_micros,
+            shared_name=name)
+        outputs = f(*batched_tensors)
+        if isinstance(outputs, ops.Tensor):
+          outputs_list = [outputs]
+        else:
+          outputs_list = outputs
+        with ops.name_scope("unbatch") as unbatch_name:
+          unbatched = [
+              gen_batch_ops.unbatch(t, batch_index, id_t,
+                                    timeout_micros=unbatch_timeout_micros,
+                                    shared_name=unbatch_name)
+              for t in outputs_list]
+        if isinstance(outputs, ops.Tensor):
+          return unbatched[0]
+        return unbatched
+    return decorated
+  return decorator
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac7aff29f79fa18fa5f7e596db8afedabaa8993
--- /dev/null
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for the currently experimental in-graph batch ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import time
+
+from tensorflow.contrib.batching.python.ops import batch_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+def delayed_plus1(x):
+  """Sleeps for 100ms then returns x+1."""
+  time.sleep(0.1)
+  return x + 1
+
+
+class BatchOpsTest(test.TestCase):
+  """Tests for batch_ops.{un,}batch."""
+
+  def testBasicBatch(self):
+    """Tests that a single batched tensor executes together and only once."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+        index_t = thread_results[1]
+        empty_b = main_results[0][0]
+        empty_m = main_results[1]
+      else:
+        batch_t = main_results[0][0]
+        index_t = main_results[1]
+        empty_b = thread_results[0][0]
+        empty_m = thread_results[1]
+
+      # Check that both the inputs made it out exactly once.
+      self.assertAllEqual(sorted(batch_t), (1, 2))
+      # Check that we get 2 rows in the index tensor.
+      self.assertEqual(len(index_t), 2)
+      # Check that the other ones are empty.
+      self.assertEqual(len(empty_b), 0)
+      self.assertEqual(len(empty_m), 0)
+
+  def testBatchWithPadding(self):
+    """Test that batching with padding up to an allowed batch size works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[5, 10],
+          grad_timeout_micros=0, batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched, index], feed_dict={inp: [1, 3]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched, index], feed_dict={inp: [2, 4]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0][0]
+      else:
+        batch_t = main_results[0][0]
+
+      # Check that the batch tensor incorporates the padding.
+      self.assertEqual(len(batch_t), 5)
+
+  def testMultipleBatch(self):
+    """Tests that multiple batched tensors execute together."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, _, _ = batch_ops.batch(
+          [inp0, inp1],
+          num_batch_threads=1,
+          max_batch_size=2,
+          batch_timeout_micros=36000000,
+          grad_timeout_micros=0,
+          batching_queue="")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(
+            sess.run([batched], feed_dict={inp0: [1],
+                                           inp1: [2]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([batched], feed_dict={inp0: [2], inp1: [3]})
+      worker_thread.join()
+
+      # At this point either the thread or the main did the batch and the other
+      # should have empty results.
+      if list(thread_results[0][0]):
+        batch_t = thread_results[0]
+        empty_t = main_results[0]
+      else:
+        batch_t = main_results[0]
+        empty_t = thread_results[0]
+
+      # Assert that the tensors were batched together.
+      self.assertAllEqual(sorted(batch_t[0]), [1, 2])
+      self.assertAllEqual(sorted(batch_t[1]), [2, 3])
+      self.assertAllEqual(empty_t[0], [])
+      self.assertAllEqual(empty_t[1], [])
+
+  def testIllegalBatchDifferentDim0Sizes(self):
+    """Tests illegally feeding tensors with different dim0 sizes."""
+    with self.test_session() as sess:
+      inp0 = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp1 = array_ops.placeholder(dtype=dtypes.int32, shape=[2])
+      batched, index, _ = batch_ops.batch(
+          [inp0, inp1], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=0, grad_timeout_micros=0, batching_queue="")
+      with self.assertRaises(Exception) as raised:
+        _ = sess.run([batched, index], feed_dict={inp0: [0], inp1: [1, 2]})
+      self.assertGreater(
+          raised.exception.message.find("must have equal 0th-dimension size"),
+          0)
+
+  def testBasicUnbatch(self):
+    """Tests that batch and unbatch work together."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          grad_timeout_micros=0, batching_queue="")
+      computation = batched[0] + 1
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBasicUnbatchDecorated(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return in_t + 1
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testUnbatchTimeout(self):
+    """Tests that the unbatch timeout works."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=0,
+          batching_queue="")
+      computation = batched[0] + 1
+      timeout_micros = 10
+      result = batch_ops.unbatch(computation, index, id_t, timeout_micros,
+                                 shared_name="shared_unbatch")
+      # Set up a parallel pipeline that delays the computation, but uses the
+      # same unbatch resource object as the non-delayed pipeline.
+      computation_delayed = script_ops.py_func(delayed_plus1,
+                                               [batched[0]],
+                                               dtypes.int32)
+      result_delayed = batch_ops.unbatch(computation_delayed,
+                                         index,
+                                         id_t,
+                                         timeout_micros,
+                                         shared_name="shared_unbatch")
+
+      thread_results = []
+      def worker():
+        # A first call using the non-delayed pipeline. The batcher will send an
+        # empty tensor along the non-delayed pipeline.
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      time.sleep(0.1)  # Ensure the thread's call starts first.
+      # A second call using the delayed pipeline.  The batcher will send the
+      # batched tensor along the delayed pipeline, thus delaying the arrival of
+      # the batched tensor at the unbatch op, relative to the empty tensor.
+      #
+      # TODO(olston, apassos): Avoid relying on the order in which the batch op
+      # emits the empty tensor versus the batched one.
+      _ = sess.run([result_delayed], feed_dict={inp: [2]})
+      worker_thread.join()
+      # The thread's call should hit the timeout, and thus get 0 results.
+      self.assertEqual(len(thread_results), 0)
+
+  def testUnbatchGrad(self):
+    """Tests that batch and unbatch are differentiable."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      batched, index, id_t = batch_ops.batch(
+          [inp], num_batch_threads=1, max_batch_size=2,
+          batch_timeout_micros=36000000, grad_timeout_micros=1000000,
+          batching_queue="")
+      computation = batched[0] * batched[0]
+      result = batch_ops.unbatch(computation, index, id_t,
+                                 timeout_micros=1000000, shared_name="unbatch")
+      grad = gradients_impl.gradients(result, inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([grad], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([grad], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [4])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
index 2a49250041cdc260cad3553152be189a5527f7ae..41a3f99137ade2552432fee62ddce17d064148a4 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
index a793ed3ccd6da5a580436d1fc3dfc3a1a304224d..809958c737e6b2b68422c162471b25dea51636f3 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.cc b/tensorflow/contrib/batching/test_util/fake_clock_env.cc
index 4a6fa338a13ed3b02de4d911f2c7b6a3c6a51f7e..166d6703bde1054a4a44842ecea382b5a1fb79e7 100644
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.cc
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/test_util/fake_clock_env.h b/tensorflow/contrib/batching/test_util/fake_clock_env.h
index 78b66bce705d027927c926ddd37cbe55df10455e..35cafcb73c51feb4e9e15a61d1830c8ef6bc3e0f 100644
--- a/tensorflow/contrib/batching/test_util/fake_clock_env.h
+++ b/tensorflow/contrib/batching/test_util/fake_clock_env.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index fe1660ffd1ced7bd74249c43fac61ea65949094d..e69d6ecd8fa1eddb557472c317fce206f7c490aa 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -22,11 +22,21 @@ filegroup(
 )
 
 cc_library(
-    name = "periodic_function",
+    name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
     hdrs = ["periodic_function.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "periodic_function",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":periodic_function_dynamic",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/contrib/batching/util/periodic_function.cc b/tensorflow/contrib/batching/util/periodic_function.cc
index e15084787827303edd23031353c6a4af5267a001..b7e4838da50c2daf70a5b2c7b7f630caa0be96fa 100644
--- a/tensorflow/contrib/batching/util/periodic_function.cc
+++ b/tensorflow/contrib/batching/util/periodic_function.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/util/periodic_function.h b/tensorflow/contrib/batching/util/periodic_function.h
index b2dde8df7a4adcf1c9b8d3e244df172d0af33db9..2c032d802fe5f23a267db28dc869a253f16afc34 100644
--- a/tensorflow/contrib/batching/util/periodic_function.h
+++ b/tensorflow/contrib/batching/util/periodic_function.h
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/batching/util/periodic_function_test.cc b/tensorflow/contrib/batching/util/periodic_function_test.cc
index ad0a0a17472b9d1f7b31a9bf087d707b6d0b54bf..15179611160e1962bbd28b03ddbaa2eec35eb8ea 100644
--- a/tensorflow/contrib/batching/util/periodic_function_test.cc
+++ b/tensorflow/contrib/batching/util/periodic_function_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
index d98d4e737c32dd1d172fc9fef92786b717924d60..6cdaa3187054daa278dc7342626b089f9655457b 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@@ -95,7 +95,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.sample,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
@@ -123,7 +123,7 @@ class ElboRatioTest(test.TestCase):
           n=n_samples,
           form=entropy.ELBOForms.analytic_entropy,
           seed=42)
-      actual_kl = distributions.kl(q, p)
+      actual_kl = distributions.kl_divergence(q, p)
 
       # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
       # pass.
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
index 81e40dbe5ecfe0cff958c22b00889f3e059cce0b..6d0cff4678972719cb5c565bc409041e298beadb 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@@ -19,16 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions as distributions_lib
+
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
 sge = stochastic_gradient_estimators
 st = stochastic_tensor_impl
 
@@ -42,21 +42,18 @@ class StochasticTensorTest(test.TestCase):
       sigma2 = constant_op.constant([0.1, 0.2, 0.3])
 
       prior_default = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma))
+          normal.Normal(loc=mu, scale=sigma))
       self.assertTrue(isinstance(prior_default.value_type, st.SampleValue))
       prior_0 = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma),
+          normal.Normal(loc=mu, scale=sigma),
           dist_value_type=st.SampleValue())
       self.assertTrue(isinstance(prior_0.value_type, st.SampleValue))
 
       with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior.value_type, st.SampleValue))
         likelihood = st.StochasticTensor(
-            distributions.Normal(
-                loc=prior, scale=sigma2))
+            normal.Normal(loc=prior, scale=sigma2))
         self.assertTrue(isinstance(likelihood.value_type, st.SampleValue))
 
       coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
@@ -85,7 +82,7 @@ class StochasticTensorTest(test.TestCase):
       sigma = constant_op.constant([1.1, 1.2, 1.3])
 
       with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior.value_type, st.MeanValue))
 
       prior_mean = prior.mean()
@@ -102,8 +99,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue()):
         prior_single = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
 
       prior_single_value = prior_single.value()
       self.assertEqual(prior_single_value.get_shape(), (2, 3))
@@ -113,8 +109,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue(1)):
         prior_single = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
         self.assertTrue(isinstance(prior_single.value_type, st.SampleValue))
 
       prior_single_value = prior_single.value()
@@ -125,8 +120,7 @@ class StochasticTensorTest(test.TestCase):
 
       with st.value_type(st.SampleValue(2)):
         prior_double = st.StochasticTensor(
-            distributions.Normal(
-                loc=mu, scale=sigma))
+            normal.Normal(loc=mu, scale=sigma))
 
       prior_double_value = prior_double.value()
       self.assertEqual(prior_double_value.get_shape(), (2, 2, 3))
@@ -139,10 +133,10 @@ class StochasticTensorTest(test.TestCase):
       mu = [0.0, -1.0, 1.0]
       sigma = constant_op.constant([1.1, 1.2, 1.3])
       with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
         entropy = prior.entropy()
         deep_entropy = prior.distribution.entropy()
-        expected_deep_entropy = distributions.Normal(
+        expected_deep_entropy = normal.Normal(
             loc=mu, scale=sigma).entropy()
         entropies = sess.run([entropy, deep_entropy, expected_deep_entropy])
         self.assertAllEqual(entropies[2], entropies[0])
@@ -155,7 +149,7 @@ class StochasticTensorTest(test.TestCase):
 
       # With default
       with st.value_type(st.MeanValue(stop_gradient=True)):
-        dt = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
+        dt = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
       loss = dt.loss([constant_op.constant(2.0)])
       self.assertTrue(loss is not None)
       self.assertAllClose(
@@ -163,8 +157,7 @@ class StochasticTensorTest(test.TestCase):
 
       # With passed-in loss_fn.
       dt = st.StochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma),
+          normal.Normal(loc=mu, scale=sigma),
           dist_value_type=st.MeanValue(stop_gradient=True),
           loss_fn=sge.get_score_function_with_constant_baseline(
               baseline=constant_op.constant(8.0)))
@@ -199,8 +192,7 @@ class ObservedStochasticTensorTest(test.TestCase):
       sigma = constant_op.constant([1.1, 1.2, 1.3])
       obs = array_ops.zeros((2, 3))
       z = st.ObservedStochasticTensor(
-          distributions.Normal(
-              loc=mu, scale=sigma), value=obs)
+          normal.Normal(loc=mu, scale=sigma), value=obs)
       [obs_val, z_val] = sess.run([obs, z.value()])
       self.assertAllEqual(obs_val, z_val)
 
@@ -212,15 +204,13 @@ class ObservedStochasticTensorTest(test.TestCase):
     sigma = array_ops.placeholder(dtypes.float32)
     obs = array_ops.placeholder(dtypes.float32)
     z = st.ObservedStochasticTensor(
-        distributions.Normal(
-            loc=mu, scale=sigma), value=obs)
+        normal.Normal(loc=mu, scale=sigma), value=obs)
 
     mu2 = array_ops.placeholder(dtypes.float32, shape=[None])
     sigma2 = array_ops.placeholder(dtypes.float32, shape=[None])
     obs2 = array_ops.placeholder(dtypes.float32, shape=[None, None])
     z2 = st.ObservedStochasticTensor(
-        distributions.Normal(
-            loc=mu2, scale=sigma2), value=obs2)
+        normal.Normal(loc=mu2, scale=sigma2), value=obs2)
 
     coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
     self.assertEqual(coll, [z, z2])
@@ -231,22 +221,18 @@ class ObservedStochasticTensorTest(test.TestCase):
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
+        normal.Normal(loc=mu, scale=sigma),
         value=array_ops.zeros((3,)))
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
+        normal.Normal(loc=mu, scale=sigma),
         value=array_ops.zeros((3, 1)))
     self.assertRaises(
         ValueError,
         st.ObservedStochasticTensor,
-        distributions.Normal(
-            loc=mu, scale=sigma),
-        value=array_ops.zeros(
-            (1, 2), dtype=dtypes.int32))
+        normal.Normal(loc=mu, scale=sigma),
+        value=array_ops.zeros((1, 2), dtype=dtypes.int32))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
index a46d755897e61ec77bd0af1d94c8504d200c49e3..fff6b74b2efed27abd7b25cbe0e8e8b3904767e1 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
@@ -22,12 +22,12 @@ from tensorflow.contrib import distributions as distributions_lib
 from tensorflow.contrib import layers
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
 from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 st = stochastic_tensor
@@ -68,7 +68,7 @@ class VariationalInferenceTest(test.TestCase):
   def testDefaultVariationalAndPrior(self):
     _, prior, variational, _, log_likelihood = mini_vae()
     elbo = vi.elbo(log_likelihood)
-    expected_elbo = log_likelihood - kullback_leibler.kl(
+    expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
         variational.distribution, prior)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
@@ -80,7 +80,7 @@ class VariationalInferenceTest(test.TestCase):
       prior = normal.Normal(loc=3., scale=2.)
       elbo = vi.elbo(
           log_likelihood, variational_with_prior={variational: prior})
-      expected_elbo = log_likelihood - kullback_leibler.kl(
+      expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
           variational.distribution, prior)
       sess.run(variables.global_variables_initializer())
       self.assertAllEqual(*sess.run([expected_elbo, elbo]))
@@ -121,7 +121,7 @@ class VariationalInferenceTest(test.TestCase):
 
     # No analytic KL available between prior and variational distributions.
     with self.assertRaisesRegexp(NotImplementedError, "No KL"):
-      distributions.kl(variational.distribution, prior)
+      distributions.kl_divergence(variational.distribution, prior)
 
     elbo = vi.elbo(
         variational_with_prior={variational: prior},
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
index ef9fb730258ac52ae6b36554939f3490421ce0c5..f155de5032be8fc4477e0c71ca634a32c0d922d1 100644
--- a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
@@ -84,8 +84,9 @@ def elbo_ratio(log_p,
   KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
   ```
 
-  Note that if `p` is a `Distribution`, then `distributions.kl(q, p)` may be
-  defined and available as an exact result.
+  Note that if `p` is a `Distribution`, then
+  `distributions.kl_divergence(q, p)` may be defined and available as an
+  exact result.
 
   #### ELBO
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 55e0e6d57b32606012574f9df82c8ee67c46fabb..3590f940acfb05ee7a13f59837f6a5ca90c41cb5 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -177,7 +177,7 @@ def _logspace_mean(log_values):
       `Log[Mean[values]]`.
   """
   # center = Max[Log[values]],  with stop-gradient
-  # The center hopefully keep the exponentiated term small.  It is cancelled
+  # The center hopefully keep the exponentiated term small.  It is canceled
   # from the final result, so putting stop gradient on it will not change the
   # final result.  We put stop gradient on to eliminate unnecessary computation.
   center = array_ops.stop_gradient(_sample_max(log_values))
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
index b810ad3093e6b8fc19496dab37c15da280f2fe62..ce5fdd98c69ca6b3482bfafa8859accdf8a78749 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
@@ -48,9 +48,9 @@ import threading
 import six
 
 from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import distribution
 
 STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
 
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
index 6a8577234f2a47022cb2aa0fb2f44870c5c6f6db..8d932a7c340e21da012d4ab93883735b13e01175 100644
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
@@ -28,10 +28,10 @@ from __future__ import print_function
 
 from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl as sg
 from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl as st
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import tf_logging as logging
 
 VI_PRIORS = "__vi_priors__"
@@ -259,7 +259,7 @@ def _elbo(form, log_likelihood, log_joint, variational_with_prior,
     kl = None
     if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
       try:
-        kl = kullback_leibler.kl(q, p)
+        kl = kullback_leibler.kl_divergence(q, p)
         logging.info("Using analytic KL between q:%s, p:%s", q, p)
       except NotImplementedError as e:
         if form == ELBOForms.analytic_kl:
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 714bd324c2ae43140e122dfc0c7b9e03aa55c78e..011c02d720f7ab01de72039d4b4603194c7aa9ee 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -160,3 +160,90 @@ cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+cc_library(
+    name = "models",
+    srcs = ["models/multiple_additive_trees.cc"],
+    hdrs = ["models/multiple_additive_trees.h"],
+    deps = [
+        ":trees",
+        ":utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "multiple_additive_trees_test",
+    size = "small",
+    srcs = ["models/multiple_additive_trees_test.cc"],
+    deps = [
+        ":batch_features_testutil",
+        ":models",
+        ":random_tree_gen",
+        "//tensorflow/contrib/boosted_trees/resources:decision_tree_ensemble_resource",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "trees",
+    srcs = ["trees/decision_tree.cc"],
+    hdrs = ["trees/decision_tree.h"],
+    deps = [
+        ":utils",
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+)
+
+cc_test(
+    name = "trees_test",
+    size = "small",
+    srcs = ["trees/decision_tree_test.cc"],
+    deps = [
+        ":trees",
+        ":utils",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "batch_features_testutil",
+    testonly = 1,
+    srcs = ["testutil/batch_features_testutil.cc"],
+    hdrs = ["testutil/batch_features_testutil.h"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "random_tree_gen",
+    srcs = ["testutil/random_tree_gen.cc"],
+    hdrs = ["testutil/random_tree_gen.h"],
+    deps = [
+        "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_binary(
+    name = "random_tree_gen_main",
+    srcs = ["testutil/random_tree_gen_main.cc"],
+    deps = [
+        ":random_tree_gen",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16bffd9beccfad352820c805e08bec71f3705f42
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
@@ -0,0 +1,140 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h"
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/parallel_for.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+namespace {
+void CalculateTreesToKeep(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const std::vector<int32>& trees_to_drop, const int32 num_trees,
+    const bool only_finalized, std::vector<int32>* trees_to_keep) {
+  trees_to_keep->reserve(num_trees - trees_to_drop.size());
+
+  int32 index = 0;
+  // This assumes that trees_to_drop is a sorted list of tree ids.
+  for (int32 tree = 0; tree < num_trees; ++tree) {
+    if ((!trees_to_drop.empty() && index < trees_to_drop.size() &&
+         trees_to_drop[index] == tree) ||
+        (only_finalized && config.tree_metadata_size() > 0 &&
+         !config.tree_metadata(tree).is_finalized())) {
+      ++index;
+      continue;
+    }
+    trees_to_keep->push_back(tree);
+  }
+}
+
+void UpdatePredictions(
+    const int32 index_1, const int32 index_2, const float value,
+    tensorflow::TTypes<float>::Matrix* output_predictions,
+    tensorflow::TTypes<float>::Matrix* additional_output_predictions) {
+  (*output_predictions)(index_1, index_2) += value;
+
+  if (additional_output_predictions != nullptr) {
+    (*additional_output_predictions)(index_1, index_2) += value;
+  }
+}
+
+void UpdatePredictionsBasedOnTree(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const int32 tree_idx, const boosted_trees::utils::Example& example,
+    tensorflow::TTypes<float>::Matrix* output_predictions,
+    tensorflow::TTypes<float>::Matrix* additional_output_predictions) {
+  const boosted_trees::trees::DecisionTreeConfig& tree = config.trees(tree_idx);
+  const float tree_weight = config.tree_weights(tree_idx);
+  const int leaf_idx = trees::DecisionTree::Traverse(tree, 0, example);
+  QCHECK(leaf_idx >= 0) << "Invalid tree: " << tree.DebugString();
+  const auto& leaf_node = tree.nodes(leaf_idx);
+  QCHECK(leaf_node.has_leaf())
+      << "Invalid leaf node: " << leaf_node.DebugString();
+  if (leaf_node.leaf().has_sparse_vector()) {
+    const auto& leaf = leaf_node.leaf().sparse_vector();
+    QCHECK_EQ(leaf.index_size(), leaf.value_size());
+    for (size_t class_idx = 0; class_idx < leaf.index_size(); ++class_idx) {
+      const float value = tree_weight * leaf.value(class_idx);
+
+      UpdatePredictions(example.example_idx, leaf.index(class_idx), value,
+                        output_predictions, additional_output_predictions);
+    }
+  } else {
+    QCHECK(leaf_node.leaf().has_vector()) << "Unknown leaf type";
+    const auto& leaf = leaf_node.leaf().vector();
+    for (size_t i = 0; i < leaf.value_size(); ++i) {
+      const float value = tree_weight * leaf.value(i);
+      UpdatePredictions(example.example_idx, i, value, output_predictions,
+                        additional_output_predictions);
+    }
+  }
+}
+
+}  // namespace
+
+void MultipleAdditiveTrees::Predict(
+    const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+    const bool only_finalized_trees, const std::vector<int32>& trees_to_drop,
+    const boosted_trees::utils::BatchFeatures& features,
+    tensorflow::thread::ThreadPool* worker_threads,
+    tensorflow::TTypes<float>::Matrix output_predictions,
+    tensorflow::TTypes<float>::Matrix no_dropout_predictions) {
+  // Zero out predictions as the model is additive.
+  output_predictions.setZero();
+  no_dropout_predictions.setZero();
+
+  // Get batch size.
+  const int64 batch_size = features.batch_size();
+  if (batch_size <= 0) {
+    return;
+  }
+
+  // Prepare the list of trees to keep.
+  std::vector<int32> trees_to_keep;
+  CalculateTreesToKeep(config, trees_to_drop, config.trees_size(),
+                       only_finalized_trees, &trees_to_keep);
+
+  // Lambda for doing a block of work.
+  auto update_predictions = [&config, &features, &trees_to_keep, &trees_to_drop,
+                             &output_predictions,
+                             &no_dropout_predictions](int64 start, int64 end) {
+    auto examples_iterable = features.examples_iterable(start, end);
+    for (const auto& example : examples_iterable) {
+      for (const int32 tree_idx : trees_to_keep) {
+        UpdatePredictionsBasedOnTree(config, tree_idx, example,
+                                     &output_predictions,
+                                     &no_dropout_predictions);
+      }
+
+      // Now do predictions for dropped trees
+      for (const int32 tree_idx : trees_to_drop) {
+        UpdatePredictionsBasedOnTree(config, tree_idx, example,
+                                     &no_dropout_predictions, nullptr);
+      }
+    }
+  };
+
+  // TODO(salehay): parallelize this for low latency in serving path where
+  // batch size tends to be small but ensemble size tends to be large.
+  boosted_trees::utils::ParallelFor(batch_size, worker_threads->NumThreads(),
+                                    worker_threads, update_predictions);
+}
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedade2026137ce43ff6b1cecd21f1e6c1461960
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
@@ -0,0 +1,50 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+// Multiple additive trees prediction model.
+// This class does not hold state and is thread safe.
+class MultipleAdditiveTrees {
+ public:
+  // Predict runs tree ensemble on the given batch and updates
+  // output predictions accordingly. The method also returns predictions that
+  // we would get if no dropout was applied.
+  static void Predict(
+      const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
+      const bool only_finalized_trees, const std::vector<int32>& trees_to_drop,
+      const boosted_trees::utils::BatchFeatures& features,
+      thread::ThreadPool* const thread_pool,
+      TTypes<float>::Matrix output_predictions,
+      TTypes<float>::Matrix no_dropout_predictions);
+};
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_MODELS_MULTIPLE_ADDITIVE_TREES_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0924b48f2a57c5ba8af1e564e344e8ffa1b676
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
@@ -0,0 +1,381 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h"
+
+#include "tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h"
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+#include "tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+using boosted_trees::trees::DecisionTreeEnsembleConfig;
+using test::AsTensor;
+
+namespace boosted_trees {
+namespace models {
+namespace {
+
+const int32 kNumThreadsMultiThreaded = 6;
+const int32 kNumThreadsSingleThreaded = 1;
+
+class MultipleAdditiveTreesTest : public ::testing::Test {
+ protected:
+  MultipleAdditiveTreesTest() : batch_features_(2) {
+    // Create a batch of two examples having one dense feature each.
+    // The shape of the dense matrix is therefore 2x1 as in one row per example
+    // and one column per feature per example.
+    auto dense_matrix = test::AsTensor<float>({7.0f, -2.0f}, {2, 1});
+    TF_EXPECT_OK(
+        batch_features_.Initialize({dense_matrix}, {}, {}, {}, {}, {}, {}));
+  }
+
+  boosted_trees::utils::BatchFeatures batch_features_;
+};
+
+TEST_F(MultipleAdditiveTreesTest, Empty) {
+  // Create empty tree ensemble.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto output_tensor = AsTensor<float>({9.0f, 23.0f}, {2, 1});
+  auto output_matrix = output_tensor.matrix<float>();
+  auto no_dropout_output_matrix = output_tensor.matrix<float>();
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                 false,  // include non-finalized trees
+                                 {}, batch_features_, &threads, output_matrix,
+                                 no_dropout_output_matrix);
+  EXPECT_EQ(0, output_matrix(0, 0));
+  EXPECT_EQ(0, output_matrix(1, 0));
+
+  // There was no dropout
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, SingleClass) {
+  // Add one bias and one stump to ensemble for a single class.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  bias_leaf->add_index(0);
+  bias_leaf->add_value(-0.4f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf1->add_index(0);
+  leaf1->add_value(0.9f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf2->add_index(0);
+  leaf2->add_value(0.2f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  auto output_tensor = AsTensor<float>({0.0f, 0.0f}, {2, 1});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor = AsTensor<float>({0.0f, 0.0f}, {2, 1});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1).
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+    }
+  }
+  // Weighted case
+  {
+    DecisionTreeEnsembleConfig weighted = tree_ensemble_config;
+    weighted.set_tree_weights(0, 6.0);
+    weighted.set_tree_weights(1, 3.2);
+    MultipleAdditiveTrees::Predict(weighted,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.2 * 3.2, output_matrix(0, 0));
+    // -0.4 (bias) + 0.9 (leaf 1).
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.9 * 3.2, output_matrix(1, 0));
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      EXPECT_EQ(output_matrix(i, 0), no_dropout_output_matrix(i, 0));
+    }
+  }
+  // Drop first tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 0));  // 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 1).
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+  // Drop second tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias).
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias).
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+  // Drop all trees.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0, 1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.0, output_matrix(1, 0));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(
+        -0.2f, no_dropout_output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 1).
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, MultiClass) {
+  // Add one bias and one stump to ensemble for two classes.
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  bias_leaf->add_index(0);
+  bias_leaf->add_value(-0.4f);
+  bias_leaf->add_index(1);
+  bias_leaf->add_value(-0.7f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf1->add_index(0);
+  leaf1->add_value(0.9f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_sparse_vector();
+  leaf2->add_index(1);
+  leaf2->add_value(0.2f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  auto output_tensor = AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f}, {2, 2});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f}, {2, 2});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.5f, output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(1, 1));  // -0.7 (bias)
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        EXPECT_EQ(output_matrix(i, j), no_dropout_output_matrix(i, j));
+      }
+    }
+  }
+  // Weighted case.
+  {
+    DecisionTreeEnsembleConfig weighted = tree_ensemble_config;
+    weighted.set_tree_weights(0, 6.0);
+    weighted.set_tree_weights(1, 3.2);
+    MultipleAdditiveTrees::Predict(weighted,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    // bias
+    EXPECT_FLOAT_EQ(-0.4f * 6, output_matrix(0, 0));
+    // bias + leaf 2
+    EXPECT_FLOAT_EQ(-0.7f * 6 + 0.2f * 3.2, output_matrix(0, 1));
+    // bias + leaf 2
+    EXPECT_FLOAT_EQ(-0.4f * 6 + 0.9f * 3.2f, output_matrix(1, 0));
+    // bias
+    EXPECT_FLOAT_EQ(-0.7f * 6, output_matrix(1, 1));
+  }
+  // Dropout first tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 1));  // 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 1));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+  // Dropout second tree.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(0, 1));  // -0.7 (bias)
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(-0.7f, output_matrix(1, 1));  // -0.7 (bias)
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+  // Drop both trees.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {0, 1}, batch_features_, &threads,
+                                   output_matrix, no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 0));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 1));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 0));
+    EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 1));
+
+    // No dropout predictions
+    EXPECT_FLOAT_EQ(-0.4f, no_dropout_output_matrix(0, 0));  // -0.4 (bias)
+    EXPECT_FLOAT_EQ(
+        -0.5f, no_dropout_output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(
+        0.5f, no_dropout_output_matrix(1, 0));  // -0.4 (bias) + 0.9 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.7f, no_dropout_output_matrix(1, 1));  // -0.7 (bias)
+  }
+}
+
+TEST_F(MultipleAdditiveTreesTest, DenseLeaves) {
+  DecisionTreeEnsembleConfig tree_ensemble_config;
+  auto* tree1 = tree_ensemble_config.add_trees();
+  auto* bias_leaf = tree1->add_nodes()->mutable_leaf()->mutable_vector();
+  bias_leaf->add_value(-0.4f);
+  bias_leaf->add_value(-0.7f);
+  bias_leaf->add_value(3.0f);
+  auto* tree2 = tree_ensemble_config.add_trees();
+  auto* dense_split = tree2->add_nodes()->mutable_dense_float_binary_split();
+  dense_split->set_feature_column(0);
+  dense_split->set_threshold(5.0f);
+  dense_split->set_left_id(1);
+  dense_split->set_right_id(2);
+  auto* leaf1 = tree2->add_nodes()->mutable_leaf()->mutable_vector();
+  leaf1->add_value(0.9f);
+  leaf1->add_value(0.8f);
+  leaf1->add_value(0.7f);
+  auto* leaf2 = tree2->add_nodes()->mutable_leaf()->mutable_vector();
+  leaf2->add_value(0.2f);
+  leaf2->add_value(0.3f);
+  leaf2->add_value(0.4f);
+
+  tree_ensemble_config.add_tree_weights(1.0);
+  tree_ensemble_config.add_tree_weights(1.0);
+
+  // Predict for both instances.
+  tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
+                                         kNumThreadsSingleThreaded);
+  auto output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {2, 3});
+  auto output_matrix = output_tensor.matrix<float>();
+
+  auto no_dropout_output_tensor =
+      AsTensor<float>({0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, {2, 3});
+  auto no_dropout_output_matrix = no_dropout_output_tensor.matrix<float>();
+
+  // Normal case.
+  {
+    MultipleAdditiveTrees::Predict(tree_ensemble_config,
+                                   false,  // include non-finalized trees
+                                   {}, batch_features_, &threads, output_matrix,
+                                   no_dropout_output_matrix);
+    EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (tree1) + 0.2 (leaf 2)
+    EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 1));  // -0.7 (tree1) + 0.3 (leaf 2)
+    EXPECT_FLOAT_EQ(3.4f, output_matrix(0, 2));   // 3.0 -(tree1) + 0.4 (leaf 2)
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (tree1) + 0.9 (leaf 1)
+    EXPECT_FLOAT_EQ(0.1f, output_matrix(1, 1));   // -0.7 (tree1) + 0.8 (leaf 1)
+    EXPECT_FLOAT_EQ(3.7f, output_matrix(1, 2));   // 3.0 (tree1) + 0.7 (leaf 1)
+
+    // No dropout predictions are the same.
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        EXPECT_EQ(output_matrix(i, j), no_dropout_output_matrix(i, j));
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
index 22ec5349f861949b0db65ea28598fa83c1073b72..5e316538cefed30b2867252c9ebc4754216db329 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -55,7 +55,7 @@ class WeightedQuantilesBuffer {
       : max_size_(std::min(block_size << 1, max_elements)) {
     QCHECK(max_size_ > 0) << "Invalid buffer specification: (" << block_size
                           << ", " << max_elements << ")";
-    map_.reserve(max_size_);
+    vec_.reserve(max_size_);
   }
 
   // Disallow copying as it's semantically non-sensical in the Squawd algorithm
@@ -77,42 +77,48 @@ class WeightedQuantilesBuffer {
       return;
     }
 
-    // Insert entry to map if not already present else
-    // accumulate the new weight.
-    auto result = map_.insert(BufferMapEntry(value, weight));
-    if (!result.second) {
-      result.first->second += weight;
-    }
+    // Push back the entry to the buffer.
+    vec_.push_back(BufferEntry(value, weight));
   }
 
-  // Returns a sorted vector view of the base buffer. Callers should
-  // minimize how often this is called, ideally only right after the buffer
-  // becomes full.
-  std::vector<BufferEntry> GenerateEntryList() const {
+  // Returns a sorted vector view of the base buffer and clears the buffer.
+  // Callers should minimize how often this is called, ideally only right after
+  // the buffer becomes full.
+  std::vector<BufferEntry> GenerateEntryList() {
     std::vector<BufferEntry> ret;
-    ret.reserve(map_.size());
-    std::transform(map_.begin(), map_.end(), std::back_inserter(ret),
-                   [](const BufferMapEntry& map_entry) {
-                     return BufferEntry(map_entry.first, map_entry.second);
-                   });
+    if (vec_.size() == 0) {
+      return ret;
+    }
+    ret.swap(vec_);
+    vec_.reserve(max_size_);
     std::sort(ret.begin(), ret.end());
+    size_t num_entries = 0;
+    for (size_t i = 1; i < ret.size(); ++i) {
+      if (ret[i].value != ret[i - 1].value) {
+        BufferEntry tmp = ret[i];
+        ++num_entries;
+        ret[num_entries] = tmp;
+      } else {
+        ret[num_entries].weight += ret[i].weight;
+      }
+    }
+    ret.resize(num_entries + 1);
     return ret;
   }
 
-  int64 Size() const { return map_.size(); }
-  bool IsFull() const { return map_.size() >= max_size_; }
-  void Clear() { map_.clear(); }
+  int64 Size() const { return vec_.size(); }
+  bool IsFull() const { return vec_.size() >= max_size_; }
+  void Clear() { vec_.clear(); }
 
  private:
-  using BufferMap = typename std::unordered_map<ValueType, WeightType>;
-  using BufferMapEntry = typename BufferMap::value_type;
+  using BufferVector = typename std::vector<BufferEntry>;
 
   // Comparison function.
   static constexpr decltype(CompareFn()) kCompFn = CompareFn();
 
   // Base buffer.
   size_t max_size_;
-  BufferMap map_;
+  BufferVector vec_;
 };
 
 template <typename ValueType, typename WeightType, typename CompareFn>
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc
index 02696fb4f18bd4498bd99765a384983e2c466180..8e403186651e83dae74adfd209aa001654b09681 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer_test.cc
@@ -69,46 +69,31 @@ TEST_F(WeightedQuantilesBufferTest, PushEntryFull) {
   expected.emplace_back(2, 4);
   expected.emplace_back(5, 9);
 
-  // At this point, we have a compaction and duplicate entry 2 is merged.
-  EXPECT_FALSE(buffer.IsFull());
-  EXPECT_EQ(buffer.GenerateEntryList(), expected);
-
-  // Push another unique entry.
-  buffer.PushEntry(3, 2);
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
   EXPECT_TRUE(buffer.IsFull());
-
-  // Can't push any more entries before clearing.
-  EXPECT_DEATH(({ buffer.PushEntry(6, 6); }), "Buffer already full");
+  EXPECT_EQ(buffer.GenerateEntryList(), expected);
+  EXPECT_FALSE(buffer.IsFull());
 }
 
-TEST_F(WeightedQuantilesBufferTest, RandomizedPush) {
-  // buffer capacity is 6.
-  Buffer buffer(3, 100);
-  std::array<double, 5> elements = {{1.1, 2.3, 5.1, 8.0, 12.6}};
-  std::array<double, elements.size()> counts;
-  counts.fill(0.0);
-
-  random::PhiloxRandom philox(13);
-  random::SimplePhilox rand(&philox);
-
-  for (int iters = 10000; iters-- > 0; --iters) {
-    // Add entry.
-    int32 picked_idx = rand.Uniform(elements.size());
-    buffer.PushEntry(elements[picked_idx], 1.0);
-    ++counts[picked_idx];
-
-    // We can't fill buffer with a number of unique elements < capacity.
-    EXPECT_FALSE(buffer.IsFull());
-  }
+TEST_F(WeightedQuantilesBufferTest, PushEntryFullDeath) {
+  // buffer capacity is 4.
+  Buffer buffer(2, 100);
+  buffer.PushEntry(5, 9);
+  buffer.PushEntry(2, 3);
+  buffer.PushEntry(-1, 7);
+  buffer.PushEntry(2, 1);
 
-  // Ensure we didn't lose any information.
   std::vector<BufferEntry> expected;
-  for (int i = 0; i < elements.size(); ++i) {
-    if (counts[i] > 0) {
-      expected.emplace_back(elements[i], counts[i]);
-    }
-  }
-  EXPECT_EQ(buffer.GenerateEntryList(), expected);
+  expected.emplace_back(-1, 7);
+  expected.emplace_back(2, 4);
+  expected.emplace_back(5, 9);
+
+  // At this point, we have pushed 4 entries and we expect the buffer to be
+  // full.
+  EXPECT_TRUE(buffer.IsFull());
+  // Can't push any more entries before clearing.
+  EXPECT_DEATH(({ buffer.PushEntry(6, 6); }), "Buffer already full");
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index ad2358e4c436099560bb3d8ba19e987b489dfadd..fd577ad712f228fa8016a48942511a3263aae5da 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -17,6 +17,7 @@
 
 #include <memory>
 #include <vector>
+#include <cmath>
 
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h"
 #include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h"
@@ -91,12 +92,11 @@ class WeightedQuantilesStream {
     // and push weighted quantile summary up the level chain.
     if (buffer_.IsFull()) {
       PushBuffer(buffer_);
-      buffer_.Clear();
     }
   }
 
   // Pushes full buffer while maintaining approximation error invariants.
-  void PushBuffer(const Buffer& buffer) {
+  void PushBuffer(Buffer& buffer) {
     // Validate state.
     QCHECK(!finalized_) << "Finalize() already called.";
 
@@ -124,7 +124,6 @@ class WeightedQuantilesStream {
 
     // Flush any remaining buffer elements.
     PushBuffer(buffer_);
-    buffer_.Clear();
 
     // Create final merged summary.
     local_summary_.Clear();
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
index e6d10bf08b6ebbd2966627959f4c3aa5c1878034..8de154483e6dc7df8dd4402c1d596f93c9509c16 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary_test.cc
@@ -75,7 +75,7 @@ TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
   Summary summary;
   summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
 
-  // We expect no approximation error because no compress operation occured.
+  // We expect no approximation error because no compress operation occurred.
   EXPECT_EQ(summary.ApproximationError(), 0);
 
   // Check first and last elements in the summary.
@@ -91,9 +91,10 @@ TEST_F(WeightedQuantilesSummaryTest, BuildFromBuffer) {
 }
 
 TEST_F(WeightedQuantilesSummaryTest, CompressSeparately) {
+  const auto entry_list = buffer1_->GenerateEntryList();
   for (int new_size = 9; new_size >= 2; --new_size) {
     Summary summary;
-    summary.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+    summary.BuildFromBufferEntries(entry_list);
     summary.Compress(new_size);
 
     // Expect a max approximation error of 1 / n
@@ -161,10 +162,12 @@ TEST_F(WeightedQuantilesSummaryTest, CompressRandomized) {
 
 TEST_F(WeightedQuantilesSummaryTest, MergeSymmetry) {
   // Create two separate summaries and merge.
+  const auto list_1 = buffer1_->GenerateEntryList();
+  const auto list_2 = buffer2_->GenerateEntryList();
   Summary summary1;
-  summary1.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  summary1.BuildFromBufferEntries(list_1);
   Summary summary2;
-  summary2.BuildFromBufferEntries(buffer2_->GenerateEntryList());
+  summary2.BuildFromBufferEntries(list_2);
 
   // Merge summary 2 into 1 and verify.
   summary1.Merge(summary2);
@@ -178,7 +181,7 @@ TEST_F(WeightedQuantilesSummaryTest, MergeSymmetry) {
   EXPECT_EQ(summary1.Size(), 14);  // 14 unique values.
 
   // Merge summary 1 into 2 and verify same result.
-  summary1.BuildFromBufferEntries(buffer1_->GenerateEntryList());
+  summary1.BuildFromBufferEntries(list_1);
   summary2.Merge(summary1);
   EXPECT_EQ(summary2.ApproximationError(), 0.0);
   EXPECT_EQ(summary2.MinValue(),
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39c2fbe9c998541d9c530c273e9b56c454cd9f1b
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.cc
@@ -0,0 +1,88 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h"
+
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+using tensorflow::Tensor;
+
+void RandomlyInitializeBatchFeatures(
+    tensorflow::random::SimplePhilox* rng, uint32 num_dense_float_features,
+    uint32 num_sparse_float_features, double sparsity_lo, double sparsity_hi,
+    boosted_trees::utils::BatchFeatures* batch_features) {
+  const int64 batch_size = static_cast<int64>(batch_features->batch_size());
+
+  // Populate dense features.
+  std::vector<tensorflow::Tensor> dense_float_features_list;
+  for (int i = 0; i < num_dense_float_features; ++i) {
+    std::vector<float> values;
+    for (int64 j = 0; j < batch_size; ++j) {
+      values.push_back(rng->RandFloat());
+    }
+    auto dense_tensor = Tensor(tensorflow::DT_FLOAT, {batch_size, 1});
+    tensorflow::test::FillValues<float>(&dense_tensor, values);
+    dense_float_features_list.push_back(dense_tensor);
+  }
+
+  // Populate sparse features.
+  std::vector<tensorflow::Tensor> sparse_float_feature_indices_list;
+  std::vector<tensorflow::Tensor> sparse_float_feature_values_list;
+  std::vector<tensorflow::Tensor> sparse_float_feature_shapes_list;
+  for (int i = 0; i < num_sparse_float_features; ++i) {
+    std::set<uint64> indices;
+    const double sparsity =
+        sparsity_lo + rng->RandDouble() * (sparsity_hi - sparsity_lo);
+    const double density = 1 - sparsity;
+    for (int64 k = 0; k < static_cast<int64>(density * batch_size) + 1; ++k) {
+      indices.insert(rng->Uniform64(batch_size));
+    }
+    const int64 sparse_values_size = indices.size();
+    std::vector<int64> indices_vector;
+    for (auto idx : indices) {
+      indices_vector.push_back(idx);
+      indices_vector.push_back(0);
+    }
+    auto indices_tensor = Tensor(tensorflow::DT_INT64, {sparse_values_size, 2});
+    tensorflow::test::FillValues<int64>(&indices_tensor, indices_vector);
+    sparse_float_feature_indices_list.push_back(indices_tensor);
+
+    std::vector<float> values;
+    for (int64 j = 0; j < sparse_values_size; ++j) {
+      values.push_back(rng->RandFloat());
+    }
+    auto values_tensor = Tensor(tensorflow::DT_FLOAT, {sparse_values_size});
+    tensorflow::test::FillValues<float>(&values_tensor, values);
+    sparse_float_feature_values_list.push_back(values_tensor);
+
+    auto shape_tensor = Tensor(tensorflow::DT_INT64, {2});
+    tensorflow::test::FillValues<int64>(&shape_tensor, {batch_size, 1});
+    sparse_float_feature_shapes_list.push_back(shape_tensor);
+  }
+
+  // TODO(salehay): Add categorical feature generation support.
+  TF_EXPECT_OK(batch_features->Initialize(
+      dense_float_features_list, sparse_float_feature_indices_list,
+      sparse_float_feature_values_list, sparse_float_feature_shapes_list, {},
+      {}, {}));
+}
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
new file mode 100644
index 0000000000000000000000000000000000000000..d95878ec87b9e903930d2016bb573eee2573f776
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/batch_features_testutil.h
@@ -0,0 +1,45 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+// This method calls Initialize on the given 'batch_features', which will be
+// populated with randomly generated feature values when the call returns.
+// 'tensors' returns a vector of all tensors used in the initialization,
+// because they must outlive 'batch_features'.
+//
+// All float features will be either missing or uniformly randomly chosen
+// from [0, 1). For sparse (float) features, a sparsity is uniformly randomly
+// chosen from ['sparsity_lo', 'sparsity_hi') per feature, and each instance
+// will have a probability of sparsity of missing that feature, in other words,
+// sparsity = 1 - density.
+void RandomlyInitializeBatchFeatures(
+    tensorflow::random::SimplePhilox* rng, uint32 num_dense_float_features,
+    uint32 num_sparse_float_features, double sparsity_lo, double sparsity_hi,
+    boosted_trees::utils::BatchFeatures* batch_features);
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_BATCH_FEATURES_TESTUTIL_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbe26ba918d384ad903fb854ca3e88e84d16a923
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.cc
@@ -0,0 +1,211 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+using tensorflow::boosted_trees::trees::DecisionTreeConfig;
+using tensorflow::boosted_trees::trees::TreeNode;
+using boosted_trees::trees::DenseFloatBinarySplit;
+
+namespace {
+
+// Append the given nodes to tree with transfer of pointer ownership.
+// nodes will not be usable upon return.
+template <typename T>
+void AppendNodes(DecisionTreeConfig* tree, T* nodes) {
+  std::reverse(nodes->pointer_begin(), nodes->pointer_end());
+  while (!nodes->empty()) {
+    tree->mutable_nodes()->AddAllocated(nodes->ReleaseLast());
+  }
+}
+
+DenseFloatBinarySplit* GetSplit(TreeNode* node) {
+  switch (node->node_case()) {
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft:
+      return node->mutable_sparse_float_binary_split_default_left()
+          ->mutable_split();
+    case TreeNode::kSparseFloatBinarySplitDefaultRight:
+      return node->mutable_sparse_float_binary_split_default_right()
+          ->mutable_split();
+    case TreeNode::kDenseFloatBinarySplit:
+      return node->mutable_dense_float_binary_split();
+    default:
+      LOG(FATAL) << "Unknown node type encountered.";
+  }
+  return nullptr;
+}
+
+}  // namespace
+
+RandomTreeGen::RandomTreeGen(tensorflow::random::SimplePhilox* rng,
+                             int dense_feature_size, int sparse_feature_size)
+    : rng_(rng),
+      dense_feature_size_(dense_feature_size),
+      sparse_feature_size_(sparse_feature_size) {}
+
+namespace {
+void AddWeightAndMetadata(
+    boosted_trees::trees::DecisionTreeEnsembleConfig* ret) {
+  // Assign the weight of the tree to 1 and say that this weight was updated
+  // only once.
+  ret->add_tree_weights(1.0);
+  auto* meta = ret->add_tree_metadata();
+  meta->set_num_tree_weight_updates(1);
+}
+
+}  //  namespace
+
+boosted_trees::trees::DecisionTreeEnsembleConfig
+RandomTreeGen::GenerateEnsemble(int depth, int tree_count) {
+  boosted_trees::trees::DecisionTreeEnsembleConfig ret;
+  *(ret.add_trees()) = Generate(depth);
+  AddWeightAndMetadata(&ret);
+  for (int i = 1; i < tree_count; ++i) {
+    *(ret.add_trees()) = Generate(ret.trees(0));
+    AddWeightAndMetadata(&ret);
+  }
+  return ret;
+}
+
+DecisionTreeConfig RandomTreeGen::Generate(const DecisionTreeConfig& tree) {
+  DecisionTreeConfig ret = tree;
+  for (auto& node : *ret.mutable_nodes()) {
+    if (node.node_case() == TreeNode::kLeaf) {
+      node.mutable_leaf()->mutable_sparse_vector()->set_value(
+          0, rng_->RandFloat());
+      continue;
+    }
+    // Original node is a split. Re-generate it's type but retain the split node
+    // indices.
+    DenseFloatBinarySplit* split = GetSplit(&node);
+    const int left_id = split->left_id();
+    const int right_id = split->right_id();
+    GenerateSplit(&node, left_id, right_id);
+  }
+  return ret;
+}
+
+DecisionTreeConfig RandomTreeGen::Generate(int depth) {
+  DecisionTreeConfig ret;
+  // Add root,
+  TreeNode* node = ret.add_nodes();
+  GenerateSplit(node, 1, 2);
+  if (depth == 1) {
+    // Add left and right leaves.
+    TreeNode* left = ret.add_nodes();
+    left->mutable_leaf()->mutable_sparse_vector()->add_index(0);
+    left->mutable_leaf()->mutable_sparse_vector()->add_value(rng_->RandFloat());
+    TreeNode* right = ret.add_nodes();
+    right->mutable_leaf()->mutable_sparse_vector()->add_index(0);
+    right->mutable_leaf()->mutable_sparse_vector()->add_value(
+        rng_->RandFloat());
+    return ret;
+  } else {
+    DecisionTreeConfig left_branch = Generate(depth - 1);
+    DecisionTreeConfig right_branch = Generate(depth - 1);
+    Combine(&ret, &left_branch, &right_branch);
+    return ret;
+  }
+}
+
+void RandomTreeGen::Combine(DecisionTreeConfig* root,
+                            DecisionTreeConfig* left_branch,
+                            DecisionTreeConfig* right_branch) {
+  const int left_branch_size = left_branch->nodes_size();
+  CHECK_EQ(1, root->nodes_size());
+  // left_branch starts its index at 1. right_branch starts its index at
+  // (left_branch_size + 1).
+  auto* root_node = root->mutable_nodes(0);
+  DenseFloatBinarySplit* root_split = GetSplit(root_node);
+  root_split->set_left_id(1);
+  root_split->set_right_id(left_branch_size + 1);
+  // Shift left/right branch's indices internally so that everything is
+  // consistent.
+  ShiftNodeIndex(left_branch, 1);
+  ShiftNodeIndex(right_branch, left_branch_size + 1);
+
+  // Complexity O(branch node size). No proto copying though.
+  AppendNodes(root, left_branch->mutable_nodes());
+  AppendNodes(root, right_branch->mutable_nodes());
+}
+
+void RandomTreeGen::ShiftNodeIndex(DecisionTreeConfig* tree, int shift) {
+  for (TreeNode& node : *(tree->mutable_nodes())) {
+    DenseFloatBinarySplit* split = nullptr;
+    switch (node.node_case()) {
+      case TreeNode::kLeaf:
+        break;
+      case TreeNode::kSparseFloatBinarySplitDefaultLeft:
+        split = node.mutable_sparse_float_binary_split_default_left()
+                    ->mutable_split();
+        break;
+      case TreeNode::kSparseFloatBinarySplitDefaultRight:
+        split = node.mutable_sparse_float_binary_split_default_right()
+                    ->mutable_split();
+        break;
+      case TreeNode::kDenseFloatBinarySplit:
+        split = node.mutable_dense_float_binary_split();
+        break;
+      default:
+        LOG(FATAL) << "Unknown node type encountered.";
+    }
+    if (split) {
+      split->set_left_id(shift + split->left_id());
+      split->set_right_id(shift + split->right_id());
+    }
+  }
+}
+
+void RandomTreeGen::GenerateSplit(TreeNode* node, int left_id, int right_id) {
+  const double denseSplitProb =
+      sparse_feature_size_ == 0
+          ? 1.0
+          : static_cast<double>(dense_feature_size_) /
+                (dense_feature_size_ + sparse_feature_size_);
+  // Generate the tree such that it has equal probability of going left and
+  // right when the feature is missing.
+  static constexpr float kLeftProb = 0.5;
+
+  DenseFloatBinarySplit* split;
+  int feature_size;
+  if (rng_->RandFloat() < denseSplitProb) {
+    feature_size = dense_feature_size_;
+    split = node->mutable_dense_float_binary_split();
+  } else {
+    feature_size = sparse_feature_size_;
+    if (rng_->RandFloat() < kLeftProb) {
+      split = node->mutable_sparse_float_binary_split_default_left()
+                  ->mutable_split();
+    } else {
+      split = node->mutable_sparse_float_binary_split_default_right()
+                  ->mutable_split();
+    }
+  }
+  split->set_threshold(rng_->RandFloat());
+  split->set_feature_column(rng_->Uniform(feature_size));
+  split->set_left_id(left_id);
+  split->set_right_id(right_id);
+}
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e12429ba778344edda623d149e017661f1e0222
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h
@@ -0,0 +1,75 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
+
+#include <memory>
+
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace testutil {
+
+// Randomly generate a balanced tree, for performance benchmarking purposes,
+// that assume all features are sparse float features, for now.
+class RandomTreeGen {
+ public:
+  RandomTreeGen(tensorflow::random::SimplePhilox* rng, int dense_feature_size,
+                int sparse_feature_size);
+
+  // Required: depth must be >= 1.
+  // If one wants to generate multiple trees with the same depth, see also the
+  // overload below.
+  boosted_trees::trees::DecisionTreeConfig Generate(int depth);
+
+  // Randomly generate a new tree with the same depth (and tree structure)
+  // as the given tree. This is faster.
+  boosted_trees::trees::DecisionTreeConfig Generate(
+      const boosted_trees::trees::DecisionTreeConfig& tree);
+
+  // Required: depth >= 1; tree_count >= 1.
+  boosted_trees::trees::DecisionTreeEnsembleConfig GenerateEnsemble(
+      int dept, int tree_count);
+
+ private:
+  tensorflow::random::SimplePhilox* rng_;
+  const int dense_feature_size_;
+  const int sparse_feature_size_;
+
+  // Put together a deeper tree by combining two trees.
+  void Combine(boosted_trees::trees::DecisionTreeConfig* root,
+               boosted_trees::trees::DecisionTreeConfig* left_branch,
+               boosted_trees::trees::DecisionTreeConfig* right_branch);
+
+  // For each node in the provided tree, shift its referenced left/right index
+  // by shift.
+  void ShiftNodeIndex(boosted_trees::trees::DecisionTreeConfig* tree,
+                      int shift);
+
+  // Generate a sparse split in the node.
+  void GenerateSplit(boosted_trees::trees::TreeNode* node, int left_id,
+                     int right_id);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomTreeGen);
+};
+
+}  // namespace testutil
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TESTUTIL_RANDOM_TREE_GEN_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ea81e8d9a40e8c20e6eb8844700b092a062cec5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen_main.cc
@@ -0,0 +1,67 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+// Randomly generate a tree ensemble and write to file.
+
+#include "tensorflow/contrib/boosted_trees/lib/testutil/random_tree_gen.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::Flag;
+using tensorflow::Flags;
+using tensorflow::int32;
+using tensorflow::string;
+
+int main(int argc, char* argv[]) {
+  int32 dense_feature_size = 100;
+  int32 sparse_feature_size = 100;
+  int32 depth = 8;
+  int32 tree_count = 10;
+  string filename = "/tmp/trees.pb";
+  std::vector<Flag> flag_list = {
+      Flag("dense_feature_size", &dense_feature_size, "dense feature size"),
+      Flag("sparse_feature_size", &sparse_feature_size, "sparse_feature_size"),
+      Flag("depth", &depth, "tree depth"),
+      Flag("tree_count", &tree_count, "tree count"),
+      Flag("filename", &filename, "Output filename."),
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+  if (!parse_result) {
+    LOG(ERROR) << "\n" << usage;
+    return -1;
+  }
+
+  tensorflow::random::PhiloxRandom philox(1);
+  tensorflow::random::SimplePhilox rng(&philox);
+  tensorflow::boosted_trees::testutil::RandomTreeGen tree_gen(
+      &rng, dense_feature_size, sparse_feature_size);
+  const auto& trees = tree_gen.GenerateEnsemble(depth, tree_count);
+  tensorflow::Status status =
+      tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, trees);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to write: " << filename << " : " << status;
+  } else {
+    LOG(INFO) << "Tree ensemble written to: " << filename;
+  }
+  return 0;
+}
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd9a747d51914dd16ad9a1fc2327d4282a9f06d5
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -0,0 +1,203 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+
+constexpr int kInvalidLeaf = -1;
+int DecisionTree::Traverse(const DecisionTreeConfig& config,
+                           const int32 sub_root_id,
+                           const utils::Example& example) {
+  if (TF_PREDICT_FALSE(config.nodes_size() <= sub_root_id)) {
+    return kInvalidLeaf;
+  }
+
+  // Traverse tree starting at the provided sub-root.
+  int32 node_id = sub_root_id;
+  while (true) {
+    const auto& current_node = config.nodes(node_id);
+    switch (current_node.node_case()) {
+      case TreeNode::kLeaf: {
+        return node_id;
+      }
+      case TreeNode::kDenseFloatBinarySplit: {
+        const auto& split = current_node.dense_float_binary_split();
+        node_id = example.dense_float_features[split.feature_column()] <=
+                          split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+        const auto& split =
+            current_node.sparse_float_binary_split_default_left().split();
+        auto sparse_feature =
+            example.sparse_float_features[split.feature_column()];
+        node_id = !sparse_feature.has_value() ||
+                          sparse_feature.get_value() <= split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+        const auto& split =
+            current_node.sparse_float_binary_split_default_right().split();
+        auto sparse_feature =
+            example.sparse_float_features[split.feature_column()];
+        node_id = sparse_feature.has_value() &&
+                          sparse_feature.get_value() <= split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kCategoricalIdBinarySplit: {
+        const auto& split = current_node.categorical_id_binary_split();
+        node_id = example.sparse_int_features[split.feature_column()].count(
+                      split.feature_id()) > 0
+                      ? split.left_id()
+                      : split.right_id();
+        break;
+      }
+      case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+        const auto& split =
+            current_node.categorical_id_set_membership_binary_split();
+        bool found = false;
+        for (const int64 feature_id :
+             example.sparse_int_features[split.feature_column()]) {
+          const auto iter =
+              std::lower_bound(split.feature_ids().begin(),
+                               split.feature_ids().end(), feature_id);
+          if (iter != split.feature_ids().end() && *iter == feature_id) {
+            node_id = split.left_id();
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          node_id = split.right_id();
+        }
+        break;
+      }
+      case TreeNode::NODE_NOT_SET: {
+        QCHECK(false) << "Invalid node in tree: " << current_node.DebugString();
+        break;
+      }
+    }
+    DCHECK_NE(node_id, 0) << "Malformed tree, cycles found to root:"
+                          << current_node.DebugString();
+  }
+}
+
+void DecisionTree::LinkChildren(const std::vector<int32>& children,
+                                TreeNode* parent_node) {
+  // Decide how to link children depending on the parent node's type.
+  auto children_it = children.begin();
+  switch (parent_node->node_case()) {
+    case TreeNode::kLeaf: {
+      // Essentially no-op.
+      QCHECK(children.empty()) << "A leaf node cannot have children.";
+      break;
+    }
+    case TreeNode::kDenseFloatBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split = parent_node->mutable_dense_float_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_sparse_float_binary_split_default_left()
+              ->mutable_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_sparse_float_binary_split_default_right()
+              ->mutable_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kCategoricalIdBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split = parent_node->mutable_categorical_id_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+      QCHECK(children.size() == 2)
+          << "A binary split node must have exactly two children.";
+      auto* split =
+          parent_node->mutable_categorical_id_set_membership_binary_split();
+      split->set_left_id(*children_it);
+      split->set_right_id(*++children_it);
+      break;
+    }
+    case TreeNode::NODE_NOT_SET: {
+      QCHECK(false) << "A non-set node cannot have children.";
+      break;
+    }
+  }
+}
+
+std::vector<int32> DecisionTree::GetChildren(const TreeNode& node) {
+  // A node's children depend on its type.
+  switch (node.node_case()) {
+    case TreeNode::kLeaf: {
+      return {};
+    }
+    case TreeNode::kDenseFloatBinarySplit: {
+      const auto& split = node.dense_float_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultLeft: {
+      const auto& split = node.sparse_float_binary_split_default_left().split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kSparseFloatBinarySplitDefaultRight: {
+      const auto& split =
+          node.sparse_float_binary_split_default_right().split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kCategoricalIdBinarySplit: {
+      const auto& split = node.categorical_id_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
+      const auto& split = node.categorical_id_set_membership_binary_split();
+      return {split.left_id(), split.right_id()};
+    }
+    case TreeNode::NODE_NOT_SET: {
+      return {};
+    }
+  }
+}
+
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
new file mode 100644
index 0000000000000000000000000000000000000000..604ff02744b25b136bd935bf85635731730effe8
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h
@@ -0,0 +1,49 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/utils/example.h"
+#include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"  // NOLINT
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+
+// Decision tree class to encapsulate tree traversal and mutation logic.
+// This class does not hold state and is thread safe.
+class DecisionTree {
+ public:
+  // Traverse given an instance, a sub-root and its set of features
+  // and return the leaf index or -1 if the tree is empty or
+  // the sub-root is invalid.
+  static int Traverse(const DecisionTreeConfig& config, int32 sub_root_id,
+                      const utils::Example& example);
+
+  // Links the specified children to the parent, the children must
+  // already be added to the decision tree config so this method
+  // just ensures nodes are re-linked.
+  static void LinkChildren(const std::vector<int32>& children,
+                           TreeNode* parent_node);
+
+  // Retrieves node children indices if any.
+  static std::vector<int32> GetChildren(const TreeNode& node);
+};
+
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_TREES_DECISION_TREE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c55d09807eaf3a9c9db1cfbbfdfc66aec8f25155
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
@@ -0,0 +1,346 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/lib/utils/batch_features.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace trees {
+namespace {
+
+class DecisionTreeTest : public ::testing::Test {
+ protected:
+  DecisionTreeTest() : batch_features_(2) {
+    // Create a batch of two examples having one dense float, two sparse float
+    // and one sparse int features.
+    // The first example is missing the second sparse feature column and the
+    // second example is missing the first sparse feature column.
+    // This looks like the following:
+    // Instance | DenseF1 | SparseF1 | SparseF2 | SparseI1 |
+    // 0        |   7     |   -3     |          |    3     |
+    // 1        |  -2     |          |   4      |          |
+    auto dense_float_matrix = test::AsTensor<float>({7.0f, -2.0f}, {2, 1});
+    auto sparse_float_indices1 = test::AsTensor<int64>({0, 0}, {1, 2});
+    auto sparse_float_values1 = test::AsTensor<float>({-3.0f});
+    auto sparse_float_shape1 = test::AsTensor<int64>({2, 1});
+    auto sparse_float_indices2 = test::AsTensor<int64>({1, 0}, {1, 2});
+    auto sparse_float_values2 = test::AsTensor<float>({4.0f});
+    auto sparse_float_shape2 = test::AsTensor<int64>({2, 1});
+    auto sparse_int_indices1 = test::AsTensor<int64>({0, 0}, {1, 2});
+    auto sparse_int_values1 = test::AsTensor<int64>({3});
+    auto sparse_int_shape1 = test::AsTensor<int64>({2, 1});
+    TF_EXPECT_OK(batch_features_.Initialize(
+        {dense_float_matrix}, {sparse_float_indices1, sparse_float_indices2},
+        {sparse_float_values1, sparse_float_values2},
+        {sparse_float_shape1, sparse_float_shape2}, {sparse_int_indices1},
+        {sparse_int_values1}, {sparse_int_shape1}));
+  }
+
+  template <typename SplitType>
+  void TestLinkChildrenBinary(TreeNode* node, SplitType* split) {
+    // Verify children were linked.
+    DecisionTree::LinkChildren({3, 8}, node);
+    EXPECT_EQ(3, split->left_id());
+    EXPECT_EQ(8, split->right_id());
+
+    // Invalid cases.
+    EXPECT_DEATH(DecisionTree::LinkChildren({}, node),
+                 "A binary split node must have exactly two children.");
+    EXPECT_DEATH(DecisionTree::LinkChildren({3}, node),
+                 "A binary split node must have exactly two children.");
+    EXPECT_DEATH(DecisionTree::LinkChildren({1, 2, 3}, node),
+                 "A binary split node must have exactly two children.");
+  }
+
+  void TestGetChildren(const TreeNode& node,
+                       const std::vector<uint32>& expected_children) {
+    // Verify children were linked.
+    auto children = DecisionTree::GetChildren(node);
+    EXPECT_EQ(children.size(), expected_children.size());
+    for (size_t idx = 0; idx < children.size(); ++idx) {
+      EXPECT_EQ(children[idx], expected_children[idx]);
+    }
+  }
+
+  utils::BatchFeatures batch_features_;
+};
+
+TEST_F(DecisionTreeTest, TraverseEmpty) {
+  DecisionTreeConfig tree_config;
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(-1, DecisionTree::Traverse(tree_config, 0, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseBias) {
+  DecisionTreeConfig tree_config;
+  tree_config.add_nodes()->mutable_leaf();
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(0, DecisionTree::Traverse(tree_config, 0, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseInvalidSubRoot) {
+  DecisionTreeConfig tree_config;
+  tree_config.add_nodes()->mutable_leaf();
+  auto example = (*batch_features_.examples_iterable(0, 1).begin());
+  EXPECT_EQ(-1, DecisionTree::Traverse(tree_config, 10, example));
+}
+
+TEST_F(DecisionTreeTest, TraverseDenseBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node =
+      tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split_node->set_feature_column(0);
+  split_node->set_threshold(0.0f);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect right child to be picked as !(7 <= 0);
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect left child to be picked as (-2 <= 0);
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
+  // Test first sparse feature which is missing for the second example.
+  DecisionTreeConfig tree_config1;
+  auto* split_node1 = tree_config1.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_left()
+                          ->mutable_split();
+  split_node1->set_feature_column(0);
+  split_node1->set_threshold(-20.0f);
+  split_node1->set_left_id(1);
+  split_node1->set_right_id(2);
+  tree_config1.add_nodes()->mutable_leaf();
+  tree_config1.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect right child to be picked as !(-3 <= -20).
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config1, 0, *example_it));
+
+  // Expect left child to be picked as default direction.
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config1, 0, *++example_it));
+
+  // Test second sparse feature which is missing for the first example.
+  DecisionTreeConfig tree_config2;
+  auto* split_node2 = tree_config2.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_right()
+                          ->mutable_split();
+  split_node2->set_feature_column(1);
+  split_node2->set_threshold(4.0f);
+  split_node2->set_left_id(1);
+  split_node2->set_right_id(2);
+  tree_config2.add_nodes()->mutable_leaf();
+  tree_config2.add_nodes()->mutable_leaf();
+
+  // Expect right child to be picked as default direction.
+  example_it = example_iterable.begin();
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config2, 0, *example_it));
+
+  // Expect left child to be picked as (4 <= 4).
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config2, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseCategoricalIdBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node =
+      tree_config.add_nodes()->mutable_categorical_id_binary_split();
+  split_node->set_feature_column(0);
+  split_node->set_feature_id(3);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect left child to be picked as 3 == 3;
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect right child to be picked as the feature is missing;
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseCategoricalIdSetMembershipBinarySplit) {
+  DecisionTreeConfig tree_config;
+  auto* split_node = tree_config.add_nodes()
+                         ->mutable_categorical_id_set_membership_binary_split();
+  split_node->set_feature_column(0);
+  split_node->add_feature_ids(3);
+  split_node->set_left_id(1);
+  split_node->set_right_id(2);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect left child to be picked as 3 in {3};
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect right child to be picked as the feature is missing;
+  EXPECT_EQ(2, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, TraverseHybridSplits) {
+  DecisionTreeConfig tree_config;
+  auto* split_node1 =
+      tree_config.add_nodes()->mutable_dense_float_binary_split();
+  split_node1->set_feature_column(0);
+  split_node1->set_threshold(9.0f);
+  split_node1->set_left_id(1);   // sparse split.
+  split_node1->set_right_id(2);  // leaf
+  auto* split_node2 = tree_config.add_nodes()
+                          ->mutable_sparse_float_binary_split_default_left()
+                          ->mutable_split();
+  tree_config.add_nodes()->mutable_leaf();
+  split_node2->set_feature_column(0);
+  split_node2->set_threshold(-20.0f);
+  split_node2->set_left_id(3);
+  split_node2->set_right_id(4);
+  auto* split_node3 =
+      tree_config.add_nodes()->mutable_categorical_id_binary_split();
+  split_node3->set_feature_column(0);
+  split_node3->set_feature_id(2);
+  split_node3->set_left_id(5);
+  split_node3->set_right_id(6);
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  tree_config.add_nodes()->mutable_leaf();
+  auto example_iterable = batch_features_.examples_iterable(0, 2);
+
+  // Expect will go left through the first dense split as (7.0f <= 9.0f),
+  // then will go right through the sparse split as !(-3 <= -20).
+  auto example_it = example_iterable.begin();
+  EXPECT_EQ(4, DecisionTree::Traverse(tree_config, 0, *example_it));
+
+  // Expect will go left through the first dense split as (-2.0f <= 9.0f),
+  // then will go left the default direction as the sparse feature is missing,
+  // then will go right as 2 != 3 on the categorical split.
+  EXPECT_EQ(6, DecisionTree::Traverse(tree_config, 0, *++example_it));
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenLeaf) {
+  // Create leaf node.
+  TreeNode node;
+  node.mutable_leaf();
+
+  // No-op.
+  DecisionTree::LinkChildren({}, &node);
+
+  // Invalid case.
+  EXPECT_DEATH(DecisionTree::LinkChildren({1}, &node),
+               "A leaf node cannot have children.");
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenDenseFloatBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_dense_float_binary_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenSparseFloatBinarySplitDefaultLeft) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_left()->mutable_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenSparseFloatBinarySplitDefaultRight) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_right()->mutable_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenCategoricalSingleIdBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_categorical_id_binary_split();
+  split->set_left_id(-1);
+  split->set_right_id(-1);
+  TestLinkChildrenBinary(&node, split);
+}
+
+TEST_F(DecisionTreeTest, LinkChildrenNodeNotSet) {
+  // Create unset node.
+  TreeNode node;
+
+  // Invalid case.
+  EXPECT_DEATH(DecisionTree::LinkChildren({1}, &node),
+               "A non-set node cannot have children.");
+}
+
+TEST_F(DecisionTreeTest, GetChildrenLeaf) {
+  TreeNode node;
+  node.mutable_leaf();
+  TestGetChildren(node, {});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenDenseFloatBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_dense_float_binary_split();
+  split->set_left_id(23);
+  split->set_right_id(24);
+  TestGetChildren(node, {23, 24});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenSparseFloatBinarySplitDefaultLeft) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_left()->mutable_split();
+  split->set_left_id(12);
+  split->set_right_id(13);
+  TestGetChildren(node, {12, 13});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenSparseFloatBinarySplitDefaultRight) {
+  TreeNode node;
+  auto* split =
+      node.mutable_sparse_float_binary_split_default_right()->mutable_split();
+  split->set_left_id(1);
+  split->set_right_id(2);
+  TestGetChildren(node, {1, 2});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenCategoricalSingleIdBinarySplit) {
+  TreeNode node;
+  auto* split = node.mutable_categorical_id_binary_split();
+  split->set_left_id(7);
+  split->set_right_id(8);
+  TestGetChildren(node, {7, 8});
+}
+
+TEST_F(DecisionTreeTest, GetChildrenNodeNotSet) {
+  TreeNode node;
+  TestGetChildren(node, {});
+}
+
+}  // namespace
+}  // namespace trees
+}  // namespace boosted_trees
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
index 21df5d13ff45ff00c1aec481df9f4292bfff432b..bc0a93db8c39abf737d11682088233e2fd88e868 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
@@ -87,7 +87,7 @@ class IndicesRowIterator
     return tmp;
   }
 
-  difference_type operator-(const IndicesRowIterator& other) {
+  difference_type operator-(const IndicesRowIterator& other) const {
     return row_idx_ - other.row_idx_;
   }
 
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index 3b6b0339d2e815d294861abf3aad0577da337641..c99d8849bd59c42870a78e284e51626f081b858f 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -24,6 +24,15 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "quantiles_proto",
+    srcs = [
+        "quantiles.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "tree_config_proto",
     srcs = ["tree_config.proto"],
diff --git a/tensorflow/contrib/boosted_trees/proto/quantiles.proto b/tensorflow/contrib/boosted_trees/proto/quantiles.proto
new file mode 100644
index 0000000000000000000000000000000000000000..7f872d2aa71393fc48e707313d4334384b4b3f77
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/proto/quantiles.proto
@@ -0,0 +1,32 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package boosted_trees;
+
+message QuantileConfig {
+  // Maximum eps error when computing quantile summaries.
+  double eps = 1;
+  // Number of quantiles to generate.
+  int64 num_quantiles = 2;
+}
+
+message QuantileEntry {
+  // Value for the entry.
+  float value = 1;
+  // Weight for the entry.
+  float weight = 2;
+  // We need the minimum and maximum rank possible for this entry.
+  // Rank is 0.0 for the absolute minimum and sum of the weights for the maximum
+  // value in the input.
+  float min_rank = 3;
+  float max_rank = 4;
+}
+
+message QuantileSummaryState {
+  repeated QuantileEntry entries = 1;
+}
+
+message QuantileStreamState {
+  repeated QuantileSummaryState summaries = 1;
+}
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index 3daa613b5d2c0080a758bc925934570bb0fa2f38..a78996605378dd726927f12d9f59e629f84a9f0e 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -13,6 +13,8 @@ message TreeNode {
     SparseFloatBinarySplitDefaultRight sparse_float_binary_split_default_right =
         4;
     CategoricalIdBinarySplit categorical_id_binary_split = 5;
+    CategoricalIdSetMembershipBinarySplit
+        categorical_id_set_membership_binary_split = 6;
   }
   TreeNodeMetadata node_metadata = 777;
 }
@@ -81,6 +83,20 @@ message CategoricalIdBinarySplit {
   int32 right_id = 4;
 }
 
+// Split rule for categorical features with a set of feature Ids.
+message CategoricalIdSetMembershipBinarySplit {
+  // Categorical feature column and Id describing
+  // the rule feature ∈ feature_ids.
+  int32 feature_column = 1;
+  // Sorted list of Ids in the set.
+  repeated int64 feature_ids = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
 // DecisionTreeConfig describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
@@ -101,9 +117,21 @@ message DecisionTreeMetadata {
   bool is_finalized = 3;
 }
 
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+}
+
 // DecisionTreeEnsembleConfig describes an ensemble of decision trees.
 message DecisionTreeEnsembleConfig {
   repeated DecisionTreeConfig trees = 1;
   repeated float tree_weights = 2;
   repeated DecisionTreeMetadata tree_metadata = 3;
+
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
 }
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5dfdf8f48967efd989f969076508d815969d7a04
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -0,0 +1,53 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(
+    default_visibility = [
+        "//tensorflow/contrib/boosted_trees:__subpackages__",
+        "//tensorflow/contrib/boosted_trees:friends",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "stamped_resource",
+    hdrs = ["stamped_resource.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "quantile_stream_resource",
+    hdrs = ["quantile_stream_resource.h"],
+    deps = [
+        ":stamped_resource",
+        "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
+        "//tensorflow/contrib/boosted_trees/proto:quantiles_proto_cc",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "decision_tree_ensemble_resource",
+    hdrs = ["decision_tree_ensemble_resource.h"],
+    deps = [
+        ":stamped_resource",
+        "//tensorflow/contrib/boosted_trees/lib:trees",
+        "//tensorflow/core:framework_headers_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..45c3bbadfc8d6300841cbc256c894e3bb14cb44e
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -0,0 +1,77 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
+#include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+namespace models {
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class DecisionTreeEnsembleResource : public StampedResource {
+ public:
+  // Constructor.
+  explicit DecisionTreeEnsembleResource()
+      : decision_tree_ensemble_(
+            protobuf::Arena::CreateMessage<
+                boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_)) {}
+
+  string DebugString() override {
+    return strings::StrCat("GTFlowDecisionTreeEnsemble[size=",
+                           decision_tree_ensemble_->trees_size(), "]");
+  }
+
+  const boosted_trees::trees::DecisionTreeEnsembleConfig&
+  decision_tree_ensemble() const {
+    return *decision_tree_ensemble_;
+  }
+
+  boosted_trees::trees::DecisionTreeEnsembleConfig*
+  mutable_decision_tree_ensemble() {
+    return decision_tree_ensemble_;
+  }
+
+  // Resets the resource and frees the protos in arena.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset() {
+    // Reset stamp.
+    set_stamp(-1);
+
+    // Clear tree ensemle.
+    arena_.Reset();
+    CHECK_EQ(0, arena_.SpaceAllocated());
+    decision_tree_ensemble_ = protobuf::Arena::CreateMessage<
+        boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_);
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+ private:
+  protobuf::Arena arena_;
+  mutex mu_;
+  boosted_trees::trees::DecisionTreeEnsembleConfig* decision_tree_ensemble_;
+};
+
+}  // namespace models
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_DECISION_TREE_ENSEMBLE_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb29f79e578e8e52b67de631c527be35b7772b41
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -0,0 +1,104 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
+
+#include "tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h"
+#include "tensorflow/contrib/boosted_trees/proto/quantiles.pb.h"  // NOLINT
+#include "tensorflow/contrib/boosted_trees/resources/stamped_resource.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+using QuantileStream =
+    boosted_trees::quantiles::WeightedQuantilesStream<float, float>;
+
+// Resource for accumulating summaries for multiple columns.
+class QuantileStreamResource : public StampedResource {
+ public:
+  QuantileStreamResource(const float epsilon, const int32 num_quantiles,
+                         const int64 max_elements, int64 stamp_token)
+      : stream_(epsilon, max_elements),
+        are_buckets_ready_(false),
+        epsilon_(epsilon),
+        num_quantiles_(num_quantiles),
+        max_elements_(max_elements) {
+    set_stamp(stamp_token);
+  }
+
+  string DebugString() override { return "QuantileStreamResource"; }
+
+  tensorflow::mutex* mutex() { return &mu_; }
+
+  QuantileStream* stream(int64 stamp) {
+    CHECK(is_stamp_valid(stamp));
+    return &stream_;
+  }
+
+  const std::vector<float>& boundaries(int64 stamp) {
+    CHECK(is_stamp_valid(stamp));
+    return boundaries_;
+  }
+
+  void set_boundaries(int64 stamp, const std::vector<float>& boundaries) {
+    CHECK(is_stamp_valid(stamp));
+    are_buckets_ready_ = true;
+    boundaries_ = boundaries;
+  }
+
+  float epsilon() const { return epsilon_; }
+  int32 num_quantiles() const { return num_quantiles_; }
+
+  void Reset(int64 stamp) {
+    set_stamp(stamp);
+    stream_ = QuantileStream(epsilon_, max_elements_);
+  }
+
+  bool are_buckets_ready() const { return are_buckets_ready_; }
+  void set_buckets_ready(bool are_buckets_ready) {
+    are_buckets_ready_ = are_buckets_ready;
+  }
+
+ private:
+  ~QuantileStreamResource() override {}
+
+  // Mutex for the whole resource.
+  tensorflow::mutex mu_;
+
+  // Quantile stream.
+  QuantileStream stream_;
+
+  // Stores the boundaries from the previous iteration. Empty during the first
+  // iteration.
+  std::vector<float> boundaries_;
+
+  // Whether boundaries are created. Initially boundaries are empty until
+  // set_boundaries are called.
+  bool are_buckets_ready_;
+
+  const float epsilon_;
+  const int32 num_quantiles_;
+  // An upper-bound for the number of elements.
+  int64 max_elements_;
+  TF_DISALLOW_COPY_AND_ASSIGN(QuantileStreamResource);
+};
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_QUANTILE_STREAM_RESOURCE_H_
diff --git a/tensorflow/contrib/boosted_trees/resources/stamped_resource.h b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
new file mode 100644
index 0000000000000000000000000000000000000000..aabeeb98516eda6f7e8e7e296d6860fe5d8d5ec3
--- /dev/null
+++ b/tensorflow/contrib/boosted_trees/resources/stamped_resource.h
@@ -0,0 +1,42 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace boosted_trees {
+
+// A StampedResource is a resource that has a stamp token associated with it.
+// Before reading from or applying updates to the resource, the stamp should
+// be checked to verify that the update is not stale.
+class StampedResource : public ResourceBase {
+ public:
+  StampedResource() : stamp_(-1) {}
+
+  bool is_stamp_valid(int64 stamp) const { return stamp_ == stamp; }
+
+  int64 stamp() const { return stamp_; }
+  void set_stamp(int64 stamp) { stamp_ = stamp; }
+
+ private:
+  int64 stamp_;
+};
+
+}  // namespace boosted_trees
+}  // namespace tensorflow
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_RESOURCES_STAMPED_RESOURCE_H_
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
index 02a759eefd175964ea3e4aaa0fc09789ae820179..093000559b7f2d8a5ab500f7f54213ca841946fb 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_reader_ops.cc
@@ -46,7 +46,7 @@ Status GetTableAttrs(OpKernelConstruction* context, string* project_id,
 
 }  // namespace
 
-// Note that overriden methods with names ending in "Locked" are called by
+// Note that overridden methods with names ending in "Locked" are called by
 // ReaderBase while a mutex is held.
 // See comments for ReaderBase.
 class BigQueryReader : public ReaderBase {
diff --git a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
index 9acdb4b102baad161044141a744ed89d49b19b6b..493b3c6f1b5e7a7a7dc1dd4f48d2f54c1d284098 100644
--- a/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
+++ b/tensorflow/contrib/cloud/python/ops/bigquery_reader_ops_test.py
@@ -46,7 +46,7 @@ _TABLE = "test-table"
 # The values for rows are generated such that some columns have null values. The
 # general formula here is:
 #   - The int64 column is present in every row.
-#   - The string column is only avaiable in even rows.
+#   - The string column is only available in even rows.
 #   - The float column is only available in every third row.
 _ROWS = [[0, "s_0", 0.1], [1, None, None], [2, "s_2", None], [3, None, 3.1],
          [4, "s_4", None], [5, None, None], [6, "s_6", 6.1], [7, None, None],
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index e27df6898e36b58c8d6cefa48565d2f7c5374fe4..bade45e96a3d89ca675d1cad4b4f3db40eda233b 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -29,6 +29,7 @@ option(tensorflow_BUILD_ALL_KERNELS "Build all OpKernels" ON)
 option(tensorflow_BUILD_CONTRIB_KERNELS "Build OpKernels from tensorflow/contrib/..." ON)
 option(tensorflow_BUILD_CC_TESTS "Build cc unit tests " OFF)
 option(tensorflow_BUILD_PYTHON_TESTS "Build python unit tests " OFF)
+option(tensorflow_BUILD_SHARED_LIB "Build TensorFlow as a shared library" OFF)
 option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for the native processor architecture (if available)" ON)
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 
@@ -60,15 +61,18 @@ add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
   add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS)
-  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0)
+  add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH)
   add_definitions(-DTF_COMPILE_LIBRARY)
-  add_definitions(-DNDEBUG /O2)  # Equivalent of -c opt in Bazel.
   add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-)
   # Suppress warnings to reduce build log size.
   add_definitions(/wd4267 /wd4244 /wd4800 /wd4503 /wd4554 /wd4996 /wd4348 /wd4018)
   add_definitions(/wd4099 /wd4146 /wd4267 /wd4305 /wd4307)
   add_definitions(/wd4715 /wd4722 /wd4723 /wd4838 /wd4309 /wd4334)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
+  set(CMAKE_CXX_FLAGS_DEBUG "/D_DEBUG /MDd /Ob0")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /D_ITERATOR_DEBUG_LEVEL=0")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /D_ITERATOR_DEBUG_LEVEL=0")
 endif()
 
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
@@ -108,6 +112,7 @@ include(eigen)
 include(gemmlowp)
 include(jsoncpp)
 include(farmhash)
+include(fft2d)
 include(highwayhash)
 include(protobuf)
 if (tensorflow_BUILD_CC_TESTS)
@@ -121,6 +126,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${jpeg_STATIC_LIBRARIES}
     ${jsoncpp_STATIC_LIBRARIES}
     ${farmhash_STATIC_LIBRARIES}
+    ${fft2d_STATIC_LIBRARIES}
     ${highwayhash_STATIC_LIBRARIES}
     ${protobuf_STATIC_LIBRARIES}
 )
@@ -135,6 +141,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     protobuf
     eigen
     gemmlowp
+    fft2d
 )
 
 include_directories(
@@ -195,7 +202,7 @@ if (tensorflow_ENABLE_GPU)
     # add cudnn
     include_directories(${CUDNN_HOME})
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
-      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
 
     # create cuda_config.h
     FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
@@ -216,6 +223,7 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
     include_directories(${tensorflow_source_dir}/third_party/gpus)
@@ -224,6 +232,12 @@ if (tensorflow_ENABLE_GPU)
   endif()
 endif()
 
+# Find python executable
+include(FindPythonInterp)
+if(NOT ${PYTHONINTERP_FOUND})
+    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+endif()
+
 # Let's get to work!
 include(tf_core_framework.cmake)
 # NOTE: Disabled until issue #3996 is fixed.
@@ -241,16 +255,20 @@ include(tf_core_kernels.cmake)
 if(tensorflow_ENABLE_GRPC_SUPPORT)
   include(tf_core_distributed_runtime.cmake)
 endif()
+# We include tf_cc_ops first, because tf_c depends on tf_cc.
 include(tf_cc_ops.cmake)
+include(tf_c.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
 endif()
 include(tf_tools.cmake)
 if(tensorflow_BUILD_PYTHON_BINDINGS)
-  include(tensorboard)
   include(tf_python.cmake)
 endif()
-if (tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
+if(tensorflow_BUILD_SHARED_LIB)
+  include(tf_shared_lib.cmake)
+endif()
+if(tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
   include(tf_tests.cmake)
 endif()
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index af949f79fa1aab1f7d70b735960ff95ec524a7a2..664d0f4b6b09bcfccd17dccae662f6112d1d7d17 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -24,7 +24,7 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
  - QuantizeAndDequantize
  - QuantizedAvgPool
  - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd 
+ - QuantizedBiasAdd
  - QuantizedConcat
  - QuantizedConv2D
  - QuantizedMatmul
@@ -68,10 +68,10 @@ bindings.
   - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.continuum.io/downloads)
   - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
   - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 8.0] (https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 5.1] (https://developer.nvidia.com/cudnn)
+  - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
+  - [NVidia CUDNN 5.1](https://developer.nvidia.com/cudnn)
   - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
-  
+
 * Ubuntu 14.04
   - Makefile generator
   - Docker 1.9.1 (for automated testing)
@@ -87,7 +87,7 @@ bindings.
 
   - The following Python APIs are not currently implemented:
     * Loading custom op libraries via `tf.load_op_library()`. In order to use your
-      custom op, please put the source code under the tensorflow/core/user_ops 
+      custom op, please put the source code under the tensorflow/core/user_ops
       directory, and a shape function is required (not optional) for each op.
     * Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
       functional.
@@ -132,12 +132,12 @@ Step-by-step Windows build
      D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
      ```
 
-   * When building with GPU support after installing the CUDNN zip file from NVidia, append its 
+   * When building with GPU support after installing the CUDNN zip file from NVidia, append its
      bin directory to your PATH environment variable.
-     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable. 
+     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable.
      It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
      For example:
-     
+
      ```
      D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
      D:\local\cuda\bin
@@ -174,7 +174,7 @@ Step-by-step Windows build
    D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
    More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
    More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib 
+   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
    ```
    To build with GPU support add "^" at the end of the last line above following with:
    ```
@@ -184,7 +184,7 @@ Step-by-step Windows build
    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
    ```
    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
-   ```     
+   ```
 
    Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
    configuration that you choose when invoking `msbuild`. The known-good
@@ -222,11 +222,11 @@ Step-by-step Windows build
      SSL support (for making secure HTTP requests) in the TensorFlow runtime.
      This support is incomplete, and will be used for Google Cloud Storage
      support.
-     
+
    * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
      GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
      CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unziped_cudnn.
- 
+
    * `-Dtensorflow_BUILD_CC_TESTS=(ON|OFF)`. Defaults to `OFF`. This builds cc unit tests.
      There are many of them and building will take a few hours.
      After cmake, build and execute the tests with
@@ -234,7 +234,7 @@ Step-by-step Windows build
      MSBuild /p:Configuration=RelWithDebInfo ALL_BUILD.vcxproj
      ctest -C RelWithDebInfo
      ```
- 
+
    * `-Dtensorflow_BUILD_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python kernel tests.
      After building the python wheel, you need to install the new wheel before running the tests.
      To execute the tests, use
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index b91c7bf38300ad65ffc16f34d780fe6e07638292..2ae591d3fae66d1d9360c0dd759a6bfa97d92925 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(boringssl_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/boringssl/src/boringssl/include)
diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake
index 7dd7dd4070044e0fa568d64e7ae8ed7bb28b7d5e..45a0096085cc2a6332c82e1ea284812acdd45152 100644
--- a/tensorflow/contrib/cmake/external/eigen.cmake
+++ b/tensorflow/contrib/cmake/external/eigen.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 #new_http_archive(
 #  name = "eigen_archive",
 #  urls = ["https://bitbucket.org/eigen/eigen/get/..."],
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index c256f5f303a0b9934dbee62dae5cff0d1f4783fd..41b0e8c92bafcd6627edd95b5a879c937bf333f1 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(farmhash_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive/util)
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..85f77e9879fd3e9b9f270f5946fae66c769b466f
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+include (ExternalProject)
+
+set(fft2d_URL http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz)
+set(fft2d_HASH SHA256=52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296)
+set(fft2d_BUILD ${CMAKE_CURRENT_BINARY_DIR}/fft2d/)
+set(fft2d_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/fft2d/src)
+
+if(WIN32)
+  set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/lib/fft2d.lib)
+
+  ExternalProject_Add(fft2d
+      PREFIX fft2d
+      URL ${fft2d_URL}
+      URL_HASH ${fft2d_HASH}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+      INSTALL_DIR ${fft2d_INSTALL}
+      CMAKE_CACHE_ARGS
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          -DCMAKE_INSTALL_PREFIX:STRING=${fft2d_INSTALL})
+else()
+  set(fft2d_STATIC_LIBRARIES ${fft2d_BUILD}/src/fft2d/libfft2d.a)
+
+  ExternalProject_Add(fft2d
+      PREFIX fft2d
+      URL ${fft2d_URL}
+      URL_HASH ${fft2d_HASH}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
+      PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/fft2d/CMakeLists.txt ${fft2d_BUILD}/src/fft2d/CMakeLists.txt
+      INSTALL_DIR $(fft2d_INSTALL)
+      INSTALL_COMMAND echo
+      BUILD_COMMAND $(MAKE))
+    
+endif()
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 30ee21a13b4328e8e87ac467a6dbc9e0bad8f8c8..eee61ffd57bf60e578d51c1b4246b73384288122 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(gemmlowp_URL http://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz)
diff --git a/tensorflow/contrib/cmake/external/gif.cmake b/tensorflow/contrib/cmake/external/gif.cmake
index aaa0787f8b4ef073dd051cca16f3adf53fa3a149..5011239c14d23afef78b62c119c451e736e7597c 100644
--- a/tensorflow/contrib/cmake/external/gif.cmake
+++ b/tensorflow/contrib/cmake/external/gif.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(gif_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/gif_archive/giflib-5.1.4/)
diff --git a/tensorflow/contrib/cmake/external/googletest.cmake b/tensorflow/contrib/cmake/external/googletest.cmake
index fe1a9367175e5172c6bb699c398fc72394c9ce4f..d09bb02890f25a0312e62c876c1729e57a059e82 100644
--- a/tensorflow/contrib/cmake/external/googletest.cmake
+++ b/tensorflow/contrib/cmake/external/googletest.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(googletest_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/include)
@@ -7,7 +21,7 @@ set(googletest_TAG ec44c6c1675c25b9827aacd08c02433cccde7780)
 
 if(WIN32)
   set(googletest_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/$(Configuration)/gtest.lib)
 else()
   set(googletest_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/googletest/src/googletest/googletest/${CMAKE_BUILD_TYPE}/gtest.a)
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 7065e5d60dafc67402b3fa46ce9082028b8d462b..d7201680ceb9984598bc45df01d6195e4e1ca897 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
@@ -7,9 +21,9 @@ set(GRPC_TAG 3bc78cd0b5bd784a235c01612d634b1ec5f8fb97)
 
 if(WIN32)
   set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc++_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/grpc_unsecure.lib
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/${CMAKE_BUILD_TYPE}/gpr.lib)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
 else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
diff --git a/tensorflow/contrib/cmake/external/highwayhash.cmake b/tensorflow/contrib/cmake/external/highwayhash.cmake
index 972c97a55a45fc2739573dccbcfccc1db4bfe135..2c23bef8a331de356c93dbf9d0e91d8bb13bd6c8 100644
--- a/tensorflow/contrib/cmake/external/highwayhash.cmake
+++ b/tensorflow/contrib/cmake/external/highwayhash.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(highwayhash_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/highwayhash)
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
index b0b212eeb6098705538b6133b43ab067be2294b8..e4737a1dd825409133cdfd8a54f20dac819c0d5b 100644
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
diff --git a/tensorflow/contrib/cmake/external/jpeg.cmake b/tensorflow/contrib/cmake/external/jpeg.cmake
index fcfa9d3485b090cf046c95317f774c56e19d7b11..f2797d13b2470df653ddb44505e1f6f827f299e9 100644
--- a/tensorflow/contrib/cmake/external/jpeg.cmake
+++ b/tensorflow/contrib/cmake/external/jpeg.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(jpeg_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/jpeg_archive)
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 43d6e0456cdc4a7480987567a5c14cc2b49a49ce..5127d7e8f79abdda4516eb9f006e243b7438bc65 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(jsoncpp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/jsoncpp/src/jsoncpp)
@@ -9,7 +23,7 @@ set(jsoncpp_LIBRARIES ${jsoncpp_BUILD}/obj/so/libjsoncpp.so)
 set(jsoncpp_INCLUDES ${jsoncpp_BUILD})
 
 if(WIN32)
-  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/${CMAKE_BUILD_TYPE}/jsoncpp.lib)
+  set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/$(Configuration)/jsoncpp.lib)
 else()
   set(jsoncpp_STATIC_LIBRARIES ${jsoncpp_BUILD}/libjsoncpp.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index bfad8e5a2646167f91eaa0e2b043da51980b05fa..2b2bd47d1c95ca886469c525191c27f22d416c29 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(png_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/png_archive)
@@ -7,7 +21,9 @@ set(png_BUILD ${CMAKE_BINARY_DIR}/png/src/png)
 set(png_INSTALL ${CMAKE_BINARY_DIR}/png/install)
 
 if(WIN32)
-  set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
+  set(png_STATIC_LIBRARIES 
+    debug ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_staticd.lib
+    optimized ${CMAKE_BINARY_DIR}/png/install/lib/libpng12_static.lib)
 else()
   set(png_STATIC_LIBRARIES ${CMAKE_BINARY_DIR}/png/install/lib/libpng12.a)
 endif()
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 5ee6987175fd98a6d0403c6ccbba7ee4c52cc7db..d600d8c3c0d30ec517d0abc4bac94c588b5268d4 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
@@ -5,8 +19,10 @@ set(PROTOBUF_URL https://github.com/mrry/protobuf.git)  # Includes MSVC fix.
 set(PROTOBUF_TAG 1d2c7b6c7376f396c8c7dd9b6afd2d4f83f3cb05)
 
 if(WIN32)
-  set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/libprotobuf.lib)
-  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/${CMAKE_BUILD_TYPE}/protoc.exe)
+  set(protobuf_STATIC_LIBRARIES 
+    debug ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobufd.lib
+    optimized ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/libprotobuf.lib)
+  set(PROTOBUF_PROTOC_EXECUTABLE ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/$(Configuration)/protoc.exe)
   set(PROTOBUF_ADDITIONAL_CMAKE_OPTIONS	-Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF -A x64)
 else()
   set(protobuf_STATIC_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/libprotobuf.a)
diff --git a/tensorflow/contrib/cmake/external/tensorboard.cmake b/tensorflow/contrib/cmake/external/tensorboard.cmake
deleted file mode 100644
index a249af070507928f221463d175b8d79cb55cd2f8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cmake/external/tensorboard.cmake
+++ /dev/null
@@ -1,134 +0,0 @@
-include (ExternalProject)
-
-set(tensorboard_dependencies)
-add_custom_target(tensorboard_copy_dependencies)
-
-function(tb_new_http_archive)
-  cmake_parse_arguments(_TB "" "NAME;URL" "FILES" ${ARGN})
-  ExternalProject_Add(${_TB_NAME}
-    PREFIX ${_TB_NAME}
-    URL ${_TB_URL}
-    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}/${_TB_NAME}"
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-    INSTALL_COMMAND ""
-  )
-
-  set(src_dir "${CMAKE_CURRENT_BINARY_DIR}/${_TB_NAME}/src/${_TB_NAME}")
-  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}")
-
-  foreach(src_file ${_TB_FILES})
-    add_custom_command(
-      TARGET tensorboard_copy_dependencies PRE_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_dir}/${src_file} ${dst_dir}/${src_file}
-    )
-  endforeach()
-  
-  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
-endfunction()
-
-function(tb_http_file)
-  cmake_parse_arguments(_TB "" "NAME;URL" "" ${ARGN})
-  get_filename_component(src_file ${_TB_URL} NAME)
-  file(DOWNLOAD ${_TB_URL} "${DOWNLOAD_LOCATION}/${_TB_NAME}/${src_file}")
-  
-  set(src_dir "${DOWNLOAD_LOCATION}/${_TB_NAME}")
-  set(dst_dir "${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external/${_TB_NAME}/file")
-  
-  add_custom_command(
-    TARGET tensorboard_copy_dependencies PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${src_dir}/${src_file} ${dst_dir}/${src_file}
-  )
-  
-  add_custom_target(${_TB_NAME} DEPENDS ${src_dir}/${src_file})
-  set(tensorboard_dependencies ${tensorboard_dependencies} ${_TB_NAME} PARENT_SCOPE)
-endfunction()
-
-# Parse TensorBoard dependency names and URLs from Bazel's WORKSPACE file.
-set(tb_dep_names)
-file(STRINGS ${PROJECT_SOURCE_DIR}/../../../WORKSPACE workspace_contents)
-foreach(line ${workspace_contents})
-  if(line MATCHES "# TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT")
-    set(tb_deps_started 1)
-  endif()
-
-  if(NOT tb_deps_started)
-    continue()
-  endif()
-
-  if(line MATCHES "new_http_archive\\(")
-    set(tb_dep_is_archive 1)
-    continue()
-  elseif(line MATCHES "http_file\\(")
-    set(tb_dep_is_archive 0)
-    continue()
-  endif()
-
-  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
-  if(has_name)
-    set(tb_dep_name ${CMAKE_MATCH_1})
-    continue()
-  endif()
-
-  string(REGEX MATCH "url.*=.*\"(.*)\"" has_url ${line})
-  if(has_url)
-    list(APPEND tb_dep_names ${tb_dep_name})
-    set(${tb_dep_name}_is_archive ${tb_dep_is_archive})
-    set(${tb_dep_name}_url ${CMAKE_MATCH_1})
-  endif()
-endforeach()
-
-# Parse the files needed for each TensorBoard dependency from Bazel's bower.BUILD file.
-# Due to CMAKE quirkiness, cannot use file(strings) with files that contain '[' and ']'.
-file(READ ${PROJECT_SOURCE_DIR}/../../../bower.BUILD bower_build_contents)
-string(REPLACE "\[" "OB" bower_build_contents "${bower_build_contents}")
-string(REPLACE "\]" "CB" bower_build_contents "${bower_build_contents}")
-string(REPLACE ";" "\\\\;" bower_build_contents "${bower_build_contents}")
-string(REPLACE "\n" "E;" bower_build_contents "${bower_build_contents}")
-foreach(line ${bower_build_contents})
-  string(REGEX MATCH "name.*=.*\"(.*)\"" has_name ${line})
-  if(has_name)
-    set(tb_dep_name ${CMAKE_MATCH_1})
-    set(${tb_dep_name}_files)
-    continue()
-  endif()
-
-  string(REGEX MATCH "srcs.*=.*\"(.*)\"CB" has_single_line_src ${line})
-  if(has_single_line_src)
-    list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
-    continue()
-  endif()
-
-  if(line MATCHES "srcs.*=.*OB")
-    set(inside_files_def 1)
-    continue()
-  elseif(line MATCHES "CB,")
-    set(inside_files_def 0)
-    continue()
-  endif()
-
-  if(inside_files_def)
-   string(REGEX MATCH "\"(.*)\"," has_file ${line})
-   if(has_file)
-     list(APPEND ${tb_dep_name}_files ${CMAKE_MATCH_1})
-   endif()
-  endif()
-endforeach()
-
-# Generate a target for each dependency.
-foreach(tb_dep_name ${tb_dep_names})
-  if (${tb_dep_name}_is_archive)
-    tb_new_http_archive(
-      NAME ${tb_dep_name}
-      URL ${${tb_dep_name}_url}
-      FILES ${${tb_dep_name}_files}
-    )
-  else()
-    tb_http_file(
-      NAME ${tb_dep_name}
-      URL ${${tb_dep_name}_url}
-    )
-  endif()
-endforeach()
-
-add_dependencies(tensorboard_copy_dependencies ${tensorboard_dependencies})
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index eb7d4bc38b9ed34e1745946d66956e700d7a434f..c8af611e1eaefdf135551940a66985a4d50b26ed 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 include (ExternalProject)
 
 set(zlib_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/zlib_archive)
@@ -8,7 +22,8 @@ set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
 
 if(WIN32)
   set(zlib_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
+      debug ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstaticd.lib
+      optimized ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/zlibstatic.lib)
 else()
   set(zlib_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/zlib/install/lib/libz.a)
diff --git a/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b31ea3ed98092340c73042dfece44114fa0f4ca3
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/fft2d/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(fft2d)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(FFT2D_SRCS
+    "fftsg.c"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(fft2d ${FFT2D_SRCS})
+
+install(TARGETS fft2d
+  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+  ARCHIVE DESTINATION lib COMPONENT Development)
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..069cdfa35261fdfa909b4a4f5761be9e1d9d185f
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_c_framework library
+########################################################
+set(tf_c_srcs
+    "${tensorflow_source_dir}/tensorflow/c/c_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/c_api.h"
+    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
+    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
+    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
+    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
+)
+
+add_library(tf_c OBJECT ${tf_c_srcs})
+add_dependencies(tf_c tf_cc_framework tf_core_lib tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 936196dd202dbb6e57f732cc3703dffab1931681..b53f428461d70685b0226f4d0cd0a3f63d8f47d8 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_cc_framework library
 ########################################################
@@ -5,6 +19,7 @@ set(tf_cc_framework_srcs
     "${tensorflow_source_dir}/tensorflow/cc/framework/ops.h"
     "${tensorflow_source_dir}/tensorflow/cc/framework/ops.cc"
     "${tensorflow_source_dir}/tensorflow/cc/framework/scope.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope_internal.h"
     "${tensorflow_source_dir}/tensorflow/cc/framework/scope.cc"
 )
 
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 4410b361f39f082a93499a781fafa192cb5464a3..1c80ffcd7b129f6b9e934c205c1d73273f1eb9b8 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_cpu library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_direct_session.cmake b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
index 712f04ddc1d31cab6fd03c48ec0fd2cf7f6c3398..de2fa8669577ddc0b2ae3170eab9d080bd43ff1f 100644
--- a/tensorflow/contrib/cmake/tf_core_direct_session.cmake
+++ b/tensorflow/contrib/cmake/tf_core_direct_session.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_direct_session library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
index ffa5710534056ad71b27c896979fc22764c3e0ff..2c1b6d1f6e5caa36eaff4c9157ccef01a77b41f0 100644
--- a/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
+++ b/tensorflow/contrib/cmake/tf_core_distributed_runtime.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_distributed_runtime library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 3787ac4c81d6c2a3bd456f7f7a477104fbc97613..a048194a1973188dfe3bba88b2dd8b65a7a55b55 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # RELATIVE_PROTOBUF_GENERATE_CPP function
 ########################################################
@@ -104,8 +118,10 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/types.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
+    "tensorflow/core/protobuf/cluster.proto"
     "tensorflow/core/protobuf/config.proto"
     "tensorflow/core/protobuf/debug.proto"
+    "tensorflow/core/protobuf/device_properties.proto"
     "tensorflow/core/protobuf/rewriter_config.proto"
     "tensorflow/core/protobuf/tensor_bundle.proto"
     "tensorflow/core/protobuf/saver.proto"
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 33384eed4809bc366f6e539a45b39931b0444325..c95cd068cd6372973d1b35fd91716901945dd21f 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 ########################################################
 # tf_core_kernels library
 ########################################################
@@ -33,15 +47,18 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/decode_audio_op.cc"
       #"${tensorflow_source_dir}/tensorflow/contrib/ffmpeg/encode_audio_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/bucketization_kernel.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/kernels/sparse_feature_cross_kernel.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/best_splits_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/kernels/count_extremely_random_stats_op.cc"
@@ -84,6 +101,7 @@ file(GLOB_RECURSE tf_core_kernels_exclude_srcs
    "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/*"
    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute*.cc"
+   "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform*.cc"
 )
 list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 
@@ -93,12 +111,17 @@ if(WIN32)
       "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
       "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
+      "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
       # no in tensorflow.dll - comes from .so
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc"
+      # temporarily disable nccl (nccl itself needs to be ported to windows first)
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
   )
   list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs})
 endif(WIN32)
@@ -106,6 +129,7 @@ endif(WIN32)
 file(GLOB_RECURSE tf_core_gpu_kernels_srcs
    "${tensorflow_source_dir}/tensorflow/core/kernels/*.cu.cc"
    "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
+   "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
 )
 
 if(WIN32 AND tensorflow_ENABLE_GPU)
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 4e3000562956d404efd9b9003e4a0bb9999b584a..7131f14d992f316a4fd1da58f7e90e9274b8658b 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -1,13 +1,29 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_op_lib_names
     "array_ops"
     "candidate_sampling_ops"
     "control_flow_ops"
     "ctc_ops"
     "data_flow_ops"
+    "dataset_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
+    "lookup_ops"
     "logging_ops"
     "math_ops"
     "nn_ops"
@@ -18,11 +34,12 @@ set(tf_op_lib_names
     "resource_variable_ops"
     "script_ops"
     "sdca_ops"
-    "set_ops"  
+    "set_ops"
     "sendrecv_ops"
     "sparse_ops"
     "spectral_ops"
     "state_ops"
+    "stateless_random_ops"
     "string_ops"
     "training_ops"
 )
@@ -55,11 +72,12 @@ GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(input_pipeline "${tensorflow_source_dir}/tensorflow/contrib/input_pipeline/ops/input_pipeline_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(image "${tensorflow_source_dir}/tensorflow/contrib/image/ops/image_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(layers_bucketization "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/bucketization_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_lstm "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(seq2seq_beam_search "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tensor_forest "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/ops/tensor_forest_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_hybrid "${tensor_forest_hybrid_srcs}")
 GENERATE_CONTRIB_OP_LIBRARY(bigquery_reader "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc")
@@ -95,7 +113,7 @@ file(GLOB_RECURSE tf_core_ops_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/user_ops/*.cu.cc"
 )
 
-list(REMOVE_ITEM tf_core_ops_srcs ${tf_core_ops_exclude_srcs}) 
+list(REMOVE_ITEM tf_core_ops_srcs ${tf_core_ops_exclude_srcs})
 
 add_library(tf_core_ops OBJECT ${tf_core_ops_srcs})
 
diff --git a/tensorflow/contrib/cmake/tf_label_image_example.cmake b/tensorflow/contrib/cmake/tf_label_image_example.cmake
index cfd213114889231e2fdd5296c582395a147b622e..0d3a4699ebb102257e8a4a816652c90ffff42d92 100644
--- a/tensorflow/contrib/cmake/tf_label_image_example.cmake
+++ b/tensorflow/contrib/cmake/tf_label_image_example.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_label_image_example_srcs
     "${tensorflow_source_dir}/tensorflow/examples/label_image/main.cc"
 )
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 02038da7f85bc7b2c6a5a443d94b83e05fe5ec45..7789edf809869365fffd595995dd451ee98f99a1 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # CMake rules for generating the TensorFlow Python bindings.
 #
 # Known limitations:
@@ -13,7 +27,6 @@
 
 # 1. Resolve the installed version of Python (for Python.h and python).
 # TODO(mrry): Parameterize the build script to enable Python 3 building.
-include(FindPythonInterp)
 if(NOT PYTHON_INCLUDE_DIR)
   set(PYTHON_NOT_FOUND false)
   exec_program("${PYTHON_EXECUTABLE}"
@@ -111,6 +124,7 @@ file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
@@ -124,6 +138,7 @@ RELATIVE_PROTOBUF_GENERATE_PYTHON(
 file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/python/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
+    "${tensorflow_source_dir}/tensorflow/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
 )
@@ -187,13 +202,17 @@ add_python_module("tensorflow/python/estimator")
 add_python_module("tensorflow/python/estimator/export")
 add_python_module("tensorflow/python/estimator/inputs")
 add_python_module("tensorflow/python/estimator/inputs/queues")
+add_python_module("tensorflow/python/feature_column")
 add_python_module("tensorflow/python/framework")
+add_python_module("tensorflow/python/grappler")
 add_python_module("tensorflow/python/kernel_tests")
+add_python_module("tensorflow/python/kernel_tests/distributions")
 add_python_module("tensorflow/python/layers")
 add_python_module("tensorflow/python/lib")
 add_python_module("tensorflow/python/lib/core")
 add_python_module("tensorflow/python/lib/io")
 add_python_module("tensorflow/python/ops")
+add_python_module("tensorflow/python/ops/distributions")
 add_python_module("tensorflow/python/ops/losses")
 add_python_module("tensorflow/python/platform")
 add_python_module("tensorflow/python/platform/default")
@@ -210,7 +229,6 @@ add_python_module("tensorflow/tensorboard")
 add_python_module("tensorflow/tensorboard/backend")
 add_python_module("tensorflow/tensorboard/backend/event_processing")
 add_python_module("tensorflow/tensorboard/plugins")
-add_python_module("tensorflow/tensorboard/plugins/debugger")
 add_python_module("tensorflow/tensorboard/plugins/projector")
 add_python_module("tensorflow/tensorboard/plugins/text")
 add_python_module("tensorflow/tensorboard/scripts")
@@ -247,6 +265,11 @@ add_python_module("tensorflow/contrib/cudnn_rnn/ops")
 add_python_module("tensorflow/contrib/cudnn_rnn/python")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
 add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
+add_python_module("tensorflow/contrib/data")
+add_python_module("tensorflow/contrib/data/python")
+add_python_module("tensorflow/contrib/data/python/framework")
+add_python_module("tensorflow/contrib/data/python/kernel_tests")
+add_python_module("tensorflow/contrib/data/python/ops")
 add_python_module("tensorflow/contrib/deprecated")
 add_python_module("tensorflow/contrib/distributions")
 add_python_module("tensorflow/contrib/distributions/python")
@@ -342,6 +365,9 @@ add_python_module("tensorflow/contrib/keras/python/keras/layers")
 add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
 add_python_module("tensorflow/contrib/keras/python/keras/utils")
 add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
+add_python_module("tensorflow/contrib/kernel_methods")
+add_python_module("tensorflow/contrib/kernel_methods/python")
+add_python_module("tensorflow/contrib/kernel_methods/python/mappers")
 add_python_module("tensorflow/contrib/labeled_tensor")
 add_python_module("tensorflow/contrib/labeled_tensor/python")
 add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
@@ -405,6 +431,11 @@ add_python_module("tensorflow/contrib/ndlstm/python")
 add_python_module("tensorflow/contrib/nn")
 add_python_module("tensorflow/contrib/nn/python")
 add_python_module("tensorflow/contrib/nn/python/ops")
+add_python_module("tensorflow/contrib/nccl")
+add_python_module("tensorflow/contrib/nccl/kernels")
+add_python_module("tensorflow/contrib/nccl/ops")
+add_python_module("tensorflow/contrib/nccl/python")
+add_python_module("tensorflow/contrib/nccl/python/ops")
 add_python_module("tensorflow/contrib/opt")
 add_python_module("tensorflow/contrib/opt/python")
 add_python_module("tensorflow/contrib/opt/python/training")
@@ -424,6 +455,8 @@ add_python_module("tensorflow/contrib/saved_model")
 add_python_module("tensorflow/contrib/saved_model/python")
 add_python_module("tensorflow/contrib/saved_model/python/saved_model")
 add_python_module("tensorflow/contrib/seq2seq")
+add_python_module("tensorflow/contrib/seq2seq/kernels")
+add_python_module("tensorflow/contrib/seq2seq/ops")
 add_python_module("tensorflow/contrib/seq2seq/python")
 add_python_module("tensorflow/contrib/seq2seq/python/kernel_tests")
 add_python_module("tensorflow/contrib/seq2seq/python/ops")
@@ -445,7 +478,9 @@ add_python_module("tensorflow/contrib/sparsemax/python")
 add_python_module("tensorflow/contrib/sparsemax/python/ops")
 add_python_module("tensorflow/contrib/specs")
 add_python_module("tensorflow/contrib/specs/python")
+add_python_module("tensorflow/contrib/staging")
 add_python_module("tensorflow/contrib/stat_summarizer")
+add_python_module("tensorflow/contrib/stateless")
 add_python_module("tensorflow/contrib/tensorboard")
 add_python_module("tensorflow/contrib/tensorboard/plugins")
 add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
@@ -479,13 +514,6 @@ add_python_module("tensorflow/contrib/training/python/training")
 add_python_module("tensorflow/contrib/util")
 
 
-# Additional directories with no Python sources.
-add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist")
-add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css")
-
-
 ########################################################
 # tf_python_op_gen_main library
 ########################################################
@@ -562,10 +590,12 @@ GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
+GENERATE_PYTHON_OP_LIB("dataset_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
+GENERATE_PYTHON_OP_LIB("lookup_ops")
 GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
@@ -593,22 +623,26 @@ GENERATE_PYTHON_OP_LIB("contrib_input_pipeline_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/input_pipeline/ops/gen_input_pipeline_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_image_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/image/ops/gen_image_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_layers_bucketization_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_bucketization_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_rnn_gru_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/ops/gen_gru_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_rnn_lstm_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/ops/gen_lstm_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_seq2seq_beam_search_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/ops/gen_beam_search_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/python/ops/gen_tensor_forest_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_tensor_forest_hybrid_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/tensor_forest/hybrid/ops/gen_training_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
+GENERATE_PYTHON_OP_LIB("stateless_random_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 
 add_custom_target(tf_python_ops SOURCES ${tf_python_ops_generated_files} ${PYTHON_PROTO_GENFILES})
 add_dependencies(tf_python_ops tf_python_op_gen_main)
@@ -658,12 +692,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_writer.cc"
     "${tensorflow_source_dir}/tensorflow/python/util/kernel_registry.h"
     "${tensorflow_source_dir}/tensorflow/python/util/kernel_registry.cc"
-    "${tensorflow_source_dir}/tensorflow/c/c_api.cc"
-    "${tensorflow_source_dir}/tensorflow/c/c_api.h"
-    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
-    "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
-    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
-    "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/ops.cc"
+    "${tensorflow_source_dir}/tensorflow/cc/framework/scope.cc"
     "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc"
 )
 
@@ -679,9 +709,12 @@ if(WIN32)
     #
     add_library(pywrap_tensorflow_internal_static STATIC
         ${pywrap_tensorflow_internal_src}
+        $<TARGET_OBJECTS:tf_c>
         $<TARGET_OBJECTS:tf_core_lib>
         $<TARGET_OBJECTS:tf_core_cpu>
         $<TARGET_OBJECTS:tf_core_framework>
+        $<TARGET_OBJECTS:tf_cc>
+        $<TARGET_OBJECTS:tf_cc_ops>
         $<TARGET_OBJECTS:tf_core_ops>
         $<TARGET_OBJECTS:tf_core_direct_session>
         $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
@@ -690,33 +723,44 @@ if(WIN32)
         $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
         $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
     )
+    
     target_include_directories(pywrap_tensorflow_internal_static PUBLIC
         ${PYTHON_INCLUDE_DIR}
         ${NUMPY_INCLUDE_DIR}
     )
-    target_link_libraries(pywrap_tensorflow_internal_static
-        tf_protos_cc
-        tf_python_protos_cc
+    #target_link_libraries(pywrap_tensorflow_internal_static
+    #	tf_protos_cc
+    #	tf_python_protos_cc
+    #)  
+    add_dependencies(pywrap_tensorflow_internal_static tf_protos_cc tf_python_protos_cc)
+    set(pywrap_tensorflow_internal_static_dependencies
+        $<TARGET_FILE:pywrap_tensorflow_internal_static>
+        $<TARGET_FILE:tf_protos_cc>
+        $<TARGET_FILE:tf_python_protos_cc>
     )
+    
     set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def")
     set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
     add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
-            --input $<TARGET_FILE:pywrap_tensorflow_internal_static>
-            --output ${pywrap_tensorflow_deffile}
+            --input "${pywrap_tensorflow_internal_static_dependencies}"
+            --output "${pywrap_tensorflow_deffile}"
+            --target _pywrap_tensorflow_internal.pyd
     )
 endif(WIN32)
 
-
 # pywrap_tensorflow_internal is a shared library containing all of the
 # TensorFlow runtime and the standard ops and kernels. These are installed into
 # tf_python/tensorflow/python/.
 add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_internal_src}
+    $<TARGET_OBJECTS:tf_c>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_cc>
+    $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
@@ -735,7 +779,8 @@ target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${PYTHON_INCLUDE_DIR}
     ${NUMPY_INCLUDE_DIR}
 )
-target_link_libraries(pywrap_tensorflow_internal
+
+target_link_libraries(pywrap_tensorflow_internal PRIVATE
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
@@ -781,13 +826,32 @@ if(WIN32)
         DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
 endif(WIN32)
 
+if(WIN32)
+    # include contrib/seq2seq as .so
+    #
+    set(tf_beam_search_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
+    )
+
+    set(tf_beam_search_gpu_srcs
+        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
+    )
+
+    AddUserOps(TARGET _beam_search_ops
+        SOURCES "${tf_beam_search_srcs}"
+        GPUSOURCES ${tf_beam_search_gpu_srcs}
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
+endif(WIN32)
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
 add_custom_target(tf_python_build_pip_package)
 add_dependencies(tf_python_build_pip_package
     pywrap_tensorflow_internal
-    tensorboard_copy_dependencies
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
@@ -797,9 +861,9 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 if(WIN32)
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
 else()
   add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
@@ -814,24 +878,17 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
 
 # Copy resources for TensorBoard.
+file(DOWNLOAD http://mirror.bazel.build/tensorboard/index.html ${DOWNLOAD_LOCATION}/tensorboard/index.html
+  EXPECTED_HASH SHA256=60f185c68ff3f906000df9670bf9f46588056b197da7e7b10074411a0c048dae)
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+  COMMAND ${CMAKE_COMMAND} -E make_directory
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/bazel-html-imports.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/index.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/dist/tf-tensorboard.html
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/dist/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/lib/css/global.css
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/lib/css/)
+  COMMAND ${CMAKE_COMMAND} -E copy ${DOWNLOAD_LOCATION}/tensorboard/index.html
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/components/)
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tensorboard/TAG
-                                   ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
-add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/tensorboard_external
-                                             ${CMAKE_CURRENT_BINARY_DIR}/tf_python/external)
+  ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tensorboard/)
 
 # Copy datasets for tf.contrib.learn.
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9385ac52e903e1f0f2436066f573af5359c46770
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+if(WIN32)
+  # Windows: build a static library with the same objects as tensorflow.dll.
+  # This can be used to build for a standalone exe and also helps us to
+  # find all symbols that need to be exported from the dll which is needed
+  # to provide the tensorflow c/c++ api in tensorflow.dll.
+  # From the static library we create the def file with all symbols that need to
+  # be exported from tensorflow.dll. Because there is a limit of 64K sybmols
+  # that can be exported, we filter the symbols with a python script to the namespaces
+  # we need.
+  #
+  add_library(tensorflow_static STATIC
+      $<TARGET_OBJECTS:tf_c>
+      $<TARGET_OBJECTS:tf_cc>
+      $<TARGET_OBJECTS:tf_cc_framework>
+      $<TARGET_OBJECTS:tf_cc_ops>
+      $<TARGET_OBJECTS:tf_core_lib>
+      $<TARGET_OBJECTS:tf_core_cpu>
+      $<TARGET_OBJECTS:tf_core_framework>
+      $<TARGET_OBJECTS:tf_core_ops>
+      $<TARGET_OBJECTS:tf_core_direct_session>
+      $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+      $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+      $<TARGET_OBJECTS:tf_core_kernels>
+      $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+      $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+  )
+
+  add_dependencies(tensorflow_static tf_protos_cc)
+  set(tensorflow_static_dependencies
+      $<TARGET_FILE:tensorflow_static>
+      $<TARGET_FILE:tf_protos_cc>
+  )
+    
+  set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
+  set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
+
+  add_custom_command(TARGET tensorflow_static POST_BUILD
+      COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py
+          --input "${tensorflow_static_dependencies}"
+          --output "${tensorflow_deffile}"
+          --target tensorflow.dll
+  )
+endif(WIN32)
+
+# tensorflow is a shared library containing all of the
+# TensorFlow runtime and the standard ops and kernels.
+add_library(tensorflow SHARED
+    $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_cc>
+    $<TARGET_OBJECTS:tf_cc_framework>
+    $<TARGET_OBJECTS:tf_cc_ops>
+    $<TARGET_OBJECTS:tf_core_lib>
+    $<TARGET_OBJECTS:tf_core_cpu>
+    $<TARGET_OBJECTS:tf_core_framework>
+    $<TARGET_OBJECTS:tf_core_ops>
+    $<TARGET_OBJECTS:tf_core_direct_session>
+    $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
+    $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
+    $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
+    ${tensorflow_deffile}
+)
+
+target_link_libraries(tensorflow PRIVATE
+    ${tf_core_gpu_kernels_lib}
+    ${tensorflow_EXTERNAL_LIBRARIES}
+    tf_protos_cc
+)
+
+# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+# linking to the tensorflow library. Adding the following libraries fixes it.
+# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+    target_link_libraries(tensorflow PRIVATE gcc_s gcc)
+endif()
+
+if(WIN32)
+  add_dependencies(tensorflow tensorflow_static)
+endif(WIN32)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index bf45bb0863193395e6b1ab2dd0cbf572d621165e..3d84f1ebb9c1fa1b2f3ccdd8d5ae8eaf182f7715 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 #cc_library(
 #    name = "stream_executor",
 #    srcs = glob(
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 2146870dfd666722f6ad672f0f41a1111f263a17..4abf3b0a645141b41f7b5637604bf3bbbb974479 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 enable_testing()
 
 #
@@ -120,7 +134,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py"
     )
   endif()
- 
+
   file(GLOB_RECURSE tf_test_src_py
     ${tf_test_rnn_src_py}
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
@@ -130,8 +144,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/python/saved_model/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/training/*_test.py"
     "${tensorflow_source_dir}/tensorflow/tensorboard/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
     # NOTE: tensor_forest tests in tensor_forest/hybrid/... still don't pass.
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/client/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/*_test.py"
@@ -141,6 +157,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
   set(tf_test_src_py_exclude
     # Python source line inspection tests are flaky on Windows (b/36375074).
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/analyzer_cli_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/debug/cli/profile_analyzer_cli_test.py"
     # Windows does not have the curses library and uses readline.
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
     # generally not working
@@ -165,12 +182,14 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/cast_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/string_to_number_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/clip_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/tensor_array_ops_test.py"  # Needs portpicker.
       # misc
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reshape_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
       "${tensorflow_source_dir}/tensorflow/tensorboard/backend/server_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/diag_op_test.py"  # Silently failing with GPU kernel disabled.
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py"  # Depends on gemmlowp -> pthread.
       # int32/int64 mixup
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
@@ -189,6 +208,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/tensorboard/lib/python/http_util_test.py"
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/tensorboard/plugins/debugger/plugin_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
@@ -198,7 +218,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Newly running on Windows since TensorBoard backend move. Fail on Windows and need debug.
       "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/directory_watcher_test.py"
       "${tensorflow_source_dir}/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py"
-    )
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
+  )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
 
@@ -263,6 +284,7 @@ if (tensorflow_BUILD_CC_TESTS)
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/call_options_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/distributed_runtime/tensor_coding_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc"
+    "${tensorflow_source_dir}/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/graph_transferer_test.cc"
     "${tensorflow_source_dir}/tensorflow/core/kernels/hexagon/quantized_matmul_op_for_hexagon_test.cc"
   )
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 636caf5f3d9605ee1768a9710f727f258664dd13..6ef95989630a39eaedaddda68f7da709e7d9ab03 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_tools_proto_text_src_dir "${tensorflow_source_dir}/tensorflow/tools/proto_text")
 
 file(GLOB tf_tools_proto_text_srcs
@@ -59,10 +73,13 @@ add_executable(${transform_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${transform_graph} PUBLIC
   tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
@@ -78,10 +95,13 @@ add_executable(${summarize_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${summarize_graph} PUBLIC
   tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
@@ -97,10 +117,13 @@ add_executable(${compare_graphs}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
 target_link_libraries(${compare_graphs} PUBLIC
   tf_protos_cc
+  ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
 
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index d6547d6eacc724c7b7033ee099afca41f8fc7acc..858e7dda92e9e9f456d5fc56b563b2e3ec998520 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 set(tf_tutorials_example_trainer_srcs
     "${tensorflow_source_dir}/tensorflow/cc/tutorials/example_trainer.cc"
 )
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index ebdc9181686683e0f9489b2ac626712accfbe0db..b16a5eadb05da79f2d5a325fea4986e6728fe021 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,11 +44,19 @@ UNDNAME = "undname.exe"
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
 
 # Include if matched before exclude
-INCLUDEPRE_RE = re.compile(r"tensorflow::internal::LogMessage|"
-                           r"tensorflow::internal::CheckOpMessageBuilder")
+INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+                           r"tensorflow::internal::LogMessage|"
+                           r"tensorflow::internal::LogString|"
+                           r"tensorflow::internal::CheckOpMessageBuilder|"
+                           r"tensorflow::internal::PickUnusedPortOrDie|"
+                           r"tensorflow::internal::ValidateDevice|"
+                           r"tensorflow::ops::internal::Enter|"
+                           r"tensorflow::strings::internal::AppendPieces|"
+                           r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
 INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
@@ -56,12 +64,27 @@ INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
                         r"functor::|"
                         r"perftools::gputools")
 
-
+# We want to identify data members explicitly in the DEF file, so that no one
+# can implicitly link against the DLL if they use one of the variables exported
+# from the DLL and the header they use does not decorate the symbol with
+# __declspec(dllimport). It is easier to detect what a data symbol does 
+# NOT look like, so doing it with the below regex.
+DATA_EXCLUDE_RE = re.compile(r"[)(]|"
+                             r"vftable|"
+                             r"vbtable|"
+                             r"vcall|"
+                             r"RTTI|"
+                             r"protobuf::internal::ExplicitlyConstructed")
+      
 def get_args():
   """Parse command line."""
+  filename_list = lambda x: x.split(";")
   parser = argparse.ArgumentParser()
-  parser.add_argument("--input", help="input library", required=True)
+  parser.add_argument("--input", type=filename_list,
+                      help="paths to input libraries separated by semicolons",
+                      required=True)
   parser.add_argument("--output", help="output deffile", required=True)
+  parser.add_argument("--target", help="name of the target", required=True)
   args = parser.parse_args()
   return args
 
@@ -70,25 +93,26 @@ def main():
   """main."""
   args = get_args()
 
-  # Pipe dumpbin to extract all linkable symbols from a lib.
+  # Pipe dumpbin to extract all linkable symbols from libs.
   # Good symbols are collected in candidates and also written to
   # a temp file.
   candidates = []
   tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
-  proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", args.input],
-                          stdout=subprocess.PIPE)
-  for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
-    cols = line.split()
-    if len(cols) < 2:
-      continue
-    sym = cols[1]
-    tmpfile.file.write(sym + "\n")
-    candidates.append(sym)
+  for lib_path in args.input:
+    proc = subprocess.Popen([DUMPBIN, "/nologo", "/linkermember:1", lib_path],
+                            stdout=subprocess.PIPE)
+    for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
+      cols = line.split()
+      if len(cols) < 2:
+        continue
+      sym = cols[1]
+      tmpfile.file.write(sym + "\n")
+      candidates.append(sym)
+    exit_code = proc.wait()
+    if exit_code != 0:
+      print("{} failed, exit={}".format(DUMPBIN, exit_code))
+      return exit_code
   tmpfile.file.close()
-  exit_code = proc.wait()
-  if exit_code != 0:
-    print("{} failed, exit={}".format(DUMPBIN, exit_code))
-    return exit_code
 
   # Run the symbols through undname to get their undecorated name
   # so we can filter on something readable.
@@ -96,9 +120,8 @@ def main():
     # track dupes
     taken = set()
 
-    # Header for the def file. Since the tensorflow.dll is actually called
-    # _pywrap_tensorflow.pyd in the python wheel, hint that in the def file.
-    def_fp.write("LIBRARY _pywrap_tensorflow_internal.pyd\n")
+    # Header for the def file.
+    def_fp.write("LIBRARY " + args.target + "\n")
     def_fp.write("EXPORTS\n")
     def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
 
@@ -119,7 +142,16 @@ def main():
         if not INCLUDE_RE.search(line):
           continue
 
-      def_fp.write("\t" + decorated + "\n")
+      if "deleting destructor" in line:
+        # Some of the symbols convered by INCLUDEPRE_RE export deleting
+        # destructor symbols, which is a bad idea.
+        # So we filter out such symbols here.
+        continue
+
+      if DATA_EXCLUDE_RE.search(line):
+        def_fp.write("\t" + decorated + "\n")
+      else:
+        def_fp.write("\t" + decorated + " DATA\n")
       taken.add(decorated)
   exit_code = proc.wait()
   if exit_code != 0:
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 9ebf94315b017d01176530e877a4d35168ec7c8e..60c0b42a796df7c05b67751dfe3f9f76ba12c9a3 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -82,7 +82,7 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 6049d2afdab3e631ae91783e04472607ebc42683..86faf0cc854e94d808375b80d6e29d98711f506f 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
@@ -67,7 +68,7 @@ limitations under the License.
  * TensorFlow is responsible for making sure the memory is alive long enough
  * and recycles afterwards.
  *
-*/
+ */
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -106,6 +107,7 @@ using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
 using perftools::gputools::port::StatusOr;
+using strings::Printf;
 
 Status ParseRNNMode(const string& str, RnnMode* rnn_mode) {
   if (str == "rnn_relu") {
@@ -203,9 +205,10 @@ DeviceMemoryBase SliceDeviceMemory(const DeviceMemoryBase& device_memory,
 }
 
 inline Status FromExecutorStatus(const perftools::gputools::port::Status& s) {
-  return s.ok() ? Status::OK() : Status(static_cast<tensorflow::error::Code>(
-                                            static_cast<int>(s.code())),
-                                        s.error_message());
+  return s.ok() ? Status::OK()
+                : Status(static_cast<tensorflow::error::Code>(
+                             static_cast<int>(s.code())),
+                         s.error_message());
 }
 
 template <typename T>
@@ -244,8 +247,7 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return perftools::gputools::port::StatusOr<
-        perftools::gputools::DeviceMemory<uint8>>(
+    return StatusOr<DeviceMemory<uint8>>(
         AsDeviceMemory<uint8>(&temporary_memory));
   }
   int64 TotalByteSize() { return total_byte_size_; }
@@ -296,6 +298,43 @@ class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
   int output_index_;
 };
 
+// A helper to allocate persistent memory for Cudnn RNN models, which is
+// expected to live between kernel invocations.
+// This class is not thread-safe.
+class CudnnRNNPersistentSpaceAllocator : public ScratchAllocator {
+ public:
+  CudnnRNNPersistentSpaceAllocator(OpKernelContext* context)
+      : context_(context) {}
+
+  virtual ~CudnnRNNPersistentSpaceAllocator() {}
+
+  int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
+    return std::numeric_limits<int64>::max();
+  }
+
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      perftools::gputools::Stream* stream, int64 byte_size) override {
+    if (total_byte_size_ != 0) {
+      return Status(error::FAILED_PRECONDITION,
+                    "Persistent space allocator can only be called once");
+    }
+
+    Status allocation_status = context_->allocate_persistent(
+        DT_UINT8, TensorShape({byte_size}), &handle_, nullptr);
+    if (!allocation_status.ok()) {
+      return ToExecutorStatus(allocation_status);
+    }
+    total_byte_size_ += byte_size;
+    return AsDeviceMemory<uint8>(handle_.AccessTensor(context_));
+  }
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 total_byte_size_ = 0;
+  PersistentTensor handle_;
+  OpKernelContext* context_;  // not owned
+};
+
 struct CudnnModelTypes {
   RnnMode rnn_mode;
   TFRNNInputMode rnn_input_mode;
@@ -317,6 +356,16 @@ struct CudnnModelShapes {
   TensorShape input_shape;
   TensorShape output_shape;
   TensorShape hidden_state_shape;
+  // At present only fields related to cached RnnDescriptor are concerned.
+  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+    return num_layers == rhs.num_layers && input_size == rhs.input_size &&
+           num_units == rhs.num_units && dir_count == rhs.dir_count;
+  }
+  string RnnDescDebugString() {
+    return strings::Printf(
+        "[num_layers, input_size, num_units, dir_count]: [%d, %d, %d, %d]",
+        num_layers, input_size, num_units, dir_count);
+  }
 };
 
 // Extract and checks the forward input tensors, parameters, and shapes from the
@@ -399,11 +448,23 @@ void RestoreParams(const OpInputList params_input,
 
 }  // namespace
 
+// Note: all following kernels depend on a RnnDescriptor instance, which
+// according to Cudnn official doc should be kept around and reused across all
+// Cudnn kernels in the same model.
+// In Tensorflow, we don't pass the reference across different OpKernels,
+// rather, recreate it separately in each OpKernel, which does no cause issue:
+// CudnnDropoutDescriptor keeps a reference to a memory for
+// random number generator state. During recreation, this state is lost.
+// However, only forward-pass Cudnn APIs make use of the state.
+
 // A common base class for RNN kernels. It extracts common attributes and
 // shape validations.
 class CudnnRNNKernelCommon : public OpKernel {
  protected:
   CudnnRNNKernelCommon(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dropout", &dropout_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed", &seed_));
+    OP_REQUIRES_OK(context, context->GetAttr("seed2", &seed2_));
     string str;
     OP_REQUIRES_OK(context, context->GetAttr("rnn_mode", &str));
     OP_REQUIRES_OK(context, ParseRNNMode(str, &model_types_.rnn_mode));
@@ -413,6 +474,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("direction", &str));
     OP_REQUIRES_OK(
         context, ParseRNNDirectionMode(str, &model_types_.rnn_direction_mode));
+    // Reset CudnnRnnDescriptor and related random number generate states in
+    // every Compute() call.
+    OP_REQUIRES_OK(context, ReadBoolFromEnvVar("TF_CUDNN_RESET_RND_GEN_STATE",
+                                               false, &reset_rnd_gen_state_));
   }
 
   bool HasInputC() const { return model_types_.HasInputC(); }
@@ -422,6 +487,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     return model_types_.rnn_direction_mode;
   }
   CudnnModelTypes model_types() const { return model_types_; }
+  float dropout() const { return dropout_; }
+  uint64 seed() { return (static_cast<uint64>(seed_) << 32) | seed2_; }
+  bool ResetRndGenState() { return reset_rnd_gen_state_; }
 
   template <typename T>
   Status ExtractCudnnRNNParamsInfo(OpKernelContext* context,
@@ -448,11 +516,14 @@ class CudnnRNNKernelCommon : public OpKernel {
     RnnInputMode input_mode;
     TF_RETURN_IF_ERROR(
         ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode));
+
     auto* stream = context->op_device_context()->stream();
+    // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
+    // random number generator, therefore set state_allocator to nullptr.
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, 0.f /*dropout*/, 0 /*seed*/,
-        nullptr /*state_allocator*/);
+        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
     }
@@ -461,6 +532,11 @@ class CudnnRNNKernelCommon : public OpKernel {
   }
 
  private:
+  int seed_;
+  int seed2_;
+  float dropout_;
+  bool reset_rnd_gen_state_;
+
   CudnnModelTypes model_types_;
 };
 
@@ -560,9 +636,8 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
         context->set_output(i, input.Slice(start, end));
       } else {
         Tensor* output = nullptr;
-        OP_REQUIRES_OK(
-            context,
-            context->allocate_output(i, TensorShape({width, height}), &output));
+        OP_REQUIRES_OK(context, context->allocate_output(
+                                    i, TensorShape({width, height}), &output));
         DeviceMemoryBase data_src_ptr = SliceDeviceMemory(
             input_ptr, rnn_desc->ParamsWeightRegions()[i].offset,
             size_in_bytes);
@@ -571,14 +646,17 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
       }
     }
 
-    CHECK(num_params_ == rnn_desc->ParamsBiasRegions().size())
-        << "Number of params mismatch. Expected " << num_params_ << ", got "
-        << rnn_desc->ParamsBiasRegions().size();
+    OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(),
+                errors::InvalidArgument("Number of params mismatch. Expected ",
+                                        num_params_, ", got ",
+                                        rnn_desc->ParamsBiasRegions().size()));
     for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) {
       int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size;
       int64 size = size_in_bytes / sizeof(T);
-      CHECK(size == num_units) << "Params size mismatch. Expected " << num_units
-                               << ", got " << size;
+      OP_REQUIRES(context, size == num_units,
+                  errors::InvalidArgument("Params size mismatch. Expected ",
+                                          num_units, ", got ", size));
+
       // If data is aligned, use slice view to avoid expensive memcpy.
       bool start_aligned =
           rnn_desc->ParamsBiasRegions()[i].offset % EIGEN_MAX_ALIGN_BYTES == 0;
@@ -698,16 +776,32 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
     auto data_type = ToDataType<T>::value;
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes_->num_layers, model_shapes_->num_units,
+            model_shapes_->input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -753,21 +847,30 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnForward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, &output_data, *hidden_state_desc, &output_h_data,
-                *hidden_state_desc, &output_c_data, is_training_,
-                &reserve_space_allocator, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnForward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, &output_data, *hidden_state_desc,
+                  &output_h_data, *hidden_state_desc, &output_c_data,
+                  is_training_, &reserve_space_allocator, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnForward"));
   }
 
  private:
+  mutex mu_;
   bool is_training_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
@@ -808,9 +911,9 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* output_h = nullptr;
     OP_REQUIRES_OK(context, context->input("output_h", &output_h));
     OP_REQUIRES(context, output_h->shape() == hidden_state_shape,
-                errors::InvalidArgument("Invalid output_h shape: ",
-                                        output_h->shape().DebugString(), " ",
-                                        hidden_state_shape.DebugString()));
+                errors::InvalidArgument(
+                    "Invalid output_h shape: ", output_h->shape().DebugString(),
+                    " ", hidden_state_shape.DebugString()));
     const Tensor* output_c = nullptr;
     if (HasInputC()) {
       // Only LSTM uses input_c and output_c. So for all other models, we only
@@ -881,15 +984,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
                                   model_shapes.input_size, &input_mode));
-    // TODO(zhengxq): add dropout support.
     // TODO(zhengxq): cache the descriptor so we don't have to create them all
     // the time.
-    auto rnn_desc_s = executor->createRnnDescriptor(
-        model_shapes.num_layers, model_shapes.num_units,
-        model_shapes.input_size, input_mode, rnn_direction_mode(), rnn_mode(),
-        data_type, 0.f /*dropout*/, 0 /*seed*/, nullptr /*state_allocator*/);
-    OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
-    auto rnn_desc = rnn_desc_s.ConsumeValueOrDie();
+    {
+      mutex_lock l(mu_);
+      if (model_shapes_ == nullptr) {
+        model_shapes_.reset(new CudnnModelShapes(model_shapes));
+      } else {
+        OP_REQUIRES(context, model_shapes_->IsCompatibleWith(model_shapes),
+                    errors::InvalidArgument(
+                        "Incompatible rnn model shapes inferred: expecting ",
+                        model_shapes_->RnnDescDebugString(), ", getting ",
+                        model_shapes.RnnDescDebugString(), "."));
+      }
+
+      if (rnn_desc_ == nullptr || ResetRndGenState()) {
+        dropout_state_allocator_.reset(
+            new CudnnRNNPersistentSpaceAllocator(context));
+        auto rnn_desc_s = executor->createRnnDescriptor(
+            model_shapes.num_layers, model_shapes.num_units,
+            model_shapes.input_size, input_mode, rnn_direction_mode(),
+            rnn_mode(), data_type, dropout(), seed(),
+            dropout_state_allocator_.get());
+        OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
+        rnn_desc_ = std::move(rnn_desc_s.ConsumeValueOrDie());
+      }
+    }
 
     auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
@@ -939,21 +1059,32 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
     CudnnRNNWorkspaceAllocator workspace_allocator(context);
-    bool launch_status =
-        stream
-            ->ThenRnnBackward(
-                *rnn_desc, *input_desc, input_data, *hidden_state_desc,
-                input_h_data, *hidden_state_desc, input_c_data, params_data,
-                *output_desc, output_data, *hidden_state_desc, output_h_data,
-                *hidden_state_desc, output_c_data, output_backprop_data,
-                output_h_backprop_data, output_c_backprop_data,
-                &input_backprop_data, &input_h_backprop_data,
-                &input_c_backprop_data, &params_backprop_data,
-                &reserve_space_uint8, &workspace_allocator)
-            .ok();
+    bool launch_status = false;
+    {
+      mutex_lock l(mu_);
+      launch_status =
+          stream
+              ->ThenRnnBackward(
+                  *rnn_desc_, *input_desc, input_data, *hidden_state_desc,
+                  input_h_data, *hidden_state_desc, input_c_data, params_data,
+                  *output_desc, output_data, *hidden_state_desc, output_h_data,
+                  *hidden_state_desc, output_c_data, output_backprop_data,
+                  output_h_backprop_data, output_c_backprop_data,
+                  &input_backprop_data, &input_h_backprop_data,
+                  &input_c_backprop_data, &params_backprop_data,
+                  &reserve_space_uint8, &workspace_allocator)
+              .ok();
+    }
     OP_REQUIRES(context, launch_status,
                 errors::Internal("Failed to call ThenRnnBackward"));
   }
+
+ private:
+  mutex mu_;
+  std::unique_ptr<CudnnModelShapes> model_shapes_ GUARDED_BY(mu_);
+  std::unique_ptr<RnnDescriptor> rnn_desc_ GUARDED_BY(mu_);
+  std::unique_ptr<CudnnRNNPersistentSpaceAllocator> dropout_state_allocator_
+      GUARDED_BY(mu_);
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 58025f7b1a5d592f54ec63f5ce36c3c7a7611c0d..2c631b064b559e19d767297e8ba5bfda06ab0880 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -35,6 +35,9 @@ input_mode: Indicate whether there is a linear projection between the input and
     input_size == num_units; otherwise, it implies 'linear_input'.
 direction: Indicates whether a bidirectional model will be used.
     dir = (direction == bidirectional) ? 2 : 1
+dropout: dropout probability. When set to 0., dropout is disabled.
+seed: the 1st part of a seed to initialize dropout.
+seed2: the 2nd part of a seed to initialize dropout.
 )doc";
 
 constexpr auto kCudnnRNNParamsBuffer = R"doc(
@@ -77,6 +80,9 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .Output("params_size: S")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(1));
@@ -119,6 +125,7 @@ REGISTER_OP("CudnnRNN")
     .Input("input_h: T")
     .Input("input_c: T")
     .Input("params: T")
+    .SetIsStateful()
     .Output("output: T")
     .Output("output_h: T")
     .Output("output_c: T")
@@ -127,7 +134,7 @@ REGISTER_OP("CudnnRNN")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
-    .Attr("dropout: float")
+    .Attr("dropout: float = 0.0")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("is_training: bool = true")
@@ -158,7 +165,8 @@ REGISTER_OP("CudnnRNN")
 Computes the RNN from the input and initial states, with respect to the params
 buffer.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 is_training: Indicates whether this operation is used for inferenece or
     training.
 reserve_space: an opaque tensor that can be used in backprop calculation. It
@@ -177,6 +185,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Input("output_h_backprop: T")
     .Input("output_c_backprop: T")
     .Input("reserve_space: T")
+    .SetIsStateful()
     .Output("input_backprop: T")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
@@ -185,6 +194,9 @@ REGISTER_OP("CudnnRNNBackprop")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       auto input_shape = c->input(0);
       auto input_h_shape = c->input(1);
@@ -199,7 +211,8 @@ REGISTER_OP("CudnnRNNBackprop")
     .Doc(strings::StrCat(R"doc(
 Compute the backprop of both data and weights in a RNN.
 )doc",
-                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(), R"doc(
+                         kCudnnRNNCommonAttrs, CudnnRNNForwardTensors(),
+                         R"doc(
 output_backprop: A 3-D tensor with the same shape as output in the forward pass.
 output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
     pass.
@@ -228,6 +241,9 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused));
@@ -268,6 +284,9 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       return Status::OK();
@@ -281,7 +300,6 @@ upcoming training or inferences.
 num_params: number of parameter sets for all layers.
     Each layer may contain multiple parameter sets, with each set consisting of
     a weight matrix and a bias vector.
-)doc",
-                         kCudnnRNNCommonAttrs));
+)doc", kCudnnRNNCommonAttrs));
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 945791578ac48bf24db721182e191a78a7643c6c..08ec3076e49696602f729772e8dc3686c281cbaa 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -38,15 +38,25 @@ from tensorflow.python.training import saver as saver_lib
 
 class CudnnRNNTest(TensorFlowTestCase):
 
-  def _CreateModel(self, rnn_mode, num_layers, num_units, input_size):
+  def _CreateModel(self,
+                   rnn_mode,
+                   num_layers,
+                   num_units,
+                   input_size,
+                   input_mode="linear_input",
+                   dropout=0.):
     if rnn_mode == "lstm":
-      model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnLSTM(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "gru":
-      model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnGRU(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_tanh":
-      model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNTanh(
+          num_layers, num_units, input_size, dropout=dropout)
     elif rnn_mode == "rnn_relu":
-      model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size)
+      model = cudnn_rnn_ops.CudnnRNNRelu(
+          num_layers, num_units, input_size, dropout=dropout)
     else:
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     return model
@@ -58,9 +68,8 @@ class CudnnRNNTest(TensorFlowTestCase):
       params: a Variable for weight and bias parameters.
       model: a CudnnRNN model.
     """
-    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(model.params_to_canonical,
-                                                      model.canonical_to_params,
-                                                      params)
+    params_saveable = cudnn_rnn_ops.RNNParamsSaveable(
+        model.params_to_canonical, model.canonical_to_params, [params])
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
 
   def _testSaveRestoreVariable(self, rnn_mode):
@@ -175,9 +184,12 @@ class CudnnRNNTest(TensorFlowTestCase):
         self._testOneLSTMParamsSize(num_layers, num_units, input_size)
 
   def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, expected,
-                              tolerance):
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+                              batch_size, seq_length, dir_count, dropout,
+                              expected, tolerance):
+    random_seed.set_random_seed(5678)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              input_mode="auto_select",
+                              dropout=dropout)
     has_input_c = (rnn_mode == "lstm")
     params_size_t = model.params_size()
     input_data = array_ops.ones([seq_length, batch_size, input_size])
@@ -207,18 +219,24 @@ class CudnnRNNTest(TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       total_sum_v = sess.run([total_sum])
+
       self.assertAllClose(
           total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleInference(self):
+    # Cudnn scales result for dropout during training, therefore dropout has no
+    # impact for inference results.
+    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
+    # demonstrative of the dropout-invariant nature of CudnnRnn.)
     test_configs = [
-        [
-            "lstm",
-            231833.22,
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "expected": 231833.22,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -226,12 +244,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -239,12 +258,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            56000,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "expected": 56000,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 4,
                 "num_units": 200,
                 "input_size": 200,
@@ -252,12 +272,13 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 10,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            130688,
-            1e-2,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "expected": 130688,
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 8,
                 "input_size": 4,
@@ -265,24 +286,32 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 2,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        expected = config[1]
-        tolerance = config[2]
-        shapes = config[3]
-        self._testOneSimpleInference(rnn_mode, shapes["num_layers"],
-                                     shapes["num_units"], shapes["input_size"],
-                                     shapes["batch_size"], shapes["seq_length"],
-                                     shapes["dir_count"], expected, tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        expected = config["expected"]
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleInference(
+              rnn_mode, shape["num_layers"], shape["num_units"],
+              shape["input_size"], shape["batch_size"], shape["seq_length"],
+              shape["dir_count"], dropout, expected, tolerance)
 
   def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, tolerance):
+                             batch_size, seq_length, dir_count, dropout,
+                             tolerance):
+    # Gradient checking runs two forward ops with almost the same input. Need to
+    # make sure the drop patterns across the two runs are the same.
+    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
+    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
     has_input_c = (rnn_mode == "lstm")
     random_seed.set_random_seed(1234)
-    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size)
+    model = self._CreateModel(rnn_mode, num_layers, num_units, input_size,
+                              dropout=dropout)
     params_size_t = model.params_size()
     input_data = variables.Variable(
         random_ops.random_uniform([seq_length, batch_size, input_size]))
@@ -295,6 +324,7 @@ class CudnnRNNTest(TensorFlowTestCase):
       input_c = variables.Variable(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units]))
+
       output, output_h, output_c = model(
           input_data=input_data,
           input_h=input_h,
@@ -323,18 +353,22 @@ class CudnnRNNTest(TensorFlowTestCase):
       sess.run(variables.global_variables_initializer())
       all_inputs = [entry[0] for entry in inputs_and_shapes]
       all_shapes = [entry[1] for entry in inputs_and_shapes]
+
       err = gradient_checker.compute_gradient_error(all_inputs, all_shapes,
                                                     total_sum, [1])
+
       self.assertLess(err, tolerance)
+      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTraining(self):
     test_configs = [
-        [
-            "lstm",
-            1e-2,
-            {
+        {
+            "rnn_mode": "lstm",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 1e-2,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -342,11 +376,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "gru",
-            4e-3,
-            {
+        },
+        {
+            "rnn_mode": "gru",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -354,11 +389,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_tanh",
-            5e-3,
-            {
+        },
+        {
+            "rnn_mode": "rnn_tanh",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 5e-3,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -366,11 +402,12 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
-        [
-            "rnn_relu",
-            3e-1,
-            {
+        },
+        {
+            "rnn_mode": "rnn_relu",
+            "dropout": [0., 0.5, 1.],
+            "tolerance": 4e-1,
+            "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
@@ -378,17 +415,19 @@ class CudnnRNNTest(TensorFlowTestCase):
                 "seq_length": 4,
                 "dir_count": 1,
             },
-        ],
+        },
     ]
     with ops.Graph().as_default():
       for config in test_configs:
-        rnn_mode = config[0]
-        tolerance = config[1]
-        shape = config[2]
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    shape["dir_count"], tolerance)
+        rnn_mode = config["rnn_mode"]
+        dropout_list = config.get("dropout", [0.])
+        tolerance = config["tolerance"]
+        shape = config["shape"]
+        for dropout in dropout_list:
+          self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
+                                      shape["num_units"], shape["input_size"],
+                                      shape["batch_size"], shape["seq_length"],
+                                      shape["dir_count"], dropout, tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 9ab337df15c09639dd70215666b6d1ed97cabbc0..cc0c7b082964546741c17a5cc9345330c3d8d6cc 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -23,13 +23,13 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
 
-
 _cudnn_rnn_ops_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_cudnn_rnn_ops.so"))
 
@@ -48,8 +48,8 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
   def __init__(self,
                params_to_canonical,
                canonical_to_params,
-               name="params_canonical",
-               *param_variables):
+               param_variables,
+               name="params_canonical"):
     """Creates a RNNParamsSaveable object.
 
        RNNParamsSaveable is saveable/restorable in a checkpoint file and is used
@@ -83,11 +83,11 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           must return a scalar (e.g. in the case of cuDNN) or a tuple. This
           function could be _CudnnRNN.canonical_to_params() or a
           user-defined function.
-      name: the name of the RNNParamsSaveable object.
-      *param_variables: a list of Variables for parameters in a specific form.
+      param_variables: a list of Variables for parameters in a specific form.
           For cuDNN RNN ops, this is a single merged variable for both weights
           and biases; for other RNN ops, this might be multiple unmerged or
           partially merged variables respectively for weights and biases.
+      name: the name of the RNNParamsSaveable object.
     """
     # There is only a single merged parameter variable for cuDNN when saving.
     weights, biases = params_to_canonical(param_variables[0])
@@ -110,12 +110,12 @@ class RNNParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     if not isinstance(params, tuple):
       params = (params,)
     assign_ops = [
-        state_ops.assign(
-            variable, param, validate_shape=False)
+        state_ops.assign(variable, param, validate_shape=False)
         for variable, param in zip(self._variables, params)
     ]
     return control_flow_ops.group(*assign_ops)
 
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -141,7 +141,7 @@ _cudnn_rnn_common_doc_string = """
     * Once a while, the user saves the parameter buffer into model checkpoints
         with Saver.save().
     * When restoring, the user creates a RNNParamsSaveable object and uses
-      Saver.restore() to restore the paramter buffer from the canonical format
+      Saver.restore() to restore the parameter buffer from the canonical format
       to a user-defined format, as well as to restore other savable objects
       in the checkpoint file.
 """
@@ -160,11 +160,10 @@ class _CudnnRNN(object):
                num_layers,
                num_units,
                input_size,
-               input_mode="auto_select",
+               input_mode="linear_input",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a CudnnRNN model from model spec.
 
     Args:
@@ -175,16 +174,18 @@ class _CudnnRNN(object):
       input_size: the size of the input, it could be different from the
           num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and The actual computation before the first layer. It could be
-          'skip_input', 'linear_input' or 'auto_select'.
+          input and the actual computation before the first layer. It could be
+          'linear_input', 'skip_input' or 'auto_select'.
+          'linear_input' (default) always applies a linear projection of input
+          onto RNN hidden state. (standard RNN behavior).
           'skip_input' is only allowed when input_size == num_units;
           'auto_select' implies 'skip_input' when input_size == num_units;
           otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
+          for behavior.
     """
     self._num_layers = num_layers
     self._num_units = num_units
@@ -193,8 +194,10 @@ class _CudnnRNN(object):
     self._input_mode = input_mode
     self._direction = direction
     self._dropout = dropout
-    self._seed = seed
-    self._seed2 = seed2
+    # get graph and op seed.
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    if self._seed is None and self._seed2 is None:
+      self._seed, self._seed2 = 0, 0
 
   def params_size(self):
     """Calculates the size of the opaque parameter buffer needed for this model.
@@ -208,6 +211,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         T=dtypes.float32,
         S=dtypes.int32,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)[0]
@@ -258,6 +264,9 @@ class _CudnnRNN(object):
         num_units=self._num_units,
         input_size=self._input_size,
         params=params,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         num_params=self._num_layers * self._NUM_PARAMS_PER_LAYER,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
@@ -280,6 +289,9 @@ class _CudnnRNN(object):
         input_size=self._input_size,
         weights=weights,
         biases=biases,
+        dropout=self._dropout,
+        seed=self._seed,
+        seed2=self._seed2,
         rnn_mode=self._rnn_mode,
         input_mode=self._input_mode,
         direction=self._direction)
@@ -299,8 +311,7 @@ class CudnnLSTM(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn LSTM model from model spec.
 
     Args:
@@ -317,8 +328,7 @@ class CudnnLSTM(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(CudnnLSTM, self).__init__(
         "lstm",
@@ -328,8 +338,7 @@ class CudnnLSTM(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, input_c, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -346,11 +355,8 @@ class CudnnLSTM(_CudnnRNN):
       output_h: the final state for h.
       output_c: the final state for c.
     """
-    output, output_h, output_c = super(CudnnLSTM, self).__call__(input_data,
-                                                                 input_h,
-                                                                 input_c,
-                                                                 params,
-                                                                 is_training)
+    output, output_h, output_c = super(CudnnLSTM, self).__call__(
+        input_data, input_h, input_c, params, is_training=is_training)
     return (output, output_h, output_c)
 
 
@@ -365,8 +371,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
                input_mode="auto_select",
                direction="unidirectional",
                dropout=0.,
-               seed=0,
-               seed2=0):
+               seed=0):
     """Creates a Cudnn RNN model from model without hidden-state C.
 
     Args:
@@ -383,8 +388,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       direction: the direction model that the model operates. Could be either
           'unidirectional' or 'bidirectional'
       dropout: whether to enable dropout. With it is 0, dropout is disabled.
-      seed: the first part of a seed that is used to initialize dropout.
-      seed2: the second part of a seed that is used to initialize dropout.
+      seed: the seed used for initializing dropout.
     """
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
@@ -394,8 +398,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         input_mode=input_mode,
         direction=direction,
         dropout=dropout,
-        seed=seed,
-        seed2=seed2)
+        seed=seed)
 
   def __call__(self, input_data, input_h, params, is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
@@ -411,7 +414,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       output_h: the final state for h.
     """
     output, output_h, _ = super(_CudnnRNNNoInputC, self).__call__(
-        input_data, input_h, None, params, is_training=True)
+        input_data, input_h, None, params, is_training=is_training)
     return (output, output_h)
 
 
@@ -459,6 +462,9 @@ def _cudnn_rnn_backward(op, *grad):
       output_h_backprop=grad[1],
       output_c_backprop=grad[2],
       reserve_space=op.outputs[3],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
diff --git a/tensorflow/tensorboard/app/BUILD b/tensorflow/contrib/data/BUILD
similarity index 61%
rename from tensorflow/tensorboard/app/BUILD
rename to tensorflow/contrib/data/BUILD
index 9afcd23e9e72a010b71bd9c39a1b52f3d3542a4b..7b916d82c1c02ffab1841e630d12a66359bd2a97 100644
--- a/tensorflow/tensorboard/app/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -1,11 +1,19 @@
-# Description:
-# Build rules for building the HTML/JS necessary for TensorBoard.
 package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "data",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4d14fdc39a679f38e3013027286a192a4e54d13
--- /dev/null
+++ b/tensorflow/contrib/data/README.md
@@ -0,0 +1,630 @@
+# Using the `Dataset` API for TensorFlow Input Pipelines
+
+The `Dataset` API is designed to let you build complex input pipelines from
+simple, reusable pieces. For example, the pipeline for an image model might
+aggregate data from files in a distributed file system, apply random
+perturbations to each image, and merge randomly selected images into a batch
+for training. The pipeline for a text model might involve extracting symbols
+from raw text data, converting them to embedding identifiers with a lookup
+table, and batching together sequences of different lengths. The `Dataset` API
+makes it easy to deal with large amounts of data, different data formats, and
+complicated transformations.
+
+The `Dataset` API introduces two new abstractions to TensorFlow:
+
+* A `tf.contrib.data.Dataset` represents a sequence of elements, in which
+  each element contains one or more `Tensor` objects. For example, in an image
+  pipeline, an element might be a single training example, with a pair of
+  tensors representing the image data and a label. A `Dataset` can either be a
+  *source* (e.g. `Dataset.from_tensor_slices()` constructs a dataset from one
+  or more `tf.Tensor` objects), or a *transformation* (e.g. `Dataset.batch()`
+  constructs a dataset by stacking consecutive elements of another dataset into
+  a single element).
+
+* A `tf.contrib.data.Iterator` provides the main way to extract elements from a
+  dataset. The `Iterator.get_next()` operation yields the next element of a
+  `Dataset`, and typically acts as the interface between input pipeline code and
+  your model. The simplest iterator is a "one-shot iterator", which is
+  associated with a particular `Dataset` and iterates through it once. For more
+  sophisticated uses, the `Iterator.initializer` operation enables you to
+  reinitialize and parameterize an iterator with different datasets, so that
+  you can, for example, iterate over training and validation data multiple times
+  in the same program.
+
+## Tutorial
+
+This programmers' guide includes step-by-step instructions for a variety of
+input data use cases. Also see the `Dataset` and `Iterator` class references
+for more detailed information about the API.
+
+### Basic mechanics
+
+This section of the guide describes the fundamentals of creating different kinds
+of `Dataset` and `Iterator` objects, and how to extract data from them.
+
+#### Defining a source dataset
+
+You can build a `Dataset` using one of the following *source* dataset
+constructors:
+
+* From in-memory data:
+  * `tf.contrib.data.Dataset.from_tensors()`
+  * `tf.contrib.data.Dataset.from_tensor_slices()`
+
+* From on-disk data:
+  * `tf.contrib.data.FixedLengthRecordDataset()`
+  * `tf.contrib.data.TextLineDataset()`
+  * `tf.contrib.data.TFRecordDataset()`
+
+* From parameters:
+  * `tf.contrib.data.Dataset.range()`
+
+#### Transforming a dataset
+
+The `tf.contrib.data.Dataset` class has many methods that can be chained
+together to *transform* one dataset into another:
+
+* Per-element transformations:
+  * `Dataset.filter()`
+  * `Dataset.flat_map()`
+  * `Dataset.map()`
+  * `Dataset.zip()`
+
+* Multi-element transformations:
+  * `Dataset.batch()`
+  * `Dataset.dense_to_sparse_batch()`
+  * `Dataset.group_by_window()`
+  * `Dataset.padded_batch()`
+  * `Dataset.repeat()`
+  * `Dataset.shuffle()`
+  * `Dataset.skip()`
+  * `Dataset.take()`
+
+The following sections contain examples of how to use these transformations to
+solve common problems.
+
+#### Dataset structure
+
+A dataset comprises elements that each have the same structure. An element
+contains one or more `tf.Tensor` objects, called *components*. Each component
+has a `tf.DType` representing the type of elements in the tensor, and a
+`tf.TensorShape` representing the (possibly partially specified) static shape of
+each element. The `Dataset.output_types` and `Dataset.output_shapes` properties
+allow you to inspect the inferred types and shapes of each component of a
+dataset element. The *nested structure* of these properties map to the structure
+of an element, which may be a single tensor, a tuple of tensors, or a nested
+tuple of tensors. For example:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+print(dataset1.output_types)  # ==> "tf.float32"
+print(dataset1.output_shapes)  # ==> "(10,)"
+
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices(
+   (tf.random_uniform([4]), tf.random_uniform([4, 100], dtype=tf.int32)))
+print(dataset2.output_types)  # ==> "(tf.float32, tf.int32)"
+print(dataset2.output_shapes)  # ==> "((), (100,))"
+
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+print(dataset3.output_types)  # ==> (tf.float32, (tf.float32, tf.int32))
+print(dataset3.output_shapes)  # ==> "((), (100,))"
+```
+
+The `Dataset` transformations support datasets of any structure. When using the
+`Dataset.map()`, `Dataset.flat_map()` and `Dataset.filter()` transformations,
+which apply a function to each element, the element structure determines the
+arguments of the function:
+
+```python
+dataset1 = dataset1.map(lambda x: ...)
+
+dataset2 = dataset2.flat_map(lambda x, y: ...)
+
+# *N.B.* Lambda argument destructuring is not available in Python 3.
+dataset3 = dataset3.filter(lambda x, (y, z): ...)
+```
+
+#### Creating an iterator
+
+One you have built a `Dataset` to represent your input data, the next step is to
+create an `Iterator` to access elements from that dataset.  The `Dataset` API
+currently supports three kinds of iterator, in increasing level of
+sophistication:
+
+A *one-shot* iterator is the simplest form of iterator, which only supports
+iterating once through a dataset, with no need for explicit initialization.
+One-shot iterators handle almost all of the cases that the existing queue-based
+input pipelines support, but they do not support parameterization. Using the
+example of `Dataset.range()`:
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+iterator = dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+An *initializable* iterator requires you to run an explicit
+`iterator.initializer` operation before using it. In exchange for this
+inconvenience, it enables you to *parameterize* the definition of the dataset,
+using one or more `tf.placeholder()` tensors that can be fed when you
+initialize the iterator. Continuing the `Dataset.range()` example:
+
+```python
+max_value = tf.placeholder(tf.int64, shape=[])
+dataset = tf.contrib.data.Dataset.range(max_value)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Initialize an iterator over a dataset with 10 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 10})
+for i in range(10):
+  value = sess.run(next_element)
+  assert i == value
+
+# Initialize the same iterator over a dataset with 100 elements.
+sess.run(iterator.initializer, feed_dict={max_value: 100})
+for i in range(100):
+  value = sess.run(next_element)
+  assert i == value
+```
+
+A *reinitializable* iterator can be initialized from multiple different
+`Dataset` objects. For example, you might have a training input pipeline that
+uses random perturbations to the input images to improve generalization, and
+a validation input pipeline that evaluates predictions on unmodified data. These
+pipelines will typically use different `Dataset` objects that have the same
+structure (i.e. the same types and compatible shapes for each component). 
+
+```python
+training_dataset = tf.contrib.data.Dataset.range(100).map(
+    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
+validation_dataset = tf.contrib.data.Dataset.range(50)
+
+# A reinitializable iterator is defined by its structure. We could use the
+# `output_types` and `output_shapes` properties of either `training_dataset`
+# or `validation_dataset` here, because they are compatible.
+iterator = Iterator.from_structure(training_dataset.output_types,
+                                   training_dataset.output_shapes)
+next_element = iterator.get_next()
+
+training_init_op = iterator.make_initializer(training_dataset)
+validation_init_op = iterator.make_initializer(validation_dataset)
+
+# Run 20 epochs in which the training dataset is traversed, followed by the
+# validation dataset.
+for _ in range(20):
+  # Initialize an iterator over the training dataset.
+  sess.run(training_init_op)
+  for _ in range(100):
+    sess.run(next_element)
+
+  # Initialize an iterator over the validation dataset.
+  sess.run(validation_init_op)
+  for _ in range(50):
+    sess.run(next_element)
+```
+
+#### Consuming values from an iterator
+
+The `Iterator.get_next()` method returns one or more `tf.Tensor` objects that
+correspond to the symbolic next element of an iterator. Each time these tensors
+are evaluated, they take the value of the next element in the underlying
+dataset. (Note that, like other stateful objects in TensorFlow, calling
+`Iterator.get_next()` does not immediately advance the iterator. Instead you
+must use the returned `tf.Tensor` objects in a TensorFlow expression, and pass
+the result of that expression to `tf.Session.run()` to get the next elements and
+advance the iterator.)
+
+If the iterator reaches the end of the dataset, executing
+the `Iterator.get_next()` operation will raise a `tf.errors.OutOfRangeError`.
+After this point the iterator will be in an unusable state, and you must
+initialize it again if you want to use it further.
+
+```python
+dataset = tf.contrib.data.Dataset.range(5)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Typically `result` will be the output of a model, or an optimizer's
+# training operation.
+result = tf.add(next_element, next_element)
+
+sess.run(iterator.initializer)
+print(sess.run(result))  # ==> "0"
+print(sess.run(result))  # ==> "2"
+print(sess.run(result))  # ==> "4"
+print(sess.run(result))  # ==> "6"
+print(sess.run(result))  # ==> "8"
+try:
+  sess.run(result)
+except tf.errors.OutOfRangeError:
+  print("End of dataset")  # ==> "End of dataset"
+```
+
+A common pattern is to wrap the "training loop" in a `try`-`except` block:
+
+```python
+sess.run(iterator.initializer)
+while True:
+  try:
+    sess.run(result)
+  except tf.errors.OutOfRangeError:
+    break
+```
+
+If each element of the dataset has a nested structure, the return value of
+`Iterator.get_next()` will be one or more `tf.Tensor` objects in the same
+nested structure:
+
+```python
+dataset1 = tf.contrib.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
+dataset2 = tf.contrib.data.Dataset.from_tensor_slices((tf.random_uniform([4]), tf.random_uniform([4, 100])))
+dataset3 = tf.contrib.data.Dataset.zip((dataset1, dataset2))
+
+iterator = dataset3.make_initializable_iterator()
+
+sess.run(iterator.initializer)
+next1, (next2, next3) = iterator.get_next()
+```
+
+Note that evaluating *any* of `next1`, `next2`, or `next3` will advance the
+iterator for all components. A typical consumer of an iterator will include all
+components in a single expression.
+
+### Reading input data
+
+#### Consuming NumPy arrays
+
+If all of your input data fit in memory, the simplest way to create a `Dataset`
+from them is to convert them to `tf.Tensor` objects and use
+`Dataset.from_tensor_slices()`.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features, labels))
+```
+
+Note that the above code snippet will embed the `features` and `labels` arrays
+in your TensorFlow graph as constants. This works well for a small dataset, but
+wastes memory, and can run into the 2GB limit for the `tf.GraphDef` protocol
+buffer.
+
+As an alternative, you can define the `Dataset` in terms of `tf.placeholder()`
+tensors, and *feed* the NumPy arrays when you initialize an `Iterator` over the
+dataset.
+
+```python
+# Load the training data into two NumPy arrays, for example using `np.load()`.
+with np.load("/var/data/training_data.npy") as data:
+  features = data["features"]
+  labels = data["labels"]
+
+# Assume that each row of `features` corresponds to the same row as `labels`.
+assert features.shape[0] == labels.shape[0]
+
+features_placeholder = tf.placeholder(features.dtype, features.shape)
+labels_placeholder = tf.placeholder(labels.dtype, labels.shape)
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
+# [Other transformations on `dataset`...]
+dataset = ...
+iterator = dataset.make_initializable_iterator()
+
+sess.run(iterator.initializer, feed_dict={features_placeholder: features,
+                                          labels_placeholder: labels})
+```
+
+#### Consuming TFRecord data
+
+The `Dataset` API supports a variety of file formats so that you can process
+large datasets that do not fit in memory. The TFRecord file format is a
+simple record-oriented binary format that many TensorFlow applications use for
+training data. The `tf.contrib.data.TFRecordDataset` class enables you to
+stream over the contents of one or more TFRecord files as part of an input
+pipeline.
+
+```python
+# Creates a dataset that reads all of the examples from two files.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+```
+
+The `filenames` argument to the `TFRecordDataset` initializer can be a
+`tf.Tensor` of strings. Therefore if you have two sets of files for training
+and validation purposes, you can use a `tf.placeholder(tf.string)` to represent
+the filenames, and initialize an iterator from the appropriate filenames:
+
+```python
+filenames = tf.placeholder(tf.string, shape=[None])
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+# [Other transformations on `dataset`...]
+dataset = ...
+iterator = dataset.make_initializable_iterator()
+
+# You can feed the initializer with the appropriate filenames for the current
+# phase of execution, e.g. training vs. validation.
+
+# Initialize `iterator` with training data.
+training_filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+sess.run(iterator.initializer, feed_dict={filenames: training_filenames})
+
+# Initialize `iterator` with validation data.
+validation_filenames = ["/var/data/validation1.tfrecord", ...]
+sess.run(iterator.initializer, feed_dict={filenames: validation_filenames})
+```
+
+#### Consuming text data
+
+Many datasets are distributed as one or more text files. The
+`tf.contrib.data.TextLineDataset` provides an easy way to extract lines from
+one or more text files. Given one or more filenames, a `TextLineDataset` will
+produce one string-valued element per line of those files. Like a
+`TFRecordDataset`, `TextLineDataset` accepts `filenames` as a `tf.Tensor`, so
+you can parameterize it by passing a `tf.placeholder(tf.string)`.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+dataset = tf.contrib.data.TextLineDataset(filenames)
+```
+
+By default, a `TextLineDataset` yields *every* line of each file, which may
+not be desirable, for example if the file starts with a header line, or contains
+comments. These lines can be removed using the `Dataset.skip()` and
+`Dataset.filter()` transformations. To apply these transformations to each
+file separately, we use `Dataset.flat_map()` to create a nested `Dataset` for
+each file.
+
+```python
+filenames = ["/var/data/file1.txt", "/var/data/file2.txt"]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames)
+
+# Use `Dataset.flat_map()` to transform each file separately.
+# * Skip the first line (header row).
+# * Filter out lines beginning with "#" (comments).
+dataset = dataset.flat_map(
+    lambda filename: (
+        tf.contrib.data.Dataset.TextLineDataset(filename)
+        .skip(1)
+        .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
+```
+
+<!--
+TODO(mrry): Add these sections.
+
+#### Consuming from a Python generator
+#### Consuming from an index file and images
+-->
+
+### Preprocessing data with `Dataset.map()`
+
+The `Dataset.map(f)` transformation produces a new dataset by applying a given
+function `f` to each element of the input dataset. It is based on
+the
+[`map()` function](https://en.wikipedia.org/wiki/Map_(higher-order_function))
+that is commonly applied to lists (and other structures) in functional
+programming languages.  The function `f` takes the `tf.Tensor` objects that
+represent a single element in the input, and returns the `tf.Tensor` objects
+that will represent a single element in the new dataset. Its implementation uses
+standard TensorFlow operations to transform one element into another.
+
+This section covers common examples of how to use `Dataset.map()`.
+
+#### Parsing `tf.Example` protocol buffer messages
+
+Many input pipelines extract `tf.train.Example` protocol buffer messages from a
+TFRecord-format file (written, for example, using
+`tf.python_io.TFRecordWriter`). Each `tf.train.Example` record contains one or
+more "features", and the input pipeline typically converts these features into
+tensors.
+
+```python
+# Transforms a scalar string `example_proto` into a pair of a scalar string and
+# a scalar integer, representing an image and its label, respectively.
+def _parse_function(example_proto):
+  features = {"image": tf.FixedLenFeature((), tf.string, default_value=""),
+              "label": tf.FixedLenFeature((), tf.int32, default_value=0)}
+  parsed_features = tf.parse_single_example(example_proto, features)
+  return parsed_features["image"], parsed_features["label"]
+
+# Creates a dataset that reads all of the examples from two files, and extracts
+# the image and label features.
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(_parse_function)
+```
+
+#### Decoding image data and resizing it
+
+When training a neural network on real-world image data, it is often necessary
+to convert images of different sizes to a common size, so that they may be
+batched into a fixed size.
+
+```python
+# Reads an image from a file, decodes it into a dense tensor, and resizes it
+# to a fixed shape.
+def _parse_function(filename, label):
+  image_string = tf.read_file(filename)
+  image_decoded = tf.image.decode_image(image_string)
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+filenames = tf.constant(["/var/data/image1.jpg", "/var/data/image2.jpg", ...])
+labels = tf.constant([0, 37, 29, 1, ...])
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(_parse_function)
+```
+
+#### Applying arbitrary Python logic with `tf.py_func()`
+
+For performance reasons, we encourage you to use TensorFlow operations for
+preprocessing your data whenever possible. However, it is sometimes useful to
+be able to call upon external Python libraries when parsing your input data,
+and you can do this by invoking the `tf.py_func()` operation in a
+`Dataset.map()` transformation.
+
+```python
+import cv2
+
+# Use a custom OpenCV function to read the image, instead of the standard
+# TensorFlow `tf.read_file()` operation.
+def _read_py_function(filename, label):
+  image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
+  return image_decoded, label
+
+# Use standard TensorFlow operations to resize the image to a fixed shape.
+def _resize_function(image_decoded, label):
+  image_decoded.set_shape([None, None, None])
+  image_resized = tf.image.resize_images(image_decoded, [28, 28])
+  return image_resized, label
+
+filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
+labels = [0, 37, 29, 1, ...]
+
+dataset = tf.contrib.data.Dataset.from_tensor_slices((filenames, labels))
+dataset = dataset.map(
+    lambda filename, label: tf.py_func(
+        _read_py_function, [filename, label], [tf.uint8, label.dtype]))
+dataset = dataset.map(_resize_function)
+```
+
+<!--
+TODO(mrry): Add this section.
+
+#### Handling text data with unusual sizes
+-->
+
+### Batching dataset elements
+
+#### Simple batching
+
+The simplest form of batching stacks `n` consecutive elements of a dataset into
+a single element. The `Dataset.batch()` transformation does exactly this, with
+the same constraints as the `tf.stack()` operator, applied to each component
+of the elements: i.e. for each component *i*, all elements must have a tensor
+of the exact same shape.
+
+```python
+inc_dataset = tf.contrib.data.Dataset.range(100)
+dec_dataset = tf.contrib.data.Dataset.range(0, -100, -1)
+dataset = tf.contrib.data.Dataset.zip((inc_dataset, dec_dataset))
+batched_dataset = dataset.batch(4)
+
+iterator = batched_dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> ([0, 1, 2,   3],   [ 0, -1,  -2,  -3])
+print(sess.run(next_element))  # ==> ([4, 5, 6,   7],   [-4, -5,  -6,  -7])
+print(sess.run(next_element))  # ==> ([8, 9, 10, 11],   [-8, -9, -10, -11])
+```
+
+#### Batching tensors with padding
+
+The above recipe works for tensors that all have the same size. However, many
+models (e.g. sequence models) work with input data that can have varying size
+(e.g. sequences of different lengths). To handle this case, the
+`Dataset.padded_batch()` transformation enables you to batch tensors of
+different shape by specifying one or more dimensions in which they may be
+padded.
+
+```python
+dataset = tf.contrib.data.Dataset.range(100)
+dataset = dataset.map(lambda x: tf.fill([tf.cast(x, tf.int32)], x))
+dataset = dataset.padded_batch(4, padded_shapes=[None])
+
+iterator = batched_dataset.make_one_shot_iterator()
+next_element = iterator.get_next()
+
+print(sess.run(next_element))  # ==> [[0, 0, 0], [1, 0, 0], [2, 2, 0], [3, 3, 3]]
+print(sess.run(next_element))  # ==> [[4, 4, 4, 4, 0, 0, 0],
+                               #      [5, 5, 5, 5, 5, 0, 0],
+                               #      [6, 6, 6, 6, 6, 6, 0],
+                               #      [7, 7, 7, 7, 7, 7, 7]]
+```
+
+The `Dataset.padded_batch()` transformation allows you to set different padding
+for each dimension of each component, and it may be variable-length (signified
+by `None` in the example above) or constant-length. It is also possible to
+override the padding value, which defaults to 0.
+
+<!--
+TODO(mrry): Add this section.
+
+#### Dense ragged -> tf.SparseTensor
+-->
+
+### Training workflows
+
+#### Processing multiple epochs
+
+The `Dataset` API offers two main ways to process multiple epochs of the same
+data.
+
+The simplest way to iterate over a dataset in multiple epochs is to use the
+`Dataset.repeat()` transformation. For example, to create a dataset that repeats
+its input for 10 epochs:
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.repeat(10)
+dataset = dataset.batch(32)
+```
+
+Applying the `Dataset.repeat()` transformation with no arguments will repeat
+the input indefinitely. The `Dataset.repeat()` transformation concatenates its
+arguments without signaling the end of one epoch and the beginning of the next
+epoch.
+
+If you want to receive a signal at the end of each epoch, you can write a
+training loop that catches the `tf.errors.OutOfRangeError` at the end of a
+dataset. At that point you might collect some statistics (e.g. the validation
+error) for the epoch.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.batch(32)
+iterator = dataset.make_initializable_iterator()
+next_element = iterator.get_next()
+
+# Compute for 100 epochs.
+for _ in range(100):
+  sess.run(iterator.initializer)
+  while True:
+    try:
+      sess.run(next_element)
+    except tf.errors.OutOfRangeError:
+      break
+
+  # [Perform end-of-epoch calculations here.]
+```
+
+#### Randomly shuffling input data
+
+The `Dataset.shuffle()` transformation randomly shuffles the input dataset
+using a similar algorithm to `tf.RandomShuffleQueue`: it maintains a fixed-size
+buffer and chooses the next element uniformly at random from that buffer.
+
+```python
+filenames = ["/var/data/file1.tfrecord", "/var/data/file2.tfrecord"]
+dataset = tf.contrib.data.TFRecordDataset(filenames)
+dataset = dataset.map(...)
+dataset = dataset.repeat()
+dataset = dataset.shuffle(buffer_size=10000)
+dataset = dataset.batch(32)
+```
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5308ab64ace297d89dacddcf17921f428cd72b1d
--- /dev/null
+++ b/tensorflow/contrib/data/__init__.py
@@ -0,0 +1,42 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.contrib.data.Dataset` API for input pipelines.
+
+@@Dataset
+@@Iterator
+@@TFRecordDataset
+@@FixedLengthRecordDataset
+@@TextLineDataset
+
+@@read_batch_features
+@@rejection_resample
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
+from tensorflow.contrib.data.python.ops.dataset_ops import FixedLengthRecordDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import Iterator
+from tensorflow.contrib.data.python.ops.dataset_ops import read_batch_features
+from tensorflow.contrib.data.python.ops.dataset_ops import rejection_resample
+from tensorflow.contrib.data.python.ops.dataset_ops import TextLineDataset
+from tensorflow.contrib.data.python.ops.dataset_ops import TFRecordDataset
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/data/python/framework/BUILD b/tensorflow/contrib/data/python/framework/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7b84825bb4fd8a07b63150122f84b4faab73a893
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "function",
+    srcs = ["function.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+    ],
+)
+
+py_test(
+    name = "function_test",
+    size = "medium",
+    srcs = ["function_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/framework/function.py b/tensorflow/contrib/data/python/framework/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aa44a736adeaa672352de31401dbbe7eff24b3e
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/function.py
@@ -0,0 +1,267 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An experimental fork of the Python TensorFlow-function library.
+
+NOTE: functions are currently experimental and subject to change!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import tf_inspect
+
+# NOTE(mrry): This is an experimental extension of a core class that wasn't
+# designed to be extended, so we disable protected access checks for the
+# whole file.
+# pylint: disable=protected-access
+
+
+class _ExperimentalFuncGraph(function._FuncGraph):
+  """A helper for construction a function (supporting capture-by-value).
+
+  _ExperimentalFuncGraph overrides ops.Graph's create_op() so that we can keep
+  track of every inputs into every op created inside the function.  If
+  any input is from other graphs, we keep track of it in self.capture
+  and substitute the input with a place holder.
+
+  Each captured input's corresponding place holder is converted into a
+  function argument and the caller passes in the captured tensor.
+  """
+
+  def __init__(self, capture_by_value, *args, **kwargs):
+    super(_ExperimentalFuncGraph, self).__init__(*args, **kwargs)
+    self._capture_by_value = capture_by_value
+    self._building_function = True
+    self._outer_graph = ops.get_default_graph()
+    self._vscope = vs.get_variable_scope()
+    self._old_custom_getter = self._vscope.custom_getter
+    self._captured = {}
+    self.extra_inputs = []
+    self.extra_args = []
+    self.extra_vars = []
+
+  def create_op(self, op_type, inputs, data_types, **kwargs):
+    for i, x in enumerate(inputs):
+      if x.graph is not self:
+        # Referring to a tensor from other graph.
+        if x in self._captured:
+          # Captured already.
+          inputs[i] = self._captured[x]
+        elif self._capture_by_value:
+          inputs[i] = self._add_tensor_and_parents(x)
+        else:
+          # Substitute with a placeholder.
+          self.extra_inputs.append(x)
+          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # pylint: disable=protected-access
+          ph._handle_shape = x._handle_shape
+          ph._handle_dtype = x._handle_dtype
+          # pylint: enable=protected-access
+          inputs[i] = ph
+          self._captured[x] = ph
+          self.extra_args.append(ph)
+    return super(_ExperimentalFuncGraph, self).create_op(op_type, inputs,
+                                                         data_types, **kwargs)
+
+  def _add_tensor_and_parents(self, tensor):
+    op = self._add_op_and_parents(tensor.op)
+    return op.outputs[tensor.value_index]
+
+  def _add_op_and_parents(self, op):
+    op_def = function._get_op_def(op)
+    if op_def.is_stateful:
+      raise ValueError("Cannot capture a stateful node by value.")
+    elif op.type in ("Placeholder", "PlaceholderV2"):
+      raise ValueError("Cannot capture a placeholder by value.")
+
+    captured_inputs = [self._add_tensor_and_parents(x) for x in op.inputs]
+
+    captured_op = self.create_op(op.type, captured_inputs,
+                                 [o.dtype for o in op.outputs],
+                                 name=op.name, attrs=op.node_def.attr,
+                                 op_def=op_def)
+
+    for t, captured_t in zip(op.outputs, captured_op.outputs):
+      self._captured[t] = captured_t
+
+    return captured_op
+
+
+class _ExperimentalDefinedFunction(function._DefinedFunction):
+  """Overrides _DefinedFunction with support for capture-by-value."""
+
+  def __init__(self,
+               func,
+               argnames,
+               input_types,
+               func_name=None,
+               grad_func=None,
+               python_grad_func=None,
+               out_names=None,
+               shape_func=None,
+               capture_by_value=False,
+               **kwargs):
+    """Creates an _ExperimentalDefinedFunction.
+
+    Args:
+      func:  A python callable which constructs a tf function body.
+      argnames: A list of strings for function argument names.
+      input_types: The function's argument types. Can be a tuple, list of
+        tf data types.
+      func_name: The function name. Defaults to None, in which derives from
+        'func'.
+      grad_func: This function's gradient function, if not None. Defaults
+        to None.
+      python_grad_func: A python callable implementing the gradient of
+        the function python-side.
+      out_names: An optional list of strings for the function return value
+        names.
+      shape_func: An optional function mapping an op to a list of static
+        output shapes.
+      capture_by_value: Boolean (defaults to False). If True, captured values
+        will be copied into the function body.
+      **kwargs: The keyword arguments. **kwargs is passed to every call
+        site of this function.
+
+    Raises:
+      ValueError: The function definition is invalid.
+    """
+    super(_ExperimentalDefinedFunction, self).__init__(
+        func, argnames, input_types, func_name, grad_func, python_grad_func,
+        out_names, shape_func, **kwargs)
+    self._capture_by_value = capture_by_value
+
+  def _create_definition_if_needed(self):
+    """Creates the function definition if it's not created yet."""
+
+    if self._definition is not None:
+      return
+
+    # Create the func_def object.
+    temp_graph = _ExperimentalFuncGraph(capture_by_value=self._capture_by_value)
+    with temp_graph.as_default():
+      # List of placeholders for the function_def.
+      inputs = []
+      for (argname, argtype) in self._args:
+        argholder = array_ops.placeholder(argtype, name=argname)
+        inputs.append(argholder)
+      # Call func and gather the output tensors.
+      with vs.variable_scope("", custom_getter=temp_graph.getvar):
+        outputs = self._func(*inputs)
+      # If func only returned one value, make it a tuple.
+      if not isinstance(outputs, (list, tuple)):
+        outputs = (outputs,)
+      if any([_ is None for _ in outputs]):
+        raise ValueError("Function can not return None.")
+      # Ensures each output is a Tensor.
+      outputs = [ops.convert_to_tensor(_) for _ in outputs]
+    self._extra_inputs = temp_graph.extra_inputs
+    inputs.extend(temp_graph.extra_args)
+    self._sub_functions = temp_graph._functions
+
+    # Build the FunctionDef
+    self._definition = function._graph_to_function_def(
+        temp_graph, temp_graph.get_operations(), inputs, outputs,
+        out_names=self._out_names)
+
+    # Extra kwargs are treated as attrs on the function def.
+    sig_pre_func_name = self._func_name or function._get_func_name(self._func)
+    kwargs_attr = function._parse_kwargs_as_attrs(
+        sig_pre_func_name, **self._extra_kwargs)
+    for k in kwargs_attr:
+      self._definition.attr[k].CopyFrom(kwargs_attr[k])
+
+    # Hash the definition and its dependencies.
+    self._hash_str = self._create_hash_str(
+        self._definition.signature.input_arg,
+        self._definition.signature.output_arg,
+        self._definition.node_def)
+
+    # Finally, we decide the function name to use.  If not specified,
+    # make up something which is almost certainly unique (but deterministic).
+    if not self._func_name:
+      self._func_name = "_".join([function._get_func_name(self._func),
+                                  self._hash_str])
+    self._definition.signature.name = self._func_name
+    if self._func.__doc__:
+      self._definition.signature.description = self._func.__doc__
+
+
+class Defun(function.Defun):
+  """Experimental version of Defun supporting capture-by-value."""
+
+  def __init__(self, *input_types, **kwargs):
+    """Create an experimental `Defun` decorator.
+
+    Args:
+      *input_types: A list of `tf.DType`
+      **kwargs: Optional keyword arguments (see `function.Defun`) plus:
+        capture_by_value - Boolean (defaults to False). If True, captured values
+        will be copied into the function body.
+    """
+    super(Defun, self).__init__(*input_types, **kwargs)
+
+  def __call__(self, func):
+    # Various sanity checks on the callable func.
+    if not callable(func):
+      raise ValueError("func %s must be callable" % func)
+
+    # Func should not use kwargs and defaults.
+    argspec = tf_inspect.getargspec(func)
+    if argspec.keywords or argspec.defaults:
+      raise ValueError("Functions with argument defaults or keyword "
+                       "arguments are not supported.")
+
+    # Computes how many arguments 'func' has.
+    min_args = len(argspec.args)
+    max_args = min_args
+    if argspec.varargs:
+      max_args = 1000000
+    argnames = argspec.args
+    if tf_inspect.ismethod(func):
+      # 1st argument is the "class" type.
+      min_args -= 1
+      argnames = argnames[1:]
+
+    if self._input_types:
+      # If Defun is given a list of types for the inputs, the number
+      # of input types should be compatible with 'func'.
+      num = len(self._input_types)
+      if num < min_args or num > max_args:
+        raise ValueError(
+            "The function has fewer arguments than the number of specified "
+            "input types.")
+      return _ExperimentalDefinedFunction(
+          func, argnames, self._input_types, self._func_name, self._grad_func,
+          self._python_grad_func, out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # 'func' expects no arguments and input types is an empty list.
+    if min_args == 0 and max_args == 0:
+      return _ExperimentalDefinedFunction(
+          func, [], [], self._func_name, self._grad_func,
+          self._python_grad_func, out_names=self._out_names,
+          **self._extra_kwargs)
+
+    # Input types are unknown. It's an overloaded function and hence
+    # its definition needs to be deferred until it's called.
+    return function._OverloadedFunction(
+        func, argnames, self._func_name, self._grad_func,
+        self._python_grad_func, out_names=self._out_names, **self._extra_kwargs)
diff --git a/tensorflow/contrib/data/python/framework/function_test.py b/tensorflow/contrib/data/python/framework/function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c493170d28219602580acdbd18bfbd6ffc813da1
--- /dev/null
+++ b/tensorflow/contrib/data/python/framework/function_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for experimental capture-by-value feature in TF functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.framework import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FunctionTest(test.TestCase):
+
+  def testCaptureByValue(self):
+    g = ops.Graph()
+    with g.as_default():
+      w = constant_op.constant([[1.0]])
+      b = constant_op.constant([2.0])
+
+      # Foo() captures w and b.
+      @function.Defun(dtypes.float32, capture_by_value=True)
+      def Foo(x):
+
+        # Plus() captures b.
+        @function.Defun(dtypes.float32, capture_by_value=True)
+        def Plus(y):
+          return y + b
+
+        self.assertEqual(0, len(Plus.captured_inputs))
+
+        return Plus(math_ops.matmul(w, x))
+
+      y = Foo(constant_op.constant([[10.]]))
+
+    self.assertEqual(0, len(Foo.captured_inputs))
+
+    with self.test_session(graph=g):
+      self.assertAllEqual(y.eval(), [[12.0]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0d14fd514c39031072c5945e3612687613afd3a0
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -0,0 +1,219 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "bucketing_test",
+    size = "small",
+    srcs = ["bucketing_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "dataset_constructor_op_test",
+    size = "small",
+    srcs = ["dataset_constructor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "flat_map_dataset_op_test",
+    size = "small",
+    srcs = ["flat_map_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "reader_dataset_ops_test",
+    size = "small",
+    srcs = ["reader_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "resample_test",
+    size = "medium",
+    srcs = ["resample_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "sequence_dataset_op_test",
+    size = "small",
+    srcs = ["sequence_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78e1b412c44a190e8db2184f935806c1d7e9a84
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class BatchDatasetTest(test.TestCase):
+
+  def testBatchDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> BatchDataset(batch_size).
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+                .repeat(count).batch(batch_size).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [t.shape.as_list() for t in get_next])
+
+    with self.test_session() as sess:
+      # Batch of a finite input, where the batch_size divides the
+      # total number of elements.
+      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
+      num_batches = (28 * 7) // 14
+      for i in range(num_batches):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(14):
+            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+                                result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of a finite input, where the batch_size does not
+      # divide the total number of elements.
+      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
+
+      # We expect (num_batches - 1) full-sized batches.
+      num_batches = int(math.ceil((14 * 7) / 8))
+      for i in range(num_batches - 1):
+        result = sess.run(get_next)
+        for component, result_component in zip(components, result):
+          for j in range(8):
+            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+                                result_component[j])
+      result = sess.run(get_next)
+      for component, result_component in zip(components, result):
+        for j in range((14 * 7) % 8):
+          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
+                              result_component[j])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Batch of an empty input should fail straight away.
+      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Empty batch should be an initialization time error.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+
+  def testPaddedBatchDataset(self):
+    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
+    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
+                .map(lambda x: array_ops.fill([x], x)).padded_batch(
+                    4,
+                    padded_shapes=padded_shape).make_initializable_iterator())
+
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Test with random sequence lengths, and max padding.
+      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        padded_len = np.max(result)
+        self.assertEqual((4, padded_len), result.shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with random sequence lengths, and constant padding.
+      sess.run(init_op, feed_dict={padded_shape: [25],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        self.assertEqual((4, 25), result.shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test correct handling of empty tensors.
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: [0, 0, 0, 0]})
+      result = sess.run(get_next)
+      self.assertAllEqual([[], [], [], []], result)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test error handling with constant sequence lengths, and
+      # too-short padding.
+      sess.run(init_op, feed_dict={padded_shape: [5],
+                                   seq_lens: [6, 5, 5, 5]})
+      with self.assertRaises(errors.DataLossError):
+        result = sess.run(get_next)
+
+  def testPaddedBatchDatasetNonDefaultPadding(self):
+    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
+    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
+
+    def fill_tuple(x):
+      filled = array_ops.fill([x], x)
+      return (filled, string_ops.as_string(filled))
+    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+                .padded_batch(
+                    4,
+                    padded_shapes=(padded_shape, padded_shape),
+                    padding_values=(-1, "<end>")).make_initializable_iterator())
+
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Test with random sequence lengths, and max padding.
+      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+      sess.run(init_op, feed_dict={padded_shape: [-1],
+                                   seq_lens: random_seq_lens})
+      for i in range(8):
+        result = sess.run(get_next)
+        padded_len = np.max(result[0])
+        self.assertEqual((4, padded_len), result[0].shape)
+        self.assertEqual((4, padded_len), result[1].shape)
+        for j in range(4):
+          seq_len = random_seq_lens[(i*4)+j]
+          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
+          self.assertAllEqual(result[0][j, seq_len:],
+                              [-1] * (padded_len - seq_len))
+          self.assertAllEqual(result[1][j, :seq_len],
+                              [compat.as_bytes(str(seq_len))] * seq_len)
+          self.assertAllEqual(result[1][j, seq_len:],
+                              [b"<end>"] * (padded_len - seq_len))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testPaddedBatchDatasetShapeSpecifications(self):
+    int_placeholder = array_ops.placeholder(dtypes.int32)
+    float_placeholder = array_ops.placeholder(dtypes.float32)
+    string_placeholder = array_ops.placeholder(dtypes.string)
+    input_dataset = dataset_ops.Dataset.from_tensors(
+        (int_placeholder, float_placeholder, string_placeholder))
+
+    # Test different ways of specifying the `padded_shapes` argument.
+    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
+        32,
+        padded_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([None, None]),
+                       tensor_shape.TensorShape([37])))
+    dynamic_padding_from_lists = input_dataset.padded_batch(
+        32, padded_shapes=([None], [None, None], [37]))
+    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
+        32, padded_shapes=([-1], [-1, -1], [37]))
+    dynamic_padding_from_tensors = input_dataset.padded_batch(
+        32,
+        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
+                       constant_op.constant([-1, -1], dtype=dtypes.int64),
+                       constant_op.constant([37], dtype=dtypes.int64)))
+
+    for dataset in [dynamic_padding_from_tensor_shapes,
+                    dynamic_padding_from_lists,
+                    dynamic_padding_from_lists_with_minus_one,
+                    dynamic_padding_from_tensors]:
+      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+
+  def testDenseToSparseBatchDataset(self):
+    components = np.random.randint(12, size=(100,)).astype(np.int32)
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .map(lambda x: array_ops.fill([x], x)).dense_to_sparse_batch(
+                    4, [12]).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      for start in range(0, len(components), 4):
+        results = sess.run(get_next)
+        self.assertAllEqual(
+            [[i, j] for i, c in enumerate(components[start:start+4])
+             for j in range(c)], results.indices)
+        self.assertAllEqual(
+            [c for c in components[start:start+4] for _ in range(c)],
+            results.values)
+        self.assertAllEqual(
+            [min(4, len(components) - start), 12], results.dense_shape)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDenseToSparseBatchDatasetShapeErrors(self):
+    input_tensor = array_ops.placeholder(dtypes.int32)
+    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
+                .dense_to_sparse_batch(4, [12]).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      # Initialize with an input tensor of incompatible rank.
+      sess.run(init_op, feed_dict={input_tensor: [[1]]})
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "incompatible with the row shape"):
+        sess.run(get_next)
+
+      # Initialize with an input tensor that is larger than `row_shape`.
+      sess.run(init_op, feed_dict={input_tensor: range(13)})
+      with self.assertRaisesRegexp(errors.DataLossError,
+                                   "larger than the row shape"):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..20d66d7f231d753436ec104dd0dbfbc469076bca
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -0,0 +1,292 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class BucketingTest(test.TestCase):
+
+  def testSimple(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
+        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          result = sess.run(get_next)
+          self.assertTrue(
+              all(x % 2 == 0 for x in result) or all(x % 2 == 1)
+              for x in result)
+          counts.append(result.shape[0])
+
+      self.assertEqual(len(components), sum(counts))
+      num_full_batches = len([c for c in counts if c == 4])
+      self.assertGreaterEqual(num_full_batches, 23)
+      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
+
+  def testImmediateOutput(self):
+    components = np.array(
+        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1)
+        .group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      # The input is infinite, so this test demonstrates that:
+      # 1. We produce output without having to consume the entire input,
+      # 2. Different buckets can produce output at different rates, and
+      # 3. For deterministic input, the output is deterministic.
+      for _ in range(3):
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+
+  def testSmallGroups(self):
+    components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      # The small outputs at the end are deterministically produced in key
+      # order.
+      self.assertAllEqual([0, 0, 0], sess.run(get_next))
+      self.assertAllEqual([1], sess.run(get_next))
+
+  def testReduceFuncError(self):
+    components = np.random.randint(100, size=(200,)).astype(np.int64)
+
+    def reduce_func(_, xs):
+      # Introduce an incorrect padded shape that cannot (currently) be
+      # detected at graph construction time.
+      return xs.padded_batch(
+          4,
+          padded_shapes=(tensor_shape.TensorShape([]),
+                         constant_op.constant([5], dtype=dtypes.int64) * -1))
+
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: (x, ops.convert_to_tensor([x * x])))
+        .group_by_window(lambda x, _: x % 2, reduce_func, 32))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+  def testConsumeWindowDatasetMoreThanOnce(self):
+    components = np.random.randint(50, size=(200,)).astype(np.int64)
+
+    def reduce_func(key, window):
+      # Apply two different kinds of padding to the input: tight
+      # padding, and quantized (to a multiple of 10) padding.
+      return dataset_ops.Dataset.zip((window.padded_batch(
+          4,
+          padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch(
+              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),))
+
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
+        .group_by_window(
+            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
+            reduce_func, 4))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      counts = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          tight_result, multiple_of_10_result = sess.run(get_next)
+          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
+          self.assertAllEqual(tight_result,
+                              multiple_of_10_result[:, :tight_result.shape[1]])
+          counts.append(tight_result.shape[0])
+      self.assertEqual(len(components), sum(counts))
+
+
+# NOTE(mrry): These tests are based on the tests in
+# bucket_ops_test.py. Currently, different batch sizes for each key
+# are not supported, although this would be possible to add to
+# `Dataset.group_by_window()`.
+class BucketTest(test.TestCase):
+
+  def _dynamicPad(self, bucket, window, window_size):
+    # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a
+    # generic form of padded_batch that pads every component
+    # dynamically and does not rely on static shape information about
+    # the arguments.
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch(
+            32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape([None]),
+                 tensor_shape.TensorShape([3])))))
+
+  def testSingleBucket(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32),
+        32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      which_bucket, bucketed_values = sess.run(get_next)
+
+      self.assertEqual(0, which_bucket)
+
+      expected_scalar_int = np.arange(32, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
+      for i in range(32):
+        expected_unk_int64[i, :i] = i
+      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
+
+  def testEvenOddBuckets(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches (one containing even values, one containing odds)
+      which_bucket_even, bucketed_values_even = sess.run(get_next)
+      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+
+      # Count number of bucket_tensors.
+      self.assertEqual(3, len(bucketed_values_even))
+      self.assertEqual(3, len(bucketed_values_odd))
+
+      # Ensure bucket 0 was used for all minibatch entries.
+      self.assertAllEqual(0, which_bucket_even)
+      self.assertAllEqual(1, which_bucket_odd)
+
+      # Test the first bucket outputted, the events starting at 0
+      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i] = 2 * i
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
+
+      # Test the second bucket outputted, the odds starting at 1
+      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
+      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
+      for i in range(0, 32):
+        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
+        expected_vec3_str = np.vstack(
+            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
+
+      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
+      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
+      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
+
+  def testEvenOddBucketsFilterOutAllOdd(self):
+    def _map_fn(v):
+      return (v, array_ops.fill([v], v),
+              array_ops.fill([3], string_ops.as_string(v)))
+
+    input_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
+        .filter(lambda x, y, z: math_ops.equal(x % 2, 0)))
+
+    bucketed_dataset = input_dataset.group_by_window(
+        lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
+        lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)
+
+    iterator = dataset_ops.Iterator.from_dataset(bucketed_dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+
+      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
+      which_bucket0, bucketed_values_even0 = sess.run(get_next)
+      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+
+      # Ensure that bucket 1 was completely filtered out
+      self.assertAllEqual(0, which_bucket0)
+      self.assertAllEqual(0, which_bucket1)
+      self.assertAllEqual(
+          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0[0])
+      self.assertAllEqual(
+          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..acff83c239625b03af04f64cd3b43d5c3bc3fa8b
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -0,0 +1,239 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class DatasetConstructorTest(test.TestCase):
+
+  def testTensorDataset(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = [np.array(1), np.array([1, 2, 3]), np.array(37.0)]
+
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testTensorSliceDataset(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
+            np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(4):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparseTensorSliceDataset(self):
+    """Test a dataset based on slices of a `tf.SparseTensor`."""
+    st = array_ops.sparse_placeholder(dtypes.float64)
+    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.test_session() as sess:
+      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+
+      # Test with sparse tensor in the appropriate order.
+      indices = np.array(
+          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
+      values = np.array([val for s in slices for val in s])
+      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
+      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
+                                                    dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      for i, s in enumerate(slices):
+        results = sess.run(get_next)
+        self.assertAllEqual(s, results.values)
+        expected_indices = np.array(
+            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
+        self.assertAllEqual(expected_indices, results.indices)
+        self.assertAllEqual(dense_shape[1:], results.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with sparse tensor in the reverse order, which is not
+      # currently supported.
+      reverse_order_indices = indices[::-1, :]
+      reverse_order_values = values[::-1]
+      sparse_feed = sparse_tensor.SparseTensorValue(
+          reverse_order_indices, reverse_order_values, dense_shape)
+      with self.assertRaises(errors.UnimplementedError):
+        sess.run(init_op, feed_dict={st: sparse_feed})
+
+      # Test with an empty sparse tensor.
+      empty_indices = np.empty((0, 4), dtype=np.int64)
+      empty_values = np.empty((0,), dtype=np.float64)
+      empty_dense_shape = [0, 4, 37, 9]
+      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
+                                                    empty_dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  # pylint: disable=g-long-lambda,unnecessary-lambda
+  def testNestedStructure(self):
+    components = (np.array([1, 2, 3]), (np.array([4., 5.]), np.array([6., 7.])),
+                  np.array([8, 9, 10]))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.shuffle(10, 10)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.repeat(-1)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.filter(lambda x, y, z: True)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.take(5)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
+                                                       (y[0], y[1])))
+    )
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.batch(32)
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                      nest.pack_sequence_as(dataset.output_shapes, [
+                          s.as_list()
+                          for s in nest.flatten(dataset.output_shapes)
+                      ]))
+
+    iterator = dataset.make_one_shot_iterator()
+    (w, x), (y, z) = iterator.get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    iterator = dataset.make_initializable_iterator()
+    (w, x), (y, z) = iterator.get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    # Define a separate set of components with matching leading
+    # dimension for the from-slices constructor.
+    components_for_slices = (np.array([1, 2, 3]), (np.array(
+        [4., 5., 6.]), np.array([7., 8., 9.])), np.array([10, 11, 12]))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+
+  def testNonSequenceNestedStructure(self):
+    components = np.array([1, 2, 3])
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.filter(
+        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([2, 3], dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    self.assertEquals(dtypes.int64, get_next.dtype)
+    self.assertEquals([3], get_next.shape)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4e994479fb9e1f78b0fb7a42a231db9fb18e306
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FilterDatasetTest(test.TestCase):
+
+  def testFilterDataset(self):
+    components = [
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    ]
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    modulus = array_ops.placeholder(dtypes.int64)
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(count)
+        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test that we can dynamically feed a different modulus value for each
+      # iterator.
+      def do_test(count_val, modulus_val):
+        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
+        for _ in range(count_val):
+          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+      do_test(14, 2)
+      do_test(4, 18)
+
+      # Test an empty dataset.
+      do_test(0, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..705cfef9017733feb1e5606d640a7aca81bfcc74
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.client import session
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class FlatMapDatasetTest(test.TestCase):
+
+  # pylint: disable=g-long-lambda
+  def testFlatMapDataset(self):
+    repeats = [1, 2, 3, 4, 5, 0, 1]
+    components = [np.array(repeats, dtype=np.int64)]
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next, = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in repeats:
+        for _ in range(i):
+          self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = [np.array(repeats, dtype=np.int64)]
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices([x])
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors([y])
+                            .repeat(y))).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next, = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for row in repeats:
+        for i in row:
+          for _ in range(i):
+            self.assertEqual(i, sess.run(get_next))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSharedResourceNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = [np.array(repeats, dtype=np.int64)]
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices([x])
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors([y])
+                            .repeat(y))).make_initializable_iterator(
+                                shared_name="shared_flat_map_iterator"))
+    init_op = iterator.initializer
+    get_next, = iterator.get_next()
+
+    # Create two concurrent sessions that share the same iterator
+    # resource on the same server, and verify that a random
+    # interleaving of `Session.run(get_next)` calls on the two
+    # sessions yields the expected result.
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target) as sess1:
+      with session.Session(server.target) as sess2:
+        for _ in range(3):
+          sess = random.choice([sess1, sess2])
+          sess.run(init_op)
+          for row in repeats:
+            for i in row:
+              for _ in range(i):
+                sess = random.choice([sess1, sess2])
+                self.assertEqual(i, sess.run(get_next))
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess = random.choice([sess1, sess2])
+          sess.run(get_next)
+  # pylint: enable=g-long-lambda
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ace04919e7c9990f16060f26a87767906d240e1b
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py
@@ -0,0 +1,252 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class IteratorTest(test.TestCase):
+
+  def testOneShotIterator(self):
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+                .repeat(14).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOneShotIteratorCaptureByValue(self):
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+    tensor_components = [ops.convert_to_tensor(c) for c in components]
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(tensor_components)
+                .map(_map_fn).repeat(14).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOneShotIteratorInsideContainer(self):
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+
+    def within_container():
+      def _map_fn(x, y, z):
+        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+      iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                  .map(_map_fn).repeat(14).make_one_shot_iterator())
+      return iterator.get_next()
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two iterators within unique containers, and run them to
+    # make sure that the resources aren't shared.
+    #
+    # The test below would fail if cname were the same across both
+    # sessions.
+    for i in range(2):
+      with session.Session(server.target) as sess:
+        cname = "iteration%d" % i
+        with ops.container(cname):
+          get_next = within_container()
+
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testSimpleSharedResource(self):
+    components = [
+        np.array(1, dtype=np.int64),
+        np.array([1, 2, 3], dtype=np.int64),
+        np.array(37.0, dtype=np.float64)
+    ]
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two non-overlapping sessions that share the same iterator
+    # resource on the same server, and verify that an action of the
+    # first session (initializing the iterator) is visible in the
+    # second session.
+    with ops.Graph().as_default():
+      iterator = (dataset_ops.Dataset.from_tensors(components)
+                  .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
+                      shared_name="shared_iterator"))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        sess.run(init_op)
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Re-initialize the iterator in the first session.
+        sess.run(init_op)
+
+    with ops.Graph().as_default():
+      # Re-define the iterator manually, without defining any of the
+      # functions in this graph, to ensure that we are not
+      # accidentally redefining functions with the same names in the
+      # new graph.
+      iterator = dataset_ops.Iterator.from_structure(
+          shared_name="shared_iterator",
+          output_types=[dtypes.int64, dtypes.int64, dtypes.float64],
+          output_shapes=[[], [3], []])
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        # Use the iterator without re-initializing in the second session.
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testNotInitializedError(self):
+    components = [np.array(1), np.array([1, 2, 3]), np.array(37.0)]
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .make_initializable_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.FailedPreconditionError,
+                                   "iterator has not been initialized"):
+        sess.run(get_next)
+
+  def testReinitializableIterator(self):
+    dataset_3 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([1, 2, 3]))
+    dataset_4 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([4, 5, 6, 7]))
+    iterator = dataset_ops.Iterator.from_structure(dataset_3.output_types,
+                                                   [None])
+
+    dataset_3_init_op = iterator.make_initializer(dataset_3)
+    dataset_4_init_op = iterator.make_initializer(dataset_4)
+    get_next = iterator.get_next()
+
+    self.assertEqual(dataset_3.output_types, iterator.output_types)
+    self.assertEqual(dataset_4.output_types, iterator.output_types)
+    self.assertEqual([None], iterator.output_shapes.as_list())
+
+    with self.test_session() as sess:
+      # The iterator is initially uninitialized.
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(get_next)
+
+      # Initialize with one dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Initialize with a different dataset.
+      sess.run(dataset_4_init_op)
+      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Reinitialize with the first dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testReinitializableIteratorStaticErrors(self):
+    # Non-matching structure for types and shapes.
+    with self.assertRaises(TypeError):
+      iterator = dataset_ops.Iterator.from_structure((dtypes.int64,
+                                                      dtypes.float64), [None])
+
+    # Test validation of dataset argument.
+    iterator = dataset_ops.Iterator.from_structure((dtypes.int64,
+                                                    dtypes.float64))
+
+    # Incompatible structure.
+    with self.assertRaises(ValueError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors(((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
+
+    # Incompatible types.
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int32), constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float32))))
+
+    # Incompatible shapes.
+    iterator = dataset_ops.Iterator.from_structure(
+        (dtypes.int64, dtypes.float64), ([None], []))
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int64), constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float64))))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7f8ebbae82c4c8ba3cb1fe9fd9bef84c1ac50f
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -0,0 +1,330 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+class MapDatasetTest(test.TestCase):
+
+  def _buildMapDataset(self, components, count):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+            .repeat(count))
+
+  def testMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildMapDataset(components, count)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test single-threaded access to the iterator.
+      sess.run(init_op, feed_dict={count: 14})
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test multi-threaded access to the same iterator.
+      sess.run(init_op, feed_dict={count: 18})
+      results = []
+      def iterator_thread():
+        while True:
+          try:
+            results.append(sess.run(get_next))
+          except errors.OutOfRangeError:
+            return
+      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      # `results` will contain the same elements components**2
+      # repeated 18 times, but in a non-deterministic order. Sort the
+      # results, and assert that each element of components**2 is
+      # produced 18 times.
+      results.sort(key=lambda x: x[0])
+      for i in range(7):
+        for j in range(18):
+          for component, result_component in zip(components,
+                                                 results[i * 18 + j]):
+            self.assertAllEqual(component[i]**2, result_component)
+
+  def _buildParallelMapDataset(self, components, count, num_threads,
+                               output_buffer_size):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_threads=num_threads, output_buffer_size=output_buffer_size)
+            .repeat(count))
+
+  def testParallelMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = [np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7)]
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    num_threads = array_ops.placeholder(dtypes.int32, shape=[])
+    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildParallelMapDataset(components, count, num_threads,
+                                            output_buffer_size)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      def do_test(num_threads_val, output_buffer_size_val):
+        # Test single-threaded access to the iterator.
+        sess.run(init_op, feed_dict={
+            count: 14,
+            num_threads: num_threads_val,
+            output_buffer_size: output_buffer_size_val})
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Test multi-threaded access to the same iterator.
+        sess.run(init_op, feed_dict={
+            count: 18,
+            num_threads: num_threads_val,
+            output_buffer_size: output_buffer_size_val})
+        results = []
+        def iterator_thread():
+          while True:
+            try:
+              results.append(sess.run(get_next))
+            except errors.OutOfRangeError:
+              return
+        threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+        for t in threads:
+          t.start()
+        for t in threads:
+          t.join()
+
+        # `results` will contain the same elements components**2
+        # repeated 18 times, but in a non-deterministic order. Sort the
+        # results, and assert that each element of components**2 is
+        # produced 18 times.
+        results.sort(key=lambda x: x[0])
+        for i in range(7):
+          for j in range(18):
+            for component, result_component in zip(components,
+                                                   results[i * 18 + j]):
+              self.assertAllEqual(component[i]**2, result_component)
+
+      for num_threads_val, output_buffer_size_val in [
+          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
+        do_test(num_threads_val, output_buffer_size_val)
+
+  def _testDisposeParallelMapDataset(self, explicit_dispose):
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(1000).
+    components = [np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000)]
+
+    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    if explicit_dispose:
+      dispose_op = iterator.dispose_op()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      if explicit_dispose:
+        sess.run(dispose_op)
+
+  def testExplicitDisposeParallelMapDataset(self):
+    self._testDisposeParallelMapDataset(True)
+
+  def testImplicitDisposeParallelMapDataset(self):
+    self._testDisposeParallelMapDataset(False)
+
+  def testParallelMapError(self):
+    components = [np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)]
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message")))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+      sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureHashTable(self):
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        constant_op.constant([
+            "brain brain tank salad surgery",
+            "surgery brain",
+        ]))
+
+    iterator = (input_sentences
+                .map(lambda x: string_ops.string_split([x]).values)
+                .map(table.lookup)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(table.init)
+      sess.run(init_op)
+
+      print(sess.run(get_next))
+      print(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureQueue(self):
+    elements = np.random.randint(100, size=[200])
+    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
+    enqueue_op = queue.enqueue_many(elements)
+    close_op = queue.close()
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
+                .map(lambda _: queue.dequeue()).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(enqueue_op)
+      sess.run(close_op)
+      sess.run(init_op)
+      for element in elements:
+        self.assertEqual(element, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureVariable(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: counter_var.assign_add(1))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(counter_var.initializer)
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i, sess.run(counter_var))
+        self.assertEqual(i + 1, sess.run(get_next))
+      self.assertEqual(10, sess.run(counter_var))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(10, sess.run(counter_var))
+
+  def testCaptureUninitializedVariableError(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: counter_var.assign_add(1))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(errors.FailedPreconditionError,
+                                   "Failed to capture resource"):
+        sess.run(init_op)
+
+  def testSeededStatefulOperatorIsProperlyStateful(self):
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
+                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      random_values = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values.extend(sess.run(get_next))
+      self.assertEqual(10, len(random_values))
+      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
+      sess.run(init_op)
+      random_values_2 = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values_2.extend(sess.run(get_next))
+
+      # Randomness is repeatable given same seed
+      self.assertAllClose(random_values, random_values_2)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e10e22c353c4ec07338c90a8f3919a7978139aa
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -0,0 +1,182 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test RangeDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class RangeDatasetTest(test.TestCase):
+
+  def testStop(self):
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={stop: 5})
+      for i in range(5):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStartStop(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start,
+                                         stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 5})
+      for i in range(2, 5):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStartStopStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
+      for i in range(2, 10, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testZeroStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
+
+  def testNegativeStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(2, 10, -1):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStart(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start,
+                                         stop).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(10, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
+      # This for loop is a no-op but will ensure that the implementation is
+      # consistent with range if it ever changes.
+      for i in range(10, 2, 2):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start = array_ops.placeholder(dtypes.int64, shape=[])
+    stop = array_ops.placeholder(dtypes.int64, shape=[])
+    step = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(start, stop,
+                                         step).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
+      for i in range(10, 2, -1):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testEnumerateDataset(self):
+    components = [np.array(["a", "b"]), np.array([1, 2]), np.array([37.0, 38])]
+    start = constant_op.constant(20, dtype=dtypes.int64)
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).enumerate(
+        start=start).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(dtypes.int64, get_next[0].dtype)
+    self.assertEqual((), get_next[0].shape)
+    self.assertEqual([tensor_shape.TensorShape([])] * 3,
+                     [t.shape for t in get_next[1]])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertEqual((20, [b"a", 1, 37.0]), sess.run(get_next))
+      self.assertEqual((21, [b"b", 2, 38.0]), sess.run(get_next))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d5b34b77b8fda206fdb4dce83eed4f15cf1a49c
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -0,0 +1,500 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class TextLineDatasetTest(test.TestCase):
+
+  def _lineText(self, f, l):
+    return compat.as_bytes("%d: %d" % (f, l))
+
+  def _createFiles(self, num_files, num_lines, crlf=False):
+    filenames = []
+    for i in range(num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        for j in range(num_lines):
+          f.write(self._lineText(i, j))
+          # Always include a newline after the record unless it is
+          # at the end of the file, in which case we include it sometimes.
+          if j + 1 != num_lines or i == 0:
+            f.write(b"\r\n" if crlf else b"\n")
+    return filenames
+
+  def testTextLineDataset(self):
+    test_filenames = self._createFiles(2, 5, crlf=True)
+    filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = dataset_ops.TextLineDataset(filenames).repeat(num_epochs)
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[0]],
+                                   num_epochs: 1})
+      for i in range(5):
+        self.assertEqual(self._lineText(0, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from file 1.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[1]],
+                                   num_epochs: 1})
+      for i in range(5):
+        self.assertEqual(self._lineText(1, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 1})
+      for j in range(2):
+        for i in range(5):
+          self.assertEqual(self._lineText(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(2):
+          for i in range(5):
+            self.assertEqual(self._lineText(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(init_batch_op, feed_dict={filenames: test_filenames,
+                                         num_epochs: 10,
+                                         batch_size: 5})
+      for _ in range(10):
+        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
+                            sess.run(get_next))
+        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
+                            sess.run(get_next))
+
+
+class FixedLengthRecordReaderTest(test.TestCase):
+
+  def setUp(self):
+    super(FixedLengthRecordReaderTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self._header_bytes = 5
+    self._record_bytes = 3
+    self._footer_bytes = 2
+
+  def _record(self, f, r):
+    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        for j in range(self._num_records):
+          f.write(self._record(i, j))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
+  def testFixedLengthRecordDataset(self):
+    test_filenames = self._createFiles()
+    filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = (dataset_ops.FixedLengthRecordDataset(
+        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
+                      .repeat(num_epochs))
+    batch_dataset = repeat_dataset.batch(batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    init_op = iterator.make_initializer(repeat_dataset)
+    init_batch_op = iterator.make_initializer(batch_dataset)
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[0]],
+                                   num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertEqual(self._record(0, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from file 1.
+      sess.run(init_op, feed_dict={filenames: [test_filenames[1]],
+                                   num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertEqual(self._record(1, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Basic test: read from both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 1})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertEqual(self._record(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test repeated iteration through both files.
+      sess.run(init_op, feed_dict={filenames: test_filenames,
+                                   num_epochs: 10})
+      for _ in range(10):
+        for j in range(self._num_files):
+          for i in range(self._num_records):
+            self.assertEqual(self._record(j, i), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test batched and repeated iteration through both files.
+      sess.run(init_batch_op, feed_dict={filenames: test_filenames,
+                                         num_epochs: 10,
+                                         batch_size: self._num_records})
+      for _ in range(10):
+        for j in range(self._num_files):
+          self.assertAllEqual([self._record(j, i)
+                               for i in range(self._num_records)],
+                              sess.run(get_next))
+
+
+class TFRecordDatasetTest(test.TestCase):
+
+  def setUp(self):
+    super(TFRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+    self.test_filenames = self._createFiles()
+
+    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
+    self.num_epochs = array_ops.placeholder_with_default(
+        constant_op.constant(1, dtypes.int64), shape=[])
+    self.compression_type = array_ops.placeholder_with_default("", shape=[])
+    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = dataset_ops.TFRecordDataset(
+        self.filenames, self.compression_type).repeat(self.num_epochs)
+    batch_dataset = repeat_dataset.batch(self.batch_size)
+
+    iterator = dataset_ops.Iterator.from_structure(batch_dataset.output_types)
+    self.init_op = iterator.make_initializer(repeat_dataset)
+    self.init_batch_op = iterator.make_initializer(batch_dataset)
+    self.get_next = iterator.get_next()
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def testReadOneEpoch(self):
+    with self.test_session() as sess:
+      # Basic test: read from file 0.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: [self.test_filenames[0]],
+                          self.num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+      # Basic test: read from file 1.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: [self.test_filenames[1]],
+                          self.num_epochs: 1})
+      for i in range(self._num_records):
+        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+      # Basic test: read from both files.
+      sess.run(self.init_op,
+               feed_dict={self.filenames: self.test_filenames,
+                          self.num_epochs: 1})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadTenEpochs(self):
+    with self.test_session() as sess:
+      sess.run(self.init_op, feed_dict={self.filenames: self.test_filenames,
+                                        self.num_epochs: 10})
+      for _ in range(10):
+        for j in range(self._num_files):
+          for i in range(self._num_records):
+            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadTenEpochsOfBatches(self):
+    with self.test_session() as sess:
+      sess.run(self.init_batch_op,
+               feed_dict={self.filenames: self.test_filenames,
+                          self.num_epochs: 10,
+                          self.batch_size: self._num_records})
+      for _ in range(10):
+        for j in range(self._num_files):
+          values = sess.run(self.get_next)
+          self.assertAllEqual([self._record(j, i)
+                               for i in range(self._num_records)], values)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadZlibFiles(self):
+    zlib_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+
+    with self.test_session() as sess:
+      sess.run(self.init_op,
+               feed_dict={self.filenames: zlib_files,
+                          self.compression_type: "ZLIB"})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+  def testReadGzipFiles(self):
+    gzip_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(gzfn, "wb") as gzf:
+          gzf.write(f.read())
+        gzip_files.append(gzfn)
+
+    with self.test_session() as sess:
+      sess.run(self.init_op,
+               feed_dict={self.filenames: gzip_files,
+                          self.compression_type: "GZIP"})
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(self.get_next)
+
+
+class ReadBatchFeaturesTest(test.TestCase):
+
+  def setUp(self):
+    super(ReadBatchFeaturesTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self.test_filenames = self._createFiles()
+
+  def _read_batch_features(self, filenames, num_epochs, batch_size):
+    self.filenames = filenames
+    self.num_epochs = num_epochs
+    self.batch_size = batch_size
+
+    return dataset_ops.read_batch_features(
+        file_pattern=self.filenames,
+        batch_size=self.batch_size,
+        features={
+            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "keywords": parsing_ops.VarLenFeature(dtypes.string)
+        },
+        reader=dataset_ops.TFRecordDataset,
+        randomize_input=False,
+        num_epochs=self.num_epochs)
+
+  def _record(self, f, r):
+    example = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            "file":
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[f])),
+            "record":
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[r])),
+            "keywords":
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=self._get_keywords(f, r)))
+        }))
+    return example.SerializeToString()
+
+  def _get_keywords(self, f, r):
+    num_keywords = 1 + (f + r) % 2
+    keywords = []
+    for index in range(num_keywords):
+      keywords.append(compat.as_bytes("keyword%d" % index))
+    return keywords
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def _next_actual_batch(self, sess):
+    file_op = self.outputs["file"]
+    keywords_indices_op = self.outputs["keywords"].indices
+    keywords_values_op = self.outputs["keywords"].values
+    keywords_dense_shape_op = self.outputs["keywords"].dense_shape
+    record_op = self.outputs["record"]
+    return sess.run([
+        file_op, keywords_indices_op, keywords_values_op,
+        keywords_dense_shape_op, record_op
+    ])
+
+  def _next_expected_batch(self, file_indices, batch_size, num_epochs):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    file_batch = []
+    keywords_batch_indices = []
+    keywords_batch_values = []
+    keywords_batch_max_len = 0
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      for record in _next_record(file_indices):
+        f = record[0]
+        r = record[1]
+        file_batch.append(f)
+        record_batch.append(r)
+        keywords = self._get_keywords(f, r)
+        keywords_batch_values.extend(keywords)
+        keywords_batch_indices.extend([[batch_index, i]
+                                       for i in range(len(keywords))])
+        batch_index += 1
+        keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
+        if len(file_batch) == batch_size:
+          yield [
+              file_batch, keywords_batch_indices, keywords_batch_values,
+              [batch_size, keywords_batch_max_len], record_batch
+          ]
+          file_batch = []
+          keywords_batch_indices = []
+          keywords_batch_values = []
+          keywords_batch_max_len = 0
+          record_batch = []
+          batch_index = 0
+    if file_batch:
+      yield [
+          file_batch, keywords_batch_indices, keywords_batch_values,
+          [len(file_batch), keywords_batch_max_len], record_batch
+      ]
+
+  def _verify_records(self, sess, batch_size, file_index=None, num_epochs=1):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(file_indices, batch_size,
+                                                    num_epochs):
+      actual_batch = self._next_actual_batch(sess)
+      for i in range(len(expected_batch)):
+        self.assertAllEqual(expected_batch[i], actual_batch[i])
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from file 0.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames[0],
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, 0, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from file 1.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames[1],
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, 1, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+        with ops.Graph().as_default():
+          with self.test_session(graph=ops.get_default_graph()) as sess:
+            # Basic test: read from both files.
+            self.outputs = self._read_batch_features(
+                filenames=self.test_filenames,
+                num_epochs=num_epochs,
+                batch_size=batch_size)
+            self._verify_records(sess, batch_size, num_epochs=num_epochs)
+            with self.assertRaises(errors.OutOfRangeError):
+              self._next_actual_batch(sess)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb66acdcace0ced032f65f7c8d82e5e2b3a2cdac
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ResampleTest(test.TestCase):
+
+  def testInitialKnownDistribution(self):
+    self._testDistribution(initial_known=True)
+
+  def testInitialNotKnownDistribution(self):
+    self._testDistribution(initial_known=False)
+
+  def _testDistribution(self, initial_known):
+    classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
+    target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
+    initial_dist = [0.2] * 5 if initial_known else None
+    iterator = dataset_ops.Iterator.from_dataset(
+        dataset_ops.rejection_resample(
+            (dataset_ops.Dataset.from_tensor_slices(classes)
+             .shuffle(200, seed=21)
+             .map(lambda c: (c, string_ops.as_string(c)))),
+            target_dist=target_dist,
+            initial_dist=initial_dist,
+            class_func=lambda c, _: c,
+            seed=27))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    variable_init_op = variables.global_variables_initializer()
+
+    with self.test_session() as sess:
+      sess.run(variable_init_op)
+      sess.run(init_op)
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
+    returned_classes, returned_classes_and_data = zip(*returned)
+    _, returned_data = zip(*returned_classes_and_data)
+    self.assertAllEqual([compat.as_bytes(str(c))
+                         for c in returned_classes], returned_data)
+    total_returned = len(returned_classes)
+    # Subsampling rejects a large percentage of the initial data in
+    # this case.
+    self.assertGreater(total_returned, 20000 * 0.2)
+    class_counts = np.array([
+        len([True for v in returned_classes if v == c])
+        for c in range(5)])
+    returned_dist = class_counts / total_returned
+    self.assertAllClose(target_dist, returned_dist, atol=1e-2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6362b5e450a1670ee86a21630d52295ded5957a5
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -0,0 +1,211 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class SequenceDatasetTest(test.TestCase):
+
+  def testRepeatTensorDataset(self):
+    """Test a dataset that repeats its input multiple times."""
+    components = [np.array(1), np.array([1, 2, 3]), np.array(37.0)]
+    # This placeholder can be fed when dataset-definition subgraph
+    # runs (i.e. `init_op` below) to configure the number of
+    # repetitions used in a particular iterator.
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensors(components)
+                .repeat(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Test a finite repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 3})
+      for _ in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test a different finite repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 7})
+      for _ in range(7):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test an empty repetition.
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test an infinite repetition.
+      # NOTE(mrry): There's not a good way to test that the sequence
+      # actually is infinite.
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      for _ in range(17):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+
+  def testTakeTensorDataset(self):
+    components = [np.arange(10)]
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .take(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Take fewer than input size
+      sess.run(init_op, feed_dict={count_placeholder: 4})
+      for i in range(4):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take more than input size
+      sess.run(init_op, feed_dict={count_placeholder: 25})
+      for i in range(10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take all of input
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      for i in range(10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Take nothing
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSkipTensorDataset(self):
+    components = [np.arange(10)]
+    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
+                .skip(count_placeholder).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      # Skip fewer than input size, we should skip
+      # the first 4 elements and then read the rest.
+      sess.run(init_op, feed_dict={count_placeholder: 4})
+      for i in range(4, 10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip more than input size: get nothing.
+      sess.run(init_op, feed_dict={count_placeholder: 25})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip exactly input size.
+      sess.run(init_op, feed_dict={count_placeholder: 10})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Set -1 for 'count': skip the entire dataset.
+      sess.run(init_op, feed_dict={count_placeholder: -1})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Skip nothing
+      sess.run(init_op, feed_dict={count_placeholder: 0})
+      for i in range(0, 10):
+        results = sess.run(get_next)
+        self.assertAllEqual(results, components[0][i:i+1])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testRepeatRepeatTensorDataset(self):
+    """Test the composition of repeat datasets."""
+    components = [np.array(1), np.array([1, 2, 3]), np.array(37.0)]
+    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
+    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
+
+    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
+                .repeat(outer_count).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape for c in components],
+                     [t.shape for t in get_next])
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
+      for _ in range(7 * 14):
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testRepeatEmptyDataset(self):
+    """Test that repeating an empty dataset does not hang."""
+    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
+                .repeat(-1).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.OutOfRangeError,
+          "Attempted to repeat an empty dataset infinitely."):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8048e4f87edf218f716e16c534162023b253d9e2
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ShuffleDatasetTest(test.TestCase):
+
+  def testShuffleDataset(self):
+    components = [
+        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+        np.array([9.0, 10.0, 11.0, 12.0])
+    ]
+    count_placeholder = array_ops.placeholder_with_default(
+        constant_op.constant(5, dtypes.int64), shape=[])
+    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+                      .repeat(count_placeholder))
+
+    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
+                                             seed_placeholder)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     shuffle_dataset.output_shapes)
+
+    # Create initialization ops for iterators without and with
+    # shuffling, respectively.
+    iterator = dataset_ops.Iterator.from_structure(
+        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
+    init_fifo_op = iterator.make_initializer(repeat_dataset)
+    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      # First run without shuffling to collect the "ground truth".
+      sess.run(init_fifo_op)
+      unshuffled_elements = []
+      for _ in range(20):
+        unshuffled_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Assert that the shuffled dataset has the same elements as the
+      # "ground truth".
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 37})
+      shuffled_elements = []
+      for _ in range(20):
+        shuffled_elements.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertAllEqual(
+          sorted(unshuffled_elements), sorted(shuffled_elements))
+
+      # Assert that shuffling twice with the same seeds gives the same sequence.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 37})
+      reshuffled_elements_same_seed = []
+      for _ in range(20):
+        reshuffled_elements_same_seed.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
+
+      # Assert that shuffling twice with a different seed gives a different
+      # permutation of the same elements.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 100,
+                     seed_placeholder: 1037})
+      reshuffled_elements_different_seed = []
+      for _ in range(20):
+        reshuffled_elements_different_seed.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
+      self.assertAllEqual(
+          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
+
+      # Assert that the shuffled dataset has the same elements as the
+      # "ground truth" when the buffer size is smaller than the input
+      # dataset.
+      sess.run(
+          init_shuffle_op,
+          feed_dict={buffer_size_placeholder: 2,
+                     seed_placeholder: 37})
+      reshuffled_elements_small_buffer = []
+      for _ in range(20):
+        reshuffled_elements_small_buffer.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertAllEqual(
+          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
+
+      # Test the case of shuffling an empty dataset.
+      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
+                                           seed_placeholder: 37,
+                                           count_placeholder: 0})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDefaultArguments(self):
+    components = np.array([0, 1, 2, 3, 4])
+    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
+                .repeat().make_one_shot_iterator())
+
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      counts = collections.defaultdict(lambda: 0)
+      for _ in range(10):
+        for _ in range(5):
+          counts[sess.run(get_next)] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c47f072361c460feab7e7b33d8246c11e4af95ac
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ZipDatasetTest(test.TestCase):
+
+  def testZipDataset(self):
+    component_placeholders = [
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.float64)
+    ]
+
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
+        for component_placeholder in component_placeholders
+    ]
+    zipped = dataset_ops.Dataset.zip(datasets)
+
+    iterator = zipped.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      equal_length_components = [
+          np.tile(np.array([[1], [2], [3], [4]]), 20),
+          np.tile(np.array([[12], [13], [14], [15]]), 22),
+          np.array([37.0, 38.0, 39.0, 40.0])
+      ]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, equal_length_components)})
+      for i in range(4):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            equal_length_components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, variable_length_components)})
+      for i in range(2):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            variable_length_components, results):
+          self.assertAllEqual(component[i], result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedZipDataset(self):
+    component_placeholders = [
+        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
+        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
+        array_ops.placeholder(dtypes.float64, shape=[4])
+    ]
+
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
+        for component_placeholder in component_placeholders
+    ]
+    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+    iterator = zipped.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([20], get_next[0].shape)
+    self.assertEqual([22], get_next[1][0].shape)
+    self.assertEqual([], get_next[1][1].shape)
+
+    with self.test_session() as sess:
+      equal_length_components = [
+          np.tile(np.array([[1], [2], [3], [4]]), 20),
+          np.tile(np.array([[12], [13], [14], [15]]), 22),
+          np.array([37.0, 38.0, 39.0, 40.0])
+      ]
+      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
+          component_placeholders, equal_length_components)})
+      for i in range(4):
+        result1, (result2, result3) = sess.run(get_next)
+        self.assertAllEqual(equal_length_components[0][i], result1)
+        self.assertAllEqual(equal_length_components[1][i], result2)
+        self.assertAllEqual(equal_length_components[2][i], result3)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..489ec879a3e87531ceede8d4da2d3198b411969d
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "dataset_ops",
+    srcs = ["dataset_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/framework:function",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae1a7dd56871b2252891e824bc7410c867e0bdf2
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -0,0 +1,1902 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrappers for Datasets and Iterators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import numpy as np
+
+from tensorflow.contrib.data.python.framework import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.util import nest
+
+
+class Iterator(object):
+  """Represents the state of iterating through a `Dataset`."""
+
+  def __init__(self, iterator_resource, initializer, output_types,
+               output_shapes):
+    """Creates a new iterator from the given iterator resource.
+
+    NOTE(mrry): Most users will not call this initializer directly, and will
+    instead use `Iterator.from_dataset()` or `Dataset.make_one_shot_iterator()`.
+
+    Args:
+      iterator_resource: A `tf.resource` scalar `tf.Tensor` representing the
+        iterator.
+      initializer: A `tf.Operation` that should be run to initialize this
+        iterator.
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this iterator.
+      output_shapes: A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element of this dataset.
+    """
+    self._iterator_resource = iterator_resource
+    self._initializer = initializer
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+
+  @staticmethod
+  def from_dataset(dataset, shared_name=None):
+    """Creates a new, uninitialized `Iterator` from the given `Dataset`.
+
+    To initialize this iterator, you must run its `initializer`:
+
+    ```python
+    dataset = ...
+    iterator = Iterator.from_dataset(dataset)
+    # ...
+    sess.run(iterator.initializer)
+    ```
+
+    Args:
+      dataset: A `Dataset` object.
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator`.
+    """
+    if shared_name is None:
+      shared_name = ""
+    iterator_resource = gen_dataset_ops.iterator(
+        container="",
+        shared_name=shared_name,
+        output_types=nest.flatten(dataset.output_types),
+        output_shapes=nest.flatten(dataset.output_shapes))
+    initializer = gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
+                                                iterator_resource)
+    return Iterator(iterator_resource, initializer, dataset.output_types,
+                    dataset.output_shapes)
+
+  @staticmethod
+  def from_structure(output_types, output_shapes=None, shared_name=None):
+    """Creates a new, uninitialized `Iterator` with the given structure.
+
+    This iterator-constructing method can be used to create an iterator that
+    is reusable with many different datasets.
+
+    The returned iterator is not bound to a particular dataset, and it has
+    no `initializer`. To initialize the iterator, run the operation returned by
+    `Iterator.make_initializer(dataset)`.
+
+    The following is an example
+
+    ```python
+    iterator = Iterator.from_structure(tf.int64, tf.TensorShape([]))
+
+    dataset_range = Dataset.range(10)
+    range_initializer = iterator.make_initializer(dataset_range)
+
+    dataset_evens = dataset_range.filter(lambda x: x % 2 == 0)
+    evens_initializer = iterator.make_initializer(dataset_evens)
+
+    # Define a model based on the iterator; in this example, the model_fn
+    # is expected to take scalar tf.int64 Tensors as input (see
+    # the definition of 'iterator' above).
+    prediction, loss = model_fn(iterator.get_next())
+
+    # Train for `num_epochs`, where for each epoch, we first iterate over
+    # dataset_range, and then iterate over dataset_evens.
+    for _ in range(num_epochs):
+      # Initialize the iterator to `dataset_range`
+      sess.run(range_initializer)
+      while True:
+        try:
+          pred, loss_val = sess.run([prediction, loss])
+        except tf.errors.OutOfRangeError:
+          break
+
+      # Initialize the iterator to `dataset_evens`
+      sess.run(evens_initializer)
+      while True:
+        try:
+          pred, loss_val = sess.run([prediction, loss])
+        except tf.errors.OutOfRangeError:
+          break
+    ```
+
+    Args:
+      output_types: A nested structure of `tf.DType` objects corresponding to
+        each component of an element of this iterator.
+      output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
+        corresponding to each component of an element of this dataset. If
+        omitted, each component will have an unconstrainted shape.
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator`.
+
+    Raises:
+      TypeError: If the structures of `output_shapes` and `output_types` are
+        not the same.
+    """
+    output_types = nest.map_structure(dtypes.as_dtype, output_types)
+    if output_shapes is None:
+      output_shapes = nest.map_structure(
+          lambda _: tensor_shape.TensorShape(None), output_types)
+    else:
+      output_shapes = nest.map_structure_up_to(
+          output_types, tensor_shape.as_shape, output_shapes)
+    nest.assert_same_structure(output_types, output_shapes)
+    if shared_name is None:
+      shared_name = ""
+    iterator_resource = gen_dataset_ops.iterator(
+        container="",
+        shared_name=shared_name,
+        output_types=nest.flatten(output_types),
+        output_shapes=nest.flatten(output_shapes))
+    return Iterator(iterator_resource, None, output_types, output_shapes)
+
+  @property
+  def initializer(self):
+    """A `tf.Operation` that should be run to initialize this iterator.
+
+    Returns:
+      A `tf.Operation` that should be run to initialize this iterator
+
+    Raises:
+      ValueError: If this iterator initializes itself automatically.
+    """
+    if self._initializer is not None:
+      return self._initializer
+    else:
+      # TODO(mrry): Consider whether one-shot iterators should have
+      # initializers that simply reset their state to the beginning.
+      raise ValueError("Iterator does not have an initializer.")
+
+  def make_initializer(self, dataset):
+    """Returns a `tf.Operation` that initializes this iterator on `dataset`.
+
+    Args:
+      dataset: A `Dataset` with compatible structure to this iterator.
+
+    Returns:
+      A `tf.Operation` that can be run to initialize this iterator on the given
+      `dataset`.
+
+    Raises:
+      TypeError: If `dataset` and this iterator do not have a compatible
+        element structure.
+    """
+    nest.assert_same_structure(self._output_types, dataset.output_types)
+    nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+    for iterator_dtype, dataset_dtype in zip(
+        nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
+      if iterator_dtype != dataset_dtype:
+        raise TypeError(
+            "Expected output types %r but got dataset with output types %r." %
+            (self._output_types, dataset.output_types))
+    for iterator_shape, dataset_shape in zip(
+        nest.flatten(self._output_shapes), nest.flatten(dataset.output_shapes)):
+      if not iterator_shape.is_compatible_with(dataset_shape):
+        raise TypeError("Expected output shapes compatible with %r but got "
+                        "dataset with output shapes %r." %
+                        (self._output_shapes, dataset.output_shapes))
+    return gen_dataset_ops.make_iterator(dataset.make_dataset_resource(),
+                                         self._iterator_resource)
+
+  def get_next(self, name=None):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+
+    Args:
+      name: (Optional.) A name for the created operation.
+
+    Returns:
+      A nested structure of `tf.Tensor` objects.
+    """
+    return nest.pack_sequence_as(
+        self._output_types,
+        gen_dataset_ops.iterator_get_next(
+            self._iterator_resource,
+            output_types=nest.flatten(self._output_types),
+            output_shapes=nest.flatten(self._output_shapes),
+            name=name))
+
+  def dispose_op(self, name=None):
+    """Returns a `tf.Operation` that destroys this iterator.
+
+    The returned operation may be used to release any resources consumed by
+    this iterator without closing the session.
+
+    Args:
+      name: (Optional.) A name for the created operation.
+
+    Returns:
+      A `tf.Operation`.
+    """
+    return gen_dataset_ops.iterator_dispose(self._iterator_resource, name=name)
+
+  @property
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this iterator.
+    """
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    """Returns the type of each component of an element of this iterator.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this iterator.
+    """
+    return self._output_types
+
+
+def _calculate_acceptance_probs(initial_probs, target_probs):
+  """Calculate the per-class acceptance rates.
+
+  Args:
+    initial_probs: The class probabilities of the data.
+    target_probs: The desired class proportion in minibatches.
+  Returns:
+    A list of the per-class acceptance probabilities.
+
+  This method is based on solving the following analysis:
+
+  Let F be the probability of a rejection (on any example).
+  Let p_i be the proportion of examples in the data in class i (init_probs)
+  Let a_i is the rate the rejection sampler should *accept* class i
+  Let t_i is the target proportion in the minibatches for class i (target_probs)
+
+  ```
+  F = sum_i(p_i * (1-a_i))
+    = 1 - sum_i(p_i * a_i)     using sum_i(p_i) = 1
+  ```
+
+  An example with class `i` will be accepted if `k` rejections occur, then an
+  example with class `i` is seen by the rejector, and it is accepted. This can
+  be written as follows:
+
+  ```
+  t_i = sum_k=0^inf(F^k * p_i * a_i)
+      = p_i * a_j / (1 - F)    using geometric series identity, since 0 <= F < 1
+      = p_i * a_i / sum_j(p_j * a_j)        using F from above
+  ```
+
+  Note that the following constraints hold:
+  ```
+  0 <= p_i <= 1, sum_i(p_i) = 1
+  0 <= a_i <= 1
+  0 <= t_i <= 1, sum_i(t_i) = 1
+  ```
+
+
+  A solution for a_i in terms of the other variabes is the following:
+    ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
+  """
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  ratio_l = target_probs / denom
+
+  # Calculate list of acceptance probabilities.
+  max_ratio = math_ops.reduce_max(ratio_l)
+  return ratio_l / max_ratio
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: A `ResourceVariable` containing counts.
+      Type `int64`, shape `[num_classes]`.
+
+  Returns:
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in
+  # batch.  But do this asynchronously to avoid performing a
+  # cross-device round-trip.  Just use the cached value.
+  num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
+      math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64),
+          0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  return math_ops.cast(init_prob_estimate, dtypes.float32)
+
+
+class Dataset(object):
+  """Represents a potentially large set of elements.
+
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  def __init__(self):
+    pass
+
+  @abc.abstractmethod
+  def make_dataset_resource(self):
+    """Creates a `tf.Tensor` of  `tf.resource` tensor representing this dataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.resource` type, which represents this dataset.
+    """
+    raise NotImplementedError("Dataset.make_dataset_resource")
+
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    **N.B.** The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it.
+
+    Args:
+      shared_name: (Optional.) If non-empty, this iterator will be shared under
+        the given name across multiple sessions that share the same devices
+        (e.g. when using a remote server).
+
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    return Iterator.from_dataset(self, shared_name)
+
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    **N.B.** The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      return self.make_dataset_resource()
+
+    _make_dataset.add_to_graph(ops.get_default_graph())
+
+    return Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset,
+            output_types=nest.flatten(self.output_types),
+            output_shapes=nest.flatten(self.output_shapes)), None,
+        self.output_types, self.output_shapes)
+
+  @abc.abstractproperty
+  def output_shapes(self):
+    """Returns the shape of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.TensorShape` objects corresponding to each
+      component of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_shapes")
+
+  @abc.abstractproperty
+  def output_types(self):
+    """Returns the type of each component of an element of this dataset.
+
+    Returns:
+      A nested structure of `tf.DType` objects corresponding to each component
+      of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_types")
+
+  def __repr__(self):
+    output_shapes = nest.map_structure(str, self.output_shapes)
+    output_shapes = str(output_shapes).replace("'", "")
+    output_types = nest.map_structure(repr, self.output_types)
+    output_types = str(output_types).replace("'", "")
+    return ("<%s shapes: %s, types: %s>"
+            % (type(self).__name__, output_shapes, output_types))
+
+  @staticmethod
+  def from_tensors(tensors):
+    """Creates a `Dataset` with a single element, comprising the given tensors.
+
+    Args:
+      tensors: A nested structure of tensors.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TensorDataset(tensors)
+
+  @staticmethod
+  def from_tensor_slices(tensors):
+    """Creates a `Dataset` whose elements are slices of the given tensors.
+
+    Args:
+      tensors: A nested structure of tensors, each having the same size in the
+        0th dimension.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TensorSliceDataset(tensors)
+
+  @staticmethod
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return SparseTensorSliceDataset(sparse_tensor)
+
+  @staticmethod
+  def range(*args):
+    """Creates a `Dataset` of a step-separated range of values.
+
+    For example:
+
+    ```python
+    Dataset.range(5) == [0, 1, 2, 3, 4]
+    Dataset.range(2, 5) == [2, 3, 4]
+    Dataset.range(1, 5, 2) == [1, 3]
+    Dataset.range(1, 5, -2) == []
+    Dataset.range(5, 1) == []
+    Dataset.range(5, 1, -2) == [5, 3]
+    ```
+
+    Args:
+      *args: follow same semantics as python's xrange.
+        len(args) == 1 -> start = 0, stop = args[0], step = 1
+        len(args) == 2 -> start = args[0], stop = args[1], step = 1
+        len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
+
+    Returns:
+      A `RangeDataset`.
+
+    Raises:
+      ValueError: if len(args) == 0.
+    """
+    return RangeDataset(*args)
+
+  @staticmethod
+  def zip(datasets):
+    """Creates a `Dataset` by zipping together the given datasets.
+
+    This method has similar semantics to the built-in `zip()` function
+    in Python, with the main difference being that the `datasets`
+    argument can be an arbitrary nested structure of `Dataset` objects.
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { 4, 5, 6 }
+    c = { (7, 8), (9, 10), (11, 12) }
+    d = { 13, 14 }
+
+    # The nested structure of the `datasets` argument determines the
+    # structure of elements in the resulting dataset.
+    Dataset.zip((a, b)) == { (1, 4), (2, 5), (3, 6) }
+    Dataset.zip((b, a)) == { (4, 1), (5, 2), (6, 3) }
+
+    # The `datasets` argument may contain an arbitrary number of
+    # datasets.
+    Dataset.zip((a, b, c) == { (1, 4, (7, 8)),
+                               (2, 5, (9, 10)),
+                               (3, 6, (11, 12)) }
+
+    # The number of elements in the resulting dataset is the same as
+    # the size of the smallest dataset in `datasets`.
+    Dataset.zip((a, d)) == { (1, 13), (2, 14) }
+    ```
+
+    Args:
+      datasets: A nested structure of datasets.
+
+    Returns:
+      A `Dataset`.
+    """
+    return ZipDataset(datasets)
+
+  @staticmethod
+  def read_batch_features(file_pattern,
+                          batch_size,
+                          features,
+                          reader,
+                          reader_args=None,
+                          randomize_input=True,
+                          num_epochs=None,
+                          capacity=10000):
+    """Reads batches of Examples.
+
+    Args:
+      file_pattern: A string pattern or a placeholder with list of filenames.
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+      features: A `dict` mapping feature keys to `FixedLenFeature` or
+        `VarLenFeature` values. See `tf.parse_example`.
+      reader: A function or class that can be called with a `filenames` tensor
+        and (optional) `reader_args` and returns a `Dataset` of serialized
+        Examples.
+      reader_args: Additional arguments to pass to the reader class.
+      randomize_input: Whether the input should be randomized.
+      num_epochs: Integer specifying the number of times to read through the
+        dataset. If None, cycles through the dataset forever.
+      capacity: Capacity of the ShuffleDataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    if isinstance(file_pattern, str):
+      filenames = _get_file_names(file_pattern, randomize_input)
+    else:
+      filenames = file_pattern
+    if reader_args:
+      dataset = reader(filenames, *reader_args)
+    else:
+      dataset = reader(filenames)
+    dataset = dataset.repeat(num_epochs)
+    if randomize_input:
+      dataset = dataset.shuffle(capacity)
+    dataset = dataset.map(
+        lambda x: _parse_example(nest.flatten(x), features)
+    )
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def repeat(self, count=None):
+    """Repeats this dataset `count` times.
+
+    Args:
+      count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        number of times the elements of this dataset should be repeated. The
+        default behavior (if `count` is `None` or `-1`) is for the elements to
+        be repeated indefinitely.
+
+    Returns:
+      A `Dataset`.
+    """
+    return RepeatDataset(self, count)
+
+  def enumerate(self, start=0):
+    """Enumerate the elements of this dataset.  Similar to python's `enumerate`.
+
+    For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { 1, 2, 3 }
+    b = { (7, 8), (9, 10), (11, 12) }
+
+    # The nested structure of the `datasets` argument determines the
+    # structure of elements in the resulting dataset.
+    a.enumerate(start=5) == { (5, 1), (6, 2), (7, 3) }
+    b.enumerate() == { (0, (7, 8)), (1, (9, 10)), (2, (11, 12)) }
+
+    Args:
+      start: A `tf.int64` scalar `tf.Tensor`, representing the start
+        value for enumeration.
+
+    Returns:
+      A `Dataset`.
+    """
+    max_value = np.iinfo(dtypes.int64.as_numpy_dtype).max
+    return Dataset.zip((Dataset.range(start, max_value), self))
+
+  def shuffle(self, buffer_size, seed=None):
+    """Randomly shuffles the elements of this dataset.
+
+    Args:
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of elements from this dataset from which the new
+        dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
+
+    Returns:
+      A `Dataset`.
+    """
+    return ShuffleDataset(self, buffer_size, seed)
+
+  def take(self, count):
+    """Creates a `Dataset` with at most `count` elements from this dataset.
+
+    Args:
+      count: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        elements of this dataset that should be taken to form the new dataset.
+        If `count` is -1, or if `count` is greater than the size of this
+        dataset, the new dataset will contain all elements of this dataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    return TakeDataset(self, count)
+
+  def skip(self, count):
+    """Creates a `Dataset` that skips `count` elements from this dataset.
+
+    Args:
+      count: A `tf.int64` scalar `tf.Tensor`, representing the number
+        of elements of this dataset that should be skipped to form the
+        new dataset.  If `count` is greater than the size of this
+        dataset, the new dataset will contain no elements.  If `count`
+        is -1, skips the entire dataset.
+
+    Returns:
+      A `Dataset`.
+    """
+    return SkipDataset(self, count)
+
+  def batch(self, batch_size):
+    """Combines consecutive elements of this dataset into batches.
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+
+    Returns:
+      A `Dataset`.
+    """
+    return BatchDataset(self, batch_size)
+
+  def padded_batch(self, batch_size, padded_shapes, padding_values=None):
+    """Combines consecutive elements of this dataset into padded batches.
+
+    Like `Dataset.dense_to_sparse_batch()`, this method combines
+    multiple consecutive elements of this dataset, which might have
+    different shapes, into a single element. The tensors in the
+    resulting element have an additional outer dimension, and are
+    padded to the respective shape in `padded_shapes`.
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements of this dataset to combine in a single batch.
+      padded_shapes: A nested structure of `tf.TensorShape` or
+        `tf.int64` vector tensor-like objects representing the shape
+        to which the respective component of each input element should
+        be padded prior to batching. Any unknown dimensions
+        (e.g. `tf.Dimension(None)` in a `tf.TensorShape` or `-1` in a
+        tensor-like object) will be padded to the maximum size of that
+        dimension in each batch.
+      padding_values: (Optional.) A nested structure of scalar-shaped
+        `tf.Tensor`, representing the padding values to use for the
+        respective components.  Defaults are `0` for numeric types and
+        the empty string for string types.
+
+    Returns:
+      A `Dataset`.
+    """
+    return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
+
+  def dense_to_sparse_batch(self, batch_size, row_shape):
+    """Batches ragged elements of this dataset into `tf.SparseTensor`s.
+
+    Like `Dataset.padded_batch()`, this method combines multiple
+    consecutive elements of this dataset, which might have different
+    shapes, into a single element. The resulting element has three
+    components (`indices`, `values`, and `dense_shape`), which
+    comprise a `tf.SparseTensor` that represents the same data. The
+    `row_shape` represents the dense shape of each row in the
+    resulting `tf.SparseTensor`, to which the effective batch size is
+    prepended. For example:
+
+    ```python
+    # NOTE: The following examples use `{ ... }` to represent the
+    # contents of a dataset.
+    a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] }
+
+    a.dense_to_sparse_batch(batch_size=2, row_shape=[6]) == {
+        ([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],  # indices
+         ['a', 'b', 'c', 'a', 'b'],                 # values
+         [2, 6]),                                   # dense_shape
+        ([[2, 0], [2, 1], [2, 2], [2, 3]],
+         ['a', 'b', 'c', 'd'],
+         [1, 6])
+    }
+    ```
+
+    Args:
+      batch_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of consecutive elements of this dataset to combine in a
+        single batch.
+      row_shape: A `tf.TensorShape` or `tf.int64` vector tensor-like
+        object representing the equivalent dense shape of a row in the
+        resulting `tf.SparseTensor`. Each element of this dataset must
+        have the same rank as `row_shape`, and must have size less
+        than or equal to `row_shape` in each dimension.
+
+    Returns:
+      A `Dataset`.
+    """
+    return DenseToSparseBatchDataset(self, batch_size, row_shape)
+
+  def group_by_window(self, key_func, reduce_func, window_size):
+    """Performs a windowed "group-by" operation on this dataset.
+
+    This method maps each consecutive element in this dataset to a key
+    using `key_func` and groups the elements by key. It then applies
+    `reduce_func` to at most `window_size` elements matching the same
+    key. All execpt the final window for each key will contain
+    `window_size` elements; the final window may be smaller.
+
+    Args:
+      key_func: A function mapping a nested structure of tensors
+        (having shapes and types defined by `self.output_shapes` and
+        `self.output_types`) to a scalar `tf.int64` tensor.
+      reduce_func: A function mapping a key and a dataset of up to `batch_size`
+        consecutive elements matching that key to another dataset.
+      window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        consecutive elements matching the same key to combine in a single
+        batch, which will be passed to `reduce_func`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return GroupByWindowDataset(self, key_func, reduce_func, window_size)
+
+  def map(self, map_func, num_threads=None, output_buffer_size=None):
+    """Maps `map_func` across this datset.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having
+        shapes and types defined by `self.output_shapes` and
+       `self.output_types`) to another nested structure of tensors.
+      num_threads: (Optional.) A `tf.int32` scalar `tf.Tensor`, representing
+        the number of threads to use for processing elements in parallel. If
+        not specified, elements will be processed sequentially without
+        buffering.
+      output_buffer_size: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+        representing the maximum number of processed elements that will be
+        buffered when processing in parallel.
+
+    Returns:
+      A `Dataset`.
+    """
+    return MapDataset(self, map_func, num_threads, output_buffer_size)
+
+  def flat_map(self, map_func):
+    """Maps `map_func` across this dataset and flattens the result.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        `Dataset`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return FlatMapDataset(self, map_func)
+
+  def unbatch(self):
+    """Splits elements of this dataset into sequences of consecutive elements.
+
+    For example, if elements of this dataset are shaped `[B, a0, a1, ...]`,
+    where `B` may vary from element to element, then for each element in
+    this dataset, the unbatched dataset will contain `B` consecutive elements
+    of shape `[a0, a1, ...]`.
+
+    Returns:
+      A `Dataset`.
+    """
+    return self.flat_map(map_func=Dataset.from_tensor_slices)
+
+  def filter(self, predicate):
+    """Filters this dataset according to `predicate`.
+
+    Args:
+      predicate: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        scalar `tf.bool` tensor.
+
+    Returns:
+      A `Dataset`.
+    """
+    return FilterDataset(self, predicate)
+
+
+class TensorDataset(Dataset):
+  """A `Dataset` with a single element, viz. a nested structure of tensors."""
+
+  def __init__(self, tensors):
+    """See `Dataset.from_tensors()` for details."""
+    super(TensorDataset, self).__init__()
+    with ops.name_scope("tensors"):
+      self._tensors = nest.pack_sequence_as(tensors, [
+          ops.convert_to_tensor(t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(tensors))
+      ])
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tensor_dataset(
+        nest.flatten(self._tensors),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.shape for t in nest.flatten(self._tensors)])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.dtype for t in nest.flatten(self._tensors)])
+
+
+class TensorSliceDataset(Dataset):
+  """A `Dataset` of slices from a nested structure of tensors."""
+
+  def __init__(self, tensors):
+    """See `Dataset.from_tensor_slices()` for details."""
+    super(TensorSliceDataset, self).__init__()
+    with ops.name_scope("tensors"):
+      flat_tensors = [
+          ops.convert_to_tensor(t, name="component_%d" % i)
+          for i, t in enumerate(nest.flatten(tensors))
+      ]
+
+    self._tensors = nest.pack_sequence_as(tensors, flat_tensors)
+    batch_dim = flat_tensors[0].get_shape()[0]
+    for t in flat_tensors[1:]:
+      batch_dim.assert_is_compatible_with(t.get_shape()[0])
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tensor_slice_dataset(
+        nest.flatten(self._tensors),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._tensors, [
+        tensor_shape.TensorShape(t.shape[1:])
+        for t in nest.flatten(self._tensors)
+    ])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._tensors,
+                                 [t.dtype for t in nest.flatten(self._tensors)])
+
+
+class SparseTensorSliceDataset(Dataset):
+  """A `Dataset` that splits a rank-N `tf.SparseTensor` into its rows."""
+
+  def __init__(self, sparse_tensor):
+    """See `Dataset.from_sparse_tensor_slices()` for details."""
+    super(SparseTensorSliceDataset, self).__init__()
+    if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
+      raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
+    self._sparse_tensor = sparse_tensor
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.sparse_tensor_slice_dataset(
+        self._sparse_tensor.indices, self._sparse_tensor.values,
+        self._sparse_tensor.dense_shape)
+
+  @property
+  def output_shapes(self):
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape[1] - 1).merge_with(shape_shape[0] - 1)
+    num_values = tensor_shape.Dimension(None)
+    return (tensor_shape.TensorShape([num_values, rank]),
+            tensor_shape.TensorShape([num_values]), tensor_shape.TensorShape(
+                [rank]))
+
+  @property
+  def output_types(self):
+    return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
+
+
+class ZipDataset(Dataset):
+  """A `Dataset` that zips its inputs together."""
+
+  def __init__(self, datasets):
+    """See `Dataset.zip()` for details."""
+    super(ZipDataset, self).__init__()
+    self._datasets = datasets
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.zip_dataset(
+        [ds.make_dataset_resource() for ds in nest.flatten(self._datasets)],
+        output_shapes=[
+            s
+            for ds in nest.flatten(self._datasets)
+            for s in nest.flatten(ds.output_shapes)
+        ],
+        output_types=[
+            t
+            for ds in nest.flatten(self._datasets)
+            for t in nest.flatten(ds.output_types)
+        ])
+
+  @property
+  def output_shapes(self):
+    return nest.pack_sequence_as(self._datasets, [
+        ds.output_shapes for ds in nest.flatten(self._datasets)])
+
+  @property
+  def output_types(self):
+    return nest.pack_sequence_as(self._datasets, [
+        ds.output_types for ds in nest.flatten(self._datasets)])
+
+
+class RepeatDataset(Dataset):
+  """A `Dataset` that repeats its input several times."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.repeat()` for details."""
+    super(RepeatDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if count is None:
+      self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
+    else:
+      self._count = ops.convert_to_tensor(count, dtype=dtypes.int64,
+                                          name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.repeat_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class RangeDataset(Dataset):
+  """A `Dataset` of a step separated range of values."""
+
+  def __init__(self, *args):
+    """See `Dataset.range()` for details."""
+    super(RangeDataset, self).__init__()
+    self._parse_args(*args)
+
+  def _parse_args(self, *args):
+    if len(args) == 1:
+      self._start = self._build_tensor(0, "start")
+      self._stop = args[0]
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 2:
+      self._start = args[0]
+      self._stop = args[1]
+      self._step = self._build_tensor(1, "step")
+    elif len(args) == 3:
+      self._start = args[0]
+      self._stop = args[1]
+      self._step = args[2]
+    else:
+      raise ValueError("Invalid arguments to RangeDataset: %s" % str(args))
+
+  def _build_tensor(self, int64_value, name):
+    return constant_op.constant(int64_value, dtype=dtypes.int64, name=name)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.range_dataset(
+        start=self._start,
+        stop=self._stop,
+        step=self._step,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
+
+
+class ShuffleDataset(Dataset):
+  """A `Dataset` that randomly shuffles the elements of its input."""
+
+  def __init__(self, input_dataset, buffer_size, seed=None):
+    """See `Dataset.shuffle()` for details."""
+    super(ShuffleDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(seed2, dtype=dtypes.int64,
+                                          name="seed2")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.shuffle_dataset(
+        self._input_dataset.make_dataset_resource(),
+        buffer_size=self._buffer_size,
+        seed=self._seed,
+        seed2=self._seed2,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class TakeDataset(Dataset):
+  """A `Dataset` containing the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.take()` for details."""
+    super(TakeDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.take_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class SkipDataset(Dataset):
+  """A `Dataset` skipping the first `count` elements from its input."""
+
+  def __init__(self, input_dataset, count):
+    """See `Dataset.skip()` for details."""
+    super(SkipDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.skip_dataset(
+        self._input_dataset.make_dataset_resource(),
+        count=self._count,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class BatchDataset(Dataset):
+  """A `Dataset` that batches contiguous elements from its input."""
+
+  def __init__(self, input_dataset, batch_size):
+    """See `Dataset.batch()` for details."""
+    super(BatchDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        batch_size=self._batch_size,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    input_shapes = self._input_dataset.output_shapes
+    return nest.pack_sequence_as(input_shapes, [
+        tensor_shape.vector(None).concatenate(s)
+        for s in nest.flatten(self._input_dataset.output_shapes)
+    ])
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+def _partial_shape_to_tensor(shape_like):
+  try:
+    # First attempt to convert the input to a shape, and return the
+    # "canonical" tensor representation, which uses `-1` in place of
+    # `None`.
+    shape_like = tensor_shape.as_shape(shape_like)
+    return ops.convert_to_tensor(
+        [dim if dim is not None else -1 for dim in shape_like.as_list()],
+        dtype=dtypes.int64)
+  except (TypeError, ValueError):
+    # The argument was not trivially convertible to a
+    # `tf.TensorShape`, so fall back on the conversion to tensor
+    # machinery.
+    return ops.convert_to_tensor(shape_like, dtype=dtypes.int64)
+
+
+def _padding_value_to_tensor(value, output_type):
+  """Converts the padding value to a tensor.
+
+  Args:
+    value: The padding value.
+    output_type: Its expected dtype.
+
+  Returns:
+    A scalar `Tensor`.
+
+  Raises:
+    ValueError: if the padding value is not a scalar.
+    TypeError: if the padding value's type does not match `output_type`.
+  """
+  value = ops.convert_to_tensor(value, name="padding_value")
+  if not value.shape.is_compatible_with(tensor_shape.scalar()):
+    raise ValueError(
+        "Padding value should be a scalar, but is not: %s" % value)
+  if value.dtype != output_type:
+    raise TypeError(
+        "Padding value tensor (%s) does not match output type: %s"
+        % (value, output_type))
+  return value
+
+
+class PaddedBatchDataset(Dataset):
+  """A `Dataset` that batches and pads contiguous elements from its input."""
+
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+    """See `Dataset.batch()` for details."""
+    super(PaddedBatchDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    padding_values = (padding_values if padding_values is not None else
+                      self._default_padding(input_dataset))
+    self._padded_shapes = nest.map_structure_up_to(input_dataset.output_shapes,
+                                                   _partial_shape_to_tensor,
+                                                   padded_shapes)
+    self._padding_values = nest.map_structure_up_to(input_dataset.output_shapes,
+                                                    _padding_value_to_tensor,
+                                                    padding_values,
+                                                    input_dataset.output_types)
+
+  def _default_padding(self, input_dataset):
+    def make_zero(t):
+      if t.base_dtype == dtypes.string:
+        return ""
+      else:
+        return np.zeros_like(t.as_numpy_dtype())
+    return nest.map_structure(make_zero, input_dataset.output_types)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.padded_batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        batch_size=self._batch_size,
+        padded_shapes=[
+            ops.convert_to_tensor(s, dtype=dtypes.int64)
+            for s in nest.flatten(self._padded_shapes)
+        ],
+        padding_values=nest.flatten(self._padding_values),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.vector(None).concatenate(
+          tensor_util.constant_value_as_shape(s))
+    return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class DenseToSparseBatchDataset(Dataset):
+  """A `Dataset` that batches ragged dense elements into `tf.SparseTensor`s."""
+
+  def __init__(self, input_dataset, batch_size, row_shape):
+    """See `Dataset.dense_to_sparse_batch()` for more details."""
+    super(DenseToSparseBatchDataset, self).__init__()
+    if not isinstance(input_dataset.output_types, dtypes.DType):
+      raise TypeError("DenseToSparseDataset requires an input whose elements "
+                      "have a single component, whereas the input has %r."
+                      % input_dataset.output_types)
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    self._row_shape = _partial_shape_to_tensor(row_shape)
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._batch_size,
+        self._row_shape,
+        output_shapes=self.output_shapes,
+        output_types=self.output_types)
+
+  @property
+  def output_shapes(self):
+    num_elements = tensor_shape.Dimension(None)
+    return (tensor_shape.matrix(num_elements, self._row_shape.shape[0] + 1),
+            tensor_shape.vector(num_elements),
+            tensor_shape.vector(self._row_shape.shape[0] + 1))
+
+  @property
+  def output_types(self):
+    return (dtypes.int64, self._input_dataset.output_types, dtypes.int64)
+
+
+class _ResourceDataset(Dataset):
+  """A Dataset wrapper for a tf.resource-typed function argument."""
+
+  def __init__(self, dataset_resource, output_types, output_shapes):
+    super(_ResourceDataset, self).__init__()
+    self._dataset_resource = dataset_resource,
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+
+  def make_dataset_resource(self):
+    return self._dataset_resource
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class GroupByWindowDataset(Dataset):
+  """A `Dataset` that groups its input and performs a windowed reduction."""
+
+  def __init__(self, input_dataset, key_func, reduce_func, window_size):
+    """See `Dataset.group_by_window()` for details."""
+    super(GroupByWindowDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._window_size = window_size
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_key_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+      if nest.is_sequence(nested_args):
+        ret = key_func(*nested_args)
+      else:
+        ret = key_func(nested_args)
+      ret = ops.convert_to_tensor(ret, dtype=dtypes.int64)
+      if ret.dtype != dtypes.int64:
+        raise ValueError("`key_func` must return a single tf.int64 tensor.")
+      return ret
+
+    self._key_func = tf_key_func
+    self._key_func.add_to_graph(ops.get_default_graph())
+
+    @function.Defun(dtypes.int64, dtypes.resource)
+    def tf_reduce_func(key, window_dataset_resource):
+      """A wrapper for Defun that facilitates shape inference."""
+      key.set_shape([])
+      window_dataset = _ResourceDataset(window_dataset_resource,
+                                        input_dataset.output_types,
+                                        input_dataset.output_shapes)
+      output_dataset = reduce_func(key, window_dataset)
+      if not isinstance(output_dataset, Dataset):
+        raise TypeError("`reduce_func` must return a `Dataset` object.")
+      self._output_types = output_dataset.output_types
+      self._output_shapes = output_dataset.output_shapes
+      return output_dataset.make_dataset_resource()
+
+    self._reduce_func = tf_reduce_func
+    self._reduce_func.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.group_by_window_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._key_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._window_size,
+        key_func=self._key_func,
+        reduce_func=self._reduce_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+def _most_specific_compatible_shape(s1, s2):
+  """Returns the most specific shape compatible with `s1` and `s2`."""
+  if s1.dims is None:
+    return s1
+  if s2.dims is None:
+    return s2
+  s1.assert_same_rank(s2)
+  dims = []
+  for dim1, dim2 in zip(s1, s2):
+    if dim1.value is None or dim2.value is None or dim1.value != dim2.value:
+      dims.append(tensor_shape.Dimension(None))
+    else:
+      dims.append(dim1.value)
+  return tensor_shape.TensorShape(dims)
+
+
+class MapDataset(Dataset):
+  """A `Dataset` that maps a function over elements in its input."""
+
+  def __init__(self,
+               input_dataset,
+               map_func,
+               num_threads=None,
+               output_buffer_size=None):
+    """See `Dataset.map()` for details."""
+    super(MapDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    self._output_shapes = None
+    self._output_types = None
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_map_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        ret = map_func(*nested_args)
+      else:
+        ret = map_func(nested_args)
+
+      # Extract shape information from the returned values.
+      flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)]
+      self._output_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in flattened_ret])
+      self._output_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in flattened_ret])
+
+      return flattened_ret
+
+    self._map_func = tf_map_func
+    self._map_func.add_to_graph(ops.get_default_graph())
+    if num_threads is not None:
+      self._num_threads = ops.convert_to_tensor(
+          num_threads, dtype=dtypes.int32, name="num_threads")
+      if output_buffer_size is not None:
+        self._output_buffer_size = ops.convert_to_tensor(
+            output_buffer_size, dtype=dtypes.int64, name="output_buffer_size")
+      else:
+        self._output_buffer_size = self._num_threads
+    else:
+      self._num_threads = None
+      self._output_buffer_size = None
+
+  def make_dataset_resource(self):
+    input_resource = self._input_dataset.make_dataset_resource()
+    if self._num_threads is None:
+      return gen_dataset_ops.map_dataset(
+          input_resource,
+          self._map_func.captured_inputs,
+          f=self._map_func,
+          output_types=nest.flatten(self.output_types),
+          output_shapes=nest.flatten(self.output_shapes))
+    else:
+      return gen_dataset_ops.parallel_map_dataset(
+          input_resource,
+          self._map_func.captured_inputs,
+          f=self._map_func,
+          num_threads=self._num_threads,
+          output_buffer_size=self._output_buffer_size,
+          output_types=nest.flatten(self.output_types),
+          output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class FlatMapDataset(Dataset):
+  """A `Dataset` that maps a function over its input and flattens the result."""
+
+  def __init__(self,
+               input_dataset,
+               map_func):
+    """See `Dataset.flat_map()` for details."""
+    super(FlatMapDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_map_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        dataset = map_func(*nested_args)
+      else:
+        dataset = map_func(nested_args)
+
+      if not isinstance(dataset, Dataset):
+        raise TypeError("`map_func` must return a `Dataset` object.")
+
+      self._output_types = dataset.output_types
+      self._output_shapes = dataset.output_shapes
+
+      return dataset.make_dataset_resource()
+
+    self._map_func = tf_map_func
+    self._map_func.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.flat_map_dataset(
+        self._input_dataset.make_dataset_resource(),
+        self._map_func.captured_inputs,
+        f=self._map_func,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+
+class FilterDataset(Dataset):
+  """A `Dataset` that filters its input according to a predicate function."""
+
+  def __init__(self, input_dataset, predicate):
+    """See `Dataset.filter()` for details."""
+    super(FilterDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+    @function.Defun(*nest.flatten(input_dataset.output_types))
+    def tf_predicate(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+
+      if nest.is_sequence(nested_args):
+        ret = predicate(*nested_args)
+      else:
+        ret = predicate(nested_args)
+
+      ret = ops.convert_to_tensor(ret, dtype=dtypes.bool)
+      if not (ret.dtype == dtypes.bool and
+              ret.shape.is_compatible_with(tensor_shape.scalar())):
+        raise ValueError("`predicate` must return a scalar boolean tensor.")
+
+      return ret
+
+    self._predicate = tf_predicate
+    self._predicate.add_to_graph(ops.get_default_graph())
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.filter_dataset(
+        self._input_dataset.make_dataset_resource(),
+        other_arguments=self._predicate.captured_inputs,
+        predicate=self._predicate,
+        output_types=nest.flatten(self.output_types),
+        output_shapes=nest.flatten(self.output_shapes))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+
+class TextLineDataset(Dataset):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames):
+    """Creates a `TextLineDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+    """
+    super(TextLineDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.text_line_dataset(self._filenames)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+class TFRecordDataset(Dataset):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None):
+    """Creates a `TFRecordDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      compression_type: A `tf.string` scalar evaluating to one of `""` (no
+        compression), `"ZLIB"`, or `"GZIP"`.
+    """
+    super(TFRecordDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(filenames, name="filenames")
+    if compression_type is not None:
+      self._compression_type = ops.convert_to_tensor(
+          compression_type, dtype=dtypes.string, name="compression_type")
+    else:
+      self._compression_type = constant_op.constant("", name="compression_type")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.tf_record_dataset(self._filenames,
+                                             self._compression_type)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+class FixedLengthRecordDataset(Dataset):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None):
+    """Creates a `FixedLengthRecordDataset`.
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      record_bytes: A `tf.int64` scalar representing the number of bytes in
+        each record.
+      header_bytes: (Optional.) A `tf.int64` scalar representing the number of
+        bytes to skip at the start of a file.
+      footer_bytes: (Optional.) A `tf.int64` scalar representing the number of
+        bytes to ignore at the end of a file.
+    """
+    super(FixedLengthRecordDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+    self._record_bytes = ops.convert_to_tensor(
+        record_bytes, dtype=dtypes.int64, name="record_bytes")
+    if header_bytes is not None:
+      self._header_bytes = ops.convert_to_tensor(
+          header_bytes, dtype=dtypes.int64, name="header_bytes")
+    else:
+      self._header_bytes = constant_op.constant(
+          0, dtype=dtypes.int64, name="header_bytes")
+    if footer_bytes is not None:
+      self._footer_bytes = ops.convert_to_tensor(
+          footer_bytes, dtype=dtypes.int64, name="footer_bytes")
+    else:
+      self._footer_bytes = constant_op.constant(
+          0, dtype=dtypes.int64, name="footer_bytes")
+
+  def make_dataset_resource(self):
+    return gen_dataset_ops.fixed_length_record_dataset(
+        self._filenames, self._header_bytes, self._record_bytes,
+        self._footer_bytes)
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.string
+
+
+def rejection_resample(dataset, class_func, target_dist,
+                       initial_dist=None, seed=None):
+  """Resamples this dataset to achieve a target class distribution.
+
+  **NOTE** Resampling is performed via rejection sampling; some fraction
+  of the input values will be dropped.
+
+  Args:
+    dataset: A `Dataset` object.
+    class_func: A function mapping a nested structure of tensors (having
+      shapes and types defined by `dataset.output_shapes` and
+      `dataset.output_types`) to a scalar `tf.int32` tensor.  Values should
+      be in `[0, num_classes)`.
+    target_dist: A floating point type tensor, shaped `[num_classes].
+    initial_dist: (Optional.)  A floating point type tensor, shaped
+      `[num_classes]`.  If not provided, the true class distribution is
+      estimated live in a streaming fashion.
+    seed: (Optional.) Python integer seed for the resampler.
+
+  Returns:
+    A `Dataset`.
+  """
+  dist_estimation_batch_size = 32
+  target_dist = ops.convert_to_tensor(target_dist, name="initial_dist")
+  class_values_ds = dataset.map(class_func)
+  if initial_dist is not None:
+    initial_dist = ops.convert_to_tensor(
+        initial_dist, name="initial_dist")
+    acceptance_dist = _calculate_acceptance_probs(initial_dist, target_dist)
+    initial_dist_ds = Dataset.from_tensors(initial_dist).repeat()
+    acceptance_dist_ds = Dataset.from_tensors(acceptance_dist).repeat()
+  else:
+    num_classes = (target_dist.shape[0].value
+                   or array_ops.shape(target_dist)[0])
+    smoothing_constant = 10
+    num_examples_per_class_seen = resource_variable_ops.ResourceVariable(
+        initial_value=array_ops.fill(
+            [num_classes], np.int64(smoothing_constant)),
+        trainable=False,
+        name="class_count",
+        dtype=dtypes.int64)
+    def update_estimate_and_tile(c):
+      return array_ops.tile(
+          array_ops.expand_dims(
+              _estimate_data_distribution(c, num_examples_per_class_seen), 0),
+          [dist_estimation_batch_size, 1])
+    initial_dist_ds = (class_values_ds
+                       .batch(dist_estimation_batch_size)
+                       .map(update_estimate_and_tile)
+                       .unbatch())
+    acceptance_dist_ds = initial_dist_ds.map(
+        lambda initial: _calculate_acceptance_probs(initial, target_dist))
+
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum(
+        (1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (
+      Dataset.zip((acceptance_dist_ds, initial_dist_ds))
+      .map(maybe_warn_on_large_rejection))
+
+  current_probabilities_ds = (Dataset
+                              .zip((acceptance_dist_ds, class_values_ds))
+                              .map(array_ops.gather))
+  filtered_ds = (
+      Dataset.zip((class_values_ds, current_probabilities_ds, dataset))
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def read_batch_features(file_pattern,
+                        batch_size,
+                        features,
+                        reader,
+                        reader_args=None,
+                        randomize_input=True,
+                        num_epochs=None,
+                        capacity=10000):
+  """Reads batches of Examples.
+
+  Example:
+
+  ```
+  serialized_examples = [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "code", "art" ] } } }
+    },
+    features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+      feature { key: "kws" value { bytes_list { value: [ "sports" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  features: {
+    "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+    "gender": FixedLenFeature([], dtype=tf.string),
+    "kws": VarLenFeature(dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+    "kws": SparseTensor(
+      indices=[[0, 0], [0, 1], [1, 0]],
+      values=["code", "art", "sports"]
+      dense_shape=[2, 2]),
+  }
+  ```
+
+  Args:
+    file_pattern: List of files or patterns of file paths containing
+      `Example` records. See `tf.gfile.Glob` for pattern rules.
+    batch_size: An int representing the number of consecutive elements of this
+      dataset to combine in a single batch.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values. See `tf.parse_example`.
+    reader: A function or class that can be called with a `filenames` tensor
+      and (optional) `reader_args` and returns a `Dataset` of serialized
+      Examples.
+    reader_args: Additional arguments to pass to the reader class.
+    randomize_input: Whether the input should be randomized.
+    num_epochs: Integer specifying the number of times to read through the
+      dataset. If None, cycles through the dataset forever.
+    capacity: Capacity of the ShuffleDataset. A large capacity ensures better
+      shuffling but would increase memory usage and startup time.
+
+  Returns:
+    A dict from keys in features to Tensor or SparseTensor objects.
+  """
+  filenames = _get_file_names(file_pattern, randomize_input)
+  if reader_args:
+    dataset = reader(filenames, *reader_args)
+  else:
+    dataset = reader(filenames)
+  dataset = dataset.repeat(num_epochs)
+  if randomize_input:
+    dataset = dataset.shuffle(capacity)
+  dataset = dataset.batch(batch_size)
+  dataset = dataset.map(lambda x: _parse_example(x, features))
+  iterator = dataset.make_one_shot_iterator()
+  outputs = iterator.get_next()
+  index = 0
+  result = {}
+  for key in sorted(features.keys()):
+    feature = features[key]
+    if isinstance(feature, parsing_ops.FixedLenFeature):
+      result[key] = outputs[index]
+      index += 1
+    else:
+      result[key] = sparse_tensor_lib.SparseTensor(
+          indices=outputs[index],
+          values=outputs[index + 1],
+          dense_shape=outputs[index + 2])
+      index += 3
+  return result
+
+
+def _parse_example(serialized, features):
+  parsed = parsing_ops.parse_example(serialized, features)
+  result = []
+  for key in sorted(features.keys()):
+    val = parsed[key]
+    if isinstance(val, sparse_tensor_lib.SparseTensor):
+      result.extend([val.indices, val.values, val.dense_shape])
+    else:
+      result.append(val)
+  return result
+
+
+def _get_file_names(file_pattern, randomize_input):
+  """Parse list of file names from pattern, optionally shuffled.
+
+  Args:
+    file_pattern: File glob pattern, or list of glob patterns.
+    randomize_input: Whether to shuffle the order of file names.
+
+  Returns:
+    List of file names matching `file_pattern`.
+
+  Raises:
+    ValueError: If `file_pattern` is empty, or pattern matches no files.
+  """
+  if isinstance(file_pattern, list):
+    if not file_pattern:
+      raise ValueError("File pattern is empty.")
+    file_names = []
+    for entry in file_pattern:
+      file_names.extend(gfile.Glob(entry))
+  else:
+    file_names = list(gfile.Glob(file_pattern))
+
+  if not file_names:
+    raise ValueError("No files match %s." % file_pattern)
+
+  # Sort files so it will be deterministic for unit tests.
+  if not randomize_input:
+    file_names = sorted(file_names)
+  return file_names
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index f55859261b3677cba29e8c5a020341a18f94bdac..8dea2763f2946bea9a4b7ef00353b10560fc700c 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -54,6 +55,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -191,38 +193,6 @@ cuda_py_test(
     tags = ["notap"],  # http://b/30441813
 )
 
-cuda_py_test(
-    name = "bernoulli_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bernoulli_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "beta_test",
-    size = "small",
-    srcs = ["python/kernel_tests/beta_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -236,24 +206,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "categorical_test",
-    size = "small",
-    srcs = ["python/kernel_tests/categorical_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-    ],
-)
-
 cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
@@ -285,66 +237,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "dirichlet_test",
-    size = "small",
-    srcs = ["python/kernel_tests/dirichlet_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "dirichlet_multinomial_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/dirichlet_multinomial_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "exponential_test",
-    srcs = ["python/kernel_tests/exponential_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "gamma_test",
-    srcs = ["python/kernel_tests/gamma_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "geometric_test",
     size = "small",
@@ -377,23 +269,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "multinomial_test",
-    srcs = ["python/kernel_tests/multinomial_test.py"],
+    name = "mvn_diag_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_diag_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -402,14 +280,15 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
 cuda_py_test(
-    name = "mvn_diag_test",
-    size = "small",
-    srcs = ["python/kernel_tests/mvn_diag_test.py"],
+    name = "mvn_diag_plus_low_rank_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/mvn_diag_plus_low_rank_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -424,9 +303,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "mvn_diag_plus_low_rank_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/mvn_diag_plus_low_rank_test.py"],
+    name = "mvn_full_covariance_test",
+    size = "small",
+    srcs = ["python/kernel_tests/mvn_full_covariance_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
@@ -494,24 +373,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "normal_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/normal_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
     name = "poisson_test",
     size = "small",
@@ -545,21 +406,19 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "student_t_test",
-    size = "small",
-    srcs = ["python/kernel_tests/student_t_test.py"],
+    name = "vector_laplace_diag_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/vector_laplace_diag_test.py"],
     additional_deps = [
         ":distributions_py",
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
-    tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
 cuda_py_test(
@@ -578,22 +437,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "uniform_test",
-    size = "small",
-    srcs = ["python/kernel_tests/uniform_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 cuda_py_test(
     name = "wishart_test",
     size = "small",
@@ -612,18 +455,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "kullback_leibler_test",
-    size = "small",
-    srcs = ["python/kernel_tests/kullback_leibler_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "normal_conjugate_posteriors_test",
     size = "small",
@@ -749,22 +580,6 @@ cuda_py_test(
     tags = ["no_pip"],
 )
 
-cuda_py_test(
-    name = "special_math_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/special_math_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
     name = "distribution_util_test",
     size = "small",
@@ -813,25 +628,6 @@ filegroup(
 
 # === Bijector Tests ==========================================================
 
-cuda_py_test(
-    name = "bijector_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bijectors/bijector_test.py"],
-    additional_deps = [
-        ":bijectors_py",
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "conditional_bijector_test",
     size = "small",
@@ -947,25 +743,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "identity_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bijectors/identity_test.py"],
-    additional_deps = [
-        ":bijectors_py",
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-        "//tensorflow/contrib/linalg:linalg_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "inline_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 470b9edb793d5b317bb81a174b2c1b197184cfaf..1fddad53689a0d74d00c1f210d81b83975fb1d37 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -15,74 +15,6 @@
 """Classes representing statistical distributions and ops for working with them.
 
 See the @{$python/contrib.distributions} guide.
-
-## Distribution Object
-@@ReparameterizationType
-@@Distribution
-
-## Individual Distributions
-@@Binomial
-@@Bernoulli
-@@BernoulliWithSigmoidProbs
-@@Beta
-@@BetaWithSoftplusConcentration
-@@Categorical
-@@Chi2
-@@Chi2WithAbsDf
-@@Deterministic
-@@VectorDeterministic
-@@Exponential
-@@ExponentialWithSoftplusRate
-@@Gamma
-@@GammaWithSoftplusConcentrationRate
-@@Geometric
-@@InverseGamma
-@@InverseGammaWithSoftplusConcentrationRate
-@@Laplace
-@@LaplaceWithSoftplusScale
-@@Logistic
-@@NegativeBinomial
-@@Normal
-@@NormalWithSoftplusScale
-@@Poisson
-@@StudentT
-@@StudentTWithAbsDfSoftplusScale
-@@Uniform
-
-@@MultivariateNormalDiag
-@@MultivariateNormalTriL
-@@MultivariateNormalDiagPlusLowRank
-@@MultivariateNormalDiagWithSoftplusScale
-
-@@Dirichlet
-@@DirichletMultinomial
-@@Multinomial
-@@WishartCholesky
-@@WishartFull
-
-@@TransformedDistribution
-@@QuantizedDistribution
-
-@@Mixture
-
-@@ExpRelaxedOneHotCategorical
-@@OneHotCategorical
-@@RelaxedBernoulli
-@@RelaxedOneHotCategorical
-
-## Kullback-Leibler Divergence
-@@kl
-@@RegisterKL
-
-## Helper Functions
-@@matrix_diag_transform
-@@normal_conjugates_known_scale_posterior
-@@normal_conjugates_known_scale_predictive
-@@softplus_inverse
-
-## Functions for statistics of samples
-@@percentile
-
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -90,35 +22,23 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
-
-from tensorflow.contrib.distributions.python.ops.bernoulli import *
-from tensorflow.contrib.distributions.python.ops.beta import *
-from tensorflow.contrib.distributions.python.ops.bijectors import *
+from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.binomial import *
-from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
 from tensorflow.contrib.distributions.python.ops.deterministic import *
-from tensorflow.contrib.distributions.python.ops.dirichlet import *
-from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
-from tensorflow.contrib.distributions.python.ops.distribution import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
-from tensorflow.contrib.distributions.python.ops.exponential import *
-from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
-from tensorflow.contrib.distributions.python.ops.kullback_leibler import *
-from tensorflow.contrib.distributions.python.ops.laplace import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
-from tensorflow.contrib.distributions.python.ops.multinomial import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
+from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
 from tensorflow.contrib.distributions.python.ops.mvn_tril import *
 from tensorflow.contrib.distributions.python.ops.negative_binomial import *
-from tensorflow.contrib.distributions.python.ops.normal import *
 from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
 from tensorflow.contrib.distributions.python.ops.onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.poisson import *
@@ -126,17 +46,101 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
-from tensorflow.contrib.distributions.python.ops.student_t import *
-from tensorflow.contrib.distributions.python.ops.transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.uniform import *
+from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
+from tensorflow.python.ops.distributions.bernoulli import *
+from tensorflow.python.ops.distributions.beta import *
+from tensorflow.python.ops.distributions.categorical import *
+from tensorflow.python.ops.distributions.dirichlet import *
+from tensorflow.python.ops.distributions.dirichlet_multinomial import *
+from tensorflow.python.ops.distributions.distribution import *
+from tensorflow.python.ops.distributions.exponential import *
+from tensorflow.python.ops.distributions.gamma import *
+from tensorflow.python.ops.distributions.kullback_leibler import *
+from tensorflow.python.ops.distributions.laplace import *
+from tensorflow.python.ops.distributions.multinomial import *
+from tensorflow.python.ops.distributions.normal import *
+from tensorflow.python.ops.distributions.student_t import *
+from tensorflow.python.ops.distributions.transformed_distribution import *
+from tensorflow.python.ops.distributions.uniform import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['ConditionalDistribution',
-                    'ConditionalTransformedDistribution',
-                    'FULLY_REPARAMETERIZED', 'NOT_REPARAMETERIZED']
+_allowed_symbols = [
+    'bijectors',
+    'ConditionalDistribution',
+    'ConditionalTransformedDistribution',
+    'FULLY_REPARAMETERIZED',
+    'NOT_REPARAMETERIZED',
+    'Affine',
+    'AffineLinearOperator',
+    'Bijector',
+    'Chain',
+    'CholeskyOuterProduct',
+    'Exp',
+    'Identity',
+    'Inline',
+    'Invert',
+    'PowerTransform',
+    'SigmoidCentered',
+    'SoftmaxCentered',
+    'Softplus',
+    'ReparameterizationType',
+    'Distribution',
+    'Binomial',
+    'Bernoulli',
+    'BernoulliWithSigmoidProbs',
+    'Beta',
+    'BetaWithSoftplusConcentration',
+    'Categorical',
+    'Chi2',
+    'Chi2WithAbsDf',
+    'Deterministic',
+    'VectorDeterministic',
+    'Exponential',
+    'ExponentialWithSoftplusRate',
+    'Gamma',
+    'GammaWithSoftplusConcentrationRate',
+    'Geometric',
+    'InverseGamma',
+    'InverseGammaWithSoftplusConcentrationRate',
+    'Laplace',
+    'LaplaceWithSoftplusScale',
+    'Logistic',
+    'NegativeBinomial',
+    'Normal',
+    'NormalWithSoftplusScale',
+    'Poisson',
+    'StudentT',
+    'StudentTWithAbsDfSoftplusScale',
+    'Uniform',
+    'MultivariateNormalDiag',
+    'MultivariateNormalFullCovariance',
+    'MultivariateNormalTriL',
+    'MultivariateNormalDiagPlusLowRank',
+    'MultivariateNormalDiagWithSoftplusScale',
+    'Dirichlet',
+    'DirichletMultinomial',
+    'Multinomial',
+    'VectorLaplaceDiag',
+    'WishartCholesky',
+    'WishartFull',
+    'TransformedDistribution',
+    'QuantizedDistribution',
+    'Mixture',
+    'ExpRelaxedOneHotCategorical',
+    'OneHotCategorical',
+    'RelaxedBernoulli',
+    'RelaxedOneHotCategorical',
+    'kl_divergence',
+    'RegisterKL',
+    'matrix_diag_transform',
+    'normal_conjugates_known_scale_posterior',
+    'normal_conjugates_known_scale_predictive',
+    'softplus_inverse',
+    'percentile'
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
index d4f83567e4ef65bbb50710dcd52cd86111cf1738..0738754b217e5842bd0fa516915f14926083d321 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_linear_operator_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops.bijectors import affine_linear_operator as affine_linear_operator_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
 from tensorflow.python.platform import test
 
 
@@ -29,7 +29,7 @@ class AffineLinearOperatorTest(test.TestCase):
 
   def testIdentity(self):
     with self.test_session():
-      affine = affine_linear_operator_lib.AffineLinearOperator(
+      affine = AffineLinearOperator(
           validate_args=True)
       x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
       y = x
@@ -48,7 +48,7 @@ class AffineLinearOperatorTest(test.TestCase):
       diag = np.array([[1, 2, 3],
                        [2, 5, 6]], dtype=np.float32)
       scale = linalg.LinearOperatorDiag(diag, is_non_singular=True)
-      affine = affine_linear_operator_lib.AffineLinearOperator(
+      affine = AffineLinearOperator(
           shift=shift, scale=scale, validate_args=True)
 
       x = np.array([[1, 0, -1], [2, 3, 4]], dtype=np.float32)
@@ -73,7 +73,7 @@ class AffineLinearOperatorTest(test.TestCase):
                         [4, 3, 2]]],
                       dtype=np.float32)
       scale = linalg.LinearOperatorTriL(tril, is_non_singular=True)
-      affine = affine_linear_operator_lib.AffineLinearOperator(
+      affine = AffineLinearOperator(
           shift=shift, scale=scale, validate_args=True)
 
       x = np.array([[[1, 0, -1],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
index 71460a176951b46c7780a93afd78beff702d1683..e8fd6aa2f73fa3457333483111379f0d987801ff 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py
@@ -22,10 +22,10 @@ import itertools
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import affine as affine_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
+from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -36,7 +36,7 @@ class AffineBijectorTest(test.TestCase):
     with self.test_session():
       mu = -1.
       # scale corresponds to 1.
-      bijector = affine_lib.Affine(shift=mu, event_ndims=0)
+      bijector = Affine(shift=mu, event_ndims=0)
       self.assertEqual("affine", bijector.name)
 
   def testNoBatchScalarViaIdentity(self):
@@ -53,7 +53,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = 2
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu, scale_identity_multiplier=2., event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1., 2, 3]  # Three scalar samples (no batches).
@@ -76,7 +76,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = 2
-        bijector = affine_lib.Affine(shift=mu, scale_diag=[2.], event_ndims=0)
+        bijector = Affine(shift=mu, scale_diag=[2.], event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1., 2, 3]  # Three scalar samples (no batches).
         self.assertAllClose([1., 3, 5], run(bijector.forward, x))
@@ -98,7 +98,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = 2.
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu, scale_identity_multiplier=2., event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [[1., 2, 3], [4, 5, 6]]  # Weird sample shape.
@@ -126,7 +126,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1.]
         # One batch, scalar.
         # Corresponds to scale = 1.
-        bijector = affine_lib.Affine(shift=mu, event_ndims=0)
+        bijector = Affine(shift=mu, event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1.]  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
@@ -148,7 +148,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1.]
         # One batch, scalar.
         # Corresponds to scale = 1.
-        bijector = affine_lib.Affine(shift=mu, scale_diag=[1.], event_ndims=0)
+        bijector = Affine(shift=mu, scale_diag=[1.], event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1.]  # One sample from one batches.
         self.assertAllClose([2.], run(bijector.forward, x))
@@ -170,7 +170,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1., -1]
         # Univariate, two batches.
         # Corresponds to scale = 1.
-        bijector = affine_lib.Affine(shift=mu, event_ndims=0)
+        bijector = Affine(shift=mu, event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1., 1]  # One sample from each of two batches.
         self.assertAllClose([2., 0], run(bijector.forward, x))
@@ -192,7 +192,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1., -1]
         # Univariate, two batches.
         # Corresponds to scale = 1.
-        bijector = affine_lib.Affine(shift=mu, scale_diag=[1.], event_ndims=0)
+        bijector = Affine(shift=mu, scale_diag=[1.], event_ndims=0)
         self.assertEqual(0, bijector.event_ndims.eval())  # "is scalar"
         x = [1., 1]  # One sample from each of two batches.
         self.assertAllClose([2., 0], run(bijector.forward, x))
@@ -214,7 +214,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1., -1]
         # Multivariate
         # Corresponds to scale = [[1., 0], [0, 1.]]
-        bijector = affine_lib.Affine(shift=mu)
+        bijector = Affine(shift=mu)
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 1]
         # matmul(sigma, x) + shift
@@ -245,7 +245,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [1., -1]
         # Multivariate
         # Corresponds to scale = [[2., 0], [0, 1.]]
-        bijector = affine_lib.Affine(shift=mu, scale_diag=[2., 1])
+        bijector = Affine(shift=mu, scale_diag=[2., 1])
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 1]
         # matmul(sigma, x) + shift
@@ -287,7 +287,7 @@ class AffineBijectorTest(test.TestCase):
           event_ndims: event_ndims_value
       }
 
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
       self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
       self.assertAllClose([[3., 1]], sess.run(bijector.forward(x), feed_dict))
@@ -311,7 +311,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [[1., -1]]
         # Corresponds to 1 2x2 matrix, with twos on the diagonal.
         scale = 2.
-        bijector = affine_lib.Affine(shift=mu, scale_identity_multiplier=scale)
+        bijector = Affine(shift=mu, scale_identity_multiplier=scale)
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
@@ -334,7 +334,7 @@ class AffineBijectorTest(test.TestCase):
         mu = [[1., -1]]
         # Corresponds to 1 2x2 matrix, with twos on the diagonal.
         scale_diag = [[2., 2]]
-        bijector = affine_lib.Affine(shift=mu, scale_diag=scale_diag)
+        bijector = Affine(shift=mu, scale_diag=scale_diag)
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[[1., 1]]]
         self.assertAllClose([[[3., 1]]], run(bijector.forward, x))
@@ -361,7 +361,7 @@ class AffineBijectorTest(test.TestCase):
           event_ndims: event_ndims_value
       }
 
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=mu, scale_diag=scale_diag, event_ndims=event_ndims)
       self.assertEqual(1, sess.run(bijector.event_ndims, feed_dict))
       self.assertAllClose([[[3., 1]]], sess.run(bijector.forward(x), feed_dict))
@@ -384,7 +384,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = 2
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_identity_multiplier=1.,
             scale_diag=[1.],
@@ -410,7 +410,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # scale = [[2., 0], [2, 2]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_identity_multiplier=1.,
             scale_tril=[[1., 0], [2., 1]])
@@ -435,7 +435,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # scale = [[2., 0], [2, 3]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu, scale_diag=[1., 2.], scale_tril=[[1., 0], [2., 1]])
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [[1., 2]]  # One multivariate sample.
@@ -458,7 +458,7 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # scale = [[3., 0], [2, 4]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_identity_multiplier=1.0,
             scale_diag=[1., 2.],
@@ -484,14 +484,12 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = [[10, 0, 0], [0, 2, 0], [0, 0, 3]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_identity_multiplier=2.,
             scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = affine_lib.Affine(shift=mu, scale_diag=[10., 2, 3])
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(shift=mu, scale_diag=[10., 2, 3])
 
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
@@ -522,14 +520,12 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = [[10, 0, 0], [0, 3, 0], [0, 0, 5]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_diag=[2., 3, 4],
             scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = affine_lib.Affine(shift=mu, scale_diag=[10., 3, 5])
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(shift=mu, scale_diag=[10., 3, 5])
 
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
@@ -559,19 +555,13 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = [[10, 0, 0], [1, 3, 0], [2, 3, 5]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
-            scale_tril=[[2., 0, 0],
-                        [1, 3, 0],
-                        [2, 3, 4]],
+            scale_tril=[[2., 0, 0], [1, 3, 0], [2, 3, 4]],
             scale_perturb_diag=[2., 1],
-            scale_perturb_factor=[[2., 0],
-                                  [0., 0],
-                                  [0, 1]])
-        bijector_ref = affine_lib.Affine(
-            shift=mu, scale_tril=[[10., 0, 0],
-                                  [1, 3, 0],
-                                  [2, 3, 5]])
+            scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
+        bijector_ref = Affine(
+            shift=mu, scale_tril=[[10., 0, 0], [1, 3, 0], [2, 3, 5]])
 
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
         x = [1., 2, 3]  # Vector.
@@ -601,12 +591,12 @@ class AffineBijectorTest(test.TestCase):
       for run in (static_run, dynamic_run):
         mu = -1.
         # Corresponds to scale = [[6, 0, 0], [1, 3, 0], [2, 3, 5]]
-        bijector = affine_lib.Affine(
+        bijector = Affine(
             shift=mu,
             scale_tril=[[2., 0, 0], [1, 3, 0], [2, 3, 4]],
             scale_perturb_diag=None,
             scale_perturb_factor=[[2., 0], [0., 0], [0, 1]])
-        bijector_ref = affine_lib.Affine(
+        bijector_ref = Affine(
             shift=mu, scale_tril=[[6., 0, 0], [1, 3, 0], [2, 3, 5]])
 
         self.assertEqual(1, bijector.event_ndims.eval())  # "is vector"
@@ -626,7 +616,7 @@ class AffineBijectorTest(test.TestCase):
   def testNoBatchMultivariateRaisesWhenSingular(self):
     with self.test_session():
       mu = [1., -1]
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=mu,
           # Has zero on the diagonal.
           scale_diag=[0., 1],
@@ -638,14 +628,14 @@ class AffineBijectorTest(test.TestCase):
     with self.test_session():
       mu = [1., -1]
       # Scale corresponds to 2x2 identity matrix.
-      bijector = affine_lib.Affine(shift=mu, event_ndims=2, validate_args=True)
+      bijector = Affine(shift=mu, event_ndims=2, validate_args=True)
       bijector.forward([1., 1.]).eval()
 
   def testScaleZeroScalarRaises(self):
     with self.test_session():
       mu = -1.
       # Check Identity matrix with zero scaling.
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=mu,
           scale_identity_multiplier=0.0,
           event_ndims=0,
@@ -654,16 +644,16 @@ class AffineBijectorTest(test.TestCase):
         bijector.forward(1.).eval()
 
       # Check Diag matrix with zero scaling.
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=mu, scale_diag=[0.0], event_ndims=0, validate_args=True)
       with self.assertRaisesOpError("Condition x > 0"):
         bijector.forward(1.).eval()
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = affine_lib.Affine(
+      bijector = Affine(
           shift=3.6, scale_identity_multiplier=0.42, event_ndims=0)
-      bijector_test_util.assert_scalar_congruency(
+      assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def _makeScale(self,
@@ -743,9 +733,9 @@ class AffineBijectorTest(test.TestCase):
         # We haven't specified enough information for the scale.
         if scale is None:
           with self.assertRaisesRegexp(ValueError, ("must be specified.")):
-            bijector = affine_lib.Affine(shift=shift, **bijector_args)
+            bijector = Affine(shift=shift, **bijector_args)
         else:
-          bijector = affine_lib.Affine(shift=shift, **bijector_args)
+          bijector = Affine(shift=shift, **bijector_args)
           np_x = x
           # For the case a vector is passed in, we need to make the shape
           # match the matrix for matmul to work.
@@ -823,7 +813,7 @@ class AffineBijectorTest(test.TestCase):
   def testScalePropertyAssertsCorrectly(self):
     with self.test_session():
       with self.assertRaises(NotImplementedError):
-        scale = affine_lib.Affine(  # pylint:disable=unused-variable
+        scale = Affine(  # pylint:disable=unused-variable
             scale_tril=[[1., 0], [2, 1]],
             scale_perturb_factor=[2., 1.]).scale
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index ecf068bf6b587d6d987b6b46cc0d54a862f26030..20e754308449af3f0399101f4ea1bb47b3356424 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import chain as chain_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import exp as exp_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered as softmax_centered_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import softplus as softplus_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.chain import Chain
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -34,8 +34,7 @@ class ChainBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      chain = chain_lib.Chain((exp_lib.Exp(event_ndims=1),
-                               softplus_lib.Softplus(event_ndims=1)))
+      chain = Chain((Exp(event_ndims=1), Softplus(event_ndims=1)))
       self.assertEqual("chain_of_exp_of_softplus", chain.name)
       x = np.asarray([[[1., 2.],
                        [2., 3.]]])
@@ -49,7 +48,7 @@ class ChainBijectorTest(test.TestCase):
 
   def testBijectorIdentity(self):
     with self.test_session():
-      chain = chain_lib.Chain()
+      chain = Chain()
       self.assertEqual("identity", chain.name)
       x = np.asarray([[[1., 2.],
                        [2., 3.]]])
@@ -60,17 +59,18 @@ class ChainBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = chain_lib.Chain((exp_lib.Exp(), softplus_lib.Softplus()))
-      bijector_test_util.assert_scalar_congruency(
+      bijector = Chain((Exp(), Softplus()))
+      assert_scalar_congruency(
           bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = chain_lib.Chain([
-          softmax_centered_lib.SoftmaxCentered(
+      bijector = Chain([
+          SoftmaxCentered(
               event_ndims=1, validate_args=True),
-          softmax_centered_lib.SoftmaxCentered(
-              event_ndims=0, validate_args=True)])
+          SoftmaxCentered(
+              event_ndims=0, validate_args=True)
+      ])
       x = tensor_shape.TensorShape([])
       y = tensor_shape.TensorShape([2 + 1])
       self.assertAllEqual(y, bijector.forward_event_shape(x))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index 7b09553be9a425029a39b63c9319fc958de981c1..0ff35304283fce9ce3f9e5d31b1258394e384d7b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -57,7 +57,7 @@ class InvertBijectorTest(test.TestCase):
   def testScalarCongruency(self):
     with self.test_session():
       bijector = bijectors.Invert(bijectors.Exp())
-      bijector_test_util.assert_scalar_congruency(
+      assert_scalar_congruency(
           bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 9c2a1c8f91dfacf3182e143924396340652062ea..26e0d2a539c78540603281ae0f361987a7bf8d90 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import conditional_bijector
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import ConditionalBijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
 
 
-class _TestBijector(conditional_bijector.ConditionalBijector):
+class _TestBijector(ConditionalBijector):
 
   def __init__(self):
     super(_TestBijector, self).__init__(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
index 04ddf09b69e66f852b42252d1b36fea6f32159af..9970c0b4d86afda188d9401ebaf3c98d3fffbfdf 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/exp_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import exp as exp_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -30,7 +31,7 @@ class ExpBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = exp_lib.Exp(event_ndims=1)
+      bijector = Exp(event_ndims=1)
       self.assertEqual("exp", bijector.name)
       x = [[[1.], [2.]]]
       y = np.exp(x)
@@ -44,16 +45,16 @@ class ExpBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = exp_lib.Exp()
-      bijector_test_util.assert_scalar_congruency(
+      bijector = Exp()
+      assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = exp_lib.Exp(event_ndims=0)
+      bijector = Exp(event_ndims=0)
       x = np.linspace(-10, 10, num=10).astype(np.float32)
       y = np.logspace(-10, 10, num=10).astype(np.float32)
-      bijector_test_util.assert_bijective_and_finite(bijector, x, y)
+      assert_bijective_and_finite(bijector, x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
index c17c67c6f58ad856ad4dcabfd8cbdd0143b69b7f..739fa6d439a8bce993ab1b4601489d9bbcd69bee 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/inline_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import exp as exp_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import inline as inline_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.exp import Exp
+from tensorflow.contrib.distributions.python.ops.bijectors.inline import Inline
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -33,8 +33,8 @@ class InlineBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      exp = exp_lib.Exp(event_ndims=1)
-      inline = inline_lib.Inline(
+      exp = Exp(event_ndims=1)
+      inline = Inline(
           forward_fn=math_ops.exp,
           inverse_fn=math_ops.log,
           inverse_log_det_jacobian_fn=(
@@ -57,7 +57,7 @@ class InlineBijectorTest(test.TestCase):
 
   def testShapeGetters(self):
     with self.test_session():
-      bijector = inline_lib.Inline(
+      bijector = Inline(
           forward_event_shape_tensor_fn=lambda x: array_ops.concat((x, [1]), 0),
           forward_event_shape_fn=lambda x: x.as_list() + [1],
           inverse_event_shape_tensor_fn=lambda x: x[:-1],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 7b09553be9a425029a39b63c9319fc958de981c1..0ff35304283fce9ce3f9e5d31b1258394e384d7b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import transformed_distribution as transformed_distribution_lib
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -57,7 +57,7 @@ class InvertBijectorTest(test.TestCase):
   def testScalarCongruency(self):
     with self.test_session():
       bijector = bijectors.Invert(bijectors.Exp())
-      bijector_test_util.assert_scalar_congruency(
+      assert_scalar_congruency(
           bijector, lower_x=1e-3, upper_x=1.5, rtol=0.05)
 
   def testShapeGetters(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
index e95bbdf67e6e6d2b9e67a4a35b6ab9eb21330ee9..de1659aa9f4d0f7d19ec2e8185715573b78eaf2b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/power_transform_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import power_transform as power_transform_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import PowerTransform
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -31,7 +32,7 @@ class PowerTransformBijectorTest(test.TestCase):
   def testBijector(self):
     with self.test_session():
       c = 0.2
-      bijector = power_transform_lib.PowerTransform(
+      bijector = PowerTransform(
           power=c, event_ndims=1, validate_args=True)
       self.assertEqual("power_transform", bijector.name)
       x = np.array([[[-1.], [2.], [-5. + 1e-4]]])
@@ -49,18 +50,18 @@ class PowerTransformBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = power_transform_lib.PowerTransform(
+      bijector = PowerTransform(
           power=0.2, validate_args=True)
-      bijector_test_util.assert_scalar_congruency(
+      assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=1.5, rtol=0.05)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      bijector = power_transform_lib.PowerTransform(
+      bijector = PowerTransform(
           power=0.2, event_ndims=0, validate_args=True)
       x = np.linspace(-4.999, 10, num=10).astype(np.float32)
       y = np.logspace(0.001, 10, num=10).astype(np.float32)
-      bijector_test_util.assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
+      assert_bijective_and_finite(bijector, x, y, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
index 8884da146d8325b0efba849b29123e4e6b312cd4..4ff3f334ccb59f1c117b3d35032d9e799cfd79bb 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_centered_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import sigmoid_centered as sigmoid_centered_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import SigmoidCentered
 from tensorflow.python.platform import test
 
 
@@ -29,7 +29,7 @@ class SigmoidCenteredBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      sigmoid = sigmoid_centered_lib.SigmoidCentered()
+      sigmoid = SigmoidCentered()
       self.assertEqual("sigmoid_centered", sigmoid.name)
       x = np.log([[2., 3, 4],
                   [4., 8, 12]])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
index e16f9dff22536a9fd84ba18df3afd79320ff0b6f..e4f9d72785c301284812a48c0a67614ca439ffae 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sigmoid_test.py
@@ -21,8 +21,9 @@ from __future__ import print_function
 import numpy as np
 from scipy import special
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import sigmoid
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 
@@ -31,35 +32,27 @@ class SigmoidBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      self.assertEqual("sigmoid", sigmoid.Sigmoid().name)
+      self.assertEqual("sigmoid", Sigmoid().name)
       x = np.linspace(-10., 10., 100).reshape([2, 5, 10]).astype(np.float32)
       y = special.expit(x)
       ildj = -np.log(y) - np.log1p(-y)
-      self.assertAllClose(
-          y, sigmoid.Sigmoid().forward(x).eval(),
-          atol=0., rtol=1e-2)
-      self.assertAllClose(
-          x, sigmoid.Sigmoid().inverse(y).eval(),
-          atol=0., rtol=1e-4)
-      self.assertAllClose(
-          ildj, sigmoid.Sigmoid().inverse_log_det_jacobian(y).eval(),
-          atol=0., rtol=1e-6)
-      self.assertAllClose(
-          -ildj, sigmoid.Sigmoid().forward_log_det_jacobian(x).eval(),
-          atol=0., rtol=1e-4)
+      self.assertAllClose(y, Sigmoid().forward(x).eval(), atol=0., rtol=1e-2)
+      self.assertAllClose(x, Sigmoid().inverse(y).eval(), atol=0., rtol=1e-4)
+      self.assertAllClose(ildj, Sigmoid().inverse_log_det_jacobian(y).eval(),
+                          atol=0., rtol=1e-6)
+      self.assertAllClose(-ildj, Sigmoid().forward_log_det_jacobian(x).eval(),
+                          atol=0., rtol=1e-4)
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector_test_util.assert_scalar_congruency(
-          sigmoid.Sigmoid(), lower_x=-7., upper_x=7.)
+      assert_scalar_congruency(Sigmoid(), lower_x=-7., upper_x=7.)
 
   def testBijectiveAndFinite(self):
     with self.test_session():
       x = np.linspace(-7., 7., 100).astype(np.float32)
       eps = 1e-3
       y = np.linspace(eps, 1. - eps, 100).astype(np.float32)
-      bijector_test_util.assert_bijective_and_finite(
-          sigmoid.Sigmoid(), x, y, atol=0., rtol=1e-4)
+      assert_bijective_and_finite(Sigmoid(), x, y, atol=0., rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
index 97df62ec05f238574925fe80a31d3baa960b2936..62e3869db090e9c9327bc552d10234ff76ba28fd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softmax_centered_test.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered as softmax_centered_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
@@ -34,7 +34,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
 
   def testBijectorScalar(self):
     with self.test_session():
-      softmax = softmax_centered_lib.SoftmaxCentered()  # scalar by default
+      softmax = SoftmaxCentered()  # scalar by default
       self.assertEqual("softmax_centered", softmax.name)
       x = np.log([[2., 3, 4],
                   [4., 8, 12]])
@@ -59,7 +59,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
 
   def testBijectorVector(self):
     with self.test_session():
-      softmax = softmax_centered_lib.SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered(event_ndims=1)
       self.assertEqual("softmax_centered", softmax.name)
       x = np.log([[2., 3, 4], [4., 8, 12]])
       y = [[0.2, 0.3, 0.4, 0.1], [0.16, 0.32, 0.48, 0.04]]
@@ -80,11 +80,11 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
     with self.test_session():
       for x, y, b in ((tensor_shape.TensorShape([]),
                        tensor_shape.TensorShape([2]),
-                       softmax_centered_lib.SoftmaxCentered(
+                       SoftmaxCentered(
                            event_ndims=0, validate_args=True)),
                       (tensor_shape.TensorShape([4]),
                        tensor_shape.TensorShape([5]),
-                       softmax_centered_lib.SoftmaxCentered(
+                       SoftmaxCentered(
                            event_ndims=1, validate_args=True))):
         self.assertAllEqual(y, b.forward_event_shape(x))
         self.assertAllEqual(y.as_list(),
@@ -95,7 +95,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
 
   def testBijectiveAndFinite(self):
     with self.test_session():
-      softmax = softmax_centered_lib.SoftmaxCentered(event_ndims=1)
+      softmax = SoftmaxCentered(event_ndims=1)
       x = np.linspace(-50, 50, num=10).reshape(5, 2).astype(np.float32)
       # Make y values on the simplex with a wide range.
       y_0 = np.ones(5).astype(np.float32)
@@ -104,7 +104,7 @@ class SoftmaxCenteredBijectorTest(test.TestCase):
       y = np.array([y_0, y_1, y_2])
       y /= y.sum(axis=0)
       y = y.T  # y.shape = [5, 3]
-      bijector_test_util.assert_bijective_and_finite(softmax, x, y)
+      assert_bijective_and_finite(softmax, x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
index a7db1dcc6c9d5a5b86e4e303c616ddd0e0ae0ed9..d9af9aec50d3d69bb10f69f2ffd6ca3a24c316f8 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import softplus as softplus_lib
+from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
+from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
+from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(42)
@@ -40,9 +41,15 @@ class SoftplusBijectorTest(test.TestCase):
     """Inverse log det jacobian, before being reduced."""
     return -np.log(1 - np.exp(-y))
 
+  def testHingeSoftnessZeroRaises(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=0., validate_args=True)
+      with self.assertRaisesOpError("must be non-zero"):
+        bijector.forward([1., 1.]).eval()
+
   def testBijectorForwardInverseEventDimsZero(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=0)
+      bijector = Softplus(event_ndims=0)
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -50,9 +57,18 @@ class SoftplusBijectorTest(test.TestCase):
       self.assertAllClose(y, bijector.forward(x).eval())
       self.assertAllClose(x, bijector.inverse(y).eval())
 
+  def testBijectorForwardInverseWithHingeSoftnessEventDimsZero(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.5)
+      x = 2 * rng.randn(2, 10)
+      y = 1.5 * self._softplus(x / 1.5)
+
+      self.assertAllClose(y, bijector.forward(x).eval())
+      self.assertAllClose(x, bijector.inverse(y).eval())
+
   def testBijectorLogDetJacobianEventDimsZero(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=0)
+      bijector = Softplus(event_ndims=0)
       y = 2 * rng.rand(2, 10)
       # No reduction needed if event_dims = 0.
       ildj = self._softplus_ildj_before_reduction(y)
@@ -61,7 +77,7 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorForwardInverseEventDimsOne(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=1)
+      bijector = Softplus(event_ndims=1)
       self.assertEqual("softplus", bijector.name)
       x = 2 * rng.randn(2, 10)
       y = self._softplus(x)
@@ -71,7 +87,7 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testBijectorLogDetJacobianEventDimsOne(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=1)
+      bijector = Softplus(event_ndims=1)
       y = 2 * rng.rand(2, 10)
       ildj_before = self._softplus_ildj_before_reduction(y)
       ildj = np.sum(ildj_before, axis=1)
@@ -80,28 +96,56 @@ class SoftplusBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=0)
-      bijector_test_util.assert_scalar_congruency(
+      bijector = Softplus(event_ndims=0)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testScalarCongruencyWithPositiveHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.3)
+      assert_scalar_congruency(
+          bijector, lower_x=-2., upper_x=2.)
+
+  def testScalarCongruencyWithNegativeHingeSoftness(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-1.3)
+      assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
   def testBijectiveAndFinite32bit(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=0)
+      bijector = Softplus(event_ndims=0)
       x = np.linspace(-20., 20., 100).astype(np.float32)
       y = np.logspace(-10, 10, 100).astype(np.float32)
-      bijector_test_util.assert_bijective_and_finite(
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFiniteWithPositiveHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=1.23)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
+          bijector, x, y, rtol=1e-2, atol=1e-2)
+
+  def testBijectiveAndFiniteWithNegativeHingeSoftness32Bit(self):
+    with self.test_session():
+      bijector = Softplus(event_ndims=0, hinge_softness=-0.7)
+      x = np.linspace(-20., 20., 100).astype(np.float32)
+      y = -np.logspace(-10, 10, 100).astype(np.float32)
+      assert_bijective_and_finite(
           bijector, x, y, rtol=1e-2, atol=1e-2)
 
   def testBijectiveAndFinite16bit(self):
     with self.test_session():
-      bijector = softplus_lib.Softplus(event_ndims=0)
+      bijector = Softplus(event_ndims=0)
       # softplus(-20) is zero, so we can't use such a large range as in 32bit.
       x = np.linspace(-10., 20., 100).astype(np.float16)
       # Note that float16 is only in the open set (0, inf) for a smaller
       # logspace range.  The actual range was (-7, 4), so use something smaller
       # for the test.
       y = np.logspace(-6, 3, 100).astype(np.float16)
-      bijector_test_util.assert_bijective_and_finite(
+      assert_bijective_and_finite(
           bijector, x, y, rtol=1e-1, atol=1e-3)
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
index f34d7a1fd4d22f61f0a25b168b123b6915f6bafa..d30f6e418d79f63324fd125ade1448a6007efade 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/binomial_test.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import binomial
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -69,19 +71,25 @@ class BinomialTest(test.TestCase):
       self.assertEqual((1, 3), binom.logits.get_shape())
       self.assertAllClose(logits, binom.logits.eval())
 
-  def testPmfNandCountsAgree(self):
+  def testPmfAndCdfNandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
       binom.prob([3., 1, 2]).eval()
+      binom.cdf([2., 3, 2]).eval()
+      binom.cdf([3., 1, 2]).eval()
       with self.assertRaisesOpError("Condition x >= 0.*"):
         binom.prob([-1., 4, 2]).eval()
       with self.assertRaisesOpError("Condition x <= y.*"):
         binom.prob([7., 3, 0]).eval()
+      with self.assertRaisesOpError("Condition x >= 0.*"):
+        binom.cdf([-1., 4, 2]).eval()
+      with self.assertRaisesOpError("Condition x <= y.*"):
+        binom.cdf([7., 3, 0]).eval()
 
-  def testPmfNonIntegerCounts(self):
+  def testPmfAndCdfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
@@ -89,50 +97,73 @@ class BinomialTest(test.TestCase):
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=True)
       binom.prob([2., 3, 2]).eval()
       binom.prob([3., 1, 2]).eval()
+      binom.cdf([2., 3, 2]).eval()
+      binom.cdf([3., 1, 2]).eval()
+      placeholder = array_ops.placeholder(dtypes.float32)
       # Both equality and integer checking fail.
       with self.assertRaisesOpError(
           "cannot contain fractional components."):
-        binom.prob([1.0, 2.5, 1.5]).eval()
+        binom.prob(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
+      with self.assertRaisesOpError(
+          "cannot contain fractional components."):
+        binom.cdf(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
 
       binom = binomial.Binomial(total_count=n, probs=p, validate_args=False)
       binom.prob([1., 2., 3.]).eval()
+      binom.cdf([1., 2., 3.]).eval()
       # Non-integer arguments work.
       binom.prob([1.0, 2.5, 1.5]).eval()
+      binom.cdf([1.0, 2.5, 1.5]).eval()
 
-  def testPmfBothZeroBatches(self):
+  def testPmfAndCdfBothZeroBatches(self):
     with self.test_session():
       # Both zero-batches.  No broadcast
       p = 0.5
       counts = 1.
-      pmf = binomial.Binomial(total_count=1., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=1., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=1, p=p), cdf.eval())
       self.assertEqual((), pmf.get_shape())
+      self.assertEqual((), cdf.get_shape())
 
-  def testPmfBothZeroBatchesNontrivialN(self):
+  def testPmfAndCdfBothZeroBatchesNontrivialN(self):
     with self.test_session():
       # Both zero-batches.  No broadcast
       p = 0.1
       counts = 3.
       binom = binomial.Binomial(total_count=5., probs=p)
       pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(stats.binom.pmf(counts, n=5., p=p), pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=5., p=p), cdf.eval())
       self.assertEqual((), pmf.get_shape())
+      self.assertEqual((), cdf.get_shape())
 
-  def testPmfPStretchedInBroadcastWhenSameRank(self):
+  def testPmfAndCdfPStretchedInBroadcastWhenSameRank(self):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 2.]]
-      pmf = binomial.Binomial(total_count=3., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=3., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose(stats.binom.pmf(counts, n=3., p=p), pmf.eval())
+      self.assertAllClose(stats.binom.cdf(counts, n=3., p=p), cdf.eval())
       self.assertEqual((1, 2), pmf.get_shape())
+      self.assertEqual((1, 2), cdf.get_shape())
 
-  def testPmfPStretchedInBroadcastWhenLowerRank(self):
+  def testPmfAndCdfPStretchedInBroadcastWhenLowerRank(self):
     with self.test_session():
       p = [0.1, 0.4]
       counts = [[1.], [0.]]
-      pmf = binomial.Binomial(total_count=1., probs=p).prob(counts)
+      binom = binomial.Binomial(total_count=1., probs=p)
+      pmf = binom.prob(counts)
+      cdf = binom.cdf(counts)
       self.assertAllClose([[0.1, 0.4], [0.9, 0.6]], pmf.eval())
+      self.assertAllClose([[1.0, 1.0], [0.9, 0.6]], cdf.eval())
       self.assertEqual((2, 2), pmf.get_shape())
+      self.assertEqual((2, 2), cdf.get_shape())
 
   def testBinomialMean(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
index eacdfd9ccc559e5d1d884b440aa3a294fd92edd1..545471907f1eabc822b3d28ea9c57e183a09ff50 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/conditional_transformed_distribution_test.py
@@ -22,7 +22,7 @@ import numpy as np
 
 from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.kernel_tests import transformed_distribution_test
-from tensorflow.contrib.distributions.python.ops.bijectors import conditional_bijector
+from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import ConditionalBijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,7 +32,7 @@ from tensorflow.python.platform import test
 ds = distributions
 
 
-class _ChooseLocation(conditional_bijector.ConditionalBijector):
+class _ChooseLocation(ConditionalBijector):
   """A Bijector which chooses between one of two location parameters."""
 
   def __init__(self, loc, name="ChooseLocation"):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 5a4a6720f765cdce42aff74387fa4c036fd52f7e..1c67a1b8f6e7eba052555e0f930e1db80d8f1b12 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -24,9 +24,11 @@ import numpy as np
 from scipy import special
 
 from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.linalg.python.ops import linear_operator_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -41,42 +43,44 @@ from tensorflow.python.platform import tf_logging as logging
 class AssertCloseTest(test.TestCase):
 
   def testAssertCloseIntegerDtype(self):
-    x = [1, 5, 10, 15, 20]
+    x = array_ops.placeholder(dtypes.int32)
     y = x
-    z = [2, 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.int32)
+    feed_dict = {x: [1, 5, 10, 15, 20], z: [2, 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval()
+          array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
   def testAssertCloseNonIntegerDtype(self):
-    x = np.array([1., 5, 10, 15, 20], dtype=np.float32)
+    x = array_ops.placeholder(dtypes.float32)
     y = x + 1e-8
-    z = [2., 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], z: [2., 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_close(x, y)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with ops.control_dependencies([distribution_util.assert_close(y, x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(x, z)]):
-          array_ops.identity(x).eval()
+          array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([distribution_util.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
   def testAssertCloseEpsilon(self):
     x = [0., 5, 10, 15, 20]
@@ -98,30 +102,106 @@ class AssertCloseTest(test.TestCase):
 
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
-    x = [1., 5, 10, 15, 20]
-    y = [1.1, 5, 10, 15, 20]
+    x = array_ops.placeholder(dtypes.float32)
+    y = array_ops.placeholder(dtypes.float32)
     # First component isn't less than float32.eps = 1e-7
-    z = [1.0001, 5, 10, 15, 20]
+    z = array_ops.placeholder(dtypes.float32)
     # This shouldn"t be detected as an integer.
-    w = [1e-8, 5, 10, 15, 20]
+    w = array_ops.placeholder(dtypes.float32)
+    feed_dict = {x: [1., 5, 10, 15, 20], y: [1.1, 5, 10, 15, 20],
+                 z: [1.0001, 5, 10, 15, 20], w: [1e-8, 5, 10, 15, 20]}
     with self.test_session():
       with ops.control_dependencies([distribution_util.assert_integer_form(x)]):
-        array_ops.identity(x).eval()
+        array_ops.identity(x).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(y)]):
-          array_ops.identity(y).eval()
+          array_ops.identity(y).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(z)]):
-          array_ops.identity(z).eval()
+          array_ops.identity(z).eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("x has non-integer components"):
         with ops.control_dependencies(
             [distribution_util.assert_integer_form(w)]):
-          array_ops.identity(w).eval()
+          array_ops.identity(w).eval(feed_dict=feed_dict)
+
+
+class ShapesFromLocAndScaleTest(test.TestCase):
+
+  def test_static_loc_static_scale_non_matching_event_size_raises(self):
+    loc = constant_op.constant(np.zeros((2, 4)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    with self.assertRaisesRegexp(ValueError, "could not be broadcast"):
+      distribution_util.shapes_from_loc_and_scale(loc, scale)
+
+  def test_static_loc_static_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 2]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_static_loc_dynamic_scale(self):
+    loc = constant_op.constant(np.zeros((2, 3)))
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_static_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = constant_op.constant(np.ones((5, 2, 3)))
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session():
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+      # batch_shape depends on both args, and so is dynamic.  Since loc did not
+      # have static shape, we inferred event shape entirely from scale, and this
+      # is available statically.
+      self.assertAllEqual(
+          [5, 2], batch_shape.eval(feed_dict={loc: np.zeros((2, 3))}))
+      self.assertAllEqual([3], event_shape)
+
+  def test_dynamic_loc_dynamic_scale(self):
+    loc = array_ops.placeholder(dtypes.float64)
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 2, 3)), loc: np.zeros((2, 3))})
+      self.assertAllEqual([5, 2], batch_shape)
+      self.assertAllEqual([3], event_shape)
+
+  def test_none_loc_static_scale(self):
+    loc = None
+    scale = linear_operator_diag.LinearOperatorDiag(np.ones((5, 1, 3)))
+    batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+        loc, scale)
+
+    self.assertEqual(tensor_shape.TensorShape([5, 1]), batch_shape)
+    self.assertEqual(tensor_shape.TensorShape([3]), event_shape)
+
+  def test_none_loc_dynamic_scale(self):
+    loc = None
+    diag = array_ops.placeholder(dtypes.float64)
+    scale = linear_operator_diag.LinearOperatorDiag(diag)
+    with self.test_session() as sess:
+      batch_shape, event_shape = sess.run(
+          distribution_util.shapes_from_loc_and_scale(loc, scale),
+          feed_dict={diag: np.ones((5, 1, 3))})
+      self.assertAllEqual([5, 1], batch_shape)
+      self.assertAllEqual([3], event_shape)
 
 
 class GetLogitsAndProbsTest(test.TestCase):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
index 3dbad7b607387c240cc57a437afe5e73d2aea6be..9ef68c4c2cbdbfab48602d2fd98fe30acede06f3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/geometric_test.py
@@ -22,7 +22,9 @@ import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import geometric
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -74,12 +76,13 @@ class GeometricTest(test.TestCase):
     with self.test_session():
       batch_size = 6
       probs = constant_op.constant([.9] * batch_size)
-      x = np.array([2.5, 3.2, 4.3, 5.1, 6., 7.], dtype=np.float32)
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
       geom = geometric.Geometric(probs=probs)
 
       with self.assertRaisesOpError("Condition x == y"):
         log_prob = geom.log_prob(x)
-        log_prob.eval()
+        log_prob.eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         log_prob = geom.log_prob(np.array([-1.], dtype=np.float32))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
index 7d7560c3f5954bf585e6fcb0f7df9a95c0d60f7a..eb9028e5df0af5e3f6a2adb719fc0200dc65f01c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/logistic_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 from scipy import stats
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.contrib.distributions.python.ops import logistic
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
index e8ef52547db83c9d7c0257f718f3925c7454bcb9..aa523a95118e2dbddd3b0d7005a0b26e510e6a77 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import contextlib
 
 import numpy as np
+from scipy import stats
+
 from tensorflow.contrib import distributions
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -523,6 +525,104 @@ class MixtureTest(test.TestCase):
 
         self.assertAllClose(true_entropy_lower_bound, entropy_lower_bound_value)
 
+  def testCdfScalarUnivariate(self):
+    """Tests CDF against scipy for a mixture of seven gaussians."""
+    # Construct a mixture of gaussians with seven components.
+    n_components = 7
+
+    # pre-softmax mixture probabilities.
+    mixture_weight_logits = np.random.uniform(
+        low=-1, high=1, size=(n_components,)).astype(np.float32)
+
+    def _scalar_univariate_softmax(x):
+      e_x = np.exp(x - np.max(x))
+      return e_x / e_x.sum()
+
+    # Construct the distributions_py.Mixture object.
+    mixture_weights = _scalar_univariate_softmax(mixture_weight_logits)
+    means = [np.random.uniform(low=-10, high=10, size=()).astype(np.float32)
+             for _ in range(n_components)]
+    sigmas = [np.ones(shape=(), dtype=np.float32) for _ in range(n_components)]
+    cat_tf = distributions_py.Categorical(probs=mixture_weights)
+    components_tf = [distributions_py.Normal(loc=mu, scale=sigma)
+                     for (mu, sigma) in zip(means, sigmas)]
+    mixture_tf = distributions_py.Mixture(cat=cat_tf, components=components_tf)
+
+    x_tensor = array_ops.placeholder(shape=(), dtype=dtypes.float32)
+
+    # These are two test cases to verify.
+    xs_to_check = [
+        np.array(1.0, dtype=np.float32),
+        np.array(np.random.randn()).astype(np.float32)
+    ]
+
+    # Carry out the test for both d.cdf and exp(d.log_cdf).
+    x_cdf_tf = mixture_tf.cdf(x_tensor)
+    x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
+
+    with self.test_session() as sess:
+      for x_feed in xs_to_check:
+        x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
+            [x_cdf_tf, x_log_cdf_tf], feed_dict={x_tensor: x_feed})
+
+        # Compute the cdf with scipy.
+        scipy_component_cdfs = [stats.norm.cdf(x=x_feed, loc=mu, scale=sigma)
+                                for (mu, sigma) in zip(means, sigmas)]
+        scipy_cdf_result = np.dot(mixture_weights,
+                                  np.array(scipy_component_cdfs))
+        self.assertAllClose(x_cdf_tf_result, scipy_cdf_result)
+        self.assertAllClose(np.exp(x_log_cdf_tf_result), scipy_cdf_result)
+
+  def testCdfBatchUnivariate(self):
+    """Tests against scipy for a (batch of) mixture(s) of seven gaussians."""
+    n_components = 7
+    batch_size = 5
+    mixture_weight_logits = np.random.uniform(
+        low=-1, high=1, size=(batch_size, n_components)).astype(np.float32)
+
+    def _batch_univariate_softmax(x):
+      e_x = np.exp(x)
+      e_x_sum = np.expand_dims(np.sum(e_x, axis=1), axis=1)
+      return e_x / np.tile(e_x_sum, reps=[1, x.shape[1]])
+
+    psize = (batch_size,)
+    mixture_weights = _batch_univariate_softmax(mixture_weight_logits)
+    means = [np.random.uniform(low=-10, high=10, size=psize).astype(np.float32)
+             for _ in range(n_components)]
+    sigmas = [np.ones(shape=psize, dtype=np.float32)
+              for _ in range(n_components)]
+    cat_tf = distributions_py.Categorical(probs=mixture_weights)
+    components_tf = [distributions_py.Normal(loc=mu, scale=sigma)
+                     for (mu, sigma) in zip(means, sigmas)]
+    mixture_tf = distributions_py.Mixture(cat=cat_tf, components=components_tf)
+
+    x_tensor = array_ops.placeholder(shape=psize, dtype=dtypes.float32)
+    xs_to_check = [
+        np.array([1.0, 5.9, -3, 0.0, 0.0], dtype=np.float32),
+        np.random.randn(batch_size).astype(np.float32)
+    ]
+
+    x_cdf_tf = mixture_tf.cdf(x_tensor)
+    x_log_cdf_tf = mixture_tf.log_cdf(x_tensor)
+
+    with self.test_session() as sess:
+      for x_feed in xs_to_check:
+        x_cdf_tf_result, x_log_cdf_tf_result = sess.run(
+            [x_cdf_tf, x_log_cdf_tf],
+            feed_dict={x_tensor: x_feed})
+
+        # Compute the cdf with scipy.
+        scipy_component_cdfs = [stats.norm.cdf(x=x_feed, loc=mu, scale=sigma)
+                                for (mu, sigma) in zip(means, sigmas)]
+        weights_and_cdfs = zip(np.transpose(mixture_weights, axes=[1, 0]),
+                               scipy_component_cdfs)
+        final_cdf_probs_per_component = [
+            np.multiply(c_p_value, d_cdf_value)
+            for (c_p_value, d_cdf_value) in weights_and_cdfs]
+        scipy_cdf_result = np.sum(final_cdf_probs_per_component, axis=0)
+        self.assertAllClose(x_cdf_tf_result, scipy_cdf_result)
+        self.assertAllClose(np.exp(x_log_cdf_tf_result), scipy_cdf_result)
+
 
 class MixtureBenchmark(test.Benchmark):
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
index 29cb3eb9b0d7636e16d39493d896f35af455d872..a924d2e383419702471609e14e49f7e52ea34ad9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_plus_low_rank_test.py
@@ -145,8 +145,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
     true_covariance = np.matmul(true_scale, true_scale.T)
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
-    true_det_covariance = np.linalg.det(true_covariance)
-    true_log_det_covariance = np.log(true_det_covariance)
 
     with self.test_session() as sess:
       dist = ds.MultivariateNormalDiagPlusLowRank(
@@ -185,19 +183,19 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_kl_identity = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_identity.log_prob(samps), 0)
-      analytical_kl_identity = ds.kl(dist, mvn_identity)
+      analytical_kl_identity = ds.kl_divergence(dist, mvn_identity)
 
       sample_kl_scaled = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
-      analytical_kl_scaled = ds.kl(dist, mvn_scaled)
+      analytical_kl_scaled = ds.kl_divergence(dist, mvn_scaled)
 
       sample_kl_diag = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_diag.log_prob(samps), 0)
-      analytical_kl_diag = ds.kl(dist, mvn_diag)
+      analytical_kl_diag = ds.kl_divergence(dist, mvn_diag)
 
       sample_kl_chol = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol = ds.kl(dist, mvn_chol)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
 
       n = int(10e3)
       baseline = ds.MultivariateNormalDiag(
@@ -208,19 +206,21 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_kl_identity_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_identity.log_prob(samps), 0)
-      analytical_kl_identity_diag_baseline = ds.kl(baseline, mvn_identity)
+      analytical_kl_identity_diag_baseline = ds.kl_divergence(
+          baseline, mvn_identity)
 
       sample_kl_scaled_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
-      analytical_kl_scaled_diag_baseline = ds.kl(baseline, mvn_scaled)
+      analytical_kl_scaled_diag_baseline = ds.kl_divergence(
+          baseline, mvn_scaled)
 
       sample_kl_diag_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_diag.log_prob(samps), 0)
-      analytical_kl_diag_diag_baseline = ds.kl(baseline, mvn_diag)
+      analytical_kl_diag_diag_baseline = ds.kl_divergence(baseline, mvn_diag)
 
       sample_kl_chol_diag_baseline = math_ops.reduce_mean(
           baseline.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol_diag_baseline = ds.kl(baseline, mvn_chol)
+      analytical_kl_chol_diag_baseline = ds.kl_divergence(baseline, mvn_chol)
 
       [
           sample_mean_,
@@ -229,8 +229,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
           analytical_covariance_,
           analytical_variance_,
           analytical_stddev_,
-          analytical_log_det_covariance_,
-          analytical_det_covariance_,
           scale_,
           sample_kl_identity_, analytical_kl_identity_,
           sample_kl_scaled_, analytical_kl_scaled_,
@@ -248,8 +246,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
           dist.covariance(),
           dist.variance(),
           dist.stddev(),
-          dist.log_det_covariance(),
-          dist.det_covariance(),
           scale,
           sample_kl_identity, analytical_kl_identity,
           sample_kl_scaled, analytical_kl_scaled,
@@ -264,8 +260,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
 
       sample_variance_ = np.diag(sample_covariance_)
       sample_stddev_ = np.sqrt(sample_variance_)
-      sample_det_covariance_ = np.linalg.det(sample_covariance_)
-      sample_log_det_covariance_ = np.log(sample_det_covariance_)
 
       logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
       logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
@@ -284,20 +278,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
       logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
       logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
 
-      logging.vlog(2, "true_log_det_covariance:\n{}".format(
-          true_log_det_covariance))
-      logging.vlog(2, "sample_log_det_covariance:\n{}".format(
-          sample_log_det_covariance_))
-      logging.vlog(2, "analytical_log_det_covariance:\n{}".format(
-          analytical_log_det_covariance_))
-
-      logging.vlog(2, "true_det_covariance:\n{}".format(
-          true_det_covariance))
-      logging.vlog(2, "sample_det_covariance:\n{}".format(
-          sample_det_covariance_))
-      logging.vlog(2, "analytical_det_covariance:\n{}".format(
-          analytical_det_covariance_))
-
       logging.vlog(2, "true_scale:\n{}".format(true_scale))
       logging.vlog(2, "scale:\n{}".format(scale_))
 
@@ -351,17 +331,6 @@ class MultivariateNormalDiagPlusLowRankTest(test.TestCase):
       self.assertAllClose(true_stddev, analytical_stddev_,
                           atol=0., rtol=1e-6)
 
-      self.assertAllClose(true_log_det_covariance, sample_log_det_covariance_,
-                          atol=0., rtol=0.02)
-      self.assertAllClose(true_log_det_covariance,
-                          analytical_log_det_covariance_,
-                          atol=0., rtol=1e-6)
-
-      self.assertAllClose(true_det_covariance, sample_det_covariance_,
-                          atol=0., rtol=0.02)
-      self.assertAllClose(true_det_covariance, analytical_det_covariance_,
-                          atol=0., rtol=1e-5)
-
       self.assertAllClose(true_scale, scale_,
                           atol=0., rtol=1e-6)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
index aa9d45f151deb07cb7fc001a94d255f3de8d5e3b..3f4582eb7ee1319684a9209465046bb241337f9d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_diag_test.py
@@ -103,6 +103,39 @@ class MultivariateNormalDiagTest(test.TestCase):
       self.assertAllClose(cov_mat, np.cov(samps.T),
                           atol=0.05, rtol=0.05)
 
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.MultivariateNormalDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu, mean.eval())
+
+      n = int(1e3)
+      samps = dist.sample(n, seed=0).eval()
+      cov_mat = array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps.transpose([1, 2, 0]),
+                             samps.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
   def testCovariance(self):
     with self.test_session():
       mvn = ds.MultivariateNormalDiag(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfab46ebe47b7ed6bccda59fd2f3f9cfd438479
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultivariateNormalFullCovariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy import stats
+from tensorflow.contrib import distributions
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+rng = np.random.RandomState(42)
+
+
+class MultivariateNormalFullCovarianceTest(test.TestCase):
+
+  def _random_pd_matrix(self, *shape):
+    mat = rng.rand(*shape)
+    chol = ds.matrix_diag_transform(mat, transform=nn_ops.softplus)
+    chol = array_ops.matrix_band_part(chol, -1, 0)
+    return math_ops.matmul(chol, chol, adjoint_b=True).eval()
+
+  def testRaisesIfInitializedWithNonSymmetricMatrix(self):
+    with self.test_session():
+      mu = [1., 2.]
+      sigma = [[1., 0.], [1., 1.]]  # Nonsingular, but not symmetric
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      with self.assertRaisesOpError("not symmetric"):
+        mvn.covariance().eval()
+
+  def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self):
+    with self.test_session():
+      mu = rng.rand(10)
+      sigma = self._random_pd_matrix(10, 10)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      # Should not raise
+      mvn.covariance().eval()
+
+  def testLogPDFScalarBatch(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      sigma = self._random_pd_matrix(2, 2)
+      mvn = ds.MultivariateNormalFullCovariance(mu, sigma, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=sigma)
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testLogPDFScalarBatchCovarianceNotProvided(self):
+    with self.test_session():
+      mu = rng.rand(2)
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance_matrix=None, validate_args=True)
+      x = rng.rand(2)
+
+      log_pdf = mvn.log_prob(x)
+      pdf = mvn.prob(x)
+
+      # Initialize a scipy_mvn with the default covariance.
+      scipy_mvn = stats.multivariate_normal(mean=mu, cov=np.eye(2))
+
+      expected_log_pdf = scipy_mvn.logpdf(x)
+      expected_pdf = scipy_mvn.pdf(x)
+      self.assertEqual((), log_pdf.get_shape())
+      self.assertEqual((), pdf.get_shape())
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(expected_pdf, pdf.eval())
+
+  def testShapes(self):
+    with self.test_session():
+      mu = rng.rand(3, 5, 2)
+      covariance = self._random_pd_matrix(3, 5, 2, 2)
+
+      mvn = ds.MultivariateNormalFullCovariance(
+          mu, covariance, validate_args=True)
+
+      # Shapes known at graph construction time.
+      self.assertEqual((2,), tuple(mvn.event_shape.as_list()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape.as_list()))
+
+      # Shapes known at runtime.
+      self.assertEqual((2,), tuple(mvn.event_shape_tensor().eval()))
+      self.assertEqual((3, 5), tuple(mvn.batch_shape_tensor().eval()))
+
+  def _random_mu_and_sigma(self, batch_shape, event_shape):
+    # This ensures sigma is positive def.
+    mat_shape = batch_shape + event_shape + event_shape
+    mat = rng.randn(*mat_shape)
+    perm = np.arange(mat.ndim)
+    perm[-2:] = [perm[-1], perm[-2]]
+    sigma = np.matmul(mat, np.transpose(mat, perm))
+
+    mu_shape = batch_shape + event_shape
+    mu = rng.randn(*mu_shape)
+
+    return mu, sigma
+
+  def testKLBatch(self):
+    batch_shape = (2,)
+    event_shape = (3,)
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
+      mvn_a = ds.MultivariateNormalFullCovariance(
+          loc=mu_a,
+          covariance_matrix=sigma_a,
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalFullCovariance(
+          loc=mu_b,
+          covariance_matrix=sigma_b,
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b[0, :], sigma_b[0, :])
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b[1, :], sigma_b[1, :])
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
+
+def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
+  """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
+  # Check using numpy operations
+  # This mostly repeats the tensorflow code _kl_mvn_mvn(), but in numpy.
+  # So it is important to also check that KL(mvn, mvn) = 0.
+  sigma_b_inv = np.linalg.inv(sigma_b)
+
+  t = np.trace(sigma_b_inv.dot(sigma_a))
+  q = (mu_b - mu_a).dot(sigma_b_inv).dot(mu_b - mu_a)
+  k = mu_a.shape[0]
+  l = np.log(np.linalg.det(sigma_b) / np.linalg.det(sigma_a))
+
+  return 0.5 * (t + q - k + l)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
index dd7283bb57ff41f580a0f400cc225722da574e3f..685f32883dae5b8513badeb05e1508cd611d6e93 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -151,6 +151,14 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(sample_values.mean(axis=0), mu, atol=1e-2)
       self.assertAllClose(np.cov(sample_values, rowvar=0), sigma, atol=0.06)
 
+  def testSingularScaleRaises(self):
+    with self.test_session():
+      mu = None
+      chol = [[1., 0.], [0., 0.]]
+      mvn = ds.MultivariateNormalTriL(mu, chol, validate_args=True)
+      with self.assertRaisesOpError("Singular operator"):
+        mvn.sample().eval()
+
   def testSampleWithSampleShape(self):
     with self.test_session():
       mu = self._rng.rand(3, 5, 2)
@@ -241,7 +249,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           scale_tril=np.linalg.cholesky(sigma_b),
           validate_args=True)
 
-      kl = ds.kl(mvn_a, mvn_b)
+      kl = ds.kl_divergence(mvn_a, mvn_b)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -263,7 +271,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           scale_tril=np.linalg.cholesky(sigma_b),
           validate_args=True)
 
-      kl = ds.kl(mvn_a, mvn_b)
+      kl = ds.kl_divergence(mvn_a, mvn_b)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -285,7 +293,7 @@ class MultivariateNormalTriLTest(test.TestCase):
           validate_args=True)
 
       # Should be zero since KL(p || p) = =.
-      kl = ds.kl(mvn_a, mvn_a)
+      kl = ds.kl_divergence(mvn_a, mvn_a)
       self.assertEqual(batch_shape, kl.get_shape())
 
       kl_v = kl.eval()
@@ -300,8 +308,6 @@ class MultivariateNormalTriLTest(test.TestCase):
     true_covariance = np.matmul(true_scale, true_scale.T)
     true_variance = np.diag(true_covariance)
     true_stddev = np.sqrt(true_variance)
-    true_det_covariance = np.linalg.det(true_covariance)
-    true_log_det_covariance = np.log(true_det_covariance)
 
     with self.test_session() as sess:
       dist = ds.MultivariateNormalTriL(
@@ -323,7 +329,7 @@ class MultivariateNormalTriLTest(test.TestCase):
 
       sample_kl_chol = math_ops.reduce_mean(
           dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
-      analytical_kl_chol = ds.kl(dist, mvn_chol)
+      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)
 
       scale = dist.scale.to_dense()
 
@@ -334,8 +340,6 @@ class MultivariateNormalTriLTest(test.TestCase):
           analytical_covariance_,
           analytical_variance_,
           analytical_stddev_,
-          analytical_log_det_covariance_,
-          analytical_det_covariance_,
           sample_kl_chol_, analytical_kl_chol_,
           scale_,
       ] = sess.run([
@@ -345,16 +349,12 @@ class MultivariateNormalTriLTest(test.TestCase):
           dist.covariance(),
           dist.variance(),
           dist.stddev(),
-          dist.log_det_covariance(),
-          dist.det_covariance(),
           sample_kl_chol, analytical_kl_chol,
           scale,
       ])
 
       sample_variance_ = np.diag(sample_covariance_)
       sample_stddev_ = np.sqrt(sample_variance_)
-      sample_det_covariance_ = np.linalg.det(sample_covariance_)
-      sample_log_det_covariance_ = np.log(sample_det_covariance_)
 
       logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
       logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
@@ -373,21 +373,6 @@ class MultivariateNormalTriLTest(test.TestCase):
       logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
       logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))
 
-      logging.vlog(
-          2, "true_log_det_covariance:\n{}".format(true_log_det_covariance))
-      logging.vlog(
-          2, "sample_log_det_covariance:\n{}".format(
-              sample_log_det_covariance_))
-      logging.vlog(2, "analytical_log_det_covariance:\n{}".format(
-          analytical_log_det_covariance_))
-
-      logging.vlog(2, "true_det_covariance:\n{}".format(true_det_covariance))
-      logging.vlog(
-          2, "sample_det_covariance:\n{}".format(sample_det_covariance_))
-      logging.vlog(
-          2, "analytical_det_covariance:\n{}".format(
-              analytical_det_covariance_))
-
       logging.vlog(2, "true_scale:\n{}".format(true_scale))
       logging.vlog(2, "scale:\n{}".format(scale_))
 
@@ -414,17 +399,6 @@ class MultivariateNormalTriLTest(test.TestCase):
       self.assertAllClose(true_stddev, analytical_stddev_,
                           atol=0., rtol=1e-6)
 
-      self.assertAllClose(true_log_det_covariance, sample_log_det_covariance_,
-                          atol=0., rtol=0.04)
-      self.assertAllClose(true_log_det_covariance,
-                          analytical_log_det_covariance_,
-                          atol=0., rtol=1e-6)
-
-      self.assertAllClose(true_det_covariance, sample_det_covariance_,
-                          atol=0., rtol=0.03)
-      self.assertAllClose(true_det_covariance, analytical_det_covariance_,
-                          atol=0., rtol=1e-6)
-
       self.assertAllClose(true_scale, scale_,
                           atol=0., rtol=1e-6)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
index f55de9939619a0ce29855e45300ea7bdaf6b6bbc..c1a74c6483b9843c609ac94054a8c27476f7d7ff 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/negative_binomial_test.py
@@ -21,6 +21,7 @@ import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import negative_binomial
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -140,17 +141,18 @@ class NegativeBinomialTest(test.TestCase):
       batch_size = 6
       probs = [.9] * batch_size
       total_count = 5.
-      x = np.array([2.5, 3.2, 4.3, 5.1, 6., 7.], dtype=np.float32)
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
       negbinom = negative_binomial.NegativeBinomial(
           total_count=total_count, probs=probs, validate_args=True)
 
       with self.assertRaisesOpError("Condition x == y"):
         log_pmf = negbinom.log_prob(x)
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         log_pmf = negbinom.log_prob([-1.])
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       negbinom = negative_binomial.NegativeBinomial(
           total_count=total_count, probs=probs, validate_args=False)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
index 56ad4a081bc2306e6d2e147caf4feda80292d988..111f88eeb50fa9ef134dbe30d4a0be0eec7a0d26 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/onehot_categorical_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.contrib.distributions.python.ops import onehot_categorical
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -27,6 +26,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -178,8 +178,8 @@ class OneHotCategoricalTest(test.TestCase):
           kl_expected = np.sum(
               prob_p * (np.log(prob_p) - np.log(prob_q)), axis=-1)
 
-          kl_actual = kullback_leibler.kl(p, q)
-          kl_same = kullback_leibler.kl(p, p)
+          kl_actual = kullback_leibler.kl_divergence(p, q)
+          kl_same = kullback_leibler.kl_divergence(p, p)
           x = p.sample(int(2e4), seed=0)
           x = math_ops.cast(x, dtype=dtypes.float32)
           # Compute empirical KL(p||q).
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
index 49ece78b0d2e169ce5d73260e9220e0277a305fb..6549992633dcc384f26950f4c80ade60f337b78d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_cholesky_test.py
@@ -19,16 +19,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib import distributions
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.platform import test
 
-distributions = distributions_lib
-
 
 def softplus(x):
   return np.log(1 + np.exp(x))
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
index dd59c649e1012a9e19be6e852381aff8f51e69ed..35a7c7e60392347fa47470ce5c57d1056cab9c76 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/operator_pd_full_test.py
@@ -47,7 +47,7 @@ class OperatorPDFullTest(test.TestCase):
       operator = operator_pd_full.OperatorPDFull(matrix, verify_pd=True)
       # Could fail inside Cholesky decomposition, or later when we test the
       # diag.
-      with self.assertRaisesOpError("x > 0|LLT"):
+      with self.assertRaisesOpError("x > 0|Cholesky"):
         operator.to_dense().eval()
 
   def testNonSymmetricMatrixRaises(self):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
index b1a9478b4361e0405f921a0635e259a2629f973e..f157c0d3edd6e56083b7914d89dcd1e5b9420f78 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/poisson_test.py
@@ -21,7 +21,9 @@ import numpy as np
 from scipy import stats
 from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -64,17 +66,18 @@ class PoissonTest(test.TestCase):
     with self.test_session():
       batch_size = 6
       lam = constant_op.constant([3.0] * batch_size)
-      x = [2.5, 3.2, 4.3, 5.1, 6., 7.]
+      x = array_ops.placeholder(dtypes.float32, shape=[6])
+      feed_dict = {x: [2.5, 3.2, 4.3, 5.1, 6., 7.]}
       poisson = poisson_lib.Poisson(rate=lam, validate_args=True)
 
       # Non-integer
       with self.assertRaisesOpError("cannot contain fractional components"):
         log_pmf = poisson.log_prob(x)
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         log_pmf = poisson.log_prob([-1.])
-        log_pmf.eval()
+        log_pmf.eval(feed_dict=feed_dict)
 
       poisson = poisson_lib.Poisson(rate=lam, validate_args=False)
       log_pmf = poisson.log_prob(x)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
index 0e2d1437323590ed115e92fc3134f36a1c2d593f..6a7ee3a8bfab40eab199f52b86d94f9e879c5872 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py
@@ -373,15 +373,16 @@ class QuantizedDistributionTest(test.TestCase):
 
   def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self):
     with self.test_session():
+      low = array_ops.placeholder(dtypes.float32)
       qdist = distributions.QuantizedDistribution(
           distribution=distributions.Normal(loc=0., scale=1.),
-          low=1.5,
+          low=low,
           high=10.,
           validate_args=True)
 
       self.assertTrue(qdist.validate_args)  # Default is True.
       with self.assertRaisesOpError("has non-integer components"):
-        qdist.sample().eval()
+        qdist.sample().eval(feed_dict={low: 1.5})
 
   def testCutoffsCanBeFloatValuedIfValidateArgsFalse(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c355adeedbfff1072281a81de726ddb0ece07882
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_laplace_diag_test.py
@@ -0,0 +1,215 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for VectorLaplaceLinearOperator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import distributions
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+ds = distributions
+
+
+class VectorLaplaceDiagTest(test.TestCase):
+  """Well tested because this is a simple override of the base class."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testScalarParams(self):
+    mu = -1.
+    diag = -5.
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, "at least 1 dimension"):
+        ds.VectorLaplaceDiag(mu, diag)
+
+  def testVectorParams(self):
+    mu = [-1.]
+    diag = [-5.]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([3, 1], dist.sample(3).get_shape())
+
+  def testDistWithBatchShapeOneThenTransformedThroughSoftplus(self):
+    # This complex combination of events resulted in a loss of static shape
+    # information when tensor_util.constant_value(self._needs_rotation) was
+    # being used incorrectly (resulting in always rotating).
+    # Batch shape = [1], event shape = [3]
+    mu = array_ops.zeros((1, 3))
+    diag = array_ops.ones((1, 3))
+    with self.test_session():
+      base_dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      dist = ds.TransformedDistribution(
+          base_dist,
+          validate_args=True,
+          bijector=bijectors.Softplus(event_ndims=1))
+      samps = dist.sample(5)  # Shape [5, 1, 3].
+      self.assertAllEqual([5, 1], dist.log_prob(samps).get_shape())
+
+  def testMean(self):
+    mu = [-1., 1]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual(mu, dist.mean().eval())
+
+  def testMeanWithBroadcastLoc(self):
+    mu = [-1.]
+    diag = [1., -5]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      self.assertAllEqual([-1., -1.], dist.mean().eval())
+
+  def testSample(self):
+    mu = [-1., 1]
+    diag = [1., -2]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      samps = dist.sample(int(1e4), seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0., rtol=0.05)
+      self.assertAllClose(cov_mat, np.cov(samps.T),
+                          atol=0.05, rtol=0.05)
+
+  def testSingularScaleRaises(self):
+    mu = [-1., 1]
+    diag = [1., 0]
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+      with self.assertRaisesOpError("Singular"):
+        dist.sample().eval()
+
+  def testSampleWithBroadcastScale(self):
+    # mu corresponds to a 2-batch of 3-variate normals
+    mu = np.zeros([2, 3])
+
+    # diag corresponds to no batches of 3-variate normals
+    diag = np.ones([3])
+
+    with self.test_session():
+      dist = ds.VectorLaplaceDiag(mu, diag, validate_args=True)
+
+      mean = dist.mean()
+      self.assertAllEqual([2, 3], mean.get_shape())
+      self.assertAllClose(mu, mean.eval())
+
+      n = int(1e4)
+      samps = dist.sample(n, seed=0).eval()
+      cov_mat = 2. * array_ops.matrix_diag(diag).eval()**2
+      sample_cov = np.matmul(samps.transpose([1, 2, 0]),
+                             samps.transpose([1, 0, 2])) / n
+
+      self.assertAllClose(mu, samps.mean(axis=0),
+                          atol=0.10, rtol=0.05)
+      self.assertAllClose([cov_mat, cov_mat], sample_cov,
+                          atol=0.10, rtol=0.05)
+
+  def testCovariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.diag(np.ones([3], dtype=np.float32)),
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 3, 0],
+                          [0, 0, 3]],
+                         [[2, 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 2]]])**2.,
+          vla.covariance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllEqual([2], vla.batch_shape)
+      self.assertAllEqual([3], vla.event_shape)
+      self.assertAllClose(
+          2. * np.array([[[3., 0, 0],
+                          [0, 2, 0],
+                          [0, 0, 1]],
+                         [[4, 0, 0],
+                          [0, 5, 0],
+                          [0, 0, 6]]])**2.,
+          vla.covariance().eval())
+
+  def testVariance(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          2. * np.ones([3], dtype=np.float32),
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          2. * np.array([[3., 3, 3],
+                         [2, 2, 2]])**2.,
+          vla.variance().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1],
+                      [4, 5, 6]])
+      self.assertAllClose(
+          2. * np.array([[3., 2, 1],
+                         [4, 5, 6]])**2.,
+          vla.variance().eval())
+
+  def testStddev(self):
+    with self.test_session():
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([2, 3], dtype=dtypes.float32))
+      self.assertAllClose(
+          np.sqrt(2) * np.ones([3], dtype=np.float32),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_identity_multiplier=[3., 2.])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 3, 3],
+                                 [2, 2, 2]]),
+          vla.stddev().eval())
+
+      vla = ds.VectorLaplaceDiag(
+          loc=array_ops.zeros([3], dtype=dtypes.float32),
+          scale_diag=[[3., 2, 1], [4, 5, 6]])
+      self.assertAllClose(
+          np.sqrt(2) * np.array([[3., 2, 1],
+                                 [4, 5, 6]]),
+          vla.stddev().eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
index 9d0ffd637633294d80bafe03409ee73ebb674d14..b8a3a262ce02c170cc3a69bdef65ec6601152f76 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/vector_student_t_test.py
@@ -38,7 +38,7 @@ class _FakeVectorStudentT(object):
 
   Other `Vector*` implementations need only test new code. That we don't need
   to test every Vector* distribution is good because there aren't SciPy
-  analogues and reimplementing everything in NumPy sort of defeats the point of
+  analogs and reimplementing everything in NumPy sort of defeats the point of
   having the `TransformedDistribution + Affine` API.
   """
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index 1fa6ca0906d0ee951c7cc65fc5a70e197af589ca..d9dc978f23d4dc35bcfc0b910853f7cee083cde4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -339,7 +339,7 @@ class WishartCholeskyTest(test.TestCase):
                             chol_scale_deferred: chol_scale})
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "LLT decomposition was not successful"):
+                                   "Cholesky decomposition was not successful"):
         chol_w = distributions.WishartFull(
             df=df_deferred, scale=chol_scale_deferred)
         # np.ones((3, 3)) is not positive, definite.
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 2e83ed6e41dd6014f5b81f6d479d3352cd4d3d15..1684a5fffe13fa8a074ae7ede0182a9d145300c7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Bijector Ops.
 
-See the @{$python/contrib.distributions.bijectors} guide.
-
 @@Affine
 @@AffineLinearOperator
 @@Bijector
@@ -41,12 +39,10 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops.bijectors.affine import *
 from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import *
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector import *
 from tensorflow.contrib.distributions.python.ops.bijectors.chain import *
 from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product import *
 from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import *
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import *
-from tensorflow.contrib.distributions.python.ops.bijectors.identity import *
 from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
@@ -54,5 +50,11 @@ from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
+from tensorflow.python.ops.distributions.bijector import *
+from tensorflow.python.ops.distributions.identity_bijector import Identity
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
index 429f7eac4d56d5d6f74b564dfd554322e7f6e008..d44e258bd280b10b694211b4b536a98b13a7f431 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
@@ -22,7 +22,6 @@ from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_diag
 from tensorflow.contrib.distributions.python.ops import operator_pd_identity
 from tensorflow.contrib.distributions.python.ops import operator_pd_vdvt_update
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +31,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
index a8e93bd2c761c430c5ce0415d1ba1240940c4182..ae380b5cb2bc39e06aa1e187c134d7e92f6cd92f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
 from tensorflow.contrib.linalg.python.ops import linear_operator
 from tensorflow.python.framework import constant_op
@@ -27,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
@@ -193,7 +193,7 @@ class AffineLinearOperator(bijector.Bijector):
           y, expand_batch_dim=False)
       with ops.control_dependencies(self._maybe_collect_assertions() if
                                     self.validate_args else []):
-        y = self.scale.apply(y)
+        y = self.scale.matmul(y)
       y = self._shaper.undo_make_batch_of_event_sample_matrices(
           y, sample_shape, expand_batch_dim=False)
     if self.shift is not None:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
index 0b72c5aadfe26b0c856b302864730f131505a947..defa36a14048d35c6264c7227840ed70dcc77cbb 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import itertools
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
index e605aec9c3590fe06ccd54e99c1de59f8ec5f7eb..dc05b2f611a52dc29717c69df77a1576aa6b5693 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -29,6 +27,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
index a1b2aef830927124b26ab195f8afca7c1258fef2..ccb1f029277bc07011df7be047a075274f2b3a27 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = ["ConditionalBijector"]
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
index 1f9ec0b1718a4db9f8f4062e0bd406251f600682..fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
index 73d2162ac3c93a1b1142718ab11abebe256353f8..7f28a298572642e9ced7c0b88f9601a0d1751141 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector as bijector_lib
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
 __all__ = [
     "Invert",
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
index 9963d8a7fe1786c44c50de707516b171f64d1066..c37db61720d10949f294ff7b2e9778ba6efa57f0 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
index b8d8152ce3817a28bd1db3952a54aac9afbd27e4..a640dfe7dfbcce96261589c7fc49107deaefdd54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
index 87a5aca1d97d7a8d0a8991883d8e7987b842a947..8645cc1b6b04be75a419342591272f07a4a1711c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -30,6 +29,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
index a1e72ef7f61f899b78dd63e8a0daa35a8c3c9b13..81957fcf78922fa15fd20a25d144071f431161ae 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -38,6 +41,22 @@ class Softplus(bijector.Bijector):
   * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
     the `Exp` `Bijector`.
 
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
     Example Use:
 
     ```python
@@ -45,9 +64,9 @@ class Softplus(bijector.Bijector):
     # batch ndim and 2 event ndims (i.e., vector of matrices).
     softplus = Softplus(event_ndims=2)
     x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
     log(1 + exp(x)) == softplus.forward(x)
     log(exp(x) - 1) == softplus.inverse(x)
     ```
@@ -56,20 +75,48 @@ class Softplus(bijector.Bijector):
     reduction over the event space.
   """
 
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
   def __init__(self,
                event_ndims=0,
+               hinge_softness=None,
                validate_args=False,
                name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
     super(Softplus, self).__init__(
         event_ndims=event_ndims,
         validate_args=validate_args,
         name=name)
 
   def _forward(self, x):
-    return nn_ops.softplus(x)
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
 
   def _inverse(self, y):
-    return distribution_util.softplus_inverse(y)
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
 
   def _inverse_log_det_jacobian(self, y):
     # Could also do:
@@ -81,9 +128,17 @@ class Softplus(bijector.Bijector):
     #           = 1 / (1 - exp{-Y}),
     # which is the most stable for large Y > 0. For small Y, we use
     # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
     return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
                                 axis=self._event_dims_tensor(y))
 
   def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
     return -math_ops.reduce_sum(nn_ops.softplus(-x),
                                 axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index ee7751c9969621d20facfe3df70ea15afcbbc900..9304a56491ece71fe9d8151a28a3f087882222a2 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 _binomial_sample_note = """
@@ -42,6 +42,28 @@ to integer values.
 """
 
 
+def _bdtr(k, n, p):
+  """The binomial cumulative distribution function.
+
+  Args:
+    k: floating point `Tensor`.
+    n: floating point `Tensor`.
+    p: floating point `Tensor`.
+
+  Returns:
+    `sum_{j=0}^k p^j (1 - p)^(n - j)`.
+  """
+  # Trick for getting safe backprop/gradients into n, k when
+  #   betainc(a = 0, ..) = nan
+  # Write:
+  #   where(unsafe, safe_output, betainc(where(unsafe, safe_input, input)))
+  ones = array_ops.ones_like(n - k)
+  k_eq_n = math_ops.equal(k, n)
+  safe_dn = array_ops.where(k_eq_n, ones, n - k)
+  dk = math_ops.betainc(a=safe_dn, b=k + 1, x=1 - p)
+  return array_ops.where(k_eq_n, ones, dk)
+
+
 class Binomial(distribution.Distribution):
   """Binomial distribution.
 
@@ -201,6 +223,18 @@ class Binomial(distribution.Distribution):
   def _prob(self, counts):
     return math_ops.exp(self._log_prob(counts))
 
+  def _cdf(self, counts):
+    counts = self._maybe_assert_valid_sample(counts)
+    probs = self.probs
+    if not (counts.shape.is_fully_defined()
+            and self.probs.shape.is_fully_defined()
+            and counts.shape.is_compatible_with(self.probs.shape)):
+      # If both shapes are well defined and equal, we skip broadcasting.
+      probs += array_ops.zeros_like(counts)
+      counts += array_ops.zeros_like(self.probs)
+
+    return _bdtr(k=counts, n=self.total_count, p=probs)
+
   def _log_unnormalized_prob(self, counts):
     counts = self._maybe_assert_valid_sample(counts)
     return (counts * math_ops.log(self.probs) +
@@ -235,7 +269,7 @@ class Binomial(distribution.Distribution):
             message="total_count must be non-negative."),
         distribution_util.assert_integer_form(
             total_count,
-            message="total_count cannot contain fractional componentes."),
+            message="total_count cannot contain fractional components."),
     ], total_count)
 
   def _maybe_assert_valid_sample(self, counts, check_integer=True):
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index 45d3accdd6c0c77b698d38eb1e992f8fcce05741..bdd5571c966a74e58e4f9f8eed2628f131a1b92e 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
index e3ca5c5468e64234b283876f369661ee0ffac8ee..ef25d4aedd6a2cd9a342bb5911f4f35fec7b3d74 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_distribution.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ConditionalDistribution(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 06d0549c35329c5c3255c8cac65b3ed659657be6..2e1e68cf0587b69f055d8d747672d99383f75ed6 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -18,9 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 6faa2728426d202a97ccd66273b406ad53f7e24c..850d08d1bd69ebc7661557d648e2bffe77e6a908 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -22,7 +22,6 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +31,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution
 
 __all__ = [
     "Deterministic",
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index a0872677a96db7abe158e794f2beed9bbca37156..5e3b42dd2aa5e85fab23820fc63a69be77c3ac27 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -18,619 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import hashlib
-import math
-import numpy as np
-
 from tensorflow.contrib import linalg
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-
-
-def assert_close(
-    x, y, data=None, summarize=None, message=None, name="assert_close"):
-  """Assert that that x and y are within machine epsilon of each other.
-
-  Args:
-    x: Floating-point `Tensor`
-    y: Floating-point `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-
-  Returns:
-    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
-  """
-  message = message or ""
-  x = ops.convert_to_tensor(x, name="x")
-  y = ops.convert_to_tensor(y, name="y")
-
-  if data is None:
-    data = [
-        message,
-        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
-        y.name, y
-    ]
-
-  if x.dtype.is_integer:
-    return check_ops.assert_equal(
-        x, y, data=data, summarize=summarize, message=message, name=name)
-
-  with ops.name_scope(name, "assert_close", [x, y, data]):
-    tol = np.finfo(x.dtype.as_numpy_dtype).eps
-    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
-    return control_flow_ops.Assert(
-        condition, data, summarize=summarize)
-
-
-def assert_integer_form(
-    x, data=None, summarize=None, message=None, name="assert_integer_form"):
-  """Assert that x has integer components (or floats equal to integers).
-
-  Args:
-    x: Floating-point `Tensor`
-    data: The tensors to print out if the condition is `False`. Defaults to
-      error message and first few entries of `x` and `y`.
-    summarize: Print this many entries of each tensor.
-    message: A string to prefix to the default message.
-    name: A name for this operation (optional).
-
-  Returns:
-    Op raising `InvalidArgumentError` if round(x) != x.
-  """
-
-  message = message or "x has non-integer components"
-  x = ops.convert_to_tensor(x, name="x")
-  casted_x = math_ops.to_int64(x)
-  return check_ops.assert_equal(
-      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
-      data=data, summarize=summarize, message=message, name=name)
-
-
-def assert_symmetric(matrix):
-  matrix_t = array_ops.matrix_transpose(matrix)
-  return control_flow_ops.with_dependencies(
-      [check_ops.assert_equal(matrix, matrix_t)], matrix)
-
-
-def embed_check_nonnegative_discrete(x, check_integer=True):
-  """Assert x is a non-negative tensor, and optionally of integers."""
-  assertions = [check_ops.assert_non_negative(
-      x, message="x must be non-negative.")]
-  if check_integer:
-    assertions += [assert_integer_form(
-        x, message="x cannot contain fractional components.")]
-  return control_flow_ops.with_dependencies(assertions, x)
-
-
-def same_dynamic_shape(a, b):
-  """Returns whether a and b have the same dynamic shape.
-
-  Args:
-    a: `Tensor`
-    b: `Tensor`
-
-  Returns:
-    `bool` `Tensor` representing if both tensors have the same shape.
-  """
-  a = ops.convert_to_tensor(a, name="a")
-  b = ops.convert_to_tensor(b, name="b")
-
-  # Here we can't just do math_ops.equal(a.shape, b.shape), since
-  # static shape inference may break the equality comparison between
-  # shape(a) and shape(b) in math_ops.equal.
-  def all_shapes_equal():
-    return math_ops.reduce_all(math_ops.equal(
-        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
-        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
-
-  # One of the shapes isn't fully defined, so we need to use the dynamic
-  # shape.
-  return control_flow_ops.cond(
-      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
-      all_shapes_equal,
-      lambda: constant_op.constant(False))
-
-
-def get_logits_and_probs(logits=None,
-                         probs=None,
-                         multidimensional=False,
-                         validate_args=False,
-                         name="get_logits_and_probs"):
-  """Converts logit to probabilities (or vice-versa), and returns both.
-
-  Args:
-    logits: Floating-point `Tensor` representing log-odds.
-    probs: Floating-point `Tensor` representing probabilities.
-    multidimensional: Python `bool`, default `False`.
-      If `True`, represents whether the last dimension of `logits` or `probs`,
-      a `[N1, N2, ...  k]` dimensional tensor, representing the
-      logit or probability of `shape[-1]` classes.
-    validate_args: Python `bool`, default `False`. When `True`, either assert
-      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
-      of `probs` sums to one.
-    name: A name for this operation (optional).
-
-  Returns:
-    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
-      `1`, then the corresponding entry in the returned logit will be `-Inf` and
-      `Inf` respectively.
-
-  Raises:
-    ValueError: if neither `probs` nor `logits` were passed in, or both were.
-  """
-  with ops.name_scope(name, values=[probs, logits]):
-    if (probs is None) == (logits is None):
-      raise ValueError("Must pass probs or logits, but not both.")
-
-    if probs is None:
-      logits = ops.convert_to_tensor(logits, name="logits")
-      if multidimensional:
-        return logits, nn.softmax(logits, name="probs")
-      return logits, math_ops.sigmoid(logits, name="probs")
-
-    probs = ops.convert_to_tensor(probs, name="probs")
-    if validate_args:
-      with ops.name_scope("validate_probs"):
-        one = constant_op.constant(1., probs.dtype)
-        dependencies = [check_ops.assert_non_negative(probs)]
-        if multidimensional:
-          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
-                                        message="probs does not sum to 1.")]
-        else:
-          dependencies += [check_ops.assert_less_equal(
-              probs, one, message="probs has components greater than 1.")]
-        probs = control_flow_ops.with_dependencies(dependencies, probs)
-
-    with ops.name_scope("logits"):
-      if multidimensional:
-        # Here we don't compute the multidimensional case, in a manner
-        # consistent with respect to the unidimensional case. We do so
-        # following the TF convention. Typically, you might expect to see
-        # logits = log(probs) - log(probs[pivot]). A side-effect of
-        # being consistent with the TF approach is that the unidimensional case
-        # implicitly handles the second dimension but the multidimensional case
-        # explicitly keeps the pivot dimension.
-        return math_ops.log(probs), probs
-      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
-
-
-def log_combinations(n, counts, name="log_combinations"):
-  """Multinomial coefficient.
-
-  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
-  the multinomial coefficient as:
-
-  ```n! / sum_i n_i!```
-
-  where `i` runs over all `k` classes.
-
-  Args:
-    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
-      outcomes.
-    counts: Floating-point `Tensor` broadcastable with `n`. This represents
-      counts in `k` classes, where `k` is the last dimension of the tensor.
-    name: A name for this operation (optional).
-
-  Returns:
-    `Tensor` representing the multinomial coefficient between `n` and `counts`.
-  """
-  # First a bit about the number of ways counts could have come in:
-  # E.g. if counts = [1, 2], then this is 3 choose 2.
-  # In general, this is (sum counts)! / sum(counts!)
-  # The sum should be along the last dimension of counts. This is the
-  # "distribution" dimension. Here n a priori represents the sum of counts.
-  with ops.name_scope(name, values=[n, counts]):
-    n = ops.convert_to_tensor(n, name="n")
-    counts = ops.convert_to_tensor(counts, name="counts")
-    total_permutations = math_ops.lgamma(n + 1)
-    counts_factorial = math_ops.lgamma(counts + 1)
-    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
-    return total_permutations - redundant_permutations
-
-
-def matrix_diag_transform(matrix, transform=None, name=None):
-  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
-
-  Create a trainable covariance defined by a Cholesky factor:
-
-  ```python
-  # Transform network layer into 2 x 2 array.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-
-  # Make the diagonal positive. If the upper triangle was zero, this would be a
-  # valid Cholesky factor.
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # OperatorPDCholesky ignores the upper triangle.
-  operator = OperatorPDCholesky(chol)
-  ```
-
-  Example of heteroskedastic 2-D linear regression.
-
-  ```python
-  # Get a trainable Cholesky factor.
-  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
-  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
-  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
-
-  # Get a trainable mean.
-  mu = tf.contrib.layers.fully_connected(activations, 2)
-
-  # This is a fully trainable multivariate normal!
-  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
-
-  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
-  # will be a distribution predicting labels as multivariate Gaussians.
-  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
-  ```
-
-  Args:
-    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
-      equal.
-    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
-      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
-      unchanged. Defaults to `None`.
-    name:  A name to give created ops.
-      Defaults to "matrix_diag_transform".
-
-  Returns:
-    A `Tensor` with same shape and `dtype` as `matrix`.
-  """
-  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
-    matrix = ops.convert_to_tensor(matrix, name="matrix")
-    if transform is None:
-      return matrix
-    # Replace the diag with transformed diag.
-    diag = array_ops.matrix_diag_part(matrix)
-    transformed_diag = transform(diag)
-    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
-
-  return transformed_mat
-
-
-def rotate_transpose(x, shift, name="rotate_transpose"):
-  """Circularly moves dims left or right.
-
-  Effectively identical to:
-
-  ```python
-  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
-  ```
-
-  When `validate_args=False` additional graph-runtime checks are
-  performed. These checks entail moving data from to GPU to CPU.
-
-  Example:
-
-    ```python
-    x = ...  # Tensor of shape [1, 2, 3, 4].
-    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
-    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
-    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
-    rotate_transpose(x, 7) == rotate_transpose(x, 3)
-    rotate_transpose(x, -7) == rotate_transpose(x, -3)
-    ```
-
-  Args:
-    x: `Tensor`.
-    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
-      transpose right (shift>0).
-    name: Python `str`. The name to give this op.
-
-  Returns:
-    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
-
-  Raises:
-    TypeError: if shift is not integer type.
-  """
-  with ops.name_scope(name, values=[x, shift]):
-    x = ops.convert_to_tensor(x, name="x")
-    shift = ops.convert_to_tensor(shift, name="shift")
-    # We do not assign back to preserve constant-ness.
-    check_ops.assert_integer(shift)
-    shift_value_static = tensor_util.constant_value(shift)
-    ndims = x.get_shape().ndims
-    if ndims is not None and shift_value_static is not None:
-      if ndims < 2: return x
-      shift_value_static = np.sign(shift_value_static) * (
-          abs(shift_value_static) % ndims)
-      if shift_value_static == 0: return x
-      perm = np.roll(np.arange(ndims), shift_value_static)
-      return array_ops.transpose(x, perm=perm)
-    else:
-      # Consider if we always had a positive shift, and some specified
-      # direction.
-      # When shifting left we want the new array:
-      #   last(x, n-shift) + first(x, shift)
-      # and if shifting right then we want:
-      #   last(x, shift) + first(x, n-shift)
-      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
-      # Also, we can encode direction and shift as one: direction * shift.
-      # Combining these facts, we have:
-      #   a = cond(shift<0, -shift, n-shift)
-      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
-      # Finally, we transform shift by modulo length so it can be specified
-      # independently from the array upon which it operates (like python).
-      ndims = array_ops.rank(x)
-      shift = array_ops.where(math_ops.less(shift, 0),
-                              math_ops.mod(-shift, ndims),
-                              ndims - math_ops.mod(shift, ndims))
-      first = math_ops.range(0, shift)
-      last = math_ops.range(shift, ndims)
-      perm = array_ops.concat([last, first], 0)
-      return array_ops.transpose(x, perm=perm)
-
-
-def pick_vector(cond,
-                true_vector,
-                false_vector,
-                name="pick_vector"):
-  """Picks possibly different length row `Tensor`s based on condition.
-
-  Value `Tensor`s should have exactly one dimension.
-
-  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
-  `false_vector` is immediately returned. I.e., no graph nodes are created and
-  no validation happens.
-
-  Args:
-    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
-    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
-    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
-    name: Python `str`. The name to give this op.
-
-  Example:
-
-  ```python
-  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [10, 11].
-  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
-  # result is tensor: [15, 16, 17].
-  ```
-
-  Returns:
-    true_or_false_vector: `Tensor`.
-
-  Raises:
-    TypeError: if `cond.dtype != tf.bool`
-    TypeError: if `cond` is not a constant and
-      `true_vector.dtype != false_vector.dtype`
-  """
-  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
-    cond = ops.convert_to_tensor(cond, name="cond")
-    if cond.dtype != dtypes.bool:
-      raise TypeError("%s.dtype=%s which is not %s" %
-                      (cond.name, cond.dtype, dtypes.bool))
-    cond_value_static = tensor_util.constant_value(cond)
-    if cond_value_static is not None:
-      return true_vector if cond_value_static else false_vector
-    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
-    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
-    if true_vector.dtype != false_vector.dtype:
-      raise TypeError(
-          "%s.dtype=%s does not match %s.dtype=%s"
-          % (true_vector.name, true_vector.dtype,
-             false_vector.name, false_vector.dtype))
-    n = array_ops.shape(true_vector)[0]
-    return array_ops.slice(
-        array_ops.concat([true_vector, false_vector], 0),
-        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
-
-
-def gen_new_seed(seed, salt):
-  """Generate a new seed, from the given seed and salt."""
-  if seed is None:
-    return None
-  string = (str(seed) + salt).encode("utf-8")
-  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
-
-
-def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
-  """Creates a (batch of) lower triangular matrix from a vector of inputs.
-
-  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
-  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
-  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
-
-  Although the non-batch complexity is O(n**2), large constants and sub-optimal
-  vectorization means the complexity of this function is 5x slower than zeroing
-  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
-  function becomes competitive only when several matmul/cholesky/etc ops can be
-  ellided in constructing the input. Example: wiring a fully connected layer as
-  a covariance matrix; this function reduces the final layer by 2x and possibly
-  reduces the network arch complexity considerably. In most cases it is better
-  to simply build a full matrix and zero out the upper triangular elements,
-  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
-  construct a lower triangular.
-
-  Example:
-
-  ```python
-  fill_lower_triangular([1, 2, 3, 4, 5, 6])
-  # Returns: [[1, 0, 0],
-  #           [2, 3, 0],
-  #           [4, 5, 6]]
-  ```
-
-  For comparison, a pure numpy version of this function can be found in
-  `distribution_util_test.py`, function `_fill_lower_triangular`.
-
-  Args:
-    x: `Tensor` representing lower triangular elements.
-    validate_args: Python `bool`, default `False`. Whether to ensure the shape
-      of `x` can be mapped to a lower triangular matrix (controls non-static
-      checks only).
-    name: Python `str`. The name to give this op.
-
-  Returns:
-    tril: `Tensor` with lower triangular elements filled from `x`.
-
-  Raises:
-    ValueError: if shape if `x` has static shape which cannot be mapped to a
-      lower triangular matrix.
-  """
-  # TODO(jvdillon): Replace this code with dedicated op when it exists.
-  with ops.name_scope(name, values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[-1].value is not None):
-      d = x.get_shape()[-1].value
-      # d = n(n+1)/2 implies n is:
-      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
-      d_inferred = n * (n + 1) /2
-      if d != d_inferred:
-        raise ValueError("Input cannot be mapped to a lower triangular; "
-                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([n, n]))
-    else:
-      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
-      # d = n(n+1)/2 implies n is:
-      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
-                        dtype=dtypes.int32)
-      if validate_args:
-        is_valid_input_shape = check_ops.assert_equal(
-            n * (n + 1) / 2, d,
-            message="Input cannot be mapped to a lower triangular.")
-        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
-      final_shape = x.get_shape()[:-1].concatenate(
-          tensor_shape.TensorShape([None, None]))
-
-    def tril_ids(n):
-      """Internal helper to create vector of linear indices into y."""
-      # Build the ids statically; chose 512 because it implies 1MiB.
-      if not tensor_util.is_tensor(n) and n <= 512:
-        ids = np.arange(n**2, dtype=np.int32)
-        rows = (ids / n).astype(np.int32)  # Implicit floor.
-        # We need to stop incrementing the index when we encounter
-        # upper-triangular elements. The idea here is to compute the
-        # lower-right number of zeros then by "symmetry" subtract this from the
-        # total number of zeros, n(n-1)/2.
-        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
-        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
-        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
-        # mask = (ids <= (n + 1) * rows).astype(np.int32)
-      else:
-        ids = math_ops.range(n**2)
-        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
-        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
-                               dtype=dtypes.int32)
-      return ids - offset
-
-    # Special-case non-batch case.
-    if x.get_shape().ndims == 1:
-      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
-      y = array_ops.matrix_band_part(y, -1, 0)
-      y.set_shape(y.get_shape().merge_with(final_shape))
-      return y
-
-    # Make ids for each batch dim.
-    if (x.get_shape().ndims is not None and
-        x.get_shape()[:-1].is_fully_defined()):
-      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
-      m = np.prod(batch_shape).astype(np.int32)
-    else:
-      batch_shape = array_ops.shape(x)[:-1]
-      m = array_ops.reduce_prod(array_ops.shape(x)[:-1])
-    batch_ids = math_ops.range(m)
-
-    # Assemble the tril_ids into batch,tril_id pairs.
-    idx = array_ops.stack([
-        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
-        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
-    ])
-    idx = array_ops.transpose(idx, [1, 2, 0])
-
-    # Gather up, reshape, and return.
-    y = array_ops.reshape(x, [-1, d])
-    y = array_ops.gather_nd(y, idx)
-    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
-    y = array_ops.matrix_band_part(y, -1, 0)
-    y.set_shape(y.get_shape().merge_with(final_shape))
-    return y
-
-
-# TODO(jvdillon): Merge this test back into:
-# tensorflow/python/ops/softplus_op_test.py
-# once TF core is accepting new ops.
-def softplus_inverse(x, name=None):
-  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
-
-  Mathematically this op is equivalent to:
-
-  ```none
-  softplus_inverse = log(exp(x) - 1.)
-  ```
-
-  Args:
-    x: `Tensor`. Non-negative (not enforced), floating-point.
-    name: A name for the operation (optional).
-
-  Returns:
-    `Tensor`. Has the same type/shape as input `x`.
-  """
-  with ops.name_scope(name, "softplus_inverse", values=[x]):
-    x = ops.convert_to_tensor(x, name="x")
-    # We begin by deriving a more numerically stable softplus_inverse:
-    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
-    # ==> exp{x} = 1 + exp{y}                                (1)
-    # ==> y = Log[exp{x} - 1]                                (2)
-    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
-    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
-    #       = Log[1 - exp{-x}] + x                           (3)
-    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
-    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
-    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
-    #
-    # In addition to the numerically stable derivation above, we clamp
-    # small/large values to be congruent with the logic in:
-    # tensorflow/core/kernels/softplus_op.h
-    #
-    # Finally, we set the input to one whenever the input is too large or too
-    # small. This ensures that no unchosen codepath is +/- inf. This is
-    # necessary to ensure the gradient doesn't get NaNs. Recall that the
-    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
-    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
-    # to overwrite `x` with ones only when we will never actually use this
-    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
-    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
-    is_too_small = math_ops.less(x, np.exp(threshold))
-    is_too_large = math_ops.greater(x, -threshold)
-    too_small_value = math_ops.log(x)
-    too_large_value = x
-    # This `where` will ultimately be a NOP because we won't select this
-    # codepath whenever we used the surrogate `ones_like`.
-    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
-                        array_ops.ones_like(x), x)
-    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
-    return array_ops.where(is_too_small, too_small_value,
-                           array_ops.where(is_too_large, too_large_value, y))
-
-
-# TODO(b/35290280): Add unit-tests.
-def dimension_size(x, axis):
-  """Returns the size of a specific dimension."""
-  # Since tf.gather isn't "constant-in, constant-out", we must first check the
-  # static shape or fallback to dynamic shape.
-  num_rows = (None if x.get_shape().ndims is None
-              else x.get_shape()[axis].value)
-  if num_rows is not None:
-    return num_rows
-  return array_ops.shape(x)[axis]
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import
 
 
 # TODO(b/35290280): Add unit-tests.
@@ -648,13 +44,11 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
           check_ops.assert_positive(
               x, message="diagonal part must be positive"),
       ], x)
-    # TODO(b/35157376): Use `assert_none_equal` once it exists.
     return control_flow_ops.with_dependencies([
-        check_ops.assert_greater(
-            math_ops.abs(x),
+        check_ops.assert_none_equal(
+            x,
             array_ops.zeros([], x.dtype),
-            message="diagonal part must be non-zero"),
-    ], x)
+            message="diagonal part must be non-zero")], x)
 
   with ops.name_scope(name, "make_diag_scale",
                       values=[loc, scale_diag, scale_identity_multiplier]):
@@ -678,7 +72,7 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
       raise ValueError(
           "Cannot infer `event_shape` unless `loc` is specified.")
 
-    num_rows = dimension_size(loc, -1)
+    num_rows = util.dimension_size(loc, -1)
 
     if scale_identity_multiplier is None:
       return linalg.LinearOperatorIdentity(
@@ -697,62 +91,108 @@ def make_diag_scale(loc, scale_diag, scale_identity_multiplier,
         assert_proper_shapes=validate_args)
 
 
-class AppendDocstring(object):
-  """Helper class to promote private subclass docstring to public counterpart.
+def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
+  """Infer distribution batch and event shapes from a location and scale.
 
-  Example:
+  Location and scale family distributions determine their batch/event shape by
+  broadcasting the `loc` and `scale` args.  This helper does that broadcast,
+  statically if possible.
 
-  ```python
-  class TransformedDistribution(Distribution):
-    @distribution_util.AppendDocstring(
-      additional_note="A special note!",
-      kwargs_dict={"foo": "An extra arg."})
-    def _prob(self, y, foo=None):
-      pass
-  ```
+  Batch shape broadcasts as per the normal rules.
+  We allow the `loc` event shape to broadcast up to that of `scale`.  We do not
+  allow `scale`'s event shape to change.  Therefore, the last dimension of `loc`
+  must either be size `1`, or the same as `scale.range_dimension`.
 
-  In this case, the `AppendDocstring` decorator appends the `additional_note` to
-  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
-  section with each dictionary item as a bullet-point.
+  See `MultivariateNormalLinearOperator` for a usage example.
 
-  For a more detailed example, see `TransformedDistribution`.
-  """
+  Args:
+    loc:  `N-D` `Tensor` with `N >= 1` (already converted to tensor) or `None`.
+      If `None`, both batch and event shape are determined by `scale`.
+    scale:  A `LinearOperator` instance.
+    name:  A string name to prepend to created ops.
 
-  def __init__(self, additional_note="", kwargs_dict=None):
-    """Initializes the AppendDocstring object.
+  Returns:
+    batch_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
+    event_shape:  `TensorShape` (if broadcast is done statically), or `Tensor`.
 
-    Args:
-      additional_note: Python string added as additional docstring to public
-        version of function.
-      kwargs_dict: Python string/string dictionary representing
-        specific kwargs expanded from the **kwargs input.
+  Raises:
+    ValueError:  If the last dimension of `loc` is determined statically to be
+      different than the range of `scale`.
+  """
+  with ops.name_scope(name, values=[loc] + scale.graph_parents):
+    # Get event shape.
+    event_size = scale.range_dimension_tensor()
+    event_size_const = tensor_util.constant_value(event_size)
+    if event_size_const is not None:
+      event_shape = event_size_const.reshape([1])
+    else:
+      event_shape = event_size[array_ops.newaxis]
 
-    Raises:
-      ValueError: if kwargs_dict.key contains whitespace.
-      ValueError: if kwargs_dict.value contains newlines.
-    """
-    self._additional_note = additional_note
-    if kwargs_dict:
-      bullets = []
-      for key in sorted(kwargs_dict.keys()):
-        value = kwargs_dict[key]
-        if any(x.isspace() for x in key):
+    # Static check that event shapes match.
+    if loc is not None:
+      loc_event_size = loc.get_shape()[-1].value
+      if loc_event_size is not None and event_size_const is not None:
+        if loc_event_size != 1 and loc_event_size != event_size_const:
           raise ValueError(
-              "Parameter name \"%s\" contains whitespace." % key)
-        value = value.lstrip()
-        if "\n" in value:
-          raise ValueError(
-              "Parameter description for \"%s\" contains newlines." % key)
-        bullets.append("*  `%s`: %s" % (key, value))
-      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
-                                "\n".join(bullets))
+              "Event size of 'scale' (%d) could not be broadcast up to that of "
+              "'loc' (%d)." % (loc_event_size, event_size_const))
 
-  def __call__(self, fn):
-    @functools.wraps(fn)
-    def _fn(*args, **kwargs):
-      return fn(*args, **kwargs)
-    if _fn.__doc__ is None:
-      _fn.__doc__ = self._additional_note
+    # Get batch shape.
+    batch_shape = scale.batch_shape_tensor()
+    if loc is None:
+      batch_shape_const = tensor_util.constant_value(batch_shape)
+      batch_shape = (
+          batch_shape_const if batch_shape_const is not None else batch_shape)
     else:
-      _fn.__doc__ += "\n%s" % self._additional_note
-    return _fn
+      loc_batch_shape = loc.get_shape().with_rank_at_least(1)[:-1]
+      if (loc.get_shape().ndims is None or
+          not loc_batch_shape.is_fully_defined()):
+        loc_batch_shape = array_ops.shape(loc)[:-1]
+      else:
+        loc_batch_shape = ops.convert_to_tensor(loc_batch_shape,
+                                                name="loc_batch_shape")
+      batch_shape = prefer_static_broadcast_shape(batch_shape, loc_batch_shape)
+
+  return batch_shape, event_shape
+
+
+def prefer_static_broadcast_shape(
+    shape1, shape2, name="prefer_static_broadcast_shape"):
+  """Convenience function which statically broadcasts shape when possible.
+
+  Args:
+    shape1:  `1-D` integer `Tensor`.  Already converted to tensor!
+    shape2:  `1-D` integer `Tensor`.  Already converted to tensor!
+    name:  A string name to prepend to created ops.
+
+  Returns:
+    The broadcast shape, either as `TensorShape` (if broadcast can be done
+      statically), or as a `Tensor`.
+  """
+  with ops.name_scope(name, values=[shape1, shape2]):
+    if (tensor_util.constant_value(shape1) is not None and
+        tensor_util.constant_value(shape2) is not None):
+      return array_ops.broadcast_static_shape(
+          tensor_shape.TensorShape(tensor_util.constant_value(shape1)),
+          tensor_shape.TensorShape(tensor_util.constant_value(shape2)))
+    return array_ops.broadcast_dynamic_shape(shape1, shape2)
+
+
+def is_diagonal_scale(scale):
+  """Returns `True` if `scale` is a `LinearOperator` that is known to be diag.
+
+  Args:
+    scale:  `LinearOperator` instance.
+
+  Returns:
+    Python `bool`.
+
+  Raises:
+    TypeError:  If `scale` is not a `LinearOperator`.
+  """
+  if not isinstance(scale, linalg.LinearOperator):
+    raise TypeError("Expected argument 'scale' to be instance of LinearOperator"
+                    ". Found: %s" % scale)
+  return (isinstance(scale, linalg.LinearOperatorIdentity) or
+          isinstance(scale, linalg.LinearOperatorScaledIdentity) or
+          isinstance(scale, linalg.LinearOperatorDiag))
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index aa7aef6681f16eb8220ebecccd4208ff4328ca4a..918200830c35536e110b9a2ce4fdf35e55caac18 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -19,8 +19,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +29,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Geometric(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index db26c2b627eaefeacb5329268420abf63d43aa42..ba8d3c639b397422f0f6210ba9f48650f0da1e3e 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 import numpy as np
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class _Gumbel(distribution.Distribution):
@@ -44,7 +44,7 @@ class _Gumbel(distribution.Distribution):
 
   where `loc = mu` and `scale = sigma`.
 
-  The cumulative densifyt function of this distribution is,
+  The cumulative density function of this distribution is,
 
   ```cdf(x; mu, sigma) = exp(-exp(-(x - mu) / sigma))```
 
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 7b91b00c2cddf13ebee1be6cf6beac68b9df09f4..956dee38a378813434656a28a69c89b6ec1e8b72 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +30,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 6af16041c08b242c69841742c0f2b4dcae908a00..ce1a459cae9f409d4f7aeed1508eefe547863fae 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +30,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Logistic(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 6417d7bd6d4a392d8d974d168d6e5d2e65c7ba64..f3b09f60f3e906daf073eacb90834920f506bb96 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -31,6 +28,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Mixture(distribution.Distribution):
@@ -229,6 +229,19 @@ class Mixture(distribution.Distribution):
       log_sum_exp = math_ops.reduce_logsumexp(concat_log_probs, [0])
       return log_sum_exp
 
+  def _log_cdf(self, x):
+    with ops.control_dependencies(self._assertions):
+      x = ops.convert_to_tensor(x, name="x")
+      distribution_log_cdfs = [d.log_cdf(x) for d in self.components]
+      cat_log_probs = self._cat_probs(log_probs=True)
+      final_log_cdfs = [
+          cat_lp + d_lcdf
+          for (cat_lp, d_lcdf) in zip(cat_log_probs, distribution_log_cdfs)
+      ]
+      concatted_log_cdfs = array_ops.stack(final_log_cdfs, axis=0)
+      mixture_log_cdf = math_ops.reduce_logsumexp(concatted_log_cdfs, [0])
+      return mixture_log_cdf
+
   def _prob(self, x):
     return math_ops.exp(self._log_prob(x))
 
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index 946c1dc16b5fd69d514e8435a5d3efa94ead8189..163cf75d990d5fe7ec1e3aaf0040fc71f61774a7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -146,8 +146,8 @@ class MultivariateNormalDiag(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
@@ -197,11 +197,14 @@ class MultivariateNormalDiag(
     with ops.name_scope(name):
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
         scale = distribution_util.make_diag_scale(
             loc=loc,
             scale_diag=scale_diag,
             scale_identity_multiplier=scale_identity_multiplier,
-            validate_args=validate_args,
+            validate_args=False,
             assert_positive=False)
     super(MultivariateNormalDiag, self).__init__(
         loc=loc,
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 1d4132cfd4ba58ae486fcc785244f70544985425..ee3e02e0203a3338b7e6a40b7e3ff30c0a0940f0 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -155,8 +155,8 @@ class MultivariateNormalDiagPlusLowRank(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
new file mode 100644
index 0000000000000000000000000000000000000000..84809d8dc45dcafbdfa5e8771355d712812706e7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -0,0 +1,187 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multivariate Normal distribution class initialized with a full covariance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import mvn_tril
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+
+
+__all__ = [
+    "MultivariateNormalFullCovariance",
+]
+
+
+class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
+  """The multivariate normal distribution on `R^k`.
+
+  The Multivariate Normal distribution is defined over `R^k` and parameterized
+  by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
+  `covariance_matrix` matrices that are the covariance.
+  This is different than the other multivariate normals, which are parameterized
+  by a matrix more akin to the standard deviation.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is, with `@` as matrix multiplication,
+
+  ```none
+  pdf(x; loc, covariance_matrix) = exp(-0.5 ||y||**2) / Z,
+  y = (x - loc)^T @ inv(covariance_matrix) @ (x - loc)
+  Z = (2 pi)**(0.5 k) |det(covariance_matrix)|**(0.5).
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `covariance_matrix` is an `R^{k x k}` symmetric positive definite matrix,
+  * `Z` denotes the normalization constant, and,
+  * `||y||**2` denotes the squared Euclidean norm of `y`.
+
+  Additional leading dimensions (if any) in `loc` and `covariance_matrix` allow
+  for batch dimensions.
+
+  The MultivariateNormal distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed e.g. as,
+
+  ```none
+  X ~ MultivariateNormal(loc=0, scale=1)   # Identity scale, zero shift.
+  scale = Cholesky(covariance_matrix)
+  Y = scale @ X + loc
+  ```
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 3-variate Gaussian.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance_matrix=cov)
+
+  mvn.mean().eval()
+  # ==> [1., 2, 3]
+
+  # Covariance agrees with covariance_matrix.
+  mvn.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an observation in `R^3` ; return a scalar.
+  mvn.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Gaussians.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
+  mvn = ds.MultivariateNormalFullCovariance(
+      loc=mu,
+      covariance=covariance_matrix)
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  mvn.prob(x).eval()    # shape: [2]
+
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               covariance_matrix=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="MultivariateNormalFullCovariance"):
+    """Construct Multivariate Normal distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and
+    `covariance_matrix` arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `covariance_matrix`. The last dimension of `loc` (if provided) must
+    broadcast with this.
+
+    A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive
+    definite matrix.  In other words it is (real) symmetric with all eigenvalues
+    strictly positive.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      covariance_matrix: Floating-point, symmetric positive definite `Tensor` of
+        same `dtype` as `loc`.  The strict upper triangle of `covariance_matrix`
+        is ignored, so if `covariance_matrix` is not symmetric no error will be
+        raised (unless `validate_args is True`).  `covariance_matrix` has shape
+        `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if neither `loc` nor `covariance_matrix` are specified.
+    """
+    parameters = locals()
+
+    # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[loc, covariance_matrix]):
+        if covariance_matrix is None:
+          scale_tril = None
+        else:
+          covariance_matrix = ops.convert_to_tensor(
+              covariance_matrix, name="covariance_matrix")
+          if validate_args:
+            assert_symmetric = check_ops.assert_equal(
+                covariance_matrix,
+                array_ops.matrix_transpose(covariance_matrix),
+                message="Matrix was not symmetric.")
+            covariance_matrix = control_flow_ops.with_dependencies(
+                [assert_symmetric], covariance_matrix)
+          # No need to validate that covariance_matrix is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
+          # However, cholesky() ignores the upper triangular part, so we do need
+          # to separately assert symmetric.
+          scale_tril = linalg_ops.cholesky(covariance_matrix)
+        super(MultivariateNormalFullCovariance, self).__init__(
+            loc=loc,
+            scale_tril=scale_tril,
+            validate_args=validate_args,
+            allow_nan_stats=allow_nan_stats)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 734648d2d6401be6ae79565af5e6f5163b6992c9..b25250d3671ff68a8362c7f2eaa8f586900f27e2 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -21,14 +21,13 @@ from __future__ import print_function
 from tensorflow.contrib import linalg
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
+from tensorflow.python.ops.distributions import transformed_distribution
 
 
 __all__ = [
@@ -60,7 +59,7 @@ class MultivariateNormalLinearOperator(
 
   The Multivariate Normal distribution is defined over `R^k` and parameterized
   by a (batch of) length-`k` `loc` vector (aka "mu") and a (batch of) `k x k`
-  `scale` matrix; `covariance = scale @ scale.T` where `@` denotes
+  `scale` matrix; `covariance = scale @ scale.T`, where `@` denotes
   matrix-multiplication.
 
   #### Mathematical Details
@@ -147,8 +146,8 @@ class MultivariateNormalLinearOperator(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`.
 
@@ -179,12 +178,12 @@ class MultivariateNormalLinearOperator(
     if not scale.dtype.is_floating:
       raise TypeError("`scale` parameter must have floating-point dtype.")
 
-    # Since expand_dims doesn't preserve constant-ness, we obtain the
-    # non-dynamic value if possible.
-    event_shape = scale.domain_dimension_tensor()
-    if tensor_util.constant_value(event_shape) is not None:
-      event_shape = tensor_util.constant_value(event_shape)
-    event_shape = event_shape[array_ops.newaxis]
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
 
     super(MultivariateNormalLinearOperator, self).__init__(
         distribution=normal.Normal(
@@ -192,7 +191,7 @@ class MultivariateNormalLinearOperator(
             scale=array_ops.ones([], dtype=scale.dtype)),
         bijector=bijectors.AffineLinearOperator(
             shift=loc, scale=scale, validate_args=validate_args),
-        batch_shape=scale.batch_shape_tensor(),
+        batch_shape=batch_shape,
         event_shape=event_shape,
         validate_args=validate_args,
         name=name)
@@ -208,18 +207,6 @@ class MultivariateNormalLinearOperator(
     """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
     return self.bijector.scale
 
-  def log_det_covariance(self, name="log_det_covariance"):
-    """Log of determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.scale.graph_parents):
-        return 2. * self.scale.log_abs_determinant()
-
-  def det_covariance(self, name="det_covariance"):
-    """Determinant of covariance matrix."""
-    with ops.name_scope(self.name):
-      with ops.name_scope(name, values=self.scale.graph_parents):
-        return math_ops.exp(2.* self.scale.log_abs_determinant())
-
   @distribution_util.AppendDocstring(_mvn_sample_note)
   def _log_prob(self, x):
     return super(MultivariateNormalLinearOperator, self)._log_prob(x)
@@ -248,41 +235,32 @@ class MultivariateNormalLinearOperator(
     return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
 
   def _covariance(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
-      return self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))
+      return self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
 
   def _variance(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return math_ops.square(self.scale.diag_part())
     elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
           and self.scale.is_self_adjoint):
       return array_ops.matrix_diag_part(
-          self.scale.apply(self.scale.to_dense()))
+          self.scale.matmul(self.scale.to_dense()))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
       return array_ops.matrix_diag_part(
-          self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense())))
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
 
   def _stddev(self):
-    if (isinstance(self.scale, linalg.LinearOperatorIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorScaledIdentity) or
-        isinstance(self.scale, linalg.LinearOperatorDiag)):
+    if distribution_util.is_diagonal_scale(self.scale):
       return math_ops.abs(self.scale.diag_part())
     elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
           and self.scale.is_self_adjoint):
       return math_ops.sqrt(array_ops.matrix_diag_part(
-          self.scale.apply(self.scale.to_dense())))
+          self.scale.matmul(self.scale.to_dense())))
     else:
-      # TODO(b/35040238): Remove transpose once LinOp supports `transpose`.
       return math_ops.sqrt(array_ops.matrix_diag_part(
-          self.scale.apply(array_ops.matrix_transpose(self.scale.to_dense()))))
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
 
   def _mode(self):
     return self._mean()
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index e2a31c862494208477005b2be67dce1f9e95658d..d662b25e1e1dc1dc1053c22aef9fe6b7a440cdc0 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -19,13 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import mvn_linear_operator as mvn_linop
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -55,7 +51,7 @@ class MultivariateNormalTriL(
   where:
 
   * `loc` is a vector in `R^k`,
-  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `scale` is a matrix in `R^{k x k}`, `covariance = scale @ scale.T`,
   * `Z` denotes the normalization constant, and,
   * `||y||**2` denotes the squared Euclidean norm of `y`.
 
@@ -140,8 +136,8 @@ class MultivariateNormalTriL(
     The `batch_shape` is the broadcast shape between `loc` and `scale`
     arguments.
 
-    The `event_shape` is given by the last dimension of `loc` or the last
-    dimension of the matrix implied by `scale`.
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
 
     Recall that `covariance = scale @ scale.T`. A (non-batch) `scale` matrix is:
 
@@ -191,14 +187,9 @@ class MultivariateNormalTriL(
               is_positive_definite=True,
               assert_proper_shapes=validate_args)
         else:
-          if validate_args:
-            scale_tril = control_flow_ops.with_dependencies([
-                # TODO(b/35157376): Use `assert_none_equal` once it exists.
-                check_ops.assert_greater(
-                    math_ops.abs(array_ops.matrix_diag_part(scale_tril)),
-                    array_ops.zeros([], scale_tril.dtype),
-                    message="`scale_tril` must have non-zero diagonal"),
-            ], scale_tril)
+          # No need to validate that scale_tril is non-singular.
+          # LinearOperatorTriL has an assert_non_singular method that is called
+          # by the Bijector.
           scale = linalg.LinearOperatorTriL(
               scale_tril,
               is_non_singular=True,
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index 294b733c3c0187e51be3704a50d63cae12cc1d6a..8895fd8b465bf1f1e6f6b818cfbfc1aaa86a522e 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -27,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class NegativeBinomial(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
index 10e934326a1f0983325f8c99eaa7b03a0e3589be..4025285780b63560181b912635325ce7ebdc3ec2 100644
--- a/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
+++ b/tensorflow/contrib/distributions/python/ops/normal_conjugate_posteriors.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.normal import Normal  # pylint: disable=line-too-long
-
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal
 
 
 def normal_conjugates_known_scale_posterior(prior, scale, s, n):
@@ -65,7 +64,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -77,7 +76,7 @@ def normal_conjugates_known_scale_posterior(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2))
 
@@ -131,7 +130,7 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
     TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
       Normal object.
   """
-  if not isinstance(prior, Normal):
+  if not isinstance(prior, normal.Normal):
     raise TypeError("Expected prior to be an instance of type Normal")
 
   if s.dtype != prior.dtype:
@@ -143,6 +142,6 @@ def normal_conjugates_known_scale_predictive(prior, scale, s, n):
   scale0_2 = math_ops.square(prior.scale)
   scale_2 = math_ops.square(scale)
   scalep_2 = 1.0/(1/scale0_2 + n/scale_2)
-  return Normal(
+  return normal.Normal(
       loc=(prior.loc/scale0_2 + s/scale_2) * scalep_2,
       scale=math_ops.sqrt(scalep_2 + scale_2))
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 1679a797e131ca7f24b010bceb600569560f456d..b76cebf79fad09ebec68f2459c6fe80794ea81c0 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class OneHotCategorical(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
index 548374bcde67fec955247e1bb3ab63ae99d19014..3ca341bb830b0baafa75765abe7f695021bfed1e 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_full.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
index e6947bf60935528ed8fc6f2ba76e0be0980c76f3..4cee2997909dbd105fd045be9ea1238a343a2c27 100644
--- a/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
+++ b/tensorflow/contrib/distributions/python/ops/operator_pd_identity.py
@@ -115,7 +115,7 @@ class OperatorPDIdentity(operator_pd.OperatorPDBase):
     """Static check that the argument `x` is proper `shape`, `dtype`."""
     # x is a typical argument e.g. to matmul or solve.  In both cases, x should
     # have the same type/shape since this is a square matrix.  These checks are
-    # ususally not needed since we ususally have some tensor backing this
+    # usually not needed since we usually have some tensor backing this
     # distribution, and the calls to tf.matmul do a shape/type check.
     #
     # Static checks only for efficiency, the identity should be fast.
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index 3c4f1cc1d895c47bf629f985467053d57c144edb..d9929183c1a85f2ed16f289c795c4c7bf46caec0 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,6 +26,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "Poisson",
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 6241dbfcf021bf50668a1ac17d31c8bf34777f49..8aebb79b9138cce1373e6472d17cf9072d2bc285 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution as distributions
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distributions
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = ["QuantizedDistribution"]
 
@@ -232,7 +232,7 @@ class QuantizedDistribution(distributions.Distribution):
       graph_parents = self._dist._graph_parents  # pylint: disable=protected-access
 
       checks = []
-      if low is not None and high is not None:
+      if validate_args and low is not None and high is not None:
         message = "low must be strictly less than high."
         checks.append(
             check_ops.assert_less(
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 7fee2e1f3a16f6c66c41093146f6c2e420f9d595..5b57a95c55eca7f3d6301c1e87a6cf52f040ab26 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import logistic
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
-from tensorflow.contrib.distributions.python.ops.bijectors import sigmoid as sigmoid_lib
+# Bijectors must be directly imported because `remove_undocumented` prevents
+# individual file imports.
+from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import Sigmoid
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
@@ -171,12 +173,13 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits, probs=probs, validate_args=validate_args)
       super(RelaxedBernoulli, self).__init__(
-          distribution=logistic.Logistic(self._logits / self._temperature,
-                                         1. / self._temperature,
-                                         validate_args=validate_args,
-                                         allow_nan_stats=allow_nan_stats,
-                                         name=name + "/Logistic"),
-          bijector=sigmoid_lib.Sigmoid(validate_args=validate_args),
+          distribution=logistic.Logistic(
+              self._logits / self._temperature,
+              1. / self._temperature,
+              validate_args=validate_args,
+              allow_nan_stats=allow_nan_stats,
+              name=name + "/Logistic"),
+          bijector=Sigmoid(validate_args=validate_args),
           validate_args=validate_args,
           name=name)
     self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index e81c5935a11d1a53f058b3de7563bc8b4c0137b4..da1cd72a6f13f7c585a60d0be122c212671fe5e8 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +28,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ExpRelaxedOneHotCategorical(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 0b1ceefe7bf944a0713c20ad378c64f58464c0fc..26cf922d0afe0c8c07da1e3e8da43e1d5cea25c4 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -44,7 +44,7 @@ def percentile(x,
                keep_dims=False,
                validate_args=False,
                name=None):
-  """Compute the `q`-th percentile of `x` along leading (sample) dimensions.
+  """Compute the `q`-th percentile of `x`.
 
   Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
   way from the minimum to the maximum in in a sorted copy of `x`.
@@ -58,7 +58,7 @@ def percentile(x,
 
 
   ```python
-  # Get 30th percentile with default ('linear') interpolation.
+  # Get 30th percentile with default ('nearest') interpolation.
   x = [1., 2., 3., 4.]
   percentile(x, q=30.)
   ==> 2.0
@@ -91,11 +91,10 @@ def percentile(x,
     axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values.
       The axis that hold independent samples over which to return the desired
       percentile.  If `None` (the default), treat every dimension as a sample
-      dimension, returning a scalar
+      dimension, returning a scalar.
     interpolation : {"lower", "higher", "nearest"}.  Default: "nearest"
       This optional parameter specifies the interpolation method to
-      use when the desired quantile lies between two data points
-      `i < j`:
+      use when the desired quantile lies between two data points `i < j`:
         * lower: `i`.
         * higher: `j`.
         * nearest: `i` or `j`, whichever is nearest.
diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py
index b1271d61c0ad45f48733fd61c541417e58f0c618..516d7b60fecbffec197a40ae361204a9b620988a 100644
--- a/tensorflow/contrib/distributions/python/ops/shape.py
+++ b/tensorflow/contrib/distributions/python/ops/shape.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -27,6 +26,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class _DistributionShape(object):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3867809a820f49cfa7f5282c47f786626481a6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -0,0 +1,232 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution of a vectorized Laplace, with uncorrelated components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops import vector_laplace_linear_operator as vector_laplace_linop
+from tensorflow.python.framework import ops
+
+
+__all__ = [
+    "VectorLaplaceDiag",
+]
+
+
+class VectorLaplaceDiag(
+    vector_laplace_linop.VectorLaplaceLinearOperator):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  A (non-batch) `scale` matrix is:
+
+  ```none
+  scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+  ```
+
+  where:
+
+  * `scale_diag.shape = [k]`, and,
+  * `scale_identity_multiplier.shape = []`.
+
+  Additional leading dimensions (if any) will index batches.
+
+  If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+  `scale` is the Identity matrix.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+
+  # Initialize a single 2-variate VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_diag=[1, 2.])
+
+  vla.mean().eval()
+  # ==> [1., -1]
+
+  vla.stddev().eval()
+  # ==> [1., 2] * sqrt(2)
+
+  # Evaluate this on an observation in `R^2`, returning a scalar.
+  vla.prob([-1., 0]).eval()  # shape: []
+
+  # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
+  vla = ds.VectorLaplaceDiag(
+      loc=[1., -1],
+      scale_identity_multiplier=[1, 2., 3])
+
+  vla.mean().eval()  # shape: [3, 2]
+  # ==> [[1., -1]
+  #      [1, -1],
+  #      [1, -1]]
+
+  vla.stddev().eval()  # shape: [3, 2]
+  # ==> sqrt(2) * [[1., 1],
+  #                [2, 2],
+  #                [3, 3]]
+
+  # Evaluate this on an observation in `R^2`, returning a length-3 vector.
+  vla.prob([-1., 0]).eval()  # shape: [3]
+
+  # Initialize a 2-batch of 3-variate VectorLaplace's.
+  vla = ds.VectorLaplaceDiag(
+      loc=[[1., 2, 3],
+           [11, 22, 33]]           # shape: [2, 3]
+      scale_diag=[[1., 2, 3],
+                  [0.5, 1, 1.5]])  # shape: [2, 3]
+
+  # Evaluate this on a two observations, each in `R^3`, returning a length-2
+  # vector.
+  x = [[-1., 0, 1],
+       [-11, 0, 11.]]   # shape: [2, 3].
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale_diag=None,
+               scale_identity_multiplier=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceDiag"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    ```none
+    scale = diag(scale_diag + scale_identity_multiplier * ones(k))
+    ```
+
+    where:
+
+    * `scale_diag.shape = [k]`, and,
+    * `scale_identity_multiplier.shape = []`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    If both `scale_diag` and `scale_identity_multiplier` are `None`, then
+    `scale` is the Identity matrix.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale_diag: Non-zero, floating-point `Tensor` representing a diagonal
+        matrix added to `scale`. May have shape `[B1, ..., Bb, k]`, `b >= 0`,
+        and characterizes `b`-batches of `k x k` diagonal matrices added to
+        `scale`. When both `scale_identity_multiplier` and `scale_diag` are
+        `None` then `scale` is the `Identity`.
+      scale_identity_multiplier: Non-zero, floating-point `Tensor` representing
+        a scaled-identity-matrix added to `scale`. May have shape
+        `[B1, ..., Bb]`, `b >= 0`, and characterizes `b`-batches of scaled
+        `k x k` identity matrices added to `scale`. When both
+        `scale_identity_multiplier` and `scale_diag` are `None` then `scale` is
+        the `Identity`.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      ValueError: if at most `scale_identity_multiplier` is specified.
+    """
+    parameters = locals()
+    with ops.name_scope(name):
+      with ops.name_scope("init", values=[
+          loc, scale_diag, scale_identity_multiplier]):
+        # No need to validate_args while making diag_scale.  The returned
+        # LinearOperatorDiag has an assert_non_singular method that is called by
+        # the Bijector.
+        scale = distribution_util.make_diag_scale(
+            loc=loc,
+            scale_diag=scale_diag,
+            scale_identity_multiplier=scale_identity_multiplier,
+            validate_args=False,
+            assert_positive=False)
+    super(VectorLaplaceDiag, self).__init__(
+        loc=loc,
+        scale=scale,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        name=name)
+    self._parameters = parameters
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2c46d94de9c031768be1410990b180b30497d2
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -0,0 +1,294 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Vectorized Laplace distribution class, directly using LinearOpeartor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import laplace
+from tensorflow.python.ops.distributions import transformed_distribution
+
+
+__all__ = [
+    "VectorLaplaceLinearOperator"
+]
+
+_mvn_sample_note = """
+`value` is a batch vector with compatible shape if `value` is a `Tensor` whose
+shape can be broadcast up to either:
+
+```python
+self.batch_shape + self.event_shape
+```
+
+or
+
+```python
+[M1, ..., Mm] + self.batch_shape + self.event_shape
+```
+
+"""
+
+
+class VectorLaplaceLinearOperator(
+    transformed_distribution.TransformedDistribution):
+  """The vectorization of the Laplace distribution on `R^k`.
+
+  The vector laplace distribution is defined over `R^k`, and parameterized by
+  a (batch of) length-`k` `loc` vector (the means) and a (batch of) `k x k`
+  `scale` matrix:  `covariance = 2 * scale @ scale.T`, where `@` denotes
+  matrix-multiplication.
+
+  #### Mathematical Details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = exp(-||y||_1) / Z,
+  y = inv(scale) @ (x - loc),
+  Z = 2**k |det(scale)|,
+  ```
+
+  where:
+
+  * `loc` is a vector in `R^k`,
+  * `scale` is a linear operator in `R^{k x k}`, `cov = scale @ scale.T`,
+  * `Z` denotes the normalization constant, and,
+  * `||y||_1` denotes the `l1` norm of `y`, `sum_i |y_i|.
+
+  The VectorLaplace distribution is a member of the [location-scale
+  family](https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
+  constructed as,
+
+  ```none
+  X = (X_1, ..., X_k), each X_i ~ Laplace(loc=0, scale=1)
+  Y = (Y_1, ...,Y_k) = scale @ X + loc
+  ```
+
+  #### About `VectorLaplace` and `Vector` distributions in TensorFlow.
+
+  The `VectorLaplace` is a non-standard distribution that has useful properties.
+
+  The marginals `Y_1, ..., Y_k` are *not* Laplace random variables, due to
+  the fact that the sum of Laplace random variables is not Laplace.
+
+  Instead, `Y` is a vector whose components are linear combinations of Laplace
+  random variables.  Thus, `Y` lives in the vector space generated by `vectors`
+  of Laplace distributions.  This allows the user to decide the mean and
+  covariance (by setting `loc` and `scale`), while preserving some properties of
+  the Laplace distribution.  In particular, the tails of `Y_i` will be (up to
+  polynomial factors) exponentially decaying.
+
+  To see this last statement, note that the pdf of `Y_i` is the convolution of
+  the pdf of `k` independent Laplace random variables.  One can then show by
+  induction that distributions with exponential (up to polynomial factors) tails
+  are closed under convolution.
+
+
+  #### Examples
+
+  ```python
+  ds = tf.contrib.distributions
+  la = tf.contrib.linalg
+
+  # Initialize a single 3-variate VectorLaplace with some desired covariance.
+  mu = [1., 2, 3]
+  cov = [[ 0.36,  0.12,  0.06],
+         [ 0.12,  0.29, -0.13],
+         [ 0.06, -0.13,  0.26]]
+
+  scale = tf.cholesky(cov)
+  # ==> [[ 0.6,  0. ,  0. ],
+  #      [ 0.2,  0.5,  0. ],
+  #      [ 0.1, -0.3,  0.4]])
+
+  # Divide scale by sqrt(2) so that the final covariance will be what we want.
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorTriL(scale / tf.sqrt(2)))
+
+  # Covariance agrees with cholesky(cov) parameterization.
+  vla.covariance().eval()
+  # ==> [[ 0.36,  0.12,  0.06],
+  #      [ 0.12,  0.29, -0.13],
+  #      [ 0.06, -0.13,  0.26]]
+
+  # Compute the pdf of an`R^3` observation; return a scalar.
+  vla.prob([-1., 0, 1]).eval()  # shape: []
+
+  # Initialize a 2-batch of 3-variate Vector Laplace's.
+  mu = [[1., 2, 3],
+        [11, 22, 33]]              # shape: [2, 3]
+  scale_diag = [[1., 2, 3],
+                [0.5, 1, 1.5]]     # shape: [2, 3]
+
+  vla = ds.VectorLaplaceLinearOperator(
+      loc=mu,
+      scale=la.LinearOperatorDiag(scale_diag))
+
+  # Compute the pdf of two `R^3` observations; return a length-2 vector.
+  x = [[-0.9, 0, 0.1],
+       [-10, 0, 9]]     # shape: [2, 3]
+  vla.prob(x).eval()    # shape: [2]
+  ```
+
+  """
+
+  def __init__(self,
+               loc=None,
+               scale=None,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="VectorLaplaceLinearOperator"):
+    """Construct Vector Laplace distribution on `R^k`.
+
+    The `batch_shape` is the broadcast shape between `loc` and `scale`
+    arguments.
+
+    The `event_shape` is given by last dimension of the matrix implied by
+    `scale`. The last dimension of `loc` (if provided) must broadcast with this.
+
+    Recall that `covariance = 2 * scale @ scale.T`.
+
+    Additional leading dimensions (if any) will index batches.
+
+    Args:
+      loc: Floating-point `Tensor`. If this is set to `None`, `loc` is
+        implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where
+        `b >= 0` and `k` is the event size.
+      scale: Instance of `LinearOperator` with same `dtype` as `loc` and shape
+        `[B1, ..., Bb, k, k]`.
+      validate_args: Python `bool`, default `False`. Whether to validate input
+        with asserts. If `validate_args` is `False`, and the inputs are
+        invalid, correct behavior is not guaranteed.
+      allow_nan_stats: Python `bool`, default `True`. If `False`, raise an
+        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
+        batch member If `True`, batch members with valid parameters leading to
+        undefined statistics will return NaN for this statistic.
+      name: The name to give Ops created by the initializer.
+
+    Raises:
+      ValueError: if `scale` is unspecified.
+      TypeError: if not `scale.dtype.is_floating`
+    """
+    parameters = locals()
+    if scale is None:
+      raise ValueError("Missing required `scale` parameter.")
+    if not scale.dtype.is_floating:
+      raise TypeError("`scale` parameter must have floating-point dtype.")
+
+    with ops.name_scope(name, values=[loc] + scale.graph_parents):
+      # Since expand_dims doesn't preserve constant-ness, we obtain the
+      # non-dynamic value if possible.
+      loc = ops.convert_to_tensor(loc, name="loc") if loc is not None else loc
+      batch_shape, event_shape = distribution_util.shapes_from_loc_and_scale(
+          loc, scale)
+
+      super(VectorLaplaceLinearOperator, self).__init__(
+          distribution=laplace.Laplace(
+              loc=array_ops.zeros([], dtype=scale.dtype),
+              scale=array_ops.ones([], dtype=scale.dtype)),
+          bijector=bijectors.AffineLinearOperator(
+              shift=loc, scale=scale, validate_args=validate_args),
+          batch_shape=batch_shape,
+          event_shape=event_shape,
+          validate_args=validate_args,
+          name=name)
+      self._parameters = parameters
+
+  @property
+  def loc(self):
+    """The `loc` `Tensor` in `Y = scale @ X + loc`."""
+    return self.bijector.shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + loc`."""
+    return self.bijector.scale
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _log_prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._log_prob(x)
+
+  @distribution_util.AppendDocstring(_mvn_sample_note)
+  def _prob(self, x):
+    return super(VectorLaplaceLinearOperator, self)._prob(x)
+
+  def _mean(self):
+    shape = self.batch_shape.concatenate(self.event_shape)
+    has_static_shape = shape.is_fully_defined()
+    if not has_static_shape:
+      shape = array_ops.concat([
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], 0)
+
+    if self.loc is None:
+      return array_ops.zeros(shape, self.dtype)
+
+    if has_static_shape and shape == self.loc.get_shape():
+      return array_ops.identity(self.loc)
+
+    # Add dummy tensor of zeros to broadcast.  This is only necessary if shape
+    # != self.loc.shape, but we could not determine if this is the case.
+    return array_ops.identity(self.loc) + array_ops.zeros(shape, self.dtype)
+
+  def _covariance(self):
+    # Let
+    #   W = (w1,...,wk), with wj ~ iid Laplace(0, 1).
+    # Then this distribution is
+    #   X = loc + LW,
+    # and since E[X] = loc,
+    #   Cov(X) = E[LW W^T L^T] = L E[W W^T] L^T.
+    # Since E[wi wj] = 0 if i != j, and 2 if i == j, we have
+    #   Cov(X) = 2 LL^T
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * array_ops.matrix_diag(math_ops.square(self.scale.diag_part()))
+    else:
+      return 2. * self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)
+
+  def _variance(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return 2. * math_ops.square(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return array_ops.matrix_diag_part(
+          2. * self.scale.matmul(self.scale.to_dense()))
+    else:
+      return 2. * array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True))
+
+  def _stddev(self):
+    if distribution_util.is_diagonal_scale(self.scale):
+      return np.sqrt(2) * math_ops.abs(self.scale.diag_part())
+    elif (isinstance(self.scale, linalg.LinearOperatorUDVHUpdate)
+          and self.scale.is_self_adjoint):
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense())))
+    else:
+      return np.sqrt(2) * math_ops.sqrt(array_ops.matrix_diag_part(
+          self.scale.matmul(self.scale.to_dense(), adjoint_arg=True)))
+
+  def _mode(self):
+    return self._mean()
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 2f4b33b7e7c74e4363a32feb1e40d6678fbbc29e..ae804b61727b820b2af3c32f05818324bfbccf93 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -19,14 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import student_t
-from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import student_t
+from tensorflow.python.ops.distributions import transformed_distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 # TODO(jvdillon): Add unittests for this once we know where will put this code
@@ -191,7 +191,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   ```
 
   For more examples of how to construct the `scale` matrix, see the
-  `bijectors.Affine` docstring.
+  `tf.contrib.distributions.bijectors.Affine` docstring.
 
   """
 
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 6ed2c4dfb09353ff7b023dc87b00b28df293ed43..e162a796100ae877c92932c0a805787526eb7ce0 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 import math
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.distributions.python.ops import operator_pd_cholesky
 from tensorflow.contrib.distributions.python.ops import operator_pd_full
 from tensorflow.python.framework import constant_op
@@ -35,6 +33,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index aa26c6060fb45bb1e19e27bfd54e68a8b8c1b575..60e7c8f160a8e07e7c8d59ce731395586c7ab474 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -35,6 +35,7 @@ tf_custom_op_py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":factorization_ops_test_utils_py",
         ":gen_clustering_ops",
         ":gen_factorization_ops",
         "//tensorflow/contrib/framework:framework_py",
@@ -137,6 +138,7 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = [
+        "no_pip",  # b/38283730
         "notsan",  # Flaky: b/30756419
     ],
 )
@@ -161,12 +163,28 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "factorization_ops_test_utils_py",
+    srcs = [
+        "python/ops/factorization_ops_test_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 tf_py_test(
     name = "factorization_ops_test",
     srcs = ["python/ops/factorization_ops_test.py"],
     additional_deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        ":factorization_ops_test_utils_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -185,6 +203,7 @@ tf_py_test(
     additional_deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
+        ":factorization_ops_test_utils_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 195a58e04081c382d9fbab61c5b7177628156955..f10a5586b8fc70c2c02a322aecaedbd7f72f79b7 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -6,8 +6,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-
 cc_library(
     name = "all_kernels",
     deps = [
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 253f2d81e58d74575c9980625db222103da42999..d3fa233a1245a2794f8302212b8ad599d0bb42e1 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -34,7 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 from tensorflow.python.platform import resource_loader
 
@@ -327,31 +327,34 @@ class KMeans(object):
         cluster_centers_updated back to cluster_centers.
     """
     init_value = array_ops.constant([], dtype=dtypes.float32)
-    cluster_centers = variables.Variable(init_value,
-                                         name='clusters',
-                                         validate_shape=False)
-    cluster_centers_initialized = variables.Variable(False,
-                                                     dtype=dtypes.bool,
-                                                     name='initialized')
+    cluster_centers = variable_scope.variable(init_value,
+                                              name='clusters',
+                                              validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(False,
+                                                          dtype=dtypes.bool,
+                                                          name='initialized')
 
     if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
       # Copy of cluster centers actively updated each step according to
       # mini-batch update rule.
-      cluster_centers_updated = variables.Variable(init_value,
-                                                   name='clusters_updated',
-                                                   validate_shape=False)
+      cluster_centers_updated = variable_scope.variable(init_value,
+                                                        name='clusters_updated',
+                                                        validate_shape=False)
       # How many steps till we copy the updated clusters to cluster_centers.
-      update_in_steps = variables.Variable(self._mini_batch_steps_per_iteration,
-                                           dtype=dtypes.int64,
-                                           name='update_in_steps')
+      update_in_steps = variable_scope.variable(
+          self._mini_batch_steps_per_iteration,
+          dtype=dtypes.int64,
+          name='update_in_steps')
       # Count of points assigned to cluster_centers_updated.
-      cluster_counts = variables.Variable(array_ops.zeros([self._num_clusters],
-                                                          dtype=dtypes.int64))
+      cluster_counts = variable_scope.variable(
+          array_ops.zeros([self._num_clusters],
+                          dtype=dtypes.int64))
     else:
       cluster_centers_updated = cluster_centers
       update_in_steps = None
-      cluster_counts = (variables.Variable(array_ops.ones([self._num_clusters],
-                                                          dtype=dtypes.int64))
+      cluster_counts = (variable_scope.variable(array_ops.ones(
+          [self._num_clusters],
+          dtype=dtypes.int64))
                         if self._use_mini_batch else None)
     return (cluster_centers,
             cluster_centers_initialized,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index b853652629c1255f350e153aaef1d20800b55ed5..000e2403a7009c06284c8e1079eb4c02bb7a7add 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import resource_loader
 
@@ -190,7 +191,8 @@ class WALSModel(object):
                num_col_shards=1,
                row_weights=1,
                col_weights=1,
-               use_factors_weights_cache=True):
+               use_factors_weights_cache=True,
+               use_gramian_cache=True):
     """Creates model for WALS matrix factorization.
 
     Args:
@@ -224,6 +226,8 @@ class WALSModel(object):
       col_weights: See row_weights.
       use_factors_weights_cache: When True, the factors and weights will be
         cached on the workers before the updates start. Defaults to True.
+      use_gramian_cache: When True, the Gramians will be cached on the workers
+        before the updates start. Defaults to True.
     """
     self._input_rows = input_rows
     self._input_cols = input_cols
@@ -243,6 +247,7 @@ class WALSModel(object):
                                                   self._num_col_shards,
                                                   "col_weights")
     self._use_factors_weights_cache = use_factors_weights_cache
+    self._use_gramian_cache = use_gramian_cache
     self._row_factors = self._create_factors(self._input_rows,
                                              self._n_components,
                                              self._num_row_shards, row_init,
@@ -324,7 +329,7 @@ class WALSModel(object):
       var_name = "%s_shard_%d" % (name, i)
       var_init = make_initializer(i, size)
       sharded_matrix.append(
-          variables.Variable(
+          variable_scope.variable(
               var_init, dtype=dtypes.float32, name=var_name))
 
     return sharded_matrix
@@ -383,7 +388,7 @@ class WALSModel(object):
       var_name = "%s_shard_%d" % (name, i)
       var_init = make_wt_initializer(i, size)
       sharded_weight.append(
-          variables.Variable(
+          variable_scope.variable(
               var_init, dtype=dtypes.float32, name=var_name))
 
     return sharded_weight
@@ -400,7 +405,7 @@ class WALSModel(object):
     Returns:
       A gramian Tensor with shape of [n_components, n_components].
     """
-    return variables.Variable(
+    return variable_scope.variable(
         array_ops.zeros([n_components, n_components]),
         dtype=dtypes.float32,
         name=name)
@@ -408,7 +413,7 @@ class WALSModel(object):
   @staticmethod
   def _transient_var(name):
     """Helper function to create a Variable."""
-    return variables.Variable(
+    return variable_scope.variable(
         1.0,
         trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES],
@@ -495,10 +500,13 @@ class WALSModel(object):
     """Creates local cache of factors, weights and gramian for rows and columns.
 
     Note that currently the caching strategy is as follows:
-    When initiating a row(column) update, the column(row) gramian is computed
-    and cached while the row gramian is reset; optionally, column(row) factors
-    and weights are cached and row(column) factors and weights are reset when
-    use_factors_weights_cache is True.
+    When initiating a row (resp. column) update:
+      - The column (resp. row) gramian is computed.
+      - Optionally, if use_gramian_cache is True, the column (resp. row) Gramian
+        is cached, while the row (resp. column) gramian is reset.
+      - Optionally, if use_factors_weights_cache is True, the column (resp. row)
+        factors and weights are cached, while the row (resp. column) factors and
+        weights are reset.
     """
 
     (self._row_factors_cache, row_factors_cache_init,
@@ -515,18 +523,20 @@ class WALSModel(object):
         self._row_weights,
         "row_wt_cache",
         pass_through=not self._use_factors_weights_cache)
-
     (self._col_wt_cache, col_wt_cache_init, _) = self._cached_copy(
         self._col_weights,
         "col_wt_cache",
         pass_through=not self._use_factors_weights_cache)
-
     (self._row_gramian_cache, row_gramian_cache_init,
      row_gramian_cache_reset) = self._cached_copy(
-         self._row_gramian, "row_gramian_cache", pass_through=False)
+         self._row_gramian,
+         "row_gramian_cache",
+         pass_through=not self._use_gramian_cache)
     (self._col_gramian_cache, col_gramian_cache_init,
      col_gramian_cache_reset) = self._cached_copy(
-         self._col_gramian, "col_gramian_cache", pass_through=False)
+         self._col_gramian,
+         "col_gramian_cache",
+         pass_through=not self._use_gramian_cache)
 
     self._row_updates_init = control_flow_ops.group(col_factors_cache_init,
                                                     row_factors_cache_reset,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index 40b8550ac83b3abb868c8de00aff96949054b9e0..bcee881854586571061264155b2b346c88d4860c 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -18,160 +18,56 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import random
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.factorization.python.ops import factorization_ops
-from tensorflow.python.framework import constant_op
+from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
-INPUT_MATRIX = np.array(
-    [[0.1, 0.0, 0.2, 0.0, 0.4, 0.5, 0.0],
-     [0.0, 1.1, 0.0, 1.3, 1.4, 0.0, 1.6],
-     [2.0, 0.0, 0.0, 2.3, 0.0, 2.5, 0.0],
-     [3.0, 0.0, 3.2, 3.3, 0.0, 3.5, 0.0],
-     [0.0, 4.1, 0.0, 0.0, 4.4, 0.0, 4.6]]).astype(np.float32)
-
-
-def np_matrix_to_tf_sparse(np_matrix,
-                           row_slices=None,
-                           col_slices=None,
-                           transpose=False,
-                           shuffle=False):
-  """Simple util to slice non-zero np matrix elements as tf.SparseTensor."""
-  indices = np.nonzero(np_matrix)
-
-  # Only allow slices of whole rows or whole columns.
-  assert not (row_slices is not None and col_slices is not None)
-
-  if row_slices is not None:
-    selected_ind = np.concatenate(
-        [np.where(indices[0] == r)[0] for r in row_slices], 0)
-    indices = (indices[0][selected_ind], indices[1][selected_ind])
-
-  if col_slices is not None:
-    selected_ind = np.concatenate(
-        [np.where(indices[1] == c)[0] for c in col_slices], 0)
-    indices = (indices[0][selected_ind], indices[1][selected_ind])
-
-  if shuffle:
-    shuffled_ind = [x for x in range(len(indices[0]))]
-    random.shuffle(shuffled_ind)
-    indices = (indices[0][shuffled_ind], indices[1][shuffled_ind])
-
-  ind = (np.concatenate((np.expand_dims(indices[1], 1),
-                         np.expand_dims(indices[0], 1)), 1).astype(np.int64) if
-         transpose else np.concatenate((np.expand_dims(indices[0], 1),
-                                        np.expand_dims(indices[1], 1)),
-                                       1).astype(np.int64))
-  val = np_matrix[indices].astype(np.float32)
-  shape = (np.array([max(indices[1]) + 1, max(indices[0]) + 1]).astype(np.int64)
-           if transpose else np.array(
-               [max(indices[0]) + 1, max(indices[1]) + 1]).astype(np.int64))
-  return sparse_tensor.SparseTensor(ind, val, shape)
-
-
-def sparse_input():
-  return np_matrix_to_tf_sparse(INPUT_MATRIX)
-
-
-def count_rows(sp_input):
-  return math_ops.cast(
-      array_ops.shape(array_ops.unique(sp_input.indices[:, 0])[0])[0],
-      dtypes.float32)
-
-
-def count_cols(sp_input):
-  return math_ops.cast(
-      array_ops.shape(array_ops.unique(sp_input.indices[:, 1])[0])[0],
-      dtypes.float32)
-
-
-def calculate_loss(input_mat, row_factors, col_factors, regularization=None,
-                   w0=1., row_weights=None, col_weights=None):
-  """Calculates the loss of a given factorization.
-
-  Using a non distributed method, different than the one implemented in the
-  WALS model. The weight of an observed entry (i, j) (i.e. such that
-  input_mat[i, j] is non zero) is (w0 + row_weights[i]col_weights[j]).
-
-  Args:
-    input_mat: The input matrix, a SparseTensor of rank 2.
-    row_factors: The row factors, a dense Tensor of rank 2.
-    col_factors: The col factors, a dense Tensor of rank 2.
-    regularization: the regularization coefficient, a scalar.
-    w0: the weight of unobserved entries. A scalar.
-    row_weights: A dense tensor of rank 1.
-    col_weights: A dense tensor of rank 1.
-
-  Returns:
-    The total loss.
-  """
-  wr = (array_ops.expand_dims(row_weights, 1) if row_weights is not None
-        else constant_op.constant(1.))
-  wc = (array_ops.expand_dims(col_weights, 0) if col_weights is not None
-        else constant_op.constant(1.))
-  reg = (regularization if regularization is not None
-         else constant_op.constant(0.))
-
-  row_indices, col_indices = array_ops.split(input_mat.indices,
-                                             axis=1,
-                                             num_or_size_splits=2)
-  gathered_row_factors = array_ops.gather(row_factors, row_indices)
-  gathered_col_factors = array_ops.gather(col_factors, col_indices)
-  sp_approx_vals = array_ops.squeeze(math_ops.matmul(
-      gathered_row_factors, gathered_col_factors, adjoint_b=True))
-  sp_approx = sparse_tensor.SparseTensor(
-      indices=input_mat.indices,
-      values=sp_approx_vals,
-      dense_shape=input_mat.dense_shape)
-
-  sp_approx_sq = math_ops.square(sp_approx)
-  row_norm = math_ops.reduce_sum(math_ops.square(row_factors))
-  col_norm = math_ops.reduce_sum(math_ops.square(col_factors))
-  row_col_norm = math_ops.reduce_sum(math_ops.square(math_ops.matmul(
-      row_factors, col_factors, transpose_b=True)))
-
-  resid = sparse_ops.sparse_add(input_mat, sp_approx * (-1))
-  resid_sq = math_ops.square(resid)
-  loss = w0 * (
-      sparse_ops.sparse_reduce_sum(resid_sq) -
-      sparse_ops.sparse_reduce_sum(sp_approx_sq)
-      )
-  loss += (sparse_ops.sparse_reduce_sum(wr * (resid_sq * wc)) +
-           w0 * row_col_norm + reg * (row_norm + col_norm))
-  return loss.eval()
-
-
-def calculate_loss_from_wals_model(wals_model, sp_inputs):
-  current_rows = embedding_ops.embedding_lookup(
-      wals_model.row_factors, math_ops.range(wals_model._input_rows),
-      partition_strategy="div")
-  current_cols = embedding_ops.embedding_lookup(
-      wals_model.col_factors, math_ops.range(wals_model._input_cols),
-      partition_strategy="div")
-  row_wts = embedding_ops.embedding_lookup(
-      wals_model._row_weights, math_ops.range(wals_model._input_rows),
-      partition_strategy="div")
-  col_wts = embedding_ops.embedding_lookup(
-      wals_model._col_weights, math_ops.range(wals_model._input_cols),
-      partition_strategy="div")
-  return calculate_loss(
-      sp_inputs, current_rows, current_cols, wals_model._regularization,
-      wals_model._unobserved_weight, row_wts, col_wts)
+
+INPUT_MATRIX = factorization_ops_test_utils.INPUT_MATRIX
+np_matrix_to_tf_sparse = factorization_ops_test_utils.np_matrix_to_tf_sparse
 
 
 class WalsModelTest(test.TestCase):
 
+  def sparse_input(self):
+    return np_matrix_to_tf_sparse(INPUT_MATRIX)
+
+  def count_rows(self, sp_input):
+    return math_ops.cast(
+        array_ops.shape(array_ops.unique(sp_input.indices[:, 0])[0])[0],
+        dtypes.float32)
+
+  def count_cols(self, sp_input):
+    return math_ops.cast(
+        array_ops.shape(array_ops.unique(sp_input.indices[:, 1])[0])[0],
+        dtypes.float32)
+
+  def calculate_loss_from_wals_model(self, wals_model, sp_inputs):
+    current_rows = embedding_ops.embedding_lookup(
+        wals_model.row_factors, math_ops.range(wals_model._input_rows),
+        partition_strategy="div")
+    current_cols = embedding_ops.embedding_lookup(
+        wals_model.col_factors, math_ops.range(wals_model._input_cols),
+        partition_strategy="div")
+    row_wts = embedding_ops.embedding_lookup(
+        wals_model._row_weights, math_ops.range(wals_model._input_rows),
+        partition_strategy="div")
+    col_wts = embedding_ops.embedding_lookup(
+        wals_model._col_weights, math_ops.range(wals_model._input_cols),
+        partition_strategy="div")
+    return factorization_ops_test_utils.calculate_loss(
+        sp_inputs, current_rows, current_cols, wals_model._regularization,
+        wals_model._unobserved_weight, row_wts, col_wts)
+
   def setUp(self):
     self.col_init = [
         # shard 0
@@ -208,7 +104,7 @@ class WalsModelTest(test.TestCase):
                               use_factors_weights_cache,
                               compute_loss=False):
     with ops.Graph().as_default(), self.test_session() as sess:
-      self._wals_inputs = sparse_input()
+      self._wals_inputs = self.sparse_input()
       sp_feeder = array_ops.sparse_placeholder(dtypes.float32)
       num_rows = 5
       num_cols = 7
@@ -282,10 +178,10 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the row update
         loss = sum(
-            sess.run(factor_loss * count_rows(inp) / num_rows,
+            sess.run(factor_loss * self.count_rows(inp) / num_rows,
                      feed_dict={sp_feeder: inp})
             for inp in input_scattered_rows)
-        true_loss = calculate_loss_from_wals_model(
+        true_loss = self.calculate_loss_from_wals_model(
             wals_model, self._wals_inputs)
         self.assertNear(
             loss, true_loss, err=.001,
@@ -355,10 +251,10 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the column update.
         loss = sum(
-            sess.run(factor_loss * count_cols(inp) / num_cols,
+            sess.run(factor_loss * self.count_cols(inp) / num_cols,
                      feed_dict={sp_feeder: inp})
             for inp in input_scattered_cols_non_duplicate)
-        true_loss = calculate_loss_from_wals_model(
+        true_loss = self.calculate_loss_from_wals_model(
             wals_model, self._wals_inputs)
         self.assertNear(
             loss, true_loss, err=.001,
@@ -368,7 +264,7 @@ class WalsModelTest(test.TestCase):
   def _run_test_process_input_transposed(self, use_factors_weights_cache,
                                          compute_loss=False):
     with ops.Graph().as_default(), self.test_session() as sess:
-      self._wals_inputs = sparse_input()
+      self._wals_inputs = self.sparse_input()
       sp_feeder = array_ops.sparse_placeholder(dtypes.float32)
       num_rows = 5
       num_cols = 7
@@ -448,10 +344,10 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the row update
         loss = sum(
-            sess.run(factor_loss * count_cols(inp) / num_rows,
+            sess.run(factor_loss * self.count_cols(inp) / num_rows,
                      feed_dict={sp_feeder: inp})
             for inp in input_scattered_rows_non_duplicate)
-        true_loss = calculate_loss_from_wals_model(
+        true_loss = self.calculate_loss_from_wals_model(
             wals_model, self._wals_inputs)
         self.assertNear(
             loss, true_loss, err=.001,
@@ -516,10 +412,10 @@ class WalsModelTest(test.TestCase):
       if compute_loss:
         # Test loss computation after the col update
         loss = sum(
-            sess.run(factor_loss * count_rows(inp) / num_cols,
+            sess.run(factor_loss * self.count_rows(inp) / num_cols,
                      feed_dict={sp_feeder: inp})
             for inp in input_scattered_cols_non_duplicate)
-        true_loss = calculate_loss_from_wals_model(
+        true_loss = self.calculate_loss_from_wals_model(
             wals_model, self._wals_inputs)
         self.assertNear(
             loss, true_loss, err=.001,
@@ -534,7 +430,7 @@ class WalsModelTest(test.TestCase):
   # Here we test that those two give identical results.
   def _run_test_als(self, use_factors_weights_cache):
     with ops.Graph().as_default(), self.test_session():
-      self._wals_inputs = sparse_input()
+      self._wals_inputs = self.sparse_input()
       col_init = np.random.rand(7, 3)
       als_model = factorization_ops.WALSModel(
           5,
@@ -613,7 +509,7 @@ class WalsModelTest(test.TestCase):
 
   def _run_test_als_transposed(self, use_factors_weights_cache):
     with ops.Graph().as_default(), self.test_session():
-      self._wals_inputs = sparse_input()
+      self._wals_inputs = self.sparse_input()
       col_init = np.random.rand(7, 3)
       als_model = factorization_ops.WALSModel(
           5,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead9474805c9ee2ac52cb757660a87d3cfcbb76e
--- /dev/null
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test_utils.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utils for factorization_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+
+
+INPUT_MATRIX = np.array(
+    [[0.1, 0.0, 0.2, 0.0, 0.4, 0.5, 0.0],
+     [0.0, 1.1, 0.0, 1.3, 1.4, 0.0, 1.6],
+     [2.0, 0.0, 0.0, 2.3, 0.0, 2.5, 0.0],
+     [3.0, 0.0, 3.2, 3.3, 0.0, 3.5, 0.0],
+     [0.0, 4.1, 0.0, 0.0, 4.4, 0.0, 4.6]]).astype(np.float32)
+
+
+def remove_empty_rows_columns(np_matrix):
+  """Simple util to remove empty rows and columns of a matrix.
+
+  Args:
+    np_matrix: A numpy array.
+  Returns:
+    A tuple consisting of:
+    mat: A numpy matrix obtained by removing empty rows and columns from
+      np_matrix.
+    nz_row_ids: A numpy array of the ids of non-empty rows, such that
+      nz_row_ids[i] is the old row index corresponding to new index i.
+    nz_col_ids: A numpy array of the ids of non-empty columns, such that
+      nz_col_ids[j] is the old column index corresponding to new index j.
+  """
+  nz_row_ids = np.where(np.sum(np_matrix, axis=1) != 0)[0]
+  nz_col_ids = np.where(np.sum(np_matrix, axis=0) != 0)[0]
+  mat = np_matrix[np.ix_(nz_row_ids, nz_col_ids)]
+  return mat, nz_row_ids, nz_col_ids
+
+
+def np_matrix_to_tf_sparse(np_matrix,
+                           row_slices=None,
+                           col_slices=None,
+                           transpose=False,
+                           shuffle=False):
+  """Simple util to slice non-zero np matrix elements as tf.SparseTensor."""
+  indices = np.nonzero(np_matrix)
+
+  # Only allow slices of whole rows or whole columns.
+  assert not (row_slices is not None and col_slices is not None)
+
+  if row_slices is not None:
+    selected_ind = np.concatenate(
+        [np.where(indices[0] == r)[0] for r in row_slices], 0)
+    indices = (indices[0][selected_ind], indices[1][selected_ind])
+
+  if col_slices is not None:
+    selected_ind = np.concatenate(
+        [np.where(indices[1] == c)[0] for c in col_slices], 0)
+    indices = (indices[0][selected_ind], indices[1][selected_ind])
+
+  if shuffle:
+    shuffled_ind = [x for x in range(len(indices[0]))]
+    random.shuffle(shuffled_ind)
+    indices = (indices[0][shuffled_ind], indices[1][shuffled_ind])
+
+  ind = (np.concatenate((np.expand_dims(indices[1], 1),
+                         np.expand_dims(indices[0], 1)), 1).astype(np.int64) if
+         transpose else np.concatenate((np.expand_dims(indices[0], 1),
+                                        np.expand_dims(indices[1], 1)),
+                                       1).astype(np.int64))
+  val = np_matrix[indices].astype(np.float32)
+  shape = (np.array([max(indices[1]) + 1, max(indices[0]) + 1]).astype(np.int64)
+           if transpose else np.array(
+               [max(indices[0]) + 1, max(indices[1]) + 1]).astype(np.int64))
+  return sparse_tensor.SparseTensor(ind, val, shape)
+
+
+def calculate_loss(input_mat, row_factors, col_factors, regularization=None,
+                   w0=1., row_weights=None, col_weights=None):
+  """Calculates the loss of a given factorization.
+
+  Using a non distributed method, different than the one implemented in the
+  WALS model. The weight of an observed entry (i, j) (i.e. such that
+  input_mat[i, j] is non zero) is (w0 + row_weights[i]col_weights[j]).
+
+  Args:
+    input_mat: The input matrix, a SparseTensor of rank 2.
+    row_factors: The row factors, a dense Tensor of rank 2.
+    col_factors: The col factors, a dense Tensor of rank 2.
+    regularization: the regularization coefficient, a scalar.
+    w0: the weight of unobserved entries. A scalar.
+    row_weights: A dense tensor of rank 1.
+    col_weights: A dense tensor of rank 1.
+
+  Returns:
+    The total loss.
+  """
+  wr = (array_ops.expand_dims(row_weights, 1) if row_weights is not None
+        else constant_op.constant(1.))
+  wc = (array_ops.expand_dims(col_weights, 0) if col_weights is not None
+        else constant_op.constant(1.))
+  reg = (regularization if regularization is not None
+         else constant_op.constant(0.))
+
+  row_indices, col_indices = array_ops.split(input_mat.indices,
+                                             axis=1,
+                                             num_or_size_splits=2)
+  gathered_row_factors = array_ops.gather(row_factors, row_indices)
+  gathered_col_factors = array_ops.gather(col_factors, col_indices)
+  sp_approx_vals = array_ops.squeeze(math_ops.matmul(
+      gathered_row_factors, gathered_col_factors, adjoint_b=True))
+  sp_approx = sparse_tensor.SparseTensor(
+      indices=input_mat.indices,
+      values=sp_approx_vals,
+      dense_shape=input_mat.dense_shape)
+
+  sp_approx_sq = math_ops.square(sp_approx)
+  row_norm = math_ops.reduce_sum(math_ops.square(row_factors))
+  col_norm = math_ops.reduce_sum(math_ops.square(col_factors))
+  row_col_norm = math_ops.reduce_sum(math_ops.square(math_ops.matmul(
+      row_factors, col_factors, transpose_b=True)))
+
+  resid = sparse_ops.sparse_add(input_mat, sp_approx * (-1))
+  resid_sq = math_ops.square(resid)
+  loss = w0 * (
+      sparse_ops.sparse_reduce_sum(resid_sq) -
+      sparse_ops.sparse_reduce_sum(sp_approx_sq)
+      )
+  loss += (sparse_ops.sparse_reduce_sum(wr * (resid_sq * wc)) +
+           w0 * row_col_norm + reg * (row_norm + col_norm))
+  return loss.eval()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 8d78067b9a50e73cc1e8bd8011b99de53c45843f..b092eab316664705a455b88a524a77917f141b37 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
 from tensorflow.python.summary import summary
 
@@ -85,7 +85,7 @@ def _init_clusters_random(data, num_clusters, random_seed):
         maxval=math_ops.cast(num_data, dtypes.int64),
         seed=random_seed,
         dtype=dtypes.int64)
-  indices = math_ops.cast(indices, dtypes.int32) % num_data
+  indices %= math_ops.cast(num_data, dtypes.int64)
   clusters_init = embedding_lookup(data, indices, partition_strategy='div')
   return clusters_init
 
@@ -161,14 +161,14 @@ class GmmAlgorithm(object):
     first_shard = data[0]
     # Initialize means: num_classes X 1 X dimensions.
     if initial_means is not None:
-      self._means = variables.Variable(
+      self._means = variable_scope.variable(
           array_ops.expand_dims(initial_means, 1),
           name=self.CLUSTERS_VARIABLE,
           validate_shape=False,
           dtype=dtypes.float32)
     else:
       # Sample data randomly
-      self._means = variables.Variable(
+      self._means = variable_scope.variable(
           array_ops.expand_dims(
               _init_clusters_random(data, self._num_classes, self._random_seed),
               1),
@@ -187,11 +187,11 @@ class GmmAlgorithm(object):
       covs = array_ops.tile(
           array_ops.expand_dims(array_ops.diag_part(cov), 0),
           [self._num_classes, 1])
-    self._covs = variables.Variable(
+    self._covs = variable_scope.variable(
         covs, name=self.CLUSTERS_COVS_VARIABLE, validate_shape=False)
     # Mixture weights, representing the probability that a randomly
     # selected unobservable data (in EM terms) was generated by component k.
-    self._alpha = variables.Variable(
+    self._alpha = variable_scope.variable(
         array_ops.tile([1.0 / self._num_classes], [self._num_classes]),
         name=self.CLUSTERS_WEIGHT,
         validate_shape=False)
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 3fd2cbbec2b541475c0660da83c06868997c30d2..2f7bf480415ddccab33b89210e2245c00f413093 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.factorization.python.ops import factorization_ops
 from tensorflow.contrib.framework.python.ops import variables as framework_variables
+from tensorflow.contrib.learn.python.learn.estimators import estimator
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -26,7 +29,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import session_run_hook
 
@@ -67,6 +70,7 @@ class _SweepHook(session_run_hook.SessionRunHook):
         order. These are typically local initialization ops (such as cache
         initialization).
     """
+    # TODO(walidk): Provide a counter for the number of completed sweeps.
     self._num_rows = num_rows
     self._num_cols = num_cols
     self._row_prep_ops = row_prep_ops
@@ -122,14 +126,14 @@ class _SweepHook(session_run_hook.SessionRunHook):
     """
     processed_rows_init = array_ops.fill(dims=[self._num_rows], value=False)
     with ops.colocate_with(processed_rows_init):
-      processed_rows = variables.Variable(
+      processed_rows = variable_scope.variable(
           processed_rows_init,
           collections=[ops.GraphKeys.GLOBAL_VARIABLES],
           trainable=False,
           name="sweep_hook_processed_rows")
     processed_cols_init = array_ops.fill(dims=[self._num_cols], value=False)
     with ops.colocate_with(processed_cols_init):
-      processed_cols = variables.Variable(
+      processed_cols = variable_scope.variable(
           processed_cols_init,
           collections=[ops.GraphKeys.GLOBAL_VARIABLES],
           trainable=False,
@@ -221,3 +225,325 @@ class _SweepHook(session_run_hook.SessionRunHook):
     self._is_sweep_done = run_values.results[0]
     logging.info("Partial fit done.")
 
+
+def _wals_factorization_model_function(features, labels, mode, params):
+  """Model function for the WALSFactorization estimator.
+
+  Args:
+    features: Dictionary of features. See WALSMatrixFactorization.
+    labels: Must be None.
+    mode: A model_fn.ModeKeys object.
+    params: Dictionary of parameters containing arguments passed to the
+      WALSMatrixFactorization constructor.
+
+  Returns:
+    A ModelFnOps object.
+  """
+  assert labels is None
+  use_factors_weights_cache = (
+      params["use_factors_weights_cache_for_training"]
+      and mode == model_fn.ModeKeys.TRAIN)
+  use_gramian_cache = (
+      params["use_gramian_cache_for_training"]
+      and mode == model_fn.ModeKeys.TRAIN)
+  model = factorization_ops.WALSModel(
+      params["num_rows"],
+      params["num_cols"],
+      params["embedding_dimension"],
+      unobserved_weight=params["unobserved_weight"],
+      regularization=params["regularization_coeff"],
+      row_init=params["row_init"],
+      col_init=params["col_init"],
+      num_row_shards=params["num_row_shards"],
+      num_col_shards=params["num_col_shards"],
+      row_weights=params["row_weights"],
+      col_weights=params["col_weights"],
+      use_factors_weights_cache=use_factors_weights_cache,
+      use_gramian_cache=use_gramian_cache)
+
+  # Get input rows and cols. We either update rows or columns depending on
+  # the value of row_sweep, which is maintained using a session hook
+  input_rows = features[WALSMatrixFactorization.INPUT_ROWS]
+  input_cols = features[WALSMatrixFactorization.INPUT_COLS]
+  input_row_indices, _ = array_ops.unique(input_rows.indices[:, 0])
+  input_col_indices, _ = array_ops.unique(input_cols.indices[:, 0])
+
+  # Train ops, controlled using the SweepHook
+  # We need to run the following ops:
+  # Before a row sweep:
+  #   row_update_prep_gramian_op
+  #   initialize_row_update_op
+  # During a row sweep:
+  #   update_row_factors_op
+  # Before a col sweep:
+  #   col_update_prep_gramian_op
+  #   initialize_col_update_op
+  # During a col sweep:
+  #   update_col_factors_op
+
+  is_row_sweep_var = variable_scope.variable(
+      True, "is_row_sweep",
+      collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+  # The row sweep is determined by is_row_sweep_var (controlled by the
+  # sweep_hook) in TRAIN mode, and manually in EVAL mode.
+  is_row_sweep = (features[WALSMatrixFactorization.PROJECT_ROW]
+                  if mode == model_fn.ModeKeys.EVAL else is_row_sweep_var)
+
+  def update_row_factors():
+    return model.update_row_factors(sp_input=input_rows, transpose_input=False)
+  def update_col_factors():
+    return model.update_col_factors(sp_input=input_cols, transpose_input=True)
+  _, train_op, loss = control_flow_ops.cond(
+      is_row_sweep, update_row_factors, update_col_factors)
+
+  row_prep_ops = [model.row_update_prep_gramian_op,
+                  model.initialize_row_update_op]
+  col_prep_ops = [model.col_update_prep_gramian_op,
+                  model.initialize_col_update_op]
+  cache_init_ops = [model.worker_init]
+
+  sweep_hook = _SweepHook(
+      is_row_sweep_var,
+      train_op,
+      params["num_rows"],
+      params["num_cols"],
+      input_row_indices,
+      input_col_indices,
+      row_prep_ops,
+      col_prep_ops,
+      cache_init_ops,
+  )
+
+  # Prediction ops (only return predictions in INFER mode)
+  predictions = {}
+  if mode == model_fn.ModeKeys.INFER:
+    project_row = features[WALSMatrixFactorization.PROJECT_ROW]
+    projection_weights = features.get(
+        WALSMatrixFactorization.PROJECTION_WEIGHTS)
+    def get_row_projection():
+      return model.project_row_factors(
+          sp_input=input_rows,
+          projection_weights=projection_weights,
+          transpose_input=False)
+    def get_col_projection():
+      return model.project_col_factors(
+          sp_input=input_cols,
+          projection_weights=projection_weights,
+          transpose_input=True)
+
+    predictions[WALSMatrixFactorization.PROJECTION_RESULT] = (
+        control_flow_ops.cond(
+            project_row, get_row_projection, get_col_projection))
+
+  return model_fn.ModelFnOps(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      eval_metric_ops={},
+      train_op=train_op,
+      training_hooks=[sweep_hook])
+
+
+class WALSMatrixFactorization(estimator.Estimator):
+  """An Estimator for Weighted Matrix Factorization, using the WALS method.
+
+  WALS (Weighted Alternating Least Squares) is an algorithm for weighted matrix
+  factorization. It computes a low-rank approximation of a given sparse (n x m)
+  matrix A, by a product of two matrices, U * V^T, where U is a (n x k) matrix
+  and V is a (m x k) matrix. Here k is the rank of the approximation, also
+  called the embedding dimension. We refer to U as the row factors, and V as the
+  column factors.
+  See tensorflow/contrib/factorization/g3doc/wals.md for the precise problem
+  formulation.
+
+  The training proceeds in sweeps: during a row_sweep, we fix V and solve for U.
+  During a column sweep, we fix U and solve for V. Each one of these problems is
+  an unconstrained quadratic minimization problem and can be solved exactly (it
+  can also be solved in mini-batches, since the solution decouples nicely).
+  The alternating between sweeps is achieved by using a hook during training,
+  which is responsible for keeping track of the sweeps and running preparation
+  ops at the beginning of each sweep. It also updates the global_step variable,
+  which keeps track of the number of batches processed since the beginning of
+  training.
+  The current implementation assumes that the training is run on a single
+  machine, and will fail if config.num_worker_replicas is not equal to one.
+  Training is done by calling self.fit(input_fn=input_fn), where input_fn
+  provides two tensors: one for rows of the input matrix, and one for rows of
+  the transposed input matrix (i.e. columns of the original matrix). Note that
+  during a row sweep, only row batches are processed (ignoring column batches)
+  and vice-versa.
+  Also note that every row (respectively every column) of the input matrix
+  must be processed at least once for the sweep to be considered complete. In
+  particular, training will not make progress if input_fn does not generate some
+  rows.
+
+  For prediction, given a new set of input rows A' (e.g. new rows of the A
+  matrix), we compute a corresponding set of row factors U', such that U' * V^T
+  is a good approximation of A'. We call this operation a row projection. A
+  similar operation is defined for columns.
+  Projection is done by calling self.get_projections(input_fn=input_fn), where
+  input_fn satisfies the constraints given below.
+
+  The input functions must satisfy the following constraints: Calling input_fn
+  must return a tuple (features, labels) where labels is None, and features is
+  a dict containing the following keys:
+  TRAIN:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows of the input matrix to process (or to project).
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns of the input matrix to process (or to project), transposed.
+  INFER:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows to project.
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns to project.
+    - WALSMatrixFactorization.PROJECT_ROW: Boolean Tensor. Whether to project
+      the rows or columns.
+    - WALSMatrixFactorization.PROJECTION_WEIGHTS (Optional): float32 Tensor
+      (vector). The weights to use in the projection.
+  EVAL:
+    - WALSMatrixFactorization.INPUT_ROWS: float32 SparseTensor (matrix).
+      Rows to project.
+    - WALSMatrixFactorization.INPUT_COLS: float32 SparseTensor (matrix).
+      Columns to project.
+    - WALSMatrixFactorization.PROJECT_ROW: Boolean Tensor. Whether to project
+      the rows or columns.
+  """
+  # Keys to be used in model_fn
+  # Features keys
+  INPUT_ROWS = "input_rows"
+  INPUT_COLS = "input_cols"
+  PROJECT_ROW = "project_row"
+  PROJECTION_WEIGHTS = "projection_weights"
+  # Predictions key
+  PROJECTION_RESULT = "projection"
+
+  def __init__(self,
+               num_rows,
+               num_cols,
+               embedding_dimension,
+               unobserved_weight=0.1,
+               regularization_coeff=None,
+               row_init="random",
+               col_init="random",
+               num_row_shards=1,
+               num_col_shards=1,
+               row_weights=1,
+               col_weights=1,
+               use_factors_weights_cache_for_training=True,
+               use_gramian_cache_for_training=True,
+               model_dir=None,
+               config=None):
+    """Creates a model for matrix factorization using the WALS method.
+
+    Args:
+      num_rows: Total number of rows for input matrix.
+      num_cols: Total number of cols for input matrix.
+      embedding_dimension: Dimension to use for the factors.
+      unobserved_weight: Weight of the unobserved entries of matrix.
+      regularization_coeff: Weight of the L2 regularization term. Defaults to
+        None, in which case the problem is not regularized.
+      row_init: Initializer for row factor. Must be either:
+        - A tensor: The row factor matrix is initialized to this tensor,
+        - A numpy constant,
+        - "random": The rows are initialized using a normal distribution.
+      col_init: Initializer for column factor. See row_init.
+      num_row_shards: Number of shards to use for the row factors.
+      num_col_shards: Number of shards to use for the column factors.
+      row_weights: Must be in one of the following three formats:
+        - None: In this case, the weight of every entry is the unobserved_weight
+          and the problem simplifies to ALS. Note that, in this case,
+          col_weights must also be set to "None".
+        - List of lists of non-negative scalars, of the form
+          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          where the number of inner lists equal to the number of row factor
+          shards and the elements in each inner list are the weights for the
+          rows of that shard. In this case,
+          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+        - A non-negative scalar: This value is used for all row weights.
+          Note that it is allowed to have row_weights as a list and col_weights
+          as a scalar, or vice-versa.
+      col_weights: See row_weights.
+      use_factors_weights_cache_for_training: Boolean, whether the factors and
+        weights will be cached on the workers before the updates start, during
+        training. Defaults to True.
+        Note that caching is disabled during prediction.
+      use_gramian_cache_for_training: Boolean, whether the Gramians will be
+        cached on the workers before the updates start, during training.
+        Defaults to True. Note that caching is disabled during prediction.
+      model_dir: The directory to save the model results and log files.
+      config: A Configuration object. See Estimator.
+
+    Raises:
+      ValueError: If config.num_worker_replicas is strictly greater than one.
+        The current implementation only supports running on a single worker.
+    """
+    # TODO(walidk): Support power-law based weight computation.
+    # TODO(walidk): Add factor lookup by indices, with caching.
+    # TODO(walidk): Support caching during prediction.
+    # TODO(walidk): Provide input pipelines that handle missing rows.
+
+    params = {
+        "num_rows": num_rows,
+        "num_cols": num_cols,
+        "embedding_dimension": embedding_dimension,
+        "unobserved_weight": unobserved_weight,
+        "regularization_coeff": regularization_coeff,
+        "row_init": row_init,
+        "col_init": col_init,
+        "num_row_shards": num_row_shards,
+        "num_col_shards": num_col_shards,
+        "row_weights": row_weights,
+        "col_weights": col_weights,
+        "use_factors_weights_cache_for_training":
+            use_factors_weights_cache_for_training,
+        "use_gramian_cache_for_training": use_gramian_cache_for_training
+    }
+    self._row_factors_names = ["row_factors_shard_%d" % i
+                               for i in range(num_row_shards)]
+    self._col_factors_names = ["col_factors_shard_%d" % i
+                               for i in range(num_col_shards)]
+
+    super(WALSMatrixFactorization, self).__init__(
+        model_fn=_wals_factorization_model_function,
+        params=params,
+        model_dir=model_dir,
+        config=config)
+
+    if self._config is not None and self._config.num_worker_replicas > 1:
+      raise ValueError("WALSMatrixFactorization must be run on a single worker "
+                       "replica.")
+
+  def get_row_factors(self):
+    """Returns the row factors of the model, loading them from checkpoint.
+
+    Should only be run after training.
+
+    Returns:
+      A list of the row factors of the model.
+    """
+    return [self.get_variable_value(name) for name in self._row_factors_names]
+
+  def get_col_factors(self):
+    """Returns the column factors of the model, loading them from checkpoint.
+
+    Should only be run after training.
+
+    Returns:
+      A list of the column factors of the model.
+    """
+    return [self.get_variable_value(name) for name in self._col_factors_names]
+
+  def get_projections(self, input_fn):
+    """Computes the projections of the rows or columns given in input_fn.
+
+    Runs predict() with the given input_fn, and returns the results. Should only
+    be run after training.
+
+    Args:
+      input_fn: Input function which specifies the rows or columns to project.
+    Returns:
+      A generator of the projected factors.
+    """
+    return (result[WALSMatrixFactorization.PROJECTION_RESULT]
+            for result in self.predict(input_fn=input_fn))
diff --git a/tensorflow/contrib/factorization/python/ops/wals_test.py b/tensorflow/contrib/factorization/python/ops/wals_test.py
index 2ae2d3ab058163879bd98c8e288dd0b19d3e5bab..323b89a5cd7f1e6c4697aaec6cfae7020e516540 100644
--- a/tensorflow/contrib/factorization/python/ops/wals_test.py
+++ b/tensorflow/contrib/factorization/python/ops/wals_test.py
@@ -18,16 +18,392 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import json
+import numpy as np
+
+from tensorflow.contrib.factorization.python.ops import factorization_ops_test_utils
 from tensorflow.contrib.factorization.python.ops import wals as wals_lib
+from tensorflow.contrib.learn.python.learn import run_config
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import session_run_hook
 
 
+class WALSMatrixFactorizationTest(test.TestCase):
+  INPUT_MATRIX = factorization_ops_test_utils.INPUT_MATRIX
+
+  def np_array_to_sparse(self, np_array):
+    """Transforms an np.array to a tf.SparseTensor."""
+    return factorization_ops_test_utils.np_matrix_to_tf_sparse(np_array)
+
+  def calculate_loss(self):
+    """Calculates the loss of the current (trained) model."""
+    current_rows = embedding_ops.embedding_lookup(
+        self._model.get_row_factors(), math_ops.range(self._num_rows),
+        partition_strategy='div')
+    current_cols = embedding_ops.embedding_lookup(
+        self._model.get_col_factors(), math_ops.range(self._num_cols),
+        partition_strategy='div')
+    row_wts = embedding_ops.embedding_lookup(
+        self._row_weights, math_ops.range(self._num_rows),
+        partition_strategy='div')
+    col_wts = embedding_ops.embedding_lookup(
+        self._col_weights, math_ops.range(self._num_cols),
+        partition_strategy='div')
+    sp_inputs = self.np_array_to_sparse(self.INPUT_MATRIX)
+    return factorization_ops_test_utils.calculate_loss(
+        sp_inputs, current_rows, current_cols, self._regularization_coeff,
+        self._unobserved_weight, row_wts, col_wts)
+
+  # TODO(walidk): Replace with input_reader_utils functions once open sourced.
+  def remap_sparse_tensor_rows(self, sp_x, row_ids, shape):
+    """Remaps the row ids of a tf.SparseTensor."""
+    old_row_ids, old_col_ids = array_ops.split(
+        value=sp_x.indices, num_or_size_splits=2, axis=1)
+    new_row_ids = array_ops.gather(row_ids, old_row_ids)
+    new_indices = array_ops.concat([new_row_ids, old_col_ids], 1)
+    return sparse_tensor.SparseTensor(
+        indices=new_indices, values=sp_x.values, dense_shape=shape)
+
+  # TODO(walidk): Add an option to shuffle inputs.
+  def input_fn(self, np_matrix, batch_size, mode,
+               project_row=None, projection_weights=None,
+               remove_empty_rows_columns=False):
+    """Returns an input_fn that selects row and col batches from np_matrix.
+
+    This simple utility creates an input function from a numpy_array. The
+    following transformations are performed:
+    * The empty rows and columns in np_matrix are removed (if
+      remove_empty_rows_columns is true)
+    * np_matrix is converted to a SparseTensor.
+    * The rows of the sparse matrix (and the rows of its transpose) are batched.
+    * A features dictionary is created, which contains the row / column batches.
+
+    In TRAIN mode, one only needs to specify the np_matrix and the batch_size.
+    In INFER and EVAL modes, one must also provide project_row, a boolean which
+    specifies whether we are projecting rows or columns.
+
+    Args:
+      np_matrix: A numpy array. The input matrix to use.
+      batch_size: Integer.
+      mode: Can be one of model_fn.ModeKeys.{TRAIN, INFER, EVAL}.
+      project_row: A boolean. Used in INFER and EVAL modes. Specifies whether
+        to project rows or columns.
+      projection_weights: A float numpy array. Used in INFER mode. Specifies
+        the weights to use in the projection (the weights are optional, and
+        default to 1.).
+      remove_empty_rows_columns: A boolean. When true, this will remove empty
+        rows and columns in the np_matrix. Note that this will result in
+        modifying the indices of the input matrix. The mapping from new indices
+        to old indices is returned in the form of two numpy arrays.
+
+    Returns:
+      A tuple consisting of:
+      _fn: A callable. Calling _fn returns a features dict.
+      nz_row_ids: A numpy array of the ids of non-empty rows, such that
+        nz_row_ids[i] is the old row index corresponding to new index i.
+      nz_col_ids: A numpy array of the ids of non-empty columns, such that
+        nz_col_ids[j] is the old column index corresponding to new index j.
+    """
+    if remove_empty_rows_columns:
+      np_matrix, nz_row_ids, nz_col_ids = (
+          factorization_ops_test_utils.remove_empty_rows_columns(np_matrix))
+    else:
+      nz_row_ids = np.arange(np.shape(np_matrix)[0])
+      nz_col_ids = np.arange(np.shape(np_matrix)[1])
+
+    def extract_features(row_batch, col_batch, shape):
+      row_ids = row_batch[0]
+      col_ids = col_batch[0]
+      rows = self.remap_sparse_tensor_rows(row_batch[1], row_ids, shape)
+      cols = self.remap_sparse_tensor_rows(col_batch[1], col_ids, shape)
+      features = {
+          wals_lib.WALSMatrixFactorization.INPUT_ROWS: rows,
+          wals_lib.WALSMatrixFactorization.INPUT_COLS: cols,
+      }
+      return features
+
+    def _fn():
+      num_rows = np.shape(np_matrix)[0]
+      num_cols = np.shape(np_matrix)[1]
+      row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
+      col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
+      sp_mat = self.np_array_to_sparse(np_matrix)
+      sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
+      row_batch = input_lib.batch(
+          [row_ids, sp_mat],
+          batch_size=min(batch_size, num_rows),
+          capacity=10,
+          enqueue_many=True)
+      col_batch = input_lib.batch(
+          [col_ids, sp_mat_t],
+          batch_size=min(batch_size, num_cols),
+          capacity=10,
+          enqueue_many=True)
+
+      features = extract_features(row_batch, col_batch, sp_mat.dense_shape)
+
+      if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL:
+        self.assertTrue(
+            project_row is not None,
+            msg='project_row must be specified in INFER or EVAL mode.')
+        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
+            constant_op.constant(project_row))
+
+      if mode == model_fn.ModeKeys.INFER and projection_weights is not None:
+        weights_batch = input_lib.batch(
+            projection_weights,
+            batch_size=batch_size,
+            capacity=10,
+            enqueue_many=True)
+        features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
+            weights_batch)
+
+      labels = None
+      return features, labels
+
+    return _fn, nz_row_ids, nz_col_ids
+
+  @property
+  def input_matrix(self):
+    return self.INPUT_MATRIX
+
+  @property
+  def row_steps(self):
+    return np.ceil(self._num_rows / self.batch_size)
+
+  @property
+  def col_steps(self):
+    return np.ceil(self._num_cols / self.batch_size)
+
+  @property
+  def batch_size(self):
+    return 2
+
+  @property
+  def use_cache(self):
+    return False
+
+  def setUp(self):
+    self._num_rows = 5
+    self._num_cols = 7
+    self._embedding_dimension = 3
+    self._unobserved_weight = 0.1
+    self._num_row_shards = 2
+    self._num_col_shards = 3
+    self._regularization_coeff = 0.01
+    self._col_init = [
+        # Shard 0.
+        [[-0.36444709, -0.39077035, -0.32528427],
+         [1.19056475, 0.07231052, 2.11834812],
+         [0.93468881, -0.71099287, 1.91826844]],
+        # Shard 1.
+        [[1.18160152, 1.52490723, -0.50015002],
+         [1.82574749, -0.57515913, -1.32810032]],
+        # Shard 2.
+        [[-0.15515432, -0.84675711, 0.13097958],
+         [-0.9246484, 0.69117504, 1.2036494]],
+    ]
+    self._row_weights = [[0.1, 0.2, 0.3], [0.4, 0.5]]
+    self._col_weights = [[0.1, 0.2, 0.3], [0.4, 0.5], [0.6, 0.7]]
+
+    # Values of row and column factors after running one iteration or factor
+    # updates.
+    self._row_factors_0 = [[0.097689, -0.219293, -0.020780],
+                           [0.50842, 0.64626, 0.22364],
+                           [0.401159, -0.046558, -0.192854]]
+    self._row_factors_1 = [[1.20597, -0.48025, 0.35582],
+                           [1.5564, 1.2528, 1.0528]]
+    self._col_factors_0 = [[2.4725, -1.2950, -1.9980],
+                           [0.44625, 1.50771, 1.27118],
+                           [1.39801, -2.10134, 0.73572]]
+    self._col_factors_1 = [[3.36509, -0.66595, -3.51208],
+                           [0.57191, 1.59407, 1.33020]]
+    self._col_factors_2 = [[3.3459, -1.3341, -3.3008],
+                           [0.57366, 1.83729, 1.26798]]
+    self._model = wals_lib.WALSMatrixFactorization(
+        self._num_rows,
+        self._num_cols,
+        self._embedding_dimension,
+        self._unobserved_weight,
+        col_init=self._col_init,
+        regularization_coeff=self._regularization_coeff,
+        num_row_shards=self._num_row_shards,
+        num_col_shards=self._num_col_shards,
+        row_weights=self._row_weights,
+        col_weights=self._col_weights,
+        use_factors_weights_cache_for_training=self.use_cache,
+        use_gramian_cache_for_training=self.use_cache)
+
+  def test_fit(self):
+    # Row sweep.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    row_factors = self._model.get_row_factors()
+    self.assertAllClose(row_factors[0], self._row_factors_0, atol=1e-3)
+    self.assertAllClose(row_factors[1], self._row_factors_1, atol=1e-3)
+
+    # Col sweep.
+    # Running fit a second time will resume training from the checkpoint.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True)[0]
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    col_factors = self._model.get_col_factors()
+    self.assertAllClose(col_factors[0], self._col_factors_0, atol=1e-3)
+    self.assertAllClose(col_factors[1], self._col_factors_1, atol=1e-3)
+    self.assertAllClose(col_factors[2], self._col_factors_2, atol=1e-3)
+
+  def test_predict(self):
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
+    # Project rows 1 and 4 from the input matrix.
+    proj_input_fn = self.input_fn(
+        np_matrix=self.INPUT_MATRIX[[1, 4], :],
+        batch_size=2,
+        mode=model_fn.ModeKeys.INFER,
+        project_row=True,
+        projection_weights=[[0.2, 0.5]])[0]
+
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    projections = self._model.get_projections(proj_input_fn)
+    projected_rows = list(itertools.islice(projections, 2))
+
+    self.assertAllClose(
+        projected_rows,
+        [self._row_factors_0[1], self._row_factors_1[1]],
+        atol=1e-3)
+
+    # Project columns 5, 3, 1 from the input matrix.
+    proj_input_fn = self.input_fn(
+        np_matrix=self.INPUT_MATRIX[:, [5, 3, 1]],
+        batch_size=3,
+        mode=model_fn.ModeKeys.INFER,
+        project_row=False,
+        projection_weights=[[0.6, 0.4, 0.2]])[0]
+
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    projections = self._model.get_projections(proj_input_fn)
+    projected_cols = list(itertools.islice(projections, 3))
+    self.assertAllClose(
+        projected_cols,
+        [self._col_factors_2[0], self._col_factors_1[0],
+         self._col_factors_0[1]],
+        atol=1e-3)
+
+  def test_eval(self):
+    # Do a row sweep then evaluate the model on row inputs.
+    # The evaluate function returns the loss of the projected rows, but since
+    # projection is idempotent, the eval loss must match the model loss.
+    input_fn = self.input_fn(np_matrix=self.input_matrix,
+                             batch_size=self.batch_size,
+                             mode=model_fn.ModeKeys.TRAIN,
+                             remove_empty_rows_columns=True,
+                            )[0]
+    self._model.fit(input_fn=input_fn, steps=self.row_steps)
+    eval_input_fn_row = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=True,
+                                      remove_empty_rows_columns=True)[0]
+    loss = self._model.evaluate(
+        input_fn=eval_input_fn_row, steps=self._num_rows)['loss']
+
+    with self.test_session():
+      true_loss = self.calculate_loss()
+
+    self.assertNear(
+        loss, true_loss, err=.001,
+        msg="""After row update, eval loss = {}, does not match the true
+        loss = {}.""".format(loss, true_loss))
+
+    # Do a col sweep then evaluate the model on col inputs.
+    self._model.fit(input_fn=input_fn, steps=self.col_steps)
+    eval_input_fn_col = self.input_fn(np_matrix=self.input_matrix,
+                                      batch_size=1,
+                                      mode=model_fn.ModeKeys.EVAL,
+                                      project_row=False,
+                                      remove_empty_rows_columns=True)[0]
+    loss = self._model.evaluate(
+        input_fn=eval_input_fn_col, steps=self._num_cols)['loss']
+
+    with self.test_session():
+      true_loss = self.calculate_loss()
+
+    self.assertNear(
+        loss, true_loss, err=.001,
+        msg="""After row update, eval loss = {}, does not match the true
+        loss = {}.""".format(loss, true_loss))
+
+
+class WALSMatrixFactorizationTestCached(WALSMatrixFactorizationTest):
+
+  @property
+  def use_cache(self):
+    return True
+
+
+class WALSMatrixFactorizationTestFullBatch(WALSMatrixFactorizationTest):
+
+  @property
+  def batch_size(self):
+    return 100
+
+
+class WALSMatrixFactorizaiontTestPaddedInput(WALSMatrixFactorizationTest):
+  PADDED_INPUT_MATRIX = np.pad(
+      WALSMatrixFactorizationTest.INPUT_MATRIX,
+      [(1, 0), (1, 0)], mode='constant')
+
+  @property
+  def input_matrix(self):
+    return self.PADDED_INPUT_MATRIX
+
+
+class WALSMatrixFactorizationUnsupportedTest(test.TestCase):
+
+  def setUp(self):
+    pass
+
+  def testDistributedWALSUnsupported(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.WORKER,
+            'index': 1
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      config = run_config.RunConfig()
+    self.assertEqual(config.num_worker_replicas, 2)
+    with self.assertRaises(ValueError):
+      self._model = wals_lib.WALSMatrixFactorization(1, 1, 1, config=config)
+
+
 class SweepHookTest(test.TestCase):
 
   def setUp(self):
@@ -45,7 +421,7 @@ class SweepHookTest(test.TestCase):
 
   def run_hook_with_indices(self, sweep_hook, row_indices, col_indices):
     with self.test_session() as sess:
-      # Before run
+      # Before run.
       run_context = session_run_hook.SessionRunContext(
           original_args=None, session=sess)
       sess_run_args = sweep_hook.before_run(run_context)
@@ -53,11 +429,11 @@ class SweepHookTest(test.TestCase):
           self._input_row_indices_ph: row_indices,
           self._input_col_indices_ph: col_indices
       }
-      # Run
+      # Run.
       run_results = sess.run(sess_run_args.fetches, feed_dict=feed_dict)
       run_values = session_run_hook.SessionRunValues(
           results=run_results, options=None, run_metadata=None)
-      # After run
+      # After run.
       sweep_hook.after_run(run_context, run_values)
 
   def test_row_sweep(self):
@@ -74,9 +450,9 @@ class SweepHookTest(test.TestCase):
           self._col_prep_ops,
           self._init_ops)
 
-      # Initialize variables
+      # Initialize variables.
       sess.run([variables.global_variables_initializer()])
-      # Row sweep
+      # Row sweep.
       self.run_hook_with_indices(sweep_hook, [], [])
       self.assertTrue(sess.run(self._init_done),
                       msg='init ops not run by the sweep_hook')
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index a758bb92aaa6177af524d37f4f34dde43a3ddd16..a4dd3a642fdfec1aeca7b82d30ccb7b291d4bc39 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -70,8 +70,7 @@ bool IsBinaryInstalled(const string& binary_name) {
     const string binary_path = io::JoinPath(dir, binary_name);
     char absolute_path[PATH_MAX + 1];
     if (::realpath(binary_path.c_str(), absolute_path) == NULL) {
-      LOG(ERROR) << "Invalid binary path: " << binary_path;
-      return false;
+      continue;
     }
     struct stat statinfo;
     int result = ::stat(absolute_path, &statinfo);
@@ -142,7 +141,7 @@ template <typename UInt>
 string LittleEndianData(UInt data) {
   static_assert(std::is_unsigned<UInt>::value, "UInt must be unsigned");
   string str;
-  for (int i = 0; i < sizeof(UInt); ++i) {
+  for (size_t i = 0; i < sizeof(UInt); ++i) {
     const unsigned char bits = static_cast<unsigned char>(data & 0xFFU);
     char ch;
     ::memcpy(&ch, &bits, sizeof(bits));
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 6fc4814d53ab2a1388eba590768f13403f22ff68..d14fc5d6856376db9baa6d8c6599ff437fc0db54 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -40,6 +40,9 @@ See the @{$python/contrib.framework} guide.
 @@has_arg_scope
 @@arg_scoped_arguments
 
+@@prepend_name_scope
+@@strip_name_scope
+
 @@add_model_variable
 @@assert_global_step
 @@assert_or_get_global_step
@@ -53,6 +56,7 @@ See the @{$python/contrib.framework} guide.
 @@get_or_create_global_step
 @@get_local_variables
 @@get_model_variables
+@@get_name_scope
 @@get_trainable_variables
 @@get_unique_variable
 @@get_variables_by_name
@@ -81,6 +85,9 @@ from tensorflow.contrib.framework.python.framework import *
 from tensorflow.contrib.framework.python.ops import *
 # pylint: enable=unused-import,wildcard-import
 
+from tensorflow.python.framework.ops import prepend_name_scope
+from tensorflow.python.framework.ops import strip_name_scope
+
 from tensorflow.python.util.all_util import remove_undocumented
 
 
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
index 5d078236ac331c584faa503edcc50132ae980037..36de1e3f8216e0e7995838d1a145ed6dc554dc6b 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py
@@ -272,7 +272,7 @@ def init_from_checkpoint(checkpoint_dir, assignment_map):
       # and create variable to variable mapping.
       scope_variables = set()
       for var_name in var_scope._vars:
-        if var_name.startswith(scopes):
+        if not scopes or var_name.startswith(scopes + "/"):
           # Consume /part_ if partitioned variable.
           if "/part_" in var_name:
             var_name = var_name[:var_name.index("/part_")]
diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
index 51ca5ec1251dd97461fd984d6a84ff379380c81a..9396f027d31e2bbfebb868f984847c69242b364d 100644
--- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
+++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py
@@ -144,6 +144,25 @@ class CheckpointsTest(test.TestCase):
         # Check that tensors are not explicitly in the graph.
         self.assertLess(len(str(session.graph.as_graph_def())), 27000)
 
+  def testInitWithScopeDoesNotCaptureSuffixes(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    with ops.Graph().as_default() as g:
+      with variable_scope.variable_scope("useful_scope"):
+        my4 = variable_scope.get_variable("var4", [9, 9])
+      with variable_scope.variable_scope("useful_scope_1"):
+        my5_init = [[1.0, 2.0], [3.0, 4.0]]
+        my5 = variable_scope.get_variable("var5", initializer=my5_init)
+
+      checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                            {"useful_scope/": "useful_scope/"})
+      with self.test_session(graph=g) as session:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my4.eval(session), v4)
+        self.assertAllEqual(my5.eval(session), my5_init)
+
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index ad84cd681aa2ccc266db8df60222b155246225ff..9c194ec202ab6150278b26e844b9d3e97a7d6761 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -61,8 +61,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import contextlib
-import functools
+
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
 
 __all__ = ['arg_scope',
            'add_arg_scope',
@@ -106,7 +107,7 @@ def _add_op(op):
     _DECORATED_OPS[key_op] = _kwarg_names(op)
 
 
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def arg_scope(list_ops_or_scope, **kwargs):
   """Stores the default arguments for the given set of list_ops.
 
@@ -170,7 +171,6 @@ def add_arg_scope(func):
   Returns:
     A tuple with the decorated function func_with_args().
   """
-  @functools.wraps(func)
   def func_with_args(*args, **kwargs):
     current_scope = _current_arg_scope()
     current_args = kwargs
@@ -181,8 +181,7 @@ def add_arg_scope(func):
     return func(*args, **current_args)
   _add_op(func)
   setattr(func_with_args, '_key_op', _key_op(func))
-  setattr(func_with_args, '__doc__', func.__doc__)
-  return func_with_args
+  return tf_decorator.make_decorator(func, func_with_args)
 
 
 def has_arg_scope(func):
diff --git a/tensorflow/contrib/framework/python/ops/ops.py b/tensorflow/contrib/framework/python/ops/ops.py
index f403942fe7d382c730cdd1d1dcf2364ddb802eb7..4fccc2ceac7cdcdb3ce6de86e9591d1f7eb53212 100644
--- a/tensorflow/contrib/framework/python/ops/ops.py
+++ b/tensorflow/contrib/framework/python/ops/ops.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 
 
-__all__ = ['get_graph_from_inputs']
+__all__ = ['get_graph_from_inputs',
+           'get_name_scope']
 
 
 def get_graph_from_inputs(op_input_list, graph=None):
@@ -52,3 +53,21 @@ def get_graph_from_inputs(op_input_list, graph=None):
   """
   # pylint: disable=protected-access
   return ops._get_graph_from_inputs(op_input_list, graph)
+
+
+def get_name_scope():
+  """Returns the current name scope of the default graph.
+
+  For example:
+
+    ```python
+    with tf.name_scope('scope1'):
+      with tf.name_scope('scope2'):
+        print(tf.contrib.framework.get_name_scope())
+    ```
+    would print the string `scope1/scope2`.
+
+  Returns:
+    A string represnting the current name scope.
+  """
+  return ops.get_default_graph().get_name_scope()
diff --git a/tensorflow/contrib/framework/python/ops/ops_test.py b/tensorflow/contrib/framework/python/ops/ops_test.py
index 321ca6b82d4215f3bbbd5679a46a5ad8ce9890da..19bcb5d22e0f82f0827cf478a231b73f46653e93 100644
--- a/tensorflow/contrib/framework/python/ops/ops_test.py
+++ b/tensorflow/contrib/framework/python/ops/ops_test.py
@@ -57,6 +57,15 @@ class OpsTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not from the passed-in graph"):
       ops_lib.get_graph_from_inputs(values, g1)
 
+  def testGetNameScope(self):
+    with ops.name_scope("scope1"):
+      with ops.name_scope("scope2"):
+        with ops.name_scope("scope3"):
+          self.assertEqual("scope1/scope2/scope3", ops_lib.get_name_scope())
+        self.assertEqual("scope1/scope2", ops_lib.get_name_scope())
+      self.assertEqual("scope1", ops_lib.get_name_scope())
+    self.assertEqual("", ops_lib.get_name_scope())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 4f8e266cbe01267779d2ee9bc9f8c60c7d155d90..f02a7c636068b651c60f58009ad056036473d655 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -32,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
@@ -157,7 +156,7 @@ def local_variable(initial_value, validate_shape=True, name=None):
   Returns:
     New variable.
   """
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=validate_shape, name=name)
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 01c31ffc18af06c2b6a050cdb2cdc905612ac23d..959905e9826fe439112078a32fef9a5f5b96e9ac 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -39,10 +39,21 @@ __all__ = [
 
 
 def concatenate_unique(la, lb):
-  """Add all the elements of lb in la if they are not there already."""
+  """Add all the elements of `lb` to `la` if they are not there already.
+
+  The elements added to `la` maintain ordering with respect to `lb`.
+
+  Args:
+    la: List of Python objects.
+    lb: List of Python objects.
+  Returns:
+    `la`: The list `la` with missing elements from `lb`.
+  """
+  la_set = set(la)
   for l in lb:
-    if l not in la:
+    if l not in la_set:
       la.append(l)
+      la_set.add(l)
   return la
 
 
@@ -119,7 +130,7 @@ def transform_tree(tree, fn, iterable_type=tuple):
     tree: iterable or not. If iterable, its elements (child) can also be
       iterable or not.
     fn: function to apply to each leaves.
-    iterable_type: type use to construct the resulting tree for unknwon
+    iterable_type: type use to construct the resulting tree for unknown
       iterable, typically `list` or `tuple`.
   Returns:
     A tree whose leaves has been transformed by `fn`.
diff --git a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
index 758e0bcc07ce6ddc6dfd36070d4b1eb4ede8ca80..280271a42dc7fc007c4c0c06b64e4532472a728c 100644
--- a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
+++ b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
@@ -34,180 +34,228 @@ from tensorflow.python.platform import test
 class GridRNNCellTest(test.TestCase):
 
   def testGrid2BasicLSTMCell(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)) as root_scope:
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2BasicLSTMCell(2)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36617181, 0.36617181]])
-        self.assertAllClose(res[1], [[0.71053141, 0.71053141, 0.36617181,
-                                      0.36617181, 0.72320831, 0.80555487,
-                                      0.39102408, 0.42150158]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g, ([[0.36617181, 0.36617181]],))
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
 
         # emulate a loop through the input sequence,
         # where we call cell() multiple times
         root_scope.reuse_variables()
         g2, s2 = cell(x, m)
-        self.assertEqual(g2.get_shape(), (1, 2))
-        self.assertEqual(s2.get_shape(), (1, 8))
-
-        res = sess.run([g2, s2], {x: np.array([[2., 2., 2.]]), m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.58847463, 0.58847463]])
-        self.assertAllClose(res[1], [[1.40469193, 1.40469193, 0.58847463,
-                                      0.58847463, 0.97726452, 1.04626071,
-                                      0.4927212, 0.51137757]])
+        self.assertEqual(g2[0].get_shape(), (1, 2))
+        self.assertEqual(s2[0].c.get_shape(), (1, 2))
+        self.assertEqual(s2[0].h.get_shape(), (1, 2))
+        self.assertEqual(s2[1].c.get_shape(), (1, 2))
+        self.assertEqual(s2[1].h.get_shape(), (1, 2))
+
+        res_g2, res_s2 = sess.run([g2, s2],
+                                  {x: np.array([[2., 2., 2.]]),
+                                   m: res_s})
+        self.assertEqual(res_g2[0].shape, (1, 2))
+        self.assertEqual(res_s2[0].c.shape, (1, 2))
+        self.assertEqual(res_s2[0].h.shape, (1, 2))
+        self.assertEqual(res_s2[1].c.shape, (1, 2))
+        self.assertEqual(res_s2[1].h.shape, (1, 2))
+        self.assertAllClose(res_g2[0], [[0.58847463, 0.58847463]])
+        self.assertAllClose(
+            res_s2, (([[1.40469193, 1.40469193]], [[0.58847463, 0.58847463]]),
+                     ([[0.97726452, 1.04626071]], [[0.4927212, 0.51137757]])))
 
   def testGrid2BasicLSTMCellTied(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2BasicLSTMCell(2, tied=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36617181, 0.36617181]])
-        self.assertAllClose(res[1], [[0.71053141, 0.71053141, 0.36617181,
-                                      0.36617181, 0.72320831, 0.80555487,
-                                      0.39102408, 0.42150158]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
 
-        res = sess.run([g, s], {x: np.array([[1., 1., 1.]]), m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.36703536, 0.36703536]])
-        self.assertAllClose(res[1], [[0.71200621, 0.71200621, 0.36703536,
-                                      0.36703536, 0.80941606, 0.87550586,
-                                      0.40108523, 0.42199609]])
+        self.assertAllClose(res_g[0], [[0.36617181, 0.36617181]])
+        self.assertAllClose(
+            res_s, (([[0.71053141, 0.71053141]], [[0.36617181, 0.36617181]]),
+                    ([[0.72320831, 0.80555487]], [[0.39102408, 0.42150158]])))
+
+        res_g, res_s = sess.run([g, s], {x: np.array([[1., 1., 1.]]), m: res_s})
+        self.assertEqual(res_g[0].shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.36703536, 0.36703536]])
+        self.assertAllClose(
+            res_s, (([[0.71200621, 0.71200621]], [[0.36703536, 0.36703536]]),
+                    ([[0.80941606, 0.87550586]], [[0.40108523, 0.42199609]])))
 
   def testGrid2BasicLSTMCellWithRelu(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.2)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2BasicLSTMCell(
             2, tied=False, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.31667367, 0.31667367]])
-        self.assertAllClose(res[1], [[0.29530135, 0.37520045, 0.17044567,
-                                      0.21292259]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertAllClose(res_g[0], [[0.31667367, 0.31667367]])
+        self.assertAllClose(res_s, (([[0.29530135, 0.37520045]],
+                                     [[0.17044567, 0.21292259]]),))
 
   """LSTMCell
   """
 
   def testGrid2LSTMCell(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res[1], [[2.41515064, 2.41515064, 0.95686918,
-                                      0.95686918, 1.38917875, 1.49043763,
-                                      0.83884692, 0.86036491]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellTied(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=False) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 8])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid2LSTMCell(2, tied=True, use_peepholes=True)
-        self.assertEqual(cell.state_size, 8)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 8))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 8))
-        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
-        self.assertAllClose(res[1], [[2.41515064, 2.41515064, 0.95686918,
-                                      0.95686918, 1.38917875, 1.49043763,
-                                      0.83884692, 0.86036491]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+
+        self.assertAllClose(res_g[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(
+            res_s, (([[2.41515064, 2.41515064]], [[0.95686918, 0.95686918]]),
+                    ([[1.38917875, 1.49043763]], [[0.83884692, 0.86036491]])))
 
   def testGrid2LSTMCellWithRelu(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid2LSTMCell(
             2, use_peepholes=True, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[2.1831727, 2.1831727]])
-        self.assertAllClose(res[1], [[0.92270052, 1.02325559, 0.66159075,
-                                      0.70475441]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertAllClose(res_g[0], [[2.1831727, 2.1831727]])
+        self.assertAllClose(res_s, (([[0.92270052, 1.02325559]],
+                                     [[0.66159075, 0.70475441]]),))
 
   """RNNCell
   """
@@ -217,74 +265,84 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
-        m = array_ops.zeros([2, 4])
+        m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, (2, 2))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (2, 2))
-        self.assertEqual(s.get_shape(), (2, 4))
+        self.assertEqual(g[0].get_shape(), (2, 2))
+        self.assertEqual(s[0].get_shape(), (2, 2))
+        self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1.], [2., 2.]]),
-            m: np.array([[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
         })
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 4))
-        self.assertAllClose(res[0], [[0.94685763, 0.94685763],
-                                     [0.99480951, 0.99480951]])
-        self.assertAllClose(res[1],
-                            [[0.94685763, 0.94685763, 0.80049908, 0.80049908],
-                             [0.99480951, 0.99480951, 0.97574311, 0.97574311]])
+        self.assertEqual(res_g[0].shape, (2, 2))
+        self.assertEqual(res_s[0].shape, (2, 2))
+        self.assertEqual(res_s[1].shape, (2, 2))
+
+        self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellTied(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([2, 2])
-        m = array_ops.zeros([2, 4])
+        m = (array_ops.zeros([2, 2]), array_ops.zeros([2, 2]))
         cell = grid_rnn_cell.Grid2BasicRNNCell(2, tied=True)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, (2, 2))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (2, 2))
-        self.assertEqual(s.get_shape(), (2, 4))
+        self.assertEqual(g[0].get_shape(), (2, 2))
+        self.assertEqual(s[0].get_shape(), (2, 2))
+        self.assertEqual(s[1].get_shape(), (2, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
-            x: np.array([[1., 1.], [2., 2.]]),
-            m: np.array([[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2]])
+        res_g, res_s = sess.run([g, s], {
+            x:
+                np.array([[1., 1.], [2., 2.]]),
+            m: (np.array([[0.1, 0.1], [0.2, 0.2]]), np.array([[0.1, 0.1],
+                                                              [0.2, 0.2]]))
         })
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 4))
-        self.assertAllClose(res[0], [[0.94685763, 0.94685763],
-                                     [0.99480951, 0.99480951]])
-        self.assertAllClose(res[1],
-                            [[0.94685763, 0.94685763, 0.80049908, 0.80049908],
-                             [0.99480951, 0.99480951, 0.97574311, 0.97574311]])
+        self.assertEqual(res_g[0].shape, (2, 2))
+        self.assertEqual(res_s[0].shape, (2, 2))
+        self.assertEqual(res_s[1].shape, (2, 2))
+
+        self.assertAllClose(res_g, ([[0.94685763, 0.94685763],
+                                     [0.99480951, 0.99480951]],))
+        self.assertAllClose(
+            res_s, ([[0.94685763, 0.94685763], [0.99480951, 0.99480951]],
+                    [[0.80049908, 0.80049908], [0.97574311, 0.97574311]]))
 
   def testGrid2BasicRNNCellWithRelu(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
+        m = (array_ops.zeros([1, 2]),)
         cell = grid_rnn_cell.Grid2BasicRNNCell(2, non_recurrent_fn=nn_ops.relu)
-        self.assertEqual(cell.state_size, 2)
+        self.assertEqual(cell.state_size, (2,))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 2))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s],
-                       {x: np.array([[1., 1.]]),
-                        m: np.array([[0.1, 0.1]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 2))
-        self.assertAllClose(res[0], [[1.80049896, 1.80049896]])
-        self.assertAllClose(res[1], [[0.80049896, 0.80049896]])
+        res_g, res_s = sess.run(
+            [g, s], {x: np.array([[1., 1.]]),
+                     m: np.array([[0.1, 0.1]])})
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].shape, (1, 2))
+        self.assertAllClose(res_g, ([[1.80049896, 1.80049896]],))
+        self.assertAllClose(res_s, ([[0.80049896, 0.80049896]],))
 
   """1-LSTM
   """
@@ -294,51 +352,59 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)) as root_scope:
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
         cell = grid_rnn_cell.Grid1LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 4)
+        self.assertEqual(cell.state_size, ((2, 2),))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1., 1.]]),
-             m: np.array([[0.1, 0.2, 0.3, 0.4]])})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.91287315, 0.91287315]])
-        self.assertAllClose(res[1],
-                            [[2.26285243, 2.26285243, 0.91287315, 0.91287315]])
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),)
+        })
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+
+        self.assertAllClose(res_g, ([[0.91287315, 0.91287315]],))
+        self.assertAllClose(res_s, (([[2.26285243, 2.26285243]],
+                                     [[0.91287315, 0.91287315]]),))
 
         root_scope.reuse_variables()
 
         x2 = array_ops.zeros([0, 0])
         g2, s2 = cell(x2, m)
-        self.assertEqual(g2.get_shape(), (1, 2))
-        self.assertEqual(s2.get_shape(), (1, 4))
+        self.assertEqual(g2[0].get_shape(), (1, 2))
+        self.assertEqual(s2[0].c.get_shape(), (1, 2))
+        self.assertEqual(s2[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g2, s2], {m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.9032144, 0.9032144]])
-        self.assertAllClose(res[1],
-                            [[2.79966092, 2.79966092, 0.9032144, 0.9032144]])
+        res_g2, res_s2 = sess.run([g2, s2], {m: res_s})
+        self.assertEqual(res_g2[0].shape, (1, 2))
+        self.assertEqual(res_s2[0].c.shape, (1, 2))
+        self.assertEqual(res_s2[0].h.shape, (1, 2))
+
+        self.assertAllClose(res_g2, ([[0.9032144, 0.9032144]],))
+        self.assertAllClose(res_s2, (([[2.79966092, 2.79966092]],
+                                      [[0.9032144, 0.9032144]]),))
 
         g3, s3 = cell(x2, m)
-        self.assertEqual(g3.get_shape(), (1, 2))
-        self.assertEqual(s3.get_shape(), (1, 4))
+        self.assertEqual(g3[0].get_shape(), (1, 2))
+        self.assertEqual(s3[0].c.get_shape(), (1, 2))
+        self.assertEqual(s3[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g3, s3], {m: res[1]})
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 4))
-        self.assertAllClose(res[0], [[0.92727238, 0.92727238]])
-        self.assertAllClose(res[1],
-                            [[3.3529923, 3.3529923, 0.92727238, 0.92727238]])
+        res_g3, res_s3 = sess.run([g3, s3], {m: res_s2})
+        self.assertEqual(res_g3[0].shape, (1, 2))
+        self.assertEqual(res_s3[0].c.shape, (1, 2))
+        self.assertEqual(res_s3[0].h.shape, (1, 2))
+        self.assertAllClose(res_g3, ([[0.92727238, 0.92727238]],))
+        self.assertAllClose(res_s3, (([[3.3529923, 3.3529923]],
+                                      [[0.92727238, 0.92727238]]),))
 
   """3-LSTM
   """
@@ -348,32 +414,42 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 12])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),
+             (array_ops.zeros([1, 2]), array_ops.zeros([1, 2])))
         cell = grid_rnn_cell.Grid3LSTMCell(2, use_peepholes=True)
-        self.assertEqual(cell.state_size, 12)
+        self.assertEqual(cell.state_size, ((2, 2), (2, 2), (2, 2)))
 
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (1, 2))
-        self.assertEqual(s.get_shape(), (1, 12))
+        self.assertEqual(g[0].get_shape(), (1, 2))
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
+        self.assertEqual(s[1].c.get_shape(), (1, 2))
+        self.assertEqual(s[1].h.get_shape(), (1, 2))
+        self.assertEqual(s[2].c.get_shape(), (1, 2))
+        self.assertEqual(s[2].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {
+        res_g, res_s = sess.run([g, s], {
             x:
                 np.array([[1., 1., 1.]]),
-            m:
-                np.array([[
-                    0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, -0.1, -0.2, -0.3,
-                    -0.4
-                ]])
+            m: ((np.array([[0.1, 0.2]]), np.array([[0.3, 0.4]])),
+                (np.array([[0.5, 0.6]]), np.array([[0.7, 0.8]])), (np.array(
+                    [[-0.1, -0.2]]), np.array([[-0.3, -0.4]])))
         })
-        self.assertEqual(res[0].shape, (1, 2))
-        self.assertEqual(res[1].shape, (1, 12))
-
-        self.assertAllClose(res[0], [[0.96892911, 0.96892911]])
-        self.assertAllClose(res[1], [[2.45227885, 2.45227885, 0.96892911,
-                                      0.96892911, 1.33592629, 1.4373529,
-                                      0.80867189, 0.83247656, 0.7317788,
-                                      0.63205892, 0.56548983, 0.50446129]])
+        self.assertEqual(res_g[0].shape, (1, 2))
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
+        self.assertEqual(res_s[1].c.shape, (1, 2))
+        self.assertEqual(res_s[1].h.shape, (1, 2))
+        self.assertEqual(res_s[2].c.shape, (1, 2))
+        self.assertEqual(res_s[2].h.shape, (1, 2))
+
+        self.assertAllClose(res_g, ([[0.96892911, 0.96892911]],))
+        self.assertAllClose(
+            res_s, (([[2.45227885, 2.45227885]], [[0.96892911, 0.96892911]]),
+                    ([[1.33592629, 1.4373529]], [[0.80867189, 0.83247656]]),
+                    ([[0.7317788, 0.63205892]], [[0.56548983, 0.50446129]])))
 
   """Edge cases
   """
@@ -383,7 +459,7 @@ class GridRNNCellTest(test.TestCase):
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([3, 2])
-        m = array_ops.zeros([0, 0])
+        m = ()
 
         # this is equivalent to relu
         cell = grid_rnn_cell.GridRNNCell(
@@ -394,21 +470,22 @@ class GridRNNCellTest(test.TestCase):
             non_recurrent_dims=0,
             non_recurrent_fn=nn_ops.relu)
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (3, 2))
-        self.assertEqual(s.get_shape(), (0, 0))
+        self.assertEqual(g[0].get_shape(), (3, 2))
+        self.assertEqual(s, ())
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([g, s], {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
-        self.assertEqual(res[0].shape, (3, 2))
-        self.assertEqual(res[1].shape, (0, 0))
-        self.assertAllClose(res[0], [[0, 0], [0, 0], [0.5, 0.5]])
+        res_g, res_s = sess.run([g, s],
+                                {x: np.array([[1., -1.], [-2, 1], [2, -1]])})
+        self.assertEqual(res_g[0].shape, (3, 2))
+        self.assertEqual(res_s, ())
+        self.assertAllClose(res_g, ([[0, 0], [0, 0], [0.5, 0.5]],))
 
   def testGridRNNEdgeCasesNoOutput(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 4])
+        m = ((array_ops.zeros([1, 2]), array_ops.zeros([1, 2])),)
 
         # This cell produces no output
         cell = grid_rnn_cell.GridRNNCell(
@@ -419,16 +496,18 @@ class GridRNNCellTest(test.TestCase):
             non_recurrent_dims=0,
             non_recurrent_fn=nn_ops.relu)
         g, s = cell(x, m)
-        self.assertEqual(g.get_shape(), (0, 0))
-        self.assertEqual(s.get_shape(), (1, 4))
+        self.assertEqual(g, ())
+        self.assertEqual(s[0].c.get_shape(), (1, 2))
+        self.assertEqual(s[0].h.get_shape(), (1, 2))
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run(
-            [g, s],
-            {x: np.array([[1., 1.]]),
-             m: np.array([[0.1, 0.1, 0.1, 0.1]])})
-        self.assertEqual(res[0].shape, (0, 0))
-        self.assertEqual(res[1].shape, (1, 4))
+        res_g, res_s = sess.run([g, s], {
+            x: np.array([[1., 1.]]),
+            m: ((np.array([[0.1, 0.1]]), np.array([[0.1, 0.1]])),)
+        })
+        self.assertEqual(res_g, ())
+        self.assertEqual(res_s[0].c.shape, (1, 2))
+        self.assertEqual(res_s[0].h.shape, (1, 2))
 
   """Test with tf.nn.rnn
   """
@@ -451,20 +530,29 @@ class GridRNNCellTest(test.TestCase):
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 8))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid2LSTMCellReLUWithRNN(self):
     batch_size = 3
@@ -478,27 +566,33 @@ class GridRNNCellTest(test.TestCase):
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 4))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid3LSTMCellReLUWithRNN(self):
     batch_size = 3
@@ -512,27 +606,35 @@ class GridRNNCellTest(test.TestCase):
           num_units=num_units, non_recurrent_fn=nn_ops.relu)
 
       inputs = max_length * [
-          array_ops.placeholder(
-              dtypes.float32, shape=(batch_size, input_size))
+          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
       ]
 
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 8))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[1].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape()[0], inp.get_shape()[0])
-      self.assertEqual(out.get_shape()[1], num_units)
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape()[0], inp.get_shape()[0])
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
 
   def testGrid1LSTMCellWithRNN(self):
     batch_size = 3
@@ -553,20 +655,91 @@ class GridRNNCellTest(test.TestCase):
       outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
     self.assertEqual(len(outputs), len(inputs))
-    self.assertEqual(state.get_shape(), (batch_size, 4))
+    self.assertEqual(state[0].c.get_shape(), (batch_size, 2))
+    self.assertEqual(state[0].h.get_shape(), (batch_size, 2))
 
     for out, inp in zip(outputs, inputs):
-      self.assertEqual(out.get_shape(), (3, num_units))
-      self.assertEqual(out.dtype, inp.dtype)
+      self.assertEqual(len(out), 1)
+      self.assertEqual(out[0].get_shape(), (3, num_units))
+      self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
 
       input_value = np.ones((batch_size, input_size))
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
-      for v in values:
-        self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
+
+  def testGrid2LSTMCellWithRNNAndDynamicBatchSize(self):
+    """Test for #4296."""
+    input_size = 5
+    max_length = 6  # unrolled up to this length
+    num_units = 2
+
+    with variable_scope.variable_scope(
+        'root', initializer=init_ops.constant_initializer(0.5)):
+      cell = grid_rnn_cell.Grid2LSTMCell(num_units=num_units)
 
+      inputs = max_length * [
+          array_ops.placeholder(dtypes.float32, shape=(None, input_size))
+      ]
+
+      outputs, state = core_rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
+
+    self.assertEqual(len(outputs), len(inputs))
+
+    for out, inp in zip(outputs, inputs):
+      self.assertEqual(len(out), 1)
+      self.assertTrue(out[0].get_shape()[0].value is None)
+      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertEqual(out[0].dtype, inp.dtype)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      input_value = np.ones((3, input_size))
+      values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
+      for tp in values[:-1]:
+        for v in tp:
+          self.assertTrue(np.all(np.isfinite(v)))
+      for tp in values[-1]:
+        for st in tp:
+          for v in st:
+            self.assertTrue(np.all(np.isfinite(v)))
+
+  def testGrid2LSTMCellLegacy(self):
+    """Test for legacy case (when state_is_tuple=False)."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'root', initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 8])
+        cell = grid_rnn_cell.Grid2LSTMCell(
+            2, use_peepholes=True, state_is_tuple=False, output_is_tuple=False)
+        self.assertEqual(cell.state_size, 8)
+
+        g, s = cell(x, m)
+        self.assertEqual(g.get_shape(), (1, 2))
+        self.assertEqual(s.get_shape(), (1, 8))
+
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, s], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+        self.assertEqual(res[1].shape, (1, 8))
+        self.assertAllClose(res[0], [[0.95686918, 0.95686918]])
+        self.assertAllClose(res[1], [[
+            2.41515064, 2.41515064, 0.95686918, 0.95686918, 1.38917875,
+            1.49043763, 0.83884692, 0.86036491
+        ]])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
index 269b2245819f6d41c2eaf5ac345d149c51f233a9..252788140f8c1906718c150574b963385b6ecfa1 100644
--- a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
+++ b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
@@ -25,6 +25,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope as vs
+
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.contrib import layers
 from tensorflow.contrib import rnn
 
@@ -53,7 +55,9 @@ class GridRNNCell(rnn.RNNCell):
                non_recurrent_dims=None,
                tied=False,
                cell_fn=None,
-               non_recurrent_fn=None):
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     """Initialize the parameters of a Grid RNN cell
 
     Args:
@@ -68,26 +72,47 @@ class GridRNNCell(rnn.RNNCell):
       non_recurrent_dims: int or list, List of dimensions that are not
         recurrent.
               The transfer function for non-recurrent dimensions is specified
-                via `non_recurrent_fn`,
-              which is default to be `tensorflow.nn.relu`.
+                via `non_recurrent_fn`, which is
+                default to be `tensorflow.nn.relu`.
       tied: bool, Whether to share the weights among the dimensions of this
         GridRNN cell.
               If there are non-recurrent dimensions in the grid, weights are
-                shared between each
-              group of recurrent and non-recurrent dimensions.
-      cell_fn: function, a function which returns the recurrent cell object. Has
-        to be in the following signature:
-              def cell_func(num_units, input_size):
+                shared between each group of recurrent and non-recurrent
+                dimensions.
+      cell_fn: function, a function which returns the recurrent cell object.
+        Has to be in the following signature:
+              ```
+              def cell_func(num_units):
                 # ...
-
+              ```
               and returns an object of type `RNNCell`. If None, LSTMCell with
                 default parameters will be used.
+        Note that if you use a custom RNNCell (with `cell_fn`), it is your
+        responsibility to make sure the inner cell use `state_is_tuple=True`.
+
       non_recurrent_fn: a tensorflow Op that will be the transfer function of
         the non-recurrent dimensions
+      state_is_tuple: If True, accepted and returned states are tuples of the
+        states of the recurrent dimensions. If False, they are concatenated
+        along the column axis. The latter behavior will soon be deprecated.
+
+        Note that if you use a custom RNNCell (with `cell_fn`), it is your
+        responsibility to make sure the inner cell use `state_is_tuple=True`.
+
+      output_is_tuple: If True, the output is a tuple of the outputs of the
+        recurrent dimensions. If False, they are concatenated along the
+        column axis. The later behavior will soon be deprecated.
 
     Raises:
       TypeError: if cell_fn does not return an RNNCell instance.
     """
+    if not state_is_tuple:
+      logging.warning('%s: Using a concatenated state is slower and will '
+                      'soon be deprecated.  Use state_is_tuple=True.', self)
+    if not output_is_tuple:
+      logging.warning('%s: Using a concatenated output is slower and will'
+                      'soon be deprecated.  Use output_is_tuple=True.', self)
+
     if num_dims < 1:
       raise ValueError('dims must be >= 1: {}'.format(num_dims))
 
@@ -96,37 +121,41 @@ class GridRNNCell(rnn.RNNCell):
                                      non_recurrent_fn or nn.relu, tied,
                                      num_units)
 
-    cell_input_size = (self._config.num_dims - 1) * num_units
+    self._state_is_tuple = state_is_tuple
+    self._output_is_tuple = output_is_tuple
+
     if cell_fn is None:
       my_cell_fn = functools.partial(
-          rnn.LSTMCell,
-          num_units=num_units, input_size=cell_input_size,
-          state_is_tuple=False)
+          rnn.LSTMCell, num_units=num_units, state_is_tuple=state_is_tuple)
     else:
-      my_cell_fn = lambda: cell_fn(num_units, cell_input_size)
+      my_cell_fn = lambda: cell_fn(num_units)
     if tied:
       self._cells = [my_cell_fn()] * num_dims
     else:
       self._cells = [my_cell_fn() for _ in range(num_dims)]
     if not isinstance(self._cells[0], rnn.RNNCell):
-      raise TypeError(
-          'cell_fn must return an RNNCell instance, saw: %s'
-          % type(self._cells[0]))
+      raise TypeError('cell_fn must return an RNNCell instance, saw: %s' %
+                      type(self._cells[0]))
 
-  @property
-  def input_size(self):
-    # temporarily using num_units as the input_size of each dimension.
-    # The actual input size only determined when this cell get invoked,
-    # so this information can be considered unreliable.
-    return self._config.num_units * len(self._config.inputs)
+    if self._output_is_tuple:
+      self._output_size = tuple(self._cells[0].output_size
+                                for _ in self._config.outputs)
+    else:
+      self._output_size = self._cells[0].output_size * len(self._config.outputs)
+
+    if self._state_is_tuple:
+      self._state_size = tuple(self._cells[0].state_size
+                               for _ in self._config.recurrents)
+    else:
+      self._state_size = self._cell_state_size() * len(self._config.recurrents)
 
   @property
   def output_size(self):
-    return self._cells[0].output_size * len(self._config.outputs)
+    return self._output_size
 
   @property
   def state_size(self):
-    return self._cells[0].state_size * len(self._config.recurrents)
+    return self._state_size
 
   def __call__(self, inputs, state, scope=None):
     """Run one step of GridRNN.
@@ -145,76 +174,148 @@ class GridRNNCell(rnn.RNNCell):
       - A 2D, batch x state_size, Tensor representing the new state of the cell
         after reading "inputs" when previous state was "state".
     """
-    state_sz = state.get_shape().as_list()[1]
-    if self.state_size != state_sz:
-      raise ValueError(
-          'Actual state size not same as specified: {} vs {}.'.format(
-              state_sz, self.state_size))
-
     conf = self._config
-    dtype = inputs.dtype if inputs is not None else state.dtype
+    dtype = inputs.dtype
 
-    # c_prev is `m`, and m_prev is `h` in the paper.
-    # Keep c and m here for consistency with the codebase
-    c_prev = [None] * self._config.num_dims
-    m_prev = [None] * self._config.num_dims
-    cell_output_size = self._cells[0].state_size - conf.num_units
-
-    # for LSTM   : state = memory cell + output, hence cell_output_size > 0
-    # for GRU/RNN: state = output (whose size is equal to _num_units),
-    #              hence cell_output_size = 0
-    for recurrent_dim, start_idx in zip(self._config.recurrents, range(
-        0, self.state_size, self._cells[0].state_size)):
-      if cell_output_size > 0:
-        c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
-                                                [-1, conf.num_units])
-        m_prev[recurrent_dim] = array_ops.slice(
-            state, [0, start_idx + conf.num_units], [-1, cell_output_size])
-      else:
-        m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
-                                                [-1, conf.num_units])
+    c_prev, m_prev, cell_output_size = self._extract_states(state)
 
     new_output = [None] * conf.num_dims
     new_state = [None] * conf.num_dims
 
     with vs.variable_scope(scope or type(self).__name__):  # GridRNNCell
+      # project input, populate c_prev and m_prev
+      self._project_input(inputs, c_prev, m_prev, cell_output_size > 0)
 
-      # project input
-      if inputs is not None and sum(inputs.get_shape().as_list()) > 0 and len(
-          conf.inputs) > 0:
-        input_splits = array_ops.split(
-            value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
-        input_sz = input_splits[0].get_shape().as_list()[1]
-
-        for i, j in enumerate(conf.inputs):
-          input_project_m = vs.get_variable(
-              'project_m_{}'.format(j), [input_sz, conf.num_units], dtype=dtype)
-          m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)
-
-          if cell_output_size > 0:
-            input_project_c = vs.get_variable(
-                'project_c_{}'.format(j), [input_sz, conf.num_units],
-                dtype=dtype)
-            c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
-
+      # propagate along dimensions, first for non-priority dimensions
+      # then priority dimensions
       _propagate(conf.non_priority, conf, self._cells, c_prev, m_prev,
                  new_output, new_state, True)
       _propagate(conf.priority, conf, self._cells,
                  c_prev, m_prev, new_output, new_state, False)
 
+      # collect outputs and states
       output_tensors = [new_output[i] for i in self._config.outputs]
-      output = array_ops.zeros(
-          [0, 0], dtype) if len(output_tensors) == 0 else array_ops.concat(
-              output_tensors, 1)
+      if self._output_is_tuple:
+        output = tuple(output_tensors)
+      else:
+        if output_tensors:
+          output = array_ops.concat(output_tensors, 1)
+        else:
+          output = array_ops.zeros([0, 0], dtype)
 
-      state_tensors = [new_state[i] for i in self._config.recurrents]
-      states = array_ops.zeros(
-          [0, 0],
-          dtype) if len(state_tensors) == 0 else array_ops.concat(state_tensors,
-                                                                  1)
+      if self._state_is_tuple:
+        states = tuple(new_state[i] for i in self._config.recurrents)
+      else:
+        # concat each state first, then flatten the whole thing
+        state_tensors = [
+            x for i in self._config.recurrents for x in new_state[i]
+        ]
+        if state_tensors:
+          states = array_ops.concat(state_tensors, 1)
+        else:
+          states = array_ops.zeros([0, 0], dtype)
 
     return output, states
 
+  def _extract_states(self, state):
+    """Extract the cell and previous output tensors from the given state.
+
+    Args:
+      state: The RNN state.
+
+    Returns:
+      Tuple of the cell value, previous output, and cell_output_size.
+
+    Raises:
+      ValueError: If len(self._config.recurrents) != len(state).
+    """
+    conf = self._config
+
+    # c_prev is `m` (cell value), and
+    # m_prev is `h` (previous output) in the paper.
+    # Keeping c and m here for consistency with the codebase
+    c_prev = [None] * conf.num_dims
+    m_prev = [None] * conf.num_dims
+
+    # for LSTM   : state = memory cell + output, hence cell_output_size > 0
+    # for GRU/RNN: state = output (whose size is equal to _num_units),
+    #              hence cell_output_size = 0
+    total_cell_state_size = self._cell_state_size()
+    cell_output_size = total_cell_state_size - conf.num_units
+
+    if self._state_is_tuple:
+      if len(conf.recurrents) != len(state):
+        raise ValueError('Expected state as a tuple of {} '
+                         'element'.format(len(conf.recurrents)))
+
+      for recurrent_dim, recurrent_state in zip(conf.recurrents, state):
+        if cell_output_size > 0:
+          c_prev[recurrent_dim], m_prev[recurrent_dim] = recurrent_state
+        else:
+          m_prev[recurrent_dim] = recurrent_state
+    else:
+      for recurrent_dim, start_idx in zip(conf.recurrents,
+                                          range(0, self.state_size,
+                                                total_cell_state_size)):
+        if cell_output_size > 0:
+          c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
+                                                  [-1, conf.num_units])
+          m_prev[recurrent_dim] = array_ops.slice(
+              state, [0, start_idx + conf.num_units], [-1, cell_output_size])
+        else:
+          m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx],
+                                                  [-1, conf.num_units])
+    return c_prev, m_prev, cell_output_size
+
+  def _project_input(self, inputs, c_prev, m_prev, with_c):
+    """Fills in c_prev and m_prev with projected input, for input dimensions.
+
+    Args:
+      inputs: inputs tensor
+      c_prev: cell value
+      m_prev: previous output
+      with_c: boolean; whether to include project_c.
+
+    Raises:
+      ValueError: if len(self._config.input) != len(inputs)
+    """
+    conf = self._config
+
+    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0 and
+        conf.inputs):
+      if isinstance(inputs, tuple):
+        if len(conf.inputs) != len(inputs):
+          raise ValueError('Expect inputs as a tuple of {} '
+                           'tensors'.format(len(conf.inputs)))
+        input_splits = inputs
+      else:
+        input_splits = array_ops.split(
+            value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
+      input_sz = input_splits[0].get_shape().with_rank(2)[1].value
+
+      for i, j in enumerate(conf.inputs):
+        input_project_m = vs.get_variable(
+            'project_m_{}'.format(j), [input_sz, conf.num_units],
+            dtype=inputs.dtype)
+        m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)
+
+        if with_c:
+          input_project_c = vs.get_variable(
+              'project_c_{}'.format(j), [input_sz, conf.num_units],
+              dtype=inputs.dtype)
+          c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
+
+  def _cell_state_size(self):
+    """Total size of the state of the inner cell used in this grid.
+
+    Returns:
+      Total size of the state of the inner cell.
+    """
+    state_sizes = self._cells[0].state_size
+    if isinstance(state_sizes, tuple):
+      return sum(state_sizes)
+    return state_sizes
+
 
 """Specialized cells, for convenience
 """
@@ -223,11 +324,17 @@ class GridRNNCell(rnn.RNNCell):
 class Grid1BasicRNNCell(GridRNNCell):
   """1D BasicRNN cell"""
 
-  def __init__(self, num_units):
+  def __init__(self, num_units, state_is_tuple=True, output_is_tuple=True):
     super(Grid1BasicRNNCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0, tied=False,
-        cell_fn=lambda n, i: rnn.BasicRNNCell(num_units=n, input_size=i))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicRNNCell(GridRNNCell):
@@ -240,71 +347,112 @@ class Grid2BasicRNNCell(GridRNNCell):
   specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2BasicRNNCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.BasicRNNCell(num_units=n, input_size=i),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=lambda n: rnn.BasicRNNCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1BasicLSTMCell(GridRNNCell):
-  """1D BasicLSTM cell"""
+  """1D BasicLSTM cell."""
 
-  def __init__(self, num_units, forget_bias=1):
+  def __init__(self,
+               num_units,
+               forget_bias=1,
+               state_is_tuple=True,
+               output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid1BasicLSTMCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0, tied=False,
-        cell_fn=lambda n, i: rnn.BasicLSTMCell(
-            num_units=n,
-            forget_bias=forget_bias, input_size=i,
-            state_is_tuple=False))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=False,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2BasicLSTMCell(GridRNNCell):
-  """2D BasicLSTM cell
+  """2D BasicLSTM cell.
 
-    This creates a 2D cell which receives input and gives output in the first
-    dimension.
+  This creates a 2D cell which receives input and gives output in the first
+  dimension.
 
-    The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
-    specified.
+  The first dimension can optionally be non-recurrent if `non_recurrent_fn` is
+  specified.
   """
 
   def __init__(self,
                num_units,
                tied=False,
                non_recurrent_fn=None,
-               forget_bias=1):
+               forget_bias=1,
+               state_is_tuple=True,
+               output_is_tuple=True):
+    def cell_fn(n):
+      return rnn.BasicLSTMCell(num_units=n, forget_bias=forget_bias)
     super(Grid2BasicLSTMCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.BasicLSTMCell(
-            num_units=n, forget_bias=forget_bias, input_size=i,
-            state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid1LSTMCell(GridRNNCell):
-  """1D LSTM cell
+  """1D LSTM cell.
 
-    This is different from Grid1BasicLSTMCell because it gives options to
-    specify the forget bias and enabling peepholes
+  This is different from Grid1BasicLSTMCell because it gives options to
+  specify the forget bias and enabling peepholes.
   """
 
-  def __init__(self, num_units, use_peepholes=False, forget_bias=1.0):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid1LSTMCell, self).__init__(
-        num_units=num_units, num_dims=1,
-        input_dims=0, output_dims=0, priority_dims=0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, use_peepholes=use_peepholes,
-            forget_bias=forget_bias, state_is_tuple=False))
+        num_units=num_units,
+        num_dims=1,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        cell_fn=cell_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2LSTMCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -317,19 +465,30 @@ class Grid2LSTMCell(GridRNNCell):
                tied=False,
                non_recurrent_fn=None,
                use_peepholes=False,
-               forget_bias=1.0):
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid2LSTMCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, forget_bias=forget_bias,
-            use_peepholes=use_peepholes, state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid3LSTMCell(GridRNNCell):
-  """3D BasicLSTM cell
+  """3D BasicLSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -343,19 +502,30 @@ class Grid3LSTMCell(GridRNNCell):
                tied=False,
                non_recurrent_fn=None,
                use_peepholes=False,
-               forget_bias=1.0):
+               forget_bias=1.0,
+               state_is_tuple=True,
+               output_is_tuple=True):
+
+    def cell_fn(n):
+      return rnn.LSTMCell(
+          num_units=n, forget_bias=forget_bias, use_peepholes=use_peepholes)
+
     super(Grid3LSTMCell, self).__init__(
-        num_units=num_units, num_dims=3,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=3,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.LSTMCell(
-            num_units=n, input_size=i, forget_bias=forget_bias,
-            use_peepholes=use_peepholes, state_is_tuple=False),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=cell_fn,
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
 class Grid2GRUCell(GridRNNCell):
-  """2D LSTM cell
+  """2D LSTM cell.
 
     This creates a 2D cell which receives input and gives output in the first
     dimension.
@@ -363,21 +533,31 @@ class Grid2GRUCell(GridRNNCell):
     specified.
   """
 
-  def __init__(self, num_units, tied=False, non_recurrent_fn=None):
+  def __init__(self,
+               num_units,
+               tied=False,
+               non_recurrent_fn=None,
+               state_is_tuple=True,
+               output_is_tuple=True):
     super(Grid2GRUCell, self).__init__(
-        num_units=num_units, num_dims=2,
-        input_dims=0, output_dims=0, priority_dims=0, tied=tied,
+        num_units=num_units,
+        num_dims=2,
+        input_dims=0,
+        output_dims=0,
+        priority_dims=0,
+        tied=tied,
         non_recurrent_dims=None if non_recurrent_fn is None else 0,
-        cell_fn=lambda n, i: rnn.GRUCell(num_units=n, input_size=i),
-        non_recurrent_fn=non_recurrent_fn)
+        cell_fn=lambda n: rnn.GRUCell(num_units=n),
+        non_recurrent_fn=non_recurrent_fn,
+        state_is_tuple=state_is_tuple,
+        output_is_tuple=output_is_tuple)
 
 
-"""Helpers
-"""
+# Helpers
 
-_GridRNNDimension = namedtuple(
-    '_GridRNNDimension',
-    ['idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'])
+_GridRNNDimension = namedtuple('_GridRNNDimension', [
+    'idx', 'is_input', 'is_output', 'is_priority', 'non_recurrent_fn'
+])
 
 _GridRNNConfig = namedtuple('_GridRNNConfig',
                             ['num_dims', 'dims', 'inputs', 'outputs',
@@ -387,7 +567,6 @@ _GridRNNConfig = namedtuple('_GridRNNConfig',
 
 def _parse_rnn_config(num_dims, ls_input_dims, ls_output_dims, ls_priority_dims,
                       ls_non_recurrent_dims, non_recurrent_fn, tied, num_units):
-
   def check_dim_list(ls):
     if ls is None:
       ls = []
@@ -412,8 +591,8 @@ def _parse_rnn_config(num_dims, ls_input_dims, ls_output_dims, ls_priority_dims,
             is_input=(i in input_dims),
             is_output=(i in output_dims),
             is_priority=(i in priority_dims),
-            non_recurrent_fn=non_recurrent_fn if i in non_recurrent_dims else
-            None))
+            non_recurrent_fn=non_recurrent_fn
+            if i in non_recurrent_dims else None))
   return _GridRNNConfig(
       num_dims=num_dims,
       dims=rnn_dims,
@@ -440,34 +619,40 @@ def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
   if conf.num_dims > 1:
     ls_cell_inputs = [None] * (conf.num_dims - 1)
     for d in conf.dims[:-1]:
-      ls_cell_inputs[d.idx] = new_output[d.idx] if new_output[
-          d.idx] is not None else m_prev[d.idx]
+      if new_output[d.idx] is None:
+        ls_cell_inputs[d.idx] = m_prev[d.idx]
+      else:
+        ls_cell_inputs[d.idx] = new_output[d.idx]
     cell_inputs = array_ops.concat(ls_cell_inputs, 1)
   else:
     cell_inputs = array_ops.zeros([m_prev[0].get_shape().as_list()[0], 0],
                                   m_prev[0].dtype)
 
-  last_dim_output = new_output[-1] if new_output[-1] is not None else m_prev[-1]
+  last_dim_output = (new_output[-1]
+                     if new_output[-1] is not None else m_prev[-1])
 
   for i in dim_indices:
     d = conf.dims[i]
     if d.non_recurrent_fn:
-      linear_args = array_ops.concat(
-          [cell_inputs, last_dim_output],
-          1) if conf.num_dims > 1 else last_dim_output
+      if conf.num_dims > 1:
+        linear_args = array_ops.concat([cell_inputs, last_dim_output], 1)
+      else:
+        linear_args = last_dim_output
       with vs.variable_scope('non_recurrent' if conf.tied else
                              'non_recurrent/cell_{}'.format(i)):
         if conf.tied and not (first_call and i == dim_indices[0]):
           vs.get_variable_scope().reuse_variables()
-        new_output[d.idx] = layers.legacy_fully_connected(
+
+        new_output[d.idx] = layers.fully_connected(
             linear_args,
-            num_output_units=conf.num_units,
+            num_outputs=conf.num_units,
             activation_fn=d.non_recurrent_fn,
-            weight_init=vs.get_variable_scope().initializer or
-            layers.initializers.xavier_initializer)
+            weights_initializer=(vs.get_variable_scope().initializer or
+                                 layers.initializers.xavier_initializer),
+            weights_regularizer=vs.get_variable_scope().regularizer)
     else:
       if c_prev[i] is not None:
-        cell_state = array_ops.concat([c_prev[i], last_dim_output], 1)
+        cell_state = (c_prev[i], last_dim_output)
       else:
         # for GRU/RNN, the state is just the previous output
         cell_state = last_dim_output
diff --git a/tensorflow/contrib/hooks/README.md b/tensorflow/contrib/hooks/README.md
index c7f88bb1113f045415eba8fe5fef9adfa5bd068c..84dd6ac8792c04f46b61a792a06b905c414a936c 100644
--- a/tensorflow/contrib/hooks/README.md
+++ b/tensorflow/contrib/hooks/README.md
@@ -5,7 +5,7 @@ of `SessionRunHook` and are to be used with helpers like `MonitoredSession`
 and `learn.Estimator` that wrap `tensorflow.Session`.
 
 The hooks are called between invocations of `Session.run()` to perform custom
-behaviour.
+behavior.
 
 For example the `ProfilerHook` periodically collects `RunMetadata` after
 `Session.run()` and saves profiling information that can be viewed in a
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fa75943d786859914303317dba633a9611a19366
--- /dev/null
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -0,0 +1,36 @@
+# Description:
+#   Contains a tool to dump TensorFlow ops which are not supported
+#   in TensorFlow HVX runtime.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+cc_binary(
+    name = "hvx_ops_support_checker",
+    testonly = 1,
+    srcs = ["hvx_ops_support_checker_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
+        "//tensorflow/core/kernels/hexagon:graph_transferer",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+)
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ae7c4a7420e8d7a58bc0a83e14e792b442f6d5d
--- /dev/null
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// bazel build tensorflow/contrib/hvx/hvx_ops_support_checker &&
+// bazel-bin/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker
+// \
+// --in_graph=graph_def.pb
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
+#include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace {
+static int ParseFlags(int argc, char* argv[], string* in_graph) {
+  std::vector<Flag> flag_list = {
+      Flag("in_graph", in_graph, "input graph file name"),
+  };
+  CHECK(Flags::Parse(&argc, argv, flag_list));
+  // We need to call this to set up global state for TensorFlow.
+  port::InitMain(argv[0], &argc, &argv);
+
+  string usage = Flags::Usage(argv[0], flag_list);
+  CHECK(!in_graph->empty()) << "in_graph graph can't be empty.\n" << usage;
+
+  return 0;
+}
+
+static void SummarizeNode(const NodeDef& node_def) {
+  LOG(INFO) << "Node(" << node_def.name() << ")";
+  LOG(INFO) << "  op: " << node_def.op();
+  for (const string& input : node_def.input()) {
+    LOG(INFO) << " Input: " << input;
+  }
+}
+
+static void DumpRemoteFusedGraph(const NodeDef& node_def) {
+  LOG(INFO) << "Remote fused graph found.";
+  RemoteFusedGraphExecuteInfo info;
+  string serialized_proto;
+  GetNodeAttr(node_def,
+              RemoteFusedGraphExecuteUtils::
+                  ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+              &serialized_proto)
+      .IgnoreError();
+  info.ParseFromString(serialized_proto);
+  LOG(INFO) << "Node name: " << node_def.name();
+  LOG(INFO) << "Executor name: " << info.executor_name();
+  for (const string& input : info.graph_input_node_name()) {
+    LOG(INFO) << "Input: " << input;
+  }
+  for (const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type :
+       info.default_graph_input_tensor_shape()) {
+    LOG(INFO) << "Input shape type: " << shape_type.DebugString();
+  }
+  for (const string& output : info.graph_output_node_name()) {
+    LOG(INFO) << "Output: " << output;
+  }
+  for (const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type :
+       info.default_graph_output_tensor_shape()) {
+    LOG(INFO) << "Output shape type: " << shape_type.DebugString();
+  }
+  const int subgraph_node_size = info.remote_graph().node_size();
+  LOG(INFO) << "Nodes in the graph: " << subgraph_node_size;
+  for (int i = 0; i < subgraph_node_size; ++i) {
+    LOG(INFO) << "node(" << i << "): " << info.remote_graph().node(i).name();
+  }
+}
+
+static void CheckOpsSupport(const GraphDef& graph_def) {
+  const IGraphTransferOpsDefinitions& ops_definition =
+      HexagonOpsDefinitions::getInstance();
+  LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
+
+  std::unordered_set<string> unsupported_ops;
+  bool all_supported = true;
+  bool contains_remote_graph = false;
+  for (const NodeDef& node : graph_def.node()) {
+    if (node.op() == "RemoteFusedGraphExecute") {
+      contains_remote_graph = true;
+      DumpRemoteFusedGraph(node);
+      continue;
+    }
+    // TODO(satok): Set correct data type if it's given.
+    const int op_id = ops_definition.GetOpIdFor(node.op(), {});
+    if (op_id == IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+      all_supported = false;
+      LOG(ERROR) << "OP type: " << node.op() << " is not supported on hvx. "
+                 << "Name = " << node.name();
+      unsupported_ops.emplace(node.op());
+    }
+  }
+
+  LOG(INFO) << "\n";
+  LOG(INFO) << "Unsupported ops:";
+  int count = 0;
+  for (const string& op_type : unsupported_ops) {
+    LOG(INFO) << "(" << (++count) << ") " << op_type;
+  }
+  if (count == 0) {
+    LOG(INFO) << "All ops supported!";
+  } else {
+    LOG(INFO) << count << " ops are not supported.";
+  }
+
+  if (contains_remote_graph) {
+    for (const NodeDef& node : graph_def.node()) {
+      SummarizeNode(node);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  tensorflow::string in_graph;
+  const int ret = tensorflow::ParseFlags(argc, argv, &in_graph);
+  if (ret != 0) {
+    return ret;
+  }
+
+  tensorflow::GraphDef graph_def;
+  TF_CHECK_OK(tensorflow::graph_transforms::LoadTextOrBinaryGraphFile(
+      in_graph, &graph_def));
+
+  tensorflow::CheckOpsSupport(graph_def);
+  return 0;
+}
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
old mode 100644
new mode 100755
index c31ac7b3242117459c8461fc57c8b2b82a593761..a095f0e048a9b5831222d153a6c84cfa8ccd0dce
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -20,6 +20,7 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 tf_custom_op_library(
     name = "python/ops/_image_ops.so",
     srcs = [
+        "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
         "ops/image_ops.cc",
@@ -33,6 +34,7 @@ tf_custom_op_library(
 tf_kernel_library(
     name = "image_ops_kernels",
     srcs = [
+        "kernels/bipartite_match_op.cc",
         "kernels/image_ops.cc",
         "kernels/image_ops.h",
     ],
@@ -87,6 +89,7 @@ cuda_py_test(
     srcs = ["python/kernel_tests/image_ops_test.py"],
     additional_deps = [
         ":image_py",
+        ":single_image_random_dot_stereograms_py",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -96,6 +99,33 @@ cuda_py_test(
     ],
 )
 
+tf_custom_op_library(
+    name = "python/ops/_single_image_random_dot_stereograms.so",
+    srcs = [
+        "kernels/single_image_random_dot_stereograms_ops.cc",
+        "ops/single_image_random_dot_stereograms_ops.cc",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["single_image_random_dot_stereograms_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "single_image_random_dot_stereograms_ops",
+    deps = [":single_image_random_dot_stereograms_ops_op_lib"],
+)
+
+py_library(
+    name = "single_image_random_dot_stereograms_py",
+    srcs = glob(["python/ops/single*.py"]) + ["__init__.py"],
+    data = [":python/ops/_single_image_random_dot_stereograms.so"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":single_image_random_dot_stereograms_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
old mode 100644
new mode 100755
index 4ad599b1f85534ccbbd8284517e5ac158359cc05..fee1a6c2bc951214cb64a9abf80b584ee839c5b0
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -25,6 +25,8 @@ projective transforms (including rotation) are supported.
 @@compose_transforms
 @@rotate
 @@transform
+@@bipartite_match
+@@single_image_random_dot_stereograms
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -35,6 +37,7 @@ from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_t
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import rotate
 from tensorflow.contrib.image.python.ops.image_ops import transform
+from tensorflow.contrib.image.python.ops.single_image_random_dot_stereograms import single_image_random_dot_stereograms
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/image/kernels/bipartite_match_op.cc b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d207c388b159c4ad0f25032811e97b153fd50d6
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/bipartite_match_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <queue>
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace {
+
+struct DistancePair {
+  DistancePair(int i1, int i2, double d) : index1(i1), index2(i2), dist(d) {}
+
+  bool operator<(const DistancePair& b1) const { return b1.dist < dist; }
+
+  int index1, index2;
+  float dist;
+};
+
+}  // namespace
+
+namespace tensorflow {
+
+class BipartiteMatchOp : public OpKernel {
+ public:
+  explicit BipartiteMatchOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("top_k", &top_k_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_distance_mat = context->input(0);
+    OP_REQUIRES(context, input_distance_mat.dims() == 2,
+                errors::InvalidArgument(
+                    "distance_mat should be 2-dimensional, but got ",
+                    input_distance_mat.shape().DebugString()));
+    const int num_input_rows = input_distance_mat.dim_size(0);
+    const int num_input_columns = input_distance_mat.dim_size(1);
+
+    const Tensor& input_num_valid_rows = context->input(1);
+    OP_REQUIRES(
+        context, input_num_valid_rows.NumElements() == 1,
+        errors::InvalidArgument(
+            "num_valid_rows argument should be a tensor with 1 element, "
+            "but got ",
+            input_num_valid_rows.NumElements()));
+
+    const float num_valid_rows_f = input_num_valid_rows.flat<float>()(0);
+    int num_valid_rows = num_input_rows;
+    // If num_valid_rows_f is non-negative, use it to set num_valid_rows.
+    if (num_valid_rows_f >= 0) {
+      num_valid_rows = static_cast<int>(num_valid_rows_f + 0.1);
+    }
+    OP_REQUIRES(
+        context, num_input_rows >= num_valid_rows,
+        errors::InvalidArgument("There should be at least ", num_valid_rows,
+                                " rows in distance_mat, but only got ",
+                                num_input_rows, " rows."));
+
+    // If negative or zero then set it to the maximum possible matches.
+    auto valid_top_k = top_k_;
+
+    if (valid_top_k <= 0) {
+      valid_top_k = num_valid_rows * num_input_columns;
+    }
+
+    // Create output tensors.
+    Tensor* row_to_column_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({num_input_rows}),
+                                            &row_to_column_match_indices));
+    Tensor* column_to_row_match_indices = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({num_input_columns}),
+                                            &column_to_row_match_indices));
+
+    typename TTypes<float, 2>::ConstTensor distance_mat =
+        input_distance_mat.shaped<float, 2>(
+            {num_input_rows, num_input_columns});
+
+    // Greedy bi-partite matching.
+    std::priority_queue<DistancePair> match_queue;
+
+    for (int index1 = 0; index1 < num_valid_rows; index1++) {
+      for (int index2 = 0; index2 < num_input_columns; index2++) {
+        match_queue.push(
+            DistancePair(index1, index2, distance_mat(index1, index2)));
+      }
+    }
+
+    std::vector<int> row_to_col_match_vec(num_input_rows, -1);
+    std::vector<int> col_to_row_match_vec(num_input_columns, -1);
+    int index = 0;
+    while (!match_queue.empty()) {
+      const auto& match = match_queue.top();
+      if (row_to_col_match_vec[match.index1] == -1 &&
+          col_to_row_match_vec[match.index2] == -1) {
+        row_to_col_match_vec[match.index1] = match.index2;
+        col_to_row_match_vec[match.index2] = match.index1;
+
+        index++;
+        if (index >= valid_top_k) {
+          break;
+        }
+      }
+      match_queue.pop();
+    }
+
+    // Set the output tensors.
+    row_to_column_match_indices->vec<int>() =
+        TTypes<int>::Vec(row_to_col_match_vec.data(), num_input_rows);
+    column_to_row_match_indices->vec<int>() =
+        TTypes<int>::Vec(col_to_row_match_vec.data(), num_input_columns);
+  }
+
+ private:
+  int top_k_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BipartiteMatch").Device(DEVICE_CPU),
+                        BipartiteMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc
index 8d50541771b55c5958674c489329202f3da207b3..8a97f07732c4be43192f6ea8f6934118b49875f8 100644
--- a/tensorflow/contrib/image/kernels/image_ops.cc
+++ b/tensorflow/contrib/image/kernels/image_ops.cc
@@ -43,13 +43,29 @@ template class FillProjectiveTransform<CPUDevice, double>;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 using functor::FillProjectiveTransform;
+using generator::INTERPOLATION_BILINEAR;
+using generator::INTERPOLATION_NEAREST;
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
 class ImageProjectiveTransform : public OpKernel {
+ private:
+  Interpolation interpolation_;
+
  public:
-  explicit ImageProjectiveTransform(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
+  explicit ImageProjectiveTransform(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string interpolation_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("interpolation", &interpolation_str));
+    if (interpolation_str == "NEAREST") {
+      interpolation_ = INTERPOLATION_NEAREST;
+    } else if (interpolation_str == "BILINEAR") {
+      interpolation_ = INTERPOLATION_BILINEAR;
+    } else {
+      LOG(FATAL) << "Invalid interpolation " << interpolation_str
+                 << ". Supported types: NEAREST, BILINEAR";
+    }
+  }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& images_t = ctx->input(0);
@@ -68,8 +84,8 @@ class ImageProjectiveTransform : public OpKernel {
     Tensor* output_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t));
     auto output = output_t->tensor<T, 4>();
-    const FillProjectiveTransform<Device, T> functor;
-    functor(ctx->eigen_device<Device>(), &output, images, transform);
+    (FillProjectiveTransform<Device, T>(interpolation_))(
+        ctx->eigen_device<Device>(), &output, images, transform);
   }
 };
 
diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h
index 92b908a1c68ef7db173410243a5f900110e81f4d..692e33fcf30b5b3b323ef26fab0c88fbfaab0f20 100644
--- a/tensorflow/contrib/image/kernels/image_ops.h
+++ b/tensorflow/contrib/image/kernels/image_ops.h
@@ -28,6 +28,8 @@ namespace tensorflow {
 
 namespace generator {
 
+enum Interpolation { INTERPOLATION_NEAREST, INTERPOLATION_BILINEAR };
+
 using Eigen::array;
 using Eigen::DenseIndex;
 
@@ -36,20 +38,19 @@ class ProjectiveGenerator {
  private:
   typename TTypes<T, 4>::ConstTensor input_;
   typename TTypes<float>::ConstMatrix transforms_;
+  const Interpolation interpolation_;
 
  public:
   static const int kNumParameters = 8;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
-                      typename TTypes<float>::ConstMatrix transforms)
-      : input_(input), transforms_(transforms) {}
+                      typename TTypes<float>::ConstMatrix transforms,
+                      const Interpolation interpolation)
+      : input_(input), transforms_(transforms), interpolation_(interpolation) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
-    array<DenseIndex, 4> input_coords;
-    input_coords[0] = coords[0];
-
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -57,24 +58,73 @@ class ProjectiveGenerator {
             ? transforms_.data()
             : &transforms_.data()[transforms_.dimension(1) * coords[0]];
     float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
-    const int64 input_x = std::round(
+    const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
-        projection);
-    const int64 input_y = std::round(
+        projection;
+    const float input_y =
         (transform[3] * output_x + transform[4] * output_y + transform[5]) /
-        projection);
-
-    if (!(0 <= input_y && input_y < input_.dimension(1) && 0 <= input_x &&
-          input_x < input_.dimension(2))) {
-      // TODO(ringwalt): Add a fill value input.
-      return T(0);
+        projection;
+
+    // TODO(ringwalt): Add a fill value input.
+    static const T fill_value = T(0);
+    switch (interpolation_) {
+      case INTERPOLATION_NEAREST:
+        // Switch the order of x and y again for indexing into the image.
+        return nearest_interpolation(coords[0], input_y, input_x, coords[3],
+                                     fill_value);
+      case INTERPOLATION_BILINEAR:
+        return bilinear_interpolation(coords[0], input_y, input_x, coords[3],
+                                      fill_value);
     }
-    input_coords[1] = input_y;
-    input_coords[2] = input_x;
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  nearest_interpolation(const DenseIndex batch, const float y, const float x,
+                        const DenseIndex channel, const T fill_value) const {
+    return read_with_fill_value(batch, DenseIndex(std::round(y)),
+                                DenseIndex(std::round(x)), channel, fill_value);
+  }
 
-    input_coords[3] = coords[3];
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
+  bilinear_interpolation(const DenseIndex batch, const float y, const float x,
+                         const DenseIndex channel, const T fill_value) const {
+    const float y_floor = std::floor(y);
+    const float x_floor = std::floor(x);
+    const float y_ceil = y_floor + 1;
+    const float x_ceil = x_floor + 1;
+    // f(x, y_floor) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_floor)
+    //               + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_floor)
+    const float value_yfloor =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_floor),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y_ceil) = (x_ceil - x) / (x_ceil - x_floor) * f(x_floor, y_ceil)
+    //              + (x - x_floor) / (x_ceil - x_floor) * f(x_ceil, y_ceil)
+    const float value_yceil =
+        (x_ceil - x) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                            DenseIndex(x_floor), channel,
+                                            fill_value) +
+        (x - x_floor) * read_with_fill_value(batch, DenseIndex(y_ceil),
+                                             DenseIndex(x_ceil), channel,
+                                             fill_value);
+    // f(x, y) = (y_ceil - y) / (y_ceil - y_floor) * f(x, y_floor)
+    //         + (y - y_floor) / (y_ceil - y_floor) * f(x, y_ceil)
+    return T((y_ceil - y) * value_yfloor + (y - y_floor) * value_yceil);
+  }
 
-    return input_(input_coords);
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T read_with_fill_value(
+      const DenseIndex batch, const DenseIndex y, const DenseIndex x,
+      const DenseIndex channel, const T fill_value) const {
+    // batch and channel must be correct, because they are passed unchanged from
+    // the input.
+    return (0 <= y && y < input_.dimension(1) && 0 <= x &&
+            x < input_.dimension(2))
+               ? input_(array<DenseIndex, 4>{batch, y, x, channel})
+               : fill_value;
   }
 };
 
@@ -85,6 +135,7 @@ class ProjectiveGenerator {
 // some Eigen device code.
 namespace functor {
 
+using generator::Interpolation;
 using generator::ProjectiveGenerator;
 
 template <typename Device, typename T>
@@ -92,15 +143,17 @@ struct FillProjectiveTransform {
   typedef typename TTypes<T, 4>::Tensor OutputType;
   typedef typename TTypes<T, 4>::ConstTensor InputType;
   typedef typename TTypes<float, 2>::ConstTensor TransformsType;
+  const Interpolation interpolation_;
 
-  FillProjectiveTransform() {}
+  FillProjectiveTransform(Interpolation interpolation)
+      : interpolation_(interpolation) {}
 
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
                   const InputType& images,
                   const TransformsType& transform) const {
-    ProjectiveGenerator<Device, T> generator(images, transform);
-    output->device(device) = images.generate(generator);
+    output->device(device) = images.generate(
+        ProjectiveGenerator<Device, T>(images, transform, interpolation_));
   }
 };
 
diff --git a/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
new file mode 100755
index 0000000000000000000000000000000000000000..23efd359d578c05afa0bb6766b9ed2661dc10d78
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/single_image_random_dot_stereograms_ops.cc
@@ -0,0 +1,424 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+template <typename T>
+class SingleImageRandomDotStereogramsOp : public OpKernel {
+ private:
+  int E2Epixels;  // Pixels from eye to eye = eye_to_eye_inches * DPI
+
+  int input_Xvalue;  // X value of input Z values (width)
+  int input_Yvalue;  // Y value of input Z values (height)
+
+  int output_Ximage;  // X value of output image (width)
+  int output_Yimage;  // Y value of output image (height)
+  int output_Cimage;  // color value of output image (color, 1 or 3)  (3 not
+                      // implemented)
+
+  int data_box_left;    // X starting value for DATA window
+  int data_box_top;     // Y starting value for DATA window
+  int data_box_width;   // width of scan line
+  int data_box_height;  // hight of image
+
+  int converge_dot_box_end;  // Row convergences dots end on
+
+  uint8* outputImage;  // Output Image flat as a buffer (Tensor Connection)
+  double* ZBuffer;     // For internal use, allow for MASK, etc later, actual Z
+                       // used for Stereogram, XxY (X is the row index, y is col
+                       // index like a screen)
+                       // 0 (far) -> 1.0(near) range
+  bool hidden_surface_removal;
+  int convergence_dots_size;
+  int dots_per_inch;
+  float eye_separation;
+  float mu;
+  bool normalize;
+  float normalize_max;
+  float normalize_min;
+  float border_level;
+  int number_colors;
+  ::tensorflow::TensorShapeProto output_image_shape;
+  ::tensorflow::TensorShapeProto output_data_window;
+
+  uint8 Cblack = (uint8)0;
+  uint8 Cwhite = (uint8)255;
+
+  int indexMode = 0;  // 0 - truncate XY, 1 - round XY, 2 - Interpolate XY (not
+                      // implemented yet, keep default of 0)
+  int interp_x, interp_y;  // 1 - yes, 0 - no  interpolation directions (not
+                           // implemented yet)
+
+  bool debugging = false;
+
+  inline int separation(double z) {
+    return (std::round((1 - mu * z) * E2Epixels / (2 - mu * z)));
+  }
+
+  inline int get_far_width() { return (separation(0.0)); }
+  inline int get_near_width() { return (separation(1.0)); }
+
+ public:
+  explicit SingleImageRandomDotStereogramsOp(OpKernelConstruction* context)
+      : OpKernel(context) {  // Constructor
+    OP_REQUIRES_OK(context, context->GetAttr("hidden_surface_removal",
+                                             &hidden_surface_removal));
+    OP_REQUIRES_OK(context, context->GetAttr("convergence_dots_size",
+                                             &convergence_dots_size));
+    OP_REQUIRES_OK(context, context->GetAttr("dots_per_inch", &dots_per_inch));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("eye_separation", &eye_separation));
+    OP_REQUIRES_OK(context, context->GetAttr("mu", &mu));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize", &normalize));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_max", &normalize_max));
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_min", &normalize_min));
+    OP_REQUIRES_OK(context, context->GetAttr("border_level", &border_level));
+    OP_REQUIRES_OK(context, context->GetAttr("number_colors", &number_colors));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_image_shape", &output_image_shape));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("output_data_window", &output_data_window));
+
+    E2Epixels =
+        eye_separation * dots_per_inch;  // Initialize pixels from eye to eye
+  }
+
+  ~SingleImageRandomDotStereogramsOp() {  // Destructor
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    input_Xvalue = input_tensor.shape().dim_size(
+        1);  // X value is the number of columns of the input matrix
+    input_Yvalue =
+        input_tensor.shape().dim_size(0);  // Y value is the number of rows
+
+    output_Ximage = output_image_shape.dim(0).size();
+    output_Yimage = output_image_shape.dim(1).size();
+    output_Cimage = output_image_shape.dim(2).size();
+
+    if (number_colors > 256)  // Go to full color image
+      output_Cimage = 3;
+
+    int data_Xwindow = output_data_window.dim(0).size();
+    int data_Ywindow = output_data_window.dim(1).size();
+
+    int deltaX_border_image = output_Ximage - data_Xwindow;
+    int deltaY_border_image = output_Yimage - data_Ywindow;
+
+    if (convergence_dots_size >
+        0)  // 3 frame sections in Y direction due to DOTS
+    {
+      deltaY_border_image =
+          deltaY_border_image -
+          convergence_dots_size;  // Take off space for Convergence Dots
+      deltaY_border_image = std::max(0, deltaY_border_image);
+      data_box_top = deltaY_border_image / 3;
+
+      if (deltaY_border_image >= 0) {
+        converge_dot_box_end = output_Yimage - 1 - data_box_top;
+      } else {
+        converge_dot_box_end = output_Yimage - 1;
+      }
+    } else  // Otherwise only 2, no convergence dot
+    {
+      data_box_top = deltaY_border_image / 2;  // Center DATA in Y dimension
+      converge_dot_box_end = output_Yimage - 1;
+    }
+
+    data_box_left = deltaX_border_image / 2;  // Center DATA in X dimension
+    data_box_width = data_Xwindow;             // width of scan line
+    data_box_height = data_Ywindow;            // hight of image
+
+    const T* inputZ = input_tensor.flat<T>().data();  // Flatten input Z buffer
+
+    BuildZBuffer(inputZ);
+
+    // Output a scalar string.
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({output_Yimage, output_Ximage, output_Cimage}),
+            &output_tensor));
+
+    outputImage = output_tensor->flat<uint8>().data();
+
+    generate_stereogram();
+
+    delete[] ZBuffer;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  // Move input into standard Z format to reduce complexity of algorithm
+  //
+  void BuildZBuffer(const T* Z, bool log = false) {
+    double MaxValue = 1.0;
+    double MinValue = 0.0;
+    ZBuffer = new double[input_Xvalue * input_Yvalue];  // Used to computer
+                                                        // final Z values before
+                                                        // rendering to output
+
+    if (normalize) {
+      // Init Min/Max to first value
+      if (normalize_max < normalize_min)  // Autoscale if MIN>MAX
+      {
+        MaxValue = (double)*Z;
+        MinValue = (double)*Z;
+
+        for (int y = 0; y < input_Yvalue; ++y)
+          for (int x = 0; x < input_Xvalue; ++x) {
+            double value = getZfromInputImage(Z, x, y);
+            if (value > MaxValue) MaxValue = value;
+            if (value < MinValue) MinValue = value;
+          }
+      } else {
+        MaxValue = normalize_max;
+        MinValue = normalize_min;
+      }
+    }
+
+    for (int y = 0; y < input_Yvalue; ++y)
+      for (int x = 0; x < input_Xvalue; ++x) {
+        double value = getZfromInputImage(Z, x, y);
+
+        if (normalize) {
+          value = (value - MinValue) / (MaxValue - MinValue);
+        }
+
+        if (value > 1.0) value = 1.0;
+        if (value < 0.0) value = 0.0;
+
+        *(ZBuffer + (input_Xvalue * y + x)) = value;
+      }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  double getZfromInputImage(const T* Z, int x, int y) {
+    double return_val;
+
+    return_val = (double)*(Z + input_Xvalue * y + x);  // Get value
+    return return_val;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  // All normalized, not checking required
+  // Possible Projection issue if DATA is bigger or smaller than Input
+  //  Modes include:
+  //         Truncate value (Default)
+  //         Round-off value
+  //         Interpolate between values
+  //
+  double getZfromZbuffer(double x, double y) {
+    int xi, yi;
+
+    switch (indexMode) {
+      case 0:  // Truncate
+        xi = int(x);
+        yi = int(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      case 1:  // Round-off
+        xi = std::round(x);
+        yi = std::round(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      case 2:  // Interpolate (Not implemented yet, will need 4 points
+               // [x,y],[x+1,y],[x,y+1],[x+1,y+1], then interpolate)
+        xi = int(x);
+        yi = int(y);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+      default:  // Round-off is the default
+        xi = int(x + 0.5);
+        yi = int(y + 0.5);
+        return (*(ZBuffer + (xi + input_Xvalue * yi)));
+        break;
+    }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  int getOutputImageIndex(int x, int y,
+                          int channel) {  // No error checking for some
+                                          // optimization, calling routine
+                                          // required to make sure there is no
+                                          // violation
+    return ((output_Ximage * output_Cimage) * y + x * output_Cimage + channel);
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  double getZFromOutputPixel(int x, int y) {
+    double xofz, yofz, returnval;
+
+    // Convert pixel units to Z units, do this as "double"
+
+    xofz =
+        (double)input_Xvalue * (x - data_box_left) / ((double)data_box_width);
+    yofz =
+        (double)input_Yvalue * (y - data_box_top) / ((double)data_box_height);
+
+    if ((xofz < 0) || (yofz < 0) || (yofz >= input_Yvalue) ||
+        (xofz >= input_Xvalue)) {  // Top of left side border hit or  Right
+                                   // side or bottom border hit
+                                   // Send BORDER Z value
+      return (border_level);
+    }
+
+    {  // in data set Z interpolate if need
+      double gz;
+
+      gz = getZfromZbuffer(xofz, yofz);
+
+      returnval = gz;
+    }
+
+    return (returnval);
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void generate_stereogram() {
+    int s, left, right, visible, t, l;
+    double zt, gz;
+    // Scan line
+    uint8* pix;  // Scan row color for each pixel
+    int* same;   // Used to determine if Pixel needs to be the same as another
+                 // pixel in the row
+
+    pix = new uint8[output_Ximage * output_Cimage];
+    same = new int[output_Ximage];
+
+    for (int y = 0; y < output_Yimage; ++y) {
+      // Set no dependencies on any pixels, tie each one back to itself
+      for (int x = 0; x < output_Ximage; ++x) same[x] = x;
+
+      for (int x = 0; x < output_Ximage; ++x) {
+        gz = getZFromOutputPixel(x, y);
+        s = separation(gz);
+        left = x - s / 2;
+        right = left + s;
+
+        if ((left >= 0) && (right < output_Ximage)) {
+          t = 1;
+          visible = 1;
+          if (hidden_surface_removal) do {
+              zt = gz + 2 * (2 - mu * gz) * t / (mu * E2Epixels);
+              visible = (getZFromOutputPixel(x - t, y) < zt) &&
+                        (getZFromOutputPixel(x + t, y) < zt);
+              ++t;
+            } while ((visible) && (zt < 1));
+
+          if (visible) {
+            l = same[left];
+            while ((l != left) && (l != right))
+              if (l < right) {
+                left = l;
+                l = same[left];
+              } else {
+                same[left] = right;
+                left = right;
+                l = same[left];
+                right = l;
+              }
+            same[left] = right;
+          }
+        }
+      }
+      // Set colors for scan row, use channels and number_colors
+      for (int x = output_Ximage - 1; x >= 0; x--) {
+        for (int channel = 0; channel < output_Cimage; ++channel) {
+          if (same[x] == x) {  // Pick a random color
+            if (number_colors == 2) {
+              if ((rand() % 2) == 0) {
+                pix[x * output_Cimage + channel] = Cblack;
+              } else {
+                pix[x * output_Cimage + channel] = Cwhite;
+              }
+            } else {
+              pix[x * output_Cimage + channel] = rand() % 256;
+            }
+          } else
+            pix[x * output_Cimage + channel] =
+                pix[same[x] * output_Cimage + channel];
+
+          setpixel(x, y, channel, pix[x * output_Cimage + channel]);
+        }
+      }
+    }
+
+    draw_convergence_dots();
+
+    delete[] pix;
+    delete[] same;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void draw_convergence_dots() {
+    int x1, x2;  // center position for convergence dots
+
+    if (convergence_dots_size == 0)  // No dot, return
+      return;
+
+    x1 = output_Ximage / 2 - get_far_width() / 2;
+    x2 = output_Ximage / 2 + get_far_width() / 2;
+
+    for (int lloop = 0; lloop < convergence_dots_size; ++lloop)
+      for (int wloop = 0; wloop < convergence_dots_size; ++wloop)
+        for (int channel = 0; channel < output_Cimage; ++channel) {
+          setpixel(x1 - (convergence_dots_size / 2) + wloop,
+                   converge_dot_box_end - lloop, channel, Cblack);
+          setpixel(x2 - (convergence_dots_size / 2) + wloop,
+                   converge_dot_box_end - lloop, channel, Cblack);
+        }
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+
+  void setpixel(int x, int y, int channel, uint8 color) {
+    *(outputImage + getOutputImageIndex(x, y, channel)) = color;
+  }
+};
+
+#define REGISTER_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("SingleImageRandomDotStereograms") \
+                              .Device(DEVICE_CPU)                 \
+                              .TypeConstraint<T>("T"),            \
+                          SingleImageRandomDotStereogramsOp<T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+
+#undef REGISTER_KERNEL
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 18c16cf1bb62fff8920109344cf6052296000a4c..740854930c43f38583519232bf7a87cea89f7846 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -20,16 +20,17 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc.
 // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// TODO(ringwalt): Add an "interpolation" argument with "none", "bilinear", etc.
 // TODO(ringwalt): Add an "output_shape" argument. This is sufficient to
 // implement "same" and "valid" modes in the Python function.
 REGISTER_OP("ImageProjectiveTransform")
     .Input("images: dtype")
     .Input("transforms: float32")
     .Attr("dtype: {uint8, int32, int64, float32, float64}")
+    .Attr("interpolation: string")
     .Output("transformed_images: dtype")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
@@ -59,4 +60,44 @@ transformed_images: 4D `Tensor`, image(s) in NHWC format, generated by applying
 the `transforms` to the `images`. Satisfies the description above.
 )doc");
 
+REGISTER_OP("BipartiteMatch")
+    .Input("distance_mat: float")
+    .Input("num_valid_rows: float")
+    .Attr("top_k: int = -1")
+    .Output("row_to_col_match_indices: int32")
+    .Output("col_to_row_match_indices: int32")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      c->set_output(0, c->MakeShape({c->Dim(input, 0)}));
+      c->set_output(1, c->MakeShape({c->Dim(input, 1)}));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Find bipartite matching based on a given distance matrix.
+
+A greedy bi-partite matching algorithm is used to obtain the matching with the
+(greedy) minimum distance.
+
+distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+  pair-wise distance matrix between the entities represented by each row and
+  each column. It is an asymmetric matrix. The smaller the distance is, the more
+  similar the pairs are. The bipartite matching is to minimize the distances.
+num_valid_rows: A scalar or a 1-D tensor with one element describing the
+  number of valid rows of distance_mat to consider for the bipartite matching.
+  If set to be negative, then all rows from `distance_mat` are used.
+top_k: A scalar that specifies the number of top-k matches to retrieve.
+  If set to be negative, then is set according to the maximum number of
+  matches from `distance_mat`.
+row_to_col_match_indices: A vector of length num_rows, which is the number of
+  rows of the input `distance_matrix`.
+  If `row_to_col_match_indices[i]` is not -1, row i is matched to column
+  `row_to_col_match_indices[i]`.
+col_to_row_match_indices: A vector of length num_columns, which is the number
+  of columns of the input ditance matrix.
+  If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+  `col_to_row_match_indices[j]`.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
new file mode 100755
index 0000000000000000000000000000000000000000..8a7cc562565fbbae15c600774712e18284a3b7aa
--- /dev/null
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("SingleImageRandomDotStereograms")
+    .Attr("T: {double,float,int64,int32}")
+    .Input("depth_values: T")
+    .Output("image: uint8")
+    .Attr("hidden_surface_removal: bool = true")
+    .Attr("convergence_dots_size: int = 8")
+    .Attr("dots_per_inch: int = 72")
+    .Attr("eye_separation: float = 2.5")
+    .Attr("mu: float = .3333")
+    .Attr("normalize: bool = true")
+    .Attr("normalize_max: float = -100.0")
+    .Attr("normalize_min: float = 100.0")
+    .Attr("border_level: float = 0.0")
+    .Attr("number_colors: int = 256")
+    .Attr(
+        "output_image_shape: shape = { dim {size:1024} dim {size: 768} dim "
+        "{size: 1}}")
+    .Attr("output_data_window: shape = { dim {size:1022} dim {size: 757}}")
+    .Doc(R"doc(
+Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
+
+Given the 2-D tensor 'depth_values' with encoded Z values, this operation will 
+encode 3-D data into a 2-D image.  The output of this Op is suitable for the
+encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
+encode 3-D data witin the image.
+
+This Op is based upon:
+'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
+
+Example use which outputs a SIRDS image as picture_out.png:
+```python
+img=[[1,2,3,3,2,1],
+     [1,2,3,4,5,2],
+     [1,2,3,4,5,3],
+     [1,2,3,4,5,4],
+     [6,5,4,4,5,5]]
+
+session = tf.InteractiveSession()
+
+sirds = single_image_random_dot_stereograms(img,convergence_dots_size=8,number_colors=256,normalize=True)
+
+out = sirds.eval()
+
+png = tf.image.encode_png(out).eval()
+
+with open('picture_out.png', 'wb') as f:
+    f.write(png)
+```
+
+depth_values: Z values of data to encode into 'output_data_window' window, 
+  lower values are further away {0.0 floor(far), 1.0 ceiling(near) after normalization}, must be 2-D tensor
+hidden_surface_removal: Activate hidden surface removal
+convergence_dots_size: Black dot size in pixels to help view converge image, drawn on bottom of image
+dots_per_inch: Output device in dots/inch
+eye_separation: Separation between eyes in inches
+mu: Depth of field, Fraction of viewing distance (eg. 1/3 = .3333)
+normalize: Normalize input data to [0.0, 1.0] 
+normalize_max: Fix MAX value for Normalization - if < MIN, autoscale
+normalize_min: Fix MIN value for Normalization - if > MAX, autoscale
+border_level: Value of border depth 0.0 {far} to 1.0 {near}
+number_colors: 2 (Black & White),256 (grayscale), and Numbers > 256 (Full Color) are all that are supported currently
+output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale, 3 color (1024, 768, 1),
+  channels will be updated to 3 if 'number_colors' > 256
+output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
+  and use 'convergence_dots_size' for best fit to avoid overlap if possible
+
+image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 4ce33de24a017e70a377a0262fe5f1405e9ad045..b8a0706b61449ebebeb2f1dc98b438f9dd620aa3 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -110,6 +111,139 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
                              [0, 1, 0, 1],
                              [0, 1, 1, 1]])
 
+  def test_bilinear(self):
+    with self.test_session():
+      image = constant_op.constant(
+          [[0, 0, 0, 0, 0],
+           [0, 1, 1, 1, 0],
+           [0, 1, 0, 1, 0],
+           [0, 1, 1, 1, 0],
+           [0, 0, 0, 0, 0]],
+          dtypes.float32)
+      # The following result matches:
+      # >>> scipy.ndimage.rotate(image, 45, order=1, reshape=False)
+      # which uses spline interpolation of order 1, equivalent to bilinear
+      # interpolation.
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.000, 0.000, 0.343, 0.000, 0.000],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.343, 0.914, 0.000, 0.914, 0.343],
+           [0.000, 0.586, 0.914, 0.586, 0.000],
+           [0.000, 0.000, 0.343, 0.000, 0.000]],
+          atol=0.001)
+      self.assertAllClose(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="NEAREST").eval(),
+          [[0, 0, 1, 0, 0],
+           [0, 1, 1, 1, 0],
+           [1, 1, 0, 1, 1],
+           [0, 1, 1, 1, 0],
+           [0, 0, 1, 0, 0]])
+
+  def test_bilinear_uint8(self):
+    with self.test_session():
+      image = constant_op.constant(
+          np.asarray(
+              [[0.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 255, 0.0, 255, 0.0],
+               [0.0, 255, 255, 255, 0.0],
+               [0.0, 0.0, 0.0, 0.0, 0.0]],
+              np.uint8),
+          dtypes.uint8)
+      # == np.rint((expected image above) * 255)
+      self.assertAllEqual(
+          image_ops.rotate(image, np.pi / 4.0, interpolation="BILINEAR").eval(),
+          [[0.0, 0.0, 87., 0.0, 0.0],
+           [0.0, 149, 233, 149, 0.0],
+           [87., 233, 0.0, 233, 87.],
+           [0.0, 149, 233, 149, 0.0],
+           [0.0, 0.0, 87., 0.0, 0.0]])
+
+  def _test_grad(self, shape_to_test):
+    with self.test_session():
+      test_image_shape = shape_to_test
+      test_image = np.random.randn(*test_image_shape)
+      test_image_tensor = constant_op.constant(
+          test_image, shape=test_image_shape)
+      test_transform = image_ops.angles_to_projective_transforms(
+          np.pi / 2, 4, 4)
+
+      output_shape = test_image_shape
+      output = image_ops.transform(test_image_tensor, test_transform)
+      left_err = gradient_checker.compute_gradient_error(
+          test_image_tensor,
+          test_image_shape,
+          output,
+          output_shape,
+          x_init_value=test_image)
+      self.assertLess(left_err, 1e-10)
+
+  def test_grad(self):
+    self._test_grad([16, 16])
+    self._test_grad([4, 12, 12])
+    self._test_grad([3, 4, 12, 12])
+
+
+class BipartiteMatchTest(test_util.TensorFlowTestCase):
+
+  def _BipartiteMatchTest(self, distance_mat, distance_mat_shape,
+                          num_valid_rows,
+                          expected_row_to_col_match,
+                          expected_col_to_row_match):
+    distance_mat_np = np.array(distance_mat, dtype=np.float32).reshape(
+        distance_mat_shape)
+    expected_row_to_col_match_np = np.array(expected_row_to_col_match,
+                                            dtype=np.int32)
+    expected_col_to_row_match_np = np.array(expected_col_to_row_match,
+                                            dtype=np.int32)
+
+    with self.test_session():
+      distance_mat_tf = constant_op.constant(distance_mat_np,
+                                             shape=distance_mat_shape)
+      location_to_prior, prior_to_location = image_ops.bipartite_match(
+          distance_mat_tf, num_valid_rows)
+      location_to_prior_np = location_to_prior.eval()
+      prior_to_location_np = prior_to_location.eval()
+      self.assertAllEqual(location_to_prior_np, expected_row_to_col_match_np)
+      self.assertAllEqual(prior_to_location_np, expected_col_to_row_match_np)
+
+  def testBipartiteMatch(self):
+    distance_mat = [0.5, 0.8, 0.1,
+                    0.3, 0.2, 0.15]
+    num_valid_rows = 2
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less than num-of-rows-in-distance-mat.
+    num_valid_rows = 1
+    expected_row_to_col_match = [2, -1]
+    expected_col_to_row_match = [-1, -1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows being 0.
+    num_valid_rows = 0
+    expected_row_to_col_match = [-1, -1]
+    expected_col_to_row_match = [-1, -1, -1]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
+    # The case of num_valid_rows less being -1.
+    num_valid_rows = -1
+    # The expected results are the same as num_valid_rows being 2.
+    expected_row_to_col_match = [2, 1]
+    expected_col_to_row_match = [-1, 1, 0]
+    self._BipartiteMatchTest(distance_mat, [2, 3], num_valid_rows,
+                             expected_row_to_col_match,
+                             expected_col_to_row_match)
+
 
 if __name__ == "__main__":
   googletest.main()
+
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 889f361b19ee16e3492bee668919c046cef3a9d0..b396dcea2118a2aa602d71e0316ba7f272ff9599 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 
@@ -36,7 +37,7 @@ _IMAGE_DTYPES = set(
 ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn)
 
 
-def rotate(images, angles):
+def rotate(images, angles, interpolation="NEAREST"):
   """Rotate image(s) by the passed angle(s) in radians.
 
   Args:
@@ -45,6 +46,7 @@ def rotate(images, angles):
        (num_rows, num_columns) (HW).
     angles: A scalar angle to rotate all images by, or (if images has rank 4)
        a vector of length num_images, with an angle for each image in the batch.
+    interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, rotated by the given
@@ -69,7 +71,8 @@ def rotate(images, angles):
   image_width = math_ops.cast(array_ops.shape(images)[2], dtypes.float32)[None]
   output = transform(
       images,
-      angles_to_projective_transforms(angles, image_width, image_height))
+      angles_to_projective_transforms(angles, image_height, image_width),
+      interpolation=interpolation)
   if len(image_or_images.get_shape()) == 2:
     return output[0, :, :, 0]
   elif len(image_or_images.get_shape()) == 3:
@@ -119,7 +122,7 @@ def angles_to_projective_transforms(angles, image_height, image_width):
       axis=1)
 
 
-def transform(images, transforms):
+def transform(images, transforms, interpolation="NEAREST"):
   """Applies the given transform(s) to the image(s).
 
   Args:
@@ -133,6 +136,7 @@ def transform(images, transforms):
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
        the transform mapping input points to output points.
+     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
     Image(s) with the same type and shape as `images`, with the given
@@ -162,8 +166,8 @@ def transform(images, transforms):
     transforms = transform_or_transforms
   else:
     raise TypeError("Transforms should have rank 1 or 2.")
-  # pylint: disable=protected-access
-  output = gen_image_ops.image_projective_transform(images, transforms)
+  output = gen_image_ops.image_projective_transform(
+      images, transforms, interpolation=interpolation.upper())
   if len(image_or_images.get_shape()) == 2:
     return output[0, :, :, 0]
   elif len(image_or_images.get_shape()) == 3:
@@ -214,4 +218,82 @@ def _transform_matrices_to_flat(transform_matrices):
   return transforms[:, :8]
 
 
-ops.NotDifferentiable("ImageProjectiveTransform")
+@ops.RegisterGradient("ImageProjectiveTransform")
+def _image_projective_transform_grad(op, grad):
+  """Computes the gradient for ImageProjectiveTransform."""
+  images = op.inputs[0]
+  transforms = op.inputs[1]
+  interpolation = op.get_attr("interpolation")
+
+  image_or_images = ops.convert_to_tensor(images, name="images")
+  transform_or_transforms = ops.convert_to_tensor(
+      transforms, name="transforms", dtype=dtypes.float32)
+
+  if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
+    raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(image_or_images.get_shape()) == 2:
+    images = image_or_images[None, :, :, None]
+  elif len(image_or_images.get_shape()) == 3:
+    images = image_or_images[None, :, :, :]
+  elif len(image_or_images.get_shape()) == 4:
+    images = image_or_images
+  else:
+    raise TypeError("Images should have rank between 2 and 4")
+  if len(transform_or_transforms.get_shape()) == 1:
+    transforms = transform_or_transforms[None]
+  elif len(transform_or_transforms.get_shape()) == 2:
+    transforms = transform_or_transforms
+  else:
+    raise TypeError("Transforms should have rank 1 or 2.")
+
+  # Invert transformations
+  transforms = _flat_transforms_to_matrices(transforms=transforms)
+  inverse = linalg_ops.matrix_inverse(transforms)
+  transforms = _transform_matrices_to_flat(inverse)
+  output = gen_image_ops.image_projective_transform(
+      grad, transforms, interpolation=interpolation)
+  if len(image_or_images.get_shape()) == 2:
+    return [output[0, :, :, 0], None]
+  elif len(image_or_images.get_shape()) == 3:
+    return [output[0, :, :, :], None]
+  else:
+    return [output, None]
+
+
+def bipartite_match(
+    distance_mat,
+    num_valid_rows,
+    top_k=-1):
+  """Find bipartite matching based on a given distance matrix.
+
+  A greedy bi-partite matching algorithm is used to obtain the matching with
+  the (greedy) minimum distance.
+
+  Args:
+    distance_mat: A 2-D float tensor of shape `[num_rows, num_columns]`. It is a
+      pair-wise distance matrix between the entities represented by each row and
+      each column. It is an asymmetric matrix. The smaller the distance is, the
+      more similar the pairs are. The bipartite matching is to minimize the
+      distances.
+    num_valid_rows: A scalar or a 1-D tensor with one element describing the
+      number of valid rows of distance_mat to consider for the bipartite
+      matching. If set to be negative, then all rows from `distance_mat` are
+      used.
+    top_k: A scalar that specifies the number of top-k matches to retrieve.
+      If set to be negative, then is set according to the maximum number of
+      matches from `distance_mat`.
+
+  Returns:
+    row_to_col_match_indices: A vector of length num_rows, which is the number
+      of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
+      is not -1, row i is matched to column `row_to_col_match_indices[i]`.
+    col_to_row_match_indices: A vector of length num_columns, which is the
+      number of columns of the input ditance matrix.
+      If `col_to_row_match_indices[j]` is not -1, column j is matched to row
+      `col_to_row_match_indices[j]`.
+  """
+  result = gen_image_ops.bipartite_match(distance_mat, num_valid_rows, top_k)
+  return result
+
+
+ops.NotDifferentiable("BipartiteMatch")
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
new file mode 100755
index 0000000000000000000000000000000000000000..79261c5e7501566537ee9492b5aa64570599e862
--- /dev/null
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -0,0 +1,125 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python layer for image_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
+
+_sirds_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile(
+        "_single_image_random_dot_stereograms.so"))
+
+def single_image_random_dot_stereograms(
+    depth_values,
+    hidden_surface_removal=None,
+    convergence_dots_size=None,
+    dots_per_inch=None,
+    eye_separation=None, mu=None,
+    normalize=None, normalize_max=None,
+    normalize_min=None,
+    border_level=None,
+    number_colors=None,
+    output_image_shape=None,
+    output_data_window=None):
+  """Output a RandomDotStereogram Tensor for export via encode_PNG/JPG OP.
+
+  Given the 2-D tensor 'depth_values' with encoded Z values, this operation
+  will encode 3-D data into a 2-D image.  The output of this Op is suitable
+  for the encode_PNG/JPG ops.  Be careful with image compression as this may
+  corrupt the encode 3-D data witin the image.
+
+  Based upon [this paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
+
+  This outputs a SIRDS image as picture_out.png:
+
+  ```python
+  img=[[1,2,3,3,2,1],
+       [1,2,3,4,5,2],
+       [1,2,3,4,5,3],
+       [1,2,3,4,5,4],
+       [6,5,4,4,5,5]]
+  session = tf.InteractiveSession()
+  sirds = single_image_random_dot_stereograms(
+      img,
+      convergence_dots_size=8,
+      number_colors=256,normalize=True)
+
+  out = sirds.eval()
+  png = tf.image.encode_png(out).eval()
+  with open('picture_out.png', 'wb') as f:
+    f.write(png)
+  ```
+
+  Args:
+    depth_values: A `Tensor`. Must be one of the following types: 
+      `float64`, `float32`, `int64`, `int32`.  Z values of data to encode
+      into 'output_data_window' window, lower further away {0.0 floor(far),
+      1.0 ceiling(near) after norm}, must be 2-D tensor
+    hidden_surface_removal: An optional `bool`. Defaults to `True`.
+      Activate hidden surface removal
+    convergence_dots_size: An optional `int`. Defaults to `8`.
+      Black dot size in pixels to help view converge image, drawn on bottom
+      of the image
+    dots_per_inch: An optional `int`. Defaults to `72`.
+      Output device in dots/inch
+    eye_separation: An optional `float`. Defaults to `2.5`.
+      Separation between eyes in inches
+    mu: An optional `float`. Defaults to `0.3333`.
+      Depth of field, Fraction of viewing distance (eg. 1/3 = 0.3333)
+    normalize: An optional `bool`. Defaults to `True`.
+      Normalize input data to [0.0, 1.0] 
+    normalize_max: An optional `float`. Defaults to `-100`.
+      Fix MAX value for Normalization (0.0) - if < MIN, autoscale
+    normalize_min: An optional `float`. Defaults to `100`.
+      Fix MIN value for Normalization (0.0) - if > MAX, autoscale
+    border_level: An optional `float`. Defaults to `0`.
+      Value of bord in depth 0.0 {far} to 1.0 {near} 
+    number_colors: An optional `int`. Defaults to `256`. 2 (Black &
+      White), 256 (grayscale), and Numbers > 256 (Full Color) are
+      supported
+    output_image_shape: An optional `tf.TensorShape` or list of `ints`. 
+      Defaults to shape `[1024, 768, 1]`. Defines output shape of returned
+      image in '[X,Y, Channels]' 1-grayscale, 3 color; channels will be
+      updated to 3 if number_colors > 256
+    output_data_window: An optional `tf.TensorShape` or list of `ints`.
+      Defaults to `[1022, 757]`. Size of "DATA" window, must be equal to or
+      smaller than `output_image_shape`, will be centered and use
+      `convergence_dots_size` for best fit to avoid overlap if possible
+
+  Returns:
+    A `Tensor` of type `uint8` of shape 'output_image_shape' with encoded
+    'depth_values'
+  """
+
+  result = _sirds_ops.single_image_random_dot_stereograms(
+      depth_values=depth_values,
+      hidden_surface_removal=hidden_surface_removal,
+      convergence_dots_size=convergence_dots_size,
+      dots_per_inch=dots_per_inch,
+      eye_separation=eye_separation, mu=mu,
+      normalize=normalize,
+      normalize_max=normalize_max,
+      normalize_min=normalize_min,
+      border_level=border_level,
+      number_colors=number_colors,
+      output_image_shape=output_image_shape,
+      output_data_window=output_data_window)
+  return result
+
+ops.NotDifferentiable("SingleImageRandomDotStereograms")
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
index 20c49d5b6a9d0e474db1c8f9f0faf62fdb40e2df..27df3d3d71cb2d3b1efaca13d87c173c76c330a8 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
@@ -323,10 +323,10 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
       auto predictions = output->flat<float>();
 
       NSMutableDictionary *newValues = [NSMutableDictionary dictionary];
-      for (int index = 0; index < predictions.size(); index += 1) {
+      for (int index = 0; index < predictions.size(); ++index) {
         const float predictionValue = predictions(index);
         if (predictionValue > 0.05f) {
-          std::string label = labels[index % predictions.size()];
+          std::string label = labels[index];
           NSString *labelObject = [NSString stringWithUTF8String:label.c_str()];
           NSNumber *valueObject = [NSNumber numberWithFloat:predictionValue];
           [newValues setObject:valueObject forKey:labelObject];
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 449b0a3f501770a20720c74ffc5aa49c3a685d21..b1b8fc49b64de8305b8cb2bcd01e2884ca9e9805 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -107,6 +107,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/tensorboard:projector",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -118,6 +119,7 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
@@ -133,9 +135,10 @@ py_library(
 
 py_test(
     name = "integration_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/keras/integration_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -193,6 +196,7 @@ py_test(
     size = "medium",
     srcs = ["python/keras/optimizers_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -306,6 +310,7 @@ py_test(
     size = "medium",
     srcs = ["python/keras/layers/convolutional_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -401,6 +406,7 @@ py_test(
     size = "medium",
     srcs = ["python/keras/layers/simplernn_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -459,6 +465,7 @@ py_test(
     size = "small",
     srcs = ["python/keras/wrappers/scikit_learn_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -504,6 +511,7 @@ py_test(
     size = "small",
     srcs = ["python/keras/callbacks_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
@@ -516,6 +524,7 @@ py_test(
     size = "small",
     srcs = ["python/keras/engine/training_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
         ":testing_utils",
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
index 2f579f2d2812c6e9ac54a0900363eb5f64871941..36db34f592d839619112a1945c31fbcdbd2cfaf4 100644
--- a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -29,6 +29,7 @@ from tensorflow.contrib.keras.python.keras.callbacks import ModelCheckpoint
 from tensorflow.contrib.keras.python.keras.callbacks import ProgbarLogger
 from tensorflow.contrib.keras.python.keras.callbacks import ReduceLROnPlateau
 from tensorflow.contrib.keras.python.keras.callbacks import RemoteMonitor
+from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/python/keras/__init__.py b/tensorflow/contrib/keras/python/keras/__init__.py
index cdfc40dff1dcaf139280ee92219c1dd7698b631d..ec316253dbacb90d71bc9cafe67533c5449ef4b3 100644
--- a/tensorflow/contrib/keras/python/keras/__init__.py
+++ b/tensorflow/contrib/keras/python/keras/__init__.py
@@ -37,4 +37,4 @@ from tensorflow.contrib.keras.python.keras import utils
 from tensorflow.contrib.keras.python.keras import wrappers
 
 
-__version__ = '2.0.0-tf'
+__version__ = '2.0.2-tf'
diff --git a/tensorflow/contrib/keras/python/keras/activations.py b/tensorflow/contrib/keras/python/keras/activations.py
index 1eac52dfad6b5728aed701a96ffb58579e5f25ee..67762c83ba2960c71f287c35a6f9b47c3288901b 100644
--- a/tensorflow/contrib/keras/python/keras/activations.py
+++ b/tensorflow/contrib/keras/python/keras/activations.py
@@ -24,18 +24,28 @@ from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 
 
-def softmax(x):
+def softmax(x, axis=-1):
+  """Softmax activation function.
+
+  Arguments:
+      x : Tensor.
+      axis: Integer, axis along which the softmax normalization is applied.
+
+  Returns:
+      Tensor, output of softmax transformation.
+
+  Raises:
+      ValueError: In case `dim(x) == 1`.
+  """
   ndim = K.ndim(x)
   if ndim == 2:
     return K.softmax(x)
-  elif ndim == 3:
-    e = K.exp(x - K.max(x, axis=-1, keepdims=True))
-    s = K.sum(e, axis=-1, keepdims=True)
+  elif ndim > 2:
+    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
+    s = K.sum(e, axis=axis, keepdims=True)
     return e / s
   else:
-    raise ValueError('Cannot apply softmax to a tensor '
-                     'that is not 2D or 3D. '
-                     'Here, ndim=' + str(ndim))
+    raise ValueError('Cannot apply softmax to a tensor that is 1D')
 
 
 def elu(x, alpha=1.0):
diff --git a/tensorflow/contrib/keras/python/keras/applications/resnet50.py b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
index 546fcb9433abdcbcab877a14ed5274e0d4352124..640cc9a3868ad6491f3a5e0e404d29644f02795a 100644
--- a/tensorflow/contrib/keras/python/keras/applications/resnet50.py
+++ b/tensorflow/contrib/keras/python/keras/applications/resnet50.py
@@ -59,7 +59,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
 
   Arguments:
       input_tensor: input tensor
-      kernel_size: defualt 3, the kernel size of middle conv layer at main path
+      kernel_size: default 3, the kernel size of middle conv layer at main path
       filters: list of integers, the filterss of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
@@ -98,7 +98,7 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
 
   Arguments:
       input_tensor: input tensor
-      kernel_size: defualt 3, the kernel size of middle conv layer at main path
+      kernel_size: default 3, the kernel size of middle conv layer at main path
       filters: list of integers, the filterss of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
@@ -163,8 +163,8 @@ def ResNet50(include_top=True,
   specified in your Keras config file.
 
   Arguments:
-      include_top: whether to include the 3 fully-connected
-          layers at the top of the network.
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
       weights: one of `None` (random initialization)
           or "imagenet" (pre-training on ImageNet).
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
diff --git a/tensorflow/contrib/keras/python/keras/backend.py b/tensorflow/contrib/keras/python/keras/backend.py
index 9769bce3b059f9c291eb33a915d1617ef0a136f8..8bc3327552b48bb9551e205bb0a4e5897595a2ac 100644
--- a/tensorflow/contrib/keras/python/keras/backend.py
+++ b/tensorflow/contrib/keras/python/keras/backend.py
@@ -21,8 +21,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
-import errno
 import json
 import os
 import warnings
@@ -93,7 +91,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 def backend():
   """Publicly accessible method for determining the current backend.
 
-  Only exists for API compatibily with multi-backend Keras.
+  Only exists for API compatibility with multi-backend Keras.
 
   Returns:
       The string "tensorflow".
@@ -246,17 +244,40 @@ def set_image_data_format(data_format):
 
 
 def get_uid(prefix=''):
-  global _GRAPH_UID_DICTS  # pylint: disable=global-variable-not-assigned
-  graph = ops.get_default_graph()
-  if graph not in _GRAPH_UID_DICTS:
-    _GRAPH_UID_DICTS[graph] = defaultdict(int)
-  _GRAPH_UID_DICTS[graph][prefix] += 1
-  return _GRAPH_UID_DICTS[graph][prefix]
+  """Associates a string prefix with an integer counter in a TensorFlow graph.
+
+  Arguments:
+    prefix: String prefix to index.
+
+  Returns:
+    Unique integer ID.
+
+  Example:
+
+  ```
+    >>> get_uid('dense')
+    1
+    >>> get_uid('dense')
+    2
+  ```
+  """
+  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
+  if not layer_name_uids_collection:
+    layer_name_uids = {}
+    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
+  else:
+    layer_name_uids = layer_name_uids_collection[0]
+  if prefix not in layer_name_uids:
+    layer_name_uids[prefix] = 1
+  else:
+    layer_name_uids[prefix] += 1
+  return layer_name_uids[prefix]
 
 
 def reset_uids():
-  global _GRAPH_UID_DICTS
-  _GRAPH_UID_DICTS = {}
+  layer_name_uids_collection = ops.get_collection_ref('LAYER_NAME_UIDS')
+  if layer_name_uids_collection:
+    layer_name_uids_collection.pop()
 
 
 def clear_session():
@@ -270,6 +291,7 @@ def clear_session():
   reset_uids()
   _SESSION = None
   phase = array_ops.placeholder(dtype='bool', name='keras_learning_phase')
+  _GRAPH_LEARNING_PHASES = {}
   _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
 
 
@@ -351,7 +373,8 @@ def get_session():
       _SESSION = session_module.Session(config=config)
     session = _SESSION
   if not _MANUAL_VAR_INIT:
-    _initialize_variables()
+    with session.graph.as_default():
+      _initialize_variables()
   return session
 
 
@@ -1257,6 +1280,34 @@ def prod(x, axis=None, keepdims=False):
   return math_ops.reduce_prod(x, reduction_indices=axis, keep_dims=keepdims)
 
 
+def cumsum(x, axis=0):
+  """Cumulative sum of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the sum.
+
+  Returns:
+      A tensor of the cumulative sum of values of `x` along `axis`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.cumsum(x, axis=axis)
+
+
+def cumprod(x, axis=0):
+  """Cumulative product of the values in a tensor, alongside the specified axis.
+
+  Arguments:
+      x: A tensor or variable.
+      axis: An integer, the axis to compute the product.
+
+  Returns:
+      A tensor of the cumulative product of values of `x` along `axis`.
+  """
+  axis = _normalize_axis(axis, ndim(x))
+  return math_ops.cumprod(x, axis=axis)
+
+
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -1330,8 +1381,7 @@ def any(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  x = math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
-  return math_ops.cast(x, dtypes_module.uint8)
+  return math_ops.reduce_any(x, reduction_indices=axis, keep_dims=keepdims)
 
 
 def all(x, axis=None, keepdims=False):
@@ -1347,8 +1397,7 @@ def all(x, axis=None, keepdims=False):
   """
   axis = _normalize_axis(axis, ndim(x))
   x = math_ops.cast(x, dtypes_module.bool)
-  x = math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
-  return math_ops.cast(x, dtypes_module.uint8)
+  return math_ops.reduce_all(x, reduction_indices=axis, keep_dims=keepdims)
 
 
 def argmax(x, axis=-1):
@@ -1645,7 +1694,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """
   mean, var = nn.moments(
       x, reduction_axes, shift=None, name=None, keep_dims=False)
-  if sorted(reduction_axes) == range(ndim(x))[:-1]:
+  if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
     normed = nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
   else:
     # need broadcasting
@@ -2324,8 +2373,8 @@ def rnn(step_function,
           (no time dimension),
           containing the initial values for the states used in
           the step function.
-      go_backwards: boolean. If True, do the iteration over
-          the time dimension in reverse order.
+      go_backwards: boolean. If True, do the iteration over the time
+          dimension in reverse order and return the reversed sequence.
       mask: binary tensor with shape `(samples, time, 1)`,
           with a zero for every element that is masked.
       constants: a list of constant values passed at each step.
@@ -2414,9 +2463,9 @@ def rnn(step_function,
         states = return_states
         successive_outputs.append(output)
         successive_states.append(states)
-        last_output = successive_outputs[-1]
-        new_states = successive_states[-1]
-        outputs = array_ops.stack(successive_outputs)
+      last_output = successive_outputs[-1]
+      new_states = successive_states[-1]
+      outputs = array_ops.stack(successive_outputs)
     else:
       for inp in input_list:
         output, states = step_function(inp, states + constants)
@@ -2568,7 +2617,7 @@ def in_train_phase(x, alt, training=None):
           (tensor or callable that returns a tensor).
       training: Optional scalar tensor
           (or Python boolean, or Python integer)
-          specifing the learning phase.
+          specifying the learning phase.
 
   Returns:
       Either `x` or `alt` based on the `training` flag.
@@ -2611,7 +2660,7 @@ def in_test_phase(x, alt, training=None):
           (tensor or callable that returns a tensor).
       training: Optional scalar tensor
           (or Python boolean, or Python integer)
-          specifing the learning phase.
+          specifying the learning phase.
 
   Returns:
       Either `x` or `alt` based on `K.learning_phase`.
@@ -2872,13 +2921,14 @@ def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
   Arguments:
-      predictions: A tensor of shape `batch_size` x classes and type `float32`.
-      targets: A tensor of shape batch_size and type `int32` or `int64`.
+      predictions: A tensor of shape `(batch_size, classes)` and type `float32`.
+      targets: A 1D tensor of length `batch_size` and type `int32` or `int64`.
       k: An `int`, number of top elements to consider.
 
   Returns:
-      A tensor of shape `batch_size` and type `bool`. `output_i` is `True` if
-      `targets_i` is within top-k values of `predictions_i`
+      A 1D tensor of length `batch_size` and type `bool`.
+      `output[i]` is `True` if `predictions[i, targets[i]]` is within top-`k`
+      values of `predictions[i]`.
   """
   return nn.in_top_k(predictions, targets, k)
 
@@ -3426,8 +3476,9 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
   max_num_labels_tns = array_ops.stack([label_shape[1]])
 
   def range_less_than(_, current_input):
-    return array_ops.expand_dims(math_ops.range(
-        label_shape[1]), 0) < array_ops.fill(max_num_labels_tns, current_input)
+    return array_ops.expand_dims(
+        math_ops.range(label_shape[1]), 0) < array_ops.fill(
+            max_num_labels_tns, current_input)
 
   init = math_ops.cast(
       array_ops.fill([1, label_shape[1]], 0), dtypes_module.bool)
@@ -3534,19 +3585,19 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 # HIGH ORDER FUNCTIONS
 
 
-def map_fn(fn, elems, name=None):
+def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
   Arguments:
       fn: Callable that will be called upon each element in elems
       elems: tensor
       name: A string name for the map node in the graph
+      dtype: Output data type.
 
   Returns:
-      Tensor with first dimension equal to the elems and second depending on
-      fn
+      Tensor with dtype `dtype`.
   """
-  return functional_ops.map_fn(fn, elems, name=name)
+  return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
 
 
 def foldl(fn, elems, initializer=None, name=None):
@@ -3560,7 +3611,7 @@ def foldl(fn, elems, initializer=None, name=None):
       name: A string name for the foldl node in the graph
 
   Returns:
-      Same type and shape as initializer
+      Tensor with same type and shape as `initializer`.
   """
   return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
 
@@ -3583,27 +3634,42 @@ def foldr(fn, elems, initializer=None, name=None):
 
 # Load Keras default configuration from config file if present.
 _keras_base_dir = os.path.expanduser('~')
-if not os.access(_keras_base_dir, os.W_OK):
-  _keras_base_dir = '/tmp'
 _keras_dir = os.path.join(_keras_base_dir, '.keras')
-if not os.path.exists(_keras_dir):
-  try:
-    os.makedirs(_keras_dir)
-  except OSError as e:
-    if e.errno == errno.EEXIST:
-      pass
-    else:
-      raise
 _config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
 if os.path.exists(_config_path):
-  _config = json.load(open(_config_path))
+  try:
+    _config = json.load(open(_config_path))
+  except ValueError:
+    _config = {}
   _floatx = _config.get('floatx', floatx())
   assert _floatx in {'float16', 'float32', 'float64'}
   _epsilon = _config.get('epsilon', epsilon())
   assert isinstance(_epsilon, float)
-  _backend = backend()
   _image_data_format = _config.get('image_data_format', image_data_format())
   assert _image_data_format in {'channels_last', 'channels_first'}
   set_floatx(_floatx)
   set_epsilon(_epsilon)
   set_image_data_format(_image_data_format)
+
+# Save config file.
+if not os.path.exists(_keras_dir):
+  try:
+    os.makedirs(_keras_dir)
+  except OSError:
+    # Except permission denied and potential race conditions
+    # in multi-threaded environments.
+    pass
+
+if not os.path.exists(_config_path):
+  _config = {
+      'floatx': floatx(),
+      'epsilon': epsilon(),
+      'backend': 'tensorflow',
+      'image_data_format': image_data_format()
+  }
+  try:
+    with open(_config_path, 'w') as f:
+      f.write(json.dumps(_config, indent=4))
+  except IOError:
+    # Except permission denied.
+    pass
diff --git a/tensorflow/contrib/keras/python/keras/backend_test.py b/tensorflow/contrib/keras/python/keras/backend_test.py
index fd9db1f3273433a73342f56740f151c5ee032d38..2da5aee58e5633fa0461a08d352d696f710d9620 100644
--- a/tensorflow/contrib/keras/python/keras/backend_test.py
+++ b/tensorflow/contrib/keras/python/keras/backend_test.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 import numpy as np
 
 from tensorflow.contrib.keras.python import keras
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 def compare_single_input_op_to_numpy(keras_op,
@@ -207,7 +206,7 @@ class BackendLinearAlgebraTest(test.TestCase):
         compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
                                          keras_kwargs={'axis': -1},
                                          np_kwargs={'axis': -1})
-        if 'keepdims' in inspect.getargspec(keras_op).args:
+        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
           compare_single_input_op_to_numpy(keras_op, np_op,
                                            input_shape=(4, 7, 5),
                                            keras_kwargs={'axis': 1,
diff --git a/tensorflow/contrib/keras/python/keras/callbacks.py b/tensorflow/contrib/keras/python/keras/callbacks.py
index 345db2791c9ab2772d676af5ef22b5f460f48530..a533e0fbda8933f4b570b22644348f9b8e26ee2b 100644
--- a/tensorflow/contrib/keras/python/keras/callbacks.py
+++ b/tensorflow/contrib/keras/python/keras/callbacks.py
@@ -31,8 +31,10 @@ import numpy as np
 
 from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import Progbar
+from tensorflow.contrib.tensorboard.plugins import projector
 from tensorflow.python.ops import array_ops
 from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver as saver_lib
 
 
 # pylint: disable=g-import-not-at-top
@@ -108,9 +110,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_begin)
     if (self._delta_t_batch > 0. and
         delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      warnings.warn('Method on_batch_begin() is slow compared '
-                    'to the batch update (%f). Check your callbacks.' %
-                    delta_t_median)
+      warnings.warn(
+          'Method on_batch_begin() is slow compared '
+          'to the batch update (%f). Check your callbacks.' % delta_t_median)
     self._t_enter_batch = time.time()
 
   def on_batch_end(self, batch, logs=None):
@@ -131,9 +133,9 @@ class CallbackList(object):
     delta_t_median = np.median(self._delta_ts_batch_end)
     if (self._delta_t_batch > 0. and
         (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      warnings.warn('Method on_batch_end() is slow compared '
-                    'to the batch update (%f). Check your callbacks.' %
-                    delta_t_median)
+      warnings.warn(
+          'Method on_batch_end() is slow compared '
+          'to the batch update (%f). Check your callbacks.' % delta_t_median)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -585,6 +587,7 @@ class LearningRateScheduler(Callback):
 
 
 class TensorBoard(Callback):
+  # pylint: disable=line-too-long
   """Tensorboard basic visualizations.
 
   This callback writes a log for TensorBoard, which allows
@@ -603,19 +606,36 @@ class TensorBoard(Callback):
           write_graph is set to True.
       write_images: whether to write model weights to visualize as
           image in Tensorboard.
+      embeddings_freq: frequency (in epochs) at which selected embedding
+          layers will be saved.
+      embeddings_layer_names: a list of names of layers to keep eye on. If
+          None or empty list all the embedding layer will be watched.
+      embeddings_metadata: a dictionary which maps layer name to a file name
+          in which metadata for this embedding layer is saved. See the
+          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+          about metadata files format. In case if the same metadata file is
+          used for all embedding layers, string can be passed.
   """
 
+  # pylint: enable=line-too-long
+
   def __init__(self,
                log_dir='./logs',
                histogram_freq=0,
                write_graph=True,
-               write_images=False):
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None):
     super(TensorBoard, self).__init__()
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
     self.merged = None
     self.write_graph = write_graph
     self.write_images = write_images
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata or {}
 
   def set_model(self, model):
     self.model = model
@@ -644,6 +664,47 @@ class TensorBoard(Callback):
     else:
       self.writer = tf_summary.FileWriter(self.log_dir)
 
+    if self.embeddings_freq:
+      self.saver = saver_lib.Saver()
+
+      embeddings_layer_names = self.embeddings_layer_names
+
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      embeddings = {
+          layer.name: layer.weights[0]
+          for layer in self.model.layers if layer.name in embeddings_layer_names
+      }
+
+      embeddings_metadata = {}
+
+      if not isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = self.embeddings_metadata
+      else:
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings.keys()
+        }
+
+      config = projector.ProjectorConfig()
+      self.embeddings_logs = []
+
+      for layer_name, tensor in embeddings.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        self.embeddings_logs.append(
+            os.path.join(self.log_dir, layer_name + '.ckpt'))
+
+        if layer_name in embeddings_metadata:
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
 
@@ -663,6 +724,11 @@ class TensorBoard(Callback):
         summary_str = result[0]
         self.writer.add_summary(summary_str, epoch)
 
+    if self.embeddings_freq and self.embeddings_logs:
+      if epoch % self.embeddings_freq == 0:
+        for log in self.embeddings_logs:
+          self.saver.save(self.sess, log, epoch)
+
     for name, value in logs.items():
       if name in ['batch', 'size']:
         continue
diff --git a/tensorflow/contrib/keras/python/keras/datasets/imdb.py b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
index 2688e8bedecc422ad507005a2843044b6cc2c2d3..5c087fe63f5f03c8741336ef285239f39470997e 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/imdb.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/imdb.py
@@ -41,7 +41,7 @@ def load_data(path='imdb.npz',
       num_words: max number of words to include. Words are ranked
           by how often they occur (in the training set) and only
           the most frequent words are kept
-      skip_top: skip the top N most frequently occuring words
+      skip_top: skip the top N most frequently occurring words
           (which may not be informative).
       maxlen: truncate sequences after this length.
       seed: random seed for sample shuffling.
@@ -100,10 +100,10 @@ def load_data(path='imdb.npz',
         new_labels.append(y)
     xs = new_xs
     labels = new_labels
-  if not xs:
-    raise ValueError('After filtering for sequences shorter than maxlen=' + str(
-        maxlen) + ', no sequence was kept. '
-                     'Increase maxlen.')
+    if not xs:
+      raise ValueError('After filtering for sequences shorter than maxlen=' +
+                       str(maxlen) + ', no sequence was kept. '
+                       'Increase maxlen.')
   if not num_words:
     num_words = max([max(x) for x in xs])
 
diff --git a/tensorflow/contrib/keras/python/keras/datasets/reuters.py b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
index 81e940a8463a2fad1b7006e62e1a801578095a3a..b1c22fee63d23d10d7f6b6ddac2245915688ad4a 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/reuters.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/reuters.py
@@ -43,7 +43,7 @@ def load_data(path='reuters.npz',
       num_words: max number of words to include. Words are ranked
           by how often they occur (in the training set) and only
           the most frequent words are kept
-      skip_top: skip the top N most frequently occuring words
+      skip_top: skip the top N most frequently occurring words
           (which may not be informative).
       maxlen: truncate sequences after this length.
       test_split: Fraction of the dataset to be used as test data.
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology.py b/tensorflow/contrib/keras/python/keras/engine/topology.py
index 0f506ff0a46b358f591d180dd2271ac6ae6105e1..6f45bf74336c51bd6ecd2c519ca23bb19c0b5f2d 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import inspect
 import json
 import os
 import re
@@ -30,11 +29,13 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.keras.python.keras import backend as K
-from tensorflow.contrib.keras.python.keras import initializers
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.contrib.keras.python.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=g-import-not-at-top
@@ -207,7 +208,7 @@ class Node(object):
     }
 
 
-class Layer(object):
+class Layer(tf_base_layers.Layer):
   """Abstract base layer class.
 
   # Properties
@@ -272,42 +273,53 @@ class Layer(object):
   # Internal methods:
       build(input_shape)
       _add_inbound_node(layer, index=0)
-      assert_input_compatibility()
   """
 
   def __init__(self, **kwargs):
-    self.input_spec = None
-    self.supports_masking = False
-
-    # These properties will be set upon call of self.build()
-    self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._constraints = {}  # dict {tensor: constraint instance}
-    self.built = False
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self.inbound_nodes = []
-    self.outbound_nodes = []
-
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_shape', 'batch_input_shape', 'batch_size', 'dtype', 'name',
-        'trainable', 'weights'
+        'input_shape',
+        'batch_input_shape',
+        'batch_size',
+        'dtype',
+        'name',
+        'trainable',
+        'weights',
     }
+    # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
+
+    # Get layer name.
     name = kwargs.get('name')
-    if not name:
-      prefix = self.__class__.__name__
-      name = _to_snake_case(prefix) + '_' + str(K.get_uid(prefix))
-    self.name = name
 
-    self.trainable = kwargs.get('trainable', True)
+    # Get `trainable` status.
+    trainable = kwargs.get('trainable', True)
+
+    # Get `dtype`.
+    dtype = kwargs.get('dtype')
+    if dtype is None:
+      dtype = K.floatx()
+
+    # Call super, which will set all properties common to Keras layers
+    # and core TF layers.
+    super(Layer, self).__init__(name=name, dtype=dtype, trainable=trainable)
+
+    # Add properties that are Keras-only for now.
+    self.input_spec = None
+    self.supports_masking = False
+    self._constraints = {}  # dict {tensor: constraint instance}
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self.inbound_nodes = []
+    self.outbound_nodes = []
+
+    # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -320,8 +332,8 @@ class Layer(object):
           batch_size = None
         batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
       self.batch_input_shape = batch_input_shape
-      dtype = kwargs.get('dtype', K.floatx())
-      self.dtype = dtype
+
+    # Manage initial weight values if passed.
     if 'weights' in kwargs:
       self._initial_weights = kwargs['weights']
     else:
@@ -335,161 +347,45 @@ class Layer(object):
   def constraints(self, constraints):
     self._constraints = constraints
 
-  @property
-  def trainable_weights(self):
-    trainable = getattr(self, 'trainable', True)
-    if trainable:
-      return self._trainable_weights
-    else:
-      return []
-
-  @trainable_weights.setter
-  def trainable_weights(self, weights):
-    self._trainable_weights = weights
-
-  @property
-  def non_trainable_weights(self):
-    trainable = getattr(self, 'trainable', True)
-    if not trainable:
-      return self._trainable_weights + self._non_trainable_weights
-    else:
-      return self._non_trainable_weights
-
-  @non_trainable_weights.setter
-  def non_trainable_weights(self, weights):
-    self._non_trainable_weights = weights
-
   def add_weight(self,
+                 name,
                  shape,
-                 initializer,
-                 name=None,
-                 trainable=True,
+                 dtype=None,
+                 initializer=None,
                  regularizer=None,
+                 trainable=True,
                  constraint=None):
     """Adds a weight variable to the layer.
 
     Arguments:
+        name: String, the name for the weight variable.
         shape: The shape tuple of the weight.
+        dtype: The dtype of the weight.
         initializer: An Initializer instance (callable).
-        name: String, the name for the weight variable.
+        regularizer: An optional Regularizer instance.
         trainable: A boolean, whether the weight should
             be trained via backprop or not (assuming
             that the layer itself is also trainable).
-        regularizer: An optional Regularizer instance.
         constraint: An optional Constraint instance.
 
     Returns:
         The created weight variable.
     """
-    shape = tuple(tensor_shape.TensorShape(shape).as_list())
-    initializer = initializers.get(initializer)
-    weight = K.variable(initializer(shape), dtype=K.floatx(), name=name)
-    if regularizer is not None:
-      self.add_loss(regularizer(weight))
+    if dtype is None:
+      dtype = K.floatx()
+    weight = self.add_variable(
+        name, shape, dtype=dtype,
+        initializer=initializer, regularizer=regularizer, trainable=trainable)
     if constraint is not None:
       self.constraints[weight] = constraint
-    if trainable:
-      self._trainable_weights.append(weight)
-    else:
-      self._non_trainable_weights.append(weight)
     return weight
 
-  def assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `input`
-    verify the input assumptions of the layer
-    (if any). If not, exceptions are raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = _to_list(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = _to_list(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' + str(
-          len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Input received: ' + str(input))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      # Check ndim.
-      if spec.ndim is not None:
-        if K.ndim(x) != spec.ndim:
-          raise ValueError('Input ' + str(
-              input_index) + ' is incompatible with layer ' + self.name +
-                           ': expected ndim=' + str(
-                               spec.ndim) + ', found ndim=' + str(K.ndim(x)))
-      if spec.max_ndim is not None:
-        ndim = K.ndim(x)
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(
-              input_index) + ' is incompatible with layer ' + self.name +
-                           ': expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(K.ndim(x)))
-      if spec.min_ndim is not None:
-        ndim = K.ndim(x)
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(
-              input_index) + ' is incompatible with layer ' + self.name +
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(K.ndim(x)))
-      # Check dtype.
-      if spec.dtype is not None:
-        if K.dtype(x) != spec.dtype:
-          raise ValueError('Input ' + str(
-              input_index) + ' is incompatible with layer ' + self.name +
-                           ': expected dtype=' + str(
-                               spec.dtype) + ', found dtype=' + str(K.dtype(x)))
-      # Check specific shape axes.
-      if spec.axes:
-        try:
-          x_shape = K.int_shape(x)
-        except TypeError:
-          x_shape = None
-        if x_shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and x_shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' is incompatible with layer ' +
-                  self.name + ': expected axis ' + str(
-                      axis) + ' of input shape to have '
-                  'value ' + str(value) + ' but got shape ' + str(x_shape))
-      # Check shape.
-      if spec.shape is not None:
-        try:
-          x_shape = K.int_shape(x)
-        except TypeError:
-          x_shape = None
-        if x_shape is not None:
-          for spec_dim, dim in zip(spec.shape, x_shape):
-            if hasattr(spec_dim, 'value'):
-              spec_dim = spec_dim.value
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(
-                    input_index) + ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(x_shape))
-
-  def call(self, inputs):
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
 
     Arguments:
-        inputs: input tensor, or list/tuple of input tensors.
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
 
     Returns:
         A tensor or list/tuple of tensors.
@@ -518,65 +414,43 @@ class Layer(object):
         ValueError: in case the layer is missing shape information
             for its `build` call.
     """
+    if isinstance(inputs, list):
+      inputs = inputs[:]
+
+    # Handle mask propagation.
+    previous_mask = _collect_previous_mask(inputs)
+    user_kwargs = copy.copy(kwargs)
+    if not _is_all_none(previous_mask):
+      # The previous layer generated a mask.
+      if 'mask' in tf_inspect.getargspec(self.call).args:
+        if 'mask' not in kwargs:
+          # If mask is explicitly passed to __call__,
+          # we should override the default mask.
+          kwargs['mask'] = previous_mask
+
+    # Actually call the layer (optionally building it).
+    output = super(Layer, self).__call__(inputs, **kwargs)
+
+    # Handle mask computation.
     with K.name_scope(self.name):
-      # Handle laying building (weight creating, input spec locking).
-      if not self.built:
-        # Raise exceptions in case the input is not compatible
-        # with the input_spec specified in the layer constructor.
-        self.assert_input_compatibility(inputs)
-
-        # Collect input shapes to build layer.
-        input_shapes = []
-        for x_elem in _to_list(inputs):
-          input_shapes.append(K.int_shape(x_elem))
-        if len(input_shapes) == 1:
-          self.build(input_shapes[0])
-        else:
-          self.build(input_shapes)
-        self.built = True
-
-        # Load weights that were specified at layer instantiation.
-        if self._initial_weights is not None:
-          self.set_weights(self._initial_weights)
-
-      # Raise exceptions in case the input is not compatible
-      # with the input_spec set at build time.
-      self.assert_input_compatibility(inputs)
-
-      # Handle mask propagation.
-      previous_mask = _collect_previous_mask(inputs)
-      if not _is_all_none(previous_mask):
-        # The previous layer generated a mask.
-        if 'mask' in inspect.getargspec(self.call).args:
-          if 'mask' not in kwargs:
-            # If mask is explicitly passed to __call__,
-            # we should override the default mask.
-            kwargs['mask'] = previous_mask
-
-      # Actually call the layer, collecting output(s), mask(s), and shape(s).
-      output = self.call(inputs, **kwargs)
       output_mask = self.compute_mask(inputs, previous_mask)
 
-      # Add an inbound node to the layer, so that it keeps track
-      # of the call and of all new variables created during the call.
-      # This also updates the layer history of the output tensor(s).
-      # If the input tensor(s) had not previous Keras history,
-      # this does nothing.
-      self._add_inbound_node(
-          input_tensors=inputs,
-          output_tensors=output,
-          input_masks=previous_mask,
-          output_masks=output_mask,
-          arguments=kwargs)
-
-      # Apply activity regularizer if any:
-      if hasattr(
-          self,
-          'activity_regularizer') and self.activity_regularizer is not None:
-        regularization_losses = [
-            self.activity_regularizer(x) for x in _to_list(output)
-        ]
-        self.add_loss(regularization_losses, _to_list(inputs))
+    # Add an inbound node to the layer, so that it keeps track
+    # of the call and of all new variables created during the call.
+    # This also updates the layer history of the output tensor(s).
+    # If the input tensor(s) had not previous Keras history,
+    # this does nothing.
+    self._add_inbound_node(
+        input_tensors=inputs,
+        output_tensors=output,
+        input_masks=previous_mask,
+        output_masks=output_mask,
+        arguments=user_kwargs)
+
+    # Optionally load weight values that were specified at layer instantiation.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
     return output
 
   def _add_inbound_node(self,
@@ -678,7 +552,7 @@ class Layer(object):
                           'but was passed an input_mask: ' + str(mask))
       # masking not explicitly supported: return None as mask
       return None
-    # if masking is explictly supported, by default
+    # if masking is explicitly supported, by default
     # carry over the input mask
     return mask
 
@@ -720,9 +594,9 @@ class Layer(object):
       raise RuntimeError('The layer has never been called '
                          'and thus has no defined ' + attr_name + '.')
     if not len(self.inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' + str(
-          node_index) + ', but the layer has only ' + str(
-              len(self.inbound_nodes)) + ' inbound nodes.')
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self.inbound_nodes)) + ' inbound nodes.')
     values = getattr(self.inbound_nodes[node_index], attr)
     if len(values) == 1:
       return values[0]
@@ -922,14 +796,14 @@ class Layer(object):
 
   @property
   def input_shape(self):
-    """Retrieves the input shape tuple(s) of a layer.
+    """Retrieves the input shape(s) of a layer.
 
     Only applicable if the layer has exactly one inbound node,
     i.e. if it is connected to one incoming layer.
 
     Returns:
-        Input shape tuple
-        (or list of input shape tuples, one tuple per input tensor).
+        Input shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per input tensor).
 
     Raises:
         AttributeError: if the layer is connected to
@@ -960,14 +834,14 @@ class Layer(object):
 
   @property
   def output_shape(self):
-    """Retrieves the output shape tuple(s) of a layer.
+    """Retrieves the output shape(s) of a layer.
 
     Only applicable if the layer has one inbound node,
     or if all inbound nodes have the same output shape.
 
     Returns:
-        Output shape tuple
-        (or list of input shape tuples, one tuple per output tensor).
+        Output shape, as `TensorShape`
+        (or list of `TensorShape`, one tuple per output tensor).
 
     Raises:
         AttributeError: if the layer is connected to
@@ -996,114 +870,6 @@ class Layer(object):
                            'Use `get_output_shape_at(node_index)` '
                            'instead.')
 
-  def add_loss(self, losses, inputs=None):
-    """Add losses to the layer.
-
-    The loss may potentially be conditional on some inputs tensors,
-    for instance activity losses are conditional on the layer's inputs.
-
-    Arguments:
-        losses: loss tensor or list of loss tensors
-            to add to the layer.
-        inputs: input tensor or list of inputs tensors to mark
-            the losses as conditional on these inputs.
-            If None is passed, the loss is assumed unconditional
-            (e.g. L2 weight regularization, which only depends
-            on the layer's weights variables, not on any inputs tensors).
-    """
-    if losses is None:
-      return
-    # Update self.losses
-    losses = _to_list(losses)
-    if not hasattr(self, 'losses'):
-      self.losses = []
-    try:
-      self.losses += losses
-    except AttributeError:
-      # In case self.losses isn't settable
-      # (i.e. it's a getter method).
-      # In that case the `losses` property is
-      # auto-computed and shouldn't be set.
-      pass
-    # Update self._per_input_updates
-    if not hasattr(self, '_per_input_losses'):
-      self._per_input_losses = {}
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      # Updates indexed by None are unconditional
-      # rather than input-dependent
-      inputs_hash = None
-    if inputs_hash not in self._per_input_losses:
-      self._per_input_losses[inputs_hash] = []
-    self._per_input_losses[inputs_hash] += losses
-
-  def add_update(self, updates, inputs=None):
-    """Add updates to the layer.
-
-    The updates may potentially be conditional on some inputs tensors,
-    for instance batch norm updates are conditional on the layer's inputs.
-
-    Arguments:
-        updates: update op or list of update ops
-            to add to the layer.
-        inputs: input tensor or list of inputs tensors to mark
-            the updates as conditional on these inputs.
-            If None is passed, the updates are assumed unconditional.
-    """
-    if updates is None:
-      return
-    # Update self.updates
-    updates = _to_list(updates)
-    if not hasattr(self, 'updates'):
-      self.updates = []
-    try:
-      self.updates += updates
-    except AttributeError:
-      # In case self.updates isn't settable
-      # (i.e. it's a getter method).
-      # In that case the `updates` property is
-      # auto-computed and shouldn't be set.
-      pass
-    # Update self._per_input_updates
-    if not hasattr(self, '_per_input_updates'):
-      self._per_input_updates = {}
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      # Updates indexed by None are unconditional
-      # rather than input-dependent
-      inputs_hash = None
-    if inputs_hash not in self._per_input_updates:
-      self._per_input_updates[inputs_hash] = []
-    self._per_input_updates[inputs_hash] += updates
-
-  def get_updates_for(self, inputs):
-    if not hasattr(self, '_per_input_updates'):
-      return []
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    if inputs_hash in self._per_input_updates:
-      return self._per_input_updates[inputs_hash]
-    return []
-
-  def get_losses_for(self, inputs):
-    if not hasattr(self, '_per_input_losses'):
-      return []
-    if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
-    else:
-      inputs_hash = None
-    if inputs_hash in self._per_input_losses:
-      return self._per_input_losses[inputs_hash]
-    return []
-
-  @property
-  def weights(self):
-    return self.trainable_weights + self.non_trainable_weights
-
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -1121,10 +887,10 @@ class Layer(object):
     params = self.weights
     if len(params) != len(weights):
       raise ValueError('You called `set_weights(weights)` on layer "' +
-                       self.name + '" with a  weight list of length ' + str(
-                           len(weights)) + ', but the layer was expecting ' +
-                       str(len(params)) + ' weights. Provided weights: ' + str(
-                           weights)[:50] + '...')
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
     if not params:
       return
     weight_value_tuples = []
@@ -1237,9 +1003,12 @@ class InputLayer(Layer):
     if not name:
       prefix = 'input'
       name = prefix + '_' + str(K.get_uid(prefix))
+    if not dtype:
+      if input_tensor is None:
+        dtype = K.floatx()
+      else:
+        dtype = K.dtype(input_tensor)
     super(InputLayer, self).__init__(dtype=dtype, name=name)
-
-    self.trainable = False
     self.built = True
     self.sparse = sparse
 
@@ -1267,15 +1036,7 @@ class InputLayer(Layer):
         batch_input_shape = (batch_size,) + tuple(input_shape)
     else:
       batch_input_shape = tuple(batch_input_shape)
-
-    if not dtype:
-      if input_tensor is None:
-        dtype = K.floatx()
-      else:
-        dtype = K.dtype(input_tensor)
-
     self.batch_input_shape = batch_input_shape
-    self.dtype = dtype
 
     if input_tensor is None:
       self.is_placeholder = True
@@ -1417,7 +1178,7 @@ class Container(Layer):
       get_weights
       set_weights
       get_config
-      get_output_shape_for
+      compute_output_shape
 
   # Class Methods
       from_config
@@ -1429,9 +1190,18 @@ class Container(Layer):
       prefix = self.__class__.__name__.lower()
       name = prefix + '_' + str(K.get_uid(prefix))
     self.name = name
-
     self.supports_masking = False
     self.trainable = True
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
 
     # Container-specific properties.
     if isinstance(inputs, (list, tuple)):
@@ -1482,8 +1252,8 @@ class Container(Layer):
       if not hasattr(x, '_keras_history'):
         cls_name = self.__class__.__name__
         raise TypeError('Input tensors to a ' + cls_name + ' ' +
-                        'must be Keras tensors. Found: ' + str(
-                            x) + ' (missing Keras metadata).')
+                        'must be Keras tensors. Found: ' + str(x) +
+                        ' (missing Keras metadata).')
       # Check that x is an input tensor.
       layer, node_index, tensor_index = x._keras_history
       if len(layer.inbound_nodes) > 1 or (
@@ -1691,8 +1461,8 @@ class Container(Layer):
                                  'cannot obtain value for tensor ' + str(x) +
                                  ' at layer "' + layer.name + '". '
                                  'The following previous layers '
-                                 'were accessed without issue: ' + str(
-                                     layers_with_complete_input))
+                                 'were accessed without issue: ' +
+                                 str(layers_with_complete_input))
           for x in node.output_tensors:
             computable_tensors.append(x)
           layers_with_complete_input.append(layer.name)
@@ -1706,8 +1476,8 @@ class Container(Layer):
     all_names = [layer.name for layer in self.layers]
     for name in all_names:
       if all_names.count(name) != 1:
-        raise RuntimeError('The name "' + name + '" is used ' + str(
-            all_names.count(name)) + ' times in the model. '
+        raise RuntimeError('The name "' + name + '" is used ' +
+                           str(all_names.count(name)) + ' times in the model. '
                            'All layer names should be unique.')
 
     # Layer parameters.
@@ -1756,9 +1526,9 @@ class Container(Layer):
     # without the container being notified of it.
     if index is not None:
       if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' +
-                         str(index) + ' but model only has ' + str(
-                             len(self.layers)) + ' layers.')
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
       else:
         return self.layers[index]
     else:
@@ -1776,7 +1546,7 @@ class Container(Layer):
     """Retrieve the model's updates.
 
     Will only include updates that are either
-    inconditional, or conditional on inputs to this model
+    unconditional, or conditional on inputs to this model
     (e.g. will not include updates that depend on tensors
     that aren't inputs to this model).
 
@@ -1786,19 +1556,16 @@ class Container(Layer):
     updates = []
     for layer in self.layers:
       if hasattr(layer, 'updates'):
-        if len(layer.inbound_nodes) == 1:
-          updates += layer.updates
-        else:
-          # Collect updates that are dependent on inputs
-          # that are part of the model.
-          for node_index, node in enumerate(layer.inbound_nodes):
-            node_key = layer.name + '_ib-' + str(node_index)
-            if node_key in self.container_nodes:
-              # The model owns this layer node.
-              inputs = node.input_tensors
-              updates += layer.get_updates_for(inputs)
-          # Collect unconditional updates.
-          updates += layer.get_updates_for(None)
+        # Collect updates that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer.inbound_nodes):
+          node_key = layer.name + '_ib-' + str(node_index)
+          if node_key in self.container_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            updates += layer.get_updates_for(inputs)
+        # Collect unconditional updates.
+        updates += layer.get_updates_for(None)
     return updates
 
   @property
@@ -1806,7 +1573,7 @@ class Container(Layer):
     """Retrieve the model's losses.
 
     Will only include losses that are either
-    inconditional, or conditional on inputs to this model
+    unconditional, or conditional on inputs to this model
     (e.g. will not include losses that depend on tensors
     that aren't inputs to this model).
 
@@ -1817,22 +1584,18 @@ class Container(Layer):
     # Retrieve losses for all internal layers.
     for layer in self.layers:
       if hasattr(layer, 'losses'):
-        if len(layer.inbound_nodes) == 1:
-          losses += layer.losses
-        else:
-          # Collect losses that are dependent on inputs
-          # that are part of the model.
-          for node_index, node in enumerate(layer.inbound_nodes):
-            node_key = layer.name + '_ib-' + str(node_index)
-            if node_key in self.container_nodes:
-              # The model owns this layer node.
-              inputs = node.input_tensors
-              losses += layer.get_losses_for(inputs)
-          # Collect unconditional losses.
-          losses += layer.get_losses_for(None)
+        # Collect losses that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer.inbound_nodes):
+          node_key = layer.name + '_ib-' + str(node_index)
+          if node_key in self.container_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            losses += layer.get_losses_for(inputs)
+        # Collect unconditional losses.
+        losses += layer.get_losses_for(None)
     # Add any potential unconditional model-level loss.
-    if hasattr(self, '_per_input_losses'):
-      losses += self._per_input_losses.get(None, [])
+    losses += self.get_losses_for(None)
     return losses
 
   @property
@@ -2009,9 +1772,9 @@ class Container(Layer):
         input_shapes = [None]
 
     if len(input_shapes) != len(self.input_layers):
-      raise ValueError('Invalid input_shape argument ' +
-                       str(input_shape) + ': model has ' + str(
-                           len(self.input_layers)) + ' tensor inputs.')
+      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
+                       ': model has ' + str(len(self.input_layers)) +
+                       ' tensor inputs.')
 
     cache_key = ','.join([str(x) for x in input_shapes])
     if cache_key in self._output_shape_cache:
@@ -2029,7 +1792,7 @@ class Container(Layer):
       for i in range(len(input_shapes)):
         layer = self.input_layers[i]
         input_shape = input_shapes[i]
-        # It's an input layer: get_output_shape_for is identity,
+        # It's an input layer: compute_output_shape is identity,
         # and there is only one node and one tensor output.
         shape_key = layer.name + '_0_0'
         layers_to_output_shapes[shape_key] = input_shape
@@ -2144,6 +1907,7 @@ class Container(Layer):
         for x in reference_input_tensors:
           if str(id(x)) in tensor_map:
             computed_data.append(tensor_map[str(id(x))])
+
         if len(computed_data) == len(reference_input_tensors):
           # call layer
           with K.name_scope(layer.name):
@@ -2153,7 +1917,7 @@ class Container(Layer):
               kwargs = {}
             if len(computed_data) == 1:
               computed_tensor, computed_mask = computed_data[0]
-              if 'mask' in inspect.getargspec(layer.call).args:
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_mask
               output_tensors = _to_list(layer.call(computed_tensor, **kwargs))
@@ -2164,23 +1928,30 @@ class Container(Layer):
             else:
               computed_tensors = [x[0] for x in computed_data]
               computed_masks = [x[1] for x in computed_data]
-              if 'mask' in inspect.getargspec(layer.call).args:
+              if 'mask' in tf_inspect.getargspec(layer.call).args:
                 if 'mask' not in kwargs:
                   kwargs['mask'] = computed_masks
               output_tensors = _to_list(layer.call(computed_tensors, **kwargs))
               output_masks = _to_list(
                   layer.compute_mask(computed_tensors, computed_masks))
 
+            # Apply activity regularizer if any:
+            if hasattr(layer, 'activity_regularizer'
+                      ) and layer.activity_regularizer is not None:
+              regularization_losses = [
+                  layer.activity_regularizer(x) for x in computed_tensors
+              ]
+              layer.add_loss(regularization_losses, computed_tensors)
+
           # Update model updates and losses:
-          layer_inputs = [x[0] for x in computed_data]
           # Keep track of updates that depend on the inputs
           # (e.g. BN updates).
-          self.add_update(layer.get_updates_for(layer_inputs), inputs)
+          self.add_update(layer.get_updates_for(computed_tensors), inputs)
           # Keep track of unconditional updates (e.g. a counter).
           self.add_update(layer.get_updates_for(None), None)
           # Keep track of losses that depend on the inputs
           # (e.g. activity regularizers).
-          self.add_loss(layer.get_losses_for(layer_inputs), inputs)
+          self.add_loss(layer.get_losses_for(computed_tensors), inputs)
           # Keep track of unconditional losses
           # (e.g. weight regularizers).
           self.add_loss(layer.get_losses_for(None), None)
@@ -2404,7 +2175,7 @@ class Container(Layer):
       output_tensors.append(layer_output_tensors[tensor_index])
     return cls(inputs=input_tensors, outputs=output_tensors, name=name)
 
-  def save(self, filepath, overwrite=True):
+  def save(self, filepath, overwrite=True, include_optimizer=True):
     """Save the model to a single HDF5 file.
 
     The savefile includes:
@@ -2425,6 +2196,7 @@ class Container(Layer):
         filepath: String, path to the file to save the weights to.
         overwrite: Whether to silently overwrite any existing file at the
             target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
 
     Example:
 
@@ -2440,7 +2212,7 @@ class Container(Layer):
     ```
     """
     from tensorflow.contrib.keras.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
-    save_model(self, filepath, overwrite)
+    save_model(self, filepath, overwrite, include_optimizer)
 
   def save_weights(self, filepath, overwrite=True):
     """Dumps all layer weights to a HDF5 file.
@@ -2869,9 +2641,9 @@ def load_weights_from_hdf5_group(f, layers):
   layer_names = filtered_layer_names
   if len(layer_names) != len(filtered_layers):
     raise ValueError('You are trying to load a weight file '
-                     'containing ' + str(len(
-                         layer_names)) + ' layers into a model with ' + str(
-                             len(filtered_layers)) + ' layers.')
+                     'containing ' + str(len(layer_names)) +
+                     ' layers into a model with ' + str(len(filtered_layers)) +
+                     ' layers.')
 
   # We batch weight value assignments in a single backend call
   # which provides a speedup in TensorFlow.
@@ -2890,8 +2662,8 @@ def load_weights_from_hdf5_group(f, layers):
                        'correspond to layer ' + name + ' in the save file. '
                        'However the new layer ' + layer.name + ' expects ' +
                        str(len(symbolic_weights)) +
-                       ' weights, but the saved weights have ' + str(
-                           len(weight_values)) + ' elements.')
+                       ' weights, but the saved weights have ' +
+                       str(len(weight_values)) + ' elements.')
     weight_value_tuples += zip(symbolic_weights, weight_values)
   K.batch_set_value(weight_value_tuples)
 
@@ -2942,10 +2714,10 @@ def load_weights_from_hdf5_group_by_name(f, layers):
       weight_values = preprocess_weights_for_loading(
           layer, weight_values, original_keras_version, original_backend)
       if len(weight_values) != len(symbolic_weights):
-        raise ValueError('Layer #' + str(
-            k) + ' (named "' + layer.name + '") expects ' + str(
-                len(symbolic_weights)) + ' weight(s), but the saved weights' +
-                         ' have ' + str(len(weight_values)) + ' element(s).')
+        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                         '") expects ' + str(len(symbolic_weights)) +
+                         ' weight(s), but the saved weights' + ' have ' +
+                         str(len(weight_values)) + ' element(s).')
       # Set values.
       for i in range(len(weight_values)):
         weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
diff --git a/tensorflow/contrib/keras/python/keras/engine/topology_test.py b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
index eb095b14a9742995249de21c8d4613409896c988..531ed4be3e3672eb45f982ff6d9bb471bf47d7cc 100644
--- a/tensorflow/contrib/keras/python/keras/engine/topology_test.py
+++ b/tensorflow/contrib/keras/python/keras/engine/topology_test.py
@@ -490,8 +490,8 @@ class TopologyConstructionTest(test.TestCase):
     m, n = model([j, k])
     tf_model = keras.models.Model([j, k], [m, n])
 
-    j_tf = array_ops.placeholder(dtype=dtypes.float32)
-    k_tf = array_ops.placeholder(dtype=dtypes.float32)
+    j_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
+    k_tf = array_ops.placeholder(dtype=dtypes.float32, shape=(None, 32))
     m_tf, n_tf = tf_model([j_tf, k_tf])
     self.assertListEqual(m_tf.get_shape().as_list(), [None, 64])
     self.assertListEqual(n_tf.get_shape().as_list(), [None, 5])
diff --git a/tensorflow/contrib/keras/python/keras/engine/training.py b/tensorflow/contrib/keras/python/keras/engine/training.py
index efd437f6f66faf1dc081ce041168426648b9dc51..96d1c2f262259a0cd7030736997d9501468e2075 100644
--- a/tensorflow/contrib/keras/python/keras/engine/training.py
+++ b/tensorflow/contrib/keras/python/keras/engine/training.py
@@ -84,14 +84,14 @@ def _standardize_input_data(data,
   elif isinstance(data, list):
     if len(data) != len(names):
       if data and hasattr(data[0], 'shape'):
-        raise ValueError('Error when checking ' + exception_prefix +
-                         ': the list of Numpy arrays '
-                         'that you are passing to your model '
-                         'is not the size the model expected. '
-                         'Expected to see ' + str(len(
-                             names)) + ' arrays but instead got '
-                         'the following list of ' + str(len(
-                             data)) + ' arrays: ' + str(data)[:200] + '...')
+        raise ValueError(
+            'Error when checking ' + exception_prefix +
+            ': the list of Numpy arrays '
+            'that you are passing to your model '
+            'is not the size the model expected. '
+            'Expected to see ' + str(len(names)) + ' arrays but instead got '
+            'the following list of ' + str(len(data)) + ' arrays: ' +
+            str(data)[:200] + '...')
       else:
         if len(names) == 1:
           data = [np.asarray(data)]
@@ -100,8 +100,8 @@ def _standardize_input_data(data,
                            ': you are passing a list as '
                            'input to your model, '
                            'but the model expects '
-                           'a list of ' + str(len(
-                               names)) + ' Numpy arrays instead. '
+                           'a list of ' + str(len(names)) +
+                           ' Numpy arrays instead. '
                            'The list you passed was: ' + str(data)[:200])
     arrays = data
   else:
@@ -133,8 +133,8 @@ def _standardize_input_data(data,
       array = arrays[i]
       if len(array.shape) != len(shapes[i]):
         raise ValueError(
-            'Error when checking ' + exception_prefix + ': expected ' + names[
-                i] + ' to have ' + str(len(shapes[i])) +
+            'Error when checking ' + exception_prefix + ': expected ' + names[i]
+            + ' to have ' + str(len(shapes[i])) +
             ' dimensions, but got array with shape ' + str(array.shape))
       for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
         if not j and not check_batch_axis:
@@ -143,8 +143,8 @@ def _standardize_input_data(data,
         if ref_dim:
           if ref_dim != dim:
             raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' + str(
-                                 shapes[i]) + ' but got array with shape ' +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shapes[i]) + ' but got array with shape ' +
                              str(array.shape))
   return arrays
 
@@ -175,9 +175,9 @@ def _standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       return [x_weight]
   if isinstance(x_weight, list):
     if len(x_weight) != len(output_names):
-      raise ValueError('Provided `' + weight_type + '` was a list of ' + str(
-          len(x_weight)) + ' elements, but the model has ' + str(
-              len(output_names)) + ' outputs. '
+      raise ValueError('Provided `' + weight_type + '` was a list of ' +
+                       str(len(x_weight)) + ' elements, but the model has ' +
+                       str(len(output_names)) + ' outputs. '
                        'You should provide one `' + weight_type + '`'
                        'array per model output.')
     return x_weight
@@ -239,13 +239,13 @@ def _check_array_lengths(inputs, targets, weights):
                      'and ' + str(list(set_y)[0]) + ' target samples.')
   if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
     raise ValueError('Sample_weight arrays should have '
-                     'the same number of samples as target arrays. Got ' + str(
-                         list(set_y)[0]) + ' input samples and ' + str(
-                             list(set_w)[0]) + ' target samples.')
+                     'the same number of samples as target arrays. Got ' +
+                     str(list(set_y)[0]) + ' input samples and ' +
+                     str(list(set_w)[0]) + ' target samples.')
 
 
 def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
-  """Does validation on the compatiblity of targets and loss functions.
+  """Does validation on the compatibility of targets and loss functions.
 
   This helps prevent users from using loss functions incorrectly.
 
@@ -284,8 +284,8 @@ def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
     if loss.__name__ in key_losses:
       for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
         if out_dim is not None and target_dim != out_dim:
-          raise ValueError('A target array with shape ' + str(
-              y.shape) + ' was passed for an output of shape ' + str(shape) +
+          raise ValueError('A target array with shape ' + str(y.shape) +
+                           ' was passed for an output of shape ' + str(shape) +
                            ' while using as loss `' + loss.__name__ + '`. '
                            'This loss expects '
                            'targets to have the same shape '
@@ -535,14 +535,14 @@ def _standardize_weights(y,
                        'outputs that are at least 3D, i.e. that have '
                        'a time dimension.')
     if sample_weight is not None and len(sample_weight.shape) != 2:
-      raise ValueError('Found a sample_weight array with shape ' + str(
-          sample_weight.shape) + '. '
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
                        'In order to use timestep-wise sample weighting, '
                        'you should pass a 2D sample_weight array.')
   else:
     if sample_weight is not None and len(sample_weight.shape) != 1:
-      raise ValueError('Found a sample_weight array with shape ' + str(
-          sample_weight.shape) + '. '
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
                        'In order to use timestep-wise sample weights, '
                        'you should specify '
                        'sample_weight_mode="temporal" '
@@ -552,15 +552,15 @@ def _standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' + str(
-          sample_weight.shape) + '.'
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
                        'Expected sample_weight with rank '
                        'less than or equal to ' + str(len(y.shape)))
 
     if y.shape[:sample_weight.ndim] != sample_weight.shape:
-      raise ValueError('Found a sample_weight array with shape ' + str(
-          sample_weight.shape) + ' for an input with shape ' + str(y.shape) +
-                       '. '
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
                        'sample_weight cannot be broadcast.')
     return sample_weight
   elif isinstance(class_weight, dict):
@@ -728,16 +728,17 @@ class Model(Container):
         if name not in self.output_names:
           raise ValueError('Unknown entry in loss '
                            'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' + str(
-                               self.output_names))
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
       loss_functions = []
       for name in self.output_names:
         if name not in loss:
-          warnings.warn('Output "' + name + '" missing from loss dictionary. '
-                        'We assume this was done on purpose, '
-                        'and we will not be expecting '
-                        'any data to be passed to "' + name +
-                        '" during training.')
+          warnings.warn(
+              'Output "' + name + '" missing from loss dictionary. '
+              'We assume this was done on purpose, '
+              'and we will not be expecting '
+              'any data to be passed to "' + name + '" during training.',
+              stacklevel=2)
         loss_functions.append(losses.get(loss.get(name)))
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
@@ -780,8 +781,8 @@ class Model(Container):
         if name not in self.output_names:
           raise ValueError('Unknown entry in loss_weights '
                            'dictionary: "' + name + '". '
-                           'Only expected the following keys: ' + str(
-                               self.output_names))
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
       loss_weights_list = []
       for name in self.output_names:
         loss_weights_list.append(loss_weights.get(name, 1.))
@@ -790,12 +791,12 @@ class Model(Container):
         raise ValueError('When passing a list as loss_weights, '
                          'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss_weights=' + str(
-                             loss_weights))
+                         ' outputs, but you passed loss_weights=' +
+                         str(loss_weights))
       loss_weights_list = loss_weights
     else:
-      raise TypeError('Could not interpret loss_weights argument: ' + str(
-          loss_weights) + ' - expected a list of dicts.')
+      raise TypeError('Could not interpret loss_weights argument: ' +
+                      str(loss_weights) + ' - expected a list of dicts.')
 
     # Prepare sample weights.
     sample_weights = []
@@ -805,8 +806,8 @@ class Model(Container):
         if name not in self.output_names:
           raise ValueError('Unknown entry in '
                            'sample_weight_mode dictionary: "' + name + '". '
-                           'Only expected the following keys: ' + str(
-                               self.output_names))
+                           'Only expected the following keys: ' +
+                           str(self.output_names))
       for i, name in enumerate(self.output_names):
         if i in skip_indices:
           weight = None
@@ -940,8 +941,8 @@ class Model(Container):
           # (because of class mode duality)
           output_shape = self.internal_output_shapes[i]
           acc_fn = None
-          if output_shape[-1] == 1 or self.loss_functions[
-              i] == losses.binary_crossentropy:
+          if (output_shape[-1] == 1 or
+              self.loss_functions[i] == losses.binary_crossentropy):
             # case: binary accuracy
             acc_fn = metrics_module.binary_accuracy
           elif self.loss_functions[i] == losses.sparse_categorical_crossentropy:
@@ -1202,7 +1203,7 @@ class Model(Container):
       if batch_index == 0:
         for batch_out in batch_outs:
           shape = (samples,) + batch_out.shape[1:]
-          outs.append(np.zeros(shape, dtype=K.floatx()))
+          outs.append(np.zeros(shape, dtype=batch_out.dtype))
 
       for i, batch_out in enumerate(batch_outs):
         outs[i][batch_start:batch_end] = batch_out
@@ -1320,8 +1321,8 @@ class Model(Container):
         raise ValueError('In a stateful network, '
                          'you should only pass inputs with '
                          'a number of samples that can be '
-                         'divided by the batch size. Found: ' + str(
-                             x[0].shape[0]) + ' samples')
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples')
     return x, y, sample_weights
 
   def fit(self,
@@ -1412,11 +1413,11 @@ class Model(Container):
       elif len(validation_data) == 3:
         val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
       else:
-        raise ValueError('When passing validation_data, '
-                         'it must contain 2 (x_val, y_val) '
-                         'or 3 (x_val, y_val, val_sample_weights) '
-                         'items, however it contains %d items' %
-                         len(validation_data))
+        raise ValueError(
+            'When passing validation_data, '
+            'it must contain 2 (x_val, y_val) '
+            'or 3 (x_val, y_val, val_sample_weights) '
+            'items, however it contains %d items' % len(validation_data))
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1567,8 +1568,8 @@ class Model(Container):
         raise ValueError('In a stateful network, '
                          'you should only pass inputs with '
                          'a number of samples that can be '
-                         'divided by the batch size. Found: ' + str(
-                             x[0].shape[0]) + ' samples. '
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples. '
                          'Batch size: ' + str(batch_size) + '.')
 
     # prepare inputs, delegate logic to _predict_loop
@@ -1718,7 +1719,7 @@ class Model(Container):
             - a tuple (inputs, targets, sample_weights).
             All arrays should contain the same number of samples.
             The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `samples_per_epoch`
+            indefinitely. An epoch finishes when `steps_per_epoch`
             samples have been seen by the model.
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
@@ -1767,7 +1768,7 @@ class Model(Container):
                 f.close()
 
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            samples_per_epoch=10000, epochs=10)
+                            steps_per_epoch=10000, epochs=10)
     ```
 
     Raises:
@@ -2028,7 +2029,8 @@ class Model(Container):
                         steps,
                         max_q_size=10,
                         workers=1,
-                        pickle_safe=False):
+                        pickle_safe=False,
+                        verbose=0):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -2048,6 +2050,7 @@ class Model(Container):
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
+        verbose: verbosity mode, 0 or 1.
 
     Returns:
         Numpy array(s) of predictions.
@@ -2067,6 +2070,9 @@ class Model(Container):
       enqueuer = GeneratorEnqueuer(generator, pickle_safe=pickle_safe)
       enqueuer.start(workers=workers, max_q_size=max_q_size)
 
+      if verbose == 1:
+        progbar = Progbar(target=steps)
+
       while steps_done < steps:
         generator_output = None
         while enqueuer.is_running():
@@ -2103,6 +2109,8 @@ class Model(Container):
         for i, out in enumerate(outs):
           all_outs[i].append(out)
         steps_done += 1
+        if verbose == 1:
+          progbar.update(steps_done)
 
     finally:
       if enqueuer is not None:
diff --git a/tensorflow/contrib/keras/python/keras/initializers.py b/tensorflow/contrib/keras/python/keras/initializers.py
index 621069f424bd6380920e89c4c7497196aad3f5f1..b0b71e7cb4b1dac6f7edbec45d0cb8760dd40f86 100644
--- a/tensorflow/contrib/keras/python/keras/initializers.py
+++ b/tensorflow/contrib/keras/python/keras/initializers.py
@@ -18,244 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 import six
 
-from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.framework import tensor_shape
-
-
-class Initializer(object):
-  """Initializer base class: all initializers inherit from this class.
-  """
-
-  def __call__(self, shape, dtype=None):
-    raise NotImplementedError
-
-  def get_config(self):
-    return {}
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-
-class Zeros(Initializer):
-  """Initializer that generates tensors initialized to 0."""
-
-  def __call__(self, shape, dtype=None):
-    return K.constant(0, shape=shape, dtype=dtype)
-
-
-class Ones(Initializer):
-  """Initializer that generates tensors initialized to 1."""
-
-  def __call__(self, shape, dtype=None):
-    return K.constant(1, shape=shape, dtype=dtype)
-
-
-class Constant(Initializer):
-  """Initializer that generates tensors initialized to a constant value.
-
-  Arguments:
-      value: float; the value of the generator tensors.
-  """
-
-  def __init__(self, value=0):
-    self.value = value
-
-  def __call__(self, shape, dtype=None):
-    return K.constant(self.value, shape=shape, dtype=dtype)
-
-  def get_config(self):
-    return {'value': self.value}
-
-
-class RandomNormal(Initializer):
-  """Initializer that generates tensors with a normal distribution.
-
-  Arguments:
-      mean: a python scalar or a scalar tensor. Mean of the random values
-        to generate.
-      stddev: a python scalar or a scalar tensor. Standard deviation of the
-        random values to generate.
-      seed: A Python integer. Used to seed the random generator.
-  """
-
-  def __init__(self, mean=0., stddev=0.05, seed=None):
-    self.mean = mean
-    self.stddev = stddev
-    self.seed = seed
-
-  def __call__(self, shape, dtype=None):
-    return K.random_normal(
-        shape, self.mean, self.stddev, dtype=dtype, seed=self.seed)
-
-  def get_config(self):
-    return {'mean': self.mean, 'stddev': self.stddev, 'seed': self.seed}
-
-
-class RandomUniform(Initializer):
-  """Initializer that generates tensors with a uniform distribution.
-
-  Arguments:
-      minval: A python scalar or a scalar tensor. Lower bound of the range
-        of random values to generate.
-      maxval: A python scalar or a scalar tensor. Upper bound of the range
-        of random values to generate.  Defaults to 1 for float types.
-      seed: A Python integer. Used to seed the random generator.
-  """
-
-  def __init__(self, minval=-0.05, maxval=0.05, seed=None):
-    self.minval = minval
-    self.maxval = maxval
-    self.seed = seed
-
-  def __call__(self, shape, dtype=None):
-    return K.random_uniform(
-        shape, self.minval, self.maxval, dtype=dtype, seed=self.seed)
-
-  def get_config(self):
-    return {
-        'minval': self.minval,
-        'maxval': self.maxval,
-        'seed': self.seed,
-    }
-
-
-class TruncatedNormal(Initializer):
-  """Initializer that generates a truncated normal distribution.
-
-  These values are similar to values from a `random_normal_initializer`
-  except that values more than two standard deviations from the mean
-  are discarded and re-drawn. This is the recommended initializer for
-  neural network weights and filters.
-
-  Arguments:
-      mean: a python scalar or a scalar tensor. Mean of the random values
-        to generate.
-      stddev: a python scalar or a scalar tensor. Standard deviation of the
-        random values to generate.
-      seed: A Python integer. Used to seed the random generator.
-  """
-
-  def __init__(self, mean=0., stddev=0.05, seed=None):
-    self.mean = mean
-    self.stddev = stddev
-    self.seed = seed
-
-  def __call__(self, shape, dtype=None):
-    return K.truncated_normal(
-        shape, self.mean, self.stddev, dtype=dtype, seed=self.seed)
-
-  def get_config(self):
-    return {'mean': self.mean, 'stddev': self.stddev, 'seed': self.seed}
-
-
-class VarianceScaling(Initializer):
-  """Initializer capable of adapting its scale to the shape of weights.
-
-  With `distribution="normal"`, samples are drawn from a truncated normal
-  distribution centered on zero, with `stddev = sqrt(scale / n)` where n is:
-      - number of input units in the weight tensor, if mode = "fan_in"
-      - number of output units, if mode = "fan_out"
-      - average of the numbers of input and output units, if mode = "fan_avg"
-
-  With `distribution="uniform"`,
-  samples are drawn from a uniform distribution
-  within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
-
-  Arguments:
-      scale: Scaling factor (positive float).
-      mode: One of "fan_in", "fan_out", "fan_avg".
-      distribution: Random distribution to use. One of "normal", "uniform".
-      seed: A Python integer. Used to seed the random generator.
-
-  Raises:
-      ValueError: In case of an invalid value for the "scale", mode" or
-        "distribution" arguments.
-  """
-
-  def __init__(self, scale=1.0, mode='fan_in', distribution='normal',
-               seed=None):
-    if scale <= 0.:
-      raise ValueError('`scale` must be a positive float. Got:', scale)
-    mode = mode.lower()
-    if mode not in {'fan_in', 'fan_out', 'fan_avg'}:
-      raise ValueError('Invalid `mode` argument: '
-                       'expected on of {"fan_in", "fan_out", "fan_avg"} '
-                       'but got', mode)
-    distribution = distribution.lower()
-    if distribution not in {'normal', 'uniform'}:
-      raise ValueError('Invalid `distribution` argument: '
-                       'expected one of {"normal", "uniform"} '
-                       'but got', distribution)
-    self.scale = scale
-    self.mode = mode
-    self.distribution = distribution
-    self.seed = seed
-
-  def __call__(self, shape, dtype=None):
-    fan_in, fan_out = _compute_fans(shape)
-    scale = self.scale
-    if self.mode == 'fan_in':
-      scale /= max(1., fan_in)
-    elif self.mode == 'fan_out':
-      scale /= max(1., fan_out)
-    else:
-      scale /= max(1., float(fan_in + fan_out) / 2)
-    if self.distribution == 'normal':
-      stddev = math.sqrt(scale)
-      return K.truncated_normal(shape, 0., stddev, dtype=dtype, seed=self.seed)
-    else:
-      limit = math.sqrt(3. * scale)
-      return K.random_uniform(shape, -limit, limit, dtype=dtype, seed=self.seed)
-
-  def get_config(self):
-    return {
-        'scale': self.scale,
-        'mode': self.mode,
-        'distribution': self.distribution,
-        'seed': self.seed
-    }
-
-
-class Orthogonal(Initializer):
-  """Initializer that generates a random orthogonal matrix.
-
-  Arguments:
-      gain: Multiplicative factor to apply to the orthogonal matrix.
-      seed: A Python integer. Used to seed the random generator.
-
-  References:
-      Saxe et al., http://arxiv.org/abs/1312.6120
-  """
-
-  def __init__(self, gain=1., seed=None):
-    self.gain = gain
-    self.seed = seed
-
-  def __call__(self, shape, dtype=None):
-    num_rows = 1
-    for dim in shape[:-1]:
-      num_rows *= dim
-    num_cols = shape[-1]
-    flat_shape = (num_rows, num_cols)
-    if self.seed is not None:
-      np.random.seed(self.seed)
-    a = np.random.normal(0.0, 1.0, flat_shape)
-    u, _, v = np.linalg.svd(a, full_matrices=False)
-    # Pick the one with the correct shape.
-    q = u if u.shape == flat_shape else v
-    q = q.reshape(shape)
-    return self.gain * q[:shape[0], :shape[1]]
-
-  def get_config(self):
-    return {'gain': self.gain, 'seed': self.seed}
+from tensorflow.python.ops.init_ops import Constant
+from tensorflow.python.ops.init_ops import Initializer
+from tensorflow.python.ops.init_ops import Ones
+from tensorflow.python.ops.init_ops import Orthogonal
+from tensorflow.python.ops.init_ops import RandomNormal
+from tensorflow.python.ops.init_ops import RandomUniform
+from tensorflow.python.ops.init_ops import TruncatedNormal
+from tensorflow.python.ops.init_ops import VarianceScaling
+from tensorflow.python.ops.init_ops import Zeros
 
 
 class Identity(Initializer):
@@ -403,47 +179,6 @@ orthogonal = Orthogonal
 # Utility functions
 
 
-def _compute_fans(shape, data_format='channels_last'):
-  """Computes the number of input and output units for a weight shape.
-
-  Arguments:
-      shape: Integer shape tuple.
-      data_format: Image data format to use for convolution kernels.
-          Note that all kernels in Keras are standardized on the
-          `channels_last` ordering (even when inputs are set
-          to `channels_first`).
-
-  Returns:
-      A tuple of scalars, `(fan_in, fan_out)`.
-
-  Raises:
-      ValueError: in case of invalid `data_format` argument.
-  """
-  shape = tensor_shape.TensorShape(shape).as_list()
-  if len(shape) == 2:
-    fan_in = shape[0]
-    fan_out = shape[1]
-  elif len(shape) in {3, 4, 5}:
-    # Assuming convolution kernels (1D, 2D or 3D).
-    # TH kernel shape: (depth, input_depth, ...)
-    # TF kernel shape: (..., input_depth, depth)
-    if data_format == 'channels_first':
-      receptive_field_size = np.prod(shape[2:])
-      fan_in = shape[1] * receptive_field_size
-      fan_out = shape[0] * receptive_field_size
-    elif data_format == 'channels_last':
-      receptive_field_size = np.prod(shape[:2])
-      fan_in = shape[-2] * receptive_field_size
-      fan_out = shape[-1] * receptive_field_size
-    else:
-      raise ValueError('Invalid data_format: ' + data_format)
-  else:
-    # No specific assumptions.
-    fan_in = math.sqrt(np.prod(shape))
-    fan_out = math.sqrt(np.prod(shape))
-  return fan_in, fan_out
-
-
 def serialize(initializer):
   return serialize_keras_object(initializer)
 
diff --git a/tensorflow/contrib/keras/python/keras/initializers_test.py b/tensorflow/contrib/keras/python/keras/initializers_test.py
index 7436fbb39043e3a59c28e03c62ab85186f788d84..0a07eddd89ac68f2f408e419f92e189ad525ff1f 100644
--- a/tensorflow/contrib/keras/python/keras/initializers_test.py
+++ b/tensorflow/contrib/keras/python/keras/initializers_test.py
@@ -21,121 +21,132 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.keras.python import keras
+from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
-def _runner(init, shape, target_mean=None, target_std=None,
-            target_max=None, target_min=None):
-  variable = keras.backend.variable(init(shape))
-  output = keras.backend.get_value(variable)
-  lim = 3e-2
-  if target_std is not None:
-    assert abs(output.std() - target_std) < lim, output.std()
-  if target_mean is not None:
-    assert abs(output.mean() - target_mean) < lim, output.mean()
-  if target_max is not None:
-    assert abs(output.max() - target_max) < lim, output.max()
-  if target_min is not None:
-    assert abs(output.min() - target_min) < lim, output.min()
-
-
 class KerasInitializersTest(test.TestCase):
 
+  def _runner(self, init, shape, target_mean=None, target_std=None,
+              target_max=None, target_min=None):
+    variable = keras.backend.variable(init(shape))
+    output = keras.backend.get_value(variable)
+    lim = 3e-2
+    if target_std is not None:
+      self.assertGreater(lim, abs(output.std() - target_std))
+    if target_mean is not None:
+      self.assertGreater(lim, abs(output.mean() - target_mean))
+    if target_max is not None:
+      self.assertGreater(lim, abs(output.max() - target_max))
+    if target_min is not None:
+      self.assertGreater(lim, abs(output.min() - target_min))
+
+    # Test serialization (assumes deterministic behavior).
+    config = init.get_config()
+    reconstructed_init = init.__class__.from_config(config)
+    variable = keras.backend.variable(reconstructed_init(shape))
+    output_2 = keras.backend.get_value(variable)
+    self.assertAllClose(output, output_2, atol=1e-4)
+
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.test_session():
-      _runner(keras.initializers.RandomUniform(minval=-1, maxval=1, seed=124),
-              tensor_shape,
-              target_mean=0., target_max=1, target_min=-1)
+      self._runner(keras.initializers.RandomUniform(minval=-1,
+                                                    maxval=1,
+                                                    seed=124),
+                   tensor_shape,
+                   target_mean=0., target_max=1, target_min=-1)
 
   def test_normal(self):
     tensor_shape = (8, 12, 99)
     with self.test_session():
-      _runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
-              tensor_shape,
-              target_mean=0., target_std=1)
+      self._runner(keras.initializers.RandomNormal(mean=0, stddev=1, seed=153),
+                   tensor_shape,
+                   target_mean=0., target_std=1)
 
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
     with self.test_session():
-      _runner(keras.initializers.TruncatedNormal(mean=0, stddev=1, seed=126),
-              tensor_shape,
-              target_mean=0., target_std=None, target_max=2)
+      self._runner(keras.initializers.TruncatedNormal(mean=0,
+                                                      stddev=1,
+                                                      seed=126),
+                   tensor_shape,
+                   target_mean=0., target_std=None, target_max=2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
     with self.test_session():
-      _runner(keras.initializers.Constant(2), tensor_shape,
-              target_mean=2, target_max=2, target_min=2)
+      self._runner(keras.initializers.Constant(2), tensor_shape,
+                   target_mean=2, target_max=2, target_min=2)
 
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
-      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
       scale = np.sqrt(3. / fan_in)
-      _runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-              target_mean=0., target_max=scale, target_min=-scale)
+      self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
-      fan_in, fan_out = keras.initializers._compute_fans(tensor_shape)
+      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       scale = np.sqrt(6. / (fan_in + fan_out))
-      _runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-              target_mean=0., target_max=scale, target_min=-scale)
+      self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
-      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
       scale = np.sqrt(6. / fan_in)
-      _runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-              target_mean=0., target_max=scale, target_min=-scale)
+      self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
+                   target_mean=0., target_max=scale, target_min=-scale)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
-      fan_in, fan_out = keras.initializers._compute_fans(tensor_shape)
+      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
       scale = np.sqrt(2. / (fan_in + fan_out))
-      _runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-              target_mean=0., target_std=None, target_max=2 * scale)
+      self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
-      fan_in, _ = keras.initializers._compute_fans(tensor_shape)
+      fan_in, _ = init_ops._compute_fans(tensor_shape)
       scale = np.sqrt(2. / fan_in)
-      _runner(keras.initializers.he_normal(seed=123), tensor_shape,
-              target_mean=0., target_std=None, target_max=2 * scale)
+      self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
+                   target_mean=0., target_std=None, target_max=2 * scale)
 
   def test_orthogonal(self):
-    tensor_shape = (7, 8)
+    tensor_shape = (20, 20)
     with self.test_session():
-      _runner(keras.initializers.orthogonal(seed=123), tensor_shape,
-              target_mean=0.)
+      self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
+                   target_mean=0.)
 
   def test_identity(self):
     with self.test_session():
       tensor_shape = (3, 4, 5)
       with self.assertRaises(ValueError):
-        _runner(keras.initializers.identity(), tensor_shape,
-                target_mean=1. / tensor_shape[0], target_max=1.)
+        self._runner(keras.initializers.identity(), tensor_shape,
+                     target_mean=1. / tensor_shape[0], target_max=1.)
 
       tensor_shape = (3, 3)
-      _runner(keras.initializers.identity(), tensor_shape,
-              target_mean=1. / tensor_shape[0], target_max=1.)
+      self._runner(keras.initializers.identity(), tensor_shape,
+                   target_mean=1. / tensor_shape[0], target_max=1.)
 
   def test_zero(self):
     tensor_shape = (4, 5)
     with self.test_session():
-      _runner(keras.initializers.zeros(), tensor_shape,
-              target_mean=0., target_max=0.)
+      self._runner(keras.initializers.zeros(), tensor_shape,
+                   target_mean=0., target_max=0.)
 
   def test_one(self):
     tensor_shape = (4, 5)
     with self.test_session():
-      _runner(keras.initializers.ones(), tensor_shape,
-              target_mean=1., target_max=1.)
+      self._runner(keras.initializers.ones(), tensor_shape,
+                   target_mean=1., target_max=1.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/integration_test.py b/tensorflow/contrib/keras/python/keras/integration_test.py
index f42f81b286ed5216188cfafbe189100f7f78c59d..16d0713b31ffaba3f489bbf8d4bda5b7c74a0993 100644
--- a/tensorflow/contrib/keras/python/keras/integration_test.py
+++ b/tensorflow/contrib/keras/python/keras/integration_test.py
@@ -33,13 +33,13 @@ class KerasIntegrationTest(test.TestCase):
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=200,
           test_samples=100,
-          input_shape=(8,),
+          input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
       y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential([
-          keras.layers.Dense(8,
+          keras.layers.Dense(16,
                              activation='relu',
                              input_shape=x_train.shape[1:]),
           keras.layers.Dropout(0.1),
@@ -51,7 +51,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
 
   def test_vector_classification_functional(self):
     with self.test_session():
@@ -59,13 +59,13 @@ class KerasIntegrationTest(test.TestCase):
       (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
           train_samples=200,
           test_samples=100,
-          input_shape=(8,),
+          input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
       y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
-      x = keras.layers.Dense(8, activation='relu')(inputs)
+      x = keras.layers.Dense(16, activation='relu')(inputs)
       x = keras.layers.Dropout(0.1)(x)
       outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
 
@@ -76,7 +76,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
 
   def test_temporal_classification_declarative(self):
     with self.test_session():
@@ -99,7 +99,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
 
   def test_image_classification_declarative(self):
     with self.test_session():
@@ -130,7 +130,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
 
   def test_video_classification_functional(self):
     with self.test_session():
@@ -159,7 +159,7 @@ class KerasIntegrationTest(test.TestCase):
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
                           validation_data=(x_test, y_test),
                           verbose=2)
-      self.assertTrue(history.history['val_acc'][-1] > 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.85)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
index b3abfc29d250514c5d2544df04a6d6c343a82755..2c957ece4466660cd5e62aa1fcc9dc9f9052091d 100644
--- a/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
+++ b/tensorflow/contrib/keras/python/keras/layers/advanced_activations.py
@@ -120,7 +120,7 @@ class PReLU(Layer):
         param_shape[i - 1] = 1
         self.param_broadcast[i - 1] = True
     self.alpha = self.add_weight(
-        param_shape,
+        shape=param_shape,
         name='alpha',
         initializer=self.alpha_initializer,
         regularizer=self.alpha_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional.py b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
index 1a28399a28fb1d6203dc8754df03b3adb670e880..9ee5aa21217d9944cb09885935f1290e74c26eb2 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional.py
@@ -37,233 +37,11 @@ from tensorflow.contrib.keras.python.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import convolutional as tf_convolutional_layers
 
 
-class _Conv(Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  with the layer input to produce a tensor of outputs.
-  If `use_bias` is True, a bias vector is created and added to the outputs.
-  Finally, if `activation` is not `None`,
-  it is applied to the outputs as well.
-
-  Arguments:
-      rank: An integer, the rank of the convolution,
-          e.g. "2" for 2D convolution.
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number output of filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, ..., channels)` while `channels_first` corresponds to
-          inputs with shape `(batch, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_initializer: Initializer for the `kernel` weights matrix.
-      bias_initializer: Initializer for the bias vector.
-      kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
-      kernel_constraint: Constraint function applied to the kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format=None,
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(_Conv, self).__init__(**kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, rank,
-                                                  'kernel_size')
-    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, rank,
-                                                    'dilation_rate')
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.kernel_constraint = constraints.get(kernel_constraint)
-    self.bias_constraint = constraints.get(bias_constraint)
-    self.input_spec = InputSpec(ndim=self.rank + 2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-
-    self.kernel = self.add_weight(
-        kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          (self.filters,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(
-        ndim=self.rank + 2, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs):
-    if self.rank == 1:
-      outputs = K.conv1d(
-          inputs,
-          self.kernel,
-          strides=self.strides[0],
-          padding=self.padding,
-          data_format=self.data_format,
-          dilation_rate=self.dilation_rate[0])
-    if self.rank == 2:
-      outputs = K.conv2d(
-          inputs,
-          self.kernel,
-          strides=self.strides,
-          padding=self.padding,
-          data_format=self.data_format,
-          dilation_rate=self.dilation_rate)
-    if self.rank == 3:
-      outputs = K.conv3d(
-          inputs,
-          self.kernel,
-          strides=self.strides,
-          padding=self.padding,
-          data_format=self.data_format,
-          dilation_rate=self.dilation_rate)
-
-    if self.use_bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = conv_utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = conv_utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-  def get_config(self):
-    config = {
-        'rank':
-            self.rank,
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(_Conv, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Conv1D(_Conv):
-  """1D convolution layer (e.g.
-
-  temporal convolution).
+class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+  """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
   with the layer input over a single spatial (or temporal) dimension
@@ -289,7 +67,7 @@ class Conv1D(_Conv):
           any `dilation_rate` value != 1.
       padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive).
           `"causal"` results in causal (dilated) convolutions, e.g. output[t]
-          depends solely on input[:t-1]. Useful when modeling temporal data
+          does not depend on input[t+1:]. Useful when modeling temporal data
           where the model should not violate the temporal order.
           See [WaveNet: A Generative Model for Raw Audio, section
             2.1](https://arxiv.org/abs/1609.03499).
@@ -336,33 +114,55 @@ class Conv1D(_Conv):
                bias_constraint=None,
                **kwargs):
     super(Conv1D, self).__init__(
-        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format='channels_last',
         dilation_rate=dilation_rate,
-        activation=activation,
+        activation=activations.get(activation),
         use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self.input_spec = InputSpec(ndim=3)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv1D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
   def get_config(self):
-    config = super(Conv1D, self).get_config()
-    config.pop('rank')
-    config.pop('data_format')
-    return config
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv2D(_Conv):
+class Conv2D(tf_convolutional_layers.Conv2D, Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -395,9 +195,9 @@ class Conv2D(_Conv):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -452,36 +252,60 @@ class Conv2D(_Conv):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
     super(Conv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
-        activation=activation,
+        activation=activations.get(activation),
         use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self.input_spec = InputSpec(ndim=4)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
 
-  def get_config(self):
-    config = super(Conv2D, self).get_config()
-    config.pop('rank')
-    return config
+  def build(self, input_shape):
+    super(Conv2D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
-class Conv3D(_Conv):
-  """3D convolution layer (e.g.
 
-  spatial convolution over volumes).
+class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+  """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
   with the layer input to produce a tensor of
@@ -577,33 +401,59 @@ class Conv3D(_Conv):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
     super(Conv3D, self).__init__(
-        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
-        activation=activation,
+        activation=activations.get(activation),
         use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self.input_spec = InputSpec(ndim=5)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    super(Conv3D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
   def get_config(self):
-    config = super(Conv3D, self).get_config()
-    config.pop('rank')
-    return config
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class Conv2DTranspose(Conv2D):
+class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -621,7 +471,7 @@ class Conv2DTranspose(Conv2D):
 
   Arguments:
       filters: Integer, the dimensionality of the output space
-          (i.e. the number output of filters in the convolution).
+          (i.e. the number of output filters in the convolution).
       kernel_size: An integer or tuple/list of 2 integers, specifying the
           width and height of the 2D convolution window.
           Can be a single integer to specify the same value for
@@ -637,9 +487,9 @@ class Conv2DTranspose(Conv2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -688,7 +538,7 @@ class Conv2DTranspose(Conv2D):
                kernel_size,
                strides=(1, 1),
                padding='valid',
-               data_format='channels_last',
+               data_format=None,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -699,121 +549,57 @@ class Conv2DTranspose(Conv2D):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
     super(Conv2DTranspose, self).__init__(
-        filters,
-        kernel_size,
+        filters=filters,
+        kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
-        activation=activation,
+        activation=activations.get(activation),
         use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self.input_spec = InputSpec(ndim=4)
+    # TODO(fchollet): move weight constraint support to core layers.
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
 
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if len(input_shape) != 4:
-      raise ValueError(
-          'Inputs should have rank ' + str(4) + '; Received input shape:',
-          str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_weight(
-        kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          (self.filters,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs):
-    input_shape = K.shape(inputs)
-    batch_size = input_shape[0]
-    if self.data_format == 'channels_first':
-      h_axis, w_axis = 2, 3
-    else:
-      h_axis, w_axis = 1, 2
-
-    height, width = input_shape[h_axis], input_shape[w_axis]
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_height = conv_utils.deconv_length(height, stride_h, kernel_h,
-                                          self.padding)
-    out_width = conv_utils.deconv_length(width, stride_w, kernel_w,
-                                         self.padding)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-
-    outputs = K.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape,
-        self.strides,
-        padding=self.padding,
-        data_format=self.data_format)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = conv_utils.deconv_length(
-        output_shape[h_axis], stride_h, kernel_h, self.padding)
-    output_shape[w_axis] = conv_utils.deconv_length(
-        output_shape[w_axis], stride_w, kernel_w, self.padding)
-    return tensor_shape.TensorShape(output_shape)
+    super(Conv2DTranspose, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
   def get_config(self):
-    config = super(Conv2DTranspose, self).get_config()
-    config.pop('dilation_rate')
-    return config
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv2DTranspose, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class SeparableConv2D(Conv2D):
+class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
   """Depthwise separable 2D convolution.
 
   Separable convolutions consist in first performing
@@ -845,9 +631,9 @@ class SeparableConv2D(Conv2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -909,126 +695,68 @@ class SeparableConv2D(Conv2D):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
     super(SeparableConv2D, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
-        activation=activation,
+        activation=activations.get(activation),
         use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
+        depthwise_initializer=initializers.get(depthwise_initializer),
+        pointwise_initializer=initializers.get(pointwise_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        depthwise_regularizer=regularizers.get(depthwise_regularizer),
+        pointwise_regularizer=regularizers.get(pointwise_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.pointwise_initializer = initializers.get(pointwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    # TODO(fchollet): move weight constraint support to core layers.
     self.depthwise_constraint = constraints.get(depthwise_constraint)
     self.pointwise_constraint = constraints.get(pointwise_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
 
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `SeparableConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`SeparableConv2D` '
-                       'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
-                              input_dim, self.depth_multiplier)
-    pointwise_kernel_shape = (1, 1, self.depth_multiplier * input_dim,
-                              self.filters)
-
-    self.depthwise_kernel = self.add_weight(
-        depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-    self.pointwise_kernel = self.add_weight(
-        pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        name='pointwise_kernel',
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          (self.filters,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs):
-    outputs = K.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        data_format=self.data_format,
-        strides=self.strides,
-        padding=self.padding)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], self.filters, rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, self.filters])
+    super(SeparableConv2D, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.depthwise_constraint:
+      self.constraints[self.depthwise_kernel] = self.depthwise_constraint
+    if self.pointwise_constraint:
+      self.constraints[self.pointwise_kernel] = self.pointwise_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
   def get_config(self):
-    config = super(SeparableConv2D, self).get_config()
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['pointwise_initializer'] = initializers.serialize(
-        self.pointwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['pointwise_regularizer'] = regularizers.serialize(
-        self.pointwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    config['pointwise_constraint'] = constraints.serialize(
-        self.pointwise_constraint)
-    return config
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'depthwise_initializer': initializers.serialize(
+            self.depthwise_initializer),
+        'pointwise_initializer': initializers.serialize(
+            self.pointwise_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer': regularizers.serialize(
+            self.depthwise_regularizer),
+        'pointwise_regularizer': regularizers.serialize(
+            self.pointwise_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'depthwise_constraint': constraints.serialize(
+            self.depthwise_constraint),
+        'pointwise_constraint': constraints.serialize(
+            self.pointwise_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(SeparableConv2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 class UpSampling1D(Layer):
@@ -1079,9 +807,9 @@ class UpSampling2D(Layer):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -1110,17 +838,17 @@ class UpSampling2D(Layer):
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
-      height = self.size[0] * input_shape[2] if input_shape[
-          2] is not None else None
-      width = self.size[1] * input_shape[3] if input_shape[
-          3] is not None else None
+      height = self.size[0] * input_shape[
+          2] if input_shape[2] is not None else None
+      width = self.size[1] * input_shape[
+          3] if input_shape[3] is not None else None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], height, width])
     else:
-      height = self.size[0] * input_shape[1] if input_shape[
-          1] is not None else None
-      width = self.size[1] * input_shape[2] if input_shape[
-          2] is not None else None
+      height = self.size[0] * input_shape[
+          1] if input_shape[1] is not None else None
+      width = self.size[1] * input_shape[
+          2] if input_shape[2] is not None else None
       return tensor_shape.TensorShape(
           [input_shape[0], height, width, input_shape[3]])
 
@@ -1177,21 +905,21 @@ class UpSampling3D(Layer):
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
-      dim1 = self.size[0] * input_shape[2] if input_shape[
-          2] is not None else None
-      dim2 = self.size[1] * input_shape[3] if input_shape[
-          3] is not None else None
-      dim3 = self.size[2] * input_shape[4] if input_shape[
-          4] is not None else None
+      dim1 = self.size[0] * input_shape[
+          2] if input_shape[2] is not None else None
+      dim2 = self.size[1] * input_shape[
+          3] if input_shape[3] is not None else None
+      dim3 = self.size[2] * input_shape[
+          4] if input_shape[4] is not None else None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], dim1, dim2, dim3])
     else:
-      dim1 = self.size[0] * input_shape[1] if input_shape[
-          1] is not None else None
-      dim2 = self.size[1] * input_shape[2] if input_shape[
-          2] is not None else None
-      dim3 = self.size[2] * input_shape[3] if input_shape[
-          3] is not None else None
+      dim1 = self.size[0] * input_shape[
+          1] if input_shape[1] is not None else None
+      dim2 = self.size[1] * input_shape[
+          2] if input_shape[2] is not None else None
+      dim3 = self.size[2] * input_shape[
+          3] if input_shape[3] is not None else None
       return tensor_shape.TensorShape(
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
@@ -1230,9 +958,10 @@ class ZeroPadding1D(Layer):
     self.input_spec = InputSpec(ndim=3)
 
   def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = input_shape[1] + self.padding[0] + self.padding[1] if input_shape[
-        1] is not None else None
+    if input_shape[1] is not None:
+      length = input_shape[1] + self.padding[0] + self.padding[1]
+    else:
+      length = None
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
   def call(self, inputs):
@@ -1257,7 +986,7 @@ class ZeroPadding2D(Layer):
           - If tuple of 2 ints:
               interpreted as two different
               symmetric padding values for height and width:
-              `(symmetric_height_pad, symmetrc_width_pad)`.
+              `(symmetric_height_pad, symmetric_width_pad)`.
           - If tuple of 2 tuples of 2 ints:
               interpreted as
               `((top_pad, bottom_pad), (left_pad, right_pad))`
@@ -1265,9 +994,9 @@ class ZeroPadding2D(Layer):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -1313,17 +1042,25 @@ class ZeroPadding2D(Layer):
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
-      rows = input_shape[2] + self.padding[0][0] + self.padding[0][
-          1] if input_shape[2] is not None else None
-      cols = input_shape[3] + self.padding[1][0] + self.padding[1][
-          1] if input_shape[3] is not None else None
+      if input_shape[2] is not None:
+        rows = input_shape[2] + self.padding[0][0] + self.padding[0][1]
+      else:
+        rows = None
+      if input_shape[3] is not None:
+        cols = input_shape[3] + self.padding[1][0] + self.padding[1][1]
+      else:
+        cols = None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], rows, cols])
-    else:
-      rows = input_shape[1] + self.padding[0][0] + self.padding[0][
-          1] if input_shape[1] is not None else None
-      cols = input_shape[2] + self.padding[1][0] + self.padding[1][
-          1] if input_shape[2] is not None else None
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        rows = input_shape[1] + self.padding[0][0] + self.padding[0][1]
+      else:
+        rows = None
+      if input_shape[2] is not None:
+        cols = input_shape[2] + self.padding[1][0] + self.padding[1][1]
+      else:
+        cols = None
       return tensor_shape.TensorShape(
           [input_shape[0], rows, cols, input_shape[3]])
 
@@ -1414,21 +1151,33 @@ class ZeroPadding3D(Layer):
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if self.data_format == 'channels_first':
-      dim1 = input_shape[2] + 2 * self.padding[0][0] if input_shape[
-          2] is not None else None
-      dim2 = input_shape[3] + 2 * self.padding[1][0] if input_shape[
-          3] is not None else None
-      dim3 = input_shape[4] + 2 * self.padding[2][0] if input_shape[
-          4] is not None else None
+      if input_shape[2] is not None:
+        dim1 = input_shape[2] + 2 * self.padding[0][0]
+      else:
+        dim1 = None
+      if input_shape[3] is not None:
+        dim2 = input_shape[3] + 2 * self.padding[1][0]
+      else:
+        dim2 = None
+      if input_shape[4] is not None:
+        dim3 = input_shape[4] + 2 * self.padding[2][0]
+      else:
+        dim3 = None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], dim1, dim2, dim3])
-    else:
-      dim1 = input_shape[1] + 2 * self.padding[0][1] if input_shape[
-          1] is not None else None
-      dim2 = input_shape[2] + 2 * self.padding[1][1] if input_shape[
-          2] is not None else None
-      dim3 = input_shape[3] + 2 * self.padding[2][1] if input_shape[
-          3] is not None else None
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        dim1 = input_shape[1] + 2 * self.padding[0][1]
+      else:
+        dim1 = None
+      if input_shape[2] is not None:
+        dim2 = input_shape[2] + 2 * self.padding[1][1]
+      else:
+        dim2 = None
+      if input_shape[3] is not None:
+        dim3 = input_shape[3] + 2 * self.padding[2][1]
+      else:
+        dim3 = None
       return tensor_shape.TensorShape(
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
@@ -1498,7 +1247,7 @@ class Cropping2D(Layer):
           - If tuple of 2 ints:
               interpreted as two different
               symmetric cropping values for height and width:
-              `(symmetric_height_crop, symmetrc_width_crop)`.
+              `(symmetric_height_crop, symmetric_width_crop)`.
           - If tuple of 2 tuples of 2 ints:
               interpreted as
               `((top_crop, bottom_crop), (left_crop, right_crop))`
@@ -1506,9 +1255,9 @@ class Cropping2D(Layer):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -1700,21 +1449,33 @@ class Cropping3D(Layer):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     # pylint: disable=invalid-unary-operand-type
     if self.data_format == 'channels_first':
-      dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][
-          1] if input_shape[2] is not None else None
-      dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][
-          1] if input_shape[3] is not None else None
-      dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][
-          1] if input_shape[4] is not None else None
+      if input_shape[2] is not None:
+        dim1 = input_shape[2] - self.cropping[0][0] - self.cropping[0][1]
+      else:
+        dim1 = None
+      if input_shape[3] is not None:
+        dim2 = input_shape[3] - self.cropping[1][0] - self.cropping[1][1]
+      else:
+        dim2 = None
+      if input_shape[4] is not None:
+        dim3 = input_shape[4] - self.cropping[2][0] - self.cropping[2][1]
+      else:
+        dim3 = None
       return tensor_shape.TensorShape(
           [input_shape[0], input_shape[1], dim1, dim2, dim3])
-    else:
-      dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][
-          1] if input_shape[1] is not None else None
-      dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][
-          1] if input_shape[2] is not None else None
-      dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][
-          1] if input_shape[3] is not None else None
+    elif self.data_format == 'channels_last':
+      if input_shape[1] is not None:
+        dim1 = input_shape[1] - self.cropping[0][0] - self.cropping[0][1]
+      else:
+        dim1 = None
+      if input_shape[2] is not None:
+        dim2 = input_shape[2] - self.cropping[1][0] - self.cropping[1][1]
+      else:
+        dim2 = None
+      if input_shape[3] is not None:
+        dim3 = input_shape[3] - self.cropping[2][0] - self.cropping[2][1]
+      else:
+        dim3 = None
       return tensor_shape.TensorShape(
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
     # pylint: enable=invalid-unary-operand-type
@@ -1738,14 +1499,14 @@ class Cropping3D(Layer):
         return inputs[:, :, self.cropping[0][0]:, self.cropping[1][
             0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][1]]
       elif self.cropping[1][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
-            1], self.cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][1]]
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
+                      cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][1]]
       elif self.cropping[2][1] == 0:
-        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
-            1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:]
-      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][
-          1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:
-                    -self.cropping[2][1]]
+        return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1], self.
+                      cropping[1][0]:-self.cropping[1][1], self.cropping[2][0]:]
+      return inputs[:, :, self.cropping[0][0]:-self.cropping[0][1],
+                    self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][
+                        0]:-self.cropping[2][1]]
     else:
       if self.cropping[0][1] == self.cropping[1][1] == self.cropping[2][1] == 0:
         return inputs[:, self.cropping[0][0]:, self.cropping[1][0]:,
@@ -1761,19 +1522,19 @@ class Cropping3D(Layer):
                       -self.cropping[1][1], self.cropping[2][0]:, :]
       elif self.cropping[0][1] == 0:
         return inputs[:, self.cropping[0][0]:, self.cropping[1][
-            0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][
-                1], :]
+            0]:-self.cropping[1][1], self.cropping[2][0]:
+                      -self.cropping[2][1], :]
       elif self.cropping[1][1] == 0:
-        return inputs[:, self.cropping[0][0]:-self.cropping[0][
-            1], self.cropping[1][0]:, self.cropping[2][0]:-self.cropping[2][
-                1], :]
+        return inputs[:, self.cropping[0][
+            0]:-self.cropping[0][1], self.cropping[1][0]:, self.cropping[2][0]:
+                      -self.cropping[2][1], :]
       elif self.cropping[2][1] == 0:
-        return inputs[:, self.cropping[0][0]:-self.cropping[0][
-            1], self.cropping[1][0]:-self.cropping[1][1], self.cropping[2][
-                0]:, :]
+        return inputs[:, self.cropping[0][0]:-self.cropping[0][1],
+                      self.cropping[1][0]:-self.cropping[1][1], self.cropping[
+                          2][0]:, :]
       return inputs[:, self.cropping[0][0]:-self.cropping[0][1], self.cropping[
-          1][0]:-self.cropping[1][1], self.cropping[2][0]:-self.cropping[2][  # pylint: disable=invalid-unary-operand-type
-              1], :]
+          1][0]:-self.cropping[1][1], self.cropping[2][0]:  # pylint: disable=invalid-unary-operand-type
+                    -self.cropping[2][1], :]  # pylint: disable=invalid-unary-operand-type
     # pylint: enable=invalid-unary-operand-type
 
   def get_config(self):
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
index 4ed5046dc310a2eafafbb1763437b84dbbd90790..30325b7148ee0a425cb5c47135ab4a6b8495868e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_recurrent.py
@@ -357,7 +357,7 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.states = [None, None]
 
     if self.data_format == 'channels_first':
-      channel_axis = 1
+      channel_axis = 2
     else:
       channel_axis = -1
     if input_shape[channel_axis] is None:
@@ -369,20 +369,20 @@ class ConvLSTM2D(ConvRecurrent2D):
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
     self.kernel = self.add_weight(
-        kernel_shape,
+        shape=kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        recurrent_kernel_shape,
+        shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
         name='recurrent_kernel',
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.filters * 4,),
+          shape=(self.filters * 4,),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 845e9eee1225d5b8ed1574798fcf7df069ec58e4..3b7f31a3e9e3764bcb6b3fc019e4df928feb2ee5 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -27,24 +27,7 @@ from tensorflow.python.platform import test
 
 class Convolution1DTest(test.TestCase):
 
-  def test_causal_dilated_conv1d(self):
-    # Causal:
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.Conv1D,
-          input_data=np.reshape(np.arange(4, dtype='float32'), (1, 4, 1)),
-          kwargs={
-              'filters': 1,
-              'kernel_size': 2,
-              'dilation_rate': 1,
-              'padding': 'causal',
-              'kernel_initializer': 'ones',
-              'use_bias': False,
-          },
-          expected_output=[[[0], [1], [3], [5]]])
-
   def test_dilated_conv1d(self):
-    # Non-causal:
     with self.test_session():
       testing_utils.layer_test(
           keras.layers.Conv1D,
diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index 1207cc119f20f2044f66ca767c9e32f1e639de45..d287fa56d91840e710c93afc0a84b875b9ec3aef 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import inspect
 import types as python_types
 
 import numpy as np
@@ -35,6 +34,8 @@ from tensorflow.contrib.keras.python.keras.utils.generic_utils import deserializ
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_dump
 from tensorflow.contrib.keras.python.keras.utils.generic_utils import func_load
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.util import tf_inspect
 
 
 class Masking(Layer):
@@ -84,11 +85,11 @@ class Masking(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class Dropout(Layer):
+class Dropout(tf_core_layers.Dropout, Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting
-  a fraction `p` of input units to 0 at each update during training time,
+  a fraction `rate` of input units to 0 at each update during training time,
   which helps prevent overfitting.
 
   Arguments:
@@ -103,24 +104,18 @@ class Dropout(Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    super(Dropout, self).__init__(**kwargs)
-    self.rate = min(1., max(0., rate))
-    self.noise_shape = noise_shape
-    self.seed = seed
     self.supports_masking = True
-
-  def _get_noise_shape(self, _):
-    return self.noise_shape
+    # Inheritance call order:
+    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
+    super(Dropout, self).__init__(**kwargs)
 
   def call(self, inputs, training=None):
-    if 0. < self.rate < 1.:
-      noise_shape = self._get_noise_shape(inputs)
-
-      def dropped_inputs():
-        return K.dropout(inputs, self.rate, noise_shape, seed=self.seed)
-
-      return K.in_train_phase(dropped_inputs, inputs, training=training)
-    return inputs
+    if training is None:
+      training = K.learning_phase()
+    output = super(Dropout, self).call(inputs, training=training)
+    if training is K.learning_phase():
+      output._uses_learning_phase = True  # pylint: disable=protected-access
+    return output
 
   def get_config(self):
     config = {'rate': self.rate}
@@ -140,7 +135,7 @@ class SpatialDropout1D(Dropout):
   between feature maps and should be used instead.
 
   Arguments:
-      p: float between 0 and 1. Fraction of the input units to drop.
+      rate: float between 0 and 1. Fraction of the input units to drop.
 
   Input shape:
       3D tensor with shape:
@@ -595,7 +590,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    arg_spec = inspect.getargspec(self.function)
+    arg_spec = tf_inspect.getargspec(self.function)
     if 'mask' in arg_spec.args:
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
@@ -643,7 +638,7 @@ class Lambda(Layer):
     return cls(**config)
 
 
-class Dense(Layer):
+class Dense(tf_core_layers.Dense, Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -712,80 +707,45 @@ class Dense(Layer):
                **kwargs):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
-    super(Dense, self).__init__(**kwargs)
-    self.units = units
-    self.activation = activations.get(activation)
-    self.use_bias = use_bias
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    # Inheritance call order:
+    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
+    super(Dense, self).__init__(
+        units,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        kernel_initializer=initializers.get(kernel_initializer),
+        bias_initializer=initializers.get(bias_initializer),
+        kernel_regularizer=regularizers.get(kernel_regularizer),
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    # TODO(fchollet): move weight constraint support to core layers.
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
-    self.input_spec = InputSpec(min_ndim=2)
     self.supports_masking = True
 
   def build(self, input_shape):
-    assert len(input_shape) >= 2
-    input_dim = input_shape[-1]
-
-    self.kernel = self.add_weight(
-        (input_dim, self.units),
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
-    if self.use_bias:
-      self.bias = self.add_weight(
-          (self.units,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
-    self.built = True
-
-  def call(self, inputs):
-    output = K.dot(inputs, self.kernel)
-    if self.use_bias:
-      output = K.bias_add(output, self.bias)
-    if self.activation is not None:
-      output = self.activation(output)
-    return output
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    assert input_shape and len(input_shape) >= 2
-    assert input_shape[-1]
-    output_shape = list(input_shape)
-    output_shape[-1] = self.units
-    return tensor_shape.TensorShape(output_shape)
+    super(Dense, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.kernel_constraint:
+      self.constraints[self.kernel] = self.kernel_constraint
+    if self.use_bias and self.bias_constraint:
+      self.constraints[self.bias] = self.bias_constraint
 
   def get_config(self):
     config = {
-        'units':
-            self.units,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(Dense, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/core_test.py b/tensorflow/contrib/keras/python/keras/layers/core_test.py
index d7aa8413bbe6fd3d22f7a6aa39bea007915206a3..7066af0ef6ce80919fa8b0ab77f9729d7d4c778c 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core_test.py
@@ -165,24 +165,23 @@ class CoreLayersTest(test.TestCase):
           3,
           kernel_regularizer=keras.regularizers.l1(0.01),
           bias_regularizer='l1',
-          activity_regularizer='l2')
-      layer.build((None, 4))
-      assert len(layer.losses) == 2
+          activity_regularizer='l2',
+          name='dense_reg')
       layer(keras.backend.variable(np.ones((2, 4))))
-      assert len(layer.losses) == 3
+      self.assertEqual(3, len(layer.losses))
 
     # Test constraints
     with self.test_session():
       layer = keras.layers.Dense(
           3, kernel_constraint='max_norm', bias_constraint='max_norm')
-      layer.build((None, 4))
-      assert len(layer.constraints) == 2
+      layer(keras.backend.variable(np.ones((2, 4))))
+      self.assertEqual(2, len(layer.constraints))
 
   def test_activity_regularization(self):
     with self.test_session():
       layer = keras.layers.ActivityRegularization(l1=0.1)
       layer(keras.backend.variable(np.ones((2, 4))))
-      assert len(layer.losses) == 1
+      self.assertEqual(1, len(layer.losses))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/keras/python/keras/layers/embeddings.py b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
index 5ba7d7db8a0f068038e646f78edb1bb7b048ee92..bc0bae67d05275346e40791e2a6d58a6b89bdf30 100644
--- a/tensorflow/contrib/keras/python/keras/layers/embeddings.py
+++ b/tensorflow/contrib/keras/python/keras/layers/embeddings.py
@@ -53,8 +53,8 @@ class Embedding(Layer):
   ```
 
   Arguments:
-    input_dim: int > 0. Size of the vocabulary, ie.
-        1 + maximum integer index occurring in the input data.
+    input_dim: int > 0. Size of the vocabulary,
+        i.e. maximum integer index + 1.
     output_dim: int >= 0. Dimension of the dense embedding.
     embeddings_initializer: Initializer for the `embeddings` matrix.
     embeddings_regularizer: Regularizer function applied to
@@ -68,7 +68,8 @@ class Embedding(Layer):
         If this is `True` then all subsequent layers
         in the model need to support masking or an exception will be raised.
         If mask_zero is set to True, as a consequence, index 0 cannot be
-        used in the vocabulary (input_dim should equal `|vocabulary| + 2`).
+        used in the vocabulary (input_dim should equal size of
+        vocabulary + 1).
     input_length: Length of input sequences, when it is constant.
         This argument is required if you are going to connect
         `Flatten` then `Dense` layers upstream
@@ -115,7 +116,7 @@ class Embedding(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     self.embeddings = self.add_weight(
-        (self.input_dim, self.output_dim),
+        shape=(self.input_dim, self.output_dim),
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
diff --git a/tensorflow/contrib/keras/python/keras/layers/local.py b/tensorflow/contrib/keras/python/keras/layers/local.py
index 3bf5ee4f0fcde01364a1189c6fb71dfb16d424c1..863674c1cbd95f2a93e24297278fc9e60800bc14 100644
--- a/tensorflow/contrib/keras/python/keras/layers/local.py
+++ b/tensorflow/contrib/keras/python/keras/layers/local.py
@@ -59,7 +59,8 @@ class LocallyConnected1D(Layer):
           specifying the stride length of the convolution.
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
+      padding: Currently only supports `"valid"` (case-insensitive).
+          `"same"` may be supported in the future.
       activation: Activation function to use.
           If you don't specify anything, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
@@ -129,14 +130,14 @@ class LocallyConnected1D(Layer):
     self.kernel_shape = (output_length, self.kernel_size[0] * input_dim,
                          self.filters)
     self.kernel = self.add_weight(
-        self.kernel_shape,
+        shape=self.kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (output_length, self.filters),
+          shape=(output_length, self.filters),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
@@ -188,7 +189,7 @@ class LocallyConnected1D(Layer):
         'kernel_initializer':
             initializers.serialize(self.kernel_initializer),
         'bias_initializer':
-            initializers.serialize(self.kernel_initializer),
+            initializers.serialize(self.bias_initializer),
         'kernel_regularizer':
             regularizers.serialize(self.kernel_regularizer),
         'bias_regularizer':
@@ -239,16 +240,15 @@ class LocallyConnected2D(Layer):
           specifying the strides of the convolution along the width and height.
           Can be a single integer to specify the same value for
           all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `"valid"` or `"same"` (case-insensitive).
+      padding: Currently only support `"valid"` (case-insensitive).
+          `"same"` will be supported in future.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -336,17 +336,18 @@ class LocallyConnected2D(Layer):
                                                self.padding, self.strides[1])
     self.output_row = output_row
     self.output_col = output_col
-    self.kernel_shape = (output_row * output_col, self.kernel_size[0] *
-                         self.kernel_size[1] * input_filter, self.filters)
+    self.kernel_shape = (
+        output_row * output_col,
+        self.kernel_size[0] * self.kernel_size[1] * input_filter, self.filters)
     self.kernel = self.add_weight(
-        self.kernel_shape,
+        shape=self.kernel_shape,
         initializer=self.kernel_initializer,
         name='kernel',
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (output_row, output_col, self.filters),
+          shape=(output_row, output_col, self.filters),
           initializer=self.bias_initializer,
           name='bias',
           regularizer=self.bias_regularizer,
@@ -460,7 +461,7 @@ class LocallyConnected2D(Layer):
         'kernel_initializer':
             initializers.serialize(self.kernel_initializer),
         'bias_initializer':
-            initializers.serialize(self.kernel_initializer),
+            initializers.serialize(self.bias_initializer),
         'kernel_regularizer':
             regularizers.serialize(self.kernel_regularizer),
         'bias_regularizer':
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
index eea4313d31c5f52b64b5dfdbcd3ff3cb7c81b8df..b4bb9935fdef01f28896056a745d66c91a31d745 100644
--- a/tensorflow/contrib/keras/python/keras/layers/merge.py
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -41,6 +41,44 @@ class _Merge(Layer):
   def _merge_function(self, inputs):
     raise NotImplementedError
 
+  def _compute_elemwise_op_output_shape(self, shape1, shape2):
+    """Computes the shape of the resultant of an elementwise operation.
+
+    Arguments:
+        shape1: tuple or None. Shape of the first tensor
+        shape2: tuple or None. Shape of the second tensor
+
+    Returns:
+        expected output shape when an element-wise operation is
+        carried out on 2 tensors with shapes shape1 and shape2.
+        tuple or None.
+
+    Raises:
+        ValueError: if shape1 and shape2 are not compatible for
+            element-wise operations.
+    """
+    if None in [shape1, shape2]:
+      return None
+    elif len(shape1) < len(shape2):
+      return self._compute_elemwise_op_output_shape(shape2, shape1)
+    elif not shape2:
+      return shape1
+    output_shape = list(shape1[:-len(shape2)])
+    for i, j in zip(shape1[-len(shape2):], shape2):
+      if i is None or j is None:
+        output_shape.append(None)
+      elif i == 1:
+        output_shape.append(j)
+      elif j == 1:
+        output_shape.append(i)
+      else:
+        if i != j:
+          raise ValueError('Operands could not be broadcast '
+                           'together with shapes ' + str(shape1) + ' ' +
+                           str(shape2))
+        output_shape.append(i)
+    return tuple(output_shape)
+
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
@@ -49,23 +87,109 @@ class _Merge(Layer):
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
                        'Got ' + str(len(input_shape)) + ' inputs.')
-    if all([shape is None for shape in input_shape]):
-      return
-    input_shapes = [
-        tuple(tensor_shape.TensorShape(shape).as_list())
-        for shape in input_shape
-    ]
-    # TODO(fchollet): handle shapes with None entries.
-    input_shapes_set = set(input_shapes)
-    if None in input_shapes_set:
-      input_shapes_set.remove(None)
-    if len(input_shapes_set) > 1:
-      raise ValueError('Only tensors of same shape can '
-                       'be merged by layer' + self.name +
-                       ' Got input shapes: %s' % input_shapes)
+    input_shape = [tensor_shape.TensorShape(s).as_list() for s in input_shape]
+    batch_sizes = [s[0] for s in input_shape if s is not None]
+    batch_sizes = set(batch_sizes)
+    batch_sizes -= set([None])
+    if len(batch_sizes) > 1:
+      raise ValueError('Can not merge tensors with different '
+                       'batch sizes. Got tensors with shapes : ' +
+                       str(input_shape))
+    if input_shape[0] is None:
+      output_shape = None
+    else:
+      output_shape = input_shape[0][1:]
+    for i in range(1, len(input_shape)):
+      if input_shape[i] is None:
+        shape = None
+      else:
+        shape = input_shape[i][1:]
+      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+    # If the inputs have different ranks, we have to reshape them
+    # to make them broadcastable.
+    if None not in input_shape and len(set(map(len, input_shape))) == 1:
+      self._reshape_required = False
+    else:
+      self._reshape_required = True
+    self.built = True
 
   def call(self, inputs):
-    return self._merge_function(inputs)
+    if self._reshape_required:
+      reshaped_inputs = []
+      input_ndims = list(map(K.ndim, inputs))
+      if None not in input_ndims:
+        # If ranks of all inputs are available,
+        # we simply expand each of them at axis=1
+        # until all of them have the same rank.
+        max_ndim = max(input_ndims)
+        for x in inputs:
+          x_ndim = K.ndim(x)
+          for _ in range(max_ndim - x_ndim):
+            x = K.expand_dims(x, 1)
+          reshaped_inputs.append(x)
+        return self._merge_function(reshaped_inputs)
+      else:
+        # Transpose all inputs so that batch size is the last dimension.
+        # (batch_size, dim1, dim2, ... ) -> (dim1, dim2, ... , batch_size)
+        transposed = False
+        for x in inputs:
+          x_ndim = K.ndim(x)
+          if x_ndim is None:
+            x_shape = K.shape(x)
+            batch_size = x_shape[0]
+            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
+            x_transposed = K.reshape(x,
+                                     K.stack([batch_size, K.prod(x_shape[1:])]))
+            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
+            x_transposed = K.reshape(x_transposed, new_shape)
+            reshaped_inputs.append(x_transposed)
+            transposed = True
+          elif x_ndim > 1:
+            dims = list(range(1, x_ndim)) + [0]
+            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            transposed = True
+          else:
+            # We don't transpose inputs if they are 1D vectors or scalars.
+            reshaped_inputs.append(x)
+        y = self._merge_function(reshaped_inputs)
+        y_ndim = K.ndim(y)
+        if transposed:
+          # If inputs have been transposed, we have to transpose the output too.
+          if y_ndim is None:
+            y_shape = K.shape(y)
+            y_ndim = K.shape(y_shape)[0]
+            batch_size = y_shape[y_ndim - 1]
+            new_shape = K.concatenate(
+                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
+            y = K.reshape(y, (-1, batch_size))
+            y = K.permute_dimensions(y, (1, 0))
+            y = K.reshape(y, new_shape)
+          elif y_ndim > 1:
+            dims = [y_ndim - 1] + list(range(y_ndim - 1))
+            y = K.permute_dimensions(y, dims)
+        return y
+    else:
+      return self._merge_function(inputs)
+
+  def compute_output_shape(self, input_shape):
+    if input_shape[0] is None:
+      output_shape = None
+    else:
+      output_shape = input_shape[0][1:]
+    for i in range(1, len(input_shape)):
+      if input_shape[i] is None:
+        shape = None
+      else:
+        shape = input_shape[i][1:]
+      output_shape = self._compute_elemwise_op_output_shape(output_shape, shape)
+    batch_sizes = [s[0] for s in input_shape if s is not None]
+    batch_sizes = set(batch_sizes)
+    batch_sizes -= set([None])
+    if len(batch_sizes) == 1:
+      output_shape = (list(batch_sizes)[0],) + output_shape
+    else:
+      output_shape = (None,) + output_shape
+    return output_shape
 
   def compute_mask(self, inputs, mask=None):
     if mask is None:
@@ -179,6 +303,7 @@ class Concatenate(_Merge):
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
+    self.built = True
 
   def call(self, inputs):
     if not isinstance(inputs, list):
@@ -219,8 +344,8 @@ class Concatenate(_Merge):
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
         # Input is unmasked. Append all 1s to masks,
-        # but cast it to uint8 first
-        masks.append(K.cast(K.ones_like(input_i), 'uint8'))
+        # but cast it to bool first
+        masks.append(K.cast(K.ones_like(input_i), 'bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
         masks.append(K.expand_dims(mask_i))
@@ -291,6 +416,7 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
+    self.built = True
 
   def call(self, inputs):
     x1 = inputs[0]
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization.py b/tensorflow/contrib/keras/python/keras/layers/normalization.py
index 41c618cc79d6d8521f0d12bf0024ee3762c9f59b..4537814b61a153d7518e56cbb1b23db3e735dec9 100644
--- a/tensorflow/contrib/keras/python/keras/layers/normalization.py
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization.py
@@ -22,12 +22,11 @@ from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras import constraints
 from tensorflow.contrib.keras.python.keras import initializers
 from tensorflow.contrib.keras.python.keras import regularizers
-from tensorflow.contrib.keras.python.keras.engine import InputSpec
 from tensorflow.contrib.keras.python.keras.engine import Layer
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import normalization as tf_normalization_layers
 
 
-class BatchNormalization(Layer):
+class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -86,148 +85,59 @@ class BatchNormalization(Layer):
                beta_constraint=None,
                gamma_constraint=None,
                **kwargs):
-    super(BatchNormalization, self).__init__(**kwargs)
     self.supports_masking = True
-    self.axis = axis
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = initializers.get(beta_initializer)
-    self.gamma_initializer = initializers.get(gamma_initializer)
-    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
-    self.moving_variance_initializer = initializers.get(
-        moving_variance_initializer)
-    self.beta_regularizer = regularizers.get(beta_regularizer)
-    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    super(BatchNormalization, self).__init__(
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=initializers.get(beta_initializer),
+        gamma_initializer=initializers.get(gamma_initializer),
+        moving_mean_initializer=initializers.get(moving_mean_initializer),
+        moving_variance_initializer=initializers.get(
+            moving_variance_initializer),
+        beta_regularizer=regularizers.get(beta_regularizer),
+        gamma_regularizer=regularizers.get(gamma_regularizer),
+        **kwargs
+    )
+    # TODO(fchollet): move weight constraint support to core layers.
     self.beta_constraint = constraints.get(beta_constraint)
     self.gamma_constraint = constraints.get(gamma_constraint)
 
   def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    dim = input_shape[self.axis]
-    if dim is None:
-      raise ValueError('Axis ' + str(self.axis) + ' of '
-                       'input tensor should have a defined dimension '
-                       'but the layer received an input with shape ' + str(
-                           input_shape) + '.')
-    self.input_spec = InputSpec(ndim=len(input_shape), axes={self.axis: dim})
-    shape = (dim,)
-
-    if self.scale:
-      self.gamma = self.add_weight(
-          shape,
-          name='gamma',
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint)
-    else:
-      self.gamma = None
-    if self.center:
-      self.beta = self.add_weight(
-          shape,
-          name='beta',
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint)
-    else:
-      self.beta = None
-    self.moving_mean = self.add_weight(
-        shape,
-        name='moving_mean',
-        initializer=self.moving_mean_initializer,
-        trainable=False)
-    self.moving_variance = self.add_weight(
-        shape,
-        name='moving_variance',
-        initializer=self.moving_variance_initializer,
-        trainable=False)
-    self.built = True
+    super(BatchNormalization, self).build(input_shape)
+    # TODO(fchollet): move weight constraint support to core layers.
+    if self.center and self.beta_constraint:
+      self.constraints[self.beta] = self.beta_constraint
+    if self.scale and self.gamma_constraint:
+      self.constraints[self.gamma] = self.gamma_constraint
 
   def call(self, inputs, training=None):
-    input_shape = inputs.get_shape().as_list()
-    # Prepare broadcasting shape.
-    ndim = len(input_shape)
-    reduction_axes = list(range(len(input_shape)))
-    del reduction_axes[self.axis]
-    broadcast_shape = [1] * len(input_shape)
-    broadcast_shape[self.axis] = input_shape[self.axis]
-
-    # Determines whether broadcasting is needed.
-    needs_broadcasting = (sorted(reduction_axes) != range(ndim)[:-1])
-
-    normed, mean, variance = K.normalize_batch_in_training(
-        inputs, self.gamma, self.beta, reduction_axes, epsilon=self.epsilon)
-
-    if training in {0, False}:
-      return normed
-    else:
-      self.add_update([
-          K.moving_average_update(self.moving_mean, mean, self.momentum),
-          K.moving_average_update(self.moving_variance, variance, self.momentum)
-      ], inputs)
-
-      def normalize_inference():
-        if needs_broadcasting:
-          # In this case we must explictly broadcast all parameters.
-          broadcast_moving_mean = K.reshape(self.moving_mean, broadcast_shape)
-          broadcast_moving_variance = K.reshape(self.moving_variance,
-                                                broadcast_shape)
-          if self.center:
-            broadcast_beta = K.reshape(self.beta, broadcast_shape)
-          else:
-            broadcast_beta = None
-          if self.scale:
-            broadcast_gamma = K.reshape(self.gamma, broadcast_shape)
-          else:
-            broadcast_gamma = None
-          return K.batch_normalization(
-              inputs,
-              broadcast_moving_mean,
-              broadcast_moving_variance,
-              broadcast_beta,
-              broadcast_gamma,
-              epsilon=self.epsilon)
-        else:
-          return K.batch_normalization(
-              inputs,
-              self.moving_mean,
-              self.moving_variance,
-              self.beta,
-              self.gamma,
-              epsilon=self.epsilon)
-
-    # Pick the normalized form corresponding to the training phase.
-    return K.in_train_phase(normed, normalize_inference, training=training)
+    if training is None:
+      training = K.learning_phase()
+    output = super(BatchNormalization, self).call(inputs, training=training)
+    if training is K.learning_phase():
+      output._uses_learning_phase = True  # pylint: disable=protected-access
+    return output
 
   def get_config(self):
     config = {
-        'axis':
-            self.axis,
-        'momentum':
-            self.momentum,
-        'epsilon':
-            self.epsilon,
-        'center':
-            self.center,
-        'scale':
-            self.scale,
-        'beta_initializer':
-            initializers.serialize(self.beta_initializer),
-        'gamma_initializer':
-            initializers.serialize(self.gamma_initializer),
+        'axis': self.axis,
+        'momentum': self.momentum,
+        'epsilon': self.epsilon,
+        'center': self.center,
+        'scale': self.scale,
+        'beta_initializer': initializers.serialize(self.beta_initializer),
+        'gamma_initializer': initializers.serialize(self.gamma_initializer),
         'moving_mean_initializer':
             initializers.serialize(self.moving_mean_initializer),
         'moving_variance_initializer':
             initializers.serialize(self.moving_variance_initializer),
-        'beta_regularizer':
-            regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer':
-            regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint':
-            constraints.serialize(self.beta_constraint),
-        'gamma_constraint':
-            constraints.serialize(self.gamma_constraint)
+        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint': constraints.serialize(self.beta_constraint),
+        'gamma_constraint': constraints.serialize(self.gamma_constraint)
     }
     base_config = super(BatchNormalization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
index 51e23b84949a6f9640f141600b74d0924cd8edcb..dc410f84d8563aa9e112b270a08c91cf73771930 100644
--- a/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/normalization_test.py
@@ -116,19 +116,21 @@ class NoiseLayersTest(test.TestCase):
     """
     with self.test_session():
       # Test single layer reuse
-      bn = keras.layers.BatchNormalization(input_shape=(10,))
+      bn = keras.layers.BatchNormalization()
       x1 = keras.layers.Input(shape=(10,))
-      bn(x1)
+      _ = bn(x1)
 
       x2 = keras.layers.Input(shape=(10,))
       y2 = bn(x2)
 
       x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
       model = keras.models.Model(x2, y2)
-      assert len(model.updates) == 2
+
       model.compile('sgd', 'mse')
       model.train_on_batch(x, x)
 
+      assert len(model.updates) == 2
+
       # Test model-level reuse
       x3 = keras.layers.Input(shape=(10,))
       y3 = model(x3)
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling.py b/tensorflow/contrib/keras/python/keras/layers/pooling.py
index e31caed3ecccc70cf1af954ac7bbc03d5bcf2c96..704f05e494e9d7109f234e5e73cb08937cdc7f9e 100644
--- a/tensorflow/contrib/keras/python/keras/layers/pooling.py
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling.py
@@ -23,51 +23,10 @@ from tensorflow.contrib.keras.python.keras.engine import InputSpec
 from tensorflow.contrib.keras.python.keras.engine import Layer
 from tensorflow.contrib.keras.python.keras.utils import conv_utils
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import pooling as tf_pooling_layers
 
 
-class _Pooling1D(Layer):
-  """Abstract class for different pooling 1D layers.
-  """
-
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    super(_Pooling1D, self).__init__(**kwargs)
-    if strides is None:
-      strides = pool_size
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.input_spec = InputSpec(ndim=3)
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                           self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
-
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    raise NotImplementedError
-
-  def call(self, inputs):
-    inputs = K.expand_dims(inputs, 2)  # add dummy last dimension
-    output = self._pooling_function(
-        inputs=inputs,
-        pool_size=self.pool_size + (1,),
-        strides=self.strides + (1,),
-        padding=self.padding,
-        data_format='channels_last')
-    return K.squeeze(output, 2)  # remove dummy last dimension
-
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding
-    }
-    base_config = super(_Pooling1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class MaxPooling1D(_Pooling1D):
+class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
   """Max pooling operation for temporal data.
 
   Arguments:
@@ -85,15 +44,21 @@ class MaxPooling1D(_Pooling1D):
   """
 
   def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+    if strides is None:
+      strides = pool_size
     super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
 
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool2d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='max')
-    return output
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(MaxPooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class AveragePooling1D(_Pooling1D):
+class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
   """Average pooling for temporal data.
 
   Arguments:
@@ -111,78 +76,22 @@ class AveragePooling1D(_Pooling1D):
   """
 
   def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
-                                           **kwargs)
-
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool2d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
-    return output
-
-
-class _Pooling2D(Layer):
-  """Abstract class for different pooling 2D layers.
-  """
-
-  def __init__(self,
-               pool_size=(2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super(_Pooling2D, self).__init__(**kwargs)
-    data_format = conv_utils.normalize_data_format(data_format)
     if strides is None:
       strides = pool_size
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=4)
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                         self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                         self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    raise NotImplementedError
-
-  def call(self, inputs):
-    output = self._pooling_function(
-        inputs=inputs,
-        pool_size=self.pool_size,
-        strides=self.strides,
-        padding=self.padding,
-        data_format=self.data_format)
-    return output
+    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
+                                           **kwargs)
 
   def get_config(self):
     config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
         'strides': self.strides,
-        'data_format': self.data_format
+        'pool_size': self.pool_size,
+        'padding': self.padding
     }
-    base_config = super(_Pooling2D, self).get_config()
+    base_config = super(AveragePooling1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class MaxPooling2D(_Pooling2D):
+class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
   """Max pooling operation for spatial data.
 
   Arguments:
@@ -199,9 +108,9 @@ class MaxPooling2D(_Pooling2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -229,16 +138,25 @@ class MaxPooling2D(_Pooling2D):
                padding='valid',
                data_format=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
     super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
                                        **kwargs)
 
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool2d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='max')
-    return output
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(MaxPooling2D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class AveragePooling2D(_Pooling2D):
+class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
   """Average pooling operation for spatial data.
 
   Arguments:
@@ -255,9 +173,9 @@ class AveragePooling2D(_Pooling2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -285,68 +203,12 @@ class AveragePooling2D(_Pooling2D):
                padding='valid',
                data_format=None,
                **kwargs):
-    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
-
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool2d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
-    return output
-
-
-class _Pooling3D(Layer):
-  """Abstract class for different pooling 3D layers.
-  """
-
-  def __init__(self,
-               pool_size=(2, 2, 2),
-               strides=None,
-               padding='valid',
-               data_format=None,
-               **kwargs):
-    super(_Pooling3D, self).__init__(**kwargs)
+    if data_format is None:
+      data_format = K.image_data_format()
     if strides is None:
       strides = pool_size
-    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(ndim=5)
-
-  def _compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
-                                             self.padding, self.strides[0])
-    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
-                                             self.padding, self.strides[1])
-    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
-                                             self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
-
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    raise NotImplementedError
-
-  def call(self, inputs):
-    output = self._pooling_function(
-        inputs=inputs,
-        pool_size=self.pool_size,
-        strides=self.strides,
-        padding=self.padding,
-        data_format=self.data_format)
-    return output
+    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
+                                           data_format, **kwargs)
 
   def get_config(self):
     config = {
@@ -355,11 +217,11 @@ class _Pooling3D(Layer):
         'strides': self.strides,
         'data_format': self.data_format
     }
-    base_config = super(_Pooling3D, self).get_config()
+    base_config = super(AveragePooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class MaxPooling3D(_Pooling3D):
+class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -402,16 +264,25 @@ class MaxPooling3D(_Pooling3D):
                padding='valid',
                data_format=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
     super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
                                        **kwargs)
 
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool3d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='max')
-    return output
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(MaxPooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-class AveragePooling3D(_Pooling3D):
+class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -454,13 +325,22 @@ class AveragePooling3D(_Pooling3D):
                padding='valid',
                data_format=None,
                **kwargs):
+    if data_format is None:
+      data_format = K.image_data_format()
+    if strides is None:
+      strides = pool_size
     super(AveragePooling3D, self).__init__(pool_size, strides, padding,
                                            data_format, **kwargs)
 
-  def _pooling_function(self, inputs, pool_size, strides, padding, data_format):
-    output = K.pool3d(
-        inputs, pool_size, strides, padding, data_format, pool_mode='avg')
-    return output
+  def get_config(self):
+    config = {
+        'pool_size': self.pool_size,
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
+    }
+    base_config = super(AveragePooling3D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 class _GlobalPooling1D(Layer):
@@ -542,9 +422,9 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -577,9 +457,9 @@ class GlobalMaxPooling2D(_GlobalPooling2D):
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
           `channels_last` corresponds to inputs with shape
-          `(batch, width, height, channels)` while `channels_first`
+          `(batch, height, width, channels)` while `channels_first`
           corresponds to inputs with shape
-          `(batch, channels, width, height)`.
+          `(batch, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
diff --git a/tensorflow/contrib/keras/python/keras/layers/recurrent.py b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
index 06986d3eaad812eae6985bd8a9036c6f9e543f8f..e608921adda74316b20647c3315e7275090324d0 100644
--- a/tensorflow/contrib/keras/python/keras/layers/recurrent.py
+++ b/tensorflow/contrib/keras/python/keras/layers/recurrent.py
@@ -105,8 +105,16 @@ class Recurrent(Layer):
       # now model.output_shape == (None, 32)
       # note: `None` is the batch dimension.
 
-      # for subsequent layers, not need to specify the input size:
+      # for subsequent layers, no need to specify the input size:
       model.add(LSTM(16))
+
+      # to stack recurrent layers, you must use return_sequences=True
+      # on any recurrent layer that feeds into another recurrent layer.
+      # note that you only need to specify the input size on the first layer.
+      model = Sequential()
+      model.add(LSTM(64, input_dim=64, input_length=10, return_sequences=True))
+      model.add(LSTM(32, return_sequences=True))
+      model.add(LSTM(10))
   ```
 
   Arguments:
@@ -116,7 +124,8 @@ class Recurrent(Layer):
       return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       go_backwards: Boolean (default False).
-          If True, process the input sequence backwards.
+          If True, process the input sequence backwards and return the
+          reversed sequence.
       stateful: Boolean (default False). If True, the last state
           for each sample at index i in a batch will be used as initial
           state for the sample of index i in the following batch.
@@ -296,9 +305,9 @@ class Recurrent(Layer):
       initial_states = self.get_initial_states(inputs)
 
     if len(initial_states) != len(self.states):
-      raise ValueError('Layer has ' + str(
-          len(self.states)) + ' states but was passed ' + str(
-              len(initial_states)) + ' initial states.')
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_states)) +
+                       ' initial states.')
     input_shape = K.int_shape(inputs)
     if self.unroll and input_shape[1] is None:
       raise ValueError('Cannot unroll a RNN if the '
@@ -372,8 +381,8 @@ class Recurrent(Layer):
       if states_value:
         value = states_value[i]
         if value.shape != (batch_size, self.units):
-          raise ValueError('Expected state #' + str(
-              i) + ' to have shape ' + str((batch_size, self.units)) +
+          raise ValueError('Expected state #' + str(i) + ' to have shape ' +
+                           str((batch_size, self.units)) +
                            ' but got array with shape ' + str(value.shape))
       else:
         value = np.zeros((batch_size, self.units))
@@ -398,6 +407,7 @@ class SimpleRNN(Recurrent):
       units: Positive integer, dimensionality of the output space.
       activation: Activation function to use.
           If you don't specify anything, no activation is applied
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
@@ -483,20 +493,20 @@ class SimpleRNN(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units),
+        shape=(self.input_dim, self.units),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units),
+        shape=(self.units, self.units),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units,),
+          shape=(self.units,),
           name='bias',
           initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
@@ -547,7 +557,7 @@ class SimpleRNN(Recurrent):
 
   def get_constants(self, inputs, training=None):
     constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
+    if self.implementation != 0 and 0 < self.dropout < 1:
       input_shape = K.int_shape(inputs)
       input_dim = input_shape[-1]
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
@@ -619,7 +629,7 @@ class GRU(Recurrent):
   Arguments:
       units: Positive integer, dimensionality of the output space.
       activation: Activation function to use.
-          If you don't specify anything, no activation is applied
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
           for the recurrent step.
@@ -713,13 +723,13 @@ class GRU(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units * 3),
+        shape=(self.input_dim, self.units * 3),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units * 3),
+        shape=(self.units, self.units * 3),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
@@ -727,9 +737,9 @@ class GRU(Recurrent):
 
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units * 3,),
+          shape=(self.units * 3,),
           name='bias',
-          initializer='zero',
+          initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
           constraint=self.bias_constraint)
     else:
@@ -738,8 +748,8 @@ class GRU(Recurrent):
     self.kernel_z = self.kernel[:, :self.units]
     self.recurrent_kernel_z = self.recurrent_kernel[:, :self.units]
     self.kernel_r = self.kernel[:, self.units:self.units * 2]
-    self.recurrent_kernel_r = self.recurrent_kernel[:, self.units:self.units *
-                                                    2]
+    self.recurrent_kernel_r = self.recurrent_kernel[:, self.units:
+                                                    self.units * 2]
     self.kernel_h = self.kernel[:, self.units * 2:]
     self.recurrent_kernel_h = self.recurrent_kernel[:, self.units * 2:]
 
@@ -792,7 +802,7 @@ class GRU(Recurrent):
 
   def get_constants(self, inputs, training=None):
     constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
+    if self.implementation != 0 and 0 < self.dropout < 1:
       input_shape = K.int_shape(inputs)
       input_dim = input_shape[-1]
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
@@ -861,7 +871,7 @@ class GRU(Recurrent):
         if self.use_bias:
           x_z = K.bias_add(x_z, self.bias_z)
           x_r = K.bias_add(x_r, self.bias_r)
-          x_h = K.bias_add(x_r, self.bias_h)
+          x_h = K.bias_add(x_h, self.bias_h)
       else:
         raise ValueError('Unknown `implementation` mode.')
       z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
@@ -924,7 +934,7 @@ class LSTM(Recurrent):
   Arguments:
       units: Positive integer, dimensionality of the output space.
       activation: Activation function to use.
-          If you don't specify anything, no activation is applied
+          If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
           for the recurrent step.
@@ -1029,13 +1039,13 @@ class LSTM(Recurrent):
       self.reset_states()
 
     self.kernel = self.add_weight(
-        (self.input_dim, self.units * 4),
+        shape=(self.input_dim, self.units * 4),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
-        (self.units, self.units * 4),
+        shape=(self.units, self.units * 4),
         name='recurrent_kernel',
         initializer=self.recurrent_initializer,
         regularizer=self.recurrent_regularizer,
@@ -1043,7 +1053,7 @@ class LSTM(Recurrent):
 
     if self.use_bias:
       self.bias = self.add_weight(
-          (self.units * 4,),
+          shape=(self.units * 4,),
           name='bias',
           initializer=self.bias_initializer,
           regularizer=self.bias_regularizer,
@@ -1061,10 +1071,10 @@ class LSTM(Recurrent):
     self.kernel_o = self.kernel[:, self.units * 3:]
 
     self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units]
-    self.recurrent_kernel_f = self.recurrent_kernel[:, self.units:self.units *
-                                                    2]
-    self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2:self.units
-                                                    * 3]
+    self.recurrent_kernel_f = self.recurrent_kernel[:, self.units:
+                                                    self.units * 2]
+    self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2:
+                                                    self.units * 3]
     self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:]
 
     if self.use_bias:
@@ -1127,7 +1137,7 @@ class LSTM(Recurrent):
 
   def get_constants(self, inputs, training=None):
     constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
+    if self.implementation != 0 and 0 < self.dropout < 1:
       input_shape = K.int_shape(inputs)
       input_dim = input_shape[-1]
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
@@ -1199,8 +1209,8 @@ class LSTM(Recurrent):
                                                 self.recurrent_kernel_i))
       f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
                                                 self.recurrent_kernel_f))
-      c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * rec_dp_mask[2],
-                                                      self.recurrent_kernel_c))
+      c = f * c_tm1 + i * self.activation(
+          x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c))
       o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
                                                 self.recurrent_kernel_o))
     h = o * self.activation(c)
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index 75b4810e40bd2fb962311b9ad9f11f182ccb6c72..092501cb1149d30f987c34934871c1bae55eccdf 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
 """Wrapper layers: layers that augment the functionality of another layer.
 """
 from __future__ import absolute_import
@@ -24,6 +25,7 @@ from tensorflow.contrib.keras.python.keras import backend as K
 from tensorflow.contrib.keras.python.keras.engine import InputSpec
 from tensorflow.contrib.keras.python.keras.engine import Layer
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import tf_inspect
 
 
 class Wrapper(Layer):
@@ -42,19 +44,53 @@ class Wrapper(Layer):
     super(Wrapper, self).__init__(**kwargs)
 
   def build(self, input_shape=None):
-    # Assumes that self.layer is already set.
-    # Should be called at the end of .build() in the children classes.
-    self.trainable_weights = getattr(self.layer, 'trainable_weights', [])
-    self.non_trainable_weights = getattr(self.layer, 'non_trainable_weights',
-                                         [])
-    self.updates = getattr(self.layer, 'updates', [])
-    self.losses = getattr(self.layer, 'losses', [])
-    self.constraints = getattr(self.layer, 'constraints', {})
     self.built = True
 
+  @property
+  def activity_regularizer(self):
+    if hasattr(self.layer, 'activity_regularizer'):
+      return self.layer.activity_regularizer
+    else:
+      return None
+
+  @property
+  def trainable_weights(self):
+    return self.layer.trainable_weights
+
+  @property
+  def non_trainable_weights(self):
+    return self.layer.non_trainable_weights
+
+  @property
+  def updates(self):
+    if hasattr(self.layer, 'updates'):
+      return self.layer.updates
+    return []
+
+  def get_updates_for(self, inputs=None):
+    if inputs is None:
+      updates = self.layer.get_updates_for(None)
+      return updates + super(Wrapper, self).get_updates_for(None)
+    return super(Wrapper, self).get_updates_for(inputs)
+
+  @property
+  def losses(self):
+    if hasattr(self.layer, 'losses'):
+      return self.layer.losses
+    return []
+
+  def get_losses_for(self, inputs=None):
+    if inputs is None:
+      losses = self.layer.get_losses_for(None)
+      return losses + super(Wrapper, self).get_losses_for(None)
+    return super(Wrapper, self).get_losses_for(inputs)
+
+  @property
+  def constraints(self):
+    return self.layer.constraints
+
   def get_weights(self):
-    weights = self.layer.get_weights()
-    return weights
+    return self.layer.get_weights()
 
   def set_weights(self, weights):
     self.layer.set_weights(weights)
@@ -70,9 +106,10 @@ class Wrapper(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
   @classmethod
-  def from_config(cls, config):
+  def from_config(cls, config, custom_objects=None):
     from tensorflow.contrib.keras.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
-    layer = deserialize_layer(config.pop('layer'))
+    layer = deserialize_layer(
+        config.pop('layer'), custom_objects=custom_objects)
     return cls(layer, **config)
 
 
@@ -129,11 +166,12 @@ class TimeDistributed(Wrapper):
       self.layer.build(child_input_shape)
       self.layer.built = True
     super(TimeDistributed, self).build()
+    self.built = True
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    child_input_shape = tensor_shape.TensorShape([input_shape[0]] + input_shape[
-        2:])
+    child_input_shape = tensor_shape.TensorShape([input_shape[0]] +
+                                                 input_shape[2:])
     child_output_shape = self.layer._compute_output_shape(  # pylint: disable=protected-access
         child_input_shape).as_list()
     timesteps = input_shape[1]
@@ -188,12 +226,15 @@ class Bidirectional(Wrapper):
           If None, the outputs will not be combined,
           they will be returned as a list.
 
+  Raises:
+      ValueError: In case of invalid `merge_mode` argument.
+
   Examples:
 
   ```python
       model = Sequential()
       model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5,
-        10)))
+      10)))
       model.add(Bidirectional(LSTM(10)))
       model.add(Dense(5))
       model.add(Activation('softmax'))
@@ -242,29 +283,47 @@ class Bidirectional(Wrapper):
       shape = self.forward_layer._compute_output_shape(input_shape)  # pylint: disable=protected-access
       return [shape, copy.copy(shape)]
 
-  def call(self, inputs, mask=None):
-    y = self.forward_layer.call(inputs, mask)
-    y_rev = self.backward_layer.call(inputs, mask)
+  def call(self, inputs, training=None, mask=None):
+    kwargs = {}
+    func_args = tf_inspect.getargspec(self.layer.call).args
+    if 'training' in func_args:
+      kwargs['training'] = training
+    if 'mask' in func_args:
+      kwargs['mask'] = mask
+
+    y = self.forward_layer.call(inputs, **kwargs)
+    y_rev = self.backward_layer.call(inputs, **kwargs)
     if self.return_sequences:
       y_rev = K.reverse(y_rev, 1)
     if self.merge_mode == 'concat':
-      return K.concatenate([y, y_rev])
+      output = K.concatenate([y, y_rev])
     elif self.merge_mode == 'sum':
-      return y + y_rev
+      output = y + y_rev
     elif self.merge_mode == 'ave':
-      return (y + y_rev) / 2
+      output = (y + y_rev) / 2
     elif self.merge_mode == 'mul':
-      return y * y_rev
+      output = y * y_rev
     elif self.merge_mode is None:
-      return [y, y_rev]
+      output = [y, y_rev]
+
+    # Properly set learning phase
+    if 0 < self.layer.dropout + self.layer.recurrent_dropout:
+      if self.merge_mode is None:
+        for out in output:
+          out._uses_learning_phase = True
+      else:
+        output._uses_learning_phase = True
+    return output
 
   def reset_states(self):
     self.forward_layer.reset_states()
     self.backward_layer.reset_states()
 
   def build(self, input_shape):
-    self.forward_layer.build(input_shape)
-    self.backward_layer.build(input_shape)
+    with K.name_scope(self.forward_layer.name):
+      self.forward_layer.build(input_shape)
+    with K.name_scope(self.backward_layer.name):
+      self.backward_layer.build(input_shape)
     self.built = True
 
   def compute_mask(self, inputs, mask):
diff --git a/tensorflow/contrib/keras/python/keras/metrics.py b/tensorflow/contrib/keras/python/keras/metrics.py
index d7266c94cf78c0139f244bf3307ce4406030df2f..59d380f73bd8592b1593d52569c947d8ceb8ab7a 100644
--- a/tensorflow/contrib/keras/python/keras/metrics.py
+++ b/tensorflow/contrib/keras/python/keras/metrics.py
@@ -43,12 +43,15 @@ def binary_accuracy(y_true, y_pred):
 
 
 def categorical_accuracy(y_true, y_pred):
-  return K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1))
+  return K.cast(
+      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
-  return K.equal(
-      K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1), K.floatx()))
+  return K.cast(
+      K.equal(
+          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
+                                         K.floatx())), K.floatx())
 
 
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
diff --git a/tensorflow/contrib/keras/python/keras/models.py b/tensorflow/contrib/keras/python/keras/models.py
index 2be4431d03d3970aafe27f9ce6d4eaee3ddcac9a..1c041091fc1619e90080234f0158a28602194d5e 100644
--- a/tensorflow/contrib/keras/python/keras/models.py
+++ b/tensorflow/contrib/keras/python/keras/models.py
@@ -35,6 +35,7 @@ from tensorflow.contrib.keras.python.keras.engine.topology import Input
 from tensorflow.contrib.keras.python.keras.engine.topology import Layer
 from tensorflow.contrib.keras.python.keras.engine.training import Model
 from tensorflow.contrib.keras.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.framework import ops
 
 
 # pylint: disable=g-import-not-at-top
@@ -50,7 +51,7 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-def save_model(model, filepath, overwrite=True):
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
   """Save a model to a HDF5 file.
 
   The saved model contains:
@@ -68,6 +69,7 @@ def save_model(model, filepath, overwrite=True):
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
+      include_optimizer: If True, save optimizer's state together.
 
   Raises:
       ImportError: if h5py is not available.
@@ -129,7 +131,7 @@ def save_model(model, filepath, overwrite=True):
   model_layers = model.layers
   topology.save_weights_to_hdf5_group(model_weights_group, model_layers)
 
-  if hasattr(model, 'optimizer'):
+  if include_optimizer and hasattr(model, 'optimizer'):
     if isinstance(model.optimizer, optimizers.TFOptimizer):
       warnings.warn(
           'TensorFlow optimizers do not '
@@ -207,7 +209,7 @@ def load_model(filepath, custom_objects=None):
       ValueError: In case of an invalid savefile.
   """
   if h5py is None:
-    raise ImportError('`save_model` requires h5py.')
+    raise ImportError('`load_model` requires h5py.')
 
   if not custom_objects:
     custom_objects = {}
@@ -219,7 +221,7 @@ def load_model(filepath, custom_objects=None):
         obj: object, dict, or list.
 
     Returns:
-        The same structure, where occurences
+        The same structure, where occurrences
             of a custom object name have been replaced
             with the custom object.
     """
@@ -234,7 +236,14 @@ def load_model(filepath, custom_objects=None):
     if isinstance(obj, dict):
       deserialized = {}
       for key, value in obj.items():
-        if value in custom_objects:
+        deserialized[key] = []
+        if isinstance(value, list):
+          for element in value:
+            if element in custom_objects:
+              deserialized[key].append(custom_objects[element])
+            else:
+              deserialized[key].append(element)
+        elif value in custom_objects:
           deserialized[key] = custom_objects[value]
         else:
           deserialized[key] = value
@@ -412,6 +421,14 @@ class Sequential(Model):
       name = prefix + str(K.get_uid(prefix))
     self.name = name
 
+    # The following properties are not actually used by Keras;
+    # they exist for compatibility with TF's variable scoping mechanism.
+    self._updates = []
+    self._scope = None
+    self._reuse = None
+    self._base_name = name
+    self._graph = ops.get_default_graph()
+
     # Add to the model any layers passed to the constructor.
     if layers:
       for layer in layers:
@@ -1006,7 +1023,7 @@ class Sequential(Model):
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples if your dataset
+            be equal to the number of unique samples of your dataset
             divided by the batch size.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
@@ -1017,8 +1034,10 @@ class Sequential(Model):
             - A tuple (inputs, targets, sample_weights).
         validation_steps: Only relevant if `validation_data`
             is a generator.
-            Number of samples to use from validation generator
-            at the end of every epoch.
+            Number of steps to yield from validation generator
+            at the end of every epoch. It should typically
+            be equal to the number of unique samples of your
+            validation dataset divided by the batch size.
         class_weight: Dictionary mapping class indices to a weight
             for the class.
         max_q_size: Maximum size for the generator queue
@@ -1050,7 +1069,7 @@ class Sequential(Model):
                     # and labels, from each line in the file
                     x, y = process_line(line)
                     yield (x, y)
-                f.close()
+                    f.close()
 
         model.fit_generator(generate_arrays_from_file('/my_file.txt'),
                             samples_per_epoch=10000, epochs=10)
@@ -1119,7 +1138,8 @@ class Sequential(Model):
                         steps,
                         max_q_size=10,
                         workers=1,
-                        pickle_safe=False):
+                        pickle_safe=False,
+                        verbose=0):
     """Generates predictions for the input samples from a data generator.
 
     The generator should return the same kind of data as accepted by
@@ -1136,6 +1156,7 @@ class Sequential(Model):
             relies on multiprocessing, you should not pass
             non picklable arguments to the generator
             as they can't be passed easily to children processes.
+        verbose: verbosity mode, 0 or 1.
 
     Returns:
         A Numpy array of predictions.
@@ -1147,7 +1168,8 @@ class Sequential(Model):
         steps,
         max_q_size=max_q_size,
         workers=workers,
-        pickle_safe=pickle_safe)
+        pickle_safe=pickle_safe,
+        verbose=verbose)
 
   def get_config(self):
     config = []
@@ -1159,9 +1181,9 @@ class Sequential(Model):
     return copy.deepcopy(config)
 
   @classmethod
-  def from_config(cls, config):
+  def from_config(cls, config, custom_objects=None):
     model = cls()
     for conf in config:
-      layer = layer_module.deserialize(conf)
+      layer = layer_module.deserialize(conf, custom_objects=custom_objects)
       model.add(layer)
     return model
diff --git a/tensorflow/contrib/keras/python/keras/optimizers.py b/tensorflow/contrib/keras/python/keras/optimizers.py
index b50c18b0e1ce1cf6bdcf20d324a6fe72c1914b57..75fce5c96f6bdb766239ccc59de18a24e7ebdd3f 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers.py
@@ -596,8 +596,9 @@ class Nadam(Optimizer):
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (1. - 0.5 *
                                       (K.pow(0.96, t * self.schedule_decay)))
-    momentum_cache_t_1 = self.beta_1 * (
-        1. - 0.5 * (K.pow(0.96, (t + 1) * self.schedule_decay)))
+    momentum_cache_t_1 = self.beta_1 * (1. - 0.5 *
+                                        (K.pow(0.96,
+                                               (t + 1) * self.schedule_decay)))
     m_schedule_new = self.m_schedule * momentum_cache_t
     m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
     self.updates.append((self.m_schedule, m_schedule_new))
@@ -615,8 +616,8 @@ class Nadam(Optimizer):
       m_t_prime = m_t / (1. - m_schedule_next)
       v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
       v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
-      m_t_bar = (1. - momentum_cache_t
-                ) * g_prime + momentum_cache_t_1 * m_t_prime
+      m_t_bar = (
+          1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
       self.updates.append(K.update(m, m_t))
       self.updates.append(K.update(v, v_t))
diff --git a/tensorflow/contrib/keras/python/keras/optimizers_test.py b/tensorflow/contrib/keras/python/keras/optimizers_test.py
index b3aaddb7c0c837b274c06b4cfb4e5aa4ab68afcf..af5e3c99b96344db7d410d7ff5e31d5f60fa64e9 100644
--- a/tensorflow/contrib/keras/python/keras/optimizers_test.py
+++ b/tensorflow/contrib/keras/python/keras/optimizers_test.py
@@ -41,7 +41,7 @@ def _test_optimizer(optimizer, target=0.75):
                                                       input_shape=(10,),
                                                       num_classes=2)
   y_train = keras.utils.to_categorical(y_train)
-  model = _get_model(x_train.shape[1], 10, y_train.shape[1])
+  model = _get_model(x_train.shape[1], 20, y_train.shape[1])
   model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer,
                 metrics=['accuracy'])
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/image.py b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
index 86c7650a073b2ecc5bddb03cdd4410b9b41e1956..8cceb441dfe7dc72d21b5844895217922351d632 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/image.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/image.py
@@ -711,8 +711,8 @@ class ImageDataGenerator(object):
           'following the data format convention "' + self.data_format + '" '
           '(channels on axis ' + str(self.channel_axis) + '), i.e. expected '
           'either 1, 3 or 4 channels on axis ' + str(self.channel_axis) + '. '
-          'However, it was passed an array with shape ' + str(
-              x.shape) + ' (' + str(x.shape[self.channel_axis]) + ' channels).')
+          'However, it was passed an array with shape ' + str(x.shape) + ' (' +
+          str(x.shape[self.channel_axis]) + ' channels).')
 
     if seed is not None:
       np.random.seed(seed)
@@ -785,7 +785,7 @@ class Iterator(object):
           index_array = np.random.permutation(n)
 
       current_index = (self.batch_index * batch_size) % n
-      if n >= current_index + batch_size:
+      if n > current_index + batch_size:
         current_batch_size = batch_size
         self.batch_index += 1
       else:
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
index 5a24a63b0140620b9ffc7ef9621e00b903d986c3..692a359ead371b858ad447a1c71dc781c40a1bba 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/sequence.py
@@ -156,7 +156,7 @@ def skipgrams(sequence,
           of word indices (integers). If using a `sampling_table`,
           word indices are expected to match the rank
           of the words in a reference dataset (e.g. 10 would encode
-          the 10-th most frequently occuring token).
+          the 10-th most frequently occurring token).
           Note that index 0 is expected to be a non-word and will be skipped.
       vocabulary_size: int. maximum possible word index + 1
       window_size: int. actually half-window.
diff --git a/tensorflow/contrib/keras/python/keras/preprocessing/text.py b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
index b164b613d277a3f30d3cfe83f9d13db7548b292f..5b89c8035c26456a3031598f535862a643ba8a50 100644
--- a/tensorflow/contrib/keras/python/keras/preprocessing/text.py
+++ b/tensorflow/contrib/keras/python/keras/preprocessing/text.py
@@ -280,8 +280,8 @@ class Tokenizer(object):
           # Use weighting scheme 2 in
           # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
           tf = 1 + np.log(c)
-          idf = np.log(1 + self.document_count / (1 + self.index_docs.get(j, 0)
-                                                 ))
+          idf = np.log(1 + self.document_count /
+                       (1 + self.index_docs.get(j, 0)))
           x[i][j] = tf * idf
         else:
           raise ValueError('Unknown vectorization mode:', mode)
diff --git a/tensorflow/contrib/keras/python/keras/testing_utils.py b/tensorflow/contrib/keras/python/keras/testing_utils.py
index baba5447d99397c1e187805609f91af6947b3cc0..bf6f661adff4a22626763f207ef91839766da774 100644
--- a/tensorflow/contrib/keras/python/keras/testing_utils.py
+++ b/tensorflow/contrib/keras/python/keras/testing_utils.py
@@ -18,11 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 import numpy as np
 
 from tensorflow.contrib.keras.python import keras
+from tensorflow.python.util import tf_inspect
 
 
 def get_test_data(train_samples,
@@ -98,7 +97,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   layer.set_weights(weights)
 
   # test and instantiation from weights
-  if 'weights' in inspect.getargspec(layer_cls.__init__):
+  if 'weights' in tf_inspect.getargspec(layer_cls.__init__):
     kwargs['weights'] = weights
     layer = layer_cls(**kwargs)
 
diff --git a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
index ffc131ec4fdfb0bedd7a8846d225f702b1cbeb65..7d4fdda296205e2da03d6590db249fa79879996c 100644
--- a/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/conv_utils.py
@@ -47,11 +47,11 @@ def normalize_tuple(value, n, name):
     try:
       value_tuple = tuple(value)
     except TypeError:
-      raise ValueError('The `' + name + '` argument must be a tuple of ' + str(
-          n) + ' integers. Received: ' + str(value))
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
     if len(value_tuple) != n:
-      raise ValueError('The `' + name + '` argument must be a tuple of ' + str(
-          n) + ' integers. Received: ' + str(value))
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
     for single_value in value_tuple:
       try:
         int(single_value)
@@ -69,8 +69,8 @@ def normalize_data_format(value):
   data_format = value.lower()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('The `data_format` argument must be one of '
-                     '"channels_first", "channels_last". Received: ' + str(
-                         value))
+                     '"channels_first", "channels_last". Received: ' +
+                     str(value))
   return data_format
 
 
diff --git a/tensorflow/contrib/keras/python/keras/utils/data_utils.py b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
index b2d5427c973111201354202547fb260e17b99c5a..5a42444308dab78afac9df4aab7c651c4aef5a7d 100644
--- a/tensorflow/contrib/keras/python/keras/utils/data_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/data_utils.py
@@ -23,7 +23,9 @@ import os
 import shutil
 import sys
 import tarfile
+import zipfile
 
+import six
 from six.moves.urllib.error import HTTPError
 from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
@@ -73,24 +75,108 @@ else:
   from six.moves.urllib.request import urlretrieve  # pylint: disable=g-import-not-at-top
 
 
-def get_file(fname, origin, untar=False, md5_hash=None,
-             cache_subdir='datasets'):
+def _extract_archive(file_path, path='.', archive_format='auto'):
+  """Extracts an archive if it matches tar, tar.gz, tar.bz, or zip formats.
+
+  Arguments:
+      file_path: path to the archive file
+      path: path to extract the archive file
+      archive_format: Archive format to try for extracting the file.
+          Options are 'auto', 'tar', 'zip', and None.
+          'tar' includes tar, tar.gz, and tar.bz files.
+          The default 'auto' is ['tar', 'zip'].
+          None or an empty list will return no matches found.
+
+  Returns:
+      True if a match was found and an archive extraction was completed,
+      False otherwise.
+  """
+  if archive_format is None:
+    return False
+  if archive_format is 'auto':
+    archive_format = ['tar', 'zip']
+  if isinstance(archive_format, six.string_types):
+    archive_format = [archive_format]
+
+  for archive_type in archive_format:
+    if archive_type is 'tar':
+      open_fn = tarfile.open
+      is_match_fn = tarfile.is_tarfile
+    if archive_type is 'zip':
+      open_fn = zipfile.ZipFile
+      is_match_fn = zipfile.is_zipfile
+
+    if is_match_fn(file_path):
+      with open_fn(file_path) as archive:
+        try:
+          archive.extractall(path)
+        except (tarfile.TarError, RuntimeError, KeyboardInterrupt):
+          if os.path.exists(path):
+            if os.path.isfile(path):
+              os.remove(path)
+            else:
+              shutil.rmtree(path)
+          raise
+      return True
+  return False
+
+
+def get_file(fname,
+             origin,
+             untar=False,
+             md5_hash=None,
+             file_hash=None,
+             cache_subdir='datasets',
+             hash_algorithm='auto',
+             extract=False,
+             archive_format='auto',
+             cache_dir=None):
   """Downloads a file from a URL if it not already in the cache.
 
-  Passing the MD5 hash will verify the file after download
-  as well as if it is already present in the cache.
+  By default the file at the url `origin` is downloaded to the
+  cache_dir `~/.keras`, placed in the cache_subdir `datasets`,
+  and given the filename `fname`. The final location of a file
+  `example.txt` would therefore be `~/.keras/datasets/example.txt`.
+
+  Files in tar, tar.gz, tar.bz, and zip formats can also be extracted.
+  Passing a hash will verify the file after download. The command line
+  programs `shasum` and `sha256sum` can compute the hash.
 
   Arguments:
-      fname: name of the file
-      origin: original URL of the file
-      untar: boolean, whether the file should be decompressed
-      md5_hash: MD5 hash of the file for verification
-      cache_subdir: directory being used as the cache
+      fname: Name of the file. If an absolute path `/path/to/file.txt` is
+          specified the file will be saved at that location.
+      origin: Original URL of the file.
+      untar: Deprecated in favor of 'extract'.
+          boolean, whether the file should be decompressed
+      md5_hash: Deprecated in favor of 'file_hash'.
+          md5 hash of the file for verification
+      file_hash: The expected hash string of the file after download.
+          The sha256 and md5 hash algorithms are both supported.
+      cache_subdir: Subdirectory under the Keras cache dir where the file is
+          saved. If an absolute path `/path/to/folder` is
+          specified the file will be saved at that location.
+      hash_algorithm: Select the hash algorithm to verify the file.
+          options are 'md5', 'sha256', and 'auto'.
+          The default 'auto' detects the hash algorithm in use.
+      extract: True tries extracting the file as an Archive, like tar or zip.
+      archive_format: Archive format to try for extracting the file.
+          Options are 'auto', 'tar', 'zip', and None.
+          'tar' includes tar, tar.gz, and tar.bz files.
+          The default 'auto' is ['tar', 'zip'].
+          None or an empty list will return no matches found.
+      cache_dir: Location to store cached files, when None it
+          defaults to the [Keras
+            Directory](/faq/#where-is-the-keras-configuration-filed-stored).
 
   Returns:
       Path to the downloaded file
   """
-  datadir_base = os.path.expanduser(os.path.join('~', '.keras'))
+  if cache_dir is None:
+    cache_dir = os.path.expanduser(os.path.join('~', '.keras'))
+  if md5_hash is not None and file_hash is None:
+    file_hash = md5_hash
+    hash_algorithm = 'md5'
+  datadir_base = os.path.expanduser(cache_dir)
   if not os.access(datadir_base, os.W_OK):
     datadir_base = os.path.join('/tmp', '.keras')
   datadir = os.path.join(datadir_base, cache_subdir)
@@ -106,10 +192,12 @@ def get_file(fname, origin, untar=False, md5_hash=None,
   download = False
   if os.path.exists(fpath):
     # File found; verify integrity if a hash was provided.
-    if md5_hash is not None:
-      if not validate_file(fpath, md5_hash):
+    if file_hash is not None:
+      if not validate_file(fpath, file_hash, algorithm=hash_algorithm):
         print('A local file was found, but it seems to be '
-              'incomplete or outdated.')
+              'incomplete or outdated because the ' + hash_algorithm +
+              ' file hash does not match the original value of ' + file_hash +
+              ' so we will re-download the data.')
         download = True
   else:
     download = True
@@ -141,38 +229,68 @@ def get_file(fname, origin, untar=False, md5_hash=None,
 
   if untar:
     if not os.path.exists(untar_fpath):
-      print('Untaring file...')
-      tfile = tarfile.open(fpath, 'r:gz')
-      try:
-        tfile.extractall(path=datadir)
-      except (Exception, KeyboardInterrupt) as e:
-        if os.path.exists(untar_fpath):
-          if os.path.isfile(untar_fpath):
-            os.remove(untar_fpath)
-          else:
-            shutil.rmtree(untar_fpath)
-        raise
-      tfile.close()
+      _extract_archive(fpath, datadir, archive_format='tar')
     return untar_fpath
 
+  if extract:
+    _extract_archive(fpath, datadir, archive_format)
+
   return fpath
 
 
-def validate_file(fpath, md5_hash):
-  """Validates a file against a MD5 hash.
+def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
+  """Calculates a file sha256 or md5 hash.
+
+  Example:
+
+  ```python
+     >>> from keras.data_utils import _hash_file
+     >>> _hash_file('/path/to/file.zip')
+     'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+  ```
+
+  Arguments:
+      fpath: path to the file being validated
+      algorithm: hash algorithm, one of 'auto', 'sha256', or 'md5'.
+          The default 'auto' detects the hash algorithm in use.
+      chunk_size: Bytes to read at a time, important for large files.
+
+  Returns:
+      The file hash
+  """
+  if (algorithm is 'sha256') or (algorithm is 'auto' and len(hash) is 64):
+    hasher = hashlib.sha256()
+  else:
+    hasher = hashlib.md5()
+
+  with open(fpath, 'rb') as fpath_file:
+    for chunk in iter(lambda: fpath_file.read(chunk_size), b''):
+      hasher.update(chunk)
+
+  return hasher.hexdigest()
+
+
+def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
+  """Validates a file against a sha256 or md5 hash.
 
   Arguments:
       fpath: path to the file being validated
-      md5_hash: the MD5 hash being validated against
+      file_hash:  The expected hash string of the file.
+          The sha256 and md5 hash algorithms are both supported.
+      algorithm: Hash algorithm, one of 'auto', 'sha256', or 'md5'.
+          The default 'auto' detects the hash algorithm in use.
+      chunk_size: Bytes to read at a time, important for large files.
 
   Returns:
       Whether the file is valid
   """
-  hasher = hashlib.md5()
-  with open(fpath, 'rb') as f:
-    buf = f.read()
-    hasher.update(buf)
-  if str(hasher.hexdigest()) == str(md5_hash):
+  if ((algorithm is 'sha256') or
+      (algorithm is 'auto' and len(file_hash) is 64)):
+    hasher = 'sha256'
+  else:
+    hasher = 'md5'
+
+  if str(_hash_file(fpath, hasher, chunk_size)) == str(file_hash):
     return True
   else:
     return False
diff --git a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
index c1e02968353bc4db1b722fe531811c88f2ba8e35..27cc23f232dfe5e753a257809468974cbf90fd93 100644
--- a/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/generic_utils.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import marshal
 import sys
 import time
@@ -26,6 +25,8 @@ import types as python_types
 import numpy as np
 import six
 
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 _GLOBAL_CUSTOM_OBJECTS = {}
 
@@ -116,6 +117,7 @@ def get_custom_objects():
 
 
 def serialize_keras_object(instance):
+  _, instance = tf_decorator.unwrap(instance)
   if instance is None:
     return None
   if hasattr(instance, 'get_config'):
@@ -149,14 +151,14 @@ def deserialize_keras_object(identifier,
       if cls is None:
         raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
     if hasattr(cls, 'from_config'):
-      arg_spec = inspect.getargspec(cls.from_config)
+      arg_spec = tf_inspect.getargspec(cls.from_config)
       if 'custom_objects' in arg_spec.args:
         custom_objects = custom_objects or {}
         return cls.from_config(
             config['config'],
             custom_objects=dict(
-                list(_GLOBAL_CUSTOM_OBJECTS.items()) + list(
-                    custom_objects.items())))
+                list(_GLOBAL_CUSTOM_OBJECTS.items()) +
+                list(custom_objects.items())))
       return cls.from_config(config['config'])
     else:
       # Then `cls` may be a function returning a class.
@@ -172,7 +174,8 @@ def deserialize_keras_object(identifier,
     else:
       fn = module_objects.get(function_name)
       if fn is None:
-        raise ValueError('Unknown ' + printable_module_name, ':' + class_name)
+        raise ValueError('Unknown ' + printable_module_name,
+                         ':' + function_name)
     return fn
   else:
     raise ValueError('Could not interpret serialized ' + printable_module_name +
@@ -215,6 +218,8 @@ def func_load(code, defaults=None, closure=None, globs=None):
   """
   if isinstance(code, (tuple, list)):  # unpack previous dump
     code, defaults, closure = code
+    if isinstance(defaults, list):
+      defaults = tuple(defaults)
   code = marshal.loads(code.encode('raw_unicode_escape'))
   if globs is None:
     globs = globals()
diff --git a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
index 32e0de7d3dc19a4585f5197381ddae9229ad05e9..26878fdd57fb5b4e59e3d8ce27a3f65766d7b98e 100644
--- a/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/layer_utils.py
@@ -171,7 +171,7 @@ def count_total_params(layers, layer_set=None):
           [K.count_params(p) for p in layer.trainable_weights])
       non_trainable_count += np.sum(
           [K.count_params(p) for p in layer.non_trainable_weights])
-  return trainable_count, non_trainable_count
+  return int(trainable_count), int(non_trainable_count)
 
 
 def convert_all_kernels_in_model(model):
diff --git a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
index 49efa6040d2b23640bcec293207f045c96bd9fae..9e2ee86424595c20c516df0d99553e7c7b9d88c3 100644
--- a/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
+++ b/tensorflow/contrib/keras/python/keras/utils/vis_utils.py
@@ -18,17 +18,24 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
+import sys
 
 try:
   # pydot-ng is a fork of pydot that is better maintained.
   import pydot_ng as pydot  # pylint: disable=g-import-not-at-top
 except ImportError:
   # Fall back on pydot if necessary.
+  # Silence a `print` statement that occurs in case of import error,
+  # by temporarily replacing sys.stdout.
+  _stdout = sys.stdout
+  sys.stdout = sys.stderr
   try:
     import pydot  # pylint: disable=g-import-not-at-top
   except ImportError:
     pydot = None
+  finally:
+    # Restore sys.stdout.
+    sys.stdout = _stdout
 
 
 def _check_pydot():
diff --git a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
index ecda890fec966e7151ed50d40825fde72ea5ffd2..0d04fc120f112d91cfbdcc59d1555d5fcb57e0ed 100644
--- a/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
+++ b/tensorflow/contrib/keras/python/keras/wrappers/scikit_learn.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import inspect
 import types
 
 import numpy as np
 
 from tensorflow.contrib.keras.python.keras.models import Sequential
 from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.util import tf_inspect
 
 
 class BaseWrapper(object):
@@ -97,7 +97,7 @@ class BaseWrapper(object):
 
     legal_params = []
     for fn in legal_params_fns:
-      legal_params += inspect.getargspec(fn)[0]
+      legal_params += tf_inspect.getargspec(fn)[0]
     legal_params = set(legal_params)
 
     for params_name in params:
@@ -109,7 +109,7 @@ class BaseWrapper(object):
     """Gets parameters for this estimator.
 
     Arguments:
-        **params: ignored (exists for API compatiblity).
+        **params: ignored (exists for API compatibility).
 
     Returns:
         Dictionary of parameter names mapped to their values.
@@ -182,7 +182,7 @@ class BaseWrapper(object):
     """
     override = override or {}
     res = {}
-    fn_args = inspect.getargspec(fn)[0]
+    fn_args = tf_inspect.getargspec(fn)[0]
     for name, value in self.sk_params.items():
       if name in fn_args:
         res.update({name: value})
@@ -194,6 +194,36 @@ class KerasClassifier(BaseWrapper):
   """Implementation of the scikit-learn classifier API for Keras.
   """
 
+  def fit(self, x, y, **kwargs):
+    """Constructs a new model with `build_fn` & fit the model to `(x, y)`.
+
+    Arguments:
+        x : array-like, shape `(n_samples, n_features)`
+            Training samples where n_samples in the number of samples
+            and n_features is the number of features.
+        y : array-like, shape `(n_samples,)` or `(n_samples, n_outputs)`
+            True labels for X.
+        **kwargs: dictionary arguments
+            Legal arguments are the arguments of `Sequential.fit`
+
+    Returns:
+        history : object
+            details about the training history at each epoch.
+
+    Raises:
+        ValueError: In case of invalid shape for `y` argument.
+    """
+    y = np.array(y)
+    if len(y.shape) == 2 and y.shape[1] > 1:
+      self.classes_ = np.arange(y.shape[1])
+    elif (len(y.shape) == 2 and y.shape[1] == 1) or len(y.shape) == 1:
+      self.classes_ = np.unique(y)
+      y = np.searchsorted(self.classes_, y)
+    else:
+      raise ValueError('Invalid shape for y: ' + str(y.shape))
+    self.n_classes_ = len(self.classes_)
+    return super(KerasClassifier, self).fit(x, y, **kwargs)
+
   def predict(self, x, **kwargs):
     """Returns the class predictions for the given test data.
 
@@ -210,7 +240,8 @@ class KerasClassifier(BaseWrapper):
             Class predictions.
     """
     kwargs = self.filter_sk_params(Sequential.predict_classes, kwargs)
-    return self.model.predict_classes(x, **kwargs)
+    classes = self.model.predict_classes(x, **kwargs)
+    return self.classes_[classes]
 
   def predict_proba(self, x, **kwargs):
     """Returns class probability estimates for the given test data.
@@ -261,6 +292,7 @@ class KerasClassifier(BaseWrapper):
             compute accuracy. You should pass `metrics=["accuracy"]` to
             the `.compile()` method of the model.
     """
+    y = np.searchsorted(self.classes_, y)
     kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
 
     loss_name = self.model.loss
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index b37cbc119f427bd14be584f8991197bea35afee1..fccaa3abd4d70519741b8e375a077d95b51f1b2c 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -21,9 +21,14 @@ py_library(
         ":dense_kernel_mapper_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -31,6 +36,7 @@ py_library(
     name = "dense_kernel_mapper_py",
     srcs = ["python/mappers/dense_kernel_mapper.py"],
     srcs_version = "PY2AND3",
+    deps = ["@six_archive//:six"],
 )
 
 py_test(
@@ -40,12 +46,12 @@ py_test(
     deps = [
         ":dense_kernel_mapper_py",
         ":kernel_methods",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
     ],
 )
 
@@ -55,10 +61,12 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":kernel_methods",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/learn",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/kernel_methods/README.md b/tensorflow/contrib/kernel_methods/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..44ed9670a09ece8fb11e79a3e58725e2a54e513b
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/README.md
@@ -0,0 +1,53 @@
+# TensorFlow contrib kernel_methods.
+
+This module contains operations and estimators that enable the use of primal
+(explicit) kernel methods in TensorFlow. See also the [tutorial](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/g3doc/tutorial.md) on how to use this module to improve the quality of
+classification or regression tasks.
+
+## Kernel Mappers
+Implement explicit kernel mapping Ops over tensors. Kernel mappers add
+Tensor-In-Tensor-Out (TITO) Ops to the TensorFlow graph. They can be used in
+conjunction with other layers or ML models.
+
+Sample usage:
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.SomeKernelMapper(...)
+out_tensor = kernel_mapper.map(in_tensor)
+...  # code that consumes out_tensor.
+```
+
+Currently, there is a [RandomFourierFeatureMapper](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py) implemented that maps dense input to dense
+output. More mappers are on the way.
+
+## Kernel-based Estimators
+These are estimators inheriting from the @{tf.contrib.learn.Estimator} class and
+use kernel mappers internally to discover non-linearities in the data. These
+canned estimators map their input features using kernel mapper Ops and then
+apply linear models to the mapped features. Combining kernel mappers with linear
+models and different loss functions leads to a variety of models: linear and
+non-linear SVMs, linear regression (with and without kernels) and (multinomial)
+logistic regression (with and without kernels).
+
+Currently there is a [KernelLinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/kernel_estimators.py) implemented but more pre-packaged estimators
+are on the way.
+
+Sample usage:
+
+```python
+real_column_a = tf.contrib.layers.real_valued_column(name='real_column_a',...)
+sparse_column_b = tf.contrib.layers.sparse_column_with_hash_bucket(...)
+kernel_mappers = {real_column_a : [tf.contrib.kernel_methods.SomeKernelMapper(...)]}
+optimizer = ...
+
+kernel_classifier = tf.contrib.kernel_methods.KernelLinearClassifier(
+    feature_columns=[real_column_a, sparse_column_b],
+    model_dir=...,
+    optimizer=optimizer,
+    kernel_mappers=kernel_mappers)
+
+# Construct input_fns
+kernel_classifier.fit(...)
+kernel_classifier.evaluate(...)
+```
+
diff --git a/tensorflow/contrib/kernel_methods/__init__.py b/tensorflow/contrib/kernel_methods/__init__.py
index 1a3a0ab77a6d24e39dd9fc3626410141969f1318..7272e5951605617ffcde46bab2743bde70123f35 100644
--- a/tensorflow/contrib/kernel_methods/__init__.py
+++ b/tensorflow/contrib/kernel_methods/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ops for evaluation metrics and summary statistics.
+"""Ops and estimators that enable explicit kernel methods in TensorFlow.
 
 @@KernelLinearClassifier
 @@RandomFourierFeatureMapper
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.kernel_methods.python.kernel_estimators import KernelLinearClassifier
-from tensorflow.contrib.kernel_methods.python.mappers import dense_kernel_mapper
 from tensorflow.contrib.kernel_methods.python.mappers.random_fourier_features import RandomFourierFeatureMapper
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..1028bb390179cd21dba0b41b53a0b1eff4e345a4
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc-vs-trn_time.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3384e053b24d0225576a8610147b042fb7b1708
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/acc_vs_outdim.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png
new file mode 100644
index 0000000000000000000000000000000000000000..e63303dab45d671acbeb839b22726ac0877dffae
Binary files /dev/null and b/tensorflow/contrib/kernel_methods/g3doc/kernel_mapping.png differ
diff --git a/tensorflow/contrib/kernel_methods/g3doc/tutorial.md b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..64c2adf9f39f8b49ca2135d5084a526789c1f4fd
--- /dev/null
+++ b/tensorflow/contrib/kernel_methods/g3doc/tutorial.md
@@ -0,0 +1,279 @@
+# Improving Linear Models Using Explicit Kernel Methods
+
+In this tutorial, we demonstrate how combining (explicit) kernel methods with
+linear models can drastically increase the latters' quality of predictions
+without significantly increasing training and inference times. Unlike dual
+kernel methods, explicit (primal) kernel methods scale well with the size of the
+training dataset both in terms of training/inference times and in terms of
+memory requirements.
+
+Currently, explicit kernel mappings are supported for dense features. Support
+for sparse features is in the works.
+
+We will use [tf.contrib.learn](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn) (TensorFlow's high-level Machine Learning API) Estimators for our ML models. The
+tf.contrib.learn API reduces the boilerplate code one needs to write for
+configuring, training and evaluating models and will let us focus on the core
+ideas. If you are not familiar with this API, [tf.contrib.learn Quickstart](https://www.tensorflow.org/get_started/tflearn) is a good place to start. We
+will use MNIST, a widely-used dataset containing images of handwritten digits
+(between 0 and 9). The tutorial consists of the following steps:
+
+* Load and prepare MNIST data for classification.
+* Construct a simple linear model, train it and evaluate it on the eval data.
+* Replace the linear model with a kernelized linear model, re-train and
+re-evaluate.
+
+## Load and prepare MNIST data for classification
+The first step is to prepare the data to be fed to the ML models. The following
+utility command from tf.contrib.learn loads the MNIST dataset:
+
+```python
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+```
+This loads the entire MNIST dataset (containing 70K samples) and splits it into
+train, validation and test data with 55K, 5K and 10K samples respectively. Each
+split contains one numpy array for images (with shape [sample_size, 784]) and
+one for labels (with shape [sample_size, 1]). In this tutorial, we only use the
+train and validation splits (to train and evaluate our models respectively).
+
+In order to feed data to a tf.contrib.learn Estimator, it is helpful to convert
+it to Tensors. For this, we will use an `input function` which adds Ops to the
+TensorFlow graph that, when executed, create mini-batches of Tensors to be used
+downstream. For more background on input functions, check
+[Building Input Functions with tf.contrib.learn](https://www.tensorflow.org/get_started/input_fn).
+In this example, we will use the `tf.train.shuffle_batch` Op which, besides
+converting numpy arrays to Tensors, allows us to specify the batch_size and
+whether to randomize the input every time the input_fn Ops are executed
+(randomization typically expedites convergence during training). The full code
+for loading and preparing the data is shown in the snippet below. In this
+example, we use mini-batches of size 256 for training and the entire sample (5K
+entries) for evaluation. Feel free to experiment with different batch sizes.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+def get_input_fn(dataset_split, batch_size, capacity=10000, min_after_dequeue=3000):
+
+  def _input_fn():
+    images_batch, labels_batch = tf.train.shuffle_batch(
+        tensors=[dataset_split.images, dataset_split.labels.astype(np.int32)],
+        batch_size=batch_size,
+        capacity=capacity,
+        min_after_dequeue=min_after_dequeue,
+        enqueue_many=True,
+        num_threads=4)
+    features_map = {'images': images_batch}
+    return features_map, labels_batch
+
+  return _input_fn
+
+data = tf.contrib.learn.datasets.mnist.load_mnist()
+
+train_input_fn = get_input_fn(data.train, batch_size=256)
+eval_input_fn = get_input_fn(data.validation, batch_size=5000)
+
+```
+
+## Training a simple linear model
+We can now train a linear model over the MNIST dataset. We will use the
+[tf.contrib.learn.LinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py) estimator with 10 classes (representing the 10 digits).
+The input features form a 784-dimensional (dense) vector which can be specified
+as follows:
+
+```python
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+```
+
+The full code for constructing, training and evaluating a LinearClassifier
+estimator is shown below.
+
+```python
+import time
+
+# Specify the feature(s) to be used by the estimator.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+estimator = tf.contrib.learn.LinearClassifier(feature_columns=[image_column], n_classes=10)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+On eval data, the loss (i.e., the value of the objective function being
+minimized during training) lies between **0.25** and **0.30** (depending on the
+parameters used) while the accuracy of the classifier is approximately **92.5%**
+(training is randomized so the exact loss and accuracy will vary). Also, the
+training time is around 25 seconds (this will also vary based on the machine you
+run the code on).
+
+In addition to experimenting with the (training) batch size and the number of
+training steps, there are a couple other parameters that can be tuned as well.
+For instance, you can change the optimization method used to minimize the loss
+by explicitly selecting another optimizer from the collection of
+[available optimizers](https://www.tensorflow.org/code/tensorflow/python/training).
+As an example, the following code constructs a LinearClassifier estimator that
+uses the Follow-The-Regularized-Leader (FTRL) optimization strategy with a
+specific learning rate and L2-regularization.
+
+
+```python
+optimizer = tf.train.FtrlOptimizer(learning_rate=5.0, l2_regularization_strength=1.0)
+estimator = tf.contrib.learn.LinearClassifier(
+    feature_columns=[image_column], n_classes=10, optimizer=optimizer)
+```
+
+Regardless of the values of the parameters, the max accuracy a linear model can
+achieve on this dataset caps at around **93%**.
+
+## Using explicit kernel mappings with the linear model.
+The relatively high error (~7%) of the linear model over MNIST indicates that
+the input data is not linearly separable. We will use explicit kernel mappings
+to reduce the classification error.
+
+**Intuition:** The high-level idea is to use a non-linear map to transform the
+input space to another feature space (of possibly higher dimension) where the
+(transformed) features are (almost) linearly separable and then apply a linear
+model on the mapped features. This is shown in the following figure:
+
+![image](./kernel_mapping.png)
+
+**Technical details overview:** In this example we will use **Random Fourier
+Features** (introduced in the
+["Random Features for Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf) paper by
+Rahimi and Recht) to map the input data. Random Fourier Features map a vector
+\\(\mathbf{x} \in \mathbb{R}^d\\) to \\(\mathbf{x'} \in \mathbb{R}^D\\) via the
+following mapping:
+
+$$
+RFFM(\cdot): \mathbb{R}^d \to \mathbb{R}^D, \quad
+RFFM(\mathbf{x}) =  \cos(\mathbf{\Omega} \cdot \mathbf{x}+ \mathbf{b})
+$$
+
+where \\(\mathbf{\Omega} \in \mathbb{R}^{D \times d}\\),
+\\(\mathbf{x} \in \mathbb{R}^d,\\) \\(\mathbf{b} \in \mathbb{R}^D\\) and the
+cosine is applied element-wise.
+
+In this example, the entries of \\(\mathbf{\Omega}\\) and \\(\mathbf{b}\\) are
+sampled from distributions such that the mapping satisfies the following
+property:
+
+$$
+RFFM(\mathbf{x})^T \cdot RFFM(\mathbf{y}) \approx
+e^{-\frac{\|\mathbf{x} - \mathbf{y}\|^2}{2 \sigma^2}}
+$$
+
+The right-hand-side quantity of the expression above is known as the RBF (or
+Gaussian) kernel function. This function is one of the most-widely used kernel
+functions in Machine Learning and measures (implicitly) similarity in a
+different (much higher dimensional) space than the original one. See
+[Radial basis function kernel](https://en.wikipedia.org/wiki/Radial_basis_function_kernel)
+for more details.
+
+**Kernel Classifier:** `tf.contrib.kernel_methods.KernelLinearClassifier` is a
+pre-packaged `tf.contrib.learn` estimator that combines the power of explicit
+kernel mappings with linear models. Its API is very similar to that of the
+LinearClassifier with the additional ability to specify a list of explicit
+kernel mappings to be applied to each feature used by the classifier. The
+following code snippet demonstrates how to replace LinearClassifier with
+KernelLinearClassifier.
+
+
+```python
+# Specify the feature(s) to be used by the estimator. This is identical to the
+# code used for the LinearClassifier.
+image_column = tf.contrib.layers.real_valued_column('images', dimension=784)
+optimizer = tf.train.FtrlOptimizer(
+   learning_rate=50.0, l2_regularization_strength=0.001)
+
+
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+
+# Train.
+start = time.time()
+estimator.fit(input_fn=train_input_fn, steps=2000)
+end = time.time()
+print('Elapsed time: {} seconds'.format(end - start))
+
+# Evaluate and report metrics.
+eval_metrics = estimator.evaluate(input_fn=eval_input_fn, steps=1)
+print(eval_metrics)
+```
+The only additional parameter passed to `KernelLinearClassifier` is a dictionary
+from feature_columns to a list of kernel mappings to be applied to the
+corresponding feature column. In this example, the lines
+
+```python
+kernel_mapper = tf.contrib.kernel_methods.RandomFourierFeatureMapper(
+  input_dim=784, output_dim=2000, stddev=5.0, name='rffm')
+kernel_mappers = {image_column: [kernel_mapper]}
+estimator = tf.contrib.kernel_methods.KernelLinearClassifier(
+   n_classes=10, optimizer=optimizer, kernel_mappers=kernel_mappers)
+```
+instruct the classifier to first map the initial 784-dimensional images to
+2000-dimensional vectors using random Fourier features and then learn a linear
+model on the transformed vectors. Note that, besides the output dimension, there
+is one more parameter (stddev) involved. This parameter is the standard
+deviation (\\(\sigma\\)) of the approximated RBF kernel and controls the
+similarity measure used in classification. This parameter is typically
+determined via hyperparameter tuning.
+
+Running the code above yields a loss of approximately **0.10** while the
+accuracy is increased to approximately **97%** on eval data (an increase of 4%
+over the plain linear model). The training time hovers around 35 seconds. We can
+increase the accuracy even more, by increasing the output dimension of the
+mapping and tuning the standard deviation even more.
+
+**On the role of stddev:** The classification quality is very sensitive to the
+value of the stddev parameter used to define the similarity measure between the
+pairs of input features. The following table shows the accuracy of the
+classifier on the eval data for different values of stddev (for all experiments
+the output dimension was fixed to 3000). The optimal value is stddev=5.0. Notice
+how too small or too high stddev values can dramatically decrease the accuracy
+of the classification.
+
+stddev | eval accuracy
+:----- | :------------
+1.0    | 0.1362
+2.0    | 0.4764
+4.0    | 0.9654
+5.0    | 0.9766
+8.0    | 0.9714
+16.0   | 0.8878
+
+**On the role of the output dimension:** Intuitively, the larger the output
+dimension of the mapping, the closer the inner product of two mapped vectors
+approximates the kernel which typically translates to better classification
+accuracy. Another way to think about this is that the output dimension equals
+the number of weights of the linear model (the larger this dimension, the larger
+the "degrees of freedom" of the model). However, after a certain threshold,
+higher output dimensions increase the accuracy by very little (while still
+increasing the training time). This is shown in the following 2 Figures which
+depict the eval accuracy as a function of the output dimension and the training
+time respectively.
+
+![image](./acc_vs_outdim.png)  ![image](./acc-vs-trn_time.png)
+
+
+## Explicit kernel mappings: summary and practical tips
+* Explicit kernel mappings combine the predictive power of non-linear models
+with the scalability of linear models.
+* Unlike traditional dual kernel methods, they can scale to millions or hundreds
+of millions of samples.
+* Random Fourier Features can be particularly effective for datasets with dense
+features.
+* The parameters of the kernel mapping are often data-dependent. Model quality
+can be very sensitive to these parameters. Use hyperparameter tuning to find the
+optimal values.
+* If you have multiple numerical features, concatinate them into a single
+multi-dimensional feature and apply the kernel mapping to the concatenated
+vector.
+
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
index 80370824878d384ecd1ed70bd2f18e38886ec452..de7530231db4ea4f50996a67eb8c0d6936db9dd3 100644
--- a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
@@ -59,7 +59,7 @@ def _update_features_and_columns(features, feature_columns,
                                  kernel_mappers_dict):
   """Updates features and feature_columns based on provided kernel mappers.
 
-  Currently supports the update of RealValuedColumns only.
+  Currently supports the update of `RealValuedColumn`s only.
 
   Args:
     features: Initial features dict. The key is a `string` (feature column name)
@@ -176,7 +176,7 @@ class _KernelEstimator(estimator.Estimator):
         "head": head,
         _FEATURE_COLUMNS: feature_columns or [],
         _OPTIMIZER: optimizer,
-        _KERNEL_MAPPERS: kernel_mappers
+        _KERNEL_MAPPERS: kernel_mappers,
     }
     super(_KernelEstimator, self).__init__(
         model_fn=_kernel_model_fn,
@@ -196,8 +196,10 @@ class KernelLinearClassifier(_KernelEstimator):
   The user can provide a list of kernel mappers to be applied to all or a subset
   of existing feature_columns. This way, the user can effectively provide 2
   types of feature columns:
-  - those passed as elements of feature_columns in the classifier's constructor
-  - those appearing as a key of the kernel_mappers dict.
+
+  * those passed as elements of feature_columns in the classifier's constructor
+  * those appearing as a key of the kernel_mappers dict.
+
   If a column appears in feature_columns only, no mapping is applied to it. If
   it appears as a key in kernel_mappers, the corresponding kernel mappers are
   applied to it. Note that it is possible that a column appears in both places.
@@ -237,8 +239,9 @@ class KernelLinearClassifier(_KernelEstimator):
   kernel_classifier.predict(...)
   ```
 
-  Input of `fit` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
+  Input of `fit` and `evaluate` should have following features, otherwise there
+  will be a `KeyError`:
+
   * if `weight_column_name` is not `None`, a feature with
     `key=weight_column_name` whose value is a `Tensor`.
   * for each `column` in `feature_columns`:
@@ -300,6 +303,7 @@ class KernelLinearClassifier(_KernelEstimator):
         weight_column_name=weight_column_name,
         head=head_lib.multi_class_head(
             n_classes=n_classes, weight_column_name=weight_column_name),
+        optimizer=optimizer,
         kernel_mappers=kernel_mappers,
         config=config)
 
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 270a2439703288a63c12c7a5334875967d544697..9dc01124ab195ae17b8795a11e4ebefe3f2c746b 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -31,26 +31,31 @@ from tensorflow.python.ops import math_ops
 # TODO(sibyl-vie3Poto,felixyu): add an option to control whether the parameters in the
 # kernel map are trainable.
 class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
-  """Class that implements Random Fourier Feature Mapping.
+  r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-    exp(-||x-y||_2^2 / (2 * sigma^2))
+  ```
+  exp(-||x-y||_2^2 / (2 * sigma^2))
+  ```
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix Omega in R^{d x D} and a bias vector b in R^D where
-  d is the input dimension (number of dense input features) and D is the output
-  dimension (i.e., dimension of the feature space the input is mapped to). Each
-  entry of Omega is sampled i.i.d. from a (scaled) Gaussian distribution and
-  each entry of the bias vector is sampled i.i.d. and uniformly from [0, 2*pi].
+  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
+  where `d` is the input dimension (number of dense input features) and `D` is
+  the output dimension (i.e., dimension of the feature space the input is mapped
+  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
+  distribution and each entry of `b` is sampled independently and uniformly from
+  [0, 2 * pi].
 
   For a single input feature vector x in R^d, its RFFM is defined as:
-              sqrt(2/D) * cos(x * Omega + b)
-  where cos is the element-wise cosine function and x, b are represented as row
-  vectors. The aforementioned paper shows that the linear kernel of RFFM-mapped
-  vectors approximates the Gaussian kernel of the initial vectors.
+  ```
+      sqrt(2/D) * cos(x * Omega + b)
+  ```
+  where `cos` is the element-wise cosine function and `x, b` are represented as
+  row vectors. The aforementioned paper shows that the linear kernel of
+  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
 
   """
 
@@ -63,8 +68,8 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (Omega and bias) of the
-        mapper. For repeatable sequences across different invocations of the
+      seed: An integer used to initialize the parameters (`Omega` and `b`) of
+        the mapper. For repeatable sequences across different invocations of the
         mapper object (for instance, to ensure consistent mapping both at
         training and eval/inference if these happen in different invocations),
         set this to the same integer.
@@ -83,13 +88,13 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
 
   @property
   def name(self):
-    """Returns a name for the RandomFourierFeatureMapper instance.
+    """Returns a name for the `RandomFourierFeatureMapper` instance.
 
-    If the name provided in the constructor is None, then the object's unique id
-    is returned.
+    If the name provided in the constructor is `None`, then the object's unique
+    id is returned.
 
     Returns:
-      A name for the RandomFourierFeatureMapper instance.
+      A name for the `RandomFourierFeatureMapper` instance.
     """
     return self._name or str(id(self))
 
@@ -105,15 +110,15 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
     """Maps each row of input_tensor using random Fourier features.
 
     Args:
-      input_tensor: tensor containing input features. It's shape is
+      input_tensor: a `Tensor` containing input features. It's shape is
       [batch_size, self._input_dim].
 
     Returns:
-      A tensor of shape [batch_size, self._output_dim] containing RFFM-mapped
+      A `Tensor` of shape [batch_size, self._output_dim] containing RFFM-mapped
       features.
 
     Raises:
-      InvalidShapeError: if the shape of the input_tensor is inconsistent with
+      InvalidShapeError: if the shape of the `input_tensor` is inconsistent with
         expected input dimension.
     """
     input_tensor_shape = input_tensor.get_shape()
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 200d00b6637e002fb88b9744ab89b3c448c81fcd..6f4a264485993ab737723171409042b4a9673669 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -85,7 +85,7 @@ class RandomFourierFeatureMapperTest(TensorFlowTestCase):
       mapped_x = rffm.map(x)
       mapped_x_copy = rffm.map(x)
       # Two different evaluations of tensors output by map on the same input
-      # are identical because the same paramaters are used for the mappings.
+      # are identical because the same parameters are used for the mappings.
       self.assertAllClose(mapped_x.eval(), mapped_x_copy.eval(), atol=0.001)
 
   def testTwoMapperObjects(self):
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
index 4a939cb22c54a3a1eaa2466cae676211a81d2027..80fa17ec1f7f0ddaaaabaee4536a2d8ca0c93657 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/_typecheck.py
@@ -24,9 +24,9 @@ from __future__ import print_function
 
 import collections
 import functools
-import inspect
 import re
 
+from tensorflow.python.util import tf_inspect
 
 # used for register_type_abbreviation and _type_repr below.
 _TYPE_ABBREVIATIONS = {}
@@ -230,7 +230,7 @@ def accepts(*types):
 
   def check_accepts(f):
     """Check the types."""
-    spec = inspect.getargspec(f)
+    spec = tf_inspect.getargspec(f)
 
     num_function_arguments = len(spec.args)
     if len(types) != num_function_arguments:
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/core.py b/tensorflow/contrib/labeled_tensor/python/ops/core.py
index 393c7f93f367e3fea9b91ebce773bd1596a77cf4..d886a17c498c835c83764baa31e6b803760919bc 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/core.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/core.py
@@ -618,7 +618,7 @@ def identity(labeled_tensor, name=None):
 def slice_function(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
 
-  This is an analogue of tf.slice.
+  This is an analog of tf.slice.
   For example:
   >>> tensor = tf.reshape(tf.range(0, 6), [3, 2])
   >>> labeled_tensor = lt.LabeledTensor(tensor, ['a', ('b', ['foo', 'bar'])])
@@ -810,7 +810,7 @@ def axis_order_scope(axis_order=None):
   Example usage:
 
     with lt.axis_order_scope(['x', 'y', 'z']):
-      # result is guranteed to have the correct axis order
+      # result is guaranteed to have the correct axis order
       result = w + b
 
   You can nest scopes, in which case only the inner-most scope applies, e.g.,
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index 98842494face96158d24bd89caaf5532024a39ef..c957b41a49b292225e547ce17b0c5a247810325a 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -51,8 +51,7 @@ def _gather_1d_on_axis(labeled_tensor, indexer, axis, name=None):
 @tc.returns(core.LabeledTensor)
 @tc.accepts(core.LabeledTensorLike,
             tc.Mapping(string_types,
-                       tc.Union(slice, collections.Hashable,
-                                collections.Sequence)),
+                       tc.Union(slice, collections.Hashable, list)),
             tc.Optional(string_types))
 def select(labeled_tensor, selection, name=None):
   """Slice out a subset of the tensor.
@@ -110,23 +109,22 @@ def select(labeled_tensor, selection, name=None):
 
         slices[axis_name] = slice(start, stop)
 
-      else:
-        # We're allowing anything NumPy treats as a scalar or 1D array.
-        value = np.asarray(value)
-        if value.ndim == 0:
-          slices[axis_name] = axis.index(value.item())
-        elif value.ndim == 1:
-          if indexers:
-            raise NotImplementedError(
-                'select does not yet support more than one list selection at '
-                'the same time')
-          indexer = [axis.index(v) for v in value.tolist()]
-          indexers[axis_name] = ops.convert_to_tensor(
-              indexer, dtype=dtypes.int64)
-        else:
+      # Needs to be after checking for slices, since slice objects claim to be
+      # instances of collections.Hashable but hash() on them fails.
+      elif isinstance(value, collections.Hashable):
+        slices[axis_name] = axis.index(value)
+
+      elif isinstance(value, list):
+        if indexers:
           raise NotImplementedError(
-              'select does not yet support selections with more than one '
-              'dimension: %s on axis %r' % (value, axis_name))
+              'select does not yet support more than one list selection at '
+              'the same time')
+        indexer = [axis.index(v) for v in value]
+        indexers[axis_name] = ops.convert_to_tensor(indexer, dtype=dtypes.int64)
+
+      else:
+        # If type checking is working properly, this shouldn't be possible.
+        raise TypeError('cannot handle arbitrary types')
 
     if indexers and slices:
       raise NotImplementedError(
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
index ea5e008752391053cbe77b88f927642be07a125a..0727f4cf88728dc3d919e662d65c93a658ac730b 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops_test.py
@@ -121,6 +121,13 @@ class SelectTest(Base):
     golden_lt = core.LabeledTensor(self.tensor[1, 1, :, :], [self.a2, self.a3])
     self.assertLabeledTensorsEqual(select_lt, golden_lt)
 
+  def test_tuple(self):
+    original_lt = core.LabeledTensor(constant_op.constant([5, 6]),
+                                     [('x', [(1, 2), (3, 4)])])
+    select_lt = ops.select(original_lt, {'x': (1, 2)})
+    golden_lt = core.LabeledTensor(constant_op.constant(5), [])
+    self.assertLabeledTensorsEqual(select_lt, golden_lt)
+
   def test_invalid_input(self):
     with self.assertRaises(ValueError):
       ops.select(self.original_lt, {'foo': 1})
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index f5eb723e2db94fb2f410ad0783e5a3adc8a6c4e8..03af3771495fb3919062f1cafd3b757f87c2344d 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -18,35 +18,6 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
-tf_custom_op_library(
-    # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
-    name = "python/ops/_bucketization_op.so",
-    srcs = [
-        "ops/bucketization_op.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["bucketization_op"],
-)
-
-tf_gen_op_wrapper_py(
-    name = "bucketization_op",
-    deps = [":bucketization_op_op_lib"],
-)
-
-tf_kernel_library(
-    name = "bucketization_op_kernel",
-    deps = [
-        "//tensorflow/contrib/layers/kernels:bucketization_kernel",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
-)
-
 tf_custom_op_library(
     # TODO(sibyl-Mooth6ku,ptucker): Understand why 'python/ops/_' is needed and fix it.
     name = "python/ops/_sparse_feature_cross_op.so",
@@ -97,18 +68,14 @@ tf_custom_op_py_library(
         "python/ops/sparse_ops.py",
     ],
     dso = [
-        ":python/ops/_bucketization_op.so",
         ":python/ops/_sparse_feature_cross_op.so",
     ],
     kernels = [
-        ":bucketization_op_kernel",
         ":sparse_feature_cross_op_kernel",
-        ":bucketization_op_op_lib",
         ":sparse_feature_cross_op_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":bucketization_op",
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/lookup:lookup_py",
@@ -118,6 +85,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:common_shapes",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
@@ -131,13 +99,16 @@ tf_custom_op_py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:standard_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
         "@six_archive//:six",
     ],
 )
@@ -312,22 +283,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "bucketization_op_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bucketization_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":layers_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "sparse_feature_cross_op_test",
     size = "medium",
@@ -349,6 +304,7 @@ py_test(
 py_test(
     name = "embedding_ops_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/layers/embedding_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/layers/README.md b/tensorflow/contrib/layers/README.md
index 7b374172f5c036fc99e37b05401c305b08814105..9310b194dff99529be6f1d5875bf242e810ef041 100644
--- a/tensorflow/contrib/layers/README.md
+++ b/tensorflow/contrib/layers/README.md
@@ -18,17 +18,14 @@ these arguments.
      …,
      weight_init=<DEFAULT>,
      bias_init=<DEFAULT>,
-     weight_collections=(tf.GraphKeys.WEIGHTS,),
-     bias_collections=(tf.GraphKeys.BIASES,),
-     output_collections=(tf.GraphKeys.ACTIVATIONS,),
      weight_regularizer=None,
      bias_regularizer=None,
      name=None) : Tensor`
 
 `x` is the input tensor.
 
-Weights, biases, and activations (i.e., outputs) are, by default, added to the specified collections. Weights and biases are also added to
-`tf.GraphKeys.GLOBAL_VARIABLES` and `tf.GraphKeys.TRAINABLE_VARIABLES`.
+Weights and biases are added to `tf.GraphKeys.GLOBAL_VARIABLES` and
+`tf.GraphKeys.TRAINABLE_VARIABLES`.
 
 ## optimizers.py
 
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 8e2516dcd579459d3a801a35b68a689526f59781..299a8867099f1805077914c74452fa4545884660 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -24,6 +24,7 @@ See the @{$python/contrib.layers} guide.
 @@conv2d_transpose
 @@convolution2d_transpose
 @@dropout
+@@elu
 @@embedding_lookup_unique
 @@flatten
 @@fully_connected
@@ -99,6 +100,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = ['bias_add',
                     'conv2d',
+                    'elu',
                     'feature_column',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index 7a2d6d8c4f714891e875db89b1cc770aa0b6b4db..15b984f93893b9da3a202129b7532c37338fb4d4 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -7,17 +7,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
-cc_library(
-    name = "bucketization_kernel",
-    srcs = ["bucketization_kernel.cc"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf//:protobuf_headers",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "sparse_feature_cross_kernel",
     srcs = ["sparse_feature_cross_kernel.cc"],
diff --git a/tensorflow/contrib/layers/ops/bucketization_op.cc b/tensorflow/contrib/layers/ops/bucketization_op.cc
deleted file mode 100644
index d90d47a1eb4ecc2c0ba49506ead955eaf40fa81c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/layers/ops/bucketization_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("Bucketize")
-    .Input("input: T")
-    .Output("output: int32")
-    .Attr("T: {int32, int64, float, double}")
-    .Attr("boundaries: list(float)")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Bucketizes 'input' based on 'boundaries'.
-
-For example, if the inputs are
-    boundaries = [0, 10, 100]
-    input = [[-5, 10000]
-             [150,   10]
-             [5,    100]]
-
-then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
-
-input: Any shape of Tensor contains with int or float type.
-boundaries: A sorted list of floats gives the boundary of the buckets.
-output: Same shape with 'input', each value of input replaced with bucket index.
-
-)doc");
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index e42e885364c478f3302953c68cbfa03cc7cb8398..f8f4122d1db4470701cd1d9599add842349943f4 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -22,11 +22,13 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -98,7 +100,13 @@ def safe_embedding_lookup_sparse(embedding_weights,
     logging.warn("The default value of combiner will change from \"mean\" "
                  "to \"sqrtn\" after 2016/11/01.")
     combiner = "mean"
-  if embedding_weights is None or len(embedding_weights) < 1:
+  if embedding_weights is None:
+    raise ValueError("Missing embedding_weights %s." % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
     raise ValueError("Missing embedding_weights %s." % embedding_weights)
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
@@ -349,7 +357,7 @@ def _sampled_scattered_embedding_lookup(
     # No need to validate the indices since we have checked the params
     # dimensions and we know the largest id.
     result = embedding_ops.embedding_lookup(
-        params, ids, partition_strategy="div", validate_indices=False)
+        params, ids, partition_strategy="div")
 
     return array_ops.reshape(result,
                              array_ops.concat([values_shape, [dimension]], 0))
@@ -555,8 +563,13 @@ def _sampled_scattered_embedding_lookup_sparse(params,
                                          name=name_scope)
 
 
-def embedding_lookup_sparse_with_distributed_aggregation(params, sp_ids,
-    sp_weights, partition_strategy="mod", name=None, combiner=None,
+def embedding_lookup_sparse_with_distributed_aggregation(
+    params,
+    sp_ids,
+    sp_weights,
+    partition_strategy="mod",
+    name=None,
+    combiner=None,
     max_norm=None):
   """Computes embeddings for the given ids and weights.
 
@@ -638,8 +651,13 @@ def embedding_lookup_sparse_with_distributed_aggregation(params, sp_ids,
 
     weights = None if ignore_weights else sp_weights.values
     embeddings = _embedding_lookup_with_distributed_aggregation(
-        params, ids, partition_strategy=partition_strategy, max_norm=max_norm,
-        weights=weights, idx=idx, segment_ids=segment_ids)
+        params,
+        ids,
+        partition_strategy=partition_strategy,
+        max_norm=max_norm,
+        weights=weights,
+        idx=idx,
+        segment_ids=segment_ids)
     # Set weights to all one if ignore weights.
     if ignore_weights:
       weights = array_ops.fill([array_ops.shape(segment_ids)[0]], 1)
@@ -648,13 +666,13 @@ def embedding_lookup_sparse_with_distributed_aggregation(params, sp_ids,
     # Reshape weights.
     ones = array_ops.fill(
         array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
-    bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones],
-                                           0)
+    bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones], 0)
     orig_weights_shape = weights.get_shape()
     weights = array_ops.reshape(weights, bcast_weights_shape)
     if embeddings.get_shape().ndims is not None:
-      weights.set_shape(orig_weights_shape.concatenate(
-          [1 for _ in range(embeddings.get_shape().ndims - 1)]))
+      weights.set_shape(
+          orig_weights_shape.concatenate(
+              [1 for _ in range(embeddings.get_shape().ndims - 1)]))
 
     if combiner == "mean":
       weight_sum = math_ops.segment_sum(weights, segment_ids)
@@ -669,24 +687,29 @@ def embedding_lookup_sparse_with_distributed_aggregation(params, sp_ids,
     return embeddings
 
 
-def _do_gather(params, ids, validate_indices=True, name=None):
+def _do_gather(params, ids, name=None):
   """Deals with doing gather differently for resource variables."""
   if isinstance(params, resource_variable_ops.ResourceVariable):
     return params.sparse_read(ids, name=name)
-  return array_ops.gather(
-      params, ids, name=name, validate_indices=validate_indices)
-
-
-def _embedding_lookup_with_distributed_aggregation(params, ids,
-    partition_strategy="mod", name=None, validate_indices=True, max_norm=None,
-    weights=None, idx=None, segment_ids=None):
-  """ Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
+  return array_ops.gather(params, ids, name=name)
+
+
+def _embedding_lookup_with_distributed_aggregation(params,
+                                                   ids,
+                                                   partition_strategy="mod",
+                                                   name=None,
+                                                   max_norm=None,
+                                                   weights=None,
+                                                   idx=None,
+                                                   segment_ids=None):
+  """Lookup helper for embedding_lookup_sparse_with_distributed_aggregation."""
   if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
     raise ValueError("Need at least one param")
   if isinstance(params, variables.PartitionedVariable):
     params = list(params)  # Iterate to get the underlying Variables.
   if not isinstance(params, list):
     params = [params]
+
   def maybe_normalize(x):
     if max_norm is not None:
       if x.get_shape().ndims is not None:
@@ -695,18 +718,17 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
         ndims = array_ops.size(array_ops.shape(x))
       return clip_ops.clip_by_norm(x, max_norm, axes=list(range(1, ndims)))
     return x
+
   with ops.name_scope(name, "embedding_lookup_with_distributed_aggregation",
-      params + [ids]) as name:
+                      params + [ids]) as name:
     np = len(params)  # Number of partitions
     # Preserve the resource variable status to avoid accidental dense reads.
-    if not any(isinstance(p, resource_variable_ops.ResourceVariable)
-               for p in params):
+    if not any(
+        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
       params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
     if np == 1:
       with ops.colocate_with(params[0]):
-        ret = maybe_normalize(
-            _do_gather(
-                params[0], ids, validate_indices=validate_indices))
+        ret = maybe_normalize(_do_gather(params[0], ids))
         ignore_weights = weights is None
         if not ignore_weights:
           if weights.dtype != ret.dtype:
@@ -720,8 +742,9 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
           weights = array_ops.reshape(weights, bcast_weights_shape)
           # Set weights shape after reshape
           if ret.get_shape().ndims is not None:
-            weights.set_shape(orig_weights_shape.concatenate(
-                [1 for _ in range(ret.get_shape().ndims - 1)]))
+            weights.set_shape(
+                orig_weights_shape.concatenate(
+                    [1 for _ in range(ret.get_shape().ndims - 1)]))
           ret *= weights
           return math_ops.segment_sum(ret, segment_ids, name=name)
         else:
@@ -757,18 +780,16 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
         ids_per_partition = num_total_ids // np
         extras = num_total_ids % np
 
-        p_assignments = math_ops.maximum(
-            flat_ids // (ids_per_partition + 1),
-            (flat_ids - extras) // ids_per_partition)
+        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1), (
+            flat_ids - extras) // ids_per_partition)
 
         # Emulate a conditional using a boolean indicator tensor
-        is_in_first_extras_partitions = math_ops.cast(
-            p_assignments < extras, flat_ids.dtype)
-        new_ids = (
-            is_in_first_extras_partitions * (
-                flat_ids % (ids_per_partition + 1)) +
-            (1 - is_in_first_extras_partitions) * (
-                (flat_ids - extras) % ids_per_partition))
+        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
+                                                      flat_ids.dtype)
+        new_ids = (is_in_first_extras_partitions * (flat_ids %
+                                                    (ids_per_partition + 1)) +
+                   (1 - is_in_first_extras_partitions) * (
+                       (flat_ids - extras) % ids_per_partition))
       else:
         raise ValueError("Unrecognized partition strategy: " +
                          partition_strategy)
@@ -785,9 +806,7 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
       partitioned_result = []
       for p in xrange(np):
         with ops.colocate_with(params[p]):
-          partitioned_result.append(
-              _do_gather(params[p], gather_ids[p],
-                         validate_indices=validate_indices))
+          partitioned_result.append(_do_gather(params[p], gather_ids[p]))
 
       ignore_weights = weights is None
       if not ignore_weights:
@@ -802,17 +821,21 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
       if element_shape.is_fully_defined():
         for p in xrange(np):
           with ops.colocate_with(params[p]):
-            partitioned_result[p] = array_ops.reshape(partitioned_result[p],
-                array_ops.concat(
-                    [array_ops.shape(pindices[p]), element_shape], 0))
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([array_ops.shape(pindices[p]), element_shape],
+                                 0))
       else:
         with ops.colocate_with(params[0]):
           params_shape = array_ops.shape(params[0])
         for p in xrange(np):
           with ops.colocate_with(params[p]):
-            partitioned_result[p] = array_ops.reshape(partitioned_result[p],
-                array_ops.concat([array_ops.shape(pindices[p]),
-                    array_ops.slice(params_shape, [1], [-1])], 0))
+            partitioned_result[p] = array_ops.reshape(
+                partitioned_result[p],
+                array_ops.concat([
+                    array_ops.shape(pindices[p]), array_ops.slice(
+                        params_shape, [1], [-1])
+                ], 0))
       # Normalize each partition result.
       for p in xrange(np):
         with ops.colocate_with(params[p]):
@@ -823,7 +846,7 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
           with ops.colocate_with(params[p]):
             if partitioned_weight[p].dtype != partitioned_result[p].dtype:
               partitioned_weight[p] = math_ops.cast(partitioned_weight[p],
-                  partitioned_result[p].dtype)
+                                                    partitioned_result[p].dtype)
             # Reshape partition weights.
             ones = array_ops.fill(
                 array_ops.expand_dims(
@@ -834,9 +857,12 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
             partitioned_weight[p] = array_ops.reshape(partitioned_weight[p],
                                                       bcast_weights_shape)
             if partitioned_result[p].get_shape().ndims is not None:
-              partitioned_weight[p].set_shape(orig_weights_shape.concatenate(
-                  [1 for _ in range(
-                      partitioned_result[p].get_shape().ndims - 1)]))
+              partitioned_weight[p].set_shape(
+                  orig_weights_shape.concatenate([
+                      1
+                      for _ in range(partitioned_result[p].get_shape().ndims -
+                                     1)
+                  ]))
             partitioned_result[p] *= partitioned_weight[p]
       partitioned_segment_ids = []
       for p in xrange(np):
@@ -874,5 +900,7 @@ def _embedding_lookup_with_distributed_aggregation(params, ids,
       concat_segment_ids = array_ops.concat(partitioned_segment_ids, 0)
       concat_partitioned_result = array_ops.concat(partitioned_result, 0)
       return math_ops.unsorted_segment_sum(
-          concat_partitioned_result, concat_segment_ids,
-          math_ops.reduce_max(concat_segment_ids) + 1, name=name)
+          concat_partitioned_result,
+          concat_segment_ids,
+          math_ops.reduce_max(concat_segment_ids) + 1,
+          name=name)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index eb38d70c52c5599ea4c4ec36ba33155259effaad..bf2514498202e9227c2d74c036c7eecba5ccdf2c 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -31,8 +31,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
-from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
@@ -145,8 +146,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertAllClose(
           embedding_lookup_result,
           [(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-           [0] * 4, embedding_weights[0][2],
-           (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
+           [0] * 4, embedding_weights[0][2], (
+               embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.test_session():
@@ -171,8 +172,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -185,11 +186,10 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, sparse_weights).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
-            3.0, [0] * 4, [0] * 4],
-           [embedding_weights[0][2], [0] * 4, [0] * 4]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
+          [0] * 4, [0] * 4
+      ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.test_session():
@@ -215,14 +215,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights, sparse_ids, None).eval())
 
-      self.assertAllClose(
-          embedding_lookup_result,
-          [[(embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4,
-            [0] * 4], [
-                embedding_weights[0][2],
-                (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0,
-                [0] * 4
-            ]])
+      self.assertAllClose(embedding_lookup_result, [[(
+          embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
+              0
+          ] * 4], [
+              embedding_weights[0][2],
+              (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
+          ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.test_session():
@@ -233,13 +232,12 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
-      self.assertAllClose(embedding_lookup_result,
-                          [[(embedding_weights[0] + embedding_weights[1]) / 2.0,
-                            [0] * 4, [0] * 4], [
-                                embedding_weights[2],
-                                (embedding_weights[0] + embedding_weights[1]) /
-                                2.0, [0] * 4
-                            ]])
+      self.assertAllClose(embedding_lookup_result, [[
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4, [0] * 4
+      ], [
+          embedding_weights[2],
+          (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
+      ]])
 
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
@@ -251,8 +249,8 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids)
       embedding_weights = [
-          constant_op.constant(
-              w, dtype=dtypes.float64) for w in embedding_weights
+          constant_op.constant(w, dtype=dtypes.float64)
+          for w in embedding_weights
       ]
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
@@ -301,8 +299,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertAllEqual(embedding_lookup_result[0],
                           embedding_lookup_result[1])
       # Different embedding expected for different value.
-      embedding_diff = np.min((embedding_lookup_result[2] -
-                               embedding_lookup_result[0])**2)
+      embedding_diff = np.min(
+          (embedding_lookup_result[2] - embedding_lookup_result[0])**2)
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
@@ -320,8 +318,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
   def test_scattered_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
 
       embedding_lookup_result = embedding_ops.scattered_embedding_lookup(
           embedding_weights, values, dimension=10).eval()
@@ -340,8 +338,8 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
 
       embedding_lookup_result = (
           embedding_ops.scattered_embedding_lookup_sparse(
-              embedding_weights, sparse_tensor, dimension=5, combiner="mean")
-          .eval())
+              embedding_weights, sparse_tensor, dimension=5,
+              combiner="mean").eval())
 
       self.assertAllEqual(embedding_lookup_result.shape, [5, 5])
       # Same non-zero embedding for the empty rows filled with a default value.
@@ -433,8 +431,8 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
   def test_hashed_embedding_multi_dimension(self):
     with self.test_session():
       embedding_weights = self._random_weights()
-      values = constant_op.constant(
-          [["foo", "bar", "bar"], ["bar", "bar", "foo"]])
+      values = constant_op.constant([["foo", "bar", "bar"],
+                                     ["bar", "bar", "foo"]])
       sampled_candidates = constant_op.constant(
           [[[1, 3, 4, 6], [1, 7, 8, 9], [1, 7, 8, 9]],
            [[1, 7, 8, 9], [1, 7, 8, 9], [1, 3, 4, 6]]])
@@ -491,8 +489,8 @@ class SampledScatteredEmbeddingLookupSparseTest(test.TestCase):
       result = embedding_ops._sampled_scattered_embedding_lookup_sparse(
           params, sp_values, dimension=5, hash_key=self._hash_key)
 
-      self.assertAllClose(result.eval(), [[0., 0., 0., 0., 0.],
-                                          [.3, .2, .2, .3, .1],
+      self.assertAllClose(result.eval(), [[0., 0., 0., 0.,
+                                           0.], [.3, .2, .2, .3, .1],
                                           [0., 0., 0., 0., 0.]])
 
   def test_output_values_with_sampled_candidates(self):
@@ -631,8 +629,8 @@ def _EmbeddingResult(params,
         else:
           partition = extras + (i - threshold) // ids_per_partition
           offset = (i - threshold) % ids_per_partition
-        val = np.copy(params[_PName(partition) + ":0"][
-            offset, :]) * weight_value
+        val = np.copy(
+            params[_PName(partition) + ":0"][offset, :]) * weight_value
       else:
         assert False
       if value_aggregation is None:
@@ -707,19 +705,19 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
     grouped_ignored_weights = self._GroupByBatchEntry(
         np.ones(np.sum(vals_per_batch_entry)), vals_per_batch_entry)
 
-    for num_shards, combiner, dtype, ignore_weights in itertools.product([1, 5],
-        ["sum", "mean", "sqrtn"], [dtypes.float32, dtypes.float64],
-        [True, False]):
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 5], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
 
       with self.test_session():
         p, params, feed_dict = _EmbeddingParams(
             num_shards, vocab_size, shape=param_shape, dtype=dtype)
         embedding_sum = \
             embedding_ops.embedding_lookup_sparse_with_distributed_aggregation(
-            p,
-            sp_ids,
-            None if ignore_weights else sp_weights,
-            combiner=combiner)
+                p,
+                sp_ids,
+                None if ignore_weights else sp_weights,
+                combiner=combiner)
 
         self.assertEqual(embedding_sum.get_shape().as_list(),
                          expected_lookup_result_shape)
@@ -731,8 +729,8 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
             grouped_ids,
             num_shards,
             vocab_size,
-            weight_vals=grouped_ignored_weights if ignore_weights else
-            grouped_weights)
+            weight_vals=grouped_ignored_weights
+            if ignore_weights else grouped_weights)
         if combiner == "mean":
           np_embedding_sum /= np.reshape(np_weight_sum, (batch_size, 1, 1))
         if combiner == "sqrtn":
@@ -744,12 +742,12 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
     vocab_size = 12
     batch_size = 4
     param_shape = [2, 3]
-    sp_ids, sp_weights, _, _, _ = (
-        self._RandomIdsAndWeights(batch_size, vocab_size))
+    sp_ids, sp_weights, _, _, _ = (self._RandomIdsAndWeights(
+        batch_size, vocab_size))
 
-    for num_shards, combiner, dtype, ignore_weights in itertools.product([1, 3],
-        ["sum", "mean", "sqrtn"], [dtypes.float32, dtypes.float64],
-        [True, False]):
+    for num_shards, combiner, dtype, ignore_weights in itertools.product(
+        [1, 3], ["sum", "mean", "sqrtn"], [dtypes.float32,
+                                           dtypes.float64], [True, False]):
       with self.test_session():
         x, params, _ = _EmbeddingParams(
             num_shards, vocab_size, shape=param_shape, dtype=dtype)
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index 8b6abb4b456fa92a4f1b3f293a123d5397fc6f68..89c9d37bd09cb6c43eebb91f3a16600eae9cb490 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -121,7 +121,7 @@ def embed_sequence(ids,
     `Tensor` of `[batch_size, doc_length, embed_dim]` with embedded sequences.
 
   Raises:
-    ValueError: if `embed_dim` or `vocab_size` are not specified when not
+    ValueError: if `embed_dim` or `vocab_size` are not specified when
       `reuse` is `None` or `False`.
   """
   if not (reuse or (vocab_size and embed_dim)):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 282c556424ed5bc91153212c8260b13340d7c63e..e1a27335abe34041fbad3d59f0f336e088a10319 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -39,9 +39,13 @@ should choose depends on (1) the feature type and (2) the model type.
      age_column = real_valued_column("age")
 
    To feed sparse features into DNN models, wrap the column with
-   `embedding_column` or `one_hot_column`. `one_hot_column` is recommended for
-   features with only a few possible values. For features with many possible
-   values, `embedding_column` is recommended.
+   `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense
+   boolean tensor with an entry for each possible value, and thus the
+   computation cost is linear in the number of possible values versus the number
+   of values that occur in the sparse tensor. Thus using a "one_hot_column" is
+   only recommended for features with only a few possible values. For features
+   with many possible values or for very sparse features, `embedding_column` is
+   recommended.
 
      embedded_dept_column = embedding_column(
        sparse_column_with_keys("department", ["math", "philosphy", ...]),
@@ -49,7 +53,9 @@ should choose depends on (1) the feature type and (2) the model type.
 
 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
 
-   Sparse features can be fed directly into linear models.
+   Sparse features can be fed directly into linear models. When doing so
+   an embedding_lookups are used to efficiently perform the sparse matrix
+   multiplication.
 
      dept_column = sparse_column_with_keys("department",
        ["math", "philosophy", "english"])
@@ -125,18 +131,27 @@ import math
 import six
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.framework.python.framework import checkpoint_utils
+from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import embedding_ops
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.layers.python.ops import bucketization_op
 from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op
 from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_py
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 
@@ -282,11 +297,13 @@ class _FeatureColumn(object):
 
 
 # TODO(b/30410315): Support warm starting in all feature columns.
-class _SparseColumn(_FeatureColumn,
-                    collections.namedtuple("_SparseColumn",
-                                           ["column_name", "is_integerized",
-                                            "bucket_size", "lookup_config",
-                                            "combiner", "dtype"])):
+class _SparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_SparseColumn", [
+        "column_name", "is_integerized", "bucket_size", "lookup_config",
+        "combiner", "dtype"
+    ])):
   """Represents a sparse feature column also known as categorical features.
 
   Instances of this class are immutable. A sparse column means features are
@@ -310,7 +327,7 @@ class _SparseColumn(_FeatureColumn,
         * "mean": do l1 normalization on features in the column
         * "sqrtn": do l2 normalization on features in the column
       For more information: `tf.embedding_lookup_sparse`.
-    dtype: Type of features, such as `tf.string` or `tf.int64`.
+    dtype: Type of features, either `tf.string` or `tf.int64`.
 
   Raises:
     TypeError: if lookup_config is not a _SparseIdLookupConfig.
@@ -417,9 +434,8 @@ class _SparseColumn(_FeatureColumn,
         initializer=init_ops.zeros_initializer(),
         combiner=self.combiner)
 
-  def _get_input_sparse_tensor(self, columns_to_tensors):
-    """Looks up the input tensor for transformation and sparsify it if dense."""
-    input_tensor = columns_to_tensors[self.name]
+  def _get_input_sparse_tensor(self, input_tensor):
+    """sparsify input_tensor if dense."""
     if not isinstance(input_tensor, sparse_tensor_py.SparseTensor):
       # To avoid making any assumptions about which values are to be ignored,
       # we set ignore_value to -1 for numeric tensors to avoid excluding valid
@@ -435,7 +451,7 @@ class _SparseColumn(_FeatureColumn,
     return input_tensor
 
   def is_compatible(self, other_column):
-    """Check compatability of two sparse columns."""
+    """Check compatibility of two sparse columns."""
     if self.lookup_config and other_column.lookup_config:
       return self.lookup_config == other_column.lookup_config
     compatible = (self.length == other_column.length and
@@ -446,18 +462,44 @@ class _SparseColumn(_FeatureColumn,
                    format(self.name, other_column.name))
     return compatible
 
-
-class _SparseColumnIntegerized(_SparseColumn):
-  """See `sparse_column_with_integerized_feature`."""
+  @abc.abstractmethod
+  def _do_transform(self, input_tensor):
+    pass
 
   def insert_transformed_feature(self, columns_to_tensors):
     """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
+    input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name])
+    columns_to_tensors[self] = self._do_transform(input_tensor)
+
+  def _transform_feature(self, inputs):
+    input_tensor = self._get_input_sparse_tensor(inputs.get(self.name))
+    return self._do_transform(input_tensor)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
 
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
+
+
+class _SparseColumnIntegerized(_SparseColumn):
+  """See `sparse_column_with_integerized_feature`."""
+
+  def _do_transform(self, input_tensor):
     sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size,
                                     name="mod")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_integerized_feature(column_name,
@@ -508,10 +550,7 @@ def sparse_column_with_integerized_feature(column_name,
 class _SparseColumnHashed(_SparseColumn):
   """See `sparse_column_with_hash_bucket`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     if self.dtype.is_integer:
       sparse_values = string_ops.as_string(input_tensor.values)
     else:
@@ -519,8 +558,8 @@ class _SparseColumnHashed(_SparseColumn):
 
     sparse_id_values = string_ops.string_to_hash_bucket_fast(
         sparse_values, self.bucket_size, name="lookup")
-    columns_to_tensors[self] = sparse_tensor_py.SparseTensor(
-        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+    return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values,
+                                         input_tensor.dense_shape)
 
 
 def sparse_column_with_hash_bucket(column_name,
@@ -563,16 +602,13 @@ def sparse_column_with_hash_bucket(column_name,
 class _SparseColumnKeys(_SparseColumn):
   """See `sparse_column_with_keys`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    input_tensor = self._get_input_sparse_tensor(columns_to_tensors)
-
+  def _do_transform(self, input_tensor):
     table = lookup.index_table_from_tensor(
         mapping=tuple(self.lookup_config.keys),
         default_value=self.lookup_config.default_value,
         dtype=self.dtype,
         name="lookup")
-    columns_to_tensors[self] = table.lookup(input_tensor)
+    return table.lookup(input_tensor)
 
 
 def sparse_column_with_keys(
@@ -612,9 +648,7 @@ def sparse_column_with_keys(
 class _SparseColumnVocabulary(_SparseColumn):
   """See `sparse_column_with_vocabulary_file`."""
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Handles sparse column to id conversion."""
-    st = self._get_input_sparse_tensor(columns_to_tensors)
+  def _do_transform(self, st):
     if self.dtype.is_integer:
       sparse_string_values = string_ops.as_string(st.values)
       sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices,
@@ -623,13 +657,13 @@ class _SparseColumnVocabulary(_SparseColumn):
     else:
       sparse_string_tensor = st
 
-    table = lookup.string_to_index_table_from_file(
+    table = lookup.index_table_from_file(
         vocabulary_file=self.lookup_config.vocabulary_file,
         num_oov_buckets=self.lookup_config.num_oov_buckets,
         vocab_size=self.lookup_config.vocab_size,
         default_value=self.lookup_config.default_value,
         name=self.name + "_lookup")
-    columns_to_tensors[self] = table.lookup(sparse_string_tensor)
+    return table.lookup(sparse_string_tensor)
 
 
 def sparse_column_with_vocabulary_file(column_name,
@@ -685,9 +719,12 @@ def sparse_column_with_vocabulary_file(column_name,
       dtype=dtype)
 
 
-class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
-    "_WeightedSparseColumn",
-    ["sparse_id_column", "weight_column_name", "dtype"])):
+class _WeightedSparseColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_WeightedSparseColumn",
+                           ["sparse_id_column", "weight_column_name",
+                            "dtype"])):
   """See `weighted_sparse_column`."""
 
   def __new__(cls, sparse_id_column, weight_column_name, dtype):
@@ -716,22 +753,6 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    """Inserts a tuple with the id and weight tensors."""
-    if self.sparse_id_column not in columns_to_tensors:
-      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
-
-    weight_tensor = columns_to_tensors[self.weight_column_name]
-    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
-      # The weight tensor can be a regular Tensor. In such case, sparsify it.
-      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
-    if not self.dtype.is_floating:
-      weight_tensor = math_ops.to_float(weight_tensor)
-    columns_to_tensors[self] = tuple([
-        columns_to_tensors[self.sparse_id_column],
-        weight_tensor
-    ])
-
   def id_tensor(self, input_tensor):
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor[0]
@@ -759,6 +780,43 @@ class _WeightedSparseColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner=self.sparse_id_column.combiner)
 
+  def _do_transform(self, id_tensor, weight_tensor):
+    if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor):
+      # The weight tensor can be a regular Tensor. In such case, sparsify it.
+      weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor)
+    if not self.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return tuple([id_tensor, weight_tensor])
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Inserts a tuple with the id and weight tensors."""
+    if self.sparse_id_column not in columns_to_tensors:
+      self.sparse_id_column.insert_transformed_feature(columns_to_tensors)
+
+    weight_tensor = columns_to_tensors[self.weight_column_name]
+    columns_to_tensors[self] = self._do_transform(
+        columns_to_tensors[self.sparse_id_column], weight_tensor)
+
+  def _transform_feature(self, inputs):
+    return self._do_transform(
+        inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.id_tensor(input_tensor), self.weight_tensor(input_tensor))
+
 
 def weighted_sparse_column(sparse_id_column,
                            weight_column_name,
@@ -791,9 +849,11 @@ def weighted_sparse_column(sparse_id_column,
       weight or value of the corresponding sparse id feature.
     dtype: Type of weights, such as `tf.float32`. Only floating and integer
       weights are supported.
+
   Returns:
     A _WeightedSparseColumn composed of two sparse features: one represents id,
     the other represents weight (value) of the id feature in that example.
+
   Raises:
     ValueError: if dtype is not convertible to float.
   """
@@ -804,9 +864,10 @@ def weighted_sparse_column(sparse_id_column,
   return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype)
 
 
-class _OneHotColumn(_FeatureColumn,
-                    collections.namedtuple("_OneHotColumn",
-                                           ["sparse_id_column"])):
+class _OneHotColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_OneHotColumn", ["sparse_id_column"])):
   """Represents a one-hot column for use in deep networks.
 
   Args:
@@ -854,7 +915,7 @@ class _OneHotColumn(_FeatureColumn,
       output_rank: the desired rank of the output `Tensor`.
 
     Returns:
-      A multihot Tensor to be fed into the first layer of neural network.
+      A multi-hot Tensor to be fed into the first layer of neural network.
 
     Raises:
       ValueError: When using one_hot_column with weighted_sparse_column.
@@ -886,12 +947,31 @@ class _OneHotColumn(_FeatureColumn,
     return math_ops.reduce_sum(
         one_hot_id_tensor, reduction_indices=[output_rank - 1])
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.length])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  def _transform_feature(self, inputs):
+    return self._to_dnn_input_layer(inputs.get(self.sparse_id_column))
 
-class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
-    "_EmbeddingColumn",
-    ["sparse_id_column", "dimension", "combiner", "initializer",
-     "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
-     "shared_vocab_size", "max_norm", "trainable"])):
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+
+class _EmbeddingColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_EmbeddingColumn", [
+        "sparse_id_column", "dimension", "combiner", "initializer",
+        "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name",
+        "shared_vocab_size", "max_norm", "trainable"
+    ])):
   """Represents an embedding column.
 
   Args:
@@ -1016,6 +1096,139 @@ class _EmbeddingColumn(_FeatureColumn, collections.namedtuple(
     raise ValueError("Column {} is not supported in linear models. "
                      "Please use sparse_column.".format(self))
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.sparse_id_column)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+
+def _is_variable(v):
+  """Returns true if `v` is a variable."""
+  return isinstance(v, (variables.Variable,
+                        resource_variable_ops.ResourceVariable))
+
+
+def _embeddings_from_arguments(column,
+                               args,
+                               weight_collections,
+                               trainable,
+                               output_rank=2):
+  """Returns embeddings for a column based on the computed arguments.
+
+  Args:
+   column: the column name.
+   args: the _DeepEmbeddingLookupArguments for this column.
+   weight_collections: collections to store weights in.
+   trainable: whether these embeddings should be trainable.
+   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
+     be combined to produce the desired rank.
+
+  Returns:
+   the embeddings.
+
+  Raises:
+   ValueError: if not possible to create.
+  """
+  # pylint: disable=protected-access
+  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
+  weight_tensor = None
+  if args.weight_tensor is not None:
+    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
+  # pylint: enable=protected-access
+
+  # This option is only enabled for scattered_embedding_column.
+  if args.hash_key:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+    return embedding_ops.scattered_embedding_lookup_sparse(
+        embeddings,
+        input_tensor,
+        args.dimension,
+        hash_key=args.hash_key,
+        combiner=args.combiner,
+        name="lookup")
+
+  if args.shared_embedding_name is not None:
+    shared_embedding_collection_name = (
+        "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper())
+    graph = ops.get_default_graph()
+    shared_embedding_collection = (
+        graph.get_collection_ref(shared_embedding_collection_name))
+    shape = [args.vocab_size, args.dimension]
+    if shared_embedding_collection:
+      if len(shared_embedding_collection) > 1:
+        raise ValueError(
+            "Collection %s can only contain one "
+            "(partitioned) variable." % shared_embedding_collection_name)
+      else:
+        embeddings = shared_embedding_collection[0]
+        if embeddings.get_shape() != shape:
+          raise ValueError(
+              "The embedding variable with name {} already "
+              "exists, but its shape does not match required "
+              "embedding shape  here. Please make sure to use "
+              "different shared_embedding_name for different "
+              "shared embeddings.".format(args.shared_embedding_name))
+    else:
+      embeddings = contrib_variables.model_variable(
+          name=args.shared_embedding_name,
+          shape=shape,
+          dtype=dtypes.float32,
+          initializer=args.initializer,
+          trainable=(trainable and args.trainable),
+          collections=weight_collections)
+      graph.add_to_collection(shared_embedding_collection_name, embeddings)
+  else:
+    embeddings = contrib_variables.model_variable(
+        name="weights",
+        shape=[args.vocab_size, args.dimension],
+        dtype=dtypes.float32,
+        initializer=args.initializer,
+        trainable=(trainable and args.trainable),
+        collections=weight_collections)
+
+  if _is_variable(embeddings):
+    embeddings = [embeddings]
+  else:
+    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings)
+  return embedding_ops.safe_embedding_lookup_sparse(
+      embeddings,
+      input_tensor,
+      sparse_weights=weight_tensor,
+      combiner=args.combiner,
+      name=column.name + "weights",
+      max_norm=args.max_norm)
+
+
+def _maybe_restore_from_checkpoint(checkpoint_path, variable):
+  if checkpoint_path is not None:
+    path, tensor_name = checkpoint_path
+    weights_to_restore = variable
+    if len(variable) == 1:
+      weights_to_restore = variable[0]
+    checkpoint_utils.init_from_checkpoint(path,
+                                          {tensor_name: weights_to_restore})
+
 
 def one_hot_column(sparse_id_column):
   """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN.
@@ -1183,10 +1396,11 @@ def shared_embedding_columns(sparse_id_columns,
 
 class _ScatteredEmbeddingColumn(
     _FeatureColumn,
-    collections.namedtuple(
-        "_ScatteredEmbeddingColumn",
-        ["column_name", "size", "dimension", "hash_key", "combiner",
-         "initializer"])):
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_ScatteredEmbeddingColumn", [
+        "column_name", "size", "dimension", "hash_key", "combiner",
+        "initializer"
+    ])):
   """See `scattered_embedding_column`."""
 
   def __new__(cls,
@@ -1239,6 +1453,23 @@ class _ScatteredEmbeddingColumn(
         max_norm=None,
         trainable=True)
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _embeddings_from_arguments(
+        self,
+        self._deep_embedding_lookup_arguments(inputs.get(self)),
+        weight_collections, trainable)
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.column_name)
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
 
 def scattered_embedding_column(column_name,
                                size,
@@ -1359,19 +1590,157 @@ def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None):
   return layers._inner_flatten(input_tensor, output_rank)  # pylint: disable=protected-access
 
 
-class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
-    "_RealValuedColumn",
-    ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
-  """Represents a real valued feature column also known as continuous features.
+class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple(
+    "_RealValuedVarLenColumn",
+    ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])):
+  """Represents a real valued feature column for variable length Features.
 
-  Instances of this class are immutable. A real valued column with a specified
-  dimension means features are dense, otherwise they're sparse.
-  In the dense case, the dictionary returned by InputBuilder contains a
+  Instances of this class are immutable.
+  If is_sparse=False, the dictionary returned by InputBuilder contains a
   ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension).
-  In the sparse shape, the dictionary contains a ("column_name", SparseTensor)
+  If is_sparse=True, the dictionary contains a ("column_name", SparseTensor)
   pair instead with shape inferred after parsing.
   """
 
+  @property
+  def name(self):
+    return self.column_name
+
+  @property
+  def config(self):
+    if self.is_sparse:
+      return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
+    else:
+      return {self.column_name: parsing_ops.FixedLenSequenceFeature(
+          [], self.dtype, allow_missing=True,
+          default_value=self.default_value)}
+
+  @property
+  def key(self):
+    """Returns a string which will be used as a key when we do sorting."""
+    return self._key_without_properties(["normalizer"])
+
+  @property
+  def normalizer_fn(self):
+    """Returns the function used to normalize the column."""
+    return self.normalizer
+
+  def _normalized_input_tensor(self, input_tensor):
+    """Returns the input tensor after custom normalization is applied."""
+    if self.normalizer is None:
+      return input_tensor
+    if self.is_sparse:
+      return sparse_tensor_py.SparseTensor(
+          input_tensor.indices,
+          self.normalizer(input_tensor.values),
+          input_tensor.dense_shape)
+    else:
+      return self.normalizer(input_tensor)
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Apply transformation and inserts it into columns_to_tensors.
+
+    Args:
+      columns_to_tensors: A mapping from feature columns to tensors. 'string'
+        key means a base feature (not-transformed). It can have _FeatureColumn
+        as a key too. That means that _FeatureColumn is already transformed.
+    """
+    # Transform the input tensor according to the normalizer function.
+    input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name])
+    columns_to_tensors[self] = math_ops.to_float(input_tensor)
+
+  # pylint: disable=unused-argument
+  def _to_dnn_input_layer(self,
+                          input_tensor,
+                          weight_collections=None,
+                          trainable=True,
+                          output_rank=2):
+    return _reshape_real_valued_tensor(
+        self._to_dense_tensor(input_tensor), output_rank, self.name)
+
+  def _to_dense_tensor(self, input_tensor):
+    if not self.is_sparse:
+      return input_tensor
+    raise ValueError("Set is_sparse to False if you want a dense Tensor for "
+                     "column_name: {}".format(self.name))
+
+
+@experimental
+def _real_valued_var_len_column(column_name,
+                                default_value=None,
+                                dtype=dtypes.float32,
+                                normalizer=None,
+                                is_sparse=False):
+  """Creates a `_RealValuedVarLenColumn` for variable-length numeric data.
+
+  Note, this is not integrated with any of the DNNEstimators, except the RNN
+  ones DynamicRNNEstimator and the StateSavingRNNEstimator.
+
+  It can either create a parsing config for a SparseTensor (with is_sparse=True)
+  or a padded Tensor.
+  The (dense_)shape of the result will be [batch_size, None], which can be used
+  with is_sparse=False as input into an RNN (see DynamicRNNEstimator or
+  StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see
+  gtflow).
+
+  Use real_valued_column if the Feature has a fixed length. Use some
+  SparseColumn for columns to be embedded / one-hot-encoded.
+
+  Args:
+    column_name: A string defining real valued column name.
+    default_value: A scalar value compatible with dtype. Needs to be specified
+      if is_sparse=False.
+    dtype: Defines the type of values. Default value is tf.float32. Needs to be
+      convertible to tf.float32.
+    normalizer: If not None, a function that can be used to normalize the value
+      of the real valued column after default_value is applied for parsing.
+      Normalizer function takes the input tensor as its argument, and returns
+      the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for
+      is_sparse=False, the normalizer will be run on the values of the
+      `SparseTensor`.
+    is_sparse: A boolean defining whether to create a SparseTensor or a Tensor.
+  Returns:
+    A _RealValuedSparseColumn.
+  Raises:
+    TypeError: if default_value is not a scalar value compatible with dtype.
+    TypeError: if dtype is not convertible to tf.float32.
+    ValueError: if default_value is None and is_sparse is False.
+  """
+  if not (dtype.is_integer or dtype.is_floating):
+    raise TypeError("dtype must be convertible to float. "
+                    "dtype: {}, column_name: {}".format(dtype, column_name))
+
+  if default_value is None and not is_sparse:
+    raise ValueError("default_value must be provided when is_sparse=False to "
+                     "parse a padded Tensor. "
+                     "column_name: {}".format(column_name))
+  if isinstance(default_value, list):
+    raise ValueError(
+        "Only scalar default value. default_value: {}, column_name: {}".format(
+            default_value, column_name))
+  if default_value is not None:
+    if dtype.is_integer:
+      default_value = int(default_value)
+    elif dtype.is_floating:
+      default_value = float(default_value)
+
+  return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer,
+                                 is_sparse)
+
+
+class _RealValuedColumn(
+    _FeatureColumn,
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple(
+        "_RealValuedColumn",
+        ["column_name", "dimension", "default_value", "dtype", "normalizer"])):
+  """Represents a real valued feature column also known as continuous features.
+
+  Instances of this class are immutable. The dictionary returned by InputBuilder
+  contains a ("column_name", Tensor) pair with a Tensor shape of
+  (batch_size, dimension).
+  """
+
   def __new__(cls, column_name, dimension, default_value,
               dtype, normalizer):
     if default_value is not None:
@@ -1386,15 +1755,12 @@ class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
 
   @property
   def config(self):
-    if self.dimension is None:
-      return {self.column_name: parsing_ops.VarLenFeature(self.dtype)}
-    else:
-      default_value = self.default_value
-      if default_value is not None:
-        default_value = list(default_value)
-      return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
-                                                            self.dtype,
-                                                            default_value)}
+    default_value = self.default_value
+    if default_value is not None:
+      default_value = list(default_value)
+    return {self.column_name: parsing_ops.FixedLenFeature([self.dimension],
+                                                          self.dtype,
+                                                          default_value)}
 
   @property
   def key(self):
@@ -1435,13 +1801,25 @@ class _RealValuedColumn(_FeatureColumn, collections.namedtuple(
     return _reshape_real_valued_tensor(input_tensor, output_rank, self.name)
 
   def _to_dense_tensor(self, input_tensor):
-    if isinstance(input_tensor, sparse_tensor_py.SparseTensor):
-      default_value = (self.default_value[0] if self.default_value is not None
-                       else 0)
-      return sparse_ops.sparse_tensor_to_dense(
-          input_tensor, default_value=default_value)
     return input_tensor
 
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([self.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  def _transform_feature(self, inputs):
+    return math_ops.to_float(
+        self._normalized_input_tensor(inputs.get(self.name)))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
 
 def real_valued_column(column_name,
                        dimension=1,
@@ -1453,10 +1831,7 @@ def real_valued_column(column_name,
   Args:
     column_name: A string defining real valued column name.
     dimension: An integer specifying dimension of the real valued column.
-      The default is 1. When dimension is not None, the Tensor representing
-      the _RealValuedColumn will have the shape of [batch_size, dimension].
-      A None dimension means the feature column should be treat as variable
-      length and will be parsed as a `SparseTensor`.
+      The default is 1.
     default_value: A single value compatible with dtype or a list of values
       compatible with dtype which the column takes on during tf.Example parsing
       if data is missing. When dimension is not None, a default value of None
@@ -1484,15 +1859,19 @@ def real_valued_column(column_name,
     ValueError: if dtype is not convertible to tf.float32.
   """
 
-  if dimension is not None:
-    if not isinstance(dimension, int):
-      raise TypeError("dimension must be an integer. "
-                      "dimension: {}, column_name: {}".format(dimension,
-                                                              column_name))
-    if dimension < 1:
-      raise ValueError("dimension must be greater than 0. "
-                       "dimension: {}, column_name: {}".format(dimension,
-                                                               column_name))
+  if dimension is None:
+    raise TypeError("dimension must be an integer. Use the "
+                    "_real_valued_var_len_column for variable length features."
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
+  if not isinstance(dimension, int):
+    raise TypeError("dimension must be an integer. "
+                    "dimension: {}, column_name: {}".format(dimension,
+                                                            column_name))
+  if dimension < 1:
+    raise ValueError("dimension must be greater than 0. "
+                     "dimension: {}, column_name: {}".format(dimension,
+                                                             column_name))
 
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError("dtype must be convertible to float. "
@@ -1523,11 +1902,6 @@ def real_valued_column(column_name,
                                normalizer)
 
   if isinstance(default_value, list):
-    if dimension is None:
-      raise ValueError(
-          "Only scalar default value is supported when dimension is None. "
-          "default_value: {}, column_name: {}".format(
-              default_value, column_name))
     if len(default_value) != dimension:
       raise ValueError(
           "The length of default_value must be equal to dimension. "
@@ -1561,8 +1935,12 @@ def real_valued_column(column_name,
                       default_value, dtype, column_name))
 
 
-class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
-    "_BucketizedColumn", ["source_column", "boundaries"])):
+class _BucketizedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    fc_core._DenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_BucketizedColumn", ["source_column",
+                                                 "boundaries"])):
   """Represents a bucketization transformation also known as binning.
 
   Instances of this class are immutable. Values in `source_column` will be
@@ -1632,15 +2010,6 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
     """Returns a string which will be used as a key when we do sorting."""
     return "{}".format(self)
 
-  def insert_transformed_feature(self, columns_to_tensors):
-    # Bucketize the source column.
-    if self.source_column not in columns_to_tensors:
-      self.source_column.insert_transformed_feature(columns_to_tensors)
-    columns_to_tensors[self] = bucketization_op.bucketize(
-        columns_to_tensors[self.source_column],
-        boundaries=list(self.boundaries),
-        name="bucketize")
-
   # pylint: disable=unused-argument
   def _to_dnn_input_layer(self,
                           input_tensor,
@@ -1698,6 +2067,43 @@ class _BucketizedColumn(_FeatureColumn, collections.namedtuple(
         initializer=init_ops.zeros_initializer(),
         combiner="sum")
 
+  def _transform_feature(self, inputs):
+    """Handles cross transformation."""
+    # Bucketize the source column.
+    return bucketization_op.bucketize(
+        inputs.get(self.source_column),
+        boundaries=list(self.boundaries),
+        name="bucketize")
+
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
+
+  @property
+  def _parse_example_config(self):
+    return self.config
+
+  @property
+  def _num_buckets(self):
+    return self.length * self.source_column.dimension
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(  # pylint: disable=protected-access
+        self.to_sparse_tensor(inputs.get(self)), None)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        [self.length * self.source_column.dimension])
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return self._to_dnn_input_layer(
+        inputs.get(self), weight_collections, trainable)
+
 
 def bucketized_column(source_column, boundaries):
   """Creates a _BucketizedColumn for discretizing dense input.
@@ -1716,13 +2122,14 @@ def bucketized_column(source_column, boundaries):
   return _BucketizedColumn(source_column, boundaries)
 
 
-class _CrossedColumn(_FeatureColumn,
-                     collections.namedtuple("_CrossedColumn",
-                                            ["columns", "hash_bucket_size",
-                                             "hash_key",
-                                             "combiner", "ckpt_to_load_from",
-                                             "tensor_name_in_ckpt"])):
-  """Represents a cross transformation also known as conjuction or combination.
+class _CrossedColumn(
+    _FeatureColumn,
+    fc_core._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple("_CrossedColumn", [
+        "columns", "hash_bucket_size", "hash_key", "combiner",
+        "ckpt_to_load_from", "tensor_name_in_ckpt"
+    ])):
+  """Represents a cross transformation also known as conjunction or combination.
 
   Instances of this class are immutable. It crosses given `columns`. Crossed
   column output will be hashed to hash_bucket_size.
@@ -1839,12 +2246,37 @@ class _CrossedColumn(_FeatureColumn,
     """Returns the id tensor from the given transformed input_tensor."""
     return input_tensor
 
-  # pylint: disable=unused-argument
   def weight_tensor(self, input_tensor):
     """Returns the weight tensor from the given transformed input_tensor."""
+    del input_tensor
     return None
 
-  def insert_transformed_feature(self, columns_to_tensors):
+  def _to_dnn_input_layer(self,
+                          input_tensor,
+                          weight_collections=None,
+                          trainable=True,
+                          output_rank=2):
+    del input_tensor
+    del weight_collections
+    del trainable
+    del output_rank
+    raise ValueError("CrossedColumn is not supported in DNN. "
+                     "Please use embedding_column. column: {}".format(self))
+
+  def _checkpoint_path(self):
+    if self.ckpt_to_load_from is not None:
+      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
+    return None
+
+  def _wide_embedding_lookup_arguments(self, input_tensor):
+    return _LinearEmbeddingLookupArguments(
+        input_tensor=input_tensor,
+        weight_tensor=None,
+        vocab_size=self.length,
+        initializer=init_ops.zeros_initializer(),
+        combiner=self.combiner)
+
+  def _transform_feature(self, inputs):
     """Handles cross transformation."""
 
     def _collect_leaf_level_columns(cross):
@@ -1860,42 +2292,57 @@ class _CrossedColumn(_FeatureColumn,
     feature_tensors = []
     for c in _collect_leaf_level_columns(self):
       if isinstance(c, _SparseColumn):
-        feature_tensors.append(columns_to_tensors[c.name])
+        feature_tensors.append(inputs.get(c.name))
       else:
-        if c not in columns_to_tensors:
-          c.insert_transformed_feature(columns_to_tensors)
         if isinstance(c, _BucketizedColumn):
-          feature_tensors.append(c.to_sparse_tensor(columns_to_tensors[c]))
+          feature_tensors.append(c.to_sparse_tensor(inputs.get(c)))
         else:
-          feature_tensors.append(columns_to_tensors[c])
-    columns_to_tensors[self] = sparse_feature_cross_op.sparse_feature_cross(
+          feature_tensors.append(inputs.get(c))
+    return sparse_feature_cross_op.sparse_feature_cross(
         feature_tensors,
         hashed_output=True,
         num_buckets=self.hash_bucket_size,
         hash_key=self.hash_key,
         name="cross")
 
-  # pylint: disable=unused-argument
-  def _to_dnn_input_layer(self,
-                          input_tensor,
-                          weight_collections=None,
-                          trainable=True,
-                          output_rank=2):
-    raise ValueError("CrossedColumn is not supported in DNN. "
-                     "Please use embedding_column. column: {}".format(self))
+  def insert_transformed_feature(self, columns_to_tensors):
+    """Handles sparse column to id conversion."""
+    columns_to_tensors[self] = self._transform_feature(
+        _LazyBuilderByColumnsToTensor(columns_to_tensors))
 
-  def _checkpoint_path(self):
-    if self.ckpt_to_load_from is not None:
-      return self.ckpt_to_load_from, self.tensor_name_in_ckpt
-    return None
+  @property
+  def _parse_example_config(self):
+    return self.config
 
-  def _wide_embedding_lookup_arguments(self, input_tensor):
-    return _LinearEmbeddingLookupArguments(
-        input_tensor=input_tensor,
-        weight_tensor=None,
-        vocab_size=self.length,
-        initializer=init_ops.zeros_initializer(),
-        combiner=self.combiner)
+  @property
+  def _num_buckets(self):
+    return self.length
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None)  # pylint: disable=protected-access
+
+
+class _LazyBuilderByColumnsToTensor(object):
+
+  def __init__(self, columns_to_tensors):
+    self._columns_to_tensors = columns_to_tensors
+
+  def get(self, key):
+    """Gets the transformed feature column."""
+    if key in self._columns_to_tensors:
+      return self._columns_to_tensors[key]
+    if isinstance(key, str):
+      raise ValueError(
+          "features dictionary doesn't contain key ({})".format(key))
+    if not isinstance(key, _FeatureColumn):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      "Provided: {}".format(key))
+
+    key.insert_transformed_feature(self._columns_to_tensors)
+    return self._columns_to_tensors[key]
 
 
 def crossed_column(columns, hash_bucket_size, combiner="sum",
@@ -2020,6 +2467,7 @@ def _get_feature_config(feature_column):
         "Given column is {}".format(feature_column))
   if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn,
                                  _EmbeddingColumn, _RealValuedColumn,
+                                 _RealValuedVarLenColumn,
                                  _BucketizedColumn, _CrossedColumn,
                                  _OneHotColumn, _ScatteredEmbeddingColumn)):
     return feature_column.config
@@ -2089,7 +2537,8 @@ def _create_sequence_feature_spec_for_parsing(sequence_feature_columns,
   feature_spec = create_feature_spec_for_parsing(sequence_feature_columns)
   sequence_feature_spec = {}
   for key, feature in feature_spec.items():
-    if isinstance(feature, parsing_ops.VarLenFeature):
+    if (isinstance(feature, parsing_ops.VarLenFeature) or
+        isinstance(feature, parsing_ops.FixedLenSequenceFeature)):
       sequence_feature = feature
     elif isinstance(feature, parsing_ops.FixedLenFeature):
       default_is_set = feature.default_value is not None
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index 7f1bfc9605bd1df75fa5b048740ba00b2a6bf022..fa0047f05d893f6543ddb1680824a32469e13293 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.framework import checkpoint_utils
+import functools
+
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
 from tensorflow.contrib.layers.python.layers import embedding_ops
@@ -34,106 +35,60 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 
-def _embeddings_from_arguments(column,
-                               args,
-                               weight_collections,
-                               trainable,
-                               output_rank=2):
-  """Returns embeddings for a column based on the computed arguments.
+def _maybe_reshape_input_tensor(tensor, column_name, output_rank):
+  """Reshape the input tensor by the following rule.
 
-  Args:
-   column: the column name.
-   args: the _DeepEmbeddingLookupArguments for this column.
-   weight_collections: collections to store weights in.
-   trainable: whether these embeddings should be trainable.
-   output_rank: the desired rank of the returned `Tensor`. Inner dimensions will
-     be combined to produce the desired rank.
+  1. If `output_rank > input_rank + 1`, raise a `ValueError`.
+  2. If `output_rank == input_rank + 1`, expand the tensor by one dimension.
+  3. If `output_rank == input_rank`, do nothing.
+  4. If `output_rank < input_rank`, flatten the inner dimensions of the tensor.
 
+  Args:
+    tensor: A Tensor or SparseTensor to be reshaped.
+    column_name: A string name of the feature column for the tensor.
+    output_rank: the desired rank of the tensor.
   Returns:
-   the embeddings.
-
+    A reshaped Tensor or SparseTensor.
   Raises:
-   ValueError: if not possible to create.
+    ValueError: if `output_rank > input_rank + 1` for the input tensor.
   """
-  # pylint: disable=protected-access
-  input_tensor = layers._inner_flatten(args.input_tensor, output_rank)
-  weight_tensor = None
-  if args.weight_tensor is not None:
-    weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank)
-  # pylint: enable=protected-access
-
-  # This option is only enabled for scattered_embedding_column.
-  if args.hash_key:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=(trainable and args.trainable),
-        collections=weight_collections)
-
-    return embedding_ops.scattered_embedding_lookup_sparse(
-        embeddings, input_tensor, args.dimension,
-        hash_key=args.hash_key,
-        combiner=args.combiner, name='lookup')
-
-  if args.shared_embedding_name is not None:
-    shared_embedding_collection_name = (
-        'SHARED_EMBEDDING_COLLECTION_' + args.shared_embedding_name.upper())
-    graph = ops.get_default_graph()
-    shared_embedding_collection = (
-        graph.get_collection_ref(shared_embedding_collection_name))
-    shape = [args.vocab_size, args.dimension]
-    if shared_embedding_collection:
-      if len(shared_embedding_collection) > 1:
-        raise ValueError('Collection %s can only contain one '
-                         '(partitioned) variable.'
-                         % shared_embedding_collection_name)
-      else:
-        embeddings = shared_embedding_collection[0]
-        if embeddings.get_shape() != shape:
-          raise ValueError('The embedding variable with name {} already '
-                           'exists, but its shape does not match required '
-                           'embedding shape  here. Please make sure to use '
-                           'different shared_embedding_name for different '
-                           'shared embeddings.'.format(
-                               args.shared_embedding_name))
+  input_rank = tensor.get_shape().ndims
+
+  if input_rank is None and isinstance(tensor, sparse_tensor_py.SparseTensor):
+    # Try to get the rank of a sparse tensor by its dense_shape's shape.
+    input_rank = tensor.dense_shape.get_shape().as_list()[0]
+
+  if input_rank is None:
+    raise ValueError('Error while processing column {}. Rank of input Tensor '
+                     'can not be None.'.format(column_name))
+
+  if output_rank > input_rank + 1:
+    raise ValueError('Error while processing column {}. Rank of input Tensor '
+                     '({}) should be the same as output_rank ({}). For '
+                     'example, sequence data should typically be 3 '
+                     'dimensional (rank 3) while non-sequence data is '
+                     'typically 2 dimensional (rank 2).'.format(
+                         column_name, input_rank, output_rank))
+  elif output_rank == input_rank + 1:
+    # Expand the tensor's shape by 1 dimension.
+    if isinstance(tensor, sparse_tensor_py.SparseTensor):
+      output_shape = array_ops.concat([tensor.dense_shape, [1]], 0)
+      return sparse_ops.sparse_reshape(tensor, output_shape)
     else:
-      embeddings = contrib_variables.model_variable(
-          name=args.shared_embedding_name,
-          shape=shape,
-          dtype=dtypes.float32,
-          initializer=args.initializer,
-          trainable=(trainable and args.trainable),
-          collections=weight_collections)
-      graph.add_to_collection(shared_embedding_collection_name, embeddings)
+      reshaped = array_ops.expand_dims(tensor, -1)
+      # Try to calculate the new shape.
+      static_shape = tensor.get_shape()
+      if static_shape is not None and static_shape.dims is not None:
+        reshaped.set_shape(static_shape.as_list() + [1])
+      return reshaped
+  elif output_rank < input_rank:
+    return layers._inner_flatten(tensor, output_rank)  # pylint: disable=protected-access
   else:
-    embeddings = contrib_variables.model_variable(
-        name='weights',
-        shape=[args.vocab_size, args.dimension],
-        dtype=dtypes.float32,
-        initializer=args.initializer,
-        trainable=(trainable and args.trainable),
-        collections=weight_collections)
-
-  if isinstance(embeddings, variables.Variable):
-    embeddings = [embeddings]
-  else:
-    embeddings = embeddings._get_variable_list()  # pylint: disable=protected-access
-  # pylint: disable=protected-access
-  _maybe_restore_from_checkpoint(
-      column._checkpoint_path(), embeddings)
-  return embedding_ops.safe_embedding_lookup_sparse(
-      embeddings,
-      input_tensor,
-      sparse_weights=weight_tensor,
-      combiner=args.combiner,
-      name=column.name + 'weights',
-      max_norm=args.max_norm)
+    return tensor
 
 
 def _input_from_feature_columns(columns_to_tensors,
@@ -160,16 +115,23 @@ def _input_from_feature_columns(columns_to_tensors,
                                          default_name=column.name,
                                          values=columns_to_tensors.values()):
         transformed_tensor = transformer.transform(column)
+        if output_rank == 3:
+          transformed_tensor = nest.map_structure(
+              functools.partial(
+                  _maybe_reshape_input_tensor,
+                  column_name=column.name,
+                  output_rank=output_rank), transformed_tensor)
         try:
           # pylint: disable=protected-access
           arguments = column._deep_embedding_lookup_arguments(
               transformed_tensor)
-          output_tensors.append(_embeddings_from_arguments(
-              column,
-              arguments,
-              weight_collections,
-              trainable,
-              output_rank=output_rank))
+          output_tensors.append(
+              fc._embeddings_from_arguments(  # pylint: disable=protected-access
+                  column,
+                  arguments,
+                  weight_collections,
+                  trainable,
+                  output_rank=output_rank))
 
         except NotImplementedError as ee:
           try:
@@ -325,7 +287,7 @@ def _create_embedding_lookup(column,
         initializer=embedding_lookup_arguments.initializer,
         trainable=trainable,
         collections=weight_collections)
-    if isinstance(variable, variables.Variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -338,16 +300,6 @@ def _create_embedding_lookup(column,
     return variable, predictions
 
 
-def _maybe_restore_from_checkpoint(checkpoint_path, variable):
-  if checkpoint_path is not None:
-    path, tensor_name = checkpoint_path
-    weights_to_restore = variable
-    if len(variable) == 1:
-      weights_to_restore = variable[0]
-    checkpoint_utils.init_from_checkpoint(path,
-                                          {tensor_name: weights_to_restore})
-
-
 def _create_joint_embedding_lookup(columns_to_tensors,
                                    embedding_lookup_arguments,
                                    num_outputs,
@@ -383,7 +335,7 @@ def _create_joint_embedding_lookup(columns_to_tensors,
         initializer=init_ops.zeros_initializer(),
         trainable=trainable,
         collections=weight_collections)
-    if isinstance(variable, variables.Variable):
+    if fc._is_variable(variable):  # pylint: disable=protected-access
       variable = [variable]
     else:
       variable = variable._get_variable_list()  # pylint: disable=protected-access
@@ -548,7 +500,8 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
             default_name=column.name,
             values=columns_to_tensors.values()):
           tensor = column._to_dense_tensor(transformed_tensor)
-          tensor = fc._reshape_real_valued_tensor(tensor, 2, column.name)
+          tensor = _maybe_reshape_input_tensor(
+              tensor, column.name, output_rank=2)
           variable = [
               contrib_variables.model_variable(
                   name='weight',
@@ -565,7 +518,7 @@ def weighted_sum_from_feature_columns(columns_to_tensors,
           predictions, shape=(-1, num_outputs)))
       column_to_variable[column] = variable
       _log_variable(variable)
-      _maybe_restore_from_checkpoint(column._checkpoint_path(), variable)
+      fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable)  # pylint: disable=protected-access
     # pylint: enable=protected-access
     predictions_no_bias = math_ops.add_n(output_tensors)
     bias = contrib_variables.model_variable(
@@ -758,10 +711,10 @@ def parse_feature_columns_from_sequence_examples(
 def _log_variable(variable):
   if isinstance(variable, list):
     for var in variable:
-      if isinstance(variable, variables.Variable):
+      if fc._is_variable(variable):  # pylint: disable=protected-access
         logging.info('Created variable %s, with device=%s', var.name,
                      var.device)
-  elif isinstance(variable, variables.Variable):
+  elif fc._is_variable(variable):  # pylint: disable=protected-access
     logging.info('Created variable %s, with device=%s', variable.name,
                  variable.device)
 
@@ -903,7 +856,8 @@ def _add_variable_collection(weight_collections):
 # pylint: disable=protected-access
 _SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn,
                                fc._EmbeddingColumn,
-                               fc._RealValuedColumn)
+                               fc._RealValuedColumn,
+                               fc._RealValuedVarLenColumn)
 
 _FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn,
                                fc._BucketizedColumn,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 35daef9cc6e1639660e9228823aace6adff56c30..797a7c11dbf9f3d044687e93ba9f9fe4df3a8357 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,14 +27,15 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -52,8 +53,8 @@ class TransformerTest(test.TestCase):
       self.assertAllEqual(output.eval(), [[20.], [110], [-3]])
 
   def testSparseRealValuedColumnIdentityTransformation(self):
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None)
+    sparse_real_valued = feature_column._real_valued_var_len_column(
+        "rating", is_sparse=True)
     rating_tensor = sparse_tensor.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     features = {"rating": rating_tensor}
@@ -68,11 +69,10 @@ class TransformerTest(test.TestCase):
   def testSparseRealValuedColumnWithTransformation(self):
 
     def square_fn(x):
-      return sparse_tensor.SparseTensor(
-          values=x.values**2, indices=x.indices, dense_shape=x.dense_shape)
+      return x**2
 
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None, normalizer=square_fn)
+    sparse_real_valued = feature_column._real_valued_var_len_column(
+        "rating", normalizer=square_fn, is_sparse=True)
     rating_tensor = sparse_tensor.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     features = {"rating": rating_tensor}
@@ -224,7 +224,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(keys_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[keys_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[keys_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[keys_sparse].indices.eval(),
@@ -242,7 +242,7 @@ class TransformerTest(test.TestCase):
     output = feature_column_ops._Transformer(features).transform(keys_sparse)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # While the input is a dense Tensor, the output should be a SparseTensor.
       self.assertIsInstance(output, sparse_tensor.SparseTensor)
       self.assertEqual(output.dtype, dtypes.int64)
@@ -311,7 +311,7 @@ class TransformerTest(test.TestCase):
     self.assertIn(weighted_ids, output)
 
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[weighted_ids][0].dense_shape.eval(),
                           ids_tensor.dense_shape.eval())
       self.assertAllEqual(output[weighted_ids][0].indices.eval(),
@@ -341,7 +341,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -363,7 +363,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -387,7 +387,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -409,7 +409,7 @@ class TransformerTest(test.TestCase):
     self.assertEqual(len(output), 1)
     self.assertIn(vocab_sparse, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertEqual(output[vocab_sparse].values.dtype, dtypes.int64)
       self.assertAllEqual(output[vocab_sparse].values.eval(), [1, 2, 0, 1])
       self.assertAllEqual(output[vocab_sparse].indices.eval(),
@@ -561,13 +561,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       feature_column_ops.input_from_feature_columns(
           features, {"feature": real_valued})
 
+  def testSparseTensorRealValuedColumn(self):
+    var_len_sparse_real_valued_column = (
+        feature_column._real_valued_var_len_column("rating", is_sparse=True))
+    features = {
+        "ids":
+            sparse_tensor.SparseTensor(
+                values=["c", "b", "a"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "income":
+            constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
+        "rating":
+            sparse_tensor.SparseTensor(
+                values=[3.5, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
+    }
+    with self.assertRaisesRegexp(
+        ValueError,
+        "dd"):
+      feature_column_ops.input_from_feature_columns(
+          features, [var_len_sparse_real_valued_column])
+
   def testAllDNNColumns(self):
     sparse_column = feature_column.sparse_column_with_keys(
         "ids", ["a", "b", "c", "unseen"])
-
     real_valued_column = feature_column.real_valued_column("income", 2)
-    sparse_real_valued_column = feature_column.real_valued_column(
-        "rating", dimension=None)
     one_hot_column = feature_column.one_hot_column(sparse_column)
     embedding_column = feature_column.embedding_column(sparse_column, 10)
     features = {
@@ -578,18 +596,16 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                 dense_shape=[3, 1]),
         "income":
             constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
-        "rating":
-            sparse_tensor.SparseTensor(
-                values=[3.5, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
     }
-    output = feature_column_ops.input_from_feature_columns(features, [
-        one_hot_column, embedding_column, real_valued_column,
-        sparse_real_valued_column
-    ])
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    output = feature_column_ops.input_from_feature_columns(features, columns)
+    output_core = fc_core.input_layer(features, columns)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
-      self.assertAllEqual(output.eval().shape, [3, 3 + 4 + 10])
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(output.eval().shape, [3, 2 + 4 + 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
@@ -598,6 +614,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -608,17 +627,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval())
-
-  def testRealValuedColumnSparse(self):
-    sparse_real_valued = feature_column.real_valued_column(
-        "rating", dimension=None, default_value=-1)
-    rating_tensor = sparse_tensor.SparseTensor(
-        values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    features = {"rating": rating_tensor}
-    output = feature_column_ops.input_from_feature_columns(features,
-                                                           [sparse_real_valued])
-    with self.test_session():
-      self.assertAllClose(output.eval(), [[2.0], [-1.0], [5.0]])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
+
+  def testRealValuedColumnDense(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
+    features = {"rating": constant_op.constant(rating)}
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating, output)
+
+  def testRealValuedColumnTypeConversion(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0, 1, 2, -1],
+                       [3, 4, 5, 6]])
+    features = {"rating": constant_op.constant(rating, dtype=dtypes.int64)}
+    with self.test_session() as sess:
+      output = sess.run(feature_column_ops.input_from_feature_columns(
+          features, [var_len_real_valued]))
+    self.assertAllClose(rating.astype(np.float32), output)
 
   def testRealValuedColumnWithNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -628,6 +661,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testRealValuedColumnWithMultiDimensionsAndNormalizer(self):
     real_valued = feature_column.real_valued_column(
@@ -639,6 +675,9 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [real_valued])
     with self.test_session():
       self.assertAllClose(output.eval(), features["price"].eval() - 2)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [real_valued]).eval())
 
   def testBucketizedColumnWithNormalizerSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -651,6 +690,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     expected = [[0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testBucketizedColumnWithMultiDimensionsSucceedsForDNN(self):
     bucket = feature_column.bucketized_column(
@@ -665,6 +706,8 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                 [1, 0, 0, 0, 1, 0, 0, 0]]
     with self.test_session():
       self.assertAllClose(output.eval(), expected)
+      self.assertAllClose(output.eval(),
+                          fc_core.input_layer(features, [bucket]).eval())
 
   def testOneHotColumnFromWeightedSparseColumnSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -683,11 +726,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_column = feature_column.one_hot_column(weighted_ids_column)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_column])
+    output_core = fc_core.input_layer(features, [one_hot_column])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 10., 0], [0, 20., 0, 0], [30., 0, 40., 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -700,12 +746,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromMultivalentSparseColumnWithKeysSucceedsForDNN(self):
     ids_column = feature_column.sparse_column_with_keys(
@@ -718,12 +767,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     features = {"ids": ids_tensor}
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithIntegerizedFeaturePassesForDNN(self):
     ids_column = feature_column.sparse_column_with_integerized_feature(
@@ -738,10 +790,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     }
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual([[0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0]],
                           output.eval())
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testOneHotColumnFromSparseColumnWithHashBucketSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("feat", 10)
@@ -753,10 +808,13 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     one_hot_sparse = feature_column.one_hot_column(hashed_sparse)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [one_hot_sparse])
+    output_core = fc_core.input_layer(features, [one_hot_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual([3, 10], output.eval().shape)
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -768,9 +826,12 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(hashed_sparse, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [4, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testScatteredEmbeddingColumnSucceedsForDNN(self):
     wire_tensor = sparse_tensor.SparseTensor(
@@ -786,14 +847,24 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         features, [embedded_sparse], weight_collections=["my_collection"])
     weights = ops.get_collection("my_collection")
     grad = gradients_impl.gradients(output, weights)
+    # Calcuates the tensors calculated by FC core libs. Later, the values will
+    # be compared with the contrib version.
+    output_core = fc_core.input_layer(
+        features, [embedded_sparse], weight_collections=["my_collection_core"])
+    weights_core = ops.get_collection("my_collection_core")
+    grad_core = gradients_impl.gradients(output_core, weights_core)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       gradient_values = []
+      gradient_values_core = []
       # Collect the gradient from the different partitions (one in this test)
       for p in range(len(grad)):
         gradient_values.extend(grad[p].values.eval())
+        gradient_values_core.extend(grad_core[p].values.eval())
       gradient_values.sort()
+      gradient_values_core.sort()
       self.assertAllEqual(gradient_values, [0.5] * 6 + [2] * 3)
+      self.assertAllEqual(gradient_values, gradient_values_core)
 
   def testEmbeddingColumnWithInitializerSucceedsForDNN(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -809,12 +880,15 @@ class CreateInputLayersForDNNsTest(test.TestCase):
         initializer=init_ops.constant_initializer(init_value))
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       output_eval = output.eval()
       self.assertAllEqual(output_eval.shape, [2, 10])
       self.assertAllClose(output_eval, np.tile(init_value, [2, 10]))
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testEmbeddingColumnWithMultipleInitializersFails(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -860,10 +934,14 @@ class CreateInputLayersForDNNsTest(test.TestCase):
     embeded_sparse = feature_column.embedding_column(weighted_ids, 10)
     output = feature_column_ops.input_from_feature_columns(features,
                                                            [embeded_sparse])
+    output_core = fc_core.input_layer(features, [embeded_sparse])
+
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
   def testEmbeddingColumnWithIntegerWeightedSparseColumnSucceedsForDNN(self):
     """Same as the previous test, but with integer weights."""
@@ -885,7 +963,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output.eval().shape, [2, 10])
 
   def testEmbeddingColumnWithCrossedColumnSucceedsForDNN(self):
@@ -936,7 +1014,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           "Error creating input layer for column: ids_weighted_by_weights"):
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         feature_column_ops.input_from_feature_columns(features, [weighted_ids])
 
   def testCrossedColumnFailsForDNN(self):
@@ -1043,7 +1121,7 @@ class CreateInputLayersForDNNsTest(test.TestCase):
                                                            [embeded_sparse])
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       # score: (sum of weights)
       self.assertAllEqual(output.eval(), [[10.], [50.], [0.]])
 
@@ -1196,6 +1274,19 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
       model_inputs = sess.run(model_input_tensor)
     self.assertAllClose(measurement_input, model_inputs)
 
+  def testRealValuedVarLenColumn(self):
+    var_len_real_valued = feature_column._real_valued_var_len_column(
+        "rating", default_value=-1)
+    rating = np.array([[0., 1., 2., -1.],
+                       [3., 4., 5., 6.]])
+    features = {"rating": constant_op.constant(rating)}
+    with self.test_session() as sess:
+      output = sess.run(
+          feature_column_ops.sequence_input_from_feature_columns(
+              features, [var_len_real_valued]))
+    reshaped_rating = np.reshape(rating, [2, 4, 1])
+    self.assertAllClose(reshaped_rating, output)
+
   def testRealValuedColumnWithExtraDimensions(self):
     batch_size = 4
     sequence_length = 8
@@ -1281,7 +1372,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, 4])
@@ -1315,7 +1406,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = np.array([4, 3, hash_buckets])
@@ -1345,7 +1436,36 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
+      model_input = sess.run(model_input_tensor)
+
+    self.assertAllEqual(expected_input_shape, model_input.shape)
+
+  def testEmbeddingColumnWithAutoReshape(self):
+    hash_buckets = 10
+    embedding_dimension = 5
+    ids_tensor = sparse_tensor.SparseTensor(
+        values=["c", "b",
+                "a", "c", "b",
+                "b"],
+        indices=[[0, 0], [0, 1],
+                 [1, 0], [1, 1], [1, 2],
+                 [3, 2]],
+        dense_shape=[4, 3])
+
+    expected_input_shape = np.array([4, 3, embedding_dimension])
+
+    hashed_ids_column = feature_column.sparse_column_with_hash_bucket(
+        "ids", hash_buckets)
+    embedded_column = feature_column.embedding_column(hashed_ids_column,
+                                                      embedding_dimension)
+    columns_to_tensors = {"ids": ids_tensor}
+    model_input_tensor = feature_column_ops.sequence_input_from_feature_columns(
+        columns_to_tensors, [embedded_column])
+
+    with self.test_session() as sess:
+      variables_lib.global_variables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     self.assertAllEqual(expected_input_shape, model_input.shape)
@@ -1375,14 +1495,14 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
                                                embedding_weights)
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input, gradients = sess.run([model_input_tensor, gradient_tensor])
 
     expected_input_shape = [4, 3, embedding_dimension]
     self.assertAllEqual(expected_input_shape, model_input.shape)
 
-    # `ids_tensor` consists of 7 instances of <empty>, 3 occurences of "b",
-    # 2 occurences of "c" and 1 instance of "a".
+    # `ids_tensor` consists of 7 instances of <empty>, 3 occurrences of "b",
+    # 2 occurrences of "c" and 1 instance of "a".
     expected_gradient_values = sorted([0., 3., 2., 1.] * embedding_dimension)
     actual_gradient_values = np.sort(gradients[0].values, axis=None)
     self.assertAllClose(expected_gradient_values, actual_gradient_values)
@@ -1442,7 +1562,7 @@ class SequenceInputFromFeatureColumnTest(test.TestCase):
 
     with self.test_session() as sess:
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       model_input = sess.run(model_input_tensor)
 
     expected_input_shape = [
@@ -1476,9 +1596,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseIntColumn(self):
     """Tests a sparse column with int values."""
@@ -1491,9 +1614,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testSparseColumnWithDenseInputTensor(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1502,9 +1628,12 @@ class WeightedSumTest(test.TestCase):
     features = {"wire": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [hashed_sparse], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [hashed_sparse], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumn(self):
     ids = feature_column.sparse_column_with_keys("ids",
@@ -1521,10 +1650,13 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testWeightedSparseColumnWithDenseInputTensor(self):
     ids = feature_column.sparse_column_with_keys(
@@ -1536,11 +1668,14 @@ class WeightedSumTest(test.TestCase):
     features = {"ids": ids_tensor, "weights": weights_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [weighted_ids], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [weighted_ids], units=5)
 
     with self.test_session():
       variables_lib.global_variables_initializer().run()
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testCrossedColumn(self):
     a = feature_column.sparse_column_with_hash_bucket(
@@ -1555,9 +1690,12 @@ class WeightedSumTest(test.TestCase):
     features = {"aaa": wire_tensor, "bbb": wire_tensor}
     logits, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [crossed], num_outputs=5)
+    logits_core = fc_core.linear_model(features, [crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(logits.eval().shape, [2, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(logits.eval(), logits_core.eval())
 
   def testEmbeddingColumn(self):
     hashed_sparse = feature_column.sparse_column_with_hash_bucket("wire", 10)
@@ -1591,9 +1729,11 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [movies], num_outputs=1))
+      logits_core = fc_core.linear_model(features, [movies])
+
       with self.test_session() as sess:
         variables_lib.initialize_all_variables().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (3, 1))
@@ -1601,6 +1741,8 @@ class WeightedSumTest(test.TestCase):
         # score for first example = 0.3 (matrix) + 0.1 (head-on) = 0.4
         # score for second example = 0.5 (winter sleep)
         self.assertAllClose(output.eval(), [[0.4], [0.5]])
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval().shape, logits_core.eval().shape)
 
   def testRealValuedColumnWithMultiDimensions(self):
     real_valued = feature_column.real_valued_column("price", 2)
@@ -1645,9 +1787,13 @@ class WeightedSumTest(test.TestCase):
     }
     output, _, _ = feature_column_ops.weighted_sum_from_feature_columns(
         features, [real_valued, bucket, hashed_sparse, crossed], num_outputs=5)
+    output_core = fc_core.linear_model(
+        features, [real_valued, bucket, hashed_sparse, crossed], units=5)
     with self.test_session():
       variables_lib.global_variables_initializer().run()
       self.assertAllEqual(output.eval().shape, [3, 5])
+      # Verify cross compatibility: Core builder output should equal to contrib.
+      self.assertAllEqual(output.eval(), output_core.eval())
 
   def testPredictions(self):
     language = feature_column.sparse_column_with_keys(
@@ -1668,7 +1814,7 @@ class WeightedSumTest(test.TestCase):
               features, [age, language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1708,7 +1854,7 @@ class WeightedSumTest(test.TestCase):
       self.assertEqual(len(variables), 1)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1772,7 +1918,7 @@ class WeightedSumTest(test.TestCase):
               features, [weighted_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertAllClose(output.eval(), [[0.], [0.]])
 
@@ -1800,7 +1946,7 @@ class WeightedSumTest(test.TestCase):
               features, [language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # score: 0.1 + language_weight['hindi'] + language_weight['english']
         sess.run(bias.assign([0.1]))
@@ -1823,7 +1969,7 @@ class WeightedSumTest(test.TestCase):
               features, [movies], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
         self.assertEqual(weights.get_shape(), (15, 1))
@@ -1857,7 +2003,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1881,7 +2027,7 @@ class WeightedSumTest(test.TestCase):
               features, [language_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[language_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1914,7 +2060,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language][0]
         sess.run(weights.assign(weights + 0.4))
@@ -1955,7 +2101,7 @@ class WeightedSumTest(test.TestCase):
                 scope=scope))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         self.assertEqual(2, len(column_to_variable[country]))
         self.assertEqual(3, len(column_to_variable[language]))
@@ -1992,7 +2138,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, incomes], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         incomes_weights = column_to_variable[incomes][0]
         sess.run(incomes_weights.assign([[0.1], [0.2], [0.3]]))
@@ -2004,7 +2150,9 @@ class WeightedSumTest(test.TestCase):
     age = feature_column.real_valued_column("age")
     # The following RealValuedColumn has no predefined dimension so it
     # can be missing.
-    height = feature_column.real_valued_column("height", dimension=None)
+    height = feature_column._real_valued_var_len_column("height",
+                                                        default_value=0,
+                                                        is_sparse=False)
     # The following RealValuedColumn has 3 dimensions.
     incomes = feature_column.real_valued_column("incomes", 3)
     with ops.Graph().as_default():
@@ -2014,10 +2162,7 @@ class WeightedSumTest(test.TestCase):
           "incomes":
               constant_op.constant([[100., 200., 300.], [10., 20., 30.]]),
           "height":
-              sparse_tensor.SparseTensor(
-                  values=[5.0, 4.0, 6.0],
-                  indices=[[0, 0], [0, 1], [1, 1]],
-                  dense_shape=[2, 2]),
+              constant_op.constant([[5., 4.], [0., 6.]]),
           "country":
               sparse_tensor.SparseTensor(
                   values=["US", "SV"],
@@ -2029,7 +2174,7 @@ class WeightedSumTest(test.TestCase):
               features, [country, age, height, incomes], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         height_weights = column_to_variable[height][0]
         sess.run(
@@ -2057,9 +2202,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         sess.run(column_to_variable[bucket][0].assign([[0.1], [0.2], [0.3],
                                                        [0.4]]))
@@ -2085,9 +2233,12 @@ class WeightedSumTest(test.TestCase):
       output, column_to_variable, _ = (
           feature_column_ops.weighted_sum_from_feature_columns(
               features, [bucket, country], num_outputs=1))
+      output_core = fc_core.linear_model(features, [bucket, country])
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
+        # Cross compatibility: Core builder output should equal to contrib.
+        self.assertAllEqual(output.eval(), output_core.eval())
 
         # dimension = 2, bucket_size = 4, num_classes = 1
         sess.run(column_to_variable[bucket][0].assign(
@@ -2116,7 +2267,7 @@ class WeightedSumTest(test.TestCase):
               features, [bucket, country], num_outputs=5))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         # dimension = 2, bucket_size = 4, num_classes = 5
         sess.run(column_to_variable[bucket][0].assign(
@@ -2152,7 +2303,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2191,7 +2342,7 @@ class WeightedSumTest(test.TestCase):
               features, [country_language_price], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[country_language_price][0]
         sess.run(weights.assign(weights + 0.4))
@@ -2215,7 +2366,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2230,7 +2381,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2245,7 +2396,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.6], [0.7]])
@@ -2266,7 +2417,7 @@ class WeightedSumTest(test.TestCase):
               features, [product], num_outputs=1))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         product_weights = column_to_variable[product][0]
         sess.run(product_weights.assign([[0.1], [0.2], [0.3], [0.4], [0.5]]))
         self.assertAllClose(output.eval(), [[0.1], [0.5], [0.3]])
@@ -2278,7 +2429,7 @@ class WeightedSumTest(test.TestCase):
           features, [feature_column.real_valued_column("age")], num_outputs=3)
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         sess.run(bias.assign([0.1, 0.2, 0.3]))
         self.assertAllClose(output.eval(), [[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
                                             [0.1, 0.2, 0.3], [0.1, 0.2, 0.3]])
@@ -2292,7 +2443,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (1, 3))
         sess.run(weights.assign([[0.01, 0.03, 0.05]]))
@@ -2316,7 +2467,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
         sess.run(
@@ -2342,7 +2493,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2382,7 +2533,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2411,7 +2562,7 @@ class WeightedSumTest(test.TestCase):
               features, [column], num_outputs=3))
       with self.test_session() as sess:
         variables_lib.global_variables_initializer().run()
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[column][0]
         self.assertEqual(weights.get_shape(), (5, 3))
@@ -2476,7 +2627,7 @@ class ParseExampleTest(test.TestCase):
     self.assertIn(bucket, output)
     self.assertIn(wire_cast, output)
     with self.test_session():
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(output[bucket].eval(), [[2, 3, 0]])
       self.assertAllEqual(output[wire_cast].indices.eval(), [[0, 0], [0, 1]])
       self.assertAllEqual(output[wire_cast].values.eval(), [2, 0])
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index dc706d24f274e23ed693668680c78dc38c196865..b6a8b6bdda390bc685352021b6a881457cd5740c 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -173,7 +173,7 @@ class FeatureColumnTest(test.TestCase):
     for i in range(len(b1_value)):
       self.assertAllClose(b1_value[i], b2_value[i])
 
-    # Test the case when a shared_embedding_name is explictly specified.
+    # Test the case when a shared_embedding_name is explicitly specified.
     d = fc.shared_embedding_columns(
         [a1, a2],
         dimension=4,
@@ -287,6 +287,51 @@ class FeatureColumnTest(test.TestCase):
     self.assertEqual(column.name, "a_one_hot")
     self.assertEqual(column.length, 4)
 
+  def testRealValuedVarLenColumn(self):
+    c = fc._real_valued_var_len_column("ccc", is_sparse=True)
+    self.assertTrue(c.is_sparse)
+    self.assertTrue(c.default_value is None)
+    # default_value is an integer.
+    c5 = fc._real_valued_var_len_column("c5", default_value=2)
+    self.assertEqual(c5.default_value, 2)
+    # default_value is a float.
+    d4 = fc._real_valued_var_len_column("d4", is_sparse=True)
+    self.assertEqual(d4.default_value, None)
+    self.assertEqual(d4.is_sparse, True)
+    # Default value is a list but dimension is None.
+    with self.assertRaisesRegexp(ValueError,
+                                 "Only scalar default value.*"):
+      fc._real_valued_var_len_column("g5", default_value=[2., 3.])
+
+  def testRealValuedVarLenColumnDtypes(self):
+    rvc = fc._real_valued_var_len_column("rvc", is_sparse=True)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
+        }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column("rvc", default_value=0,
+                                         is_sparse=False)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.FixedLenSequenceFeature(shape=[],
+                                                       dtype=dtypes.float32,
+                                                       allow_missing=True,
+                                                       default_value=0.0)
+        }, rvc.config)
+
+    rvc = fc._real_valued_var_len_column("rvc", dtype=dtypes.int32,
+                                         default_value=0, is_sparse=True)
+    self.assertDictEqual(
+        {
+            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
+        }, rvc.config)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 "dtype must be convertible to float"):
+      fc._real_valued_var_len_column("rvc", dtype=dtypes.string,
+                                     default_value="", is_sparse=True)
+
   def testRealValuedColumn(self):
     a = fc.real_valued_column("aaa")
     self.assertEqual(a.name, "aaa")
@@ -294,9 +339,6 @@ class FeatureColumnTest(test.TestCase):
     b = fc.real_valued_column("bbb", 10)
     self.assertEqual(b.dimension, 10)
     self.assertTrue(b.default_value is None)
-    c = fc.real_valued_column("ccc", dimension=None)
-    self.assertIsNone(c.dimension)
-    self.assertTrue(c.default_value is None)
 
     with self.assertRaisesRegexp(TypeError, "dimension must be an integer"):
       fc.real_valued_column("d3", dimension=1.0)
@@ -319,8 +361,6 @@ class FeatureColumnTest(test.TestCase):
     c4 = fc.real_valued_column(
         "c4", dimension=4, default_value=2, dtype=dtypes.int32)
     self.assertListEqual(list(c4.default_value), [2, 2, 2, 2])
-    c5 = fc.real_valued_column("c5", dimension=None, default_value=2)
-    self.assertListEqual(list(c5.default_value), [2])
 
     # default_value is a float.
     d1 = fc.real_valued_column("d1", default_value=2.)
@@ -330,8 +370,6 @@ class FeatureColumnTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError,
                                  "default_value must be compatible with dtype"):
       fc.real_valued_column("d3", default_value=2., dtype=dtypes.int32)
-    d4 = fc.real_valued_column("d4", dimension=None, default_value=2.)
-    self.assertListEqual(list(d4.default_value), [2.])
 
     # default_value is neither integer nor float.
     with self.assertRaisesRegexp(TypeError,
@@ -362,12 +400,6 @@ class FeatureColumnTest(test.TestCase):
         ValueError, "The length of default_value must be equal to dimension"):
       fc.real_valued_column("g4", dimension=3, default_value=[2.])
 
-    # Default value is a list but dimension is None.
-    with self.assertRaisesRegexp(ValueError,
-                                 "Only scalar default value is supported "
-                                 "when dimension is None"):
-      fc.real_valued_column("g5", dimension=None, default_value=[2., 3.])
-
     # Test that the normalizer_fn gets stored for a real_valued_column
     normalizer = lambda x: x - 1
     h1 = fc.real_valued_column("h1", normalizer=normalizer)
@@ -404,30 +436,13 @@ class FeatureColumnTest(test.TestCase):
   def testRealValuedColumnDensification(self):
     """Tests densification behavior of `RealValuedColumn`."""
     # No default value, dimension 1 float.
-    real_valued_column = fc.real_valued_column(
-        "sparse_real_valued1", dimension=None)
+    real_valued_column = fc._real_valued_var_len_column(
+        "sparse_real_valued1", is_sparse=True)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         values=[2.0, 5.0], indices=[[0, 0], [2, 0]], dense_shape=[3, 1])
-    densified_output = real_valued_column._to_dnn_input_layer(sparse_tensor)
-
-    # With default value, dimension 2 int.
-    real_valued_column_with_default = fc.real_valued_column(
-        "sparse_real_valued2",
-        dimension=None,
-        default_value=-1,
-        dtype=dtypes.int32)
-    sparse_tensor2 = sparse_tensor_lib.SparseTensor(
-        values=[2, 5, 9, 0],
-        indices=[[0, 0], [1, 1], [2, 0], [2, 1]],
-        dense_shape=[3, 2])
-    densified_output2 = real_valued_column_with_default._to_dnn_input_layer(
-        sparse_tensor2)
-
-    with self.test_session() as sess:
-      densified_output_eval, densified_output_eval2 = sess.run(
-          [densified_output, densified_output2])
-      self.assertAllEqual(densified_output_eval, [[2.0], [0.0], [5.0]])
-      self.assertAllEqual(densified_output_eval2, [[2, -1], [-1, 5], [9, 0]])
+    with self.assertRaisesRegexp(
+        ValueError, "Set is_sparse to False"):
+      real_valued_column._to_dnn_input_layer(sparse_tensor)
 
   def testRealValuedColumnDeepCopy(self):
     column = fc.real_valued_column(
@@ -452,9 +467,11 @@ class FeatureColumnTest(test.TestCase):
               column_name="bbb", bucket_size=10), [0])
 
   def testBucketizedColumnRequiresRealValuedColumnDimension(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 "source_column must have a defined dimension"):
-      fc.bucketized_column(fc.real_valued_column("bbb", dimension=None), [0])
+    with self.assertRaisesRegexp(
+        TypeError, "source_column must be an instance of _RealValuedColumn.*"):
+      fc.bucketized_column(fc._real_valued_var_len_column("bbb",
+                                                          is_sparse=True),
+                           [0])
 
   def testBucketizedColumnRequiresSortedBuckets(self):
     with self.assertRaisesRegexp(ValueError,
@@ -564,12 +581,6 @@ class FeatureColumnTest(test.TestCase):
         },
         rvc.config)
 
-    rvc = fc.real_valued_column("rvc", dimension=None)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.float32)
-        }, rvc.config)
-
     rvc = fc.real_valued_column("rvc", dtype=dtypes.int32)
     self.assertDictEqual(
         {
@@ -578,20 +589,10 @@ class FeatureColumnTest(test.TestCase):
         },
         rvc.config)
 
-    rvc = fc.real_valued_column("rvc", dimension=None, dtype=dtypes.int32)
-    self.assertDictEqual(
-        {
-            "rvc": parsing_ops.VarLenFeature(dtype=dtypes.int32)
-        }, rvc.config)
-
     with self.assertRaisesRegexp(ValueError,
                                  "dtype must be convertible to float"):
       fc.real_valued_column("rvc", dtype=dtypes.string)
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "dtype must be convertible to float"):
-      fc.real_valued_column("rvc", dimension=None, dtype=dtypes.string)
-
   def testSparseColumnDtypes(self):
     sc = fc.sparse_column_with_integerized_feature("sc", 10)
     self.assertDictEqual(
@@ -696,8 +697,11 @@ class FeatureColumnTest(test.TestCase):
                                                 "str_id_weights_column")
     real_valued_col1 = fc.real_valued_column("real_valued_column1")
     real_valued_col2 = fc.real_valued_column("real_valued_column2", 5)
-    real_valued_col3 = fc.real_valued_column(
-        "real_valued_column3", dimension=None)
+    real_valued_col3 = fc._real_valued_var_len_column(
+        "real_valued_column3", is_sparse=True)
+    real_valued_col4 = fc._real_valued_var_len_column(
+        "real_valued_column4", dtype=dtypes.int64, default_value=0,
+        is_sparse=False)
     bucketized_col1 = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4])
     bucketized_col2 = fc.bucketized_column(
@@ -713,8 +717,8 @@ class FeatureColumnTest(test.TestCase):
     feature_columns = set([
         sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col,
         int64_sparse_id_col, real_valued_col1, real_valued_col2,
-        real_valued_col3, bucketized_col1, bucketized_col2, cross_col,
-        one_hot_col, scattered_embedding_col
+        real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2,
+        cross_col, one_hot_col, scattered_embedding_col
     ])
     expected_config = {
         "sparse_column":
@@ -737,6 +741,9 @@ class FeatureColumnTest(test.TestCase):
                 [5], dtype=dtypes.float32),
         "real_valued_column3":
             parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_column4":
+            parsing_ops.FixedLenSequenceFeature(
+                [], dtype=dtypes.int64, allow_missing=True, default_value=0),
         "real_valued_column_for_bucketization1":
             parsing_ops.FixedLenFeature(
                 [1], dtype=dtypes.float32),
@@ -773,14 +780,17 @@ class FeatureColumnTest(test.TestCase):
         "real_valued_column3", default_value=[8])
     real_valued_col4 = fc.real_valued_column(
         "real_valued_column4", 3, default_value=[1, 0, 6])
-    real_valued_col5 = fc.real_valued_column(
-        "real_valued_column5", dimension=None, default_value=2)
+    real_valued_col5 = fc._real_valued_var_len_column(
+        "real_valued_column5", default_value=2, is_sparse=True)
+    real_valued_col6 = fc._real_valued_var_len_column(
+        "real_valued_column6", dtype=dtypes.int64, default_value=1,
+        is_sparse=False)
     feature_columns = [
         real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4,
-        real_valued_col5
+        real_valued_col5, real_valued_col6
     ]
     config = fc.create_feature_spec_for_parsing(feature_columns)
-    self.assertEqual(5, len(config))
+    self.assertEqual(6, len(config))
     self.assertDictEqual(
         {
             "real_valued_column1":
@@ -798,7 +808,11 @@ class FeatureColumnTest(test.TestCase):
                 parsing_ops.FixedLenFeature(
                     [3], dtype=dtypes.float32, default_value=[1., 0., 6.]),
             "real_valued_column5":
-                parsing_ops.VarLenFeature(dtype=dtypes.float32)
+                parsing_ops.VarLenFeature(dtype=dtypes.float32),
+            "real_valued_column6":
+                parsing_ops.FixedLenSequenceFeature(
+                    [], dtype=dtypes.int64, allow_missing=True,
+                    default_value=1)
         },
         config)
 
@@ -816,12 +830,14 @@ class FeatureColumnTest(test.TestCase):
     real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2)
     real_valued_col2 = fc.real_valued_column(
         "real_valued_default_column", dimension=5, default_value=3.0)
-    real_valued_col3 = fc.real_valued_column(
-        "real_valued_var_len_column", dimension=None, default_value=3.0)
+    real_valued_col3 = fc._real_valued_var_len_column(
+        "real_valued_var_len_column", default_value=3.0, is_sparse=True)
+    real_valued_col4 = fc._real_valued_var_len_column(
+        "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False)
 
     feature_columns = set([
         sparse_col, embedding_col, weighted_id_col, real_valued_col1,
-        real_valued_col2, real_valued_col3
+        real_valued_col2, real_valued_col3, real_valued_col4
     ])
 
     feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns)
@@ -842,7 +858,11 @@ class FeatureColumnTest(test.TestCase):
             parsing_ops.FixedLenSequenceFeature(
                 shape=[5], dtype=dtypes.float32, allow_missing=True),
         "real_valued_var_len_column":
-            parsing_ops.VarLenFeature(dtype=dtypes.float32)
+            parsing_ops.VarLenFeature(dtype=dtypes.float32),
+        "real_valued_var_len_dense_column":
+            parsing_ops.FixedLenSequenceFeature(
+                shape=[], dtype=dtypes.float32, allow_missing=True,
+                default_value=4.0),
     }
 
     self.assertDictEqual(expected_feature_spec, feature_spec)
@@ -851,8 +871,8 @@ class FeatureColumnTest(test.TestCase):
     sparse_col = fc.sparse_column_with_hash_bucket(
         "sparse_column", hash_bucket_size=100)
     real_valued_col = fc.real_valued_column("real_valued_column", 5)
-    vlen_real_valued_col = fc.real_valued_column(
-        "vlen_real_valued_column", dimension=None)
+    vlen_real_valued_col = fc._real_valued_var_len_column(
+        "vlen_real_valued_column", is_sparse=True)
 
     bucketized_col = fc.bucketized_column(
         fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4])
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index 4359d0c63e3b743f926f4e6cf231e5b9c69becc2..271b3c01ffc86aeb031ec2737c96b926e6d16697 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -34,9 +34,10 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   This function implements the weight initialization from:
 
   Xavier Glorot and Yoshua Bengio (2010):
-           Understanding the difficulty of training deep feedforward neural
+           [Understanding the difficulty of training deep feedforward neural
            networks. International conference on artificial intelligence and
-           statistics.
+           statistics.](
+           http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf)
 
   This initializer is designed to keep the scale of the gradients roughly the
   same in all layers. In uniform distribution this ends up being the range:
@@ -46,8 +47,7 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32):
   Args:
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
@@ -97,8 +97,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
     mode: String.  'FAN_IN', 'FAN_OUT', 'FAN_AVG'.
     uniform: Whether to use uniform or normal distributed random initialization.
     seed: A Python integer. Used to create random seeds. See
-      [`set_random_seed`](../../api_docs/python/constant_op.md#set_random_seed)
-      for behavior.
+          @{tf.set_random_seed} for behavior.
     dtype: The data type. Only floating point types are supported.
 
   Returns:
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 65dcf8577f0c21b7d8c6551f2168258091e1cf74..7a429f75bbf2abe3eeb6bc3b5ac53d2be7e845e4 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -59,6 +59,7 @@ __all__ = ['avg_pool2d',
            'convolution2d_in_plane',
            'convolution2d_transpose',
            'dropout',
+           'elu',
            'flatten',
            'fully_connected',
            'layer_norm',
@@ -277,7 +278,7 @@ def _fused_batch_norm(
         trainable=trainable_gamma)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections.
+    # appropriate collections.
     moving_mean_collections = utils.get_variable_collections(
         variables_collections, 'moving_mean')
     moving_mean_initializer = param_initializers.get(
@@ -379,7 +380,10 @@ def batch_norm(inputs,
                fused=False,
                data_format=DATA_FORMAT_NHWC,
                zero_debias_moving_mean=False,
-               scope=None):
+               scope=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_decay=0.99):
   """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
     "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -446,6 +450,19 @@ def batch_norm(inputs,
     zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
       pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
     scope: Optional scope for `variable_scope`.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_decay: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `decay` is still applied
+      to get the means and variances for inference.
 
   Returns:
     A `Tensor` representing the output of the operation.
@@ -464,6 +481,8 @@ def batch_norm(inputs,
     if param_regularizers is not None:
       raise ValueError('Regularizers are not currently '
                        'supported for fused batch norm.')
+    if renorm:
+      raise ValueError('Renorm is not supported for fused batch norm.')
     return _fused_batch_norm(
         inputs,
         decay=decay,
@@ -524,6 +543,9 @@ def batch_norm(inputs,
           beta_regularizer=beta_regularizer,
           gamma_regularizer=gamma_regularizer,
           trainable=trainable,
+          renorm=renorm,
+          renorm_clipping=renorm_clipping,
+          renorm_momentum=renorm_decay,
           name=sc.name,
           _scope=sc,
           _reuse=reuse)
@@ -551,6 +573,9 @@ def batch_norm(inputs,
     # Custom updates collections are not supported because the update logic
     # is different in this case, in particular w.r.t. "forced updates" and
     # update op reuse.
+    if renorm:
+      raise ValueError('renorm is not supported with batch_weights, '
+                       'updates_collections or zero_debias_moving_mean')
     inputs_shape = inputs.get_shape()
     inputs_rank = inputs_shape.ndims
     if inputs_rank is None:
@@ -607,7 +632,7 @@ def batch_norm(inputs,
                                        trainable=trainable)
 
     # Create moving_mean and moving_variance variables and add them to the
-    # appropiate collections. We disable variable partitioning while creating
+    # appropriate collections. We disable variable partitioning while creating
     # them, because assign_moving_average is not yet supported for partitioned
     # variables.
     partitioner = variable_scope.get_variable_scope().partitioner
@@ -819,7 +844,7 @@ def convolution(inputs,
   variable would be created and added the activations. Finally, if
   `activation_fn` is not `None`, it is applied to the activations as well.
 
-  Performs a'trous convolution with input stride/dilation rate equal to `rate`
+  Performs atrous convolution with input stride/dilation rate equal to `rate`
   if a value > 1 for any dimension of `rate` is specified.  In this case
   `stride` values != 1 are not supported.
 
@@ -845,7 +870,7 @@ def convolution(inputs,
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     rate: A sequence of N positive integers specifying the dilation rate to use
-      for a'trous convolution.  Can be a single integer to specify the same
+      for atrous convolution.  Can be a single integer to specify the same
       value for all spatial dimensions.  Specifying any `rate` value != 1 is
       incompatible with specifying any `stride` value != 1.
     activation_fn: Activation function. The default value is a ReLU function.
@@ -1062,7 +1087,7 @@ def convolution2d_transpose(
   """Adds a convolution2d_transpose with an optional batch normalization layer.
 
   The function creates a variable called `weights`, representing the
-  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
+  kernel, that is convolved with the input. If `normalizer_fn` is `None`, a
   second variable called 'biases' is added to the result of the operation.
 
   Args:
@@ -1241,6 +1266,13 @@ def flatten(inputs,
 
 def _sparse_inner_flatten(inputs, new_rank):
   """Helper function for `inner_flatten`."""
+  inputs_rank = inputs.dense_shape.get_shape().as_list()[0]
+  if inputs_rank < new_rank:
+    raise ValueError(
+        'Inputs has rank less than new_rank. {} must have rank at least'
+        ' {}. Received rank {}, shape {}'.format(inputs, new_rank, inputs_rank,
+                                                 inputs.get_shape()))
+
   outer_dimensions = inputs.dense_shape[:new_rank - 1]
   inner_dimensions = inputs.dense_shape[new_rank - 1:]
   new_shape = array_ops.concat((outer_dimensions,
@@ -1815,9 +1847,9 @@ def separable_convolution2d(
   This op first performs a depthwise convolution that acts separately on
   channels, creating a variable called `depthwise_weights`. If `num_outputs`
   is not None, it adds a pointwise convolution that mixes channels, creating a
-  variable called `pointwise_weights`. Then, if `batch_norm_params` is None,
-  it adds bias to the result, creating a variable called 'biases', otherwise
-  it adds a batch normalization layer. It finally applies an activation function
+  variable called `pointwise_weights`. Then, if `normalizer_fn` is None,
+  it adds bias to the result, creating a variable called 'biases', otherwise,
+  the `normalizer_fn` is applied. It finally applies an activation function
   to produce the end result.
 
   Args:
@@ -1833,7 +1865,7 @@ def separable_convolution2d(
       depthwise convolution stride. Can be an int if both strides are the same.
     padding: One of 'VALID' or 'SAME'.
     rate: A list of length 2: [rate_height, rate_width], specifying the dilation
-      rates for a'trous convolution. Can be an int if both rates are the same.
+      rates for atrous convolution. Can be an int if both rates are the same.
       If any value is larger than one, then both stride values need to be one.
     activation_fn: Activation function. The default value is a ReLU function.
       Explicitly set it to None to skip it and maintain a linear activation.
@@ -1942,6 +1974,7 @@ def separable_convolution2d(
                                             dtype=dtype,
                                             initializer=biases_initializer,
                                             regularizer=biases_regularizer,
+                                            trainable=trainable,
                                             collections=biases_collections)
           outputs = nn.bias_add(outputs, biases)
 
@@ -2201,6 +2234,7 @@ def legacy_fully_connected(x,
 
 # TODO(eiderm): Verify and fix autocomplete in colab (also relu6).
 # Simple aliases which remove the activation_fn parameter.
+elu = functools.partial(fully_connected, activation_fn=nn.elu)
 legacy_relu = functools.partial(legacy_fully_connected, activation_fn=nn.relu)
 legacy_linear = functools.partial(legacy_fully_connected, activation_fn=None)
 relu = functools.partial(fully_connected, activation_fn=nn.relu)
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 3bc31a26249b5fb4ce78cd7ca41e068698d156f0..9acce027bce62d017755d5ceff326c6d2af5dd4f 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -71,7 +71,6 @@ class AvgPool2DTest(test.TestCase):
     height, width = 3, 6
     images = np.random.uniform(size=(5, 2, height, width))
     output = _layers.avg_pool2d(images, [3, 3], data_format='NCHW')
-    self.assertEquals(output.op.name, 'AvgPool2D/AvgPool')
     self.assertListEqual(output.get_shape().as_list(), [5, 2, 1, 2])
 
   def testCollectOutputs(self):
@@ -247,7 +246,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConv(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 4))
+      images = np.random.uniform(size=(5, height, width, 4)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3])
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
@@ -259,7 +258,7 @@ class ConvolutionTest(test.TestCase):
   def testCreateConvNCHW(self):
     height, width = 7, 9
     with self.test_session():
-      images = np.random.uniform(size=(5, 4, height, width))
+      images = np.random.uniform(size=(5, 4, height, width)).astype(np.float32)
       output = layers_lib.convolution2d(images, 32, [3, 3], data_format='NCHW')
       self.assertEqual(output.op.name, 'Conv/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 32, height, width])
@@ -1465,6 +1464,30 @@ class PartialFlattenTest(test.TestCase):
     flattened5 = _layers._inner_flatten(inputs, 5)
     self.assertEqual([2, None, 4, None, 30], flattened5.get_shape().as_list())
 
+  def testDenseFlattenRankAssertion(self):
+    """Test `_inner_flatten` rank assertion for dense tensors."""
+    shape = [2, 3]
+    new_rank = 3
+    inputs = array_ops.placeholder(dtypes.int32)
+    inputs.set_shape(shape)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'inputs has rank less than new_rank'):
+      _layers._inner_flatten(inputs, new_rank)
+
+  def testSparseFlattenRankAssertion(self):
+    """Test `_inner_flatten` rank assertion for sparse tensors."""
+    shape = [2, 3]
+    new_rank = 3
+    np.random.seed(10301)
+    random_ = np.random.rand(*shape)
+    indices, values, _ = _sparsify(random_)
+    inputs = sparse_tensor.SparseTensor(indices, values, shape)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Inputs has rank less than new_rank'):
+      _layers._inner_flatten(inputs, new_rank)
+
 
 class FCTest(test.TestCase):
 
@@ -2668,7 +2691,6 @@ class MaxPool2DTest(test.TestCase):
     height, width = 3, 6
     images = np.random.uniform(size=(5, 3, height, width)).astype(np.float32)
     output = _layers.max_pool2d(images, [3, 3], data_format='NCHW')
-    self.assertEquals(output.op.name, 'MaxPool2D/MaxPool')
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 1, 2])
 
   def testCollectOutputs(self):
@@ -2756,7 +2778,7 @@ class RepeatTests(test.TestCase):
   def testRepeat(self):
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3))
+      images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
       self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
@@ -2791,15 +2813,6 @@ class SeparableConv2dTest(test.TestCase):
       self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
 
-  def testCreateConvFloat64(self):
-    height, width = 3, 3
-    with self.test_session():
-      images = random_ops.random_uniform(
-          (5, height, width, 3), seed=1, dtype=dtypes.float64)
-      output = layers_lib.separable_conv2d(images, 32, [3, 3], 2)
-      self.assertEqual(output.op.name, 'SeparableConv2d/Relu')
-      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
-
   def testCreateDepthwiseConv(self):
     height, width = 3, 3
     with self.test_session():
@@ -2979,6 +2992,20 @@ class SeparableConv2dTest(test.TestCase):
       sess.run(init_op)
       sess.run(net, feed_dict={images_placeholder: images})
 
+  def testTrainableFlagIsPassedOn(self):
+    for trainable in [True, False]:
+      for num_filters in [None, 8]:
+        with ops.Graph().as_default():
+          input_size = [5, 10, 12, 3]
+
+          images = random_ops.random_uniform(input_size, seed=1)
+          layers_lib.separable_conv2d(
+              images, num_filters, [3, 3], 1, trainable=trainable)
+          model_variables = variables.get_model_variables()
+          trainable_variables = variables_lib.trainable_variables()
+          for model_variable in model_variables:
+            self.assertEqual(trainable, model_variable in trainable_variables)
+
 
 class ScaleGradientTests(test.TestCase):
   """Simple tests of the scale_gradient function."""
@@ -3079,6 +3106,15 @@ class StackTests(test.TestCase):
       self.assertEqual(output.op.name, 'Stack/fully_connected_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 30])
 
+  def testStackElu(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = random_ops.random_uniform(
+          (5, height * width * 3), seed=1, name='images')
+      output = _layers.stack(images, layers_lib.elu, [10, 20, 30])
+      self.assertEqual(output.op.name, 'Stack/fully_connected_3/Elu')
+      self.assertListEqual(output.get_shape().as_list(), [5, 30])
+
   def testStackConvolution2d(self):
     height, width = 3, 3
     with self.test_session():
diff --git a/tensorflow/contrib/layers/python/layers/optimizers.py b/tensorflow/contrib/layers/python/layers/optimizers.py
index c804303671e4ed48987168a419190d2e98bdc452..1a6dfc12e9bf4a5bd970581a88741d9adef02724 100644
--- a/tensorflow/contrib/layers/python/layers/optimizers.py
+++ b/tensorflow/contrib/layers/python/layers/optimizers.py
@@ -148,6 +148,7 @@ def optimize_loss(loss,
         * `clip_gradients` is not float or callable.
         * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
           `global_step` is available.
+        * `gradients` is empty
   """
   loss = ops.convert_to_tensor(loss)
   contrib_framework.assert_scalar(loss)
@@ -244,6 +245,10 @@ def optimize_loss(loss,
     # Multiply some gradients.
     if gradient_multipliers is not None:
       gradients = _multiply_gradients(gradients, gradient_multipliers)
+      if not gradients:
+        raise ValueError(
+            "Empty list of (gradient, var) pairs encountered. This is most "
+            "likely to be caused by an improper value of gradient_multipliers.")
 
     if "gradient_norm" in summaries:
       summary.scalar("global_norm/gradient_norm",
@@ -255,8 +260,8 @@ def optimize_loss(loss,
     elif callable(clip_gradients):
       gradients = clip_gradients(gradients)
     elif clip_gradients is not None:
-      raise ValueError("Unknown type %s for clip_gradients" %
-                       type(clip_gradients))
+      raise ValueError(
+          "Unknown type %s for clip_gradients" % type(clip_gradients))
 
     # Add scalar summary for loss.
     if "loss" in summaries:
diff --git a/tensorflow/contrib/layers/python/layers/target_column_test.py b/tensorflow/contrib/layers/python/layers/target_column_test.py
index 1baa663151ac4f6183a845aa2837dfbe7b0f0e53..d5d03fb1ebcea09f8fd5425a41b0bd12c5139e4d 100644
--- a/tensorflow/contrib/layers/python/layers/target_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/target_column_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 class RegressionTargetColumnTest(test.TestCase):
 
-  # TODO(zakaria): test multilabel regresssion.
+  # TODO(zakaria): test multilabel regression.
   def testRegression(self):
     target_column = target_column_lib.regression_target()
     with ops.Graph().as_default(), session.Session() as sess:
diff --git a/tensorflow/contrib/layers/python/ops/bucketization_op.py b/tensorflow/contrib/layers/python/ops/bucketization_op.py
index b941a9b82227e40fb14400daa8bd7b34bed982b4..f498352855f656666e66a889d8db274cec755028 100644
--- a/tensorflow/contrib/layers/python/ops/bucketization_op.py
+++ b/tensorflow/contrib/layers/python/ops/bucketization_op.py
@@ -17,13 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.layers.ops import gen_bucketization_op
-from tensorflow.contrib.util import loader
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
-
-_bucketization_op = loader.load_op_library(
-    resource_loader.get_path_to_datafile("_bucketization_op.so"))
+from tensorflow.python.ops import math_ops
 
 
 def bucketize(input_tensor, boundaries, name=None):
@@ -43,10 +37,5 @@ def bucketize(input_tensor, boundaries, name=None):
   Raises:
     TypeError: If boundaries is not a list.
   """
-  if not isinstance(boundaries, list):
-    raise TypeError("boundaries must be a list")
-
-  return gen_bucketization_op.bucketize(input_tensor, boundaries, name=name)
-
-
-ops.NotDifferentiable("Bucketize")
+  return math_ops._bucketize(  # pylint: disable=protected-access
+      input_tensor, boundaries=boundaries, name=name)
diff --git a/tensorflow/contrib/layers/python/ops/sparse_ops.py b/tensorflow/contrib/layers/python/ops/sparse_ops.py
index 8c24f37d77e71f2d01feb15a46813e0f82b5d65c..114f312d276001be7b3cbe7db086fb2ac927ed56 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_ops.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_ops.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import compat
 
 
 def _multiplier_helper(shape):
@@ -37,14 +38,24 @@ def _multiplier_helper(shape):
   return multipliers
 
 
+def _ignore_value(dtype):
+  if dtype == dtypes.string:
+    # Exception due to TF strings are converted to numpy objects by default.
+    return ""
+  # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
+  # constructing a new numpy object of the given type, which yields the
+  # default value for that type.
+  return dtype.as_numpy_dtype()
+
+
 def dense_to_sparse_tensor(dense_tensor, ignore_value=None):
-  """Converts a dense Tensor to a SparseTensor, dropping ignore_value cells.
+  """Converts dense `Tensor` to `SparseTensor`, dropping `ignore_value` cells.
 
   Args:
-    dense_tensor: A `Tensor`.
+    dense_tensor: A `Tensor`. This must have a statically defined rank.
     ignore_value: Entries in `dense_tensor` equal to this value will be
       absent from the return `SparseTensor`. If `None`, default value of
-      dense_tensor's dtype will be used (e.g. '' for `str`, 0 for `int`).
+      `dense_tensor` dtype will be used (e.g. '' for `str`, 0 for `int`).
 
   Returns:
     A `SparseTensor` with the same shape as `dense_tensor`.
@@ -58,11 +69,7 @@ def dense_to_sparse_tensor(dense_tensor, ignore_value=None):
       # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
       raise ValueError("dense_tensor.get_shape() should be defined, got None.")
     if ignore_value is None:
-      if dense_t.dtype == dtypes.string:
-        # Exception due to TF strings are converted to numpy objects by default.
-        ignore_value = ""
-      else:
-        ignore_value = dense_t.dtype.as_numpy_dtype()
+      ignore_value = _ignore_value(dense_t.dtype)
     dense_shape = math_ops.cast(array_ops.shape(dense_t), dtypes.int64)
     indices = array_ops.where(
         math_ops.not_equal(dense_t, math_ops.cast(ignore_value, dense_t.dtype)))
@@ -81,3 +88,129 @@ def dense_to_sparse_tensor(dense_tensor, ignore_value=None):
       flat_indices = math_ops.add(flat_indices, offsets)
     values = array_ops.gather(flat_tensor, flat_indices)
     return sparse_tensor.SparseTensor(indices, values, dense_shape)
+
+
+# TODO(ptucker): Support integer dtype arg, and cast values back to that.
+def indicators_to_sparse_ids(indicators, ignore_value=None, dtype=dtypes.int64):
+  """Convert a dense indicator tensor to sparse IDs.
+
+  This is commonly used for converting a dense classification label to sparse.
+  In the following example, we have an input of shape (2, 2, num_classes),
+  where num_classes=4.
+
+  indicators = [
+    [[0, 0, 1, 0], [0, 0, 0, 0]],
+    [[1, 0, 1, 1], [0, 0, 1, 0]],
+  ]
+  indicator_to_sparse_ids(indicators) => [
+    [[2], []],
+    [[0, 2, 3], [2]],
+  ]
+
+  Args:
+    indicators: Dense `Tensor` of shape `(d0, ..., dn, num_classes)`. This must
+      have a statically defined rank. `ignore_value` values are ignored. For
+      other values (typically, ones), the index along the last dimension is
+      returned.
+    ignore_value: Entries in `indicators` equal to this value will be
+      absent from the returned `SparseTensor`. If `None`, default value of
+      `indicators` dtype will be used (e.g. '' for `str`, 0 for `int`).
+    dtype: Type of result, must be integer type.
+
+  Returns:
+    `tf.int64` `SparseTensor` of shape `(d0, ..., dn, max_num_labels)`,
+      where `max_num_labels` is the maximum number of non-zero values in any
+      row (in the example above, row (1, 1) has 3 non-zero values, so the result
+      shape is (2, 2, 3)). The values of this `SparseTensor` are in the range
+      `[0, num_classes)` and correspond to the index of non-empty values along
+      the last dimension of `indicators`.
+
+  Raises:
+    ValueError: if `dtype` is not integer.
+  """
+  if not dtype.is_integer:
+    raise ValueError("Invalid dtype {} not integer.".format(dtype))
+  with ops.name_scope(
+      None, "indicators_to_sparse_ids", (indicators, ignore_value)):
+    # Convert indicators to binary ones and zeros. We use int64 since
+    # SparseTensor requires int64 indices.
+    indicators = ops.convert_to_tensor(indicators, name="indicators")
+    if ignore_value is None:
+      ignore_value = _ignore_value(indicators.dtype)
+    missing_indicators = math_ops.equal(
+        indicators, ignore_value, name="missing")
+    zeros_like_indicators = array_ops.zeros_like(
+        indicators, dtype=dtypes.int64, name="zeros")
+    binary_indicators = array_ops.where(
+        missing_indicators, zeros_like_indicators,
+        array_ops.ones_like(indicators, dtype=dtypes.int64, name="ones"),
+        name="binary_indicators")
+
+    # Use cumsum along the last dimension to generate per-row indexes.
+    # Note that these are 1-based (since 0 indicates missing values), so they're
+    # off-by-1 from the actual indices. We'll subtract 1 below. Since they're
+    # off-by-one, the max value is the size of last dimension (i.e.,
+    # last_index + 1).
+    row_index_indicators = array_ops.where(
+        missing_indicators, zeros_like_indicators,
+        math_ops.cumsum(binary_indicators, axis=-1), "row_index_indicators")
+    result_last_dim = array_ops.reshape(
+        math_ops.reduce_max(row_index_indicators), shape=(1,),
+        name="result_last_dim")
+
+    # Convert to a SparseTensor. The values of this SparseTensor are the last
+    # indices of our result, and the last indices of this SparseTensor (i.e.,
+    # the class IDs indicated by `indicators`) are the values of our result, so
+    # we use unstack/stack to swap them.
+    sparse_row_index_indicators = dense_to_sparse_tensor(
+        row_index_indicators, ignore_value=0)
+    index_columns = array_ops.unstack(
+        sparse_row_index_indicators.indices, axis=1)
+    return sparse_tensor.SparseTensor(
+        indices=array_ops.stack(
+            index_columns[0:-1] + [sparse_row_index_indicators.values - 1],
+            axis=1, name="indices"),
+        values=math_ops.cast(index_columns[-1], dtype=dtype, name="values"),
+        dense_shape=array_ops.concat(
+            (sparse_row_index_indicators.dense_shape[0:-1], result_last_dim),
+            axis=0, name="dense_shape"))
+
+
+def sparse_row_envelope(sparse_input, row_axis=0, col_axis=1, name=None):
+  """Returns the length of each 'row' in a `SparseTensor`.
+
+  For example, if `sparse_input` has indices `[[0,0], [2, 0], [2, 1], [2, 2]]`
+  and shape `[3, 3]`, this function will return `[1, 0, 3]`.
+
+  Args:
+    sparse_input: a `SparseTensor` of rank at least 2.
+    row_axis: An integer. The axis for the row of the envelope matrix. Default
+      is 0.
+    col_axis: An integer. The axis for the col of the envelope matrix. Default
+      is 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A one-dimensional `Tensor` whose entries correspond to the length of each
+    row of `SparseTensor`.
+
+  Raises:
+    ValueError: If row_axis and col_axis are the same axis or they are not
+      integers.
+  """
+  if not (isinstance(row_axis, compat.integral_types) and
+          isinstance(col_axis, compat.integral_types)):
+    raise ValueError("`row_axis` and `col_axis` must be integers.")
+
+  if row_axis == col_axis:
+    raise ValueError("Row and column can not be the same axis.")
+
+  with ops.name_scope(name, "sparse_row_envelope", [sparse_input]):
+    indices = sparse_input.indices
+    row_indices = indices[:, row_axis]
+    col_indices = indices[:, col_axis]
+    num_rows = math_ops.cast(sparse_input.dense_shape[row_axis], dtypes.int32)
+    row_envelope = math_ops.unsorted_segment_max(
+        col_indices + 1, row_indices, num_rows, name=name)
+    zeros = array_ops.zeros_like(row_envelope)
+    return array_ops.where(row_envelope > zeros, row_envelope, zeros)
diff --git a/tensorflow/contrib/layers/python/ops/sparse_ops_test.py b/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
index b27174e43760f3c0f18ae57917931ca3f9cf4981..9a9582dcad96ade282d0a9b51bd5fd77598531ae 100644
--- a/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
+++ b/tensorflow/contrib/layers/python/ops/sparse_ops_test.py
@@ -22,11 +22,24 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.ops import sparse_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class SparseOpsTest(test.TestCase):
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class DenseToSparseTensorTest(test.TestCase):
 
   def test_dense_to_sparse_tensor_1d(self):
     with self.test_session() as sess:
@@ -134,5 +147,164 @@ class SparseOpsTest(test.TestCase):
         sparse_ops.dense_to_sparse_tensor(tensor)
 
 
+class SparseRowEnvelopeTest(test.TestCase):
+
+  def test_sparse_row_envelope(self):
+    expected_sparse_row_envelope = [1, 0, 3]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [2, 0], [2, 1], [2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[3, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_unsorted_indices(self):
+    expected_sparse_row_envelope = [1, 0, 3]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[2, 0], [2, 2], [2, 1], [0, 0]],
+          values=[0, 1, 2, 3],
+          dense_shape=[3, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_empty_in_the_end(self):
+    expected_sparse_row_envelope = [1, 0, 3, 0, 0]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [2, 0], [2, 1], [2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[5, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+  def test_sparse_row_envelope_empty_3d(self):
+    expected_sparse_row_envelope = [1, 0, 3, 0, 0]
+    with self.test_session() as sess:
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=[[0, 0, 0], [0, 2, 0], [0, 2, 1], [0, 2, 2]],
+          values=[0, 1, 2, 3],
+          dense_shape=[1, 5, 3])
+      sparse_row_envelope = sess.run(
+          sparse_ops.sparse_row_envelope(sparse_input, 1, 2))
+      self.assertAllEqual(expected_sparse_row_envelope,
+                          sparse_row_envelope)
+
+
+class IndicatorToSparseIdsTest(test.TestCase):
+
+  def test_indicators_to_sparse_ids_1d(self):
+    indicators = (0, 0, 1, 0)
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0,),),
+          values=(2,),
+          dense_shape=(1,),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_2d(self):
+    indicators = (
+        (0, 0, 1, 0),
+        (1, 0, 0, 1),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 3),
+          dense_shape=(2, 2),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_3d(self):
+    indicators = (
+        ((0, 0, 1, 0, 0), (0, 0, 0, 0, 0)),
+        ((1, 0, 0, 1, 0), (0, 0, 1, 0, 0)),
+        ((0, 0, 0, 0, 0), (0, 0, 0, 0, 0)),
+        ((1, 0, 0, 1, 1), (0, 0, 1, 0, 0)),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=(
+              (0, 0, 0),
+              (1, 0, 0), (1, 0, 1), (1, 1, 0),
+              (3, 0, 0), (3, 0, 1), (3, 0, 2), (3, 1, 0)
+          ), values=(
+              2,
+              0, 3, 2,
+              0, 3, 4, 2
+          ), dense_shape=(4, 2, 3),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_ignore_value(self):
+    indicators = (
+        ((-1, -1, 10, -1), (-1, -1, -1, -1)),
+        ((11, -1, -1, 12), (-1, -1, 13, -1)),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(
+        indicators, ignore_value=-1)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_string_indicators_to_sparse_ids(self):
+    indicators = (
+        (('', '', 'A', ''), ('', '', '', '')),
+        (('B', '', '', 'C'), ('', '', 'D', '')),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_string_indicators_to_sparse_ids_ignore_value(self):
+    indicators = (
+        (('x', 'x', 'A', 'x'), ('x', 'x', 'x', 'x')),
+        (('B', 'x', 'x', 'C'), ('x', 'x', 'D', 'x')),
+    )
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(
+        indicators, ignore_value='x')
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval())
+
+  def test_indicators_to_sparse_ids_unknown_dims(self):
+    indicators_values = (
+        ((0, 0, 1, 0), (0, 0, 0, 0)),
+        ((1, 0, 0, 1), (0, 0, 1, 0)),
+    )
+    indicators = array_ops.placeholder(
+        dtype=dtypes.int32, shape=(None, None, None))
+    sparse_ids = sparse_ops.indicators_to_sparse_ids(indicators)
+    with self.test_session():
+      _assert_sparse_tensor_value(self, sparse_tensor.SparseTensorValue(
+          indices=((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+          values=(2, 0, 3, 2),
+          dense_shape=(2, 2, 2),
+      ), sparse_ids.eval(feed_dict={indicators: indicators_values}))
+
+  def test_indicators_to_sparse_ids_unknown_rank(self):
+    indicators = array_ops.placeholder(dtype=dtypes.int32)
+    with self.assertRaisesRegexp(ValueError, r'shape.*should be defined'):
+      sparse_ops.indicators_to_sparse_ids(indicators)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 90d4d7e3a8fde9ddb5e491c6dacec6650560bcce..f313c461439db490830a8a5e52e73f9339865a97 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -68,6 +68,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:inputs",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -836,6 +837,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "model_fn_test",
+    size = "small",
+    srcs = ["python/learn/estimators/model_fn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":learn",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "multioutput_test",
     size = "small",
@@ -1018,10 +1032,12 @@ py_test(
 py_test(
     name = "export_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["python/learn/utils/export_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",  # http://b/31032996
+        "notap",  # TODO(b/37950026): Test is flaky
     ],
     deps = [
         ":learn",
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index bd56066b1bc95bebb5045a444d3f4a8d9c74a164..c06343b731d3de785ff51650cdeb478b7375cc56 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -38,6 +38,7 @@ See the @{$python/contrib.learn} guide.
 @@LinearEstimator
 @@LinearRegressor
 @@LogisticRegressor
+@@StateSavingRnnEstimator
 @@SVM
 @@SKCompat
 
@@ -87,9 +88,11 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn import *
 # pylint: enable=wildcard-import
 
+from tensorflow.contrib.learn.python.learn import learn_runner_lib as learn_runner
+
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['datasets', 'head', 'io', 'models',
+_allowed_symbols = ['datasets', 'head', 'io', 'learn_runner', 'models',
                     'monitors', 'NotFittedError', 'ops', 'preprocessing',
                     'utils', 'graph_actions']
 
diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md
deleted file mode 100644
index 6a7b0ea61417bb69f920015725b9ba0afdcf1ef1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/learn/python/learn/README.md
+++ /dev/null
@@ -1,247 +0,0 @@
-# TF Learn
-
-TF Learn is a simplified interface for TensorFlow, to get people started on predictive analytics and data mining. The library covers a variety of needs: from linear models to *Deep Learning* applications like text and image understanding.
-
-### Why *TensorFlow*?
-
-* TensorFlow provides a good backbone for building different shapes of machine learning applications.
-* It will continue to evolve both in the distributed direction and as general pipelinining machinery.
-
-### Why *TensorFlow Learn*?
-
-- To smooth the transition from the [scikit-learn](http://scikit-learn.org/stable/) world of one-liner machine learning into the more open world of building different shapes of ML models. You can start by using [fit](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#fit)/[predict](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator#predict) and slide into TensorFlow APIs as you are getting comfortable.
-- To provide a set of reference models that will be easy to integrate with existing code.
-
-## Installation
-
-[Install TensorFlow](https://www.tensorflow.org/install/), and then simply import `learn` via `from tensorflow.contrib.learn` or use `tf.contrib.learn`.
-
-Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [pandas](http://pandas.pydata.org/) for additional functionality.
-
-### Tutorials
-
--   [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build,
-    train, and evaluate a neural network with just a few lines of code.
--   [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how
-    to create input functions to feed data into your models.
--   [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics
-    of building linear models.
--   [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep).
-    Jointly train a linear model and a deep neural network.
--   [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors).
-    Use the Monitor API to audit training of a neural network.
--   [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn
-    how to create a custom estimator.
--   More coming soon.
-
-### Community
-
-- Twitter [#tensorflow](https://twitter.com/search?q=tensorflow&src=typd).
-- StackOverflow with [tensorflow tag](http://stackoverflow.com/questions/tagged/tensorflow) for questions and struggles.
-- GitHub [issues](https://github.com/tensorflow/tensorflow/issues) for technical discussions and feature requests.
-
-### Existing Estimator Implementations
-
--   [`LinearClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearClassifier))
--   [`LinearRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/linear.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/LinearRegressor))
--   [`DNNClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNClassifier))
--   [`DNNRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNRegressor))
--   [`DNNLinearCombinedClassifier`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedClassifier))
--   [`DNNLinearCombinedRegressor`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py)
-    ([docs](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/DNNLinearCombinedRegressor))
--   [`SVM`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/svm.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/g3doc/svm.md))
--   [`GMM`](https://www.tensorflow.org/code/tensorflow/contrib/factorization/python/ops/gmm.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/factorization/g3doc/gmm.md))
--   [`KMeansClustering`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/kmeans.py)
-    ([docs](https://www.tensorflow.org/code/tensorflow/contrib/factorization/g3doc/kmeans.md))
-
-### Usage Examples
-
-Below are a few simple examples of the API. For more examples, please see [examples](https://www.tensorflow.org/code/tensorflow/examples/learn).
-
-General tips:
-
--  It's useful to rescale a dataset to 0 mean and unit standard deviation before passing it to an [`Estimator`](https://www.tensorflow.org/api_docs/python/tf/contrib/learn/Estimator). [Stochastic Gradient Descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent) doesn't always do the right thing when variable are at very different scales.
-
--  Categorical variables should be managed before passing input to the estimator.
-
-## Linear Classifier
-
-Simple linear classification:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.LinearClassifier(n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Linear Regressor
-
-Simple linear regression:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics, preprocessing
-
-boston = datasets.load_boston()
-x = preprocessing.StandardScaler().fit_transform(boston.data)
-feature_columns = learn.infer_real_valued_columns_from_input(x)
-regressor = learn.LinearRegressor(feature_columns=feature_columns)
-regressor.fit(x, boston.target, steps=200, batch_size=32)
-boston_predictions = list(regressor.predict(x, as_iterable=True))
-score = metrics.mean_squared_error(boston_predictions, boston.target)
-print ("MSE: %f" % score)
-```
-
-## Deep Neural Network
-
-Example of 3 layer network with 10, 20 and 10 hidden units respectively:
-
-```python
-import tensorflow.contrib.learn.python.learn as learn
-from sklearn import datasets, metrics
-
-iris = datasets.load_iris()
-feature_columns = learn.infer_real_valued_columns_from_input(iris.data)
-classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], n_classes=3, feature_columns=feature_columns)
-classifier.fit(iris.data, iris.target, steps=200, batch_size=32)
-iris_predictions = list(classifier.predict(iris.data, as_iterable=True))
-score = metrics.accuracy_score(iris.target, iris_predictions)
-print("Accuracy: %f" % score)
-```
-
-## Custom model
-
-Example of how to pass a custom model to the Estimator:
-
-```python
-from sklearn import datasets
-from sklearn import metrics
-import tensorflow as tf
-import tensorflow.contrib.layers.python.layers as layers
-import tensorflow.contrib.learn.python.learn as learn
-
-iris = datasets.load_iris()
-
-def my_model(features, labels):
-  """DNN with three hidden layers."""
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  labels = tf.one_hot(labels, 3, 1, 0)
-
-  # Create three fully connected layers respectively of size 10, 20, and 10.
-  features = layers.stack(features, layers.fully_connected, [10, 20, 10])
-
-  # Create two tensors respectively for prediction and loss.
-  prediction, loss = (
-      tf.contrib.learn.models.logistic_regression(features, labels)
-  )
-
-  # Create a tensor for training op.
-  train_op = tf.contrib.layers.optimize_loss(
-      loss, tf.contrib.framework.get_global_step(), optimizer='Adagrad',
-      learning_rate=0.1)
-
-  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op
-
-classifier = learn.Estimator(model_fn=my_model)
-classifier.fit(iris.data, iris.target, steps=1000)
-
-y_predicted = [
-  p['class'] for p in classifier.predict(iris.data, as_iterable=True)]
-score = metrics.accuracy_score(iris.target, y_predicted)
-print('Accuracy: {0:f}'.format(score))
-```
-
-## Saving / Restoring models
-
-Each estimator supports a `model_dir` argument, which takes a folder path where all model information will be saved:
-
-```python
-classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-```
-
-If you run multiple `fit` operations on the same `Estimator`, training will resume where the last operation left off, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 2.40115, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.338706, step = 101
-INFO:tensorflow:loss = 0.159414, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0953846.
-
-<strong>classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.113173, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.175782, step = 401
-INFO:tensorflow:loss = 0.119735, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0518137.</pre>
-
-To restore checkpoints to a new `Estimator`, just pass it the same `model_dir` argument, e.g.:
-
-<pre><strong>classifier = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 1.16335, step = 1
-INFO:tensorflow:Saving checkpoints for 1 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.176995, step = 101
-INFO:tensorflow:loss = 0.184573, step = 201
-INFO:tensorflow:Saving checkpoints for 300 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0512496.
-
-<strong>classifier2 = learn.DNNClassifier(..., model_dir="/tmp/my_model")
-classifier2.fit(..., steps=300)</strong>
-INFO:tensorflow:Create CheckpointSaverHook
-INFO:tensorflow:loss = 0.0543797, step = 301
-INFO:tensorflow:Saving checkpoints for 301 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:loss = 0.101036, step = 401
-INFO:tensorflow:loss = 0.137956, step = 501
-INFO:tensorflow:Saving checkpoints for 600 into /tmp/leftoff/model.ckpt.
-INFO:tensorflow:Loss for final step: 0.0162506.</pre>
-
-## Summaries
-
-If you supply a `model_dir` argument to your `Estimator`s, TensorFlow will write summaries for ``loss`` and histograms for variables in this directory. (You can also add custom summaries in your custom model function by calling [Summary](https://www.tensorflow.org/api_guides/python/summary) operations.)
-
-To view the summaries in TensorBoard, run the following command, where `logdir` is the `model_dir` for your `Estimator`:
-
-```shell
-tensorboard --logdir=/tmp/tf_examples/my_model_1
-```
-
-and then load the reported URL.
-
-**Graph visualization**
-
-![Text classification RNN Graph](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_graph.png)
-
-**Loss visualization**
-
-![Text classification RNN Loss](https://raw.githubusercontent.com/tensorflow/skflow/master/g3doc/images/text_classification_rnn_loss.png)
-
-## More examples
-
-See the [examples folder](https://www.tensorflow.org/code/tensorflow/examples/learn) for:
-
--  An easy way to handle [categorical variables](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification.py) (words are just an example of a categorical variable)
--  Text Classification: see examples for [RNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_rnn.py) and [CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_cnn.py) on characters
--  [Digit recognition using a CNN](https://www.tensorflow.org/code/tensorflow/examples/learn/mnist.py)
--  And much more!
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
index dd836c1dec361dacfa03df7f8abce648bf438f8d..cc08a47c3911b64fc64feb5e5a6b280b87023ff4 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/dataframe.py
@@ -24,10 +24,13 @@ import collections
 from .series import Series
 from .transform import Transform
 
+from tensorflow.python.util.deprecation import deprecated
+
 
 class DataFrame(object):
   """A DataFrame is a container for ingesting and preprocessing data."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self):
     self._columns = {}
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
index 0ca8afe498b591341a030b0612834e7a12d6f054..377299ed57d87f855d8cf4c557e9d075d94f3b27 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/estimator_utils.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.learn.python.learn.dataframe import series as ss
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.util.deprecation import deprecated
 
 
 def _to_feature_spec(tensor, default_value=None):
@@ -89,6 +90,7 @@ def _build_alternate_universe(
   return new_feature_series_dict, feature_specs
 
 
+@deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
 def to_feature_columns_and_input_fn(dataframe,
                                     base_input_keys_with_defaults,
                                     feature_keys,
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
index e71ad9b50b400a360b697ba89918d0045ded7493..c3ac4ad84d14b142d0b0ebdb66c5abcc38e5db3a 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_functions.py
@@ -18,11 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
 # pylint: disable=unused-import
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _ArrayFeedFn
-from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data as enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _enqueue_data
+from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _OrderedDictNumpyFeedFn
 from tensorflow.python.estimator.inputs.queues.feeding_functions import _PandasFeedFn
-from tensorflow.python.estimator.inputs.queues.feeding_functions import _GeneratorFeedFn
 # pylint: enable=unused-import
+
+from tensorflow.python.util.deprecation import deprecated
+
+
+@deprecated('2017-06-15', 'Moved to tf.contrib.training.enqueue_data.')
+def enqueue_data(*args, **kwargs):
+  return _enqueue_data(*args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
index d055555b01028106acbe8198aa858d89b06abf02..4f6a4b82a68b1b73ea7440088c941266baeff9cb 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/queues/feeding_queue_runner.py
@@ -19,6 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
-from tensorflow.python.estimator.inputs.queues.feeding_queue_runner import _FeedingQueueRunner as FeedingQueueRunner
-# pylint: enable=unused-import
+from tensorflow.python.estimator.inputs.queues.feeding_queue_runner import _FeedingQueueRunner
+from tensorflow.python.util.deprecation import deprecated
+
+
+class FeedingQueueRunner(_FeedingQueueRunner):
+
+  @deprecated('2017-06-15', 'Moved to tf.contrib.training.FeedingQueueRunner.')
+  def __init__(self, *args, **kwargs):
+    super(FeedingQueueRunner, self).__init__(*args, **kwargs)
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/series.py b/tensorflow/contrib/learn/python/learn/dataframe/series.py
index 5893db3aad2a3d2eab5b29fbbbd42753e7dfd7c6..39ffcd7469735745e97d6ccaf691324f2128a8b7 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/series.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/series.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from abc import ABCMeta
 
+from tensorflow.python.util.deprecation import deprecated
+
 
 class Series(object):
   """A single output series.
@@ -106,6 +108,7 @@ class Series(object):
 class PredefinedSeries(Series):
   """A `Series` that requires the cache to already map a given name."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self, name, feature_spec):
     super(PredefinedSeries, self).__init__()
     self._name = name
@@ -144,6 +147,7 @@ class PredefinedSeries(Series):
 class TransformedSeries(Series):
   """A `Series` that results from applying a `Transform` to a list of inputs."""
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self, input_series, transform, output_name):
     super(TransformedSeries, self).__init__()
     self._input_series = input_series
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
index b17a4b8d05b13940b6fbe65b7b775e2c352fe630..f316c5c9804321437011cf76fc59e6ecd6c68b9b 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/tensorflow_dataframe.py
@@ -97,7 +97,7 @@ class TensorFlowDataFrame(df.DataFrame):
       graph: the `Graph` in which the `DataFrame` should be built.
       session: the `Session` in which to run the columns of the `DataFrame`.
       start_queues: if true, queues will be started before running and halted
-        after producting `n` batches.
+        after producing `n` batches.
       initialize_variables: if true, variables will be initialized.
       **kwargs: Additional keyword arguments e.g. `num_epochs`.
 
diff --git a/tensorflow/contrib/learn/python/learn/dataframe/transform.py b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
index c28da59ac76130d25bf330abb5d9ffa01fac1aca..c04eea4a222c8e3ad1796b9aedaf7cd3fb146402 100644
--- a/tensorflow/contrib/learn/python/learn/dataframe/transform.py
+++ b/tensorflow/contrib/learn/python/learn/dataframe/transform.py
@@ -24,11 +24,13 @@ from abc import abstractmethod
 from abc import abstractproperty
 
 import collections
-import inspect
 
 from .series import Series
 from .series import TransformedSeries
 
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.deprecation import deprecated
+
 
 def _make_list_of_series(x):
   """Converts `x` into a list of `Series` if possible.
@@ -86,6 +88,7 @@ def _make_tuple_of_string(x):
                   "got %s" % type(x).__name__)
 
 
+@deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
 def parameter(func):
   """Tag functions annotated with `@parameter` for later retrieval.
 
@@ -109,6 +112,7 @@ class Transform(object):
 
   __metaclass__ = ABCMeta
 
+  @deprecated("2017-06-15", "contrib/learn/dataframe/** is deprecated.")
   def __init__(self):
     self._return_type = None
 
@@ -120,7 +124,7 @@ class Transform(object):
   def parameters(self):
     """A dict of names to values of properties marked with `@parameter`."""
     property_param_names = [name
-                            for name, func in inspect.getmembers(type(self))
+                            for name, func in tf_inspect.getmembers(type(self))
                             if (hasattr(func, "fget") and hasattr(
                                 getattr(func, "fget"), "is_parameter"))]
     return {name: getattr(self, name) for name in property_param_names}
diff --git a/tensorflow/contrib/learn/python/learn/datasets/mnist.py b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
index 01262ff5f81053a3407809aa680589af27cec783..13f213c197fb44bef22574fdf557d0b9e420d003 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/mnist.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/mnist.py
@@ -26,8 +26,10 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
 
-SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
+# CVDF mirror of http://yann.lecun.com/exdb/mnist/
+SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
 
 
 def _read32(bytestream):
@@ -108,12 +110,16 @@ class DataSet(object):
                fake_data=False,
                one_hot=False,
                dtype=dtypes.float32,
-               reshape=True):
+               reshape=True,
+               seed=None):
     """Construct a DataSet.
     one_hot arg is used only if fake_data is true.  `dtype` can be either
     `uint8` to leave the input as `[0, 255]`, or `float32` to rescale into
-    `[0, 1]`.
+    `[0, 1]`.  Seed arg provides for convenient deterministic testing.
     """
+    seed1, seed2 = random_seed.get_seed(seed)
+    # If op level seed is not set, use whatever graph level seed is returned
+    numpy.random.seed(seed1 if seed is None else seed2)
     dtype = dtypes.as_dtype(dtype).base_dtype
     if dtype not in (dtypes.uint8, dtypes.float32):
       raise TypeError('Invalid image dtype %r, expected uint8 or float32' %
@@ -207,11 +213,13 @@ def read_data_sets(train_dir,
                    one_hot=False,
                    dtype=dtypes.float32,
                    reshape=True,
-                   validation_size=5000):
+                   validation_size=5000,
+                   seed=None):
   if fake_data:
 
     def fake():
-      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)
+      return DataSet(
+          [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)
 
     train = fake()
     validation = fake()
@@ -253,12 +261,16 @@ def read_data_sets(train_dir,
   train_images = train_images[validation_size:]
   train_labels = train_labels[validation_size:]
 
-  train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
-  validation = DataSet(validation_images,
-                       validation_labels,
-                       dtype=dtype,
-                       reshape=reshape)
-  test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)
+  train = DataSet(
+      train_images, train_labels, dtype=dtype, reshape=reshape, seed=seed)
+  validation = DataSet(
+      validation_images,
+      validation_labels,
+      dtype=dtype,
+      reshape=reshape,
+      seed=seed)
+  test = DataSet(
+      test_images, test_labels, dtype=dtype, reshape=reshape, seed=seed)
 
   return base.Datasets(train=train, validation=validation, test=test)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/__init__.py b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
index 118e974c6add1351f33a74943f6c534cd45ab59b..a40cbc0449071d86bd879c330677ec649605523d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/__init__.py
@@ -323,6 +323,7 @@ from tensorflow.contrib.learn.python.learn.estimators.metric_key import MetricKe
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModeKeys
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.contrib.learn.python.learn.estimators.prediction_key import PredictionKey
+from tensorflow.contrib.learn.python.learn.estimators.rnn_common import PredictionType
 from tensorflow.contrib.learn.python.learn.estimators.run_config import ClusterConfig
 from tensorflow.contrib.learn.python.learn.estimators.run_config import Environment
 from tensorflow.contrib.learn.python.learn.estimators.run_config import RunConfig
diff --git a/tensorflow/contrib/learn/python/learn/estimators/constants.py b/tensorflow/contrib/learn/python/learn/estimators/constants.py
index a62f1815b21d9cd85bc3cf057bf3be3c1958a797..fc69e810244a182b864be856e6720f8584f7aa65 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/constants.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/constants.py
@@ -38,3 +38,8 @@ class ProblemType(object):
   CLASSIFICATION = 1
   LINEAR_REGRESSION = 2
   LOGISTIC_REGRESSION = 3
+
+
+# CollectionDef key for the input feature keys.
+# TODO(b/34388557): This is a stopgap; please follow the bug to learn of changes
+COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS = "input_feature_keys"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index 285ed4c186eca552c8e54cb0963eec6da9aa19a7..5e6288af99e1cdcd7b7f7bd7f51bddd41cb31c37 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -24,6 +24,7 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn.estimators import dnn_linear_combined
@@ -32,6 +33,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -125,11 +127,20 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      net = layers.input_from_feature_columns(
-          columns_to_tensors=features,
-          feature_columns=feature_columns,
-          weight_collections=[parent_scope],
-          scope=input_layer_scope)
+      if all([
+          isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+          for fc in feature_columns
+      ]):
+        net = layers.input_from_feature_columns(
+            columns_to_tensors=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope],
+            scope=input_layer_scope)
+      else:
+        net = fc_core.input_layer(
+            features=features,
+            feature_columns=feature_columns,
+            weight_collections=[parent_scope])
 
     for layer_id, num_hidden_units in enumerate(hidden_units):
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 37fe6faa3127b512eaf6e0bb60684deec09ffbc5..726612235050def6e7addb503cc6646a25de0e42 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -33,13 +33,14 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 
@@ -100,9 +101,13 @@ def _linear_learning_rate(num_linear_feature_columns):
 
 
 def _add_hidden_layer_summary(value, tag):
-  logging_ops.scalar_summary("%s/fraction_of_zero_values" % tag,
-                             nn.zero_fraction(value))
-  logging_ops.histogram_summary("%s/activation" % tag, value)
+  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s/activation" % tag, value)
+
+
+def _add_layer_summary(value, tag):
+  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
+  summary.histogram("%s/activation" % tag, value)
 
 
 def _get_embedding_variable(column, collection_key, input_layer_scope):
@@ -226,11 +231,20 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        net = layers.input_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=dnn_feature_columns,
-            weight_collections=[dnn_parent_scope],
-            scope=dnn_input_scope)
+        if all([
+            isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+            for fc in dnn_feature_columns
+        ]):
+          net = layers.input_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope],
+              scope=dnn_input_scope)
+        else:
+          net = fc_core.input_layer(
+              features=features,
+              feature_columns=dnn_feature_columns,
+              weight_collections=[dnn_parent_scope])
 
       for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
         with variable_scope.variable_scope(
@@ -247,7 +261,7 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
                 net,
                 keep_prob=(1.0 - dnn_dropout))
         # TODO(b/31209633): Consider adding summary before dropout.
-        _add_hidden_layer_summary(net, dnn_hidden_layer_scope.name)
+        _add_layer_summary(net, dnn_hidden_layer_scope.name)
 
       with variable_scope.variable_scope(
           "logits",
@@ -258,7 +272,7 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
             activation_fn=None,
             variables_collections=[dnn_parent_scope],
             scope=dnn_logits_scope)
-      _add_hidden_layer_summary(dnn_logits, dnn_logits_scope.name)
+      _add_layer_summary(dnn_logits, dnn_logits_scope.name)
 
   # Build Linear logits.
   linear_parent_scope = "linear"
@@ -273,20 +287,30 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if joint_linear_weights:
-        linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
-            feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+              for fc in linear_feature_columns]):
+        if joint_linear_weights:
+          linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
+        else:
+          linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
+              columns_to_tensors=features,
+              feature_columns=linear_feature_columns,
+              num_outputs=head.logits_dimension,
+              weight_collections=[linear_parent_scope],
+              scope=scope)
       else:
-        linear_logits, _, _ = layers.weighted_sum_from_feature_columns(
-            columns_to_tensors=features,
+        linear_logits = fc_core.linear_model(
+            features=features,
             feature_columns=linear_feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[linear_parent_scope],
-            scope=scope)
+            units=head.logits_dimension,
+            weight_collections=[linear_parent_scope])
+
+      _add_layer_summary(linear_logits, scope.name)
 
   # Combine logits and build full model.
   if dnn_logits is not None and linear_logits is not None:
@@ -499,9 +523,36 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
+  estimator.fit(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = DNNLinearCombinedClassifier(
+      n_classes=n_classes,
+      linear_feature_columns=[sparse_feature_a_x_sparse_feature_b],
+      dnn_feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
+      dnn_hidden_units=[1000, 500, 100],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
   estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -541,6 +592,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
                feature_engineering_fn=None,
                embedding_lr_multipliers=None,
                input_layer_min_slice_size=None,
+               label_keys=None,
                fix_global_step_increment_bug=False):
     """Constructs a DNNLinearCombinedClassifier instance.
 
@@ -592,6 +644,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         learning rate for the embedding variables.
       input_layer_min_slice_size: Optional. The min slice size of input layer
         partitions. If not provided, will use the default of 64M.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
       fix_global_step_increment_bug: If `False`, the estimator needs two fit
         steps to optimize both linear and dnn parts. If `True`, this bug is
         fixed. New users must set this to `True`, but it the default value is
@@ -605,7 +659,8 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
     head = head_lib.multi_class_head(
         n_classes=n_classes,
         weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     linear_feature_columns = tuple(linear_feature_columns or [])
     dnn_feature_columns = tuple(dnn_feature_columns or [])
     self._feature_columns = linear_feature_columns + dnn_feature_columns
@@ -816,9 +871,11 @@ class DNNLinearCombinedRegressor(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y
     ...
+  def input_fn_predict: # returns x, None
+    ...
   estimator.train(input_fn_train)
   estimator.evaluate(input_fn_eval)
-  estimator.predict(x)
+  estimator.predict(input_fn_predict)
   ```
 
   Input of `fit`, `train`, and `evaluate` should have following features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 301211ee82233e55a340ea706ca28dc82bd7dcc0..181a8cab1ce5f325686b51db2fb4fc2c7ee35110 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,6 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -420,6 +421,52 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=100)
     _assert_metrics_in_range(('accuracy', 'auc'), scores)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+    """Tests binary classification using Tensor data as input."""
+
+    def _input_fn():
+      iris = test_data.prepare_iris_data_for_logistic_regression()
+      features = {}
+      for i in range(4):
+        # The following shows how to provide the Tensor data for
+        # RealValuedColumns.
+        features.update({
+            str(i):
+                array_ops.reshape(
+                    constant_op.constant(iris.data[:, i], dtype=dtypes.float32),
+                    [-1, 1])
+        })
+      # The following shows how to provide the SparseTensor data for
+      # a SparseColumn.
+      features['dummy_sparse_column'] = sparse_tensor.SparseTensor(
+          values=['en', 'fr', 'zh'],
+          indices=[[0, 0], [0, 1], [60, 0]],
+          dense_shape=[len(iris.target), 2])
+      labels = array_ops.reshape(
+          constant_op.constant(iris.target, dtype=dtypes.int32), [-1, 1])
+      return features, labels
+
+    iris = test_data.prepare_iris_data_for_logistic_regression()
+    cont_features = [fc_core.numeric_column(str(i)) for i in range(4)]
+    linear_features = [
+        fc_core.bucketized_column(
+            cont_features[i],
+            sorted(set(test_data.get_quantile_based_buckets(
+                iris.data[:, i], 10)))) for i in range(4)
+    ]
+    linear_features.append(
+        fc_core.categorical_column_with_hash_bucket(
+            'dummy_sparse_column', hash_bucket_size=100))
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        linear_feature_columns=linear_features,
+        dnn_feature_columns=cont_features,
+        dnn_hidden_units=[3, 3])
+
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=100)
+    _assert_metrics_in_range(('accuracy', 'auc'), scores)
+
   def testTrainWithPartitionedVariables(self):
     """Tests training with partitioned variables."""
 
@@ -493,6 +540,59 @@ class DNNLinearCombinedClassifierTest(test.TestCase):
         input_fn=test_data.iris_input_multiclass_fn, steps=100)
     _assert_metrics_in_range(('accuracy',), scores)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
+        n_classes=3,
+        linear_feature_columns=[language_column],
+        dnn_feature_columns=[
+            feature_column.embedding_column(
+                language_column, dimension=1),
+            feature_column.real_valued_column('age')
+        ],
+        dnn_hidden_units=[3, 3],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    _assert_metrics_in_range(('accuracy',), scores)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLoss(self):
     """Tests loss calculation."""
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 193091511990d8c947f39a1e1d87f454207e6db0..615af24cd306d1b384b4668b91d715f251ffee01 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,6 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -325,6 +326,49 @@ class DNNClassifierTest(test.TestCase):
       for i in range(expected_n_classes):
         self._assertInRange(0.0, 1.0, probabilities[b][i])
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [
+        fc_core.embedding_column(language_column, dimension=1),
+        fc_core.numeric_column('age')
+    ]
+
+    classifier = dnn.DNNClassifier(
+        n_classes=2,
+        feature_columns=feature_columns,
+        hidden_units=[10, 10],
+        config=run_config.RunConfig(tf_random_seed=1))
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self._assertInRange(0.0, 1.0, scores['accuracy'])
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(input_fn=predict_input_fn, as_iterable=True))
+    self._assertBinaryPredictions(3, predicted_classes)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_TensorData(self):
     """Tests binary classification using tensor data as input."""
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
index 525f84d511528e9fec35df7d9a2a854f38f26f02..1724d7599d09873f969555cc9382c0753eba463f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.framework import deprecated
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -116,7 +115,7 @@ def dict_to_state_tuple(input_dict, cell):
 
 
 def _concatenate_context_input(sequence_input, context_input):
-  """Replicates `context_input` accross all timesteps of `sequence_input`.
+  """Replicates `context_input` across all timesteps of `sequence_input`.
 
   Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
   This value is appended to `sequence_input` on dimension 2 and the result is
@@ -178,7 +177,7 @@ def build_sequence_input(features,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features i.e. features that apply accross all time
+      describing context features i.e. features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     weight_collections: List of graph collections to which weights are added.
@@ -420,7 +419,7 @@ def _get_dynamic_rnn_model_fn(
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
@@ -540,20 +539,6 @@ def _get_dynamic_rnn_model_fn(
   return _dynamic_rnn_model_fn
 
 
-def _get_dropout_and_num_units(num_units,
-                               num_rnn_layers,
-                               input_keep_probability,
-                               output_keep_probability):
-  """Helper function for deprecated factory functions."""
-  dropout_keep_probabilities = None
-  num_units = [num_units for _ in range(num_rnn_layers)]
-  if input_keep_probability or output_keep_probability:
-    dropout_keep_probabilities = ([input_keep_probability]
-                                  + [1.0] * (num_rnn_layers - 1)
-                                  + [output_keep_probability])
-  return dropout_keep_probabilities, num_units
-
-
 class DynamicRnnEstimator(estimator.Estimator):
 
   def __init__(self,
@@ -612,13 +597,13 @@ class DynamicRnnEstimator(estimator.Estimator):
         `ProblemType.CLASSIFICATION` or `ProblemType.LINEAR_REGRESSION`.
       prediction_type: whether the `Estimator` should return a value for each
         step in the sequence, or just a single value for the final time step.
-        Must be one of `ProblemType.SINGLE_VALUE` or
-        `ProblemType.MULTIPLE_VALUE`.
+        Must be one of `PredictionType.SINGLE_VALUE` or
+        `PredictionType.MULTIPLE_VALUE`.
       sequence_feature_columns: An iterable containing all the feature columns
         describing sequence features. All items in the iterable should be
         instances of classes derived from `FeatureColumn`.
       context_feature_columns: An iterable containing all the feature columns
-        describing context features, i.e., features that apply accross all time
+        describing context features, i.e., features that apply across all time
         steps. All items in the set should be instances of classes derived from
         `FeatureColumn`.
       num_classes: the number of classes for a classification problem. Only
@@ -704,339 +689,3 @@ class DynamicRnnEstimator(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
-
-
-@deprecated('2017-04-01',
-            'multi_value_rnn_regressor is deprecated. '
-            'Please construct a DynamicRnnEstimator directly.')
-def multi_value_rnn_regressor(num_units,
-                              sequence_feature_columns,
-                              context_feature_columns=None,
-                              cell_type='basic_rnn',
-                              num_rnn_layers=1,
-                              optimizer_type='SGD',
-                              learning_rate=0.1,
-                              momentum=None,
-                              gradient_clipping_norm=5.0,
-                              input_keep_probability=None,
-                              output_keep_probability=None,
-                              model_dir=None,
-                              config=None,
-                              feature_engineering_fn=None):
-  """Creates a `DynamicRnnEstimator` for multi-value regression.
-
-  Returns an `Estimator` that given input sequences, processes them in a dynamic
-  recurrent network and outputs a sequence of continuous values.
-
-  Args:
-    num_units: The size of the RNN cells.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer`, a callback that returns an
-      optimizer, or a string. Strings must be one of 'Adagrad', 'Adam',
-      'Ftrl', 'Momentum', 'RMSProp' or 'SGD. See `layers.optimize_loss` for
-      more details.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  dropout_keep_probabilities, num_units = _get_dropout_and_num_units(
-      num_units,
-      num_rnn_layers,
-      input_keep_probability,
-      output_keep_probability)
-  return DynamicRnnEstimator(
-      problem_type=constants.ProblemType.LINEAR_REGRESSION,
-      prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_units=num_units,
-      cell_type=cell_type,
-      optimizer=optimizer_type,
-      learning_rate=learning_rate,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      feature_engineering_fn=feature_engineering_fn,
-      config=config)
-
-
-@deprecated('2017-04-01',
-            'multi_value_rnn_classifier is deprecated. '
-            'Please construct a DynamicRNNEstimator directly.')
-def multi_value_rnn_classifier(num_classes,
-                               num_units,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               cell_type='basic_rnn',
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               predict_probabilities=False,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               input_keep_probability=None,
-                               output_keep_probability=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None):
-  """Creates a `DynamicRNNEstimator` for multi-value classification.
-
-  Returns an `Estimator` that given input sequences, processes them in a dynamic
-  recurrent network and outputs a sequence of classifications, along with
-  (optionally) a probability distribution over classes.
-
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer`, a callback that returns an
-      optimizer, or a string. Strings must be one of 'Adagrad', 'Adam',
-      'Ftrl', 'Momentum', 'RMSProp' or 'SGD. See `layers.optimize_loss` for
-      more details.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  dropout_keep_probabilities, num_units = _get_dropout_and_num_units(
-      num_units,
-      num_rnn_layers,
-      input_keep_probability,
-      output_keep_probability)
-  return DynamicRnnEstimator(
-      problem_type=constants.ProblemType.CLASSIFICATION,
-      prediction_type=rnn_common.PredictionType.MULTIPLE_VALUE,
-      num_classes=num_classes,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_units=num_units,
-      cell_type=cell_type,
-      optimizer=optimizer_type,
-      learning_rate=learning_rate,
-      predict_probabilities=predict_probabilities,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      feature_engineering_fn=feature_engineering_fn,
-      config=config)
-
-
-@deprecated('2017-04-01',
-            'single_value_rnn_regressor is deprecated. '
-            'Please construct a DynamicRnnEstimator directly.')
-def single_value_rnn_regressor(num_units,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               cell_type='basic_rnn',
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               input_keep_probability=None,
-                               output_keep_probability=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None):
-  """Creates a `DynamicRnnEstimator` for single-value regression.
-
-  Returns an `Estimator` that given input sequences, processes them in a dynamic
-  recurrent network and outputs a single continuous values.
-
-  Args:
-    num_units: The size of the RNN cells.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer`, a callback that returns an
-      optimizer, or a string. Strings must be one of 'Adagrad', 'Adam',
-      'Ftrl', 'Momentum', 'RMSProp' or 'SGD. See `layers.optimize_loss` for
-      more details.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  dropout_keep_probabilities, num_units = _get_dropout_and_num_units(
-      num_units,
-      num_rnn_layers,
-      input_keep_probability,
-      output_keep_probability)
-  return DynamicRnnEstimator(
-      problem_type=constants.ProblemType.LINEAR_REGRESSION,
-      prediction_type=rnn_common.PredictionType.SINGLE_VALUE,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_units=num_units,
-      cell_type=cell_type,
-      optimizer=optimizer_type,
-      learning_rate=learning_rate,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      feature_engineering_fn=feature_engineering_fn,
-      config=config)
-
-
-@deprecated('2017-04-01',
-            'single_value_rnn_classifier is deprecated. '
-            'Please construct a DynamicRnnEstimator directly.')
-def single_value_rnn_classifier(num_classes,
-                                num_units,
-                                sequence_feature_columns,
-                                context_feature_columns=None,
-                                cell_type='basic_rnn',
-                                num_rnn_layers=1,
-                                optimizer_type='SGD',
-                                learning_rate=0.1,
-                                predict_probabilities=False,
-                                momentum=None,
-                                gradient_clipping_norm=5.0,
-                                input_keep_probability=None,
-                                output_keep_probability=None,
-                                model_dir=None,
-                                config=None,
-                                feature_engineering_fn=None):
-  """Creates a `DynamicRnnEstimator` for single-value classification.
-
-  Returns an `Estimator` that given input sequences, processes them in a dynamic
-  recurrent network and outputs a single classifications, along with
-  (optionally) a probability distribution over classes.
-
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    cell_type: A subclass of `RNNCell` or one of 'basic_rnn,' 'lstm' or 'gru'.
-    num_rnn_layers: Number of RNN layers. Leave this at its default value 1
-      if passing a `cell_type` that is already a MultiRNNCell.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer`, a callback that returns an
-      optimizer, or a string. Strings must be one of 'Adagrad', 'Adam',
-      'Ftrl', 'Momentum', 'RMSProp' or 'SGD. See `layers.optimize_loss` for
-      more details.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    input_keep_probability: Probability to keep inputs to `cell`. If `None`,
-      no dropout is applied.
-    output_keep_probability: Probability to keep outputs of `cell`. If `None`,
-      no dropout is applied.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-  Returns:
-    An initialized `Estimator`.
-  """
-  dropout_keep_probabilities, num_units = _get_dropout_and_num_units(
-      num_units,
-      num_rnn_layers,
-      input_keep_probability,
-      output_keep_probability)
-  return DynamicRnnEstimator(
-      problem_type=constants.ProblemType.CLASSIFICATION,
-      prediction_type=rnn_common.PredictionType.SINGLE_VALUE,
-      num_classes=num_classes,
-      sequence_feature_columns=sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_units=num_units,
-      cell_type=cell_type,
-      optimizer=optimizer_type,
-      learning_rate=learning_rate,
-      predict_probabilities=predict_probabilities,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      feature_engineering_fn=feature_engineering_fn,
-      config=config)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 43b3d2a78fc66eb6be542dce36c49464909e1af2..6fc028ab7069eaca46a736f1e96b36e31771a3bd 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -38,8 +38,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -157,7 +157,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
         self.context_feature_columns)
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       sequence_input_val = sess.run(sequence_input)
     expected_shape = np.array([
         3,  # expected batch size
@@ -178,7 +178,7 @@ class DynamicRnnEstimatorTest(test.TestCase):
     # Obtain values of activations and final state.
     with session.Session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.tables_initializer())
+      sess.run(lookup_ops.tables_initializer())
       activations, final_state = sess.run([activations_t, final_state_t])
 
     expected_activations_shape = np.array([3, 2, self.NUM_LABEL_COLUMNS])
@@ -410,56 +410,6 @@ class DynamicRnnEstimatorTest(test.TestCase):
       state_piece = prediction_dict[dynamic_rnn_estimator._get_state_name(i)]
       self.assertListEqual(list(state_piece.shape), [batch_size, state_size])
 
-  def testLegacyConstructor(self):
-    """Exercise legacy constructor function."""
-    num_units = 16
-    num_layers = 6
-    output_keep_prob = 0.9
-    input_keep_prob = 0.7
-    batch_size = 11
-    learning_rate = 0.1
-    train_sequence_length = 21
-    train_steps = 121
-
-    def get_input_fn(batch_size, sequence_length, state_dict, starting_step=0):
-
-      def input_fn():
-        sequence = constant_op.constant(
-            [[(starting_step + i + j) % 2 for j in range(sequence_length + 1)]
-             for i in range(batch_size)],
-            dtype=dtypes.int32)
-        labels = array_ops.slice(sequence, [0, 0],
-                                 [batch_size, sequence_length])
-        inputs = array_ops.expand_dims(
-            math_ops.to_float(
-                array_ops.slice(sequence, [0, 1], [batch_size, sequence_length
-                                                  ])), 2)
-        input_dict = state_dict
-        input_dict['inputs'] = inputs
-        return input_dict, labels
-
-      return input_fn
-
-    seq_columns = [feature_column.real_valued_column('inputs', dimension=1)]
-    config = run_config.RunConfig(tf_random_seed=21212)
-
-    model_dir = tempfile.mkdtemp()
-    sequence_estimator = dynamic_rnn_estimator.multi_value_rnn_classifier(
-        num_classes=2,
-        num_units=num_units,
-        num_rnn_layers=num_layers,
-        input_keep_probability=input_keep_prob,
-        output_keep_probability=output_keep_prob,
-        sequence_feature_columns=seq_columns,
-        learning_rate=learning_rate,
-        config=config,
-        model_dir=model_dir)
-
-    train_input_fn = get_input_fn(
-        batch_size, train_sequence_length, state_dict={})
-
-    sequence_estimator.fit(input_fn=train_input_fn, steps=train_steps)
-
   def testMultipleRuns(self):
     """Tests resuming training by feeding state."""
     cell_sizes = [4, 7]
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 107454dca1a0be790eb8077c844d532b63820174..ddd3d087e7b2866ca3d4d995242002225d2d3720 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import abc
 import copy
-import inspect
 import os
 import tempfile
 
@@ -41,6 +40,7 @@ from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import monitors as monitor_lib
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn as sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.contrib.learn.python.learn.estimators import run_config
@@ -57,7 +57,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -70,6 +70,8 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import summary_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 AS_ITERABLE_DATE = '2016-09-15'
@@ -87,7 +89,7 @@ SCIKIT_DECOUPLE_INSTRUCTIONS = (
 
 
 def _verify_input_args(x, y, input_fn, feed_fn, batch_size):
-  """Verifies validity of co-existance of input arguments."""
+  """Verifies validity of co-existence of input arguments."""
   if input_fn is None:
     if x is None:
       raise ValueError('Either x or input_fn must be provided.')
@@ -173,17 +175,27 @@ def infer_real_valued_columns_from_input(x):
   return infer_real_valued_columns_from_input_fn(input_fn)
 
 
-def _get_arguments(func):
-  """Returns list of arguments this function has."""
-  if hasattr(func, '__code__'):
-    # Regular function.
-    return inspect.getargspec(func).args
-  elif hasattr(func, '__call__'):
-    # Callable object.
-    return _get_arguments(func.__call__)
-  elif hasattr(func, 'func'):
-    # Partial function.
-    return _get_arguments(func.func)
+def _model_fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  _, fn = tf_decorator.unwrap(fn)
+  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
+    # Handle functools.partial and similar objects.
+    return tuple([
+        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
+        if arg not in set(fn.keywords.keys())
+    ])
+  # Handle function.
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 def _get_replica_device_setter(config):
@@ -319,14 +331,21 @@ def _write_dict_to_summary(output_dir,
   for key in dictionary:
     if dictionary[key] is None:
       continue
+    if key == 'global_step':
+      continue
     value = summary_proto.value.add()
     value.tag = key
     if (isinstance(dictionary[key], np.float32) or
         isinstance(dictionary[key], float)):
       value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
     else:
-      logging.warn('Skipping summary for %s, must be a float or np.float32.',
-                   key)
+      logging.warn(
+          'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
+          key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
 
@@ -339,7 +358,7 @@ class BaseEstimator(
   """
   __metaclass__ = abc.ABCMeta
 
-  # Note that for Google users, this is overriden with
+  # Note that for Google users, this is overridden with
   # learn_runner.EstimatorConfig.
   # TODO(wicke): Remove this once launcher takes over config functionality
   _Config = run_config.RunConfig  # pylint: disable=invalid-name
@@ -360,7 +379,11 @@ class BaseEstimator(
       logging.info('Using default config.')
     else:
       self._config = config
-    logging.info('Using config: %s', str(vars(self._config)))
+
+    if self._config.session_config is None:
+      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    else:
+      self._session_config = self._config.session_config
 
     # Model directory.
     if (model_dir is not None) and (self._config.model_dir is not None):
@@ -380,6 +403,7 @@ class BaseEstimator(
                       self._model_dir)
     if self._config.model_dir is None:
       self._config = self._config.replace(model_dir=self._model_dir)
+    logging.info('Using config: %s', str(vars(self._config)))
 
     # Set device function depending if there are replicas or not.
     self._device_fn = _get_replica_device_setter(self._config)
@@ -679,7 +703,7 @@ class BaseEstimator(
   def _get_eval_ops(self, features, labels, metrics):
     """Method that builds model graph and returns evaluation ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
 
     Args:
       features: `Tensor` or `dict` of `Tensor` objects.
@@ -829,7 +853,7 @@ class BaseEstimator(
           eval_ops=update_op,
           final_ops=eval_dict,
           hooks=hooks,
-          config=config_pb2.ConfigProto(allow_soft_placement=True))
+          config=self._session_config)
       current_global_step = eval_results[global_step_key]
 
       _write_dict_to_summary(eval_dir, eval_results, current_global_step)
@@ -864,7 +888,7 @@ class BaseEstimator(
           session_creator=monitored_session.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
               scaffold=infer_ops.scaffold,
-              config=config_pb2.ConfigProto(allow_soft_placement=True)))
+              config=self._session_config))
       if not as_iterable:
         with mon_sess:
           if not mon_sess.should_stop():
@@ -949,7 +973,8 @@ class BaseEstimator(
             saver.Saver(
                 sharded=True,
                 max_to_keep=self._config.keep_checkpoint_max,
-                defer_build=True))
+                defer_build=True,
+                save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
@@ -976,7 +1001,7 @@ class BaseEstimator(
           chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks,
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=config_pb2.ConfigProto(allow_soft_placement=True)
+          config=self._session_config
       ) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
@@ -1065,10 +1090,11 @@ class Estimator(BaseEstimator):
     super(Estimator, self).__init__(model_dir=model_dir, config=config)
     if model_fn is not None:
       # Check number of arguments of the given function matches requirements.
-      model_fn_args = _get_arguments(model_fn)
+      model_fn_args = _model_fn_args(model_fn)
       if params is not None and 'params' not in model_fn_args:
-        raise ValueError('Estimator\'s model_fn (%s) has less than 4 '
-                         'arguments, but not None params (%s) are passed.' %
+        raise ValueError('Estimator\'s model_fn (%s) does not have a params '
+                         'argument, but params (%s) were passed to the '
+                         'Estimator\'s constructor.' %
                          (model_fn, params))
       if params is None and 'params' in model_fn_args:
         logging.warning('Estimator\'s model_fn (%s) includes params '
@@ -1095,7 +1121,7 @@ class Estimator(BaseEstimator):
       ValueError: if model_fn returns invalid objects.
     """
     features, labels = self._feature_engineering_fn(features, labels)
-    model_fn_args = _get_arguments(self._model_fn)
+    model_fn_args = _model_fn_args(self._model_fn)
     kwargs = {}
     if 'mode' in model_fn_args:
       kwargs['mode'] = mode
@@ -1123,7 +1149,7 @@ class Estimator(BaseEstimator):
   def _get_train_ops(self, features, labels):
     """Method that builds model graph and returns trainer ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1139,7 +1165,7 @@ class Estimator(BaseEstimator):
   def _get_eval_ops(self, features, labels, metrics):
     """Method that builds model graph and returns evaluation ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1178,7 +1204,7 @@ class Estimator(BaseEstimator):
   def _get_predict_ops(self, features):
     """Method that builds model graph and returns prediction ops.
 
-    Expected to be overriden by sub-classes that require custom support.
+    Expected to be overridden by sub-classes that require custom support.
     This implementation uses `model_fn` passed as parameter to constructor to
     build model.
 
@@ -1235,6 +1261,13 @@ class Estimator(BaseEstimator):
       input_alternatives, features = (
           saved_model_export_utils.get_input_alternatives(input_ops))
 
+      # TODO(b/34388557) This is a stopgap, pending recording model provenance.
+      # Record which features are expected at serving time.  It is assumed that
+      # these are the features that were used in training.
+      for feature_key in input_ops.features.keys():
+        ops.add_to_collection(
+            constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS, feature_key)
+
       # Call the model_fn and collect the output alternatives.
       model_fn_ops = self._call_model_fn(features, None,
                                          model_fn_lib.ModeKeys.INFER)
@@ -1263,14 +1296,11 @@ class Estimator(BaseEstimator):
       else:
         saver_for_restore = saver.Saver(sharded=True)
       with tf_session.Session('') as session:
-        variables.initialize_local_variables()
-        data_flow_ops.tables_initializer()
-        resources.initialize_resources(resources.shared_resources())
         saver_for_restore.restore(session, checkpoint_path)
         init_op = control_flow_ops.group(
             variables.local_variables_initializer(),
             resources.initialize_resources(resources.shared_resources()),
-            data_flow_ops.tables_initializer())
+            lookup_ops.tables_initializer())
 
         # Perform the export
         builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 1dc362beb8975571d240aff2b2e7655877e0b704..c95df75356b70663180c5e3fbb5bb5b6d84aeffa 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -28,6 +28,8 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from google.protobuf import text_format
+
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
 from tensorflow.contrib.framework.python.ops import variables
@@ -38,6 +40,7 @@ from tensorflow.contrib.learn.python.learn import models
 from tensorflow.contrib.learn.python.learn import monitors as monitors_lib
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import _sklearn
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import linear
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
@@ -49,9 +52,10 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables as variables_lib
@@ -60,9 +64,9 @@ from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_state_pb2
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import monitored_session
-from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
@@ -81,18 +85,6 @@ def boston_input_fn(num_epochs=None):
   return features, labels
 
 
-def boston_input_fn_with_queue(num_epochs=None):
-  features, labels = boston_input_fn(num_epochs=num_epochs)
-
-  # Create a minimal queue runner.
-  fake_queue = data_flow_ops.FIFOQueue(30, dtypes.int32)
-  queue_runner = queue_runner_impl.QueueRunner(fake_queue,
-                                               [constant_op.constant(0)])
-  queue_runner_impl.add_queue_runner(queue_runner)
-
-  return features, labels
-
-
 def iris_input_fn():
   iris = base.load_iris()
   features = array_ops.reshape(
@@ -295,33 +287,93 @@ class CheckCallsMonitor(monitors_lib.BaseMonitor):
             self.begin_calls == self.expect_calls)
 
 
-class EstimatorTest(test.TestCase):
+def _model_fn_ops(
+    expected_features, expected_labels, actual_features, actual_labels, mode):
+  assert_ops = tuple([
+      check_ops.assert_equal(
+          expected_features[k], actual_features[k], name='assert_%s' % k)
+      for k in expected_features
+  ] + [
+      check_ops.assert_equal(
+          expected_labels, actual_labels, name='assert_labels')
+  ])
+  with ops.control_dependencies(assert_ops):
+    return model_fn.ModelFnOps(
+        mode=mode,
+        predictions=constant_op.constant(0.),
+        loss=constant_op.constant(0.),
+        train_op=constant_op.constant(0.))
+
+
+def _make_input_fn(features, labels):
+  def _input_fn():
+    return {
+        k: constant_op.constant(v)
+        for k, v in six.iteritems(features)
+    }, constant_op.constant(labels)
+  return _input_fn
 
-  def testExperimentIntegration(self):
-    exp = experiment.Experiment(
-        estimator=estimator.Estimator(model_fn=linear_model_fn),
-        train_input_fn=boston_input_fn,
-        eval_input_fn=boston_input_fn)
-    exp.test()
+
+class EstimatorModelFnTest(test.TestCase):
 
   def testModelFnArgs(self):
-    expected_param = {'some_param': 'some_value'}
+    features = {'x': 42., 'y': 43.}
+    labels = 44.
+    expected_params = {'some_param': 'some_value'}
     expected_config = run_config.RunConfig()
     expected_config.i_am_test = True
 
-    def _argument_checker(features, labels, mode, params, config):
-      _, _ = features, labels
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # `features` and `labels` are passed by position, `arg0` and `arg1` here.
+    def _model_fn(arg0, arg1, mode, params, config):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(features.keys(), arg0.keys())
       self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
-      self.assertEqual(expected_param, params)
+      self.assertEqual(expected_params, params)
       self.assertTrue(config.i_am_test)
-      return constant_op.constant(0.), constant_op.constant(
-          0.), constant_op.constant(0.)
+      return _model_fn_ops(features, labels, arg0, arg1, mode)
 
     est = estimator.Estimator(
-        model_fn=_argument_checker,
-        params=expected_param,
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.fit(input_fn=_make_input_fn(features, labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def testPartialModelFnArgs(self):
+    features = {'x': 42., 'y': 43.}
+    labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+    expected_foo = 45.
+    expected_bar = 46.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # `features` and `labels` are passed by position, `arg0` and `arg1` here.
+    def _model_fn(arg0, arg1, foo, mode, params, config, bar):
+      model_fn_call_count[0] += 1
+      self.assertEqual(expected_foo, foo)
+      self.assertEqual(expected_bar, bar)
+      self.assertItemsEqual(features.keys(), arg0.keys())
+      self.assertEqual(model_fn.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _model_fn_ops(features, labels, arg0, arg1, mode)
+    partial_model_fn = functools.partial(
+        _model_fn, foo=expected_foo, bar=expected_bar)
+
+    est = estimator.Estimator(
+        model_fn=partial_model_fn, params=expected_params,
         config=expected_config)
-    est.fit(input_fn=boston_input_fn, steps=1)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.fit(input_fn=_make_input_fn(features, labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
 
   def testModelFnWithModelDir(self):
     expected_param = {'some_param': 'some_value'}
@@ -447,6 +499,16 @@ class EstimatorTest(test.TestCase):
     est.export_savedmodel(est.model_dir + '/export', serving_input_fn)
     self.assertTrue(self.mock_saver.restore.called)
 
+
+class EstimatorTest(test.TestCase):
+
+  def testExperimentIntegration(self):
+    exp = experiment.Experiment(
+        estimator=estimator.Estimator(model_fn=linear_model_fn),
+        train_input_fn=boston_input_fn,
+        eval_input_fn=boston_input_fn)
+    exp.test()
+
   def testCheckpointSaverHookSuppressesTheDefaultOne(self):
     saver_hook = test.mock.Mock(
         spec=basic_session_run_hooks.CheckpointSaverHook)
@@ -616,6 +678,38 @@ class EstimatorTest(test.TestCase):
         metrics={'MSE': metric_ops.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est.fit(input_fn=boston_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est1.fit(input_fn=boston_input_fn, steps=5)
+
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    self.assertEqual(5, est2.get_variable_value('global_step'))
+    est2.fit(input_fn=boston_input_fn, steps=5)
+    self.assertEqual(10, est2.get_variable_value('global_step'))
+
   def testEstimatorParams(self):
     boston = base.load_boston()
     est = estimator.SKCompat(
@@ -852,6 +946,10 @@ class EstimatorTest(test.TestCase):
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
         self.assertTrue('linear/linear/feature/matmul' in graph_ops)
+        self.assertSameElements(
+            ['bogus_lookup', 'feature'],
+            graph.get_collection(
+                constants.COLLECTION_DEF_KEY_FOR_INPUT_FEATURE_KEYS))
 
     # cleanup
     gfile.DeleteRecursively(tmpdir)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
index eb0cf51e098b0ac326270a6fbc1b56f7e4cd3a80..fd47710e3015de9ae6a453f98978b0ef8f88968c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test_utils.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
+from tensorflow.python.util import tf_inspect
 
 
 def assert_estimator_contract(tester, estimator_class):
@@ -31,7 +31,7 @@ def assert_estimator_contract(tester, estimator_class):
     tester: A tf.test.TestCase.
     estimator_class: 'type' object of pre-canned estimator.
   """
-  attributes = inspect.getmembers(estimator_class)
+  attributes = tf_inspect.getmembers(estimator_class)
   attribute_names = [a[0] for a in attributes]
 
   tester.assertTrue('config' in attribute_names)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 028a13ca20a1d1684a52ca4de4fe850c2b2cff62..d270d89c12b16dcfdaf989a83896d82c88999297 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import inspect
 
 import six
 
@@ -42,10 +41,14 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 class Head(object):
@@ -160,10 +163,10 @@ class Head(object):
           ModeFnOps.loss to compute and apply gradients.
       logits: logits `Tensor` to be used by the head.
       logits_input: `Tensor` from which to build logits, often needed when you
-        don't want to compute the logits. Typicaly this is the activation of the
-        last hidden layer in a DNN. Some heads (like the ones responsible for
-        candidate sampling) intrinsically avoid computing full logits and only
-        accepts logits_input.
+        don't want to compute the logits. Typically this is the activation of
+        the last hidden layer in a DNN. Some heads (like the ones responsible
+        for candidate sampling) intrinsically avoid computing full logits and
+        only accepts logits_input.
       scope: Optional scope for `variable_scope`.
 
     Returns:
@@ -376,7 +379,12 @@ def multi_label_head(n_classes,
                      loss_fn=None):
   """Creates a Head for multi label classification.
 
-  The Head uses sigmoid cross entropy loss.
+  Multi-label classification handles the case where each example may have zero
+  or more associated labels, from a discrete set.  This is distinct from
+  `multi_class_head` which has exactly one label from a discrete set.
+
+  This head by default uses sigmoid cross entropy loss, which expects as input
+  a multi-hot tensor of shape `(batch_size, num_classes)`.
 
   Args:
     n_classes: Integer, number of classes, must be >= 2
@@ -611,6 +619,8 @@ def _create_model_fn_ops(features,
   if (mode != model_fn.ModeKeys.INFER) and (labels is not None):
     weight_tensor = _weight_tensor(features, weight_column_name)
     loss, weighted_average_loss = loss_fn(labels, logits, weight_tensor)
+    # Uses the deprecated API to set the tag explicitly.
+    # Without it, training and eval losses will show up in different graphs.
     logging_ops.scalar_summary(
         _summary_key(head_name, mkey.LOSS), weighted_average_loss)
 
@@ -816,7 +826,8 @@ class _BinaryLogisticHead(_SingleHead):
           loss_fn=self._loss_fn,
           logits_to_predictions_fn=self._logits_to_predictions,
           metrics_fn=self._metrics,
-          create_output_alternatives_fn=self._create_output_alternatives,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type),
           labels=labels,
           train_op_fn=train_op_fn,
           logits=logits,
@@ -885,6 +896,8 @@ class _BinaryLogisticHead(_SingleHead):
           _indicator_labels_streaming_mean(labels, weights))
       metrics[_summary_key(self.head_name, mkey.AUC)] = (
           _streaming_auc(logistic, labels, weights))
+      metrics[_summary_key(self.head_name, mkey.AUC_PR)] = (
+          _streaming_auc(logistic, labels, weights, curve="PR"))
 
       for threshold in self._thresholds:
         metrics[_summary_key(
@@ -913,12 +926,21 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     if not labels.dtype.is_integer:
       raise ValueError("Labels dtype should be integer "
                        "Instead got %s." % labels.dtype)
-    # TODO(ptucker): This will break for dynamic shapes.
+
     # sparse_softmax_cross_entropy_with_logits requires [batch_size] labels.
+    is_squeezed_labels = False
+    # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
       labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      is_squeezed_labels = True
+
     loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels, logits=logits, name=name)
+
+    # Restore squeezed dimension, if necessary, so loss matches weights shape.
+    if is_squeezed_labels:
+      loss = array_ops.expand_dims(loss, axis=(1,))
+
     return _compute_weighted_loss(loss, weights)
 
 
@@ -1009,7 +1031,8 @@ class _MultiClassHead(_SingleHead):
           loss_fn=self._wrapped_loss_fn,
           logits_to_predictions_fn=self._logits_to_predictions,
           metrics_fn=self._metrics,
-          create_output_alternatives_fn=self._create_output_alternatives,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type, self._label_keys),
           labels=labels,
           train_op_fn=train_op_fn,
           logits=logits,
@@ -1113,31 +1136,12 @@ class _MultiClassHead(_SingleHead):
 
     return metrics
 
-  def _create_output_alternatives(self, predictions):
-    """See superclass."""
-    probabilities = predictions[prediction_key.PredictionKey.PROBABILITIES]
-    batch_size = array_ops.shape(probabilities)[0]
-    if self._label_keys:
-      classes = array_ops.tile(
-          input=array_ops.expand_dims(input=self._label_keys, axis=0),
-          multiples=[batch_size, 1])
-    else:
-      classes = array_ops.tile(
-          input=array_ops.expand_dims(
-              input=math_ops.range(self.logits_dimension), axis=0),
-          multiples=[batch_size, 1])
-    predictions_for_serving = {
-        prediction_key.PredictionKey.CLASSES: classes,
-        prediction_key.PredictionKey.PROBABILITIES: probabilities,
-    }
-    return {self._head_name: (self._problem_type, predictions_for_serving)}
-
 
 def _to_labels_tensor(labels, label_name):
   """Returns label as a tensor.
 
   Args:
-    labels: Label `Tensor` or `SparseTensor` or a dict containig labels.
+    labels: Label `Tensor` or `SparseTensor` or a dict containing labels.
     label_name: Label name if labels is a dict.
 
   Returns:
@@ -1226,6 +1230,7 @@ class _BinarySvmHead(_SingleHead):
           loss_fn=self._loss_fn,
           logits_to_predictions_fn=self._logits_to_predictions,
           metrics_fn=self._metrics,
+          # TODO(zakaria): Handle labels for export.
           create_output_alternatives_fn=self._create_output_alternatives,
           labels=labels,
           train_op_fn=train_op_fn,
@@ -1325,7 +1330,8 @@ class _MultiLabelHead(_SingleHead):
           loss_fn=self._loss_fn,
           logits_to_predictions_fn=self._logits_to_predictions,
           metrics_fn=self._metrics,
-          create_output_alternatives_fn=self._create_output_alternatives,
+          create_output_alternatives_fn=_classification_output_alternatives(
+              self.head_name, self._problem_type),
           labels=labels,
           train_op_fn=train_op_fn,
           logits=logits,
@@ -1374,6 +1380,8 @@ class _MultiLabelHead(_SingleHead):
           metrics_lib.streaming_accuracy(classes, labels, weights))
       metrics[_summary_key(self.head_name, mkey.AUC)] = _streaming_auc(
           probabilities, labels, weights)
+      metrics[_summary_key(self.head_name, mkey.AUC_PR)] = _streaming_auc(
+          probabilities, labels, weights, curve="PR")
 
       for class_id in self._metric_class_ids:
         # TODO(ptucker): Add per-class accuracy, precision, recall.
@@ -1391,6 +1399,9 @@ class _MultiLabelHead(_SingleHead):
                 _predictions_streaming_mean(logits, weights, class_id))
         metrics[_summary_key(self.head_name, mkey.CLASS_AUC % class_id)] = (
             _streaming_auc(probabilities, labels, weights, class_id))
+        metrics[_summary_key(self.head_name, mkey.CLASS_AUC_PR % class_id)] = (
+            _streaming_auc(probabilities, labels, weights, class_id,
+                           curve="PR"))
 
     return metrics
 
@@ -1564,7 +1575,7 @@ class _MultiHead(Head):
     Args:
       all_model_fn_ops: list of ModelFnOps for the individual heads.
       train_op_fn: Function to create train op. See `create_model_fn_ops`
-          documentaion for more details.
+          documentation for more details.
 
     Returns:
       ModelFnOps that merges all heads for TRAIN.
@@ -1635,12 +1646,27 @@ class _MultiHead(Head):
 
 
 def _weight_tensor(features, weight_column_name):
-  """Returns weights as 1d `Tensor`."""
+  """Returns weights as `Tensor` of rank 0, or at least 2."""
   if not weight_column_name:
     return None
-  with ops.name_scope(None, "weight_tensor",
-                      tuple(six.itervalues(features))):
-    return math_ops.to_float(features[weight_column_name])
+  if weight_column_name not in features:
+    raise ValueError("Weights {} missing from features.".format(
+        weight_column_name))
+  with ops.name_scope(None, "weight_tensor", tuple(six.itervalues(features))):
+    weight_tensor = math_ops.to_float(features[weight_column_name])
+    shape = weight_tensor.get_shape()
+    rank = shape.ndims
+    # We don't bother with expanding dims of non-staticly shaped tensors or
+    # scalars, and >1d is already in a good format.
+    if rank == 1:
+      logging.warning(
+          "Weights {} has shape {}, expanding to make it 2d.",
+          weight_column_name, shape)
+      return (
+          sparse_ops.sparse_reshape(weight_tensor, (-1, 1))
+          if isinstance(weight_tensor, sparse_tensor.SparseTensor) else
+          array_ops.reshape(weight_tensor, (-1, 1)))
+    return weight_tensor
 
 
 # TODO(zakaria): This function is needed for backward compatibility and should
@@ -1665,19 +1691,16 @@ def _compute_weighted_loss(loss_unweighted, weight, name="loss"):
     name: Optional name
 
   Returns:
-    A tuple of losses. First one for training and the second one for reproting.
+    A tuple of losses. First one for training and the second one for reporting.
   """
   with ops.name_scope(name, values=(loss_unweighted, weight)) as name_scope:
     if weight is None:
       loss = math_ops.reduce_mean(loss_unweighted, name=name_scope)
       return loss, loss
+    weight = weights_broadcast_ops.broadcast_weights(weight, loss_unweighted)
     with ops.name_scope(None, "weighted_loss",
                         (loss_unweighted, weight)) as name:
-      weighted_loss = math_ops.multiply(
-          array_ops.reshape(loss_unweighted, shape=(-1,)),
-          array_ops.reshape(weight, shape=(-1,)), name=name)
-    # TODO(ptucker): This might be wrong if weights are broadcast to loss shape.
-    # We should use tf.losses here.
+      weighted_loss = math_ops.multiply(loss_unweighted, weight, name=name)
     weighted_loss_mean = math_ops.reduce_mean(weighted_loss, name=name_scope)
     weighted_loss_normalized = math_ops.div(
         math_ops.reduce_sum(weighted_loss),
@@ -1706,9 +1729,10 @@ def _check_mode_valid(mode):
 
 def _get_arguments(func):
   """Returns a spec of given func."""
+  _, func = tf_decorator.unwrap(func)
   if hasattr(func, "__code__"):
     # Regular function.
-    return inspect.getargspec(func)
+    return tf_inspect.getargspec(func)
   elif hasattr(func, "__call__"):
     # Callable object.
     return _get_arguments(func.__call__)
@@ -1742,7 +1766,7 @@ def _centered_bias(logits_dimension, head_name=None):
   # Do not create a variable with variable_scope.get_variable, because that may
   # create a PartitionedVariable, which does not support indexing, so
   # summary.scalar will not work.
-  centered_bias = variables.Variable(
+  centered_bias = variable_scope.variable(
       name="centered_bias_weight",
       initial_value=array_ops.zeros(shape=(logits_dimension,)),
       trainable=True)
@@ -1811,8 +1835,13 @@ def _float_weights_or_none(weights):
 
 
 def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
-  labels = ops.convert_to_tensor(labels)
+  labels = math_ops.to_float(labels)
+  weights = _float_weights_or_none(weights)
+  if weights is not None:
+    weights = weights_broadcast_ops.broadcast_weights(weights, labels)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     labels = labels[:, class_id]
   return metrics_lib.streaming_mean(labels, weights=weights)
 
@@ -1820,11 +1849,13 @@ def _indicator_labels_streaming_mean(labels, weights=None, class_id=None):
 def _predictions_streaming_mean(predictions,
                                 weights=None,
                                 class_id=None):
-  predictions = ops.convert_to_tensor(predictions)
+  predictions = math_ops.to_float(predictions)
+  weights = _float_weights_or_none(weights)
   if weights is not None:
-    weights = ops.convert_to_tensor(weights)
-
+    weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     predictions = predictions[:, class_id]
   return metrics_lib.streaming_mean(predictions, weights=weights)
 
@@ -1857,16 +1888,23 @@ def _class_labels_streaming_mean(labels, weights, class_id):
       weights=weights)
 
 
-def _streaming_auc(predictions, labels, weights=None, class_id=None):
-  predictions = ops.convert_to_tensor(predictions)
-  labels = ops.convert_to_tensor(labels)
+def _streaming_auc(predictions, labels, weights=None, class_id=None,
+                   curve="ROC"):
+  # pylint: disable=missing-docstring
+  predictions = math_ops.to_float(predictions)
+  if labels.dtype.base_dtype != dtypes.bool:
+    logging.warning("Casting %s labels to bool.", labels.dtype)
+    labels = math_ops.cast(labels, dtypes.bool)
+  weights = _float_weights_or_none(weights)
+  if weights is not None:
+    weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
   if class_id is not None:
+    if weights is not None:
+      weights = weights[:, class_id]
     predictions = predictions[:, class_id]
     labels = labels[:, class_id]
   return metrics_lib.streaming_auc(
-      predictions,
-      math_ops.cast(labels, dtypes.bool),
-      weights=_float_weights_or_none(weights))
+      predictions, labels, weights=weights, curve=curve)
 
 
 def _assert_class_id(class_id, num_classes=None):
@@ -1901,6 +1939,71 @@ def _streaming_recall_at_threshold(predictions, labels, weights, threshold):
   return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
 
 
+def _classification_output_alternatives(head_name, problem_type,
+                                        label_keys=None):
+  """Creates a func to generate output alternatives for classification.
+
+  Servo expects classes to be a string tensor, and have the same dimensions
+  as the probabilities tensor. It should contain the labels of the corresponding
+  entries in probabilities. This function creates a new classes tensor that
+  satisfies these conditions and can be exported.
+
+  Args:
+    head_name: Name of the head.
+    problem_type: `ProblemType`
+    label_keys: Optional label keys
+
+  Returns:
+    A function to generate output alternatives.
+  """
+  def _create_output_alternatives(predictions):
+    """Creates output alternative for the Head.
+
+    Args:
+      predictions: a dict of {tensor_name: Tensor}, where 'tensor_name' is a
+        symbolic name for an output Tensor possibly but not necessarily taken
+        from `PredictionKey`, and 'Tensor' is the corresponding output Tensor
+        itself.
+
+    Returns:
+      `dict` of {submodel_name: (problem_type, {tensor_name: Tensor})}, where
+      'submodel_name' is a submodel identifier that should be consistent across
+      the pipeline (here likely taken from the head_name),
+      'problem_type' is a `ProblemType`,
+      'tensor_name' is a symbolic name for an output Tensor possibly but not
+       necessarily taken from `PredictionKey`, and
+      'Tensor' is the corresponding output Tensor itself.
+
+    Raises:
+      ValueError: if predictions does not have PredictionKey.PROBABILITIES key.
+    """
+    probabilities = predictions.get(prediction_key.PredictionKey.PROBABILITIES)
+    if probabilities is None:
+      raise ValueError("%s missing in predictions" %
+                       prediction_key.PredictionKey.PROBABILITIES)
+
+    with ops.name_scope(None, "_classification_output_alternatives",
+                        (probabilities,)):
+      batch_size = array_ops.shape(probabilities)[0]
+      if label_keys:
+        classes = array_ops.tile(
+            input=array_ops.expand_dims(input=label_keys, axis=0),
+            multiples=[batch_size, 1],
+            name="classes_tensor")
+      else:
+        n = array_ops.shape(probabilities)[1]
+        classes = array_ops.tile(
+            input=array_ops.expand_dims(input=math_ops.range(n), axis=0),
+            multiples=[batch_size, 1])
+        classes = string_ops.as_string(classes, name="classes_tensor")
+
+    exported_predictions = {
+        prediction_key.PredictionKey.PROBABILITIES: probabilities,
+        prediction_key.PredictionKey.CLASSES: classes}
+    return {head_name: (problem_type, exported_predictions)}
+
+  return _create_output_alternatives
+
 # Aliases
 # TODO(zakaria): Remove these aliases, See b/34751732
 _regression_head = regression_head
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index ecc1d9ff9e139f7764a064afce592ea411f9a254..012b919d63147f2472ff6a4fc03f0dee7a60968a 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -32,11 +32,10 @@ from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses as losses_lib
 from tensorflow.python.platform import test
-# pylint: enable=g-bad-todo,g-import-not-at-top
 
 
 def _assert_variables(test_case,
@@ -226,20 +225,56 @@ class RegressionHeadTest(test.TestCase):
       _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
-  def testRegressionWithWeights(self):
+  def testRegressionWithScalarWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = 2.
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, (weights * 5.) / len(labels), {
+          "loss": (weights * 5.) / (weights * len(labels))
+      }, model_fn_ops)
+
+  def testRegressionWith1DWeights(self):
+    head = head_lib.regression_head(weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = (2., 5., 0.)
+      labels = ((0.,), (1.,), (1.,))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=((1.,), (1.,), (3.,)))
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
+                      model_fn_ops)
+
+  def testRegressionWith2DWeights(self):
     head = head_lib.regression_head(weight_column_name="label_weight")
     with ops.Graph().as_default(), session.Session():
       weights = ((2.,), (5.,), (0.,))
+      labels = ((0.,), (1.,), (1.,))
       model_fn_ops = head.create_model_fn_ops(
           features={"label_weight": weights},
-          labels=((0.,), (1.,), (1.,)),
+          labels=labels,
           mode=model_fn.ModeKeys.TRAIN,
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1.,), (1.,), (3.,)))
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      _assert_metrics(self, 2. / len(weights), {"loss": 2. / np.sum(weights)},
+      _assert_metrics(self, 2. / len(labels), {"loss": 2. / np.sum(weights)},
                       model_fn_ops)
 
   def testRegressionWithCenteredBias(self):
@@ -260,8 +295,10 @@ class RegressionHeadTest(test.TestCase):
           ),
           expected_trainable=("regression_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "regression_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "regression_head/centered_bias/bias_0"
+      ])
       _assert_metrics(self, 5. / 3, {"loss": 5. / 3}, model_fn_ops)
 
   def testRegressionErrorInSparseTensorLabels(self):
@@ -297,11 +334,15 @@ class MultiLabelHeadTest(test.TestCase):
   def _expected_eval_metrics(self, expected_loss):
     return {
         "accuracy": 1. / 3,
-        "auc": 1. / 4,
         "loss": expected_loss,
+        "auc": 1. / 4,
         "auc/class0": 1.,
         "auc/class1": 1.,
         "auc/class2": 0.,
+        "auc_precision_recall": 0.166667,
+        "auc_precision_recall/class0": 0,
+        "auc_precision_recall/class1": 0.,
+        "auc_precision_recall/class2": 1.,
         "labels/actual_label_mean/class0": self._labels[0][0],
         "labels/actual_label_mean/class1": self._labels[0][1],
         "labels/actual_label_mean/class2": self._labels[0][2],
@@ -417,7 +458,7 @@ class MultiLabelHeadTest(test.TestCase):
             {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits_input=((0., 0.),), logits=self._logits)
 
-  def testMultiLabelEvalMode(self):
+  def testMultiLabelEval(self):
     n_classes = 3
     head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
@@ -433,7 +474,7 @@ class MultiLabelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testMultiClassEvalModeWithLargeLogits(self):
+  def testMultiClassEvalWithLargeLogits(self):
     n_classes = 3
     head = head_lib.multi_label_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
@@ -472,6 +513,36 @@ class MultiLabelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       expected_eval_metrics, model_fn_ops)
 
+  def testMultiLabelInfer(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(n_classes=n_classes, head_name="head_name")
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          {}, model_fn.ModeKeys.INFER, self._labels, head_lib.no_op_train_fn,
+          logits=((1., 0., 0.), (0., 0., 1)))
+      self.assertIsNone(model_fn_ops.train_op)
+      _assert_no_variables(self)
+      with session.Session():
+        self.assertListEqual(
+            [1, 0, 0], model_fn_ops.predictions["classes"].eval().tolist()[0])
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.CLASSIFICATION,
+            model_fn_ops.output_alternatives["head_name"][0])
+
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertAllEqual(
+            [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
+            predictions_for_serving["classes"].eval())
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertAllClose(
+            [[0.731059, 0.5, 0.5],
+             [0.5, 0.5, 0.731059,]],
+            predictions_for_serving["probabilities"].eval())
+
   def testMultiLabelWithLabelName(self):
     n_classes = 3
     label_name = "my_label"
@@ -490,7 +561,7 @@ class MultiLabelHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testMultiLabelWithWeight(self):
+  def testMultiLabelWithScalarWeight(self):
     n_classes = 3
     head = head_lib.multi_label_head(
         n_classes=n_classes,
@@ -507,7 +578,42 @@ class MultiLabelHeadTest(test.TestCase):
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
       _assert_metrics(self, .089985214,
-                      self._expected_eval_metrics(2.69956), model_fn_ops)
+                      self._expected_eval_metrics(.89985214), model_fn_ops)
+
+  def testMultiLabelWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      with self.assertRaisesRegexp(
+          ValueError, "weights can not be broadcast to values"):
+        head.create_model_fn_ops(
+            features={"label_weight": (.1, .1, .1)},
+            labels=self._labels,
+            mode=model_fn.ModeKeys.TRAIN,
+            train_op_fn=head_lib.no_op_train_fn,
+            logits=self._logits)
+
+  def testMultiLabelWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_label_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": ((.1, .1, .1),)},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      _assert_metrics(self, .089985214,
+                      self._expected_eval_metrics(.89985214), model_fn_ops)
 
   def testMultiLabelWithCustomLoss(self):
     n_classes = 3
@@ -526,8 +632,9 @@ class MultiLabelHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      _assert_metrics(self, 0.089985214,
-                      self._expected_eval_metrics(0.089985214), model_fn_ops)
+      expected_loss = .089985214
+      _assert_metrics(self, expected_loss,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
 
   def testMultiLabelWithCenteredBias(self):
     n_classes = 3
@@ -621,6 +728,7 @@ class BinaryClassificationHeadTest(test.TestCase):
         "accuracy/baseline_label_mean": label_mean,
         "accuracy/threshold_0.500000_mean": 1. / 2,
         "auc": 1. / 2,
+        "auc_precision_recall": 0.749999,
         "labels/actual_label_mean": label_mean,
         "labels/prediction_mean": .731059,  # softmax
         "loss": expected_loss,
@@ -691,7 +799,7 @@ class BinaryClassificationHeadTest(test.TestCase):
             {}, model_fn.ModeKeys.TRAIN, self._labels, head_lib.no_op_train_fn,
             logits_input=((0., 0.), (0., 0.)), logits=self._logits)
 
-  def testBinaryClassificationEvalMode(self):
+  def testBinaryClassificationEval(self):
     n_classes = 2
     head = head_lib.multi_class_head(n_classes=n_classes)
     with ops.Graph().as_default(), session.Session():
@@ -708,20 +816,34 @@ class BinaryClassificationHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testBinaryClassificationInferMode(self):
+  def testBinaryClassificationInfer(self):
     n_classes = 2
-    head = head_lib.multi_class_head(n_classes=n_classes)
+    head = head_lib.multi_class_head(n_classes=n_classes, head_name="head_name")
     with ops.Graph().as_default(), session.Session():
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       model_fn_ops = head.create_model_fn_ops(
           {}, model_fn.ModeKeys.INFER, self._labels, head_lib.no_op_train_fn,
           logits=self._logits)
-      self._assert_output_alternatives(model_fn_ops)
       self.assertIsNone(model_fn_ops.train_op)
       _assert_no_variables(self)
+      with session.Session():
+        self.assertListEqual(
+            [1, 1], list(model_fn_ops.predictions["classes"].eval()))
+        self.assertItemsEqual(
+            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertEqual(
+            constants.ProblemType.LOGISTIC_REGRESSION,
+            model_fn_ops.output_alternatives["head_name"][0])
+        predictions_for_serving = (
+            model_fn_ops.output_alternatives["head_name"][1])
+        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        predicted_classes = predictions_for_serving["classes"].eval().tolist()
+        self.assertListEqual(
+            [b"0", b"1"], predicted_classes[0])
+        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
 
-  def testBinaryClassificationInferMode_withWightColumn(self):
+  def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
     head = head_lib.multi_class_head(n_classes=n_classes,
                                      weight_column_name="label_weight")
@@ -773,7 +895,42 @@ class BinaryClassificationHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
-  def testBinaryClassificationWithWeights(self):
+  def testBinaryClassificationWith1DWeights(self):
+    n_classes = 2
+    head = head_lib.multi_class_head(
+        n_classes=n_classes, weight_column_name="label_weight")
+    with ops.Graph().as_default(), session.Session():
+      weights = (1., 0.)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_total_loss = .31326166
+      _assert_metrics(
+          self,
+          expected_total_loss / len(weights),
+          {
+              "accuracy": 1. / 1,
+              "accuracy/baseline_label_mean": 1. / 1,
+              "accuracy/threshold_0.500000_mean": 1. / 1,
+              "auc": 0. / 1,
+              "labels/actual_label_mean": 1. / 1,
+              "labels/prediction_mean": .731059,  # softmax
+              # eval loss is weighted loss divided by sum of weights.
+              "loss": expected_total_loss,
+              "precision/positive_threshold_0.500000_mean": 1. / 1,
+              "recall/positive_threshold_0.500000_mean": 1. / 1,
+          },
+          model_fn_ops)
+
+  def testBinaryClassificationWith2DWeights(self):
     n_classes = 2
     head = head_lib.multi_class_head(
         n_classes=n_classes, weight_column_name="label_weight")
@@ -825,7 +982,7 @@ class BinaryClassificationHeadTest(test.TestCase):
       _assert_summary_tags(self, ["loss"])
       # logloss: z:label, x:logit
       # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-      # expected_loss is (total_weighted_loss)/1 since htere is 1 nonzero
+      # expected_loss is (total_weighted_loss)/1 since there is 1 nonzero
       # weight.
       expected_loss = 0.062652342
       _assert_metrics(
@@ -861,8 +1018,10 @@ class BinaryClassificationHeadTest(test.TestCase):
                "Adagrad:0"),),
           expected_trainable=("binary_logistic_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "binary_logistic_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "binary_logistic_head/centered_bias/bias_0"
+      ])
       expected_loss = .81326175
       _assert_metrics(self, expected_loss,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
@@ -879,7 +1038,7 @@ class MultiClassHeadTest(test.TestCase):
 
   def setUp(self):
     self._logits = ((1., 0., 0.),)
-    self._labels = (2,)
+    self._labels = ((2,),)
 
   def _expected_eval_metrics(self, expected_loss):
     return {
@@ -1006,7 +1165,7 @@ class MultiClassHeadTest(test.TestCase):
                             "multi_class_head/centered_bias/bias_1",
                             "multi_class_head/centered_bias/bias_2"])
 
-  def testMultiClassEvalMode(self):
+  def testMultiClassEval(self):
     n_classes = 3
     head = head_lib.multi_class_head(
         n_classes=n_classes, metric_class_ids=range(n_classes))
@@ -1059,7 +1218,7 @@ class MultiClassHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       expected_eval_metrics, model_fn_ops)
 
-  def testMultiClassWithWeight(self):
+  def testMultiClassWithScalarWeight(self):
     n_classes = 3
     head = head_lib.multi_class_head(
         n_classes=n_classes,
@@ -1082,6 +1241,54 @@ class MultiClassHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss * weight,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
+  def testMultiClassWith1DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = (weight,)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
+  def testMultiClassWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = ((weight,),)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
   def testMultiClassWithCustomLoss(self):
     n_classes = 3
     head = head_lib.multi_class_head(
@@ -1118,7 +1325,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.), (0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertAllEqual(
             [0, 2],
             model_fn_ops.predictions["classes"].eval())
@@ -1131,7 +1338,7 @@ class MultiClassHeadTest(test.TestCase):
             model_fn_ops.output_alternatives["head_name"][1])
         self.assertIn("classes", six.iterkeys(predictions_for_serving))
         self.assertAllEqual(
-            [[0, 1, 2], [0, 1, 2]],
+            [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
         self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
         self.assertAllClose(
@@ -1170,7 +1377,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.), (0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertAllEqual(
             [b"key0", b"key2"],
             model_fn_ops.predictions["classes"].eval())
@@ -1205,7 +1412,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((1., 0., 0.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertIsNone(model_fn_ops.train_op)
         _assert_no_variables(self)
         _assert_summary_tags(self, ["loss"])
@@ -1231,7 +1438,7 @@ class MultiClassHeadTest(test.TestCase):
           train_op_fn=head_lib.no_op_train_fn,
           logits=((0., 0., 1.),))
       with session.Session():
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
         self.assertIsNone(model_fn_ops.train_op)
         _assert_no_variables(self)
         _assert_summary_tags(self, ["loss"])
@@ -1361,11 +1568,12 @@ class BinarySvmHeadTest(test.TestCase):
           "loss": expected_loss,
       }, model_fn_ops)
 
-  def testBinarySVMWithWeights(self):
+  def testBinarySVMWith1DWeights(self):
     head = head_lib.binary_svm_head(weight_column_name="weights")
     with ops.Graph().as_default(), session.Session():
       weights = (7., 11.)
       model_fn_ops = head.create_model_fn_ops(
+          # We have to add an extra dim here for weights broadcasting to work.
           features={"weights": weights},
           mode=model_fn.ModeKeys.TRAIN,
           labels=self._labels,
@@ -1374,11 +1582,30 @@ class BinarySvmHeadTest(test.TestCase):
       self._assert_output_alternatives(model_fn_ops)
       _assert_no_variables(self)
       _assert_summary_tags(self, ["loss"])
-      expected_weighted_sum = np.sum(
-          np.multiply(weights, self._expected_losses))
-      _assert_metrics(self, expected_weighted_sum / len(weights), {
+      expected_weighted_losses = np.multiply(weights, self._expected_losses)
+      _assert_metrics(self, np.mean(expected_weighted_losses), {
           "accuracy": 1.,
-          "loss": expected_weighted_sum / np.sum(weights),
+          "loss": np.sum(expected_weighted_losses) / np.sum(weights),
+      }, model_fn_ops)
+
+  def testBinarySVMWith2DWeights(self):
+    head = head_lib.binary_svm_head(weight_column_name="weights")
+    with ops.Graph().as_default(), session.Session():
+      weights = (7., 11.)
+      model_fn_ops = head.create_model_fn_ops(
+          # We have to add an extra dim here for weights broadcasting to work.
+          features={"weights": tuple([(w,) for w in weights])},
+          mode=model_fn.ModeKeys.TRAIN,
+          labels=self._labels,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._predictions)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_weighted_losses = np.multiply(weights, self._expected_losses)
+      _assert_metrics(self, np.mean(expected_weighted_losses), {
+          "accuracy": 1.,
+          "loss": np.sum(expected_weighted_losses) / np.sum(weights),
       }, model_fn_ops)
 
   def testBinarySVMWithCenteredBias(self):
@@ -1400,8 +1627,10 @@ class BinarySvmHeadTest(test.TestCase):
           ),
           expected_trainable=("binary_svm_head/centered_bias_weight:0",))
       variables.global_variables_initializer().run()
-      _assert_summary_tags(
-          self, ["loss", "binary_svm_head/centered_bias/bias_0"])
+      _assert_summary_tags(self, [
+          "loss",
+          "binary_svm_head/centered_bias/bias_0"
+      ])
       expected_loss = np.average(self._expected_losses)
       _assert_metrics(self, expected_loss, {
           "accuracy": 1.,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index a0f501dfbaf2d9d0f2d562a47b50274a07344ed7..a473cf46d59e25e5d20e4da271a92f8249003782 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -27,11 +27,12 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.summary import summary
 from tensorflow.python.ops.control_flow_ops import with_dependencies
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training.session_run_hook import SessionRunArgs
 
@@ -118,7 +119,7 @@ def _kmeans_clustering_model_fn(features, labels, mode, params, config):
            'kmeans_plus_plus_num_retries')).training_graph()
   incr_step = state_ops.assign_add(variables.get_global_step(), 1)
   loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
-  logging_ops.scalar_summary('loss/raw', loss)
+  summary.scalar('loss/raw', loss)
   training_op = with_dependencies([training_op, incr_step], loss)
   predictions = {
       KMeansClustering.ALL_SCORES: all_scores[0],
@@ -257,4 +258,3 @@ class KMeansClustering(estimator.Estimator):
   def clusters(self):
     """Returns cluster centers."""
     return super(KMeansClustering, self).get_variable_value(self.CLUSTERS)
-
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index d1b4aedb81e0565cf4c8bbc85cd0baaac647f446..8a595a79016281b39a4f0f4d36083a1033085198 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -27,11 +27,13 @@ from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -148,17 +150,24 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if joint_weights:
-      layer_fn = layers.joint_weighted_sum_from_feature_columns
+    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+            for fc in feature_columns]):
+      if joint_weights:
+        layer_fn = layers.joint_weighted_sum_from_feature_columns
+      else:
+        layer_fn = layers.weighted_sum_from_feature_columns
+      logits, _, _ = layer_fn(
+          columns_to_tensors=features,
+          feature_columns=feature_columns,
+          num_outputs=head.logits_dimension,
+          weight_collections=[parent_scope],
+          scope=scope)
     else:
-      layer_fn = layers.weighted_sum_from_feature_columns
-        
-    logits, _, _ = layer_fn(
-            columns_to_tensors=features,
-            feature_columns=feature_columns,
-            num_outputs=head.logits_dimension,
-            weight_collections=[parent_scope],
-            scope=scope)
+      logits = fc_core.linear_model(
+          features=features,
+          feature_columns=feature_columns,
+          units=head.logits_dimension,
+          weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
       global_step = contrib_variables.get_global_step()
@@ -333,9 +342,34 @@ class LinearClassifier(estimator.Estimator):
     ...
   def input_fn_eval: # returns x, y (where y represents label's class index).
     ...
+  def input_fn_predict: # returns x, None.
+    ...
   estimator.fit(input_fn=input_fn_train)
   estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(x=x) # returns predicted labels (i.e. label's class index).
+  # predict_classes returns class indices.
+  estimator.predict_classes(input_fn=input_fn_predict)
+  ```
+
+  If the user specifies `label_keys` in constructor, labels must be strings from
+  the `label_keys` vocabulary. Example:
+
+  ```python
+  label_keys = ['label0', 'label1', 'label2']
+  estimator = LinearClassifier(
+      n_classes=n_classes,
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      label_keys=label_keys)
+
+  def input_fn_train: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.fit(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y (where y is one of label_keys).
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+  # predict_classes returns one of label_keys.
+  estimator.predict_classes(input_fn=input_fn_predict)
   ```
 
   Input of `fit` and `evaluate` should have following features,
@@ -363,7 +397,8 @@ class LinearClassifier(estimator.Estimator):
                enable_centered_bias=False,
                _joint_weight=False,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               label_keys=None):
     """Construct a `LinearClassifier` estimator object.
 
     Args:
@@ -398,6 +433,8 @@ class LinearClassifier(estimator.Estimator):
                         labels which are the output of `input_fn` and
                         returns features and labels which will be fed
                         into the model.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
 
     Returns:
       A `LinearClassifier` estimator.
@@ -419,7 +456,8 @@ class LinearClassifier(estimator.Estimator):
     head = head_lib.multi_class_head(
         n_classes,
         weight_column_name=weight_column_name,
-        enable_centered_bias=enable_centered_bias)
+        enable_centered_bias=enable_centered_bias,
+        label_keys=label_keys)
     params = {
         "head": head,
         "feature_columns": feature_columns,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index fc6437745283f3e1ff12b8f0d7a479d68340f982..145d5c40fa2d6072ed4b01535e8da3e9f550ec94 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,6 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
+from tensorflow.python.feature_column import feature_column as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -172,6 +173,49 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(x=train_x, y=train_y, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testMultiClassLabelKeys(self):
+    """Tests n_classes > 2 with label_keys vocabulary for labels."""
+    # Byte literals needed for python3 test to pass.
+    label_keys = [b'label0', b'label1', b'label2']
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      labels = constant_op.constant(
+          [[label_keys[1]], [label_keys[0]], [label_keys[0]]],
+          dtype=dtypes.string)
+      return features, labels
+
+    language_column = feature_column_lib.sparse_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+
+    classifier = linear.LinearClassifier(
+        n_classes=3,
+        feature_columns=[language_column],
+        label_keys=label_keys)
+
+    classifier.fit(input_fn=_input_fn, steps=50)
+
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+    self.assertIn('loss', scores)
+    predict_input_fn = functools.partial(_input_fn, num_epochs=1)
+    predicted_classes = list(
+        classifier.predict_classes(
+            input_fn=predict_input_fn, as_iterable=True))
+    self.assertEqual(3, len(predicted_classes))
+    for pred in predicted_classes:
+      self.assertIn(pred, label_keys)
+    predictions = list(
+        classifier.predict(input_fn=predict_input_fn, as_iterable=True))
+    self.assertAllEqual(predicted_classes, predictions)
+
   def testLogisticRegression_MatrixData(self):
     """Tests binary classification using matrix data as input."""
 
@@ -192,6 +236,32 @@ class LinearClassifierTest(test.TestCase):
     scores = classifier.evaluate(input_fn=_input_fn, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testEstimatorWithCoreFeatureColumns(self):
+
+    def _input_fn(num_epochs=None):
+      features = {
+          'age':
+              input_lib.limit_epochs(
+                  constant_op.constant([[.8], [0.2], [.1]]),
+                  num_epochs=num_epochs),
+          'language':
+              sparse_tensor.SparseTensor(
+                  values=input_lib.limit_epochs(
+                      ['en', 'fr', 'zh'], num_epochs=num_epochs),
+                  indices=[[0, 0], [0, 1], [2, 0]],
+                  dense_shape=[3, 2])
+      }
+      return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32)
+
+    language_column = fc_core.categorical_column_with_hash_bucket(
+        'language', hash_bucket_size=20)
+    feature_columns = [language_column, fc_core.numeric_column('age')]
+
+    classifier = linear.LinearClassifier(feature_columns=feature_columns)
+    classifier.fit(input_fn=_input_fn, steps=100)
+    scores = classifier.evaluate(input_fn=_input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+
   def testLogisticRegression_MatrixData_Labels1D(self):
     """Same as the last test, but labels shape is [100] instead of [100, 1]."""
 
@@ -739,7 +809,7 @@ class LinearClassifierTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.4], [0.6], [0.3]]),
+              constant_op.constant([0.4, 0.6, 0.3]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
@@ -1408,7 +1478,7 @@ class LinearRegressorTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
diff --git a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
index 10ac888eca7a0fe8c42a8646187cc3252276d444..99388f116b345bd038f2985606c6203011597ea2 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/metric_key.py
@@ -22,7 +22,9 @@ class MetricKey(object):
   """Metric key strings."""
   LOSS = "loss"
   AUC = "auc"
+  AUC_PR = "auc_precision_recall"
   CLASS_AUC = "auc/class%d"
+  CLASS_AUC_PR = "auc_precision_recall/class%d"
   PREDICTION_MEAN = "labels/prediction_mean"
   CLASS_PREDICTION_MEAN = "labels/prediction_mean/class%d"
   CLASS_LOGITS_MEAN = "labels/logits_mean/class%d"
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 3c812a46597972bfab84f843de0832ec09497213..8a327ab01f2b272f687a9507deb89225c1e5d38c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -25,10 +25,17 @@ import six
 
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import get_graph_from_inputs
-
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import metric_key
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
+from tensorflow.python.estimator import model_fn as core_model_fn_lib
+from tensorflow.python.estimator.export import export_output as core_export_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import session_run_hook
 
 
@@ -46,12 +53,17 @@ class ModeKeys(object):
   EVAL = 'eval'
   INFER = 'infer'
 
+  @classmethod
+  def validate(cls, key):
+    if key not in (cls.TRAIN, cls.EVAL, cls.INFER):
+      raise ValueError('Invalid mode %s.' % key)
+
 
 class ModelFnOps(
     collections.namedtuple('ModelFnOps', [
         'predictions', 'loss', 'train_op', 'eval_metric_ops',
         'output_alternatives', 'training_chief_hooks', 'training_hooks',
-        'scaffold'
+        'scaffold', 'mode'
     ])):
   """Ops returned from a model_fn."""
 
@@ -112,13 +124,15 @@ class ModelFnOps(
     Raises:
       ValueError: If validation fails.
     """
+    ModeKeys.validate(mode)
+
     # Assert all ops are from the same graph.
     get_graph_from_inputs((predictions, loss, train_op))
 
     # Validate train_op.
     if train_op is None:
       if mode == ModeKeys.TRAIN:
-        raise ValueError('Missing training_op.')
+        raise ValueError('Missing train_op.')
     elif not isinstance(train_op, ops.Operation):
       # TODO(ptucker): Should this be allowed? Consider raising error.
       train_op = ops.convert_to_tensor(train_op).op
@@ -176,4 +190,93 @@ class ModelFnOps(
         output_alternatives=output_alternatives,
         training_chief_hooks=training_chief_hooks,
         training_hooks=training_hooks,
-        scaffold=scaffold)
+        scaffold=scaffold,
+        mode=mode)
+
+  def estimator_spec(self, default_serving_output_alternative_key=None):
+    """Creates an equivalent `EstimatorSpec`.
+
+    Args:
+      default_serving_output_alternative_key: Required for multiple heads. If
+        you have multiple entries in `output_alternatives` dict (comparable to
+        multiple heads), `EstimatorSpec` requires a default head that will be
+        used if a Servo request does not explicitly mention which head to infer
+        on. Pass the key of the output alternative here that you want to
+        designate as default. A separate ExportOutpout for this default head
+        wil be added to the export_outputs dict with the special key
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, unless there is
+        already an enry in output_alternatives with this special key.
+
+    Returns:
+      Instance of `EstimatorSpec` that is equivalent to this `ModelFnOps`
+
+    Raises:
+      ValueError: If problem type is unknown.
+    """
+    def _scores(output_tensors):
+      scores = output_tensors.get(prediction_key.PredictionKey.SCORES)
+      if scores is None:
+        scores = output_tensors.get(prediction_key.PredictionKey.PROBABILITIES)
+      return scores
+
+    def _classes(output_tensors):  # pylint: disable=missing-docstring
+      classes = output_tensors.get(prediction_key.PredictionKey.CLASSES)
+      if classes is None:
+        logging.warning(
+            'classes is None, Servo inference will not have class ids.')
+        return None
+      elif classes.dtype != dtypes.string:
+        # Servo classification can only serve string classes
+        logging.warning(
+            'classes is not string, Servo inference will not have class ids.')
+        return None
+
+      return classes
+
+    def _export_output(problem_type, predictions):  # pylint: disable=missing-docstring
+      if problem_type == constants.ProblemType.LINEAR_REGRESSION:
+        return core_export_lib.RegressionOutput(_scores(predictions))
+
+      if (problem_type == constants.ProblemType.CLASSIFICATION or
+          problem_type == constants.ProblemType.LOGISTIC_REGRESSION):
+        return core_export_lib.ClassificationOutput(
+            scores=_scores(predictions), classes=_classes(predictions))
+
+      if problem_type == constants.ProblemType.UNSPECIFIED:
+        return core_export_lib.PredictOutput(predictions)
+
+      raise ValueError('Unknown problem_type=%s' % problem_type)
+
+    # Converts output_alternatives
+    export_outputs_dict = None
+    if self.output_alternatives:
+      output_alternatives = self.output_alternatives
+      # Adds default output_alternative if needed.
+      if (len(output_alternatives) > 1 and
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+          output_alternatives):
+        output_alternatives = output_alternatives.copy()
+        output_alternatives[
+            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = (
+                output_alternatives[default_serving_output_alternative_key])
+      export_outputs_dict = {key: _export_output(*val) for key, val in
+                             output_alternatives.items()}
+
+    def _get_eval_metric_ops():
+      """Returns self.eval_metric_ops without loss metric."""
+      result = {}
+      for key, value in six.iteritems(self.eval_metric_ops):
+        if key != metric_key.MetricKey.LOSS:
+          result[key] = value
+      return result
+
+    return core_model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=_get_eval_metric_ops(),
+        export_outputs=export_outputs_dict,
+        training_chief_hooks=self.training_chief_hooks,
+        training_hooks=self.training_hooks,
+        scaffold=self.scaffold)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ebfeb0f1616fb2cdbd96604b020d44227831968
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn_test.py
@@ -0,0 +1,297 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModelFnOps tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.learn.python.learn.estimators import constants
+from tensorflow.contrib.learn.python.learn.estimators import model_fn
+from tensorflow.python.client import session
+from tensorflow.python.estimator.export import export_output as core_export_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import monitored_session
+
+
+class ModelFnopsTest(test.TestCase):
+  """Multi-output tests."""
+
+  def create_predictions(self):
+    probabilities = constant_op.constant([1., 1., 1.])
+    scores = constant_op.constant([1., 2., 3.])
+    classes = constant_op.constant([b"0", b"1", b"2"])
+    return {
+        "probabilities": probabilities,
+        "scores": scores,
+        "classes": classes}
+
+  def create_model_fn_ops(self, predictions, output_alternatives,
+                          mode=model_fn.ModeKeys.INFER):
+
+    return model_fn.ModelFnOps(
+        model_fn.ModeKeys.INFER,
+        predictions=predictions,
+        loss=constant_op.constant([1]),
+        train_op=control_flow_ops.no_op(),
+        eval_metric_ops={
+            "metric_key": (constant_op.constant(1.), control_flow_ops.no_op()),
+            "loss": (constant_op.constant(1.), control_flow_ops.no_op()),
+        },
+        training_chief_hooks=[basic_session_run_hooks.StepCounterHook()],
+        training_hooks=[basic_session_run_hooks.StepCounterHook()],
+        output_alternatives=output_alternatives,
+        scaffold=monitored_session.Scaffold())
+
+  def assertEquals_except_export_and_eval_loss(
+      self, model_fn_ops, estimator_spec):
+    expected_eval_metric_ops = {}
+    for key, value in six.iteritems(model_fn_ops.eval_metric_ops):
+      if key != "loss":
+        expected_eval_metric_ops[key] = value
+    self.assertEqual(model_fn_ops.predictions, estimator_spec.predictions)
+    self.assertEqual(model_fn_ops.loss, estimator_spec.loss)
+    self.assertEqual(model_fn_ops.train_op, estimator_spec.train_op)
+    self.assertEqual(expected_eval_metric_ops,
+                     estimator_spec.eval_metric_ops)
+    self.assertAllEqual(model_fn_ops.training_chief_hooks,
+                        estimator_spec.training_chief_hooks)
+    self.assertAllEqual(model_fn_ops.training_hooks,
+                        estimator_spec.training_hooks)
+    self.assertEqual(model_fn_ops.scaffold, estimator_spec.scaffold)
+
+  def testEstimatorSpec_except_export(self):
+    predictions = self.create_predictions()
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, None, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+  def testEstimatorSpec_export_regression_with_scores(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"regression_head": (
+        constants.ProblemType.LINEAR_REGRESSION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          regression_output.value.eval())
+
+  def testEstimatorSpec_export_regression_with_probabilities(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    output_alternatives = {"regression_head": (
+        constants.ProblemType.LINEAR_REGRESSION,
+        output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["probabilities"].eval(),
+                          regression_output.value.eval())
+
+  def testEstimatorSpec_export_classsification(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classsification_with_missing_scores(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["probabilities"].eval(),
+                          classification_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classsification_with_missing_scores_proba(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["scores"]
+    del output_alternatives_predictions["probabilities"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertIsNone(classification_output.scores)
+      self.assertAllEqual(predictions["classes"].eval(),
+                          classification_output.classes.eval())
+
+  def testEstimatorSpec_export_classsification_with_missing_classes(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    del output_alternatives_predictions["classes"]
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertIsNone(classification_output.classes)
+
+  def testEstimatorSpec_export_classsification_with_nonstring_classes(self):
+    predictions = self.create_predictions()
+    output_alternatives_predictions = predictions.copy()
+    output_alternatives_predictions["classes"] = constant_op.constant(
+        [1, 2, 3])
+    output_alternatives = {"classification_head": (
+        constants.ProblemType.CLASSIFICATION, output_alternatives_predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      classification_output = estimator_spec.export_outputs[
+          "classification_head"]
+      self.assertTrue(isinstance(classification_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          classification_output.scores.eval())
+      self.assertIsNone(classification_output.classes)
+
+  def testEstimatorSpec_export_logistic(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"logistic_head": (
+        constants.ProblemType.LOGISTIC_REGRESSION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      logistic_output = estimator_spec.export_outputs["logistic_head"]
+      self.assertTrue(isinstance(logistic_output,
+                                 core_export_lib.ClassificationOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          logistic_output.scores.eval())
+      self.assertAllEqual(predictions["classes"].eval(),
+                          logistic_output.classes.eval())
+
+  def testEstimatorSpec_export_unspecified(self):
+    predictions = self.create_predictions()
+    output_alternatives = {"unspecified_head": (
+        constants.ProblemType.UNSPECIFIED, predictions)}
+
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec()
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      unspecified_output = estimator_spec.export_outputs["unspecified_head"]
+      self.assertTrue(isinstance(unspecified_output,
+                                 core_export_lib.PredictOutput))
+      self.assertEqual(predictions, unspecified_output.outputs)
+
+  def testEstimatorSpec_export_multihead(self):
+    predictions = self.create_predictions()
+    output_alternatives = {
+        "regression_head": (
+            constants.ProblemType.LINEAR_REGRESSION, predictions),
+        "classification_head": (
+            constants.ProblemType.CLASSIFICATION, predictions)}
+    model_fn_ops = self.create_model_fn_ops(
+        predictions, output_alternatives, mode=model_fn.ModeKeys.INFER)
+
+    estimator_spec = model_fn_ops.estimator_spec("regression_head")
+    self.assertEquals_except_export_and_eval_loss(model_fn_ops, estimator_spec)
+
+    with session.Session():
+      regression_output = estimator_spec.export_outputs["regression_head"]
+      self.assertTrue(isinstance(
+          regression_output, core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          regression_output.value.eval())
+
+      default_output = estimator_spec.export_outputs[
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+      self.assertTrue(isinstance(default_output,
+                                 core_export_lib.RegressionOutput))
+      self.assertAllEqual(predictions["scores"].eval(),
+                          default_output.value.eval())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
index f20dc788349aacdf87d8367c32e32f68958739d1..0f09b111bd8dee03633402fbda7654bc4dcdbddc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/rnn_common.py
@@ -66,8 +66,8 @@ def _get_single_cell(cell_type, num_units):
     ValueError: `cell_type` is an invalid `RNNCell` name.
     TypeError: `cell_type` is not a string or a subclass of `RNNCell`.
   """
-  cell_type = _CELL_TYPES.get(cell_type)
-  if cell_type is None and not issubclass(cell_type, contrib_rnn.RNNCell):
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, contrib_rnn.RNNCell):
     raise ValueError('The supported cell types are {}; got {}'.format(
         list(_CELL_TYPES.keys()), cell_type))
   return cell_type(num_units=num_units)
@@ -119,7 +119,7 @@ def apply_dropout(cells, dropout_keep_probabilities, random_seed=None):
   """
   if len(dropout_keep_probabilities) != len(cells) + 1:
     raise ValueError(
-        'The number of dropout probabilites must be one greater than the '
+        'The number of dropout probabilities must be one greater than the '
         'number of cells. Got {} cells and {} dropout probabilities.'.format(
             len(cells), len(dropout_keep_probabilities)))
   wrapped_cells = [
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index bc7465bbc22fab3b9aa02c8e6da864faed7e99f1..3aaee5862df4fdc5af8ead37e6ebff7944e29dbd 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 import json
 import os
 
@@ -27,9 +26,25 @@ import six
 
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as core_run_config
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
+# A list of the property names in RunConfig user allows to change. They will
+# not affect the execution framework, so when execution framework checks the
+# `uid` of the RunConfig, it should be ingored.
+_DEFAULT_UID_WHITE_LIST = [
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+
 class Environment(object):
   # For running general distributed training.
   CLOUD = 'cloud'
@@ -88,9 +103,9 @@ class ClusterConfig(object):
     ```
       cluster = {'ps': ['host1:2222', 'host2:2222'],
                  'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-      os.environ['TF_CONFIG'] = json.dumps({
+      os.environ['TF_CONFIG'] = json.dumps(
           {'cluster': cluster,
-           'task': {'type': 'worker', 'index': 1}}})
+           'task': {'type': 'worker', 'index': 1}})
       config = ClusterConfig()
       assert config.master == 'host4:2222'
       assert config.task_id == 1
@@ -192,9 +207,11 @@ class ClusterConfig(object):
     return int(task_index) if task_index else 0
 
 
-class RunConfig(ClusterConfig):
+class RunConfig(ClusterConfig, core_run_config.RunConfig):
   """This class specifies the configurations for an `Estimator` run.
 
+  This class is the implementation of ${tf.estimator.RunConfig} interface.
+
   If you're a Google-internal user using command line flags with
   `learn_runner.py` (for instance, to do distributed training or to use
   parameter servers), you probably want to use `learn_runner.EstimatorConfig`
@@ -214,7 +231,8 @@ class RunConfig(ClusterConfig):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                evaluation_master='',
-               model_dir=None):
+               model_dir=None,
+               session_config=None):
     """Constructor.
 
     Note that the superclass `ClusterConfig` may set properties like
@@ -245,7 +263,12 @@ class RunConfig(ClusterConfig):
         the feature.
       evaluation_master: the master on which to perform evaluation.
       model_dir: directory where model parameters, graph etc are saved. If
-        `None`, see `Estimator` about where the model will be saved.
+        `None`, will use `model_dir` property in `TF_CONFIG` environment
+        variable. If both are set, must have same value. If both are `None`, see
+        `Estimator` about where the model will be saved.
+      session_config: a ConfigProto used to set session parameters, or None.
+        Note - using this argument, it is easy to provide settings which break
+        otherwise perfectly good models. Use with care.
     """
     super(RunConfig, self).__init__(
         master=master, evaluation_master=evaluation_master)
@@ -261,6 +284,7 @@ class RunConfig(ClusterConfig):
     self._tf_random_seed = tf_random_seed
     self._save_summary_steps = save_summary_steps
     self._save_checkpoints_secs = save_checkpoints_secs
+    self._session_config = session_config
     if save_checkpoints_secs == RunConfig._USE_DEFAULT:
       if save_checkpoints_steps is None:
         self._save_checkpoints_secs = 600
@@ -272,52 +296,38 @@ class RunConfig(ClusterConfig):
     # create Scaffold and Saver in their model_fn to set these.
     self._keep_checkpoint_max = keep_checkpoint_max
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
-    self._model_dir = model_dir
-
-  def replace(self, **kwargs):
-    """Returns a new instance of `RunConfig` replacing specified properties.
-
-    Only the properties in the following list are allowed to be replaced:
-      - `model_dir`.
-
-    Args:
-      **kwargs: keyword named properties with new values.
-
-    Raises:
-      ValueError: If any property name in `kwargs` does not exist or is not
-        allowed to be replaced.
-
-    Returns:
-      a new instance of `RunConfig`.
-    """
-
-    new_copy = copy.deepcopy(self)
-
-    # TODO(b/33295821): Allow more fields to be replaced.
-    for key, new_value in six.iteritems(kwargs):
-      if key == 'model_dir':
-        new_copy._model_dir = new_value  # pylint: disable=protected-access
-        continue
-
-      raise ValueError('{} is not supported by RunConfig replace'.format(key))
-
-    return new_copy
+    self._model_dir = _get_model_dir(model_dir)
 
   @experimental
-  def uid(self):
+  def uid(self, whitelist=None):
     """Generates a 'Unique Identifier' based on all internal fields.
 
     Caller should use the uid string to check `RunConfig` instance integrity
     in one session use, but should not rely on the implementation details, which
     is subject to change.
 
+    Args:
+      whitelist: A list of the string names of the properties uid should not
+        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+        includes most properties user allowes to change.
+
     Returns:
       A uid string.
     """
-    # TODO(b/33295821): Allows user to specify a whitelist.
+    if whitelist is None:
+      whitelist = _DEFAULT_UID_WHITE_LIST
+
     state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+    # Pop out the keys in whitelist.
+    for k in whitelist:
+      state.pop('_' + k, None)
+
     ordered_state = collections.OrderedDict(
         sorted(state.items(), key=lambda t: t[0]))
+    # For class instance without __repr__, some special cares are required.
+    # Otherwise, the object address will be used.
+    if '_cluster_spec' in ordered_state:
+      ordered_state['_cluster_spec'] = ordered_state['_cluster_spec'].as_dict()
     return ', '.join(
         '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
 
@@ -345,6 +355,10 @@ class RunConfig(ClusterConfig):
   def save_checkpoints_steps(self):
     return self._save_checkpoints_steps
 
+  @property
+  def session_config(self):
+    return self._session_config
+
   @property
   def keep_checkpoint_max(self):
     return self._keep_checkpoint_max
@@ -396,3 +410,21 @@ def _get_master(cluster_spec, task_type, task_id):
   # For backwards compatibility, we return empty string if task_type was
   # not set (task_type did not previously exist).
   return ''
+
+
+def _get_model_dir(model_dir):
+  """Returns `model_dir` based user provided `model_dir` or `TF_CONFIG`."""
+
+  model_dir_in_tf_config = json.loads(
+      os.environ.get('TF_CONFIG') or '{}').get('model_dir', None)
+  if model_dir_in_tf_config is not None:
+    if model_dir is not None and model_dir_in_tf_config != model_dir:
+      raise ValueError(
+          '`model_dir` provided in RunConfig construct, if set, '
+          'must have the same value as the model_dir in TF_CONFIG. '
+          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
+              model_dir, model_dir_in_tf_config))
+
+    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+
+  return model_dir or model_dir_in_tf_config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
index 4d312ca8eeacd99de93f1c8d671ae241bb0f7dc6..6e2a2690ae4629b29aad1e550448d8609b20a5a4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
@@ -18,15 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import json
 
-from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as core_run_config
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 TEST_DIR = "test_dir"
 ANOTHER_TEST_DIR = "another_test_dir"
+MASTER = "master_"
 RANDOM_SEED = 123
 
 patch = test.mock.patch
@@ -34,8 +37,12 @@ patch = test.mock.patch
 
 class RunConfigTest(test.TestCase):
 
+  def test_instance_of_core_run_config(self):
+    config = run_config_lib.RunConfig()
+    self.assertTrue(isinstance(config, core_run_config.RunConfig))
+
   def test_defaults_with_no_tf_config(self):
-    config = run_config.RunConfig()
+    config = run_config_lib.RunConfig()
     self.assertEqual(config.master, "")
     self.assertEqual(config.task_id, 0)
     self.assertEqual(config.num_ps_replicas, 0)
@@ -56,7 +63,7 @@ class RunConfigTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertEqual(config.master, "grpc://host4:4")
     self.assertEqual(config.task_id, 1)
@@ -80,7 +87,7 @@ class RunConfigTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig(
+      config = run_config_lib.RunConfig(
           master="localhost:0", evaluation_master="localhost:9991")
 
     self.assertEqual(config.master, "localhost:0")
@@ -95,7 +102,7 @@ class RunConfigTest(test.TestCase):
   def test_single_node_in_cluster_spec_produces_empty_master(self):
     tf_config = {"cluster": {run_config_lib.TaskType.WORKER: ["host1:1"]}}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(config.master, "")
 
   def test_no_task_type_produces_empty_master(self):
@@ -107,7 +114,7 @@ class RunConfigTest(test.TestCase):
         # Omits "task": {"type": "worker}
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(config.master, "")
 
   def test_invalid_job_name_raises(self):
@@ -125,7 +132,7 @@ class RunConfigTest(test.TestCase):
         "os.environ",
         {"TF_CONFIG": json.dumps(tf_config)}), self.assertRaisesRegexp(
             ValueError, expected_msg_regexp):
-      run_config.RunConfig()
+      run_config_lib.RunConfig()
 
   def test_illegal_task_index_raises(self):
     tf_config = {
@@ -143,7 +150,7 @@ class RunConfigTest(test.TestCase):
         "os.environ",
         {"TF_CONFIG": json.dumps(tf_config)}), self.assertRaisesRegexp(
             ValueError, expected_msg_regexp):
-      run_config.RunConfig()
+      run_config_lib.RunConfig()
 
   def test_is_chief_from_cloud_tf_config(self):
     # is_chief should be true when ["task"]["type"] == "master" and
@@ -162,7 +169,7 @@ class RunConfigTest(test.TestCase):
         "environment": "cloud"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
@@ -182,7 +189,7 @@ class RunConfigTest(test.TestCase):
         "environment": "random"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
@@ -200,26 +207,47 @@ class RunConfigTest(test.TestCase):
         "environment": "random"
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertFalse(config.is_chief)
 
   def test_default_is_chief_from_tf_config_without_job_name(self):
     tf_config = {"cluster": {}, "task": {}}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
 
     self.assertTrue(config.is_chief)
 
   def test_model_dir(self):
-    empty_config = run_config.RunConfig()
+    empty_config = run_config_lib.RunConfig()
     self.assertIsNone(empty_config.model_dir)
 
-    config = run_config.RunConfig(model_dir=TEST_DIR)
+    config = run_config_lib.RunConfig(model_dir=TEST_DIR)
     self.assertEqual(TEST_DIR, config.model_dir)
 
+  def test_model_dir_in_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig()
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_both_in_tf_config_and_constructor(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig(model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_fail_if_constructor_value_mismatch_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "`model_dir` provided in RunConfig .* must have "
+          "the same value .* in TF_CONFIG"):
+        run_config_lib.RunConfig(model_dir=TEST_DIR + "/sub_dir")
+
   def test_replace(self):
-    config = run_config.RunConfig(
+    config = run_config_lib.RunConfig(
         tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
     self.assertEqual(TEST_DIR, config.model_dir)
     self.assertEqual(RANDOM_SEED, config.tf_random_seed)
@@ -227,19 +255,10 @@ class RunConfigTest(test.TestCase):
     new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
     self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
     self.assertEqual(RANDOM_SEED, new_config.tf_random_seed)
-
-    self.assertEqual(TEST_DIR, config.model_dir)
     self.assertEqual(RANDOM_SEED, config.tf_random_seed)
 
-    with self.assertRaises(ValueError):
-      # tf_random_seed is not allowed to be replaced.
-      config.replace(tf_random_seed=RANDOM_SEED)
-
-    with self.assertRaises(ValueError):
-      config.replace(some_undefined_property=RANDOM_SEED)
-
-  def test_uid(self):
-    config = run_config.RunConfig(
+  def test_uid_for_different_configs(self):
+    config = run_config_lib.RunConfig(
         tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
 
     expected_uid = config.uid()
@@ -248,7 +267,78 @@ class RunConfigTest(test.TestCase):
       self.assertEqual(expected_uid, config.uid())
 
     new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
     self.assertNotEqual(expected_uid, new_config.uid())
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_whitelist(self):
+    whitelist = ["model_dir"]
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid(whitelist)
+    self.assertEqual(expected_uid, config.uid(whitelist))
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertEqual(expected_uid, new_config.uid(whitelist))
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_default_whitelist(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_steps=13,
+        save_checkpoints_secs=14,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=True),
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(13, config.save_checkpoints_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(config_pb2.ConfigProto(allow_soft_placement=True),
+                     config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+    new_config = run_config_lib.RunConfig(
+        tf_random_seed=21,
+        save_summary_steps=22,
+        save_checkpoints_steps=23,
+        save_checkpoints_secs=24,
+        session_config=config_pb2.ConfigProto(allow_soft_placement=False),
+        keep_checkpoint_max=26,
+        keep_checkpoint_every_n_hours=27)
+    self.assertEqual(config.uid(), new_config.uid())
+    # model_dir is not on the default whitelist.
+    self.assertNotEqual(config.uid(whitelist=[]),
+                        new_config.uid(whitelist=[]))
+    new_config = new_config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertNotEqual(config.uid(), new_config.uid())
+
+  def test_uid_for_deepcopy(self):
+    tf_config = {
+        "cluster": {
+            run_config_lib.TaskType.PS: ["host1:1", "host2:2"],
+            run_config_lib.TaskType.WORKER: ["host3:3", "host4:4", "host5:5"]
+        },
+        "task": {
+            "type": run_config_lib.TaskType.WORKER,
+            "index": 1
+        }
+    }
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      config = run_config_lib.RunConfig(
+          tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+    self.assertEqual(config.cluster_spec.as_dict(), tf_config["cluster"])
+
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid()
+    new_config = copy.deepcopy(config)
+    self.assertEqual(expected_uid, new_config.uid())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
index e09278bc63626ed216d3c87c99d507c4b74fe312..9cb4c3515a96c48d3c9ca53249e68096c5b26dcf 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.contrib import layers
 from tensorflow.contrib import rnn as rnn_cell
-from tensorflow.contrib.framework.python.framework import deprecated
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import constants
@@ -145,7 +144,7 @@ def _prepare_features_for_sqss(features, labels, mode,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
 
@@ -262,7 +261,7 @@ def _read_batch(cell,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     num_threads: The Python integer number of threads enqueuing input examples
@@ -421,7 +420,7 @@ def _get_rnn_model_fn(cell_type,
       describing sequence features. All items in the set should be instances
       of classes derived from `FeatureColumn`.
     context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
+      describing context features, i.e., features that apply across all time
       steps. All items in the set should be instances of classes derived from
       `FeatureColumn`.
     predict_probabilities: A boolean indicating whether to predict probabilities
@@ -564,7 +563,7 @@ class StateSavingRnnEstimator(estimator.Estimator):
         describing sequence features. All items in the set should be instances
         of classes derived from `FeatureColumn`.
       context_feature_columns: An iterable containing all the feature columns
-        describing context features, i.e., features that apply accross all time
+        describing context features, i.e., features that apply across all time
         steps. All items in the set should be instances of classes derived from
         `FeatureColumn`.
       num_classes: The number of classes for categorization. Used only and
@@ -652,180 +651,3 @@ class StateSavingRnnEstimator(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
-
-
-@deprecated('2017-04-01', 'multi_value_rnn_regressor is deprecated. '
-            'Please construct a StateSavingRnnEstimator directly.')
-def multi_value_rnn_regressor(num_units,
-                              num_unroll,
-                              batch_size,
-                              sequence_feature_columns,
-                              context_feature_columns=None,
-                              num_rnn_layers=1,
-                              optimizer_type='SGD',
-                              learning_rate=0.1,
-                              momentum=None,
-                              gradient_clipping_norm=5.0,
-                              dropout_keep_probabilities=None,
-                              model_dir=None,
-                              config=None,
-                              feature_engineering_fn=None,
-                              num_threads=3,
-                              queue_capacity=1000,
-                              seed=None):
-  """Creates a RNN `Estimator` that predicts sequences of values.
-
-  Args:
-    num_units: The size of the RNN cells.
-    num_unroll: Python integer, how many time steps to unroll at a time.
-      The input sequences of length `k` are then split into `k / num_unroll`
-      many segments.
-    batch_size: Python integer, the size of the minibatch.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    num_rnn_layers: Number of RNN layers.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    dropout_keep_probabilities: a list of dropout keep probabilities or `None`.
-        If given a list, it must have length `num_rnn_layers + 1`.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-    num_threads: The Python integer number of threads enqueuing input examples
-      into a queue. Defaults to 3.
-    queue_capacity: The max capacity of the queue in number of examples.
-      Needs to be at least `batch_size`. Defaults to 1000. When iterating
-      over the same input example multiple times reusing their keys the
-      `queue_capacity` must be smaller than the number of examples.
-    seed: Fixes the random seed used for generating input keys by the SQSS.
-  Returns:
-    An initialized `Estimator`.
-  """
-  num_units = [num_units for _ in range(num_rnn_layers)]
-  return StateSavingRnnEstimator(
-      constants.ProblemType.LINEAR_REGRESSION,
-      num_unroll,
-      batch_size,
-      sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_classes=None,
-      num_units=num_units,
-      cell_type='lstm',
-      optimizer_type=optimizer_type,
-      learning_rate=learning_rate,
-      predict_probabilities=False,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      config=config,
-      feature_engineering_fn=feature_engineering_fn,
-      num_threads=num_threads,
-      queue_capacity=queue_capacity,
-      seed=seed)
-
-
-@deprecated('2017-04-01', 'multi_value_rnn_classifier is deprecated. '
-            'Please construct a StateSavingRnnEstimator directly.')
-def multi_value_rnn_classifier(num_classes,
-                               num_units,
-                               num_unroll,
-                               batch_size,
-                               sequence_feature_columns,
-                               context_feature_columns=None,
-                               num_rnn_layers=1,
-                               optimizer_type='SGD',
-                               learning_rate=0.1,
-                               predict_probabilities=False,
-                               momentum=None,
-                               gradient_clipping_norm=5.0,
-                               dropout_keep_probabilities=None,
-                               model_dir=None,
-                               config=None,
-                               feature_engineering_fn=None,
-                               num_threads=3,
-                               queue_capacity=1000,
-                               seed=None):
-  """Creates a RNN `Estimator` that predicts sequences of labels.
-
-  Args:
-    num_classes: The number of classes for categorization.
-    num_units: The size of the RNN cells.
-    num_unroll: Python integer, how many time steps to unroll at a time.
-      The input sequences of length `k` are then split into `k / num_unroll`
-      many segments.
-    batch_size: Python integer, the size of the minibatch.
-    sequence_feature_columns: An iterable containing all the feature columns
-      describing sequence features. All items in the set should be instances
-      of classes derived from `FeatureColumn`.
-    context_feature_columns: An iterable containing all the feature columns
-      describing context features, i.e., features that apply accross all time
-      steps. All items in the set should be instances of classes derived from
-      `FeatureColumn`.
-    num_rnn_layers: Number of RNN layers.
-    optimizer_type: The type of optimizer to use. Either a subclass of
-      `Optimizer`, an instance of an `Optimizer` or a string. Strings must be
-      one of 'Adagrad', 'Momentum' or 'SGD'.
-    learning_rate: Learning rate. This argument has no effect if `optimizer`
-      is an instance of an `Optimizer`.
-    predict_probabilities: A boolean indicating whether to predict probabilities
-      for all classes.
-    momentum: Momentum value. Only used if `optimizer_type` is 'Momentum'.
-    gradient_clipping_norm: Parameter used for gradient clipping. If `None`,
-      then no clipping is performed.
-    dropout_keep_probabilities: a list of dropout keep probabilities or `None`.
-        If given a list, it must have length `num_rnn_layers + 1`.
-    model_dir: The directory in which to save and restore the model graph,
-      parameters, etc.
-    config: A `RunConfig` instance.
-    feature_engineering_fn: Takes features and labels which are the output of
-      `input_fn` and returns features and labels which will be fed into
-      `model_fn`. Please check `model_fn` for a definition of features and
-      labels.
-    num_threads: The Python integer number of threads enqueuing input examples
-      into a queue. Defaults to 3.
-    queue_capacity: The max capacity of the queue in number of examples.
-      Needs to be at least `batch_size`. Defaults to 1000. When iterating
-      over the same input example multiple times reusing their keys the
-      `queue_capacity` must be smaller than the number of examples.
-    seed: Fixes the random seed used for generating input keys by the SQSS.
-  Returns:
-    An initialized `Estimator`.
-  """
-  num_units = [num_units for _ in range(num_rnn_layers)]
-  return StateSavingRnnEstimator(
-      constants.ProblemType.CLASSIFICATION,
-      num_unroll,
-      batch_size,
-      sequence_feature_columns,
-      context_feature_columns=context_feature_columns,
-      num_classes=num_classes,
-      num_units=num_units,
-      cell_type='lstm',
-      optimizer_type=optimizer_type,
-      learning_rate=learning_rate,
-      predict_probabilities=predict_probabilities,
-      momentum=momentum,
-      gradient_clipping_norm=gradient_clipping_norm,
-      dropout_keep_probabilities=dropout_keep_probabilities,
-      model_dir=model_dir,
-      config=config,
-      feature_engineering_fn=feature_engineering_fn,
-      num_threads=num_threads,
-      queue_capacity=queue_capacity,
-      seed=seed)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
index f5bd03429c64abfcd36f0ec97728824d0f6f223a..442247409dbc49052466c8b476be2ad1c840a814 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/state_saving_rnn_estimator_test.py
@@ -35,8 +35,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -55,7 +55,7 @@ class PrepareInputsForRnnTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
+      sess.run(lookup_ops.tables_initializer())
       features_val = sess.run(features_by_time)
       self.assertAllEqual(expected, features_val)
 
@@ -316,7 +316,7 @@ class StateSavingRnnEstimatorTest(test.TestCase):
 
     with self.test_session() as sess:
       sess.run(variables.global_variables_initializer())
-      sess.run(data_flow_ops.initialize_all_tables())
+      sess.run(lookup_ops.tables_initializer())
       actual_sequence, actual_context = sess.run(
           [sequence, context])
       assert_equal(expected_sequence, actual_sequence)
@@ -455,56 +455,6 @@ class LegacyConstructorTest(test.TestCase):
       return {'inputs': inputs}, labels
     return input_fn
 
-  def testClassifierConstructor(self):
-    batch_size = 16
-    num_classes = 2
-    num_unroll = 32
-    sequence_length = 32
-    num_units = 4
-    learning_rate = 0.5
-    steps = 100
-    input_fn = self._get_input_fn(sequence_length,
-                                  seed=1234)
-    model_dir = tempfile.mkdtemp()
-    seq_columns = [
-        feature_column.real_valued_column(
-            'inputs', dimension=num_units)
-    ]
-    estimator = ssre.multi_value_rnn_classifier(num_classes,
-                                                num_units,
-                                                num_unroll,
-                                                batch_size,
-                                                seq_columns,
-                                                learning_rate=learning_rate,
-                                                model_dir=model_dir,
-                                                queue_capacity=batch_size+2,
-                                                seed=1234)
-    estimator.fit(input_fn=input_fn, steps=steps)
-
-  def testRegressorConstructor(self):
-    batch_size = 16
-    num_unroll = 32
-    sequence_length = 32
-    num_units = 4
-    learning_rate = 0.5
-    steps = 100
-    input_fn = self._get_input_fn(sequence_length,
-                                  seed=4321)
-    model_dir = tempfile.mkdtemp()
-    seq_columns = [
-        feature_column.real_valued_column(
-            'inputs', dimension=num_units)
-    ]
-    estimator = ssre.multi_value_rnn_regressor(num_units,
-                                               num_unroll,
-                                               batch_size,
-                                               seq_columns,
-                                               learning_rate=learning_rate,
-                                               model_dir=model_dir,
-                                               queue_capacity=batch_size+2,
-                                               seed=1234)
-    estimator.fit(input_fn=input_fn, steps=steps)
-
 
 # TODO(jtbates): move all tests below to a benchmark test.
 class StateSavingRNNEstimatorLearningTest(test.TestCase):
@@ -575,7 +525,7 @@ class StateSavingRNNEstimatorLearningTest(test.TestCase):
     num_classes = 2
     num_unroll = 32
     sequence_length = 32
-    train_steps = 200
+    train_steps = 300
     eval_steps = 20
     num_units = [4]
     learning_rate = 0.5
diff --git a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
index ccb33cae1e570db29528812903045f6b384c6f2a..f67f181d1ad629825aa7834f44199409cf15f774 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/svm_test.py
@@ -59,9 +59,9 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
-          'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
-      }, constant_op.constant([[1], [0], [1]])
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
+          'feature2': constant_op.constant([1.0, -1.0, 0.5]),
+      }, constant_op.constant([1, 0, 1])
 
     feature1 = feature_column.real_valued_column('feature1')
     feature2 = feature_column.real_valued_column('feature2')
@@ -142,7 +142,7 @@ class SVMTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'feature1': constant_op.constant([[0.5], [1.0], [1.0]]),
+          'feature1': constant_op.constant([0.5, 1.0, 1.0]),
           'feature2': constant_op.constant([[1.0], [-1.0], [0.5]]),
       }, constant_op.constant([[1], [0], [1]])
 
@@ -223,7 +223,7 @@ class SVMTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.6], [0.8], [0.3]]),
+              constant_op.constant([0.6, 0.8, 0.3]),
           'sq_footage':
               constant_op.constant([[900.0], [700.0], [600.0]]),
           'country':
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index a8f8d995fe666a890f3ebf2f75f253d5535b2857..c60ecac5df4362b4300ecb668dbc57e30ab04ab6 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -53,7 +53,7 @@ class Experiment(object):
   """
 
   # TODO(ispir): remove delay_workers_by_global_step and make global step based
-  # waiting as only behaviour.
+  # waiting as only behavior.
   @deprecated_args(
       "2016-10-23",
       "local_eval_frequency is deprecated as local_run will be renamed to "
@@ -76,7 +76,7 @@ class Experiment(object):
                local_eval_frequency=None,
                eval_delay_secs=120,
                continuous_eval_throttle_secs=60,
-               min_eval_frequency=1,
+               min_eval_frequency=None,
                delay_workers_by_global_step=False,
                export_strategies=None,
                train_steps_per_iteration=None):
@@ -97,7 +97,8 @@ class Experiment(object):
         finite number of batches (generally, 1 epoch over the evaluation data).
       eval_metrics: `dict` of string, metric function. If `None`, default set
         is used. This should be `None` if the `estimator` is
-        ${tf.estimator.Estimator}.
+        ${tf.estimator.Estimator}. If metrics are provided they will be
+        *appended* to the default set.
       train_steps: Perform this many steps of training. `None`, the default,
         means train forever.
       eval_steps: `evaluate` runs until input is exhausted (or another exception
@@ -115,9 +116,13 @@ class Experiment(object):
       min_eval_frequency: (applies only to train_and_evaluate). the minimum
         number of steps between evaluations. Of course, evaluation does not
         occur if no new snapshot is available, hence, this is the minimum.
+        If 0, the evaluation will only happen after training.
+        If None, defaults to 1, unless model_dir is on GCS, in which case the
+        default is 1000.
       delay_workers_by_global_step: if `True` delays training workers
         based on global step instead of time.
-      export_strategies: A list of `ExportStrategy`s, or a single one, or None.
+      export_strategies: Iterable of `ExportStrategy`s, or a single one, or
+        `None`.
       train_steps_per_iteration: (applies only to continuous_train_and_eval).
         Perform this many (integer) number of train steps for each
         training-evaluation iteration. With a small value, the model will be
@@ -155,7 +160,13 @@ class Experiment(object):
     self._local_eval_frequency = local_eval_frequency
     self._eval_delay_secs = eval_delay_secs
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
-    self._min_eval_frequency = min_eval_frequency
+    # Using 1 on a non-cached file system requires a lot of overhead to
+    # read the checkpoint state file. This is particular bad on GCS, so
+    # we use a different default. This is a temporary band-aid, to be
+    # fixed holistically later (b/36498507).
+    default_min_eval_frequency = 1000 if _is_gcs(estimator.model_dir) else 1
+    self._min_eval_frequency = min_eval_frequency if (
+        min_eval_frequency is not None) else default_min_eval_frequency
     self._delay_workers_by_global_step = delay_workers_by_global_step
     self._train_monitors = train_monitors[:] if train_monitors else []
     self._eval_hooks = eval_hooks[:] if eval_hooks else []
@@ -183,16 +194,19 @@ class Experiment(object):
   def eval_steps(self):
     return self._eval_steps
 
-  def _set_export_strategies(self, value):
-    if value is None:
-      self._export_strategies = []
-    elif isinstance(value, list):
-      self._export_strategies = value[:]
-    elif isinstance(value, export_strategy.ExportStrategy):
-      self._export_strategies = [value]
-    else:
-      raise ValueError("`export_strategies` must be an ExportStrategy, "
-                       "a list of ExportStrategies, or None.")
+  def _set_export_strategies(self, values):  # pylint: disable=missing-docstring
+    export_strategies = []
+    if values:
+      if isinstance(values, export_strategy.ExportStrategy):
+        export_strategies.append(values)
+      else:
+        for value in values:
+          if not isinstance(value, export_strategy.ExportStrategy):
+            raise ValueError("`export_strategies` must be an ExportStrategy,"
+                             " an iterable of ExportStrategy, or `None`,"
+                             " found %s." % value)
+          export_strategies.append(value)
+    self._export_strategies = tuple(export_strategies)
 
   def extend_train_hooks(self, additional_hooks):
     """Extends the hooks for training."""
@@ -442,7 +456,7 @@ class Experiment(object):
     """Interleaves training and evaluation.
 
     The frequency of evaluation is controlled by the contructor arg
-    `min_eval_frequency`. When this parameter is None or 0, evaluation happens
+    `min_eval_frequency`. When this parameter is 0, evaluation happens
     only after training has completed. Note that evaluation cannot happen
     more frequently than checkpoints are taken. If no new snapshots are
     available when evaluation is supposed to occur, then evaluation doesn't
@@ -497,6 +511,8 @@ class Experiment(object):
     (via constructor). The model will be first trained for
     `train_steps_per_iteration`, and then be evaluated in turns.
 
+    This method is intended for single machine usage.
+
     This differs from `train_and_evaluate` as follows:
       1. The procedure will have train and evaluation in turns. The model
       will be trained for a number of steps (usuallly smaller than `train_steps`
@@ -534,7 +550,7 @@ class Experiment(object):
     eval_result = None
 
     # Set the default value for train_steps_per_iteration, which will be
-    # overriden by other settings.
+    # overridden by other settings.
     train_steps_per_iteration = 1000
     if self._train_steps_per_iteration is not None:
       train_steps_per_iteration = self._train_steps_per_iteration
@@ -633,6 +649,10 @@ class Experiment(object):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
+    # Estimator in core cannot work with monitors. We need to convert them
+    # to hooks. For Estimator in contrib, it is converted internally. So, it is
+    # safe to convert for both cases.
+    hooks = monitors.replace_monitors_with_hooks(hooks, self._estimator)
     if self._core_estimator_used:
       return self._estimator.train(input_fn=input_fn,
                                    steps=steps,
@@ -694,3 +714,7 @@ def _new_attr_context(obj, attr):
     yield
   finally:
     setattr(obj, attr, saved)
+
+
+def _is_gcs(model_dir):
+  return model_dir and model_dir.startswith("gs://")
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index 00ed062b0aa9d01bf9470ed8b72c4c9176fffb20..17feeb273625947fc3d59f0b3de71d08848c95e0 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -22,9 +22,9 @@ import os
 import tempfile
 import time
 
+from tensorflow.contrib.learn.python.learn import estimator as estimator_lib
 from tensorflow.contrib.learn.python.learn import evaluable
 from tensorflow.contrib.learn.python.learn import experiment
-from tensorflow.contrib.learn.python.learn import monitors
 from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn import trainable
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
@@ -39,6 +39,7 @@ from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_inspect
 
 
 class SheepCounter(object):
@@ -120,6 +121,15 @@ class TestBaseEstimator(object):
         compat.as_bytes(export_dir_base), compat.as_bytes('bogus_timestamp'))
 
 
+def _check_method_supports_args(method, kwargs):
+  """Checks that the given method supports the given args."""
+  supported_args = tuple(tf_inspect.getargspec(method).args)
+  for kwarg in kwargs:
+    if kwarg not in supported_args:
+      raise ValueError(
+          'Argument `{}` is not supported in method {}.'.format(kwarg, method))
+
+
 class TestEstimator(
     TestBaseEstimator, evaluable.Evaluable, trainable.Trainable):
 
@@ -127,9 +137,12 @@ class TestEstimator(
     super(TestEstimator, self).__init__(config, max_evals, eval_dict)
     tf_logging.info('Create Estimator')
 
+  def evaluate(self, **kwargs):
+    _check_method_supports_args(evaluable.Evaluable.evaluate, kwargs)
+    return super(TestEstimator, self).evaluate(**kwargs)
+
   def fit(self, **kwargs):
-    if 'hooks' in kwargs:
-      raise ValueError('`hooks` is defined in core Estimator')
+    _check_method_supports_args(trainable.Trainable.fit, kwargs)
     if 'monitors' in kwargs:
       self.monitors = kwargs['monitors']
     return super(TestEstimator, self).train(**kwargs)
@@ -137,6 +150,13 @@ class TestEstimator(
   def train(self, **kwargs):
     raise ValueError('`train` is not defined in Estimator.')
 
+  def export_savedmodel(
+      self, export_dir_base, serving_input_fn, **kwargs):
+    _check_method_supports_args(
+        estimator_lib.Estimator.export_savedmodel, kwargs)
+    return super(TestEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_fn, **kwargs)
+
 
 class TestCoreEstimator(TestBaseEstimator, core_estimator.Estimator):
 
@@ -145,17 +165,22 @@ class TestCoreEstimator(TestBaseEstimator, core_estimator.Estimator):
     tf_logging.info('Create Core Estimator')
 
   def evaluate(self, **kwargs):
-    if 'eval_metrics' in kwargs:
-      raise ValueError('`eval_metrics` is not defined in core Estimator')
+    _check_method_supports_args(core_estimator.Estimator.evaluate, kwargs)
     return super(TestCoreEstimator, self).evaluate(**kwargs)
 
   def train(self, **kwargs):
-    if 'monitors' in kwargs:
-      raise ValueError('`monitors` is not defined in core Estimator')
+    _check_method_supports_args(core_estimator.Estimator.train, kwargs)
     if 'hooks' in kwargs:
       self.monitors = kwargs['hooks']
     return super(TestCoreEstimator, self).train(**kwargs)
 
+  def export_savedmodel(
+      self, export_dir_base, serving_input_receiver_fn, **kwargs):
+    _check_method_supports_args(
+        core_estimator.Estimator.export_savedmodel, kwargs)
+    return super(TestCoreEstimator, self).export_savedmodel(
+        export_dir_base, serving_input_receiver_fn, **kwargs)
+
 
 class _NoopHook(session_run_hook.SessionRunHook):
   pass
@@ -185,6 +210,23 @@ class ExperimentTest(test.TestCase):
           eval_input_fn='eval_input',
           eval_metrics='eval_metrics')
 
+  def test_default_output_alternative_key_core_estimator(self):
+    est = TestCoreEstimator()
+    export_strategy = saved_model_export_utils.make_export_strategy(
+        est,
+        default_output_alternative_key='export_key',
+        exports_to_keep=None)
+    ex = experiment.Experiment(
+        est,
+        train_input_fn='train_input',
+        eval_input_fn='eval_input',
+        train_steps=100,
+        eval_steps=100,
+        export_strategies=export_strategy)
+    with self.assertRaisesRegexp(
+        ValueError, 'default_output_alternative_key is not supported'):
+      ex.train_and_evaluate()
+
   def test_train(self):
     for est in self._estimators_for_tests():
       eval_metrics = 'eval_metrics' if not isinstance(
@@ -461,7 +503,8 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(1, est.eval_count)
       self.assertEqual(1, len(est.monitors))
       self.assertEqual([noop_hook], est.eval_hooks)
-      self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
 
   def test_train_hooks_extend_does_not_mutate_input_hooks(self):
     for est in self._estimators_for_tests():
@@ -484,12 +527,33 @@ class ExperimentTest(test.TestCase):
       self.assertAllEqual([noop_hook, another_noop_hook], ex._train_monitors)
       self.assertAllEqual([noop_hook], input_hooks)
 
+  def test_invalid_export_strategies(self):
+    for est in self._estimators_for_tests():
+      with self.assertRaisesRegexp(ValueError, 'ExportStrategy'):
+        experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            train_steps=100,
+            eval_steps=100,
+            export_strategies='not_an_export_strategy')
+      with self.assertRaisesRegexp(ValueError, 'ExportStrategy'):
+        experiment.Experiment(
+            est,
+            train_input_fn='train_input',
+            eval_input_fn='eval_input',
+            train_steps=100,
+            eval_steps=100,
+            export_strategies=['not_an_export_srategy'])
+
   def test_export_strategies_reset(self):
     for est in self._estimators_for_tests():
       eval_metrics = 'eval_metrics' if not isinstance(
           est, core_estimator.Estimator) else None
       export_strategy_1 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_1', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_1',
+          exports_to_keep=None)
 
       ex = experiment.Experiment(
           est,
@@ -498,7 +562,7 @@ class ExperimentTest(test.TestCase):
           eval_metrics=eval_metrics,
           train_steps=100,
           eval_steps=100,
-          export_strategies=[export_strategy_1])
+          export_strategies=(export_strategy_1,))
       ex.train_and_evaluate()
       self.assertEqual(1, est.export_count)
 
@@ -512,9 +576,13 @@ class ExperimentTest(test.TestCase):
       # After reset with list, the count should increase with the number of
       # items.
       export_strategy_2 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_2', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_2',
+          exports_to_keep=None)
       export_strategy_3 = saved_model_export_utils.make_export_strategy(
-          est, 'export_input_3', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_3',
+          exports_to_keep=None)
 
       old_es = ex.reset_export_strategies(
           [export_strategy_2, export_strategy_3])
@@ -528,7 +596,9 @@ class ExperimentTest(test.TestCase):
           est, core_estimator.Estimator) else None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -544,7 +614,61 @@ class ExperimentTest(test.TestCase):
       self.assertEqual(1, est.export_count)
       self.assertEqual(1, len(est.monitors))
       self.assertEqual([noop_hook], est.eval_hooks)
-      self.assertTrue(isinstance(est.monitors[0], monitors.ValidationMonitor))
+      self.assertTrue(isinstance(est.monitors[0],
+                                 session_run_hook.SessionRunHook))
+
+  def test_train_and_evaluate_with_no_eval_during_training(self):
+    for est in self._estimators_for_tests():
+      eval_metrics = 'eval_metrics' if not isinstance(
+          est, core_estimator.Estimator) else None
+      noop_hook = _NoopHook()
+      ex = experiment.Experiment(
+          est,
+          train_input_fn='train_input',
+          eval_input_fn='eval_input',
+          eval_metrics=eval_metrics,
+          eval_hooks=[noop_hook],
+          train_steps=100,
+          eval_steps=100,
+          min_eval_frequency=0)
+      ex.train_and_evaluate()
+      self.assertEqual(1, est.fit_count)
+      self.assertEqual(1, est.eval_count)
+      self.assertEqual(0, len(est.monitors))
+
+  def test_min_eval_frequency_defaults(self):
+    def dummy_model_fn(features, labels):  # pylint: disable=unused-argument
+      pass
+
+    # The default value when model_dir is on GCS is 1000
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
+    ex = experiment.Experiment(
+        estimator, train_input_fn=None, eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 1000)
+
+    # The default value when model_dir is not on GCS is 1
+    estimator = core_estimator.Estimator(dummy_model_fn, '/tmp/dummy')
+    ex = experiment.Experiment(
+        estimator, train_input_fn=None, eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 1)
+
+    # Make sure default not used when explicitly set
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
+    ex = experiment.Experiment(
+        estimator,
+        min_eval_frequency=123,
+        train_input_fn=None,
+        eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 123)
+
+    # Make sure default not used when explicitly set as 0
+    estimator = core_estimator.Estimator(dummy_model_fn, 'gs://dummy_bucket')
+    ex = experiment.Experiment(
+        estimator,
+        min_eval_frequency=0,
+        train_input_fn=None,
+        eval_input_fn=None)
+    self.assertEquals(ex._min_eval_frequency, 0)
 
   def test_continuous_train_and_eval(self):
     for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
@@ -552,7 +676,9 @@ class ExperimentTest(test.TestCase):
           est, core_estimator.Estimator) else None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -573,7 +699,9 @@ class ExperimentTest(test.TestCase):
       eval_metrics = 'eval_metrics' if not isinstance(
           est, core_estimator.Estimator) else None
       export_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
@@ -723,12 +851,14 @@ class ExperimentTest(test.TestCase):
   def test_test(self):
     for est in self._estimators_for_tests():
       exp_strategy = saved_model_export_utils.make_export_strategy(
-          est, 'export_input', exports_to_keep=None)
+          est,
+          None if isinstance(est, core_estimator.Estimator) else 'export_input',
+          exports_to_keep=None)
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           eval_input_fn='eval_input',
-          export_strategies=[exp_strategy])
+          export_strategies=(exp_strategy,))
       ex.test()
       self.assertEqual(1, est.fit_count)
       self.assertEqual(1, est.eval_count)
diff --git a/tensorflow/contrib/learn/python/learn/export_strategy.py b/tensorflow/contrib/learn/python/learn/export_strategy.py
index c62b8861a1e1e026145da51236ddfd1c9f3aa4b5..f276aab0e6beb011a21c20fa194dd5212db796d1 100644
--- a/tensorflow/contrib/learn/python/learn/export_strategy.py
+++ b/tensorflow/contrib/learn/python/learn/export_strategy.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """ExportStrategy class represents different flavors of model export."""
 
 from __future__ import absolute_import
@@ -20,13 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import inspect
+
+from tensorflow.python.util import tf_inspect
 
 __all__ = ['ExportStrategy']
 
 
-class ExportStrategy(collections.namedtuple('ExportStrategy',
-                                            ['name', 'export_fn'])):
+class ExportStrategy(
+    collections.namedtuple('ExportStrategy', ['name', 'export_fn'])):
   """A class representing a type of model export.
 
   Typically constructed by a utility function specific to the exporter, such as
@@ -74,7 +74,7 @@ class ExportStrategy(collections.namedtuple('ExportStrategy',
     """
     # don't break existing export_fns that don't accept checkpoint_path and
     # eval_result
-    export_fn_args = inspect.getargspec(self.export_fn).args
+    export_fn_args = tf_inspect.getargspec(self.export_fn).args
     kwargs = {}
     if 'checkpoint_path' in export_fn_args:
       kwargs['checkpoint_path'] = checkpoint_path
diff --git a/tensorflow/contrib/learn/python/learn/graph_actions.py b/tensorflow/contrib/learn/python/learn/graph_actions.py
index 4b7867f2d0013012c7d988bbd84fa591942b7e04..98365c05f663e5d2a06703457fc5663d7135f7d9 100644
--- a/tensorflow/contrib/learn/python/learn/graph_actions.py
+++ b/tensorflow/contrib/learn/python/learn/graph_actions.py
@@ -37,8 +37,8 @@ from tensorflow.python.client import session as tf_session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -429,11 +429,14 @@ def _get_ready_op():
 
 
 def _get_local_init_op():
+  """Returns the local init ops to initialize tables and local variables."""
   local_init_op = _get_first_op_from_collection(
       ops.GraphKeys.LOCAL_INIT_OP)
   if local_init_op is None:
-    op_list = [variables.local_variables_initializer(),
-               data_flow_ops.tables_initializer()]
+    op_list = [
+        variables.local_variables_initializer(),
+        lookup_ops.tables_initializer()
+    ]
     if op_list:
       local_init_op = control_flow_ops.group(*op_list)
       ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -680,7 +683,7 @@ def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None):
       else:
         session.run(variables.global_variables_initializer())
       session.run(variables.local_variables_initializer())
-      session.run(data_flow_ops.tables_initializer())
+      session.run(lookup_ops.tables_initializer())
       coord = coordinator.Coordinator()
       threads = None
       try:
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
index 456792835827f86c0fbc42822e688240e6643ed4..06c3782a471537cf3879450e6bd20899a35d96ac 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/__init__.py
@@ -21,14 +21,14 @@ from __future__ import print_function
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_data
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import extract_dask_labels
 from tensorflow.contrib.learn.python.learn.learn_io.dask_io import HAS_DASK
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import queue_parsed_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_examples
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_batch_record_features
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_examples_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features
+from tensorflow.contrib.learn.python.learn.learn_io.graph_io import read_keyed_batch_features_shared_queue
 from tensorflow.contrib.learn.python.learn.learn_io.numpy_io import numpy_input_fn
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_data
 from tensorflow.contrib.learn.python.learn.learn_io.pandas_io import extract_pandas_labels
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
index 7d08f9b4523f3876845138d033295f636a54ee79..c302c7725a4369b3da3da2eab68198ee0d8d5379 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io.py
@@ -18,8 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from types import FunctionType, GeneratorType
 from collections import Container
+from types import FunctionType
+from types import GeneratorType
 
 from tensorflow.contrib.learn.python.learn.dataframe.queues import feeding_functions
 
@@ -33,7 +34,7 @@ def generator_input_fn(x,
                        num_threads=1):
   """Returns input function that would dicts of numpy arrays
        yielded from a generator.
-  
+
   It is assumed that every dict yielded from the dictionary represents
   a single sample. The generator should consume a single epoch of the data.
 
@@ -82,47 +83,44 @@ def generator_input_fn(x,
     KeyError: `key` mismatch between dicts emitted from `x()`
   """
   if not isinstance(x, FunctionType):
-    raise TypeError('x must be generator function; got {}'.format(
-        type(x).__name__))
+    raise TypeError(
+        'x must be generator function; got {}'.format(type(x).__name__))
   generator = x()
   if not isinstance(generator, GeneratorType):
-    raise TypeError('x() must be generator; got {}'.format(
-        type(generator).__name__))
+    raise TypeError(
+        'x() must be generator; got {}'.format(type(generator).__name__))
   data = next(generator)
   if not isinstance(data, dict):
-    raise TypeError('x() must yield dict; got {}'.format(
-        type(data).__name__))
+    raise TypeError('x() must yield dict; got {}'.format(type(data).__name__))
   input_keys = sorted(next(x()).keys())
   if target_key is not None:
     if isinstance(target_key, str):
       target_key = [target_key]
-    elif isinstance(target_key,  Container):
+    elif isinstance(target_key, Container):
       for item in target_key:
         if not isinstance(item, str):
-          raise TypeError(
-              'target_key must be str or Container of str; got {}'.format(
-                  type(item).__name__))
+          raise TypeError('target_key must be str or Container of str; got {}'.
+                          format(type(item).__name__))
         if item not in input_keys:
           raise KeyError(
               'target_key not in yielded dict. Expected {} keys; got {}'.format(
                   input_keys, item))
     else:
-      raise TypeError(
-          'target_key must be str or Container of str; got {}'.format(
-              type(target_key).__name__))
+      raise TypeError('target_key must be str or Container of str; got {}'.
+                      format(type(target_key).__name__))
 
   def _generator_input_fn():
     """generator input function."""
     queue = feeding_functions.enqueue_data(
-      x,
-      queue_capacity,
-      shuffle=shuffle,
-      num_threads=num_threads,
-      enqueue_size=batch_size,
-      num_epochs=num_epochs)
+        x,
+        queue_capacity,
+        shuffle=shuffle,
+        num_threads=num_threads,
+        enqueue_size=batch_size,
+        num_epochs=num_epochs)
 
-    features = (queue.dequeue_many(batch_size) if num_epochs is None
-                else queue.dequeue_up_to(batch_size))
+    features = (queue.dequeue_many(batch_size)
+                if num_epochs is None else queue.dequeue_up_to(batch_size))
     if not isinstance(features, list):
       features = [features]
     features = dict(zip(input_keys, features))
@@ -133,4 +131,5 @@ def generator_input_fn(x,
         target = features.pop(target_key[0])
       return features, target
     return features
+
   return _generator_input_fn
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
index 8d3cdec819cd8809e04fb949081e672685217770..bc767ec18b1fac6a304c6cfb9364bbcd9197eae6 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -35,17 +35,24 @@ from tensorflow.python.training import queue_runner_impl
 
 
 class GeneratorIoTest(test.TestCase):
+
   def testGeneratorInputFn(self):
 
     def generator():
       for index in range(2):
-        yield {'a': np.ones(1) * index,
-               'b': np.ones(1) * index + 32,
-               'label': np.ones(1) * index - 32}
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key='label', batch_size=2, shuffle=False, num_epochs=1)
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
       features, target = input_fn()
 
       coord = coordinator.Coordinator()
@@ -71,7 +78,7 @@ class GeneratorIoTest(test.TestCase):
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
       features = input_fn()
 
       coord = coordinator.Coordinator()
@@ -91,15 +98,20 @@ class GeneratorIoTest(test.TestCase):
 
     def generator():
       for index in range(2):
-        yield {'a': np.ones(1) * index,
-               'b': np.ones(1) * index + 32,
-               'label': np.ones(1) * index - 32,
-               'label2': np.ones(1) * index - 64,
-               }
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32,
+            'label2': np.ones(1) * index - 64,
+        }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key=['label','label2'], batch_size=2, shuffle=False, num_epochs=1)
+          generator,
+          target_key=['label', 'label2'],
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
       features, target = input_fn()
 
       coord = coordinator.Coordinator()
@@ -108,8 +120,10 @@ class GeneratorIoTest(test.TestCase):
       res = session.run([features, target])
       self.assertAllEqual(res[0]['a'], np.asarray([0, 1]).reshape(-1, 1))
       self.assertAllEqual(res[0]['b'], np.asarray([32, 33]).reshape(-1, 1))
-      self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(-1, 1))
-      self.assertAllEqual(res[1]['label2'], np.asarray([-64, -63]).reshape(-1, 1))
+      self.assertAllEqual(res[1]['label'], np.asarray([-32, -31]).reshape(
+          -1, 1))
+      self.assertAllEqual(res[1]['label2'],
+                          np.asarray([-64, -63]).reshape(-1, 1))
 
       session.run([features])
       with self.assertRaises(errors.OutOfRangeError):
@@ -122,22 +136,34 @@ class GeneratorIoTest(test.TestCase):
 
     def generator():
       for index in range(100):
-        yield {'a': np.ones((10, 10)) * index,
-               'b': np.ones((5, 5)) * index + 32,
-               'label': np.ones((3, 3)) * index - 32}
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key="label", batch_size=2, shuffle=False, num_epochs=1)
+          generator,
+          target_key='label',
+          batch_size=2,
+          shuffle=False,
+          num_epochs=1)
       features, target = input_fn()
 
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(session, coord=coord)
 
       res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], np.vstack((np.zeros((10, 10)), np.ones((10, 10)))).reshape(2, 10, 10))
-      self.assertAllEqual(res[0]['b'], np.vstack((np.zeros((5, 5)), np.ones((5, 5)))).reshape(2, 5, 5) + 32)
-      self.assertAllEqual(res[1], np.vstack((np.zeros((3, 3)), np.ones((3, 3)))).reshape(2, 3, 3) - 32)
+      self.assertAllEqual(res[0]['a'],
+                          np.vstack((np.zeros((10, 10)), np.ones(
+                              (10, 10)))).reshape(2, 10, 10))
+      self.assertAllEqual(res[0]['b'],
+                          np.vstack((np.zeros((5, 5)), np.ones(
+                              (5, 5)))).reshape(2, 5, 5) + 32)
+      self.assertAllEqual(res[1],
+                          np.vstack((np.zeros((3, 3)), np.ones(
+                              (3, 3)))).reshape(2, 3, 3) - 32)
 
       coord.request_stop()
       coord.join(threads)
@@ -147,82 +173,97 @@ class GeneratorIoTest(test.TestCase):
     with self.test_session():
       with self.assertRaisesRegexp(TypeError, 'x must be generator function'):
         failing_input_fn = generator_io.generator_input_fn(
-          x, batch_size=2, shuffle=False, num_epochs=1)
+            x, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFnWithXAsNonGenerator(self):
+
     def generator():
       return np.arange(32, 36)
+
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError, "x\(\) must be generator"):
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
         failing_input_fn = generator_io.generator_input_fn(
-          generator, batch_size=2, shuffle=False, num_epochs=1)
+            generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFnWithXAsNonGeneratorYieldingDicts(self):
+
     def generator():
       yield np.arange(32, 36)
+
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError, "x\(\) must yield dict"):
+      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
         failing_input_fn = generator_io.generator_input_fn(
-          generator, batch_size=2, shuffle=False, num_epochs=1)
+            generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFNWithTargetLabelNotString(self):
+
     def generator():
       for index in range(2):
-        yield {'a': np.ones((10, 10)) * index,
-               'b': np.ones((5, 5)) * index + 32,
-               'label': np.ones((3, 3)) * index - 32}
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
 
     y = np.arange(32, 36)
     with self.test_session():
       with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
-                                              ' Container of str'):
+                                   ' Container of str'):
         failing_input_fn = generator_io.generator_input_fn(
-          generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFNWithTargetLabelListNotString(self):
+
     def generator():
       for index in range(2):
-        yield {'a': np.ones((10, 10)) * index,
-               'b': np.ones((5, 5)) * index + 32,
-               'label': np.ones((3, 3)) * index - 32}
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
 
-    y = ["label", np.arange(10)]
+    y = ['label', np.arange(10)]
     with self.test_session():
       with self.assertRaisesRegexp(TypeError, 'target_key must be str or'
-                                              ' Container of str'):
+                                   ' Container of str'):
         failing_input_fn = generator_io.generator_input_fn(
-          generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFNWithTargetLabelNotInDict(self):
+
     def generator():
       for index in range(2):
-        yield {'a': np.ones((10, 10)) * index,
-               'b': np.ones((5, 5)) * index + 32,
-               'label': np.ones((3, 3)) * index - 32}
+        yield {
+            'a': np.ones((10, 10)) * index,
+            'b': np.ones((5, 5)) * index + 32,
+            'label': np.ones((3, 3)) * index - 32
+        }
 
-    y = ["label", "target"]
+    y = ['label', 'target']
     with self.test_session():
-      with self.assertRaisesRegexp(KeyError,
-                                   'target_key not in yielded dict'):
+      with self.assertRaisesRegexp(KeyError, 'target_key not in yielded dict'):
         failing_input_fn = generator_io.generator_input_fn(
-          generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
+            generator, target_key=y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
   def testGeneratorInputFnWithNoTargetKey(self):
+
     def generator():
       for index in range(2):
-        yield {'a': np.ones(1) * index,
-               'b': np.ones(1) * index + 32,
-               'label': np.ones(1) * index - 32}
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
       features = input_fn()
 
       coord = coordinator.Coordinator()
@@ -241,15 +282,18 @@ class GeneratorIoTest(test.TestCase):
       coord.join(threads)
 
   def testGeneratorInputFnWithBatchLargerthanData(self):
+
     def generator():
       for index in range(2):
-        yield {'a': np.ones(1) * index,
-               'b': np.ones(1) * index + 32,
-               'label': np.ones(1) * index - 32}
+        yield {
+            'a': np.ones(1) * index,
+            'b': np.ones(1) * index + 32,
+            'label': np.ones(1) * index - 32
+        }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
+          generator, target_key=None, batch_size=4, shuffle=False, num_epochs=1)
       features = input_fn()
 
       coord = coordinator.Coordinator()
@@ -268,19 +312,24 @@ class GeneratorIoTest(test.TestCase):
       coord.join(threads)
 
   def testGeneratorInputFnWithMismatchinGeneratorKeys(self):
+
     def generator():
       index = 0
-      yield {'a': np.ones(1) * index,
-             'b': np.ones(1) * index + 32,
-             'label': np.ones(1) * index - 32}
+      yield {
+          'a': np.ones(1) * index,
+          'b': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
       index = 1
-      yield {'a': np.ones(1) * index,
-             'c': np.ones(1) * index + 32,
-             'label': np.ones(1) * index - 32}
+      yield {
+          'a': np.ones(1) * index,
+          'c': np.ones(1) * index + 32,
+          'label': np.ones(1) * index - 32
+      }
 
     with self.test_session() as session:
       input_fn = generator_io.generator_input_fn(
-        generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
+          generator, target_key=None, batch_size=2, shuffle=False, num_epochs=1)
       features = input_fn()
 
       coord = coordinator.Coordinator()
@@ -290,9 +339,10 @@ class GeneratorIoTest(test.TestCase):
         session.run([features])
 
       with self.assertRaisesRegex(KeyError, 'key mismatch between dicts emitted'
-                                            ' by GenFunExpected'):
+                                  ' by GenFunExpected'):
         coord.request_stop()
         coord.join(threads)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 0f317b7bb0441bb6bfff1c232f71cb135d17fec0..6b552f59d080ab977876e5ff99628f51baab0856 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -174,17 +174,17 @@ def read_keyed_batch_examples(file_pattern,
       seed=seed)
 
 
-def _read_keyed_batch_examples_shared_queue(file_pattern,
-                                            batch_size,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            num_threads=1,
-                                            read_batch_size=1,
-                                            parse_fn=None,
-                                            name=None,
-                                            seed=None):
+def read_keyed_batch_examples_shared_queue(file_pattern,
+                                           batch_size,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           num_threads=1,
+                                           read_batch_size=1,
+                                           parse_fn=None,
+                                           name=None,
+                                           seed=None):
   """Adds operations to read, queue, batch `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -359,8 +359,9 @@ def _read_keyed_batch_examples_helper(file_pattern,
   # Check input parameters are given and reasonable.
   if (not queue_capacity) or (queue_capacity <= 0):
     raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
-  if (batch_size is None) or ((not isinstance(batch_size, ops.Tensor)) and
-                              (batch_size <= 0 or batch_size > queue_capacity)):
+  if (batch_size is None) or (
+      (not isinstance(batch_size, ops.Tensor)) and
+      (batch_size <= 0 or batch_size >= queue_capacity)):
     raise ValueError('Invalid batch_size %s, with queue_capacity %s.' %
                      (batch_size, queue_capacity))
   if (read_batch_size is None) or (
@@ -511,18 +512,18 @@ def read_keyed_batch_features(file_pattern,
         name=scope)
 
 
-def _read_keyed_batch_features_shared_queue(file_pattern,
-                                            batch_size,
-                                            features,
-                                            reader,
-                                            randomize_input=True,
-                                            num_epochs=None,
-                                            queue_capacity=10000,
-                                            reader_num_threads=1,
-                                            feature_queue_capacity=100,
-                                            num_queue_runners=2,
-                                            parse_fn=None,
-                                            name=None):
+def read_keyed_batch_features_shared_queue(file_pattern,
+                                           batch_size,
+                                           features,
+                                           reader,
+                                           randomize_input=True,
+                                           num_epochs=None,
+                                           queue_capacity=10000,
+                                           reader_num_threads=1,
+                                           feature_queue_capacity=100,
+                                           num_queue_runners=2,
+                                           parse_fn=None,
+                                           name=None):
   """Adds operations to read, queue, batch and parse `Example` protos.
 
   Given file pattern (or list of files), will setup a shared queue for file
@@ -570,7 +571,7 @@ def _read_keyed_batch_features_shared_queue(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    keys, examples = _read_keyed_batch_examples_shared_queue(
+    keys, examples = read_keyed_batch_examples_shared_queue(
         file_pattern,
         batch_size,
         reader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 83643689e1acfd1cbe5eabaaea8ffdee9f23890e..f25f7caf61574f4d6cbd4d64b99a5d4f18b6fb44 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -26,7 +26,6 @@ import tempfile
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.learn.python.learn.learn_io import graph_io
-from tensorflow.contrib.learn.python.learn.learn_io.graph_io import _read_keyed_batch_examples_shared_queue
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
@@ -112,6 +111,18 @@ class GraphIOTest(test.TestCase):
         queue_capacity=queue_capacity,
         num_threads=num_threads,
         name=name)
+    self.assertRaisesRegexp(
+        ValueError,
+        "Invalid batch_size",
+        graph_io.read_batch_examples,
+        _VALID_FILE_PATTERN,
+        default_batch_size,
+        io_ops.TFRecordReader,
+        False,
+        num_epochs=None,
+        queue_capacity=default_batch_size,
+        num_threads=num_threads,
+        name=name)
     self.assertRaisesRegexp(
         ValueError,
         "Invalid queue_capacity",
@@ -356,7 +367,7 @@ class GraphIOTest(test.TestCase):
     ]
     filename = self._create_temp_file("".join(json_lines))
     batch_size = 10000
-    queue_capacity = 10000
+    queue_capacity = 100000
     name = "my_large_batch"
 
     features = {"sequence": parsing_ops.FixedLenFeature([], dtypes_lib.string)}
@@ -452,7 +463,7 @@ class GraphIOTest(test.TestCase):
     name = "my_batch"
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -516,7 +527,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g1, session_lib.Session(
         server.target, graph=g1) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
@@ -545,7 +556,7 @@ class GraphIOTest(test.TestCase):
 
     with ops.Graph().as_default() as g2, session_lib.Session(
         server.target, graph=g2) as session:
-      keys, inputs = _read_keyed_batch_examples_shared_queue(
+      keys, inputs = graph_io.read_keyed_batch_examples_shared_queue(
           filenames,
           batch_size,
           reader=io_ops.TextLineReader,
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner.py b/tensorflow/contrib/learn/python/learn/learn_runner.py
index 183ab438b6f2f658da98d6f655c97b4a59ac9a06..943c55531405ebbd301700c9de41ed198bc3995c 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner.py
@@ -19,8 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
 from tensorflow.contrib.learn.python.learn.experiment import Experiment
+from tensorflow.contrib.training.python.training import hparam as hparam_lib
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -46,7 +47,52 @@ def _execute_schedule(experiment, schedule):
   return task()
 
 
-def run(experiment_fn, output_dir, schedule=None):
+def _wrapped_experiment_fn_with_uid_check(experiment_fn, require_hparams=False):
+  """Wraps the `RunConfig` uid check with `experiment_fn`.
+
+  For `experiment_fn` which takes `run_config`, it is expected that the
+  `run_config` is passed to the Estimator correctly. Toward that, the wrapped
+  `experiment_fn` compares the `uid` of the `RunConfig` instance.
+
+  Args:
+    experiment_fn: The original `experiment_fn` which takes `run_config` and
+      `hparams`.
+    require_hparams: If True, the `hparams` passed to `experiment_fn` cannot be
+      `None`.
+
+  Returns:
+    A experiment_fn with same signature.
+  """
+  def wrapped_experiment_fn(run_config, hparams):
+    """Calls experiment_fn and checks the uid of `RunConfig`."""
+    if not isinstance(run_config, run_config_lib.RunConfig):
+      raise ValueError('`run_config` must be `RunConfig` instance')
+    if not run_config.model_dir:
+      raise ValueError(
+          'Must specify a model directory `model_dir` in `run_config`.')
+    if hparams is not None and not isinstance(hparams, hparam_lib.HParams):
+      raise ValueError('`hparams` must be `HParams` instance')
+    if require_hparams and hparams is None:
+      raise ValueError('`hparams` cannot be `None`.')
+
+    expected_uid = run_config.uid()
+    experiment = experiment_fn(run_config, hparams)
+
+    if not isinstance(experiment, Experiment):
+      raise TypeError('Experiment builder did not return an Experiment '
+                      'instance, got %s instead.' % type(experiment))
+
+    if experiment.estimator.config.uid() != expected_uid:
+      raise RuntimeError(
+          '`RunConfig` instance is expected to be used by the `Estimator` '
+          'inside the `Experiment`. expected {}, but got {}'.format(
+              expected_uid, experiment.estimator.config.uid()))
+    return experiment
+  return wrapped_experiment_fn
+
+
+def run(experiment_fn, output_dir=None, schedule=None, run_config=None,
+        hparams=None):
   """Make and run an experiment.
 
   It creates an Experiment by calling `experiment_fn`. Then it calls the
@@ -62,7 +108,34 @@ def run(experiment_fn, output_dir, schedule=None):
   If the experiment's config does not include a task type, then an exception
   is raised.
 
-  Example:
+  Example with `run_config` (Recommended):
+  ```
+    def _create_my_experiment(run_config, hparams):
+
+        # You can change a subset of the run_config properties as
+        #   run_config = run_config.replace(save_checkpoints_steps=500)
+
+        return tf.contrib.learn.Experiment(
+          estimator=my_estimator(config=run_config, hparams=hparams),
+          train_input_fn=my_train_input,
+          eval_input_fn=my_eval_input)
+
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      run_config=run_config_lib.RunConfig(model_dir="some/output/dir"),
+      schedule="train_and_evaluate",
+      hparams=_create_default_hparams())
+  ```
+  or simply as
+  ```
+    learn_runner.run(
+      experiment_fn=_create_my_experiment,
+      run_config=run_config_lib.RunConfig(model_dir="some/output/dir"))
+  ```
+  if `hparams` is not used by the `Estimator`. On a single machine, `schedule`
+  defaults to `train_and_evaluate`.
+
+  Example with `output_dir` (deprecated):
   ```
     def _create_my_experiment(output_dir):
         return tf.contrib.learn.Experiment(
@@ -76,37 +149,63 @@ def run(experiment_fn, output_dir, schedule=None):
       schedule="train")
   ```
   Args:
-    experiment_fn: A function that creates an `Experiment`. It should accept an
-      argument `output_dir` which should be used to create the `Estimator`
-      (passed as `model_dir` to its constructor). It must return an
-      `Experiment`.
-    output_dir: Base output directory.
+    experiment_fn: A function that creates an `Experiment`. It could be one of
+      the two following signatures:
+      1) [Deprecated] It accepts an argument `output_dir` which should be used
+      to create the `Estimator` (passed as `model_dir` to its constructor). It
+      must return an `Experiment`. For this case, `run_config` and `hparams`
+      must be None.
+      2) It accepts two arguments `run_config` and `hparams`, which should be
+      used to create the `Estimator` (`run_config` passed as `config` to its
+      constructor; `hparams` used as the hyper-paremeters of the model).
+      It must return an `Experiment`. For this case, `output_dir` must be None.
+    output_dir: Base output directory [Deprecated].
     schedule: The name of the  method in the `Experiment` to run.
+    run_config: `RunConfig` instance. The `run_config.model_dir` must be
+      non-empty. If `run_config` is set, `output_dir` must be None.
+    hparams: `HParams` instance. The default hyper-parameters, which will be
+      passed to the `experiment_fn` if `run_config` is not None.
 
   Returns:
     The return value of function `schedule`.
 
   Raises:
-    ValueError: If `output_dir` is empty, `schedule` is None but no task
-      type is set in the built experiment's config, the task type has no
-      default, or `schedule` doesn't reference a member of `Experiment`.
+    ValueError: If both `output_dir` and `run_config` are empty or set,
+      `schedule` is None but no task type is set in the built experiment's
+      config, the task type has no default, `run_config.model_dir` is empty or
+      `schedule` doesn't reference a member of `Experiment`.
     TypeError: `schedule` references non-callable member.
   """
-  if not output_dir:
-    raise ValueError('Must specify an output directory')
+
+  if output_dir is not None and run_config is not None:
+    raise ValueError('Cannot provide both `output_dir` and `run_config`')
+
+  if output_dir is None and run_config is None:
+    raise ValueError('Must set value for `output_dir` or `run_config`')
+
   if not callable(experiment_fn):
     raise TypeError('Experiment builder "%s" is not callable.' %
                     experiment_fn)
 
-  # Call the builder
-  experiment = experiment_fn(output_dir=output_dir)
-  if not isinstance(experiment, Experiment):
-    raise TypeError('Experiment builder did not return an Experiment '
-                    'instance, got %s instead.' % type(experiment))
+  experiment = None
+  if run_config is not None:
+    wrapped_experiment_fn = _wrapped_experiment_fn_with_uid_check(experiment_fn)
+    experiment = wrapped_experiment_fn(run_config=run_config, hparams=hparams)
+  else:
+    if not output_dir:
+      raise ValueError('Must specify an output directory')
+    if hparams is not None:
+      raise ValueError(
+          'Must set `hparams` as None for `experiment_fn` with `output_dir`.')
+    # Call the builder
+    experiment = experiment_fn(output_dir=output_dir)
+    if not isinstance(experiment, Experiment):
+      raise TypeError('Experiment builder did not return an Experiment '
+                      'instance, got %s instead.' % type(experiment))
 
   # Get the schedule
-  config = experiment.estimator.config
-  schedule = schedule or _get_default_schedule(config)
+  run_config = run_config or experiment.estimator.config
+  schedule = schedule or _get_default_schedule(run_config)
 
   return _execute_schedule(experiment, schedule)
 
@@ -122,11 +221,11 @@ def tune(experiment_fn, tuner):
 
   Example:
   ```
-    def _create_my_experiment(config, hparams):
+    def _create_my_experiment(run_config, hparams):
       hidden_units = [hparams.unit_per_layer] * hparams.num_hidden_layers
 
       return tf.contrib.learn.Experiment(
-          estimator=DNNClassifier(config=config, hidden_units=hidden_units),
+          estimator=DNNClassifier(config=run_config, hidden_units=hidden_units),
           train_input_fn=my_train_input,
           eval_input_fn=my_eval_input)
 
@@ -136,13 +235,16 @@ def tune(experiment_fn, tuner):
   ```
   Args:
     experiment_fn: A function that creates an `Experiment`. It should accept an
-      argument `config` which should be used to create the `Estimator` (passed
-      as `config` to its constructor), and an argument `hparams`, which should
-      be used for hyper-parameters tuning. It must return an `Experiment`.
+      argument `run_config` which should be used to create the `Estimator` (
+      passed as `config` to its constructor), and an argument `hparams`, which
+      should be used for hyper-parameters tuning. It must return an
+      `Experiment`.
     tuner: A `Tuner` instance.
   """
   while tuner.next_trial():
-    tuner.run_experiment(experiment_fn)
+    tuner.run_experiment(
+        _wrapped_experiment_fn_with_uid_check(
+            experiment_fn, require_hparams=True))
 
 
 def _is_distributed(config):
@@ -168,13 +270,13 @@ def _get_default_schedule(config):
   if not config.task_type:
     raise ValueError('Must specify a schedule')
 
-  if config.task_type == run_config.TaskType.MASTER:
+  if config.task_type == run_config_lib.TaskType.MASTER:
     # TODO(rhaertel): handle the case where there is more than one master
     # or explicitly disallow such a case.
     return 'train_and_evaluate'
-  elif config.task_type == run_config.TaskType.PS:
+  elif config.task_type == run_config_lib.TaskType.PS:
     return 'run_std_server'
-  elif config.task_type == run_config.TaskType.WORKER:
+  elif config.task_type == run_config_lib.TaskType.WORKER:
     return 'train'
 
   raise ValueError('No default schedule for task type: %s' % (config.task_type))
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_lib.py b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9b1c7716f0ab1f2274ca53406175240b613027
--- /dev/null
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_lib.py
@@ -0,0 +1,30 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to run and tune an Experiment.
+
+@@run
+@@tune
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.learn.python.learn.learn_runner import *  # pylint: disable=wildcard-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = []
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
index 1afd6a5c827f62985630c015dddbe3a5bc58ec35..b61a42a1c762608df1344a5188176fab1dc25b65 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
@@ -24,26 +24,50 @@ import os
 from tensorflow.contrib.learn.python.learn import evaluable  # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.learn.python.learn import experiment
 from tensorflow.contrib.learn.python.learn import learn_runner
-from tensorflow.contrib.learn.python.learn import run_config
 from tensorflow.contrib.learn.python.learn import trainable
+
 from tensorflow.contrib.learn.python.learn.estimators import run_config as run_config_lib
+from tensorflow.contrib.training.python.training import hparam as hparam_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 patch = test.mock.patch
 
+_MODIR_DIR = "/tmp"
+_HPARAMS = hparam_lib.HParams(learning_rate=0.01)
+_MUST_SPECIFY_OUTPUT_DIR_MSG = "Must specify an output directory"
+_MISSING_MODEL_DIR_ERR_MSG = (
+    "Must specify a model directory `model_dir` in `run_config`.")
+_EXP_NOT_CALLABLE_MSG = "Experiment builder .* is not callable"
+_INVALID_HPARAMS_ERR_MSG = "`hparams` must be `HParams` instance"
+_NOT_EXP_TYPE_MSG = "Experiment builder did not return an Experiment"
+_NON_EXIST_TASK_MSG = "Schedule references non-existent task"
+_NON_CALLABLE_MSG = "Schedule references non-callable member"
+_MUST_SPECIFY_OUTPUT_DIR_OR_CONFIG_MSG = (
+    "Must set value for `output_dir` or `run_config`")
+_HPARAMS_CANNOT_BE_SET_FOR_OUTPUT_DIR_MSG = (
+    "Must set `hparams` as None for `experiment_fn` with `output_dir`.")
+_CANNOT_SET_BOTH_OUTPUT_DIR_AND_CONFIG_MSG = (
+    "Cannot provide both `output_dir` and `run_config`")
+_INVALID_RUN_CONFIG_TYPE_MSG = "`run_config` must be `RunConfig` instance"
+_RUN_CONFIG_UID_CHECK_ERR_MSG = (
+    "`RunConfig` instance is expected to be used by the `Estimator`")
+
 
 class TestExperiment(experiment.Experiment):
 
-  def __init__(self, default=None, config=None):
+  def __init__(self, default=None, config=None, model_dir=None):
     self.default = default
     self.config = config
+    internal_model_dir = model_dir or config.model_dir
+    self._model_dir = internal_model_dir
 
     class Estimator(evaluable.Evaluable, trainable.Trainable):
       config = self.config
 
+      @property
       def model_dir(self):
-        raise NotImplementedError
+        return internal_model_dir
 
       def fit(self, x=None, y=None, input_fn=None, steps=None, batch_size=None,
               monitors=None, max_steps=None):
@@ -57,16 +81,16 @@ class TestExperiment(experiment.Experiment):
     super(TestExperiment, self).__init__(Estimator(), None, None)
 
   def local_run(self):
-    return "local_run"
+    return "local_run-{}".format(self._model_dir)
 
   def train(self):
-    return "train"
+    return "train-{}".format(self._model_dir)
 
   def run_std_server(self):
-    return "run_std_server"
+    return "run_std_server-{}".format(self._model_dir)
 
   def train_and_evaluate(self):
-    return "train_and_evaluate"
+    return "train_and_evaluate-{}".format(self._model_dir)
 
   def simple_task(self):
     return "simple_task, default=%s." % self.default
@@ -75,7 +99,20 @@ class TestExperiment(experiment.Experiment):
 # pylint: disable=unused-argument
 def build_experiment(output_dir):
   tf_logging.info("In default build_experiment.")
-  return TestExperiment()
+  return TestExperiment(model_dir=output_dir)
+
+
+def build_experiment_fn_for_output_dir(run_config=None):
+  def _build_experiment(output_dir):
+    tf_logging.info("In default build_experiment.")
+    return TestExperiment(config=run_config, model_dir=output_dir)
+  return _build_experiment
+
+
+def build_experiment_for_run_config(run_config, hparams):
+  if hparams is not None and hparams != _HPARAMS:
+    raise ValueError("hparams is not set correctly")
+  return TestExperiment(config=run_config)
 
 
 def build_non_experiment(output_dir):
@@ -98,7 +135,7 @@ def build_non_distributed_cluster_spec():
   return {"foo": ["localhost:1234"]}
 
 
-class MainTest(test.TestCase):
+class LearnRunnerRunWithOutputDirTest(test.TestCase):
 
   def setUp(self):
     # Ensure the TF_CONFIG environment variable is unset for all tests.
@@ -108,16 +145,170 @@ class MainTest(test.TestCase):
     self.assertEqual(
         "simple_task, default=None.",
         learn_runner.run(build_experiment,
-                         output_dir="/tmp",
+                         output_dir=_MODIR_DIR,
                          schedule="simple_task"))
 
   def test_run_with_explicit_local_run(self):
     self.assertEqual(
-        "local_run",
+        "local_run-" + _MODIR_DIR,
         learn_runner.run(build_experiment,
-                         output_dir="/tmp",
+                         output_dir=_MODIR_DIR,
+                         schedule="local_run"))
+
+  def test_fail_output_dir_and_run_config_are_both_set(self):
+    with self.assertRaisesRegexp(
+        ValueError, _CANNOT_SET_BOTH_OUTPUT_DIR_AND_CONFIG_MSG):
+      learn_runner.run(build_experiment,
+                       output_dir=_MODIR_DIR,
+                       schedule="simple_task",
+                       run_config=run_config_lib.RunConfig())
+
+  def test_fail_empty_output_dir(self):
+    with self.assertRaisesRegexp(ValueError, _MUST_SPECIFY_OUTPUT_DIR_MSG):
+      learn_runner.run(build_experiment, output_dir="", schedule="simple_task")
+
+  def test_fail_no_output_dir(self):
+    with self.assertRaisesRegexp(
+        ValueError, _MUST_SPECIFY_OUTPUT_DIR_OR_CONFIG_MSG):
+      learn_runner.run(build_experiment, None, "simple_task")
+
+  def test_fail_hparams_are_set(self):
+    hparams = _HPARAMS
+    with self.assertRaisesRegexp(
+        ValueError, _HPARAMS_CANNOT_BE_SET_FOR_OUTPUT_DIR_MSG):
+      learn_runner.run(
+          build_experiment, _MODIR_DIR, schedule="simple_task", hparams=hparams)
+
+  def test_fail_non_callable(self):
+    with self.assertRaisesRegexp(TypeError, _EXP_NOT_CALLABLE_MSG):
+      learn_runner.run("not callable", _MODIR_DIR, "simple_test")
+
+  def test_fail_not_experiment(self):
+    with self.assertRaisesRegexp(TypeError, _NOT_EXP_TYPE_MSG):
+      learn_runner.run(build_non_experiment, _MODIR_DIR, "simple_test")
+
+  def test_fail_non_existent_task(self):
+    with self.assertRaisesRegexp(ValueError, _NON_EXIST_TASK_MSG):
+      learn_runner.run(build_experiment, _MODIR_DIR, "mirage")
+
+  def test_fail_non_callable_task(self):
+    with self.assertRaisesRegexp(TypeError, _NON_CALLABLE_MSG):
+      learn_runner.run(build_experiment, _MODIR_DIR, "default")
+
+
+class LearnRunnerRunWithRunConfigTest(test.TestCase):
+
+  def setUp(self):
+    # Ensure the TF_CONFIG environment variable is unset for all tests.
+    os.environ.pop("TF_CONFIG", None)
+
+  def test_run_with_custom_schedule(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "simple_task, default=None.",
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
+                         schedule="simple_task"))
+
+  def test_run_with_hparams(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "simple_task, default=None.",
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
+                         schedule="simple_task",
+                         hparams=_HPARAMS))
+
+  def test_run_with_explicit_local_run(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    self.assertEqual(
+        "local_run-" + _MODIR_DIR,
+        learn_runner.run(build_experiment_for_run_config,
+                         run_config=run_config,
                          schedule="local_run"))
 
+  def test_fail_empty_output_dir(self):
+    run_config = run_config_lib.RunConfig(model_dir="")
+    with self.assertRaisesRegexp(ValueError, _MISSING_MODEL_DIR_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_no_output_dir(self):
+    run_config = run_config_lib.RunConfig()
+    with self.assertRaisesRegexp(ValueError, _MISSING_MODEL_DIR_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_invalid_run_config_type(self):
+    run_config = "invalid_run_config"
+    with self.assertRaisesRegexp(ValueError, _INVALID_RUN_CONFIG_TYPE_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run")
+
+  def test_fail_invalid_hparams_type(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(ValueError, _INVALID_HPARAMS_ERR_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="local_run",
+                       hparams=["hparams"])
+
+  def test_fail_non_callable(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _EXP_NOT_CALLABLE_MSG):
+      learn_runner.run("not callable",
+                       run_config=run_config,
+                       schedule="simple_task")
+
+  def test_fail_not_experiment(self):
+    def _experiment_fn(run_config, hparams):
+      del run_config, hparams  # unused.
+      return "not experiment"
+
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _NOT_EXP_TYPE_MSG):
+      learn_runner.run(_experiment_fn,
+                       run_config=run_config,
+                       schedule="simple_task")
+
+  def test_fail_non_existent_task(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(ValueError, _NON_EXIST_TASK_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="mirage")
+
+  def test_fail_non_callable_task(self):
+    run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+    with self.assertRaisesRegexp(TypeError, _NON_CALLABLE_MSG):
+      learn_runner.run(build_experiment_for_run_config,
+                       run_config=run_config,
+                       schedule="default")
+
+  def test_basic_run_config_uid_check(self):
+    expected_run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR)
+
+    def _experiment_fn(run_config, hparams):
+      del run_config, hparams  # unused.
+      # Explicitly use a new run_config.
+      new_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR + "/123")
+
+      return TestExperiment(config=new_config)
+
+    with self.assertRaisesRegexp(RuntimeError, _RUN_CONFIG_UID_CHECK_ERR_MSG):
+      learn_runner.run(experiment_fn=_experiment_fn,
+                       run_config=expected_run_config)
+
+
+class LearnRunnerDefaultScheduleTest(test.TestCase):
+
+  def setUp(self):
+    # Ensure the TF_CONFIG environment variable is unset for all tests.
+    os.environ.pop("TF_CONFIG", None)
+
   def test_schedule_from_tf_config_runs_train_on_worker(self):
     os.environ["TF_CONFIG"] = json.dumps({
         "cluster": build_distributed_cluster_spec(),
@@ -126,11 +317,12 @@ class MainTest(test.TestCase):
         }
     })
     # RunConfig constructor will set job_name from TF_CONFIG.
-    config = run_config.RunConfig()
+    config = run_config_lib.RunConfig()
     self.assertEqual(
-        "train",
-        learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                         output_dir="/tmp"))
+        "train-" + _MODIR_DIR,
+        learn_runner.run(
+            build_experiment_fn_for_output_dir(config),
+            output_dir=_MODIR_DIR))
 
   def test_schedule_from_tf_config_runs_train_and_evaluate_on_master(self):
     tf_config = {
@@ -140,11 +332,12 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "train_and_evaluate",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
+          "train_and_evaluate-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_schedule_from_tf_config_runs_serve_on_ps(self):
     tf_config = {
@@ -154,30 +347,27 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "run_std_server",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
-
-  def test_fail_no_output_dir(self):
-    self.assertRaisesRegexp(ValueError, "Must specify an output directory",
-                            learn_runner.run, build_experiment, "",
-                            "simple_task")
+          "run_std_server-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_no_schedule_and_no_config_runs_train_and_evaluate(self):
     self.assertEqual(
-        "train_and_evaluate",
-        learn_runner.run(build_experiment, output_dir="/tmp"))
+        "train_and_evaluate-" + _MODIR_DIR,
+        learn_runner.run(build_experiment, output_dir=_MODIR_DIR))
 
   def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self):
     tf_config = {"cluster": build_non_distributed_cluster_spec()}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertEqual(
-          "train_and_evaluate",
-          learn_runner.run(lambda output_dir: TestExperiment(config=config),
-                           output_dir="/tmp"))
+          "train_and_evaluate-" + _MODIR_DIR,
+          learn_runner.run(
+              build_experiment_fn_for_output_dir(config),
+              output_dir=_MODIR_DIR))
 
   def test_fail_task_type_with_no_default_schedule(self):
     tf_config = {
@@ -187,43 +377,24 @@ class MainTest(test.TestCase):
         }
     }
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       create_experiment_fn = lambda output_dir: TestExperiment(config=config)
-      self.assertRaisesRegexp(ValueError, "No default schedule",
-                              learn_runner.run, create_experiment_fn, "/tmp")
-
-  def test_fail_non_callable(self):
-    self.assertRaisesRegexp(TypeError, "Experiment builder .* is not callable",
-                            learn_runner.run, "not callable", "/tmp",
-                            "simple_test")
-
-  def test_fail_not_experiment(self):
-    self.assertRaisesRegexp(TypeError,
-                            "Experiment builder did not return an Experiment",
-                            learn_runner.run, build_non_experiment, "/tmp",
-                            "simple_test")
-
-  def test_fail_non_existent_task(self):
-    self.assertRaisesRegexp(ValueError, "Schedule references non-existent task",
-                            learn_runner.run, build_experiment, "/tmp",
-                            "mirage")
-
-  def test_fail_non_callable_task(self):
-    self.assertRaisesRegexp(TypeError,
-                            "Schedule references non-callable member",
-                            learn_runner.run, build_experiment, "/tmp",
-                            "default")
+      self.assertRaisesRegexp(ValueError,
+                              "No default schedule",
+                              learn_runner.run,
+                              create_experiment_fn,
+                              _MODIR_DIR)
 
   def test_fail_schedule_from_config_with_no_task_type(self):
     tf_config = {"cluster": build_distributed_cluster_spec()}
     with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-      config = run_config.RunConfig()
+      config = run_config_lib.RunConfig()
       self.assertRaisesRegexp(
           ValueError,
           "Must specify a schedule",
           learn_runner.run,
           lambda output_dir: TestExperiment(config=config),
-          output_dir="/tmp")
+          output_dir=_MODIR_DIR)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index 7be5748fa454b6cd182ee240121c877f0d272c72..eafc925ad68361e097456a6601dcd9e1297389ba 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import six
 
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
 
 
 def _assert_named_args(sentinel):
@@ -43,11 +43,11 @@ def _args(fn):
   if hasattr(fn, 'func') and hasattr(fn, 'keywords'):
     # Handle functools.partial and similar objects.
     return tuple([
-        arg for arg in inspect.getargspec(fn.func).args
+        arg for arg in tf_inspect.getargspec(fn.func).args
         if arg not in set(fn.keywords.keys())
     ])
   # Handle function.
-  return tuple(inspect.getargspec(fn).args)
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 _CANONICAL_LABELS_ARG = 'labels'
diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py
index fa9f52e9223f17f970747bbdf03c42841e0d83ed..e97992fd209ddd6ad6ada2baef406b059f834255 100644
--- a/tensorflow/contrib/learn/python/learn/monitors.py
+++ b/tensorflow/contrib/learn/python/learn/monitors.py
@@ -35,7 +35,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import os
 import time
 
@@ -48,11 +47,13 @@ from tensorflow.contrib.learn.python.learn import session_run_hook
 from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import summary_io
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import tf_inspect
 
 
 # TODO(ptucker): Split each monitor class into a separate file.
@@ -473,7 +474,7 @@ class LoggingTrainable(EveryN):
 
   def every_n_step_begin(self, step):
     super(LoggingTrainable, self).every_n_step_begin(step)
-    # Get a list of trainable variables at the begining of every N steps.
+    # Get a list of trainable variables at the beginning of every N steps.
     # We cannot get this in __init__ because train_op has not been generated.
     trainables = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=self._scope)
@@ -634,6 +635,23 @@ class ValidationMonitor(EveryN):
     """Returns the best early stopping metric value found so far."""
     return self._best_value
 
+  def _evaluate_estimator(self):
+    if isinstance(self._estimator, core_estimator.Estimator):
+      if any((x is not None for x in
+              [self.x, self.y, self.batch_size, self.metrics])):
+        raise ValueError(
+            "tf.estimator.Estimator does not support following "
+            "arguments: x, y, batch_size, metrics. Should set as `None` "
+            "in ValidationMonitor")
+      return self._estimator.evaluate(
+          input_fn=self.input_fn, steps=self.eval_steps, hooks=self.hooks,
+          name=self.name)
+    else:
+      return self._estimator.evaluate(
+          x=self.x, y=self.y, input_fn=self.input_fn,
+          batch_size=self.batch_size, steps=self.eval_steps,
+          metrics=self.metrics, hooks=self.hooks, name=self.name)
+
   def every_n_step_end(self, step, outputs):
     super(ValidationMonitor, self).every_n_step_end(step, outputs)
     # TODO(mdan): The use of step below is probably misleading.
@@ -656,10 +674,7 @@ class ValidationMonitor(EveryN):
     self._latest_path_step = step
 
     # Run evaluation and log it.
-    validation_outputs = self._estimator.evaluate(
-        x=self.x, y=self.y, input_fn=self.input_fn, batch_size=self.batch_size,
-        steps=self.eval_steps, metrics=self.metrics, hooks=self.hooks,
-        name=self.name)
+    validation_outputs = self._evaluate_estimator()
     stats = []
     for name in validation_outputs:
       stats.append("%s = %s" % (name, str(validation_outputs[name])))
@@ -919,6 +934,10 @@ class ExportMonitor(EveryN):
   def every_n_step_end(self, step, outputs):
     super(ExportMonitor, self).every_n_step_end(step, outputs)
     try:
+      if isinstance(self._estimator, core_estimator.Estimator):
+        raise ValueError(
+            "ExportMonitor does not support `tf.estimator.Estimator. `. "
+            "Please pass an ExportStrategy to Experiment instead.")
       self._last_export_dir = self._estimator.export(
           self.export_dir,
           exports_to_keep=self.exports_to_keep,
@@ -946,6 +965,10 @@ class ExportMonitor(EveryN):
       logging.info("Skipping export at the end since model has not been saved "
                    "yet.")
       return
+    if isinstance(self._estimator, core_estimator.Estimator):
+      raise ValueError(
+          "ExportMonitor does not support `tf.estimator.Estimator. `. "
+          "Please pass an ExportStrategy to Experiment instead.")
     try:
       self._last_export_dir = self._estimator.export(
           self.export_dir,
@@ -1164,7 +1187,7 @@ class RunHookAdapterForMonitors(session_run_hook.SessionRunHook):
   def end(self, session):
     self._last_step = None
     for m in self._monitors:
-      if "session" in inspect.getargspec(m.end).args:
+      if "session" in tf_inspect.getargspec(m.end).args:
         m.end(session=session)
       else:
         m.end()
diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py
index f9ee03c94437e8d5e671b418a90c6a95e2037c40..221d5f1fef6b4a887e7d8f9f041d66db44b47e3e 100644
--- a/tensorflow/contrib/learn/python/learn/monitors_test.py
+++ b/tensorflow/contrib/learn/python/learn/monitors_test.py
@@ -31,6 +31,7 @@ from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import estimators
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -449,6 +450,62 @@ class MonitorsTest(test.TestCase):
       monitor.epoch_end(epoch=0)
       monitor.end()
 
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_with_core_estimator(self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    self._assert_validation_monitor(monitor)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+      self.assertEqual(0, estimator.evaluate.call_count)
+
+      # Step 0, initial loss.
+      step = 0
+      mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+      validation_outputs['loss'] = 42.0
+      self.assertEqual(0, len(monitor.step_begin(step=step)))
+      self.assertFalse(monitor.step_end(step=step, output={}))
+      self.assertEqual(1, estimator.evaluate.call_count)
+      self._assert_validation_monitor(
+          monitor, expected_best_step=0, expected_best_value=42.0)
+      monitor.post_step(step=step, session=None)
+
+  @test.mock.patch.object(saver, 'latest_checkpoint')
+  def test_validation_monitor_fail_with_core_estimator_and_metrics(
+      self, mock_latest_checkpoint):
+    estimator = test.mock.Mock(spec=core_estimator.Estimator)
+    model_dir = 'model/dir'
+    estimator.model_dir = model_dir
+    validation_outputs = {'loss': None}
+    estimator.evaluate.return_value = validation_outputs
+
+    monitor = learn.monitors.ValidationMonitor(
+        input_fn=lambda: constant_op.constant(2.0),
+        metrics=constant_op.constant(2.0),
+        every_n_steps=0, early_stopping_rounds=2)
+    monitor.set_estimator(estimator)
+    with ops.Graph().as_default() as g, self.test_session(g):
+      monitor.begin(max_steps=100)
+      monitor.epoch_begin(epoch=0)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          'tf.estimator.Estimator does not support .* metrics'):
+        step = 0
+        mock_latest_checkpoint.return_value = '%s/ckpt.%s' % (model_dir, step)
+        validation_outputs['loss'] = 42.0
+        self.assertEqual(0, len(monitor.step_begin(step=step)))
+        self.assertFalse(monitor.step_end(step=step, output={}))
+
   def test_graph_dump(self):
     monitor0 = learn.monitors.GraphDump()
     monitor1 = learn.monitors.GraphDump()
diff --git a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
index 0faba7cee5ea961d9a289db20f59516fd0e286d9..45727faab4362abeab18f77861353eb53976023a 100644
--- a/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/seq2seq_ops.py
@@ -140,7 +140,7 @@ def rnn_seq2seq(encoder_inputs,
     scope: Scope to use, if None new will be produced.
 
   Returns:
-    List of tensors for outputs and states for trianing and sampling sub-graphs.
+    List of tensors for outputs and states for training and sampling sub-graphs.
   """
   with vs.variable_scope(scope or "rnn_seq2seq"):
     _, last_enc_state = rnn.static_rnn(
diff --git a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
index 9d4fed9998783fc86dc9b99a634e9758c311689b..5709955c49fba50ca4a299a443a2902bbd9c6b23 100644
--- a/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
+++ b/tensorflow/contrib/learn/python/learn/preprocessing/categorical_vocabulary.py
@@ -128,9 +128,9 @@ class CategoricalVocabulary(object):
       Class name.
 
     Raises:
-      ValueError: if this vocabulary wasn't initalized with support_reverse.
+      ValueError: if this vocabulary wasn't initialized with support_reverse.
     """
     if not self._support_reverse:
-      raise ValueError("This vocabulary wasn't initalized with "
+      raise ValueError("This vocabulary wasn't initialized with "
                        "support_reverse to support reverse() function.")
     return self._reverse_mapping[class_id]
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index 2d1d46042518e07f9e6246dcdb6d6809d972ee7e..972fec026f25d39dca75e8c5bafffb57fcd323fa 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -49,7 +49,7 @@ class Trainable(object):
       steps: Number of steps for which to train model. If `None`, train forever.
         'steps' works incrementally. If you call two times fit(steps=10) then
         training occurs in total 20 steps. If you don't want to have incremental
-        behaviour please set `max_steps` instead. If set, `max_steps` must be
+        behavior please set `max_steps` instead. If set, `max_steps` must be
         `None`.
       batch_size: minibatch size to use on the input, defaults to first
         dimension of `x`. Must be `None` if `input_fn` is provided.
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index b53be292830c00eb4eb03cdd2cd0965b790aa170..6af2287761299f6725f9547917101c18b0cc0164 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
@@ -67,17 +67,17 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
   with graph.as_default():
     with tf_session.Session('') as session:
       variables.local_variables_initializer()
-      data_flow_ops.tables_initializer()
+      lookup_ops.tables_initializer()
       saver.restore(session, checkpoint_path)
 
       export = exporter.Exporter(saver)
-      export.init(init_op=control_flow_ops.group(
-          variables.local_variables_initializer(),
-          data_flow_ops.tables_initializer()),
-                  default_graph_signature=default_graph_signature,
-                  named_graph_signatures=named_graph_signatures,
-                  assets_collection=ops.get_collection(
-                      ops.GraphKeys.ASSET_FILEPATHS))
+      export.init(
+          init_op=control_flow_ops.group(
+              variables.local_variables_initializer(),
+              lookup_ops.tables_initializer()),
+          default_graph_signature=default_graph_signature,
+          named_graph_signatures=named_graph_signatures,
+          assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
       return export.export(export_dir, contrib_variables.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
@@ -89,7 +89,7 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
 def generic_signature_fn(examples, unused_features, predictions):
   """Creates generic signature from given examples and predictions.
 
-  This is needed for backward compatibility with default behaviour of
+  This is needed for backward compatibility with default behavior of
   export_estimator.
 
   Args:
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index d3270dcc1622902088fbb783bb38990e8a990043..9c63096d0ee85320eb020fc59c576bacc141cbc0 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -29,10 +29,6 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-def tearDownModule():
-  gfile.DeleteRecursively(test.get_temp_dir())
-
-
 class GcTest(test_util.TensorFlowTestCase):
 
   def testLargestExportVersions(self):
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 7ad3779314bab8be458cc078a9449048c56a9ffe..3f0f3092534e6c886bb24d368e0e60322213e1d2 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -42,6 +42,7 @@ from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import gc
 from tensorflow.contrib.learn.python.learn.utils import input_fn_utils
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
@@ -308,7 +309,7 @@ def get_most_recent_export(export_dir_base):
                      directories.
 
   Returns:
-    A gc.Path, whith is just a namedtuple of (path, export_version).
+    A gc.Path, with is just a namedtuple of (path, export_version).
   """
   select_filter = gc.largest_export_versions(1)
   results = select_filter(gc.get_paths(export_dir_base,
@@ -352,7 +353,8 @@ def make_export_strategy(serving_input_fn,
       `InputFnOps`.
     default_output_alternative_key: the name of the head to serve when an
       incoming serving request does not explicitly request a specific head.
-      Not needed for single-headed models.
+      Must be `None` if the estimator inherits from ${tf.estimator.Estimator}
+      or for single-headed models.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel.  Each key should give the destination
       path (including the filename) relative to the assets.extra directory.
@@ -384,14 +386,30 @@ def make_export_strategy(serving_input_fn,
 
     Returns:
       The string path to the exported directory.
+
+    Raises:
+      ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
+        and `default_output_alternative_key` was specified.
     """
-    export_result = estimator.export_savedmodel(
-        export_dir_base,
-        serving_input_fn,
-        default_output_alternative_key=default_output_alternative_key,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path)
+    if isinstance(estimator, core_estimator.Estimator):
+      if default_output_alternative_key is not None:
+        raise ValueError(
+            'default_output_alternative_key is not supported in core '
+            'Estimator. Given: {}'.format(default_output_alternative_key))
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
+    else:
+      export_result = estimator.export_savedmodel(
+          export_dir_base,
+          serving_input_fn,
+          default_output_alternative_key=default_output_alternative_key,
+          assets_extra=assets_extra,
+          as_text=as_text,
+          checkpoint_path=checkpoint_path)
 
     garbage_collect_exports(export_dir_base, exports_to_keep)
     return export_result
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index 157744bed1d52fc5e60edd99b275c7c268aa8a7b..a15eadd018f1f92377039abe0e9434e10fab5a34 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -109,7 +109,7 @@ class SavedModelExportUtilsTest(test.TestCase):
     self.assertEqual(actual_signature_def, expected_signature_def)
 
   def test_build_standardized_signature_def_classification2(self):
-    """Tests multiple output tensors that include classes and probabilites."""
+    """Tests multiple output tensors that include classes and probabilities."""
     input_tensors = {
         "input-1":
             array_ops.placeholder(
diff --git a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
index 8dcfb775b2711a356c2ff31f95019bfa5df08ee7..824a9648693aba7cd73872c34864b8b8b61473b0 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/kernel_tests/seq2seq_test.py
@@ -837,7 +837,7 @@ class Seq2SeqTest(test.TestCase):
   #     with variable_scope.variable_scope("new"):
   #       _, losses2 = SampleGRUSeq2Seq
   #           inp, out, weights, per_example_loss=True)
-  #       # First loss is scalar, the second one is a 1-dimensinal tensor.
+  #       # First loss is scalar, the second one is a 1-dimensional tensor.
   #       self.assertEqual([], losses1[0].get_shape().as_list())
   #       self.assertEqual([None], losses2[0].get_shape().as_list())
 
@@ -942,8 +942,8 @@ class Seq2SeqTest(test.TestCase):
         perplexities[bucket].append(math.exp(float(res[1])))
       for bucket in range(len(buckets)):
         if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
-          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
-                          1.1 * perplexities[bucket][0])
+          self.assertLess(perplexities[bucket][-1],  # 20% margin of error.
+                          1.2 * perplexities[bucket][0])
 
   def testModelWithBooleanFeedPrevious(self):
     """Test the model behavior when feed_previous is True.
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 9b196e2cf50e6ba9bf58068db362678fb69f14e9..9b4f36da15d6089566f06530deca9df0858a33cc 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -30,7 +30,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "linear_operator_addition_test",
-    size = "medium",
+    size = "small",
     srcs = ["python/kernel_tests/linear_operator_addition_test.py"],
     additional_deps = [
         ":linalg_py",
@@ -43,7 +43,6 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -61,7 +60,6 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -79,7 +77,6 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -96,7 +93,6 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -112,7 +108,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -128,7 +123,6 @@ cuda_py_tests(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 cuda_py_tests(
@@ -149,7 +143,7 @@ cuda_py_tests(
 
 cuda_py_tests(
     name = "linear_operator_util_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/linear_operator_util_test.py"],
     additional_deps = [
         ":linalg_py",
@@ -160,7 +154,6 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
 )
 
 py_library(
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
index 998073e28bd36f8e2ae5ef5e547f302225886f51..e2a7f5fbe10caaf578134dbea4395fd19f1a3a96 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
@@ -65,18 +65,21 @@ class SquareLinearOperatorCompositionTest(
       # feed_dict.
       matrices = sess.run(matrices)
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph])
+          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph],
+          is_square=True)
       feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
     else:
       operator = linalg.LinearOperatorComposition(
           [linalg.LinearOperatorFullMatrix(m) for m in matrices])
       feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
@@ -185,9 +188,9 @@ class NonSquareLinearOperatorCompositionTest(
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
-    apply_order_list = list(reversed(matrices))
-    mat = ops.convert_to_tensor(apply_order_list[0])
-    for other_mat in apply_order_list[1:]:
+    matmul_order_list = list(reversed(matrices))
+    mat = ops.convert_to_tensor(matmul_order_list[0])
+    for other_mat in matmul_order_list[1:]:
       mat = math_ops.matmul(other_mat, mat)
 
     return operator, mat, feed_dict
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
index 3bb81a4333cf678153d5643d79359021e6614df8..397bfa22156e2f9398180b8fa57f34a10334906d 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_diag_test.py
@@ -122,7 +122,7 @@ class LinearOperatorDiagTest(
     with self.assertRaisesRegexp(ValueError, "must have at least 1 dimension"):
       linalg.LinearOperatorDiag(1.)
 
-  def test_broadcast_apply_and_solve(self):
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.matmul cannot handle.
     # In particular, tf.matmul does not broadcast.
@@ -130,7 +130,7 @@ class LinearOperatorDiagTest(
       x = random_ops.random_normal(shape=(2, 2, 3, 4))
 
       # This LinearOperatorDiag will be brodacast to (2, 2, 3, 3) during solve
-      # and apply with 'x' as the argument.
+      # and matmul with 'x' as the argument.
       diag = random_ops.random_uniform(shape=(2, 1, 3))
       operator = linalg.LinearOperatorDiag(diag, is_self_adjoint=True)
       self.assertAllEqual((2, 1, 3, 3), operator.shape)
@@ -140,10 +140,10 @@ class LinearOperatorDiagTest(
       mat = array_ops.matrix_diag(diag_broadcast)
       self.assertAllEqual((2, 2, 3, 3), mat.get_shape())  # being pedantic.
 
-      operator_apply = operator.apply(x)
-      mat_apply = math_ops.matmul(mat, x)
-      self.assertAllEqual(operator_apply.get_shape(), mat_apply.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, mat_apply]))
+      operator_matmul = operator.matmul(x)
+      mat_matmul = math_ops.matmul(mat, x)
+      self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
index 93cbb48e1b286650130dad1b7dd13526ac46792e..528bc3ed124e96fe4630a3a99beb8c18635b6f8e 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
@@ -17,12 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib import linalg as linalg_lib
 from tensorflow.contrib.linalg.python.ops import linear_operator_test_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
@@ -45,9 +48,10 @@ class SquareLinearOperatorFullMatrixTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
-      operator = linalg.LinearOperatorFullMatrix(matrix)
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph, is_square=True)
       feed_dict = {matrix_ph: matrix}
     else:
+      # is_square should be auto-detected here.
       operator = linalg.LinearOperatorFullMatrix(matrix)
       feed_dict = None
 
@@ -68,6 +72,46 @@ class SquareLinearOperatorFullMatrixTest(
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
+    # Auto-detected.
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
+    with self.test_session():
+      tril = linear_operator_test_util.random_tril_matrix(
+          shape=(50, 50), dtype=np.float32)
+      diag = np.logspace(-2, 2, 50).astype(np.float32)
+      tril = array_ops.matrix_set_diag(tril, diag)
+      matrix = math_ops.matmul(tril, tril, transpose_b=True).eval()
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        # Ensure that we have finite condition number...just HUGE.
+        cond = np.linalg.cond(matrix)
+        self.assertTrue(np.isfinite(cond))
+        self.assertGreater(cond, 1e12)
+        operator.assert_non_singular().run()
+
+  def test_assert_non_singular_raises_if_cond_infinite(self):
+    with self.test_session():
+      matrix = [[1., 1.], [1., 1.]]
+      # We don't pass the is_self_adjoint hint here, which means we take the
+      # generic code path.
+      operator = linalg.LinearOperatorFullMatrix(matrix)
+      with self.assertRaisesOpError("Singular matrix"):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=True)
+    with self.test_session():
+      with self.assertRaisesOpError("Cholesky decomposition was not success"):
+        operator.assert_positive_definite().run()
 
 
 class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
@@ -104,8 +148,9 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
+      # is_square is auto-set because of self_adjoint/pd.
       operator = linalg.LinearOperatorFullMatrix(
-          matrix, is_self_adjoint=True, is_positive_definite=True)
+          matrix_ph, is_self_adjoint=True, is_positive_definite=True)
       feed_dict = {matrix_ph: matrix}
     else:
       operator = linalg.LinearOperatorFullMatrix(
@@ -129,7 +174,36 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
 
     # Should be auto-set
     self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator._is_spd)
+    self.assertTrue(operator._can_use_cholesky)
+    self.assertTrue(operator.is_square)
+
+  def test_assert_non_singular(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_non_singular().run()
+
+  def test_assert_self_adjoint(self):
+    matrix = [[0., 1.], [0., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      with self.assertRaisesOpError("not equal to its adjoint"):
+        operator.assert_self_adjoint().run()
+
+  def test_assert_positive_definite(self):
+    matrix = [[1., 1.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_self_adjoint=True, is_positive_definite=True)
+    with self.test_session():
+      # Cholesky decomposition may fail, so the error is not specific to
+      # non-singular.
+      with self.assertRaisesOpError(""):
+        operator.assert_positive_definite().run()
 
 
 class NonSquareLinearOperatorFullMatrixTest(
@@ -144,7 +218,7 @@ class NonSquareLinearOperatorFullMatrixTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
-      operator = linalg.LinearOperatorFullMatrix(matrix)
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph)
       feed_dict = {matrix_ph: matrix}
     else:
       operator = linalg.LinearOperatorFullMatrix(matrix)
@@ -157,16 +231,14 @@ class NonSquareLinearOperatorFullMatrixTest(
     return operator, mat, feed_dict
 
   def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[3., 0.], [1., 1.]]
+    matrix = [[3., 2., 1.], [1., 1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix,
-        is_positive_definite=True,
-        is_non_singular=True,
         is_self_adjoint=False)
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_non_singular)
+    self.assertEqual(operator.is_positive_definite, None)
+    self.assertEqual(operator.is_non_singular, None)
     self.assertFalse(operator.is_self_adjoint)
+    self.assertFalse(operator.is_square)
 
   def test_matrix_must_have_at_least_two_dims_or_raises(self):
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
index 36a255f3d506c96a2f67263c383e0f763cf47ccb..5faf2c432b6610863864717fd5f693b1aa781915 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_identity_test.py
@@ -77,14 +77,14 @@ class LinearOperatorIdentityTest(
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_self_adjoint().run()  # Should not fail
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -147,7 +147,7 @@ class LinearOperatorIdentityTest(
     operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -156,7 +156,7 @@ class LinearOperatorIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorIdentity(
           num_rows, assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
@@ -168,11 +168,11 @@ class LinearOperatorIdentityTest(
       x = random_ops.random_normal(shape=(1, 2, 3, 4))
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -182,15 +182,15 @@ class LinearOperatorIdentityTest(
       x = array_ops.placeholder(dtypes.float32)
       operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype)
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       expected = x
 
       feed_dict = {x: rng.randn(1, 2, 3, 4)}
 
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
-  def test_broadcast_apply_static_shapes(self):
+  def test_broadcast_matmul_static_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -204,14 +204,14 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
-  def test_broadcast_apply_dynamic_shapes(self):
+  def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -229,12 +229,12 @@ class LinearOperatorIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Expected result of apply and solve.
+      # Expected result of matmul and solve.
       expected = x + zeros
 
-      operator_apply = operator.apply(x)
+      operator_matmul = operator.matmul(x)
       self.assertAllClose(
-          *sess.run([operator_apply, expected], feed_dict=feed_dict))
+          *sess.run([operator_matmul, expected], feed_dict=feed_dict))
 
   def test_is_x_flags(self):
     # The is_x flags are by default all True.
@@ -332,7 +332,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not self-adjoint"):
         operator.assert_self_adjoint().run()
 
-  def test_float16_apply(self):
+  def test_float16_matmul(self):
     # float16 cannot be tested by base test class because tf.matrix_solve does
     # not work with float16.
     with self.test_session():
@@ -340,7 +340,7 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       self.assertAllClose(multiplier[..., None, None] * x, y.eval())
 
   def test_non_scalar_num_rows_raises_static(self):
@@ -354,7 +354,7 @@ class LinearOperatorScaledIdentityTest(
         num_rows=2, multiplier=2.2)
     x = rng.randn(3, 3).astype(np.float32)
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
-      operator.apply(x)
+      operator.matmul(x)
 
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
@@ -363,11 +363,11 @@ class LinearOperatorScaledIdentityTest(
     with self.test_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows, multiplier=[1., 2], assert_proper_shapes=True)
-      y = operator.apply(x)
+      y = operator.matmul(x)
       with self.assertRaisesOpError("Incompatible.*dimensions"):
         y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
 
-  def test_broadcast_apply_and_solve(self):
+  def test_broadcast_matmul_and_solve(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -383,11 +383,11 @@ class LinearOperatorScaledIdentityTest(
       # Batch matrix of zeros with the broadcast shape of x and operator.
       zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2 + zeros
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
@@ -395,7 +395,7 @@ class LinearOperatorScaledIdentityTest(
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
       self.assertAllClose(*sess.run([operator_solve, expected]))
 
-  def test_broadcast_apply_and_solve_scalar_scale_multiplier(self):
+  def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
     # In particular, tf.batch_matmul does not broadcast.
@@ -409,11 +409,11 @@ class LinearOperatorScaledIdentityTest(
       operator = linalg_lib.LinearOperatorScaledIdentity(
           num_rows=3, multiplier=2.2)
 
-      # Test apply
+      # Test matmul
       expected = x * 2.2
-      operator_apply = operator.apply(x)
-      self.assertAllEqual(operator_apply.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_apply, expected]))
+      operator_matmul = operator.matmul(x)
+      self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
+      self.assertAllClose(*sess.run([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
index 8b419700db01e5988f68468b4204a4817bd44110..78a4822c177c8d36fcbe82d3b557b2e6cb3630af 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -54,9 +55,12 @@ class LinearOperatorShape(linalg.LinearOperator):
   def _shape_tensor(self):
     return constant_op.constant(self._stored_shape, dtype=dtypes.int32)
 
+  def _matmul(self):
+    raise NotImplementedError("Not needed for this test.")
 
-class LinearOperatorApplyOnly(linalg.LinearOperator):
-  """LinearOperator that simply wraps a [batch] matrix and implements apply."""
+
+class LinearOperatorMatmulSolve(linalg.LinearOperator):
+  """LinearOperator that wraps a [batch] matrix and implements matmul/solve."""
 
   def __init__(self,
                matrix,
@@ -65,8 +69,8 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
                is_positive_definite=None,
                is_square=None):
     self._matrix = ops.convert_to_tensor(matrix, name="matrix")
-    super(LinearOperatorApplyOnly, self).__init__(
-        dtype=matrix.dtype,
+    super(LinearOperatorMatmulSolve, self).__init__(
+        dtype=self._matrix.dtype,
         is_non_singular=is_non_singular,
         is_self_adjoint=is_self_adjoint,
         is_positive_definite=is_positive_definite,
@@ -78,8 +82,15 @@ class LinearOperatorApplyOnly(linalg.LinearOperator):
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._matrix, x, adjoint_a=adjoint)
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = ops.convert_to_tensor(x, name="x")
+    return math_ops.matmul(
+        self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = ops.convert_to_tensor(rhs, name="rhs")
+    assert not adjoint_arg, "Not implemented for this test class."
+    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
 
 
 class LinearOperatorTest(test.TestCase):
@@ -118,7 +129,7 @@ class LinearOperatorTest(test.TestCase):
 
   def test_generic_to_dense_method_non_square_matrix_static(self):
     matrix = rng.randn(2, 3, 4)
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
@@ -127,12 +138,30 @@ class LinearOperatorTest(test.TestCase):
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
-    operator = LinearOperatorApplyOnly(matrix_ph)
+    operator = LinearOperatorMatmulSolve(matrix_ph)
     with self.test_session():
       operator_dense = operator.to_dense()
       self.assertAllClose(
           matrix, operator_dense.eval(feed_dict={matrix_ph: matrix}))
 
+  def test_matvec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    x = [1., 1.]
+    with self.test_session():
+      y = operator.matvec(x)
+      self.assertAllEqual((2,), y.get_shape())
+      self.assertAllClose([1., 2.], y.eval())
+
+  def test_solvevec(self):
+    matrix = [[1., 0], [0., 2.]]
+    operator = LinearOperatorMatmulSolve(matrix)
+    y = [1., 1.]
+    with self.test_session():
+      x = operator.solvevec(y)
+      self.assertAllEqual((2,), x.get_shape())
+      self.assertAllClose([1., 1 / 2.], x.eval())
+
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
     self.assertTrue(operator.is_square)
@@ -148,11 +177,11 @@ class LinearOperatorTest(test.TestCase):
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(matrix, is_non_singular=True, is_square=False)
+      LinearOperatorMatmulSolve(matrix, is_non_singular=True, is_square=False)
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_non_square_operators_raise_on_determinant_and_solve(self):
@@ -166,16 +195,16 @@ class LinearOperatorTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
-      LinearOperatorApplyOnly(
+      LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
-    operator = LinearOperatorApplyOnly(matrix)
+    operator = LinearOperatorMatmulSolve(matrix)
     self.assertEqual(None, operator.is_square)
     # Set to True
-    operator = LinearOperatorApplyOnly(matrix, is_square=True)
+    operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
 
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
index 7abe12f1a477bff5183288be2bd5628270d9deab..f28213096b75a252eab74d7b3b4ae3f8498cb2fb 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_udvh_update_test.py
@@ -56,10 +56,11 @@ class BaseLinearOperatorUDVHUpdatetest(object):
 
   @property
   def _shapes_to_test(self):
-    # Add the (2, 10, 10) shape at the end to get something slightly larger than
-    # the other tests.  Doing this because this operator makes use of inversion
-    # and determinant lemmas that are known to have stability issues.
-    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4), (2, 10, 10)]
+    # Previously we had a (2, 10, 10) shape at the end.  We did this to test the
+    # inversion and determinant lemmas on not-tiny matrices, since these are
+    # known to have stability issues.  This resulted in test timeouts, so this
+    # shape has been removed, but rest assured, the tests did pass.
+    return [(0, 0), (1, 1), (1, 3, 3), (3, 4, 4), (2, 1, 4, 4)]
 
   def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
     # Recall A = L + UDV^H
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
index a06af336e717b2ebb2bafff7d19c76dfd8631643..f047f4b9787cb6aa171d8d4ca2531878e6301cfb 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_util_test.py
@@ -229,6 +229,29 @@ class MatmulWithBroadcastTest(test.TestCase):
       self.assertAllEqual(expected, result)
 
 
+class MatrixAdjointTest(test.TestCase):
+
+  def testNonBatchMatrix(self):
+    a = [[1, 2, 3j], [4, 5, -6j]]  # Shape (2, 3)
+    expected = [[1, 4], [2, 5], [-3j, 6j]]  # Shape (3, 2)
+    with self.test_session():
+      a_adj = linear_operator_util.matrix_adjoint(a)
+      self.assertEqual((3, 2), a_adj.get_shape())
+      self.assertAllClose(expected, a_adj.eval())
+
+  def testBatchMatrix(self):
+    matrix_0 = [[1j, 2, 3], [4, 5, 6]]
+    matrix_0_a = [[-1j, 4], [2, 5], [3, 6]]
+    matrix_1 = [[11, 22, 33], [44, 55, 66j]]
+    matrix_1_a = [[11, 44], [22, 55], [33, -66j]]
+    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    expected_adj = [matrix_0_a, matrix_1_a]  # Shape (2, 3, 2)
+    with self.test_session():
+      matrix_adj = linear_operator_util.matrix_adjoint(batch_matrix)
+      self.assertEqual((2, 3, 2), matrix_adj.get_shape())
+      self.assertAllEqual(expected_adj, matrix_adj.eval())
+
+
 class DomainDimensionStubOperator(object):
 
   def __init__(self, domain_dimension):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index 5052a0b15cf355b50a45eef0a0eed314108d0d58..6cdfa8618932d0e9ae1198d68e78f36583022390 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -18,19 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import contextlib
 
+import numpy as np
+
 from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 __all__ = ["LinearOperator"]
 
 
 # TODO(langmore) Use matrix_solve_ls for singular or non-square matrices.
-# TODO(langmore) Add adjoint_x arg to apply, solve.
 class LinearOperator(object):
   """Base class defining a [batch of] linear operator[s].
 
@@ -46,16 +51,14 @@ class LinearOperator(object):
   To enable a public method, subclasses should implement the leading-underscore
   version of the method.  The argument signature should be identical except for
   the omission of `name="..."`.  For example, to enable
-  `apply(x, adjoint=False, name="apply")` a subclass should implement
-  `_apply(x, adjoint=False)`.
+  `matmul(x, adjoint=False, name="matmul")` a subclass should implement
+  `_matmul(x, adjoint=False)`.
 
   #### Performance contract
 
-  Subclasses should implement a method only if it can be done with a reasonable
-  performance increase over generic dense operations, either in time, parallel
-  scalability, or memory usage.  For example, if the determinant can only be
-  computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-  should not be implemented.
+  Subclasses should only implement the assert methods
+  (e.g. `assert_non_singular`) if they can be done in less than `O(N^3)`
+  time.
 
   Class docstrings should contain an explanation of computational complexity.
   Since this is a high-performance library, attention should be paid to detail,
@@ -69,7 +72,7 @@ class LinearOperator(object):
 
   An example is:
 
-  `x` is a batch matrix with compatible shape for `apply` if
+  `x` is a batch matrix with compatible shape for `matmul` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  b >= 0,
@@ -101,12 +104,12 @@ class LinearOperator(object):
   operator.shape()
   ==> [2, 4, 4]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> Shape [2] Tensor
 
   x = ... Shape [2, 4, 5] Tensor
 
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4, 5] Tensor
   ```
 
@@ -132,6 +135,7 @@ class LinearOperator(object):
   * If `is_X == None` (the default), callers should have no expectation either
     way.
   """
+  __metaclass__ = abc.ABCMeta
 
   def __init__(self,
                dtype,
@@ -147,7 +151,7 @@ class LinearOperator(object):
     **Subclasses should copy-paste this `__init__` documentation.**
 
     Args:
-      dtype: The type of the this `LinearOperator`.  Arguments to `apply` and
+      dtype: The type of the this `LinearOperator`.  Arguments to `matmul` and
         `solve` will have to be this type.
       graph_parents: Python list of graph prerequisites of this `LinearOperator`
         Typically tensors that are passed during initialization.
@@ -168,17 +172,23 @@ class LinearOperator(object):
       ValueError:  If hints are set incorrectly.
     """
     # Check and auto-set flags.
-    if is_square is False:
-      if is_non_singular or is_positive_definite:
-        raise ValueError(
-            "A non-singular or positive definite operator is always square.")
-    self._is_square_set_by_user = is_square
-
     if is_positive_definite:
       if is_non_singular is False:
         raise ValueError("A positive definite matrix is always non-singular.")
       is_non_singular = True
 
+    if is_non_singular:
+      if is_square is False:
+        raise ValueError("A non-singular matrix is always square.")
+      is_square = True
+
+    if is_self_adjoint:
+      if is_square is False:
+        raise ValueError("A self-adjoint matrix is always square.")
+      is_square = True
+
+    self._is_square_set_or_implied_by_hints = is_square
+
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
       if t is None or not contrib_framework.is_tensor(t):
@@ -240,15 +250,16 @@ class LinearOperator(object):
     """Return `True/False` depending on if this operator is square."""
     # Static checks done after __init__.  Why?  Because domain/range dimension
     # sometimes requires lots of work done in the derived class after init.
-    static_square_check = self.domain_dimension == self.range_dimension
-    if self._is_square_set_by_user is False and static_square_check:
+    auto_square_check = self.domain_dimension == self.range_dimension
+    if self._is_square_set_or_implied_by_hints is False and auto_square_check:
       raise ValueError(
           "User set is_square hint to False, but the operator was square.")
-    if self._is_square_set_by_user is None:
-      return static_square_check
+    if self._is_square_set_or_implied_by_hints is None:
+      return auto_square_check
 
-    return self._is_square_set_by_user
+    return self._is_square_set_or_implied_by_hints
 
+  @abc.abstractmethod
   def _shape(self):
     # Write this in derived class to enable all static shape methods.
     raise NotImplementedError("_shape is not implemented.")
@@ -266,6 +277,7 @@ class LinearOperator(object):
     """
     return self._shape()
 
+  @abc.abstractmethod
   def _shape_tensor(self):
     raise NotImplementedError("_shape_tensor is not implemented.")
 
@@ -368,8 +380,7 @@ class LinearOperator(object):
           self._cached_tensor_rank_tensor = ops.convert_to_tensor(
               self.tensor_rank)
         else:
-          self._cached_tensor_rank_tensor = array_ops.size(
-              self.shape_tensor())
+          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
       return self._cached_tensor_rank_tensor
 
   @property
@@ -449,14 +460,70 @@ class LinearOperator(object):
       return self._cached_range_dimension_tensor
 
   def _assert_non_singular(self):
+    """Private default implementation of _assert_non_singular."""
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_non_singular."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return self.assert_positive_definite()
+    else:
+      singular_values = linalg_ops.svd(
+          self._get_cached_dense_matrix(), compute_uv=False)
+      # TODO(langmore) Add .eig and .cond as methods.
+      cond = (math_ops.reduce_max(singular_values, axis=-1) /
+              math_ops.reduce_min(singular_values, axis=-1))
+      return check_ops.assert_less(
+          cond,
+          self._max_condition_number_to_be_non_singular(),
+          message="Singular matrix up to precision epsilon.")
     raise NotImplementedError("assert_non_singular is not implemented.")
 
+  def _max_condition_number_to_be_non_singular(self):
+    """Return the maximum condition number that we consider nonsingular."""
+    with ops.name_scope("max_nonsingular_condition_number"):
+      dtype_eps = np.finfo(self.dtype.as_numpy_dtype).eps
+      eps = math_ops.cast(
+          math_ops.reduce_max([
+              100.,
+              math_ops.cast(self.range_dimension_tensor(), self.dtype),
+              math_ops.cast(self.domain_dimension_tensor(), self.dtype)
+          ]), self.dtype) * dtype_eps
+      return 1. / eps
+
   def assert_non_singular(self, name="assert_non_singular"):
-    """Returns an `Op` that asserts this operator is non singular."""
+    """Returns an `Op` that asserts this operator is non singular.
+
+    This operator is considered non-singular if
+
+    ```
+    ConditionNumber < max{100, range_dimension, domain_dimension} * eps,
+    eps := np.finfo(self.dtype.as_numpy_dtype).eps
+    ```
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is singular.
+    """
     with self._name_scope(name):
       return self._assert_non_singular()
 
   def _assert_positive_definite(self):
+    """Default implementation of _assert_positive_definite."""
+    logging.warn(
+        "Using (possibly slow) default implementation of "
+        "assert_positive_definite."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    # If the operator is self-adjoint, then checking that
+    # Cholesky decomposition succeeds + results in positive diag is necessary
+    # and sufficient.
+    if self.is_self_adjoint:
+      return check_ops.assert_positive(
+          array_ops.matrix_diag_part(self._get_cached_chol()),
+          message="Matrix was not positive definite.")
+    # We have no generic check for positive definite.
     raise NotImplementedError("assert_positive_definite is not implemented.")
 
   def assert_positive_definite(self, name="assert_positive_definite"):
@@ -470,16 +537,35 @@ class LinearOperator(object):
       name:  A name to give this `Op`.
 
     Returns:
-      An `Op` that asserts this operator is positive definite.
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not positive definite.
     """
     with self._name_scope(name):
       return self._assert_positive_definite()
 
   def _assert_self_adjoint(self):
-    raise NotImplementedError("assert_self_adjoint is not implemented.")
+    dense = self._get_cached_dense_matrix()
+    logging.warn(
+        "Using (possibly slow) default implementation of assert_self_adjoint."
+        "  Requires conversion to a dense matrix.")
+    return check_ops.assert_equal(
+        dense,
+        linear_operator_util.matrix_adjoint(dense),
+        message="Matrix was not equal to its adjoint.")
 
   def assert_self_adjoint(self, name="assert_self_adjoint"):
-    """Returns an `Op` that asserts this operator is self-adjoint."""
+    """Returns an `Op` that asserts this operator is self-adjoint.
+
+    Here we check that this operator is *exactly* equal to its hermitian
+    transpose.
+
+    Args:
+      name:  A string name to prepend to created ops.
+
+    Returns:
+      An `Assert` `Op`, that, when run, will raise an `InvalidArgumentError` if
+        the operator is not self-adjoint.
+    """
     with self._name_scope(name):
       return self._assert_self_adjoint()
 
@@ -487,19 +573,36 @@ class LinearOperator(object):
     """Check that arg.dtype == self.dtype."""
     if arg.dtype != self.dtype:
       raise TypeError(
-          "Expected argument to have dtype %s.  Found: %s in tensor %s"
-          % (self.dtype, arg.dtype, arg))
+          "Expected argument to have dtype %s.  Found: %s in tensor %s" %
+          (self.dtype, arg.dtype, arg))
 
-  def _apply(self, x, adjoint=False):
-    raise NotImplementedError("_apply is not implemented.")
+  @abc.abstractmethod
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError("_matmul is not implemented.")
 
-  def apply(self, x, adjoint=False, name="apply"):
-    """Transform `x` with left multiplication:  `x --> Ax`.
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+    """Transform [batch] matrix `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    X = ... # shape [..., N, R], batch matrix, R > 0.
+
+    Y = operator.matmul(X)
+    Y.shape
+    ==> [..., M, R]
+
+    Y[..., :, r] = sum_j A[..., :, j] X[j, r]
+    ```
 
     Args:
       x: `Tensor` with compatible shape and same `dtype` as `self`.
         See class docstring for definition of compatibility.
-      adjoint: Python `bool`.  If `True`, left multiply by the adjoint.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
+        the hermitian transpose (transposition and complex conjugation).
       name:  A name for this `Op.
 
     Returns:
@@ -508,14 +611,59 @@ class LinearOperator(object):
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
-      if adjoint:
-        self.shape[-2].assert_is_compatible_with(x.get_shape()[-2])
-      else:
-        self.shape[-1].assert_is_compatible_with(x.get_shape()[-2])
-      return self._apply(x, adjoint=adjoint)
+
+      self_dim = -2 if adjoint else -1
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _matvec(self, x, adjoint=False):
+    x_mat = array_ops.expand_dims(x, axis=-1)
+    y_mat = self.matmul(x_mat, adjoint=adjoint)
+    return array_ops.squeeze(y_mat, axis=-1)
+
+  def matvec(self, x, adjoint=False, name="matvec"):
+    """Transform [batch] vector `x` with left multiplication:  `x --> Ax`.
+
+    ```python
+    # Make an operator acting like batch matric A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+
+    X = ... # shape [..., N], batch vector
+
+    Y = operator.matvec(X)
+    Y.shape
+    ==> [..., M]
+
+    Y[..., :] = sum_j A[..., :, j] X[..., j]
+    ```
+
+    Args:
+      x: `Tensor` with compatible shape and same `dtype` as `self`.
+        `x` is treated as a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.
+        See class docstring for definition of compatibility.
+      adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
+      name:  A name for this `Op.
+
+    Returns:
+      A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
+    """
+    with self._name_scope(name, values=[x]):
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+      self_dim = -2 if adjoint else -1
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[-1])
+      return self._matvec(x, adjoint=adjoint)
 
   def _determinant(self):
-    raise NotImplementedError("_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      return math_ops.exp(self.log_abs_determinant())
+    return linalg_ops.matrix_determinant(self._matrix)
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
@@ -537,7 +685,14 @@ class LinearOperator(object):
       return self._determinant()
 
   def _log_abs_determinant(self):
-    raise NotImplementedError("_log_abs_det is not implemented.")
+    logging.warn(
+        "Using (possibly slow) default implementation of determinant."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    if self._can_use_cholesky():
+      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+    abs_det = math_ops.abs(self.determinant())
+    return math_ops.log(abs_det)
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
@@ -558,37 +713,53 @@ class LinearOperator(object):
     with self._name_scope(name):
       return self._log_abs_determinant()
 
-  def _solve(self, rhs, adjoint=False):
-    # Since this is an exact solve method for all rhs, this will only be
-    # available for non-singular (batch) operators, in particular the operator
-    # must be square.
-    raise NotImplementedError("_solve is not implemented.")
-
-  def solve(self, rhs, adjoint=False, name="solve"):
-    """Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Default implementation of _solve."""
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Solve is not yet implemented for non-square operators.")
+    logging.warn(
+        "Using (possibly slow) default implementation of solve."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
+    if self._can_use_cholesky():
+      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
+    return linalg_ops.matrix_solve(
+        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
+
+  def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
+    """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
 
     Examples:
 
     ```python
-    # Create an operator acting like a 10 x 2 x 2 matrix.
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
     operator = LinearOperator(...)
-    operator.shape # = 10 x 2 x 2
+    operator.shape = [..., M, N]
 
-    # Solve one linear system (R = 1) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 1
-    X = operator.solve(RHS)  # shape 10 x 2 x 1
+    # Solve R > 0 linear systems for every member of the batch.
+    RHS = ... # shape [..., M, R]
 
-    # Solve five linear systems (R = 5) for every member of the length 10 batch.
-    RHS = ... # shape 10 x 2 x 5
     X = operator.solve(RHS)
-    X[3, :, 2]  # Solution to the linear system A[3, :, :] X = RHS[3, :, 2]
+    # X[..., :, r] is the solution to the r'th linear system
+    # sum_j A[..., :, j] X[..., j, r] = RHS[..., :, r]
+
+    operator.matmul(X)
+    ==> RHS
     ```
 
     Args:
       rhs: `Tensor` with same `dtype` as this operator and compatible shape.
+        `rhs` is treated like a [batch] matrix meaning for every set of leading
+        dimensions, the last two dimensions defines a matrix.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
-        of this `LinearOperator`.
+        of this `LinearOperator`:  `A^H X = rhs`.
+      adjoint_arg:  Python `bool`.  If `True`, solve `A X = rhs^H` where `rhs^H`
+        is the hermitian transpose (transposition and complex conjugation).
       name:  A name scope to use for ops added by this method.
 
     Returns:
@@ -608,14 +779,70 @@ class LinearOperator(object):
     with self._name_scope(name, values=[rhs]):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
-      if adjoint:
-        self.shape[-1].assert_is_compatible_with(rhs.get_shape()[-2])
-      else:
-        self.shape[-2].assert_is_compatible_with(rhs.get_shape()[-2])
-      return self._solve(rhs, adjoint=adjoint)
+
+      self_dim = -1 if adjoint else -2
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[arg_dim])
+
+      return self._solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _solvevec(self, rhs, adjoint=False):
+    """Default implementation of _solvevec."""
+    rhs_mat = array_ops.expand_dims(rhs, axis=-1)
+    solution_mat = self.solve(rhs_mat, adjoint=adjoint)
+    return array_ops.squeeze(solution_mat, axis=-1)
+
+  def solvevec(self, rhs, adjoint=False, name="solve"):
+    """Solve single equation with best effort: `A X = rhs`.
+
+    The returned `Tensor` will be close to an exact solution if `A` is well
+    conditioned. Otherwise closeness will vary. See class docstring for details.
+
+    Examples:
+
+    ```python
+    # Make an operator acting like batch matrix A.  Assume A.shape = [..., M, N]
+    operator = LinearOperator(...)
+    operator.shape = [..., M, N]
+
+    # Solve one linear system for every member of the batch.
+    RHS = ... # shape [..., M]
+
+    X = operator.solvevec(RHS)
+    # X is the solution to the linear system
+    # sum_j A[..., :, j] X[..., j] = RHS[..., :]
+
+    operator.matvec(X)
+    ==> RHS
+    ```
+
+    Args:
+      rhs: `Tensor` with same `dtype` as this operator.
+        `rhs` is treated like a [batch] vector meaning for every set of leading
+        dimensions, the last dimension defines a vector.  See class docstring
+        for definition of compatibility regarding batch dimensions.
+      adjoint: Python `bool`.  If `True`, solve the system involving the adjoint
+        of this `LinearOperator`:  `A^H X = rhs`.
+      name:  A name scope to use for ops added by this method.
+
+    Returns:
+      `Tensor` with shape `[...,N]` and same `dtype` as `rhs`.
+
+    Raises:
+      NotImplementedError:  If `self.is_non_singular` or `is_square` is False.
+    """
+    with self._name_scope(name, values=[rhs]):
+      rhs = ops.convert_to_tensor(rhs, name="rhs")
+      self._check_input_dtype(rhs)
+      self_dim = -1 if adjoint else -2
+      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[-1])
+
+      return self._solvevec(rhs, adjoint=adjoint)
 
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
+    logging.warn("Using (possibly slow) default implementation of to_dense."
+                 "  Converts by self.matmul(identity).")
     if self.batch_shape.is_fully_defined():
       batch_shape = self.batch_shape
     else:
@@ -627,7 +854,7 @@ class LinearOperator(object):
       n = self.domain_dimension_tensor()
 
     eye = linalg_ops.eye(num_rows=n, batch_shape=batch_shape, dtype=self.dtype)
-    return self.apply(eye)
+    return self.matmul(eye)
 
   def to_dense(self, name="to_dense"):
     """Return a dense (batch) matrix representing this operator."""
@@ -636,7 +863,7 @@ class LinearOperator(object):
 
   def _diag_part(self):
     """Generic and often inefficient implementation.  Override often."""
-    return array_ops.matrix_diag_part(self.to_dense())
+    return array_ops.matrix_diag_part(self._get_cached_dense_matrix())
 
   def diag_part(self, name="diag_part"):
     """Efficiently get the [batch] diagonal part of this operator.
@@ -668,7 +895,7 @@ class LinearOperator(object):
 
   def _add_to_tensor(self, x):
     # Override if a more efficient implementation is available.
-    return self.to_dense() + x
+    return self._get_cached_dense_matrix() + x
 
   def add_to_tensor(self, x, name="add_to_tensor"):
     """Add matrix represented by this operator to `x`.  Equivalent to `A + x`.
@@ -684,3 +911,18 @@ class LinearOperator(object):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
+
+  def _can_use_cholesky(self):
+    # TODO(langmore) Add complex types when tf.cholesky can use them.
+    return (not self.dtype.is_complex and self.is_self_adjoint and
+            self.is_positive_definite)
+
+  def _get_cached_dense_matrix(self):
+    if not hasattr(self, "_cached_dense_matrix"):
+      self._cached_dense_matrix = self.to_dense()
+    return self._cached_dense_matrix
+
+  def _get_cached_chol(self):
+    if not hasattr(self, "_cached_chol"):
+      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
+    return self._cached_chol
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
index 7617e1b591d8f6dff3513a226e1faacb6dafe8d4..16c4c6e6d67f17d1674b8d1d39f006bc688bc6ce 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_addition.py
@@ -43,7 +43,7 @@ def add_operators(operators,
   Given operators `[A1, A2,...]`, this `Op` returns a possibly shorter list of
   operators `[B1, B2,...]` such that
 
-  ```sum_k Ak.apply(x) = sum_k Bk.apply(x).```
+  ```sum_k Ak.matmul(x) = sum_k Bk.matmul(x).```
 
   The operators `Bk` result by adding some of the `Ak`, as allowed by
   `addition_tiers`.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index 9f3a4d230f7eab7b92bf1fcaf24a5af8ff045853..9dec621ab29d0bd19aa9cdbe1393755d68366b38 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -63,11 +63,11 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 5 linear operators.
@@ -83,7 +83,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
   # Create a shape [2, 3, 6, 2] vector.
   x = tf.random_normal(shape=[2, 3, 6, 2])
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 3, 4, 2] Tensor
   ```
 
@@ -96,7 +96,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -112,12 +112,13 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name=None):
     r"""Initialize a `LinearOperatorComposition`.
 
     `LinearOperatorComposition` is initialized with a list of operators
-    `[op_1,...,op_J]`.  For the `apply` method to be well defined, the
-    composition `op_i.apply(op_{i+1}(x))` must be defined.  Other methods have
+    `[op_1,...,op_J]`.  For the `matmul` method to be well defined, the
+    composition `op_i.matmul(op_{i+1}(x))` must be defined.  Other methods have
     similar constraints.
 
     Args:
@@ -132,6 +133,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
 
@@ -177,6 +179,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   @property
@@ -225,18 +228,19 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
 
     return array_ops.concat((batch_shape, matrix_shape), 0)
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # If self.operators = [A, B], and not adjoint, then
-    # apply_order_list = [B, A].
-    # As a result, we return A.apply(B.apply(x))
+    # matmul_order_list = [B, A].
+    # As a result, we return A.matmul(B.matmul(x))
     if adjoint:
-      apply_order_list = self.operators
+      matmul_order_list = self.operators
     else:
-      apply_order_list = list(reversed(self.operators))
+      matmul_order_list = list(reversed(self.operators))
 
-    result = x
-    for operator in apply_order_list:
-      result = operator.apply(result, adjoint=adjoint)
+    result = matmul_order_list[0].matmul(
+        x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    for operator in matmul_order_list[1:]:
+      result = operator.matmul(result, adjoint=adjoint)
     return result
 
   def _determinant(self):
@@ -251,7 +255,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
       result += operator.log_abs_determinant()
     return result
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     # TODO(langmore) Implement solve using solve_ls if some intermediate
     # operator maps to a high dimensional space.
     # In that case, an exact solve may still be possible.
@@ -264,8 +268,9 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     else:
       solve_order_list = self.operators
 
-    solution = rhs
-    for operator in solve_order_list:
+    solution = solve_order_list[0].solve(
+        rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+    for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index 0cd7e72a8b64128a819cdfa302dd5818da446bb0..56bc967706a9f2b15aabead4d6864d02e3e5ed08 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -52,11 +52,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -68,13 +68,13 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   # operator.batch_shape = [2, 3].
   y = tf.random_normal(shape=[2, 1, 4, 2])
   x = operator.solve(y)
-  ==> operator.apply(x) = y
+  ==> operator.matmul(x) = y
   ```
 
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -87,7 +87,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorDiag` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N * R` multiplications.
+  * `operator.matmul(x)` involves `N * R` multiplications.
   * `operator.solve(x)` involves `N` divisions and `N * R` multiplications.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -97,7 +97,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -113,6 +113,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorDiag"):
     r"""Initialize a `LinearOperatorDiag`.
 
@@ -129,6 +130,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -147,12 +149,17 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         else:
           is_self_adjoint = True
 
+      if is_square is False:
+        raise ValueError("Only square diagonal operators currently supported.")
+      is_square = True
+
       super(LinearOperatorDiag, self).__init__(
           dtype=self._diag.dtype,
           graph_parents=[self._diag],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_diag(self, diag):
@@ -206,8 +213,9 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
             "This diagonal operator contained non-zero imaginary values.  "
             " Thus it was not self-adjoint."))
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     diag_term = math_ops.conj(self._diag) if adjoint else self._diag
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     diag_mat = array_ops.expand_dims(diag_term, -1)
     return diag_mat * x
 
@@ -218,8 +226,9 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return math_ops.reduce_sum(
         math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     diag_term = math_ops.conj(self._diag) if adjoint else self._diag
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     inv_diag_mat = array_ops.expand_dims(1. / diag_term, -1)
     return rhs * inv_diag_mat
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
index f9349682215613c4fddd6edae372dc5c00b37f32..67889511cbffcbec934855d67914e40b157bdc91 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
@@ -22,7 +22,6 @@ from tensorflow.contrib.linalg.python.ops import linear_operator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -48,11 +47,11 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -63,7 +62,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
@@ -82,7 +81,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   In all cases, suppose `operator` is a `LinearOperatorFullMatrix` of shape
   `[M, N]`, and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` is `O(M * N * R)`.
+  * `operator.matmul(x)` is `O(M * N * R)`.
   * If `M=N`, `operator.solve(x)` is `O(N^3 * R)`.
   * If `M=N`, `operator.determinant()` is `O(N^3)`.
 
@@ -92,7 +91,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -108,6 +107,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorFullMatrix"):
     r"""Initialize a `LinearOperatorFullMatrix`.
 
@@ -123,6 +123,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -133,19 +134,13 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
       self._matrix = ops.convert_to_tensor(matrix, name="matrix")
       self._check_matrix(self._matrix)
 
-      # Special treatment for (real) Symmetric Positive Definite.
-      self._is_spd = (
-          (not self._matrix.dtype.is_complex)
-          and is_self_adjoint and is_positive_definite)
-      if self._is_spd:
-        self._chol = linalg_ops.cholesky(self._matrix)
-
       super(LinearOperatorFullMatrix, self).__init__(
           dtype=self._matrix.dtype,
           graph_parents=[self._matrix],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_matrix(self, matrix):
@@ -172,25 +167,9 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   def _shape_tensor(self):
     return array_ops.shape(self._matrix)
 
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._matrix, x, adjoint_a=adjoint)
-
-  def _determinant(self):
-    if self._is_spd:
-      return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
-
-  def _log_abs_determinant(self):
-    if self._is_spd:
-      diag = array_ops.matrix_diag_part(self._chol)
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    abs_det = math_ops.abs(self.determinant())
-    return math_ops.log(abs_det)
-
-  def _solve(self, rhs, adjoint=False):
-    if self._is_spd:
-      return linalg_ops.cholesky_solve(self._chol, rhs)
-    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return math_ops.matmul(
+        self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _to_dense(self):
     return self._matrix
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index 60d8b2cdc03dcadb792ecb3548f0735d19954b1e..acba1c7035d738d878d801463b857104b98cfc83 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -112,11 +112,11 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 0.
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor, same as x.
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -141,20 +141,20 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   # to detect that no broadcast is necessary because both x and the operator
   # have statically defined shape.
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, same as x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   # This requires a copy, since the output is different size than the input.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 2, 3] Tensor, equal to [x, x]
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -166,21 +166,21 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
 
   If `batch_shape` initialization arg is `None`:
 
-  * `operator.apply(x)` is `O(1)`
+  * `operator.matmul(x)` is `O(1)`
   * `operator.solve(x)` is `O(1)`
   * `operator.determinant()` is `O(1)`
 
   If `batch_shape` initialization arg is provided, and static checks cannot
   rule out the need to broadcast:
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(B1*...*Bb)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -198,6 +198,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                is_non_singular=True,
                is_self_adjoint=True,
                is_positive_definite=True,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorIdentity"):
     r"""Initialize a `LinearOperatorIdentity`.
@@ -224,6 +225,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -248,12 +250,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         raise ValueError("An identity operator is always non-singular.")
       if not is_positive_definite:
         raise ValueError("An identity operator is always positive-definite.")
+      if not is_square:
+        raise ValueError("An identity operator is always square.")
 
       super(LinearOperatorIdentity, self).__init__(
           dtype=dtype,
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
@@ -329,8 +334,9 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
     zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
     return x + zeros
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     # Note that adjoint has no effect since this matrix is self-adjoint.
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if self._assert_proper_shapes:
       aps = linear_operator_util.assert_compatible_matrix_dimensions(
           self, x)
@@ -343,8 +349,8 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   def _log_abs_determinant(self):
     return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
 
-  def _solve(self, rhs, adjoint=False):
-    return self._apply(rhs)
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self._matmul(rhs, adjoint_arg=adjoint_arg)
 
   def _diag_part(self):
     return self._ones_diag()
@@ -458,11 +464,11 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 2 * Log[3]
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> 3 * x
 
   y = tf.random_normal(shape=[3, 2, 4])
@@ -480,19 +486,19 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         [0., 5.]]]
 
   x = ... Shape [2, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
 
   # Here the operator and x have different batch_shape, and are broadcast.
   x = ... Shape [1, 2, 3]
-  operator.apply(x)
+  operator.matmul(x)
   ==> 5 * x
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -502,14 +508,14 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
 
   ### Performance
 
-  * `operator.apply(x)` is `O(D1*...*Dd*N*R)`
+  * `operator.matmul(x)` is `O(D1*...*Dd*N*R)`
   * `operator.solve(x)` is `O(D1*...*Dd*N*R)`
   * `operator.determinant()` is `O(D1*...*Dd)`
 
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -526,6 +532,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorScaledIdentity"):
     r"""Initialize a `LinearOperatorScaledIdentity`.
@@ -549,6 +556,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -560,6 +568,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
+    if not is_square:
+      raise ValueError("A ScaledIdentity operator is always square.")
+
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
@@ -568,6 +579,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       # Shape [B1,...Bb, 1, 1]
@@ -616,7 +628,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         imag_multiplier,
         message="LinearOperator was not self-adjoint")
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    x = linear_operator_util.matrix_adjoint(x) if adjoint_arg else x
     if adjoint:
       matrix = self._multiplier_matrix_conj
     else:
@@ -634,7 +647,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     return self._num_rows_cast_to_real_dtype * math_ops.log(
         self._abs_multiplier)
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     if adjoint:
       matrix = self._multiplier_matrix_conj
     else:
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
index 0b7fc3da3963a696c806fc4ada4bd132bc849960..b2d7b10157b02ff2814de12459b1e417c22128b5 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_test_util.py
@@ -23,6 +23,7 @@ import numpy as np
 import six
 
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
+from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -115,7 +116,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   @abc.abstractmethod
   def _make_x(self, operator, adjoint):
-    """Make an 'x' appropriate for calling operator.apply(x).
+    """Make an 'x' appropriate for calling operator.matmul(x).
 
     Args:
       operator:  A `LinearOperator`
@@ -207,24 +208,32 @@ class LinearOperatorDerivedClassTest(test.TestCase):
                 feed_dict=feed_dict)
             self.assertAC(op_log_abs_det_v, mat_log_abs_det_v)
 
-  def test_apply(self):
-    self._skip_if_tests_to_skip_contains("apply")
+  def test_matmul(self):
+    self._skip_if_tests_to_skip_contains("matmul")
     for use_placeholder in False, True:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           for adjoint in False, True:
-            with self.test_session(graph=ops.Graph()) as sess:
-              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-              operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                  shape, dtype, use_placeholder=use_placeholder)
-              x = self._make_x(operator, adjoint=adjoint)
-              op_apply = operator.apply(x, adjoint=adjoint)
-              mat_apply = math_ops.matmul(mat, x, adjoint_a=adjoint)
-              if not use_placeholder:
-                self.assertAllEqual(op_apply.get_shape(), mat_apply.get_shape())
-              op_apply_v, mat_apply_v = sess.run([op_apply, mat_apply],
-                                                 feed_dict=feed_dict)
-              self.assertAC(op_apply_v, mat_apply_v)
+            for adjoint_arg in False, True:
+              with self.test_session(graph=ops.Graph()) as sess:
+                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+                operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                    shape, dtype, use_placeholder=use_placeholder)
+                x = self._make_x(operator, adjoint=adjoint)
+                # If adjoint_arg, compute A X^H^H = A X.
+                if adjoint_arg:
+                  op_matmul = operator.matmul(
+                      linear_operator_util.matrix_adjoint(x),
+                      adjoint=adjoint, adjoint_arg=adjoint_arg)
+                else:
+                  op_matmul = operator.matmul(x, adjoint=adjoint)
+                mat_matmul = math_ops.matmul(mat, x, adjoint_a=adjoint)
+                if not use_placeholder:
+                  self.assertAllEqual(
+                      op_matmul.get_shape(), mat_matmul.get_shape())
+                op_matmul_v, mat_matmul_v = sess.run(
+                    [op_matmul, mat_matmul], feed_dict=feed_dict)
+                self.assertAC(op_matmul_v, mat_matmul_v)
 
   def test_solve(self):
     self._skip_if_tests_to_skip_contains("solve")
@@ -232,18 +241,27 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           for adjoint in False, True:
-            with self.test_session(graph=ops.Graph()) as sess:
-              sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
-              operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
-                  shape, dtype, use_placeholder=use_placeholder)
-              rhs = self._make_rhs(operator, adjoint=adjoint)
-              op_solve = operator.solve(rhs, adjoint=adjoint)
-              mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
-              if not use_placeholder:
-                self.assertAllEqual(op_solve.get_shape(), mat_solve.get_shape())
-              op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve],
-                                                 feed_dict=feed_dict)
-              self.assertAC(op_solve_v, mat_solve_v)
+            for adjoint_arg in False, True:
+              with self.test_session(graph=ops.Graph()) as sess:
+                sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+                operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
+                    shape, dtype, use_placeholder=use_placeholder)
+                rhs = self._make_rhs(operator, adjoint=adjoint)
+                # If adjoint_arg, solve A X = (rhs^H)^H = rhs.
+                if adjoint_arg:
+                  op_solve = operator.solve(
+                      linear_operator_util.matrix_adjoint(rhs),
+                      adjoint=adjoint, adjoint_arg=adjoint_arg)
+                else:
+                  op_solve = operator.solve(
+                      rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+                mat_solve = linalg_ops.matrix_solve(mat, rhs, adjoint=adjoint)
+                if not use_placeholder:
+                  self.assertAllEqual(
+                      op_solve.get_shape(), mat_solve.get_shape())
+                op_solve_v, mat_solve_v = sess.run([op_solve, mat_solve],
+                                                   feed_dict=feed_dict)
+                self.assertAC(op_solve_v, mat_solve_v)
 
   def test_add_to_tensor(self):
     self._skip_if_tests_to_skip_contains("add_to_tensor")
@@ -358,7 +376,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
         "_make_rhs not implemented because we don't test solve")
 
   def _make_x(self, operator, adjoint):
-    # Return the number of systems for the argument 'x' for .apply(x)
+    # Return the number of systems for the argument 'x' for .matmul(x)
     r = self._get_num_systems(operator)
     # If operator.shape = [B1,...,Bb, M, N] this returns a random matrix of
     # shape [B1,...,Bb, N, R], R = 1 or 2.
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 38461ce8a22067c77c37587d8f1ce2c084d64202..8a152a9b475f4e3fdfd8e3045ab1028eb467997b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -53,11 +53,11 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [2, 4] Tensor
 
   # Create a [2, 3] batch of 4 x 4 linear operators.
@@ -68,7 +68,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [N, N],  with b >= 0
@@ -80,7 +80,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   Suppose `operator` is a `LinearOperatorTriL` of shape `[N, N]`,
   and `x.shape = [N, R]`.  Then
 
-  * `operator.apply(x)` involves `N^2 * R` multiplications.
+  * `operator.matmul(x)` involves `N^2 * R` multiplications.
   * `operator.solve(x)` involves `N * R` size `N` back-substitutions.
   * `operator.determinant()` involves a size `N` `reduce_prod`.
 
@@ -90,7 +90,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -106,6 +106,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorTriL"):
     r"""Initialize a `LinearOperatorTriL`.
 
@@ -126,12 +127,19 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
+      ValueError:  If `is_square` is `False`.
     """
 
+    if is_square is False:
+      raise ValueError(
+          "Only square lower triangular operators supported at this time.")
+    is_square = True
+
     with ops.name_scope(name, values=[tril]):
       self._tril = ops.convert_to_tensor(tril, name="tril")
       self._check_tril(self._tril)
@@ -144,6 +152,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_tril(self, tril):
@@ -173,8 +182,9 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self._diag,
         message="Singular operator:  Diagonal contained zero values.")
 
-  def _apply(self, x, adjoint=False):
-    return math_ops.matmul(self._tril, x, adjoint_a=adjoint)
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return math_ops.matmul(
+        self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
     return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
@@ -183,7 +193,8 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
     return math_ops.reduce_sum(
         math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
     return linalg_ops.matrix_triangular_solve(
         self._tril, rhs, lower=True, adjoint=adjoint)
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
index 89b5c1ab1b9fd3f1792f15c6022aa2427736801d..546d899e74e53d529dd58fc75a4e06f2fb920d1b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_udvh_update.py
@@ -74,18 +74,18 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   operator.shape
   ==> [3, 3]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [3, 4] Tensor
-  operator.apply(x)
+  operator.matmul(x)
   ==> Shape [3, 4] Tensor
   ```
 
   ### Shape compatibility
 
   This operator acts on [batch] matrix with compatible shape.
-  `x` is a batch matrix with compatible shape for `apply` and `solve` if
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
 
   ```
   operator.shape = [B1,...,Bb] + [M, N],  with b >= 0
@@ -95,15 +95,15 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
   ### Performance
 
   Suppose `operator` is a `LinearOperatorUDVHUpdate` of shape `[M, N]`,
-  made from a rank `K` update of `base_operator` which performs `.apply(x)` on
-  `x` having `x.shape = [N, R]` with `O(L_apply*N*R)` complexity (and similarly
+  made from a rank `K` update of `base_operator` which performs `.matmul(x)` on
+  `x` having `x.shape = [N, R]` with `O(L_matmul*N*R)` complexity (and similarly
   for `solve`, `determinant`.  Then, if `x.shape = [N, R]`,
 
-  * `operator.apply(x)` is `O(L_apply*N*R + K*N*R)`
+  * `operator.matmul(x)` is `O(L_matmul*N*R + K*N*R)`
 
   and if `M = N`,
 
-  * `operator.solve(x)` is `O(L_apply*N*R + N*K*R + K^2*R + K^3)`
+  * `operator.solve(x)` is `O(L_matmul*N*R + N*K*R + K^2*R + K^3)`
   * `operator.determinant()` is `O(L_determinant + L_solve*N*K + K^2*N + K^3)`
 
   If instead `operator` and `x` have shape `[B1,...,Bb, M, N]` and
@@ -348,22 +348,22 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
     return array_ops.concat(
         [batch_shape, self.base_operator.shape_tensor()[-2:]], axis=0)
 
-  def _apply(self, x, adjoint=False):
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
     u = self.u
     v = self.v
     l = self.base_operator
     d = self.diag_operator
 
-    leading_term = l.apply(x, adjoint=adjoint)
+    leading_term = l.matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
     if adjoint:
-      uh_x = math_ops.matmul(u, x, adjoint_a=True)
-      d_uh_x = d.apply(uh_x, adjoint=adjoint)
+      uh_x = math_ops.matmul(u, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      d_uh_x = d.matmul(uh_x, adjoint=adjoint)
       v_d_uh_x = math_ops.matmul(v, d_uh_x)
       return leading_term + v_d_uh_x
     else:
-      vh_x = math_ops.matmul(v, x, adjoint_a=True)
-      d_vh_x = d.apply(vh_x, adjoint=adjoint)
+      vh_x = math_ops.matmul(v, x, adjoint_a=True, adjoint_b=adjoint_arg)
+      d_vh_x = d.matmul(vh_x, adjoint=adjoint)
       u_d_vh_x = math_ops.matmul(u, d_vh_x)
       return leading_term + u_d_vh_x
 
@@ -398,7 +398,7 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
 
     return log_abs_det_c + log_abs_det_d + log_abs_det_l
 
-  def _solve(self, rhs, adjoint=False):
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     if self.base_operator.is_non_singular is False:
       raise ValueError(
           "Solve not implemented unless this is a perturbation of a "
@@ -421,7 +421,7 @@ class LinearOperatorUDVHUpdate(linear_operator.LinearOperator):
       u = self.u
 
     # L^{-1} rhs
-    linv_rhs = l.solve(rhs, adjoint=adjoint)
+    linv_rhs = l.solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
     # V^H L^{-1} rhs
     vh_linv_rhs = math_ops.matmul(v, linv_rhs, adjoint_a=True)
     # C^{-1} V^H L^{-1} rhs
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
index a52a235677fd16cac32dea924f6397f960667434..2659bd32e9a96b2117b7af1350e8773e1321d855 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_util.py
@@ -69,10 +69,10 @@ def assert_zero_imag_part(x, message=None, name="assert_zero_imag_part"):
 
 
 def assert_compatible_matrix_dimensions(operator, x):
-  """Assert that an argument to solve/apply has proper domain dimension.
+  """Assert that an argument to solve/matmul has proper domain dimension.
 
   If `operator.shape[-2:] = [M, N]`, and `x.shape[-2:] = [Q, R]`, then
-  `operator.apply(x)` is defined only if `N = Q`.  This `Op` returns an
+  `operator.matmul(x)` is defined only if `N = Q`.  This `Op` returns an
   `Assert` that "fires" if this is not the case.  Static checks are already
   done by the base class `LinearOperator`.
 
@@ -289,6 +289,53 @@ def matmul_with_broadcast(a,
         b_is_sparse=b_is_sparse)
 
 
+def matrix_adjoint(a, name="matrix_adjoint"):
+  """Transposes last two dimensions of tensor `a`, and takes complex conjugate.
+
+  If `a` is real valued, the result is equivalent to `matrix_transpose`.
+
+  For example:
+
+  ```python
+  # Matrix with no batch dimension.
+  # 'x' is [[1 2 3j]
+  #         [4 5 -6j]]
+  tf.matrix_adjoint(x) ==> [[1 4]
+                            [2 5]
+                            [-3j 6j]]
+
+  # Matrix with two batch dimensions.
+  # x.shape is [1, 2, 3, 4]
+  # tf.matrix_adjoint(x) is shape [1, 2, 4, 3]
+  ```
+
+  Note that `tf.matmul` provides kwargs allowing for adjoint of arguments.  This
+  is done with minimal cost, and is preferable to using this function. E.g.
+
+  ```
+  # Good!  Adjoint is taken at minimal additional cost.
+  tf.matmul(matrix, b, adjoint_b=True)
+
+  # Inefficient!
+  tf.matmul(matrix, tf.matrix_adjoint(b))
+  ```
+
+  Args:
+    a: A `Tensor` with `rank >= 2`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A batch matrix `Tensor` with same `dtype` as `a`.
+
+  Raises:
+    ValueError:  If `a` is determined statically to have `rank < 2`.
+  """
+  with ops.name_scope(name, values=[a]):
+    a = ops.convert_to_tensor(a, name="a")
+    a_transpose = array_ops.matrix_transpose(a)
+    return math_ops.conj(a_transpose)
+
+
 def shape_tensor(shape, name=None):
   """Convert Tensor using default type, unless empty list or tuple."""
   # Works just like random_ops._ShapeTensor.
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index 21f02b0a9674c17fd2c143e7697a65fc9c41d2b5..1fde6e5c6cb0e2d6097c63dcd707c35a491acaaa 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -111,13 +111,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sdca_ops_py",
-        ":sparse_feature_column_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
@@ -129,6 +127,7 @@ py_test(
     name = "sdca_estimator_test",
     srcs = ["python/sdca_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":sdca_estimator_py",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index eb1bdeff597543bf22a2eab439ec90619db48856..70f777f08bd5b8157e601f19019075d3e7543811 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -1058,7 +1058,7 @@ class SdcaFprintTest(SdcaModelTest):
   def testFprint(self):
     with self._single_threaded_test_session():
       in_data = constant_op.constant(['abc', 'very looooooong string', 'def'])
-      out_data = gen_sdca_ops._sdca_fprint(in_data)
+      out_data = gen_sdca_ops.sdca_fprint(in_data)
       self.assertAllEqual([[4143508125394299908, -6879828354153669051],
                            [5849691694103072671, -4874542629849009556],
                            [603227410218889250, 8762207001949257490]],
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index d8f140f5ed167868b18a34ec2f591f44977eb1b3..13f2f0f5021ea4dd339b671e20cb718f4db509f9 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -307,7 +307,7 @@ class SdcaModel(object):
           sparse_features_values.append(sf.feature_values)
 
       # pylint: disable=protected-access
-      example_ids_hashed = gen_sdca_ops._sdca_fprint(
+      example_ids_hashed = gen_sdca_ops.sdca_fprint(
           internal_convert_to_tensor(self._examples['example_ids']))
       # pylint: enable=protected-access
       example_state_data = self._hashtable.lookup(example_ids_hashed)
@@ -328,7 +328,7 @@ class SdcaModel(object):
           sparse_weights.append(array_ops.gather(w, sparse_indices[-1]))
 
       # pylint: disable=protected-access
-      esu, sfw, dfw = gen_sdca_ops._sdca_optimizer(
+      esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
           sparse_example_indices,
           sparse_feature_indices,
           sparse_features_values,
@@ -390,7 +390,7 @@ class SdcaModel(object):
           with ops.device(var.device):
             # pylint: disable=protected-access
             update_ops.append(
-                gen_sdca_ops._sdca_shrink_l1(
+                gen_sdca_ops.sdca_shrink_l1(
                     self._convert_n_to_tensor(
                         [var], as_ref=True),
                     l1=self._symmetric_l1_regularization(),
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 733b03eed36251a4079f48619fc85274981118b8..f4961ab9dbf98905df65c3b5be057fde1edca768 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -24,13 +24,10 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
-from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import session_run_hook
 
@@ -76,131 +73,6 @@ def _add_bias_column(feature_columns, columns_to_tensors, bias_variable,
   columns_to_variables[bias_column] = [bias_variable]
 
 
-def _get_sdca_train_step(optimizer, columns_to_variables, weight_column_name,
-                         loss_type, features, targets, global_step):
-  """Returns the training operation of an SdcaModel optimizer."""
-
-  def _dense_tensor_to_sparse_feature_column(dense_tensor):
-    """Returns SparseFeatureColumn for the input dense_tensor."""
-    ignore_value = 0.0
-    sparse_indices = array_ops.where(
-        math_ops.not_equal(dense_tensor,
-                           math_ops.cast(ignore_value, dense_tensor.dtype)))
-    sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
-    # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
-    # very sparse features with weights and not weights.
-    return SparseFeatureColumn(
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[0], [-1]),
-        array_ops.reshape(
-            array_ops.split(value=sparse_indices, num_or_size_splits=2,
-                            axis=1)[1], [-1]),
-        array_ops.reshape(math_ops.to_float(sparse_values), [-1]))
-
-  def _training_examples_and_variables():
-    """Returns dictionaries for training examples and variables."""
-    batch_size = targets.get_shape()[0]
-
-    # Iterate over all feature columns and create appropriate lists for dense
-    # and sparse features as well as dense and sparse weights (variables) for
-    # SDCA.
-    # TODO(sibyl-vie3Poto): Reshape variables stored as values in column_to_variables
-    # dict as 1-dimensional tensors.
-    dense_features, sparse_features, sparse_feature_with_values = [], [], []
-    dense_feature_weights = []
-    sparse_feature_weights, sparse_feature_with_values_weights = [], []
-    for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
-      transformed_tensor = features[column]
-      if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
-        # A real-valued column corresponds to a dense feature in SDCA. A
-        # transformed tensor corresponding to a RealValuedColumn has rank 2
-        # (its shape is typically [batch_size, column.dimension]) and so it
-        # can be passed to SDCA as is.
-        dense_features.append(transformed_tensor)
-        # For real valued columns, the variables list contains exactly one
-        # element.
-        dense_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
-        # A bucketized column corresponds to a sparse feature in SDCA. The
-        # bucketized feature is "sparsified" for SDCA by converting it to a
-        # SparseFeatureColumn respresenting the one-hot encoding of the
-        # bucketized feature.
-        #
-        # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
-        # bucketized feature column to a dense feature in SDCA. This will likely
-        # depend on the number of buckets.
-        dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
-        sparse_feature_column = _dense_tensor_to_sparse_feature_column(
-            dense_bucket_tensor)
-        sparse_feature_with_values.append(sparse_feature_column)
-        # For bucketized columns, the variables list contains exactly one
-        # element.
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      elif isinstance(
-          column,
-          (
-              layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
-              layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-        sparse_features.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=transformed_tensor.indices,
-                        num_or_size_splits=2,
-                        axis=1)[0], [-1]),
-                array_ops.reshape(transformed_tensor.values, [-1]), None))
-        sparse_feature_weights.append(columns_to_variables[column][0])
-      elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-        id_tensor = column.id_tensor(transformed_tensor)
-        weight_tensor = column.weight_tensor(transformed_tensor)
-        sparse_feature_with_values.append(
-            SparseFeatureColumn(
-                array_ops.reshape(
-                    array_ops.split(
-                        value=id_tensor.indices, num_or_size_splits=2, axis=1)[
-                            0], [-1]),
-                array_ops.reshape(id_tensor.values, [-1]),
-                array_ops.reshape(weight_tensor.values, [-1])))
-        sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
-      else:
-        raise ValueError("SDCAOptimizer does not support column type {}".format(
-            type(column).__name__))
-
-    example_weights = array_ops.reshape(
-        features[weight_column_name],
-        shape=[-1]) if weight_column_name else array_ops.ones([batch_size])
-    example_ids = features[optimizer.example_id_column]
-    sparse_feature_with_values.extend(sparse_features)
-    sparse_feature_with_values_weights.extend(sparse_feature_weights)
-    examples = dict(
-        sparse_features=sparse_feature_with_values,
-        dense_features=dense_features,
-        example_labels=math_ops.to_float(
-            array_ops.reshape(targets, shape=[-1])),
-        example_weights=example_weights,
-        example_ids=example_ids)
-    sdca_variables = dict(
-        sparse_features_weights=sparse_feature_with_values_weights,
-        dense_features_weights=dense_feature_weights)
-    return examples, sdca_variables
-
-  training_examples, training_variables = _training_examples_and_variables()
-  sdca_model = sdca_ops.SdcaModel(
-      examples=training_examples,
-      variables=training_variables,
-      options=dict(
-          symmetric_l1_regularization=optimizer.symmetric_l1_regularization,
-          symmetric_l2_regularization=optimizer.symmetric_l2_regularization,
-          num_loss_partitions=optimizer.num_loss_partitions,
-          num_table_shards=optimizer.num_table_shards,
-          loss_type=loss_type))
-  train_op = sdca_model.minimize(global_step=global_step)
-  return sdca_model, train_op
-
-
 def sdca_model_fn(features, labels, mode, params, config=None):
   """A model_fn for linear models that use the SDCA optimizer.
 
@@ -283,9 +155,9 @@ def sdca_model_fn(features, labels, mode, params, config=None):
 
   def _train_op_fn(unused_loss):
     global_step = contrib_variables.get_global_step()
-    sdca_model, train_op = _get_sdca_train_step(optimizer, columns_to_variables,
-                                                weight_column_name, loss_type,
-                                                features, labels, global_step)
+    sdca_model, train_op = optimizer.get_train_step(
+        columns_to_variables, weight_column_name, loss_type, features, labels,
+        global_step)
     if update_weights_hook is not None:
       update_weights_hook.set_parameters(sdca_model, train_op)
     return train_op
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 81434621bd6fbc0e80ddc9e0006122ddc10e48df..32b7f956e476ca79cc77338cde496cd0c517c401 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -35,7 +35,7 @@ class SDCALogisticClassifierTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2']),
-          'maintenance_cost': constant_op.constant([[500.0], [200.0]]),
+          'maintenance_cost': constant_op.constant([500.0, 200.0]),
           'sq_footage': constant_op.constant([[800.0], [600.0]]),
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
@@ -77,7 +77,7 @@ class SDCALogisticClassifierTest(test.TestCase):
     def input_fn():
       return {
           'example_id': constant_op.constant(['1', '2', '3']),
-          'price': constant_op.constant([[600.0], [1000.0], [400.0]]),
+          'price': constant_op.constant([600.0, 1000.0, 400.0]),
           'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]),
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
@@ -196,7 +196,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'price':
               constant_op.constant([[0.6], [0.8], [0.3]]),
           'sq_footage':
-              constant_op.constant([[900.0], [700.0], [600.0]]),
+              constant_op.constant([900.0, 700.0, 600.0]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
@@ -296,7 +296,7 @@ class SDCALinearRegressorTest(test.TestCase):
           'example_id':
               constant_op.constant(['1', '2', '3']),
           'price':
-              constant_op.constant([[0.4], [0.6], [0.3]]),
+              constant_op.constant([0.4, 0.6, 0.3]),
           'country':
               sparse_tensor.SparseTensor(
                   values=['IT', 'US', 'GB'],
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index f9d69d6dea9dd3ac3eca3b64be15c8dd131b862d..92d022f2a30ffeb77e81d3bd01365afcd14826b5 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -99,16 +101,16 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
-  def get_train_step(self, columns_to_variables,
-                     weight_column_name, loss_type, features, targets,
-                     global_step):
+  def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
+                     features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
 
-    def _tensor_to_sparse_feature_column(dense_tensor):
+    def _dense_tensor_to_sparse_feature_column(dense_tensor):
       """Returns SparseFeatureColumn for the input dense_tensor."""
       ignore_value = 0.0
-      sparse_indices = array_ops.where(math_ops.not_equal(
-          dense_tensor, math_ops.cast(ignore_value, dense_tensor.dtype)))
+      sparse_indices = array_ops.where(
+          math_ops.not_equal(dense_tensor,
+                             math_ops.cast(ignore_value, dense_tensor.dtype)))
       sparse_values = array_ops.gather_nd(dense_tensor, sparse_indices)
       # TODO(sibyl-Aix6ihai, sibyl-vie3Poto): Makes this efficient, as now SDCA supports
       # very sparse features with weights and not weights.
@@ -133,34 +135,48 @@ class SDCAOptimizer(object):
       dense_features, sparse_features, sparse_feature_with_values = [], [], []
       dense_feature_weights = []
       sparse_feature_weights, sparse_feature_with_values_weights = [], []
-      # pylint: disable=protected-access
       for column in sorted(columns_to_variables.keys(), key=lambda x: x.key):
         transformed_tensor = features[column]
-        if isinstance(column, layers.feature_column._RealValuedColumn):
+        if isinstance(column, layers.feature_column._RealValuedColumn):  # pylint: disable=protected-access
           # A real-valued column corresponds to a dense feature in SDCA. A
-          # transformed tensor corresponding to a RealValuedColumn has rank 2
-          # (its shape is typically [batch_size, column.dimension]) and so it
-          # can be passed to SDCA as is.
+          # transformed tensor corresponding to a RealValuedColumn should have
+          # rank at most 2. In order to be passed to SDCA, its rank needs to be
+          # exactly 2 (i.e., its shape should be [batch_size, column.dim]).
+          check_rank_op = control_flow_ops.Assert(
+              math_ops.less_equal(array_ops.rank(transformed_tensor), 2),
+              ['transformed_tensor shouls have rank at most 2.'])
+          # Reshape to [batch_size, dense_column_dimension].
+          with ops.control_dependencies([check_rank_op]):
+            transformed_tensor = array_ops.reshape(transformed_tensor, [
+                array_ops.shape(transformed_tensor)[0], -1
+            ])
+
           dense_features.append(transformed_tensor)
           # For real valued columns, the variables list contains exactly one
           # element.
           dense_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._BucketizedColumn):
+        elif isinstance(column, layers.feature_column._BucketizedColumn):  # pylint: disable=protected-access
           # A bucketized column corresponds to a sparse feature in SDCA. The
           # bucketized feature is "sparsified" for SDCA by converting it to a
           # SparseFeatureColumn respresenting the one-hot encoding of the
           # bucketized feature.
-          dense_bucket_tensor = layers.input_from_feature_columns(
-              {column: transformed_tensor}, [column])
-          sparse_feature_column = _tensor_to_sparse_feature_column(
+          #
+          # TODO(sibyl-vie3Poto): Explore whether it is more efficient to translate a
+          # bucketized feature column to a dense feature in SDCA. This will
+          # likely depend on the number of buckets.
+          dense_bucket_tensor = column._to_dnn_input_layer(transformed_tensor)  # pylint: disable=protected-access
+          sparse_feature_column = _dense_tensor_to_sparse_feature_column(
               dense_bucket_tensor)
           sparse_feature_with_values.append(sparse_feature_column)
           # For bucketized columns, the variables list contains exactly one
           # element.
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
-        elif isinstance(column, (layers.feature_column._CrossedColumn,
-                                 layers.feature_column._SparseColumn)):
+        elif isinstance(
+            column,
+            (
+                layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
+                layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
           sparse_features.append(
               SparseFeatureColumn(
                   array_ops.reshape(
@@ -168,10 +184,9 @@ class SDCAOptimizer(object):
                           value=transformed_tensor.indices,
                           num_or_size_splits=2,
                           axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]),
-                  None))
+                  array_ops.reshape(transformed_tensor.values, [-1]), None))
           sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):
+        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
           id_tensor = column.id_tensor(transformed_tensor)
           weight_tensor = column.weight_tensor(transformed_tensor)
           sparse_feature_with_values.append(
@@ -183,11 +198,10 @@ class SDCAOptimizer(object):
                   array_ops.reshape(id_tensor.values, [-1]),
                   array_ops.reshape(weight_tensor.values, [-1])))
           sparse_feature_with_values_weights.append(
-            columns_to_variables[column][0])
+              columns_to_variables[column][0])
         else:
           raise ValueError('SDCAOptimizer does not support column type %s.' %
                            type(column).__name__)
-      # pylint: enable=protected-access
 
       example_weights = array_ops.reshape(
           features[weight_column_name],
@@ -195,12 +209,13 @@ class SDCAOptimizer(object):
       example_ids = features[self._example_id_column]
       sparse_feature_with_values.extend(sparse_features)
       sparse_feature_with_values_weights.extend(sparse_feature_weights)
-      examples = dict(sparse_features=sparse_feature_with_values,
-                      dense_features=dense_features,
-                      example_labels=math_ops.to_float(array_ops.reshape(
-                          targets, shape=[-1])),
-                      example_weights=example_weights,
-                      example_ids=example_ids)
+      examples = dict(
+          sparse_features=sparse_feature_with_values,
+          dense_features=dense_features,
+          example_labels=math_ops.to_float(
+              array_ops.reshape(targets, shape=[-1])),
+          example_weights=example_weights,
+          example_ids=example_ids)
       sdca_variables = dict(
           sparse_features_weights=sparse_feature_with_values_weights,
           dense_features_weights=dense_feature_weights)
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index b3316ee8c4fe167385dcc33135cc877c81a3509d..b0475c41c954713f0711fd497710478bacfdece4 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -9,6 +9,7 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+# TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
     name = "lookup_py",
     srcs = [
@@ -19,9 +20,9 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops_gen",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:lookup_ops_gen",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
@@ -39,11 +40,11 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index eec197782da73863cf614dc7de6c257ceefb6616..65474f03fa01acd258c8dcc5c4e1bc3b765af440 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Lookup table Operations."""
-# pylint: disable=g-bad-name
+"""Lookup table operations."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,7 +27,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
@@ -151,7 +151,7 @@ class InitializableLookupTableBase(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as scope:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=scope)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=scope)
       # pylint: enable=protected-access
 
   def lookup(self, keys, name=None):
@@ -182,7 +182,7 @@ class InitializableLookupTableBase(LookupInterface):
         name, "%s_Lookup" % self._name,
         (self._table_ref, key_tensor, self._default_value)) as scope:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
+      values = gen_lookup_ops._lookup_table_find(
           self._table_ref, key_tensor, self._default_value, name=scope)
       # pylint: enable=protected-access
 
@@ -229,7 +229,7 @@ class HashTable(InitializableLookupTableBase):
     with ops.name_scope(
         name, "hash_table", (initializer, default_value)) as scope:
       # pylint: disable=protected-access
-      table_ref = gen_data_flow_ops._hash_table(
+      table_ref = gen_lookup_ops._hash_table(
           shared_name=shared_name,
           key_dtype=initializer.key_dtype,
           value_dtype=initializer.value_dtype,
@@ -308,10 +308,8 @@ class KeyValueTensorInitializer(TableInitializerBase):
         self._name,
         values=(table.table_ref, self._keys, self._values)) as scope:
       # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table(table.table_ref,
-                                                    self._keys,
-                                                    self._values,
-                                                    name=scope)
+      init_op = gen_lookup_ops._initialize_table(
+          table.table_ref, self._keys, self._values, name=scope)
       # pylint: enable=protected-access
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
@@ -477,7 +475,7 @@ class TextFileInitializer(TableInitializerBase):
                                        dtypes.string,
                                        name="asset_filepath")
       # pylint: disable=protected-access
-      init_op = gen_data_flow_ops._initialize_table_from_text_file(
+      init_op = gen_lookup_ops._initialize_table_from_text_file(
           table.table_ref,
           filename,
           self._key_index,
@@ -608,7 +606,7 @@ class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
   __slots__ = ()
 
 
-FastHashSpec = HasherSpec("fasthash", None)
+FastHashSpec = HasherSpec("fasthash", None)  # pylint: disable=invalid-name
 
 
 class StrongHashSpec(HasherSpec):
@@ -645,19 +643,21 @@ class IdTableWithHashBuckets(LookupInterface):
 
   For example, if an instance of `IdTableWithHashBuckets` is initialized with a
   string-to-id table that maps:
+
   - emerson -> 0
   - lake -> 1
   - palmer -> 2
 
   The `IdTableWithHashBuckets` object will performs the following mapping:
+
   - emerson -> 0
   - lake -> 1
   - palmer -> 2
   - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
     hash(<term>) % num_oov_buckets + vocab_size
 
-  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
-  the lookup result is [0, 1, 2, 4, 7]
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`
 
   If `table` is None, only out-of-vocabulary buckets are used.
 
@@ -882,7 +882,7 @@ def index_table_from_file(vocabulary_file=None,
     name: A name for this op (optional).
 
   Returns:
-    The lookup table to map a string `Tensor` to index `int64` `Tensor`.
+    The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
 
   Raises:
     ValueError: If `vocabulary_file` is not set.
@@ -974,7 +974,7 @@ def index_table_from_tensor(mapping,
   Sample Usages:
 
   ```python
-  mapping_strings = t.constant(["emerson", "lake", "palmer")
+  mapping_strings = t.constant(["emerson", "lake", "palmer"])
   table = tf.contrib.lookup.index_table_from_tensor(
       mapping=mapping_strings, num_oov_buckets=1, default_value=-1)
   features = tf.constant(["emerson", "lake", "and", "palmer"])
@@ -1066,7 +1066,7 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
   For example:
 
   ```python
-  mapping_strings = tf.constant(["emerson", "lake", "palmer")
+  mapping_strings = tf.constant(["emerson", "lake", "palmer"])
   feats = tf.constant(["emerson", "lake", "and", "palmer"])
   ids = tf.contrib.lookup.string_to_index(
       feats, mapping=mapping_strings, default_value=-1)
@@ -1333,14 +1333,14 @@ class MutableHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     # pylint: disable=protected-access
     if self._default_value.get_shape().ndims == 0:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table(
+      self._table_ref = gen_lookup_ops._mutable_hash_table(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           name=name)
     else:
-      self._table_ref = gen_data_flow_ops._mutable_hash_table_of_tensors(
+      self._table_ref = gen_lookup_ops._mutable_hash_table_of_tensors(
           shared_name=shared_name,
           use_node_name_sharing=use_node_name_sharing,
           key_dtype=key_dtype,
@@ -1368,7 +1368,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1394,10 +1394,8 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         (self._table_ref, keys, self._default_value)) as name:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(self._table_ref,
-                                                    keys,
-                                                    self._default_value,
-                                                    name=name)
+      values = gen_lookup_ops._lookup_table_find(
+          self._table_ref, keys, self._default_value, name=name)
 
     values.set_shape(keys.get_shape().concatenate(self._value_shape))
     return values
@@ -1423,7 +1421,7 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
       # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
+      op = gen_lookup_ops._lookup_table_insert(
           self._table_ref, keys, values, name=name)
       return op
 
@@ -1440,11 +1438,8 @@ class MutableHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
-          self._table_ref,
-          self._key_dtype,
-          self._value_dtype,
-          name=name)
+      exported_keys, exported_values = gen_lookup_ops._lookup_table_export(
+          self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
@@ -1464,7 +1459,7 @@ class MutableHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(
+      return gen_lookup_ops._lookup_table_import(
           self.op._table_ref, restored_tensors[0], restored_tensors[1])
 
 
@@ -1539,7 +1534,7 @@ class MutableDenseHashTable(LookupInterface):
     use_node_name_sharing = checkpoint and shared_name is None
     empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype)
     # pylint: disable=protected-access
-    self._table_ref = gen_data_flow_ops._mutable_dense_hash_table(
+    self._table_ref = gen_lookup_ops._mutable_dense_hash_table(
         empty_key=empty_key,
         shared_name=shared_name,
         use_node_name_sharing=use_node_name_sharing,
@@ -1567,7 +1562,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_Size" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_size(self._table_ref, name=name)
+      return gen_lookup_ops._lookup_table_size(self._table_ref, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -1593,7 +1588,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_find" % self._name,
                         [self._table_ref, keys]) as name:
       # pylint: disable=protected-access
-      values = gen_data_flow_ops._lookup_table_find(
+      values = gen_lookup_ops._lookup_table_find(
           self._table_ref, keys, self._default_value, name=name)
 
     if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0:
@@ -1623,7 +1618,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
                         [self._table_ref, keys, values]) as name:
       # pylint: disable=protected-access
-      op = gen_data_flow_ops._lookup_table_insert(
+      op = gen_lookup_ops._lookup_table_insert(
           self._table_ref, keys, values, name=name)
       return op
 
@@ -1640,7 +1635,7 @@ class MutableDenseHashTable(LookupInterface):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
                         [self._table_ref]) as name:
       # pylint: disable=protected-access
-      exported_keys, exported_values = gen_data_flow_ops._lookup_table_export(
+      exported_keys, exported_values = gen_lookup_ops._lookup_table_export(
           self._table_ref, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
@@ -1661,6 +1656,5 @@ class MutableDenseHashTable(LookupInterface):
 
     def restore(self, restored_tensors, unused_restored_shapes):
       # pylint: disable=protected-access
-      return gen_data_flow_ops._lookup_table_import(self.op._table_ref,
-                                                    restored_tensors[0],
-                                                    restored_tensors[1])
+      return gen_lookup_ops._lookup_table_import(
+          self.op._table_ref, restored_tensors[0], restored_tensors[1])
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 0ec40a63f26e7139bda8cc73dcec034ff47a0532..5ec169b6db4f60439a3b9f233e30a862669fa7de 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -125,7 +125,7 @@ class HashTableOpTest(test.TestCase):
       table3 = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(3, table1.size().eval())
       self.assertAllEqual(3, table2.size().eval())
       self.assertAllEqual(3, table3.size().eval())
@@ -1184,7 +1184,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int32_index_table_from_file(self):
@@ -1198,7 +1198,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int64_index_table_from_file(self):
@@ -1212,7 +1212,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_index_table_from_file_with_default_value(self):
@@ -1224,7 +1224,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
   def test_index_table_from_file_with_oov_buckets(self):
@@ -1236,7 +1236,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
               1,  # From vocabulary file.
@@ -1259,7 +1259,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, -1, -1), ids.eval())
       self.assertEqual(2, table.size().eval())
 
@@ -1286,7 +1286,7 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, -1), ids.eval())
       self.assertEqual(3, table.size().eval())
 
@@ -1345,7 +1345,7 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int32_index_table_from_tensor_with_tensor_init(self):
@@ -1356,7 +1356,7 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_int64_index_table_from_tensor_with_tensor_init(self):
@@ -1367,7 +1367,7 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
   def test_index_table_from_tensor_with_default_value(self):
@@ -1378,7 +1378,7 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       self.assertRaises(errors_impl.OpError, ids.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), ids.eval())
 
   def test_index_table_from_tensor_missing_mapping(self):
@@ -1394,7 +1394,7 @@ class IndexTableFromTensor(test.TestCase):
       self.assertRaises(errors_impl.OpError, ids.eval)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
-        data_flow_ops.tables_initializer().run()
+        lookup_ops.tables_initializer().run()
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
     with self.test_session():
@@ -1422,7 +1422,7 @@ class StringToIndexTest(test.TestCase):
       indices = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, indices.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((1, 2, -1), indices.eval())
 
@@ -1433,7 +1433,7 @@ class StringToIndexTest(test.TestCase):
       _ = lookup.string_to_index(feats, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_string_to_index_with_default_value(self):
     default_value = -42
@@ -1444,7 +1444,7 @@ class StringToIndexTest(test.TestCase):
           feats, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, indices.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, default_value), indices.eval())
 
 
@@ -1463,7 +1463,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
 
@@ -1475,7 +1475,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1489,7 +1489,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
                           features.eval())
 
@@ -1501,7 +1501,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      init = data_flow_ops.tables_initializer()
+      init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
@@ -1513,7 +1513,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
 
 
@@ -1528,7 +1528,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
@@ -1540,7 +1540,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           mapping=mapping_strings)
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
 
   def test_index_to_string_with_default_value(self):
@@ -1553,7 +1553,7 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       features = table.lookup(indices)
       self.assertRaises(errors_impl.OpError, features.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           features.eval())
 
@@ -1567,7 +1567,7 @@ class IndexToStringTest(test.TestCase):
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
 
       self.assertRaises(errors_impl.OpError, feats.eval)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           feats.eval())
@@ -1577,11 +1577,11 @@ class IndexToStringTest(test.TestCase):
       mapping_strings = constant_op.constant(["hello", "hello"])
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       feats = lookup.index_to_string(indices, mapping=mapping_strings)
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"hello", b"hello", b"UNK"), feats.eval())
 
       self.assertRaises(errors_impl.OpError,
-                        data_flow_ops.tables_initializer().run)
+                        lookup_ops.tables_initializer().run)
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
@@ -1592,7 +1592,7 @@ class IndexToStringTest(test.TestCase):
           indices, mapping=mapping_strings, default_value=default_value)
       self.assertRaises(errors_impl.OpError, feats.eval)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value), feats.eval())
 
 
@@ -1755,7 +1755,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value,
           shared_name=shared_name)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
 
@@ -2081,7 +2081,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           hasher_spec=lookup.StrongHashSpec((1, 2)),
           name="table2")
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string = constant_op.constant(
           ["fruit", "brain", "salad", "surgery", "UNK"])
@@ -2167,7 +2167,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value2),
           oov_buckets)
 
-      data_flow_ops.tables_initializer().run()
+      lookup_ops.tables_initializer().run()
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index 9861ecc1f87a0f453e07b267f727f0c44439cd61..790bf61367d85b79bae4b153328b229b10721b38 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -22,10 +22,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.losses.python import losses
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses import *
-# pylint: enable=unused-import,wildcard-import
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__, doc_string_modules=[losses])
+
+_allowed_symbols = [
+    'absolute_difference',
+    'add_loss',
+    'hinge_loss',
+    'compute_weighted_loss',
+    'cosine_distance',
+    'get_losses',
+    'get_regularization_losses',
+    'get_total_loss',
+    'log_loss',
+    'mean_pairwise_squared_error',
+    'mean_squared_error',
+    'sigmoid_cross_entropy',
+    'softmax_cross_entropy',
+    'sparse_softmax_cross_entropy',
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 1b57f0baeef0c1e7016dcd95c725ee88566b1a9d..6e9d1d4a773b3a2c9b7b1accbb3ccb3000c8164a 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -12,127 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Loss operations for use in neural networks.
+"""Ops for building neural network losses.
 
-Note: By default all the losses are collected into the `GraphKeys.LOSSES`
-collection.
-
-All of the loss functions take a pair of predictions and ground truth labels,
-from which the loss is computed. It is assumed that the shape of both these
-tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number
-of samples in the batch and `d1` ... `dN` are the remaining dimensions.
-
-It is common, when training with multiple loss functions, to adjust the relative
-strengths of individual losses. This is performed by rescaling the losses via
-a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
-implement this as:
-
-  # Explicitely set the weight.
-  tf.contrib.losses.log(predictions, labels, weight=2.0)
-
-  # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
-
-  # All the losses are collected into the `GraphKeys.LOSSES` collection.
-  losses = tf.get_collection(tf.GraphKeys.LOSSES)
-
-While specifying a scalar loss rescales the loss over the entire batch,
-we sometimes want to rescale the loss per batch sample. For example, if we have
-certain examples that matter more to us to get correctly, we might want to have
-a higher loss that other samples whose mistakes matter less. In this case, we
-can provide a weight vector of length `batch_size` which results in the loss
-for each sample in the batch being scaled by the corresponding weight element.
-For example, consider the case of a classification problem where we want to
-maximize our accuracy but we especially interested in obtaining high accuracy
-for a specific class:
-
-  inputs, labels = LoadData(batch_size=3)
-  logits = MyModelPredictions(inputs)
-
-  # Ensures that the loss for examples whose ground truth class is `3` is 5x
-  # higher than the loss for all other examples.
-  weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1
-
-  onehot_labels = tf.one_hot(labels, num_classes=5)
-  tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight)
-
-Finally, in certain cases, we may want to specify a different loss for every
-single measurable value. For example, if we are performing per-pixel depth
-prediction, or per-pixel denoising, a single batch sample has P values where P
-is the number of pixels in the image. For many losses, the number of measurable
-values matches the number of elements in the predictions and labels tensors.
-For others, such as softmax_cross_entropy and cosine_distance, the
-loss functions reduces the dimensions of the inputs to produces a tensor of
-losses for each measurable value. For example, softmax_cross_entropy takes as
-input predictions and labels of dimension [batch_size, num_classes] but the
-number of measurable values is [batch_size]. Consequently, when passing a weight
-tensor to specify a different loss for every measurable value, the dimension of
-the tensor will depend on the loss being used.
-
-For a concrete example, consider the case of per-pixel depth prediction where
-certain ground truth depth values are missing (due to sensor noise in the
-capture process). In this case, we want to assign zero weight to losses for
-these predictions.
-
-  # 'depths' that are missing have a value of 0:
-  images, depths = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-Note that when using weights for the losses, the final average is computed
-by rescaling the losses by the weights and then dividing by the total number of
-non-zero samples. For an arbitrary set of weights, this may not necessarily
-produce a weighted average. Instead, it simply and transparently rescales the
-per-element losses before averaging over the number of observations. For example
-if the losses computed by the loss function is an array [4, 1, 2, 3] and the
-weights are an array [1, 0.5, 3, 9], then the average loss is:
-
-  (4*1 + 1*0.5 + 2*3 + 3*9) / 4
-
-However, with a single loss function and an arbitrary set of weights, one can
-still easily create a loss function such that the resulting loss is a
-weighted average over the individual prediction errors:
-
-  images, labels = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = MyComplicatedWeightingFunction(labels)
-  weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-@@absolute_difference
-@@add_loss
-@@hinge_loss
-@@compute_weighted_loss
-@@cosine_distance
-@@get_losses
-@@get_regularization_losses
-@@get_total_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@@sum_of_pairwise_squares
-@@sum_of_squares
+See @{$python/contrib.losses}.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses.loss_ops import *
-from tensorflow.python.util.all_util import make_all
-# pylint: enable=unused-import,wildcard-import
-
-__all__ = make_all(__name__)
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 5ca8c8a18bc08f06e3afdf97d11bdf44f72b1a9b..f6d3601c7dc6002673a7d056313939bf99cbaa44 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -236,7 +236,7 @@ def get_regularization_losses(scope=None):
     scope: an optional scope for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 4db818a3a9c059263195b54b82ebdcdd3a521515..305ed0d11ec11ef24971a47f6b4d7f3bb25f82b2 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -281,6 +281,10 @@ ifeq ($(TARGET),ANDROID)
 		CXXFLAGS += -DUSE_HEXAGON_LIBS
 	endif
 
+	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+		CXXFLAGS += -DENABLE_EXPERIMENTAL_HEXNN_OPS
+	endif
+
 endif  # ANDROID
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
@@ -293,7 +297,7 @@ ifeq ($(TARGET),IOS)
 	IPHONESIMULATOR_SYSROOT := $(shell xcrun --sdk iphonesimulator \
 	--show-sdk-path)
 	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
-	MIN_SDK_VERSION := 8.2
+	MIN_SDK_VERSION := 8.0
 # Override IOS_ARCH with ARMV7, ARMV7S, ARM64, or I386.
 	IOS_ARCH := X86_64
 	ifeq ($(IOS_ARCH),ARMV7)
@@ -370,6 +374,7 @@ ifeq ($(TARGET),IOS)
 	ifeq ($(IOS_ARCH),I386)
 		CXXFLAGS += -mios-simulator-version-min=$(MIN_SDK_VERSION) \
 		-arch i386 \
+		-mno-sse \
 		-fembed-bitcode \
 		-D__thread= \
 		-DUSE_GEMM_FOR_CONV \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index f061b58775e87a56c745b0cfb1c7c4bc5cbcde4c..9ba5c035a269e4a76a7f6214394c6577ed6a6471 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -295,7 +295,7 @@ itself, you'll see it's broken up into host and target sections. If you are
 cross-compiling, you should look at customizing the target settings to match
 what you need for your desired system.
 
-## Dependency Managment
+## Dependency Management
 
 The Makefile loads in a list of dependencies stored in text files. These files
 are generated from the main Bazel build by running 
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 2f3649dac3a958534715bdbe5bf8e4983a835f16..161f2df5b27044971c6fd7e13c321c95e0ab4d02 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -19,6 +19,7 @@ set -e
 
 usage() {
   echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-s:t:Tx:X]"
+  echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
   echo "-T only build tensorflow"
@@ -31,8 +32,9 @@ if [[ -z "${NDK_ROOT}" ]]; then
     exit 1
 fi
 
-while getopts "s:t:Tx:" opt_name; do
+while getopts "Es:t:Tx:" opt_name; do
   case "$opt_name" in
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
     t) BUILD_TARGET="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
@@ -83,16 +85,20 @@ if [[ "${USE_HEXAGON}" == "true" ]]; then
     HEXAGON_INCLUDE=$(cd "tensorflow/core/platform/hexagon" >/dev/null && pwd)
 fi
 
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_MAKE_ARGS+=("ENABLE_EXPERIMENTAL_HEXNN_OPS=true")
+fi
+
 if [[ -z "${BUILD_TARGET}" ]]; then
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}"
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]}
 else
     # BUILD_TARGET explicitly uncommented to allow multiple targets to be
     # passed to make in a single build_all_android.sh invocation.
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
          TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
-SUB_MAKEFILES="${SUB_MAKEFILES}" ${BUILD_TARGET}
+SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]} ${BUILD_TARGET}
 fi
diff --git a/tensorflow/contrib/makefile/build_helper.subr b/tensorflow/contrib/makefile/build_helper.subr
index 717e459beb1b695b877feac3f8c4a886d13e8a85..d58b2c0a9be80da28159e869b0ed2c331e2f0191 100644
--- a/tensorflow/contrib/makefile/build_helper.subr
+++ b/tensorflow/contrib/makefile/build_helper.subr
@@ -31,7 +31,7 @@ get_cpu_count() {
 }
 
 get_job_count() {
-  echo $(($(get_cpu_count) * 2))
+  echo $(($(get_cpu_count)))
 }
 
 make_host_protoc() {
@@ -59,12 +59,22 @@ make_host_protoc() {
 }
 
 download_and_push() {
-    URL="$1"
-    LOCAL_DEST="$2"
-    ANDROID_DEST="$3"
-    curl -Ls "${URL}" -o "${LOCAL_DEST}"
-    if [[ ! -z "${ANDROID_DEST}" ]]; then
-        adb shell mkdir -p "${ANDROID_DEST}"
-        adb push "${LOCAL_DEST}" "${ANDROID_DEST}"
+  URL="$1"
+  LOCAL_DEST="$2"
+  ANDROID_DEST="$3"
+  SKIP_DOWNLOAD_IF_EXIST="$4"
+  if [[ "${SKIP_DOWNLOAD_IF_EXIST}" == "true" ]]; then
+    ANDROID_DEST_FILE_PATH="${ANDROID_DEST}/$(basename "${LOCAL_DEST}")"
+    if adb shell test -f "${ANDROID_DEST_FILE_PATH}"; then
+        echo "${ANDROID_DEST_FILE_PATH} already existins, skip download" 1>&2
+      return 0
     fi
+  fi
+
+  curl -Ls "${URL}" -o "${LOCAL_DEST}"
+
+  if [[ ! -z "${ANDROID_DEST}" ]]; then
+    adb shell mkdir -p "${ANDROID_DEST}"
+    adb push "${LOCAL_DEST}" "${ANDROID_DEST}"
+  fi
 }
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 12f34b38d0875864e0514ce51afab197a4ab8a50..d1012a6c9351eddcc306fb5261e872230aaf6deb 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -41,7 +41,7 @@ IPHONEOS_SYSROOT=`xcrun --sdk iphoneos --show-sdk-path`
 IPHONESIMULATOR_PLATFORM=`xcrun --sdk iphonesimulator --show-sdk-platform-path`
 IPHONESIMULATOR_SYSROOT=`xcrun --sdk iphonesimulator --show-sdk-path`
 IOS_SDK_VERSION=`xcrun --sdk iphoneos --show-sdk-version`
-MIN_SDK_VERSION=8.2
+MIN_SDK_VERSION=8.0
 
 CFLAGS="-DNDEBUG -Os -pipe -fPIC -fno-exceptions"
 CXXFLAGS="${CFLAGS} -std=c++11 -stdlib=libc++"
diff --git a/tensorflow/contrib/makefile/compile_linux_protobuf.sh b/tensorflow/contrib/makefile/compile_linux_protobuf.sh
index 480fbcc215cceb0ec7b462d2543f2e93adb93651..6eb061a3c96e74ddd0e3e6ee350278c5096be399 100755
--- a/tensorflow/contrib/makefile/compile_linux_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_linux_protobuf.sh
@@ -38,7 +38,7 @@ then
   exit 1
 fi
 
-./configure --prefix="${GENDIR}"
+./configure --prefix="${GENDIR}" --with-pic
 if [ $? -ne 0 ]
 then
   echo "./configure command failed."
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 58d1dc6f0afd22d894e2bfb5fd9dc2bf5554f953..f123111df84fa59e8c0da94329f8c0103f88a0ed 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,9 +30,13 @@ RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" |
 replace_by_sed() {
   local regex="${1}"
   shift
-  if echo "${OSTYPE}" | grep -q darwin; then
+  # Detect the version of sed by the return value of "--version" flag. GNU-sed
+  # supports "--version" while BSD-sed doesn't.
+  if ! sed --version >/dev/null 2>&1; then
+    # BSD-sed.
     sed -i '' -e "${regex}" "$@"
   else
+    # GNU-sed.
     sed -i -e "${regex}" "$@"
   fi
 }
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index c0969e6dee2553a46fb98d6c1327f2629a64c7fd..5ade8942af39f1d308c5f6e308e1cee754510926 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -7,9 +7,11 @@ tensorflow/core/protobuf/saver.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/meta_graph.pb.cc
+tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
 tensorflow/core/protobuf/debug.pb.cc
+tensorflow/core/protobuf/device_properties.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/framework/variable.pb.cc
@@ -35,3 +37,4 @@ tensorflow/core/framework/attr_value.pb.cc
 tensorflow/core/framework/allocation_description.pb.cc
 tensorflow/core/example/feature.pb.cc
 tensorflow/core/example/example.pb.cc
+tensorflow/core/grappler/costs/op_performance_data.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 132b4775962aaef478f90bd254702015ba498cd6..1f0ad06cdc5b98ae9c08ea63dad70eb02b6ef46b 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -7,8 +7,10 @@ tensorflow/core/protobuf/saver.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/meta_graph.pb.h
+tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
+tensorflow/core/protobuf/device_properties.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
@@ -36,3 +38,4 @@ tensorflow/core/framework/attr_value.pb.h
 tensorflow/core/framework/allocation_description.pb.h
 tensorflow/core/example/feature.pb.h
 tensorflow/core/example/example.pb.h
+tensorflow/core/grappler/costs/op_performance_data.pb.h
diff --git a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
index 3d72247cd8741df5e2b7f97e11158a45163af2f6..861bb885c7031b996b48dbc50887cfce55c638f3 100755
--- a/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
+++ b/tensorflow/contrib/makefile/samples/build_and_run_inception_hexagon.sh
@@ -19,15 +19,25 @@ set -e
 
 usage() {
   echo "Usage: QUALCOMM_SDK=<path to qualcomm sdk. Not needed if you specify -p> NDK_ROOT=<path to ndk root> $(basename "$0")"
+  echo "Optional: NNLIB_DIR=<path to downloaded nnlib dir>"
   echo "-b build only"
+  echo "-c test count"
+  echo "-E enable experimental hexnn ops"
   echo "-p use prebuilt hexagon binaries"
+  echo "-s skip download if files already exist"
   exit 1
 }
 
-while getopts "bp" opt_name; do
+TEST_COUNT=1
+SKIP_DOWNLOAD_IF_EXIST=false
+
+while getopts "bc:Eps" opt_name; do
   case "$opt_name" in
     b) BUILD_ONLY="true";;
+    c) TEST_COUNT="${OPTARG}";;
+    E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     p) USE_PREBUILT_HEXAOGON_BINARIES="true";;
+    s) SKIP_DOWNLOAD_IF_EXIST="true";;
     *) usage;;
   esac
 done
@@ -79,14 +89,27 @@ if [[ "${USE_PREBUILT_HEXAOGON_BINARIES}" == "true" ]]; then
         NN_LIB_PUSH_DEST="/vendor/lib/rfsa/adsp"
     fi
     download_and_push "${URL_BASE}/deps/hexagon/libhexagon_controller.so" \
-"${GEN_LIBS_DIR}/libhexagon_controller.so" "${CONTROLLER_PUSH_DEST}"
+"${GEN_LIBS_DIR}/libhexagon_controller.so" "${CONTROLLER_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
 
     download_and_push "${URL_BASE}/deps/hexagon/libhexagon_nn_skel.so" \
-"${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "${NN_LIB_PUSH_DEST}"
+"${GEN_LIBS_DIR}/libhexagon_nn_skel.so" "${NN_LIB_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
 else
     echo "Build hexagon binaries from source code"
     cd "${GEN_DIR}"
-    git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
+    if [[ -z "${NNLIB_DIR}" ]]; then
+      git clone https://source.codeaurora.org/quic/hexagon_nn/nnlib
+    else
+      if [[ ! -f "${NNLIB_DIR}/Makefile" ]]; then
+        echo "Couldn't locate ${NNLIB_DIR}/Makefile" 1>&2
+        exit 1
+      fi
+      echo "Use nnlib in ${NNLIB_DIR}" 1>&2
+      GEN_NNLIB_DIR="${GEN_DIR}/nnlib"
+      mkdir -p "${GEN_NNLIB_DIR}"
+      cp -af "${NNLIB_DIR}/"* "${GEN_NNLIB_DIR}"
+    fi
 
     cd "${QUALCOMM_SDK}"
     source "${QUALCOMM_SDK}/setup_sdk_env.sh"
@@ -137,7 +160,11 @@ fi
 if [[ -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf" &&
       -d "${TF_ROOT_DIR}/tensorflow/contrib/makefile/gen/protobuf-host" ]]; then
     echo "generated protobuf and protobuf-host found."
-    extra_args+=("-T")
+    EXTRA_ARGS+=("-T")
+fi
+
+if [[ "${ENABLE_EXPERIMENTAL_HEXNN_OPS}" == "true" ]]; then
+    EXTRA_ARGS+=("-E")
 fi
 
 if [[ -z "${CC_PREFIX}" ]]; then
@@ -147,7 +174,7 @@ fi
 CC_PREFIX=${CC_PREFIX} NDK_ROOT=${NDK_ROOT} "${BUILD_ALL_ANDROID_PATH}" \
 -x "${GEN_LIBS_DIR}" \
 -s "${TF_ROOT_DIR}/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in" \
--t "hexagon_graph_execution" ${extra_args[@]}
+-t "hexagon_graph_execution" ${EXTRA_ARGS[@]}
 
 echo "Download and push inception image"
 HEXAGON_DOWNLOAD_PATH=\
@@ -160,15 +187,18 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
 fi
 
 download_and_push "${URL_BASE}/example_images/img_299x299.bmp" \
-"${GEN_DOWNLOAD_DIR}/img_299x299.bmp" "${BIN_PUSH_DEST}"
+"${GEN_DOWNLOAD_DIR}/img_299x299.bmp" "${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
 
 download_and_push \
 "${URL_BASE}/models/tensorflow_inception_v3_stripped_optimized_quantized.pb" \
 "${GEN_DOWNLOAD_DIR}/tensorflow_inception_v3_stripped_optimized_quantized.pb" \
-"${BIN_PUSH_DEST}"
+"${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
 
 download_and_push "${URL_BASE}/models/imagenet_comp_graph_label_strings.txt" \
-"${GEN_DOWNLOAD_DIR}/imagenet_comp_graph_label_strings.txt" "${BIN_PUSH_DEST}"
+"${GEN_DOWNLOAD_DIR}/imagenet_comp_graph_label_strings.txt" "${BIN_PUSH_DEST}" \
+"${SKIP_DOWNLOAD_IF_EXIST}"
 
 # By default this script runs a test to fuse and run the model
 gtest_args+=("--gtest_filter=GraphTransferer.RunInceptionV3OnHexagonExampleWithTfRuntime")
@@ -195,6 +225,9 @@ if [[ "${BUILD_ONLY}" != "true" ]]; then
     adb shell chmod "${ANDROID_EXEC_FILE_MODE}" \
         "/data/local/tmp/hexagon_graph_execution"
     adb wait-for-device
-    adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
-        "/data/local/tmp/hexagon_graph_execution" ${gtest_args[@]}
+
+    for i in $(seq 1 "${TEST_COUNT}"); do
+      adb shell 'LD_LIBRARY_PATH=/data/local/tmp:$LD_LIBRARY_PATH' \
+          "/data/local/tmp/hexagon_graph_execution" ${gtest_args[@]}
+    done
 fi
diff --git a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
index bc7a238fdba621553a98ec92790e9670e1af21a4..6ba41d5d12a3f5243b797b253fed46aceae9ba9c 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/quantization/Makefile.in
@@ -50,6 +50,7 @@ tensorflow/core/kernels/hexagon/graph_transferer_test.cc \
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
 tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc \
 tensorflow/core/kernels/remote_fused_graph_execute_op.cc \
+tensorflow/core/kernels/remote_fused_graph_execute_utils.cc \
 tensorflow/core/ops/remote_fused_graph_ops.cc \
 tensorflow/core/platform/posix/test.cc
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 4a119f7ffc0ca64d63fdc364c5f89181e51425b4..b9cd91e519d2366dc384feb1fb991fb2b50bb8fe 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -4,6 +4,7 @@ tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
+tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
@@ -73,6 +74,7 @@ tensorflow/core/kernels/reduction_ops_mean.cc
 tensorflow/core/kernels/reduction_ops_max.cc
 tensorflow/core/kernels/reduction_ops_common.cc
 tensorflow/core/kernels/reduction_ops_any.cc
+tensorflow/core/kernels/reduction_ops_all.cc
 tensorflow/core/kernels/queue_ops.cc
 tensorflow/core/kernels/queue_base.cc
 tensorflow/core/kernels/pooling_ops_common.cc
@@ -101,12 +103,14 @@ tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/gather_functor.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
+tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
+tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
@@ -136,12 +140,15 @@ tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_greater_equal.cc
 tensorflow/core/kernels/cwise_op_greater.cc
+tensorflow/core/kernels/cwise_op_floor_div.cc
+tensorflow/core/kernels/cwise_op_floor.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add_2.cc
 tensorflow/core/kernels/cwise_op_add_1.cc
+tensorflow/core/kernels/cwise_op_abs.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -223,4 +230,4 @@ tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
 tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
-
+tensorflow/core/kernels/warn_about_ints.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index f1da05e4c6ec197ba5e4a8fe0296de5464e3fe92..c39257ffa91fef184e8bd5258b19c4323a1b7fe0 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -1,6 +1,7 @@
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
+tensorflow/core/protobuf/cluster.pb_text.cc
 tensorflow/core/protobuf/config.pb_text.cc
 tensorflow/core/protobuf/debug.pb_text.cc
 tensorflow/core/protobuf/rewriter_config.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 2a78ea610166410c8b4a899786c5b021ddebdba3..36d9cb74a704172a44e77952d021cab671806b03 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -7,8 +7,10 @@ tensorflow/core/protobuf/saver.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/meta_graph.proto
+tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
+tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
diff --git a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
index 0f3a5f13136965ffc98ca6bd95fd3e21b9c305b3..ec25c032f0588e5aaa0192349288d45e503baecf 100644
--- a/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
+++ b/tensorflow/contrib/memory_stats/python/kernel_tests/memory_stats_ops_test.py
@@ -49,7 +49,7 @@ class MemoryStatsOpsTest(test_util.TensorFlowTestCase):
   # The memory for matrix "a" can be reused for matrix "d". Therefore, this
   # computation needs space for only three matrix plus some small overhead.
   def testChainOfMatmul(self):
-    # MaxBytesInUse is registerd on GPU only. See kernels/memory_stats_ops.cc.
+    # MaxBytesInUse is registered on GPU only. See kernels/memory_stats_ops.cc.
     if not test.is_gpu_available():
       return
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 90c1440f085adb6d3486c1a0e578b0669d90c857..727cdd9597a6267702f705497ac6af6819a51e6a 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 
 
 def _safe_div(numerator, denominator, name):
@@ -73,7 +72,7 @@ def _create_local(name, shape, collections=None, validate_shape=True,
   # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
   collections = list(collections or [])
   collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value=array_ops.zeros(shape, dtype=dtype),
       name=name,
       trainable=False,
@@ -1339,6 +1338,87 @@ def streaming_sparse_precision_at_top_k(top_k_predictions,
         name=name_scope)
 
 
+def sparse_recall_at_top_k(labels,
+                           top_k_predictions,
+                           class_id=None,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  If `class_id` is specified, we calculate recall by considering only the
+      entries in the batch for which `class_id` is in the label, and computing
+      the fraction of them for which `class_id` is in the top-k `predictions`.
+  If `class_id` is not specified, we'll calculate recall as how often on
+      average a class among the labels of a batch entry is in the top-k
+      `predictions`.
+
+  `sparse_recall_at_top_k` creates two local variables, `true_positive_at_<k>`
+  and `false_negative_at_<k>`, that are used to compute the recall_at_k
+  frequency. This frequency is ultimately returned as `recall_at_<k>`: an
+  idempotent operation that simply divides `true_positive_at_<k>` by total
+  (`true_positive_at_<k>` + `false_negative_at_<k>`).
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall_at_<k>`. Set operations applied to `top_k` and `labels` calculate the
+  true positives and false negatives weighted by `weights`. Then `update_op`
+  increments `true_positive_at_<k>` and `false_negative_at_<k>` using these
+  values.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels], where N >= 1 and num_labels is the number of
+      target classes for the associated prediction. Commonly, N=1 and `labels`
+      has shape [batch_size, num_labels]. [D1, ... DN] must match
+      `top_k_predictions`. Values should be in range [0, num_classes), where
+      num_classes is the last dimension of `predictions`. Values outside this
+      range always count towards `false_negative_at_<k>`.
+    top_k_predictions: Integer `Tensor` with shape [D1, ... DN, k] where
+      N >= 1. Commonly, N=1 and top_k_predictions has shape [batch size, k].
+      The final dimension contains the indices of top-k labels. [D1, ... DN]
+      must match `labels`.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  default_name = _at_k_name('recall', class_id=class_id)
+  with ops.name_scope(name, default_name, (top_k_predictions, labels,
+                                           weights)) as name_scope:
+    return metrics_impl._sparse_recall_at_top_k(  # pylint: disable=protected-access
+        labels=labels,
+        predictions_idx=top_k_predictions,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=name_scope)
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
@@ -2289,6 +2369,7 @@ def _remove_squeezable_dimensions(predictions, labels, weights):
 __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
+    'sparse_recall_at_top_k',
     'streaming_accuracy',
     'streaming_auc',
     'streaming_false_negatives',
@@ -2311,7 +2392,9 @@ __all__ = [
     'streaming_root_mean_squared_error',
     'streaming_sensitivity_at_specificity',
     'streaming_sparse_average_precision_at_k',
+    'streaming_sparse_average_precision_at_top_k',
     'streaming_sparse_precision_at_k',
+    'streaming_sparse_precision_at_top_k',
     'streaming_sparse_recall_at_k',
     'streaming_specificity_at_sensitivity',
     'streaming_true_negatives',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index b960e1310ecc2a1dfd18beef56204d0f60893126..f97f03e30e1ccd50a70e2978d7ad7249516bdbe9 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -1507,7 +1507,7 @@ class StreamingAUCTest(test.TestCase):
       self.assertAlmostEqual(1, auc.eval(), 6)
 
   def np_auc(self, predictions, labels, weights):
-    """Computes the AUC explicitely using Numpy.
+    """Computes the AUC explicitly using Numpy.
 
     Args:
       predictions: an ndarray with shape [N].
@@ -2958,8 +2958,38 @@ class StreamingSparseRecallTest(test.TestCase):
         self.assertEqual(expected, update.eval())
         self.assertEqual(expected, metric.eval())
 
+  def _test_sparse_recall_at_top_k(self,
+                                   labels,
+                                   top_k_predictions,
+                                   expected,
+                                   class_id=None,
+                                   weights=None):
+    with ops.Graph().as_default() as g, self.test_session(g):
+      if weights is not None:
+        weights = constant_op.constant(weights, dtypes_lib.float32)
+      metric, update = metric_ops.sparse_recall_at_top_k(
+          labels=labels,
+          top_k_predictions=constant_op.constant(top_k_predictions,
+                                                 dtypes_lib.int32),
+          class_id=class_id,
+          weights=weights)
+
+      # Fails without initialized vars.
+      self.assertRaises(errors_impl.OpError, metric.eval)
+      self.assertRaises(errors_impl.OpError, update.eval)
+      variables.variables_initializer(variables.local_variables()).run()
+
+      # Run per-step op and assert expected values.
+      if math.isnan(expected):
+        self.assertTrue(math.isnan(update.eval()))
+        self.assertTrue(math.isnan(metric.eval()))
+      else:
+        self.assertEqual(expected, update.eval())
+        self.assertEqual(expected, metric.eval())
+
   def test_one_label_at_k1_nan(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2970,9 +3000,12 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (-1, 0, 1, 4):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=1, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_one_label_at_k1_no_predictions(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2981,9 +3014,12 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 0 predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0, class_id=2)
 
   def test_one_label_at_k1(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -2992,13 +3028,18 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, class_id=3)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=3)
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2)
 
   def test_one_label_at_k1_weighted(self):
     predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    top_k_predictions = [[3], [3]]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 0, 1], [0, 0, 1, 0]])
     dense_labels = np.array([[3], [2]], dtype=np.int64)
@@ -3007,6 +3048,8 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, class_id=3, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3014,6 +3057,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3021,6 +3070,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(2.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3028,6 +3083,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3035,6 +3096,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=3,
           weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=3,
+          weights=(0.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3042,6 +3109,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 0.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3049,6 +3122,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=1.0 / 1,
           class_id=3,
           weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=1.0 / 1,
+          class_id=3,
+          weights=(1.0, 1.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3056,6 +3135,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=2.0 / 2,
           class_id=3,
           weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=2.0 / 2,
+          class_id=3,
+          weights=(2.0, 3.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3063,6 +3148,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=3.0 / 3,
           class_id=3,
           weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=3.0 / 3,
+          class_id=3,
+          weights=(3.0, 2.0))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3070,6 +3161,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.3 / 0.3,
           class_id=3,
           weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.3 / 0.3,
+          class_id=3,
+          weights=(0.3, 0.6))
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3077,32 +3174,70 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=0.6 / 0.6,
           class_id=3,
           weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=0.6 / 0.6,
+          class_id=3,
+          weights=(0.6, 0.3))
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=NAN, weights=(0.0,))
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(2.0,))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, weights=(1.0, 0.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, weights=(0.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 2, weights=(1.0, 1.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 5, weights=(2.0, 3.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 5, weights=(3.0, 2.0))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.3 / 0.9, weights=(0.3, 0.6))
+
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.6 / 0.9, weights=(0.6, 0.3))
 
   def test_three_labels_at_k5_nan(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3112,10 +3247,16 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3124,10 +3265,16 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 8: 1 label, no predictions.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=8)
 
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sparse_labels = _binary_2d_label_to_sparse_value(
         [[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]])
     dense_labels = np.array([[2, 7, 8], [1, 2, 5]], dtype=np.int64)
@@ -3136,23 +3283,35 @@ class StreamingSparseRecallTest(test.TestCase):
       # Class 2: 2 labels, both correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=2.0 / 2, class_id=2)
 
       # Class 5: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=1.0 / 1, class_id=5)
 
       # Class 7: 1 label, incorrect.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=0.0 / 1, class_id=7)
 
       # All classes: 6 labels, 3 correct.
       self._test_streaming_sparse_recall_at_k(
           predictions, labels, k=5, expected=3.0 / 6)
+      self._test_sparse_recall_at_top_k(
+          labels, top_k_predictions, expected=3.0 / 6)
 
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
+    top_k_predictions = [
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ]
     sp_labels = sparse_tensor.SparseTensorValue(
         indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 1], [1, 2],
                  [1, 3]],
@@ -3167,6 +3326,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=2.0 / 2,
         class_id=2)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=2.0 / 2,
+        class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3175,6 +3339,11 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=1.0 / 1,
         class_id=5)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=1.0 / 1,
+        class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3183,16 +3352,30 @@ class StreamingSparseRecallTest(test.TestCase):
         k=5,
         expected=0.0 / 1,
         class_id=7)
+    self._test_sparse_recall_at_top_k(
+        sp_labels,
+        top_k_predictions,
+        expected=0.0 / 1,
+        class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions=predictions, labels=sp_labels, k=5, expected=3.0 / 8)
+    self._test_sparse_recall_at_top_k(
+        sp_labels, top_k_predictions, expected=3.0 / 8)
 
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
          [[0, 1, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 1, 0]]])
@@ -3207,12 +3390,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=NAN, class_id=class_id)
 
   def test_3d_no_predictions(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     sparse_labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3229,12 +3421,21 @@ class StreamingSparseRecallTest(test.TestCase):
       for class_id in (1, 8):
         self._test_streaming_sparse_recall_at_k(
             predictions, labels, k=5, expected=0.0, class_id=class_id)
+        self._test_sparse_recall_at_top_k(
+            labels, top_k_predictions, expected=0.0, class_id=class_id)
 
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3244,24 +3445,39 @@ class StreamingSparseRecallTest(test.TestCase):
     # Class 2: 4 labels, all correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=4.0 / 4, class_id=2)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=2.0 / 2, class_id=5)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 labels, 1 incorrect.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=1.0 / 2, class_id=7)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=1.0 / 2, class_id=7)
 
     # All classes: 12 labels, 7 correct.
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=7.0 / 12)
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=7.0 / 12)
 
   def test_3d_ignore_all(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3276,6 +3492,12 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0], [0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0], [0]])
       self._test_streaming_sparse_recall_at_k(
           predictions,
           labels,
@@ -3283,16 +3505,33 @@ class StreamingSparseRecallTest(test.TestCase):
           expected=NAN,
           class_id=class_id,
           weights=[[0, 0], [0, 0]])
+      self._test_sparse_recall_at_top_k(
+          labels,
+          top_k_predictions,
+          expected=NAN,
+          class_id=class_id,
+          weights=[[0, 0], [0, 0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0], [0]])
     self._test_streaming_sparse_recall_at_k(
         predictions, labels, k=5, expected=NAN, weights=[[0, 0], [0, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels, top_k_predictions, expected=NAN, weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
                    [[0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6],
                     [0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9]]]
+    top_k_predictions = [[
+        [9, 4, 6, 2, 0],
+        [5, 7, 2, 9, 6],
+    ], [
+        [5, 7, 2, 9, 6],
+        [9, 4, 6, 2, 0],
+    ]]
     labels = _binary_3d_label_to_sparse_value(
         [[[0, 0, 1, 0, 0, 0, 0, 1, 1, 0],
           [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]],
@@ -3307,6 +3546,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3316,6 +3561,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=2.0 / 2.0,
         class_id=2,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=2.0 / 2.0,
+        class_id=2,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3325,6 +3576,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 1.0,
         class_id=7,
         weights=[[0], [1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 1.0,
+        class_id=7,
+        weights=[[0], [1]])
 
     # Class 7: 1 label, incorrect.
     self._test_streaming_sparse_recall_at_k(
@@ -3334,6 +3591,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=0.0 / 1.0,
         class_id=7,
         weights=[[1], [0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=0.0 / 1.0,
+        class_id=7,
+        weights=[[1], [0]])
 
     # Class 7: 2 labels, 1 correct.
     self._test_streaming_sparse_recall_at_k(
@@ -3343,6 +3606,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=1.0 / 2.0,
         class_id=7,
         weights=[[1, 0], [1, 0]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=1.0 / 2.0,
+        class_id=7,
+        weights=[[1, 0], [1, 0]])
 
     # Class 7: No labels.
     self._test_streaming_sparse_recall_at_k(
@@ -3352,6 +3621,12 @@ class StreamingSparseRecallTest(test.TestCase):
         expected=NAN,
         class_id=7,
         weights=[[0, 1], [0, 1]])
+    self._test_sparse_recall_at_top_k(
+        labels,
+        top_k_predictions,
+        expected=NAN,
+        class_id=7,
+        weights=[[0, 1], [0, 1]])
 
   def test_sparse_tensor_value(self):
     predictions = [[0.1, 0.3, 0.2, 0.4],
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index c7f32baa2d55f80276c31b63e92f4862394d2d69..9d02db6fc839309b1e96bbe2ead4bb3ea815976f 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -1,10 +1,8 @@
 # Description:
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
-package(
-    default_visibility = ["//visibility:private"],
-    features = ["-parse_headers"],
-)
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -12,13 +10,15 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_cc_test",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_library(
     name = "python/ops/_nccl_ops.so",
@@ -34,6 +34,50 @@ tf_custom_op_library(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "nccl_manager_test",
+    size = "medium",
+    srcs = if_cuda(
+        [
+            "kernels/nccl_manager.cc",
+            "kernels/nccl_manager.h",
+            "kernels/nccl_manager_test.cc",
+        ],
+        [],
+    ),
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = if_cuda(
+        [
+            "@nccl_archive//:nccl",
+            "//tensorflow/core:cuda",
+        ],
+        [],
+    ) + [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "nccl_kernels",
+    srcs = [
+        "kernels/nccl_manager.cc",
+        "kernels/nccl_manager.h",
+        "kernels/nccl_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "@nccl_archive//:nccl",
+    ],
+    alwayslink = 1,
+)
+
 tf_gen_op_libs(
     op_lib_names = ["nccl_ops"],
     deps = [
@@ -46,15 +90,19 @@ tf_gen_op_wrapper_py(
     deps = [":nccl_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "nccl_py",
     srcs = [
         "__init__.py",
         "python/ops/nccl_ops.py",
     ],
-    data = [
+    dso = [
         ":python/ops/_nccl_ops.so",
     ],
+    kernels = [
+        ":nccl_kernels",
+        ":nccl_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -76,35 +124,10 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
-        "requires_cudnn5",
-    ],
-)
-
-tf_cuda_cc_test(
-    name = "nccl_manager_test",
-    size = "medium",
-    srcs = if_cuda(
-        [
-            "kernels/nccl_manager.cc",
-            "kernels/nccl_manager.h",
-            "kernels/nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    tags = ["manual"],  # Disabled until errors finding nvmlShutdown are found.
-    deps = if_cuda(
-        [
-            "@nccl_archive//:nccl",
-            "//tensorflow/core",
-            "//tensorflow/core:cuda",
-        ],
-        [],
-    ) + [
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "notap",
     ],
 )
 
diff --git a/tensorflow/contrib/nccl/__init__.py b/tensorflow/contrib/nccl/__init__.py
index 0275ed6079825e9c13bbe265077966c9e3231557..d851c522c03d5da93544f8f0f9439affbd566c33 100644
--- a/tensorflow/contrib/nccl/__init__.py
+++ b/tensorflow/contrib/nccl/__init__.py
@@ -12,13 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Ops for nccl AllReduce."""
+"""Functions for using NVIDIA nccl collective ops.
+
+@@all_max
+@@all_min
+@@all_prod
+@@all_sum
+@@broadcast
+
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.nccl.python.ops.nccl_ops import *
-# pylint: enable=wildcard-import
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_max
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_min
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_prod
+from tensorflow.contrib.nccl.python.ops.nccl_ops import all_sum
+from tensorflow.contrib.nccl.python.ops.nccl_ops import broadcast
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index dfdfbc8eeab6c48642e7c8f19d7235150bbc6443..b289c91bb8ab3cfddcc1c0e25e0dccf4295fa160 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 
+#include <utility>
+
 #ifdef GOOGLE_CUDA
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -287,7 +289,7 @@ void NcclManager::AddBroadcastSend(
     const Tensor* in_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
-                      executor, gpu_device_id, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   participant->root = true;
   AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
@@ -300,7 +302,7 @@ void NcclManager::AddBroadcastRecv(
     Tensor* out_t, DoneCallback done_callback) {
   std::unique_ptr<Participant> participant(
       new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
-                      executor, gpu_device_id, done_callback));
+                      executor, gpu_device_id, std::move(done_callback)));
   AddParticipant(num_devices, key, std::move(participant), out_t->dtype(),
                  kBroadcast, ncclSum /* unused */);
 }
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 2173e13b91fedd4b873f9391e9c988a4d3c327d0..1b3351713dcf6471df7574fe7058019a7081204d 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -8,14 +8,18 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/delay_compensated_gradient_descent.py",
+        "python/training/drop_stale_gradient_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
+        "python/training/nadam_optimizer.py",
         "python/training/variable_clipping_optimizer.py",
     ],
     srcs_version = "PY2AND3",
@@ -34,6 +38,24 @@ py_library(
     ],
 )
 
+py_test(
+    name = "delay_compensated_gradient_descent_test",
+    srcs = ["python/training/delay_compensated_gradient_descent_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "external_optimizer_test",
     srcs = ["python/training/external_optimizer_test.py"],
@@ -104,6 +126,39 @@ py_test(
     ],
 )
 
+py_test(
+    name = "nadam_optimizer_test",
+    srcs = ["python/training/nadam_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "drop_stale_gradient_optimizer_test",
+    srcs = ["python/training/drop_stale_gradient_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index f88976a43813ad58415d22e3af1736c310bfa204..f4cb7456ccc5a261908ca7e87b3284eaa9c34d46 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,18 +19,24 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.delay_compensated_gradient_descent import *
+from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
+from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['ExternalOptimizerInterface',
-                    'LazyAdamOptimizer',
-                    'MovingAverageOptimizer',
-                    'ScipyOptimizerInterface',
-                    'VariableClippingOptimizer']
+
+_allowed_symbols = [
+    'DelayCompensatedGradientDescentOptimizer',
+    'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
+    'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
+    'ScipyOptimizerInterface', 'VariableClippingOptimizer'
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5e67ef68e23c78b8860929509e3e4dc2b9e515
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent.py
@@ -0,0 +1,256 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""DelayCompensatedGradientDescentOptimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class _RefVariableAsynchronousProcessor(optimizer._RefVariableProcessor):
+  """Processor for Variable."""
+  def update_op_asynchronous(self, optimizer, g, index):
+    if isinstance(g, ops.Tensor):
+      return optimizer._apply_dense(g, self._v, index)
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, index)
+
+
+class _DenseResourceVariableAsynchronousProcessor(optimizer._DenseResourceVariableProcessor):
+  """Processor for dense ResourceVariables."""
+  def update_op_asynchronous(self, optimizer, g, index):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      return optimizer._resource_apply_sparse_duplicate_indices(
+        g.values, self._v, g.indices, index)
+    return optimizer._resource_apply_dense(g, self._v, index)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableAsynchronousProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableAsynchronousProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+class DelayCompensatedGradientDescentOptimizer(optimizer.Optimizer):
+  """Optimizer that implements gradient descent with delay compensation.
+
+  See [Zheng, Shuxin, et al., 2016](https://arxiv.org/abs/1609.08326)
+  ([pdf](https://arxiv.org/pdf/1609.08326.pdf)).
+  """
+
+  def __init__(self, learning_rate, variance_parameter, num_workers=1,
+               use_locking=False, name="DelayCompensatedGradientDescent"):
+    """Construct a new gradient descent optimizer with delay compensation.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning
+        rate to use.
+      variance_parameter: A Tensor or a floating point value. The lambda
+        value to use.
+      num_workers: A value to indicate number of workers computing gradients
+        asynchronously.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "DelayCompensatedGradientDescent".
+      """
+    if num_workers <= 0:
+      raise ValueError("num_workers must be positive: %s" % num_workers)
+    super(DelayCompensatedGradientDescentOptimizer, self).__init__(
+          use_locking, name)
+    self._learning_rate = learning_rate
+    self._lambda = variance_parameter
+    self._num_workers = num_workers
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=optimizer.Optimizer.GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, worker_index=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      worker_index: Optional. A value to indicate the instance of worker
+        minimizing if computing asynchronously.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+    """
+    if (worker_index < 0 and worker_index is not None) or worker_index >= self._num_workers:
+      raise ValueError("worker index must be in the range [0, num_workers): %s" %
+                        worker_index)
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name, worker_index=worker_index)
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      global_step=None,
+                      name=None,
+                      worker_index=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+      worker_index: Optional value to indicate the instance of worker
+        minimizing if computing asynchronously.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    converted_grads_and_vars = []
+    for g, v in grads_and_vars:
+      if g is not None:
+        try:
+          # Convert the grad to Tensor or IndexedSlices if necessary.
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+      p = _get_processor(v)
+      converted_grads_and_vars.append((g, v, p))
+
+    converted_grads_and_vars = tuple(converted_grads_and_vars)
+    var_list = [v for g, v, _ in converted_grads_and_vars if g is not None]
+    if not var_list:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, _, v in converted_grads_and_vars],))
+    with ops.control_dependencies(None):
+      self._create_slots([optimizer._get_variable_for(v) for v in var_list])
+    update_ops = []
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+      for grad, var, processor in converted_grads_and_vars:
+        if grad is None:
+          continue
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        with ops.name_scope("update_" + var.op.name), ops.colocate_with(var):
+          if worker_index is None:
+            update_ops.append(processor.update_op(self, grad))
+          else:
+            update_ops.append(processor.update_op_asynchronous(self, grad,
+                                                               worker_index))
+      if global_step is None:
+        apply_updates = self._finish(update_ops, name)
+      else:
+        with ops.control_dependencies([self._finish(update_ops, "update")]):
+          with ops.colocate_with(global_step):
+            apply_updates = state_ops.assign_add(global_step, 1, name=name).op
+
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      if apply_updates not in train_op:
+        train_op.append(apply_updates)
+
+      return apply_updates
+
+  def _create_slots(self, var_list):
+    """Initialize slots for all the vars of each worker to store
+        the previous values of it
+    """
+    for index in range(self._num_workers):
+      for v in var_list:
+        var2 = array_ops.identity(v.initialized_value())
+        self._get_or_make_slot(v, var2, "shadow_{0}".format(index),
+                               self._name)
+
+  def _resource_apply_dense(self, grad, var, worker_index=0):
+    # Get previous value of the variable from the slot
+    shadow = self.get_slot(var, "shadow_{0}".format(worker_index))
+    return training_ops.apply_delay_compensated_gradient_descent(
+        var.handle,
+        math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
+        grad,
+        math_ops.cast(self._lambda_tensor, grad.dtype.base_dtype),
+        shadow.handle,
+        use_locking=self._use_locking)
+
+  def _prepare(self):
+    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
+                                                       name="learning_rate")
+    self._lambda_tensor = ops.convert_to_tensor(self._lambda,
+                                                name="lambda")
diff --git a/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dbd8416a087fb3ccbd7c90e4146d72fc24d6dcb
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/delay_compensated_gradient_descent_test.py
@@ -0,0 +1,132 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for DelayCompensatedGradientDescentOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.contrib.opt.python.training import delay_compensated_gradient_descent
+
+
+class DelayCompensatedGradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]), worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]), worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+
+    def testGradWrtRef(self):
+      for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+        with self.test_session():
+          optimizer = (delay_compensated_gradient_descent.
+                       DelayCompensatedGradientDescentOptimizer)(
+                           learning_rate=3.0,
+                           variance_parameter=2.0,
+                           num_workers=1)
+          values = [1.0, 3.0]
+          vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+          grads_and_vars = optimizer.compute_gradients(
+              vars_[0] + vars_[1], vars_)
+          variables.global_variables_initializer().run()
+          for grad, _ in grads_and_vars:
+            self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = (delay_compensated_gradient_descent.
+                     DelayCompensatedGradientDescentOptimizer)(
+                         learning_rate=3.0,
+                         variance_parameter=2.0,
+                         num_workers=1)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]),
+            global_step=global_step,
+            worker_index=0)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f20c172ee376d0a808a21fe96bec80367bf2e9f4
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for checking and dropping stale gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+class DropStaleGradientOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that checks and drops stale gradient.
+
+  This optimizer records the global step for each worker before computing
+  gradients and compares it with the global step at the time of applying the
+  gradients. If the difference is larger than a threshold, it will drop all
+  the computed gradients.
+  """
+
+  def __init__(self,
+               opt,
+               staleness,
+               use_locking=False,
+               name="DropStaleGradient"):
+    """Constructs a new DropStaleGradientOptimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+           gradients. Must be one of the Optimizer classes.
+      staleness: The maximum staleness allowed for the optimizer.
+      use_locking: If `True` use locks for clip update operations.
+      name: Optional name prefix for the operations created when applying
+            gradients. Defaults to "DropStaleGradient".
+    """
+    super(DropStaleGradientOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._staleness = staleness
+
+  def compute_gradients(self, loss, *args, **kwargs):
+    # Record current global step for worker.
+    with ops.colocate_with(loss):
+      self._local_step = training_util.get_global_step() + 0
+
+    with ops.control_dependencies([self._local_step]):
+      loss = gen_array_ops.identity(loss)
+      return self._opt.compute_gradients(loss, *args, **kwargs)
+
+  def get_slot(self, *args, **kwargs):
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    gradients = []
+    # Number of stale gradients.
+    stale_counter = variable_scope.get_variable(
+        "stale_counter", [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False)
+
+    def _AcceptGradientOp():
+      with ops.control_dependencies(
+          [self._opt.apply_gradients(
+              grads_and_vars, global_step=global_step, name=name)]):
+        return gen_array_ops.identity(0.0)
+
+    def _DropGradientOp():
+      return gen_array_ops.identity(1.0)
+
+    for grad_and_var in grads_and_vars:
+      grad = grad_and_var[0]
+      if isinstance(grad, ops.Tensor):
+        gradients.append(grad)
+      elif grad is not None:
+        gradients.append(grad.op)
+
+    with ops.control_dependencies(gradients), ops.colocate_with(global_step):
+      staleness = gen_array_ops.reshape(
+          global_step - self._local_step, shape=())
+
+    conditional_update = stale_counter.assign_add(control_flow_ops.cond(
+        gen_math_ops.less_equal(staleness, self._staleness),
+        _AcceptGradientOp, _DropGradientOp))
+
+    summary.scalar(
+        "Gradient staleness percentage",
+        stale_counter / (math_ops.cast(global_step + 1, dtypes.float32)))
+    return conditional_update
diff --git a/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53232082e16fa76db0befb3cdc1e6579f998a7b5
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/drop_stale_gradient_optimizer_test.py
@@ -0,0 +1,297 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DropStaleGradientOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+
+from tensorflow.contrib.opt.python.training import drop_stale_gradient_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training_util
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+def _get_workers(num_workers, staleness):
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  cluster_dict = {
+      'worker': ['localhost:%s' % port for port in worker_ports],
+      'ps': ['localhost:%s' % portpicker.pick_unused_port()]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+  workers = [
+      server_lib.Server(
+          cs, job_name='worker', task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  server_lib.Server(cs, job_name='ps', task_index=0, start=True)
+
+  sessions = []
+  graphs = []
+  train_ops = []
+
+  # To simulate stale cases, maintaining two queues for computing and
+  # applying gradients respectively. In the phase of computing gradients,
+  # all workers except chief worker compute gradients together and chief worker
+  # computes after all other worers' computing finished. In the phase of
+  # applying gradients, chief worker will first apply gradients, then all other
+  # workers will apply gradients one by one. Therefore, the chief worker will
+  # always have 0 staleness, each of all other workers will have a unique
+  # staleness value from [1, num_workers).
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    with graph.as_default():
+      global_step = training_util.create_global_step()
+      var_0 = variables.Variable(0.0, name='v0')
+      var_1 = variables.Variable(1.0, name='v1')
+      compute_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='compute_gradients_queue', shared_name='compute_gradients_queue')
+      apply_gradients_queue = data_flow_ops.FIFOQueue(
+          -1, global_step.dtype.base_dtype, shapes=(),
+          name='apply_gradients_queue', shared_name='apply_gradients_queue')
+
+      # Gradients for loss on var_0 and var_1 will be 1.0.
+      loss = 0 - var_0 - var_1
+      sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+      stale_check_opt = (
+          drop_stale_gradient_optimizer.DropStaleGradientOptimizer(
+              sgd_opt, staleness))
+
+      # Compute gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [compute_gradients_queue.dequeue_many(num_workers - 1)]):
+          grad_and_vars = stale_check_opt.compute_gradients(loss)
+      else:
+        grad_and_vars = stale_check_opt.compute_gradients(loss)
+        with ops.control_dependencies([t[0] for t in grad_and_vars]):
+          worker_enqueue_op = compute_gradients_queue.enqueue(global_step)
+
+      # Apply gradients.
+      if worker_id == 0:
+        with ops.control_dependencies(
+            [stale_check_opt.apply_gradients(grad_and_vars, global_step)]):
+          train_op = apply_gradients_queue.enqueue(global_step)
+      else:
+        with ops.control_dependencies([worker_enqueue_op]):
+          with ops.control_dependencies([apply_gradients_queue.dequeue()]):
+            with ops.control_dependencies(
+                [stale_check_opt.apply_gradients(
+                    grad_and_vars, global_step)]):
+              train_op = apply_gradients_queue.enqueue(global_step)
+
+      sess = session.Session(workers[worker_id].target)
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class DropStaleGradientOptimizerTest(test.TestCase):
+
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Worker(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify the updated value after 1 step.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test1WorkerNegativeStaleness(self):
+    num_workers = 1
+    sessions, graphs, train_ops = _get_workers(num_workers, -1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    # Verify no updates because max staleness is negative.
+    self.assertAllEqual(0, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness0(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test2WorkersStaleness1(self):
+    num_workers = 2
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_0.start()
+    thread_1.start()
+    thread_0.join()
+    thread_1.join()
+
+    # With 2 workers and max staleness set to 1, both workers will update
+    # var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness0(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 0)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 0, only chief worker will update
+    # var_0 and var_1.
+    self.assertAllEqual(1, sessions[0].run(global_step))
+    self.assertAllEqual(2.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 1.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 1.0, sessions[0].run(var_1))
+
+  def test3WorkersStaleness1(self):
+    num_workers = 3
+    sessions, graphs, train_ops = _get_workers(num_workers, 1)
+    with graphs[0].as_default():
+      sessions[0].run(variables.global_variables_initializer())
+    global_step = training_util.get_global_step(graphs[0])
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    stale_counter = graphs[0].get_tensor_by_name('stale_counter:0')
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    thread_0 = self.checkedThread(
+        target=self._run, args=(train_ops[0], sessions[0]))
+    thread_1 = self.checkedThread(
+        target=self._run, args=(train_ops[1], sessions[1]))
+    thread_2 = self.checkedThread(
+        target=self._run, args=(train_ops[2], sessions[2]))
+    thread_0.start()
+    thread_1.start()
+    thread_2.start()
+    thread_0.join()
+    thread_1.join()
+    thread_2.join()
+
+    # With 3 workers and max staleness set to 1, chief worker and only one of
+    # the two other workers will update var_0 and var_1.
+    self.assertAllEqual(2, sessions[0].run(global_step))
+    self.assertAllEqual(1.0, sessions[0].run(stale_counter))
+    self.assertAllEqual(0.0 + 2.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0 + 2.0, sessions[0].run(var_1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index db04cd25607779e7ed0faa720e8bc00c4c5786e3..0909760b383d3b810f4f208763b3c10d3e902ee6 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -99,8 +99,13 @@ class ExternalOptimizerInterface(object):
         slice(start, end) for start, end in zip(accumulated_dims[:-1],
                                                 accumulated_dims[1:])]
 
-  def minimize(self, session=None, feed_dict=None, fetches=None,
-               step_callback=None, loss_callback=None, **run_kwargs):
+  def minimize(self,
+               session=None,
+               feed_dict=None,
+               fetches=None,
+               step_callback=None,
+               loss_callback=None,
+               **run_kwargs):
     """Minimize a scalar `Tensor`.
 
     Variables subject to optimization are updated in-place at the end of
@@ -120,7 +125,7 @@ class ExternalOptimizerInterface(object):
         flattened into a single vector.
       loss_callback: A function to be called every time the loss and gradients
         are computed, with evaluated fetches supplied as positional arguments.
-      run_kwargs: kwargs to pass to `session.run`.
+      **run_kwargs: kwargs to pass to `session.run`.
     """
     session = session or ops.get_default_session()
     feed_dict = feed_dict or {}
@@ -161,9 +166,10 @@ class ExternalOptimizerInterface(object):
                 for packing_slice in self._packing_slices]
 
     # Set optimization variables to their new values.
-    session.run(self._var_updates,
-                feed_dict=dict(zip(self._update_placeholders, var_vals)),
-                **run_kwargs)
+    session.run(
+        self._var_updates,
+        feed_dict=dict(zip(self._update_placeholders, var_vals)),
+        **run_kwargs)
 
   def _minimize(self, initial_val, loss_grad_func, equality_funcs,
                 equality_grad_funcs, inequality_funcs, inequality_grad_funcs,
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4421ecfe6b0af9759c6aaa51d644f1211965b6a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -0,0 +1,93 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import training_ops
+
+
+class NadamOptimizer(adam.AdamOptimizer):
+  """Optimizer that implements the Nadam algorithm.
+
+  See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.apply_adam(
+        var,
+        m,
+        v,
+        math_ops.cast(self._beta1_power, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta1_t * m_t
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(
+        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0a257d264f83ae0a54cdc0e9265d6e7098b7b56
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -0,0 +1,159 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import nadam_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam_optimizer.NadamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Nadam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 89665af2a9b25aaa32fd15ada0e4ca225c617576..9d67563eddd47b19f404ed589002db2fe5de467f 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -51,6 +51,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
@@ -303,6 +304,7 @@ filegroup(
         exclude = [
             "**/METADATA",
             "**/OWNERS",
+            "tools/**",
         ],
     ),
     visibility = ["//tensorflow:__subpackages__"],
@@ -350,3 +352,27 @@ tf_kernel_library(
         "//third_party/eigen3",
     ],
 )
+
+py_binary(
+    name = "checkpoint_convert",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_test(
+    name = "checkpoint_convert_test",
+    size = "small",
+    srcs = ["python/tools/checkpoint_convert_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":checkpoint_convert",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index ce1ed7f491b27ac13de7c1052481aef2ca04a22d..2420c3e179b73ac52ad6222bb9944acbef156971 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -42,7 +42,11 @@ See @{$python/contrib.rnn} guide.
 @@GridLSTMCell
 @@BidirectionalGridLSTMCell
 @@NASCell
+@@UGRNNCell
+@@IntersectionRNNCell
 @@PhasedLSTMCell
+@@HighwayWrapper
+@@GLSTMCell
 
 ### RNNCell wrappers
 @@AttentionCellWrapper
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
index 2de40825c906e14257872e87342a085e8939796b..699cc6c88a4634334b2621a7f48cbbeae1dc9a45 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -78,7 +78,7 @@ ci = tanh(ci)
 cs = ci .* i + cs_prev .* f
 cs = clip(cs, cell_clip)
 
-o = sigmoid(cs * wco + f)
+o = sigmoid(cs * wco + o)
 co = tanh(cs)
 h = co .* o
 ```
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
index 7c4d003b833b39aded2f8002cf0a864331691e4f..544cd163c50062093acf7f5e942f67606936c0e3 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops_test.cc
@@ -37,7 +37,7 @@ class LSTMOpsTest : public ::testing::Test {
   }
 };
 
-static string JoinedCopies(string s, int copies) {
+static string JoinedCopies(const string& s, int copies) {
   string res;
   for (int i = 0; i < copies; ++i) {
     strings::StrAppend(&res, i > 0 ? ";" : "", s);
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 697f803b24e2ac2b10efe85fccd23d557454c141..0f207b088d36a2ac585742e5ff0def7e30b82ad2 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.rnn.python.ops import core_rnn_cell_impl
 from tensorflow.contrib.rnn.python.ops.core_rnn_cell_impl import _linear as linear
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -73,7 +74,41 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 2])
-        g, _ = core_rnn_cell_impl.BasicRNNCell(2)(x, m)
+        cell = core_rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual(
+            ["root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+             "root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._BIAS_VARIABLE_NAME],
+            [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g], {x.name: np.array([[1., 1.]]),
+                  m.name: np.array([[0.1, 0.1]])})
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testBasicRNNCellNotTrainable(self):
+    with self.test_session() as sess:
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = core_rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual(
+            ["root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+             "root/basic_rnn_cell/%s:0"
+             % core_rnn_cell_impl._BIAS_VARIABLE_NAME],
+            [v.name for v in cell.non_trainable_variables])
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g], {x.name: np.array([[1., 1.]]),
@@ -113,10 +148,23 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 8])
-        g, out_m = core_rnn_cell_impl.MultiRNNCell(
+        cell = core_rnn_cell_impl.MultiRNNCell(
             [core_rnn_cell_impl.BasicLSTMCell(
                 2, state_is_tuple=False) for _ in range(2)],
-            state_is_tuple=False)(x, m)
+            state_is_tuple=False)
+        g, out_m = cell(x, m)
+        expected_variable_names = [
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._BIAS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0"
+            % core_rnn_cell_impl._BIAS_VARIABLE_NAME]
+        self.assertEqual(
+            expected_variable_names, [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(
             [g, out_m],
@@ -124,15 +172,7 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 8])})
         self.assertEqual(len(res), 2)
         variables = variables_lib.global_variables()
-        self.assertEqual(4, len(variables))
-        self.assertEquals(variables[0].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/weights")
-        self.assertEquals(variables[1].op.name,
-                          "root/multi_rnn_cell/cell_0/basic_lstm_cell/biases")
-        self.assertEquals(variables[2].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/weights")
-        self.assertEquals(variables[3].op.name,
-                          "root/multi_rnn_cell/cell_1/basic_lstm_cell/biases")
+        self.assertEqual(expected_variable_names, [v.name for v in variables])
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
         expected_mem = np.array([[
@@ -154,6 +194,44 @@ class RNNCellTest(test.TestCase):
              m.name: 0.1 * np.ones([1, 4])})
         self.assertEqual(len(res), 2)
 
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = core_rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+               m.name: 0.1 * np.ones([batch_size - 1, state_size])})
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3 # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = core_rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run([g, out_m],
+                   {x.name: 1 * np.ones([batch_size, input_size]),
+                    m.name: 0.1 * np.ones([batch_size, state_size])})
+
   def testBasicLSTMCellStateTupleType(self):
     with self.test_session():
       with variable_scope.variable_scope(
@@ -269,10 +347,10 @@ class RNNCellTest(test.TestCase):
             state_is_tuple=False)
         cell(x, m)  # Execute to create variables
       variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/weights")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/biases")
+      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
       self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/weights")
+                        "root/lstm_cell/projection/kernel")
 
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
@@ -340,28 +418,56 @@ class RNNCellTest(test.TestCase):
       outputs, _ = cell(x, m)
       self.assertTrue("cpu:14159" in outputs.device.lower())
 
-  def testUsingSecondCellInScopeWithExistingVariablesFails(self):
-    # This test should go away when this behavior is no longer an
-    # error (Approx. May 2017)
-    cell1 = core_rnn_cell_impl.LSTMCell(3)
-    cell2 = core_rnn_cell_impl.LSTMCell(3)
-    x = array_ops.zeros([1, 3])
-    m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
-    cell1(x, m)
-    with self.assertRaisesRegexp(ValueError, r"LSTMCell\(..., reuse=True\)"):
-      cell2(x, m)
-
-  def testUsingCellInDifferentScopeFromFirstCallFails(self):
-    # This test should go away when this behavior is no longer an
-    # error (Approx. May 2017)
-    cell = core_rnn_cell_impl.LSTMCell(3)
-    x = array_ops.zeros([1, 3])
-    m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
-    with variable_scope.variable_scope("scope1"):
-      cell(x, m)
-    with variable_scope.variable_scope("scope2"):
-      with self.assertRaisesRegexp(ValueError, r"Attempt to reuse RNNCell"):
-        cell(x, m)
+  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
+    if not test.is_gpu_available():
+      # Can't perform this test w/o a GPU
+      return
+
+    with self.test_session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1, 3])
+        cell = core_rnn_cell_impl.DeviceWrapper(
+            core_rnn_cell_impl.GRUCell(3), "/gpu:0")
+        with ops.device("/cpu:0"):
+          outputs, _ = rnn.dynamic_rnn(
+              cell=cell, inputs=x, dtype=dtypes.float32)
+        run_metadata = config_pb2.RunMetadata()
+        opts = config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.FULL_TRACE)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+      step_stats = run_metadata.step_stats
+      ix = 0 if "gpu" in step_stats.dev_stats[0].device else 1
+      gpu_stats = step_stats.dev_stats[ix].node_stats
+      cpu_stats = step_stats.dev_stats[1 - ix].node_stats
+      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
+      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
+
+  # def testUsingSecondCellInScopeWithExistingVariablesFails(self):
+  #   # This test should go away when this behavior is no longer an
+  #   # error (Approx. May 2017)
+  #   cell1 = core_rnn_cell_impl.LSTMCell(3)
+  #   cell2 = core_rnn_cell_impl.LSTMCell(3)
+  #   x = array_ops.zeros([1, 3])
+  #   m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
+  #   cell1(x, m)
+  #   with self.assertRaisesRegexp(ValueError, r"LSTMCell\(..., reuse=True\)"):
+  #     cell2(x, m)
+
+  # def testUsingCellInDifferentScopeFromFirstCallFails(self):
+  #   # This test should go away when this behavior is no longer an
+  #   # error (Approx. May 2017)
+  #   cell = core_rnn_cell_impl.LSTMCell(3)
+  #   x = array_ops.zeros([1, 3])
+  #   m = core_rnn_cell_impl.LSTMStateTuple(*[array_ops.zeros([1, 3])] * 2)
+  #   with variable_scope.variable_scope("scope1"):
+  #     cell(x, m)
+  #   with variable_scope.variable_scope("scope2"):
+  #     with self.assertRaisesRegexp(ValueError, r"Attempt to reuse RNNCell"):
+  #       cell(x, m)
 
   def testEmbeddingWrapper(self):
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 4358fe475fc76d17c5e53d4128b2ec9509ee5cc8..54e3a0dadf36b68473cce996aef266888aaece34 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -521,7 +521,7 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
-  def testStateTupleWithProjAndSequenceLength(self):
+  def _testStateTupleWithProjAndSequenceLength(self):
     num_units = 3
     input_size = 5
     batch_size = 2
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index 9a96d4e85600fa8045651f23fae5e51bfebdf118..3a5cbf604dc3220c337c0f3ff2fe4e90772b94d6 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -68,8 +68,8 @@ class LSTMBlockCellTest(test.TestCase):
         m3 = array_ops.zeros([1, 2])
         g, ((out_m0, out_m1),
             (out_m2, out_m3)) = core_rnn_cell_impl.MultiRNNCell(
-                [lstm_ops.LSTMBlockCell(2)] * 2, state_is_tuple=True)(x, (
-                    (m0, m1), (m2, m3)))
+                [lstm_ops.LSTMBlockCell(2) for _ in range(2)],
+                state_is_tuple=True)(x, ((m0, m1), (m2, m3)))
         sess.run([variables.global_variables_initializer()])
         res = sess.run([g, out_m0, out_m1, out_m2, out_m3], {
             x.name: np.array([[1., 1.]]),
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index d3af4de721127e66ea3f1ed97da153d6b076a132..334baa5f9c5a65a81fe17359186003dadf738be3 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -66,7 +66,7 @@ class RNNCellTest(test.TestCase):
         x = array_ops.zeros([batch_size, input_size])
         m = array_ops.zeros([batch_size, state_size])
         output, state = rnn_cell.CoupledInputForgetGateLSTMCell(
-            num_units=num_units, forget_bias=1.0)(x, m)
+            num_units=num_units, forget_bias=1.0, state_is_tuple=False)(x, m)
         sess.run([variables.global_variables_initializer()])
         res = sess.run([output, state], {
             x.name:
@@ -569,7 +569,7 @@ class RNNCellTest(test.TestCase):
               self.assertTrue(
                   float(np.linalg.norm((state[0, :] - state[i, :]))) > 1e-6)
 
-  def testAttentionCellWrapperCorrectResult(self):
+  def _testAttentionCellWrapperCorrectResult(self):
     num_units = 4
     attn_length = 6
     batch_size = 2
@@ -849,14 +849,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[2.954548e-01, 8.354891e-04],
-           [2.834632e-01, 8.158963e-01],
-           [2.291694e-01, 1.325745e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[2.116566e-01, 5.985238e-04],
-           [2.137760e-01, 6.153145e-01],
-           [1.742966e-01, 1.008306e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
@@ -882,6 +880,88 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(res[1].c, expected_state_c)
         self.assertAllClose(res[1].h, expected_state_h)
 
+  def testHighwayWrapper(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "base_cell", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        base_cell = core_rnn_cell_impl.GRUCell(3)
+        g, m_new = base_cell(x, m)
+      with variable_scope.variable_scope(
+          "hw_cell", initializer=init_ops.constant_initializer(0.5)):
+        hw_cell = rnn_cell.HighwayWrapper(
+            core_rnn_cell_impl.GRUCell(3), carry_bias_init=-100.0)
+        g_res, m_new_res = hw_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+      res = sess.run([g, g_res, m_new, m_new_res], {
+          x: np.array([[1., 1., 1.]]),
+          m: np.array([[0.1, 0.1, 0.1]])
+      })
+      # As carry_bias_init is very negative, the carry gate is 'open' and the
+      # transform gate is 'closed'. This means the output equals the input.
+      self.assertAllClose(res[1], res[0])
+      # States are left untouched
+      self.assertAllClose(res[2], res[3])
+
+  def testGLSTMCell(self):
+    # Ensure that G-LSTM matches LSTM when number_of_groups = 1
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 1
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root1", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.ones([batch_size, num_units])
+        # When number_of_groups = 1, G-LSTM is equivalent to regular LSTM
+        gcell = rnn_cell.GLSTMCell(num_units=num_units,
+                                   number_of_groups=number_of_groups)
+        cell = core_rnn_cell_impl.LSTMCell(num_units=num_units)
+        self.assertTrue(isinstance(gcell.state_size, tuple))
+        zero_state = gcell.zero_state(batch_size=batch_size,
+                                      dtype=dtypes.float32)
+        gh, gs = gcell(x, zero_state)
+        h, g = cell(x, zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        glstm_result = sess.run([gh, gs])
+        lstm_result = sess.run([h, g])
+
+        self.assertAllClose(glstm_result[0], lstm_result[0], 1e-5)
+        self.assertAllClose(glstm_result[1], lstm_result[1], 1e-5)
+
+    # Test that G-LSTM subgroup act like corresponding sub-LSTMs
+    batch_size = 2
+    num_units = 4
+    number_of_groups = 2
+
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root2", initializer=init_ops.constant_initializer(0.5)):
+        # input for G-LSTM with 2 groups
+        glstm_input = array_ops.ones([batch_size, num_units])
+        gcell = rnn_cell.GLSTMCell(num_units=num_units,
+                                   number_of_groups=number_of_groups)
+        gcell_zero_state = gcell.zero_state(batch_size=batch_size,
+                                            dtype=dtypes.float32)
+        gh, gs = gcell(glstm_input, gcell_zero_state)
+
+        # input for LSTM cell simulating single G-LSTM group
+        lstm_input = array_ops.ones([batch_size, num_units / number_of_groups])
+        # note division by number_of_groups. This cell one simulates G-LSTM group
+        cell = core_rnn_cell_impl.LSTMCell(num_units=
+                                           int(num_units / number_of_groups))
+        cell_zero_state = cell.zero_state(batch_size=batch_size,
+                                          dtype=dtypes.float32)
+        h, g = cell(lstm_input, cell_zero_state)
+
+        sess.run([variables.global_variables_initializer()])
+        [gh_res, h_res] = sess.run([gh, h])
+        self.assertAllClose(gh_res[:, 0:int(num_units / number_of_groups)],
+                            h_res, 1e-5)
+        self.assertAllClose(gh_res[:, int(num_units / number_of_groups):],
+                            h_res, 1e-5)
 
 class LayerNormBasicLSTMCellTest(test.TestCase):
 
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn.py b/tensorflow/contrib/rnn/python/ops/core_rnn.py
index d254e717d5556b98161c2f66e1670233cfa53b4a..3ce075ce9c344eedd6018ec2ce400259f3a9aeff 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -31,7 +30,8 @@ from tensorflow.python.util import nest
 
 
 # pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
+_concat = rnn_cell_impl._concat
+_like_rnncell = rnn_cell_impl._like_rnncell
 _infer_state_dtype = rnn._infer_state_dtype
 _reverse_seq = rnn._reverse_seq
 _rnn_step = rnn._rnn_step
@@ -99,7 +99,7 @@ def static_rnn(cell, inputs, initial_state=None, dtype=None,
       (column size) cannot be inferred from inputs via shape inference.
   """
 
-  if not isinstance(cell, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
@@ -159,11 +159,10 @@ def static_rnn(cell, inputs, initial_state=None, dtype=None,
             "sequence_length must be a vector of length batch_size")
       def _create_zero_output(output_size):
         # convert int to TensorShape if necessary
-        size = _state_size_with_prefix(output_size, prefix=[batch_size])
+        size = _concat(batch_size, output_size)
         output = array_ops.zeros(
             array_ops.stack(size), _infer_state_dtype(dtype, state))
-        shape = _state_size_with_prefix(
-            output_size, prefix=[fixed_batch_size.value])
+        shape = _concat(fixed_batch_size.value, output_size, static=True)
         output.set_shape(tensor_shape.TensorShape(shape))
         return output
 
@@ -320,9 +319,9 @@ def static_bidirectional_rnn(cell_fw, cell_bw, inputs,
     ValueError: If inputs is None or an empty list.
   """
 
-  if not isinstance(cell_fw, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell_fw):
     raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, core_rnn_cell.RNNCell):
+  if not _like_rnncell(cell_bw):
     raise TypeError("cell_bw must be an instance of RNNCell")
   if not nest.is_sequence(inputs):
     raise TypeError("inputs must be a sequence")
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
index f44302638eb9948a308ea60b31a1a58ccacdff10..0bc4cea4658a3d4ea0e921c7bc24393865a66523 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py
@@ -27,7 +27,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
 import hashlib
 import math
 import numbers
@@ -43,76 +42,34 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 
 from tensorflow.python.ops.math_ops import sigmoid
 from tensorflow.python.ops.math_ops import tanh
-from tensorflow.python.ops.rnn_cell_impl import _RNNCell as RNNCell
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_BIAS_VARIABLE_NAME = "biases"
-_WEIGHTS_VARIABLE_NAME = "weights"
+# pylint: disable=protected-access
+RNNCell = rnn_cell_impl._RNNCell  # pylint: disable=invalid-name
+_like_rnncell = rnn_cell_impl._like_rnncell
+# pylint: enable=protected-access
 
-
-@contextlib.contextmanager
-def _checked_scope(cell, scope, reuse=None, **kwargs):
-  if reuse is not None:
-    kwargs["reuse"] = reuse
-  with vs.variable_scope(scope, **kwargs) as checking_scope:
-    scope_name = checking_scope.name
-    if hasattr(cell, "_scope"):
-      cell_scope = cell._scope  # pylint: disable=protected-access
-      if cell_scope.name != checking_scope.name:
-        raise ValueError(
-            "Attempt to reuse RNNCell %s with a different variable scope than "
-            "its first use.  First use of cell was with scope '%s', this "
-            "attempt is with scope '%s'.  Please create a new instance of the "
-            "cell if you would like it to use a different set of weights.  "
-            "If before you were using: MultiRNNCell([%s(...)] * num_layers), "
-            "change to: MultiRNNCell([%s(...) for _ in range(num_layers)]).  "
-            "If before you were using the same cell instance as both the "
-            "forward and reverse cell of a bidirectional RNN, simply create "
-            "two instances (one for forward, one for reverse).  "
-            "In May 2017, we will start transitioning this cell's behavior "
-            "to use existing stored weights, if any, when it is called "
-            "with scope=None (which can lead to silent model degradation, so "
-            "this error will remain until then.)"
-            % (cell, cell_scope.name, scope_name, type(cell).__name__,
-               type(cell).__name__))
-    else:
-      weights_found = False
-      try:
-        with vs.variable_scope(checking_scope, reuse=True):
-          vs.get_variable(_WEIGHTS_VARIABLE_NAME)
-        weights_found = True
-      except ValueError:
-        pass
-      if weights_found and reuse is None:
-        raise ValueError(
-            "Attempt to have a second RNNCell use the weights of a variable "
-            "scope that already has weights: '%s'; and the cell was not "
-            "constructed as %s(..., reuse=True).  "
-            "To share the weights of an RNNCell, simply "
-            "reuse it in your second calculation, or create a new one with "
-            "the argument reuse=True." % (scope_name, type(cell).__name__))
-
-    # Everything is OK.  Update the cell's scope and yield it.
-    cell._scope = checking_scope  # pylint: disable=protected-access
-    yield checking_scope
+_BIAS_VARIABLE_NAME = "bias"
+_WEIGHTS_VARIABLE_NAME = "kernel"
 
 
 class BasicRNNCell(RNNCell):
   """The most basic RNN cell."""
 
   def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
+    super(BasicRNNCell, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
     self._num_units = num_units
     self._activation = activation
-    self._reuse = reuse
 
   @property
   def state_size(self):
@@ -122,23 +79,24 @@ class BasicRNNCell(RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-    with _checked_scope(self, scope or "basic_rnn_cell", reuse=self._reuse):
-      output = self._activation(
-          _linear([inputs, state], self._num_units, True))
+    output = self._activation(_linear([inputs, state], self._num_units, True))
     return output, output
 
 
 class GRUCell(RNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078)."""
 
-  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None):
+  def __init__(self, num_units, input_size=None, activation=tanh, reuse=None,
+               kernel_initializer=None, bias_initializer=None):
+    super(GRUCell, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
     self._num_units = num_units
     self._activation = activation
-    self._reuse = reuse
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
 
   @property
   def state_size(self):
@@ -148,21 +106,23 @@ class GRUCell(RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Gated recurrent unit (GRU) with nunits cells."""
-    with _checked_scope(self, scope or "gru_cell", reuse=self._reuse):
-      with vs.variable_scope("gates"):  # Reset gate and update gate.
-        # We start with bias of 1.0 to not reset and not update.
-        value = sigmoid(_linear(
-          [inputs, state], 2 * self._num_units, True, 1.0))
-        r, u = array_ops.split(
-            value=value,
-            num_or_size_splits=2,
-            axis=1)
-      with vs.variable_scope("candidate"):
-        c = self._activation(_linear([inputs, r * state],
-                                     self._num_units, True))
-      new_h = u * state + (1 - u) * c
+    with vs.variable_scope("gates"):  # Reset gate and update gate.
+      # We start with bias of 1.0 to not reset and not update.
+      bias_ones = self._bias_initializer
+      if self._bias_initializer is None:
+        dtype = [a.dtype for a in [inputs, state]][0]
+        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
+      value = sigmoid(
+          _linear([inputs, state], 2 * self._num_units, True, bias_ones,
+                  self._kernel_initializer))
+      r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+    with vs.variable_scope("candidate"):
+      c = self._activation(
+          _linear([inputs, r * state], self._num_units, True,
+                  self._bias_initializer, self._kernel_initializer))
+    new_h = u * state + (1 - u) * c
     return new_h, new_h
 
 
@@ -217,6 +177,7 @@ class BasicLSTMCell(RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(BasicLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -226,7 +187,6 @@ class BasicLSTMCell(RNNCell):
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
     self._activation = activation
-    self._reuse = reuse
 
   @property
   def state_size(self):
@@ -237,28 +197,28 @@ class BasicLSTMCell(RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Long short-term memory cell (LSTM)."""
-    with _checked_scope(self, scope or "basic_lstm_cell", reuse=self._reuse):
-      # Parameters of gates are concatenated into one multiply for efficiency.
-      if self._state_is_tuple:
-        c, h = state
-      else:
-        c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
-      concat = _linear([inputs, h], 4 * self._num_units, True)
+    # Parameters of gates are concatenated into one multiply for efficiency.
+    if self._state_is_tuple:
+      c, h = state
+    else:
+      c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
 
-      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+    concat = _linear([inputs, h], 4 * self._num_units, True)
 
-      new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
-               self._activation(j))
-      new_h = self._activation(new_c) * sigmoid(o)
+    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+    i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
 
-      if self._state_is_tuple:
-        new_state = LSTMStateTuple(new_c, new_h)
-      else:
-        new_state = array_ops.concat([new_c, new_h], 1)
-      return new_h, new_state
+    new_c = (
+        c * sigmoid(f + self._forget_bias) + sigmoid(i) * self._activation(j))
+    new_h = self._activation(new_c) * sigmoid(o)
+
+    if self._state_is_tuple:
+      new_state = LSTMStateTuple(new_c, new_h)
+    else:
+      new_state = array_ops.concat([new_c, new_h], 1)
+    return new_h, new_state
 
 
 class LSTMCell(RNNCell):
@@ -319,6 +279,7 @@ class LSTMCell(RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(LSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -341,7 +302,6 @@ class LSTMCell(RNNCell):
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
     self._activation = activation
-    self._reuse = reuse
 
     if num_proj:
       self._state_size = (
@@ -362,7 +322,7 @@ class LSTMCell(RNNCell):
   def output_size(self):
     return self._output_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
@@ -371,7 +331,6 @@ class LSTMCell(RNNCell):
         `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
         tuple of state Tensors, both `2-D`, with column sizes `c_state` and
         `m_state`.
-      scope: VariableScope for the created subgraph; defaults to "lstm_cell".
 
     Returns:
       A tuple containing:
@@ -400,9 +359,8 @@ class LSTMCell(RNNCell):
     input_size = inputs.get_shape().with_rank(2)[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    with _checked_scope(self, scope or "lstm_cell",
-                        initializer=self._initializer,
-                        reuse=self._reuse) as unit_scope:
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
       if self._num_unit_shards is not None:
         unit_scope.set_partitioner(
             partitioned_variables.fixed_size_partitioner(
@@ -466,12 +424,13 @@ class OutputProjectionWrapper(RNNCell):
   if needed or directly feed into a softmax.
   """
 
-  def __init__(self, cell, output_size, reuse=None):
+  def __init__(self, cell, output_size, activation=None, reuse=None):
     """Create a cell with output projection.
 
     Args:
       cell: an RNNCell, a projection to output_size is added to it.
       output_size: integer, the size of the output after projection.
+      activation: (optional) an optional activation function.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
@@ -480,13 +439,14 @@ class OutputProjectionWrapper(RNNCell):
       TypeError: if cell is not an RNNCell.
       ValueError: if output_size is not positive.
     """
-    if not isinstance(cell, RNNCell):
+    super(OutputProjectionWrapper, self).__init__(_reuse=reuse)
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     if output_size < 1:
       raise ValueError("Parameter output_size must be > 0: %d." % output_size)
     self._cell = cell
     self._output_size = output_size
-    self._reuse = reuse
+    self._activation = activation
 
   @property
   def state_size(self):
@@ -500,13 +460,12 @@ class OutputProjectionWrapper(RNNCell):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       return self._cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run the cell and output projection on inputs, starting from state."""
     output, res_state = self._cell(inputs, state)
-    # Default scope: "OutputProjectionWrapper"
-    with _checked_scope(self, scope or "output_projection_wrapper",
-                        reuse=self._reuse):
-      projected = _linear(output, self._output_size, True)
+    projected = _linear(output, self._output_size, True)
+    if self._activation:
+      projected = self._activation(projected)
     return projected, res_state
 
 
@@ -518,23 +477,30 @@ class InputProjectionWrapper(RNNCell):
   do the projection on this batch-concatenated sequence, then split it.
   """
 
-  def __init__(self, cell, num_proj, input_size=None):
+  def __init__(self, cell, num_proj, activation=None, input_size=None,
+               reuse=None):
     """Create a cell with input projection.
 
     Args:
       cell: an RNNCell, a projection of inputs is added before it.
       num_proj: Python integer.  The dimension to project to.
+      activation: (optional) an optional activation function.
       input_size: Deprecated and unused.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
 
     Raises:
       TypeError: if cell is not an RNNCell.
     """
+    super(InputProjectionWrapper, self).__init__(_reuse=reuse)
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     self._cell = cell
     self._num_proj = num_proj
+    self._activation = activation
 
   @property
   def state_size(self):
@@ -548,11 +514,12 @@ class InputProjectionWrapper(RNNCell):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       return self._cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run the input projection and then the cell."""
     # Default scope: "InputProjectionWrapper"
-    with vs.variable_scope(scope or "input_projection_wrapper"):
-      projected = _linear(inputs, self._num_proj, True)
+    projected = _linear(inputs, self._num_proj, True)
+    if self._activation:
+      projected = self._activation(projected)
     return self._cell(projected, state)
 
 
@@ -605,7 +572,7 @@ class DropoutWrapper(RNNCell):
       TypeError: if cell is not an RNNCell.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
-    if not isinstance(cell, RNNCell):
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not a RNNCell.")
     with ops.name_scope("DropoutWrapperInit"):
       def tensor_and_const_value(v):
@@ -803,7 +770,8 @@ class DeviceWrapper(RNNCell):
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-      return self._cell.zero_state(batch_size, dtype)
+      with ops.device(self._device):
+        return self._cell.zero_state(batch_size, dtype)
 
   def __call__(self, inputs, state, scope=None):
     """Run the cell on specified device."""
@@ -838,7 +806,8 @@ class EmbeddingWrapper(RNNCell):
       TypeError: if cell is not an RNNCell.
       ValueError: if embedding_classes is not positive.
     """
-    if not isinstance(cell, RNNCell):
+    super(EmbeddingWrapper, self).__init__(_reuse=reuse)
+    if not _like_rnncell(cell):
       raise TypeError("The parameter cell is not RNNCell.")
     if embedding_classes <= 0 or embedding_size <= 0:
       raise ValueError("Both embedding_classes and embedding_size must be > 0: "
@@ -847,7 +816,6 @@ class EmbeddingWrapper(RNNCell):
     self._embedding_classes = embedding_classes
     self._embedding_size = embedding_size
     self._initializer = initializer
-    self._reuse = reuse
 
   @property
   def state_size(self):
@@ -861,31 +829,31 @@ class EmbeddingWrapper(RNNCell):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       return self._cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run the cell on embedded inputs."""
-    with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse):
-      with ops.device("/cpu:0"):
-        if self._initializer:
-          initializer = self._initializer
-        elif vs.get_variable_scope().initializer:
-          initializer = vs.get_variable_scope().initializer
-        else:
-          # Default initializer for embeddings should have variance=1.
-          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
-          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
+    with ops.device("/cpu:0"):
+      if self._initializer:
+        initializer = self._initializer
+      elif vs.get_variable_scope().initializer:
+        initializer = vs.get_variable_scope().initializer
+      else:
+        # Default initializer for embeddings should have variance=1.
+        sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
+        initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
 
-        if type(state) is tuple:
-          data_type = state[0].dtype
-        else:
-          data_type = state.dtype
+      if type(state) is tuple:
+        data_type = state[0].dtype
+      else:
+        data_type = state.dtype
+
+      embedding = vs.get_variable(
+          "embedding", [self._embedding_classes, self._embedding_size],
+          initializer=initializer,
+          dtype=data_type)
+      embedded = embedding_ops.embedding_lookup(embedding,
+                                                array_ops.reshape(inputs, [-1]))
 
-        embedding = vs.get_variable(
-            "embedding", [self._embedding_classes, self._embedding_size],
-            initializer=initializer,
-            dtype=data_type)
-        embedded = embedding_ops.embedding_lookup(
-            embedding, array_ops.reshape(inputs, [-1]))
-    return self._cell(embedded, state)
+      return self._cell(embedded, state)
 
 
 class MultiRNNCell(RNNCell):
@@ -905,6 +873,7 @@ class MultiRNNCell(RNNCell):
       ValueError: if cells is empty (not allowed), or at least one of the cells
         returns a state tuple but the flag `state_is_tuple` is `False`.
     """
+    super(MultiRNNCell, self).__init__()
     if not cells:
       raise ValueError("Must specify at least one cell for MultiRNNCell.")
     if not nest.is_sequence(cells):
@@ -939,28 +908,29 @@ class MultiRNNCell(RNNCell):
         # presumably does not contain TensorArrays or anything else fancy
         return super(MultiRNNCell, self).zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run this multi-layer cell on inputs, starting from state."""
-    with vs.variable_scope(scope or "multi_rnn_cell"):
-      cur_state_pos = 0
-      cur_inp = inputs
-      new_states = []
-      for i, cell in enumerate(self._cells):
-        with vs.variable_scope("cell_%d" % i):
-          if self._state_is_tuple:
-            if not nest.is_sequence(state):
-              raise ValueError(
-                  "Expected state to be a tuple of length %d, but received: %s"
-                  % (len(self.state_size), state))
-            cur_state = state[i]
-          else:
-            cur_state = array_ops.slice(
-                state, [0, cur_state_pos], [-1, cell.state_size])
-            cur_state_pos += cell.state_size
-          cur_inp, new_state = cell(cur_inp, cur_state)
-          new_states.append(new_state)
+    cur_state_pos = 0
+    cur_inp = inputs
+    new_states = []
+    for i, cell in enumerate(self._cells):
+      with vs.variable_scope("cell_%d" % i):
+        if self._state_is_tuple:
+          if not nest.is_sequence(state):
+            raise ValueError(
+                "Expected state to be a tuple of length %d, but received: %s" %
+                (len(self.state_size), state))
+          cur_state = state[i]
+        else:
+          cur_state = array_ops.slice(state, [0, cur_state_pos],
+                                      [-1, cell.state_size])
+          cur_state_pos += cell.state_size
+        cur_inp, new_state = cell(cur_inp, cur_state)
+        new_states.append(new_state)
+
     new_states = (tuple(new_states) if self._state_is_tuple else
                   array_ops.concat(new_states, 1))
+
     return cur_inp, new_states
 
 
@@ -1009,14 +979,19 @@ class _SlimRNNCell(RNNCell):
     return output, state
 
 
-def _linear(args, output_size, bias, bias_start=0.0):
+def _linear(args,
+            output_size,
+            bias,
+            bias_initializer=None,
+            kernel_initializer=None):
   """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
 
   Args:
     args: a 2D Tensor or a list of 2D, batch x n, Tensors.
     output_size: int, second dimension of W[i].
     bias: boolean, whether to add a bias term or not.
-    bias_start: starting value to initialize the bias; 0 by default.
+    bias_initializer: starting value to initialize the bias; None by default.
+    kernel_initializer: starting value to initialize the weight; None by default.
 
   Returns:
     A 2D Tensor with shape [batch x output_size] equal to
@@ -1048,7 +1023,9 @@ def _linear(args, output_size, bias, bias_start=0.0):
   scope = vs.get_variable_scope()
   with vs.variable_scope(scope) as outer_scope:
     weights = vs.get_variable(
-        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], dtype=dtype)
+        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+        dtype=dtype,
+        initializer=kernel_initializer)
     if len(args) == 1:
       res = math_ops.matmul(args[0], weights)
     else:
@@ -1057,8 +1034,10 @@ def _linear(args, output_size, bias, bias_start=0.0):
       return res
     with vs.variable_scope(outer_scope) as inner_scope:
       inner_scope.set_partitioner(None)
+      if bias_initializer is None:
+        bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
       biases = vs.get_variable(
           _BIAS_VARIABLE_NAME, [output_size],
           dtype=dtype,
-          initializer=init_ops.constant_initializer(bias_start, dtype=dtype))
+          initializer=bias_initializer)
     return nn_ops.bias_add(res, biases)
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 1884629b0c9700ecf6d836912bcce9d977df05fe..0e70939cceacd9af0f2324e5aafe5a0398e00c6c 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -353,8 +353,8 @@ class LSTMBlockCell(core_rnn_cell.RNNCell):
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
     self._names = {
-        "W": "weights",
-        "b": "biases",
+        "W": "kernel",
+        "b": "bias",
         "wci": "w_i_diag",
         "wco": "w_o_diag",
         "wcf": "w_f_diag",
@@ -625,10 +625,10 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
       time_len = array_ops.shape(inputs)[0]
     input_size = inputs_shape[2].value
     w = vs.get_variable(
-        "weights",
+        "kernel",
         [input_size + self._num_units, self._num_units * 4], dtype=dtype)
     b = vs.get_variable(
-        "biases", [w.get_shape().with_rank(2)[1]],
+        "bias", [w.get_shape().with_rank(2)[1]],
         initializer=init_ops.constant_initializer(0.0),
         dtype=dtype)
     if self._use_peephole:
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 2cd1814213169958251011e6c8cfe2e5f6041a4d..566c84443d1b7fe8c606c07c8f1f875720ac2374 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -34,14 +34,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_checked_scope = core_rnn_cell_impl._checked_scope  # pylint: disable=protected-access
-
-
 def _get_concat_variable(name, shape, dtype, num_shards):
   """Get a sharded variable concatenated into one tensor."""
   sharded_variable = _get_sharded_variable(name, shape, dtype, num_shards)
@@ -109,7 +107,7 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
   def __init__(self, num_units, use_peepholes=False,
                initializer=None, num_proj=None, proj_clip=None,
                num_unit_shards=1, num_proj_shards=1,
-               forget_bias=1.0, state_is_tuple=False,
+               forget_bias=1.0, state_is_tuple=True,
                activation=math_ops.tanh, reuse=None):
     """Initialize the parameters for an LSTM cell.
 
@@ -138,6 +136,7 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn(
           "%s: Using a concatenated state is slower and will soon be "
@@ -173,7 +172,7 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._output_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
@@ -182,7 +181,6 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
         `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
         tuple of state Tensors, both `2-D`, with column sizes `c_state` and
         `m_state`.
-      scope: VariableScope for the created subgraph; defaults to "LSTMCell".
 
     Returns:
       A tuple containing:
@@ -212,51 +210,49 @@ class CoupledInputForgetGateLSTMCell(core_rnn_cell.RNNCell):
     input_size = inputs.get_shape().with_rank(2)[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    with _checked_scope(self, scope or "coupled_input_forget_gate_lstm_cell",
-                        initializer=self._initializer, reuse=self._reuse):
-      concat_w = _get_concat_variable(
-          "W", [input_size.value + num_proj, 3 * self._num_units],
-          dtype, self._num_unit_shards)
+    concat_w = _get_concat_variable(
+        "W", [input_size.value + num_proj, 3 * self._num_units],
+        dtype, self._num_unit_shards)
 
-      b = vs.get_variable(
-          "B",
-          shape=[3 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
+    b = vs.get_variable(
+        "B",
+        shape=[3 * self._num_units],
+        initializer=init_ops.zeros_initializer(),
+        dtype=dtype)
 
-      # j = new_input, f = forget_gate, o = output_gate
-      cell_inputs = array_ops.concat([inputs, m_prev], 1)
-      lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
-      j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
+    # j = new_input, f = forget_gate, o = output_gate
+    cell_inputs = array_ops.concat([inputs, m_prev], 1)
+    lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+    j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
 
-      # Diagonal connections
-      if self._use_peepholes:
-        w_f_diag = vs.get_variable(
-            "W_F_diag", shape=[self._num_units], dtype=dtype)
-        w_o_diag = vs.get_variable(
-            "W_O_diag", shape=[self._num_units], dtype=dtype)
+    # Diagonal connections
+    if self._use_peepholes:
+      w_f_diag = vs.get_variable(
+          "W_F_diag", shape=[self._num_units], dtype=dtype)
+      w_o_diag = vs.get_variable(
+          "W_O_diag", shape=[self._num_units], dtype=dtype)
 
-      if self._use_peepholes:
-        f_act = sigmoid(f + self._forget_bias + w_f_diag * c_prev)
-      else:
-        f_act = sigmoid(f + self._forget_bias)
-      c = (f_act * c_prev + (1 - f_act) * self._activation(j))
+    if self._use_peepholes:
+      f_act = sigmoid(f + self._forget_bias + w_f_diag * c_prev)
+    else:
+      f_act = sigmoid(f + self._forget_bias)
+    c = (f_act * c_prev + (1 - f_act) * self._activation(j))
 
-      if self._use_peepholes:
-        m = sigmoid(o + w_o_diag * c) * self._activation(c)
-      else:
-        m = sigmoid(o) * self._activation(c)
+    if self._use_peepholes:
+      m = sigmoid(o + w_o_diag * c) * self._activation(c)
+    else:
+      m = sigmoid(o) * self._activation(c)
 
-      if self._num_proj is not None:
-        concat_w_proj = _get_concat_variable(
-            "W_P", [self._num_units, self._num_proj],
-            dtype, self._num_proj_shards)
+    if self._num_proj is not None:
+      concat_w_proj = _get_concat_variable(
+          "W_P", [self._num_units, self._num_proj],
+          dtype, self._num_proj_shards)
 
-        m = math_ops.matmul(m, concat_w_proj)
-        if self._proj_clip is not None:
-          # pylint: disable=invalid-unary-operand-type
-          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
-          # pylint: enable=invalid-unary-operand-type
+      m = math_ops.matmul(m, concat_w_proj)
+      if self._proj_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+        # pylint: enable=invalid-unary-operand-type
 
     new_state = (core_rnn_cell.LSTMStateTuple(c, m) if self._state_is_tuple else
                  array_ops.concat([c, m], 1))
@@ -301,6 +297,7 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(TimeFreqLSTMCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._cell_clip = cell_clip
@@ -321,14 +318,12 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
   def state_size(self):
     return self._state_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, batch x num_units.
       state: state Tensor, 2D, batch x state_size.
-      scope: VariableScope for the created subgraph; defaults to
-        "TimeFreqLSTMCell".
 
     Returns:
       A tuple containing:
@@ -347,63 +342,63 @@ class TimeFreqLSTMCell(core_rnn_cell.RNNCell):
     freq_inputs = self._make_tf_features(inputs)
     dtype = inputs.dtype
     actual_input_size = freq_inputs[0].get_shape().as_list()[1]
-    with _checked_scope(self, scope or "time_freq_lstm_cell",
-                        initializer=self._initializer, reuse=self._reuse):
-      concat_w = _get_concat_variable(
-          "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
-          dtype, self._num_unit_shards)
-      b = vs.get_variable(
-          "B",
-          shape=[4 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
 
-      # Diagonal connections
-      if self._use_peepholes:
-        w_f_diag = vs.get_variable(
-            "W_F_diag", shape=[self._num_units], dtype=dtype)
-        w_i_diag = vs.get_variable(
-            "W_I_diag", shape=[self._num_units], dtype=dtype)
-        w_o_diag = vs.get_variable(
-            "W_O_diag", shape=[self._num_units], dtype=dtype)
-
-      # initialize the first freq state to be zero
-      m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
-                                     self._num_units], dtype)
-      for fq in range(len(freq_inputs)):
-        c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
-                                 [-1, self._num_units])
-        m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
-                                 [-1, self._num_units])
-        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
-        cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
-                                       1)
-        lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
-        i, j, f, o = array_ops.split(
-            value=lstm_matrix, num_or_size_splits=4, axis=1)
+    concat_w = _get_concat_variable(
+        "W", [actual_input_size + 2*self._num_units, 4 * self._num_units],
+        dtype, self._num_unit_shards)
 
-        if self._use_peepholes:
-          c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
-               sigmoid(i + w_i_diag * c_prev) * tanh(j))
-        else:
-          c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
+    b = vs.get_variable(
+        "B",
+        shape=[4 * self._num_units],
+        initializer=init_ops.zeros_initializer(),
+        dtype=dtype)
 
-        if self._cell_clip is not None:
-          # pylint: disable=invalid-unary-operand-type
-          c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
-          # pylint: enable=invalid-unary-operand-type
+    # Diagonal connections
+    if self._use_peepholes:
+      w_f_diag = vs.get_variable(
+          "W_F_diag", shape=[self._num_units], dtype=dtype)
+      w_i_diag = vs.get_variable(
+          "W_I_diag", shape=[self._num_units], dtype=dtype)
+      w_o_diag = vs.get_variable(
+          "W_O_diag", shape=[self._num_units], dtype=dtype)
 
-        if self._use_peepholes:
-          m = sigmoid(o + w_o_diag * c) * tanh(c)
-        else:
-          m = sigmoid(o) * tanh(c)
-        m_prev_freq = m
-        if fq == 0:
-          state_out = array_ops.concat([c, m], 1)
-          m_out = m
-        else:
-          state_out = array_ops.concat([state_out, c, m], 1)
-          m_out = array_ops.concat([m_out, m], 1)
+    # initialize the first freq state to be zero
+    m_prev_freq = array_ops.zeros([int(inputs.get_shape()[0]),
+                                   self._num_units], dtype)
+    for fq in range(len(freq_inputs)):
+      c_prev = array_ops.slice(state, [0, 2*fq*self._num_units],
+                               [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, (2*fq+1)*self._num_units],
+                               [-1, self._num_units])
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      cell_inputs = array_ops.concat([freq_inputs[fq], m_prev, m_prev_freq],
+                                     1)
+      lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+
+      if self._use_peepholes:
+        c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+             sigmoid(i + w_i_diag * c_prev) * tanh(j))
+      else:
+        c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * tanh(j))
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * tanh(c)
+      else:
+        m = sigmoid(o) * tanh(c)
+      m_prev_freq = m
+      if fq == 0:
+        state_out = array_ops.concat([c, m], 1)
+        m_out = m
+      else:
+        state_out = array_ops.concat([state_out, c, m], 1)
+        m_out = array_ops.concat([m_out, m], 1)
     return m_out, state_out
 
   def _make_tf_features(self, input_feat):
@@ -457,7 +452,7 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
                start_freqindex_list=None,
                end_freqindex_list=None,
                couple_input_forget_gates=False,
-               state_is_tuple=False,
+               state_is_tuple=True,
                reuse=None):
     """Initialize the parameters for an LSTM cell.
 
@@ -471,7 +466,7 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
         state is clipped by this value prior to the cell output activation.
       initializer: (optional) The initializer to use for the weight and
         projection matrices, default None.
-      num_unit_shards: (optional) int, defualt 1, How to split the weight
+      num_unit_shards: (optional) int, default 1, How to split the weight
         matrix. If > 1,the weight matrix is stored across num_unit_shards.
       forget_bias: (optional) float, default 1.0, The initial bias of the
         forget gates, used to reduce the scale of forgetting at the beginning
@@ -499,6 +494,7 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
     Raises:
       ValueError: if the num_frequency_blocks list is not specified
     """
+    super(GridLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -550,15 +546,13 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
   def state_tuple_type(self):
     return self._state_tuple_type
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, [batch, feature_size].
       state: Tensor or tuple of Tensors, 2D, [batch, state_size], depends on the
         flag self._state_is_tuple.
-      scope: (optional) VariableScope for the created subgraph; if None, it
-        defaults to "GridLSTMCell".
 
     Returns:
       A tuple containing:
@@ -571,23 +565,21 @@ class GridLSTMCell(core_rnn_cell.RNNCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = int(inputs.get_shape()[0])
+    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
     freq_inputs = self._make_tf_features(inputs)
-    with _checked_scope(self, scope or "grid_lstm_cell",
-                        initializer=self._initializer, reuse=self._reuse):
-      m_out_lst = []
-      state_out_lst = []
-      for block in range(len(freq_inputs)):
-        m_out_lst_current, state_out_lst_current = self._compute(
-            freq_inputs[block], block, state, batch_size,
-            state_is_tuple=self._state_is_tuple)
-        m_out_lst.extend(m_out_lst_current)
-        state_out_lst.extend(state_out_lst_current)
-      if self._state_is_tuple:
-        state_out = self._state_tuple_type(*state_out_lst)
-      else:
-        state_out = array_ops.concat(state_out_lst, 1)
-      m_out = array_ops.concat(m_out_lst, 1)
+    m_out_lst = []
+    state_out_lst = []
+    for block in range(len(freq_inputs)):
+      m_out_lst_current, state_out_lst_current = self._compute(
+          freq_inputs[block], block, state, batch_size,
+          state_is_tuple=self._state_is_tuple)
+      m_out_lst.extend(m_out_lst_current)
+      state_out_lst.extend(state_out_lst_current)
+    if self._state_is_tuple:
+      state_out = self._state_tuple_type(*state_out_lst)
+    else:
+      state_out = array_ops.concat(state_out_lst, 1)
+    m_out = array_ops.concat(m_out_lst, 1)
     return m_out, state_out
 
   def _compute(self, freq_inputs, block, state, batch_size,
@@ -974,14 +966,12 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
         *([num_units, num_units] * self._total_blocks * 2))
     self._output_size = 2 * num_units * self._total_blocks * 2
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of LSTM.
 
     Args:
       inputs: input Tensor, 2D, [batch, num_units].
       state: tuple of Tensors, 2D, [batch, state_size].
-      scope: (optional) VariableScope for the created subgraph; if None, it
-        defaults to "BidirectionalGridLSTMCell".
 
     Returns:
       A tuple containing:
@@ -994,7 +984,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = int(inputs.get_shape()[0])
+    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
     fwd_inputs = self._make_tf_features(inputs)
     if self._backward_slice_offset:
       bwd_inputs = self._make_tf_features(inputs, self._backward_slice_offset)
@@ -1002,29 +992,27 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       bwd_inputs = fwd_inputs
 
     # Forward processing
-    with _checked_scope(self, scope or "bidirectional_grid_lstm_cell",
-                        initializer=self._initializer, reuse=self._reuse):
-      with vs.variable_scope("fwd"):
-        fwd_m_out_lst = []
-        fwd_state_out_lst = []
-        for block in range(len(fwd_inputs)):
-          fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
-              fwd_inputs[block], block, state, batch_size,
-              state_prefix="fwd_state", state_is_tuple=True)
-          fwd_m_out_lst.extend(fwd_m_out_lst_current)
-          fwd_state_out_lst.extend(fwd_state_out_lst_current)
-      # Backward processing
-      bwd_m_out_lst = []
-      bwd_state_out_lst = []
-      with vs.variable_scope("bwd"):
-        for block in range(len(bwd_inputs)):
-          # Reverse the blocks
-          bwd_inputs_reverse = bwd_inputs[block][::-1]
-          bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
-              bwd_inputs_reverse, block, state, batch_size,
-              state_prefix="bwd_state", state_is_tuple=True)
-          bwd_m_out_lst.extend(bwd_m_out_lst_current)
-          bwd_state_out_lst.extend(bwd_state_out_lst_current)
+    with vs.variable_scope("fwd"):
+      fwd_m_out_lst = []
+      fwd_state_out_lst = []
+      for block in range(len(fwd_inputs)):
+        fwd_m_out_lst_current, fwd_state_out_lst_current = self._compute(
+            fwd_inputs[block], block, state, batch_size,
+            state_prefix="fwd_state", state_is_tuple=True)
+        fwd_m_out_lst.extend(fwd_m_out_lst_current)
+        fwd_state_out_lst.extend(fwd_state_out_lst_current)
+    # Backward processing
+    bwd_m_out_lst = []
+    bwd_state_out_lst = []
+    with vs.variable_scope("bwd"):
+      for block in range(len(bwd_inputs)):
+        # Reverse the blocks
+        bwd_inputs_reverse = bwd_inputs[block][::-1]
+        bwd_m_out_lst_current, bwd_state_out_lst_current = self._compute(
+            bwd_inputs_reverse, block, state, batch_size,
+            state_prefix="bwd_state", state_is_tuple=True)
+        bwd_m_out_lst.extend(bwd_m_out_lst_current)
+        bwd_state_out_lst.extend(bwd_state_out_lst_current)
     state_out = self._state_tuple_type(*(fwd_state_out_lst + bwd_state_out_lst))
     # Outputs are always concated as it is never used separately.
     m_out = array_ops.concat(fwd_m_out_lst + bwd_m_out_lst, 1)
@@ -1043,7 +1031,7 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
   """
 
   def __init__(self, cell, attn_length, attn_size=None, attn_vec_size=None,
-               input_size=None, state_is_tuple=False, reuse=None):
+               input_size=None, state_is_tuple=True, reuse=None):
     """Create a cell with attention.
 
     Args:
@@ -1069,7 +1057,8 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
       ValueError: if cell returns a state tuple but the flag
           `state_is_tuple` is `False` or if attn_length is zero or less.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    super(AttentionCellWrapper, self).__init__(_reuse=reuse)
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("The parameter cell is not RNNCell.")
     if nest.is_sequence(cell.state_size) and not state_is_tuple:
       raise ValueError("Cell returns tuple of states, but the flag "
@@ -1107,42 +1096,40 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._attn_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Long short-term memory cell with attention (LSTMA)."""
-    with _checked_scope(self, scope or "attention_cell_wrapper",
-                        reuse=self._reuse):
-      if self._state_is_tuple:
-        state, attns, attn_states = state
-      else:
-        states = state
-        state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
-        attns = array_ops.slice(
-            states, [0, self._cell.state_size], [-1, self._attn_size])
-        attn_states = array_ops.slice(
-            states, [0, self._cell.state_size + self._attn_size],
-            [-1, self._attn_size * self._attn_length])
-      attn_states = array_ops.reshape(attn_states,
-                                      [-1, self._attn_length, self._attn_size])
-      input_size = self._input_size
-      if input_size is None:
-        input_size = inputs.get_shape().as_list()[1]
-      inputs = _linear([inputs, attns], input_size, True)
-      lstm_output, new_state = self._cell(inputs, state)
-      if self._state_is_tuple:
-        new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
-      else:
-        new_state_cat = new_state
-      new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
-      with vs.variable_scope("attn_output_projection"):
-        output = _linear([lstm_output, new_attns], self._attn_size, True)
-      new_attn_states = array_ops.concat(
-          [new_attn_states, array_ops.expand_dims(output, 1)], 1)
-      new_attn_states = array_ops.reshape(
-          new_attn_states, [-1, self._attn_length * self._attn_size])
-      new_state = (new_state, new_attns, new_attn_states)
-      if not self._state_is_tuple:
-        new_state = array_ops.concat(list(new_state), 1)
-      return output, new_state
+    if self._state_is_tuple:
+      state, attns, attn_states = state
+    else:
+      states = state
+      state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size])
+      attns = array_ops.slice(
+          states, [0, self._cell.state_size], [-1, self._attn_size])
+      attn_states = array_ops.slice(
+          states, [0, self._cell.state_size + self._attn_size],
+          [-1, self._attn_size * self._attn_length])
+    attn_states = array_ops.reshape(attn_states,
+                                    [-1, self._attn_length, self._attn_size])
+    input_size = self._input_size
+    if input_size is None:
+      input_size = inputs.get_shape().as_list()[1]
+    inputs = _linear([inputs, attns], input_size, True)
+    lstm_output, new_state = self._cell(inputs, state)
+    if self._state_is_tuple:
+      new_state_cat = array_ops.concat(nest.flatten(new_state), 1)
+    else:
+      new_state_cat = new_state
+    new_attns, new_attn_states = self._attention(new_state_cat, attn_states)
+    with vs.variable_scope("attn_output_projection"):
+      output = _linear([lstm_output, new_attns], self._attn_size, True)
+    new_attn_states = array_ops.concat(
+        [new_attn_states, array_ops.expand_dims(output, 1)], 1)
+    new_attn_states = array_ops.reshape(
+        new_attn_states, [-1, self._attn_length * self._attn_size])
+    new_state = (new_state, new_attns, new_attn_states)
+    if not self._state_is_tuple:
+      new_state = array_ops.concat(list(new_state), 1)
+    return output, new_state
 
   def _attention(self, query, attn_states):
     conv2d = nn_ops.conv2d
@@ -1168,6 +1155,89 @@ class AttentionCellWrapper(core_rnn_cell.RNNCell):
       return new_attns, new_attn_states
 
 
+class HighwayWrapper(core_rnn_cell.RNNCell):
+  """RNNCell wrapper that adds highway connection on cell input and output.
+
+  Based on:
+    R. K. Srivastava, K. Greff, and J. Schmidhuber, "Highway networks",
+    arXiv preprint arXiv:1505.00387, 2015.
+    https://arxiv.org/abs/1505.00387
+  """
+
+  def __init__(self, cell,
+               couple_carry_transform_gates=True,
+               carry_bias_init=1.0):
+    """Constructs a `HighwayWrapper` for `cell`.
+
+    Args:
+      cell: An instance of `RNNCell`.
+      couple_carry_transform_gates: boolean, should the Carry and Transform gate
+        be coupled.
+      carry_bias_init: float, carry gates bias initialization.
+    """
+    self._cell = cell
+    self._couple_carry_transform_gates = couple_carry_transform_gates
+    self._carry_bias_init = carry_bias_init
+
+  @property
+  def state_size(self):
+    return self._cell.state_size
+
+  @property
+  def output_size(self):
+    return self._cell.output_size
+
+  def zero_state(self, batch_size, dtype):
+    with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
+      return self._cell.zero_state(batch_size, dtype)
+
+  def _highway(self, inp, out):
+    input_size = inp.get_shape().with_rank(2)[1].value
+    carry_weight = vs.get_variable("carry_w", [input_size, input_size])
+    carry_bias = vs.get_variable(
+        "carry_b", [input_size],
+        initializer=init_ops.constant_initializer(
+            self._carry_bias_init))
+    carry = math_ops.sigmoid(nn_ops.xw_plus_b(inp, carry_weight, carry_bias))
+    if self._couple_carry_transform_gates:
+      transform = 1 - carry
+    else:
+      transform_weight = vs.get_variable("transform_w",
+                                         [input_size, input_size])
+      transform_bias = vs.get_variable(
+          "transform_b", [input_size],
+          initializer=init_ops.constant_initializer(
+              -self._carry_bias_init))
+      transform = math_ops.sigmoid(nn_ops.xw_plus_b(inp,
+                                                    transform_weight,
+                                                    transform_bias))
+    return inp * carry + out * transform
+
+  def __call__(self, inputs, state, scope=None):
+    """Run the cell and add its inputs to its outputs.
+
+    Args:
+      inputs: cell inputs.
+      state: cell state.
+      scope: optional cell scope.
+
+    Returns:
+      Tuple of cell outputs and new state.
+
+    Raises:
+      TypeError: If cell inputs and outputs have different structure (type).
+      ValueError: If cell inputs and outputs have different structure (value).
+    """
+    outputs, new_state = self._cell(inputs, state, scope=scope)
+    nest.assert_same_structure(inputs, outputs)
+    # Ensure shapes match
+    def assert_shape_match(inp, out):
+      inp.get_shape().assert_is_compatible_with(out.get_shape())
+    nest.map_structure(assert_shape_match, inputs, outputs)
+    res_outputs = nest.map_structure(self._highway, inputs, outputs)
+    return (res_outputs, new_state)
+
+
 class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
   """LSTM unit with layer normalization and recurrent dropout.
 
@@ -1213,6 +1283,7 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(LayerNormBasicLSTMCell, self).__init__(_reuse=reuse)
 
     if input_size is not None:
       logging.warn("%s: The input_size parameter is deprecated.", self)
@@ -1249,41 +1320,38 @@ class LayerNormBasicLSTMCell(core_rnn_cell.RNNCell):
   def _linear(self, args):
     out_size = 4 * self._num_units
     proj_size = args.get_shape()[-1]
-    weights = vs.get_variable("weights", [proj_size, out_size])
+    weights = vs.get_variable("kernel", [proj_size, out_size])
     out = math_ops.matmul(args, weights)
     if not self._layer_norm:
-      bias = vs.get_variable("biases", [out_size])
+      bias = vs.get_variable("bias", [out_size])
       out = nn_ops.bias_add(out, bias)
     return out
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """LSTM cell with layer normalization and recurrent dropout."""
+    c, h = state
+    args = array_ops.concat([inputs, h], 1)
+    concat = self._linear(args)
 
-    with _checked_scope(self, scope or "layer_norm_basic_lstm_cell",
-                        reuse=self._reuse):
-      c, h = state
-      args = array_ops.concat([inputs, h], 1)
-      concat = self._linear(args)
+    i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
+    if self._layer_norm:
+      i = self._norm(i, "input")
+      j = self._norm(j, "transform")
+      f = self._norm(f, "forget")
+      o = self._norm(o, "output")
 
-      i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
-      if self._layer_norm:
-        i = self._norm(i, "input")
-        j = self._norm(j, "transform")
-        f = self._norm(f, "forget")
-        o = self._norm(o, "output")
+    g = self._activation(j)
+    if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
+      g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
 
-      g = self._activation(j)
-      if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
-        g = nn_ops.dropout(g, self._keep_prob, seed=self._seed)
+    new_c = (c * math_ops.sigmoid(f + self._forget_bias)
+             + math_ops.sigmoid(i) * g)
+    if self._layer_norm:
+      new_c = self._norm(new_c, "state")
+    new_h = self._activation(new_c) * math_ops.sigmoid(o)
 
-      new_c = (c * math_ops.sigmoid(f + self._forget_bias)
-               + math_ops.sigmoid(i) * g)
-      if self._layer_norm:
-        new_c = self._norm(new_c, "state")
-      new_h = self._activation(new_c) * math_ops.sigmoid(o)
-
-      new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
-      return new_h, new_state
+    new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
+    return new_h, new_state
 
 
 class NASCell(core_rnn_cell.RNNCell):
@@ -1313,6 +1381,7 @@ class NASCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(NASCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._num_proj = num_proj
     self._use_biases = use_biases
@@ -1333,14 +1402,13 @@ class NASCell(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._output_size
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of NAS Cell.
 
     Args:
       inputs: input Tensor, 2D, batch x num_units.
       state: This must be a tuple of state Tensors, both `2-D`, with column
         sizes `c_state` and `m_state`.
-      scope: VariableScope for the created subgraph; defaults to "nas_rnn".
 
     Returns:
       A tuple containing:
@@ -1368,71 +1436,70 @@ class NASCell(core_rnn_cell.RNNCell):
     input_size = inputs.get_shape().with_rank(2)[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    with _checked_scope(self, scope or "nas_rnn", reuse=self._reuse):
-      # Variables for the NAS cell. W_m is all matrices multiplying the
-      # hiddenstate and W_inputs is all matrices multiplying the inputs.
-      concat_w_m = vs.get_variable(
-          "recurrent_weights", [num_proj, 8 * self._num_units],
-          dtype)
-      concat_w_inputs = vs.get_variable(
-          "weights", [input_size.value, 8 * self._num_units],
+    # Variables for the NAS cell. W_m is all matrices multiplying the
+    # hiddenstate and W_inputs is all matrices multiplying the inputs.
+    concat_w_m = vs.get_variable(
+        "recurrent_kernel", [num_proj, 8 * self._num_units],
+        dtype)
+    concat_w_inputs = vs.get_variable(
+        "kernel", [input_size.value, 8 * self._num_units],
+        dtype)
+
+    m_matrix = math_ops.matmul(m_prev, concat_w_m)
+    inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
+
+    if self._use_biases:
+      b = vs.get_variable(
+          "bias",
+          shape=[8 * self._num_units],
+          initializer=init_ops.zeros_initializer(),
+          dtype=dtype)
+      m_matrix = nn_ops.bias_add(m_matrix, b)
+
+    # The NAS cell branches into 8 different splits for both the hiddenstate
+    # and the input
+    m_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
+                                      value=m_matrix)
+    inputs_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
+                                           value=inputs_matrix)
+
+    # First layer
+    layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
+    layer1_1 = relu(inputs_matrix_splits[1] + m_matrix_splits[1])
+    layer1_2 = sigmoid(inputs_matrix_splits[2] + m_matrix_splits[2])
+    layer1_3 = relu(inputs_matrix_splits[3] * m_matrix_splits[3])
+    layer1_4 = tanh(inputs_matrix_splits[4] + m_matrix_splits[4])
+    layer1_5 = sigmoid(inputs_matrix_splits[5] + m_matrix_splits[5])
+    layer1_6 = tanh(inputs_matrix_splits[6] + m_matrix_splits[6])
+    layer1_7 = sigmoid(inputs_matrix_splits[7] + m_matrix_splits[7])
+
+    # Second layer
+    l2_0 = tanh(layer1_0 * layer1_1)
+    l2_1 = tanh(layer1_2 + layer1_3)
+    l2_2 = tanh(layer1_4 * layer1_5)
+    l2_3 = sigmoid(layer1_6 + layer1_7)
+
+    # Inject the cell
+    l2_0 = tanh(l2_0 + c_prev)
+
+    # Third layer
+    l3_0_pre = l2_0 * l2_1
+    new_c = l3_0_pre  # create new cell
+    l3_0 = l3_0_pre
+    l3_1 = tanh(l2_2 + l2_3)
+
+    # Final layer
+    new_m = tanh(l3_0 * l3_1)
+
+    # Projection layer if specified
+    if self._num_proj is not None:
+      concat_w_proj = vs.get_variable(
+          "projection_weights", [self._num_units, self._num_proj],
           dtype)
+      new_m = math_ops.matmul(new_m, concat_w_proj)
 
-      m_matrix = math_ops.matmul(m_prev, concat_w_m)
-      inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
-
-      if self._use_biases:
-        b = vs.get_variable(
-            "bias",
-            shape=[8 * self._num_units],
-            initializer=init_ops.zeros_initializer(),
-            dtype=dtype)
-        m_matrix = nn_ops.bias_add(m_matrix, b)
-
-      # The NAS cell branches into 8 different splits for both the hiddenstate
-      # and the input
-      m_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                        value=m_matrix)
-      inputs_matrix_splits = array_ops.split(axis=1, num_or_size_splits=8,
-                                             value=inputs_matrix)
-
-      # First layer
-      layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
-      layer1_1 = relu(inputs_matrix_splits[1] + m_matrix_splits[1])
-      layer1_2 = sigmoid(inputs_matrix_splits[2] + m_matrix_splits[2])
-      layer1_3 = relu(inputs_matrix_splits[3] * m_matrix_splits[3])
-      layer1_4 = tanh(inputs_matrix_splits[4] + m_matrix_splits[4])
-      layer1_5 = sigmoid(inputs_matrix_splits[5] + m_matrix_splits[5])
-      layer1_6 = tanh(inputs_matrix_splits[6] + m_matrix_splits[6])
-      layer1_7 = sigmoid(inputs_matrix_splits[7] + m_matrix_splits[7])
-
-      # Second layer
-      l2_0 = tanh(layer1_0 * layer1_1)
-      l2_1 = tanh(layer1_2 + layer1_3)
-      l2_2 = tanh(layer1_4 * layer1_5)
-      l2_3 = sigmoid(layer1_6 + layer1_7)
-
-      # Inject the cell
-      l2_0 = tanh(l2_0 + c_prev)
-
-      # Third layer
-      l3_0_pre = l2_0 * l2_1
-      new_c = l3_0_pre  # create new cell
-      l3_0 = l3_0_pre
-      l3_1 = tanh(l2_2 + l2_3)
-
-      # Final layer
-      new_m = tanh(l3_0 * l3_1)
-
-      # Projection layer if specified
-      if self._num_proj is not None:
-        concat_w_proj = vs.get_variable(
-            "projection_weights", [self._num_units, self._num_proj],
-            dtype)
-        new_m = math_ops.matmul(new_m, concat_w_proj)
-
-      new_state = core_rnn_cell.LSTMStateTuple(new_c, new_m)
-      return new_m, new_state
+    new_state = core_rnn_cell.LSTMStateTuple(new_c, new_m)
+    return new_m, new_state
 
 
 class UGRNNCell(core_rnn_cell.RNNCell):
@@ -1467,6 +1534,7 @@ class UGRNNCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(UGRNNCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._initializer = initializer
     self._forget_bias = forget_bias
@@ -1481,13 +1549,12 @@ class UGRNNCell(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of UGRNN.
 
     Args:
       inputs: input Tensor, 2D, batch x input size.
       state: state Tensor, 2D, batch x num units.
-      scope: VariableScope for the created subgraph; defaults to "ugrnn_cell".
 
     Returns:
       new_output: batch x num units, Tensor representing the output of the UGRNN
@@ -1506,8 +1573,8 @@ class UGRNNCell(core_rnn_cell.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with _checked_scope(self, scope or "ugrnn_cell",
-                        initializer=self._initializer, reuse=self._reuse):
+    with vs.variable_scope(vs.get_variable_scope(),
+                           initializer=self._initializer):
       cell_inputs = array_ops.concat([inputs, state], 1)
       rnn_matrix = _linear(cell_inputs, 2 * self._num_units, True)
 
@@ -1567,6 +1634,7 @@ class IntersectionRNNCell(core_rnn_cell.RNNCell):
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(IntersectionRNNCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._initializer = initializer
     self._forget_bias = forget_bias
@@ -1582,14 +1650,12 @@ class IntersectionRNNCell(core_rnn_cell.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Run one step of the Intersection RNN.
 
     Args:
       inputs: input Tensor, 2D, batch x input size.
       state: state Tensor, 2D, batch x num units.
-      scope: VariableScope for the created subgraph; defaults to
-        "intersection_rnn_cell"
 
     Returns:
       new_y: batch x num units, Tensor representing the output of the +RNN
@@ -1610,8 +1676,8 @@ class IntersectionRNNCell(core_rnn_cell.RNNCell):
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
-    with _checked_scope(self, scope or "intersection_rnn_cell",
-                        initializer=self._initializer, reuse=self._reuse):
+    with vs.variable_scope(vs.get_variable_scope(),
+                           initializer=self._initializer):
       # read-in projections (should be used for first layer in deep +RNN
       # to transform size of inputs from I --> N)
       if input_size.value != self._num_units:
@@ -1683,7 +1749,7 @@ class CompiledWrapper(core_rnn_cell.RNNCell):
         return not _REGISTERED_OPS[node_def.op].is_stateful
 
     with jit.experimental_jit_scope(compile_ops=compile_ops):
-      return self._cell(inputs, state, scope=scope)
+      return self._cell(inputs, state, scope)
 
 
 def _random_exp_initializer(minval,
@@ -1743,16 +1809,17 @@ class PhasedLSTMCell(core_rnn_cell.RNNCell):
           period during which the gates are open.
       trainable_ratio_on: bool, weather ratio_on is trainable.
       period_init_min: float or scalar float Tensor. With value > 0.
-          Minimum value of the initalized period.
+          Minimum value of the initialized period.
           The period values are initialized by drawing from the distribution:
           e^U(log(period_init_min), log(period_init_max))
           Where U(.,.) is the uniform distribution.
       period_init_max: float or scalar float Tensor.
-          With value > period_init_min. Maximum value of the initalized period.
+          With value > period_init_min. Maximum value of the initialized period.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope. If not `True`, and the existing scope already has
         the given variables, an error is raised.
     """
+    super(PhasedLSTMCell, self).__init__(_reuse=reuse)
     self._num_units = num_units
     self._use_peepholes = use_peepholes
     self._leak = leak
@@ -1782,7 +1849,7 @@ class PhasedLSTMCell(core_rnn_cell.RNNCell):
     cycle_ratio = self._mod(shifted_time, period_casted) / period_casted
     return math_ops.cast(cycle_ratio, dtype=dtypes.float32)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Phased LSTM Cell.
 
     Args:
@@ -1792,7 +1859,6 @@ class PhasedLSTMCell(core_rnn_cell.RNNCell):
          The second Tensor has shape [batch, features_size], and type float32.
          It stores the features.
       state: core_rnn_cell.LSTMStateTuple, state from previous timestep.
-      scope: string, id of the variable scope.
 
     Returns:
       A tuple containing:
@@ -1801,61 +1867,235 @@ class PhasedLSTMCell(core_rnn_cell.RNNCell):
       - A core_rnn_cell.LSTMStateTuple, containing 2 Tensors of float32, shape
         [batch_size, num_units], representing the new state and the output.
     """
-    with _checked_scope(self, scope or "phased_lstm_cell", reuse=self._reuse):
-      (c_prev, h_prev) = state
-      (time, x) = inputs
+    (c_prev, h_prev) = state
+    (time, x) = inputs
 
-      in_mask_gates = [x, h_prev]
-      if self._use_peepholes:
-        in_mask_gates.append(c_prev)
+    in_mask_gates = [x, h_prev]
+    if self._use_peepholes:
+      in_mask_gates.append(c_prev)
 
-      with vs.variable_scope("mask_gates"):
-        mask_gates = math_ops.sigmoid(
-            _linear(in_mask_gates, 2 * self._num_units, True))
-        [input_gate, forget_gate] = array_ops.split(
-            axis=1, num_or_size_splits=2, value=mask_gates)
+    with vs.variable_scope("mask_gates"):
+      mask_gates = math_ops.sigmoid(
+          _linear(in_mask_gates, 2 * self._num_units, True))
+      [input_gate, forget_gate] = array_ops.split(
+          axis=1, num_or_size_splits=2, value=mask_gates)
 
-      with vs.variable_scope("new_input"):
-        new_input = math_ops.tanh(
-            _linear([x, h_prev], self._num_units, True))
+    with vs.variable_scope("new_input"):
+      new_input = math_ops.tanh(
+          _linear([x, h_prev], self._num_units, True))
 
-      new_c = (c_prev * forget_gate + input_gate * new_input)
+    new_c = (c_prev * forget_gate + input_gate * new_input)
 
-      in_out_gate = [x, h_prev]
-      if self._use_peepholes:
-        in_out_gate.append(new_c)
+    in_out_gate = [x, h_prev]
+    if self._use_peepholes:
+      in_out_gate.append(new_c)
+
+    with vs.variable_scope("output_gate"):
+      output_gate = math_ops.sigmoid(
+          _linear(in_out_gate, self._num_units, True))
 
-      with vs.variable_scope("output_gate"):
-        output_gate = math_ops.sigmoid(
-            _linear(in_out_gate, self._num_units, True))
+    new_h = math_ops.tanh(new_c) * output_gate
 
-      new_h = math_ops.tanh(new_c) * output_gate
+    period = vs.get_variable(
+        "period", [self._num_units],
+        initializer=_random_exp_initializer(
+            self._period_init_min, self._period_init_max))
+    phase = vs.get_variable(
+        "phase", [self._num_units],
+        initializer=init_ops.random_uniform_initializer(
+            0., period.initial_value))
+    ratio_on = vs.get_variable(
+        "ratio_on", [self._num_units],
+        initializer=init_ops.constant_initializer(self._ratio_on),
+        trainable=self._trainable_ratio_on)
 
-      period = vs.get_variable(
-          "period", [self._num_units],
-          initializer=_random_exp_initializer(
-              self._period_init_min, self._period_init_max))
-      phase = vs.get_variable(
-          "phase", [self._num_units],
-          initializer=init_ops.random_uniform_initializer(
-              0., period.initial_value))
-      ratio_on = vs.get_variable(
-          "ratio_on", [self._num_units],
-          initializer=init_ops.constant_initializer(self._ratio_on),
-          trainable=self._trainable_ratio_on)
+    cycle_ratio = self._get_cycle_ratio(time, phase, period)
 
-      cycle_ratio = self._get_cycle_ratio(time, phase, period)
+    k_up = 2 * cycle_ratio / ratio_on
+    k_down = 2 - k_up
+    k_closed = self._leak * cycle_ratio
 
-      k_up = 2 * cycle_ratio / ratio_on
-      k_down = 2 - k_up
-      k_closed = self._leak * cycle_ratio
+    k = array_ops.where(cycle_ratio < ratio_on, k_down, k_closed)
+    k = array_ops.where(cycle_ratio < 0.5 * ratio_on, k_up, k)
 
-      k = array_ops.where(cycle_ratio < ratio_on, k_down, k_closed)
-      k = array_ops.where(cycle_ratio < 0.5 * ratio_on, k_up, k)
+    new_c = k * new_c + (1 - k) * c_prev
+    new_h = k * new_h + (1 - k) * h_prev
 
-      new_c = k * new_c + (1 - k) * c_prev
-      new_h = k * new_h + (1 - k) * h_prev
+    new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
 
-      new_state = core_rnn_cell.LSTMStateTuple(new_c, new_h)
+    return new_h, new_state
 
-      return new_h, new_state
+
+class GLSTMCell(core_rnn_cell.RNNCell):
+  """Group LSTM cell (G-LSTM).
+
+  The implementation is based on:
+
+    https://arxiv.org/abs/1703.10722
+
+  O. Kuchaiev and B. Ginsburg
+  "Factorization Tricks for LSTM Networks", ICLR 2017 workshop.
+  """
+
+  def __init__(self, num_units, initializer=None, num_proj=None,
+               number_of_groups=1, forget_bias=1.0, activation=math_ops.tanh,
+               reuse=None):
+    """Initialize the parameters of G-LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the G-LSTM cell
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      number_of_groups: (optional) int, number of groups to use.
+        If `number_of_groups` is 1, then it should be equivalent to LSTM cell
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training.
+      activation: Activation function of the inner states.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already
+        has the given variables, an error is raised.
+
+    Raises:
+      ValueError: If `num_units` or `num_proj` is not divisible by 
+        `number_of_groups`.
+    """
+    super(GLSTMCell, self).__init__(_reuse=reuse)
+    self._num_units = num_units
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._forget_bias = forget_bias
+    self._activation = activation
+    self._number_of_groups = number_of_groups
+
+    if self._num_units % self._number_of_groups != 0:
+      raise ValueError("num_units must be divisible by number_of_groups")
+    if self._num_proj:
+      if self._num_proj % self._number_of_groups != 0:
+        raise ValueError("num_proj must be divisible by number_of_groups")
+      self._group_shape = [int(self._num_proj / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+    else:
+      self._group_shape = [int(self._num_units / self._number_of_groups),
+                           int(self._num_units / self._number_of_groups)]
+
+    if num_proj:
+      self._state_size = core_rnn_cell.LSTMStateTuple(num_units, num_proj)
+      self._output_size = num_proj
+    else:
+      self._state_size = core_rnn_cell.LSTMStateTuple(num_units, num_units)
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _get_input_for_group(self, inputs, group_id, group_size):
+    """Slices inputs into groups to prepare for processing by cell's groups
+
+    Args:
+      inputs: cell input or it's previous state,
+              a Tensor, 2D, [batch x num_units]
+      group_id: group id, a Scalar, for which to prepare input
+      group_size: size of the group
+
+    Returns:
+      subset of inputs corresponding to group "group_id",
+      a Tensor, 2D, [batch x num_units/number_of_groups]
+    """
+    return array_ops.slice(input_=inputs,
+                           begin=[0, group_id * group_size],
+                           size=[self._batch_size, group_size],
+                           name=("GLSTM_group%d_input_generation" % group_id))
+
+  def call(self, inputs, state):
+    """Run one step of G-LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, [batch x num_units].
+      state: this must be a tuple of state Tensors, both `2-D`,
+      with column sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        G-LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - LSTMStateTuple representing the new state of G-LSTM  cell
+        after reading `inputs` when the previous state was `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    (c_prev, m_prev) = state
+
+    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    dtype = inputs.dtype
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer):
+      i_parts = []
+      j_parts = []
+      f_parts = []
+      o_parts = []
+
+      for group_id in range(self._number_of_groups):
+        with vs.variable_scope("group%d" % group_id):
+          x_g_id = array_ops.concat(
+            [self._get_input_for_group(inputs, group_id,
+                                       self._group_shape[0]),
+             self._get_input_for_group(m_prev, group_id,
+                                       self._group_shape[0])], axis=1)
+          R_k = _linear(x_g_id, 4 * self._group_shape[1], bias=False)
+          i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)
+
+        i_parts.append(i_k)
+        j_parts.append(j_k)
+        f_parts.append(f_k)
+        o_parts.append(o_k)
+
+      bi = vs.get_variable(name="bias_i",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bj = vs.get_variable(name="bias_j",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bf = vs.get_variable(name="bias_f",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+      bo = vs.get_variable(name="bias_o",
+                           shape=[self._num_units],
+                           dtype=dtype,
+                           initializer=
+                           init_ops.constant_initializer(0.0, dtype=dtype))
+
+      i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
+      j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
+      f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
+      o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)
+
+    c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
+         math_ops.sigmoid(i) * math_ops.tanh(j))
+    m = math_ops.sigmoid(o) * self._activation(c)
+
+    if self._num_proj is not None:
+      with vs.variable_scope("projection"):
+        m = _linear(m, self._num_proj, bias=False)
+
+    new_state = core_rnn_cell.LSTMStateTuple(c, m)
+    return m, new_state
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9bb3bcccdef96485b309c365886516f98270e1a
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert.py
@@ -0,0 +1,247 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert checkpoints using RNNCells to new name convention.
+
+Usage:
+
+  python checkpoint_convert.py [--write_v1_checkpoint] \
+      '/path/to/checkpoint' '/path/to/new_checkpoint'
+
+For example, if there is a V2 checkpoint to be converted and the files include:
+  /tmp/my_checkpoint/model.ckpt.data-00000-of-00001
+  /tmp/my_checkpoint/model.ckpt.index
+  /tmp/my_checkpoint/model.ckpt.meta
+
+use the following command:
+  mkdir /tmp/my_converted_checkpoint &&
+  python checkpoint_convert.py \
+      /tmp/my_checkpoint/model.ckpt /tmp/my_converted_checkpoint/model.ckpt
+
+This will generate three converted checkpoint files corresponding to the three
+old ones in the new directory:
+  /tmp/my_converted_checkpoint/model.ckpt.data-00000-of-00001
+  /tmp/my_converted_checkpoint/model.ckpt.index
+  /tmp/my_converted_checkpoint/model.ckpt.meta
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import re
+import sys
+
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import app
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
+
+_RNN_NAME_REPLACEMENTS = collections.OrderedDict([
+    ############################################################################
+    # contrib/rnn/python/ops/core_rnn_cell_impl.py
+    # BasicRNNCell
+    ('basic_rnn_cell/weights', 'basic_rnn_cell/kernel'),
+    ('basic_rnn_cell/biases', 'basic_rnn_cell/bias'),
+    # GRUCell
+    ('gru_cell/weights', 'gru_cell/kernel'),
+    ('gru_cell/biases', 'gru_cell/bias'),
+    ('gru_cell/gates/weights', 'gru_cell/gates/kernel'),
+    ('gru_cell/gates/biases', 'gru_cell/gates/bias'),
+    ('gru_cell/candidate/weights', 'gru_cell/candidate/kernel'),
+    ('gru_cell/candidate/biases', 'gru_cell/candidate/bias'),
+    # BasicLSTMCell
+    ('basic_lstm_cell/weights', 'basic_lstm_cell/kernel'),
+    ('basic_lstm_cell/biases', 'basic_lstm_cell/bias'),
+    # LSTMCell
+    ('lstm_cell/weights', 'lstm_cell/kernel'),
+    ('lstm_cell/biases', 'lstm_cell/bias'),
+    ('lstm_cell/projection/weights', 'lstm_cell/projection/kernel'),
+    ('lstm_cell/projection/biases', 'lstm_cell/projection/bias'),
+    # OutputProjectionWrapper
+    ('output_projection_wrapper/weights', 'output_projection_wrapper/kernel'),
+    ('output_projection_wrapper/biases', 'output_projection_wrapper/bias'),
+    # InputProjectionWrapper
+    ('input_projection_wrapper/weights', 'input_projection_wrapper/kernel'),
+    ('input_projection_wrapper/biases', 'input_projection_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/lstm_ops.py
+    # LSTMBlockFusedCell ??
+    ('lstm_block_wrapper/weights', 'lstm_block_wrapper/kernel'),
+    ('lstm_block_wrapper/biases', 'lstm_block_wrapper/bias'),
+    ############################################################################
+    # contrib/rnn/python/ops/rnn_cell.py
+    # LayerNormBasicLSTMCell
+    ('layer_norm_basic_lstm_cell/weights', 'layer_norm_basic_lstm_cell/kernel'),
+    ('layer_norm_basic_lstm_cell/biases', 'layer_norm_basic_lstm_cell/bias'),
+    # UGRNNCell, not found in g3, but still need it?
+    ('ugrnn_cell/weights', 'ugrnn_cell/kernel'),
+    ('ugrnn_cell/biases', 'ugrnn_cell/bias'),
+    # NASCell
+    ('nas_rnn/weights', 'nas_rnn/kernel'),
+    ('nas_rnn/recurrent_weights', 'nas_rnn/recurrent_kernel'),
+    # IntersectionRNNCell
+    ('intersection_rnn_cell/weights', 'intersection_rnn_cell/kernel'),
+    ('intersection_rnn_cell/biases', 'intersection_rnn_cell/bias'),
+    ('intersection_rnn_cell/in_projection/weights',
+     'intersection_rnn_cell/in_projection/kernel'),
+    ('intersection_rnn_cell/in_projection/biases',
+     'intersection_rnn_cell/in_projection/bias'),
+    # PhasedLSTMCell
+    ('phased_lstm_cell/mask_gates/weights',
+     'phased_lstm_cell/mask_gates/kernel'),
+    ('phased_lstm_cell/mask_gates/biases', 'phased_lstm_cell/mask_gates/bias'),
+    ('phased_lstm_cell/new_input/weights', 'phased_lstm_cell/new_input/kernel'),
+    ('phased_lstm_cell/new_input/biases', 'phased_lstm_cell/new_input/bias'),
+    ('phased_lstm_cell/output_gate/weights',
+     'phased_lstm_cell/output_gate/kernel'),
+    ('phased_lstm_cell/output_gate/biases',
+     'phased_lstm_cell/output_gate/bias'),
+    # AttentionCellWrapper
+    ('attention_cell_wrapper/weights', 'attention_cell_wrapper/kernel'),
+    ('attention_cell_wrapper/biases', 'attention_cell_wrapper/bias'),
+    ('attention_cell_wrapper/attn_output_projection/weights',
+     'attention_cell_wrapper/attn_output_projection/kernel'),
+    ('attention_cell_wrapper/attn_output_projection/biases',
+     'attention_cell_wrapper/attn_output_projection/bias'),
+    ('attention_cell_wrapper/attention/weights',
+     'attention_cell_wrapper/attention/kernel'),
+    ('attention_cell_wrapper/attention/biases',
+     'attention_cell_wrapper/attention/bias'),
+])
+
+_RNN_SHARDED_NAME_REPLACEMENTS = collections.OrderedDict([
+    ('LSTMCell/W_', 'lstm_cell/weights/part_'),
+    ('BasicLSTMCell/Linear/Matrix_', 'basic_lstm_cell/weights/part_'),
+    ('GRUCell/W_', 'gru_cell/weights/part_'),
+    ('MultiRNNCell/Cell', 'multi_rnn_cell/cell_'),
+])
+
+
+def _rnn_name_replacement(var_name):
+  for pattern in _RNN_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern, _RNN_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+      break
+  return var_name
+
+
+def _rnn_name_replacement_sharded(var_name):
+  for pattern in _RNN_SHARDED_NAME_REPLACEMENTS:
+    if pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(pattern,
+                                  _RNN_SHARDED_NAME_REPLACEMENTS[pattern])
+      logging.info('Converted: %s --> %s' % (old_var_name, var_name))
+  return var_name
+
+
+def _split_sharded_vars(name_shape_map):
+  """Split shareded variables.
+
+  Args:
+    name_shape_map: A dict from variable name to variable shape.
+
+  Returns:
+    not_sharded: Names of the non-sharded variables.
+    sharded: Names of the sharded varibales.
+  """
+  sharded = []
+  not_sharded = []
+  for name in name_shape_map:
+    if re.match(name, '_[0-9]+$'):
+      if re.sub('_[0-9]+$', '_1', name) in name_shape_map:
+        sharded.append(name)
+      else:
+        not_sharded.append(name)
+    else:
+      not_sharded.append(name)
+  return not_sharded, sharded
+
+
+def convert_names(checkpoint_from_path,
+                  checkpoint_to_path,
+                  write_v1_checkpoint=False):
+  """Migrates the names of variables within a checkpoint.
+
+  Args:
+    checkpoint_from_path: Path to source checkpoint to be read in.
+    checkpoint_to_path: Path to checkpoint to be written out.
+    write_v1_checkpoint: Whether the output checkpoint will be in V1 format.
+
+  Returns:
+    A dictionary that maps the new variable names to the Variable objects.
+    A dictionary that maps the old variable names to the new variable names.
+  """
+  with ops.Graph().as_default():
+    logging.info('Reading checkpoint_from_path %s' % checkpoint_from_path)
+    reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_from_path)
+    name_shape_map = reader.get_variable_to_shape_map()
+    not_sharded, sharded = _split_sharded_vars(name_shape_map)
+    new_variable_map = {}
+    conversion_map = {}
+    for var_name in not_sharded:
+      new_var_name = _rnn_name_replacement(var_name)
+      tensor = reader.get_tensor(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+    for var_name in sharded:
+      new_var_name = _rnn_name_replacement_sharded(var_name)
+      var = variables.Variable(tensor, name=var_name)
+      new_variable_map[new_var_name] = var
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+
+    write_version = (saver_pb2.SaverDef.V1
+                     if write_v1_checkpoint else saver_pb2.SaverDef.V2)
+    saver = saver_lib.Saver(new_variable_map, write_version=write_version)
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      logging.info('Writing checkpoint_to_path %s' % checkpoint_to_path)
+      saver.save(sess, checkpoint_to_path)
+
+  logging.info('Summary:')
+  logging.info('  Converted %d variable name(s).' % len(new_variable_map))
+  return new_variable_map, conversion_map
+
+
+def main(_):
+  convert_names(
+      FLAGS.checkpoint_from_path,
+      FLAGS.checkpoint_to_path,
+      write_v1_checkpoint=FLAGS.write_v1_checkpoint)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument('checkpoint_from_path', type=str,
+                      help='Path to source checkpoint to be read in.')
+  parser.add_argument('checkpoint_to_path', type=str,
+                      help='Path to checkpoint to be written out.')
+  parser.add_argument('--write_v1_checkpoint', action='store_true',
+                      help='Write v1 checkpoint')
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc2fa80eacc853c75e8a6019976c2583edc0f5
--- /dev/null
+++ b/tensorflow/contrib/rnn/python/tools/checkpoint_convert_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for checkpoint converter."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import tempfile
+
+from tensorflow.contrib.rnn.python.tools import checkpoint_convert
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+
+
+class CheckpointConvertTest(test.TestCase):
+
+  def setUp(self):
+    self._old_ckpt_path = tempfile.mktemp()
+    self._new_ckpt_path = tempfile.mktemp()
+    ops.reset_default_graph()
+
+  def tearDown(self):
+    for file_name in glob.glob(self._old_ckpt_path + "*"):
+      os.remove(file_name)
+    for file_name in glob.glob(self._new_ckpt_path + "*"):
+      os.remove(file_name)
+
+  def testReplacementDictsContainUniqueAndNonEmptyVariableNames(self):
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+    for old_name in checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS:
+      new_name = checkpoint_convert._RNN_SHARDED_NAME_REPLACEMENTS[old_name]
+      self.assertTrue(old_name)
+      self.assertTrue(new_name)
+      self.assertNotEqual(old_name, new_name)
+
+  def testConversionFromV2WithConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    for old_name in checkpoint_convert._RNN_NAME_REPLACEMENTS:
+      variables.Variable(20.0, name=old_name)
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertTrue(glob.glob(self._new_ckpt_path + "*"))
+    self.assertItemsEqual(
+        ["a"] + list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values()),
+        new_var_map.keys())
+    self.assertEqual(checkpoint_convert._RNN_NAME_REPLACEMENTS, conversion_map)
+
+  def testConversionFromV2WithoutConvertedVariableNamesSucceeds(self):
+    variables.Variable(10.0, name="a")
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path)
+    self.assertItemsEqual(["a"], new_var_map.keys())
+    self.assertFalse(conversion_map)
+
+  def testConversionToV1Succeeds(self):
+    variables.Variable(10.0, name="a")
+    variables.Variable(
+        20.0, name=list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1])
+
+    with session.Session() as sess:
+      saver = saver_lib.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, self._old_ckpt_path)
+
+    new_var_map, conversion_map = checkpoint_convert.convert_names(
+        self._old_ckpt_path, self._new_ckpt_path, write_v1_checkpoint=True)
+    self.assertItemsEqual(
+        ["a", list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]],
+        new_var_map.keys())
+    self.assertEqual(
+        {list(checkpoint_convert._RNN_NAME_REPLACEMENTS.keys())[-1]:
+         list(checkpoint_convert._RNN_NAME_REPLACEMENTS.values())[-1]},
+        conversion_map)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f3d98cfbbe4976a8b6ec053fd3c575ef764e09ca
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -0,0 +1,55 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+# SavedModel contrib libraries for C++.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.cc"],
+    hdrs = ["signature_def_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_test(
+    name = "signature_def_utils_test",
+    srcs = ["signature_def_utils_test.cc"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a45908d272618c725744a5be68c10707bfd610fc
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+namespace {
+template <class T>
+Status FindInProtobufMap(StringPiece description,
+                         const protobuf::Map<string, T>& map, const string& key,
+                         const T** value) {
+  const auto it = map.find(key);
+  if (it == map.end()) {
+    return errors::NotFound("Could not find ", description, " for key: ", key);
+  }
+  *value = &it->second;
+  return Status::OK();
+}
+}  // namespace
+
+Status FindSignatureDefByKey(const MetaGraphDef& meta_graph_def,
+                             const string& signature_def_key,
+                             const SignatureDef** signature_def) {
+  return FindInProtobufMap("SignatureDef", meta_graph_def.signature_def(),
+                           signature_def_key, signature_def);
+}
+
+Status FindInputTensorInfoByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key,
+                                const TensorInfo** tensor_info) {
+  return FindInProtobufMap("input TensorInfo", signature_def.inputs(),
+                           tensor_info_key, tensor_info);
+}
+
+Status FindOutputTensorInfoByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key,
+                                 const TensorInfo** tensor_info) {
+  return FindInProtobufMap("output TensorInfo", signature_def.outputs(),
+                           tensor_info_key, tensor_info);
+}
+
+Status FindInputTensorNameByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key, string* name) {
+  const TensorInfo* tensor_info;
+  TF_RETURN_IF_ERROR(
+      FindInputTensorInfoByKey(signature_def, tensor_info_key, &tensor_info));
+  *name = tensor_info->name();
+  return Status::OK();
+}
+
+Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key, string* name) {
+  const TensorInfo* tensor_info;
+  TF_RETURN_IF_ERROR(
+      FindOutputTensorInfoByKey(signature_def, tensor_info_key, &tensor_info));
+  *name = tensor_info->name();
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0df224bc8cffcb485db38dea270600c71070dff
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for working with the SignatureDefs of TensorFlow SavedModels.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+
+// Finds the entry in meta_graph_def.signature_def with the given key, or
+// returns NotFound and leaves *signature_def unchanged. NOTE: The output
+// SignatureDef* points into meta_graph_def and may be invalidated by changes
+// to that protocol buffer, as usual.
+Status FindSignatureDefByKey(const MetaGraphDef& meta_graph_def,
+                             const string& signature_def_key,
+                             const SignatureDef** signature_def);
+
+// Finds the entry in signature_def.inputs with the given key, or returns
+// NotFound and leaves *tensor_info unchanged. NOTE: The output TensorInfo*
+// points into signature_def and may be invalidated by changes to that protocol
+// buffer, as usual.
+Status FindInputTensorInfoByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key,
+                                const TensorInfo** tensor_info);
+
+// Finds the entry in signature_def.outputs with the given key, or returns
+// NotFound and leaves *tensor_info unchanged. NOTE: The output TensorInfo*
+// points into signature_def and may be invalidated by changes to that protocol
+// buffer, as usual.
+Status FindOutputTensorInfoByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key,
+                                 const TensorInfo** tensor_info);
+
+// Finds the entry in signature_def.inputs with the given key and copies out
+// the name of this Tensor in the graph, or returns NotFound and leaves *name
+// unchanged.
+Status FindInputTensorNameByKey(const SignatureDef& signature_def,
+                                const string& tensor_info_key, string* name);
+
+// Finds the entry in signature_def.outputs with the given key and copies out
+// the name of this Tensor in the graph, or returns NotFound and leaves *name
+// unchanged.
+Status FindOutputTensorNameByKey(const SignatureDef& signature_def,
+                                 const string& tensor_info_key, string* name);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SAVED_MODEL_CC_SAVED_MODEL_SIGNATURE_DEF_UTILS_H_
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a063e9569602efc88e6064b452bd893ed4ae0614
--- /dev/null
+++ b/tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/saved_model/cc/saved_model/signature_def_utils.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class SignatureDefUtilsTest : public ::testing::Test {
+ protected:
+  MetaGraphDef MakeSampleMetaGraphDef() {
+    MetaGraphDef result;
+    (*result.mutable_signature_def())["blah"].set_method_name("foo");
+    (*result.mutable_signature_def())[kSignatureKey] = MakeSampleSignatureDef();
+    (*result.mutable_signature_def())["gnarl"].set_method_name("blah");
+    return result;
+  }
+
+  SignatureDef MakeSampleSignatureDef() {
+    SignatureDef result;
+    result.set_method_name(kMethodName);
+    (*result.mutable_inputs())[kInput1Key].set_name(kInput1Name);
+    (*result.mutable_inputs())[kInput2Key].set_name(kInput2Name);
+    (*result.mutable_outputs())[kOutput1Key].set_name(kOutput1Name);
+    (*result.mutable_outputs())[kOutput2Key].set_name(kOutput2Name);
+    return result;
+  }
+
+  const string kSignatureKey = "my_signature";
+  const string kMethodName = "my_method";
+  const string kInput1Key = "input_one_key";
+  const string kInput1Name = "input_one";
+  const string kInput2Key = "input_two_key";
+  const string kInput2Name = "input_two";
+  const string kOutput1Key = "output_one_key";
+  const string kOutput1Name = "output_one";
+  const string kOutput2Key = "output_two_key";
+  const string kOutput2Name = "output_two";
+};
+
+TEST_F(SignatureDefUtilsTest, FindSignatureDefByKey) {
+  const MetaGraphDef meta_graph_def = MakeSampleMetaGraphDef();
+  const SignatureDef* signature_def;
+  // Succeeds for an existing signature.
+  TF_ASSERT_OK(
+      FindSignatureDefByKey(meta_graph_def, kSignatureKey, &signature_def));
+  EXPECT_EQ(kMethodName, signature_def->method_name());
+  // Fails for a missing signature.
+  EXPECT_FALSE(
+      FindSignatureDefByKey(meta_graph_def, "nonexistent", &signature_def)
+          .ok());
+}
+
+TEST_F(SignatureDefUtilsTest, FindInputTensorNameByKey) {
+  const SignatureDef signature_def = MakeSampleSignatureDef();
+  string name;
+  // Succeeds for an existing input.
+  TF_ASSERT_OK(FindInputTensorNameByKey(signature_def, kInput2Key, &name));
+  EXPECT_EQ(kInput2Name, name);
+  // Fails for a missing input.
+  EXPECT_FALSE(
+      FindInputTensorNameByKey(signature_def, "nonexistent", &name).ok());
+}
+
+TEST_F(SignatureDefUtilsTest, FindOutputTensorNameByKey) {
+  const SignatureDef signature_def = MakeSampleSignatureDef();
+  string name;
+  // Succeeds for an existing output.
+  TF_ASSERT_OK(FindOutputTensorNameByKey(signature_def, kOutput2Key, &name));
+  EXPECT_EQ(kOutput2Name, name);
+  // Fails for a missing output.
+  EXPECT_FALSE(
+      FindOutputTensorNameByKey(signature_def, "nonexistent", &name).ok());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 652bbba85ef1ac72ad30e714fe90cd903e7bf200..f1e39a137322711efacda02abd3c13f528981bc1 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -8,12 +8,28 @@ exports_files(["LICENSE"])
 package(default_visibility = ["//tensorflow:__subpackages__"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_kernel_library",
+    "tf_gen_op_wrapper_py",
+)
 
-py_library(
+tf_custom_op_py_library(
     name = "seq2seq_py",
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    dso = [
+        ":python/ops/_beam_search_ops.so",
+    ],
+    kernels = [
+        ":beam_search_ops_kernels",
+        ":beam_search_ops_op_lib",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":beam_search_ops",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
@@ -21,11 +37,52 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_beam_search_ops.so",
+    srcs = [
+        "kernels/beam_search_ops.cc",
+        "kernels/beam_search_ops.h",
+        "ops/beam_search_ops.cc",
+    ],
+    gpu_srcs = [
+        "kernels/beam_search_ops_gpu.cu.cc",
+        "kernels/beam_search_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "beam_search_ops",
+    deps = [":beam_search_ops_op_lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "beam_search_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "beam_search_ops_kernels",
+    prefix = "kernels/beam_search_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
     ],
 )
 
@@ -67,6 +124,20 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "beam_search_ops_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/beam_search_ops_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "decoder_test",
     size = "medium",
@@ -87,6 +158,27 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "beam_search_decoder_test",
+    size = "small",
+    srcs = ["python/kernel_tests/beam_search_decoder_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "attention_wrapper_test",
     size = "medium",
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index 277434c16069ab85bcfb34744d9301c87074b58c..dc159b93a3781cb2cf90eb99a0a9d9e1aecf573b 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -16,27 +16,6 @@
 """Ops for building neural network seq2seq decoders and losses.
 
 See the @{$python/contrib.seq2seq} guide.
-
-@@Decoder
-@@dynamic_decode
-
-@@BasicDecoderOutput
-@@BasicDecoder
-
-@@Helper
-@@CustomHelper
-@@GreedyEmbeddingHelper
-@@ScheduledEmbeddingTrainingHelper
-@@ScheduledOutputTrainingHelper
-@@TrainingHelper
-
-@@BahdanauAttention
-@@LuongAttention
-
-@@hardmax
-
-@@AttentionWrapperState
-@@AttentionWrapper
 """
 
 from __future__ import absolute_import
@@ -46,13 +25,38 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long
 from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import *
 from tensorflow.contrib.seq2seq.python.ops.basic_decoder import *
+from tensorflow.contrib.seq2seq.python.ops.beam_search_decoder import *
+from tensorflow.contrib.seq2seq.python.ops.beam_search_ops import *
 from tensorflow.contrib.seq2seq.python.ops.decoder import *
 from tensorflow.contrib.seq2seq.python.ops.helper import *
 from tensorflow.contrib.seq2seq.python.ops.loss import *
+from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,widcard-import,line-too-long
 
-from tensorflow.python.util.all_util import remove_undocumented
+_allowed_symbols = [
+    "sequence_loss",
+    "Decoder",
+    "dynamic_decode",
+    "BasicDecoder",
+    "BasicDecoderOutput",
+    "BeamSearchDecoder",
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "Helper",
+    "CustomHelper",
+    "FinalBeamSearchDecoderOutput",
+    "gather_tree",
+    "GreedyEmbeddingHelper",
+    "ScheduledEmbeddingTrainingHelper",
+    "ScheduledOutputTrainingHelper",
+    "TrainingHelper",
+    "BahdanauAttention",
+    "LuongAttention",
+    "hardmax",
+    "AttentionWrapperState",
+    "AttentionWrapper",
+    "AttentionMechanism",
+    "tile_batch"]
 
-_allowed_symbols = ["sequence_loss"]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec493b84635aa6c92680685caded321b24f7f636
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class GatherTreeOp : public OpKernel {
+ public:
+  explicit GatherTreeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Device& device = ctx->eigen_device<Device>();
+    const Tensor& step_ids = ctx->input(0);
+    const Tensor& parent_ids = ctx->input(1);
+    const Tensor& sequence_length = ctx->input(2);
+    const TensorShape& step_ids_shape = step_ids.shape();
+    OP_REQUIRES(
+        ctx, step_ids_shape.dims() == 3,
+        errors::InvalidArgument("step_ids must be a 3-tensor, saw shape: ",
+                                step_ids_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(sequence_length.shape()),
+        errors::InvalidArgument("sequence_length must be a matrix, saw shape: ",
+                                sequence_length.shape().DebugString()));
+    OP_REQUIRES(ctx, sequence_length.dim_size(0) == step_ids_shape.dim_size(1),
+                errors::InvalidArgument(
+                    "Inconsistent batch sizes: sequence_length.shape[0] (",
+                    sequence_length.dim_size(0), ") != ", "step_ids.shape[1] (",
+                    step_ids_shape.dim_size(1), ")"));
+    OP_REQUIRES(ctx, sequence_length.dim_size(1) == step_ids_shape.dim_size(2),
+                errors::InvalidArgument(
+                    "Inconsistent batch sizes: sequence_length.shape[1] (",
+                    sequence_length.dim_size(1), ") != ", "step_ids.shape[2] (",
+                    step_ids_shape.dim_size(2), ")"));
+    OP_REQUIRES(
+        ctx, step_ids_shape == parent_ids.shape(),
+        errors::InvalidArgument(
+            "step_ids.shape must match parent_ids.shape.  but shapes are: ",
+            step_ids_shape.DebugString(), " and ",
+            parent_ids.shape().DebugString()));
+    Tensor* beams;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, step_ids_shape, &beams));
+    typename TTypes<T, 3>::ConstTensor step_ids_t = step_ids.tensor<T, 3>();
+    typename TTypes<T, 3>::ConstTensor parent_ids_t = parent_ids.tensor<T, 3>();
+    typename TTypes<T>::ConstMatrix seq_len_t = sequence_length.matrix<T>();
+    typename TTypes<T, 3>::Tensor beams_t = beams->tensor<T, 3>();
+    functor::GatherTree<Device, T>()(ctx, device, step_ids_t, parent_ids_t,
+                                     seq_len_t, beams_t);
+  }
+};
+
+#define REGISTER_KERNEL(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("GatherTree").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      GatherTreeOp<CPUDevice, T>);
+REGISTER_KERNEL(int32);
+#undef REGISTER_KERNEL
+
+namespace functor {
+
+// CPU specialization
+template <>
+struct GatherTree<CPUDevice, int32> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& d,
+                  typename TTypes<int32, 3>::ConstTensor step_ids,
+                  typename TTypes<int32, 3>::ConstTensor parent_ids,
+                  typename TTypes<int32>::ConstMatrix sequence_length,
+                  typename TTypes<int32, 3>::Tensor beams) {
+    const int64 max_time = parent_ids.dimension(0);
+    const int64 batch_size = parent_ids.dimension(1);
+    const int64 beam_width = parent_ids.dimension(2);
+    beams.setConstant(-1);
+
+    auto DoWork = [&, ctx](int start_batch_beam, int limit_batch_beam) {
+      for (int32 i = start_batch_beam; i < limit_batch_beam; ++i) {
+        const int32 batch = i / beam_width;
+        const int32 beam = i % beam_width;
+        int32 seq_len_b = sequence_length(batch, beam);
+        if (seq_len_b == 0) {
+          continue;
+        }
+        beams(seq_len_b - 1, batch, beam) =
+            step_ids(seq_len_b - 1, batch, beam);
+        int32 parent = parent_ids(seq_len_b - 1, batch, beam);
+        for (int32 level = seq_len_b - 2; level >= 0; --level) {
+          if (parent < 0 || parent > beam_width) {
+            ctx->SetStatus(
+                errors::InvalidArgument("Saw invalid parent id ", parent,
+                                        " at (batch, time, beam) == (", batch,
+                                        ", ", level, ", ", beam, ")"));
+            return;
+          }
+          beams(level, batch, beam) = step_ids(level, batch, parent);
+          parent = parent_ids(level, batch, parent);
+        }
+      }
+    };
+    // Guesstimate of cost; ~5 lookup/store/compare per inner beam
+    // traversal time step.
+    const int64 batch_beam_cost =
+        Eigen::TensorOpCost::DivCost<int32>() +
+        6 * Eigen::TensorOpCost::AddCost<int32>() +
+        max_time * (5 * Eigen::TensorOpCost::AddCost<int32>());
+    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          batch_size * beam_width, batch_beam_cost, DoWork);
+  }
+};
+
+}  // namespace functor
+
+#if GOOGLE_CUDA
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                            \
+  template <>                                          \
+  void GatherTree<GPUDevice, T>::operator()(           \
+      OpKernelContext* ctx, const GPUDevice& d,        \
+      typename TTypes<T, 3>::ConstTensor step_ids,     \
+      typename TTypes<T, 3>::ConstTensor parent_ids,   \
+      typename TTypes<T>::ConstMatrix sequence_length, \
+      typename TTypes<T, 3>::Tensor beams);            \
+  extern template struct GatherTree<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(int32);
+#undef DECLARE_GPU_SPEC
+}  // end namespace functor
+
+#define REGISTER_GPU_KERNEL(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("GatherTree").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      GatherTreeOp<GPUDevice, T>);
+
+REGISTER_GPU_KERNEL(int32);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..124d07264e75ac4ce7739dd3291abdabbb40a58f
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/eigen_activations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class OpKernelContext;
+
+namespace functor {
+
+template <typename Device, typename T>
+struct GatherTree {
+  void operator()(OpKernelContext* ctx, const Device& d,
+                  typename TTypes<T, 3>::ConstTensor step_ids,
+                  typename TTypes<T, 3>::ConstTensor parent_ids,
+                  typename TTypes<T>::ConstMatrix sequence_length,
+                  typename TTypes<T, 3>::Tensor beams);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_SEQ2SEQ_KERNELS_BEAM_SEARCH_OPS_H_
diff --git a/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3c0d0bfa9854e71c2cf3feb3de78637340df09c
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc
@@ -0,0 +1,88 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+__global__ void GatherTreeOpKernel(const int32 batch_size, const int32 max_time,
+                                   const int32 beam_width, const T* step_ids,
+                                   const T* parent_ids,
+                                   const T* sequence_length, T* beams) {
+  CUDA_1D_KERNEL_LOOP(i, batch_size * beam_width) {
+    const int32 batch = i / beam_width;
+    const int32 beam = i % beam_width;
+    const int32 seq_len_b = ldg(sequence_length + batch * beam_width + beam);
+#define GET_IX(time_ix, beam_ix) \
+  (batch_size * beam_width * (time_ix) + beam_width * batch + (beam_ix))
+    const int32 initial_beam_ix = GET_IX(seq_len_b - 1, beam);
+    beams[initial_beam_ix] = ldg(step_ids + initial_beam_ix);
+    int32 parent = ldg(parent_ids + initial_beam_ix);
+    for (int32 level = seq_len_b - 2; level >= 0; --level) {
+      const int32 level_beam_ix = GET_IX(level, beam);
+      const int32 level_parent_ix = GET_IX(level, parent);
+      if (parent < 0 || parent > beam_width) {
+        beams[level_beam_ix] = -1;
+        parent = -1;
+      } else {
+        beams[level_beam_ix] = ldg(step_ids + level_parent_ix);
+        parent = ldg(parent_ids + level_parent_ix);
+      }
+    }
+#undef GET_IX
+  }
+}
+
+template <typename T>
+struct GatherTree<GPUDevice, T> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor step_ids,
+                  typename TTypes<T, 3>::ConstTensor parent_ids,
+                  typename TTypes<T>::ConstMatrix sequence_length,
+                  typename TTypes<T, 3>::Tensor beams) {
+    const int32 max_time = parent_ids.dimension(0);
+    const int32 batch_size = parent_ids.dimension(1);
+    const int32 beam_width = parent_ids.dimension(2);
+    // First kernel launch to zero things out
+    beams.device(d) = beams.constant(T(-1));
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
+    // clang-format off
+    GatherTreeOpKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            batch_size, max_time, beam_width,
+            step_ids.data(), parent_ids.data(), sequence_length.data(),
+            beams.data());
+    // clang-format on
+  }
+};
+
+#define DEFINE_GPU_SPECS(T) template struct GatherTree<GPUDevice, T>;
+
+DEFINE_GPU_SPECS(int32);
+#undef DEFINE_GPU_SPECS
+
+}  // end namespace functor
+}  // end namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc b/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c445cd4606381ed56d91000bc5e42d874ca0c5c
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("GatherTree")
+    .Input("step_ids: T")
+    .Input("parent_ids: T")
+    .Input("sequence_length: T")
+    .Output("beams: T")
+    .Attr("T: {int32}")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle step_ids, parent_ids, sequence_length;
+
+      // step_ids, parent_ids, and output are all shaped:
+      //   [max_time, batch_size, beam_width].
+      // sequence_length is shaped [batch_size, beam_width].
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &step_ids));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &parent_ids));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &sequence_length));
+
+      DimensionHandle batch_size = c->Dim(step_ids, 1);
+      DimensionHandle beam_width = c->Dim(step_ids, 2);
+
+      TF_RETURN_IF_ERROR(c->Merge(step_ids, parent_ids, &step_ids));
+      TF_RETURN_IF_ERROR(
+          c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
+      TF_RETURN_IF_ERROR(
+          c->Merge(beam_width, c->Dim(sequence_length, 1), &beam_width));
+
+      c->set_output(0, step_ids);
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+Calculates the full beams from the per-step ids and parent beam ids.
+
+This op implements the following mathematical equations:
+
+```python
+TODO(ebrevdo): fill in
+```
+
+step_ids: `[max_time, batch_size, beam_width]`.
+parent_ids: `[max_time, batch_size, beam_width]`.
+sequence_length: `[batch_size, beam_width]`.
+beams: `[max_time, batch_size, beam_width]`.
+)doc");
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 9c3015ff2504fd63a16ad5b680323e52652a84d1..b8b420e10a7e3d6c5cb5d4d78aecbf754ab638fc 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 # pylint: enable=unused-import
 
-import sys
+import collections
 import functools
 
 import numpy as np
@@ -46,23 +46,44 @@ BasicDecoderOutput = basic_decoder.BasicDecoderOutput  # pylint: disable=invalid
 float32 = np.float32
 int32 = np.int32
 array = np.array
+dtype = np.dtype
+
+
+class ResultSummary(
+    collections.namedtuple('ResultSummary', ('shape', 'dtype', 'mean'))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
 
 
 class AttentionWrapperTest(test.TestCase):
 
-  def assertAllClose(self, *args, **kwargs):
-    kwargs["atol"] = 1e-4  # For GPU tests
-    kwargs["rtol"] = 1e-4  # For GPU tests
-    return super(AttentionWrapperTest, self).assertAllClose(
-        *args, **kwargs)
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperTest, self).assertAllClose(x, y, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
+
+  def testAttentionWrapperState(self):
+    num_fields = len(wrapper.AttentionWrapperState._fields)  # pylint: disable=protected-access
+    state = wrapper.AttentionWrapperState(*([None] * num_fields))
+    new_state = state.clone(time=1)
+    self.assertEqual(state.time, None)
+    self.assertEqual(new_state.time, 1)
 
   def _testWithAttention(self,
                          create_attention_mechanism,
                          expected_final_output,
                          expected_final_state,
                          attention_mechanism_depth=3,
-                         attention_history=False,
-                         name=""):
+                         alignment_history=False,
+                         expected_final_alignment_history=None,
+                         attention_layer_size=6,
+                         name=''):
     encoder_sequence_length = [3, 2, 3, 1, 0]
     decoder_sequence_length = [2, 0, 1, 2, 3]
     batch_size = 5
@@ -71,7 +92,11 @@ class AttentionWrapperTest(test.TestCase):
     input_depth = 7
     encoder_output_depth = 10
     cell_depth = 9
-    attention_depth = 6
+
+    if attention_layer_size is not None:
+      attention_depth = attention_layer_size
+    else:
+      attention_depth = encoder_output_depth
 
     decoder_inputs = np.random.randn(batch_size, decoder_max_time,
                                      input_depth).astype(np.float32)
@@ -83,14 +108,16 @@ class AttentionWrapperTest(test.TestCase):
         memory=encoder_outputs,
         memory_sequence_length=encoder_sequence_length)
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       with vs.variable_scope(
-          "root",
+          'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
         cell = core_rnn_cell.LSTMCell(cell_depth)
         cell = wrapper.AttentionWrapper(
-            cell, attention_mechanism, attention_size=attention_depth,
-            attention_history=attention_history)
+            cell,
+            attention_mechanism,
+            attention_layer_size=attention_layer_size,
+            alignment_history=alignment_history)
         helper = helper_py.TrainingHelper(decoder_inputs,
                                           decoder_sequence_length)
         my_decoder = basic_decoder.BasicDecoder(
@@ -99,7 +126,7 @@ class AttentionWrapperTest(test.TestCase):
             initial_state=cell.zero_state(
                 dtype=dtypes.float32, batch_size=batch_size))
 
-        final_outputs, final_state = decoder.dynamic_decode(my_decoder)
+        final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
 
       self.assertTrue(
           isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
@@ -120,569 +147,185 @@ class AttentionWrapperTest(test.TestCase):
       self.assertEqual((batch_size, cell_depth),
                        tuple(final_state.cell_state.h.get_shape().as_list()))
 
-      if attention_history:
-        state_attention_history = final_state.attention_history.stack()
+      if alignment_history:
+        state_alignment_history = final_state.alignment_history.stack()
         # Remove the history from final_state for purposes of the
         # remainder of the tests.
-        final_state = final_state._replace(attention_history=())  # pylint: disable=protected-access
-        self.assertEqual((None, batch_size, attention_depth),
-                         tuple(state_attention_history.get_shape().as_list()))
+        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
+        self.assertEqual((None, batch_size, encoder_max_time),
+                         tuple(state_alignment_history.get_shape().as_list()))
       else:
-        state_attention_history = ()
+        state_alignment_history = ()
 
       sess.run(variables.global_variables_initializer())
       sess_results = sess.run({
-          "final_outputs": final_outputs,
-          "final_state": final_state,
-          "state_attention_history": state_attention_history,
+          'final_outputs': final_outputs,
+          'final_state': final_state,
+          'state_alignment_history': state_alignment_history,
       })
 
-      print("Copy/paste (%s)\nexpected_final_output = " % name,
-            sess_results["final_outputs"])
-      sys.stdout.flush()
-      print("Copy/paste (%s)\nexpected_final_state = " % name,
-            sess_results["final_state"])
-      sys.stdout.flush()
-      nest.map_structure(self.assertAllClose, expected_final_output,
-                         sess_results["final_outputs"])
-      nest.map_structure(self.assertAllClose, expected_final_state,
-                         sess_results["final_state"])
-      if attention_history:  # by default, the wrapper emits attention as output
-        self.assertAllClose(
+      final_output_info = nest.map_structure(get_result_summary,
+                                             sess_results['final_outputs'])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            sess_results['final_state'])
+      print('Copy/paste:\nexpected_final_output = %s' % str(final_output_info))
+      print('expected_final_state = %s' % str(final_state_info))
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
+      if alignment_history:  # by default, the wrapper emits attention as output
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, sess_results['state_alignment_history'])
+        print('expected_final_alignment_history = %s' %
+              str(final_alignment_history_info))
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
             # outputs are batch major but the stacked TensorArray is time major
-            sess_results["state_attention_history"],
-            np.transpose(sess_results["final_outputs"].rnn_output,
-                         (1, 0, 2)))
+            expected_final_alignment_history,
+            final_alignment_history_info)
 
   def testBahdanauNotNormalized(self):
     create_attention_mechanism = wrapper.BahdanauAttention
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.89980457e-03, 1.89681584e-03, 2.05339328e-03, -3.83376027e-03,
-                -4.31808922e-03, -6.45466987e-03
-            ], [
-                2.27232254e-03, 2.02509761e-03, 2.01666891e-03, -3.87230632e-03,
-                -3.47119337e-03, -6.15991233e-03
-            ], [
-                1.87640532e-03, 2.07374478e-03, 2.30582547e-03, -3.64564802e-03,
-                -3.75995948e-03, -6.28685066e-03
-            ]], [[
-                4.89835022e-03, -1.94158917e-03, 3.32316267e-03,
-                -2.82446202e-03, 3.63192149e-03, -4.80734091e-03
-            ], [
-                5.14256489e-03, -2.00877781e-03, 3.49807227e-03,
-                -2.86567654e-03, 3.14202951e-03, -5.32575324e-03
-            ], [
-                5.21511910e-03, -2.18198029e-03, 3.56219849e-03,
-                -2.88951304e-03, 3.20866983e-03, -5.21918852e-03
-            ]], [[
-                -1.34951377e-03, -9.68646549e-04, -2.11444520e-03,
-                -1.85243192e-03, -5.27541339e-03, -9.10969637e-03
-            ], [
-                -1.36390887e-03, -1.01293903e-03, -1.96592091e-03,
-                -1.80044665e-03, -5.62618347e-03, -9.36636236e-03
-            ], [
-                -1.13357347e-03, -7.37126335e-04, -1.99582824e-03,
-                -1.88097963e-03, -5.03196474e-03, -9.34652984e-03
-            ]], [[
-                1.52963377e-03, -3.97205260e-03, -9.64675564e-04,
-                8.51404853e-04, -1.29804458e-03, 6.56467676e-03
-            ], [
-                1.22557906e-03, -4.56343032e-03, -1.08188344e-03,
-                8.27252632e-04, -2.10058759e-03, 6.43082103e-03
-            ], [
-                9.93478228e-04, -4.37378604e-03, -1.41531695e-03,
-                6.44775166e-04, -2.16480484e-03, 6.68286439e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00083043973),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18963176e-02, -8.04424379e-03, -1.48289464e-03,
-                    1.61068402e-02, -1.37983467e-02, -7.57976994e-03,
-                    -8.28560349e-03, -1.18737305e-02, 1.78835373e-02
-                ], [
-                    1.74205080e-02, -1.41929444e-02, -3.88092734e-03,
-                    3.19708064e-02, -3.54689620e-02, -2.14698724e-02,
-                    -6.21716119e-03, -1.69295724e-03, -1.94495302e-02
-                ], [
-                    -1.14528481e-02, 8.77819210e-03, -1.62970200e-02,
-                    -1.39963552e-02, 1.34831406e-02, -1.04494914e-02,
-                    6.16127765e-03, -9.41022579e-03, -6.57590060e-03
-                ], [
-                    -4.74753827e-02, -1.19123599e-02, -7.40140676e-05,
-                    4.10552323e-02, -1.36711076e-03, 2.11795457e-02,
-                    -2.80460119e-02, -5.44509329e-02, -2.91906092e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09840557e-02, -3.97477299e-03, -7.54582870e-04,
-                    7.91188516e-03, -7.02184858e-03, -3.80711886e-03,
-                    -4.22059745e-03, -6.05464494e-03, 8.92061181e-03
-                ], [
-                    8.68131686e-03, -7.16938032e-03, -1.88384682e-03,
-                    1.62678920e-02, -1.76827926e-02, -1.06622791e-02,
-                    -3.07528162e-03, -8.45885137e-04, -9.99388192e-03
-                ], [
-                    -5.71205560e-03, 4.50050412e-03, -8.07640795e-03,
-                    -6.94844872e-03, 6.75682165e-03, -5.12113515e-03,
-                    3.06208082e-03, -4.61743120e-03, -3.23931244e-03
-                ], [
-                    -2.37231534e-02, -5.88526297e-03, -3.72226204e-05,
-                    2.01789513e-02, -6.75848918e-04, 1.06686354e-02,
-                    -1.42624676e-02, -2.69628745e-02, -1.45034352e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00187641, 0.00207374, 0.00230583, -0.00364565, -0.00375996,
-                -0.00628685
-            ], [
-                0.00521512, -0.00218198, 0.0035622, -0.00288951, 0.00320867,
-                -0.00521919
-            ], [
-                -0.00113357, -0.00073713, -0.00199583, -0.00188098, -0.00503196,
-                -0.00934653
-            ], [
-                0.00099348, -0.00437379, -0.00141532, 0.00064478, -0.0021648,
-                0.00668286
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039763632),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019849765)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00081052497),
         time=3,
-        attention_history=())
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.12500001)
 
     self._testWithAttention(
         create_attention_mechanism,
         expected_final_output,
         expected_final_state,
-        attention_history=True,
-        name="testBahdanauNotNormalized")
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        name='testBahdanauNotNormalized')
 
   def testBahdanauNormalized(self):
     create_attention_mechanism = functools.partial(
         wrapper.BahdanauAttention, normalize=True)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                6.64783875e-03, 2.94425711e-03, 5.26542449e-03, -2.64955591e-03,
-                -7.95925129e-03, -5.02286293e-03
-            ], [
-                7.01954123e-03, 3.07301106e-03, 5.22849336e-03, -2.68844375e-03,
-                -7.11239874e-03, -4.72904276e-03
-            ], [
-                6.62360899e-03, 3.12234787e-03, 5.51807694e-03, -2.46222341e-03,
-                -7.40198931e-03, -4.85701021e-03
-            ]], [[
-                7.37589924e-03, -1.02620223e-03, 3.61374952e-03,
-                -5.74620720e-03, 5.05625410e-03, -7.45209027e-03
-            ], [
-                7.61946291e-03, -1.09287468e-03, 3.78817180e-03,
-                -5.78709645e-03, 4.56611114e-03, -7.96987582e-03
-            ], [
-                7.69207766e-03, -1.26582675e-03, 3.85218812e-03,
-                -5.81111759e-03, 4.63287206e-03, -7.86337163e-03
-            ]], [[
-                -2.69413739e-03, 3.47183552e-04, -1.82145904e-03,
-                -1.39805069e-03, -8.05486552e-03, -1.08372131e-02
-            ], [
-                -2.70848931e-03, 3.03293345e-04, -1.67230750e-03,
-                -1.34555507e-03, -8.40565283e-03, -1.10935047e-02
-            ], [
-                -2.47822329e-03, 5.79408603e-04, -1.70188327e-03,
-                -1.42583530e-03, -7.81180616e-03, -1.10740755e-02
-            ]], [[
-                1.48582947e-03, -3.88786104e-03, -9.39912978e-04,
-                8.36255029e-04, -1.28223014e-03, 6.40908210e-03
-            ], [
-                1.18177081e-03, -4.47923271e-03, -1.05711201e-03,
-                8.12121783e-04, -2.08477327e-03, 6.27523474e-03
-            ], [
-                9.49664740e-04, -4.28957958e-03, -1.39053771e-03,
-                6.29657647e-04, -2.14899099e-03, 6.52727811e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[0, 0, 0], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00040482997),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=1.8666666666666667))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.19389871e-02, -7.93421268e-03, -1.45148858e-03,
-                    1.61569901e-02, -1.38310911e-02, -7.59426132e-03,
-                    -8.35836027e-03, -1.18763093e-02, 1.78797375e-02
-                ], [
-                    1.74194798e-02, -1.41677596e-02, -3.89095861e-03,
-                    3.19508761e-02, -3.54519747e-02, -2.15105712e-02,
-                    -6.20894879e-03, -1.72719418e-03, -1.94605980e-02
-                ], [
-                    -1.14357909e-02, 8.76635592e-03, -1.62690803e-02,
-                    -1.39883338e-02, 1.34323873e-02, -1.04959216e-02,
-                    6.09614328e-03, -9.38197412e-03, -6.57159975e-03
-                ], [
-                    -4.74738739e-02, -1.19136795e-02, -7.36564398e-05,
-                    4.10547666e-02, -1.36771239e-03, 2.11771261e-02,
-                    -2.80481018e-02, -5.44515178e-02, -2.91903559e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.10049099e-02, -3.92028037e-03, -7.38571223e-04,
-                    7.93652050e-03, -7.03821564e-03, -3.81436548e-03,
-                    -4.25778655e-03, -6.05606195e-03, 8.91851448e-03
-                ], [
-                    8.68070032e-03, -7.15647917e-03, -1.88874488e-03,
-                    1.62575077e-02, -1.76745858e-02, -1.06826536e-02,
-                    -3.07105901e-03, -8.63034453e-04, -9.99918394e-03
-                ], [
-                    -5.70359221e-03, 4.49446775e-03, -8.06238409e-03,
-                    -6.94446685e-03, 6.73149945e-03, -5.14409645e-03,
-                    3.02969781e-03, -4.60351165e-03, -3.23720207e-03
-                ], [
-                    -2.37224046e-02, -5.88591257e-03, -3.70427515e-05,
-                    2.01787166e-02, -6.76146999e-04, 1.06674293e-02,
-                    -1.42635051e-02, -2.69631781e-02, -1.45033030e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00662361, 0.00312235, 0.00551808, -0.00246222, -0.00740199,
-                -0.00485701
-            ], [
-                0.00769208, -0.00126583, 0.00385219, -0.00581112, 0.00463287,
-                -0.00786337
-            ], [
-                -0.00247822, 0.00057941, -0.00170188, -0.00142584, -0.00781181,
-                -0.01107408
-            ], [
-                0.00094966, -0.00428958, -0.00139054, 0.00062966, -0.00214899,
-                0.00652728
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039785588),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019861322)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00038488387),
         time=3,
-        attention_history=())
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
 
     self._testWithAttention(
         create_attention_mechanism,
         expected_final_output,
         expected_final_state,
-        name="testBahdanauNormalized")
+        name='testBahdanauNormalized')
 
   def testLuongNotNormalized(self):
     create_attention_mechanism = wrapper.LuongAttention
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.74749165e-03, 1.95862399e-03, 2.12293095e-03, -3.75889172e-03,
-                -4.39571124e-03, -6.32379763e-03
-            ], [
-                2.33045570e-03, 1.99094601e-03, 1.98377599e-03, -3.87950847e-03,
-                -3.42792575e-03, -6.17497414e-03
-            ], [
-                1.65032526e-03, 1.96972815e-03, 2.03462853e-03, -3.82007333e-03,
-                -3.46369296e-03, -6.54224353e-03
-            ]], [[
-                4.77780215e-03, -1.98677275e-03, 3.30950436e-03,
-                -2.68179504e-03, 3.56271653e-03, -4.67860466e-03
-            ], [
-                5.13039157e-03, -2.02797214e-03, 3.50760575e-03,
-                -2.83981953e-03, 3.13726603e-03, -5.31156827e-03
-            ], [
-                5.17205056e-03, -2.16446724e-03, 3.53219034e-03,
-                -2.86490913e-03, 3.17879021e-03, -5.17592067e-03
-            ]], [[
-                -1.38538703e-03, -6.40910701e-04, -2.02864106e-03,
-                -1.79018872e-03, -5.18789608e-03, -8.95875692e-03
-            ], [
-                -1.38620089e-03, -7.92010222e-04, -1.91070826e-03,
-                -1.76206254e-03, -5.56525169e-03, -9.27332044e-03
-            ], [
-                -1.11966045e-03, -6.07630936e-04, -1.96643686e-03,
-                -1.86803937e-03, -4.93048411e-03, -9.25842486e-03
-            ]], [[
-                1.50820788e-03, -3.93087184e-03, -9.52563598e-04,
-                8.43994785e-04, -1.29030924e-03, 6.48857141e-03
-            ], [
-                1.17029145e-03, -4.45716921e-03, -1.05062663e-03,
-                8.08141369e-04, -2.08062865e-03, 6.23444980e-03
-            ], [
-                9.67921398e-04, -4.32466762e-03, -1.40085898e-03,
-                6.35969569e-04, -2.15558149e-03, 6.59212377e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18960866e-02, -8.04429129e-03, -1.48267671e-03,
-                    1.61071159e-02, -1.37981661e-02, -7.57933082e-03,
-                    -8.28570686e-03, -1.18733812e-02, 1.78834442e-02
-                ], [
-                    1.74204130e-02, -1.41935758e-02, -3.88074201e-03,
-                    3.19713727e-02, -3.54694910e-02, -2.14688145e-02,
-                    -6.21731905e-03, -1.69229065e-03, -1.94492843e-02
-                ], [
-                    -1.14494488e-02, 8.77974741e-03, -1.62960067e-02,
-                    -1.39961652e-02, 1.34879015e-02, -1.04502086e-02,
-                    6.15879148e-03, -9.40956455e-03, -6.57592434e-03
-                ], [
-                    -4.74739634e-02, -1.19136050e-02, -7.36759976e-05,
-                    4.10547927e-02, -1.36767328e-03, 2.11772677e-02,
-                    -2.80479677e-02, -5.44514805e-02, -2.91903690e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09839402e-02, -3.97479767e-03, -7.54472159e-04,
-                    7.91201927e-03, -7.02175125e-03, -3.80689627e-03,
-                    -4.22065007e-03, -6.05447078e-03, 8.92056432e-03
-                ], [
-                    8.68127123e-03, -7.16970162e-03, -1.88375649e-03,
-                    1.62681788e-02, -1.76830534e-02, -1.06617520e-02,
-                    -3.07536125e-03, -8.45551898e-04, -9.99375992e-03
-                ], [
-                    -5.71034756e-03, 4.50129062e-03, -8.07590690e-03,
-                    -6.94835978e-03, 6.75921654e-03, -5.12148207e-03,
-                    3.06083867e-03, -4.61710012e-03, -3.23932176e-03
-                ], [
-                    -2.37224493e-02, -5.88587578e-03, -3.70525813e-05,
-                    2.01787278e-02, -6.76127791e-04, 1.06675029e-02,
-                    -1.42634306e-02, -2.69631632e-02, -1.45033058e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00165033, 0.00196973, 0.00203463, -0.00382007, -0.00346369,
-                -0.00654224
-            ], [
-                0.00517205, -0.00216447, 0.00353219, -0.00286491, 0.00317879,
-                -0.00517592
-            ], [
-                -0.00111966, -0.00060763, -0.00196644, -0.00186804, -0.00493048,
-                -0.00925842
-            ], [
-                0.00096792, -0.00432467, -0.00140086, 0.00063597, -0.00215558,
-                0.00659212
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
         time=3,
-        attention_history=())
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
 
     self._testWithAttention(
         create_attention_mechanism,
         expected_final_output,
         expected_final_state,
         attention_mechanism_depth=9,
-        name="testLuongNotNormalized")
+        name='testLuongNotNormalized')
 
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
 
     expected_final_output = BasicDecoderOutput(
-        rnn_output=array(
-            [[[
-                1.74749165e-03, 1.95862399e-03, 2.12293095e-03, -3.75889172e-03,
-                -4.39571124e-03, -6.32379763e-03
-            ], [
-                2.33045570e-03, 1.99094601e-03, 1.98377599e-03, -3.87950847e-03,
-                -3.42792575e-03, -6.17497414e-03
-            ], [
-                1.65032526e-03, 1.96972815e-03, 2.03462853e-03, -3.82007333e-03,
-                -3.46369296e-03, -6.54224353e-03
-            ]], [[
-                4.77780215e-03, -1.98677275e-03, 3.30950436e-03,
-                -2.68179504e-03, 3.56271653e-03, -4.67860466e-03
-            ], [
-                5.13039157e-03, -2.02797214e-03, 3.50760575e-03,
-                -2.83981953e-03, 3.13726603e-03, -5.31156827e-03
-            ], [
-                5.17205056e-03, -2.16446724e-03, 3.53219034e-03,
-                -2.86490913e-03, 3.17879021e-03, -5.17592067e-03
-            ]], [[
-                -1.38538703e-03, -6.40910701e-04, -2.02864106e-03,
-                -1.79018872e-03, -5.18789608e-03, -8.95875692e-03
-            ], [
-                -1.38620089e-03, -7.92010222e-04, -1.91070826e-03,
-                -1.76206254e-03, -5.56525169e-03, -9.27332044e-03
-            ], [
-                -1.11966045e-03, -6.07630936e-04, -1.96643686e-03,
-                -1.86803937e-03, -4.93048411e-03, -9.25842486e-03
-            ]], [[
-                1.50820788e-03, -3.93087184e-03, -9.52563598e-04,
-                8.43994785e-04, -1.29030924e-03, 6.48857141e-03
-            ], [
-                1.17029145e-03, -4.45716921e-03, -1.05062663e-03,
-                8.08141369e-04, -2.08062865e-03, 6.23444980e-03
-            ], [
-                9.67921398e-04, -4.32466762e-03, -1.40085898e-03,
-                6.35969569e-04, -2.15558149e-03, 6.59212377e-03
-            ]], [[
-                -3.78854020e-04, 5.62231544e-05, 1.06837302e-04, 1.87137164e-04,
-                -1.56512906e-04, 9.63474595e-05
-            ], [
-                -1.04306288e-04, -1.37411975e-04, 2.82689070e-05,
-                6.56487318e-05, -1.48634164e-04, -1.84347919e-05
-            ], [
-                1.24452345e-04, 2.20821079e-04, 4.07114130e-04, 2.18028668e-04,
-                2.73401442e-04, -2.69805576e-04
-            ]]],
-            dtype=float32),
-        sample_id=array(
-            [[2, 0, 2], [0, 0, 0], [1, 1, 1], [5, 5, 5], [3, 3, 2]],
-            dtype=int32))
-
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00084602338),
+        sample_id=ResultSummary(shape=(5, 3), dtype=dtype('int32'), mean=2.0))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
-            c=array(
-                [[
-                    -2.18960866e-02, -8.04429129e-03, -1.48267671e-03,
-                    1.61071159e-02, -1.37981661e-02, -7.57933082e-03,
-                    -8.28570686e-03, -1.18733812e-02, 1.78834442e-02
-                ], [
-                    1.74204130e-02, -1.41935758e-02, -3.88074201e-03,
-                    3.19713727e-02, -3.54694910e-02, -2.14688145e-02,
-                    -6.21731905e-03, -1.69229065e-03, -1.94492843e-02
-                ], [
-                    -1.14494488e-02, 8.77974741e-03, -1.62960067e-02,
-                    -1.39961652e-02, 1.34879015e-02, -1.04502086e-02,
-                    6.15879148e-03, -9.40956455e-03, -6.57592434e-03
-                ], [
-                    -4.74739634e-02, -1.19136050e-02, -7.36759976e-05,
-                    4.10547927e-02, -1.36767328e-03, 2.11772677e-02,
-                    -2.80479677e-02, -5.44514805e-02, -2.91903690e-02
-                ], [
-                    2.25644894e-02, -1.40382675e-03, 1.92396250e-02,
-                    5.49034867e-03, -1.27930511e-02, -3.15603940e-03,
-                    -5.05525898e-03, 2.19191350e-02, 1.62497871e-02
-                ]],
-                dtype=float32),
-            h=array(
-                [[
-                    -1.09839402e-02, -3.97479767e-03, -7.54472159e-04,
-                    7.91201927e-03, -7.02175125e-03, -3.80689627e-03,
-                    -4.22065007e-03, -6.05447078e-03, 8.92056432e-03
-                ], [
-                    8.68127123e-03, -7.16970162e-03, -1.88375649e-03,
-                    1.62681788e-02, -1.76830534e-02, -1.06617520e-02,
-                    -3.07536125e-03, -8.45551898e-04, -9.99375992e-03
-                ], [
-                    -5.71034756e-03, 4.50129062e-03, -8.07590690e-03,
-                    -6.94835978e-03, 6.75921654e-03, -5.12148207e-03,
-                    3.06083867e-03, -4.61710012e-03, -3.23932176e-03
-                ], [
-                    -2.37224493e-02, -5.88587578e-03, -3.70525813e-05,
-                    2.01787278e-02, -6.76127791e-04, 1.06675029e-02,
-                    -1.42634306e-02, -2.69631632e-02, -1.45033058e-02
-                ], [
-                    1.12585640e-02, -6.92534202e-04, 9.88917705e-03,
-                    2.75237625e-03, -6.56115822e-03, -1.57997780e-03,
-                    -2.54477374e-03, 1.11598391e-02, 7.94144534e-03
-                ]],
-                dtype=float32)),
-        attention=array(
-            [[
-                0.00165033, 0.00196973, 0.00203463, -0.00382007, -0.00346369,
-                -0.00654224
-            ], [
-                0.00517205, -0.00216447, 0.00353219, -0.00286491, 0.00317879,
-                -0.00517592
-            ], [
-                -0.00111966, -0.00060763, -0.00196644, -0.00186804, -0.00493048,
-                -0.00925842
-            ], [
-                0.00096792, -0.00432467, -0.00140086, 0.00063597, -0.00215558,
-                0.00659212
-            ], [
-                0.00012445, 0.00022082, 0.00040711, 0.00021803, 0.0002734,
-                -0.00026981
-            ]],
-            dtype=float32),
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0039764317),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0019850098)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=dtype('float32'), mean=-0.00080144603),
         time=3,
-        attention_history=())
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
 
     self._testWithAttention(
         create_attention_mechanism,
         expected_final_output,
         expected_final_state,
         attention_mechanism_depth=9,
-        name="testLuongScaled")
+        name='testLuongScaled')
+
+  def testNotUseAttentionLayer(self):
+    create_attention_mechanism = wrapper.BahdanauAttention
+
+    expected_final_output = BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=dtype('float32'), mean=0.019546926),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=dtype('int32'), mean=2.7999999999999998))
+    expected_final_state = AttentionWrapperState(
+        cell_state=LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.0041728448),
+            h=ResultSummary(
+                shape=(5, 9), dtype=dtype('float32'), mean=-0.002085865)),
+        attention=ResultSummary(
+            shape=(5, 10), dtype=dtype('float32'), mean=0.019546915),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=dtype('float32'), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        name='testNotUseAttentionLayer')
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index 276801ba7c8d4158c558dfac961b17d99e131f73..8fc4ecfc82a14b9b4218a8818485ccbdc5274555 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -43,7 +43,7 @@ class BasicDecoderTest(test.TestCase):
     cell_depth = 10
     output_layer_depth = 3
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
       cell = core_rnn_cell.LSTMCell(cell_depth)
@@ -124,10 +124,10 @@ class BasicDecoderTest(test.TestCase):
     vocabulary_size = 7
     cell_depth = vocabulary_size  # cell's logits must match vocabulary size
     input_depth = 10
-    start_tokens = [0] * batch_size
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
     end_token = 1
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       embeddings = np.random.randn(vocabulary_size,
                                    input_depth).astype(np.float32)
       cell = core_rnn_cell.LSTMCell(vocabulary_size)
@@ -196,7 +196,7 @@ class BasicDecoderTest(test.TestCase):
     input_depth = 7
     vocabulary_size = 10
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = np.random.randn(
           batch_size, max_time, input_depth).astype(np.float32)
       embeddings = np.random.randn(
@@ -290,7 +290,7 @@ class BasicDecoderTest(test.TestCase):
     else:
       auxiliary_inputs = None
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
       cell = core_rnn_cell.LSTMCell(cell_depth)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb0cb4f8c351612c24bfc6c52c6f532f4c157d61
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -0,0 +1,321 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.beam_search_decoder."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+
+from tensorflow.contrib.rnn import core_rnn_cell
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+# pylint: enable=g-import-not-at-top
+
+
+class TestGatherTree(test.TestCase):
+  """Tests the gather_tree function."""
+
+  def test_gather_tree(self):
+    # (max_time = 3, batch_size = 2, beam_width = 3)
+
+    # create (batch_size, max_time, beam_width) matrix and transpose it
+    predicted_ids = np.array(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+         [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
+        dtype=np.int32).transpose([1, 0, 2])
+    parent_ids = np.array(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2]],
+         [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
+        dtype=np.int32).transpose([1, 0, 2])
+
+    # sequence_lengths is shaped (batch_size = 2, beam_width = 3)
+    sequence_lengths = [[3, 3, 3], [3, 3, 3]]
+
+    expected_result = np.array(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9]],
+         [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([1, 0, 2])
+
+    res = beam_search_ops.gather_tree(
+        predicted_ids, parent_ids, sequence_lengths)
+
+    with self.test_session() as sess:
+      res_ = sess.run(res)
+
+    self.assertAllEqual(expected_result, res_)
+
+
+class TestEosMasking(test.TestCase):
+  """Tests EOS masking used in beam search."""
+
+  def test_eos_masking(self):
+    probs = constant_op.constant([
+        [[-.2, -.2, -.2, -.2, -.2], [-.3, -.3, -.3, 3, 0], [5, 6, 0, 0, 0]],
+        [[-.2, -.2, -.2, -.2, 0], [-.3, -.3, -.1, 3, 0], [5, 6, 3, 0, 0]],
+    ])
+
+    eos_token = 0
+    previously_finished = constant_op.constant(
+        [[0, 1, 0], [0, 1, 1]], dtype=dtypes.float32)
+    masked = beam_search_decoder._mask_probs(probs, eos_token,
+                                             previously_finished)
+
+    with self.test_session() as sess:
+      probs = sess.run(probs)
+      masked = sess.run(masked)
+
+      self.assertAllEqual(probs[0][0], masked[0][0])
+      self.assertAllEqual(probs[0][2], masked[0][2])
+      self.assertAllEqual(probs[1][0], masked[1][0])
+
+      self.assertEqual(masked[0][1][0], 0)
+      self.assertEqual(masked[1][1][0], 0)
+      self.assertEqual(masked[1][2][0], 0)
+
+      for i in range(1, 5):
+        self.assertAllClose(masked[0][1][i], np.finfo('float32').min)
+        self.assertAllClose(masked[1][1][i], np.finfo('float32').min)
+        self.assertAllClose(masked[1][2][i], np.finfo('float32').min)
+
+
+class TestBeamStep(test.TestCase):
+  """Tests a single step of beam search."""
+
+  def setUp(self):
+    super(TestBeamStep, self).setUp()
+    self.batch_size = 2
+    self.beam_width = 3
+    self.vocab_size = 5
+    self.end_token = 0
+    self.length_penalty_weight = 0.6
+
+  def test_step(self):
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=nn_ops.log_softmax(
+            array_ops.ones([self.batch_size, self.beam_width])),
+        lengths=constant_op.constant(
+            2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int32),
+        finished=array_ops.zeros(
+            [self.batch_size, self.beam_width], dtype=dtypes.bool))
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 2.7
+    logits_[1, 2, 2] = 10.0
+    logits_[1, 2, 3] = 0.2
+    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, state_, log_probs_ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]])
+    self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]])
+    self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]])
+    self.assertAllEqual(next_state_.finished, [[False, False, False],
+                                               [False, False, False]])
+
+    expected_log_probs = []
+    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
+    expected_log_probs.append(state_.log_probs[1][[2, 1, 0]])  # 0 --> 1
+    expected_log_probs[0][0] += log_probs_[0, 1, 3]
+    expected_log_probs[0][1] += log_probs_[0, 0, 3]
+    expected_log_probs[0][2] += log_probs_[0, 0, 2]
+    expected_log_probs[1][0] += log_probs_[1, 2, 2]
+    expected_log_probs[1][1] += log_probs_[1, 1, 2]
+    expected_log_probs[1][2] += log_probs_[1, 0, 1]
+    self.assertAllEqual(next_state_.log_probs, expected_log_probs)
+
+  def test_step_with_eos(self):
+    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
+    beam_state = beam_search_decoder.BeamSearchDecoderState(
+        cell_state=dummy_cell_state,
+        log_probs=nn_ops.log_softmax(
+            array_ops.ones([self.batch_size, self.beam_width])),
+        lengths=ops.convert_to_tensor(
+            [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int32),
+        finished=ops.convert_to_tensor(
+            [[False, True, False], [False, False, True]], dtype=dtypes.bool))
+
+    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
+                      0.0001)
+    logits_[0, 0, 2] = 1.9
+    logits_[0, 0, 3] = 2.1
+    logits_[0, 1, 3] = 3.1
+    logits_[0, 1, 4] = 0.9
+    logits_[1, 0, 1] = 0.5
+    logits_[1, 1, 2] = 5.7  # why does this not work when it's 2.7?
+    logits_[1, 2, 2] = 1.0
+    logits_[1, 2, 3] = 0.2
+    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
+    log_probs = nn_ops.log_softmax(logits)
+
+    outputs, next_beam_state = beam_search_decoder._beam_search_step(
+        time=2,
+        logits=logits,
+        beam_state=beam_state,
+        batch_size=ops.convert_to_tensor(self.batch_size),
+        beam_width=self.beam_width,
+        end_token=self.end_token,
+        length_penalty_weight=self.length_penalty_weight)
+
+    with self.test_session() as sess:
+      outputs_, next_state_, state_, log_probs_ = sess.run(
+          [outputs, next_beam_state, beam_state, log_probs])
+
+    self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]])
+    self.assertAllEqual(outputs_.predicted_ids, [[0, 3, 2], [2, 0, 1]])
+    self.assertAllEqual(next_state_.lengths, [[1, 3, 3], [3, 1, 3]])
+    self.assertAllEqual(next_state_.finished, [[True, False, False],
+                                               [False, True, False]])
+
+    expected_log_probs = []
+    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
+    expected_log_probs.append(state_.log_probs[1][[1, 2, 0]])
+    expected_log_probs[0][1] += log_probs_[0, 0, 3]
+    expected_log_probs[0][2] += log_probs_[0, 0, 2]
+    expected_log_probs[1][0] += log_probs_[1, 1, 2]
+    expected_log_probs[1][2] += log_probs_[1, 0, 1]
+    self.assertAllEqual(next_state_.log_probs, expected_log_probs)
+
+
+class BeamSearchDecoderTest(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major, has_attention):
+    encoder_sequence_length = [3, 2, 3, 1, 0]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    decoder_max_time = 4
+    input_depth = 7
+    cell_depth = 9
+    attention_depth = 6
+    vocab_size = 20
+    end_token = vocab_size - 1
+    start_token = 0
+    embedding_dim = 50
+    max_out = max(decoder_sequence_length)
+    output_layer = layers_core.Dense(vocab_size, use_bias=True, activation=None)
+    beam_width = 3
+
+    with self.test_session() as sess:
+      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
+      cell = core_rnn_cell.LSTMCell(cell_depth)
+      if has_attention:
+        inputs = np.random.randn(batch_size, decoder_max_time,
+                                 input_depth).astype(np.float32)
+        tiled_inputs = beam_search_decoder.tile_batch(
+            inputs, multiplier=beam_width)
+        tiled_sequence_length = beam_search_decoder.tile_batch(
+            encoder_sequence_length, multiplier=beam_width)
+        attention_mechanism = attention_wrapper.BahdanauAttention(
+            num_units=attention_depth,
+            memory=tiled_inputs,
+            memory_sequence_length=tiled_sequence_length)
+        cell = attention_wrapper.AttentionWrapper(
+            cell=cell,
+            attention_mechanism=attention_mechanism,
+            attention_layer_size=attention_depth,
+            alignment_history=False)
+      cell_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size * beam_width)
+      bsd = beam_search_decoder.BeamSearchDecoder(
+          cell=cell,
+          embedding=embedding,
+          start_tokens=batch_size * [start_token],
+          end_token=end_token,
+          initial_state=cell_state,
+          beam_width=beam_width,
+          output_layer=output_layer,
+          length_penalty_weight=0.0)
+
+      final_outputs, final_state, final_sequence_lengths = (
+          decoder.dynamic_decode(
+              bsd, output_time_major=time_major, maximum_iterations=max_out))
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertTrue(
+          isinstance(final_outputs,
+                     beam_search_decoder.FinalBeamSearchDecoderOutput))
+      self.assertTrue(
+          isinstance(final_state, beam_search_decoder.BeamSearchDecoderState))
+
+      beam_search_decoder_output = final_outputs.beam_search_decoder_output
+      self.assertEqual(
+          _t((batch_size, None, beam_width)),
+          tuple(beam_search_decoder_output.scores.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, None, beam_width)),
+          tuple(final_outputs.predicted_ids.get_shape().as_list()))
+
+      sess.run(variables.global_variables_initializer())
+      sess_results = sess.run({
+          'final_outputs': final_outputs,
+          'final_state': final_state,
+          'final_sequence_lengths': final_sequence_lengths
+      })
+
+      max_sequence_length = np.max(sess_results['final_sequence_lengths'])
+
+      # A smoke test
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)),
+          sess_results['final_outputs'].beam_search_decoder_output.scores.shape)
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)), sess_results[
+              'final_outputs'].beam_search_decoder_output.predicted_ids.shape)
+
+  def testDynamicDecodeRNNBatchMajorNoAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=False)
+
+  def testDynamicDecodeRNNBatchMajorYesAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..491d87f62d807c5ef5306b2e55eec7ebae889be6
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.beam_search_ops."""
+# pylint: disable=unused-import,g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# pylint: enable=unused-import
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _transpose_batch_time(x):
+  return np.transpose(x, [1, 0, 2]).astype(np.int32)
+
+
+class GatherTreeTest(test.TestCase):
+
+  def testGatherTreeOne(self):
+    # (max_time = 4, batch_size = 1, beams = 3)
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    expected_result = _transpose_batch_time(
+        [[[2, 2, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    beams = beam_search_ops.gather_tree(
+        step_ids=step_ids, parent_ids=parent_ids,
+        sequence_length=sequence_length)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_result, beams.eval())
+
+  def testBadParentValuesOnCPU(self):
+    # (batch_size = 1, max_time = 4, beams = 3)
+    # bad parent in beam 1 time 1
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, -1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    with ops.device("/cpu:0"):
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+    with self.test_session():
+      with self.assertRaisesOpError(
+          r"parent id -1 at \(batch, time, beam\) == \(0, 0, 1\)"):
+        _ = beams.eval()
+
+  def testBadParentValuesOnGPU(self):
+    if not test.is_gpu_available():
+      return
+    # (max_time = 4, batch_size = 1, beams = 3)
+    # bad parent in beam 1 time 1; appears as a negative index at time 0
+    step_ids = _transpose_batch_time(
+        [[[1, 2, 3], [4, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    parent_ids = _transpose_batch_time(
+        [[[0, 0, 0], [0, -1, 1], [2, 1, 2], [-1, -1, -1]]])
+    sequence_length = [[3, 3, 3]]
+    expected_result = _transpose_batch_time(
+        [[[2, -1, 2], [6, 5, 6], [7, 8, 9], [-1, -1, -1]]])
+    with ops.device("/gpu:0"):
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_result, beams.eval())
+
+  def testGatherTreeBatch(self):
+    # sequence_length is [batch_size, beam_width] = [4, 5]
+    sequence_length = [[0] * 5, [1] * 5, [2] * 5, [3] * 5]
+
+    with self.test_session(use_gpu=True):
+      # (max_time = 4, batch_size = 4, beam_width = 5)
+      step_ids = _transpose_batch_time(
+          [[[3, 4, 0, 4, 0],
+            [4, 2, 0, 3, 1],
+            [1, 1, 3, 2, 2],
+            [3, 1, 2, 3, 4]],
+           [[3, 4, 0, 4, 0],
+            [4, 2, 0, 3, 1],
+            [1, 1, 3, 2, 2],
+            [3, 1, 2, 3, 4]],
+           [[1, 2, 3, 4, 2],
+            [2, 1, 1, 3, 2],
+            [3, 0, 1, 0, 0],
+            [3, 4, 0, 2, 4]],
+           [[0, 2, 2, 3, 1],
+            [3, 2, 2, 2, 3],
+            [3, 4, 3, 0, 3],
+            [1, 2, 2, 2, 4]]])
+      parent_ids = _transpose_batch_time(
+          [[[4, 2, 4, 3, 4],
+            [3, 4, 0, 2, 0],
+            [3, 1, 3, 2, 2],
+            [0, 2, 1, 4, 2]],
+           [[4, 2, 4, 3, 4],
+            [3, 4, 0, 2, 0],
+            [3, 1, 3, 2, 2],
+            [0, 2, 1, 4, 2]],
+           [[3, 0, 0, 4, 0],
+            [1, 2, 4, 2, 2],
+            [4, 4, 0, 3, 0],
+            [2, 4, 4, 3, 0]],
+           [[3, 1, 4, 1, 3],
+            [3, 2, 4, 0, 4],
+            [1, 0, 1, 4, 2],
+            [0, 3, 2, 0, 1]]])
+      expected_beams = _transpose_batch_time(
+          [[[-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[3, 4, 0, 4, 0],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[2, 3, 2, 3, 3],
+            [2, 1, 1, 3, 2],
+            [-1, -1, -1, -1, -1],
+            [-1, -1, -1, -1, -1]],
+           [[2, 3, 2, 1, 1],
+            [2, 3, 2, 3, 2],
+            [3, 4, 3, 0, 3],
+            [-1, -1, -1, -1, -1]]])
+
+      beams = beam_search_ops.gather_tree(
+          step_ids=step_ids, parent_ids=parent_ids,
+          sequence_length=sequence_length)
+      self.assertAllEqual(expected_beams, beams.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index 00854ed8b74414169fb213844567bba74b10849e..96dc7b4beee45f20bb7bae22baf2189c49814cc2 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -44,7 +44,7 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       if time_major:
         inputs = np.random.randn(max_time, batch_size,
                                  input_depth).astype(np.float32)
@@ -60,9 +60,9 @@ class DynamicDecodeRNNTest(test.TestCase):
           initial_state=cell.zero_state(
               dtype=dtypes.float32, batch_size=batch_size))
 
-      final_outputs, final_state = decoder.dynamic_decode(
-          my_decoder, output_time_major=time_major,
-          maximum_iterations=maximum_iterations)
+      final_outputs, final_state, final_sequence_length = (
+          decoder.dynamic_decode(my_decoder, output_time_major=time_major,
+                                 maximum_iterations=maximum_iterations))
 
       def _t(shape):
         if time_major:
@@ -73,6 +73,9 @@ class DynamicDecodeRNNTest(test.TestCase):
           isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
       self.assertTrue(isinstance(final_state, core_rnn_cell.LSTMStateTuple))
 
+      self.assertEqual(
+          (batch_size,),
+          tuple(final_sequence_length.get_shape().as_list()))
       self.assertEqual(
           _t((batch_size, None, cell_depth)),
           tuple(final_outputs.rnn_output.get_shape().as_list()))
@@ -83,7 +86,8 @@ class DynamicDecodeRNNTest(test.TestCase):
       sess.run(variables.global_variables_initializer())
       sess_results = sess.run({
           "final_outputs": final_outputs,
-          "final_state": final_state
+          "final_state": final_state,
+          "final_sequence_length": final_sequence_length,
       })
 
       # Mostly a smoke test
@@ -118,7 +122,7 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
 
@@ -131,7 +135,7 @@ class DynamicDecodeRNNTest(test.TestCase):
       # Match the variable scope of dynamic_rnn below so we end up
       # using the same variables
       with vs.variable_scope("root") as scope:
-        final_decoder_outputs, final_decoder_state = decoder.dynamic_decode(
+        final_decoder_outputs, final_decoder_state, _ = decoder.dynamic_decode(
             my_decoder,
             # impute_finished=True ensures outputs and final state
             # match those of dynamic_rnn called with sequence_length not None
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 8c6d85d1061495f0c578573289f944c4e890bf71..35c601a4bcf795ab951218851a3699b3288a69b1 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import test
 class LossTest(test.TestCase):
 
   def testSequenceLoss(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         batch_size = 2
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index ac79dbecefcc340b3eb126e520ec12e502d325cf..33ee19605d256b5a35019b9fe4205e322d10e232 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A powerful dynamic attention wrapper object.
-"""
+"""A powerful dynamic attention wrapper object."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -29,6 +28,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -39,6 +39,7 @@ from tensorflow.python.util import nest
 
 
 __all__ = [
+    "AttentionMechanism",
     "AttentionWrapper",
     "AttentionWrapperState",
     "LuongAttention",
@@ -73,6 +74,9 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
   """
   memory = nest.map_structure(
       lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+  if memory_sequence_length is not None:
+    memory_sequence_length = ops.convert_to_tensor(
+        memory_sequence_length, name="memory_sequence_length")
   if check_inner_dims_defined:
     def _check_dims(m):
       if not m.get_shape()[2:].is_fully_defined():
@@ -86,15 +90,24 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
         memory_sequence_length,
         maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
         dtype=nest.flatten(memory)[0].dtype)
+    seq_len_batch_size = (
+        memory_sequence_length.shape[0].value
+        or array_ops.shape(memory_sequence_length)[0])
   def _maybe_mask(m, seq_len_mask):
     rank = m.get_shape().ndims
     rank = rank if rank is not None else array_ops.rank(m)
     extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+    m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
     if memory_sequence_length is not None:
-      seq_len_mask = array_ops.reshape(
-          seq_len_mask,
-          array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
-      return m * seq_len_mask
+      message = ("memory_sequence_length and memory tensor batch sizes do not "
+                 "match.")
+      with ops.control_dependencies([
+          check_ops.assert_equal(
+              seq_len_batch_size, m_batch_size, message=message)]):
+        seq_len_mask = array_ops.reshape(
+            seq_len_mask,
+            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+        return m * seq_len_mask
     else:
       return m
   return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
@@ -108,8 +121,14 @@ class _BaseAttentionMechanism(AttentionMechanism):
     2. Preprocessing and storing the memory.
   """
 
-  def __init__(self, query_layer, memory, memory_sequence_length=None,
-               memory_layer=None, check_inner_dims_defined=True, name=None):
+  def __init__(self,
+               query_layer,
+               memory,
+               probability_fn,
+               memory_sequence_length=None,
+               memory_layer=None,
+               check_inner_dims_defined=True,
+               name=None):
     """Construct base AttentionMechanism class.
 
     Args:
@@ -118,6 +137,9 @@ class _BaseAttentionMechanism(AttentionMechanism):
         provided, the shape of `query` must match that of `memory_layer`.
       memory: The memory to query; usually the output of an RNN encoder.  This
         tensor should be shaped `[batch_size, max_time, ...]`.
+      probability_fn: A `callable`.  Converts the score and previous alignments
+        to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, previous_alignments)`.
       memory_sequence_length (optional): Sequence lengths for the batch entries
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
@@ -131,15 +153,19 @@ class _BaseAttentionMechanism(AttentionMechanism):
       name: Name to use when creating ops.
     """
     if (query_layer is not None
-        and not isinstance(query_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(query_layer, layers_base.Layer)):
       raise TypeError(
           "query_layer is not a Layer: %s" % type(query_layer).__name__)
     if (memory_layer is not None
-        and not isinstance(memory_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(memory_layer, layers_base.Layer)):
       raise TypeError(
           "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
     self._query_layer = query_layer
     self._memory_layer = memory_layer
+    if not callable(probability_fn):
+      raise TypeError("probability_fn must be callable, saw type: %s" %
+                      type(probability_fn).__name__)
+    self._probability_fn = probability_fn
     with ops.name_scope(
         name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self._values = _prepare_memory(
@@ -148,6 +174,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
       self._keys = (
           self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
           else self._values)
+      self._batch_size = (
+          self._keys.shape[0].value or array_ops.shape(self._keys)[0])
+      self._alignments_size = (self._keys.shape[1].value or
+                               array_ops.shape(self._keys)[1])
 
   @property
   def memory_layer(self):
@@ -165,6 +195,33 @@ class _BaseAttentionMechanism(AttentionMechanism):
   def keys(self):
     return self._keys
 
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
 
 class LuongAttention(_BaseAttentionMechanism):
   """Implements Luong-style (multiplicative) attention scoring.
@@ -188,6 +245,7 @@ class LuongAttention(_BaseAttentionMechanism):
                memory,
                memory_sequence_length=None,
                scale=False,
+               probability_fn=None,
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -199,31 +257,43 @@ class LuongAttention(_BaseAttentionMechanism):
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
       scale: Python boolean.  Whether to scale the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
     # num_units **must** match expected the query depth.
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(LuongAttention, self).__init__(
         query_layer=None,
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False),
         memory=memory,
+        probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
         name=name)
     self._num_units = num_units
     self._scale = scale
     self._name = name
 
-  def __call__(self, query):
+  def __call__(self, query, previous_alignments):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
 
     Returns:
-      score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
 
     Raises:
       ValueError: If `key` and `query` depths do not match.
@@ -261,7 +331,8 @@ class LuongAttention(_BaseAttentionMechanism):
             "attention_g", dtype=dtype, initializer=1.)
         score = g * score
 
-    return score
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
 
 
 class BahdanauAttention(_BaseAttentionMechanism):
@@ -291,6 +362,7 @@ class BahdanauAttention(_BaseAttentionMechanism):
                memory,
                memory_sequence_length=None,
                normalize=False,
+               probability_fn=None,
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -302,36 +374,49 @@ class BahdanauAttention(_BaseAttentionMechanism):
         in memory.  If provided, the memory tensor rows are masked with zeros
         for values past the respective sequence lengths.
       normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) A `callable`.  Converts the score to
+        probabilities.  The default is @{tf.nn.softmax}. Other options include
+        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
+        Its signature should be: `probabilities = probability_fn(score)`.
       name: Name to use when creating ops.
     """
+    if probability_fn is None:
+      probability_fn = nn_ops.softmax
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(BahdanauAttention, self).__init__(
         query_layer=layers_core.Dense(
             num_units, name="query_layer", use_bias=False),
         memory_layer=layers_core.Dense(
             num_units, name="memory_layer", use_bias=False),
         memory=memory,
+        probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
         name=name)
     self._num_units = num_units
     self._normalize = normalize
     self._name = name
 
-  def __call__(self, query):
+  def __call__(self, query, previous_alignments):
     """Score the query based on the keys and values.
 
     Args:
       query: Tensor of dtype matching `self.values` and shape
         `[batch_size, query_depth]`.
+      previous_alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
 
     Returns:
-      score: Tensor of dtype matching `self.values` and shape
-        `[batch_size, max_time]` (`max_time` is memory's `max_time`).
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
       dtype = processed_query.dtype
       # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
       processed_query = array_ops.expand_dims(processed_query, 1)
+      keys = self._keys
       v = variable_scope.get_variable(
           "attention_v", [self._num_units], dtype=dtype)
       if self._normalize:
@@ -347,29 +432,51 @@ class BahdanauAttention(_BaseAttentionMechanism):
         normed_v = g * v * math_ops.rsqrt(
             math_ops.reduce_sum(math_ops.square(v)))
         score = math_ops.reduce_sum(
-            normed_v * math_ops.tanh(self.keys + processed_query + b), [2])
+            normed_v * math_ops.tanh(keys + processed_query + b), [2])
       else:
-        score = math_ops.reduce_sum(
-            v * math_ops.tanh(self.keys + processed_query), [2])
+        score = math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query),
+                                    [2])
 
-    return score
+    alignments = self._probability_fn(score, previous_alignments)
+    return alignments
 
 
 class AttentionWrapperState(
-    collections.namedtuple(
-        "AttentionWrapperState", (
-            "cell_state", "attention", "time", "attention_history"))):
+    collections.namedtuple("AttentionWrapperState",
+                           ("cell_state", "attention", "time", "alignments",
+                            "alignment_history"))):
   """`namedtuple` storing the state of a `AttentionWrapper`.
 
   Contains:
 
-    - `cell_state`: The state of the wrapped `RNNCell`.
+    - `cell_state`: The state of the wrapped `RNNCell` at the previous time
+      step.
     - `attention`: The attention emitted at the previous time step.
     - `time`: int32 scalar containing the current time step.
-    - `attention_history`: (if enabled) a `TensorArray` containing attention
+    - `alignments`: The alignment emitted at the previous time step.
+    - `alignment_history`: (if enabled) a `TensorArray` containing alignment
        matrices from all time steps.  Call `stack()` to convert to a `Tensor`.
   """
-  pass
+
+  def clone(self, **kwargs):
+    """Clone this object, overriding components provided by kwargs.
+
+    Example:
+
+    ```python
+    initial_state = attention_wrapper.zero_state(dtype=..., batch_size=...)
+    initial_state = initial_state.clone(cell_state=encoder_state)
+    ```
+
+    Args:
+      **kwargs: Any properties of the state object to replace in the returned
+        `AttentionWrapperState`.
+
+    Returns:
+      A new `AttentionWrapperState` whose properties are the same as
+      this one, except any overridden properties as provided in `kwargs`.
+    """
+    return super(AttentionWrapperState, self)._replace(**kwargs)
 
 
 def hardmax(logits, name=None):
@@ -400,27 +507,26 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
   def __init__(self,
                cell,
                attention_mechanism,
-               attention_size,
-               attention_history=False,
+               attention_layer_size=None,
+               alignment_history=False,
                cell_input_fn=None,
-               probability_fn=None,
                output_attention=True,
+               initial_cell_state=None,
                name=None):
     """Construct the `AttentionWrapper`.
 
     Args:
       cell: An instance of `RNNCell`.
       attention_mechanism: An instance of `AttentionMechanism`.
-      attention_size: Python integer, the depth of the attention (output)
-        tensor.
-      attention_history: Python boolean, whether to store attention history
+      attention_layer_size: Python integer, the depth of the attention (output)
+        layer. If None (default), use the context as attention at each time
+        step. Otherwise, feed the context and cell output into the attention
+        layer to generate attention at each time step.
+      alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
       cell_input_fn: (optional) A `callable`.  The default is:
         `lambda inputs, attention: array_ops.concat([inputs, attention], -1)`.
-      probability_fn: (optional) A `callable`.  Converts the score to
-        probabilities.  The default is @{tf.nn.softmax}. Other options include
-        @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}.
       output_attention: Python bool.  If `True` (default), the output at each
         time step is the attention value.  This is the behavior of Luong-style
         attention mechanisms.  If `False`, the output at each time step is
@@ -429,9 +535,15 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         propagated to the next time step via the state and is used there.
         This flag only controls whether the attention mechanism is propagated
         up to the next cell in an RNN stack or to the top RNN output.
+      initial_cell_state: The initial state value to use for the cell when
+        the user calls `zero_state()`.  Note that if this value is provided
+        now, and the user uses a `batch_size` argument of `zero_state` which
+        does not match the batch size of `initial_cell_state`, proper
+        behavior is not guaranteed.
       name: Name to use when creating ops.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    super(AttentionWrapper, self).__init__(name=name)
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError(
           "cell must be an RNNCell, saw type: %s" % type(cell).__name__)
     if not isinstance(attention_mechanism, AttentionMechanism):
@@ -446,22 +558,42 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         raise TypeError(
             "cell_input_fn must be callable, saw type: %s"
             % type(cell_input_fn).__name__)
-    if probability_fn is None:
-      probability_fn = nn_ops.softmax
+
+    if attention_layer_size is not None:
+      self._attention_layer = layers_core.Dense(
+          attention_layer_size, name="attention_layer", use_bias=False)
+      self._attention_size = attention_layer_size
     else:
-      if not callable(cell_input_fn):
-        raise TypeError(
-            "probability_fn must be callable, saw type: %s"
-            % type(probability_fn).__name__)
+      self._attention_layer = None
+      self._attention_size = attention_mechanism.values.get_shape()[-1].value
+
     self._cell = cell
     self._attention_mechanism = attention_mechanism
-    self._attention_size = attention_size
-    self._attention_layer = layers_core.Dense(
-        attention_size, name="attention_layer", use_bias=False)
     self._cell_input_fn = cell_input_fn
-    self._probability_fn = probability_fn
     self._output_attention = output_attention
-    self._attention_history = attention_history
+    self._alignment_history = alignment_history
+    with ops.name_scope(name, "AttentionWrapperInit"):
+      if initial_cell_state is None:
+        self._initial_cell_state = None
+      else:
+        final_state_tensor = nest.flatten(initial_cell_state)[-1]
+        state_batch_size = (
+            final_state_tensor.shape[0].value
+            or array_ops.shape(final_state_tensor)[0])
+        error_message = (
+            "When constructing AttentionWrapper %s: " % self._base_name +
+            "Non-matching batch sizes between the memory "
+            "(encoder output) and initial_cell_state.  Are you using "
+            "the BeamSearchDecoder?  You may need to tile your initial state "
+            "via the tf.contrib.seq2seq.tile_batch function with argument "
+            "multiple=beam_width.")
+        with ops.control_dependencies(
+            [check_ops.assert_equal(state_batch_size,
+                                    self._attention_mechanism.batch_size,
+                                    message=error_message)]):
+          self._initial_cell_state = nest.map_structure(
+              lambda s: array_ops.identity(s, name="check_initial_cell_state"),
+              initial_cell_state)
 
   @property
   def output_size(self):
@@ -476,23 +608,45 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
         cell_state=self._cell.state_size,
         time=tensor_shape.TensorShape([]),
         attention=self._attention_size,
-        attention_history=())  # attention_history is sometimes a TensorArray
+        alignments=self._attention_mechanism.alignments_size,
+        alignment_history=())  # alignment_history is sometimes a TensorArray
 
   def zero_state(self, batch_size, dtype):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
-      if self._attention_history:
-        attention_history = tensor_array_ops.TensorArray(
+      if self._initial_cell_state is not None:
+        cell_state = self._initial_cell_state
+      else:
+        cell_state = self._cell.zero_state(batch_size, dtype)
+      error_message = (
+          "When calling zero_state of AttentionWrapper %s: " % self._base_name +
+          "Non-matching batch sizes between the memory "
+          "(encoder output) and the requested batch size.  Are you using "
+          "the BeamSearchDecoder?  If so, make sure your encoder output has "
+          "been tiled to beam_width via tf.contrib.seq2seq.tile_batch, and "
+          "the batch_size= argument passed to zero_state is "
+          "batch_size * beam_width.")
+      with ops.control_dependencies(
+          [check_ops.assert_equal(batch_size,
+                                  self._attention_mechanism.batch_size,
+                                  message=error_message)]):
+        cell_state = nest.map_structure(
+            lambda s: array_ops.identity(s, name="checked_cell_state"),
+            cell_state)
+      if self._alignment_history:
+        alignment_history = tensor_array_ops.TensorArray(
             dtype=dtype, size=0, dynamic_size=True)
       else:
-        attention_history = ()
+        alignment_history = ()
       return AttentionWrapperState(
-          cell_state=self._cell.zero_state(batch_size, dtype),
+          cell_state=cell_state,
           time=array_ops.zeros([], dtype=dtypes.int32),
-          attention=_zero_state_tensors(
-              self._attention_size, batch_size, dtype),
-          attention_history=attention_history)
+          attention=_zero_state_tensors(self._attention_size, batch_size,
+                                        dtype),
+          alignments=self._attention_mechanism.initial_alignments(
+              batch_size, dtype),
+          alignment_history=alignment_history)
 
-  def __call__(self, inputs, state, scope=None):
+  def call(self, inputs, state):
     """Perform a step of attention-wrapped RNN.
 
     - Step 1: Mix the `inputs` and previous step's `attention` output via
@@ -511,7 +665,6 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
       inputs: (Possibly nested tuple of) Tensor, the input at this time step.
       state: An instance of `AttentionWrapperState` containing
         tensors from the previous time step.
-      scope: Must be `None`.
 
     Returns:
       A tuple `(attention_or_cell_output, next_state)`, where:
@@ -519,51 +672,65 @@ class AttentionWrapper(core_rnn_cell.RNNCell):
       - `attention_or_cell_output` depending on `output_attention`.
       - `next_state` is an instance of `DynamicAttentionWrapperState`
          containing the state calculated at this time step.
-
-    Raises:
-      NotImplementedError: if `scope` is not `None`.
     """
-    if scope is not None:
-      raise NotImplementedError("scope not None is not supported")
-
-    with variable_scope.variable_scope("attention"):
-      # Step 1: Calculate the true inputs to the cell based on the
-      # previous attention value.
-      cell_inputs = self._cell_input_fn(inputs, state.attention)
-      cell_state = state.cell_state
-
-      cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
-
-      score = self._attention_mechanism(cell_output)
-      alignments = self._probability_fn(score)
-
-      # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
-      alignments = array_ops.expand_dims(alignments, 1)
-      # Context is the inner product of alignments and values along the
-      # memory time dimension.
-      # alignments shape is
-      #   [batch_size, 1, memory_time]
-      # attention_mechanism.values shape is
-      #   [batch_size, memory_time, attention_mechanism.num_units]
-      # the batched matmul is over memory_time, so the output shape is
-      #   [batch_size, 1, attention_mechanism.num_units].
-      # we then squeeze out the singleton dim.
-      context = math_ops.matmul(alignments, self._attention_mechanism.values)
-      context = array_ops.squeeze(context, [1])
-
+    # Step 1: Calculate the true inputs to the cell based on the
+    # previous attention value.
+    cell_inputs = self._cell_input_fn(inputs, state.attention)
+    cell_state = state.cell_state
+    cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
+
+    cell_batch_size = (
+        cell_output.shape[0].value or array_ops.shape(cell_output)[0])
+    error_message = (
+        "When applying AttentionWrapper %s: " % self.name +
+        "Non-matching batch sizes between the memory "
+        "(encoder output) and the query (decoder output).  Are you using "
+        "the BeamSearchDecoder?  You may need to tile your memory input via "
+        "the tf.contrib.seq2seq.tile_batch function with argument "
+        "multiple=beam_width.")
+    with ops.control_dependencies(
+        [check_ops.assert_equal(cell_batch_size,
+                                self._attention_mechanism.batch_size,
+                                message=error_message)]):
+      cell_output = array_ops.identity(
+          cell_output, name="checked_cell_output")
+
+    alignments = self._attention_mechanism(
+        cell_output, previous_alignments=state.alignments)
+
+    # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
+    expanded_alignments = array_ops.expand_dims(alignments, 1)
+    # Context is the inner product of alignments and values along the
+    # memory time dimension.
+    # alignments shape is
+    #   [batch_size, 1, memory_time]
+    # attention_mechanism.values shape is
+    #   [batch_size, memory_time, attention_mechanism.num_units]
+    # the batched matmul is over memory_time, so the output shape is
+    #   [batch_size, 1, attention_mechanism.num_units].
+    # we then squeeze out the singleton dim.
+    attention_mechanism_values = self._attention_mechanism.values
+    context = math_ops.matmul(expanded_alignments, attention_mechanism_values)
+    context = array_ops.squeeze(context, [1])
+
+    if self._attention_layer is not None:
       attention = self._attention_layer(
           array_ops.concat([cell_output, context], 1))
+    else:
+      attention = context
 
-      if self._attention_history:
-        attention_history = state.attention_history.write(state.time, attention)
-      else:
-        attention_history = ()
-
-      next_state = AttentionWrapperState(
-          time=state.time + 1,
-          cell_state=next_cell_state,
-          attention=attention,
-          attention_history=attention_history)
+    if self._alignment_history:
+      alignment_history = state.alignment_history.write(
+          state.time, alignments)
+    else:
+      alignment_history = ()
+
+    next_state = AttentionWrapperState(
+        time=state.time + 1,
+        cell_state=next_cell_state,
+        attention=attention,
+        alignments=alignments,
+        alignment_history=alignment_history)
 
     if self._output_attention:
       return attention, next_state
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index d19e2b0d5e469b484a16e9290a1cb09684c16638..8ae175b6b59a88b0516326967dd03c419957545e 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -21,13 +21,13 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.contrib.rnn import core_rnn_cell
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.util import nest
 
 
@@ -52,21 +52,20 @@ class BasicDecoder(decoder.Decoder):
       cell: An `RNNCell` instance.
       helper: A `Helper` instance.
       initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+        The initial state of the RNNCell.
       output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
         `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
         to storing the result or sampling.
 
     Raises:
-      TypeError: if `cell` is not an instance of `RNNCell`, `helper`
-        is not an instance of `Helper`, or `output_layer` is not an instance
-        of `tf.layers.Layer`.
+      TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
     """
-    if not isinstance(cell, core_rnn_cell.RNNCell):
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
       raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
     if not isinstance(helper, helper_py.Helper):
       raise TypeError("helper must be a Helper, received: %s" % type(helper))
     if (output_layer is not None
-        and not isinstance(output_layer, layers_base._Layer)):  # pylint: disable=protected-access
+        and not isinstance(output_layer, layers_base.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e8455d89e97b4a01f4c190f9889800c05d8738a
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -0,0 +1,648 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A decoder that performs beam search."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.layers import base as layers_base
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+
+__all__ = [
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "BeamSearchDecoder",
+    "FinalBeamSearchDecoderOutput",
+    "tile_batch",
+]
+
+
+class BeamSearchDecoderState(
+    collections.namedtuple("BeamSearchDecoderState", ("cell_state", "log_probs",
+                                                      "finished", "lengths"))):
+  pass
+
+
+class BeamSearchDecoderOutput(
+    collections.namedtuple("BeamSearchDecoderOutput",
+                           ("scores", "predicted_ids", "parent_ids"))):
+  pass
+
+
+class FinalBeamSearchDecoderOutput(
+    collections.namedtuple("FinalBeamDecoderOutput",
+                           ["predicted_ids", "beam_search_decoder_output"])):
+  """Final outputs returned by the beam search after all decoding is finished.
+
+  Args:
+    predicted_ids: The final prediction. A tensor of shape
+      `[T, batch_size, beam_width]`.
+    beam_search_output: An instance of `BeamSearchDecoderOutput` that describes
+      the state of the beam search.
+  """
+  pass
+
+
+def tile_batch(t, multiplier, name=None):
+  """Tile the batch dimension of tensor t.
+
+  This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed of
+  minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
+  `[batch_size * multiplier, s0, s1, ...]` composed of minibatch entries
+  `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is repeated
+  `multiplier` times.
+
+  Args:
+    t: `Tensor` shaped `[batch_size, ...]`.
+    multiplier: Python int.
+    name: Name scope for any created operations.
+
+  Returns:
+    A `Tensor` shaped `[batch_size * multiplier, ...]`.
+
+  Raises:
+    ValueError: if `t` does not have a statically known rank or it's < 1.
+  """
+  with ops.name_scope(name, "tile_batch", [t, multiplier]):
+    t = ops.convert_to_tensor(t, name="t")
+    shape_t = array_ops.shape(t)
+    if t.shape.ndims is None or t.shape.ndims < 1:
+      raise ValueError("t must have statically known rank")
+    tiling = [1] * (t.shape.ndims + 1)
+    tiling[1] = multiplier
+    tiled_static_batch_size = (
+        t.shape[0].value * multiplier if t.shape[0].value is not None else None)
+    tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
+    tiled = array_ops.reshape(
+        tiled, array_ops.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
+    tiled.set_shape(
+        tensor_shape.TensorShape(
+            [tiled_static_batch_size]).concatenate(t.shape[1:]))
+    return tiled
+
+
+class BeamSearchDecoder(decoder.Decoder):
+  """BeamSearch sampling decoder."""
+
+  def __init__(self,
+               cell,
+               embedding,
+               start_tokens,
+               end_token,
+               initial_state,
+               beam_width,
+               output_layer=None,
+               length_penalty_weight=0.0):
+    """Initialize BeamSearchDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
+        to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.layers.Layer`.
+      ValueError: If `start_tokens` is not a vector or
+        `end_token` is not a scalar.
+    """
+    if not rnn_cell_impl._like_rnncell(cell):  # pylint: disable=protected-access
+      raise TypeError("cell must be an RNNCell, received: %s" % type(cell))
+    if (output_layer is not None
+        and not isinstance(output_layer, layers_base.Layer)):
+      raise TypeError(
+          "output_layer must be a Layer, received: %s" % type(output_layer))
+    self._cell = cell
+    self._output_layer = output_layer
+
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._beam_width = beam_width
+    self._length_penalty_weight = length_penalty_weight
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams,
+        initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+    self._finished = array_ops.zeros(
+        [self._batch_size, self._beam_width], dtype=dtypes.bool)
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  def _rnn_output_size(self):
+    size = self._cell.output_size
+    if self._output_layer is None:
+      return size
+    else:
+      # To use layer's compute_output_shape, we need to convert the
+      # RNNCell's output_size entries into shapes with an unknown
+      # batch size.  We then pass this through the layer's
+      # compute_output_shape and read off all but the first (batch)
+      # dimensions to get the output size of the rnn with the layer
+      # applied to the top.
+      output_shape_with_unknown_batch = nest.map_structure(
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s),
+          size)
+      layer_output_shape = self._output_layer._compute_output_shape(  # pylint: disable=protected-access
+          output_shape_with_unknown_batch)
+      return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return BeamSearchDecoderOutput(
+        scores=tensor_shape.TensorShape([self._beam_width]),
+        predicted_ids=tensor_shape.TensorShape([self._beam_width]),
+        parent_ids=tensor_shape.TensorShape([self._beam_width]))
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    """
+    finished, start_inputs = self._finished, self._start_inputs
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=array_ops.zeros(
+            [self._batch_size, self._beam_width],
+            dtype=nest.flatten(self._initial_cell_state)[0].dtype),
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int32))
+
+    return (finished, start_inputs, initial_state)
+
+  def finalize(self, outputs, final_state, sequence_lengths):
+    """Finalize and return the predicted_ids.
+
+    Args:
+      outputs: An instance of BeamSearchDecoderOutput.
+      final_state: An instance of BeamSearchDecoderState. Passed through to the
+        output.
+      sequence_lengths: An `int32` tensor shaped `[batch_size, beam_width]`.
+        The sequence lengths determined for each beam during decode.
+
+    Returns:
+      outputs: An instance of FinalBeamSearchDecoderOutput where the
+        predicted_ids are the result of calling _gather_tree.
+      final_state: The same input instance of BeamSearchDecoderState.
+    """
+    predicted_ids = beam_search_ops.gather_tree(
+        outputs.predicted_ids, outputs.parent_ids,
+        sequence_length=sequence_lengths)
+    outputs = FinalBeamSearchDecoderOutput(
+        beam_search_decoder_output=outputs, predicted_ids=predicted_ids)
+    return outputs, final_state
+
+  def _merge_batch_beams(self, t, s=None):
+    """Merges the tensor from a batch of beams into a batch by beams.
+
+    More exactly, t is a tensor of dimension [batch_size, beam_width, s]. We
+    reshape this into [batch_size*beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size, beam_width, s]
+      s: (Possibly known) depth shape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size * beam_width, s].
+    """
+    if isinstance(s, ops.Tensor):
+      s = tensor_util.constant_value_as_shape(s)
+    else:
+      s = tensor_shape.TensorShape(s)
+    t_shape = array_ops.shape(t)
+    static_batch_size = tensor_util.constant_value(self._batch_size)
+    batch_size_beam_width = (
+        None if static_batch_size is None
+        else static_batch_size * self._beam_width)
+    reshaped_t = array_ops.reshape(
+        t, array_ops.concat(
+            ([self._batch_size * self._beam_width], t_shape[2:]), 0))
+    reshaped_t.set_shape(
+        (tensor_shape.TensorShape([batch_size_beam_width]).concatenate(s)))
+    return reshaped_t
+
+  def _split_batch_beams(self, t, s=None):
+    """Splits the tensor from a batch by beams into a batch of beams.
+
+    More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
+    reshape this into [batch_size, beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s].
+      s: (Possibly known) depth shape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size, beam_width, s].
+
+    Raises:
+      ValueError: If, after reshaping, the new tensor is not shaped
+        `[batch_size, beam_width, s]` (assuming batch_size and beam_width
+        are known statically).
+    """
+    if isinstance(s, ops.Tensor):
+      s = tensor_util.constant_value_as_shape(s)
+    else:
+      s = tensor_shape.TensorShape(s)
+    t_shape = array_ops.shape(t)
+    reshaped_t = array_ops.reshape(
+        t, array_ops.concat(
+            ([self._batch_size, self._beam_width], t_shape[1:]), 0))
+    static_batch_size = tensor_util.constant_value(self._batch_size)
+    expected_reshaped_shape = tensor_shape.TensorShape(
+        [static_batch_size, self._beam_width]).concatenate(s)
+    if not reshaped_t.shape.is_compatible_with(expected_reshaped_shape):
+      raise ValueError("Unexpected behavior when reshaping between beam width "
+                       "and batch size.  The reshaped tensor has shape: %s.  "
+                       "We expected it to have shape "
+                       "(batch_size, beam_width, depth) == %s.  Perhaps you "
+                       "forgot to create a zero_state with "
+                       "batch_size=encoder_batch_size * beam_width?"
+                       % (reshaped_t.shape, expected_reshaped_shape))
+    reshaped_t.set_shape(expected_reshaped_shape)
+    return reshaped_t
+
+  def _maybe_split_batch_beams(self, t, s):
+    """Maybe splits the tensor from a batch by beams into a batch of beams.
+
+    We do this so that we can use nest and not run into problems with shapes.
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s]
+      s: Tensor, Python int, or TensorShape.
+
+    Returns:
+      Either a reshaped version of t with dimension
+      [batch_size, beam_width, s] if t's first dimension is of size
+      batch_size*beam_width or t if not.
+
+    Raises:
+      TypeError: If t is an instance of TensorArray.
+      ValueError: If the rank of t is not statically known.
+    """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      raise TypeError(
+          "TensorArray state is not supported by BeamSearchDecoder: %s"
+          % t.name)
+    if t.shape.ndims is None:
+      raise ValueError(
+          "Expected tensor (%s) to have known rank, but ndims == None." % t)
+    if t.shape.ndims >= 1:
+      return self._split_batch_beams(t, s)
+    else:
+      return t
+
+  def _maybe_merge_batch_beams(self, t, s):
+    """Splits the tensor from a batch by beams into a batch of beams.
+
+    More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
+    reshape this into [batch_size, beam_width, s]
+
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s]
+      s: Tensor, Python int, or TensorShape.
+
+    Returns:
+      A reshaped version of t with dimension [batch_size, beam_width, s].
+
+    Raises:
+      TypeError: If t is an instance of TensorArray.
+      ValueError:  If the rank of t is not statically known.
+    """
+    if isinstance(t, tensor_array_ops.TensorArray):
+      raise TypeError(
+          "TensorArray state is not supported by BeamSearchDecoder: %s"
+          % t.name)
+    if t.shape.ndims is None:
+      raise ValueError(
+          "Expected tensor (%s) to have known rank, but ndims == None." % t)
+    if t.shape.ndims >= 2:
+      return self._merge_batch_beams(t, s)
+    else:
+      return t
+
+  def step(self, time, inputs, state, name=None):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+      name: Name scope for any created operations.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    batch_size = self._batch_size
+    beam_width = self._beam_width
+    end_token = self._end_token
+    length_penalty_weight = self._length_penalty_weight
+
+    with ops.name_scope(name, "BeamSearchDecoderStep", (time, inputs, state)):
+      cell_state = state.cell_state
+      inputs = nest.map_structure(
+          lambda inp: self._merge_batch_beams(inp, s=inp.shape[2:]), inputs)
+      cell_state = nest.map_structure(
+          self._maybe_merge_batch_beams,
+          cell_state, self._cell.state_size)
+      cell_outputs, next_cell_state = self._cell(inputs, cell_state)
+
+      cell_outputs = nest.map_structure(
+          lambda out: self._split_batch_beams(out, out.shape[1:]), cell_outputs)
+      next_cell_state = nest.map_structure(
+          self._maybe_split_batch_beams,
+          next_cell_state, self._cell.state_size)
+
+      if self._output_layer is not None:
+        cell_outputs = self._output_layer(cell_outputs)
+
+      beam_search_output, beam_search_state = _beam_search_step(
+          time=time,
+          logits=cell_outputs,
+          beam_state=state,
+          batch_size=batch_size,
+          beam_width=beam_width,
+          end_token=end_token,
+          length_penalty_weight=length_penalty_weight)
+      finished = beam_search_state.finished
+      sample_ids = beam_search_output.predicted_ids
+      next_inputs = control_flow_ops.cond(
+          math_ops.reduce_all(finished), lambda: self._start_inputs,
+          lambda: self._embedding_fn(sample_ids))
+
+    return (beam_search_output, beam_search_state, next_inputs, finished)
+
+
+def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
+                      end_token, length_penalty_weight):
+  """Performs a single step of Beam Search Decoding.
+
+  Args:
+    time: Beam search time step, should start at 0. At time 0 we assume
+      that all beams are equal and consider only the first beam for
+      continuations.
+    logits: Logits at the current time step. A tensor of shape
+      `[batch_size, beam_width, vocab_size]`
+    beam_state: Current state of the beam search.
+      An instance of `BeamSearchDecoderState`.
+    batch_size: The batch size for this input.
+    beam_width: Python int.  The size of the beams.
+    end_token: The int32 end token.
+    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+  Returns:
+    A new beam state.
+  """
+  static_batch_size = tensor_util.constant_value(batch_size)
+
+  # Calculate the current lengths of the predictions
+  prediction_lengths = beam_state.lengths
+  previously_finished = beam_state.finished
+
+  # Calculate the total log probs for the new hypotheses
+  # Final Shape: [batch_size, beam_width, vocab_size]
+  step_log_probs = nn_ops.log_softmax(logits)
+  step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
+  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
+
+  # Calculate the continuation lengths by adding to all continuing beams.
+  vocab_size = logits.shape[-1].value
+  lengths_to_add = array_ops.one_hot(
+      indices=array_ops.tile(
+          array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
+      depth=vocab_size,
+      on_value=0,
+      off_value=1)
+  add_mask = (1 - math_ops.to_int32(previously_finished))
+  lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
+  new_prediction_lengths = (
+      lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))
+
+  # Calculate the scores for each beam
+  scores = _get_scores(
+      log_probs=total_probs,
+      sequence_lengths=new_prediction_lengths,
+      length_penalty_weight=length_penalty_weight)
+
+  time = ops.convert_to_tensor(time, name="time")
+  # During the first time step we only consider the initial beam
+  scores_shape = array_ops.shape(scores)
+  scores_flat = control_flow_ops.cond(
+      time > 0,
+      lambda: array_ops.reshape(scores, [batch_size, -1]),
+      lambda: scores[:, 0])
+  num_available_beam = control_flow_ops.cond(
+      time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
+      lambda: math_ops.reduce_prod(scores_shape[2:]))
+
+  # Pick the next beams according to the specified successors function
+  next_beam_size = math_ops.minimum(
+      ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"),
+      num_available_beam)
+  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)
+  next_beam_scores.set_shape([static_batch_size, beam_width])
+  word_indices.set_shape([static_batch_size, beam_width])
+
+  # Pick out the probs, beam_ids, and states according to the chosen predictions
+  next_beam_probs = _tensor_gather_helper(
+      gather_indices=word_indices,
+      gather_from=total_probs,
+      range_input=batch_size,
+      range_size=beam_width * vocab_size,
+      final_shape=[static_batch_size, beam_width])
+
+  next_word_ids = math_ops.to_int32(word_indices % vocab_size)
+  next_beam_ids = math_ops.to_int32(word_indices / vocab_size)
+
+  # Append new ids to current predictions
+  previously_finished = _tensor_gather_helper(
+      gather_indices=next_beam_ids,
+      gather_from=previously_finished,
+      range_input=batch_size,
+      range_size=beam_width,
+      final_shape=[static_batch_size, beam_width])
+  next_finished = math_ops.logical_or(previously_finished,
+                                      math_ops.equal(next_word_ids, end_token))
+
+  # Calculate the length of the next predictions.
+  # 1. Finished beams remain unchanged
+  # 2. Beams that are now finished (EOS predicted) remain unchanged
+  # 3. Beams that are not yet finished have their length increased by 1
+  lengths_to_add = math_ops.to_int32(
+      math_ops.not_equal(next_word_ids, end_token))
+  lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add
+  next_prediction_len = _tensor_gather_helper(
+      gather_indices=next_beam_ids,
+      gather_from=beam_state.lengths,
+      range_input=batch_size,
+      range_size=beam_width,
+      final_shape=[static_batch_size, beam_width])
+  next_prediction_len += lengths_to_add
+
+  next_state = BeamSearchDecoderState(
+      cell_state=beam_state.cell_state,
+      log_probs=next_beam_probs,
+      lengths=next_prediction_len,
+      finished=next_finished)
+
+  output = BeamSearchDecoderOutput(
+      scores=next_beam_scores,
+      predicted_ids=next_word_ids,
+      parent_ids=next_beam_ids)
+
+  return output, next_state
+
+
+def _get_scores(log_probs, sequence_lengths, length_penalty_weight):
+  """Calculates scores for beam search hypotheses.
+
+  Args:
+    log_probs: The log probabilities with shape
+      `[batch_size, beam_width, vocab_size]`.
+    sequence_lengths: The array of sequence lengths.
+    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+
+  Returns:
+    The scores normalized by the length_penalty.
+  """
+  length_penality_ = _length_penalty(
+      sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
+  return log_probs / length_penality_
+
+
+def _length_penalty(sequence_lengths, penalty_factor):
+  """Calculates the length penalty. See https://arxiv.org/abs/1609.08144.
+
+  Args:
+    sequence_lengths: The sequence length of all hypotheses, a tensor
+      of shape [beam_size, vocab_size].
+    penalty_factor: A scalar that weights the length penalty.
+
+  Returns:
+    The length penalty factor, a tensor fo shape [beam_size].
+  """
+  penalty_factor = ops.convert_to_tensor(penalty_factor, name="penalty_factor")
+  penalty_factor.set_shape(())  # penalty should be a scalar.
+  static_penalty = tensor_util.constant_value(penalty_factor)
+  if static_penalty is not None and static_penalty == 0:
+    return 1.0
+  return math_ops.div((5. + math_ops.to_float(sequence_lengths))
+                      **penalty_factor, (5. + 1.)**penalty_factor)
+
+
+def _mask_probs(probs, eos_token, finished):
+  """Masks log probabilities.
+
+  The result is that finished beams allocate all probability mass to eos and
+  unfinished beams remain unchanged.
+
+  Args:
+    probs: Log probabiltiies of shape `[batch_size, beam_width, vocab_size]`
+    eos_token: An int32 id corresponding to the EOS token to allocate
+      probability to.
+    finished: A boolean tensor of shape `[batch_size, beam_width]` that
+      specifies which
+      elements in the beam are finished already.
+
+  Returns:
+    A tensor of shape `[batch_size, beam_width, vocab_size]`, where unfinished
+    beams stay unchanged and finished beams are replaced with a tensor with all
+    probability on the EOS token.
+  """
+  vocab_size = array_ops.shape(probs)[2]
+  finished_mask = array_ops.expand_dims(
+      math_ops.to_float(1. - math_ops.to_float(finished)), 2)
+  # These examples are not finished and we leave them
+  non_finished_examples = finished_mask * probs
+  # All finished examples are replaced with a vector that has all
+  # probability on EOS
+  finished_row = array_ops.one_hot(
+      eos_token,
+      vocab_size,
+      dtype=probs.dtype,
+      on_value=0.,
+      off_value=probs.dtype.min)
+  finished_examples = (1. - finished_mask) * finished_row
+  return finished_examples + non_finished_examples
+
+
+def _tensor_gather_helper(gather_indices, gather_from, range_input, range_size,
+                          final_shape):
+  range_ = array_ops.expand_dims(math_ops.range(range_input) * range_size, 1)
+  gather_indices = array_ops.reshape(gather_indices + range_, [-1])
+  output = array_ops.gather(
+      array_ops.reshape(gather_from, [-1]), gather_indices)
+  output = array_ops.reshape(output, final_shape)
+  output.set_shape(final_shape)
+  return output
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d9fcc0c90ab8457ca73dfa09b3239a0fcc096f3
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_ops.py
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam Search helper ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.seq2seq.ops import gen_beam_search_ops
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_beam_search_ops_so = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_beam_search_ops.so"))
+
+gather_tree = gen_beam_search_ops.gather_tree
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index 1d2674af30613d385ea79c6700f343b1a428a05f..4795dfb8c91bf83dc8642a9cb760043e75143a5d 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Seq2seq layer operations for use in neural networks.
-"""
+"""Seq2seq layer operations for use in neural networks."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import abc
-
 import six
 
 from tensorflow.python.framework import constant_op
@@ -31,50 +29,36 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
-__all__ = ["Decoder", "dynamic_decode"]
-
 
-def _transpose_batch_time(x):
-  """Transpose the batch and time dimensions of a Tensor.
+__all__ = ["Decoder", "dynamic_decode"]
 
-  Retains as much of the static shape information as possible.
 
-  Args:
-    x: A tensor of rank 2 or higher.
-
-  Returns:
-    x transposed along the first two dimensions.
-
-  Raises:
-    ValueError: if `x` is rank 1 or lower.
-  """
-  x_static_shape = x.get_shape()
-  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
-    raise ValueError(
-        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
-        (x, x_static_shape))
-  x_rank = array_ops.rank(x)
-  x_t = array_ops.transpose(
-      x, array_ops.concat(
-          ([1, 0], math_ops.range(2, x_rank)), axis=0))
-  x_t.set_shape(
-      tensor_shape.TensorShape([
-          x_static_shape[1].value, x_static_shape[0].value
-      ]).concatenate(x_static_shape[2:]))
-  return x_t
+_transpose_batch_time = rnn._transpose_batch_time  # pylint: disable=protected-access
 
 
 @six.add_metaclass(abc.ABCMeta)
 class Decoder(object):
-  """An RNN Decoder abstract interface object."""
+  """An RNN Decoder abstract interface object.
+
+  Concepts used by this interface:
+  - `inputs`: (structure of) tensors and TensorArrays that is passed as input to
+    the RNNCell composing the decoder, at each time step.
+  - `state`: (structure of) tensors and TensorArrays that is passed to the
+    RNNCell instance as the state.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at each
+    time step.
+  """
 
   @property
   def batch_size(self):
-    """The batch size of the inputs returned by `sample`."""
+    """The batch size of input values."""
     raise NotImplementedError
 
   @property
@@ -91,11 +75,14 @@ class Decoder(object):
   def initialize(self, name=None):
     """Called before any decoding iterations.
 
+    This methods must compute initial input values and initial state.
+
     Args:
       name: Name scope for any created operations.
 
     Returns:
-      `(finished, first_inputs, initial_state)`.
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
     """
     raise NotImplementedError
 
@@ -104,16 +91,25 @@ class Decoder(object):
     """Called per step of decoding (but only once for dynamic decoding).
 
     Args:
-      time: Scalar `int32` tensor.
-      inputs: Input (possibly nested tuple of) tensor[s] for this time step.
-      state: State (possibly nested tuple of) tensor[s] from previous time step.
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNNCell input (possibly nested tuple of) tensor[s] for this time
+        step.
+      state: RNNCell state (possibly nested tuple of) tensor[s] from previous
+        time step.
       name: Name scope for any created operations.
 
     Returns:
-      `(outputs, next_state, next_inputs, finished)`.
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an instance
+      of BasicDecoderOutput, `next_state` is a (structure of) state tensors and
+      TensorArrays, `next_inputs` is the tensor that should be used as input for
+      the next step, `finished` is a boolean tensor telling whether the sequence
+      is complete, for each sequence in the batch.
     """
     raise NotImplementedError
 
+  def finalize(self, outputs, final_state, sequence_lengths):
+    raise NotImplementedError
+
 
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
@@ -140,6 +136,8 @@ def dynamic_decode(decoder,
                    scope=None):
   """Perform dynamic decoding with `decoder`.
 
+  Calls initialize() once and step() repeatedly on the Decoder object.
+
   Args:
     decoder: A `Decoder` instance.
     output_time_major: Python boolean.  Default: `False` (batch major).  If
@@ -159,17 +157,17 @@ def dynamic_decode(decoder,
     scope: Optional variable scope to use.
 
   Returns:
-    `(final_outputs, final_state)`.
+    `(final_outputs, final_state, final_sequence_lengths)`.
 
   Raises:
     TypeError: if `decoder` is not an instance of `Decoder`.
-    ValueError: if maximum_iterations is provided but is not a scalar.
+    ValueError: if `maximum_iterations` is provided but is not a scalar.
   """
   if not isinstance(decoder, Decoder):
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
-  with variable_scope.variable_scope(scope or "decoder") as varscope:
+  with variable_scope.variable_scope(scope, "decoder") as varscope:
     # Properly cache variable values inside the while_loop
     if varscope.caching_device is None:
       varscope.set_caching_device(lambda op: op.device)
@@ -189,6 +187,8 @@ def dynamic_decode(decoder,
     if maximum_iterations is not None:
       initial_finished = math_ops.logical_or(
           initial_finished, 0 >= maximum_iterations)
+    initial_sequence_lengths = array_ops.zeros_like(
+        initial_finished, dtype=dtypes.int32)
     initial_time = constant_op.constant(0, dtype=dtypes.int32)
 
     def _shape(batch_size, from_shape):
@@ -211,10 +211,10 @@ def dynamic_decode(decoder,
                                             decoder.output_dtype)
 
     def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
-                  finished):
+                  finished, unused_sequence_lengths):
       return math_ops.logical_not(math_ops.reduce_all(finished))
 
-    def body(time, outputs_ta, state, inputs, finished):
+    def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
       """Internal while_loop body.
 
       Args:
@@ -222,10 +222,13 @@ def dynamic_decode(decoder,
         outputs_ta: structure of TensorArray.
         state: (structure of) state tensors and TensorArrays.
         inputs: (structure of) input tensors.
-        finished: 1-D bool tensor.
+        finished: bool tensor (keeping track of what's finished).
+        sequence_lengths: int32 tensor (keeping track of time of finish).
 
       Returns:
-        `(time + 1, outputs_ta, next_state, next_inputs, next_finished)`.
+        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
+          next_sequence_lengths)`.
+        ```
       """
       (next_outputs, decoder_state, next_inputs,
        decoder_finished) = decoder.step(time, inputs, state)
@@ -233,6 +236,10 @@ def dynamic_decode(decoder,
       if maximum_iterations is not None:
         next_finished = math_ops.logical_or(
             next_finished, time + 1 >= maximum_iterations)
+      next_sequence_lengths = array_ops.where(
+          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
+          array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
+          sequence_lengths)
 
       nest.assert_same_structure(state, decoder_state)
       nest.assert_same_structure(outputs_ta, next_outputs)
@@ -265,23 +272,32 @@ def dynamic_decode(decoder,
 
       outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
                                       outputs_ta, emit)
-      return (time + 1, outputs_ta, next_state, next_inputs, next_finished)
+      return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
+              next_sequence_lengths)
 
     res = control_flow_ops.while_loop(
         condition,
         body,
         loop_vars=[
             initial_time, initial_outputs_ta, initial_state, initial_inputs,
-            initial_finished
+            initial_finished, initial_sequence_lengths,
         ],
         parallel_iterations=parallel_iterations,
         swap_memory=swap_memory)
 
     final_outputs_ta = res[1]
     final_state = res[2]
+    final_sequence_lengths = res[5]
 
     final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
+
+    try:
+      final_outputs, final_state = decoder.finalize(
+          final_outputs, final_state, final_sequence_lengths)
+    except NotImplementedError:
+      pass
+
     if not output_time_major:
       final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
 
-  return final_outputs, final_state
+  return final_outputs, final_state, final_sequence_lengths
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 258e74b819472b9871ac90cfc3c7dd0083eb41d1..bdd7d7ca73e2cecc777ff610a9ff89c97990ebe4 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -23,8 +23,6 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import categorical
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +33,8 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -57,11 +57,17 @@ def _unstack_ta(inp):
 
 @six.add_metaclass(abc.ABCMeta)
 class Helper(object):
-  """Helper interface.  Helper instances are used by SamplingDecoder."""
+  """Interface for implementing sampling in seq2seq decoders.
+
+  Helper instances are used by `BasicDecoder`.
+  """
 
   @abc.abstractproperty
   def batch_size(self):
-    """Returns a scalar int32 tensor."""
+    """Batch size of tensor returned by `sample`.
+
+    Returns a scalar int32 tensor.
+    """
     raise NotImplementedError("batch_size has not been implemented")
 
   @abc.abstractmethod
@@ -357,7 +363,7 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
       self._seed = seed
 
       if (next_input_layer is not None and not isinstance(next_input_layer,
-                                                          layers_base._Layer)):  # pylint: disable=protected-access
+                                                          layers_base.Layer)):
         raise TypeError("next_input_layer must be a Layer, received: %s" %
                         type(next_input_layer))
       self._next_input_layer = next_input_layer
@@ -431,7 +437,7 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
                                        shape=base_shape))
 
       all_finished = math_ops.reduce_all(finished)
-      no_samples = math_ops.equal(array_ops.shape(sample_ids)[0], 0)
+      no_samples = math_ops.logical_not(math_ops.reduce_any(sample_ids))
       next_inputs = control_flow_ops.cond(
           math_ops.logical_or(all_finished, no_samples),
           lambda: base_next_inputs, maybe_sample)
@@ -450,12 +456,14 @@ class GreedyEmbeddingHelper(Helper):
 
     Args:
       embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
+        or the `params` argument for `embedding_lookup`. The returned tensor
+        will be passed to the decoder input.
       start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
       end_token: `int32` scalar, the token that marks end of decoding.
 
     Raises:
-      ValueError: if `sequence_length` is not a 1D tensor.
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
     """
     if callable(embedding):
       self._embedding_fn = embedding
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index 7e67c5f8a40dff2e5085df564e140b22538d4b89..39a6d2f58b140706a94d83273d3327edd1891368 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Seq2seq loss operations for use in sequence models.
 """
 
@@ -28,16 +27,21 @@ from tensorflow.python.ops import nn_ops
 __all__ = ["sequence_loss"]
 
 
-def sequence_loss(logits, targets, weights,
-                  average_across_timesteps=True, average_across_batch=True,
-                  softmax_loss_function=None, name=None):
-  """Weighted cross-entropy loss for a sequence of logits. Depending on the
-  values of `average_across_timesteps` and `average_across_batch`, the return
-  Tensor will have rank 0, 1, or 2 as these arguments reduce the cross-entropy
-  at each target, which has shape `[batch_size, sequence_length]`, over their
-  respective dimensions. For example, if `average_across_timesteps` is `True`
-  and `average_across_batch` is `False`, then the return Tensor will have shape
-  `[batch_size]`.
+def sequence_loss(logits,
+                  targets,
+                  weights,
+                  average_across_timesteps=True,
+                  average_across_batch=True,
+                  softmax_loss_function=None,
+                  name=None):
+  """Weighted cross-entropy loss for a sequence of logits.
+
+  Depending on the values of `average_across_timesteps` and
+  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
+  arguments reduce the cross-entropy at each target, which has shape
+  `[batch_size, sequence_length]`, over their respective dimensions. For
+  example, if `average_across_timesteps` is `True` and `average_across_batch`
+  is `False`, then the return Tensor will have shape `[batch_size]`.
 
   Args:
     logits: A Tensor of shape
diff --git a/tensorflow/contrib/session_bundle/README.md b/tensorflow/contrib/session_bundle/README.md
index 6df63cba807b5a121481aa8f7ee1e391c9b57b7c..5bcc8fab70f8f492f687fa37b022ee324429f530 100644
--- a/tensorflow/contrib/session_bundle/README.md
+++ b/tensorflow/contrib/session_bundle/README.md
@@ -1,5 +1,8 @@
 # TensorFlow Inference Model Format
 
+WARNING: SessionBundle has been deprecated. Please use
+[SavedModel](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md) instead.
+
 [TOC]
 
 ## Overview
diff --git a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
index 08ca47058c8e563befcd6ef1f924fee242265e44..4a56509e596d1308a4e07a31965e44d03d26aa3d 100644
--- a/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
+++ b/tensorflow/contrib/session_bundle/example/export_half_plus_two.py
@@ -97,7 +97,7 @@ def Export(export_dir, use_checkpoint_v2):
     }
 
     # Create two filename assets and corresponding tensors.
-    # TODO(b/26254158) Consider adding validation of file existance as well as
+    # TODO(b/26254158) Consider adding validation of file existence as well as
     # hashes (e.g. sha1) for consistency.
     original_filename1 = tf.constant("hello1.txt")
     tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, original_filename1)
diff --git a/tensorflow/contrib/session_bundle/gc_test.py b/tensorflow/contrib/session_bundle/gc_test.py
index 1a8ee93cca4a25b2878175beaa99be533826ecf6..8faf3ef3d4cd7ee0096265283070e25d06782254 100644
--- a/tensorflow/contrib/session_bundle/gc_test.py
+++ b/tensorflow/contrib/session_bundle/gc_test.py
@@ -29,10 +29,6 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-def tearDownModule():
-  gfile.DeleteRecursively(test.get_temp_dir())
-
-
 class GcTest(test_util.TensorFlowTestCase):
 
   def testLargestExportVersions(self):
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index fc80b9bec796547496bbcceab34a806058352d5a..ad6264d5c8aa159e579092da0443d83438452b21 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -275,7 +275,7 @@ class SessionBundleTest : public ::testing::Test {
   }
   // SetupExport that allows for the variables and meta_graph_def filenames
   // to be overridden.
-  string SetupExport(MetaGraphDefTwiddler twiddler,
+  string SetupExport(const MetaGraphDefTwiddler& twiddler,
                      const string& variables_filename,
                      const string& meta_graph_def_filename) {
     // Construct a unique path name based on the test name.
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5b65a6ae05ed98eb0ac5218c804eca37ea4743e6
--- /dev/null
+++ b/tensorflow/contrib/signal/BUILD
@@ -0,0 +1,46 @@
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "signal_py",
+    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/shape_ops_test.py"],
+    additional_deps = [
+        ":signal_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/identity.py b/tensorflow/contrib/signal/__init__.py
similarity index 75%
rename from tensorflow/contrib/distributions/python/ops/bijectors/identity.py
rename to tensorflow/contrib/signal/__init__.py
index 749dd268f98afafefd15c0a417c6ae49a62d124d..9f906dd28e8dc9130d87f4cd4a126e033fa66293 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/identity.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -12,18 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Identity bijector."""
+"""##Signal ops.
+
+@@frames
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.identity_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ["Identity"]
+from tensorflow.contrib.signal.python.ops.shape_ops import frames
 
-remove_undocumented(__name__, _allowed_symbols)
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/signal/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e672d1146c53a813613c9076c0cb6056f7081441
--- /dev/null
+++ b/tensorflow/contrib/signal/python/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07942875fdf3d0266824cf546a2a9dda94b1877
--- /dev/null
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -0,0 +1,68 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for shape_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.signal.python.ops import shape_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class FramesTest(test.TestCase):
+
+  def test_mapping_of_indices_without_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(9152), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 180)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (49, 1))
+      expected += np.tile(np.arange(49) * 180, (512, 1)).T
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+  def test_mapping_of_indices_with_padding(self):
+    with self.test_session():
+      tensor = constant_op.constant(np.arange(10000), dtypes.int32)
+      tensor = array_ops.expand_dims(tensor, 0)
+
+      result = shape_ops.frames(tensor, 512, 192)
+      result = result.eval()
+
+      expected = np.tile(np.arange(512), (51, 1))
+      expected += np.tile(np.arange(51) * 192, (512, 1)).T
+
+      expected[expected >= 10000] = 0
+
+      expected = np.expand_dims(expected, axis=0)
+      expected = np.array(expected, dtype=np.int32)
+
+      self.assertAllEqual(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/__init__.py b/tensorflow/contrib/signal/python/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e672d1146c53a813613c9076c0cb6056f7081441
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/contrib/signal/python/ops/shape_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4914f19be75398d50dc47fad0e8d7ab42e7d44aa
--- /dev/null
+++ b/tensorflow/contrib/signal/python/ops/shape_ops.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""General shape ops for frames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def frames(signal, frame_length, frame_step, name=None):
+  """Frame a signal into overlapping frames.
+
+  May be used in front of spectral functions.
+
+  For example:
+
+  ```python
+  pcm = tf.placeholder(tf.float32, [None, 9152])
+  frames = tf.contrib.signal.frames(pcm, 512, 180)
+  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  image = tf.expand_dims(magspec, 3)
+  ```
+
+  Args:
+    signal: A `Tensor` of shape `[batch_size, signal_length]`.
+    frame_length: An `int32` or `int64` `Tensor`. The length of each frame.
+    frame_step: An `int32` or `int64` `Tensor`. The step between frames.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of frames with shape `[batch_size, num_frames, frame_length]`.
+
+  Raises:
+    ValueError: if signal does not have rank 2.
+  """
+  with ops.name_scope(name, "frames", [signal, frame_length, frame_step]):
+    signal = ops.convert_to_tensor(signal, name="signal")
+    frame_length = ops.convert_to_tensor(frame_length, name="frame_length")
+    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
+
+    signal_rank = signal.shape.ndims
+
+    if signal_rank != 2:
+      raise ValueError("expected signal to have rank 2 but was " + signal_rank)
+
+    signal_length = array_ops.shape(signal)[1]
+
+    num_frames = math_ops.ceil((signal_length - frame_length) / frame_step)
+    num_frames = 1 + math_ops.cast(num_frames, dtypes.int32)
+
+    pad_length = (num_frames - 1) * frame_step + frame_length
+    pad_signal = array_ops.pad(signal, [[0, 0], [0,
+                                                 pad_length - signal_length]])
+
+    indices_frame = array_ops.expand_dims(math_ops.range(frame_length), 0)
+    indices_frames = array_ops.tile(indices_frame, [num_frames, 1])
+
+    indices_step = array_ops.expand_dims(
+        math_ops.range(num_frames) * frame_step, 1)
+    indices_steps = array_ops.tile(indices_step, [1, frame_length])
+
+    indices = indices_frames + indices_steps
+
+    # TODO(androbin): remove `transpose` when `gather` gets `axis` support
+    pad_signal = array_ops.transpose(pad_signal)
+    signal_frames = array_ops.gather(pad_signal, indices)
+    signal_frames = array_ops.transpose(signal_frames, perm=[2, 0, 1])
+
+    return signal_frames
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 94b0263ae8693fb19a609f9957204d5404ca3849..d37c632be7f1f911e62e35df3b6af3820201ee51 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -109,7 +109,7 @@ weights = slim.variable('weights',
 Note that in native TensorFlow, there are two types of variables: regular
 variables and local (transient) variables. The vast majority of variables are
 regular variables: once created, they can be saved to disk using a
-[saver](https://www.tensorflow.org/versions/r0.11/api_docs/python/state_ops.html#Saver).
+[saver](https://www.tensorflow.org/api_docs/python/tf/train/Saver).
 Local variables are those variables that only exist for the duration of a
 session and are not saved to disk.
 
@@ -289,10 +289,10 @@ slim.stack(x, slim.conv2d, [(32, [3, 3]), (32, [1, 1]), (64, [3, 3]), (64, [1, 1
 ### Scopes
 
 In addition to the types of scope mechanisms in TensorFlow
-([name_scope](https://www.tensorflow.org/api_docs/python/framework.html#name_scope),
-[variable_scope](https://www.tensorflow.org/api_docs/python/state_layers.html#variable_scope),
+([name_scope](https://www.tensorflow.org/api_docs/python/tf/name_scope),
+[variable_scope](https://www.tensorflow.org/api_docs/python/tf/variable_scope),
 TF-Slim adds a new scoping mechanism called
-[arg_scope](https://www.tensorflow.org/code/tensorflow/contrib/framework/python/ops/arg_scope.py).
+[arg_scope](https://www.tensorflow.org/api_docs/python/tf/contrib/framework/arg_scope),
 This new scope allows a user to specify one or more operations and a set of
 arguments which will be passed to each of the operations defined in the
 `arg_scope`. This functionality is best illustrated by example. Consider the
@@ -352,7 +352,7 @@ we can both ensure that each layer uses the same values and simplify the code:
 ```
 
 As the example illustrates, the use of arg_scope makes the code cleaner,
-simpler and easier to maintain. Notice that while argument values are specifed
+simpler and easier to maintain. Notice that while argument values are specified
 in the arg_scope, they can be overwritten locally. In particular, while
 the padding argument has been set to 'SAME', the second convolution overrides
 it with the value of 'VALID'.
@@ -447,7 +447,7 @@ vgg = tf.contrib.slim.nets.vgg
 images, labels = ...
 
 # Create the model.
-predictions = vgg.vgg16(images)
+predictions, _ = vgg.vgg_16(images)
 
 # Define the loss functions and get the total loss.
 loss = slim.losses.softmax_cross_entropy(predictions, labels)
diff --git a/tensorflow/contrib/slim/python/slim/data/README.md b/tensorflow/contrib/slim/python/slim/data/README.md
index 858c69499023311bec37b20b68d5015d25663bef..fe15a10b99dcac384268986d012bafd70b3d360d 100644
--- a/tensorflow/contrib/slim/python/slim/data/README.md
+++ b/tensorflow/contrib/slim/python/slim/data/README.md
@@ -71,27 +71,27 @@ for item in data_decoder.list_items():
   print(item)
 ```
 
-## Example: TFExampleDataDecoder
+## Example: TFExampleDecoder
 
 The
-[tfexample_data_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_data_decoder.py)
+[tfexample_decoder.py](https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py)
 is a data decoder which decodes serialized `TFExample` protocol buffers. A
 `TFExample` protocol buffer is a map from keys (strings) to either a
 `tf.FixedLenFeature` or `tf.VarLenFeature`. Consequently, to decode a
 `TFExample`, one must provide a mapping from one or more `TFExample` fields
-to each of the `items` that the `tfexample_data_decoder` can provide. For
+to each of the `items` that the `tfexample_decoder` can provide. For
 example, a dataset of `TFExamples` might store images in various formats and
 each `TFExample` might contain an `encoding` key and a `format` key which can
 be used to decode the image using the appropriate decoder (jpg, png, etc).
 
-To make this possible, the `tfexample_data_decoder` is constructed by specifying
+To make this possible, the `tfexample_decoder` is constructed by specifying
 the a map of `TFExample` keys to either `tf.FixedLenFeature` or
 `tf.VarLenFeature` as well as a set of `ItemHandlers`. An `ItemHandler`
 provides a mapping from `TFExample` keys to the item being provided. Because a
-`tfexample_data_decoder` might return multiple `items`, one often constructs a
-`tfexample_data_decoder` using multiple `ItemHandlers`.
+`tfexample_decoder` might return multiple `items`, one often constructs a
+`tfexample_decoder` using multiple `ItemHandlers`.
 
-`tfexample_data_decoder` provides some predefined `ItemHandlers` which take care
+`tfexample_decoder` provides some predefined `ItemHandlers` which take care
 of the common cases of mapping `TFExamples` to images, `Tensors` and
 `SparseTensors`. For example, the following specification might be
 used to decode a dataset of images:
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index f1b425aab7300f528fe78485a5687b095da74cac..82c6b5a619662ba5cbaba1b3a238045a8d9a2cd2 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -33,7 +33,7 @@ To read data using multiple readers simultaneous with shuffling:
       shuffle=True)
   images, labels = pascal_voc_data_provider.get(['images', 'labels'])
 
-Equivalently, one may request different fields of the same sample seperately:
+Equivalently, one may request different fields of the same sample separately:
 
   [images] = pascal_voc_data_provider.get(['images'])
   [labels] = pascal_voc_data_provider.get(['labels'])
@@ -59,7 +59,8 @@ class DatasetDataProvider(data_provider.DataProvider):
                common_queue_capacity=256,
                common_queue_min=128,
                record_key='record_key',
-               seed=None):
+               seed=None,
+               scope=None):
     """Creates a DatasetDataProvider.
 
     Args:
@@ -76,6 +77,7 @@ class DatasetDataProvider(data_provider.DataProvider):
       record_key: The item name to use for the dataset record keys in the
         provided tensors.
       seed: The seed to use if shuffling.
+      scope: Optional name scope for the ops.
     Raises:
       ValueError: If `record_key` matches one of the items in the dataset.
     """
@@ -88,7 +90,8 @@ class DatasetDataProvider(data_provider.DataProvider):
         shuffle=shuffle,
         capacity=common_queue_capacity,
         min_after_dequeue=common_queue_min,
-        seed=seed)
+        seed=seed,
+        scope=scope)
 
     items = dataset.decoder.list_items()
     tensors = dataset.decoder.decode(data, items)
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
index ea25fe8fd37bd0e7b31d555ce15d956ae916d606..37e9c4754ca62fc02f9146632943a50c33f9423d 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue.py
@@ -25,9 +25,15 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 
 
+def _which_queue(dynamic_pad):
+  return (data_flow_ops.PaddingFIFOQueue if dynamic_pad
+          else data_flow_ops.FIFOQueue)
+
+
 def prefetch_queue(tensors,
                    capacity=8,
                    num_threads=1,
+                   dynamic_pad=False,
                    shared_name=None,
                    name=None):
   """Creates a queue to prefetech tensors from `tensors`.
@@ -50,6 +56,7 @@ def prefetch_queue(tensors,
     tensors: A list or dictionary of `Tensors` to enqueue in the buffer.
     capacity: An integer. The maximum number of elements in the queue.
     num_threads: An integer.  Number of threads running the enqueue op.
+    dynamic_pad: Boolean.  Whether to allow variable dimensions in input shapes.
     shared_name: (optional). If set, this queue will be shared under the given
       name across multiple sessions.
     name: (Optional) A name for the operations.
@@ -70,7 +77,7 @@ def prefetch_queue(tensors,
   with ops.name_scope(name, "prefetch_queue", tensor_list) as name:
     dtypes = [t.dtype for t in tensor_list]
     shapes = [t.get_shape() for t in tensor_list]
-    queue = data_flow_ops.FIFOQueue(
+    queue = _which_queue(dynamic_pad)(
         capacity=capacity,
         dtypes=dtypes,
         shapes=shapes,
diff --git a/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py b/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
index 0a3a9e700bd66e3cfae5f4ad31ebdbd7544f5870..6c3e57c47deefd3ed2c5c6a27fd6d07c293b2ad2 100644
--- a/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/prefetch_queue_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -153,6 +155,63 @@ class PrefetchQueueTest(test.TestCase):
       for thread in threads:
         thread.join()
 
+  def testDynamicPad_failure(self):
+    with ops.Graph().as_default():
+      variable_tensor = array_ops.placeholder(dtypes.int32, shape=[None, 3])
+      with self.assertRaisesRegexp(ValueError, 'shapes must be fully defined'):
+        prefetch_queue.prefetch_queue([variable_tensor])
+
+  def testDynamicPad(self):
+    with self.test_session() as sess:
+      # Create 3 tensors of variable but compatible shapes.
+      var_shape = [None, 2]
+      p1 = constant_op.constant([[1, 2], [3, 4]])
+      p1.set_shape(var_shape)
+      p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]])
+      p2.set_shape(var_shape)
+      p3 = constant_op.constant([[11, 12]])
+      p3.set_shape(var_shape)
+      batch = [p1, p2, p3]
+      batch_size = len(batch)
+
+      zero64 = constant_op.constant(0, dtype=dtypes.int64)
+      examples = variables.Variable(zero64)
+      counter = examples.count_up_to(batch_size)
+
+      # Create a PaddingFIFOQueue to enqueue these tensors.
+      q = data_flow_ops.PaddingFIFOQueue(
+          capacity=10, dtypes=[dtypes.int32], shapes=[var_shape])
+      for tensor in [p1, p2, p3]:
+        q.enqueue([tensor]).run()
+
+      # Dequeue from the queue and batch them using batch().
+      batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size,
+                                num_threads=1, dynamic_pad=True)
+      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())
+
+      # Finally, assemble them into prefetch_queue with dynamic_pad.
+      batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True)
+      batches = batcher.dequeue()
+      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())
+
+      variables.global_variables_initializer().run()
+      threads = queue_runner_impl.start_queue_runners()
+
+      values, _ = sess.run(batches)
+      # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad
+      # they should be padded to the fixed size [3, 3, 2], where 3
+      # is the maximum length of the batch.
+      self.assertTrue(np.array_equal(
+          np.array([[[1, 2], [3, 4], [0, 0]],
+                    [[5, 6], [7, 8], [9, 10]],
+                    [[11, 12], [0, 0], [0, 0]]]),
+          values))
+
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        sess.run(batches)
+      for thread in threads:
+        thread.join()
+
   def testDictConstruction(self):
     with ops.Graph().as_default():
       batches = {
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 1e24f84b22c3012ff9bbf9b0400eb3ce9248b131..f0e028cd778865267340373cc72c1097488e4bcd 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -274,7 +275,8 @@ class Image(ItemHandler):
                format_key=None,
                shape=None,
                channels=3,
-               dtype=dtypes.uint8):
+               dtype=dtypes.uint8,
+               repeated=False):
     """Initializes the image.
 
     Args:
@@ -289,9 +291,10 @@ class Image(ItemHandler):
       channels: the number of channels in the image.
       dtype: images will be decoded at this bit depth. Different formats
         support different bit depths.
-          See tf.image.decode_png,
+          See tf.image.decode_image,
               tf.decode_raw,
-              tf.image.decode_jpeg: only supports tf.uint8
+      repeated: if False, decodes a single image. If True, decodes a
+        variable number of image strings from a 1D tensor of strings.
     """
     if not image_key:
       image_key = 'image/encoded'
@@ -304,61 +307,47 @@ class Image(ItemHandler):
     self._shape = shape
     self._channels = channels
     self._dtype = dtype
+    self._repeated = repeated
 
   def tensors_to_item(self, keys_to_tensors):
     """See base class."""
     image_buffer = keys_to_tensors[self._image_key]
     image_format = keys_to_tensors[self._format_key]
 
-    return self._decode(image_buffer, image_format)
+    if self._repeated:
+      return functional_ops.map_fn(lambda x: self._decode(x, image_format),
+                                   image_buffer, dtype=self._dtype)
+    else:
+      return self._decode(image_buffer, image_format)
 
   def _decode(self, image_buffer, image_format):
     """Decodes the image buffer.
 
     Args:
       image_buffer: The tensor representing the encoded image tensor.
-      image_format: The image format for the image in `image_buffer`.
+      image_format: The image format for the image in `image_buffer`. If image
+        format is `raw`, all images are expected to be in this format, otherwise
+        this op can decode a mix of `jpg` and `png` formats.
 
     Returns:
       A tensor that represents decoded image of self._shape, or
       (?, ?, self._channels) if self._shape is not specified.
     """
-
-    def decode_png():
-      return image_ops.decode_png(
-          image_buffer, self._channels, dtype=self._dtype)
+    def decode_image():
+      """Decodes a png or jpg based on the headers."""
+      return image_ops.decode_image(image_buffer, self._channels)
 
     def decode_raw():
+      """Decodes a raw image."""
       return parsing_ops.decode_raw(image_buffer, out_type=self._dtype)
 
-    def decode_jpg():
-      if self._dtype != dtypes.uint8:
-        raise ValueError(
-            'jpeg decoder can only be used to decode to tf.uint8 but %s was '
-            'requested for a jpeg image.' % self._dtype)
-      return image_ops.decode_jpeg(image_buffer, self._channels)
-
-    # For RGBA images JPEG is not a valid decoder option.
-    if self._channels > 3:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_png
-    else:
-      pred_fn_pairs = {
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'png'),
-              math_ops.equal(image_format, 'PNG')): decode_png,
-          math_ops.logical_or(
-              math_ops.equal(image_format, 'raw'),
-              math_ops.equal(image_format, 'RAW')): decode_raw,
-      }
-      default_decoder = decode_jpg
-
+    pred_fn_pairs = {
+        math_ops.logical_or(
+            math_ops.equal(image_format, 'raw'),
+            math_ops.equal(image_format, 'RAW')): decode_raw,
+    }
     image = control_flow_ops.case(
-        pred_fn_pairs, default=default_decoder, exclusive=True)
+        pred_fn_pairs, default=decode_image, exclusive=True)
 
     image.set_shape([None, None, self._channels])
     if self._shape is not None:
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
index dd3c6a39a244e66b9a7860e2b6702969a7de7038..506f4bd8777dd4229f8e76f424b87f899608e386 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder_test.py
@@ -228,9 +228,7 @@ class TFExampleDecoderTest(test.TestCase):
     image_shape = (2, 3, 3)
     unused_image, serialized_example = self.GenerateImage(
         image_format='jpeg', image_shape=image_shape)
-    expected_regex = ('jpeg decoder can only be used to decode to tf.uint8 but '
-                      '.* was requested for a jpeg image.')
-    with self.assertRaisesRegexp(ValueError, expected_regex):
+    with self.assertRaises(TypeError):
       unused_decoded_image = self.RunDecodeExample(
           serialized_example,
           tfexample_decoder.Image(dtype=dtypes.uint16),
@@ -730,6 +728,43 @@ class TFExampleDecoderTest(test.TestCase):
 
     self.assertAllClose(np_bboxes, bboxes)
 
+  def testDecodeExampleWithRepeatedImages(self):
+    image_shape = (2, 3, 3)
+    image_format = 'png'
+    image, _ = self.GenerateImage(
+        image_format=image_format, image_shape=image_shape)
+    tf_encoded = self._Encoder(image, image_format)
+    with self.test_session():
+      tf_string = tf_encoded.eval()
+
+    example = example_pb2.Example(features=feature_pb2.Features(feature={
+        'image/encoded': feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+            value=[tf_string, tf_string])),
+        'image/format': self._StringFeature(image_format),
+    }))
+    serialized_example = example.SerializeToString()
+
+    with self.test_session():
+      serialized_example = array_ops.reshape(serialized_example, shape=[])
+
+      decoder = tfexample_decoder.TFExampleDecoder(
+          keys_to_features={
+              'image/encoded':
+                  parsing_ops.FixedLenFeature(
+                      (2,), dtypes.string),
+              'image/format':
+                  parsing_ops.FixedLenFeature(
+                      (), dtypes.string, default_value=image_format),
+          },
+          items_to_handlers={'image': tfexample_decoder.Image(repeated=True)})
+      [tf_image] = decoder.decode(serialized_example, ['image'])
+
+      output_image = tf_image.eval()
+
+      self.assertEqual(output_image.shape, (2, 2, 3, 3))
+      self.assertAllEqual(np.squeeze(output_image[0, :, :, :]), image)
+      self.assertAllEqual(np.squeeze(output_image[1, :, :, :]), image)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 85616be410b1e391827e80fd19237b01614eb5b7..15c9f3d3f4488818cf2690c240af676bfd1f5128 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -224,7 +224,8 @@ def evaluation_loop(master,
                     eval_interval_secs=60,
                     max_number_of_evaluations=None,
                     session_config=None,
-                    timeout=None):
+                    timeout=None,
+                    hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
   Args:
@@ -255,6 +256,8 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    hooks: A list of additional SessionRunHook objects to pass during
+      repeated evaluations.
 
   Returns:
     The value of `final_op` or `None` if `final_op` is `None`.
@@ -262,12 +265,16 @@ def evaluation_loop(master,
   if summary_op == _USE_DEFAULT:
     summary_op = summary.merge_all()
 
-  hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
+  all_hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
 
   if summary_op is not None:
-    hooks.append(evaluation.SummaryAtEndHook(
+    all_hooks.append(evaluation.SummaryAtEndHook(
         log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict))
 
+  if hooks is not None:
+    # Add custom hooks if provided.
+    all_hooks.extend(hooks)
+
   saver = None
   if variables_to_restore is not None:
     saver = tf_saver.Saver(variables_to_restore)
@@ -283,7 +290,7 @@ def evaluation_loop(master,
       final_ops=final_op,
       final_ops_feed_dict=final_op_feed_dict,
       eval_interval_secs=eval_interval_secs,
-      hooks=hooks,
+      hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
       timeout=timeout)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 18c97d75e583ed87e712420c885d521fe1429e63..d9e0f54b724d3b44db158c6d57e7220d28cf7b8a 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -41,6 +41,8 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.training import input
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import session_run_hook
+
 
 FLAGS = flags.FLAGS
 
@@ -100,6 +102,22 @@ class EvaluationTest(test.TestCase):
       init_op.run()
       saver.save(sess, os.path.join(chkpt_dir, 'chkpt'))
 
+    class Object(object):
+
+      def __init__(self):
+        self.hook_was_run = False
+
+    obj = Object()
+
+    # Create a custom session run hook.
+    class CustomHook(session_run_hook.SessionRunHook):
+
+      def __init__(self, obj):
+        self.obj = obj
+
+      def end(self, session):
+        self.obj.hook_was_run = True
+
     # Now, run the evaluation loop:
     accuracy_value = evaluation.evaluation_loop(
         '',
@@ -107,9 +125,13 @@ class EvaluationTest(test.TestCase):
         logdir,
         eval_op=update_op,
         final_op=value_op,
+        hooks=[CustomHook(obj)],
         max_number_of_evaluations=1)
     self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
 
+    # Validate that custom hook ran.
+    self.assertTrue(obj.hook_was_run)
+
   def _create_names_to_metrics(self, predictions, labels):
     accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
     accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 814ce51100cea18ffc21564a2e22a1d489ccaf61..f7dddc46c365262ed30cec8e4ece694c47705cf4 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -261,7 +261,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
@@ -502,7 +502,7 @@ def train_step(sess, train_op, global_step, train_step_kwargs):
 
   if 'should_log' in train_step_kwargs:
     if sess.run(train_step_kwargs['should_log']):
-      logging.info('global step %d: loss = %.4f (%.2f sec/step)',
+      logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                    np_global_step, total_loss, time_elapsed)
 
   # TODO(nsilberman): figure out why we can't put this into sess.run. The
@@ -578,8 +578,10 @@ def train(train_op,
       replica during replica training.
     global_step: The `Tensor` representing the global step. If left as `None`,
       then slim.variables.get_or_create_global_step() is used.
-    number_of_steps: The max number of gradient steps to take during training.
-      If the value is left as None, training proceeds indefinitely.
+    number_of_steps: The max number of gradient steps to take during training,
+      as measured by 'global_step': training will stop if global_step is
+      greater than 'number_of_steps'. If the value is left as None, training
+      proceeds indefinitely.
     init_op: The initialization operation. If left to its default value, then
       the session is initialized by calling `tf.global_variables_initializer()`.
     init_feed_dict: A feed dictionary to use when executing the `init_op`.
@@ -655,7 +657,7 @@ def train(train_op,
       if local_init_op == _USE_DEFAULT:
         local_init_op = control_flow_ops.group(
             tf_variables.local_variables_initializer(),
-            data_flow_ops.tables_initializer())
+            lookup_ops.tables_initializer())
 
       if sync_optimizer is not None and isinstance(
           sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -694,8 +696,9 @@ def train(train_op,
         else:
           should_stop_op = constant_op.constant(False)
         train_step_kwargs['should_stop'] = should_stop_op
-        train_step_kwargs['should_log'] = math_ops.equal(
-            math_ops.mod(global_step, log_every_n_steps), 0)
+        if log_every_n_steps > 0:
+          train_step_kwargs['should_log'] = math_ops.equal(
+              math_ops.mod(global_step, log_every_n_steps), 0)
         if is_chief and trace_every_n_steps is not None:
           train_step_kwargs['should_trace'] = math_ops.equal(
               math_ops.mod(global_step, trace_every_n_steps), 0)
@@ -735,7 +738,7 @@ def train(train_op,
           _wait_for_step(sess, global_step,
                          min(startup_delay_steps, number_of_steps or
                              sys.maxint))
-        sv.start_queue_runners(sess)
+        threads = sv.start_queue_runners(sess)
         logging.info('Starting Queues.')
         if is_chief and sync_optimizer is not None:
           sv.start_queue_runners(sess, [chief_queue_runner])
@@ -746,6 +749,7 @@ def train(train_op,
                 sess, train_op, global_step, train_step_kwargs)
             if should_stop:
               logging.info('Stopping Training.')
+              sv.request_stop()
               break
         except errors.OutOfRangeError:
           # OutOfRangeError is thrown when epoch limit per
@@ -754,6 +758,7 @@ def train(train_op,
         if logdir and sv.is_chief:
           logging.info('Finished training! Saving model to disk.')
           sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
+          sv.stop(threads, close_summary_writer=True)
 
     except errors.AbortedError:
       # Always re-run on AbortedError as it indicates a restart of one of the
diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py
index cf3a878450d776e9d0c94892d63dbb9f2f803200..83d45f6f5adaccfca0a04629172ee803bab10ba7 100644
--- a/tensorflow/contrib/slim/python/slim/learning_test.py
+++ b/tensorflow/contrib/slim/python/slim/learning_test.py
@@ -840,7 +840,7 @@ class TrainTest(test.TestCase):
         # Initialize the variables.
         sess.run(variables_lib.global_variables_initializer())
 
-        # Get the intial weights and biases values.
+        # Get the initial weights and biases values.
         weights_values, biases_values = sess.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 8077818216a7455f3f00b4b84294a2ef4752e047..737bbbe57b2ecb1fb56052a7a10ca92fa19415f9 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -286,6 +286,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "resnet_is_training_test",
+    size = "medium",
+    srcs = ["resnet_is_training_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":resnet_utils",
+        ":resnet_v1",
+        ":resnet_v2",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "vgg",
     srcs = ["vgg.py"],
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a165577b699f757057aa10cc14bc1d48c02343a
--- /dev/null
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_is_training_test.py
@@ -0,0 +1,154 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Specifying is_training in resnet_arg_scope is being deprecated.
+
+Test that everything behaves as expected in the meantime.
+
+Note: This test modifies the layers.batch_norm function.
+Other tests that use layers.batch_norm may not work if added to this file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.framework.python.ops import add_arg_scope
+from tensorflow.contrib.framework.python.ops import arg_scope
+from tensorflow.contrib.slim.python.slim.nets import resnet_utils
+from tensorflow.contrib.slim.python.slim.nets import resnet_v1
+from tensorflow.contrib.slim.python.slim.nets import resnet_v2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def create_test_input(batch, height, width, channels):
+  """Create test input tensor."""
+  if None in [batch, height, width, channels]:
+    return array_ops.placeholder(dtypes.float32, (batch, height, width,
+                                                  channels))
+  else:
+    return math_ops.to_float(
+        np.tile(
+            np.reshape(
+                np.reshape(np.arange(height), [height, 1]) +
+                np.reshape(np.arange(width), [1, width]),
+                [1, height, width, 1]),
+            [batch, 1, 1, channels]))
+
+
+class ResnetIsTrainingTest(test.TestCase):
+
+  def _testDeprecatingIsTraining(self, network_fn):
+    batch_norm_fn = layers.batch_norm
+
+    @add_arg_scope
+    def batch_norm_expect_is_training(*args, **kwargs):
+      assert kwargs['is_training']
+      return batch_norm_fn(*args, **kwargs)
+
+    @add_arg_scope
+    def batch_norm_expect_is_not_training(*args, **kwargs):
+      assert not kwargs['is_training']
+      return batch_norm_fn(*args, **kwargs)
+
+    global_pool = True
+    num_classes = 10
+    inputs = create_test_input(2, 224, 224, 3)
+
+    # Default argument for resnet_arg_scope
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet1')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet2')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet3')
+
+    # resnet_arg_scope with is_training set to True (deprecated)
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet4')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet5')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=True)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet6')
+
+    # resnet_arg_scope with is_training set to False (deprecated)
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(inputs, num_classes, global_pool=global_pool, scope='resnet7')
+
+    layers.batch_norm = batch_norm_expect_is_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=True,
+          global_pool=global_pool,
+          scope='resnet8')
+
+    layers.batch_norm = batch_norm_expect_is_not_training
+    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      network_fn(
+          inputs,
+          num_classes,
+          is_training=False,
+          global_pool=global_pool,
+          scope='resnet9')
+
+    layers.batch_norm = batch_norm_fn
+
+  def testDeprecatingIsTrainingResnetV1(self):
+    self._testDeprecatingIsTraining(resnet_v1.resnet_v1_50)
+
+  def testDeprecatingIsTrainingResnetV2(self):
+    self._testDeprecatingIsTraining(resnet_v2.resnet_v2_50)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py b/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
index 89d27438e560902f47a398b7255fd738e498ff4c..58614a998abc2a983c4cd8df934cb30090c6443f 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_utils.py
@@ -41,6 +41,7 @@ from __future__ import print_function
 import collections
 
 from tensorflow.contrib import layers as layers_lib
+from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework.python.ops import add_arg_scope
 from tensorflow.contrib.framework.python.ops import arg_scope
 from tensorflow.contrib.layers.python.layers import initializers
@@ -222,6 +223,10 @@ def stack_blocks_dense(net,
   return net
 
 
+@deprecated_args(
+    '2017-08-01',
+    'Pass is_training directly to the network instead of the arg_scope.',
+    'is_training')
 def resnet_arg_scope(is_training=True,
                      weight_decay=0.0001,
                      batch_norm_decay=0.997,
@@ -236,7 +241,7 @@ def resnet_arg_scope(is_training=True,
 
   Args:
     is_training: Whether or not we are training the parameters in the batch
-      normalization layers of the model.
+      normalization layers of the model. (deprecated)
     weight_decay: The weight decay to use for regularizing the model.
     batch_norm_decay: The moving average decay when estimating layer activation
       statistics in batch normalization.
@@ -261,8 +266,7 @@ def resnet_arg_scope(is_training=True,
       weights_regularizer=regularizers.l2_regularizer(weight_decay),
       weights_initializer=initializers.variance_scaling_initializer(),
       activation_fn=nn_ops.relu,
-      normalizer_fn=layers.batch_norm,
-      normalizer_params=batch_norm_params):
+      normalizer_fn=layers.batch_norm):
     with arg_scope([layers.batch_norm], **batch_norm_params):
       # The following implies padding='SAME' for pool1, which makes feature
       # alignment easier for dense prediction tasks. This is also used in
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
index fe13ce1b0ed663bf41c69366c2d642780c532e85..90f93d46e34b7554353d74529360d8e9a8ff5d06 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1.py
@@ -40,15 +40,16 @@ Typical use:
 ResNet-101 for image classification into 1000 classes:
 
    # inputs has shape [batch, 224, 224, 3]
-   with slim.arg_scope(resnet_v1.resnet_arg_scope(is_training)):
-      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000)
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
+      net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
 
 ResNet-101 for semantic segmentation into 21 classes:
 
    # inputs has shape [batch, 513, 513, 3]
-   with slim.arg_scope(resnet_v1.resnet_arg_scope(is_training)):
+   with slim.arg_scope(resnet_v1.resnet_arg_scope()):
       net, end_points = resnet_v1.resnet_v1_101(inputs,
                                                 21,
+                                                is_training=False,
                                                 global_pool=False,
                                                 output_stride=16)
 """
@@ -127,6 +128,7 @@ def bottleneck(inputs,
 def resnet_v1(inputs,
               blocks,
               num_classes=None,
+              is_training=None,
               global_pool=True,
               output_stride=None,
               include_root_block=True,
@@ -161,6 +163,8 @@ def resnet_v1(inputs,
       is a resnet_utils.Block object describing the units in the block.
     num_classes: Number of predicted classes for classification tasks. If None
       we return the features before the logit layer.
+    is_training: whether is training or not. If None, the value inherited from
+      the resnet_arg_scope is used. Specifying value None is deprecated.
     global_pool: If True, we perform global average pooling before computing the
       logits. Set to True for image classification, False for dense prediction.
     output_stride: If None, then the output will be computed at the nominal
@@ -192,30 +196,36 @@ def resnet_v1(inputs,
     with arg_scope(
         [layers.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
         outputs_collections=end_points_collection):
-      net = inputs
-      if include_root_block:
-        if output_stride is not None:
-          if output_stride % 4 != 0:
-            raise ValueError('The output_stride needs to be a multiple of 4.')
-          output_stride /= 4
-        net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
-        net = layers_lib.max_pool2d(net, [3, 3], stride=2, scope='pool1')
-      net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
-      if global_pool:
-        # Global average pooling.
-        net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
-      if num_classes is not None:
-        net = layers.conv2d(
-            net,
-            num_classes, [1, 1],
-            activation_fn=None,
-            normalizer_fn=None,
-            scope='logits')
-      # Convert end_points_collection into a dictionary of end_points.
-      end_points = utils.convert_collection_to_dict(end_points_collection)
-      if num_classes is not None:
-        end_points['predictions'] = layers_lib.softmax(net, scope='predictions')
-      return net, end_points
+      if is_training is not None:
+        bn_scope = arg_scope([layers.batch_norm], is_training=is_training)
+      else:
+        bn_scope = arg_scope([])
+      with bn_scope:
+        net = inputs
+        if include_root_block:
+          if output_stride is not None:
+            if output_stride % 4 != 0:
+              raise ValueError('The output_stride needs to be a multiple of 4.')
+            output_stride /= 4
+          net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
+          net = layers_lib.max_pool2d(net, [3, 3], stride=2, scope='pool1')
+        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
+        if global_pool:
+          # Global average pooling.
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+        if num_classes is not None:
+          net = layers.conv2d(
+              net,
+              num_classes, [1, 1],
+              activation_fn=None,
+              normalizer_fn=None,
+              scope='logits')
+        # Convert end_points_collection into a dictionary of end_points.
+        end_points = utils.convert_collection_to_dict(end_points_collection)
+        if num_classes is not None:
+          end_points['predictions'] = layers_lib.softmax(
+              net, scope='predictions')
+        return net, end_points
 resnet_v1.default_image_size = 224
 
 
@@ -245,6 +255,7 @@ def resnet_v1_block(scope, base_depth, num_units, stride):
 
 def resnet_v1_50(inputs,
                  num_classes=None,
+                 is_training=None,
                  global_pool=True,
                  output_stride=None,
                  reuse=None,
@@ -260,6 +271,7 @@ def resnet_v1_50(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -269,6 +281,7 @@ def resnet_v1_50(inputs,
 
 def resnet_v1_101(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
@@ -284,6 +297,7 @@ def resnet_v1_101(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -293,6 +307,7 @@ def resnet_v1_101(inputs,
 
 def resnet_v1_152(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
@@ -308,6 +323,7 @@ def resnet_v1_152(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -317,6 +333,7 @@ def resnet_v1_152(inputs,
 
 def resnet_v1_200(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
@@ -332,6 +349,7 @@ def resnet_v1_200(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index dffd29f92077656a00af9422f359a4a9df70810e..d510337fef0762e086aee7341d4739393ee165f8 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -219,28 +219,29 @@ class ResnetUtilsTest(test.TestCase):
     # Test both odd and even input dimensions.
     height = 30
     width = 31
-    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
-      for output_stride in [1, 2, 4, 8, None]:
-        with ops.Graph().as_default():
-          with self.test_session() as sess:
-            random_seed.set_random_seed(0)
-            inputs = create_test_input(1, height, width, 3)
-            # Dense feature extraction followed by subsampling.
-            output = resnet_utils.stack_blocks_dense(inputs, blocks,
-                                                     output_stride)
-            if output_stride is None:
-              factor = 1
-            else:
-              factor = nominal_stride // output_stride
-
-            output = resnet_utils.subsample(output, factor)
-            # Make the two networks use the same weights.
-            variable_scope.get_variable_scope().reuse_variables()
-            # Feature extraction at the nominal network rate.
-            expected = self._stack_blocks_nondense(inputs, blocks)
-            sess.run(variables.global_variables_initializer())
-            output, expected = sess.run([output, expected])
-            self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      with arg_scope([layers.batch_norm], is_training=False):
+        for output_stride in [1, 2, 4, 8, None]:
+          with ops.Graph().as_default():
+            with self.test_session() as sess:
+              random_seed.set_random_seed(0)
+              inputs = create_test_input(1, height, width, 3)
+              # Dense feature extraction followed by subsampling.
+              output = resnet_utils.stack_blocks_dense(inputs, blocks,
+                                                       output_stride)
+              if output_stride is None:
+                factor = 1
+              else:
+                factor = nominal_stride // output_stride
+
+              output = resnet_utils.subsample(output, factor)
+              # Make the two networks use the same weights.
+              variable_scope.get_variable_scope().reuse_variables()
+              # Feature extraction at the nominal network rate.
+              expected = self._stack_blocks_nondense(inputs, blocks)
+              sess.run(variables.global_variables_initializer())
+              output, expected = sess.run([output, expected])
+              self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
 
 
 class ResnetCompleteNetworkTest(test.TestCase):
@@ -249,6 +250,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
   def _resnet_small(self,
                     inputs,
                     num_classes=None,
+                    is_training=None,
                     global_pool=True,
                     output_stride=None,
                     include_root_block=True,
@@ -262,8 +264,9 @@ class ResnetCompleteNetworkTest(test.TestCase):
         block('block3', base_depth=4, num_units=3, stride=2),
         block('block4', base_depth=8, num_units=2, stride=1),
     ]
-    return resnet_v1.resnet_v1(inputs, blocks, num_classes, global_pool,
-                               output_stride, include_root_block, reuse, scope)
+    return resnet_v1.resnet_v1(inputs, blocks, num_classes, is_training,
+                               global_pool, output_stride, include_root_block,
+                               reuse, scope)
 
   def testClassificationEndPoints(self):
     global_pool = True
@@ -271,7 +274,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
     self.assertTrue('predictions' in end_points)
@@ -284,7 +287,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 28, 28, 4],
           'resnet/block2': [2, 14, 14, 8],
@@ -301,7 +304,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 321, 321, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 41, 41, 4],
           'resnet/block2': [2, 21, 21, 8],
@@ -320,7 +323,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           include_root_block=False,
           scope='resnet')
       endpoint_to_shape = {
@@ -342,7 +345,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           output_stride=output_stride,
           scope='resnet')
       endpoint_to_shape = {
@@ -359,14 +362,18 @@ class ResnetCompleteNetworkTest(test.TestCase):
     """Verify dense feature extraction with atrous convolution."""
     nominal_stride = 32
     for output_stride in [4, 8, 16, 32, None]:
-      with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
           with self.test_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
             output, _ = self._resnet_small(
-                inputs, None, global_pool=False, output_stride=output_stride)
+                inputs,
+                None,
+                is_training=False,
+                global_pool=False,
+                output_stride=output_stride)
             if output_stride is None:
               factor = 1
             else:
@@ -375,7 +382,8 @@ class ResnetCompleteNetworkTest(test.TestCase):
             # Make the two networks use the same weights.
             variable_scope.get_variable_scope().reuse_variables()
             # Feature extraction at the nominal network rate.
-            expected, _ = self._resnet_small(inputs, None, global_pool=False)
+            expected, _ = self._resnet_small(
+                inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
                 output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
@@ -388,7 +396,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(None, height, width, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, _ = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
@@ -404,7 +412,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     global_pool = False
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
-      output, _ = self._resnet_small(inputs, None, global_pool)
+      output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
@@ -420,7 +428,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       output, _ = self._resnet_small(
-          inputs, None, global_pool, output_stride=output_stride)
+          inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
index 7e6fe5dfc255fb5925b8f30df4318a509bb0060d..63e8f1ff356dfcf0427d5170a03faa47ee06298c 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2.py
@@ -25,8 +25,6 @@ introduced by:
 
 The key difference of the full preactivation 'v2' variant compared to the
 'v1' variant in [1] is the use of batch normalization before every weight layer.
-Another difference is that 'v2' ResNets do not include an activation function in
-the main pathway. Also see [2; Fig. 4e].
 
 Typical use:
 
@@ -36,15 +34,16 @@ Typical use:
 ResNet-101 for image classification into 1000 classes:
 
    # inputs has shape [batch, 224, 224, 3]
-   with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)):
-      net, end_points = resnet_v2.resnet_v2_101(inputs, 1000)
+   with slim.arg_scope(resnet_v2.resnet_arg_scope()):
+      net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False)
 
 ResNet-101 for semantic segmentation into 21 classes:
 
    # inputs has shape [batch, 513, 513, 3]
-   with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)):
+   with slim.arg_scope(resnet_v2.resnet_arg_scope()):
       net, end_points = resnet_v2.resnet_v2_101(inputs,
                                                 21,
+                                                is_training=False,
                                                 global_pool=False,
                                                 output_stride=16)
 """
@@ -63,6 +62,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 
+resnet_arg_scope = resnet_utils.resnet_arg_scope
+
 
 @add_arg_scope
 def bottleneck(inputs,
@@ -129,6 +130,7 @@ def bottleneck(inputs,
 def resnet_v2(inputs,
               blocks,
               num_classes=None,
+              is_training=None,
               global_pool=True,
               output_stride=None,
               include_root_block=True,
@@ -163,6 +165,8 @@ def resnet_v2(inputs,
       is a resnet_utils.Block object describing the units in the block.
     num_classes: Number of predicted classes for classification tasks. If None
       we return the features before the logit layer.
+    is_training: whether is training or not. If None, the value inherited from
+      the resnet_arg_scope is used. Specifying value None is deprecated.
     global_pool: If True, we perform global average pooling before computing the
       logits. Set to True for image classification, False for dense prediction.
     output_stride: If None, then the output will be computed at the nominal
@@ -196,38 +200,45 @@ def resnet_v2(inputs,
     with arg_scope(
         [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
         outputs_collections=end_points_collection):
-      net = inputs
-      if include_root_block:
-        if output_stride is not None:
-          if output_stride % 4 != 0:
-            raise ValueError('The output_stride needs to be a multiple of 4.')
-          output_stride /= 4
-        # We do not include batch normalization or activation functions in conv1
-        # because the first ResNet unit will perform these. Cf. Appendix of [2].
-        with arg_scope(
-            [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
-          net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
-        net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
-      net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
-      # This is needed because the pre-activation variant does not have batch
-      # normalization or activation functions in the residual unit output. See
-      # Appendix of [2].
-      net = layers.batch_norm(net, activation_fn=nn_ops.relu, scope='postnorm')
-      if global_pool:
-        # Global average pooling.
-        net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
-      if num_classes is not None:
-        net = layers_lib.conv2d(
-            net,
-            num_classes, [1, 1],
-            activation_fn=None,
-            normalizer_fn=None,
-            scope='logits')
-      # Convert end_points_collection into a dictionary of end_points.
-      end_points = utils.convert_collection_to_dict(end_points_collection)
-      if num_classes is not None:
-        end_points['predictions'] = layers.softmax(net, scope='predictions')
-      return net, end_points
+      if is_training is not None:
+        bn_scope = arg_scope([layers.batch_norm], is_training=is_training)
+      else:
+        bn_scope = arg_scope([])
+      with bn_scope:
+        net = inputs
+        if include_root_block:
+          if output_stride is not None:
+            if output_stride % 4 != 0:
+              raise ValueError('The output_stride needs to be a multiple of 4.')
+            output_stride /= 4
+          # We do not include batch normalization or activation functions in
+          # conv1 because the first ResNet unit will perform these. Cf.
+          # Appendix of [2].
+          with arg_scope(
+              [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
+            net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
+          net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
+        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
+        # This is needed because the pre-activation variant does not have batch
+        # normalization or activation functions in the residual unit output. See
+        # Appendix of [2].
+        net = layers.batch_norm(
+            net, activation_fn=nn_ops.relu, scope='postnorm')
+        if global_pool:
+          # Global average pooling.
+          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
+        if num_classes is not None:
+          net = layers_lib.conv2d(
+              net,
+              num_classes, [1, 1],
+              activation_fn=None,
+              normalizer_fn=None,
+              scope='logits')
+        # Convert end_points_collection into a dictionary of end_points.
+        end_points = utils.convert_collection_to_dict(end_points_collection)
+        if num_classes is not None:
+          end_points['predictions'] = layers.softmax(net, scope='predictions')
+        return net, end_points
 resnet_v2.default_image_size = 224
 
 
@@ -257,6 +268,7 @@ def resnet_v2_block(scope, base_depth, num_units, stride):
 
 def resnet_v2_50(inputs,
                  num_classes=None,
+                 is_training=None,
                  global_pool=True,
                  output_stride=None,
                  reuse=None,
@@ -272,6 +284,7 @@ def resnet_v2_50(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -282,6 +295,7 @@ def resnet_v2_50(inputs,
 def resnet_v2_101(inputs,
                   num_classes=None,
                   global_pool=True,
+                  is_training=None,
                   output_stride=None,
                   reuse=None,
                   scope='resnet_v2_101'):
@@ -296,6 +310,7 @@ def resnet_v2_101(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -305,6 +320,7 @@ def resnet_v2_101(inputs,
 
 def resnet_v2_152(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
@@ -320,6 +336,7 @@ def resnet_v2_152(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
@@ -329,6 +346,7 @@ def resnet_v2_152(inputs,
 
 def resnet_v2_200(inputs,
                   num_classes=None,
+                  is_training=None,
                   global_pool=True,
                   output_stride=None,
                   reuse=None,
@@ -344,6 +362,7 @@ def resnet_v2_200(inputs,
       inputs,
       blocks,
       num_classes,
+      is_training,
       global_pool,
       output_stride,
       include_root_block=True,
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
index 1c09bcbb5a885bcbbb8d4809e19a67db1c3caccd..c4f3b071fd940d2c3d7c80fa3041b0426e336ab0 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v2_test.py
@@ -223,28 +223,29 @@ class ResnetUtilsTest(test.TestCase):
     # Test both odd and even input dimensions.
     height = 30
     width = 31
-    with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
-      for output_stride in [1, 2, 4, 8, None]:
-        with ops.Graph().as_default():
-          with self.test_session() as sess:
-            random_seed.set_random_seed(0)
-            inputs = create_test_input(1, height, width, 3)
-            # Dense feature extraction followed by subsampling.
-            output = resnet_utils.stack_blocks_dense(inputs, blocks,
-                                                     output_stride)
-            if output_stride is None:
-              factor = 1
-            else:
-              factor = nominal_stride // output_stride
-
-            output = resnet_utils.subsample(output, factor)
-            # Make the two networks use the same weights.
-            variable_scope.get_variable_scope().reuse_variables()
-            # Feature extraction at the nominal network rate.
-            expected = self._stack_blocks_nondense(inputs, blocks)
-            sess.run(variables.global_variables_initializer())
-            output, expected = sess.run([output, expected])
-            self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
+    with arg_scope(resnet_utils.resnet_arg_scope()):
+      with arg_scope([layers.batch_norm], is_training=False):
+        for output_stride in [1, 2, 4, 8, None]:
+          with ops.Graph().as_default():
+            with self.test_session() as sess:
+              random_seed.set_random_seed(0)
+              inputs = create_test_input(1, height, width, 3)
+              # Dense feature extraction followed by subsampling.
+              output = resnet_utils.stack_blocks_dense(inputs, blocks,
+                                                       output_stride)
+              if output_stride is None:
+                factor = 1
+              else:
+                factor = nominal_stride // output_stride
+
+              output = resnet_utils.subsample(output, factor)
+              # Make the two networks use the same weights.
+              variable_scope.get_variable_scope().reuse_variables()
+              # Feature extraction at the nominal network rate.
+              expected = self._stack_blocks_nondense(inputs, blocks)
+              sess.run(variables.global_variables_initializer())
+              output, expected = sess.run([output, expected])
+              self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4)
 
 
 class ResnetCompleteNetworkTest(test.TestCase):
@@ -253,6 +254,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
   def _resnet_small(self,
                     inputs,
                     num_classes=None,
+                    is_training=None,
                     global_pool=True,
                     output_stride=None,
                     include_root_block=True,
@@ -266,8 +268,9 @@ class ResnetCompleteNetworkTest(test.TestCase):
         block('block3', base_depth=4, num_units=3, stride=2),
         block('block4', base_depth=8, num_units=2, stride=1),
     ]
-    return resnet_v2.resnet_v2(inputs, blocks, num_classes, global_pool,
-                               output_stride, include_root_block, reuse, scope)
+    return resnet_v2.resnet_v2(inputs, blocks, num_classes, is_training,
+                               global_pool, output_stride, include_root_block,
+                               reuse, scope)
 
   def testClassificationEndPoints(self):
     global_pool = True
@@ -275,7 +278,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes])
     self.assertTrue('predictions' in end_points)
@@ -288,7 +291,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 224, 224, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 28, 28, 4],
           'resnet/block2': [2, 14, 14, 8],
@@ -305,7 +308,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(2, 321, 321, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       _, end_points = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
       endpoint_to_shape = {
           'resnet/block1': [2, 41, 41, 4],
           'resnet/block2': [2, 21, 21, 8],
@@ -324,7 +327,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           include_root_block=False,
           scope='resnet')
       endpoint_to_shape = {
@@ -346,7 +349,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
       _, end_points = self._resnet_small(
           inputs,
           num_classes,
-          global_pool,
+          global_pool=global_pool,
           output_stride=output_stride,
           scope='resnet')
       endpoint_to_shape = {
@@ -363,14 +366,18 @@ class ResnetCompleteNetworkTest(test.TestCase):
     """Verify dense feature extraction with atrous convolution."""
     nominal_stride = 32
     for output_stride in [4, 8, 16, 32, None]:
-      with arg_scope(resnet_utils.resnet_arg_scope(is_training=False)):
+      with arg_scope(resnet_utils.resnet_arg_scope()):
         with ops.Graph().as_default():
           with self.test_session() as sess:
             random_seed.set_random_seed(0)
             inputs = create_test_input(2, 81, 81, 3)
             # Dense feature extraction followed by subsampling.
             output, _ = self._resnet_small(
-                inputs, None, global_pool=False, output_stride=output_stride)
+                inputs,
+                None,
+                is_training=False,
+                global_pool=False,
+                output_stride=output_stride)
             if output_stride is None:
               factor = 1
             else:
@@ -379,7 +386,8 @@ class ResnetCompleteNetworkTest(test.TestCase):
             # Make the two networks use the same weights.
             variable_scope.get_variable_scope().reuse_variables()
             # Feature extraction at the nominal network rate.
-            expected, _ = self._resnet_small(inputs, None, global_pool=False)
+            expected, _ = self._resnet_small(
+                inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
                 output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
@@ -392,7 +400,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(None, height, width, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       logits, _ = self._resnet_small(
-          inputs, num_classes, global_pool, scope='resnet')
+          inputs, num_classes, global_pool=global_pool, scope='resnet')
     self.assertTrue(logits.op.name.startswith('resnet/logits'))
     self.assertListEqual(logits.get_shape().as_list(),
                          [None, 1, 1, num_classes])
@@ -408,7 +416,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     global_pool = False
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
-      output, _ = self._resnet_small(inputs, None, global_pool)
+      output, _ = self._resnet_small(inputs, None, global_pool=global_pool)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
@@ -424,7 +432,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
     inputs = create_test_input(batch, None, None, 3)
     with arg_scope(resnet_utils.resnet_arg_scope()):
       output, _ = self._resnet_small(
-          inputs, None, global_pool, output_stride=output_stride)
+          inputs, None, global_pool=global_pool, output_stride=output_stride)
     self.assertListEqual(output.get_shape().as_list(), [batch, None, None, 32])
     images = create_test_input(batch, height, width, 3)
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 89dbcd96f8640f470293c271250b0d44d2aabf7c..c8b4e472c99e0bf081a7222a7976b1fbbb680825 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -159,7 +159,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertShapeEqual(q, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst estimated-loss Rop"""
+    """check sparsemax-loss Rop, against estimated-loss Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
     q = np.zeros((test_obs, 10)).astype(dtype)
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
@@ -178,7 +178,7 @@ class SparsemaxLossTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax-loss Rop, aginst numpy Rop"""
+    """check sparsemax-loss Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
     q = np.zeros((test_obs, 10))
     q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index eafac1b9ae778ece44e36722cd85d28ed0b0c8d5..82d36ee9cb21fb822e6df0c3632c49a4fd616825 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -188,7 +188,7 @@ class SparsemaxTest(test.TestCase):
     self.assertShapeEqual(z, tf_sparsemax_op)
 
   def _test_gradient_against_estimate(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst estimated Rop"""
+    """check sparsemax Rop, against estimated Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = array_ops.placeholder(dtype, name='z')
@@ -204,7 +204,7 @@ class SparsemaxTest(test.TestCase):
     self.assertLess(err, 1e-4)
 
   def _test_gradient_against_numpy(self, dtype, random, use_gpu):
-    """check sparsemax Rop, aginst numpy Rop"""
+    """check sparsemax Rop, against numpy Rop"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
     logits = constant_op.constant(z, name='z')
diff --git a/tensorflow/contrib/specs/python/specs.py b/tensorflow/contrib/specs/python/specs.py
index a9fba442db5d560584a3ef5f1468ff63bc5e915e..d5223b9b551bc5cb1e007c97820b9425ea13334b 100644
--- a/tensorflow/contrib/specs/python/specs.py
+++ b/tensorflow/contrib/specs/python/specs.py
@@ -19,13 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-import inspect
-
 from six import exec_
 from tensorflow.contrib.specs.python import params_ops
 from tensorflow.contrib.specs.python import specs_lib
 from tensorflow.contrib.specs.python import specs_ops
+from tensorflow.python.util import tf_inspect
 
 
 def eval_params(params, environment=None):
@@ -44,7 +42,8 @@ def eval_params(params, environment=None):
   """
   specs_lib.check_keywords(params)
   bindings = {}
-  if environment: bindings.update(environment)
+  if environment:
+    bindings.update(environment)
   exec_(params, vars(params_ops), bindings)  # pylint: disable=exec-used
   return bindings
 
@@ -71,7 +70,8 @@ def eval_spec(spec, environment=None):
   """
   specs_lib.check_keywords(spec)
   bindings = {}
-  if environment: bindings.update(environment)
+  if environment:
+    bindings.update(environment)
   exec_(spec, vars(specs_ops), bindings)  # pylint: disable=exec-used
   return bindings
 
@@ -141,7 +141,7 @@ class LocalImport(object):
     self.names = names
 
   def __enter__(self):
-    self.frame = inspect.currentframe()
+    self.frame = tf_inspect.currentframe()
     bindings = self.frame.f_back.f_globals
     self.old = {k: bindings.get(k, None) for k in self.names.keys()}
     bindings.update(self.names)
@@ -151,7 +151,9 @@ class LocalImport(object):
     bindings = self.frame.f_back.f_globals
     bindings.update(self.old)
     for k, v in self.old.items():
-      if v is None: del bindings[k]
+      if v is None:
+        del bindings[k]
     del self.frame
 
+
 ops = LocalImport(specs_ops)
diff --git a/tensorflow/tensorboard/components/vz_projector/BUILD b/tensorflow/contrib/staging/BUILD
similarity index 64%
rename from tensorflow/tensorboard/components/vz_projector/BUILD
rename to tensorflow/contrib/staging/BUILD
index 8c222be10e919a047216dc906aeec59a0ef2973a..8ffc96c34696abd5a0f0c28a5071e282700511da 100644
--- a/tensorflow/tensorboard/components/vz_projector/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -1,6 +1,6 @@
-# Description:
-# Package for the Embedding Projector component.
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = [
+    "//visibility:public",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -17,3 +17,9 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+py_library(
+    name = "staging",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/contrib/staging/__init__.py b/tensorflow/contrib/staging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58ac31918ae333120d210104948f1f781dbc580
--- /dev/null
+++ b/tensorflow/contrib/staging/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""contrib module containing StagingArea."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.data_flow_ops import StagingArea
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1d9c1ffa50d767fa5bd1235fe1cc453681634f96
--- /dev/null
+++ b/tensorflow/contrib/stateless/BUILD
@@ -0,0 +1,50 @@
+# Stateless random ops
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+tf_gen_op_wrapper_py(
+    name = "stateless_random_ops",
+    out = "gen_stateless_random_ops.py",  # cmake chokes without this
+    deps = ["//tensorflow/core:stateless_random_ops_op_lib"],
+)
+
+py_library(
+    name = "stateless",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":stateless_random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "stateless_random_ops_test",
+    srcs = ["python/kernel_tests/stateless_random_ops_test.py"],
+    additional_deps = [
+        ":stateless",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e5d36ce44cbc9dc1133867f0396f6c2f0f9855
--- /dev/null
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -0,0 +1,38 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless random ops which take seed as a tensor input.
+
+Instead of taking `seed` as an attr which initializes a mutable state within
+the op, these random ops take `seed` as an input, and the random numbers are
+a deterministic function of `shape` and `seed`.
+
+WARNING: These ops are in contrib, and are not stable.  They should be
+consistent across multiple runs on the same hardware, but only for the same
+version of the code.
+
+@@stateless_random_uniform
+@@stateless_random_normal
+@@stateless_truncated_normal
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.contrib.stateless.gen_stateless_random_ops import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a36bdc2f9558220fa6cc47d5bb95d6e49a480f7
--- /dev/null
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -0,0 +1,84 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateless random ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib import stateless
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+CASES = [(stateless.stateless_random_uniform, random_ops.random_uniform),
+         (stateless.stateless_random_normal, random_ops.random_normal),
+         (stateless.stateless_truncated_normal, random_ops.truncated_normal)]
+
+
+def invert_philox(key, value):
+  """Invert the Philox bijection."""
+  key = np.array(key, dtype=np.uint32)
+  value = np.array(value, dtype=np.uint32)
+  step = np.array([0x9E3779B9, 0xBB67AE85], dtype=np.uint32)
+  for n in range(10)[::-1]:
+    key0, key1 = key + n * step
+    v0 = value[3] * 0x991a7cdb & 0xffffffff
+    v2 = value[1] * 0x6d7cae67 & 0xffffffff
+    hi0 = v0 * 0xD2511F53 >> 32
+    hi1 = v2 * 0xCD9E8D57 >> 32
+    v1 = hi1 ^ value[0] ^ key0
+    v3 = hi0 ^ value[2] ^ key1
+    value = v0, v1, v2, v3
+  return np.array(value)
+
+
+class StatelessOpsTest(test.TestCase):
+
+  def testMatchStateful(self):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    key = 0x3ec8f720, 0x02461e29
+    for seed in (7, 17), (11, 5), (2, 3):
+      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
+      preseed = preseed[::2] | preseed[1::2] << 32
+      random_seed.set_random_seed(seed[0])
+      with self.test_session(use_gpu=True):
+        for stateless_op, stateful_op in CASES:
+          for shape in (), (3,), (2, 5):
+            stateful = stateful_op(shape, seed=seed[1])
+            pure = stateless_op(shape, seed=preseed)
+            self.assertAllEqual(stateful.eval(), pure.eval())
+
+  def testDeterminism(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    with self.test_session(use_gpu=True):
+      seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for stateless_op, _ in CASES:
+        for shape in (), (3,), (2, 5):
+          pure = stateless_op(shape, seed=seed_t)
+          values = [(seed, pure.eval(feed_dict={seed_t: seed}))
+                    for seed in seeds]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index c99f9b7c12923a3d034cb013af2c11a1375012c4..17269863542a38724d6fc9d7f9958aa563370ea9 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -27,8 +27,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 
-INFERENCE_PROB_NAME = prediction_key.PredictionKey.CLASSES
-INFERENCE_PRED_NAME = prediction_key.PredictionKey.PROBABILITIES
+INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
+INFERENCE_PRED_NAME = prediction_key.PredictionKey.CLASSES
 
 FEATURE_IMPORTANCE_NAME = 'global_feature_importance'
 
diff --git a/tensorflow/contrib/tensor_forest/client/random_forest.py b/tensorflow/contrib/tensor_forest/client/random_forest.py
index 6cbe7b6d49a3c7784364ccf5949d2f6961ec8d7a..0da1f78755456e01397e6fdaca46d9ecf43f1eed 100644
--- a/tensorflow/contrib/tensor_forest/client/random_forest.py
+++ b/tensorflow/contrib/tensor_forest/client/random_forest.py
@@ -19,8 +19,10 @@ from __future__ import print_function
 
 from tensorflow.contrib import framework as contrib_framework
 
+from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
+from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 
 from tensorflow.contrib.tensor_forest.client import eval_metrics
 from tensorflow.contrib.tensor_forest.python import tensor_forest
@@ -31,6 +33,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 
 
@@ -77,7 +81,7 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
     current_loss = run_values.results['current_loss']
     current_step = run_values.results['global_step']
     self.steps += 1
-    # Gaurd against the global step going backwards, which might happen
+    # Guard against the global step going backwards, which might happen
     # if we recover from something.
     if self.last_step == -1 or self.last_step > current_step:
       logging.info('TensorForestLossHook resetting last_step.')
@@ -95,14 +99,32 @@ class TensorForestLossHook(session_run_hook.SessionRunHook):
       run_context.request_stop()
 
 
+class EveryCheckpointPreSaveListener(
+    basic_session_run_hooks.CheckpointSaverListener):
+  """Runs a given op before each checkpoint save."""
+
+  def __init__(self, op):
+    """Initializes the object.
+
+    Args:
+      op: An op to run before each checkpoint save.
+    """
+    self._op = op
+
+  def before_save(self, session, global_step_value):
+    session.run(self._op)
+
+
 def get_model_fn(params,
                  graph_builder_class,
                  device_assigner,
                  weights_name=None,
+                 keys_name=None,
                  early_stopping_rounds=100,
                  num_trainers=1,
                  trainer_id=0,
                  report_feature_importances=False,
+                 model_dir=None,
                  local_eval=False):
   """Return a model function given a way to construct a graph builder."""
   def _model_fn(features, labels, mode):
@@ -111,6 +133,10 @@ def get_model_fn(params,
     if weights_name and weights_name in features:
       weights = features.pop(weights_name)
 
+    keys = None
+    if keys_name and keys_name in features:
+      keys = features.pop(keys_name)
+
     # If we're doing eval, optionally ignore device_assigner.
     # Also ignore device assigner if we're exporting (mode == INFER)
     dev_assn = device_assigner
@@ -121,23 +147,42 @@ def get_model_fn(params,
     graph_builder = graph_builder_class(params,
                                         device_assigner=dev_assn)
     inference = {}
+    output_alternatives = None
     if (mode == model_fn_lib.ModeKeys.EVAL or
         mode == model_fn_lib.ModeKeys.INFER):
       inference[eval_metrics.INFERENCE_PROB_NAME] = (
           graph_builder.inference_graph(features))
 
-      if not params.regression:
+      if params.regression:
+        predictions = {
+            None: inference[eval_metrics.INFERENCE_PROB_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.LINEAR_REGRESSION, predictions)}
+      else:
         inference[eval_metrics.INFERENCE_PRED_NAME] = math_ops.argmax(
             inference[eval_metrics.INFERENCE_PROB_NAME], 1)
 
+        predictions = {
+            prediction_key.PredictionKey.PROBABILITIES:
+                inference[eval_metrics.INFERENCE_PROB_NAME],
+            prediction_key.PredictionKey.CLASSES:
+                inference[eval_metrics.INFERENCE_PRED_NAME]}
+        output_alternatives = {
+            None: (constants.ProblemType.CLASSIFICATION, predictions)}
+
       if report_feature_importances:
         inference[eval_metrics.FEATURE_IMPORTANCE_NAME] = (
             graph_builder.feature_importances())
 
+      if keys is not None:
+        inference[keys_name] = keys
+
     # labels might be None if we're doing prediction (which brings up the
     # question of why we force everything to adhere to a single model_fn).
     loss_deps = []
     training_graph = None
+    training_hooks = []
+    scaffold = None
     if labels is not None and mode == model_fn_lib.ModeKeys.TRAIN:
       training_graph = control_flow_ops.group(
           graph_builder.training_graph(
@@ -146,6 +191,15 @@ def get_model_fn(params,
               trainer_id=trainer_id),
           state_ops.assign_add(contrib_framework.get_global_step(), 1))
       loss_deps.append(training_graph)
+      if hasattr(graph_builder, 'finalize_training'):
+        finalize_listener = EveryCheckpointPreSaveListener(
+            graph_builder.finalize_training())
+        scaffold = monitored_session.Scaffold()
+        training_hooks.append(
+            basic_session_run_hooks.CheckpointSaverHook(
+                model_dir, save_secs=600, save_steps=None,
+                scaffold=scaffold,
+                listeners=[finalize_listener]))
 
     training_loss = None
     if (mode == model_fn_lib.ModeKeys.EVAL or
@@ -158,7 +212,6 @@ def get_model_fn(params,
     if weights is not None:
       features[weights_name] = weights
 
-    training_hooks = []
     if early_stopping_rounds:
       training_hooks.append(TensorForestLossHook(early_stopping_rounds))
 
@@ -167,7 +220,10 @@ def get_model_fn(params,
         predictions=inference,
         loss=training_loss,
         train_op=training_graph,
-        training_hooks=training_hooks)
+        training_hooks=training_hooks,
+        scaffold=scaffold,
+        output_alternatives=output_alternatives)
+
   return _model_fn
 
 
@@ -205,7 +261,7 @@ class TensorForestEstimator(estimator.Estimator):
 
   def __init__(self, params, device_assigner=None, model_dir=None,
                graph_builder_class=tensor_forest.RandomForestGraphs,
-               config=None, weights_name=None,
+               config=None, weights_name=None, keys_name=None,
                feature_engineering_fn=None,
                early_stopping_rounds=100,
                num_trainers=1, trainer_id=0,
@@ -229,6 +285,9 @@ class TensorForestEstimator(estimator.Estimator):
       weights_name: A string defining feature column name representing
         weights. Will be multiplied by the loss of the example. Used to
         downweight or boost examples during training.
+      keys_name: A string naming one of the features to strip out and
+        pass through into the inference/eval results dict.  Useful for
+        associating specific examples with their prediction.
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
@@ -253,10 +312,12 @@ class TensorForestEstimator(estimator.Estimator):
             graph_builder_class,
             device_assigner,
             weights_name=weights_name,
+            keys_name=keys_name,
             early_stopping_rounds=early_stopping_rounds,
             num_trainers=num_trainers,
             trainer_id=trainer_id,
             report_feature_importances=report_feature_importances,
+            model_dir=model_dir,
             local_eval=local_eval),
         model_dir=model_dir,
         config=config,
diff --git a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc b/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
index e5594f89878b2f2a5380796611e785a4a90d5721..41f99b09024428461073b1562d8bf3a58bc66232 100644
--- a/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/sample_inputs_op.cc
@@ -64,64 +64,6 @@ class SampleInputs : public OpKernel {
     input_spec_.ParseFromString(serialized_proto);
   }
 
-  // Returns the number of sparse values for example input_index.
-  // Also returns the index where those features start in sparse_input_start
-  // if any were found.
-  int32 GetNumSparseFeatures(const Tensor& sparse_input_indices,
-                             int32 input_index, int64* sparse_input_start) {
-    // Binary search for input_index.
-    // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
-    // for a simpler but possibly slower solution, or searching for
-    // input_start and input_end simultaneously.
-    const auto indices = sparse_input_indices.matrix<int64>();
-    const int64 num_total = sparse_input_indices.shape().dim_size(0);
-    int64 index;
-    int64 low = 0;
-    int64 high = num_total;
-
-    while (true) {
-      if (low == high) {
-        return 0;
-      }
-      index = low + (high - low) / 2;
-      const int64 feature_index = indices(index, 0);
-      if (feature_index == input_index) {
-        // found it.
-        break;
-      } else if (feature_index < input_index) {
-        // Correct for the implicit floor in the index assignment.
-        if (low == index) {
-          return 0;
-        }
-        low = index;
-      } else {
-        high = index;
-      }
-    }
-
-    // Scan for the start and end of the input_index range.
-    int64 input_start = index;
-    int64 val = indices(input_start, 0);
-    while (val == input_index) {
-      --input_start;
-      if (input_start < 0) {
-        break;
-      }
-      val = indices(input_start, 0);
-    }
-    *sparse_input_start = input_start + 1;
-    int32 input_end = index;
-    val = indices(input_end, 0);
-    while (val == input_index) {
-      ++input_end;
-      if (input_end >= num_total) {
-        break;
-      }
-      val = indices(input_end, 0);
-    }
-    return input_end - input_start - 1;
-  }
-
   // increment_input implements a "++" operation for the situation when
   // you want to do something n times on an underlying iterator.
   // In an ideal world, this would be a built-in iterator adaptor.
@@ -333,8 +275,8 @@ class SampleInputs : public OpKernel {
           int64 sparse_input_start;
           int32 num_total_features = input_spec_.dense_features_size();
           if (sparse_input) {
-            num_total_features += GetNumSparseFeatures(
-                sparse_input_indices, *it, &sparse_input_start);
+            num_total_features += tensorforest::GetNumSparseFeatures(
+                sparse_input_indices.matrix<int64>(), *it, &sparse_input_start);
           }
           if (num_total_features == 0) {
             LOG(WARNING) << "num total features is zero.";
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
index 3692b89f79d37114f1192167bc0675110c44d96e..fde0e87c9e3e0a4a87760d8b7034dd4ef4564d98 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.cc
@@ -43,8 +43,8 @@ DataColumnTypes FindSparseFeatureSpec(
   return static_cast<DataColumnTypes>(spec.sparse(column_num).original_type());
 }
 
-void GetTwoBest(int max, std::function<float(int)> score_fn, float* best_score,
-                int* best_index, float* second_best_score,
+void GetTwoBest(int max, const std::function<float(int)>& score_fn,
+                float* best_score, int* best_index, float* second_best_score,
                 int* second_best_index) {
   *best_index = -1;
   *second_best_index = -1;
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index 8c4117c189994ea0cbcf4f9b74930702247fc15a..c75f36fdee81a707c21df40927fa3c457b24737a 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -197,6 +197,66 @@ float FindSparseValue(
   return 0.0;
 }
 
+// Returns the number of sparse values for example input_index.
+// Also returns the index where those features start in sparse_input_start
+// if any were found.
+// Assumes that the first column in indices is ordered.
+template <typename T1>
+int32 GetNumSparseFeatures(const T1& indices, int32 input_index,
+                           int64* sparse_input_start) {
+  // Binary search for input_index.
+  // TODO(gilberth): Consider using std::lower_bound, std::upper_bound
+  // for a simpler but possibly slower solution, or searching for
+  // input_start and input_end simultaneously.
+  const int64 num_total = indices.dimension(0);
+  int64 index;
+  int64 low = 0;
+  int64 high = num_total;
+  *sparse_input_start = -1;  // Easy error checking.
+
+  while (true) {
+    if (low == high) {
+      return 0;
+    }
+    index = low + (high - low) / 2;
+    const int64 feature_index = indices(index, 0);
+    if (feature_index == input_index) {
+      // found it.
+      break;
+    } else if (feature_index < input_index) {
+      // Correct for the implicit floor in the index assignment.
+      if (low == index) {
+        return 0;
+      }
+      low = index;
+    } else {
+      high = index;
+    }
+  }
+
+  // Scan for the start and end of the input_index range.
+  int64 input_start = index;
+  int64 val = indices(input_start, 0);
+  while (val == input_index) {
+    --input_start;
+    if (input_start < 0) {
+      break;
+    }
+    val = indices(input_start, 0);
+  }
+  *sparse_input_start = input_start + 1;
+  int32 input_end = index;
+  val = indices(input_end, 0);
+  while (val == input_index) {
+    ++input_end;
+    if (input_end >= num_total) {
+      break;
+    }
+    val = indices(input_end, 0);
+  }
+  return input_end - input_start - 1;
+}
+
 // Returns left/right decision between the input value and the threshold bias.
 // For floating point types, the decision is value > bias, but for
 // categorical data, it is value != bias.
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index 06f8c9e18f7022d11c0b0ec774937f45a278304d..db6b3131383bfa23b0ff3990632c084cfb9abd8b 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -43,9 +43,9 @@ py_library(
     srcs = ["plugins/projector/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":protos_all_py",
         "//tensorflow/python:lib",
         "//tensorflow/tensorboard/plugins/projector:projector_plugin",
+        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
@@ -56,10 +56,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":projector",
-        ":protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
+        "//tensorflow/tensorboard/plugins/projector:protos_all_py",
     ],
 )
 
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
index c11f5d065c29d51158d78328e7f1a457c5cef4e7..0c630de7f53a9cab83c90b48233ac368a037f8e1 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/__init__.py
@@ -28,11 +28,10 @@ from __future__ import print_function
 import os
 
 from google.protobuf import text_format
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import EmbeddingInfo
-from tensorflow.contrib.tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
 from tensorflow.python.lib.io import file_io
 from tensorflow.tensorboard.plugins.projector import projector_plugin
 # pylint: disable=wildcard-import
+from tensorflow.tensorboard.plugins.projector.projector_config_pb2 import *
 from tensorflow.tensorboard.plugins.projector.projector_plugin import *
 # pylint: enable=wildcard-import
 
@@ -41,7 +40,7 @@ def visualize_embeddings(summary_writer, config):
   """Stores a config file used by the embedding projector.
 
   Args:
-    summary_writer: The summary writer used for writting events.
+    summary_writer: The summary writer used for writing events.
     config: `tf.contrib.tensorboard.plugins.projector.ProjectorConfig`
       proto that holds the configuration for the projector such as paths to
       checkpoint files and metadata files for the embeddings. If
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
index 96e084fa73639f7ab7507fd55920ca461b3be7cd..5f86f57a1c6213f4fb1e15bb2a37d33a7b21b564 100644
--- a/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
+++ b/tensorflow/contrib/tensorboard/plugins/projector/projector_api_test.py
@@ -24,10 +24,10 @@ import shutil
 from google.protobuf import text_format
 
 from tensorflow.contrib.tensorboard.plugins import projector
-from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer as writer_lib
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
 
 
 class ProjectorApiTest(test.TestCase):
@@ -46,7 +46,7 @@ class ProjectorApiTest(test.TestCase):
     writer = writer_lib.FileWriter(temp_dir)
     projector.visualize_embeddings(writer, config)
 
-    # Read the configuratin from disk and make sure it matches the original.
+    # Read the configurations from disk and make sure it matches the original.
     with gfile.GFile(os.path.join(temp_dir, 'projector_config.pbtxt')) as f:
       config2 = projector_config_pb2.ProjectorConfig()
       text_format.Parse(f.read(), config2)
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index 02f2d7fae8612b237fbaf86da0282e565261446d..f2065c666255984c8ab770fc10f682b1eabad095 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -127,3 +127,6 @@ class FakeSummaryWriter(object):
 
   def reopen(self):
     pass
+
+  def close(self):
+    pass
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index c7ff4a2921eb7ca834d9df8f695b3f7b6f0a69ba..5bfa0247a51c22ea8387f62d416fbf76ea4d38fb 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -2,75 +2,25 @@
 
 # Full Docment in tensorflow/tools/tfprof/README.md
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+4.  Selectively group, filter, account and order ops.
 
-tfprof can be used as Python API, Interactive CLI and One-shot Script.
+####tfprof supports 3 views to organize TensorFlow model profiles
 
-## Python API Tutorials
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
 
-tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
+####For each view, there are 3 ways to display outputs:
 
-### Examine the shapes and sizes of all trainiable Variables.
-```python
-# Print trainable variable parameter statistics to stdout.
-param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.
-        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
-
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
-sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
-```
-
-### Examine the number of floating point operations
-``` python
-# Print to stdout an analysis of the number of floating point operations in the
-# model broken down by individual operations.
-#
-# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
-# also requires complete shape information. It is common that shape is unknown
-# statically. To complete the shape, provide run-time shape information with
-# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
-```
-
-### Examine the timing and memory usage
-You will first need to run the following set up in your model in order to
-compute the memory and timing statistics.
-
-```python
-# Generate the meta information for the model that contains the memory usage
-# and timing information.
-run_metadata = tf.RunMetadata()
-with tf.Session() as sess:
-  _ = sess.run(train_op,
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-```
-
-Finally, you may run `print_model_analysis` to explore the timing and memory
-demands of the model.
-
-``` python
-# Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
-tf.contrib.tfprof.model_analyzer.print_model_analysis(
-    tf.get_default_graph(),
-    run_meta=run_metadata,
-    tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
-```
-
-Users can change ```tfprof_options``` to fully leverage tfprof's power.
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
index 9c3b10b22c554e76b856d32cd72bbf4681542227..c96f6719e7ed4db2cf24d2600bf2134a6529bcd2 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/BUILD
@@ -21,16 +21,34 @@ py_test(
     name = "model_analyzer_test",
     srcs = ["model_analyzer_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
         ":model_analyzer",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        ":model_analyzer_testlib",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "model_analyzer_testlib",
+    srcs = ["model_analyzer_testlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_analyzer",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
@@ -97,6 +115,29 @@ py_test(
     ],
 )
 
+py_library(
+    name = "pprof_profiler",
+    srcs = ["pprof_profiler.py"],
+    srcs_version = "PY2AND3",
+    deps = ["@com_google_pprof//:pprof_proto_py"],
+)
+
+py_test(
+    name = "pprof_profiler_test",
+    srcs = ["pprof_profiler_test.py"],
+    main = "pprof_profiler_test.py",
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # TODO(annarev): get it working with pip.
+    deps = [
+        ":pprof_profiler",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@com_google_pprof//:pprof_proto_py",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
index cc94fd65b53a73113b389b5fea75ade90f00d368..17dff69edd633482325171898a016710b58d8731 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer.py
@@ -45,7 +45,7 @@ TRAINABLE_VARS_PARAMS_STAT_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -65,7 +65,7 @@ FLOAT_OPS_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['float_ops'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -87,7 +87,7 @@ PRINT_PARAMS_ON_DEVICE = {
     'hide_name_regexes': [],
     'account_displayed_op_only': False,
     'select': ['device', 'params'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -107,7 +107,7 @@ PRINT_ALL_TIMING_MEMORY = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['micros', 'bytes'],
-    'viz': False,
+    'output': 'stdout',
     'dump_to_file': ''
 }
 
@@ -123,7 +123,7 @@ def print_model_analysis(graph,
   """Print model statistics.
 
     Prints the model statistics to stdout. Also returns the results
-    in a TFProfNode proto. See go/tfprof or run tfprof tool:
+    in a TFGraphNodeProto proto. See go/tfprof or run tfprof tool:
     'bazel run third_party/tensorflow/tools/tfprof help'
 
     Examples:
@@ -142,15 +142,19 @@ def print_model_analysis(graph,
               'micros' and 'bytes'.
     op_log: tensorflow::tfprof::OpLog proto. users can use this proto to
             group together ops and use a op_type to select the group.
-    tfprof_cmd: string. Either 'scope' or 'graph'. 'scope' view organize
-                ops using their name scopes. 'graph' view organize ops using
-                their graph inputs.
+    tfprof_cmd: string. Either 'scope', 'graph', 'code'.
+                'scope' view organize outputs using ops' name scope.
+                'graph' view organize outputs using op's inputs/outputs.
+                'code' view organize outputs using Python call stack.
     tfprof_options: See 'tfprof help' for details.
   Returns:
-    TFProfNode proto. Side effect: a formatted output to stdout.
+    If tfprof_cmd is 'scope' or 'graph', returns TFGraphNodeProto proto.
+    If tfprof_cmd is 'code', returns TFCodeNodeProto proto.
+    Side effect: a formatted output to stdout.
   """
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = tfprof_logger._merge_default_with_oplog(
+      graph, op_log, run_meta, add_trace=tfprof_cmd == 'code')
   # pylint: enable=protected-access
   opts = tfprof_options_pb2.OptionsProto()
   opts.max_depth = tfprof_options['max_depth']
@@ -174,15 +178,28 @@ def print_model_analysis(graph,
   opts.account_displayed_op_only = tfprof_options['account_displayed_op_only']
   for p in tfprof_options['select']:
     opts.select.append(p)
-  opts.viz = tfprof_options['viz']
+  opts.output = tfprof_options['output']
   opts.dump_to_file = tfprof_options['dump_to_file']
 
   run_meta_str = run_meta.SerializeToString() if run_meta else b''
-  op_log_str = op_log.SerializeToString() if op_log else b''
 
-  tfprof_node = tfprof_output_pb2.TFProfNode()
-  tfprof_node.ParseFromString(
-      print_mdl.PrintModelAnalysis(
-          graph.as_graph_def().SerializeToString(), run_meta_str, op_log_str,
-          tfprof_cmd.encode('utf-8'), opts.SerializeToString()))
+  if tfprof_cmd == 'code':
+    tfprof_node = tfprof_output_pb2.TFCodeNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+  else:
+    tfprof_node = tfprof_output_pb2.TFGraphNodeProto()
+    tfprof_node.ParseFromString(
+        print_mdl.PrintModelAnalysis(
+            graph.as_graph_def().SerializeToString(),
+            run_meta_str,
+            op_log.SerializeToString(),
+            tfprof_cmd.encode('utf-8'),
+            opts.SerializeToString()))
+
   return tfprof_node
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
index 66b9267cbec03568e581d1f846bc6a3f8e4ae2fb..afd8563e78d434710df85176c73c2bb938963669 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_test.py
@@ -18,52 +18,31 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 # XXX: this depends on pywrap_tensorflow and must come later
 from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer
+from tensorflow.contrib.tfprof.python.tools.tfprof import model_analyzer_testlib as lib
 
 
 class PrintModelAnalysisTest(test.TestCase):
 
-  def _BuildSmallModel(self):
-    image = array_ops.zeros([2, 6, 6, 3])
-    _ = variable_scope.get_variable(
-        'ScalarW', [],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    kernel = variable_scope.get_variable(
-        'DW', [3, 3, 3, 6],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
-    kernel = variable_scope.get_variable(
-        'DW2', [2, 2, 6, 12],
-        dtypes.float32,
-        initializer=init_ops.random_normal_initializer(stddev=0.001))
-    x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
-    return x
-
   def testDumpToFile(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      _ = self._BuildSmallModel()
+      _ = lib.BuildSmallModel()
       model_analyzer.print_model_analysis(sess.graph, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
         self.assertEqual(u'_TFProfRoot (--/451 params)\n'
                          '  DW (3x3x3x6, 162/162 params)\n'
                          '  DW2 (2x2x6x12, 288/288 params)\n'
@@ -71,15 +50,17 @@ class PrintModelAnalysisTest(test.TestCase):
                          f.read())
 
   def testSelectEverything(self):
+    ops.reset_default_graph()
     opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
-    opts['dump_to_file'] = os.path.join(test.get_temp_dir(), 'dump')
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
     opts['account_type_regexes'] = ['.*']
     opts['select'] = [
         'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device', 'op_types'
     ]
 
     with session.Session() as sess, ops.device('/cpu:0'):
-      x = self._BuildSmallModel()
+      x = lib.BuildSmallModel()
 
       sess.run(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
@@ -91,13 +72,121 @@ class PrintModelAnalysisTest(test.TestCase):
       model_analyzer.print_model_analysis(
           sess.graph, run_meta, tfprof_options=opts)
 
-      with gfile.Open(opts['dump_to_file'], 'r') as f:
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        self.assertEqual(
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            f.read())
+        # pylint: enable=line-too-long
+
+  def testSimpleCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.*']
+    opts['account_displayed_op_only'] = False
+    # TODO(xpan): Test 'micros'. Since the execution time changes each run,
+    # it's a bit difficult to test it now.
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device',
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB, _kTFScopeParent)\n  Conv2D (0/0 params, 5.83k/5.83k flops, 432B/432B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  Conv2D_1 (0/0 params, 4.61k/4.61k flops, 384B/384B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Conv2D)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 648B/1.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW/read (0/0 params, 0/0 flops, 648B/648B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.15KB/2.30KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|VariableV2|_trainable_variables)\n    DW2/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    DW2/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      DW2/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        DW2/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        DW2/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        DW2/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        DW2/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    DW2/read (0/0 params, 0/0 flops, 1.15KB/1.15KB, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Identity)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|VariableV2|_trainable_variables)\n    ScalarW/Assign (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Assign)\n    ScalarW/Initializer (0/0 params, 0/0 flops, 0B/0B, _kTFScopeParent)\n      ScalarW/Initializer/random_normal (0/0 params, 0/0 flops, 0B/0B, Add)\n        ScalarW/Initializer/random_normal/RandomStandardNormal (0/0 params, 0/0 flops, 0B/0B, RandomStandardNormal)\n        ScalarW/Initializer/random_normal/mean (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/mul (0/0 params, 0/0 flops, 0B/0B, Mul)\n        ScalarW/Initializer/random_normal/shape (0/0 params, 0/0 flops, 0B/0B, Const)\n        ScalarW/Initializer/random_normal/stddev (0/0 params, 0/0 flops, 0B/0B, Const)\n    ScalarW/read (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|Identity)\n  init (0/0 params, 0/0 flops, 0B/0B, /device:CPU:0, /device:CPU:0|NoOp)\n  zeros (0/0 params, 0/0 flops, 864B/864B, /job:localhost/replica:0/task:0/cpu:0, /job:localhost/replica:0/task:0/cpu:0|Const)\n',
+            '_TFProfRoot (0/451 params, 0/10.44k flops, 0B/5.28KB)\n  model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops, 0B/864B)\n  model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/1 params, 0/0 flops, 0B/0B)\n  model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/162 params, 0/0 flops, 0B/1.30KB)\n  model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/5.83k flops, 0B/432B)\n  model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/288 params, 0/0 flops, 0B/2.30KB)\n  model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/4.61k flops, 0B/384B)\n',
             f.read())
         # pylint: enable=line-too-long
 
+  def testComplexCodeView(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts['output'] = 'file:outfile=' + outfile
+    opts['account_type_regexes'] = ['.*']
+    opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = ['params', 'float_ops']
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildFullModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      # pylint: disable=line-too-long
+      with gfile.Open(outfile, 'r') as f:
+        self.assertEqual(
+            '_TFProfRoot (0/2.84k params, 0/54.08k flops)\n  model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_... (0/1.80k params, 0/41.76k flops)\n    model_analyzer_testlib.py:33:BuildSmallModel:image = array_ops... (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:37:BuildSmallModel:initializer=init_... (0/4 params, 0/0 flops)\n    model_analyzer_testlib.py:41:BuildSmallModel:initializer=init_... (0/648 params, 0/0 flops)\n    model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0/1.15k params, 0/0 flops)\n    model_analyzer_testlib.py:47:BuildSmallModel:x = nn_ops.conv2d... (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c... (0/1.04k params, 0/4.13k flops)\n  model_analyzer_testlib.py:62:BuildFullModel:target = array_op... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_... (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min... (0/0 params, 0/8.19k flops)\n',
+            f.read())
+
+      self.assertLess(0, tfprof_node.total_exec_micros)
+      self.assertEqual(2844, tfprof_node.total_parameters)
+      self.assertEqual(54080, tfprof_node.total_float_ops)
+      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual('_TFProfRoot', tfprof_node.name)
+      self.assertEqual('model_analyzer_testlib.py:56:BuildFullModel:seq.append(array_...',
+                       tfprof_node.children[0].name)
+      self.assertEqual('model_analyzer_testlib.py:60:BuildFullModel:cell, array_ops.c...',
+                       tfprof_node.children[1].name)
+      self.assertEqual('model_analyzer_testlib.py:62:BuildFullModel:target = array_op...',
+                       tfprof_node.children[2].name)
+      self.assertEqual('model_analyzer_testlib.py:63:BuildFullModel:loss = nn_ops.l2_...',
+                       tfprof_node.children[3].name)
+      self.assertEqual('model_analyzer_testlib.py:65:BuildFullModel:return sgd_op.min...',
+                       tfprof_node.children[4].name)
+      # pylint: enable=line-too-long
+
+  def testCodeViewLeafGraphNode(self):
+    ops.reset_default_graph()
+    opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
+    opts['account_type_regexes'] = ['.*']
+    opts['account_displayed_op_only'] = False
+    opts['select'] = [
+        'bytes', 'params', 'float_ops', 'num_hidden_ops', 'device'
+    ]
+
+    with session.Session() as sess, ops.device('/cpu:0'):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      tfprof_node = model_analyzer.print_model_analysis(
+          sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
+
+      leaf = tfprof_node
+      while leaf.children:
+        self.assertEqual(0, len(leaf.graph_nodes))
+        leaf = leaf.children[0]
+      self.assertEqual(1, len(leaf.graph_nodes))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed26f001c2e74eee5c2efca5a2356b08a94463ae
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/model_analyzer_testlib.py
@@ -0,0 +1,67 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A test lib that defines some models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rnn.python.ops.core_rnn_cell import BasicRNNCell
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import gradient_descent
+
+
+def BuildSmallModel():
+  """Build a small forward conv model."""
+  image = array_ops.zeros([2, 6, 6, 3])
+  _ = variable_scope.get_variable(
+      'ScalarW', [],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  kernel = variable_scope.get_variable(
+      'DW', [3, 3, 3, 6],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(image, kernel, [1, 2, 2, 1], padding='SAME')
+  kernel = variable_scope.get_variable(
+      'DW2', [2, 2, 6, 12],
+      dtypes.float32,
+      initializer=init_ops.random_normal_initializer(stddev=0.001))
+  x = nn_ops.conv2d(x, kernel, [1, 2, 2, 1], padding='SAME')
+  return x
+
+
+def BuildFullModel():
+  """Build the full model with conv,rnn,opt."""
+  seq = []
+  for i in range(4):
+    with variable_scope.variable_scope('inp_%d' % i):
+      seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1]))
+
+  cell = BasicRNNCell(16, 48)
+  out = rnn.dynamic_rnn(
+      cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0]
+
+  target = array_ops.ones_like(out)
+  loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out))
+  sgd_op = gradient_descent.GradientDescentOptimizer(1e-2)
+  return sgd_op.minimize(loss)
+
+
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57e45748d2c9503d8a26c4e3e23477c28146f46
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler.py
@@ -0,0 +1,445 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler for TensorFlow models that outputs data in pprof format.
+
+See https://github.com/google/pprof/blob/master/proto/profile.proto for pprof
+profile format.
+The following needs to be set for profiler to work:
+  * trace_level needs to be set to FULL_TRACE
+  * run_metadata object should be passed in to session.run call
+
+Sample usage:
+  options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+  run_metadata = tf.RunMetadata()
+
+  with tf.Session as sess:
+    ...
+    sess.run(computation, run_metadata=run_metadata, options=options)
+  pprof_profiler.profile(sess.graph, run_metadata, output_dir)
+
+
+  The code above would output a pprof profile to separate output_dir/.*.pb.gz
+  file for each device. These files can be passed to pprof for formatting.
+  For e.g.:
+     pprof -png --nodecount=100 --sample_index=1 output_dir/profile_output.pb.gz
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+from collections import namedtuple
+import gzip
+import os
+import string
+import sys
+import time
+
+from proto import profile_pb2
+
+
+if sys.version_info < (3,):
+  maketrans = string.maketrans
+else:
+  maketrans = str.maketrans
+
+
+ProfileDatum = namedtuple('ProfileDatum', [
+    'node_exec_stats', 'op_type', 'traceback'])
+
+
+class StringTable(object):
+  """Keeps track of strings to add to string_table in pprof proto."""
+
+  def __init__(self):
+    # Pprof requires first entry in string_table to be ''.
+    self._string_table = ['']
+    self._string_to_index = {'': 0}
+
+  def index_of(self, value_str):
+    """Get index of value_str in the string table.
+
+    If value_str is not in the string table, we will add it at the end
+    and then return the new index.
+    Args:
+      value_str: (string) Value to lookup/add in/to the string table.
+
+    Returns:
+      Index of value_str in the string table.
+    """
+    if value_str is None:
+      value_str = ''
+    if value_str in self._string_to_index:
+      return self._string_to_index[value_str]
+    index = len(self._string_table)
+    self._string_table.append(value_str)
+    self._string_to_index[value_str] = index
+    return index
+
+  def next_index(self):
+    """Gets index that would be assigned to the next added string.
+
+    Returns:
+      Index of the next string if it was added.
+    """
+    return len(self._string_table)
+
+  def string_table(self):
+    """Returns a list of strings to store in pprof's string_table."""
+    return self._string_table
+
+
+class Functions(object):
+  """Keeps track of `Function` protos for pprof profile."""
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # Maps tuples in the form (file_path, function_name, start_line_number)
+    # to `Function` protos.
+    self._function_key_to_function = {}
+
+  def index_of(self, file_path, function_name, function_start_line):
+    """Returns index of the function, adding the function if needed.
+
+    Args:
+      file_path: (string) Path to file where the function is defined.
+      function_name: (string) Function name.
+      function_start_line: (integer) Start line number of function definition.
+
+    Returns:
+      Function index.
+    """
+    function_key = (file_path, function_name, function_start_line)
+    if function_key in self._function_key_to_function:
+      return self._function_key_to_function[function_key].id
+    else:
+      # Function indexes should start from 1
+      function_index = len(self._function_key_to_function) + 1
+      function = profile_pb2.Function()
+      function.id = function_index
+      function.name = self._string_table.index_of(function_name)
+      function.filename = self._string_table.index_of(file_path)
+      function.start_line = function_start_line
+      self._function_key_to_function[function_key] = function
+      return function_index
+
+  def function_protos(self):
+    """Returns list of `profile_pb2.Function` protos."""
+    return self._function_key_to_function.values()
+
+
+class Locations(object):
+  """Keeps track of `Location` protos for pprof profile.
+
+  `Locations` store information about function call locations.
+  """
+
+  def __init__(self, functions):
+    """Constructor.
+
+    Args:
+      functions: A `Functions` object.
+    """
+    self._functions = functions
+    # Maps tuples in the form (file_path, called_function_name, line_number)
+    # to `Location` protos.
+    self._location_key_to_location = {}
+
+  def index_of(
+      self, file_path, line_number, called_function_name, called_file_path,
+      called_function_start_line):
+    """Returns index of the location, adding the location if needed.
+
+    Args:
+      file_path: (string) Path to file that makes the call.
+      line_number: (integer) Call line number.
+      called_function_name: (string) Function name of the function called at
+        `file_path` and `line_number`.
+      called_file_path: (string) Path to file where the called function is
+        defined.
+      called_function_start_line: (integer) Start line number of called
+        function definition in `called_file_path` file.
+
+    Returns:
+      Index of location.
+    """
+    location_key = (file_path, called_function_name, line_number)
+    if location_key in self._location_key_to_location:
+      location = self._location_key_to_location[location_key]
+      return location.id
+    else:
+      # Location indexes should start from 1
+      location_index = len(self._location_key_to_location) + 1
+      location = profile_pb2.Location()
+      location.id = location_index
+      self._location_key_to_location[location_key] = location
+
+      line = location.line.add()
+      line.function_id = self._functions.index_of(
+          called_file_path, called_function_name, called_function_start_line)
+      line.line = line_number
+      return location_index
+
+  def location_protos(self):
+    """Returns list of `profile_pb2.Location` protos."""
+    return self._location_key_to_location.values()
+
+
+class Samples(object):
+  """Keeps track of `Sample` protos for pprof profile.
+
+  Samples store the following statistics in order:
+  count, all_time, op_time
+  """
+
+  def __init__(self, string_table):
+    """Constructor.
+
+    Args:
+      string_table: A `StringTable` object.
+    """
+    self._string_table = string_table
+    # TODO(annarev): figure out if location is unique for each node name.
+    # If not, also key this dictionary based on location ids.
+    self._node_name_to_sample = {}
+
+  def add(self, datum, location_ids):
+    """Adds a sample data point.
+
+    Args:
+      datum: `ProfileDatum` to add a sample for.
+      location_ids: List of numberic location ids for this
+        sample.
+    """
+    node_name = datum.node_exec_stats.node_name
+    if node_name in self._node_name_to_sample:
+      sample = self._node_name_to_sample[node_name]
+      sample.location_id.extend(location_ids)
+    else:
+      sample = profile_pb2.Sample()
+      # Sample stores 3 values: count, all_time, op_time
+      sample.value.extend([0, 0, 0])
+
+      label = sample.label.add()
+      label.key = self._string_table.index_of('node_name')
+      label.str = self._string_table.index_of(node_name)
+      label = sample.label.add()
+      label.key = self._string_table.index_of('op_type')
+      label.str = self._string_table.index_of(datum.op_type)
+      self._node_name_to_sample[node_name] = sample
+    sample.value[0] += 1
+    sample.value[1] += datum.node_exec_stats.all_end_rel_micros
+    sample.value[2] += (
+        datum.node_exec_stats.op_end_rel_micros -
+        datum.node_exec_stats.op_start_rel_micros)
+
+  def get_sample_protos(self):
+    """Returns list of `Sample` protos for pprof profile."""
+    return self._node_name_to_sample.values()
+
+
+class PprofProfiler(object):
+  """Creates profiles in pprof format."""
+
+  def __init__(self, graph, run_metadata):
+    """Constructor.
+
+    Args:
+      graph: A `Graph` instance.
+      run_metadata: A list of `RunMetadata` objects.
+    """
+    self._graph = graph
+    self._run_metadata = run_metadata
+    self._string_table = StringTable()
+    self._functions = Functions(self._string_table)
+    self._locations = Locations(self._functions)
+
+  def profile(self):
+    """Generates pprof profiles.
+
+    Returns:
+      Dictionary mapping from device name to proto in `profile_pb2.Profile`
+      format.
+    """
+    profiles = {}
+    data_generator_func = self._get_profile_data_generator()
+    for device_index, device_stats in enumerate(
+        self._run_metadata.step_stats.dev_stats):
+      # Create profile
+      pprof_proto = self._get_pprof_proto(data_generator_func(device_stats))
+      if not pprof_proto.sample:
+        print(
+            'Not enough data to create profile for device %s. Did you pass '
+            'RunMetadata to session.run call?' % device_stats.device)
+        continue
+      # Add device name comment
+      device_count = len(self._run_metadata.step_stats.dev_stats)
+      device_description = (
+          'Device %d of %d: %s' %
+          (device_index + 1, device_count, device_stats.device))
+      device_description_str_index = self._string_table.next_index()
+      pprof_proto.string_table.append(device_description)
+      pprof_proto.comment.append(device_description_str_index)
+      profiles[device_stats.device] = pprof_proto
+    return profiles
+
+  def _get_pprof_proto(self, profile_datum_generator):
+    """Returns profile data in pprof proto format.
+
+    Args:
+      profile_datum_generator: Generator outputting `ProfileDatum` objects.
+
+    Returns:
+      A proto in pprof format.
+    """
+    pprof_profile = profile_pb2.Profile()
+    samples = Samples(self._string_table)
+
+    for datum in profile_datum_generator:
+      if not datum.traceback:
+        continue
+
+      stack_frame = datum.traceback[-1]
+      after_apply_op = False
+      location_ids = []
+
+      # We add locations from stack trace in bottom-up order.
+      for stack_frame_index in reversed(range(len(datum.traceback) - 1)):
+        prev_stack_frame = stack_frame
+        stack_frame = datum.traceback[stack_frame_index]
+
+        # Call at current frame calls function at previous frame.
+        prev_file_path = prev_stack_frame[0]
+        prev_function = prev_stack_frame[2]
+        prev_function_start_line = prev_stack_frame[4]
+        curr_file_path = stack_frame[0]
+        curr_line_number = stack_frame[1]
+
+        # Skip all calls up to apply_op since they are the same for all ops.
+        if not after_apply_op:
+          if prev_function == 'apply_op':
+            after_apply_op = True
+          continue
+        location_index = self._locations.index_of(
+            curr_file_path, curr_line_number,
+            prev_function, prev_file_path, prev_function_start_line)
+        location_ids.append(location_index)
+      samples.add(datum, location_ids)
+
+    sample_type_description = 'count'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('count')
+    sample_type_description = 'all_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+    sample_type_description = 'op_time'
+    sample_type = pprof_profile.sample_type.add()
+    sample_type.type = self._string_table.index_of(sample_type_description)
+    sample_type.unit = self._string_table.index_of('nanoseconds')
+
+    pprof_profile.string_table.extend(self._string_table.string_table())
+    pprof_profile.sample.extend(samples.get_sample_protos())
+    pprof_profile.function.extend(self._functions.function_protos())
+    pprof_profile.location.extend(self._locations.location_protos())
+    return pprof_profile
+
+  def _get_profile_data_generator(self):
+    """Get function that generates `ProfileDatum` objects.
+
+    Returns:
+      A function that generates `ProfileDatum` objects.
+    """
+    node_to_traceback = defaultdict(list)
+    node_to_op_type = defaultdict(str)
+    for op in self._graph.get_operations():
+      node_to_traceback[op.name] = op.traceback_with_start_lines
+      node_to_op_type[op.name] = op.type
+
+    def profile_data_generator(device_step_stats):
+      for node_stats in device_step_stats.node_stats:
+        if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK':
+          continue
+        yield ProfileDatum(
+            node_stats,
+            node_to_op_type[node_stats.node_name],
+            node_to_traceback[node_stats.node_name])
+
+    return profile_data_generator
+
+
+def get_profiles(graph, run_metadata):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+
+  Returns:
+    A dictionary mapping from device name to pprof proto for that device.
+  """
+  return PprofProfiler(graph, run_metadata).profile()
+
+
+def profile(graph, run_metadata, output_dir=None):
+  """Generate profiles in pprof format.
+
+  See https://github.com/google/pprof/blob/master/proto/profile.proto
+  for pprof proto format.
+
+  Args:
+    graph: A `Graph` object.
+    run_metadata: A `RunMetadata` proto.
+    output_dir: (string) Directory to output pprof profile to.
+      Profile files for each device will be stored in compressed
+      serialized proto format. If output_dir is None, profile protos
+      will be printed to stdout instead.
+
+  Returns:
+    List of output files created by this profile call.
+    (Note: this list will be empty if output_dir is None)
+  """
+  profiles = get_profiles(graph, run_metadata)
+  output_file_template = None
+  if output_dir:
+    if not os.path.isdir(output_dir):
+      os.makedirs(output_dir)
+    time_suffix = time.strftime('%Y%m%d%H%M%S')
+    output_file_template = os.path.join(
+        output_dir, '%s_' + time_suffix + '.pb.gz')
+
+  profile_files = []
+  for device, pprof_proto in profiles.items():
+    if output_file_template is None:
+      print('No output directory specified, printing to stdout instead.')
+      print(pprof_proto)
+    else:
+      device_name = str(device).strip('/').translate(
+          maketrans('/:', '__'))
+      profile_file = output_file_template % device_name
+      profile_files.append(profile_file)
+      with gzip.open(profile_file, 'w') as output_file:
+        print('Writing profile to %s...' % profile_file)
+        output_file.write(pprof_proto.SerializeToString())
+  return profile_files
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6487adf99204d7d2f22f47e937a6921c2a54e220
--- /dev/null
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for pprof_profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+
+from proto import profile_pb2
+from tensorflow.contrib.tfprof.python.tools.tfprof import pprof_profiler
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PprofProfilerTest(test.TestCase):
+
+  def testDataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    graph.get_operations.return_value = []
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testRunMetadataEmpty(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [('a/b/file1', 10, 'some_var')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(0, len(profiles))
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(0, len(profile_files))
+
+  def testValidProfile(self):
+    output_dir = test.get_temp_dir()
+    run_metadata = config_pb2.RunMetadata()
+
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name='Add/123',
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = 'deviceA'
+    device1.node_stats.extend([node1])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = 'Add/123'
+    op1.traceback = [
+        ('a/b/file1', 10, 'apply_op', 'abc'), ('a/c/file2', 12, 'my_op', 'def')]
+    op1.type = 'add'
+    graph.get_operations.return_value = [op1]
+
+    expected_proto = """sample_type {
+  type: 5
+  unit: 5
+}
+sample_type {
+  type: 6
+  unit: 7
+}
+sample_type {
+  type: 8
+  unit: 7
+}
+sample {
+  value: 1
+  value: 4
+  value: 2
+  label {
+    key: 1
+    str: 2
+  }
+  label {
+    key: 3
+    str: 4
+  }
+}
+string_table: ""
+string_table: "node_name"
+string_table: "Add/123"
+string_table: "op_type"
+string_table: "add"
+string_table: "count"
+string_table: "all_time"
+string_table: "nanoseconds"
+string_table: "op_time"
+string_table: "Device 1 of 1: deviceA"
+comment: 9
+"""
+    # Test with protos
+    profiles = pprof_profiler.get_profiles(graph, run_metadata)
+    self.assertEquals(1, len(profiles))
+    self.assertTrue('deviceA' in profiles)
+    self.assertEquals(expected_proto, str(profiles['deviceA']))
+    # Test with files
+    profile_files = pprof_profiler.profile(
+        graph, run_metadata, output_dir)
+    self.assertEquals(1, len(profile_files))
+    with gzip.open(profile_files[0]) as profile_file:
+      profile_contents = profile_file.read()
+      profile = profile_pb2.Profile()
+      profile.ParseFromString(profile_contents)
+      self.assertEquals(expected_proto, str(profile))
+
+  def testProfileWithWhileLoop(self):
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+
+    num_iters = 5
+    with self.test_session() as sess:
+      i = constant_op.constant(0)
+      c = lambda i: math_ops.less(i, num_iters)
+      b = lambda i: math_ops.add(i, 1)
+      r = control_flow_ops.while_loop(c, b, [i])
+      sess.run(r, options=options, run_metadata=run_metadata)
+      profiles = pprof_profiler.get_profiles(sess.graph, run_metadata)
+      self.assertEquals(1, len(profiles))
+      profile = next(iter(profiles.values()))
+      add_samples = []  # Samples for the while/Add node
+      for sample in profile.sample:
+        if profile.string_table[sample.label[0].str] == 'while/Add':
+          add_samples.append(sample)
+      # Values for same nodes are aggregated.
+      self.assertEquals(1, len(add_samples))
+      # Value of "count" should be equal to number of iterations.
+      self.assertEquals(num_iters, add_samples[0].value[0])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
index f0ac36c66a11b0b985a0b91817795419990ef119..c3e9fc9cc099f144f81235a944221fa05b6b398c 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/print_model_analysis_test.py
@@ -51,7 +51,7 @@ TEST_OPTIONS = {
     'hide_name_regexes': [],
     'account_displayed_op_only': True,
     'select': ['params'],
-    'viz': False
+    'output': 'stdout',
 }
 
 # pylint: enable=bad-whitespace
@@ -92,16 +92,17 @@ class PrintModelAnalysisTest(test.TestCase):
     opts.account_displayed_op_only = TEST_OPTIONS['account_displayed_op_only']
     for p in TEST_OPTIONS['select']:
       opts.select.append(p)
-    opts.viz = TEST_OPTIONS['viz']
+    opts.output = TEST_OPTIONS['output']
 
     with session.Session() as sess, ops.device('/cpu:0'):
       _ = self._BuildSmallModel()
-      tfprof_pb = tfprof_output_pb2.TFProfNode()
+      tfprof_pb = tfprof_output_pb2.TFGraphNodeProto()
       tfprof_pb.ParseFromString(
-          print_mdl.PrintModelAnalysis(sess.graph.as_graph_def(
-          ).SerializeToString(), b'', b'', b'scope', opts.SerializeToString()))
+          print_mdl.PrintModelAnalysis(
+              sess.graph.as_graph_def().SerializeToString(),
+              b'', b'', b'scope', opts.SerializeToString()))
 
-      expected_pb = tfprof_output_pb2.TFProfNode()
+      expected_pb = tfprof_output_pb2.TFGraphNodeProto()
       text_format.Merge(r"""name: "_TFProfRoot"
       exec_micros: 0
       requested_bytes: 0
@@ -115,7 +116,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -127,7 +127,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 648
-      device: "/device:CPU:0"
       children {
       name: "DW/Assign"
       exec_micros: 0
@@ -135,7 +134,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -216,7 +214,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
@@ -230,7 +227,6 @@ class PrintModelAnalysisTest(test.TestCase):
       total_exec_micros: 0
       total_requested_bytes: 0
       total_parameters: 0
-      device: "/device:CPU:0"
       float_ops: 0
       total_float_ops: 0
       }
diff --git a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
index e8cf84b6c7703078c88bf369aa6f5aedae68243a..e6d504d5165d4608033f2de7ef386e662912e451 100644
--- a/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
+++ b/tensorflow/contrib/tfprof/python/tools/tfprof/tfprof_logger.py
@@ -62,12 +62,13 @@ def _fill_missing_graph_shape(graph, run_meta):
   return graph
 
 
-def _get_logged_ops(graph, run_meta=None):
+def _get_logged_ops(graph, run_meta=None, add_trace=True):
   """Extract trainable model parameters and FLOPs for ops from a Graph.
 
   Args:
     graph: tf.Graph.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
   Returns:
     logged_ops: dict mapping from op_name to OpLogEntry.
   """
@@ -76,21 +77,32 @@ def _get_logged_ops(graph, run_meta=None):
 
   op_missing_shape = 0
   logged_ops = {}
-  graph_def = graph.as_graph_def()
-  for node in graph_def.node:
+  for op in graph.get_operations():
     try:
-      stats = ops.get_stats_for_node_def(graph, node, REGISTERED_FLOP_STATS)
+      stats = ops.get_stats_for_node_def(
+          graph, op.node_def, REGISTERED_FLOP_STATS)
     except ValueError:
       # Catch Exception When shape is incomplete. Skip it.
       op_missing_shape += 1
       stats = None
 
-    if not stats or not stats.value:
-      continue
-    if node.name not in logged_ops:
-      entry = tfprof_log_pb2.OpLogEntry()
-      entry.name = node.name
+    entry = tfprof_log_pb2.OpLogEntry()
+    entry.name = op.name
+    add_entry = False
+    if stats and stats.value:
       entry.float_ops = int(stats.value)
+      add_entry = True
+
+    if add_trace:
+      for tb in op.traceback:
+        trace = entry.code_def.traces.add()
+        trace.file = tb[0] if tb[0] else 'none'
+        trace.lineno = tb[1] if tb[1] else -1
+        trace.function = tb[2] if tb[2] else 'none'
+        trace.line = tb[3] if tb[3] else 'none'
+      add_entry = True
+
+    if add_entry:
       logged_ops[entry.name] = entry
 
   for v in graph.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES):
@@ -108,18 +120,20 @@ def _get_logged_ops(graph, run_meta=None):
   return logged_ops
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
+def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                              add_trace=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
     graph: tf.Graph.
     op_log: OpLog proto.
     run_meta: RunMetadata proto used to complete shape information.
+    add_trace: Whether to add op trace information.
   Returns:
     tmp_op_log: Merged OpLog proto.
   """
   tmp_op_log = tfprof_log_pb2.OpLog()
-  logged_ops = _get_logged_ops(graph, run_meta)
+  logged_ops = _get_logged_ops(graph, run_meta, add_trace=add_trace)
   if not op_log:
     tmp_op_log.log_entries.extend(logged_ops.values())
   else:
@@ -131,13 +145,15 @@ def _merge_default_with_oplog(graph, op_log=None, run_meta=None):
         all_ops[op_name].types.extend(entry.types)
         if entry.float_ops > 0 and all_ops[op_name].float_ops == 0:
           all_ops[op_name].float_ops = entry.float_ops
+        if entry.code_def.traces and not all_ops[op_name].code_def.traces:
+          all_ops[op_name].code_def.MergeFrom(entry.code_def)
       else:
         all_ops[op_name] = entry
     tmp_op_log.log_entries.extend(all_ops.values())
   return tmp_op_log
 
 
-def write_op_log(graph, log_dir, op_log=None, run_meta=None):
+def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
     The API also assigns ops in tf.trainable_variables() an op type called
@@ -154,8 +170,9 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None):
         one is created.
     run_meta: (Optional) RunMetadata proto that helps flops computation using
         run time shape information.
+    add_trace: Whether to add op trace information. Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta)
+  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index a781f0cbfc89b18fd2b532c39ca8c2d357850368..4f869584d021fe0a3dddce992d0674d0b0cf128d 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -23,7 +23,9 @@ py_library(
         "python/training/evaluation.py",
         "python/training/failure_tolerator.py",
         "python/training/feeder.py",
+        "python/training/feeding_queue_runner.py",
         "python/training/hparam.py",
+        "python/training/python_input.py",
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
@@ -46,8 +48,10 @@ py_library(
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
@@ -56,6 +60,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -243,6 +248,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "python_input_test",
+    size = "medium",
+    srcs = ["python/training/python_input_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":training_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "evaluation_test",
     size = "small",
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index be097fd9fca822b72a00c006c2d0a23d7c9257dc..36bff530f3bbbb7924a72838d08ece758737fb5b 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -35,6 +35,7 @@ See @{$python/contrib.training} guide.
 @@HParams
 @@HParamDef
 @@parse_values
+@@python_input
 """
 
 from __future__ import absolute_import
@@ -53,7 +54,9 @@ from tensorflow.contrib.training.python.training.evaluation import SummaryAtEndH
 from tensorflow.contrib.training.python.training.evaluation import wait_for_new_checkpoint
 from tensorflow.contrib.training.python.training.failure_tolerator import *
 from tensorflow.contrib.training.python.training.feeder import *
+from tensorflow.contrib.training.python.training.feeding_queue_runner import FeedingQueueRunner
 from tensorflow.contrib.training.python.training.hparam import *
+from tensorflow.contrib.training.python.training.python_input import python_input
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
@@ -71,8 +74,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 # whitelisted here or in the module docstring above.
 _allowed_symbols = [
     'checkpoints_iterator', 'evaluate_once', 'evaluate_repeatedly',
-    'get_or_create_eval_step', 'StopAfterNEvalsHook', 'SummaryAtEndHook',
-    'wait_for_new_checkpoint', 'add_gradients_summaries', 'clip_gradient_norms',
-    'create_train_op', 'multiply_gradients', 'train']
+    'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
+    'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
+    'clip_gradient_norms', 'create_train_op', 'multiply_gradients', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py
index 7c50f43b79276234aefdf1ba88667ecd302fadff..7e293da5511fcfc369eb9cb4fe9c68530619a9d1 100644
--- a/tensorflow/contrib/training/python/training/bucket_ops.py
+++ b/tensorflow/contrib/training/python/training/bucket_ops.py
@@ -251,10 +251,16 @@ def bucket(tensors,
     else:
       which_dequeue = lambda q: q.dequeue_many
 
+    def make_list(t):
+      if isinstance(t, (list, tuple)):
+        return t
+      else:
+        return [t]
+
     enqueues_to_top = [
         top_queue.enqueue(
-            [constant_op.constant(i)] + which_dequeue(q)(
-                bs, name="read_bucket_%d" % i),
+            [constant_op.constant(i)] + make_list(which_dequeue(q)(
+                bs, name="read_bucket_%d" % i)),
             name="enqueue_from_bucket_%d" % i)
         for i, (q, bs) in enumerate(zip(bucket_queues, batch_size))
     ]
@@ -282,6 +288,8 @@ def bucket(tensors,
     dequeued = top_queue.dequeue(name="dequeue_top")
     which_bucket_dequeued = dequeued[0]
     dequeued = dequeued[1:]
+    if len(dequeued) == 1:
+      dequeued = dequeued[0]
     dequeued = _restore_sparse_tensors(dequeued, sparse_info)
     return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
 
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index a35c31dc5544d3d6a6b7e9e9e79f6f967e22d640..24b733dd29cf0228ec9c5c87a8721fbfb3929574 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -254,7 +254,7 @@ def checkpoints_iterator(checkpoint_dir,
         logging.info('Timed-out waiting for a checkpoint.')
         return
       if timeout_fn():
-        # The timeout_fn indicated that we are truely done.
+        # The timeout_fn indicated that we are truly done.
         return
       else:
         # The timeout_fn indicated that more checkpoints may come.
@@ -290,19 +290,21 @@ class SummaryAtEndHook(session_run_hook.SessionRunHook):
       ValueError: If both `log_dir` and `summary_writer` are `None`.
     """
     self._summary_op = summary_op
+    self._replace_summary_op = summary_op is None
     self._feed_dict = feed_dict
     self._summary_writer = summary_writer
     self._log_dir = log_dir
-    self._summary_writer = summary_writer
     if self._log_dir is None and self._summary_writer is None:
       raise ValueError('One of log_dir or summary_writer should be used.')
-    self._global_step = variables.get_or_create_global_step()
 
   def begin(self):
+    if self._replace_summary_op:
+      self._summary_op = summary.merge_all()
+    self._global_step = variables.get_or_create_global_step()
+
+  def after_create_session(self, session, coord):
     if self._summary_writer is None and self._log_dir:
       self._summary_writer = summary.FileWriterCache.get(self._log_dir)
-    if self._summary_op is None:
-      self._summary_op = summary.merge_all()
 
   def end(self, session):
     global_step = training_util.global_step(session, self._global_step)
@@ -368,7 +370,7 @@ def evaluate_repeatedly(checkpoint_dir,
 
   One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
   summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
-  summaries run immedietly after the model checkpoint has been restored.
+  summaries run immediately after the model checkpoint has been restored.
 
   Note that `evaluate_once` creates a local variable used to track the number of
   evaluations run via `tf.contrib.training.get_or_create_eval_step`.
@@ -453,7 +455,8 @@ def evaluate_repeatedly(checkpoint_dir,
           '%Y-%m-%d-%H:%M:%S', time.gmtime()))
     num_evaluations += 1
 
-    if max_number_of_evaluations is not None and num_evaluations >= max_number_of_evaluations:
+    if (max_number_of_evaluations is not None and
+        num_evaluations >= max_number_of_evaluations):
       return final_ops_hook.final_ops_values
 
   return final_ops_hook.final_ops_values
diff --git a/tensorflow/contrib/training/python/training/feeder.py b/tensorflow/contrib/training/python/training/feeder.py
index a7f43cc07e9e48748c0aef46f31639f28382d8f0..a5cd7c5c947efff9154f9752d9bcf01e38a382a2 100644
--- a/tensorflow/contrib/training/python/training/feeder.py
+++ b/tensorflow/contrib/training/python/training/feeder.py
@@ -18,7 +18,7 @@
 
 This helper handles the plumbing in order to set up a feeder task to
 push generated inputs to a pool of remote consumers; or to run an
-identical feeding mechanism in a seperate thread in the same process.
+identical feeding mechanism in a separate thread in the same process.
 
 Example usage for distributed feeding:
 
@@ -331,7 +331,7 @@ class Feeder(object):
     they never close their queue. Second, they are added to the
     `Feeder.REMOTE_QUEUE_RUNNERS` collection, rather than
     `ops.GraphKeys.QUEUE_RUNNERS`, so they can be started/stopped
-    seperately.
+    separately.
 
     Args:
       queue: The queue.
diff --git a/tensorflow/contrib/training/python/training/feeder_test.py b/tensorflow/contrib/training/python/training/feeder_test.py
index 4d5cf9eff26041a26cc6dbb2ee02692e281df021..f3a2fee0463f25a18418e01c6240196326ef4965 100644
--- a/tensorflow/contrib/training/python/training/feeder_test.py
+++ b/tensorflow/contrib/training/python/training/feeder_test.py
@@ -156,7 +156,7 @@ class FeederTest(test.TestCase):
     coord.join()
 
   def testFeederSeparateThread(self):
-    # Start a feeder on a seperate thread, but with a shared local queue
+    # Start a feeder on a separate thread, but with a shared local queue
     servers = self._create_local_cluster(worker=1)
     coord = coordinator.Coordinator()
     feed_thread = FeederThread(self, coord, servers, 'worker', 0)
diff --git a/tensorflow/contrib/training/python/training/feeding_queue_runner.py b/tensorflow/contrib/training/python/training/feeding_queue_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d055555b01028106acbe8198aa858d89b06abf02
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/feeding_queue_runner.py
@@ -0,0 +1,24 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A `QueueRunner` that takes a feed function as an argument."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.estimator.inputs.queues.feeding_queue_runner import _FeedingQueueRunner as FeedingQueueRunner
+# pylint: enable=unused-import
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 987d2a0b70b6453a6a29f1904d13177e210e083e..c19a36eabcf7590c55bebde76bf6f50c1fd418e0 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -84,9 +84,12 @@ def parse_values(values, type_map):
     name = m_dict['name']
     if name not in type_map:
       raise ValueError('Unknown hyperparameter type for %s' % name)
-    def parse_fail():
-      raise ValueError('Could not parse hparam %s in %s' % (name, values))
-    if type_map[name] == bool:
+    type_ = type_map[name]
+    def parse_fail(value):
+      raise ValueError(
+          'Could not parse hparam \'%s\' of type \'%s\' with value \'%s\' in %s'
+          % (name, type_.__name__, value, values))
+    if type_ == bool:
       def parse_bool(value):
         if value == 'true':
           return True
@@ -95,24 +98,24 @@ def parse_values(values, type_map):
         else:
           try:
             return bool(int(value))
-          except ValueError:
-            parse_fail()
+          except (ValueError, TypeError):
+            parse_fail(value)
       parse = parse_bool
     else:
-      parse = type_map[name]
+      parse = type_
     if m_dict['val'] is not None:
       try:
         ret[name] = parse(m_dict['val'])
-      except ValueError:
-        parse_fail()
+      except (ValueError, TypeError):
+        parse_fail(m_dict['val'])
     elif m_dict['vals'] is not None:
       elements = filter(None, re.split('[ ,]', m_dict['vals']))
       try:
         ret[name] = [parse(e) for e in elements]
-      except ValueError:
-        parse_fail()
+      except (ValueError, TypeError):
+        parse_fail(m_dict['vals'])
     else:
-      parse_fail()
+      parse_fail('')
   return ret
 
 
@@ -161,7 +164,7 @@ class HParams(object):
   import argparse
   parser = argparse.ArgumentParser(description='Train my model.')
   parser.add_argument('--hparams', type=str,
-                      help='Comma seperated list of "name=value" pairs.')
+                      help='Comma separated list of "name=value" pairs.')
   args = parser.parse_args()
   ...
   def my_program():
@@ -419,7 +422,7 @@ class HParams(object):
     elif issubclass(param_type, float):
       typename = 'float'
     else:
-      raise ValueError('Unsupported paramter type: %s' % str(param_type))
+      raise ValueError('Unsupported parameter type: %s' % str(param_type))
 
     suffix = 'list' if is_list else 'value'
     return '_'.join([typename, suffix])
diff --git a/tensorflow/contrib/training/python/training/python_input.py b/tensorflow/contrib/training/python/training/python_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f5420a98a1afd3b417b302be04ec6f6747445cd
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/python_input.py
@@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for asynchronously reading data from python into queues.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import script_ops
+
+
+def _process_yielded_dict(feature_values, keys, features, dtypes, shapes):
+  """Read feature_values from the generator and emit a proper output dict."""
+  if not isinstance(feature_values, dict):
+    raise TypeError("generator must return dict, saw: %s" % feature_values)
+
+  processed_values = {}
+  for pk in keys:
+    if feature_values.get(pk, None) is not None:
+      processed_values[pk] = np.asarray(
+          feature_values[pk], dtype=dtypes[pk].as_numpy_dtype)
+      check_shape = tensor_shape.TensorShape(processed_values[pk].shape)
+      if not shapes[pk].is_compatible_with(check_shape):
+        raise ValueError(
+            "Feature '%s' has shape %s that is incompatible with declared "
+            "shape: %s" % (pk, shapes[pk], check_shape))
+      continue
+    if isinstance(features[pk], parsing_ops.FixedLenFeature):
+      if features[pk].default_value is not None:
+        processed_values[pk] = np.asarray(
+            features[pk].default_value, dtype=dtypes[pk].as_numpy_dtype)
+    elif isinstance(features[pk], parsing_ops.FixedLenSequenceFeature):
+      processed_values[pk] = np.empty(
+          [0] + features[pk].shape.aslist(), dtype=dtypes[pk].as_numpy_dtype)
+    else:
+      raise ValueError(
+          "Expected generator to return key '%s' with non-empty value" % pk)
+
+  return processed_values
+
+
+def python_input(generator, features, name=None):
+  """Easily feed data from a python generator into TensorFlow queues.
+
+  Example usage:
+
+  ```python
+  def generator():
+    for i in range(3):
+      yield {"value": i}
+
+  features = {
+    "value": tf.FixedLenFeature(shape=[], dtype=dtypes.int32)
+  }
+
+  tensor_dict = tf.contrib.training.python_input(generator, features)
+  batched_dict = tf.train.batch(
+    tensor_dict, batch_size=2, allow_smaller_final_batch=True)
+
+  s = tf.Session()
+  tf.train.start_queue_runners()
+
+  batch1 = s.run(batched_dict)  # returns {"value": np.array([0, 1])}
+  batch2 = s.run(batched_dict)  # returns {"value": np.array([2])}
+  s.run(batched_dict)  # error: Queue is closed (generator finished at i==3)
+  ```
+
+  Args:
+    generator: A python generator that takes no arguments, and yields dicts
+      containing a single minibatch entry one at a time.
+    features: A python `dict` mapping keys expected from the generator to
+      instances of `tf.FixedLenFeature`, or `tf.FixedLenSequenceFeature`.
+    name: (Optional) A name for the operations.
+
+  Returns:
+    A dict mapping keys of the `features` dict to `Tensor` objects.
+    These `Tensor` objects are outputs of a queue that is fed by `generator`.
+
+  Raises:
+    TypeError: If generator is not callable or features is not a dict.
+    TypeError: If any of features' values are not a Feature object.
+    NotImplementedError: If any of features' values are instances of
+      `SparseFeature` or `VarLenFeature`  (these are not currently supported).
+    ValueError: If any FixedLenSequenceFeatures contain a default value
+      (this field is not supported).
+    ValueError: if any FixedLenSequenceFeatures have allow_missing=False
+      (this field is not supported).
+  """
+  if not callable(generator):
+    raise TypeError("generator must be callable, saw: %s" % generator)
+  if not isinstance(features, dict):
+    raise TypeError("features must be a dict, saw: %s"
+                    % type(features).__name__)
+
+  with ops.name_scope(name, "python_input"):
+    shapes = {}
+    dtypes = {}
+    for k, v in features.items():
+      if isinstance(v, parsing_ops.FixedLenFeature):
+        if v.default_value is not None:
+          value = ops.convert_to_tensor(v.default_value, dtype=v.dtype, name=k)
+          shapes[k] = value.shape
+          dtypes[k] = value.dtype
+        else:
+          tensor_shape.TensorShape(v.shape).assert_is_fully_defined()
+          shapes[k] = tensor_shape.TensorShape(v.shape)
+          dtypes[k] = v.dtype
+      elif isinstance(v, parsing_ops.VarLenFeature):
+        raise NotImplementedError("VarLenFeature not supported")
+      elif isinstance(v, parsing_ops.SparseFeature):
+        raise NotImplementedError("SparseFeature not supported")
+      elif isinstance(v, parsing_ops.FixedLenSequenceFeature):
+        if v.default_value is not None:
+          raise ValueError("FixedLenSequenceFeature with default value not "
+                           "supported")
+        if not v.allow_missing:
+          raise ValueError("FixedLenSequenceFeature with allow_missing=False "
+                           "not supported")
+        tensor_shape.TensorShape(v.shape).assert_is_fully_defined()
+        shapes[k] = tensor_shape.TensorShape([None]).concatenate(v.shape)
+        dtypes[k] = v.dtype
+      else:
+        raise TypeError(
+            "Expected value for features key '%s' to be one of "
+            "FixedLenFeature, VarLenFeature, SparseFeature, or "
+            "FixedLenSequenceFeature.  Got: %s" % (k, v))
+
+    keys = list(shapes.keys())
+    dtypes_list = [dtypes[pk] for pk in keys]
+
+    counter = [0]
+    lock = threading.Lock()
+    iterator = iter(generator())
+
+    def generator_iter():
+      """Iterate through generator output and return np.arrays to py_func."""
+      with lock:
+        try:
+          feature_values = next(iterator)
+          counter[0] += 1
+        except StopIteration as e:
+          raise StopIteration("Iteration finished.  Processed %d entries (%s)"
+                              % (counter[0], e))
+
+      processed_dict = _process_yielded_dict(
+          feature_values, keys, features, dtypes, shapes)
+      return [processed_dict[pk] for pk in keys]
+
+    generator_pyfunc_values = script_ops.py_func(
+        generator_iter, inp=[], Tout=dtypes_list, stateful=True)
+
+    pyfunc_input = {k: v for (k, v) in zip(keys, generator_pyfunc_values)}
+    for k, v in shapes.items():
+      pyfunc_input[k].set_shape(v)
+
+  return pyfunc_input
+
+
+__all__ = ["python_input"]
diff --git a/tensorflow/contrib/training/python/training/python_input_test.py b/tensorflow/contrib/training/python/training/python_input_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..afd0f38c2cd3b2ae915f1e860a277018aaeb9cfd
--- /dev/null
+++ b/tensorflow/contrib/training/python/training/python_input_test.py
@@ -0,0 +1,191 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.contrib.training.python_input."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.contrib.training.python.training import bucket_ops
+from tensorflow.contrib.training.python.training import python_input
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import input as core_input
+from tensorflow.python.training import queue_runner_impl
+
+
+class PythonInputTest(test.TestCase):
+
+  def testGenerator(self):
+    def simple_generator():
+      for i in range(2):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+    self.assertEqual(["value"], tensors.keys())
+    self.assertEqual(dtypes.int32, tensors["value"].dtype)
+    self.assertEqual((), tensors["value"].shape)
+
+    with self.test_session() as sess:
+      self.assertEqual({"value": 0}, sess.run(tensors))
+      self.assertEqual({"value": 1}, sess.run(tensors))
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+
+  def testInvalidGenerator(self):
+    generator1 = lambda: iter([{"value": "a"}])
+    int_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors1 = python_input.python_input(generator1, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("invalid literal"):
+        # Can't convert a string to an integer
+        sess.run(tensors1)
+
+    generator2 = lambda: iter([None])
+    tensors2 = python_input.python_input(generator2, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("generator must return dict"):
+        sess.run(tensors2)
+
+    generator3 = lambda: iter([{"value": [1, 2]}])
+    tensors3 = python_input.python_input(generator3, int_features)
+
+    with self.test_session() as sess:
+      with self.assertRaisesOpError("incompatible with declared shape"):
+        sess.run(tensors3)
+
+  def testGeneratorWorksWithBatching(self):
+    def simple_generator():
+      for i in range(5):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+
+    # Request batches of size 4 at a time, the final batch may be smaller.
+    batched_tensors = core_input.batch(tensors, batch_size=4,
+                                       allow_smaller_final_batch=True)
+
+    self.assertEqual(["value"], batched_tensors.keys())
+    self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
+    self.assertEqual([None], batched_tensors["value"].shape.as_list())
+
+    with self.test_session() as sess:
+      # The generator emits 5 items total.  The first 4 are returned in
+      # the first session run; the final one is returned in the
+      # second.  This works because allow_smaller_final_batch=True.
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      r1 = sess.run(batched_tensors)
+      r2 = sess.run(batched_tensors)
+      self.assertAllEqual([0, 1, 2, 3], r1["value"])
+      self.assertEqual([4], r2["value"])
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+  def testGeneratorWorksWithManyBatchingThreads(self):
+    def simple_generator():
+      for i in range(5000):
+        yield {"value": i, "ignored": 3}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+
+    # Request batches of size 20 at a time, the final batch may be smaller.
+    _, batched_tensors = bucket_ops.bucket(
+        tensors, which_bucket=tensors["value"] % 5,
+        batch_size=20, num_buckets=5, num_threads=7, capacity=17,
+        allow_smaller_final_batch=True)
+
+    self.assertEqual(["value"], batched_tensors.keys())
+    self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
+    self.assertEqual([None], batched_tensors["value"].shape.as_list())
+
+    with self.test_session() as sess:
+      # The generator emits 5 items total.  The first 4 are returned in
+      # the first session run; the final one is returned in the
+      # second.  This works because allow_smaller_final_batch=True.
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      results = []
+      while True:
+        try:
+          r = sess.run(batched_tensors)
+          results.extend(r["value"].tolist())
+        except errors.OutOfRangeError:
+          break
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+    self.assertEqual(sorted(results),
+                     list(range(5000)))
+
+  def testVaryingFieldsInGenerator(self):
+    def simple_generator():
+      for i in range(2):
+        yield {"value": i,
+               "seqlen_value": np.ones((i, 1))}
+
+    simple_features = {
+        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32),
+        "seqlen_value": parsing_ops.FixedLenSequenceFeature(
+            shape=[1], dtype=dtypes.float32, allow_missing=True),
+        "empty_value": parsing_ops.FixedLenFeature(
+            default_value=[-1, -2], dtype=dtypes.int32, shape=[2])
+    }
+    tensors = python_input.python_input(simple_generator, simple_features)
+    self.assertEqual(
+        set(["value", "seqlen_value", "empty_value"]), set(tensors.keys()))
+    self.assertEqual(dtypes.int32, tensors["value"].dtype)
+    self.assertEqual((), tensors["value"].shape)
+    self.assertEqual(dtypes.float32, tensors["seqlen_value"].dtype)
+    self.assertEqual([None, 1], tensors["seqlen_value"].shape.as_list())
+    self.assertEqual(dtypes.int32, tensors["empty_value"].dtype)
+    self.assertEqual([2], tensors["empty_value"].shape)
+
+    with self.test_session() as sess:
+      r1 = sess.run(tensors)
+      self.assertAllEqual(0, r1["value"])
+      self.assertAllEqual(np.ones((0, 1)), r1["seqlen_value"])
+      self.assertAllEqual([-1, -2], r1["empty_value"])
+
+      r2 = sess.run(tensors)
+      self.assertAllEqual(1, r2["value"])
+      self.assertAllEqual([[1]], r2["seqlen_value"])
+      self.assertAllEqual([-1, -2], r2["empty_value"])
+
+      with self.assertRaisesOpError("Iteration finished"):
+        sess.run(tensors)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index bf1d2c8cad3fe73f436d4bf6c17c6bffff47746b..410c0a9a6b9bcd1bdbd6475edfb2f4fe52e7cc1d 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -26,7 +26,6 @@ from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.training import input as input_ops
 
 __all__ = [
@@ -264,7 +263,7 @@ def _estimate_data_distribution(labels, num_classes, smoothing_constant=10):
   # slower convergence.
   if smoothing_constant <= 0:
     raise ValueError('smoothing_constant must be nonzero.')
-  num_examples_per_class_seen = variables.Variable(
+  num_examples_per_class_seen = variable_scope.variable(
       initial_value=[smoothing_constant] * num_classes,
       trainable=False,
       name='class_count',
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 7b85f998ea8e233a4ed24c60f4ccf41b1b9adf6d..9312070e52b6c43a413f5e36df773035651f5868 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -344,7 +344,7 @@ def _prepare_sequence_inputs(inputs, states):
   key = _check_rank(inputs.key, 0)
 
   if length.dtype != dtypes.int32:
-    raise TypeError("length dtype must be int32, but recieved: %s" %
+    raise TypeError("length dtype must be int32, but received: %s" %
                     length.dtype)
   if key.dtype != dtypes.string:
     raise TypeError("key dtype must be string, but received: %s" % key.dtype)
@@ -1443,6 +1443,7 @@ def batch_sequences_with_states(input_key,
       input_length = input_length if input_length is not None else length
     elif input_sequences:
       # Assert that value_length is a multiple of num_unroll.
+      checked_input_sequences = {}
       for key, value in input_sequences.items():
         if (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
@@ -1460,11 +1461,13 @@ def batch_sequences_with_states(input_key,
                           ", but saw value: ",
                           string_ops.as_string(value_length),
                           ". Consider setting pad=True."])])]):
-            input_sequences[key] = sparse_tensor.SparseTensor(
-                indices=value.indices,
+            checked_input_sequences[key] = sparse_tensor.SparseTensor(
+                indices=array_ops.identity(
+                    value.indices, name="multiple_of_checked"),
                 values=array_ops.identity(
                     value.values, name="multiple_of_checked"),
-                dense_shape=value.dense_shape)
+                dense_shape=array_ops.identity(
+                    value.dense_shape, name="multiple_of_checked"))
         else:
           if not isinstance(value, ops.Tensor):
             try:
@@ -1490,9 +1493,9 @@ def batch_sequences_with_states(input_key,
                       ])
                   ])
           ]):
-            input_sequences[key] = array_ops.identity(
+            checked_input_sequences[key] = array_ops.identity(
                 value, name="multiple_of_checked")
-
+      input_sequences = checked_input_sequences
     # Move SparseTensors in context into input_sequences.
     _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll)
     # Deconstruct SparseTensors in sequence into a dense Tensor before inputting
@@ -1670,7 +1673,7 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
     shape = array_ops.concat(
         [array_ops.expand_dims(value_length, 0), sp_tensor.dense_shape], 0)
 
-    # Construct new indices by mutliplying old ones and prepending [0, n).
+    # Construct new indices by multiplying old ones and prepending [0, n).
     # First multiply indices n times along a newly created 0-dimension.
     multiplied_indices = array_ops.tile(
         array_ops.expand_dims(sp_tensor.indices, 0),
@@ -1691,7 +1694,6 @@ def _move_sparse_tensor_out_context(input_context, input_sequences, num_unroll):
     ind = array_ops.expand_dims(ind, 1)
     ind = array_ops.expand_dims(ind, 2)
     ind = array_ops.tile(ind, [1, dim0, 1])
-    array_ops.reshape(ind, array_ops.stack([n, dim0, 1]))
 
     # Concatenate both and reshape.
     indices = array_ops.concat([ind, multiplied_indices], 2)
diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py
index e7c8fcd2a09e8579da0cc8e15db75c14394c18a2..0af79cf2e3613eabfa64991ee94809974d777c33 100644
--- a/tensorflow/contrib/training/python/training/training_test.py
+++ b/tensorflow/contrib/training/python/training/training_test.py
@@ -508,7 +508,7 @@ class TrainTest(test.TestCase):
         # Initialize the variables.
         session.run(variables_lib2.global_variables_initializer())
 
-        # Get the intial weights and biases values.
+        # Get the initial weights and biases values.
         weights_values, biases_values = session.run([weights, biases])
         self.assertGreater(np.linalg.norm(weights_values), 0)
         self.assertAlmostEqual(np.linalg.norm(biases_values), 0)
diff --git a/tensorflow/contrib/training/python/training/tuner.py b/tensorflow/contrib/training/python/training/tuner.py
index b724c84c54a4ee5d682a9f9e8bd21d00fd176609..8843632619f0881f888ca76c9de484f081786b19 100644
--- a/tensorflow/contrib/training/python/training/tuner.py
+++ b/tensorflow/contrib/training/python/training/tuner.py
@@ -29,11 +29,11 @@ class Tuner(object):
 
   Example:
   ```
-    def _create_my_experiment(config, hparams):
+    def _create_my_experiment(run_config, hparams):
       hidden_units = [hparams.unit_per_layer] * hparams.num_hidden_layers
 
       return tf.contrib.learn.Experiment(
-          estimator=DNNClassifier(config=config, hidden_units=hidden_units),
+          estimator=DNNClassifier(config=run_config, hidden_units=hidden_units),
           train_input_fn=my_train_input,
           eval_input_fn=my_eval_input)
 
@@ -79,7 +79,7 @@ class Tuner(object):
 
     Args:
       experiment_fn: A function that creates an `Experiment`. It should accept
-        an argument `config` which should be used to create the `Estimator`
+        an argument `run_config` which should be used to create the `Estimator`
         (passed as `config` to its constructor), and an argument `hparams`,
         which should be used for hyper-parameters tuning. It must return an
         `Experiment`.
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e747fa4c9e481064226a2f58356d1d4ade4a740d
--- /dev/null
+++ b/tensorflow/contrib/verbs/BUILD
@@ -0,0 +1,168 @@
+# Description:
+#   Verbs RDMA communication interfaces and implementations for TensorFlow.
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+# For platform specific build config
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "verbs_service_proto",
+    srcs = ["verbs_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "verbs_util",
+    srcs = ["verbs_util.cc"],
+    hdrs = ["verbs_util.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "grpc_verbs_service",
+    srcs = ["grpc_verbs_service.cc"],
+    hdrs = ["grpc_verbs_service.h"],
+    deps = [
+        ":grpc_verbs_service_impl",
+        ":rdma_mgr",
+        ":verbs_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "grpc_verbs_service_impl",
+    srcs = ["grpc_verbs_service_impl.cc"],
+    hdrs = ["grpc_verbs_service_impl.h"],
+    deps = [
+        ":verbs_service_proto_cc",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_verbs_client",
+    srcs = ["grpc_verbs_client.cc"],
+    hdrs = ["grpc_verbs_client.h"],
+    deps = [
+        ":grpc_verbs_service_impl",
+        ":verbs_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:call_options",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "rdma_rendezvous_mgr",
+    srcs = ["rdma_rendezvous_mgr.cc"],
+    hdrs = ["rdma_rendezvous_mgr.h"],
+    deps = [
+        ":rdma_mgr",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+    ],
+)
+
+cc_library(
+    name = "rdma_mgr",
+    srcs = ["rdma_mgr.cc"],
+    hdrs = ["rdma_mgr.h"],
+    deps = [
+        ":grpc_verbs_client",
+        ":rdma",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+    ],
+)
+
+cc_library(
+    name = "rdma",
+    srcs = ["rdma.cc"],
+    hdrs = ["rdma.h"],
+    linkopts = select({
+        "//tensorflow:with_verbs_support": ["-libverbs"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":verbs_util",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:worker_env",
+    ],
+)
+
+cc_library(
+    name = "verbs_server_lib",
+    srcs = ["verbs_server_lib.cc"],
+    hdrs = ["verbs_server_lib.h"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        ":grpc_verbs_service",
+        ":rdma_mgr",
+        ":rdma_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..da5f2b0223bc6698e750ebbc3307d70ee1535478
--- /dev/null
+++ b/tensorflow/contrib/verbs/README.md
@@ -0,0 +1,77 @@
+## How to compile and use RDMA-enabled TensorFlow
+1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:
+
+    ```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
+
+2. To turn on RDMA connection, add the protocol "grpc+verbs" in server definition:
+
+    ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```
+
+## Overview
+The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.
+
+During the server setup, an RDMA manager is created to manage low-level RDMA components such as RDMA channel and RDMA adapter, an RDMA rendezvous manager is created to oversee send/recv operations between servers. Following the distributed TensorFlow design philosophy, the send operation is passive, i.e. merely placing a tensor in the local out-going table. It is the receive operation that actually initiates the tensor transfer.
+
+TensorFlow dynamically allocates memory for tensors that are to be sent or received. This causes difficulty for RDMA operations where pinned memory is required. Two remedies are possible, either the memory is pinned, transfer, then unpinned for each and every tensor to be transferred, or a buffer is pre-allocated and pinned for each tensor. The former incurs significant operation overhead since pinning and unpinning memory for each dynamically generated tensor is slow. The latter incurs large memory overhead and extra copying from the tensor to its pinned buffer, but may still be faster than the former. The second approach is adopted in this design. Each RDMA channel, representing a RDMA connection to a peer, contains a table of pinned buffers for all the seen tensors that requires transfer. It is assumed that the tensor size rarely changes across different steps. So only one buffer is created for the same tensor across all the steps. In the rare case when the tensor size does increases, the old buffer is discarded and new buffer of larger size is created and pinned.
+
+When a tensor is prepared for transfer, it is first converted to TensorProto, then the proto is serialized to byte array and copied to the pinned buffer. The content of the buffer is transferred to the remote node via RDMA write. On the remote side, the process is reversed. This is illustrated in the diagram below. The conversion of TensorProto is introduced to simplify transfer of string-tensors. Also since the TensorProto lives in host memory, even if the origin tensor lives in the device, the pinned buffers are all allocated in the host memory.
+![TensorFlow RDMA path](./design_diagram.png)
+
+The following improvements can be made in the future. First, conversion to TensorProto and serialization can be avoided for numeric (float/int) tensors since their internal buffer can be access directly as byte array. Second, the pinned buffer may be allocated on device if the tensor is located in the device. This avoids extra device-to-host copy at the expense of extra device memory consumption.
+## Design details
+
+### RDMA components
+
+* **RDMA adapter:** The base for RDMA communications. It may contain multiple channels and buffers.  It is responsible for handling various incoming RDMA messages.
+* **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
+* **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
+* **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
+* **RDMA rendezvous manager:** manages multiple rdma rendezvous. 
+* **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
+
+### The SEND operation
+
+In TensorFlow, when rendezvous sends a tensor, it merely puts a tensor in a local table in the corresponding rendezvous. If the tensor has been requested, a callback exists in the table. "send" will activate the callback, which tries to send the tensor across the node.
+
+
+### The RECV operation
+
+When a tensor is requested, rendezvous' recv function is called. The function first places a callback in the channel's callback table, which will be activated once the tensor is sent from the source. In the next step, a message is sent to notify the source of the requested tensor. Once the source receives the message, it will check locally for the tensor, if not found, a callback is placed in the table, otherwise, the tensor id will be placed at corresponding RDMA buffer's job queue for future transmission. When a tensor is scheduled to be transmitted, the RDMA buffer needs to have the memory allocated and initialized (registered with the remote buffer info). If the memory is not ready, the transmission is deferred, a message is sent to the destination to establish the memory first. The other case a transmission can be deferred is when the buffer is still being used by an on-going transmission.
+
+### Three types of RDMA buffers
+
+* **Message buffer:** responsible for sending message only.
+* **Ack buffer:** once a message is sent, the recipient needs to send an ack via the ack buffer to free up the message buffer. An ack buffer is exclusively for its coupled message buffer.
+* **Tensor buffer:** responsible for sending tensors. The recipient needs to send back a message to free up the sending buffer.
+
+### RDMA packet format
+
+|type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|data_type|tensor_shape|tensor_bytes|tensor_buffer|
+
+### Six types of RDMA messages
+* RDMA_MESSAGE_ACK
+* RDMA_MESSAGE_BUFFER_IDLE
+* RDMA_MESSAGE_BUFFER_REQUEST
+* RDMA_MESSAGE_BUFFER_RESPONSE
+* RDMA_MESSAGE_TENSOR_REQUEST
+* RDMA_MESSAGE_TENSOR_WRITE
+
+### Actions upon receiving RDMA messages
+* RDMA_MESSAGE_ACK
+  * sender: mark local ack buffer idle.
+  * receiver: mark remote message buffer idle, send next item.
+* RDMA_MESSAGE_BUFFER_IDLE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, set remote tensor buffer idle, send next item.
+* RDMA_MESSAGE_BUFFER_REQUEST
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, find or create tensor buffer, send BUFFER_RESPONSE.
+* RDMA_MESSAGE_BUFFER_RESPONSE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, set remote buffer info, set local and remote buffer idle, send next item.
+* RDMA_MESSAGE_TENSOR_REQUEST
+  * sender: mark local message buffer idle, send next item.
+  * receiver: send ack, find or create tensor buffer, enqueue tensor id, send next item.
+* RDMA_MESSAGE_TENSOR_WRITE
+  * sender: mark local message buffer idle, send next item.
+  * receiver: run callback.
diff --git a/tensorflow/contrib/verbs/design_diagram.png b/tensorflow/contrib/verbs/design_diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0ad27455fa72bbdd8018bd3977378d2aee468e7
Binary files /dev/null and b/tensorflow/contrib/verbs/design_diagram.png differ
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.cc b/tensorflow/contrib/verbs/grpc_verbs_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..608a9140d3dc55b268c41558b21f7c1194a5aa6b
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/grpc_verbs_client.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+Status GrpcVerbsClient::GetRemoteAddress(CallOptions* call_options,
+                                         const GetRemoteAddressRequest* request,
+                                         GetRemoteAddressResponse* response) {
+  ::grpc::ClientContext ctx;
+  ctx.set_fail_fast(false);
+  SetDeadline(&ctx, call_options->GetTimeout());
+  return FromGrpcStatus(stub_->GetRemoteAddress(&ctx, *request, response));
+}
+
+Status GrpcVerbsClient::GetRemoteAddress(const GetRemoteAddressRequest* request,
+                                         GetRemoteAddressResponse* response) {
+  CallOptions call_options;
+  call_options.SetTimeout(-1);  // no time out
+  return GetRemoteAddress(&call_options, request, response);
+}
+
+void GrpcVerbsClient::SetDeadline(::grpc::ClientContext* ctx,
+                                  int64 time_in_ms) {
+  if (time_in_ms > 0) {
+    ctx->set_deadline(gpr_time_from_millis(time_in_ms, GPR_TIMESPAN));
+  }
+}
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/contrib/verbs/grpc_verbs_client.h b/tensorflow/contrib/verbs/grpc_verbs_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..358977f92543e1a38b594cf45cdbff34f89277be
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_client.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// GrpcVerbsClient is a client that uses gRPC to talk to the Verbs service.
+class GrpcVerbsClient {
+ public:
+  explicit GrpcVerbsClient(SharedGrpcChannelPtr client_channel)
+      : stub_(grpc::VerbsService::NewStub(client_channel)) {}
+  ~GrpcVerbsClient() {}
+
+  Status GetRemoteAddress(CallOptions* call_options,
+                          const GetRemoteAddressRequest* request,
+                          GetRemoteAddressResponse* response);
+  Status GetRemoteAddress(const GetRemoteAddressRequest* request,
+                          GetRemoteAddressResponse* response);
+
+ private:
+  std::unique_ptr<grpc::VerbsService::Stub> stub_;
+
+  void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcVerbsClient);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_CLIENT_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2af6b79fba6a480afbfe88fcbefcbf8a6670ce6
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "grpc++/alarm.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/server_builder.h"
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+
+namespace tensorflow {
+
+GrpcVerbsService::GrpcVerbsService(const WorkerEnv* worker_env,
+                                   ::grpc::ServerBuilder* builder)
+    : is_shutdown_(false), worker_env_(worker_env) {
+  builder->RegisterService(&verbs_service_);
+  cq_ = builder->AddCompletionQueue().release();
+}
+
+GrpcVerbsService::~GrpcVerbsService() {
+  delete shutdown_alarm_;
+  delete cq_;
+}
+
+void GrpcVerbsService::Shutdown() {
+  bool did_shutdown = false;
+  {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      LOG(INFO) << "Shutting down GrpcWorkerService.";
+      is_shutdown_ = true;
+      did_shutdown = true;
+    }
+  }
+  if (did_shutdown) {
+    shutdown_alarm_ =
+        new ::grpc::Alarm(cq_, gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+  }
+}
+
+// This macro creates a new request for the given RPC method name
+// (e.g., `ENQUEUE_REQUEST(GetRemoteAddress, false);`), and enqueues it on
+// `this->cq_`.
+//
+// This macro is invoked one or more times for each RPC method to
+// ensure that there are sufficient completion queue entries to
+// handle incoming requests without blocking.
+//
+// The implementation of the request handler for each RPC method
+// must ensure that it calls ENQUEUE_REQUEST() for that RPC method,
+// to keep accepting new requests.
+#define ENQUEUE_REQUEST(method, supports_cancel)                             \
+  do {                                                                       \
+    mutex_lock l(shutdown_mu_);                                              \
+    if (!is_shutdown_) {                                                     \
+      Call<GrpcVerbsService, grpc::VerbsService::AsyncService,               \
+           method##Request, method##Response>::                              \
+          EnqueueRequest(&verbs_service_, cq_,                               \
+                         &grpc::VerbsService::AsyncService::Request##method, \
+                         &GrpcVerbsService::method##Handler,                 \
+                         (supports_cancel));                                 \
+    }                                                                        \
+  } while (0)
+
+// This method blocks forever handling requests from the completion queue.
+void GrpcVerbsService::HandleRPCsLoop() {
+  for (int i = 0; i < 10; ++i) {
+    ENQUEUE_REQUEST(GetRemoteAddress, false);
+  }
+
+  void* tag;
+  bool ok;
+
+  while (cq_->Next(&tag, &ok)) {
+    UntypedCall<GrpcVerbsService>::Tag* callback_tag =
+        static_cast<UntypedCall<GrpcVerbsService>::Tag*>(tag);
+    if (callback_tag) {
+      callback_tag->OnCompleted(this, ok);
+    } else {
+      cq_->Shutdown();
+    }
+  }
+}
+
+void GrpcVerbsService::GetRemoteAddressHandler(
+    WorkerCall<GetRemoteAddressRequest, GetRemoteAddressResponse>* call) {
+  Status s = GetRemoteAddressSync(&call->request, &call->response);
+  call->SendResponse(ToGrpcStatus(s));
+  ENQUEUE_REQUEST(GetRemoteAddress, false);
+}
+
+// synchronous method
+Status GrpcVerbsService::GetRemoteAddressSync(
+    const GetRemoteAddressRequest* request,
+    GetRemoteAddressResponse* response) {
+  // analyzing request
+  // the channel setting part is redundant.
+  const string remote_host_name = request->host_name();
+  RdmaChannel* rc = rdma_mgr_->FindChannel(remote_host_name);
+  CHECK(rc);
+  RdmaAddress ra;
+  ra.lid = request->channel().lid();
+  ra.qpn = request->channel().qpn();
+  ra.psn = request->channel().psn();
+  ra.snp = request->channel().snp();
+  ra.iid = request->channel().iid();
+  rc->SetRemoteAddress(ra, false);
+  rc->Connect();
+  int i = 0;
+  int idx[] = {1, 0, 3, 2};
+  std::vector<RdmaBuffer*> mb(rc->message_buffers());
+  CHECK_EQ(request->mr_size(), 4);
+  for (const auto& mr : request->mr()) {
+    // the connections are crossed, i.e.
+    // local tx_message_buffer <---> remote rx_message_buffer_
+    // local rx_message_buffer <---> remote tx_message_buffer_
+    // local tx_ack_buffer <---> remote rx_ack_buffer_
+    // local rx_ack_buffer <---> remote tx_ack_buffer_
+    // hence idx[] = {1, 0, 3, 2}.
+    RdmaBuffer* rb = mb[idx[i]];
+    RemoteMR rmr;
+    rmr.remote_addr = mr.remote_addr();
+    rmr.rkey = mr.rkey();
+    rb->SetRemoteMR(rmr, false);
+    i++;
+  }
+  CHECK(i == RdmaChannel::kNumMessageBuffers);
+
+  // setting up response
+  response->set_host_name(
+      worker_env_->session_mgr->LegacySession()->worker_name);
+  Channel* channel_info = response->mutable_channel();
+  channel_info->set_lid(rc->self().lid);
+  channel_info->set_qpn(rc->self().qpn);
+  channel_info->set_psn(rc->self().psn);
+  channel_info->set_snp(rc->self().snp);
+  channel_info->set_iid(rc->self().iid);
+  for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
+    MemoryRegion* mr = response->add_mr();
+    mr->set_remote_addr(reinterpret_cast<uint64>(mb[i]->buffer()));
+    mr->set_rkey(mb[i]->self()->rkey);
+  }
+  return Status::OK();
+}
+
+// Create a GrpcVerbsService, then assign it to a given handle.
+void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
+                        ::grpc::ServerBuilder* builder) {
+  *handle = new GrpcVerbsService(worker_env, builder);
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.h b/tensorflow/contrib/verbs/grpc_verbs_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa509602b51e7749547f1ff8eb5193acd1a3ec65
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+namespace grpc {
+class ServerBuilder;
+class ServerCompletionQueue;
+class Alarm;
+}  // namespace grpc
+
+namespace tensorflow {
+
+class GrpcVerbsService : public AsyncServiceInterface {
+ public:
+  GrpcVerbsService(const WorkerEnv* worker_env, ::grpc::ServerBuilder* builder);
+  ~GrpcVerbsService();
+  void HandleRPCsLoop() override;
+  void Shutdown() override;
+  void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
+
+ private:
+  template <class RequestMessage, class ResponseMessage>
+  using WorkerCall = Call<GrpcVerbsService, grpc::VerbsService::AsyncService,
+                          RequestMessage, ResponseMessage>;
+  void GetRemoteAddressHandler(
+      WorkerCall<GetRemoteAddressRequest, GetRemoteAddressResponse>* call);
+  Status GetRemoteAddressSync(const GetRemoteAddressRequest* request,
+                              GetRemoteAddressResponse* response);
+
+  ::grpc::ServerCompletionQueue* cq_;
+  grpc::VerbsService::AsyncService verbs_service_;
+  mutex shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  ::grpc::Alarm* shutdown_alarm_;
+  // not owned
+  RdmaMgr* rdma_mgr_;
+  const WorkerEnv* const worker_env_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcVerbsService);
+};
+
+// Create a GrpcVerbsService, then assign it to a given handle.
+void SetNewVerbsService(GrpcVerbsService** handle, const WorkerEnv* worker_env,
+                        ::grpc::ServerBuilder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_GRPC_VERBS_SERVICE_H_
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0ba78dbfd537f6ae56627c42d5d302a5fbfbd36
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
@@ -0,0 +1,68 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/channel_interface.h"
+#include "grpc++/impl/codegen/client_unary_call.h"
+#include "grpc++/impl/codegen/method_handler_impl.h"
+#include "grpc++/impl/codegen/rpc_service_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+namespace tensorflow {
+
+namespace grpc {
+
+static const char* grpcVerbsService_method_names[] = {
+    "/tensorflow.VerbsService/GetRemoteAddress",
+};
+
+std::unique_ptr<VerbsService::Stub> VerbsService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  std::unique_ptr<VerbsService::Stub> stub(new VerbsService::Stub(channel));
+  return stub;
+}
+
+VerbsService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0],
+                                  ::grpc::RpcMethod::NORMAL_RPC, channel) {}
+
+::grpc::Status VerbsService::Stub::GetRemoteAddress(
+    ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+    GetRemoteAddressResponse* response) {
+  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_GetRemoteAddress_,
+                                   context, request, response);
+}
+
+VerbsService::AsyncService::AsyncService() {
+  for (int i = 0; i < 1; ++i) {
+    AddMethod(new ::grpc::RpcServiceMethod(grpcVerbsService_method_names[i],
+                                           ::grpc::RpcMethod::NORMAL_RPC,
+                                           nullptr));
+    ::grpc::Service::MarkMethodAsync(i);
+  }
+}
+
+VerbsService::AsyncService::~AsyncService() {}
+
+}  // namespace grpc
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7ea774b661e70a1cd63d844f70f77b9c5bd10a2
--- /dev/null
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -0,0 +1,89 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/impl/codegen/rpc_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/status.h"
+#include "grpc++/impl/codegen/stub_options.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+}  // namespace grpc
+
+namespace tensorflow {
+
+namespace grpc {
+
+// Implementation of `tensorflow.VerbsService`, based on the
+// definition in "//tensorflow/contrib/verbs/verbs_service.proto",
+// and the gRPC generated stub and service classes.
+// See the proto file for the definition of methods and messages.
+class VerbsService GRPC_FINAL {
+ public:
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status GetRemoteAddress(
+        ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+        GetRemoteAddressResponse* response) = 0;
+  };
+  class Stub GRPC_FINAL : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status GetRemoteAddress(
+        ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
+        GetRemoteAddressResponse* response) GRPC_OVERRIDE;
+
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    const ::grpc::RpcMethod rpcmethod_GetRemoteAddress_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+    void RequestGetRemoteAddress(
+        ::grpc::ServerContext* context, GetRemoteAddressRequest* request,
+        ::grpc::ServerAsyncResponseWriter<GetRemoteAddressResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc687be0abb612825a4e1347fda0456c14a91d00
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -0,0 +1,888 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma.h"
+#include <cstdlib>
+#include "tensorflow/contrib/verbs/verbs_util.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+// hash name to 32-bit integer
+uint32_t NameHash(const string& name) {
+  return Hash32(name.data(), name.size(), 0x1234ABCD);
+}
+
+// convenience function for printing message
+string MessageTypeToString(RdmaMessageType rmt) {
+  switch (rmt) {
+    case RDMA_MESSAGE_ACK:
+      return "RDMA_MESSAGE_ACK";
+      break;
+    case RDMA_MESSAGE_BUFFER_IDLE:
+      return "RDMA_MESSAGE_BUFFER_IDLE";
+      break;
+    case RDMA_MESSAGE_BUFFER_REQUEST:
+      return "RDMA_MESSAGE_BUFFER_REQUEST";
+      break;
+    case RDMA_MESSAGE_BUFFER_RESPONSE:
+      return "RDMA_MESSAGE_BUFFER_RESPONSE";
+      break;
+    case RDMA_MESSAGE_TENSOR_REQUEST:
+      return "RDMA_MESSAGE_TENSOR_REQUEST";
+      break;
+    case RDMA_MESSAGE_TENSOR_WRITE:
+      return "RDMA_MESSAGE_TENSOR_WRITE";
+      break;
+    default:
+      return "UNKNOWN MESSAGE";
+  }
+}
+}  // namespace
+
+ibv_context* open_default_device() {
+  ibv_device** dev_list;
+  ibv_device* ib_dev;
+  dev_list = ibv_get_device_list(NULL);
+  CHECK(dev_list) << "No InfiniBand device found";
+  ib_dev = dev_list[0];
+  CHECK(ib_dev) << "No InfiniBand device found";
+  ibv_context* context = ibv_open_device(ib_dev);
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
+  return context;
+}
+
+ibv_pd* alloc_protection_domain(ibv_context* context) {
+  ibv_pd* pd = ibv_alloc_pd(context);
+  CHECK(pd) << "Failed to allocate protection domain";
+  return pd;
+}
+
+RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
+    : context_(open_default_device()),
+      pd_(alloc_protection_domain(context_)),
+      worker_env_(worker_env) {
+  event_channel_ = ibv_create_comp_channel(context_);
+  CHECK(event_channel_) << "Failed to create completion channel";
+  cq_ = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, event_channel_,
+                      0);
+  CHECK(cq_) << "Failed to create completion queue";
+  CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification";
+  polling_thread_.reset(Env::Default()->StartThread(
+      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
+  VLOG(2) << "Start RdmaAdapter: " << name();
+}
+
+RdmaAdapter::~RdmaAdapter() {
+  polling_thread_.reset();
+  CHECK(!ibv_destroy_cq(cq_)) << "Failed to destroy CQ";
+  CHECK(!ibv_destroy_comp_channel(event_channel_))
+      << "Failed to destroy channel";
+  CHECK(!ibv_dealloc_pd(pd_)) << "Failed to deallocate PD";
+  CHECK(!ibv_close_device(context_)) << "Failed to release context";
+}
+
+string RdmaAdapter::name() const { return string(context_->device->name); }
+
+// Function to process incoming messages
+// There are two types of messages:
+// 1. IBV_WC_RECV_RDMA_WITH_IMM (receive)
+// 2. IBV_WC_RDMA_WRITE (send))
+void RdmaAdapter::Process_CQ() {
+  while (true) {
+    ibv_cq* cq;
+    void* cq_context;
+    CHECK(!ibv_get_cq_event(event_channel_, &cq, &cq_context));
+    CHECK(cq == cq_);
+    ibv_ack_cq_events(cq, 1);
+    CHECK(!ibv_req_notify_cq(cq_, 0));
+
+    int ne =
+        ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_));
+    CHECK_GE(ne, 0);
+    for (int i = 0; i < ne; ++i) {
+      CHECK(wc_[i].status == IBV_WC_SUCCESS)
+          << "Failed status \n"
+          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
+          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
+      if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+        RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
+        // put back a recv wr.
+        rc->Recv();
+        // imm_data is the index of RX buffer in the buffer table.
+        uint32_t imm_data = wc_[i].imm_data;
+        RdmaBuffer* rb = rc->FindBuffer(imm_data);
+        RdmaMessage rm;
+        RdmaMessage::ParseMessage(rm, rb->buffer_);
+        VLOG(2) << "recv RDMA message: " << MessageTypeToString(rm.type_);
+
+        if (rm.type_ == RDMA_MESSAGE_ACK) {
+          // receive an ack to a message
+          rb = rc->tx_message_buffer_;
+          rb->SetBufferStatus(remote, idle);
+          rb->SendNextItem();
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST) {
+          // received a request-for-tensor message
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find or create buffer
+          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_);
+          string key_with_step_id =
+              VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
+          tb->EnqueueItem(key_with_step_id);
+          // send the next tensor
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_IDLE) {
+          // receive tensor-buffer-ready message
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find buffer
+          RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+          tb->SetBufferStatus(remote, idle);
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) {
+          // remote host requests to create a tensor buffer;
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find or create the buffer
+          RdmaBuffer* tb = rc->FindOrCreateBuffer(rm.name_, TENSOR);
+          RemoteMR rmr;
+          rmr.remote_addr = rm.remote_addr_;
+          rmr.rkey = rm.rkey_;
+          tb->SetRemoteMR(rmr, true);
+          tb->CreateCPUBuffer(rm.buffer_size_);
+          // create RDMA_MESSAGE_BUFFER_RESPONSE message
+          RdmaMessage br;
+          br.type_ = RDMA_MESSAGE_BUFFER_RESPONSE;
+          br.name_size_ = rm.name_.size();
+          br.name_ = rm.name_;
+          br.buffer_size_ = rm.buffer_size_;
+          br.remote_addr_ = reinterpret_cast<uint64_t>(tb->buffer_);
+          br.rkey_ = tb->self_->rkey;
+          string message = RdmaMessage::CreateMessage(br);
+          RdmaBuffer* mb = rc->tx_message_buffer_;
+          mb->EnqueueItem(message);
+          mb->SendNextItem();
+        } else if (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE) {
+          // remote creates a buffer and responds
+          // send ack to release remote tx message buffer
+          RdmaBuffer* ab = rc->tx_ack_buffer_;
+          ab->SendNextItem();
+          // find buffer
+          RdmaBuffer* tb = rc->FindBuffer(rm.name_);
+          CHECK(rm.buffer_size_ == tb->size_)
+              << "rm.buffer_size = " << rm.buffer_size_
+              << "tb->size_ = " << tb->size_ << "rm.name_ = " << rm.name_;
+          RemoteMR rmr;
+          rmr.remote_addr = rm.remote_addr_;
+          rmr.rkey = rm.rkey_;
+          tb->SetRemoteMR(rmr, true);
+          tb->SetBufferStatus(local, idle);
+          tb->SetBufferStatus(remote, idle);
+          worker_env_->compute_pool->Schedule([tb]() { tb->SendNextItem(); });
+        } else if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+          // tensor RDMA write completed
+          worker_env_->compute_pool->Schedule([rm, rc]() {
+            string key_with_step_id =
+                VerbsUtil::AppendStepidToKey(rm.name_, rm.step_id_);
+            rc->RunRecvCallback(key_with_step_id);
+          });
+        }
+      } else if (wc_[i].opcode == IBV_WC_RDMA_WRITE) {
+        RdmaBuffer* rb = reinterpret_cast<RdmaBuffer*>(wc_[i].wr_id);
+        rb->SetBufferStatus(local, idle);
+        RdmaMessage rm;
+        RdmaMessage::ParseMessage(rm, rb->buffer_);
+        VLOG(2) << "sent RDMA message: " << MessageTypeToString(rm.type_);
+        if (rm.type_ != RDMA_MESSAGE_ACK) {
+          worker_env_->compute_pool->Schedule([rb]() { rb->SendNextItem(); });
+        }
+      }
+    }
+  }
+}
+
+RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
+                         const string remote_name)
+    : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
+  // Create queue pair
+  {
+    struct ibv_qp_init_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_init_attr));
+    attr.send_cq = adapter_->cq_;
+    attr.recv_cq = adapter_->cq_;
+    attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_send_sge = 1;
+    attr.cap.max_recv_sge = 1;
+    attr.qp_type = IBV_QPT_RC;
+
+    qp_ = ibv_create_qp(adapter_->pd_, &attr);
+    CHECK(qp_) << "Failed to create queue pair";
+  }
+
+  // Init queue pair
+  {
+    struct ibv_qp_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_INIT;
+    attr.pkey_index = 0;
+    attr.port_num = 1;
+    attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
+
+    int mask =
+        IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS;
+    CHECK(!ibv_modify_qp(qp_, &attr, mask)) << "Failed to set QP to INIT";
+  }
+
+  // Local address
+  {
+    struct ibv_port_attr attr;
+    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
+        << "Query port";
+    self_.lid = attr.lid;
+    self_.qpn = qp_->qp_num;
+    self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
+    union ibv_gid gid;
+    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+        << "Query gid";
+    self_.snp = gid.global.subnet_prefix;
+    self_.iid = gid.global.interface_id;
+  }
+
+  // create message and ack buffers, then initialize the tables.
+  {
+    const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
+                                   "tx_ack_buffer", "rx_ack_buffer"};
+    tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
+    rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
+    tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
+    rx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[3]);
+    message_buffers_.reserve(kNumMessageBuffers);
+    message_buffers_.push_back(tx_message_buffer_);
+    message_buffers_.push_back(rx_message_buffer_);
+    message_buffers_.push_back(tx_ack_buffer_);
+    message_buffers_.push_back(rx_ack_buffer_);
+    // create buffer on host
+    tx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+    rx_message_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaMessageBufferSize);
+    tx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
+    rx_ack_buffer_->CreateCPUBuffer(RdmaMessage::kRdmaAckBufferSize);
+    // bt_mu_.lock() is not used in constructor.
+    for (int i = 0; i < kNumMessageBuffers; i++) {
+      uint32_t index = NameHash(buffer_names[i]);
+      buffer_table_.insert({index, message_buffers_[i]});
+      buffer_index_name_table_.insert({index, buffer_names[i]});
+      buffer_name_index_table_.insert({buffer_names[i], index});
+    }
+
+    // Initiate recv
+    for (int i = 0; i < 100; i++) {
+      Recv();
+    }
+  }
+}
+
+RdmaChannel::~RdmaChannel() {
+  CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
+  delete tx_message_buffer_;
+  delete rx_message_buffer_;
+  delete tx_ack_buffer_;
+  delete rx_ack_buffer_;
+}
+
+void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
+  mutex_lock lock{mu_};
+  if ((override) || (!remote_set_)) {
+    remote_.lid = ra.lid;
+    remote_.qpn = ra.qpn;
+    remote_.psn = ra.psn;
+    remote_.snp = ra.snp;
+    remote_.iid = ra.iid;
+    remote_set_ = true;
+  } else {
+    CHECK(remote_.lid == ra.lid);
+    CHECK(remote_.qpn == ra.qpn);
+    CHECK(remote_.psn == ra.psn);
+    CHECK(remote_.snp == ra.snp);
+    CHECK(remote_.iid == ra.iid);
+  }
+}
+
+// Adding tokens to the completion queue
+// Tokens are needed to process future messages.
+void RdmaChannel::Recv() {
+  struct ibv_recv_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)this;
+  struct ibv_recv_wr* bad_wr;
+  CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
+}
+
+// Lookup 32-bit buffer index from buffer name
+// Args:
+//   buffer_name: name of the buffer
+// Returns:
+//   32-bit index
+uint32_t RdmaChannel::LookupBufferIndex(const string& buffer_name) {
+  mutex_lock lock{bt_mu_};
+  BufferNameIndexTable::iterator iter =
+      buffer_name_index_table_.find(buffer_name);
+  CHECK(iter != buffer_name_index_table_.end());
+  return iter->second;
+}
+
+// Find a buffer by its 32-bit index
+// Args:
+//   index: 32-bit hash code of the tensor buffer name
+// Returns:
+//   name of the tensor buffer
+RdmaBuffer* RdmaChannel::FindBuffer(const uint32_t index) {
+  mutex_lock lock{bt_mu_};
+  BufferTable::iterator iter = buffer_table_.find(index);
+  CHECK(iter != buffer_table_.end());
+  return iter->second;
+}
+
+// Find a buffer by its name
+// Args:
+//   name: name of the buffer
+// Returns:
+//   the named rdma buffer
+RdmaBuffer* RdmaChannel::FindBuffer(const string& name) {
+  uint32_t index = LookupBufferIndex(name);
+  return FindBuffer(index);
+}
+
+// Find a buffer if it exists, otherwise create one.
+// The memory inside the created buffer is not allocated.
+// Args:
+//   name: the name of the buffer
+//   buffer_type: TENSOR, MESSAGE or ACK.
+// Returns:
+//   the named buffer
+RdmaBuffer* RdmaChannel::FindOrCreateBuffer(const string& name,
+                                            BufferType buffer_type) {
+  mutex_lock lock{bt_mu_};
+  RdmaBuffer* rb;
+  // find index
+  BufferNameIndexTable::iterator iter = buffer_name_index_table_.find(name);
+  if (iter != buffer_name_index_table_.end()) {
+    uint32_t index = iter->second;
+    // find buffer
+    BufferTable::iterator iter = buffer_table_.find(index);
+    CHECK(iter != buffer_table_.end());
+    rb = iter->second;
+  } else {
+    uint32_t index = NameHash(name);
+    if (buffer_type == TENSOR) {
+      rb = new RdmaTensorBuffer(this, name);
+    } else if (buffer_type == MESSAGE) {
+      rb = new RdmaMessageBuffer(this, name);
+    } else if (buffer_type == ACK) {
+      rb = new RdmaAckBuffer(this, name);
+    }
+    buffer_name_index_table_.insert({name, index});
+    buffer_index_name_table_.insert({index, name});
+    buffer_table_.insert({index, rb});
+  }
+  CHECK(rb);
+  return rb;
+}
+
+// Insert callback to the callback_table.
+// The callback is activated when the corresponding tensor is received.
+// Arg:
+//   key: the name of the tensor
+//   recv_done: the callback associated with the tensor.
+// Returns:
+//   None
+void RdmaChannel::InsertRecvCallback(const string& key,
+                                     std::function<void()> recv_done) {
+  mutex_lock lock{ct_mu_};
+  callback_table_.insert({key, recv_done});
+}
+
+// Remove callback from the callback_table.
+// Arg:
+//   key: the name of the tensor
+// Returns:
+//   None
+void RdmaChannel::RemoveRecvCallback(const string& key) {
+  mutex_lock lock{ct_mu_};
+  callback_table_.erase(key);
+}
+
+// Run named callback in the callback_table.
+// Arg:
+//   key: the name of the tensor
+// Returns:
+//   None
+void RdmaChannel::RunRecvCallback(const string& key) {
+  std::function<void()> recv_done;
+  {
+    mutex_lock lock{ct_mu_};
+    CallbackTable::iterator iter = callback_table_.find(key);
+    CHECK(iter != callback_table_.end());
+    recv_done = iter->second;
+  }
+  recv_done();
+}
+
+void RdmaChannel::Connect() {
+  {
+    mutex_lock lock{mu_};
+    CHECK(remote_set_) << "remote channel is not set";
+  }
+  Connect(remote_);
+}
+
+// Setup channel to a remote node
+// Args:
+//   remoteAddr: the rdma address of a remote channel.
+// Returns:
+//   None
+void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
+  mutex_lock lock{mu_};
+  if (!connected_) {
+    struct ibv_qp_attr attr;
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_RTR;
+    struct ibv_port_attr port_attr;
+    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
+        << "Query port failed";
+    // This assumes both QP's ports are configured with the same MTU
+    attr.path_mtu = port_attr.active_mtu;
+    attr.dest_qp_num = remoteAddr.qpn;
+    attr.rq_psn = remoteAddr.psn;
+    attr.max_dest_rd_atomic = 1;
+    attr.min_rnr_timer = 12;
+    attr.ah_attr.is_global = 1;
+    attr.ah_attr.grh.dgid.global.subnet_prefix = remoteAddr.snp;
+    attr.ah_attr.grh.dgid.global.interface_id = remoteAddr.iid;
+    attr.ah_attr.grh.flow_label = 0;
+    attr.ah_attr.grh.hop_limit = 255;
+    attr.ah_attr.dlid = remoteAddr.lid;
+    attr.ah_attr.sl = 0;
+    attr.ah_attr.src_path_bits = 0;
+    attr.ah_attr.port_num = 1;
+
+    int r;
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
+                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                  IBV_QP_MAX_DEST_RD_ATOMIC |
+                                  IBV_QP_MIN_RNR_TIMER)))
+        << "QP to Ready to Receive " << r;
+
+    memset(&attr, 0, sizeof(ibv_qp_attr));
+    attr.qp_state = IBV_QPS_RTS;
+    attr.sq_psn = self_.psn;
+    attr.timeout = 14;
+    attr.retry_cnt = 7;
+    attr.rnr_retry = 7; /* infinite */
+    attr.max_rd_atomic = 1;
+
+    CHECK(!(r = ibv_modify_qp(qp_, &attr,
+                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
+                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                  IBV_QP_MAX_QP_RD_ATOMIC)))
+        << "QP to Ready to Send " << r;
+
+    connected_ = true;
+  } else {
+    LOG(INFO) << "channel already connected";
+  }
+}
+
+RdmaBuffer::RdmaBuffer(RdmaChannel* channel, string name)
+    : channel_(channel), name_(name) {}
+
+RdmaBuffer::~RdmaBuffer() {
+  CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
+  FreeBuffer();
+}
+
+void RdmaBuffer::FreeBuffer() {
+  if ((buffer_ != nullptr) && buffer_on_host_) {
+    free(buffer_);
+  }
+  // TODO
+  // release buffer if it is on device.
+  // We don't support RDMABuffer on device at this moment.
+}
+
+// Allocate CPU memory for the Rdma buffer
+// Args:
+//   size: to-be-allocated memory size
+//   lock: whether or not mutex_lock the process to protect concurrency.
+// Returns:
+//   None
+void RdmaBuffer::CreateCPUBuffer(size_t size, bool lock) {
+  CHECK(size > 0);
+  if (lock) {
+    mu_.lock();
+  }
+  if (local_status_ != none) {
+    // delete existing buffer
+    CHECK(!ibv_dereg_mr(self_)) << "ibv_dereg_mr failed";
+    FreeBuffer();
+  }
+  size_ = size;
+  buffer_ = malloc(size_);
+  self_ = ibv_reg_mr(channel_->adapter_->pd_, buffer_, size_,
+                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+  CHECK(self_) << "Failed to register memory region";
+  buffer_on_host_ = true;
+  local_status_ = idle;
+  if (lock) {
+    mu_.unlock();
+  }
+}
+
+// Set address of remote memory region
+// Args:
+//   rmr: address of remote memory region
+//   override: whether override existing information
+// Returns:
+//   None
+void RdmaBuffer::SetRemoteMR(RemoteMR rmr, bool override) {
+  mutex_lock lock{mu_};
+  if ((override) || (remote_status_ == none)) {
+    remote_.remote_addr = rmr.remote_addr;
+    remote_.rkey = rmr.rkey;
+    remote_status_ = idle;
+  } else {
+    CHECK(remote_.remote_addr == rmr.remote_addr);
+    CHECK(remote_.rkey == rmr.rkey);
+  }
+}
+
+// Put a task in the buffer's job queue
+void RdmaBuffer::EnqueueItem(string item) {
+  mutex_lock lock{mu_};
+  queue_.push(item);
+}
+
+// Rdma-Write the content of the buffer
+void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
+  struct ibv_sge list;
+  list.addr = (uint64_t)buffer_;
+  list.length = buffer_size;
+  list.lkey = self_->lkey;
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)this;
+  wr.sg_list = &list;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  wr.imm_data = imm_data;
+  wr.wr.rdma.remote_addr = (uint64_t)remote_.remote_addr;
+  wr.wr.rdma.rkey = remote_.rkey;
+
+  struct ibv_send_wr* bad_wr;
+  CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send";
+}
+
+RdmaAckBuffer::RdmaAckBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+RdmaMessageBuffer::RdmaMessageBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+RdmaTensorBuffer::RdmaTensorBuffer(RdmaChannel* channel, string name)
+    : RdmaBuffer(channel, name) {}
+
+// Send the next ack from the buffer's job queue.
+void RdmaAckBuffer::SendNextItem() {
+  uint32_t imm_data = LookupBufferIndex("rx_ack_buffer");
+  RdmaMessage rm;
+  rm.name_ = "rx_ack_buffer";
+  rm.type_ = RDMA_MESSAGE_ACK;
+  rm.name_size_ = rm.name_.size();
+  string message = RdmaMessage::CreateMessage(rm);
+  memcpy(buffer_, message.data(), message.size());
+  Write(imm_data, message.size());
+}
+
+// Send the next message from the buffer's job queue.
+void RdmaMessageBuffer::SendNextItem() {
+  uint32_t imm_data = LookupBufferIndex("rx_message_buffer");
+  mu_.lock();
+  if (!queue_.empty() && (local_status_ == idle) && (remote_status_ == idle)) {
+    local_status_ = busy;
+    remote_status_ = busy;
+    string message = queue_.front();
+    queue_.pop();
+    // local/remote_status_ won't be set back to idle
+    // unitl Write() is successful
+    mu_.unlock();
+    memcpy(buffer_, message.data(), message.size());
+    Write(imm_data, message.size());
+  } else {
+    mu_.unlock();
+  }
+}
+
+// Send the next tensor from the buffer's job queue.
+void RdmaTensorBuffer::SendNextItem() {
+  // get the key
+  string key_with_step_id = "";
+  {
+    mutex_lock lock{mu_};
+    if (!queue_.empty()) {
+      key_with_step_id = queue_.front();
+      queue_.pop();
+    }
+  }
+  // send the tensor if a key is acquired.
+  if (key_with_step_id != "") {
+    VLOG(2) << "try to send tensor: " << key_with_step_id;
+    string key;
+    int64 step_id;
+    VerbsUtil::GetKeyAndStepId(key_with_step_id, key, step_id);
+    CHECK(key.compare(name_) == 0);
+    Rendezvous::ParsedKey parsed;
+    Rendezvous::ParseKey(key, &parsed);
+    Rendezvous::DoneCallback cb = [this, key_with_step_id, key, step_id,
+                                   parsed](const Status& status,
+                                           const Rendezvous::Args& send_args,
+                                           const Rendezvous::Args& recv_args,
+                                           const Tensor& in, bool is_dead) {
+      CHECK(status.ok()) << "RecvLocalAsync was not ok, key" << key_with_step_id
+                         << " error message: " << status.error_message();
+      size_t buffer_size = RdmaMessage::kMessageTotalBytes;
+      size_t tensor_bytes = 0;
+      TensorProto proto;
+      // Figures out which device the tensor is hosted on.
+      Device* src_dev = nullptr;
+      Status s = channel_->adapter_->worker_env_->device_mgr->LookupDevice(
+          parsed.src_device, &src_dev);
+      CHECK(s.ok()) << "src device not found";
+      // Does the device have the right incarnation number we expect?
+      CHECK(src_dev->attributes().incarnation() == parsed.src_incarnation)
+          << "RecvTensor expects a different device incarnation: "
+          << parsed.src_incarnation << " vs. "
+          << src_dev->attributes().incarnation()
+          << ". Your worker job was probably restarted. Check your "
+          << "worker job for the reason why it was restarted.";
+      Device* dst_dev = nullptr;
+      // destination is on CPU.
+      s = channel_->adapter_->worker_env_->device_mgr->LookupDevice("CPU:0",
+                                                                    &dst_dev);
+      CHECK(s.ok()) << "dst device not found";
+      AllocatorAttributes dst_alloc_attr;
+      dst_alloc_attr.set_on_host(true);
+      // string tensor needs to be serialized
+      if (src_dev->tensorflow_gpu_device_info() &&
+          (!send_args.alloc_attrs.on_host())) {
+        CHECK(send_args.device_context)
+            << "send dev name: " << src_dev->name()
+            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+        // "val" is on a GPU. Uses GPUUtil to fill the proto.
+        s = VerbsUtil::SetProtoFromGPUSync(
+            in, src_dev, send_args.device_context, &proto, is_dead);
+        CHECK(s.ok()) << "set proto from gpu sync";
+      } else {
+        // tensor is in CPU memory.
+        in.AsProtoTensorContent(&proto);
+      }
+      tensor_bytes = proto.ByteSize();
+      // maybe some margin for string tensor?
+      buffer_size += tensor_bytes;
+      // prepare message
+      RdmaMessage rm;
+      rm.name_size_ = key.size();
+      rm.name_ = key;
+      rm.tensor_shape_ = in.shape();
+      rm.data_type_ = in.dtype();
+      rm.step_id_ = step_id;
+      rm.is_dead_ = is_dead;
+      rm.tensor_bytes_ = tensor_bytes;
+      rm.buffer_size_ = buffer_size;
+      mu_.lock();
+      if (local_status_ == none ||
+          (buffer_size > size_ && local_status_ == idle &&
+           remote_status_ == idle)) {
+        if ((local_status_ != none) && (buffer_size > size_)) {
+          CHECK(rm.data_type_ == DT_STRING)
+              << "Only string tensor allows to change size";
+        }
+        CreateCPUBuffer(buffer_size, false);
+        mu_.unlock();
+        // put back the key since it is not sent;
+        EnqueueItem(key_with_step_id);
+        // ask the remote to create the same buffer
+        rm.type_ = RDMA_MESSAGE_BUFFER_REQUEST;
+        rm.remote_addr_ = reinterpret_cast<uint64_t>(buffer_);
+        rm.rkey_ = self_->rkey;
+        string message = RdmaMessage::CreateMessage(rm);
+        channel_->tx_message_buffer_->EnqueueItem(message);
+        channel_->tx_message_buffer_->SendNextItem();
+      } else if ((local_status_ == idle) && (remote_status_ == idle)) {
+        // both buffers are ready, send the tensor
+        local_status_ = busy;
+        remote_status_ = busy;
+        // local/remote_status_ won't be set back to idle
+        // unitl Write() is successful
+        mu_.unlock();
+        CHECK((buffer_size == size_ && rm.data_type_ != DT_STRING) ||
+              (buffer_size <= size_ && rm.data_type_ == DT_STRING))
+            << "tensor and buffer size do not agree!"
+            << " buffer_size = " << size_
+            << " requested tensor size = " << buffer_size << in.DebugString();
+        uint32_t imm_data = LookupBufferIndex(key);
+        rm.type_ = RDMA_MESSAGE_TENSOR_WRITE;
+        string message = RdmaMessage::CreateMessage(rm);
+        memcpy(buffer_, message.data(), message.size());
+        if (!is_dead) {
+          // copy the tensor buffer content
+          void* output =
+              static_cast<void*>(static_cast<char*>(buffer_) +
+                                 RdmaMessage::kTensorBufferStartIndex);
+          CHECK(tensor_bytes + RdmaMessage::kTensorBufferStartIndex <= size_);
+          proto.SerializeToArray(output, tensor_bytes);
+        } else {
+          buffer_size = RdmaMessage::kMessageTotalBytes;
+        }
+        Write(imm_data, buffer_size);
+      } else {
+        mu_.unlock();
+        // put back the key since it is not sent;
+        EnqueueItem(key_with_step_id);
+      }
+    };
+    channel_->adapter_->worker_env_->rendezvous_mgr->RecvLocalAsync(step_id,
+                                                                    parsed, cb);
+  }
+}
+
+// Create a RdmaMessage according to the pre-defined format
+// Args:
+//   rm: the message structure
+// Returns:
+//   message in string format
+string RdmaMessage::CreateMessage(const RdmaMessage& rm) {
+  // Rdma Message format
+  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
+  // ...|   XB    |    XB      |    8B      |...
+  //
+  // ACK:             type|13|"rx_ack_buffer"
+  // TENSOR_REQUEST:  type|name_size|tensor_name|step_id
+  // TENSOR_WRITE:    type|name_size|tensor_name|step_id|...|is_dead
+  //                 |data_type|tensor_shape|tensor_bytes
+  // BUFFER_IDLE:     type|name_size|buffer_name
+  // BUFFER_REQUEST:
+  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
+  // BUFFER_RESPONSE:
+  // type|name_size|buffer_name|...|buffer_size|remote_addr|rkey|
+  char message[kMessageTotalBytes];
+  // type
+  message[kTypeStartIndex] = static_cast<char>(rm.type_) & 0xff;
+  // size of name
+  memcpy(&message[kNameSizeStartIndex], &rm.name_size_, sizeof(rm.name_size_));
+  // name
+  memcpy(&message[kNameStartIndex], rm.name_.data(), rm.name_.size());
+  // buffer_size, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
+    memcpy(&message[kBufferSizeStartIndex], &rm.buffer_size_,
+           sizeof(rm.buffer_size_));
+    memcpy(&message[kRemoteAddrStartIndex], &rm.remote_addr_,
+           sizeof(rm.remote_addr_));
+    memcpy(&message[kRkeyStartIndex], &rm.rkey_, sizeof(rm.rkey_));
+  }
+  // step_id
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
+    memcpy(&message[kStepIdStartIndex], &rm.step_id_, sizeof(rm.step_id_));
+  }
+  // is_dead, data_type, tensor_shape, tensor_bytes
+  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+    memcpy(&message[kIsDeadStartIndex], &rm.is_dead_, sizeof(rm.is_dead_));
+
+    memcpy(&message[kDataTypeStartIndex], &rm.data_type_,
+           sizeof(rm.data_type_));
+    memcpy(&message[kTensorShapeStartIndex], &rm.tensor_shape_,
+           sizeof(rm.tensor_shape_));
+    memcpy(&message[kTensorBytesStartIndex], &rm.tensor_bytes_,
+           sizeof(rm.tensor_bytes_));
+  }
+  return string(message, kMessageTotalBytes);
+}
+
+// Parse a RdmaMessage according to the pre-defined format
+// Args:
+//   rm: the message structure where the parsed message will be saved
+//   buffer: the place where the raw message is stored
+// Returns:
+//   None
+void RdmaMessage::ParseMessage(RdmaMessage& rm, void* buffer) {
+  char* message = static_cast<char*>(buffer);
+  // type
+  rm.type_ = static_cast<RdmaMessageType>(message[kTypeStartIndex]);
+  // name_size_
+  memcpy(&rm.name_size_, &message[kNameSizeStartIndex], sizeof(rm.name_size_));
+  // name
+  rm.name_ = string(&message[kNameStartIndex], rm.name_size_);
+  // buffer_size, remote_addr, rkey
+  if ((rm.type_ == RDMA_MESSAGE_BUFFER_REQUEST) ||
+      (rm.type_ == RDMA_MESSAGE_BUFFER_RESPONSE)) {
+    memcpy(&rm.buffer_size_, &message[kBufferSizeStartIndex],
+           sizeof(rm.buffer_size_));
+    memcpy(&rm.remote_addr_, &message[kRemoteAddrStartIndex],
+           sizeof(rm.remote_addr_));
+    memcpy(&rm.rkey_, &message[kRkeyStartIndex], sizeof(rm.rkey_));
+  }
+  // step_id
+  if ((rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) ||
+      (rm.type_ == RDMA_MESSAGE_TENSOR_REQUEST)) {
+    memcpy(&rm.step_id_, &message[kStepIdStartIndex], sizeof(rm.step_id_));
+  }
+  // data_type, tensor_bytes, tensor_shape, is_dead
+  if (rm.type_ == RDMA_MESSAGE_TENSOR_WRITE) {
+    memcpy(&rm.is_dead_, &message[kIsDeadStartIndex], sizeof(rm.is_dead_));
+    memcpy(&rm.data_type_, &message[kDataTypeStartIndex],
+           sizeof(rm.data_type_));
+    memcpy(&rm.tensor_shape_, &message[kTensorShapeStartIndex],
+           sizeof(rm.tensor_shape_));
+    memcpy(&rm.tensor_bytes_, &message[kTensorBytesStartIndex],
+           sizeof(rm.tensor_bytes_));
+  }
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
new file mode 100644
index 0000000000000000000000000000000000000000..10cbbe58d9a81cbb0cf287922c28219fc4b06f4f
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -0,0 +1,279 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include <infiniband/verbs.h>
+#include <cstring>  // for memset
+#include <functional>
+#include <memory>  // for shared_ptr
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// structure to save the address of remote channels.
+struct RdmaAddress {
+  uint32_t lid;
+  uint32_t qpn;
+  uint32_t psn;
+  uint64_t snp;
+  uint64_t iid;
+};
+// structure to save information for remote memory regions.
+struct RemoteMR {
+  uint64_t remote_addr;
+  uint32_t rkey;
+};
+enum BufferStatus { none, idle, busy };
+enum Location { local, remote };
+enum BufferType { ACK, MESSAGE, TENSOR };
+enum RdmaMessageType {
+  RDMA_MESSAGE_ACK,
+  RDMA_MESSAGE_BUFFER_IDLE,
+  RDMA_MESSAGE_BUFFER_REQUEST,
+  RDMA_MESSAGE_BUFFER_RESPONSE,
+  RDMA_MESSAGE_TENSOR_REQUEST,
+  RDMA_MESSAGE_TENSOR_WRITE
+};
+class RdmaBuffer;
+// Class that represents the Rdma Adapter.
+// Responsible for creation of the completion queue, and handling
+// of work completions.
+class RdmaAdapter {
+  friend class RdmaChannel;
+  friend class RdmaBuffer;
+  friend class RdmaAckBuffer;
+  friend class RdmaMessageBuffer;
+  friend class RdmaTensorBuffer;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  RdmaAdapter(const WorkerEnv* worker_env);
+  ~RdmaAdapter();
+  // Adapter name, e.g. mlx5_0.
+  string name() const;
+  void Process_CQ();
+
+ protected:
+  static const int MAX_CONCURRENT_WRITES = 1000;
+  ibv_context* context_;
+  // ibverbs protection domain
+  ibv_pd* pd_;
+  // Completion event channel, to wait for work completions
+  ibv_comp_channel* event_channel_;
+  // Completion queue, to poll on work completions
+  ibv_cq* cq_;
+  // Pre-allocated work completions array used for polling
+  ibv_wc wc_[MAX_CONCURRENT_WRITES * 2];
+  // worker env for thread
+  const WorkerEnv* worker_env_;
+  // thread for cq.
+  std::unique_ptr<Thread> polling_thread_;
+};
+
+// Class that represents a connection to a remote Rdma peer.
+// Responsible for connecting queue pairs.
+class RdmaChannel {
+  friend class RdmaAdapter;
+  friend class RdmaBuffer;
+  friend class RdmaAckBuffer;
+  friend class RdmaMessageBuffer;
+  friend class RdmaTensorBuffer;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  explicit RdmaChannel(const RdmaAdapter* adapter, const string local_name,
+                       const string remote_name_);
+  ~RdmaChannel();
+  inline const RdmaAddress& self() { return self_; }
+  RdmaAddress address() const;
+  inline const std::vector<RdmaBuffer*>& message_buffers() const {
+    return message_buffers_;
+  }
+  void Connect(const RdmaAddress& remoteAddr);
+  void Connect();
+  void Recv();
+  RdmaBuffer* FindBuffer(const uint32_t index);
+  RdmaBuffer* FindBuffer(const string& name);
+  RdmaBuffer* FindOrCreateBuffer(const string& name,
+                                 BufferType buffer_type = TENSOR);
+  uint32_t LookupBufferIndex(const string& buffer_name);
+  void SetRemoteAddress(const RdmaAddress& ra, bool override);
+  void InsertRecvCallback(const string& key, std::function<void()> recv_done);
+  void RemoveRecvCallback(const string& key);
+  void RunRecvCallback(const string& key);
+  static const int kNumMessageBuffers = 4;
+
+ protected:
+  const RdmaAdapter* adapter_;
+  RdmaAddress self_;
+  string local_name_;
+  string remote_name_;
+  ibv_qp* qp_;
+  mutex mu_;
+  bool connected_ GUARDED_BY(bt_mu_) = false;
+  RdmaAddress remote_ GUARDED_BY(bt_mu_);
+  bool remote_set_ GUARDED_BY(bt_mu_) = false;
+  mutex ct_mu_;
+  typedef std::unordered_map<string, std::function<void()> > CallbackTable;
+  CallbackTable callback_table_ GUARDED_BY(ct_mu_);
+  mutex bt_mu_;
+  typedef std::unordered_map<unsigned int, RdmaBuffer*> BufferTable;
+  BufferTable buffer_table_ GUARDED_BY(bt_mu_);
+  typedef std::unordered_map<uint32_t, string> BufferIndexNameTable;
+  BufferIndexNameTable buffer_index_name_table_ GUARDED_BY(bt_mu_);
+  typedef std::unordered_map<string, uint32_t> BufferNameIndexTable;
+  BufferNameIndexTable buffer_name_index_table_ GUARDED_BY(bt_mu_);
+  RdmaBuffer* tx_message_buffer_;
+  RdmaBuffer* rx_message_buffer_;
+  RdmaBuffer* tx_ack_buffer_;
+  RdmaBuffer* rx_ack_buffer_;
+  std::vector<RdmaBuffer*> message_buffers_;
+};
+
+// Class that represents a buffer for Rdma writes and reads.
+class RdmaBuffer {
+  friend class RdmaChannel;
+  friend class RdmaAdapter;
+  friend class RdmaMgr;
+  friend class RdmaRemoteRendezvous;
+
+ public:
+  explicit RdmaBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaBuffer();
+
+  inline void* buffer() const { return buffer_; }
+  inline ibv_mr* self() const { return self_; }
+  inline void SetBufferStatus(Location loc, BufferStatus status) {
+    mu_.lock();
+    if (loc == local) {
+      local_status_ = status;
+    } else {
+      remote_status_ = status;
+    }
+    mu_.unlock();
+  }
+  void FreeBuffer();
+  void EnqueueItem(string Item);
+  virtual void SendNextItem(){};
+  void CreateCPUBuffer(size_t size, bool lock = true);
+  void SetRemoteMR(RemoteMR rmi, bool override);
+  uint32_t LookupBufferIndex(const string& buffer_name) {
+    return const_cast<RdmaChannel*>(channel_)->LookupBufferIndex(buffer_name);
+  }
+  void Write(uint32_t imm_data, size_t buffer_size);
+
+ protected:
+  const RdmaChannel* channel_;
+  void* buffer_ = nullptr;
+  bool buffer_on_host_ = true;
+  size_t size_ = 0;
+  const string name_;
+  ibv_mr* self_ = nullptr;
+  mutex mu_;
+  RemoteMR remote_;
+  std::queue<string> queue_ GUARDED_BY(mu_);
+  BufferStatus local_status_ GUARDED_BY(mu_) = none;
+  BufferStatus remote_status_ GUARDED_BY(mu_) = none;
+};
+
+class RdmaAckBuffer : public RdmaBuffer {
+ public:
+  explicit RdmaAckBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaAckBuffer() override {}
+  void SendNextItem() override;
+};
+
+class RdmaMessageBuffer : public RdmaBuffer {
+  friend class RdmaChannel;
+  friend class RdmaAapater;
+
+ public:
+  explicit RdmaMessageBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaMessageBuffer() override {}
+  void SendNextItem() override;
+};
+
+class RdmaTensorBuffer : public RdmaBuffer {
+ public:
+  explicit RdmaTensorBuffer(RdmaChannel* channel, string name);
+  virtual ~RdmaTensorBuffer() override {}
+  void SendNextItem() override;
+};
+
+struct RdmaMessage {
+  RdmaMessageType type_;
+  uint16_t name_size_;
+  string name_;
+  int64 step_id_;
+  uint64_t buffer_size_;
+  uint64_t remote_addr_;
+  uint32_t rkey_;
+  bool is_dead_;
+  DataType data_type_;
+  TensorShape tensor_shape_;
+  size_t tensor_bytes_;
+
+  // type|name_size|name|step_id|buffer_size|remote_addr|rkey|is_dead|...
+  //   1B|    2B   | 512|  8B   |    8B     |       8B  | 4B |    1B |...
+  // ...|data_type|tensor_shape|tensor_bytes|tensor_buffer
+  // ...|   XB    |    XB      |    8B      |...
+  //
+  static const size_t kNameCapacity = 512;
+  static const size_t kTypeStartIndex = 0;
+  static const size_t kNameSizeStartIndex = kTypeStartIndex + sizeof(type_);
+  static const size_t kNameStartIndex =
+      kNameSizeStartIndex + sizeof(name_size_);
+  static const size_t kStepIdStartIndex = kNameStartIndex + kNameCapacity;
+  static const size_t kBufferSizeStartIndex =
+      kStepIdStartIndex + sizeof(step_id_);
+  static const size_t kRemoteAddrStartIndex =
+      kBufferSizeStartIndex + sizeof(buffer_size_);
+  static const size_t kRkeyStartIndex =
+      kRemoteAddrStartIndex + sizeof(remote_addr_);
+  static const size_t kIsDeadStartIndex = kRkeyStartIndex + sizeof(rkey_);
+  static const size_t kDataTypeStartIndex =
+      kIsDeadStartIndex + sizeof(is_dead_);
+  static const size_t kTensorShapeStartIndex =
+      kDataTypeStartIndex + sizeof(data_type_);
+  static const size_t kTensorBytesStartIndex =
+      kTensorShapeStartIndex + sizeof(TensorShape);
+  static const size_t kTensorBufferStartIndex =
+      kTensorBytesStartIndex + sizeof(tensor_bytes_);
+  static const size_t kMessageTotalBytes = kTensorBufferStartIndex;
+  static const size_t kRdmaMessageBufferSize = kMessageTotalBytes;
+  static const size_t kRdmaAckBufferSize = kMessageTotalBytes;
+  static string CreateMessage(const RdmaMessage& rm);
+  static void ParseMessage(RdmaMessage& rm, void* buffer);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_H_
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09b878843f52c910f78f3769522d1fa80319c7d7
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -0,0 +1,137 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include <vector>
+#include "tensorflow/contrib/verbs/grpc_verbs_client.h"
+#include "tensorflow/contrib/verbs/verbs_service.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/session_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+RdmaMgr::RdmaMgr(const WorkerEnv* const worker_env,
+                 GrpcChannelCache* const channel_cache)
+    : worker_env_(worker_env), channel_cache_(channel_cache) {
+  rdma_adapter_ = new RdmaAdapter(worker_env_);
+  // hardcoded to default session (legacy_session_)
+  // TODO: use WorkerSessionForSession
+  // need to pass in session handle
+  local_worker_ = worker_env_->session_mgr->LegacySession()->worker_name;
+  std::vector<string> workers;
+  worker_env_->session_mgr->LegacySession()->worker_cache->ListWorkers(
+      &workers);
+  num_remote_workers_ = workers.size() - 1;
+  VLOG(2) << "rmda_mgr on local worker: " << local_worker_;
+  for (size_t i = 0; i < workers.size(); i++) {
+    if (local_worker_.compare(workers[i]) != 0) {
+      channel_table_.insert(
+          {workers[i],
+           new RdmaChannel(rdma_adapter_, local_worker_, workers[i])});
+    }
+  }
+}
+
+// Setup Rdma channels between peers.
+// This is done at the beginning of the server setup.
+
+void RdmaMgr::SetupChannels() {
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    LOG(INFO) << "connecting to remote node " << worker_name;
+    RdmaChannel* rc = p.second;
+    GetRemoteAddressRequest req;
+    GetRemoteAddressResponse resp;
+    // get the channel cache
+    SharedGrpcChannelPtr client_channel =
+        channel_cache_->FindWorkerChannel(worker_name);
+    GrpcVerbsClient* client = new GrpcVerbsClient(client_channel);
+    CHECK(client != nullptr) << "No worker known as " << worker_name;
+
+    // setting up request
+    req.set_host_name(local_worker_);
+    Channel* channel_info = req.mutable_channel();
+    channel_info->set_lid(rc->self_.lid);
+    channel_info->set_qpn(rc->self_.qpn);
+    channel_info->set_psn(rc->self_.psn);
+    channel_info->set_snp(rc->self_.snp);
+    channel_info->set_iid(rc->self_.iid);
+    for (int i = 0; i < RdmaChannel::kNumMessageBuffers; i++) {
+      MemoryRegion* mr = req.add_mr();
+      mr->set_remote_addr(
+          reinterpret_cast<uint64_t>(rc->message_buffers_[i]->buffer_));
+      mr->set_rkey(rc->message_buffers_[i]->self_->rkey);
+    }
+    // synchronous call
+    Status s = client->GetRemoteAddress(&req, &resp);
+    // save obtained remote addresses
+    // connect to the remote channel
+    if (s.ok()) {
+      CHECK(worker_name.compare(resp.host_name()) == 0);
+      RdmaAddress ra;
+      ra.lid = resp.channel().lid();
+      ra.qpn = resp.channel().qpn();
+      ra.psn = resp.channel().psn();
+      ra.snp = resp.channel().snp();
+      ra.iid = resp.channel().iid();
+      rc->SetRemoteAddress(ra, false);
+      rc->Connect();
+      int i = 0;
+      int idx[] = {1, 0, 3, 2};
+      for (const auto& mr : resp.mr()) {
+        // the connections are crossed, i.e.
+        // local tx_message_buffer <---> remote rx_message_buffer_
+        // local rx_message_buffer <---> remote tx_message_buffer_
+        // local tx_ack_buffer <---> remote rx_ack_buffer_
+        // local rx_ack_buffer <---> remote tx_ack_buffer_
+        // hence idx[] = {1, 0, 3, 2}.
+        RdmaBuffer* rb = rc->message_buffers_[idx[i]];
+        RemoteMR rmr;
+        rmr.remote_addr = mr.remote_addr();
+        rmr.rkey = mr.rkey();
+        rb->SetRemoteMR(rmr, false);
+        i++;
+      }
+      CHECK(i == RdmaChannel::kNumMessageBuffers);
+    } else {
+      LOG(ERROR) << s.error_message();
+    }
+    delete client;
+  }
+}
+
+RdmaMgr::~RdmaMgr() {
+  for (const auto& p : channel_table_) delete p.second;
+  channel_table_.clear();
+  delete rdma_adapter_;
+}
+
+// Find a channel via the given name.
+// Args:
+//   name: peer name, e.g. worker1
+// Returns
+//   channel object that is connected to the named peer.
+RdmaChannel* RdmaMgr::FindChannel(const string& name) {
+  ChannelTable::iterator iter = channel_table_.find(name);
+  CHECK(iter != channel_table_.end());
+  return iter->second;
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..b156f64096c113bb0ac3780b0f64fd1e6bd7cb89
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include <string>
+#include <unordered_map>
+
+#include "tensorflow/contrib/verbs/rdma.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+
+namespace tensorflow {
+
+class RdmaMgr {
+ public:
+  explicit RdmaMgr(const WorkerEnv* const worker_env,
+                   GrpcChannelCache* const channel_cache);
+  ~RdmaMgr();
+  RdmaChannel* FindChannel(const string& key);
+  void SetupChannels();
+  const string& local_worker() { return local_worker_; }
+
+ private:
+  string local_worker_;
+  size_t num_remote_workers_;
+  const WorkerEnv* const worker_env_;
+  GrpcChannelCache* const channel_cache_;
+  RdmaAdapter* rdma_adapter_;
+  typedef std::unordered_map<string, RdmaChannel*> ChannelTable;
+  ChannelTable channel_table_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_MGR_H_
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5871400f26aecba3db4fb4ce687c5891c1720df3
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -0,0 +1,144 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_rendezvous_mgr.h"
+#include <unordered_set>
+#include "tensorflow/contrib/verbs/verbs_util.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+
+class RdmaRemoteRendezvous : public BaseRemoteRendezvous {
+ public:
+  RdmaRemoteRendezvous(const WorkerEnv* env, int64 step_id, RdmaMgr* rdma_mgr)
+      : BaseRemoteRendezvous(env, step_id, true), rdma_mgr_(rdma_mgr) {}
+
+ protected:
+  void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
+                           const Rendezvous::Args& args,
+                           DoneCallback done) override;
+
+ private:
+  ~RdmaRemoteRendezvous() override {}
+  RdmaMgr* rdma_mgr_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaRemoteRendezvous);
+};
+
+void RdmaRemoteRendezvous::RecvFromRemoteAsync(
+    const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
+    DoneCallback done) {
+  Status s;
+  // parse src_name and dst_name
+  string src_name, dst_name, unused;
+  if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_name,
+                                        &unused)) {
+    s = errors::Internal("Could not parse src name.");
+  }
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+  if (!DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
+                                        &unused)) {
+    s = errors::Internal("Could not parse dst name.");
+  }
+  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+  if (!s.ok()) {
+    done(s, Args(), recv_args, Tensor{}, false);
+    return;
+  }
+  CHECK(dst_name.compare(rdma_mgr_->local_worker()) == 0);
+  RdmaChannel* rc = rdma_mgr_->FindChannel(src_name);
+  string key(std::move(parsed.FullKey().ToString()));
+  string key_with_step_id = VerbsUtil::AppendStepidToKey(key, step_id_);
+  // insert callback
+  rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
+                                            recv_args, parsed, done]() {
+    Status s;
+    Device* src_dev;
+    s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
+    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+    if (!s.ok()) {
+      done(s, Args(), recv_args, Tensor(), true);
+      return;
+    }
+    Device* dst_dev;
+    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
+    if (!s.ok()) {
+      done(s, Args(), recv_args, Tensor(), true);
+      return;
+    }
+    RdmaBuffer* rb = rc->FindBuffer(key);
+    RdmaMessage rm;
+    CHECK(rb->size_ >= RdmaMessage::kMessageTotalBytes);
+    RdmaMessage::ParseMessage(rm, rb->buffer_);
+    CHECK(rm.type_ == RDMA_MESSAGE_TENSOR_WRITE);
+    Tensor val;
+    if (!rm.is_dead_) {
+      void* input = static_cast<char*>(rb->buffer_) +
+                    RdmaMessage::kTensorBufferStartIndex;
+      TensorProto proto;
+      CHECK(rm.tensor_bytes_ + RdmaMessage::kTensorBufferStartIndex <=
+            rb->size_);
+      CHECK(ParseProtoUnlimited(&proto, input, rm.tensor_bytes_))
+          << "fail to parse proto from array";
+      s = dst_dev->MakeTensorFromProto(proto, recv_args.alloc_attrs, &val);
+    }
+
+    rc->RemoveRecvCallback(key_with_step_id);
+    // create message
+    RdmaMessage br;
+    br.type_ = RDMA_MESSAGE_BUFFER_IDLE;
+    br.name_size_ = key.size();
+    br.name_ = key;
+    string message = RdmaMessage::CreateMessage(br);
+    RdmaBuffer* tb = rc->tx_message_buffer_;
+    tb->EnqueueItem(message);
+    tb->SendNextItem();
+    done(s, Args(), recv_args, val, rm.is_dead_);
+  });
+  // append key to message queue
+  RdmaBuffer* rb = rc->tx_message_buffer_;
+  RdmaMessage rm;
+  rm.type_ = RDMA_MESSAGE_TENSOR_REQUEST;
+  rm.name_size_ = key.size();
+  rm.name_ = key;
+  rm.step_id_ = step_id_;
+  string message = RdmaMessage::CreateMessage(rm);
+  rb->EnqueueItem(message);
+  rb->SendNextItem();
+}
+
+RdmaRendezvousMgr::RdmaRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env) {}
+
+BaseRemoteRendezvous* RdmaRendezvousMgr::Create(int64 step_id,
+                                                const WorkerEnv* worker_env) {
+  return new RdmaRemoteRendezvous(worker_env, step_id, rdma_mgr_);
+}
+
+}  // end namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dedd6c48f96a6ecf2b69c757f525ac1bfd6f2d0
--- /dev/null
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// RendezvousMgr keeps track of a set of local rendezvous instances.
+// All tensors sent by this worker are buffered in a RendezvousMgr
+// until the tensor is received.  Each global unique "step_id"
+// corresponds to one local rendezvous instance managed by a
+// RendezvousMgr.
+//
+// E.g.,
+//   Rendezvous* rendez = worker_env->rendezvous_mgr->Find(0x8935);
+//   fork execution of an graph executor using "rendez"  on thread 1;
+//   fork execution of another graph executor using "rendez" on thread 2;
+//   ...
+//   join threads 1 and 2;
+//
+// In the example above, execution in thread 1 and 2 communicates with
+// each other by send/recv operations through the "rend".
+//
+// Tensors sent and recved through rendezvous managed by this
+// RendezvousMgr must have keys generated by Rendezvous::CreateKey.
+class RdmaRendezvousMgr : public BaseRendezvousMgr {
+ public:
+  explicit RdmaRendezvousMgr(const WorkerEnv* env);
+  void SetRdmaMgr(RdmaMgr* rdma_mgr) { rdma_mgr_ = rdma_mgr; }
+
+ protected:
+  BaseRemoteRendezvous* Create(int64 step_id,
+                               const WorkerEnv* worker_env) override;
+
+ private:
+  RdmaMgr* rdma_mgr_;
+  TF_DISALLOW_COPY_AND_ASSIGN(RdmaRendezvousMgr);
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_RDMA_RENDEZVOUS_MGR_H_
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3597249354491186d0f654207b93c5e42559348
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/verbs_server_lib.h"
+
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/contrib/verbs/rdma_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+namespace {
+// static utility function
+RendezvousMgrInterface* NewRdmaRendezvousMgr(const WorkerEnv* env) {
+  return new RdmaRendezvousMgr(env);
+}
+
+}  // namespace
+
+VerbsServer::VerbsServer(const ServerDef& server_def, Env* env)
+    : GrpcServer(server_def, env), verbs_state_(DISCONNECTED) {}
+
+VerbsServer::~VerbsServer() {
+  TF_CHECK_OK(Stop());
+  TF_CHECK_OK(Join());
+  delete rdma_mgr_;
+  delete verbs_service_;
+  delete channel_cache_;
+}
+
+Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
+                                        GrpcChannelCache** channel_cache) {
+  string name_prefix =
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
+                      "/task:", server_def.task_index());
+
+  GrpcChannelSpec channel_spec;
+  TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
+
+  *channel_cache =
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction());
+
+  const string host_port = (*channel_cache)->TranslateTask(name_prefix);
+  int requested_port;
+
+  if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
+                             &requested_port)) {
+    return errors::Internal("Could not parse port for local server from \"",
+                            (*channel_cache)->TranslateTask(name_prefix),
+                            "\".");
+  }
+  if (requested_port != bound_port()) {
+    return errors::InvalidArgument("Requested port ", requested_port,
+                                   " differs from expected port ",
+                                   bound_port());
+  }
+
+  return Status::OK();
+}
+
+Status VerbsServer::Init(ServiceInitFunction service_func,
+                         RendezvousMgrCreationFunction rendezvous_mgr_func) {
+  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  {
+    mutex_lock l(mu_);
+    CHECK_EQ(verbs_state_, DISCONNECTED);
+    CHECK(ChannelCacheFactory(server_def(), &channel_cache_).ok());
+    rdma_mgr_ = new RdmaMgr(worker_env(), channel_cache_);
+    // set rdma_mgr for verbs_service and rdma_rendezvous_mgr
+    verbs_service_->SetRdmaMgr(rdma_mgr_);
+    dynamic_cast<RdmaRendezvousMgr*>(worker_env()->rendezvous_mgr)
+        ->SetRdmaMgr(rdma_mgr_);
+  }
+  return s;
+}
+
+Status VerbsServer::Start() {
+  Status s = GrpcServer::Start();
+  {
+    mutex_lock l(mu_);
+    if (verbs_state_ == DISCONNECTED) {
+      // verbs_thread needs to be initiated
+      // before rdma_mgr sets up the rdma channels.
+      verbs_thread_.reset(worker_env()->env->StartThread(
+          ThreadOptions(), "TF_verbs_service",
+          [this] { verbs_service_->HandleRPCsLoop(); }));
+      rdma_mgr_->SetupChannels();
+      verbs_state_ = CONNECTED;
+    }
+  }
+  return s;
+}
+
+Status VerbsServer::Join() {
+  Status s = GrpcServer::Join();
+  {
+    mutex_lock l(mu_);
+    if (verbs_state_ == CONNECTED) {
+      verbs_state_ = DISCONNECTED;
+      verbs_thread_.reset();
+    }
+  }
+  return s;
+}
+
+/* static */
+Status VerbsServer::Create(const ServerDef& server_def, Env* env,
+                           std::unique_ptr<ServerInterface>* out_server) {
+  std::unique_ptr<VerbsServer> ret(new VerbsServer(server_def, Env::Default()));
+  ServiceInitFunction service_func = [&ret](const WorkerEnv* worker_env,
+                                            ::grpc::ServerBuilder* builder) {
+    return SetNewVerbsService(&ret->verbs_service_, worker_env, builder);
+  };
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRdmaRendezvousMgr));
+  *out_server = std::move(ret);
+  return Status::OK();
+}
+
+namespace {
+
+class VerbsServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "grpc+verbs";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return VerbsServer::Create(server_def, Env::Default(), out_server);
+  }
+};
+
+// Registers a `ServerFactory` for `VerbsServer` instances.
+class VerbsServerRegistrar {
+ public:
+  VerbsServerRegistrar() {
+    gpr_allocation_functions alloc_fns;
+    alloc_fns.malloc_fn = port::Malloc;
+    alloc_fns.realloc_fn = port::Realloc;
+    alloc_fns.free_fn = port::Free;
+    gpr_set_allocation_functions(alloc_fns);
+    ServerFactory::Register("VERBS_SERVER", new VerbsServerFactory());
+  }
+};
+static VerbsServerRegistrar registrar;
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.h b/tensorflow/contrib/verbs/verbs_server_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..855380129f21bd8162cdf28a4d88c098db7ddc55
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_server_lib.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
+
+#ifdef TENSORFLOW_USE_VERBS
+
+#include "tensorflow/contrib/verbs/grpc_verbs_service.h"
+#include "tensorflow/contrib/verbs/rdma_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+
+namespace tensorflow {
+
+class VerbsServer : public GrpcServer {
+ protected:
+  VerbsServer(const ServerDef& server_def, Env* env);
+
+ public:
+  static Status Create(const ServerDef& server_def, Env* env,
+                       std::unique_ptr<ServerInterface>* out_server);
+
+  // Destruction is only supported in the factory method. Clean
+  // shutdown is not currently implemented for this server type.
+  virtual ~VerbsServer() override;
+
+  // Implementations of ServerInterface methods.
+  Status Start() override;
+  Status Join() override;
+
+ protected:
+  Status Init(ServiceInitFunction service_func,
+              RendezvousMgrCreationFunction rendezvous_mgr_func);
+  Status ChannelCacheFactory(const ServerDef& server_def,
+                             GrpcChannelCache** channel_cache);
+
+ private:
+  RdmaMgr* rdma_mgr_;
+
+  // Guards state transitions.
+  mutex mu_;
+
+  enum State { DISCONNECTED, CONNECTED };
+  State verbs_state_ GUARDED_BY(mu_);
+
+  GrpcVerbsService* verbs_service_ = nullptr;
+  std::unique_ptr<Thread> verbs_thread_ GUARDED_BY(mu_);
+  GrpcChannelCache* channel_cache_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_USE_VERBS
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_VERBS_VERBS_SERVER_LIB_H_
diff --git a/tensorflow/contrib/verbs/verbs_service.proto b/tensorflow/contrib/verbs/verbs_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0df1fed4b9de81d7d99be3de9fba4be8b88ad404
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_service.proto
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option java_outer_classname = "VerbsServiceProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.contrib.verbs";
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// GRPC Helper messages used to exchange RDMA information.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message Channel {
+  int32 lid = 1;
+  int32 qpn = 2;
+  int32 psn = 3;
+  uint64 snp = 4;
+  uint64 iid = 5;
+}
+
+message MemoryRegion {
+  uint64 remote_addr = 1;
+  uint32 rkey = 2;
+}
+message GetRemoteAddressRequest {
+  string host_name = 1;
+  Channel channel = 2;
+  repeated MemoryRegion mr = 3;
+}
+
+message GetRemoteAddressResponse {
+  string host_name = 1;
+  Channel channel = 2;
+  repeated MemoryRegion mr = 3;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// VerbsService
+//
+////////////////////////////////////////////////////////////////////////////////
+
+service VerbsService {
+  rpc GetRemoteAddress(GetRemoteAddressRequest)
+      returns (GetRemoteAddressResponse);
+}
diff --git a/tensorflow/contrib/verbs/verbs_util.cc b/tensorflow/contrib/verbs/verbs_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3350f7958ce0a2a740332c765e4d566982ee1cf
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_util.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/verbs/verbs_util.h"
+
+#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+namespace tensorflow {
+
+// static sync wrapper:
+Status VerbsUtil::SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
+                                      const DeviceContext* device_context,
+                                      TensorProto* proto, bool is_dead) {
+  Notification n;
+  Status status;
+  GPUUtil::SetProtoFromGPU(tensor, dev, device_context, proto, is_dead,
+                           [&n, &status](const Status& s) {
+                             status = s;
+                             n.Notify();
+                           });
+  n.WaitForNotification();
+  return status;
+}
+
+// static
+string VerbsUtil::AppendStepidToKey(const string& key, int64 step_id) {
+  return strings::StrCat(key, ";", step_id);
+}
+
+// static
+void VerbsUtil::GetKeyAndStepId(const string& key_with_step_id, string& key,
+                                int64& step_id) {
+  StringPiece s(key_with_step_id);
+  // a key (with step_id) has exact 6 parts if split by ";"
+  // part 1: src_device;
+  // part 2: src_incarnation;
+  // part 3: dst_device;
+  // part 4: name;
+  // part 5: frame_iter.frame_id:frame_iter.iter_id
+  // part 6: step_id
+  std::vector<string> parts = str_util::Split(s, ';');
+  CHECK(parts.size() == 6) << "Key with step_id must have 6 parts";
+  strings::safe_strto64(parts[5], &step_id);
+  parts.pop_back();                        // remove step_id
+  key.assign(str_util::Join(parts, ";"));  // stitch them together
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/verbs/verbs_util.h b/tensorflow/contrib/verbs/verbs_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbc01adae494da761ced327e2b860d2ee383925f
--- /dev/null
+++ b/tensorflow/contrib/verbs/verbs_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+#define TENSORFLOW_CONTRIB_RDMA_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class TensorProto;
+
+class VerbsUtil {
+ public:
+  // synchronous wrapper of SetProtoFromGPU
+  static Status SetProtoFromGPUSync(const Tensor& tensor, Device* dev,
+                                    const DeviceContext* device_context,
+                                    TensorProto* proto, bool is_dead);
+  static string AppendStepidToKey(const string& key, int64 step_id);
+  static void GetKeyAndStepId(const string& key_with_step_id, string& key,
+                              int64& step_id);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_RDMA_UTIL_H_
diff --git a/tensorflow/contrib/xla_tf_graph/BUILD b/tensorflow/contrib/xla_tf_graph/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..865df73699bdd242f085eaddf6775cff0073aa41
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/BUILD
@@ -0,0 +1,63 @@
+# Description:
+#   contains parts of TensorFlow that are experimental or unstable and which are not supported.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
+
+cc_library(
+    name = "xla_tf_graph_util",
+    srcs = [
+        "xla_tf_graph_util.cc",
+    ],
+    hdrs = [
+        "xla_tf_graph_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_test(
+    name = "xla_tf_graph_util_test",
+    srcs = ["xla_tf_graph_util_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":xla_tf_graph_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:cwise_op",
+    ],
+)
diff --git a/tensorflow/contrib/xla_tf_graph/README.md b/tensorflow/contrib/xla_tf_graph/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a374189e813107bcf3fe71032d4baf16b3d164a2
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/README.md
@@ -0,0 +1,8 @@
+# Xla Tf Graph
+
+## Description
+
+This module contains utilities to treat xla representation as tf graph to support mobile SOC experiments and leverage tf tools.
+
+Maintainers:
+- Satoshi Kataoka (satok@google.com, github.com/satok16)
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..302aa6457ab08a30bca9c28a5f162331111c4b77
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.cc
@@ -0,0 +1,247 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h"
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+namespace {
+
+constexpr const char* const GRAPH_NAME = "xla_tf_graph";
+constexpr const char* const NODE_NAME_PREFIX = "xla";
+
+Status ConvertPrimitiveTypeToDataType(const xla::PrimitiveType p_type,
+                                      DataType* d_type) {
+  switch (p_type) {
+    case xla::PRED:
+      *d_type = DT_BOOL;
+      return Status::OK();
+    case xla::S8:
+      *d_type = DT_INT8;
+      return Status::OK();
+    case xla::S16:
+      *d_type = DT_INT16;
+      return Status::OK();
+    case xla::S32:
+      *d_type = DT_INT32;
+      return Status::OK();
+    case xla::S64:
+      *d_type = DT_INT64;
+      return Status::OK();
+    case xla::U8:
+      *d_type = DT_UINT8;
+      return Status::OK();
+    case xla::U16:
+      *d_type = DT_UINT16;
+      return Status::OK();
+    case xla::F16:
+      *d_type = DT_HALF;
+      return Status::OK();
+    case xla::F32:
+      *d_type = DT_FLOAT;
+      return Status::OK();
+    case xla::F64:
+      *d_type = DT_DOUBLE;
+      return Status::OK();
+    default:
+      return errors::InvalidArgument(
+          "Unsupported PrimitiveType in ConvertPrimitiveTypeToDataType ",
+          xla::PrimitiveType_Name(p_type));
+  }
+}
+
+Status ConvertXlaShapeToTensorShapeType(const xla::Shape& xla_shape,
+                                        std::vector<TensorShape>* tensor_shapes,
+                                        std::vector<DataType>* data_types) {
+  switch (xla_shape.element_type()) {
+    case xla::TUPLE: {
+      for (const xla::Shape& element_shape : xla_shape.tuple_shapes()) {
+        if (element_shape.element_type() == xla::TUPLE) {
+          return errors::InvalidArgument("Nested tuple is not allowed.");
+        }
+        TF_RETURN_IF_ERROR(ConvertXlaShapeToTensorShapeType(
+            element_shape, tensor_shapes, data_types));
+      }
+      return Status::OK();
+    }
+    case xla::PRED:
+    case xla::S8:
+    case xla::S16:
+    case xla::S32:
+    case xla::S64:
+    case xla::U8:
+    case xla::U16:
+    case xla::U32:
+    case xla::U64:
+    case xla::F16:
+    case xla::F32:
+    case xla::F64: {
+      TensorShape shape;
+      DataType type;
+      TF_RETURN_IF_ERROR(
+          ConvertPrimitiveTypeToDataType(xla_shape.element_type(), &type));
+      for (const int64& dim : xla_shape.dimensions()) {
+        shape.AddDim(dim);
+      }
+      tensor_shapes->emplace_back(shape);
+      data_types->emplace_back(type);
+      return Status::OK();
+    }
+    default:
+      return errors::InvalidArgument(
+          "Unsupported PrimitiveType in ConvertXlaShapeToTensorShapeType ",
+          xla::PrimitiveType_Name(xla_shape.element_type()));
+  }
+}
+
+string BuildXlaNodeName(const xla::OperationRequest& operation_request,
+                        const string& xla_op_type, const string& suffix) {
+  const string name = strings::StrCat(
+      NODE_NAME_PREFIX, "/", operation_request.output_handle().handle(), "/",
+      xla_op_type);
+  if (suffix.empty()) {
+    return name;
+  } else {
+    return strings::StrCat(name, "/", suffix);
+  }
+}
+
+string BuildXlaNodeName(const xla::OperationRequest& operation_request,
+                        const string& xla_op_type) {
+  return BuildXlaNodeName(operation_request, xla_op_type, "");
+}
+
+string BuildXlaNodeOp(const protobuf::Message& msg, const string& suffix) {
+  return strings::StrCat(msg.GetDescriptor()->name(), "/", suffix);
+}
+
+string BuildXlaNodeOp(const protobuf::Message& msg) {
+  return BuildXlaNodeOp(msg, "");
+}
+
+Status ConvertOpRequestToXlaNode(const xla::OperationRequest& operation_request,
+                                 XlaNode* xla_node) {
+  const xla::OpRequest& op_request = operation_request.request();
+  switch (op_request.op_case()) {
+    case xla::OpRequest::kBinaryOpRequest: {
+      const xla::BinaryOpRequest& op = op_request.binary_op_request();
+      xla_node->op_type =
+          BuildXlaNodeOp(op, xla::BinaryOperation_Name(op.binop()));
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      xla_node->input_ids.emplace_back(std::make_tuple(op.lhs().handle(), 0));
+      xla_node->input_ids.emplace_back(std::make_tuple(op.rhs().handle(), 0));
+      for (const int64& dim : op.broadcast_dimensions()) {
+        xla_node->broadcast_dimensions.emplace_back(dim);
+      }
+      break;
+    }
+    case xla::OpRequest::kParameterRequest: {
+      const xla::ParameterRequest& op = op_request.parameter_request();
+      xla_node->op_type = BuildXlaNodeOp(op, "");
+      xla_node->name =
+          BuildXlaNodeName(operation_request, xla_node->op_type, op.name());
+      break;
+    }
+    case xla::OpRequest::kVariadicOpRequest: {
+      const xla::VariadicOpRequest& op = op_request.variadic_op_request();
+      xla_node->op_type =
+          BuildXlaNodeOp(op, xla::VariadicOperation_Name(op.varop()));
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      for (const xla::ComputationDataHandle& handle : op.operands()) {
+        xla_node->input_ids.emplace_back(std::make_tuple(handle.handle(), 0));
+      }
+      break;
+    }
+    case xla::OpRequest::kGetTupleElementRequest: {
+      const xla::GetTupleElementRequest& op =
+          op_request.get_tuple_element_request();
+      xla_node->op_type = BuildXlaNodeOp(op);
+      xla_node->name = BuildXlaNodeName(operation_request, xla_node->op_type);
+      xla_node->input_ids.emplace_back(
+          std::make_tuple(op.operand().handle(), op.index()));
+      break;
+    }
+    default:
+      // TODO(satok): Implement all possible cases.
+      LOG(FATAL) << "Op request: " << op_request.op_case()
+                 << " is not supported yet.";
+      break;
+  }
+
+  CHECK(!xla_node->name.empty());
+  CHECK(!xla_node->op_type.empty());
+
+  TF_RETURN_IF_ERROR(ConvertXlaShapeToTensorShapeType(
+      operation_request.output_shape(), &xla_node->output_shapes,
+      &xla_node->output_data_types));
+  return Status::OK();
+}
+
+void SetupXlaCpuClient(std::unique_ptr<FunctionLibraryDefinition>* flib_def,
+                       std::unique_ptr<XlaCompiler>* compiler) {
+  xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  FunctionDefLibrary flib;
+  flib_def->reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  // Setup compiler options
+  XlaCompiler::Options options;
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  options.device_type = &device_type;
+  options.flib_def = flib_def->get();
+  options.client = client;
+  compiler->reset(new XlaCompiler(options));
+}
+
+}  // namespace
+
+xla::StatusOr<std::unique_ptr<xla::SessionModule>>
+ConvertTfGraphToXlaSessionModule(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph) {
+  CHECK(graph);
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  std::unique_ptr<XlaCompiler> compiler;
+
+  SetupXlaCpuClient(&flib_def, &compiler);
+
+  // Compile graph and build computation
+  XlaCompiler::CompilationResult result;
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), GRAPH_NAME,
+                                     std::move(graph), args, &result));
+
+  return result.computation->Snapshot();
+}
+
+xla::StatusOr<std::unordered_map<int64, XlaNode>>
+ConvertXlaSessionModuleToXlaNodes(const xla::SessionModule& session_module) {
+  std::unordered_map<int64, XlaNode> xla_nodes;
+  for (const auto& operation_request : session_module.entry().requests()) {
+    XlaNode xla_node;
+    TF_RETURN_IF_ERROR(
+        ConvertOpRequestToXlaNode(operation_request.second, &xla_node));
+    xla_nodes.emplace(operation_request.first, xla_node);
+  }
+  return std::move(xla_nodes);
+}
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e635290851f7e5d078d98d845e7488fc3cd94049
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
+#define TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+// A set of utilities to handle xla computation requests.
+// These utilities help developers leverage existing tools to work with
+// xla computations, also provide a way to support TensorFlow ops by
+// implementing xla computations so that they can do experiments on their
+// specialized environments.
+
+// A structure to represent typed attributes of TensorFlow graph node.
+// This structure contains op specific attributes as members so that
+// we can treat them explicitly.
+struct XlaNode {
+  // Unique node name
+  string name;
+  // Op type of xla computation
+  string op_type;
+  // List of pair of unique id and port of input node.
+  // We store this value instead
+  // of node name in order not to wait for all XlaNodes to be constructed.
+  std::vector<std::tuple<int64, int>> input_ids;
+  // Oputput shapes
+  std::vector<TensorShape> output_shapes;
+  // Output data types
+  std::vector<DataType> output_data_types;
+
+  //---------------------------
+  // Op specific attributes
+  // #xla::OpRequest::kBinaryOpRequest
+  std::vector<int64> broadcast_dimensions;
+};
+
+// Convert a tf graph to a xla session module
+xla::StatusOr<std::unique_ptr<xla::SessionModule>>
+ConvertTfGraphToXlaSessionModule(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph);
+
+// Convert a xla session module to a map to XlaNode from unique id
+xla::StatusOr<std::unordered_map<int64, XlaNode>>
+ConvertXlaSessionModuleToXlaNodes(const xla::SessionModule& session_module);
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_XLA_TF_GRAPH_XLA_TF_GRAPH_UTIL_H_
diff --git a/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91aa9c7f439470661acae0d8f8c82a771b68d88c
--- /dev/null
+++ b/tensorflow/contrib/xla_tf_graph/xla_tf_graph_util_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/xla_tf_graph/xla_tf_graph_util.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace xla_tf_graph {
+
+static std::unique_ptr<Graph> BuildAddGraph() {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+  // See tf2xla/kernels/binary_ops.cc
+  auto c = ops::Add(scope.WithOpName("C"), a, b);
+  auto d = ops::_Retval(scope.WithOpName("D"), c, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  return graph;
+}
+
+static std::vector<XlaCompiler::Argument> BuildAddGraphArguments() {
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  // Difference of dimension will add extra broadcast_dimensions.
+  // broadcast_dimension generates an additional HloInstruction
+  // in user_computation.cc
+  args[0].shape = TensorShape({2, 2});
+  args[1].kind = XlaCompiler::Argument::kParameter;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2});
+  return args;
+}
+
+// CAVEAT: Debug purpose only.
+// This function dumps a protobuf string format of HloModule.
+static void DumpHloGraphForDebug(const std::vector<XlaCompiler::Argument>& args,
+                                 std::unique_ptr<Graph> graph) {
+  std::unique_ptr<FunctionLibraryDefinition> flib_def;
+  std::unique_ptr<FunctionLibraryRuntime> flr;
+  std::unique_ptr<XlaCompiler> compiler;
+
+  xla::Client* client = xla::ClientLibrary::LocalClientOrDie();
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  FunctionDefLibrary flib;
+  flib_def.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  // Compiles the graph.
+  XlaCompiler::Options options;
+  DeviceType device_type("XLA_CPU_JIT");
+  options.device_type = &device_type;
+  options.client = client;
+  options.flib_def = flib_def.get();
+  compiler.reset(new XlaCompiler(options));
+
+  // Compile graph
+  XlaCompiler::CompilationResult result;
+  TF_CHECK_OK(compiler->CompileGraph(XlaCompiler::CompileOptions(), "dump",
+                                     std::move(graph), args, &result));
+
+  // Convert to hlo
+  xla::Computation& computation = *result.computation;
+
+  xla::Service* service(
+      static_cast<xla::Service*>(xla::ClientLibrary::GetXlaService(
+          static_cast<xla::LocalClient*>(client)->platform())));
+  const xla::ComputationTracker& computation_tracker =
+      service->computation_tracker();
+
+  auto user_computation_status =
+      computation_tracker.Resolve(computation.handle());
+  TF_CHECK_OK(user_computation_status.status());
+  auto user_computation = user_computation_status.ConsumeValueOrDie();
+  xla::VersionedComputationHandle versioned_handle =
+      user_computation->GetVersionedHandle();
+  std::unique_ptr<xla::HloModule> hlo_module = std::move(
+      computation_tracker.BuildHloModule(versioned_handle, /*config=*/nullptr)
+          .ValueOrDie());
+  VLOG(1) << "--- DUMP HLO ---";
+  VLOG(1) << hlo_module->ToString();
+}
+
+TEST(XlaTfGraphUtil, ConvertTfGraphToSessionModule) {
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
+  std::unique_ptr<Graph> graph = BuildAddGraph();
+
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<xla::SessionModule> session_module,
+      ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
+
+  ASSERT_EQ(5, session_module->entry().requests_size());
+
+  VLOG(1) << "--- DUMP ---";
+  VLOG(1) << session_module->DebugString();
+  DumpHloGraphForDebug(args, BuildAddGraph());
+}
+
+TEST(XlaTfGraphUtil, ConvertXlaSessionModuleToXlaNodes) {
+  std::vector<XlaCompiler::Argument> args = BuildAddGraphArguments();
+  std::unique_ptr<Graph> graph = BuildAddGraph();
+  TF_ASSIGN_OR_ASSERT_OK(
+      std::unique_ptr<xla::SessionModule> session_module,
+      ConvertTfGraphToXlaSessionModule(args, std::move(graph)));
+  TF_ASSIGN_OR_ASSERT_OK(auto xla_nodes,
+                         ConvertXlaSessionModuleToXlaNodes(*session_module));
+  EXPECT_EQ(session_module->entry().requests_size(), xla_nodes.size());
+}
+
+}  // namespace xla_tf_graph
+}  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index ba761cd7c6fc12f9d8cd9c48a16e4f61995f8eda..6a0e0d44a4f228b6ccf785ed3d1093bfb2c74b5e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -108,6 +108,7 @@ load(
     "tf_additional_cloud_op_deps",
     "tf_additional_cloud_kernel_deps",
     "tf_lib_proto_parsing_deps",
+    "tf_additional_verbs_lib_defines",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -153,7 +154,9 @@ CORE_PROTO_SRCS = [
     "framework/versions.proto",
     "lib/core/error_codes.proto",
     "protobuf/config.proto",
+    "protobuf/cluster.proto",
     "protobuf/debug.proto",
+    "protobuf/device_properties.proto",
     "protobuf/queue_runner.proto",
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
@@ -272,6 +275,7 @@ cc_library(
         "lib/monitoring/sampler.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
+        "lib/random/random_distributions.h",
         "lib/random/simple_philox.h",
         "lib/strings/numbers.h",
         "lib/strings/str_util.h",
@@ -383,6 +387,7 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
+        "util/env_var.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -494,16 +499,17 @@ cc_library(
 tf_gen_op_libs(
     op_lib_names = [
         "array_ops",
-        "audio_ops",
         "candidate_sampling_ops",
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
+        "dataset_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "lookup_ops",
         "logging_ops",
         "math_ops",
         "nn_ops",
@@ -519,11 +525,19 @@ tf_gen_op_libs(
         "sparse_ops",
         "spectral_ops",
         "state_ops",
+        "stateless_random_ops",
         "string_ops",
         "training_ops",
     ],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "audio_ops",
+    ],
+    deps = [":lib"],
+)
+
 cc_library(
     name = "debug_ops_op_lib",
     srcs = ["ops/debug_ops.cc"],
@@ -567,11 +581,13 @@ cc_library(
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
         ":data_flow_ops_op_lib",
+        ":dataset_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
         ":io_ops_op_lib",
         ":linalg_ops_op_lib",
+        ":lookup_ops_op_lib",
         ":logging_ops_op_lib",
         ":math_ops_op_lib",
         ":nn_ops_op_lib",
@@ -585,6 +601,7 @@ cc_library(
         ":sparse_ops_op_lib",
         ":spectral_ops_op_lib",
         ":state_ops_op_lib",
+        ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
@@ -686,16 +703,19 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:data_flow",
+        "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
+        "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
@@ -711,6 +731,7 @@ cc_library(
         "//tensorflow/core/kernels:set_kernels",
         "//tensorflow/core/kernels:sparse",
         "//tensorflow/core/kernels:state",
+        "//tensorflow/core/kernels:stateless_random_ops",
         "//tensorflow/core/kernels:string",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
@@ -719,10 +740,16 @@ cc_library(
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
         "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels/neon:neon_depthwise_conv_op",
     ]) + if_mkl([
+        "//tensorflow/core/kernels:mkl_concat_op",
         "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
+        "//tensorflow/core/kernels:mkl_lrn_op",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
+        "//tensorflow/core/kernels:mkl_reshape_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
     ]),
 )
@@ -824,14 +851,13 @@ filegroup(
 
 # Core sources for Android builds.
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         ":proto_text_srcs_all",
         "//tensorflow/core/kernels:android_srcs",
         "//tensorflow/core/platform/default/build_config:android_srcs",
         "//tensorflow/core/util/ctc:android_srcs",
         "//tensorflow/core/util/tensor_bundle:android_srcs",
-        "//tensorflow/core/grappler:android_srcs",
         "common_runtime/gpu/gpu_tracer.cc",
         "common_runtime/gpu/gpu_tracer.h",
     ] + glob(
@@ -858,7 +884,6 @@ filegroup(
             "**/*main.cc",
             "debug/**/*",
             "framework/op_gen_*",
-            "framework/reader_base.*",
             "graph/dot.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
@@ -909,6 +934,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_cc",
+        ":reader_base_proto_cc",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -997,6 +1023,27 @@ cc_library(
     alwayslink = 1,
 )
 
+# Android library for use with the SELECTIVE_REGISTRATION feature with
+# no proto_rtti.
+cc_library(
+    name = "android_tensorflow_lib_selective_registration_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts() + tf_opts_nortti_if_android() + [
+        "-Os",
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":protos_cc",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 filegroup(
     name = "android_op_registrations_and_gradients",
     srcs = glob(
@@ -1260,10 +1307,15 @@ cc_library(
         "platform/tracing.h",
     ],
     copts = tf_copts(),
-    defines = tf_additional_lib_defines() + ["SNAPPY"],
+    defines = tf_additional_lib_defines() + [
+        "SNAPPY",
+    ] + tf_additional_verbs_lib_defines(),
     linkopts = select({
         "//tensorflow:freebsd": [],
-        "//conditions:default": ["-ldl"],
+        "//conditions:default": [
+            "-ldl",
+            "-lpthread",
+        ],
     }),
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
@@ -1360,6 +1412,11 @@ tf_cuda_library(
             "framework/**/*.cc",
             "util/**/*.h",
             "util/**/*.cc",
+        ] + [
+            "graph/edgeset.h",
+            "graph/edgeset.cc",
+            "graph/graph.h",
+            "graph/graph.cc",
         ],
         exclude = [
             "**/*test*",
@@ -1464,42 +1521,151 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
+CORE_CPU_BASE_HDRS = [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+    "graph/algorithm.h",
+    "graph/colors.h",
+    "graph/control_flow.h",
+    "graph/costmodel.h",
+    "graph/default_device.h",
+    "graph/edgeset.h",
+    "graph/graph.h",
+    "graph/graph_constructor.h",
+    "graph/graph_def_builder.h",
+    "graph/graph_partition.h",
+    "graph/mkl_layout_pass.h",
+    "graph/mkl_tfconversion_pass.h",
+    "graph/node_builder.h",
+    "graph/optimizer_cse.h",
+    "graph/subgraph.h",
+    "graph/tensor_id.h",
+    "graph/testlib.h",
+    "graph/types.h",
+    "graph/validate.h",
+]
+
+tf_cuda_library(
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/algorithm.cc",
+        "graph/colors.cc",
+        "graph/control_flow.cc",
+        "graph/costmodel.cc",
+        "graph/graph_constructor.cc",
+        "graph/graph_def_builder.cc",
+        "graph/graph_partition.cc",
+        "graph/node_builder.cc",
+        "graph/optimizer_cse.cc",
+        "graph/subgraph.cc",
+        "graph/tensor_id.cc",
+        "graph/validate.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS,
+    copts = tf_copts(),
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":proto_text",
+        ":protos_all_cc",
+        "//tensorflow/core/kernels:required",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "core_cpu_internal",
-    srcs = glob(
-        [
-            "client/**/*.cc",
-            "common_runtime/*.h",
-            "common_runtime/*.cc",
-            "framework/versions.h",
-            "graph/**/*.h",
-            "graph/**/*.cc",
-            "public/session.h",
-            "public/session_options.h",
-            "public/version.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "**/*main.cc",
-            "common_runtime/direct_session.cc",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
-    hdrs = glob(
-        [
-            "common_runtime/*.h",
-            "framework/versions.h",
-            "graph/**/*.h",
-        ],
-        exclude = [
-            "**/*test*",
-            "common_runtime/direct_session.h",
-            "common_runtime/gpu_device_context.h",
-        ],
-    ),
+    srcs = [
+        "common_runtime/allocator_retry.cc",
+        "common_runtime/bfc_allocator.cc",
+        "common_runtime/build_graph_options.cc",
+        "common_runtime/constant_folding.cc",
+        "common_runtime/copy_tensor.cc",
+        "common_runtime/costmodel_manager.cc",
+        "common_runtime/debugger_state_interface.cc",
+        "common_runtime/device.cc",
+        "common_runtime/device_factory.cc",
+        "common_runtime/device_mgr.cc",
+        "common_runtime/device_set.cc",
+        "common_runtime/executor.cc",
+        "common_runtime/function.cc",
+        "common_runtime/graph_optimizer.cc",
+        "common_runtime/graph_runner.cc",
+        "common_runtime/local_device.cc",
+        "common_runtime/memory_types.cc",
+        "common_runtime/optimization_registry.cc",
+        "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/process_util.cc",
+        "common_runtime/renamed_device.cc",
+        "common_runtime/rendezvous_mgr.cc",
+        "common_runtime/resource_variable_read_optimizer.cc",
+        "common_runtime/session.cc",
+        "common_runtime/session_factory.cc",
+        "common_runtime/session_options.cc",
+        "common_runtime/session_state.cc",
+        "common_runtime/simple_graph_execution_state.cc",
+        "common_runtime/simple_placer.cc",
+        "common_runtime/stats_publisher_interface.cc",
+        "common_runtime/step_stats_collector.cc",
+        "common_runtime/threadpool_device.cc",
+        "common_runtime/threadpool_device_factory.cc",
+        "graph/gradients.cc",
+        "graph/mkl_layout_pass.cc",
+        "graph/mkl_tfconversion_pass.cc",
+        "graph/quantize_training.cc",
+        "public/session.h",
+        "public/session_options.h",
+        "public/version.h",
+    ],
+    hdrs = CORE_CPU_BASE_HDRS + [
+        "common_runtime/allocator_retry.h",
+        "common_runtime/bfc_allocator.h",
+        "common_runtime/build_graph_options.h",
+        "common_runtime/constant_folding.h",
+        "common_runtime/copy_tensor.h",
+        "common_runtime/costmodel_manager.h",
+        "common_runtime/debugger_state_interface.h",
+        "common_runtime/device_factory.h",
+        "common_runtime/device_mgr.h",
+        "common_runtime/device_set.h",
+        "common_runtime/dma_helper.h",
+        "common_runtime/eigen_thread_pool.h",
+        "common_runtime/executor.h",
+        "common_runtime/function.h",
+        "common_runtime/graph_optimizer.h",
+        "common_runtime/local_device.h",
+        "common_runtime/memory_types.h",
+        "common_runtime/mkl_cpu_allocator.h",
+        "common_runtime/optimization_registry.h",
+        "common_runtime/pending_counts.h",
+        "common_runtime/process_util.h",
+        "common_runtime/profile_handler.h",
+        "common_runtime/renamed_device.h",
+        "common_runtime/rendezvous_mgr.h",
+        "common_runtime/session_factory.h",
+        "common_runtime/simple_graph_execution_state.h",
+        "common_runtime/simple_placer.h",
+        "common_runtime/stats_publisher_interface.h",
+        "common_runtime/step_stats_collector.h",
+        "common_runtime/threadpool_device.h",
+        "common_runtime/visitable_allocator.h",
+        "graph/gradients.h",
+        "graph/quantize_training.h",
+    ],
     copts = tf_copts(),
     deps = [
+               ":core_cpu_base",
                ":framework",
                ":framework_internal",
                ":function_ops_op_lib",
@@ -1510,6 +1676,8 @@ tf_cuda_library(
                ":proto_text",
                ":protos_all_cc",
                "//tensorflow/core/grappler:grappler_item",
+               "//tensorflow/core/grappler/clusters:utils",
+               "//tensorflow/core/grappler/clusters:virtual_cluster",
                "//tensorflow/core/grappler/optimizers:meta_optimizer",
                "//third_party/eigen3",
                "//tensorflow/core/kernels:required",
@@ -1535,7 +1703,10 @@ cc_library(
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
-    hdrs = ["common_runtime/direct_session.h"],
+    hdrs = [
+        "common_runtime/direct_session.h",
+        "util/env_var.h",
+    ],
     copts = tf_copts(),
     cuda_deps = [
         ":gpu_tracer",
@@ -1548,6 +1719,8 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
+        "//tensorflow/core/kernels:function_ops",
     ],
     alwayslink = 1,
 )
@@ -2073,7 +2246,6 @@ tf_cc_test_mkl(
     size = "small",
     srcs = [
         "graph/mkl_layout_pass_test.cc",
-        "graph/mkl_optimizer_merge_test.cc",
         "graph/mkl_tfconversion_pass_test.cc",
     ],
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -2094,9 +2266,14 @@ tf_cc_test_mkl(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core/kernels:mkl_concat_op",
         "//tensorflow/core/kernels:mkl_conv_op",
+        "//tensorflow/core/kernels:mkl_fused_batch_norm_op",
+        "//tensorflow/core/kernels:mkl_identity_op",
+        "//tensorflow/core/kernels:mkl_lrn_op",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
+        "//tensorflow/core/kernels:mkl_reshape_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
@@ -2193,9 +2370,12 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/core/kernels:bcast_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:concat_op",
+        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
@@ -2387,6 +2567,9 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
@@ -2691,9 +2874,12 @@ filegroup(
         # -- hand-edited variant: stops after a restart marker
         "lib/jpeg/testdata/corrupt34_4.jpg",
         # GIF data
+        "lib/gif/testdata/lena.gif",
         "lib/gif/testdata/scan.gif",
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
+        # BMP data
+        "lib/bmp/testdata/lena.bmp",
     ],
     visibility = ["//visibility:public"],
 )
@@ -2731,3 +2917,9 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index b18209cb605b7761e97b05e70f50c9700b3c769a..2cf668400e6e3871a2852c6caf9cc49d90e9c4f8 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -453,8 +453,8 @@ void BFCAllocator::RemoveFreeChunkIterFromBin(
 void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
-  int count = BinFromIndex(c->bin_num)->free_chunks.erase(h);
-  CHECK(count > 0) << "Could not find chunk in bin";
+  CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
+      << "Could not find chunk in bin";
   c->bin_num = kInvalidBinNum;
 }
 
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 0b528cb0c270c5d3e1fb506968aa419a319c9a6c..b74c161dcec1e7f9b0cc9ae00bfdd4cbbdba0016 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -78,7 +78,7 @@ class BFCAllocator : public VisitableAllocator {
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
   // kInvalidChunkHandle means an invalid chunk
-  typedef int ChunkHandle;
+  typedef size_t ChunkHandle;
   static const int kInvalidChunkHandle = -1;
 
   typedef int BinNum;
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index c6d4bdad9c1e07fb393964804317ca54c35bd681..5f0e8f170b9e9b0c6a3094e475fcc3bbf47756ea 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
@@ -30,6 +31,13 @@ struct BuildGraphOptions {
   // the former via "ref" fetch_endpoints.
   std::vector<string> target_nodes;
 
+  // If `true`, uses Arg/Retval to implement feeds/fetches; otherwise
+  // uses Recv/Send to implement feeds/fetches.
+  // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
+  bool use_function_convention = false;
+
+  DebugOptions debug_options;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 5db49aa498c5807ca4630648c98d6e4555f50b84..914683d9fa37495c0346b0c5de7743985f5459e5 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -42,7 +44,10 @@ namespace tensorflow {
 namespace {
 
 bool IsConstantFoldable(const Node* n,
-                        std::function<bool(const Node*)> consider) {
+                        const std::function<bool(const Node*)>& consider) {
+  if (n->IsConstant()) {
+    return true;
+  }
   if (n->op_def().is_stateful()) {
     return false;
   }
@@ -77,50 +82,65 @@ bool IsConstantFoldable(const Node* n,
   return true;
 }
 
-// Returns the constant foldable nodes in `nodes_result` in data flow order.
-void FindConstantFoldableNodes(const Graph* graph,
-                               const FunctionLibraryDefinition* flib_def,
-                               ConstantFoldingOptions opts,
-                               std::vector<Node*>* nodes_result) {
-  std::set<const Node*> node_set;
-  std::vector<Node*>& nodes = *nodes_result;
+// Returns the constant foldable nodes in `nodes` in topological order.
+// Populates `constant_control_deps` with the non-constant control dependencies
+// of each constant node.
+void FindConstantFoldableNodes(
+    const Graph* graph, ConstantFoldingOptions opts, std::vector<Node*>* nodes,
+    std::unordered_map<const Node*, gtl::FlatSet<Node*>>*
+        constant_control_deps) {
   bool internal_node_inserted = false;
   // Walk the nodes in data flow order
-  ReverseDFS(*graph, nullptr, [&nodes, &node_set, &internal_node_inserted, opts,
-                               flib_def](Node* n) {
-    if (n->IsConstant()) {
-      // Constants with no control inputs (except from _SOURCE node)
-      // are definitely constant foldable.
-      if (n->in_edges().size() == 0 ||
-          (n->in_edges().size() == 1 &&
-           (*n->in_edges().begin())->src()->IsSource())) {
-        node_set.insert(n);
-        nodes.push_back(n);
-      }
-    } else if (IsConstantFoldable(n, opts.consider)) {
-      // Check whether the set of this node's in_nodes is completely
-      // included in the set of constant foldable nodes. If true,
-      // then this node is also constant foldable.
-      bool all_parents_constant = true;
-      for (const Node* parent : n->in_nodes()) {
-        if (node_set.count(parent) == 0 && !parent->IsSource()) {
-          all_parents_constant = false;
-          break;
+  ReverseDFS(
+      *graph, nullptr,
+      [nodes, constant_control_deps, &internal_node_inserted, opts](Node* n) {
+        if (IsConstantFoldable(n, opts.consider)) {
+          // A node is constant provided all of its non-control
+          // incoming Tensors come from constant nodes.
+          //
+          // We allow control dependencies from non-constant nodes to constant
+          // nodes, but to preserve the graph structure we must transfer the
+          // control dependency onto any constant replacement.
+          bool all_parents_constant = true;
+          for (const Edge* in : n->in_edges()) {
+            // Allows non-constant -> constant control edges.
+            if (!in->IsControlEdge() &&
+                constant_control_deps->count(in->src()) == 0) {
+              all_parents_constant = false;
+              break;
+            }
+          }
+          if (all_parents_constant) {
+            gtl::FlatSet<Node*>& control_deps = (*constant_control_deps)[n];
+            for (const Edge* e : n->in_edges()) {
+              if (constant_control_deps->count(e->src()) == 0) {
+                if (!e->src()->IsSource()) {
+                  control_deps.insert(e->src());
+                }
+              } else {
+                // If the parent is constant, add all of its transitive control
+                // deps.
+                const gtl::FlatSet<Node*>& parent_deps =
+                    (*constant_control_deps)[e->src()];
+                control_deps.insert(parent_deps.begin(), parent_deps.end());
+              }
+            }
+            nodes->push_back(n);
+            if (!n->IsConstant()) {
+              internal_node_inserted = true;
+            }
+          }
         }
-      }
-      if (all_parents_constant) {
-        node_set.insert(n);
-        nodes.push_back(n);
-        internal_node_inserted = true;
-      }
-    }
-  });
+      });
   // If we have inserted just leaf level nodes, then there is nothing to fold.
   if (!internal_node_inserted) {
-    nodes.clear();
+    nodes->clear();
+    constant_control_deps->clear();
   }
 }
 
+typedef std::pair<Node*, int> NodeAndOutput;
+
 // Given the constant foldable nodes in 'nodes', returns a new graph 'g'. 'g'
 // will contain copies of the nodes in 'nodes'. In addition, if there is an edge
 // going from a node 'n' in 'nodes' to another node in 'orig_graph' but not in
@@ -131,23 +151,21 @@ Graph* GetConstantGraph(const Graph* orig_graph,
                         std::map<NodeAndOutput, Node*>* tensors_to_fetch) {
   Graph* constant_graph = new Graph(orig_graph->op_registry());
   std::unordered_map<Node*, Node*> node_map;
-  std::set<Node*> already_added;
-  already_added.insert(constant_graph->source_node());
-  already_added.insert(constant_graph->sink_node());
   node_map[orig_graph->source_node()] = constant_graph->source_node();
   node_map[orig_graph->sink_node()] = constant_graph->sink_node();
   for (Node* n : nodes) {
     Node* added = constant_graph->CopyNode(n);
     node_map[n] = added;
-    already_added.insert(added);
     for (const Edge* in_edge : n->in_edges()) {
-      Node* in = in_edge->src();
-      CHECK_GT(node_map.count(in), size_t{0}) << n->DebugString() << " <-"
-                                              << in->DebugString();
-      CHECK_GT(already_added.count(node_map[in]), size_t{0})
-          << in->DebugString();
-      constant_graph->AddEdge(node_map[in], in_edge->src_output(), added,
-                              in_edge->dst_input());
+      // Don't copy control edges to the constant graph.
+      if (!in_edge->IsControlEdge()) {
+        Node* in = in_edge->src();
+        auto it = node_map.find(in);
+        CHECK(it != node_map.end())
+            << n->DebugString() << " <-" << in->DebugString();
+        constant_graph->AddEdge(it->second, in_edge->src_output(), added,
+                                in_edge->dst_input());
+      }
     }
   }
 
@@ -169,10 +187,15 @@ int64 UniqueConstantId() {
   return id.fetch_add(1);
 }
 
-}  // namespace
-
+// Replaces the identified Tensor in 'graph' by a 'Const' node with
+// the value supplied in 'constant'. 'partition_device', if non-null
+// is the device where the graph executes. Returns true if the
+// replacement was successful, false otherwise.
+// 'control_deps' is the set of nodes that should be control predecessors of the
+// new constant node.
 bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant) {
+                               NodeAndOutput tensor, const Tensor& constant,
+                               const gtl::FlatSet<Node*>& control_deps) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
@@ -236,8 +259,8 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     return false;
   }
 
-  VLOG(1) << "Replacing " << tensor.first->DebugString()
-          << " :: " << tensor.second << " with a constant";
+  VLOG(1) << "Replacing " << tensor.first->name() << " :: " << tensor.second
+          << " with a constant";
 
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
@@ -246,35 +269,30 @@ bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
     graph->AddEdge(constant_node, 0, edge->dst(), edge->dst_input());
     graph->RemoveEdge(edge);
   }
-  graph->AddEdge(graph->source_node(), -1, constant_node, -1);
+  if (control_deps.empty()) {
+    graph->AddControlEdge(graph->source_node(), constant_node);
+  } else {
+    for (Node* node : control_deps) {
+      graph->AddControlEdge(node, constant_node);
+    }
+  }
   if (partition_device) {
     constant_node->set_assigned_device_name(partition_device->name());
   }
   return true;
 }
 
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph) {
-  bool was_mutated;
-  Status unused_status = DoConstantFoldingWithStatus(
-      opts, function_library, env, partition_device, graph, &was_mutated);
-  return was_mutated;
-}
+}  // namespace
 
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated) {
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated) {
   DumpGraph("Before", graph);
 
-  const FunctionLibraryDefinition* flib_def = nullptr;
-  if (function_library) {
-    flib_def = function_library->GetFunctionLibraryDefinition();
-  }
-
   std::vector<Node*> constant_foldable_nodes;
-  FindConstantFoldableNodes(graph, flib_def, opts, &constant_foldable_nodes);
+  std::unordered_map<const Node*, gtl::FlatSet<Node*>> constant_control_deps;
+  FindConstantFoldableNodes(graph, opts, &constant_foldable_nodes,
+                            &constant_control_deps);
   if (constant_foldable_nodes.empty()) {
     VLOG(1) << "No constant foldable nodes found";
     *was_mutated = false;
@@ -304,10 +322,18 @@ Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
     tensors_to_replace.push_back({n.second, n.first.second});
   }
 
+  auto graph_runner = std::unique_ptr<GraphRunner>(new GraphRunner(env));
   // Evaluate the constant foldable nodes.
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(constant_graph.get(), function_library, env,
-                              {} /* inputs*/, tensors_to_fetch_names, &outputs);
+  auto delete_tensors = gtl::MakeCleanup([&graph_runner, &outputs] {
+    // Output tensors need to be cleared before the GraphRunner is deleted.
+    outputs.clear();
+    graph_runner.reset(nullptr);
+  });
+
+  Status s =
+      graph_runner->Run(constant_graph.get(), function_library, {} /* inputs*/,
+                        tensors_to_fetch_names, &outputs);
   if (!s.ok()) {
     VLOG(1) << "Could not fetch constants: " << s;
     *was_mutated = false;
@@ -319,8 +345,11 @@ Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
   // original graph with those constants.
   int32 num_nodes_replaced = 0;
   for (size_t c = 0; c < outputs.size(); ++c) {
+    const gtl::FlatSet<Node*>& control_deps =
+        constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(graph, partition_device,
-                                  tensors_to_replace[c], outputs[c])) {
+                                  tensors_to_replace[c], outputs[c],
+                                  control_deps)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 9e3479e50b094be7d3829f78629319c1e2a422ac..93289b875f5266558baecf1df3308c6430e04a9a 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,12 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMMON_RUNTIME_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
+// Options specific to constant folding optimizations.
+struct ConstantFoldingOptions {
+  // If "consider" is not a nullptr, then only constant fold a node "n" if
+  // consider(n) returns true.
+  std::function<bool(const Node*)> consider = nullptr;
+};
+
 // Perform constant folding optimization on "graph".
 // Looks for nodes in "graph" that can be completely evaluated statically, i.e.,
 // that are only dependent on constants. Evaluates those nodes on a CPU device
@@ -32,25 +40,9 @@ namespace tensorflow {
 // Sets `was_mutated` to true if and only if "graph" has been mutated.
 // The status is only set to a non-OK state if an unexpected error is hit
 // running the graph.
-Status DoConstantFoldingWithStatus(const ConstantFoldingOptions& opts,
-                                   FunctionLibraryRuntime* function_library,
-                                   Env* env, Device* partition_device,
-                                   Graph* graph, bool* was_mutated);
-
-// Version of the function that doesn't return a Status, for backwards
-// compatibility.
-bool DoConstantFolding(const ConstantFoldingOptions& opts,
-                       FunctionLibraryRuntime* function_library, Env* env,
-                       Device* partition_device, Graph* graph);
-
-typedef std::pair<Node*, int> NodeAndOutput;
-
-// Replaces the identified Tensor in 'graph' by a 'Const' node with
-// the value supplied in 'constant'. 'partition_device', if non-null
-// is the device where the graph executes. Returns true if the
-// replacement was successful, false otherwise.
-bool ReplaceTensorWithConstant(Graph* graph, Device* partition_device,
-                               NodeAndOutput tensor, const Tensor& constant);
+Status ConstantFold(const ConstantFoldingOptions& opts,
+                    FunctionLibraryRuntime* function_library, Env* env,
+                    Device* partition_device, Graph* graph, bool* was_mutated);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 3a9bdfe14419945a1ded24783015cb54d64d28b7..4a8560960ed522995f78d56f6ab092cbcb65d9a9 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -30,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -42,27 +43,14 @@ namespace {
 
 class ConstantFoldingTest : public ::testing::Test {
  protected:
-  ConstantFoldingTest() { Reset(); }
-  void Reset() { g_.reset(new Graph(OpRegistry::Global())); }
-
-  template <typename T>
-  Node* Constant(gtl::ArraySlice<T> values, TensorShape shape) {
-    return test::graph::Constant(g_.get(), test::AsTensor(values, shape));
-  }
-
-  template <typename T>
-  Node* Constant(T v) {
-    return test::graph::Constant(g_.get(), test::AsScalar(v));
-  }
-
   template <typename T>
   void ExpectNodeClose(const Node* n, gtl::ArraySlice<T> values,
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectClose(t, test::AsTensor(values, shape));
@@ -73,46 +61,57 @@ class ConstantFoldingTest : public ::testing::Test {
                        TensorShape shape) {
     EXPECT_TRUE(n->IsConstant());
     const TensorProto* tensor_proto;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "value", &tensor_proto));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "value", &tensor_proto));
     DataType dtype;
-    TF_EXPECT_OK(GetNodeAttr(n->def(), "dtype", &dtype));
+    TF_EXPECT_OK(GetNodeAttr(n->attrs(), "dtype", &dtype));
     Tensor t(dtype);
     EXPECT_TRUE(t.FromProto(*tensor_proto));
     test::ExpectTensorEqual<T>(t, test::AsTensor(values, shape));
   }
 
-// Construct the following graph
-/*
-      s1  s2
-      |    |
-      m1   m2
-      / \ / \
-     a   b   c
-*/
-#define SIMPLE_GRAPH                                                  \
-  Reset();                                                            \
-  Graph* g = g_.get();                                                \
-  Node* a = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});            \
-  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});            \
-  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});            \
-  g->AddControlEdge(g->source_node(), a);                             \
-  g->AddControlEdge(g->source_node(), b);                             \
-  g->AddControlEdge(g->source_node(), c);                             \
-  Node* m1 = test::graph::Matmul(g, a, b, false, false);              \
-  Node* s1 = test::graph::Send(g, m1, "m1", "sender", 0, "receiver"); \
-  Node* m2 = test::graph::Matmul(g, b, c, false, false);              \
-  Node* s2 = test::graph::Send(g, m2, "m2", "sender", 0, "receiver"); \
-  g->AddControlEdge(s1, g->sink_node());                              \
-  g->AddControlEdge(s2, g->sink_node());
-
-  std::unique_ptr<Graph> g_;
+  // Builds a map from node name to Node* for `graph`.
+  std::unordered_map<string, Node*> NodeNameIndex(const Graph& graph) {
+    std::unordered_map<string, Node*> index;
+    for (Node* node : graph.nodes()) {
+      index[node->name()] = node;
+    }
+    return index;
+  }
+
+  // Constructs the following graph.
+  /*
+        s1  s2
+        |    |
+        m1   m2
+        / \ / \
+       a   b   c
+  */
+  void BuildSimpleGraph(Scope* scope) {
+    Scope& s = *scope;
+    auto a = ops::Const<float>(s, {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto b = ops::Const<float>(s, {1.0, 2.0, 3.0, 4.0}, {2, 2});
+    auto c = ops::Const<float>(s, {0.0, 1.0, 1.0, 0.0}, {2, 2});
+    auto m1 = ops::MatMul(s, a, b);
+    auto s1 = ops::_Send(s.WithOpName("s1"), m1, "m1", "sender", 0, "receiver");
+    auto m2 = ops::MatMul(s.WithOpName("m2"), b, c);
+    auto s2 = ops::_Send(s.WithOpName("s2"), m2, "m2", "sender", 0, "receiver");
+  }
 };
 
 TEST_F(ConstantFoldingTest, Basic) {
-  SIMPLE_GRAPH;
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
 
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
   // Nodes s1 and s2 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
   ExpectNodeClose<float>(*(s1->in_nodes().begin()), {1.0, 2.0, 3.0, 4.0},
@@ -123,11 +122,23 @@ TEST_F(ConstantFoldingTest, Basic) {
 }
 
 TEST_F(ConstantFoldingTest, ConsiderFunction) {
-  SIMPLE_GRAPH;
+  Scope s = Scope::NewRootScope();
+  BuildSimpleGraph(&s);
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(&g));
+
   ConstantFoldingOptions opts;
   // Do not allow constant folding of m2
-  opts.consider = [m2](const Node* n) { return m2 != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "m2" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* s1 = index.at("s1");
+  Node* s2 = index.at("s2");
+  Node* m2 = index.at("m2");
 
   // Node s1 now should now have a constant input
   EXPECT_EQ(1, s1->num_inputs());
@@ -139,40 +150,52 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
-  SIMPLE_GRAPH;
-  Node* d = Constant<float>({1.0, 0.0, 0.0, 1.0}, {2, 2});
-  g->AddControlEdge(g->source_node(), d);
-  Node* s3 = test::graph::Send(g, d, "d", "sender", 0, "receiver");
-  g->AddControlEdge(s3, g->sink_node());
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    BuildSimpleGraph(&s);
+    auto d = ops::Const<float>(s.WithOpName("d"), {1.0, 0.0, 0.0, 1.0}, {2, 2});
+    auto s3 = ops::_Send(s.WithOpName("s3"), d, "d", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* d = index.at("d");
+  Node* s3 = index.at("s3");
 
   // Nodes s3 should still have d as input
   EXPECT_EQ(1, s3->num_inputs());
   EXPECT_EQ(*(s3->in_nodes().begin()), d);
 }
 
-#undef SIMPLE_GRAPH
-
 TEST_F(ConstantFoldingTest, TwoOutputs) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1 = test::graph::Send(g, test::graph::Identity(g, b, 1),
-                               strings::StrCat(b->name(), "1"), "sender", 0,
-                               "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
-
-  EXPECT_TRUE(DoConstantFolding(ConstantFoldingOptions{}, nullptr,
-                                Env::Default(), nullptr, g));
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1 = ops::_Send(s.WithOpName("b1"), ops::Identity(s, b.r1), "b1",
+                         "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+
+  bool was_mutated;
+  TF_ASSERT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
+
   EXPECT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   EXPECT_EQ(1, b1->num_inputs());
@@ -180,126 +203,164 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
 }
 
 TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 = Constant<int>({1}, {1});
-  Node* s1 = Constant<int>({2, 2}, {2});
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  Node* b = test::graph::BroadcastGradientArgs(g, s0, s1);
-  Node* b0 = test::graph::Send(g, test::graph::Identity(g, b, 0),
-                               strings::StrCat(b->name(), "0"), "sender", 0,
-                               "receiver");
-  Node* b1_ident = test::graph::Identity(g, b, 1);
-  Node* b1 = test::graph::Send(g, b1_ident, strings::StrCat(b->name(), "1"),
-                               "sender", 0, "receiver");
-  g->AddControlEdge(b0, g->sink_node());
-  g->AddControlEdge(b1, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, {1}, {1});
+    auto s1 = ops::Const<int>(s, {2, 2}, {2});
+    auto b = ops::internal::BroadcastGradientArgs(s, s0, s1);
+    auto b0 = ops::_Send(s.WithOpName("b0"), ops::Identity(s, b.r0), "b0",
+                         "sender", 0, "receiver");
+    auto b1_ident = ops::Identity(s.WithOpName("b1_ident"), b.r1);
+    auto b1 =
+        ops::_Send(s.WithOpName("b1"), b1_ident, "b1", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   ConstantFoldingOptions opts;
-  opts.consider = [b1_ident](const Node* n) { return b1_ident != n; };
-  EXPECT_TRUE(DoConstantFolding(opts, nullptr, Env::Default(), nullptr, g));
+  opts.consider = [](const Node* n) { return "b1_ident" != n->name(); };
+  bool was_mutated;
+  TF_ASSERT_OK(
+      ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* b0 = index.at("b0");
+  Node* b1 = index.at("b1");
+  Node* b1_ident = index.at("b1_ident");
+
   // 0th output of b should have been folded.
-  EXPECT_EQ(1, b0->num_inputs());
+  ASSERT_EQ(1, b0->num_inputs());
   ExpectNodeEqual<int>(*(b0->in_nodes().begin()), {0, 1}, {2});
   // 1st output of b should still be b1_ident. However, b1_ident's input must
   // have been replaced with a constant.
-  EXPECT_EQ(1, b1->num_inputs());
+  ASSERT_EQ(1, b1->num_inputs());
   EXPECT_EQ(*(b1->in_nodes().begin()), b1_ident);
 
-  EXPECT_EQ(1, b1_ident->num_inputs());
+  ASSERT_EQ(1, b1_ident->num_inputs());
   ExpectNodeEqual<int>(*(b1_ident->in_nodes().begin()), {}, {0});
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceLargeConstant) {
-  Reset();
-  Graph* g = g_.get();
-  Node* s0 =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  Node* s1 = Constant<int>(std::vector<int>(5 * 1024 * 256 + 1, 0),
-                           {5 * 1024 * 256 + 1});
-  Node* concat_dim = Constant<int>(0);
-  g->AddControlEdge(g->source_node(), s0);
-  g->AddControlEdge(g->source_node(), s1);
-  // Concat s0 and s1. The resulting tensor would be of size 10M + 1 bytes
-  Node* concat = test::graph::Concat(g, concat_dim, {s0, s1});
-  Node* concat_send =
-      test::graph::Send(g, concat, "concat_send", "sender", 0, "receiver");
-  g->AddControlEdge(concat_send, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto s0 = ops::Const<int>(s, 0, {5 * 1024 * 256});
+    auto s1 = ops::Const<int>(s, 0, {5 * 1024 * 256 + 1});
+    auto concat_dim = ops::Const<int>(s, 0);
+    auto concat = ops::Concat(s, {s0, s1}, concat_dim);
+    auto concat_send = ops::_Send(s.WithOpName("concat_send"), concat,
+                                  "concat_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above concat should not have been constant folded.
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 TEST_F(ConstantFoldingTest, TestNoReplaceFunctionCall) {
-  FunctionDefLibrary fdef_lib;
-  *fdef_lib.add_function() = test::function::XTimesTwo();
-
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
-  g_.reset(new Graph(&flib_def));
-
-  Graph* g = g_.get();
-  Node* s =
-      Constant<int>(std::vector<int>(5 * 1024 * 256, 0), {5 * 1024 * 256});
-  g->AddControlEdge(g->source_node(), s);
-
-  NodeDef def;
-  TF_ASSERT_OK(NodeDefBuilder("times_two", "XTimesTwo", g->op_registry())
-                   .Input(s->name(), 0, DT_INT32)
-                   .Finalize(&def));
-  Status status;
-  Node* times_two = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-
-  Node* times_two_send = test::graph::Send(g, times_two, "times_two_send",
-                                           "sender", 0, "receiver");
-  g->AddControlEdge(times_two_send, g->sink_node());
+  FunctionDefLibrary flib;
+  *flib.add_function() = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph g(flib_def);
+  {
+    Scope s = Scope::NewRootScope();
+    auto c = ops::Const<int32>(s.WithOpName("c"), {1}, {1});
+    TF_EXPECT_OK(s.graph()->AddFunctionLibrary(flib));
+
+    // TODO(phawkins): there is no way to make a function call using the C++
+    // graph builder API.
+    NodeDef def;
+    TF_ASSERT_OK(
+        NodeDefBuilder("times_two", "XTimesTwo", s.graph()->op_registry())
+            .Input(c.name(), 0, DT_INT32)
+            .Finalize(&def));
+    Status status;
+    Node* times_two = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
+    s.graph()->AddEdge(c.node(), 0, times_two, 0);
+
+    auto times_two_send =
+        ops::_Send(s.WithOpName("times_two_send"), Output(times_two),
+                   "times_two_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The above function call should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
-
-  g_ = nullptr;
 }
 
 REGISTER_OP("ConstantFoldingTestOp").Input("a: int64").Output("b: int64");
 
 TEST_F(ConstantFoldingTest, TestNoReplaceNonCPUOp) {
-  Graph* g = g_.get();
-
-  Node* aconst = Constant<int64>(std::vector<int64>(5, 0), {5});
-  g->AddControlEdge(g->source_node(), aconst);
-
-  NodeDef def;
-  TF_ASSERT_OK(
-      NodeDefBuilder("testop", "ConstantFoldingTestOp", g->op_registry())
-          .Input(aconst->name(), 0, DT_INT64)
-          .Finalize(&def));
-  Status status;
-  Node* non_cpu = g->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-  g->AddEdge(aconst, 0, non_cpu, 0);
-
-  Node* non_cpu_send =
-      test::graph::Send(g, non_cpu, "non_cpu_send", "sender", 0, "receiver");
-  g->AddControlEdge(non_cpu_send, g->sink_node());
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto aconst = ops::Const<int64>(s, 0, {5});
+
+    NodeDef def;
+    TF_ASSERT_OK(NodeDefBuilder("testop", "ConstantFoldingTestOp")
+                     .Input(aconst.name(), 0, DT_INT64)
+                     .Finalize(&def));
+    Status status;
+    Node* non_cpu = s.graph()->AddNode(def, &status);
+    TF_ASSERT_OK(status);
+
+    auto non_cpu_send =
+        ops::_Send(s.WithOpName("non_cpu_send"), Output(non_cpu),
+                   "non_cpu_send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
 
   // The non-CPU op should not have been constant folded.
   bool was_mutated;
-  status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
   EXPECT_FALSE(was_mutated);
-  EXPECT_TRUE(status.ok());
+}
+
+TEST_F(ConstantFoldingTest, ControlDependencies) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope();
+    auto c0 = ops::Const<int>(s, 1);
+    auto recv1 = ops::_Recv(s.WithOpName("recv1"), DT_FLOAT, "recv1", "sender",
+                            0, "receiver");
+    auto c1 = ops::Const<int>(s.WithControlDependencies(recv1), 2);
+    auto recv2 = ops::_Recv(s.WithOpName("recv2"), DT_FLOAT, "recv2", "sender",
+                            0, "receiver");
+    auto c2 = ops::Const<int>(s.WithControlDependencies(recv2), 3);
+    auto add = ops::Add(s.WithControlDependencies(c2), c0, c1);
+    auto send =
+        ops::_Send(s.WithOpName("send"), add, "send", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, Env::Default(),
+                            nullptr, &g, &was_mutated));
+  EXPECT_TRUE(was_mutated);
+
+  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  Node* recv1 = index.at("recv1");
+  Node* recv2 = index.at("recv2");
+  Node* send = index.at("send");
+
+  ASSERT_EQ(1, send->num_inputs());
+  Node* p = *(send->in_nodes().begin());
+  ExpectNodeEqual<int>(p, {3}, {});
+
+  ASSERT_EQ(2, p->in_edges().size());
+  for (const Edge* e : p->in_edges()) {
+    EXPECT_TRUE(e->IsControlEdge());
+    EXPECT_TRUE(e->src() == recv1 || e->src() == recv2) << e->src()->name();
+  }
 }
 
 namespace {
@@ -365,8 +426,7 @@ class TestTFEnvironment : public ::tensorflow::EnvWrapper {
 }  // namespace
 
 TEST_F(ConstantFoldingTest, TestImmutableConst) {
-  Reset();
-  Graph* g = g_.get();
+  Graph g(OpRegistry::Global());
   Scope root = Scope::NewRootScope();
 
   auto a = ops::ImmutableConst(root, DT_DOUBLE, {2, 2}, kTestMemRegionName);
@@ -374,18 +434,16 @@ TEST_F(ConstantFoldingTest, TestImmutableConst) {
   auto c = ops::RandomGamma(root, {2, 2}, 2.0);
   auto result1 = ops::MatMul(root, a, b);
   auto result2 = ops::MatMul(root, result1, c);
-  TF_ASSERT_OK(root.ToGraph(g));
+  TF_ASSERT_OK(root.ToGraph(&g));
   TestTFEnvironment test_env;
   bool was_mutated;
-  Status status =
-      DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                  Env::Default(), nullptr, g, &was_mutated);
+  Status status = ConstantFold(ConstantFoldingOptions{}, nullptr,
+                               Env::Default(), nullptr, &g, &was_mutated);
   EXPECT_FALSE(was_mutated);
   EXPECT_FALSE(status.ok());
-  status = DoConstantFoldingWithStatus(ConstantFoldingOptions{}, nullptr,
-                                       &test_env, nullptr, g, &was_mutated);
+  TF_EXPECT_OK(ConstantFold(ConstantFoldingOptions{}, nullptr, &test_env,
+                            nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
-  TF_EXPECT_OK(status);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index b25131b07b5276e6f718458d1c287100f878c768..ffd37faca42d6ab2e43cf4eec041ac46f8c45af5 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -71,7 +71,8 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
       if (ri.sender_device_type == src_device_type &&
           ri.receiver_device_type == dst_device_type) {
         ri.copy_function(send_dev_context, recv_dev_context, src, dst,
-                         src_alloc_attr, dst_alloc_attr, input, output, done);
+                         src_alloc_attr, dst_alloc_attr, input, output,
+                         std::move(done));
         return;
       }
     }
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.cc b/tensorflow/core/common_runtime/debugger_state_interface.cc
index 2e2fbcd7f402a001c25d0453e2d57ba0478e2c09..c1a92f9a2214131565a5a0a930781702147658bf 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.cc
+++ b/tensorflow/core/common_runtime/debugger_state_interface.cc
@@ -15,10 +15,43 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 
+#include "tensorflow/core/lib/core/errors.h"
+
 namespace tensorflow {
 
+// static
 DebuggerStateFactory* DebuggerStateRegistry::factory_ = nullptr;
 
+// static
+DebugGraphDecoratorFactory* DebugGraphDecoratorRegistry::factory_ = nullptr;
+
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches) {
+  std::ostringstream oss;
+
+  for (const DebugTensorWatch& watch : watches) {
+    string tensor_name =
+        strings::StrCat(watch.node_name(), ":", watch.output_slot());
+    if (watch.tolerate_debug_op_creation_failures()) {
+      oss << "(TOL)";  // Shorthand for "tolerate".
+    }
+    oss << tensor_name << "|";
+
+    for (const string& debug_op : watch.debug_ops()) {
+      oss << debug_op << ",";
+    }
+
+    oss << "@";
+    for (const string& debug_url : watch.debug_urls()) {
+      oss << debug_url << ",";
+    }
+
+    oss << ";";
+  }
+
+  return oss.str();
+}
+
 // static
 void DebuggerStateRegistry::RegisterFactory(
     const DebuggerStateFactory& factory) {
@@ -27,11 +60,38 @@ void DebuggerStateRegistry::RegisterFactory(
 }
 
 // static
-std::unique_ptr<DebuggerStateInterface> DebuggerStateRegistry::CreateState(
-    const DebugOptions& debug_options) {
-  return (factory_ == nullptr || *factory_ == nullptr)
-             ? nullptr
-             : (*factory_)(debug_options);
+Status DebuggerStateRegistry::CreateState(
+    const DebugOptions& debug_options,
+    std::unique_ptr<DebuggerStateInterface>* state) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of debugger state failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *state = (*factory_)(debug_options);
+    return Status::OK();
+  }
+}
+
+// static
+void DebugGraphDecoratorRegistry::RegisterFactory(
+    const DebugGraphDecoratorFactory& factory) {
+  delete factory_;
+  factory_ = new DebugGraphDecoratorFactory(factory);
+}
+
+// static
+Status DebugGraphDecoratorRegistry::CreateDecorator(
+    const DebugOptions& options,
+    std::unique_ptr<DebugGraphDecoratorInterface>* decorator) {
+  if (factory_ == nullptr || *factory_ == nullptr) {
+    return errors::Internal(
+        "Creation of graph decorator failed. "
+        "It appears that TFDBG is not linked in this TensorFlow build.");
+  } else {
+    *decorator = (*factory_)(options);
+    return Status::OK();
+  }
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.h b/tensorflow/core/common_runtime/debugger_state_interface.h
index fb72f9fa3ea0bfc86500691f345c1f3343c8b884..5841107be0b4c59cb89a21568dd172e72e39e0cc 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.h
+++ b/tensorflow/core/common_runtime/debugger_state_interface.h
@@ -18,28 +18,24 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
-class DebugOptions;  // Defined in core/protobuf/debug.h.
-class Device;
-class Graph;
+// Returns a summary string for the list of debug tensor watches.
+const string SummarizeDebugTensorWatches(
+    const protobuf::RepeatedPtrField<DebugTensorWatch>& watches);
 
 // An abstract interface for storing and retrieving debugging information.
 class DebuggerStateInterface {
  public:
   virtual ~DebuggerStateInterface() {}
 
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  virtual const string SummarizeDebugTensorWatches() = 0;
-
-  // Insert special-purpose debug nodes to graph and dump the graph for
-  // record. See the documentation of DebugNodeInserter::InsertNodes() for
-  // details.
-  virtual Status DecorateGraphForDebug(Graph* graph, Device* device) = 0;
-
   // Publish metadata about the debugged Session::Run() call.
   //
   // Args:
@@ -59,6 +55,20 @@ class DebuggerStateInterface {
       const std::vector<string>& target_nodes) = 0;
 };
 
+class DebugGraphDecoratorInterface {
+ public:
+  virtual ~DebugGraphDecoratorInterface() {}
+
+  // Insert special-purpose debug nodes to graph and dump the graph for
+  // record. See the documentation of DebugNodeInserter::InsertNodes() for
+  // details.
+  virtual Status DecorateGraph(Graph* graph, Device* device) = 0;
+
+  // Publish Graph to debug URLs.
+  virtual Status PublishGraph(const Graph& graph,
+                              const string& device_name) = 0;
+};
+
 typedef std::function<std::unique_ptr<DebuggerStateInterface>(
     const DebugOptions& options)>
     DebuggerStateFactory;
@@ -74,11 +84,12 @@ class DebuggerStateRegistry {
   // implementation based on DebugOptions.
   static void RegisterFactory(const DebuggerStateFactory& factory);
 
-  // If RegisterFactory() has been called, creates and returns a concrete
+  // If RegisterFactory() has been called, creates and supplies a concrete
   // DebuggerStateInterface implementation using the registered factory,
-  // owned by the caller.  Otherwise returns nullptr.
-  static std::unique_ptr<DebuggerStateInterface> CreateState(
-      const DebugOptions& debug_options);
+  // owned by the caller and return an OK Status. Otherwise returns an error
+  // Status.
+  static Status CreateState(const DebugOptions& debug_options,
+                            std::unique_ptr<DebuggerStateInterface>* state);
 
  private:
   static DebuggerStateFactory* factory_;
@@ -86,6 +97,24 @@ class DebuggerStateRegistry {
   TF_DISALLOW_COPY_AND_ASSIGN(DebuggerStateRegistry);
 };
 
+typedef std::function<std::unique_ptr<DebugGraphDecoratorInterface>(
+    const DebugOptions& options)>
+    DebugGraphDecoratorFactory;
+
+class DebugGraphDecoratorRegistry {
+ public:
+  static void RegisterFactory(const DebugGraphDecoratorFactory& factory);
+
+  static Status CreateDecorator(
+      const DebugOptions& options,
+      std::unique_ptr<DebugGraphDecoratorInterface>* decorator);
+
+ private:
+  static DebugGraphDecoratorFactory* factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DebugGraphDecoratorRegistry);
+};
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_DEBUGGER_STATE_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 78649afeb93aa7cb2231c35b9e69651b08ac6fb2..aa8a2d989bf5479254bb4b6fc5bdfb32e17c7325 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Device::Device(Env* env, const DeviceAttributes& device_attributes,
-               Allocator* device_allocator)
+Device::Device(Env* env, const DeviceAttributes& device_attributes)
     : DeviceBase(env), device_attributes_(device_attributes) {
   CHECK(DeviceNameUtils::ParseFullName(name(), &parsed_name_))
       << "Invalid device name: " << name();
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 07c6bdd6831923c02176206120df276fc180c985..c0e58f143e350ea9300c38b00adee9d423bdd64f 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -53,8 +53,7 @@ namespace tensorflow {
 
 class Device : public DeviceBase {
  public:
-  Device(Env* env, const DeviceAttributes& device_attributes,
-         Allocator* device_allocator);
+  Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
   // Full name of this device (see top comment).
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 820c4370e219542a719f8e3825a4e43d810f4826..31f12d4833793ef80646bd8936b50d4f6e812af1 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -29,10 +29,18 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
   for (Device* d : devices) {
     devices_.push_back(d);
 
-    // Register under both the full name and the local name.
+    // Register under the (1) full name, (2) canonical name, and (3) local name.
     string full_name = d->name();
     device_map_[CopyToBackingStore(full_name)] = d;
 
+    DeviceNameUtils::ParsedName parsed_name = d->parsed_name();
+    if (parsed_name.has_job && parsed_name.has_replica &&
+        parsed_name.has_task && parsed_name.has_type && parsed_name.has_id) {
+      string canonical_name = DeviceNameUtils::FullName(
+          parsed_name.job, parsed_name.replica, parsed_name.task,
+          parsed_name.type, parsed_name.id);
+      device_map_[CopyToBackingStore(canonical_name)] = d;
+    }
     string lname = DeviceNameUtils::LocalName(d->name());
     device_map_[CopyToBackingStore(lname)] = d;
     device_type_counts_[d->device_type()]++;
@@ -40,11 +48,12 @@ DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
 }
 
 DeviceMgr::~DeviceMgr() {
-  for (auto p : devices_) delete p;
+  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
+  for (Device* p : devices_) delete p;
 }
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
-  int n = s.size();
+  size_t n = s.size();
   char* space = name_backing_store_.Alloc(n);
   memcpy(space, s.data(), n);
   return StringPiece(space, n);
@@ -85,6 +94,12 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   Status s;
   auto iter = device_map_.find(name);
   if (iter == device_map_.end()) {
+    std::vector<StringPiece> device_names;
+    for (auto&& itr : device_map_) {
+      device_names.push_back(itr.first);
+    }
+    LOG(WARNING) << "Unknown device: " << name
+                 << " all devices: " << str_util::Join(device_names, ", ");
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index bb1ed726408b5d778517d8b76c224d2a070c69a3..d16681ac59d3bc34a54f63b8b55f372c661591b4 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -36,6 +36,7 @@ class DeviceMgr {
  public:
   // Takes ownership of each device in 'devices'.
   // TODO(zhifengc): Other initialization information.
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   explicit DeviceMgr(const std::vector<Device*>& devices);
   ~DeviceMgr();
 
@@ -61,6 +62,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
+  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
   typedef gtl::InlinedVector<Device*, 8> DeviceVec;
   DeviceVec devices_;
 
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index b0540dfa95b3e3d34c5eef770236bdde695a7cd5..4cd56e583c09f70cd375e775eb2db9071871311f 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -39,7 +39,10 @@ class DeviceSet {
 
   // Set the device designated as the "client".  This device
   // must also be registered via AddDevice().
-  void set_client_device(Device* device) { client_device_ = device; }
+  void set_client_device(Device* device) {
+    DCHECK(client_device_ == nullptr);
+    client_device_ = device;
+  }
 
   // Returns a pointer to the device designated as the "client".
   Device* client_device() const { return client_device_; }
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index ff20ee94a7de317bbc04470de3d2f2adbc8747ac..0507076c8c3734083ac0ef7ffea0edebf180ad1a 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -27,8 +27,7 @@ namespace {
 static Device* Dev(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr)
-        : Device(nullptr, attr, nullptr) {}
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index c4b2b6c12a53c1ee10ba1f483e09b9ea1643b5a4..6a948b4ca76b5b175aed177b4a5bed4e4eb118ad 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_tracer.h"
@@ -242,6 +243,13 @@ DirectSession::DirectSession(const SessionOptions& options,
     thread_pools_.push_back(GlobalThreadPool(options));
     owns_thread_pools_ = false;
   }
+  // The default value of sync_on_finish will be flipped soon and this
+  // environment variable will be removed as well.
+  Status status =
+      ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
   // NOTE(mrry): We do not need to use a unique string for the session
   // handle, because DirectSession owns its devices. This may change
   // in future versions.
@@ -353,7 +361,6 @@ Status DirectSession::ExtendLocked(const GraphDef& graph) {
   return Status::OK();
 }
 
-// TODO(yuanbyu): Simplify by treating Run() as "PRunSetup(); PRun()".
 Status DirectSession::Run(const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
                           const std::vector<string>& target_nodes,
@@ -363,6 +370,31 @@ Status DirectSession::Run(const NamedTensorList& inputs,
              &run_metadata);
 }
 
+Status DirectSession::CreateDebuggerState(
+    const DebugOptions& debug_options, int64 session_run_count,
+    int64 executor_step_count, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), session_run_count, executor_step_count,
+      input_names, output_names, target_names));
+  return Status::OK();
+}
+
+Status DirectSession::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
+  return Status::OK();
+}
+
 Status DirectSession::Run(const RunOptions& run_options,
                           const NamedTensorList& inputs,
                           const std::vector<string>& output_names,
@@ -395,39 +427,54 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  RunStateArgs run_state_args(run_options.debug_options());
 
   Executor::Args args;
   args.step_id = step_id_counter_.fetch_add(1);
 
-  // EXPERIMENTAL: Options that allow the client to insert nodes into partition
-  // graphs for debugging.
-  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
-    run_state_args.debugger_state =
-        DebuggerStateRegistry::CreateState(run_options.debug_options());
-  }
-
   TF_RETURN_IF_ERROR(
       GetOrCreateExecutors(pool, input_tensor_names, output_names, target_nodes,
                            &executors_and_keys, &run_state_args));
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
-  if (run_state_args.debugger_state) {
-    TF_RETURN_IF_ERROR(run_state_args.debugger_state->PublishDebugMetadata(
-        run_options.debug_options().global_step(), args.step_id,
-        executor_step_count, input_tensor_names, output_names, target_nodes));
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  if (!run_options.debug_options().debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(CreateDebuggerState(
+        run_options.debug_options(), args.step_id, executor_step_count,
+        input_tensor_names, output_names, target_nodes, &debugger_state));
+  }
+
+  // Configure a call frame for the step, which we use to feed and
+  // fetch values to and from the executors.
+  FunctionCallFrame call_frame(executors_and_keys->input_types,
+                               executors_and_keys->output_types);
+  gtl::InlinedVector<Tensor, 4> feed_args(inputs.size());
+  for (const auto& it : inputs) {
+    if (it.second.dtype() == DT_RESOURCE) {
+      Tensor tensor_from_handle;
+      TF_RETURN_IF_ERROR(
+          ResourceHandleToInputTensor(it.second, &tensor_from_handle));
+      feed_args[executors_and_keys->input_name_to_index[it.first]] =
+          tensor_from_handle;
+    } else {
+      feed_args[executors_and_keys->input_name_to_index[it.first]] = it.second;
+    }
+  }
+  Status s = call_frame.SetArgs(feed_args);
+  if (errors::IsInternal(s)) {
+    return errors::InvalidArgument(s.error_message());
+  } else if (!s.ok()) {
+    return s;
   }
 
   // Create a run state and start execution.
   RunState run_state(args.step_id, &devices_);
   run_state.rendez = new IntraProcessRendezvous(device_mgr_.get());
   CancellationManager step_cancellation_manager;
-
-  // Send inputs.
-  TF_RETURN_IF_ERROR(SendInputs(inputs, executors_and_keys, run_state.rendez));
+  args.call_frame = &call_frame;
 
   // Start parallel Executors.
-  const int num_executors = executors_and_keys->items.size();
+  const size_t num_executors = executors_and_keys->items.size();
   ExecutorBarrier* barrier = new ExecutorBarrier(
       num_executors, run_state.rendez, [&run_state](const Status& ret) {
         {
@@ -448,7 +495,7 @@ Status DirectSession::Run(const RunOptions& run_options,
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
 
   const bool do_trace = (run_options.trace_level() > RunOptions::NO_TRACE);
 
@@ -458,7 +505,7 @@ Status DirectSession::Run(const RunOptions& run_options,
         options_.config.graph_options().build_cost_model();
     const int64 build_cost_model_after =
         options_.config.graph_options().build_cost_model_after();
-    int measure_step_count = executor_step_count - build_cost_model_after;
+    int64 measure_step_count = executor_step_count - build_cost_model_after;
     if (measure_step_count >= 0) {
       update_cost_model =
           ((measure_step_count + 1) % build_cost_model_every == 0);
@@ -527,8 +574,22 @@ Status DirectSession::Run(const RunOptions& run_options,
   }
 
   // Receive outputs.
-  TF_RETURN_IF_ERROR(
-      RecvOutputs(output_names, executors_and_keys, &run_state, outputs));
+  if (outputs) {
+    std::vector<Tensor> sorted_outputs;
+    Status s = call_frame.ConsumeRetvals(&sorted_outputs);
+    if (errors::IsInternal(s)) {
+      return errors::InvalidArgument(s.error_message());
+    } else if (!s.ok()) {
+      return s;
+    }
+    outputs->clear();
+    outputs->reserve(sorted_outputs.size());
+    for (const string& output_name : output_names) {
+      outputs->emplace_back(
+          std::move(sorted_outputs[executors_and_keys
+                                       ->output_name_to_index[output_name]]));
+    }
+  }
 
   // Save the output tensors of this run we choose to keep.
   TF_RETURN_IF_ERROR(
@@ -587,7 +648,9 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
 
   // Check if we already have an executor for these arguments.
   ExecutorsAndKeys* executors_and_keys;
-  RunStateArgs run_state_args;
+  // TODO(cais): TFDBG support for partial runs.
+  DebugOptions debug_options;
+  RunStateArgs run_state_args(debug_options);
   run_state_args.is_partial_run = true;
   TF_RETURN_IF_ERROR(GetOrCreateExecutors(pool, input_names, output_names,
                                           target_nodes, &executors_and_keys,
@@ -611,7 +674,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   }
 
   // Start parallel Executors.
-  const int num_executors = executors_and_keys->items.size();
+  const size_t num_executors = executors_and_keys->items.size();
   ExecutorBarrier* barrier = new ExecutorBarrier(
       num_executors, run_state->rendez, [run_state](const Status& ret) {
         if (!ret.ok()) {
@@ -632,7 +695,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, run_state_args.handle);
   }
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
 
   if (options_.config.graph_options().build_cost_model()) {
     run_state->collector.reset(new StepStatsCollector(nullptr));
@@ -676,16 +739,23 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
     for (const auto& input : inputs) {
       auto it = run_state->pending_inputs.find(input.first);
       if (it == run_state->pending_inputs.end()) {
+        return errors::InvalidArgument(
+            "The feed ", input.first,
+            " was not specified in partial_run_setup.");
+      } else if (it->second) {
         return errors::InvalidArgument("The feed ", input.first,
-                                       " had already been fed.");
+                                       " has already been fed.");
       }
     }
     // Check that this is a new set of fetches that are still pending.
     for (const auto& output : output_names) {
       auto it = run_state->pending_outputs.find(output);
       if (it == run_state->pending_outputs.end()) {
+        return errors::InvalidArgument(
+            "The fetch ", output, " was not specified in partial_run_setup.");
+      } else if (it->second) {
         return errors::InvalidArgument("The fetch ", output,
-                                       " had already been fetched.");
+                                       " has already been fetched.");
       }
     }
   }
@@ -696,11 +766,11 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
       CheckFetch(inputs, output_names, executors_and_keys, run_state));
 
   // Send inputs.
-  Status s = SendInputs(inputs, executors_and_keys, run_state->rendez);
+  Status s = SendPRunInputs(inputs, executors_and_keys, run_state->rendez);
 
   // Receive outputs.
   if (s.ok()) {
-    s = RecvOutputs(output_names, executors_and_keys, run_state, outputs);
+    s = RecvPRunOutputs(output_names, executors_and_keys, run_state, outputs);
   }
 
   // Save the output tensors of this run we choose to keep.
@@ -720,14 +790,15 @@ Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
                        << run_state->status;
         }
       }
-      for (const auto& it : inputs) {
-        run_state->pending_inputs.erase(it.first);
+      for (const auto& input : inputs) {
+        auto it = run_state->pending_inputs.find(input.first);
+        it->second = true;
       }
       for (const auto& name : output_names) {
-        run_state->pending_outputs.erase(name);
+        auto it = run_state->pending_outputs.find(name);
+        it->second = true;
       }
-      done = (run_state->pending_inputs.size() == 0 &&
-              run_state->pending_outputs.size() == 0);
+      done = run_state->PendingDone();
     }
     if (done) {
       WaitForNotification(run_state, cancellation_manager_,
@@ -749,7 +820,8 @@ Status DirectSession::ResourceHandleToInputTensor(const Tensor& resource_tensor,
 
   ResourceHandle resource_handle = resource_tensor.scalar<ResourceHandle>()();
 
-  if (resource_handle.hash_code() == MakeTypeIndex<Tensor>().hash_code()) {
+  if (resource_handle.container() ==
+      SessionState::kTensorHandleResourceTypeName) {
     return session_state_.GetTensor(resource_handle.name(), retrieved_tensor);
   } else {
     return errors::InvalidArgument(strings::StrCat(
@@ -759,16 +831,17 @@ Status DirectSession::ResourceHandleToInputTensor(const Tensor& resource_tensor,
   }
 }
 
-Status DirectSession::SendInputs(const NamedTensorList& inputs,
-                                 const ExecutorsAndKeys* executors_and_keys,
-                                 IntraProcessRendezvous* rendez) {
+Status DirectSession::SendPRunInputs(const NamedTensorList& inputs,
+                                     const ExecutorsAndKeys* executors_and_keys,
+                                     IntraProcessRendezvous* rendez) {
   Status s;
   Rendezvous::ParsedKey parsed;
   // Insert the input tensors into the local rendezvous by their
   // rendezvous key.
   for (const auto& input : inputs) {
-    auto it = executors_and_keys->input_keys.find(input.first);
-    if (it == executors_and_keys->input_keys.end()) {
+    auto it =
+        executors_and_keys->input_name_to_rendezvous_key.find(input.first);
+    if (it == executors_and_keys->input_name_to_rendezvous_key.end()) {
       return errors::Internal("'", input.first, "' is not a pre-defined feed.");
     }
     const string& input_key = it->second;
@@ -797,10 +870,10 @@ Status DirectSession::SendInputs(const NamedTensorList& inputs,
   return Status::OK();
 }
 
-Status DirectSession::RecvOutputs(const std::vector<string>& output_names,
-                                  const ExecutorsAndKeys* executors_and_keys,
-                                  RunState* run_state,
-                                  std::vector<Tensor>* outputs) {
+Status DirectSession::RecvPRunOutputs(
+    const std::vector<string>& output_names,
+    const ExecutorsAndKeys* executors_and_keys, RunState* run_state,
+    std::vector<Tensor>* outputs) {
   Status s;
   if (!output_names.empty()) {
     outputs->resize(output_names.size());
@@ -811,8 +884,9 @@ Status DirectSession::RecvOutputs(const std::vector<string>& output_names,
   for (size_t output_offset = 0; output_offset < output_names.size();
        ++output_offset) {
     const string& output_name = output_names[output_offset];
-    auto it = executors_and_keys->output_keys.find(output_name);
-    if (it == executors_and_keys->output_keys.end()) {
+    auto it =
+        executors_and_keys->output_name_to_rendezvous_key.find(output_name);
+    if (it == executors_and_keys->output_name_to_rendezvous_key.end()) {
       return errors::Internal("'", output_name,
                               "' is not a pre-defined fetch.");
     }
@@ -853,11 +927,13 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
   {
     mutex_lock l(executor_lock_);
-    for (const string& feed : run_state->pending_inputs) {
-      TensorId id(ParseTensorName(feed));
+    for (const auto& input : run_state->pending_inputs) {
+      // Skip if the feed has already been fed.
+      if (input.second) continue;
+      TensorId id(ParseTensorName(input.first));
       auto it = name_to_node->find(id.first);
       if (it == name_to_node->end()) {
-        return errors::NotFound("Feed ", feed, ": not found");
+        return errors::NotFound("Feed ", input.first, ": not found");
       }
       pending_feeds.insert(id);
     }
@@ -905,14 +981,15 @@ Status DirectSession::GetOrCreateExecutors(
     thread::ThreadPool* pool, gtl::ArraySlice<string> inputs,
     gtl::ArraySlice<string> outputs, gtl::ArraySlice<string> target_nodes,
     ExecutorsAndKeys** executors_and_keys, RunStateArgs* run_state_args) {
-  string debug_tensor_watches_summary;
   int64 handle_name_counter_value = -1;
   if (LogMemory::IsEnabled() || run_state_args->is_partial_run) {
     handle_name_counter_value = handle_name_counter_.fetch_add(1);
   }
-  if (run_state_args->debugger_state) {
-    debug_tensor_watches_summary =
-        run_state_args->debugger_state->SummarizeDebugTensorWatches();
+
+  string debug_tensor_watches_summary;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    debug_tensor_watches_summary = SummarizeDebugTensorWatches(
+        run_state_args->debug_options.debug_tensor_watch_opts());
   }
 
   // Fast lookup path, no sorting.
@@ -976,14 +1053,19 @@ Status DirectSession::GetOrCreateExecutors(
   options.feed_endpoints = inputs_sorted;
   options.fetch_endpoints = outputs_sorted;
   options.target_nodes = tn_sorted;
+  options.use_function_convention = !run_state_args->is_partial_run;
+  if (!run_state_args->debug_options.debug_tensor_watch_opts().empty()) {
+    options.debug_options = run_state_args->debug_options;
+  }
 
   std::shared_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
 
   // The executor_lock_ is intentionally released while executor is
   // being created.
   std::unordered_map<string, std::unique_ptr<Graph>> graphs;
-  TF_RETURN_IF_ERROR(
-      CreateGraphs(options, &graphs, &ek->flib_def, run_state_args));
+  TF_RETURN_IF_ERROR(CreateGraphs(options, &graphs, &ek->flib_def,
+                                  run_state_args, &ek->input_types,
+                                  &ek->output_types));
 
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
@@ -1050,10 +1132,10 @@ Status DirectSession::GetOrCreateExecutors(
 
     optimizer.Optimize(lib, options_.env, device, &iter->second);
 
-    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph
-    if (run_state_args->debugger_state) {
-      TF_RETURN_IF_ERROR(run_state_args->debugger_state->DecorateGraphForDebug(
-          partition_graph.get(), params.device));
+    // EXPERIMENTAL: tfdbg inserts debug nodes in the graph.
+    if (!options.debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          options.debug_options, partition_graph.get(), params.device));
     }
 
     TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device->device_type()),
@@ -1068,17 +1150,37 @@ Status DirectSession::GetOrCreateExecutors(
     item->executor.reset(executor);
   }
 
-  // Compute the rendezvous keys to avoid recomputing them every time.
-  //
-  // We always use the first device as the device name portion of the
-  // key, even if we're feeding another graph.
-  for (const string& input : inputs) {
-    ek->input_keys[input] = GetRendezvousKey(
-        input, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
-  }
-  for (const string& output : outputs) {
-    ek->output_keys[output] = GetRendezvousKey(
-        output, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
+  // Cache the mapping from input/output names to graph elements to
+  // avoid recomputing it every time.
+  if (!run_state_args->is_partial_run) {
+    // For regular `Run()`, we use the function calling convention, and so
+    // maintain a mapping from input/output names to
+    // argument/return-value ordinal index.
+    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
+      const string& input = inputs_sorted[i];
+      ek->input_name_to_index[input] = i;
+    }
+    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
+      const string& output = outputs_sorted[i];
+      ek->output_name_to_index[output] = i;
+    }
+  } else {
+    // For `PRun()`, we use the rendezvous calling convention, and so
+    // maintain a mapping from input/output names to rendezvous keys.
+    //
+    // We always use the first device as the device name portion of the
+    // key, even if we're feeding another graph.
+    for (size_t i = 0; i < inputs_sorted.size(); ++i) {
+      const string& input = inputs_sorted[i];
+      ek->input_name_to_rendezvous_key[input] = GetRendezvousKey(
+          input, device_set_.client_device()->attributes(), FrameAndIter(0, 0));
+    }
+    for (size_t i = 0; i < outputs_sorted.size(); ++i) {
+      const string& output = outputs_sorted[i];
+      ek->output_name_to_rendezvous_key[output] =
+          GetRendezvousKey(output, device_set_.client_device()->attributes(),
+                           FrameAndIter(0, 0));
+    }
   }
 
   // Reacquire the lock, try to insert into the map.
@@ -1099,7 +1201,8 @@ Status DirectSession::CreateGraphs(
     const BuildGraphOptions& subgraph_options,
     std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
     std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-    RunStateArgs* run_state_args) {
+    RunStateArgs* run_state_args, DataTypeVector* input_types,
+    DataTypeVector* output_types) {
   mutex_lock l(graph_def_lock_);
   std::unique_ptr<SimpleClientGraph> client_graph;
 
@@ -1124,6 +1227,23 @@ Status DirectSession::CreateGraphs(
         execution_state->BuildGraph(subgraph_options, &client_graph));
   }
 
+  if (subgraph_options.feed_endpoints.size() !=
+      client_graph->feed_types.size()) {
+    return errors::Internal(
+        "Graph pruning failed: requested number of feed endpoints = ",
+        subgraph_options.feed_endpoints.size(),
+        " versus number of pruned feed endpoints = ",
+        client_graph->feed_types.size());
+  }
+  if (subgraph_options.fetch_endpoints.size() !=
+      client_graph->fetch_types.size()) {
+    return errors::Internal(
+        "Graph pruning failed: requested number of fetch endpoints = ",
+        subgraph_options.fetch_endpoints.size(),
+        " versus number of pruned fetch endpoints = ",
+        client_graph->fetch_types.size());
+  }
+
   auto current_stateful_placements = execution_state->GetStatefulPlacements();
   // Update our current state based on the execution_state's
   // placements.  If there are any mismatches for a node,
@@ -1229,6 +1349,8 @@ Status DirectSession::CreateGraphs(
     }
   }
   *flib_def = std::move(client_graph->flib_def);
+  std::swap(*input_types, client_graph->feed_types);
+  std::swap(*output_types, client_graph->fetch_types);
   return s;
 }
 
@@ -1262,10 +1384,10 @@ DirectSession::RunState::RunState(
       }) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : pending_input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : pending_output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1283,6 +1405,16 @@ DirectSession::RunState::~RunState() {
   }
 }
 
+bool DirectSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 void DirectSession::WaitForNotification(RunState* run_state,
                                         CancellationManager* cm,
                                         int64 timeout_in_ms) {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 1495648631ebfc11c905d754eb2967de40b2d57d..cc298b3e57dbd662046c7e63de9b9a9d9ae1dcf7 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -132,8 +132,13 @@ class DirectSession : public Session {
     NameNodeMap name_to_node;
     std::unique_ptr<FunctionLibraryDefinition> flib_def;
     std::vector<PerPartitionExecutorsAndLib> items;
-    std::unordered_map<string, string> input_keys;
-    std::unordered_map<string, string> output_keys;
+    std::unordered_map<string, size_t> input_name_to_index;
+    std::unordered_map<string, string> input_name_to_rendezvous_key;
+    std::unordered_map<string, size_t> output_name_to_index;
+    std::unordered_map<string, string> output_name_to_rendezvous_key;
+
+    DataTypeVector input_types;
+    DataTypeVector output_types;
   };
 
   // For each live partial execution, the session maintains a RunState.
@@ -146,8 +151,8 @@ class DirectSession : public Session {
     IntraProcessRendezvous* rendez = nullptr;
     std::unique_ptr<StepStatsCollector> collector;
     Notification executors_done;
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     TensorStore tensor_store;
     ScopedStepContainer step_container;
 
@@ -157,14 +162,19 @@ class DirectSession : public Session {
              const std::vector<string>& pending_output_names, int64 step_id,
              const std::vector<Device*>* devices);
 
+    // Returns true if all pending inputs and outputs have been completed.
+    bool PendingDone() const;
+
     ~RunState();
   };
 
   struct RunStateArgs {
+    RunStateArgs(const DebugOptions& options) : debug_options(options) {}
+
     bool is_partial_run = false;
     string handle;
     std::unique_ptr<Graph> graph;
-    std::unique_ptr<DebuggerStateInterface> debugger_state;
+    const DebugOptions& debug_options;
   };
 
   // Initializes the base execution state given the 'graph',
@@ -187,7 +197,8 @@ class DirectSession : public Session {
       const BuildGraphOptions& options,
       std::unordered_map<string, std::unique_ptr<Graph>>* outputs,
       std::unique_ptr<FunctionLibraryDefinition>* flib_def,
-      RunStateArgs* run_state_args);
+      RunStateArgs* run_state_args, DataTypeVector* input_types,
+      DataTypeVector* output_types);
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_def_lock_);
@@ -196,17 +207,17 @@ class DirectSession : public Session {
       const Tensor& resource_tensor, Tensor* retrieved_tensor);
 
   // Feeds more inputs to the executors, triggering further execution.
-  ::tensorflow::Status SendInputs(
+  ::tensorflow::Status SendPRunInputs(
       const std::vector<std::pair<string, Tensor>>& inputs,
       const ExecutorsAndKeys* executors_and_keys,
       IntraProcessRendezvous* rendez);
 
   // Fetches more outputs from the executors. It waits until the output
   // tensors are computed.
-  ::tensorflow::Status RecvOutputs(const std::vector<string>& output_names,
-                                   const ExecutorsAndKeys* executors_and_keys,
-                                   RunState* run_state,
-                                   std::vector<Tensor>* outputs);
+  ::tensorflow::Status RecvPRunOutputs(
+      const std::vector<string>& output_names,
+      const ExecutorsAndKeys* executors_and_keys, RunState* run_state,
+      std::vector<Tensor>* outputs);
 
   // Check if the specified fetches can be computed from the feeds
   // that we have already provided.
@@ -230,6 +241,16 @@ class DirectSession : public Session {
     return ::tensorflow::Status::OK();
   }
 
+  ::tensorflow::Status CreateDebuggerState(
+      const DebugOptions& debug_options, int64 session_run_count,
+      int64 executor_step_count, const std::vector<string>& input_names,
+      const std::vector<string>& output_names,
+      const std::vector<string>& target_names,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
+  ::tensorflow::Status DecorateAndPublishGraphForDebug(
+      const DebugOptions& debug_options, Graph* graph, Device* device);
+
   const SessionOptions options_;
 
   // Device structures.
@@ -247,6 +268,8 @@ class DirectSession : public Session {
   std::vector<thread::ThreadPool*> thread_pools_;
   bool owns_thread_pools_ = false;
 
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
   // Schedules 'c' for execution on pool.
   void SchedClosure(thread::ThreadPool* pool, std::function<void()> c);
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index c8b8a09b8e86deacf161c4867ed6fcaf277df118..f8deaaf222927939326a484b0c82e7c21999af9f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -1178,7 +1178,7 @@ void FeedFetchBenchmarkHelper(int num_feeds, int iters) {
     // monitor this overhead where possible, but that is not the
     // object of study in this benchmark.
     Node* placeholder;
-    TF_CHECK_OK(NodeBuilder(g.NewName("Placeholder"), "PlaceholderV2")
+    TF_CHECK_OK(NodeBuilder(g.NewName("Placeholder"), "Placeholder")
                     .Attr("shape", TensorShape())
                     .Attr("dtype", DT_FLOAT)
                     .Device("/cpu:0")
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index eaa54c2c48a56140747da6cca04e61d4798282de..9e18547af5cf9d8107585d33a391ec21decd9adf 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -92,31 +92,28 @@ bool SetTimelineLabel(const Node* node, NodeExecStats* node_stats) {
       }
     }
   }
-  const NodeDef& def = node->def();
-  string text = "";
+  const AttrSlice attrs = node->attrs();
+  string text;
   if (IsSend(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string recv_device;
-    TF_CHECK_OK(GetNodeAttr(def, "recv_device", &recv_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", recv_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "recv_device", &recv_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", recv_device);
     is_transfer_node = true;
   } else if (IsRecv(node)) {
     string tensor_name;
-    TF_CHECK_OK(GetNodeAttr(def, "tensor_name", &tensor_name));
+    TF_CHECK_OK(GetNodeAttr(attrs, "tensor_name", &tensor_name));
     string send_device;
-    TF_CHECK_OK(GetNodeAttr(def, "send_device", &send_device));
-    text = strings::StrCat(memory, def.name(), " = ", def.op(), "(",
-                           tensor_name, " @", send_device);
+    TF_CHECK_OK(GetNodeAttr(attrs, "send_device", &send_device));
+    text = strings::StrCat(memory, node->name(), " = ", node->type_string(),
+                           "(", tensor_name, " @", send_device);
     is_transfer_node = true;
   } else {
-    text = strings::StrCat(
-        memory, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    text =
+        strings::StrCat(memory, node->name(), " = ", node->type_string(), "(",
+                        str_util::Join(node->requested_inputs(), ", "), ")");
   }
   node_stats->set_timeline_label(text);
   return is_transfer_node;
@@ -232,7 +229,7 @@ struct NodeItem {
   int input_start = 0;
 
   // Number of output edges.
-  int num_output_edges;
+  size_t num_output_edges;
 
   PendingCounts::Handle pending_id;
 
@@ -307,7 +304,7 @@ class GraphView {
   void Initialize(const Graph* g);
   Status SetAllocAttrs(const Graph* g, const Device* device);
 
-  NodeItem* node(int id) const {
+  NodeItem* node(size_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
     uint32 offset = node_offsets_[id];
@@ -454,7 +451,7 @@ GraphView::~GraphView() {
 }
 
 size_t GraphView::NodeItemBytes(const Node* n) {
-  const int num_output_edges = n->out_edges().size();
+  const size_t num_output_edges = n->out_edges().size();
   const int num_inputs = n->num_inputs();
   const int num_outputs = n->num_outputs();
 
@@ -500,11 +497,11 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   // pointers). Casting to int64 is needed on 32bit CPU to avoid comparing
   // values as "int" vs "size_t" in CHECK_LE.
   CHECK_LE(static_cast<int64>(ptr - space_), kuint32max);
-  const uint32 offset = ptr - space_;
+  const uint32 offset = static_cast<uint32>(ptr - space_);
   node_offsets_[id] = offset;
   ptr += bytes;
 
-  const int num_output_edges = n->out_edges().size();
+  const size_t num_output_edges = n->out_edges().size();
   const int num_inputs = n->num_inputs();
   const int num_outputs = n->num_outputs();
 
@@ -522,7 +519,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   EdgeInfo* dst_edge = item->output_edge_base();
   for (auto e : n->out_edges()) {
     dst_edge->dst_id = e->dst()->id();
-    CHECK_LE(e->src_output(), ((int32)0x3FFFFFFF));  // Must fit in 31 bits
+    CHECK_LE(e->src_output(), 0x3FFFFFFF);  // Must fit in 31 bits
     dst_edge->output_slot = e->src_output();
     dst_edge->is_last = false;
     const int output_slot = dst_edge->output_slot;
@@ -580,9 +577,10 @@ void GraphView::Initialize(const Graph* g) {
   CHECK_EQ(ptr, space_ + total_bytes);
 }
 
-void GetMaxPendingCounts(const Node* n, int* max_pending, int* max_dead_count) {
-  const int num_in_edges = n->in_edges().size();
-  int initial_count;
+void GetMaxPendingCounts(const Node* n, size_t* max_pending,
+                         size_t* max_dead_count) {
+  const size_t num_in_edges = n->in_edges().size();
+  size_t initial_count;
   if (IsMerge(n)) {
     // merge waits all control inputs so we initialize the pending
     // count to be the number of control edges.
@@ -626,8 +624,7 @@ Status ExecutorImpl::Initialize() {
     FrameInfo* frame_info = EnsureFrameInfo(frame_name);
 
     // See if this node is a root node, and if so, add to root_nodes_.
-    const int num_in_edges = n->in_edges().size();
-    if (num_in_edges == 0) {
+    if (n->in_edges().empty()) {
       root_nodes_.push_back(n);
     }
 
@@ -640,7 +637,7 @@ Status ExecutorImpl::Initialize() {
     Status s = params_.create_kernel(n->def(), &item->kernel);
     if (!s.ok()) {
       item->kernel = nullptr;
-      s = AttachDef(s, n->def());
+      s = AttachDef(s, *n);
       LOG(ERROR) << "Executor failed to create kernel. " << s;
       return s;
     }
@@ -659,7 +656,7 @@ Status ExecutorImpl::Initialize() {
     // pending counts data structure, and allocate a handle in
     // that frame's pending counts data structure that has enough
     // space to store these maximal count values.
-    int max_pending, max_dead;
+    size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     item->pending_id =
         frame_info->pending_counts_layout.CreateHandle(max_pending, max_dead);
@@ -668,7 +665,7 @@ Status ExecutorImpl::Initialize() {
     frame_info->nodes->push_back(n);
     if (IsEnter(n)) {
       string enter_name;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "frame_name", &enter_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
     }
   }
@@ -723,7 +720,7 @@ Status InferAllocAttr(const Node* n, const Node* dst,
   // so these two cases are not mutually exclusive.
   if (IsRecv(n)) {
     string src_name;
-    s = GetNodeAttr(n->def(), "send_device", &src_name);
+    s = GetNodeAttr(n->attrs(), "send_device", &src_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_src_name;
     if (!DeviceNameUtils::ParseFullName(src_name, &parsed_src_name)) {
@@ -748,7 +745,7 @@ Status InferAllocAttr(const Node* n, const Node* dst,
   }
   if (IsSend(dst)) {
     string dst_name;
-    s = GetNodeAttr(dst->def(), "recv_device", &dst_name);
+    s = GetNodeAttr(dst->attrs(), "recv_device", &dst_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_dst_name;
     if (!DeviceNameUtils::ParseFullName(dst_name, &parsed_dst_name)) {
@@ -896,7 +893,7 @@ class ExecutorState {
     Entry* input_tensors;
 
     // The number of outstanding ops for each iteration.
-    int outstanding_ops;
+    size_t outstanding_ops;
 
     // The number of outstanding frames for each iteration.
     int outstanding_frame_count;
@@ -1037,13 +1034,13 @@ class ExecutorState {
 
     inline IterationState* GetIteration(int64 iter)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      int index = iter % iterations.size();
+      size_t index = iter % iterations.size();
       return iterations[index];
     }
 
     inline void SetIteration(int64 iter, IterationState* state)
         EXCLUSIVE_LOCKS_REQUIRED(mu) {
-      int index = iter % iterations.size();
+      size_t index = iter % iterations.size();
       DCHECK(state == nullptr || iterations[index] == nullptr);
       iterations[index] = state;
     }
@@ -1213,7 +1210,8 @@ class ExecutorState {
       GUARDED_BY(mu_);
 
   // The unique name of a frame.
-  inline string MakeFrameName(FrameState* frame, int64 iter_id, string name) {
+  inline string MakeFrameName(FrameState* frame, int64 iter_id,
+                              const string& name) {
     return strings::StrCat(frame->frame_name, ";", iter_id, ";", name);
   }
 
@@ -1360,7 +1358,7 @@ Status ExecutorImpl::BuildControlFlowInfo(const Graph* g,
     if (IsEnter(curr_node)) {
       // Enter a child frame.
       TF_RETURN_IF_ERROR(
-          GetNodeAttr(curr_node->def(), "frame_name", &frame_name));
+          GetNodeAttr(curr_node->attrs(), "frame_name", &frame_name));
       parent = curr_node;
     } else if (IsExit(curr_node)) {
       // Exit to the parent frame.
@@ -1404,7 +1402,7 @@ void ExecutorImpl::InitializePending(const Graph* graph,
   for (const Node* n : graph->nodes()) {
     const int id = n->id();
     const string& name = cf_info.frame_names[id];
-    int max_pending, max_dead;
+    size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     const NodeItem* item = gview_.node(id);
     PendingCounts* counts = EnsureFrameInfo(name)->pending_counts;
@@ -1434,7 +1432,7 @@ void ExecutorState::RunAsync(Executor::DoneCallback done) {
   } else {
     num_outstanding_ops_ = ready.size();
     root_frame_->iterations[0]->outstanding_ops = ready.size();
-    done_cb_ = done;
+    done_cb_ = std::move(done);
     // Schedule to run all the ready ops in thread pool.
     ScheduleReady(ready, nullptr);
   }
@@ -1554,8 +1552,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNodeDef(node->def())
-              << " is dead: " << tagged_node.is_dead;
+              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead;
     }
 
     Entry* input_tensors = GetInputTensors(input_frame, input_iter);
@@ -1609,7 +1606,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
           if (vlog_) {
             VLOG(2) << this << " Async kernel done: "
-                    << SummarizeNodeDef(state->item->node->def());
+                    << SummarizeNode(*state->item->node);
           }
           if (stats) nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1810,7 +1807,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // tensor value at i-th output.
       if (!IsSwitch(node) && !IsRecv(node)) {
         s.Update(errors::Internal("Missing ", i, "-th output from ",
-                                  SummarizeNodeDef(node->def())));
+                                  SummarizeNode(*node)));
       }
     } else {
       Entry* out = &((*outputs)[i]);
@@ -1877,7 +1874,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                   DataTypeString(dtype),
                                   " does not match declared output type ",
                                   DataTypeString(item.output_type(i)),
-                                  " for node ", SummarizeNodeDef(node->def())));
+                                  " for node ", SummarizeNode(*node)));
       }
     }
     if (!val.is_ref()) {
@@ -1914,7 +1911,7 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
         &impl_->gview_, input_iter, ready);
   } else if (item->is_enter) {
     bool is_constant;
-    Status s = GetNodeAttr(node->def(), "is_constant", &is_constant);
+    Status s = GetNodeAttr(node->attrs(), "is_constant", &is_constant);
     DCHECK(s.ok()) << s;
     FindOrCreateChildFrame(input_frame, input_iter, node, &output_frame);
     output_iter = 0;
@@ -2027,7 +2024,7 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
   }
 
   bool completed = false;
-  int ready_size = ready.size();
+  size_t ready_size = ready.size();
   if (ready_size == 0 || !s.ok()) {
     completed = (num_outstanding_ops_.fetch_sub(1) == 1);
   } else if (ready_size > 1) {
@@ -2240,7 +2237,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
                                            FrameState** child) {
   // Get the child frame name.
   string enter_name;
-  Status s = GetNodeAttr(node->def(), "frame_name", &enter_name);
+  Status s = GetNodeAttr(node->attrs(), "frame_name", &enter_name);
   DCHECK(s.ok()) << s;
   const string child_name = MakeFrameName(frame, iter, enter_name);
 
@@ -2258,7 +2255,7 @@ void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
   if (vlog_) VLOG(2) << "Create frame: " << child_name;
 
   int parallel_iters;
-  s = GetNodeAttr(node->def(), "parallel_iterations", &parallel_iters);
+  s = GetNodeAttr(node->attrs(), "parallel_iterations", &parallel_iters);
   DCHECK(s.ok()) << s;
   FrameState* temp = new FrameState(impl_, parallel_iters);
   temp->frame_name = child_name;
@@ -2375,10 +2372,10 @@ void ExecutorState::FrameState::ActivateNodes(const NodeItem* item,
                                               TaggedNodeSeq* ready) {
   const GraphView& gview = executor->gview_;
   IterationState* iter_state = GetIteration(iter);
-  const int num_output_edges = item->num_output_edges;
+  const size_t num_output_edges = item->num_output_edges;
   const EdgeInfo* edges = item->output_edge_list();
   Entry* input_tensors = iter_state->input_tensors;
-  for (int out_index = 0; out_index < num_output_edges; out_index++) {
+  for (size_t out_index = 0; out_index < num_output_edges; out_index++) {
     const EdgeInfo& e = edges[out_index];
     const int dst_id = e.dst_id;
     const NodeItem* dst_item = gview.node(dst_id);
@@ -2560,7 +2557,7 @@ bool ExecutorState::FrameState::CleanupIterations(const GraphView* gview,
 }
 
 void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
-  (new ExecutorState(args, this))->RunAsync(done);
+  (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
 }  // end namespace
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 239c9666e339114793fd6ea108ab6ffbec832185..e09dc4e34630fc0ab22615b7204bd0ec2d117d35 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -74,8 +74,8 @@ class Executor {
   //
   // RunAsync() uses "cancellation_manager", if not nullptr, to
   // register callbacks that should be called if the graph computation
-  // is cancelled. Note that the callbacks merely unblock any
-  // long-running computation, and a cancelled step will terminate by
+  // is canceled. Note that the callbacks merely unblock any
+  // long-running computation, and a canceled step will terminate by
   // returning/calling the DoneCallback as usual.
   //
   // RunAsync() dispatches closures to "runner". Typically, "runner"
@@ -162,7 +162,7 @@ class ExecutorBarrier {
   //
   // 'done' is called after the last executor completes, and
   // ExecutorBarrier is deleted.
-  ExecutorBarrier(int num, Rendezvous* r, StatusCallback done)
+  ExecutorBarrier(size_t num, Rendezvous* r, StatusCallback done)
       : rendez_(r), done_cb_(done), pending_(num) {}
 
   ~ExecutorBarrier() {}
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index cb7e1a40ceba433e7ffa2786f569206eb4b2594a..407c20bbf2c970a8fc56280321937177eda5bf60 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -150,8 +150,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   ~FunctionLibraryRuntimeImpl() override;
 
-  Status Instantiate(const string& function_name,
-                     const InstantiateAttrValueMap& attrs,
+  Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) override;
 
   const FunctionBody* GetFunctionBody(Handle handle) override;
@@ -208,8 +207,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   };
   std::vector<Item*> items_;
 
-  Status FunctionDefToBody(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attrs,
+  Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            FunctionBody** fbody);
   Status CreateItem(Handle handle, Item** item);
   Status GetOrCreateItem(Handle handle, Item** item);
@@ -274,8 +272,9 @@ class CallOp : public AsyncOpKernel {
                if (!status.ok()) {
                  ctx->SetStatus(status);
                } else {
-                 CHECK_EQ(rets->size(), ctx->num_outputs());
-                 for (size_t i = 0; i < rets->size(); ++i) {
+                 const int ret_size = static_cast<int>(rets->size());
+                 CHECK_EQ(ret_size, ctx->num_outputs());
+                 for (int i = 0; i < ret_size; ++i) {
                    ctx->set_output(i, (*rets)[i]);
                  }
                }
@@ -323,7 +322,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   // Try to instantiate this function for the func/attr. Maybe its
   // cached already.
   Handle handle;
-  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), ndef.attr(), &handle));
+  TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
@@ -354,24 +353,10 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
   return s;
 }
 
-Status FunctionLibraryRuntimeImpl::FunctionDefToBody(
-    const FunctionDef& fdef, const InstantiateAttrValueMap& attrs,
-    FunctionBody** fbody) {
-  // Instantiates the function template into a graph def.
-  InstantiationResult result;
-  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig_, &result));
-
-  Graph* graph = new Graph(lib_def_);
-  GraphConstructorOptions opts;
-  opts.allow_internal_ops = true;
-  opts.expect_device_spec = false;
-  Status s = ConvertGraphDefToGraph(opts, result.gdef, graph);
-  if (!s.ok()) {
-    delete graph;
-  } else {
-    *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph);
-  }
-  return s;
+Status FunctionLibraryRuntimeImpl::FunctionDefToBody(const FunctionDef& fdef,
+                                                     AttrSlice attrs,
+                                                     FunctionBody** fbody) {
+  return FunctionDefToBodyHelper(fdef, attrs, lib_def_, get_func_sig_, fbody);
 }
 
 Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
@@ -389,11 +374,13 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
     // TODO(josh11b): Should filter out the attrs from func that aren't used
     // by the gradient function.
     TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
-    TF_RETURN_IF_ERROR(FunctionDefToBody(grad_fdef, func.attr(), g_body));
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBody(grad_fdef, AttrSlice(&func.attr()), g_body));
   } else {
     // f is a user-defined function.
     Handle f_handle;
-    TF_RETURN_IF_ERROR(Instantiate(func.name(), func.attr(), &f_handle));
+    TF_RETURN_IF_ERROR(
+        Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
     const FunctionBody* f_body = GetFunctionBody(f_handle);
     CHECK_NOTNULL(f_body);
     *g_body = SymbolicGradient(*f_body);
@@ -401,9 +388,9 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::Instantiate(
-    const string& function_name, const InstantiateAttrValueMap& attrs,
-    Handle* handle) {
+Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
+                                               AttrSlice attrs,
+                                               Handle* handle) {
   const string key = Canonicalize(function_name, attrs);
   {
     mutex_lock l(mu_);
@@ -416,7 +403,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   Status s;
   FunctionBody* fbody = nullptr;
   if (function_name == kGradientOp) {
-    const AttrValue* f = gtl::FindOrNull(attrs, kFuncAttr);
+    const AttrValue* f = attrs.Find(kFuncAttr);
     if (f == nullptr) {
       return errors::InvalidArgument("SymbolicGradient is missing attr: f");
     }
@@ -426,7 +413,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     }
     const string grad = lib_def_->FindGradient(func.name());
     if (!grad.empty()) {
-      return Instantiate(grad, func.attr(), handle);
+      return Instantiate(grad, AttrSlice(&func.attr()), handle);
     }
     TF_RETURN_IF_ERROR(InstantiateSymbolicGradient(func, &fbody));
   } else {
@@ -455,7 +442,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
-          << g->edges().size();
+          << g->num_edges();
   if (VLOG_IS_ON(2)) {
     for (const auto& line : str_util::Split(DebugString(g), '\n')) {
       VLOG(2) << "|| " << line;
@@ -603,7 +590,7 @@ struct CustomCreatorSingleton {
 
   void Set(CustomKernelCreator cb) {
     mutex_lock l(mu);
-    custom_creator = cb;
+    custom_creator = std::move(cb);
   }
 
   CustomKernelCreator Get() {
@@ -620,7 +607,7 @@ CustomCreatorSingleton* GetCustomCreatorSingleton() {
 }  // end namespace
 
 void RegisterDefaultCustomKernelCreator(CustomKernelCreator cb) {
-  GetCustomCreatorSingleton()->Set(cb);
+  GetCustomCreatorSingleton()->Set(std::move(cb));
 }
 
 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
@@ -630,7 +617,7 @@ FunctionLibraryRuntime* NewFunctionLibraryRuntime(
     CustomKernelCreator custom_kernel_creator) {
   return new FunctionLibraryRuntimeImpl(dmgr, env, device, graph_def_version,
                                         lib_def, optimizer_options,
-                                        custom_kernel_creator);
+                                        std::move(custom_kernel_creator));
 }
 
 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
@@ -828,14 +815,31 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
 // Given a "caller" in "graph", which is a function call of a function
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly.
-static void InlineFunctionBody(Graph* g, Node* caller,
-                               const FunctionBody* fbody) {
+void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                        Node* caller, const FunctionBody* fbody) {
   if (!ValidateInlining(caller, fbody)) {
     LOG(WARNING) << "Inlining mismatch: " << caller->DebugString() << " vs. "
                  << DebugString(fbody->graph);
     return;
   }
 
+  // Input edges. For data edges coming into "caller", we first compute the
+  // <src>:<src_output> for the i-th input in "inputs".
+  // If "caller" has any input control dependencies, we add a NoOp
+  // node "input_control_node", which depends on "caller"'s control inputs.
+  std::vector<Endpoint> inputs(caller->num_inputs());
+  Node* input_control_node = nullptr;
+  for (const Edge* e : caller->in_edges()) {
+    if (e->IsControlEdge()) {
+      if (input_control_node == nullptr) {
+        input_control_node = AddNoOp(g);
+      }
+      g->AddControlEdge(e->src(), input_control_node);
+    } else {
+      inputs[e->dst_input()] = {e->src(), e->src_output()};
+    }
+  }
+
   // Duplicate fbody->graph into 'g'.  First, we copy the nodes of
   // fbody->graph into 'g' except the source and sink nodes.  We copy
   // edges among nodes in 'fbody->graph'.
@@ -849,8 +853,35 @@ static void InlineFunctionBody(Graph* g, Node* caller,
     CHECK(n->IsOp());
     NodeDef ndef = n->def();
     ndef.set_name(strings::StrCat(caller->name(), "/", ndef.name()));
-    node_map[n->id()] = g->AddNode(ndef, &s);
+    Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
+    node_map[n->id()] = clone;
+
+    // If there is an input control node, and one of:
+    // a) the node has no data or control inputs, or
+    // b) the node is a function call or SymbolicGradient,
+    // then add a control edge from the input control node to the clone.
+    //
+    // We must not execute any nodes if the original function call would not
+    // have executed. This is especially critical when the function call is
+    // inside a control-flow construct like tf.cond(). Case (a) ensures that
+    // such nodes do not run.
+    //
+    // The purpose of case (b) is to ensure that instances of case (a) created
+    // by further inlining steps also receive the control dependency.
+    if (input_control_node) {
+      bool has_inputs = false;
+      for (const Edge* e : n->in_edges()) {
+        if (!e->src()->IsSource()) {
+          has_inputs = true;
+          break;
+        }
+      }
+      if (!has_inputs || flib_def.Find(clone->type_string()) != nullptr ||
+          clone->type_string() == "SymbolicGradient") {
+        g->AddControlEdge(input_control_node, clone);
+      }
+    }
   }
   for (const Edge* e : fbody->graph->edges()) {
     if (e->src()->IsSource() || e->src()->IsSink() || e->dst()->IsSource() ||
@@ -864,29 +895,12 @@ static void InlineFunctionBody(Graph* g, Node* caller,
 
   // Connect input edges.
   //
-  // For data edges coming into "caller", we first compute the
-  // <src>:<src_output> for the i-th input in "inputs". We create one
-  // Identity node for each input. Then, we connect inputs[i] to to
-  // the i-th identity node added. The nodes that previously connects
-  // to the j-th output of i-th arg node are reconnected to th i-th
+  // We create one Identity node for each input. Then, we connect inputs[i] to
+  // the i-th identity node added. The nodes that previously connected
+  // to the j-th output of i-th arg node are reconnected to the i-th
   // identity node.
   //
-  // If "caller" has any input control dependencies, we add a NoOp
-  // node "input_control_node". This "input_control_node" depends on
-  // what "caller" depends on, and the added identity nodes depend on
-  // "input_control_node".
-  std::vector<Endpoint> inputs(caller->num_inputs());
-  Node* input_control_node = nullptr;
-  for (const Edge* e : caller->in_edges()) {
-    if (e->IsControlEdge()) {
-      if (input_control_node == nullptr) {
-        input_control_node = AddNoOp(g);
-      }
-      g->AddControlEdge(e->src(), input_control_node);
-    } else {
-      inputs[e->dst_input()] = {e->src(), e->src_output()};
-    }
-  }
+  // The added identity nodes depend on "input_control_node".
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = AddIdentity(g, inputs[i]);
@@ -960,13 +974,12 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
   for (Node* node : graph->nodes()) {
     VLOG(3) << "Expanding " << node->DebugString();
     bool noinline;
-    if (fld->GetAttr(node->def(), kNoInlineAttr, &noinline).ok() && noinline) {
+    if (fld->GetAttr(*node, kNoInlineAttr, &noinline).ok() && noinline) {
       VLOG(3) << "noinline: " << node->DebugString();
       continue;
     }
     FunctionLibraryRuntime::Handle handle;
-    Status s =
-        lib->Instantiate(node->type_string(), node->def().attr(), &handle);
+    Status s = lib->Instantiate(node->type_string(), node->attrs(), &handle);
     if (!s.ok()) {
       // Either "node" is a primitive op, or the instantiation failed.
       if (errors::IsNotFound(s)) {
@@ -981,7 +994,7 @@ bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph) {
     candidates.push_back({node, fbody});
   }
   for (const auto& p : candidates) {
-    InlineFunctionBody(graph, p.first, p.second);
+    InlineFunctionBody(*fld, graph, p.first, p.second);
   }
   return !candidates.empty();
 }
@@ -1000,25 +1013,19 @@ string NewName(const Node* n, bool pretty) {
 void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   // We visit nodes in forward topological sort order, which is a
   // possible execution order of the graph.
-  std::vector<int> pending(g->num_node_ids());
-  std::deque<const Node*> ready;
-  for (const Node* n : g->nodes()) {
-    pending[n->id()] = n->in_edges().size();
-    if (pending[n->id()] == 0) ready.push_back(n);
-  }
   gtl::InlinedVector<const Edge*, 4> inputs;
   gdef->Clear();
   gdef->mutable_versions()->CopyFrom(g->versions());
-  while (!ready.empty()) {
-    const Node* n = ready.front();
-    ready.pop_front();
-    for (const Edge* e : n->out_edges()) {
-      const Node* next = e->dst();
-      if (--pending[next->id()] == 0) {
-        ready.push_back(next);
-      }
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
     }
-    if (!n->IsOp()) continue;
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
     NodeDef* ndef = gdef->add_node();
     ndef->set_name(NewName(n, pretty));
     ndef->set_op(n->type_string());
@@ -1053,7 +1060,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
       }
     }
-  }
+  });
 }
 
 string DebugString(const Graph* g) {
@@ -1080,7 +1087,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       continue;
     }
     int index;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "index", &index));
+    TF_CHECK_OK(GetNodeAttr(n->attrs(), "index", &index));
     CHECK_LE(0, index);
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
@@ -1154,7 +1161,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
 
   Graph* g = gbody_->graph;
 
-  const int num_y = gbody_->ret_nodes.size();
+  const int num_y = static_cast<int>(gbody_->ret_nodes.size());
 
   // Populate 'y_node_outputs_' with node function body outputs.
   // Populate 'y_grad_nodes' with initial gradient nodes for each return node of
@@ -1169,7 +1176,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
     y_node_outputs.push_back({y, 0});
     DCHECK_EQ(y->type_string(), kRetOp);
     const DataType dtype = y->input_type(0);
-    const int index = gbody_->arg_nodes.size();
+    const int index = static_cast<int>(gbody_->arg_nodes.size());
     Node* dy = AddArg(g, dtype, index);
     gbody_->arg_types.push_back(dtype);
     gbody_->arg_nodes.push_back(dy);
@@ -1177,7 +1184,7 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   }
 
   // Populate 'x_nodes' with function args (excluding 'y_grad_node_outputs').
-  const int num_x = fbody_->arg_nodes.size();
+  const size_t num_x = fbody_->arg_nodes.size();
   std::vector<NodeOut> x_node_outputs;
   x_node_outputs.reserve(num_x);
   for (size_t i = 0; i < fbody_->arg_nodes.size(); ++i) {
@@ -1200,7 +1207,8 @@ FunctionBody* SymbolicGradientHelper::Compute() {
   gbody_->ret_nodes.clear();
   // Add new return nodes to the function gradient body for each node
   // in 'x_grad_nodes'.
-  for (size_t i = 0; i < fbody_->arg_types.size(); ++i) {
+  const int arg_types_size = static_cast<int>(fbody_->arg_types.size());
+  for (int i = 0; i < arg_types_size; ++i) {
     Endpoint grad = {x_grad_node_outputs[i].node, x_grad_node_outputs[i].index};
     Node* ret = AddRet(g, grad, i);
     gbody_->ret_nodes.push_back(ret);
@@ -1215,4 +1223,26 @@ FunctionBody* SymbolicGradient(const FunctionBody& f) {
   return SymbolicGradientHelper(f).Compute();
 }
 
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* const lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    FunctionBody** fbody) {
+  // Instantiates the function template into a graph def.
+  InstantiationResult result;
+  TF_RETURN_IF_ERROR(InstantiateFunction(fdef, attrs, get_func_sig, &result));
+
+  Graph* graph = new Graph(lib_def);
+  GraphConstructorOptions opts;
+  opts.allow_internal_ops = true;
+  opts.expect_device_spec = false;
+  Status s = ConvertGraphDefToGraph(opts, result.gdef, graph);
+  if (!s.ok()) {
+    delete graph;
+  } else {
+    *fbody = new FunctionBody(fdef, result.arg_types, result.ret_types, graph);
+  }
+  return s;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index 7cfe694673510b482c17479e5dcc47ea9cfd3cbc..f2244f8376056da10da250a39d61336c461b34f1 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -147,6 +147,19 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty = false);
 // TODO(zhifengc): Asks math expert to say the comment again.
 FunctionBody* SymbolicGradient(const FunctionBody& f);
 
+// Given a "caller" in graph "g", which is a function call of a function
+// to "fbody". Replaces the "caller" with fbody->graph and connects
+// edges properly.
+void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
+                        Node* caller, const FunctionBody* fbody);
+
+// Instantiates FunctionDef into a graph. Set *fbody to point to the
+// FunctionBody that holds the instantiated FunctionDef.
+Status FunctionDefToBodyHelper(
+    const FunctionDef& fdef, const AttrSlice& attrs,
+    const FunctionLibraryDefinition* const lib_def,
+    const std::function<Status(const string&, const OpDef**)>& get_func_sig,
+    FunctionBody** fbody);
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_COMMON_RUNTIME_FUNCTION_H_
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index f86a8ed5dc06101f7027f7a6502ceddcbe50ba6f..e27fc3898dc9c16482ea6c45edf7c06090bf79f2 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -28,12 +32,15 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
+namespace {
 
 typedef FunctionDefHelper FDH;
 
@@ -44,7 +51,7 @@ Status GetOpSig(const string& op, const OpDef** sig) {
 void FunctionTestSchedClosure(std::function<void()> fn) {
   static thread::ThreadPool* w =
       new thread::ThreadPool(Env::Default(), "Test", 8);
-  w->Schedule(fn);
+  w->Schedule(std::move(fn));
 }
 
 void HasError(const Status& s, const string& substr) {
@@ -52,19 +59,30 @@ void HasError(const Status& s, const string& substr) {
       << s << ", expected substring " << substr;
 }
 
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>& attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
+
 class FunctionTest : public ::testing::Test {
  protected:
   FunctionTest()
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionTest() override {
-    delete exec_;
-    delete device_;
-  }
-
-  void Create(const FunctionDef& fdef, InstantiateAttrValueSlice attrs) {
-    delete exec_;
+  void Create(const FunctionDef& fdef, Attrs attrs) {
+    exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
 
@@ -79,15 +97,18 @@ class FunctionTest : public ::testing::Test {
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+    Executor* exec;
+    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    exec_.reset(exec);
   }
 
   void Run(const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -105,8 +126,8 @@ class FunctionTest : public ::testing::Test {
     }
   }
 
-  Device* device_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<Executor> exec_;
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
 };
@@ -136,25 +157,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionLibraryRuntimeTest() override {
-    delete lib_;
-    delete lib_def_;
-    delete device_;
-  }
-
   void Init(const std::vector<FunctionDef>& flib) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
-    delete lib_def_;
-    lib_def_ = new FunctionLibraryDefinition(OpRegistry::Global(), proto);
-    delete lib_;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ = NewFunctionLibraryRuntime(nullptr, Env::Default(), device_,
-                                     TF_GRAPH_DEF_VERSION, lib_def_, opts);
+    lib_.reset(NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
+                                         TF_GRAPH_DEF_VERSION, lib_def_.get(),
+                                         opts));
+    fdef_lib_ = lib_def_->ToProto();
   }
 
-  Status Run(const string& name, InstantiateAttrValueSlice attrs,
-             const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
+  Status Run(const string& name, Attrs attrs, const std::vector<Tensor>& args,
+             std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -190,7 +205,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  Graph* GetFuncBody(const string& name, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetFuncBody(const string& name, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -199,12 +214,12 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*fbody->graph, ret);
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  Graph* GetGradBody(const string& func, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetGradBody(const string& func, Attrs attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
@@ -213,17 +228,17 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    FunctionBody* gbody = SymbolicGradient(*fbody);
+    std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*gbody->graph, ret);
-    delete gbody;
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*gbody->graph, ret.get());
     return ret;
   }
 
-  Device* device_ = nullptr;
-  FunctionLibraryDefinition* lib_def_ = nullptr;
-  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
@@ -254,113 +269,258 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+// Adds a function call to 'scope.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            gtl::ArraySlice<Input> inputs) {
+  NodeDef def;
+  NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
+  for (const Input& input : inputs) {
+    builder.Input(input.node()->name(), input.index(),
+                  input.node()->output_type(input.index()));
+  }
+  TF_CHECK_OK(builder.Finalize(&def));
+  Status status;
+  Node* n = scope->graph()->AddNode(def, &status);
+  TF_CHECK_OK(status);
+  for (int i = 0; i < inputs.size(); ++i) {
+    scope->graph()->AddEdge(inputs[i].node(), inputs[i].index(), n, i);
+  }
+  return Output(n);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  Graph* g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n2:float) -> (n4:float) {
-  n3 = XTimesFour[T=float](n2)
-  n4 = XTimesFour[T=float](n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-
-  ExpandInlineFunctions(lib_, g);
-  const char* e1 = R"P(
-(n2:float) -> (n17:float) {
-  n10 = Identity[T=float](n2)
-  n7 = XTimesTwo[T=float](n10)
-  n8 = XTimesTwo[T=float](n7)
-  n11 = Identity[T=float](n8)
-  n16 = Identity[T=float](n11)
-  n13 = XTimesTwo[T=float](n16)
-  n14 = XTimesTwo[T=float](n13)
-  n17 = Identity[T=float](n14)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g));
-
-  ExpandInlineFunctions(lib_, g);
-  const char* e2 = R"P(
-(n2:float) -> (n17:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n10 = Identity[T=float](n2)
-  n23 = Identity[T=float](n10)
-  n21 = Mul[T=float](n23, n19)
-  n24 = Identity[T=float](n21)
-  n30 = Identity[T=float](n24)
-  n28 = Mul[T=float](n30, n26)
-  n31 = Identity[T=float](n28)
-  n11 = Identity[T=float](n31)
-  n16 = Identity[T=float](n11)
-  n37 = Identity[T=float](n16)
-  n35 = Mul[T=float](n37, n33)
-  n38 = Identity[T=float](n35)
-  n44 = Identity[T=float](n38)
-  n42 = Mul[T=float](n44, n40)
-  n45 = Identity[T=float](n42)
-  n17 = Identity[T=float](n45)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g));
+
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto arg = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto a = Call(&s, "x4", "XTimesFour", {arg});
+    auto b = Call(&s, "y", "XTimesFour", {a});
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), b, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto x4_x2 = Call(&s, "x4/x2", "XTimesTwo", {func0});
+    auto x4_y = Call(&s, "x4/y", "XTimesTwo", {x4_x2});
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto y_x2 = Call(&s, "y/x2", "XTimesTwo", {func2});
+    auto y_y = Call(&s, "y/y", "XTimesTwo", {y_x2});
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  GraphDef e2;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
+    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
+    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
+    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&e2));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_, g);
-  EXPECT_EQ(e2, DebugString(g));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // Get rid of redundant Identity nodes.
-  RemoveIdentityNodes(g);
-  const char* e3 = R"P(
-(n2:float) -> (n42:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n21 = Mul[T=float](n2, n19)
-  n28 = Mul[T=float](n21, n26)
-  n35 = Mul[T=float](n28, n33)
-  n42 = Mul[T=float](n35, n40)
+  RemoveIdentityNodes(g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_y_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, y_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, y_y_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
-)P";
-  EXPECT_EQ(e3, DebugString(g));
-  delete g;
+
+// Verifies that control dependencies on the caller are added as control
+// dependencies on any function calls created by inlining.
+TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsWithControlDeps) {
+  Init({test::function::XTimesTwo(), test::function::XTimesFour()});
+
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto b = Call(&s, "b", "XTimesFour", {a});
+    s.graph()->AddControlEdge(c.operation.node(), b.node());
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), b, 0);
+    TF_ASSERT_OK(s.ToGraph(g.get()));
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+    auto b_x2 = Call(&s, "b/x2", "XTimesTwo", {func1});
+    s.graph()->AddControlEdge(func0.operation.node(), b_x2.node());
+    auto b_y = Call(&s, "b/y", "XTimesTwo", {b_x2});
+    s.graph()->AddControlEdge(func0.operation.node(), b_y.node());
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), b_y);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
+    auto c = ops::NoOp(s.WithOpName("c"));
+    auto func0 =
+        ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies({c}));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), a);
+
+    auto func3 =
+        ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies({func0}));
+    auto func4 = ops::Identity(
+        s.WithOpName("Func/_4").WithControlDependencies({func3}), func1);
+    auto b_x2_two = ops::Const(
+        s.WithOpName("b/x2/two").WithControlDependencies({func3}), 2LL);
+    auto b_x2_scale = ops::Cast(s.WithOpName("b/x2/scale"), b_x2_two, DT_FLOAT);
+    auto b_x2_y = ops::Mul(s.WithOpName("b/x2/y"), func4, b_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), b_x2_y);
+
+    auto func6 =
+        ops::NoOp(s.WithOpName("Func/_6").WithControlDependencies({func0}));
+    auto func7 = ops::Identity(
+        s.WithOpName("Func/_7").WithControlDependencies({func6}), func5);
+    auto b_y_two = ops::Const(
+        s.WithOpName("b/y/two").WithControlDependencies({func6}), 2LL);
+    auto b_y_scale = ops::Cast(s.WithOpName("b/y/scale"), b_y_two, DT_FLOAT);
+    auto b_y_y = ops::Mul(s.WithOpName("b/y/y"), func7, b_y_scale);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), b_y_y);
+
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func8);
+    auto ret = ops::_Retval(s.WithOpName("b_RetVal"), func2, 0);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g(GetFuncBody("XTimes16", {{"T", DT_FLOAT}}));
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_, g.get());
-  OptimizeGraph(lib_, &g);
-  const char* e0 = R"P(
-(n2:float) -> (n7:float) {
-  n8 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n4 = Mul[T=float](n2, n8)
-  n5 = Mul[T=float](n4, n8)
-  n6 = Mul[T=float](n5, n8)
-  n7 = Mul[T=float](n6, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_scale = ops::Const<float>(
+        s.WithOpName("x4/x2/scale/_12__cf__2")
+            .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+        2.0f);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_x2_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, x4_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, x4_x2_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
-      // Name
+                            // Name
       "ManySwapsNodeDef",
       // Input
       {"x: float", "y: float"},
@@ -379,9 +539,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsNodeDef", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -412,24 +572,35 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsFirst", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
 
-  // NOTE: We can remove n8, n9, n10, n11 with a control edge n8->n5.
+  // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
-  const char* e0 = R"P(
-(n3:float, n2:float) -> (n6:float) {
-  n4 = Mul[T=float](n3, n3)
-  n8 = NoOp() @ n4
-  n9 = Identity[T=float](n3) @ n8
-  n10 = Identity[T=float](n2) @ n8
-  n11 = NoOp() @ n10, n9
-  n5 = Mul[T=float](n2, n2) @ n11
-  n6 = Add[T=float](n4, n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
+    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+    auto func2 = ops::Identity(
+        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
+        {func1.output.op(), func2.output.op()}));
+    auto y2 =
+        ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
+    auto o = ops::Add(s.WithOpName("o"), x2, y2);
+    auto ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
@@ -459,13 +630,14 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 
   // Instantiating "XTimesTwo" should fail.
   FunctionLibraryRuntime::Handle handle;
-  HasError(lib_->Instantiate("XTimesTwo", {{"T", DT_FLOAT}}, &handle),
+  HasError(lib_->Instantiate("XTimesTwo", Attrs({{"T", DT_FLOAT}}), &handle),
            "Not found: type attr not found");
 
   // But XTimesFour and XTimes16 instantiation should succeed. Only
   // when they run, they fail because XTimesTwo is bad.
-  TF_CHECK_OK(lib_->Instantiate("XTimesFour", {{"T", DT_FLOAT}}, &handle));
-  TF_CHECK_OK(lib_->Instantiate("XTimes16", {{"T", DT_FLOAT}}, &handle));
+  TF_CHECK_OK(
+      lib_->Instantiate("XTimesFour", Attrs({{"T", DT_FLOAT}}), &handle));
+  TF_CHECK_OK(lib_->Instantiate("XTimes16", Attrs({{"T", DT_FLOAT}}), &handle));
 
   auto x = test::AsTensor<float>({1, 2, 3, 4});
   Tensor y;
@@ -476,84 +648,136 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  auto f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
-  const char* e0 = R"P(
-(n4:float) -> (n5:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(f));
-  delete f;
-  std::unique_ptr<Graph> g(GetGradBody("XTimesTwo", {{"T", DT_FLOAT}}));
-  const char* e1 = R"P(
-(n4:float, n6:float) -> (n7:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-  n7 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Mul[T=float]](n4, n3, n6)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
-
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n2:float, n3:float) -> (n9:float) {
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [0] values: >]()
-  n10 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n6 = Shape[T=float, out_type=int32](n2)
-  n5 = Mul[T=float](n3, n10)
-  n7 = BroadcastGradientArgs[T=int32](n6, n11)
-  n8 = Sum[T=float, Tidx=int32, keep_dims=false](n5, n7)
-  n9 = Reshape[T=float, Tshape=int32](n8, n6)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    f->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    NameAttrList fn;
+    fn.set_name("Mul");
+    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto func1 = ops::SymbolicGradient(
+        s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
+        {DT_FLOAT, DT_FLOAT}, fn);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto scale =
+        ops::Const(s.WithOpName("scale/_5__cf__6")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   2.0f);
+    auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
+    auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
+    auto const0 =
+        ops::Const(s.WithOpName("Func/_1/sy/_6__cf__7")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   0, {0});
+    auto func1_rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("Func/_1/rx"), func1_sx, const0);
+    auto func1_sum_gx =
+        ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
+    auto func1_dx =
+        ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
-  const char* e0 = R"P(
-(n7:float, n5:float, n2:float) -> (n14:float, n11:float) {
-  n3 = Identity[T=float](n2)
-  n4 = Identity[T=float](n2)
-  n6 = Shape[T=float, out_type=int32](n5)
-  n8 = Shape[T=float, out_type=int32](n7)
-  n9 = BroadcastGradientArgs[T=int32](n8, n6)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n3, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n6)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Identity(s.WithOpName("gx"), dz);
+    auto gy = ops::Identity(s.WithOpName("gy"), dz);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
-  const char* e0 = R"P(
-(n6:float, n3:float, n2:float) -> (n14:float, n11:float) {
-  n4 = Mul[T=float](n2, n3)
-  n5 = Shape[T=float, out_type=int32](n3)
-  n7 = Mul[T=float](n6, n2)
-  n8 = Shape[T=float, out_type=int32](n6)
-  n9 = BroadcastGradientArgs[T=int32](n8, n5)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n7, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n5)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Mul(s.WithOpName("gx"), dz, y);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto gy = ops::Mul(s.WithOpName("gy"), x, dz);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
@@ -570,107 +794,169 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
                           });
 
   // TestGrad = Test'(x, y)
-  auto grad =
-      FDH::Define("TestGrad", {"x:float", "y:float"}, {"dx:float", "dy:float"},
-                  {}, {FDH::Const<float>("dz", 1),
-                       {{"grad0", "grad1"},
-                        "SymbolicGradient",
-                        {"x", "y", "dz"},
-                        {
-                            {"f", FDH::FunctionRef("Test")},
-                            {"Tin", DataTypeSlice{T, T, T}},
-                            {"Tout", DataTypeSlice{T, T}},
-                        }},
-                       {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
-                       {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
+  auto grad = FDH::Define("TestGrad", {"x:float", "y:float"},
+                          {"dx:float", "dy:float"}, {},
+                          {FDH::Const<float>("dz", 1),
+                           {{"grad0", "grad1"},
+                            "SymbolicGradient",
+                            {"x", "y", "dz"},
+                            {
+                                {"f", FDH::FunctionRef("Test")},
+                                {"Tin", DataTypeSlice{T, T, T}},
+                                {"Tout", DataTypeSlice{T, T}},
+                            }},
+                           {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
+                           {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g(GetFuncBody("TestGrad", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n5 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Test](n4, n3, n2)
-  n6 = Identity[T=float](n5:1)
-  n8 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    NameAttrList fn;
+    fn.set_name("Test");
+    auto grad0 = ops::SymbolicGradient(s.WithOpName("grad0"),
+                                       std::initializer_list<Input>{x, y, dz},
+                                       {DT_FLOAT, DT_FLOAT}, fn);
+    auto dx = ops::Identity(s.WithOpName("dx"), grad0[0]);
+    auto dy = ops::Identity(s.WithOpName("dy"), grad0[1]);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 
-  ExpandInlineFunctions(lib_, g.get());
-  const char* e1 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n10 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n26 = Identity[T=float](n2)
-  n25 = Identity[T=float](n3)
-  n24 = Identity[T=float](n4)
-  n14 = Add[T=float](n24, n25)
-  n15 = Rank[T=float](n14)
-  n16 = Range[Tidx=int32](n11, n15, n10)
-  n20 = ZerosLike[T=int32](n15)
-  n17 = Sum[T=float, Tidx=int32, keep_dims=false](n14, n16)
-  n19 = SymbolicGradient[Tin={float, int32, float}, Tout={float, int32}, f=Sum[T=float, Tidx=int32, keep_dims=false]](n14, n16, n26)
-  n21 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Add[T=float]](n24, n25, n19)
-  n28 = Identity[T=float](n21:1)
-  n27 = Identity[T=float](n21)
-  n6 = Identity[T=float](n28)
-  n8 = Identity[T=float](n27)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
-
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n4:float, n3:float) -> (n25:float, n23:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n8 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n7 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n19 = Shape[T=float, out_type=int32](n3)
-  n9 = Add[T=float](n4, n3)
-  n20 = Shape[T=float, out_type=int32](n4)
-  n10 = Rank[T=float](n9)
-  n14 = Shape[T=float, out_type=int32](n9)
-  n21 = BroadcastGradientArgs[T=int32](n20, n19)
-  n11 = Range[Tidx=int32](n8, n10, n7)
-  n12 = Shape[T=int32, out_type=int32](n11)
-  n13 = Fill[T=int32](n12, n7)
-  n15 = DynamicStitch[N=2, T=int32](n11, n11, n14, n13)
-  n16 = Reshape[T=float, Tshape=int32](n2, n15)
-  n17 = Div[T=int32](n14, n15)
-  n18 = Tile[T=float, Tmultiples=int32](n16, n17)
-  n24 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21)
-  n22 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21:1)
-  n25 = Reshape[T=float, Tshape=int32](n24, n20)
-  n23 = Reshape[T=float, Tshape=int32](n22, n19)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto grad0_l = ops::Sum(s.WithOpName("grad0/l"), grad0_z, grad0_indices);
+
+    NameAttrList sum;
+    sum.set_name("Sum");
+    (*sum.mutable_attr())["T"].set_type(DT_FLOAT);
+    (*sum.mutable_attr())["Tidx"].set_type(DT_INT32);
+    (*sum.mutable_attr())["keep_dims"].set_b(false);
+    auto grad0_func1 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_1"),
+        std::initializer_list<Input>{grad0_z, grad0_indices, func2},
+        {DT_FLOAT, DT_INT32}, sum);
+
+    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+
+    NameAttrList add;
+    add.set_name("Add");
+    (*add.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto grad0_func3 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_3"),
+        std::initializer_list<Input>{func0, func1, grad0_func1[0]},
+        {DT_FLOAT, DT_FLOAT}, add);
+
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto dx = ops::Identity(s.WithOpName("dx"), func3);
+    auto dy = ops::Identity(s.WithOpName("dy"), func4);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), x, y);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto i_shape =
+        ops::Shape(s.WithOpName("grad0/Func/_1/i_shape"), grad0_indices);
+    auto stitch_val = ops::Fill(s.WithOpName("grad0/Func/_1/stitch_val1"),
+                                i_shape, grad0_one);
+    auto x_shape = ops::Shape(s.WithOpName("grad0/Func/_1/x_shape"), grad0_z);
+    auto y_shape = ops::DynamicStitch(
+        s.WithOpName("grad0/Func/_1/y_shape"),
+        std::initializer_list<Input>{grad0_indices, grad0_indices},
+        std::initializer_list<Input>{x_shape, stitch_val});
+    auto dy_reshaped =
+        ops::Reshape(s.WithOpName("grad0/Func/_1/dy_reshaped"), dz, y_shape);
+    auto tile_scaling =
+        ops::Div(s.WithOpName("grad0/Func/_1/tile_scaling"), x_shape, y_shape);
+    auto func1_dx =
+        ops::Tile(s.WithOpName("grad0/Func/_1/dx"), dy_reshaped, tile_scaling);
+
+    auto sx = ops::Shape(s.WithOpName("grad0/Func/_3/sx"), x);
+    auto sy = ops::Shape(s.WithOpName("grad0/Func/_3/sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("grad0/Func/_3/rx"), sx, sy);
+    auto sum_gx =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gx"), func1_dx, rx.r0);
+    auto sum_gy =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gy"), func1_dx, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("grad0/Func/_3/dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("grad0/Func/_3/dy"), sum_gy, sy);
+
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
 
-string Optimize(std::function<bool(Graph* g)> pass, const FunctionDef& fdef) {
+GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
+                  const FunctionDef& fdef) {
   InstantiationResult result;
-  InstantiateAttrValueMap empty;
-  TF_CHECK_OK(InstantiateFunction(fdef, empty, GetOpSig, &result));
-  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
-  pass(g);
-  Graph* g1 = new Graph(OpRegistry::Global());
-  CopyGraph(*g, g1);
-  delete g;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g.get()));
+  pass(g.get());
+  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
+  CopyGraph(*g, g1.get());
+  g = nullptr;
   GraphDef gdef;
   g1->ToGraphDef(&gdef);
-  delete g1;
-  return DebugString(gdef);
+  return gdef;
 }
 
 }  // end namespace
@@ -699,21 +985,25 @@ TEST(OptimizationTest, RemoveDeadNodes) {
        {{"keep_me"}, "RandomUniform", {"o"}, {{"T", T}, {"dtype", DT_FLOAT}}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2)
-  n3 = Add[T=int32](n2, n2)
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n4 = Mul[T=int32](n1, n3)
-  n5 = Mul[T=int32](n3, n4)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto keep_me = ops::RandomUniform(s.WithOpName("keep_me"), {o}, DT_FLOAT);
+    auto x1 = ops::Add(s.WithOpName("x1"), o, o);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x2 = ops::Mul(s.WithOpName("x2"), a, x1);
+    auto x3 = ops::Mul(s.WithOpName("x3"), x1, x2);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
 
   // TODO(zhifengc): Comes up another test case.
-  EXPECT_EQ(Optimize(::tensorflow::RemoveDeadNodes, func), e0);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(::tensorflow::RemoveDeadNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
@@ -734,23 +1024,19 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
-  const char* e0 = R"S(
-() -> (n2:float) {
-  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  n1 = Identity[T=float](n0)
-  n2 = Add[T=float](n1, n1)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"S(
-() -> (n2:float) {
-  n0 = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  n1 = Identity[T=float](n0)
-  n2 = Add[T=float](n1, n1)
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto v = ops::Variable(s.WithOpName("v"), PartialTensorShape({}), DT_FLOAT);
+    auto v_read = ops::Identity(s.WithOpName("v_read"), v);
+    auto ret = ops::Add(s.WithOpName("ret"), v_read, v_read);
+    auto ret_retval = ops::_Retval(s.WithOpName("ret_RetVal"), ret, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  TF_EXPECT_GRAPH_EQ(expected,
+                     Optimize(::tensorflow::RemoveIdentityNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes) {
@@ -781,28 +1067,38 @@ TEST(OptimizationTest, RemoveIdentityNodes) {
         {"x3"}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n3 = Identity[T=int32](n1)
-  n4 = Identity[T=int32](n3)
-  n5 = Identity[T=int32](n4)
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2) @ n5
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"S(
-(n0:int32) -> (n7:int32) {
-  n2 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n1 = Square[T=int32](n0)
-  n7 = Add[T=int32](n1, n2)
-  n6 = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](n2) @ n1
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x1 = ops::Identity(s.WithOpName("x1"), a);
+    auto x2 = ops::Identity(s.WithOpName("x2"), x1);
+    auto x3 = ops::Identity(s.WithOpName("x3"), x2);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(x3), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(a), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected,
+                       Optimize(::tensorflow::RemoveIdentityNodes, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter) {
@@ -839,49 +1135,63 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
       // Return values
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n3 = _ArrayToList[N=4, T=float, out_types={float, float, float, float}](n2, n2:1, n2:2, n2:3)
-  n5 = Mul[T=float](n3:2, n3:3)
-  n4 = Mul[T=float](n3, n3:1)
-  n6 = _ListToArray[N=2, T=float, Tin={float, float}](n4, n5)
-  n7 = AddN[N=2, T=float](n6, n6:1)
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n5 = Mul[T=float](Func/_2, Func/_3)
-  n4 = Mul[T=float](Func/_0, Func/_1)
-  n7 = AddN[N=2, T=float](Func/_4, Func/_5)
-  Func/_0 = Identity[T=float](n2)
-  Func/_1 = Identity[T=float](n2:1)
-  Func/_2 = Identity[T=float](n2:2)
-  Func/_3 = Identity[T=float](n2:3)
-  Func/_4 = Identity[T=float](n4)
-  Func/_5 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
-
-  const char* e2 = R"P(
-(n0:float) -> (n7:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n5 = Mul[T=float](n2:2, n2:3)
-  n4 = Mul[T=float](n2, n2:1)
-  n7 = AddN[N=2, T=float](n4, n5)
-}
-)P";
-  auto remove_listarray_and_identity = [](Graph* g) {
-    return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
-  };
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e2);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto a = ops::_ArrayToList(scope.WithOpName("a"), s.output,
+                               {DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT});
+    auto r = ops::Mul(scope.WithOpName("r"), a[2], a[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), a[0], a[1]);
+    auto x = ops::_ListToArray(scope.WithOpName("x"),
+                               std::initializer_list<Input>{l, r}, DT_FLOAT, 2);
+    auto o = ops::AddN(scope.WithOpName("o"), x.output);
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
+    auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto o = ops::AddN(scope.WithOpName("o"),
+                       std::initializer_list<Input>{func_4, func_5});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
+  }
+
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto r = ops::Mul(scope.WithOpName("r"), s[2], s[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), s[0], s[1]);
+    auto o =
+        ops::AddN(scope.WithOpName("o"), std::initializer_list<Input>{l, r});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    auto remove_listarray_and_identity = [](Graph* g) {
+      return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
+    };
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
@@ -910,33 +1220,48 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
         {"x"}}},
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = _ListToArray[N=2, T=float, Tin={float, float}](n0, n0) @ n1
-  n3 = AddN[N=2, T=float](n2, n2:1) @ n2
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
-
-  const char* e1 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n3 = AddN[N=2, T=float](Func/_0, Func/_1) @ Func/_3
-  Func/_0 = Identity[T=float](n0) @ Func/_2
-  Func/_1 = Identity[T=float](n0) @ Func/_2
-  Func/_2 = NoOp() @ n1
-  Func/_3 = NoOp() @ Func/_0, Func/_1
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto x = ops::_ListToArray(s.WithOpName("x").WithControlDependencies(dummy),
+                               std::initializer_list<Input>{i, i}, DT_FLOAT, 2);
+    auto o =
+        ops::AddN(s.WithOpName("o").WithControlDependencies({x.output[0].op()}),
+                  x.output);
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto func_2 =
+        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_0 = ops::Identity(
+        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+    auto func_1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
+        {func_0.output.op(), func_1.output.op()}));
+    auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
+                       std::initializer_list<Input>{func_0, func_1});
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
 
   auto remove_listarray_and_identity = [](Graph* g) {
     return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
   };
   // NOTE: We are not removing Identity nodes with any control
   // dependencies yet.
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e1);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index a863b69cef0579fa3277f1291a80dcd578217271..e2ad18f33bdc452279aaec48898744b9bc151ceb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -74,7 +75,8 @@ namespace tensorflow {
 
 class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+  EigenCudaStreamDevice()
+      : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     Eigen::initializeDeviceProp();
   }
   ~EigenCudaStreamDevice() {}
@@ -84,6 +86,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
       operation_ = context->op_kernel().name() + "/EigenAllocator";
       step_id_ = context->step_id();
     }
+    context_ = context;
     scratch_ = scratch;
     semaphore_ =
         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
@@ -100,8 +103,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   void* allocate(size_t num_bytes) const override {
     void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
     if (ret == nullptr) {
-      LOG(FATAL) << "EigenAllocator for GPU ran out of memory when allocating "
-                 << num_bytes << ". See error logs for more detailed info.";
+      if (context_) {
+        context_->SetStatus(errors::ResourceExhausted(
+            strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
+                            " bytes for ", operation_)));
+      } else {
+        LOG(FATAL)
+            << "EigenAllocator for GPU ran out of memory when allocating "
+            << num_bytes << ". See error logs for more detailed info.";
+      }
     }
     if (LogMemory::IsEnabled()) {
       LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
@@ -159,6 +169,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   ::tensorflow::Allocator* allocator_;  // Not owned.
   mutable char* scratch_;
   mutable unsigned int* semaphore_;
+  OpKernelContext* context_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
 };
@@ -168,10 +179,9 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                              int gpu_id, const string& physical_device_desc,
                              Allocator* gpu_allocator, Allocator* cpu_allocator,
                              bool sync_every_op, int32 max_streams)
-    : LocalDevice(options,
-                  Device::BuildDeviceAttributes(name, DEVICE_GPU, memory_limit,
-                                                locality, physical_device_desc),
-                  gpu_allocator),
+    : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
+                                                         memory_limit, locality,
+                                                         physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
       gpu_id_(gpu_id),
@@ -274,14 +284,14 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
                                      DeviceContextMap* device_context_map) {
   VLOG(2) << "FillContextMap";
 
-  const auto num_streams = streams_.size();
+  const size_t num_streams = streams_.size();
   // Special case for single stream.
   if (num_streams == 1) {
     return Status::OK();
   }
   const int64 before = Env::Default()->NowMicros();
   gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = num_streams;
+  opts.max_streams = static_cast<int32>(num_streams);
   std::unordered_map<int, int> node_to_stream_id;
   TF_RETURN_IF_ERROR(
       gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
@@ -426,8 +436,10 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
           << op_kernel->def().op() << " on GPU" << gpu_id_ << " stream["
           << stream_id << "]";
 
-  port::Tracing::TraceMe activity(
-      strings::StrCat(op_kernel->name(), ":", op_kernel->type_string()));
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string());
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -452,6 +464,14 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                               DataTypeString(parsed.dtype()), " tensor");
     }
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
+
+    // If the tensor is not initialized, we likely ran out of memory.
+    if (!copy.IsInitialized()) {
+      return errors::ResourceExhausted(
+          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
+          " and type ", DataTypeString(parsed.dtype()));
+    }
+
     port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     Notification n;
     device_contexts_[0]->CopyCPUTensorToDevice(&parsed, this, &copy,
@@ -519,7 +539,7 @@ void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
                                            const string& name_prefix,
                                            std::vector<Device*>* devices) {
-  int n = INT_MAX;
+  size_t n = INT_MAX;
   auto iter = options.config.device_count().find("GPU");
   if (iter != options.config.device_count().end()) {
     n = iter->second;
@@ -547,15 +567,14 @@ int64 MinSystemMemory(int64 available_memory) {
   // We use the following heuristic for now:
   //
   // If the available_memory is < 2GiB, we allocate 200MiB to system memory.
-  // Otherwise, allocate 300MiB to system memory.
+  // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
   //
-  // In the future we could be more sophisticated by using a table of
-  // devices.
+  // In the future we could be more sophisticated by using a table of devices.
   if (available_memory < (1LL << 31)) {
     // 200MiB
     return 209715200LL;
   } else {
-    // max(300 MiB, 0.95 * available_memory)
+    // max(300 MiB, 0.05 * available_memory)
     return std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
   }
 }
@@ -971,7 +990,7 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
       continue;
     }
 
-    int new_id = ids->size();
+    size_t new_id = ids->size();
     ids->push_back(visible_gpu_id);
 
     LOG(INFO) << "Creating TensorFlow device (/gpu:" << new_id << ") -> "
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 94143a55d5e2111973955d5f00ac1bc71bd34d4a..1e7a2b35bebf22e5ef3f8b0af200f030dd2df799 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -31,12 +31,17 @@ class GPUDevice : public BaseGPUDevice {
             Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, gpu_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
-                      false /* sync every op */, 1 /* max_streams */) {}
+                      false /* sync every op */, 1 /* max_streams */) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     if (attr.on_host()) {
-      ProcessState* ps = ProcessState::singleton();
-      if (attr.gpu_compatible()) {
+      if (attr.gpu_compatible() || force_gpu_compatible_) {
+        ProcessState* ps = ProcessState::singleton();
         return ps->GetCUDAHostAllocator(0);
       } else {
         return cpu_allocator_;
@@ -45,6 +50,9 @@ class GPUDevice : public BaseGPUDevice {
       return gpu_allocator_;
     }
   }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };
 
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
@@ -71,18 +79,26 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
-      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {}
+      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+    if (options.config.has_gpu_options()) {
+      force_gpu_compatible_ =
+          options.config.gpu_options().force_gpu_compatible();
+    }
+  }
   ~GPUCompatibleCPUDevice() override {}
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     ProcessState* ps = ProcessState::singleton();
-    if (attr.gpu_compatible()) {
+    if (attr.gpu_compatible() || force_gpu_compatible_) {
       return ps->GetCUDAHostAllocator(0);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
     }
   }
+
+ private:
+  bool force_gpu_compatible_ = false;
 };
 
 // The associated factory.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index e4cd79bc7f00f110affbfbd95d2e798cbbc4aca7..8226cc035c8808c54c3fc24a251f4012564c22e0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -37,12 +37,12 @@ class TEST_EventMgrHelper {
     StopPollingLoop();
   }
 
-  int queue_size() {
+  size_t queue_size() {
     mutex_lock l(em_->mu_);
     return em_->used_events_.size();
   }
 
-  int free_size() {
+  size_t free_size() {
     mutex_lock l(em_->mu_);
     return em_->free_events_.size();
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
index eae917a439550d1527f09e542cfbb334d6f54641..de715d140a1a58470e4ea6b6b7e6c63412706f63 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
@@ -82,7 +82,7 @@ Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
     // Determine a suitable stream to use.
     int stream_id = highest_stream_id + 1;
     for (const Edge* e : n->in_edges()) {
-      const int fanout = e->src()->out_edges().size();
+      const size_t fanout = e->src()->out_edges().size();
       if (fanout == 1) {
         stream_id = (*node_to_stream_id)[e->src()->id()];
         break;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
index e3bd7d7c986a45698d1c4a2d76ab4ef96cead9a1..86252a6dc309b10870ae64f8b9d0884f8663893f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer.cc
@@ -568,7 +568,6 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   const int id = 0;
   const string stream_device = strings::StrCat(prefix, "/gpu:", id, "/stream:");
   const string memcpy_device = strings::StrCat(prefix, "/gpu:", id, "/memcpy");
-  const string sync_device = strings::StrCat(prefix, "/gpu:", id, "/sync");
 
   mutex_lock l2(trace_mu_);
   for (const auto &rec : kernel_records_) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
index b1be278ab4ff57b8163e3f0a8e056278a097215c..aaa25ad345e4eea508c163f738189e0215968db7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_tracer_test.cc
@@ -82,7 +82,7 @@ class GPUTracerTest : public ::testing::Test {
   }
 
  protected:
-  void ExpectFailure(Status status, error::Code code) {
+  void ExpectFailure(const Status& status, error::Code code) {
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.error_message();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 522db80d7fac25868785a9b3a77696964217245d..71f82ec9a1bc0d13cb72c63f08d0c6cb9c125f38 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -227,7 +227,7 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
     }
     // Since we want to use the memory from recv_stream in the
     // send_device_to_device_stream, add a dependency to make sure the memory is
-    // truely free.
+    // truly free.
     // TODO(zhengxq): remove this dependency when we switch to a better way
     // to make sure the memory is free.
     send_device_to_device_stream->ThenWaitFor(recv_stream);
@@ -322,7 +322,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
     done(errors::Internal("No send gpu copy-out-stream is available."));
     return;
   }
-  // Wait for the recv-stream to make sure the buffer is truely available.
+  // Wait for the recv-stream to make sure the buffer is truly available.
   recv_host_to_device_stream->ThenWaitFor(recv_stream);
 
   const int64 total_bytes = cpu_tensor->TotalBytes();
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index f9975ef0a08d188a75a27da399d2f3142d221c1e..0e21e37fd3e720135b0be64971b9138314a8b04b 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -159,9 +159,36 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
   numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
-    Allocator* allocator =
-        new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
-                          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+    bool use_bfc_allocator = false;
+    // TODO(reedwm): Switch default to BGFAllocator if it's at least as fast and
+    // efficient.
+    Status status = ReadBoolFromEnvVar("TF_CPU_ALLOCATOR_USE_BFC", false,
+                                       &use_bfc_allocator);
+    if (!status.ok()) {
+      LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+    }
+    Allocator* allocator;
+    if (use_bfc_allocator) {
+      // TODO(reedwm): evaluate whether 64GB by default is the best choice.
+      int64 cpu_mem_limit_in_mb = -1;
+      Status status = ReadInt64FromEnvVar("TF_CPU_BFC_MEM_LIMIT_IN_MB",
+                                          1LL << 16 /*64GB max by default*/,
+                                          &cpu_mem_limit_in_mb);
+      if (!status.ok()) {
+        LOG(ERROR) << "GetCPUAllocator: " << status.error_message();
+      }
+      int64 cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20);
+      allocator = new BFCAllocator(new BasicCPUAllocator(), cpu_mem_limit,
+                                   true /*allow_growth*/,
+                                   "bfc_cpu_allocator_for_gpu" /*name*/);
+      VLOG(2) << "Using BFCAllocator with memory limit of "
+              << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
+    } else {
+      allocator = new PoolAllocator(
+          100 /*pool_size_limit*/, true /*auto_resize*/,
+          new BasicCPUAllocator(), new NoopRounder, "cpu_pool");
+      VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator";
+    }
     if (LogMemory::IsEnabled()) {
       // Wrap the allocator to track allocation ids for better logging
       // at the cost of performance.
@@ -191,7 +218,7 @@ Allocator* ProcessState::GetCUDAHostAllocator(int numa_node) {
   // example, process_state could maybe save the first stream executor
   // it knows is valid.
   gpu::StreamExecutor* se = nullptr;
-  for (size_t i = 0; i < gpu_allocators_.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(gpu_allocators_.size()); ++i) {
     if (gpu_allocators_[i] != nullptr) {
       se = GPUMachineManager()->ExecutorForDevice(i).ValueOrDie();
       break;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 68496cb729249f82106c0b114bd5ef8f6d643ba7..edfecfae06e0bd02ce6b241b11786fb13c61a067 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/optimizer_cse.h"
 
@@ -56,7 +57,10 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
 
     if (opts_.do_constant_folding()) {
       ConstantFoldingOptions cf_opts;
-      if (DoConstantFolding(cf_opts, runtime, env, device, g)) {
+      bool was_mutated;
+      ConstantFold(cf_opts, runtime, env, device, g, &was_mutated)
+          .IgnoreError();
+      if (was_mutated) {
         RemoveDeadNodes(g);
         DumpGraph("ConstFolding", g);
         changed = true;
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index c93ff1cdde8d9b38271dbdd5f6e8529b3da3df4b..74b2252c7c6a4530cce3ecf59294d1f2b8798933 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -95,22 +96,24 @@ class SimpleRendezvous : public Rendezvous {
 
 }  // namespace
 
-// static
+GraphRunner::GraphRunner(Env* env) : cpu_device_(GetCPUDevice(env)) {}
+
+GraphRunner::~GraphRunner() {}
+
 Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
-                        Env* env, const NamedTensorList& inputs,
+                        const NamedTensorList& inputs,
                         const std::vector<string>& output_names,
                         std::vector<Tensor>* outputs) {
+  if (cpu_device_ == nullptr) {
+    return errors::NotFound("Cannot find a device for GraphRunner.");
+  }
+
   // TODO(vrv): Instead of copying the entire graph, consider modifying
   // the existing graph, and then removing those removed edges.
   // prior to returning.
   std::unique_ptr<Graph> graph_to_run(new Graph(graph->op_registry()));
   CopyGraph(*graph, graph_to_run.get());
 
-  std::unique_ptr<Device> device = GetCPUDevice(env);
-  if (!device) {
-    return errors::NotFound("Cannot find a device for GraphRunner.");
-  }
-
   SimpleRendezvous* rendez = new SimpleRendezvous;
   core::ScopedUnref rendez_unref(rendez);
 
@@ -128,9 +131,11 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   }
 
   // Call RewriteGraphForExecution
+  subgraph::RewriteGraphMetadata metadata;
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
       graph_to_run.get(), input_names, output_names, {} /* target nodes */,
-      device->attributes()));
+      cpu_device_->attributes(), false /* use_function_convention */,
+      &metadata));
 
   // Create the local executor and the Rendezvous for fetching back the
   // constants.
@@ -143,10 +148,11 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   Graph* g = graph_to_run.release();
 
   LocalExecutorParams params;
-  params.device = device.get();
+  // The ownership of the output tensors are bound to this device's lifetime.
+  params.device = cpu_device_.get();
   params.function_library = function_library;
-  params.create_kernel = [&device, g](const NodeDef& ndef, OpKernel** kernel) {
-    return CreateNonCachedKernel(device.get(), nullptr, ndef,
+  params.create_kernel = [this, g](const NodeDef& ndef, OpKernel** kernel) {
+    return CreateNonCachedKernel(cpu_device_.get(), nullptr, ndef,
                                  g->versions().producer(), kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
@@ -173,8 +179,13 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
     Rendezvous::ParsedKey parsed;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(output_key, &parsed));
     bool is_dead;
+    Tensor output_tensor;
     TF_RETURN_IF_ERROR(
-        rendez->Recv(parsed, Rendezvous::Args(), &(*outputs)[i], &is_dead));
+        rendez->Recv(parsed, Rendezvous::Args(), &output_tensor, &is_dead));
+    // Does a deep copy so that ownership of the tensor isn't tied to the
+    // allocator of the cpu device we created above. The allocator could be
+    // deleted along with the device.
+    (*outputs)[i] = tensor::DeepCopy(output_tensor);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index e078c7ffc8c90b6dedddb3284fa7c73cc7906663..1e4ae7722794ca527bcea023d992d92839ee46c9 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -20,16 +20,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
@@ -44,16 +40,26 @@ namespace tensorflow {
 // to be particularly lightweight, fast, or efficient.
 class GraphRunner {
  public:
+  // REQUIRES: `env` is not nullptr.
+  GraphRunner(Env* env);
+  ~GraphRunner();
+
   // Function semantics for `inputs`, `output_names` and `outputs`
   // matches those from Session::Run().
   //
+  // NOTE: The output tensors share lifetime with the GraphRunner, and could
+  // be destroyed once the GraphRunner is destroyed.
+  //
   // REQUIRES: `graph`, `env`, and `outputs` are not nullptr.
   // `function_library` may be nullptr.
   typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
-  static Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
-                    Env* env, const NamedTensorList& inputs,
-                    const std::vector<string>& output_names,
-                    std::vector<Tensor>* outputs);
+  Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
+             const NamedTensorList& inputs,
+             const std::vector<string>& output_names,
+             std::vector<Tensor>* outputs);
+
+ private:
+  std::unique_ptr<Device> cpu_device_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_runner_test.cc b/tensorflow/core/common_runtime/graph_runner_test.cc
index 5918ba9a22d2e222e8d7948eec521f926f734034..e969ee8df77c47c7e0d740f4319d98c8a642ea9a 100644
--- a/tensorflow/core/common_runtime/graph_runner_test.cc
+++ b/tensorflow/core/common_runtime/graph_runner_test.cc
@@ -46,20 +46,48 @@ using test::internal::ExpectEqual;
 TEST(GraphRunnerTest, SingleConst) {
   Scope root = Scope::NewRootScope();
   auto c = ops::Const(root, 42.0f);
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), {},
-                              {c.name()}, &outputs);
+  Status s = graph_runner.Run(root.graph(), nullptr, {}, {c.name()}, &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(42.0f, outputs[0].scalar<float>()());
 }
 
+// If not using DeepCopy, and the allocator is deleted with the cpu-device,
+// this test will seg-fault.
+TEST(GraphRunnerTest, DeepCopy) {
+  Scope root = Scope::NewRootScope();
+  auto p1 = ops::Placeholder(root.WithOpName("p1"), DT_FLOAT);
+  auto p2 = ops::Placeholder(root.WithOpName("p2"), DT_FLOAT);
+  auto add = ops::Add(root.WithOpName("add"), p1, p2);
+
+  Tensor p1_data(DT_FLOAT, TensorShape({}));
+  Tensor p2_data(DT_FLOAT, TensorShape({}));
+  p1_data.scalar<float>()() = 1.0f;
+  p2_data.scalar<float>()() = 2.0f;
+  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                   {"p2:0", p2_data}};
+
+  // Create and destroy the GraphRunner, and ensure that the outputs are
+  // consumable beyond the lifetime of GraphRunner.
+  std::vector<Tensor> outputs;
+  {
+    GraphRunner graph_runner(Env::Default());
+    Status s =
+        graph_runner.Run(root.graph(), nullptr, inputs, {"add:0"}, &outputs);
+    TF_ASSERT_OK(s);
+  }
+  ExpectEqual(3.0f, outputs[0].scalar<float>()());
+}
+
 TEST(GraphRunnerTest, MultiFetchConst) {
   Scope root = Scope::NewRootScope();
   auto c = ops::Const(root, 42.0f);
   auto pi = ops::Const(root, 3.14f);
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), {},
-                              {c.name(), pi.name()}, &outputs);
+  Status s = graph_runner.Run(root.graph(), nullptr, {}, {c.name(), pi.name()},
+                              &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(42.0f, outputs[0].scalar<float>()());
   ExpectEqual(3.14f, outputs[1].scalar<float>()());
@@ -78,9 +106,10 @@ TEST(GraphRunnerTest, FeedAndFetch) {
   std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
                                                    {"p2:0", p2_data}};
 
+  GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
-  Status s = GraphRunner::Run(root.graph(), nullptr, Env::Default(), inputs,
-                              {"add:0"}, &outputs);
+  Status s =
+      graph_runner.Run(root.graph(), nullptr, inputs, {"add:0"}, &outputs);
   TF_ASSERT_OK(s);
   ExpectEqual(3.0f, outputs[0].scalar<float>()());
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 72bc37d43507ce52d1ed62d61c8cbf7b139752c1..4e14e6fe1a6204dd2e2dc63d28e5e1ca1de9c4d2 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -103,13 +103,13 @@ void Benchmark::Run(int iters) { RunWithArgs({}, {}, iters); }
 
 string GetRendezvousKey(const Node* node) {
   string send_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device", &send_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
   string recv_device;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "recv_device", &recv_device));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
   string tensor_name;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "tensor_name", &tensor_name));
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
   uint64 send_device_incarnation;
-  TF_CHECK_OK(GetNodeAttr(node->def(), "send_device_incarnation",
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device_incarnation",
                           reinterpret_cast<int64*>(&send_device_incarnation)));
   return Rendezvous::CreateKey(send_device, send_device_incarnation,
                                recv_device, tensor_name, FrameAndIter(0, 0));
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 0a6342ed736d285a80db2ebef8fcf5e541000b6a..3f7c9f68dba6aa9a60edd145be064a626ff7a5bb 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -60,10 +60,8 @@ struct LocalDevice::EigenThreadPoolInfo {
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
-                         const DeviceAttributes& attributes,
-                         Allocator* device_allocator)
-    : Device(options.env, attributes, device_allocator),
-      owned_tp_info_(nullptr) {
+                         const DeviceAttributes& attributes)
+    : Device(options.env, attributes), owned_tp_info_(nullptr) {
   // If we're running on the CPU, log warnings if we're not compiled using the
   // best flags for performance.
   port::WarnAboutUnusedCPUFeatures();
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index d1c27c6248143063e0204f98569da9f1b71042c5..84a4f66db4a2e749d78e97758739f95f5bddb14e 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -33,8 +33,8 @@ struct SessionOptions;
 // GPUDevice into more 'process-wide' abstractions.
 class LocalDevice : public Device {
  public:
-  LocalDevice(const SessionOptions& options, const DeviceAttributes& attributes,
-              Allocator* device_allocator);
+  LocalDevice(const SessionOptions& options,
+              const DeviceAttributes& attributes);
   ~LocalDevice() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 80c483e70b0592041d510aeccbd799e81fd2e5c7..db053dd2fa0724f4377f20fe1616fcb31f3478cb 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -43,8 +45,8 @@ struct EndpointEq {
 };
 
 static Status ProcessMemoryTypes(
-    DeviceType device_type, const Graph* g,
-    std::function<Status(const Edge*, MemoryType, MemoryType)> fn) {
+    const DeviceType& device_type, const Graph* g,
+    const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
   if (device_type != DEVICE_GPU) {
     // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always
     // compatible.
@@ -88,17 +90,18 @@ static Status ProcessMemoryTypes(
   return Status::OK();
 }
 
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g) {
-  return ProcessMemoryTypes(device_type, g, [g](const Edge* e, MemoryType sm,
-                                                MemoryType dm) {
-    if (sm == dm) {
-      return Status::OK();
-    }
-    return errors::Internal(
-        "Memory type mismatch (", sm, " ", dm, ") between :", e->src()->id(),
-        ":", e->src_output(), " and ", e->dst()->id(), ":", e->dst_input(),
-        " : from ", e->src()->DebugString(), " to ", e->dst()->DebugString());
-  });
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
+  return ProcessMemoryTypes(
+      device_type, g, [g](const Edge* e, MemoryType sm, MemoryType dm) {
+        if (sm == dm) {
+          return Status::OK();
+        }
+        return errors::Internal(
+            "Memory type mismatch (", sm, " ", dm,
+            ") between :", e->src()->id(), ":", e->src_output(), " and ",
+            e->dst()->id(), ":", e->dst_input(), " : from ",
+            e->src()->DebugString(), " to ", e->dst()->DebugString());
+      });
 }
 
 static Node* Send(Graph* g, const string& device_name, bool host,
@@ -132,8 +135,8 @@ static Node* Recv(Graph* g, const string& device_name, bool host,
   return ret;
 }
 
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g) {
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g) {
   struct Item {
     const Edge* edge;
     MemoryType sm;
@@ -185,7 +188,7 @@ Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
   return ValidateMemoryTypes(device_type, g);
 }
 
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type) {
   MemoryTypeVector inp_mvec;
   MemoryTypeVector out_mvec;
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index ccbb8cffb17d99802df85fb502c5ac7c6ca604d4..fa0a7595f32ac8bb43010dcd3a407825ef79f618 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 // Returns an error iff *g running on a single device of 'device_type'
 // has memory type mismatch for any edge's source and destination.
-Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
+Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
 
 // Updates '*g' so that every edge's source and destination has
 // compatible memory types by inserting proper HostSend/Recv and
@@ -35,12 +35,12 @@ Status ValidateMemoryTypes(DeviceType device_type, const Graph* g);
 // Returns OK if '*g' is updated properly (ValidateMemoryTypes(g) must
 // be OK). Otherwise, returns an error and '*g' may be in an
 // invalidate state and the caller should discard it.
-Status EnsureMemoryTypes(DeviceType device_type, const string& device_name,
-                         Graph* g);
+Status EnsureMemoryTypes(const DeviceType& device_type,
+                         const string& device_name, Graph* g);
 
 // Get the memory type for 'index'th output of node 'n' in graph 'g', when
 // running on 'device_type'.
-Status MemoryTypeForOutput(DeviceType device_type, const Graph* g,
+Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                            const Node* n, int index, MemoryType* memory_type);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index ffbfbc74f16f5f0fb74c5a32ca23e57c84857022..bbd38a2e0775857f0a1b652c8814ee6d8e3b0821 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -49,11 +49,11 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
       }
     }
     for (Node* n : matches) {
-      AttrSlice n_attrs(n->def());
+      AttrSlice n_attrs = n->attrs();
       auto base_make_node = [n, g, &n_attrs](const string& op,
                                              const string& name) {
         NodeBuilder node_builder(name, op);
-        node_builder.Device(n->def().device());
+        node_builder.Device(n->requested_device());
         string colo;
         if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
           node_builder.Attr("_class", colo);
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index f0c79ad601c36c42ac8ae606e7b792ccae3b4313..198eb896afc76d12090d49a12276764fbfe25c73 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -69,7 +69,7 @@ class PendingCounts {
   // to retrieve the count data for this node.
   class Layout {
    public:
-    Handle CreateHandle(int max_pending_count, int max_dead_count);
+    Handle CreateHandle(size_t max_pending_count, size_t max_dead_count);
 
    private:
     friend class PendingCounts;
@@ -91,7 +91,7 @@ class PendingCounts {
 
   ~PendingCounts() { delete[] bytes_; }
 
-  void set_initial_count(Handle h, int pending_count) {
+  void set_initial_count(Handle h, size_t pending_count) {
     if (h.is_large_) {
       LargeCounts* c = Large(h);
       c->pending = pending_count;
@@ -306,7 +306,7 @@ class PendingCounts {
 };
 
 inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
-    int max_pending_count, int max_dead_count) {
+    size_t max_pending_count, size_t max_dead_count) {
   Handle result;
   if ((max_pending_count > kMaxCountForPackedCounts) ||
       (max_dead_count > kMaxCountForPackedCounts)) {
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa9713735edd05c36e1787be0e8c89e69c043fb2
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/renamed_device.h"
+
+namespace tensorflow {
+
+// TODO(saeta): Convert to returning a std::unique_ptr?
+/* static */
+Device* RenamedDevice::NewRenamedDevice(const string& new_base,
+                                        Device* underlying,
+                                        bool owns_underlying) {
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
+  DeviceNameUtils::ParsedName underlying_parsed_name =
+      underlying->parsed_name();
+  CHECK(underlying_parsed_name.has_type);
+  CHECK(underlying_parsed_name.has_id);
+  parsed_name.type = underlying_parsed_name.type;
+  parsed_name.id = underlying_parsed_name.id;
+  string name = DeviceNameUtils::FullName(parsed_name.job, parsed_name.replica,
+                                          parsed_name.task, parsed_name.type,
+                                          parsed_name.id);
+  DeviceAttributes attributes(underlying->attributes());
+  attributes.set_name(name);
+  return new RenamedDevice(underlying, attributes, owns_underlying);
+}
+
+RenamedDevice::RenamedDevice(Device* underlying,
+                             const DeviceAttributes& attributes,
+                             bool owns_underlying)
+    : Device(underlying->env(), attributes),
+      underlying_(underlying),
+      owns_underlying_(owns_underlying) {}
+
+RenamedDevice::~RenamedDevice() {
+  if (owns_underlying_) {
+    delete underlying_;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..0158e18cedc3b9b136258085641492c94de9e612
--- /dev/null
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -0,0 +1,119 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+// Wraps a device with a new name, delegating work to the wrapped device.
+//
+// This class is used to wrap local devices when using clusterspec propagation
+// where the name of a particular device may change in the context of a given
+// session.
+class RenamedDevice : public Device {
+ public:
+  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
+                                  bool owns_underlying);
+  ~RenamedDevice() override;
+
+  // Below are virtual methods defined on DeviceBase
+  bool RequiresRecordingAccessedTensors() const override {
+    return underlying_->RequiresRecordingAccessedTensors();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    return underlying_->tensorflow_cpu_worker_threads();
+  }
+
+  const GpuDeviceInfo* tensorflow_gpu_device_info() const override {
+    return underlying_->tensorflow_gpu_device_info();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return underlying_->GetAllocator(attr);
+  }
+
+  Allocator* GetStepAllocator(AllocatorAttributes attr,
+                              ResourceMgr* step_resource_manager) override {
+    return underlying_->GetStepAllocator(attr, step_resource_manager);
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    return underlying_->eigen_cpu_device();
+  }
+
+#ifdef TENSORFLOW_USE_SYCL
+  const Eigen::SyclDevice* eigen_sycl_device() const override {
+    return underlying_->eigen_sycl_device();
+  }
+#endif
+
+  PerOpGpuDevice* MakeGpuDevice() override {
+    return underlying_->MakeGpuDevice();
+  }
+
+  void ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
+                             DeviceContext* dc, Allocator* allocator) override {
+    underlying_->ReinitializeGpuDevice(context, device, dc, allocator);
+  }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    return underlying_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
+  }
+
+  // Below are virtual methods defined on Device
+
+  void Compute(OpKernel* op_kernel, OpKernelContext* context) override {
+    underlying_->Compute(op_kernel, context);
+  }
+
+  void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override {
+    underlying_->ComputeAsync(op_kernel, context, std::move(done));
+  }
+
+  void ConsumeListOfAccessedTensors(
+      DeviceContext* context, const TensorReferenceVector& tensors) override {
+    underlying_->ConsumeListOfAccessedTensors(context, tensors);
+  }
+
+  Status Sync() override { return underlying_->Sync(); }
+
+  Status MaybeRewriteGraph(const FunctionDefLibrary& library,
+                           std::unique_ptr<Graph>* graph) override {
+    return underlying_->MaybeRewriteGraph(library, graph);
+  }
+
+  Status FillContextMap(const Graph* graph,
+                        DeviceContextMap* device_context_map) override {
+    return underlying_->FillContextMap(graph, device_context_map);
+  }
+
+ private:
+  RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
+                bool owns_underlying);
+  Device* const underlying_;
+  const bool owns_underlying_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_COMMON_RUNTIME_RENAMED_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 285ac7540c84e3c5f3a8472850105aa9c7d8d1eb..2a2b10c0cff27194b19a718887b89a1bc72e3a02 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -106,7 +106,7 @@ void IntraProcessRendezvous::SameWorkerRecvDone(
   CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context,
                      recv_args.device_context, src_device, dst_device,
                      send_args.alloc_attrs, recv_args.alloc_attrs, &in, out,
-                     done);
+                     std::move(done));
 }
 
 void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
@@ -132,7 +132,8 @@ void IntraProcessRendezvous::RecvAsync(const ParsedKey& parsed,
     };
 
     if (status.ok() && in.IsInitialized()) {
-      SameWorkerRecvDone(parsed, send_args, recv_args, in, out, final_callback);
+      SameWorkerRecvDone(parsed, send_args, recv_args, in, out,
+                         std::move(final_callback));
     } else {
       final_callback(status);
     }
diff --git a/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
index 85a29e11e2338a888c1b51d342e00c6f846b3f6d..b40924ef3a8618f1b132a243136653005eb6a93c 100644
--- a/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
+++ b/tensorflow/core/common_runtime/resource_variable_read_optimizer.cc
@@ -21,9 +21,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// Replaces ReadVariableOp nodes which are only used by Sends and sinks with
-// _UnsafeReadVariable nodes, as this transforamtion is safe and will improve
-// performance.
+// Replaces ReadVariableOp nodes which are only used by Sends, sinks,
+// and function Retvals with _UnsafeReadVariable nodes, as this
+// transformation is safe and will improve performance.
 class ResourceVariableReadPass : public GraphOptimizationPass {
  public:
   Status Run(const GraphOptimizationPassOptions& options) override {
@@ -43,7 +43,8 @@ class ResourceVariableReadPass : public GraphOptimizationPass {
       if (n->type_string() == "ReadVariableOp") {
         bool skip = false;
         for (const Edge* e : n->out_edges()) {
-          if (!e->dst()->IsSend() && e->dst()->name() != "_SINK") {
+          if (!e->dst()->IsSend() && e->dst()->type_string() != "_Retval" &&
+              e->dst()->name() != "_SINK") {
             skip = true;
           }
         }
@@ -54,7 +55,7 @@ class ResourceVariableReadPass : public GraphOptimizationPass {
     }
     for (Node* read : matches) {
       DataType dtype;
-      TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(read->def()), "dtype", &dtype));
+      TF_RETURN_IF_ERROR(GetNodeAttr(read->attrs(), "dtype", &dtype));
       std::vector<Node*> in_control_edges;
       std::vector<std::pair<Node*, int>> in_edges;
       for (const Edge* edge : read->in_edges()) {
diff --git a/tensorflow/core/common_runtime/session_factory.h b/tensorflow/core/common_runtime/session_factory.h
index 2a1632e0359558dba41f3cc3ca021d808073b59f..df3198a70dde5104b0309195831f8b4c13c9654b 100644
--- a/tensorflow/core/common_runtime/session_factory.h
+++ b/tensorflow/core/common_runtime/session_factory.h
@@ -47,7 +47,7 @@ class SessionFactory {
   // Old sessions may continue to have side-effects on resources not in
   // containers listed in "containers", and thus may affect future
   // sessions' results in ways that are hard to predict.  Thus, if well-defined
-  // behaviour is desired, is it recommended that all containers be listed in
+  // behavior is desired, is it recommended that all containers be listed in
   // "containers".
   //
   // If the "containers" vector is empty, the default container is assumed.
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 2f65abde0af95ebafcbf7a59f89d37f64da66283..876f34b99118d2793acca12536fd2c2c6a0b328e 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -33,7 +33,15 @@ using shape_inference::ShapeHandle;
 
 ShapeRefiner::ShapeRefiner(int graph_def_version,
                            const OpRegistryInterface* ops)
-    : graph_def_version_(graph_def_version), ops_registry_(ops) {}
+    : graph_def_version_(graph_def_version),
+      ops_registry_(ops),
+      graph_runner_(Env::Default()) {}
+
+ShapeRefiner::~ShapeRefiner() {
+  // The lifetime of the tensors are bound to the GraphRunner, so the tensors
+  // should be deleted before it.
+  const_tensor_map_.clear();
+}
 
 Status ShapeRefiner::AddNode(const Node* node) {
   // For each 'input' of this node, fetch the corresponding shape
@@ -72,17 +80,15 @@ Status ShapeRefiner::AddNode(const Node* node) {
   // Get the shape function for this node
   const OpRegistrationData* op_reg_data;
   TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
-  if (op_reg_data->shape_inference_fn == nullptr) {
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
     return errors::InvalidArgument(
         "No shape inference function exists for op '", node->type_string(),
         "', did you forget to define it?");
   }
 
   // This needs to be filled in with real data in a second pass.
-  std::vector<const Tensor*> input_tensors(node->num_inputs());
-  std::vector<Tensor> real_tensors(node->num_inputs());
-  std::vector<bool> attempted_materialization(node->num_inputs());
-  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
@@ -95,70 +101,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
   }
 
   // Run the shape inference function, and return if there was an error.
-  TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
-
-  // We must run the shape function repeatedly, in case users write
-  // shape functions where they only conditionally call input_tensor()
-  // based on the values of another input tensor.
-  bool rerun_shape_fn;
-  do {
-    // If the result of running shape inference would have benefitted
-    // from knowing the values of input tensors, try to materialize
-    // the results of those tensors, and then run the shape inference
-    // function again using those known tensors.
-    rerun_shape_fn = false;
-
-    // NOTE: It is possible to batch the extraction and
-    // materialization of inputs, instead of materializing one input
-    // at a time like we do below.  If input-at-a-time computation
-    // becomes a bottleneck, we could separate ExtractConstantSubgraph
-    // into two functions: one that returns true if an input is
-    // derivable from constants, and another function that extracts
-    // the subgraph for multiple target nodes and executes the whole
-    // subgraph once.
-
-    for (int i = 0; i < c->num_inputs(); ++i) {
-      if (!c->requested_input_tensor(i)) {
-        continue;
-      }
-      // Check if we have not already filled in the requested input,
-      // and if not, try to materialize the tensors.
-      if (!attempted_materialization[i]) {
-        attempted_materialization[i] = true;
-
-        Tensor result;
-        bool evaluated = false;
-        TF_RETURN_IF_ERROR(
-            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
-        if (evaluated) {
-          real_tensors[i] = result;
-          input_tensors[i] = &real_tensors[i];
-          // We have more concrete information about a shape,
-          // so re-run shape inference.
-          rerun_shape_fn = true;
-        }
-      }
-      if (c->requested_input_tensor_as_partial_shape(i) &&
-          !attempted_tensor_as_shape_conversion[i]) {
-        attempted_tensor_as_shape_conversion[i] = true;
-        if (i >= input_tensors_as_shapes.size()) {
-          input_tensors_as_shapes.resize(i + 1);
-        }
-        ShapeHandle s;
-        TF_RETURN_IF_ERROR(ConstantPartialShape(c.get(), node, i, &s));
-        input_tensors_as_shapes[i] = s;
-        rerun_shape_fn = true;
-      }
-    }
-
-    if (rerun_shape_fn) {
-      // We have more information about the shapes on this pass,
-      // so re-run shape inference.
-      c->set_input_tensors(input_tensors);
-      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
-      TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c.get()));
-    }
-  } while (rerun_shape_fn);
+  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, c.get()));
 
   // Store the resulting InferenceContext object in the map.
   node_to_context_[node].swap(c);
@@ -194,6 +137,74 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
+Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
+  auto it = node_to_context_.find(node);
+  if (it == node_to_context_.end()) {
+    *refined = true;
+    return AddNode(node);
+  }
+  InferenceContext* node_context = it->second.get();
+
+  // Give up if the context wasn't successfully built by the AddNode() method.
+  TF_RETURN_IF_ERROR(node_context->construction_status());
+
+  // Check if the shapes of the nodes in the fan-in of this node have changed,
+  // and if they have update the node input shapes.
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) continue;
+
+    Node* input = e->src();
+    auto iter = node_to_context_.find(input);
+    if (iter == node_to_context_.end()) {
+      return errors::FailedPrecondition(
+          "Input ", e->dst_input(), " ('", input->name(), "') for '",
+          node->name(), "' was not previously added to ShapeRefiner.");
+    }
+
+    InferenceContext* c = iter->second.get();
+    DCHECK_GE(e->dst_input(), 0);
+    if (node_context->MergeInput(e->dst_input(), c->output(e->src_output()))) {
+      *refined = true;
+    }
+
+    // Also propagate handle shape and dtype of edges which are carrying
+    // resource handles.
+    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+      if (node_context->set_input_handle_dtype(
+              e->dst_input(), c->output_handle_dtype(e->src_output()))) {
+        *refined = true;
+      }
+      if (node_context->MergeInputHandleShape(
+              e->dst_input(), c->output_handle_shape(e->src_output()))) {
+        *refined = true;
+      }
+    }
+  }
+
+  if (!*refined) {
+    // No input shape has changed, we're done
+    return Status::OK();
+  }
+
+  // Get and run the shape function for this node to update the shapes of the
+  // outputs.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", node->type_string(),
+        "', did you forget to define it?");
+  }
+
+  if (!op_reg_data->shape_inference_fn) {
+    // There is nothing more we can infer
+    return Status::OK();
+  }
+
+  return RunShapeFn(node, op_reg_data, node_context);
+}
+
 Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                                    int dst_idx, bool* evaluated,
                                                    Tensor* result) {
@@ -223,9 +234,8 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
   std::vector<Tensor> outputs;
   // NOTE; we should pass in a function library runtime if we want
   // to support constant-expression evaluation on functions.
-  Status s = GraphRunner::Run(&subgraph, nullptr /* function_library */,
-                              Env::Default(), const_inputs,
-                              {output_tensor_name}, &outputs);
+  Status s = graph_runner_.Run(&subgraph, nullptr /* function_library */,
+                               const_inputs, {output_tensor_name}, &outputs);
 
   // If all kernels in the constant graph are not registered
   // in the process, GraphRunner::Run may fail, in which case
@@ -293,6 +303,13 @@ Status ShapeRefiner::ExtractConstantSubgraph(
       return Status::OK();
     }
 
+    // Don't constant fold enter/exit currently either, as it's easy to end
+    // up with a partial frame.
+    if (IsEnter(current_node) || IsExit(current_node)) {
+      *is_constant_graph = false;
+      return Status::OK();
+    }
+
     // If there is nothing more to recurse down, see if
     // the generator node is a constant.
     if (current_node->num_inputs() == 0) {
@@ -440,4 +457,93 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::RunShapeFn(const Node* node,
+                                const OpRegistrationData* op_reg_data,
+                                shape_inference::InferenceContext* c) {
+  // This will be filled in with real data in a second pass.
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
+  std::vector<Tensor> real_tensors(node->num_inputs());
+  std::vector<bool> attempted_materialization(node->num_inputs());
+  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<ShapeHandle> input_tensors_as_shapes;
+
+  // Run the shape inference function, and return if there was an error.
+  c->set_input_tensors(input_tensors);
+  c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+  if (op_reg_data->shape_inference_fn) {
+    TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
+  } else {
+    TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
+  }
+
+  // We must run the shape function repeatedly, in case users write
+  // shape functions where they only conditionally call input_tensor()
+  // based on the values of another input tensor.
+  bool rerun_shape_fn;
+  do {
+    // If the result of running shape inference would have benefitted
+    // from knowing the values of input tensors, try to materialize
+    // the results of those tensors, and then run the shape inference
+    // function again using those known tensors.
+    rerun_shape_fn = false;
+
+    // NOTE: It is possible to batch the extraction and
+    // materialization of inputs, instead of materializing one input
+    // at a time like we do below.  If input-at-a-time computation
+    // becomes a bottleneck, we could separate ExtractConstantSubgraph
+    // into two functions: one that returns true if an input is
+    // derivable from constants, and another function that extracts
+    // the subgraph for multiple target nodes and executes the whole
+    // subgraph once.
+
+    for (int i = 0; i < c->num_inputs(); ++i) {
+      if (!c->requested_input_tensor(i)) {
+        continue;
+      }
+      // Check if we have not already filled in the requested input,
+      // and if not, try to materialize the tensors.
+      if (!attempted_materialization[i]) {
+        attempted_materialization[i] = true;
+
+        Tensor result;
+        bool evaluated = false;
+        TF_RETURN_IF_ERROR(
+            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        if (evaluated) {
+          real_tensors[i] = result;
+          input_tensors[i] = &real_tensors[i];
+          // We have more concrete information about a shape,
+          // so re-run shape inference.
+          rerun_shape_fn = true;
+        }
+      }
+      if (c->requested_input_tensor_as_partial_shape(i) &&
+          !attempted_tensor_as_shape_conversion[i]) {
+        attempted_tensor_as_shape_conversion[i] = true;
+        if (i >= input_tensors_as_shapes.size()) {
+          input_tensors_as_shapes.resize(i + 1);
+        }
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s));
+        input_tensors_as_shapes[i] = s;
+        rerun_shape_fn = true;
+      }
+    }
+
+    if (rerun_shape_fn) {
+      // We have more information about the shapes on this pass,
+      // so re-run shape inference.
+      c->set_input_tensors(input_tensors);
+      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+      if (op_reg_data->shape_inference_fn) {
+        TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c));
+      } else {
+        TF_RETURN_IF_ERROR(shape_inference::UnknownShape(c));
+      }
+    }
+  } while (rerun_shape_fn);
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index b8d69fc05b8e423ffb7d85a6aa9b7aaceb29646f..75eb81c346f4f6087a12383a0111d4fd38eee9dc 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -32,6 +33,7 @@ namespace tensorflow {
 class ShapeRefiner {
  public:
   ShapeRefiner(int graph_def_version, const OpRegistryInterface* ops);
+  ~ShapeRefiner();
 
   // Performs validation of 'node' and runs 'node's shape function,
   // storing its shape outputs.
@@ -53,6 +55,11 @@ class ShapeRefiner {
   Status SetShape(const Node* node, int output_port,
                   shape_inference::ShapeHandle shape);
 
+  // Update the input shapes of node in case the shapes of the fan-ins of 'node'
+  // have themselves been modified (For example, in case of incremental shape
+  // refinement). Sets refined to true if any of the node shape has changed.
+  Status UpdateNode(const Node* node, bool* refined);
+
   // Returns the InferenceContext for 'node', if present.
   shape_inference::InferenceContext* GetContext(const Node* node) const {
     auto it = node_to_context_.find(node);
@@ -62,6 +69,14 @@ class ShapeRefiner {
     return it->second.get();
   }
 
+  // Getters and setters for graph_def_version_.
+  int32 graph_def_version() const { return graph_def_version_; }
+  void set_graph_def_version(int32 version) { graph_def_version_ = version; }
+
+  void set_require_shape_inference_fns(bool require_shape_inference_fns) {
+    require_shape_inference_fns_ = require_shape_inference_fns;
+  }
+
  private:
   // Extracts the subgraph ending at 'node' that is statically
   // computable and inserts into 'out_graph'. If statically computable,
@@ -98,9 +113,16 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
-  const int graph_def_version_;
+  Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
+                    shape_inference::InferenceContext* c);
+
+  int32 graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
+  // The lifetime of the tensors are bound to the runner, so it should be the
+  // deleted after the tensors.
+  GraphRunner graph_runner_;
+
   // Stores a map from a node to its InferenceContext.
   //
   // Owns values.
@@ -118,6 +140,9 @@ class ShapeRefiner {
   // Only tensors less than 1KiB are currently stored in the cache.
   static constexpr int64 kMaxTensorSize = 1024;
   std::unordered_map<string, Tensor> const_tensor_map_;
+
+  bool require_shape_inference_fns_ = true;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
 };
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 05274ff311233b3eaf5ae86dd386298b89f7cd08..b8df6dd4f6203624eab6f14f89f9845f148be99e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -126,6 +126,27 @@ TEST(ShapeRefinerTest, SetShape) {
   ASSERT_FALSE(m.SetShape(a.node(), 0, h).ok());
 }
 
+namespace {
+
+// An op with no shape function.
+REGISTER_OP("TestOpWithNoShapeFn").Input("a: int32").Output("o: int32");
+
+}  // namespace
+
+TEST(ShapeRefinerTest, MissingShapeInferenceFns) {
+  Scope root = Scope::NewRootScope();
+  auto a = ops::Const(root, 42);
+  Node* b;
+  TF_ASSERT_OK(NodeBuilder("b", "TestOpWithNoShapeFn")
+                   .Input(a.node())
+                   .Finalize(root.graph(), &b));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(a.node()));
+  EXPECT_FALSE(m.AddNode(b).ok());
+  m.set_require_shape_inference_fns(false);
+  TF_EXPECT_OK(m.AddNode(b));
+}
+
 TEST(ShapeRefinerTest, PropagateConstants) {
   // Reduction dimension is a variable, so we don't know its value.
   // So the output shape value is unknown (though its rank is known).
@@ -747,5 +768,38 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST(ShapeRefinerTest, IncrementalUpdates) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* queue;
+  TF_CHECK_OK(NodeBuilder("queue", "FIFOQueueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Finalize(g, &queue));
+  Node* dequeue;
+  TF_CHECK_OK(NodeBuilder("dequeue", "QueueDequeueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Input(queue)
+                  .Finalize(g, &dequeue));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(queue));
+  TF_ASSERT_OK(m.AddNode(dequeue));
+
+  // At this point, the shapes of the dequeued tensor are unknown.
+  shape_inference::InferenceContext* ctx = m.GetContext(dequeue);
+  EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+
+  // Inject a shape, and incrementally propagate it to the dequeue op.
+  ctx = m.GetContext(queue);
+  shape_inference::ShapeHandle shp = ctx->MakeShape({3, 7});
+  ctx->set_output_handle_shape(0, shp);
+  ctx->set_output_handle_dtype(0, DT_FLOAT);
+
+  bool refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, &refined));
+  EXPECT_TRUE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[3,7]", ctx->DebugString(ctx->output(0)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.cc b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
index dd0896e1ddade09a19365c2f52dfc13ee2e76418..b291a6f9948a889cf59d4616fd199bef80a77240 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/graph/validate.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -39,6 +37,13 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
+#ifndef IS_MOBILE_PLATFORM
+#include "tensorflow/core/grappler/clusters/utils.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 
 SimpleGraphExecutionState::SimpleGraphExecutionState(
@@ -231,11 +236,14 @@ Status SimpleGraphExecutionState::InitBaseGraph(
     const BuildGraphOptions& options) {
   const GraphDef* graph_def = &original_graph_def_;
 
+#ifndef IS_MOBILE_PLATFORM
   GraphDef optimized_graph;
+
   const RewriterConfig& rewrite_options =
       session_options_->config.graph_options().rewrite_options();
+
   if (grappler::MetaOptimizerEnabled(rewrite_options)) {
-    // Adding this functionalty in steps. The first step is to make sure
+    // Adding this functionality in steps. The first step is to make sure
     // we don't break dependencies. The second step will be to turn the
     // functionality on by default.
     grappler::GrapplerItem item;
@@ -267,14 +275,22 @@ Status SimpleGraphExecutionState::InitBaseGraph(
     }
 
     if (s.ok()) {
-      s = grappler::RunMetaOptimizer(item, rewrite_options, &optimized_graph);
+      std::unordered_map<string, DeviceProperties> device_map;
+      for (const auto& device : device_set_->devices()) {
+        device_map[device->name()] =
+            grappler::GetDeviceInfo(device->parsed_name());
+      }
+      grappler::VirtualCluster cluster(device_map);
+      s = grappler::RunMetaOptimizer(item, rewrite_options, &cluster,
+                                     &optimized_graph);
     }
     if (s.ok()) {
       graph_def = &optimized_graph;
     }
   }
+#endif  // IS_MOBILE_PLATFORM
 
-  std::unique_ptr<Graph> new_graph(new Graph(flib_def_.get()));
+  std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, *graph_def, new_graph.get()));
   for (const Node* n : new_graph->nodes()) {
@@ -284,9 +300,11 @@ Status SimpleGraphExecutionState::InitBaseGraph(
   if (session_options_ &&
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
+    rewrite_metadata_.reset(new subgraph::RewriteGraphMetadata);
     TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
         new_graph.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes()));
+        options.target_nodes, device_set_->client_device()->attributes(),
+        options.use_function_convention, rewrite_metadata_.get()));
   }
 
   // Save stateful placements before placing.
@@ -333,15 +351,26 @@ Status SimpleGraphExecutionState::BuildGraph(
   std::unique_ptr<Graph> ng(new Graph(flib_def_.get()));
   CopyGraph(*graph_, ng.get());
 
+  subgraph::RewriteGraphMetadata rewrite_metadata;
   if (session_options_ == nullptr ||
       !session_options_->config.graph_options().place_pruned_graph()) {
     // Extract the subset of the graph that needs to be run, adding feed/fetch
     // ops as needed.
     TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
         ng.get(), options.feed_endpoints, options.fetch_endpoints,
-        options.target_nodes, device_set_->client_device()->attributes()));
+        options.target_nodes, device_set_->client_device()->attributes(),
+        options.use_function_convention, &rewrite_metadata));
+  } else {
+    // This SimpleGraphExecutionState represents a graph that was
+    // pruned when this was constructed, so we copy the metadata from
+    // a member variable.
+    CHECK(rewrite_metadata_);
+    rewrite_metadata = *rewrite_metadata_;
   }
 
+  CHECK_EQ(options.feed_endpoints.size(), rewrite_metadata.feed_types.size());
+  CHECK_EQ(options.fetch_endpoints.size(), rewrite_metadata.fetch_types.size());
+
   // Make a fresh copy of the function library for the client graph.
   std::unique_ptr<FunctionLibraryDefinition> flib(
       new FunctionLibraryDefinition(*flib_def_));
@@ -363,7 +392,8 @@ Status SimpleGraphExecutionState::BuildGraph(
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
   std::unique_ptr<SimpleClientGraph> dense_copy(
-      new SimpleClientGraph(std::move(flib)));
+      new SimpleClientGraph(std::move(flib), rewrite_metadata.feed_types,
+                            rewrite_metadata.fetch_types));
   CopyGraph(*ng, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
diff --git a/tensorflow/core/common_runtime/simple_graph_execution_state.h b/tensorflow/core/common_runtime/simple_graph_execution_state.h
index 3b6ce23c754182a4aaebba59d471adc506f1b126..00b5509fd78209727adaeeb4eea3275a5616077c 100644
--- a/tensorflow/core/common_runtime/simple_graph_execution_state.h
+++ b/tensorflow/core/common_runtime/simple_graph_execution_state.h
@@ -39,6 +39,10 @@ struct SessionOptions;
 class StepStats;
 class Timeline;
 
+namespace subgraph {
+struct RewriteGraphMetadata;
+}
+
 struct SimpleGraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
@@ -50,13 +54,19 @@ struct SimpleGraphExecutionStateOptions {
 // A SimpleClientGraph is simply a sub-graph of the full graph as induced by
 // BuildGraphOptions.
 struct SimpleClientGraph {
-  explicit SimpleClientGraph(std::unique_ptr<FunctionLibraryDefinition> flib)
-      : flib_def(std::move(flib)), graph(flib_def.get()) {}
+  explicit SimpleClientGraph(std::unique_ptr<FunctionLibraryDefinition> flib,
+                             DataTypeVector feed_types,
+                             DataTypeVector fetch_types)
+      : flib_def(std::move(flib)),
+        graph(flib_def.get()),
+        feed_types(std::move(feed_types)),
+        fetch_types(std::move(fetch_types)) {}
   // Each client-graph gets its own function library since optimization passes
   // post rewrite for execution might want to introduce new functions.
   std::unique_ptr<FunctionLibraryDefinition> flib_def;
   Graph graph;
-  int32 placement_version;
+  DataTypeVector feed_types;
+  DataTypeVector fetch_types;
 };
 
 // SimpleGraphExecutionState is responsible for generating an
@@ -190,6 +200,10 @@ class SimpleGraphExecutionState {
   // and may be updated by a graph optimization pass.
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 
+  // `rewrite_metadata_` is only set for SimpleGraphExecutionState
+  // objects created by `MakeForPrunedGraph()`.
+  std::unique_ptr<subgraph::RewriteGraphMetadata> rewrite_metadata_;
+
   // The dataflow graph owned by this object.
   Graph* graph_;
 
diff --git a/tensorflow/core/common_runtime/simple_placer.cc b/tensorflow/core/common_runtime/simple_placer.cc
index f6e6bf069257031c84fe2106340d9375652b98a6..73f49706b442a2d3dcda3e3f7c75a3c6b302acbb 100644
--- a/tensorflow/core/common_runtime/simple_placer.cc
+++ b/tensorflow/core/common_runtime/simple_placer.cc
@@ -34,6 +34,11 @@ namespace tensorflow {
 
 namespace {
 
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
 // Returns a list of devices sorted by preferred type and then name
 // from 'devices' whose type is in 'supported_device_types'.  This
 // function searches the device types in 'supported_device_types' and
@@ -71,24 +76,26 @@ void ColocationGroups(const Node& node,
   std::vector<string> class_specs;
   // TODO(vrv): We should consider adding a GetNodeAttr that returns a
   // StringPiece, to avoid a copy.
-  Status s = GetNodeAttr(node.def(), kColocationAttrName, &class_specs);
-  if (!s.ok()) {
+  if (!GetNodeAttrSimple(node.attrs(), kColocationAttrNameStringPiece,
+                         &class_specs)) {
     // No attribute value is equivalent to the empty colocation_group.
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
+    *colocation_groups = {
+        strings::StrCat(kColocationGroupPrefixStringPiece, node.name())};
     return;
   }
 
   bool found_spec = false;
   for (const string& class_spec : class_specs) {
     StringPiece spec(class_spec);
-    if (spec.Consume(kColocationGroupPrefix)) {
+    if (spec.Consume(kColocationGroupPrefixStringPiece)) {
       found_spec = true;
       colocation_groups->emplace_back(class_spec);
     }
   }
 
   if (!found_spec) {
-    *colocation_groups = {strings::StrCat(kColocationGroupPrefix, node.name())};
+    *colocation_groups = {
+        strings::StrCat(kColocationGroupPrefixStringPiece, node.name())};
   }
 }
 
@@ -322,7 +329,7 @@ class ColocationGraph {
         AddDebugInfo(node_root, &debug_info);
 
         DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->def().device(),
+        if (DeviceNameUtils::ParseFullName(node->requested_device(),
                                            &specified_device_name) &&
             specified_device_name == members_[node_root].device_name) {
           // The specified device and merged set device match, and
@@ -341,28 +348,27 @@ class ColocationGraph {
             std::sort(device_names.begin(), device_names.end());
 
             return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->def().device(),
-                "' because no devices matching that specification "
-                "are registered in this process; available devices: ",
-                str_util::Join(device_names, ", "), debug_info);
+                "Operation was explicitly assigned to ",
+                node->requested_device(), " but available devices are [ ",
+                str_util::Join(device_names, ", "), " ]. Make sure ",
+                "the device specification refers to a valid device.");
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), "' because no supported kernel for ",
+                node->requested_device(), "' because no supported kernel for ",
                 specified_device_name.type, " devices is available.",
                 debug_info);
           } else {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
-                node->def().device(), debug_info);
+                node->requested_device(), debug_info);
           }
         } else {
           // The specified device may be a valid device but the
           // merged set device is different, so print both.
           return errors::InvalidArgument(
               "Could not satisfy explicit device specification '",
-              node->def().device(),
+              node->requested_device(),
               "' because the node was colocated with a group of nodes that "
               "required incompatible device '",
               DeviceNameUtils::ParsedNameToString(
@@ -507,7 +513,7 @@ class ColocationGraph {
       return errors::Internal("Assigned device '", node.assigned_device_name(),
                               "' does not have registered OpKernel support "
                               "for ",
-                              node.def().op());
+                              node.type_string());
     } else {
       // This node has not yet been assigned to a device, so we
       // calculate any constraints due to the set of registered
@@ -521,25 +527,25 @@ class ColocationGraph {
           registered_device_types.insert(d->device_type());
         }
         return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.def().op(),
+            "No OpKernel was registered to support Op '", node.type_string(),
             "' with these attrs.  Registered devices: [",
             str_util::Join(registered_device_types, ","),
             "], Registered kernels:\n",
-            KernelsRegisteredForOp(node.def().op()));
+            KernelsRegisteredForOp(node.type_string()));
       }
 
       // If the NodeDef contains a device, then we interpret it as a
       // (partial) device specification.
-      if (!node.def().device().empty()) {
+      if (!node.requested_device().empty()) {
         // The user has specified a device in the NodeDef, try to find a
         // valid device matching their specification in the set of
         // devices.
         // NOTE: The full name may specify a device that is not in
         // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.def().device(),
+        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
                                             &member->device_name)) {
           return errors::InvalidArgument("Malformed device specification '",
-                                         node.def().device(), "'");
+                                         node.requested_device(), "'");
         }
       }
     }
@@ -638,7 +644,7 @@ Status SimplePlacer::Run() {
       continue;
     }
     status = colocation_graph.AddNode(*node);
-    if (!status.ok()) return AttachDef(status, node->def());
+    if (!status.ok()) return AttachDef(status, *node);
   }
 
   // 2. Enumerate the constraint edges, and use them to update the disjoint
@@ -654,7 +660,7 @@ Status SimplePlacer::Run() {
       if (!edge->IsControlEdge() &&
           (IsRefType(node->input_type(edge->dst_input())) ||
            node->input_type(edge->dst_input()) == DT_RESOURCE)) {
-        // If both the source node and this node have paritally
+        // If both the source node and this node have partially
         // specified a device, then 'node's device should be
         // cleared: the reference edge forces 'node' to be on the
         // same device as the source node.
@@ -701,7 +707,7 @@ Status SimplePlacer::Run() {
                                "be on the same device), but the two nodes "
                                "were assigned two different devices: ",
                                status.error_message()),
-                           node->def());
+                           *node);
         }
       }
     }
@@ -741,9 +747,9 @@ Status SimplePlacer::Run() {
     status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
     // Returns the first device in sorted devices list so we will always
@@ -783,9 +789,9 @@ Status SimplePlacer::Run() {
     status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
-          errors::InvalidArgument("Cannot assign a device to node '",
+          errors::InvalidArgument("Cannot assign a device for operation '",
                                   node->name(), "': ", status.error_message()),
-          node->def());
+          *node);
     }
 
     string assigned_device = devices[0]->name();
@@ -801,7 +807,7 @@ Status SimplePlacer::Run() {
             return e->dst()->assigned_device_name() == output_device_name;
           });
 
-      if (consumers_on_same_device && 
+      if (consumers_on_same_device &&
           CanAssignToDevice(output_device_name, devices)) {
         assigned_device = output_device_name;
       }
diff --git a/tensorflow/core/common_runtime/simple_placer_test.cc b/tensorflow/core/common_runtime/simple_placer_test.cc
index c73ed041ed19aa34325c1c35c0c1647f86c87179..69ed58b33c10e860dcfc16e488115fc5cec47aac 100644
--- a/tensorflow/core/common_runtime/simple_placer_test.cc
+++ b/tensorflow/core/common_runtime/simple_placer_test.cc
@@ -66,7 +66,7 @@ class DummyOp : public OpKernel {
 class FakeDevice : public Device {
  private:
   explicit FakeDevice(const DeviceAttributes& device_attributes)
-      : Device(nullptr, device_attributes, nullptr) {}
+      : Device(nullptr, device_attributes) {}
 
  public:
   Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
@@ -237,7 +237,7 @@ class SimplePlacerTest : public ::testing::Test {
 
   Status ReferenceTestHelper(const string& variable_op_type,
                              const string& assign_op_type,
-                             DeviceType expected_device_type);
+                             const DeviceType& expected_device_type);
 };
 
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
@@ -500,9 +500,9 @@ TEST_F(SimplePlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 // Build a graph containing a Variable op of "variable_op_type" and an
 // Assign op of "assign_op_type", and expect all of the ops to be
 // placed on a device of type "expected_device_type".
-Status SimplePlacerTest::ReferenceTestHelper(const string& variable_op_type,
-                                             const string& assign_op_type,
-                                             DeviceType expected_device_type) {
+Status SimplePlacerTest::ReferenceTestHelper(
+    const string& variable_op_type, const string& assign_op_type,
+    const DeviceType& expected_device_type) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -939,10 +939,7 @@ TEST_F(SimplePlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -957,10 +954,7 @@ TEST_F(SimplePlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Could not satisfy explicit device specification '/job:foo'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -1107,10 +1101,7 @@ TEST_F(SimplePlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakegpu:11'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1127,10 +1118,7 @@ TEST_F(SimplePlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Could not satisfy explicit "
-                            "device specification "
-                            "'/device:fakecpu:0'"));
+  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
   EXPECT_TRUE(
       StringPiece(s.error_message())
           .contains("no supported kernel for fakecpu devices is available"));
@@ -1151,12 +1139,9 @@ TEST_F(SimplePlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Could not satisfy explicit device specification "
-                    "'/job:foo/replica:17' "
-                    "because no devices matching that specification are "
-                    "registered in this process"));
+  EXPECT_TRUE(StringPiece(s.error_message())
+                  .contains("was explicitly assigned to /job:foo/replica:17 "
+                            "but available devices"));
 }
 
 TEST_F(SimplePlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index ca6ba7970f0352ee4a307ecdd20829b20dae551a..f5f8aab694698dab6151fcce7ed5c28da43ec36a 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -39,13 +39,16 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
                                    const DeviceLocality& locality,
                                    Allocator* allocator)
     : LocalDevice(options, Device::BuildDeviceAttributes(
-                               name, DEVICE_CPU, memory_limit, locality),
-                  allocator),
+                               name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator) {}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
 void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
+  // When TraceMe profiling is off (which is the default), the
+  // following TraceMe constructor is simply a conditional test of
+  // false value. Measurements show that its overhead is negligible.
+  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string());
   if (port::Tracing::IsActive()) {
     // TODO(pbar) We really need a useful identifier of the graph node.
     const uint64 id = Hash64(op_kernel->name());
diff --git a/tensorflow/core/common_runtime/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
index c83e4a4e3a1913c04257ee6036d447a9833b0d2f..8edf922d11ee1662b78771bfdc7c38e0144aee19 100644
--- a/tensorflow/core/common_runtime/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -44,7 +44,7 @@ class VisitableAllocator : public Allocator {
 };
 
 // Needed for cases when a VisitableAllocator gets wrapped for tracking.
-// Multiple-inheritance is considered acceptible in this case because
+// Multiple-inheritance is considered acceptable in this case because
 // VisitableAllocator is a pure virtual interface and only TrackingAllocator
 // has default implementation.
 class TrackingVisitableAllocator : public TrackingAllocator,
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 65cbd14f14bfa5ad4c4387e7e1b2e63430a0f7f7..2fc49d4412e9cf75dfa7a1e6c4c57cc0cade874a 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -58,7 +58,7 @@ cc_library(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
-        ":debug_graph_utils",
+        ":debugger_state_impl",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:debug_ops_op_lib",
     ],
@@ -85,6 +85,19 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+tf_cuda_library(
+    name = "debugger_state_impl",
+    srcs = ["debugger_state_impl.cc"],
+    hdrs = ["debugger_state_impl.h"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [
+        ":debug_graph_utils",
+        ":debug_io_utils",
+    ],
+    alwayslink = 1,
+)
+
 tf_cuda_library(
     name = "debug_graph_utils",
     srcs = ["debug_graph_utils.cc"],
@@ -92,7 +105,6 @@ tf_cuda_library(
     copts = tf_copts(),
     linkstatic = 1,
     deps = [
-        ":debug_io_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -135,24 +147,10 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++_unsecure",
-    ],
-    alwayslink = 1,
-)
-
-cc_binary(
-    name = "debug_test_server_main",
-    srcs = [
-        "debug_test_server_main.cc",
-    ],
-    deps = [
-        ":debug_grpc_testlib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@grpc//:grpc++_unsecure",
     ],
+    alwayslink = 1,
 )
 
 # TODO(cais): Fix flakiness on GPU and change this back to a tf_cc_test_gpu.
@@ -224,6 +222,31 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "grpc_session_debug_test",
+    size = "medium",
+    srcs = ["grpc_session_debug_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["nomac"],  # b/38276817
+    deps = [
+        ":debug_grpc_testlib",
+        ":debug_io_utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:matmul_op",
+    ],
+)
+
 # TODO(cais): Add the following back in when tfdbg is supported on Android.
 # filegroup(
 #     name = "android_srcs",
diff --git a/tensorflow/core/debug/debug.cc b/tensorflow/core/debug/debug.cc
index c293b285c353cacedbf6264715d871ba553db59f..1aedfc2710e2024fa86abcaf9d33712bd516c847 100644
--- a/tensorflow/core/debug/debug.cc
+++ b/tensorflow/core/debug/debug.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
-#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debugger_state_impl.h"
 
 namespace tensorflow {
 namespace {
@@ -30,10 +30,18 @@ class DebuggerStateRegistration {
     return std::unique_ptr<DebuggerStateInterface>(new DebuggerState(options));
   }
 
+  static std::unique_ptr<DebugGraphDecoratorInterface>
+  CreateDebugGraphDecorator(const DebugOptions& options) {
+    return std::unique_ptr<DebugGraphDecoratorInterface>(
+        new DebugGraphDecorator(options));
+  }
+
   DebuggerStateRegistration() {
     DebuggerStateRegistry::RegisterFactory(CreateDebuggerState);
+    DebugGraphDecoratorRegistry::RegisterFactory(CreateDebugGraphDecorator);
   }
 };
+
 static DebuggerStateRegistration register_debugger_state_implementation;
 
 }  // end namespace
diff --git a/tensorflow/core/debug/debug_gateway.cc b/tensorflow/core/debug/debug_gateway.cc
index 24b9dd799aa168e5f1a7e29cd793ce6c16579f46..1031ea843ed7874e2490714714cc2ce6abe09a66 100644
--- a/tensorflow/core/debug/debug_gateway.cc
+++ b/tensorflow/core/debug/debug_gateway.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_gateway.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -56,11 +58,11 @@ DebugGateway::~DebugGateway() {
 }
 
 void DebugGateway::SetNodeCompletionCallback(NodeCompletionCallback callback) {
-  comp_cb_ = callback;
+  comp_cb_ = std::move(callback);
 }
 
 void DebugGateway::SetNodeValueCallback(NodeValueCallback callback) {
-  val_cb_ = callback;
+  val_cb_ = std::move(callback);
 }
 
 void DebugGateway::CopyTensor(const string& node_name, const int output_slot,
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index d6f656c5ca3f38e7c80c3249a7a11b186cbd8f82..2911205db2ca43a678df303dd45997b02e17d6c8 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_gateway.h"
 
 #include <algorithm>
+#include <cstdlib>
 #include <unordered_map>
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
@@ -758,6 +759,10 @@ TEST_F(SessionDebugVariableTest, VariableAssignWithDebugOps) {
   tensor_watch_opts->add_debug_ops(debug_identity);
   tensor_watch_opts->add_debug_ops(debug_nan_count);
 
+  char tempdir_template[] = "/tmp/tfdbg_XXXXXX";
+  string temp_dir(mkdtemp(tempdir_template));
+  tensor_watch_opts->add_debug_urls(strings::StrCat("file://", temp_dir));
+
   // Expected name of the inserted debug node
   string debug_identity_node_name = DebugNodeInserter::GetDebugNodeName(
       strings::StrCat(var_node_name_, ":", 0), 0, debug_identity);
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 4452f140a4e0b5d5be41c26e7d9e6bd56f39ee4b..f8f3d2ae506064aef109c344f726609f295c94c4 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 
 #include "tensorflow/core/common_runtime/memory_types.h"
-#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -26,73 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-DebuggerState::DebuggerState(const DebugOptions& debug_options)
-    : watches(debug_options.debug_tensor_watch_opts()), debug_urls_() {
-  for (const DebugTensorWatch& watch : watches) {
-    for (const string& url : watch.debug_urls()) {
-      debug_urls_.insert(url);
-    }
-  }
-}
-
-DebuggerState::~DebuggerState() {
-  for (const string& debug_url : debug_urls_) {
-    DebugIO::CloseDebugURL(debug_url).IgnoreError();
-  }
-}
-
-const string DebuggerState::SummarizeDebugTensorWatches() {
-  std::ostringstream oss;
-
-  for (const DebugTensorWatch& watch : watches) {
-    string tensor_name =
-        strings::StrCat(watch.node_name(), ":", watch.output_slot());
-    if (watch.tolerate_debug_op_creation_failures()) {
-      oss << "(TOL)";  // Shorthand for "tolerate".
-    }
-    oss << tensor_name << "|";
-
-    for (const string& debug_op : watch.debug_ops()) {
-      oss << debug_op << ",";
-    }
-
-    oss << "@";
-    for (const string& debug_url : watch.debug_urls()) {
-      oss << debug_url << ",";
-    }
-
-    oss << ";";
-  }
-
-  return oss.str();
-}
-
-Status DebuggerState::DecorateGraphForDebug(Graph* graph, Device* device) {
-  Status status;
-
-  DebugNodeInserter::DeparallelizeWhileLoops(graph, device);
-  status.Update(DebugNodeInserter::InsertNodes(watches, graph, device));
-  if (status.ok()) {
-    status.Update(DebugIO::PublishGraph(*graph, debug_urls_));
+namespace {
+
+// TODO(cais): Switch to safe_strtob when available.
+Status ParseBoolString(const string& bool_str, bool* bool_val) {
+  const string lower_bool_str = str_util::Lowercase(bool_str);
+  if (lower_bool_str == "false" || lower_bool_str == "f" ||
+      lower_bool_str == "0") {
+    *bool_val = false;
+  } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
+             lower_bool_str == "1") {
+    *bool_val = true;
+  } else {
+    return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
   }
-
-  return status;
+  return Status::OK();
 }
 
-Status DebuggerState::PublishDebugMetadata(
-    const int64 global_step, const int64 session_run_count,
-    const int64 executor_step_count, const std::vector<string>& input_names,
-    const std::vector<string>& output_names,
-    const std::vector<string>& target_nodes) {
-  return DebugIO::PublishDebugMetadata(global_step, session_run_count,
-                                       executor_step_count, input_names,
-                                       output_names, target_nodes, debug_urls_);
-}
+}  // namespace
 
 // static
 Status DebugNodeInserter::InsertNodes(
     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
     Device* device) {
+  // TODO(cais): This method is getting too large in size.
+  // Refactor it with helpers.
+
   if (watches.empty()) {
     // Nothing to do: Return OK right away.
     return Status::OK();
@@ -191,7 +149,8 @@ Status DebugNodeInserter::InsertNodes(
       Node* copy_node;
       Status copy_s = CreateCopyNode(
           graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
-          src_output_slot, src_dt, tensor_name, &copy_node);
+          src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
+          tensor_watch_urls[tensor_name], &copy_node);
       if (!copy_s.ok()) {
         return Status(
             error::FAILED_PRECONDITION,
@@ -264,19 +223,16 @@ Status DebugNodeInserter::InsertNodes(
 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
   for (Node* node : graph->nodes()) {
     if (node->IsEnter()) {
-      for (const auto& attr : node->def().attr()) {
-        if (attr.first == "parallel_iterations") {
-          if (attr.second.i() > 1) {
-            LOG(INFO) << "For debugging, tfdbg is changing the "
-                      << "parallel_iterations attribute of the Enter/RefEnter "
-                      << "node \"" << node->name() << "\" on device \""
-                      << device->name() << "\" from " << attr.second.i()
-                      << " to 1. (This does not affect subsequent non-debug "
-                      << "runs.)";
-            node->AddAttr<int64>("parallel_iterations", 1);
-          }
-          break;
-        }
+      const AttrValue* parallel_iterations =
+          node->attrs().Find("parallel_iterations");
+      if (parallel_iterations && parallel_iterations->i() > 1) {
+        LOG(INFO) << "For debugging, tfdbg is changing the "
+                  << "parallel_iterations attribute of the Enter/RefEnter "
+                  << "node \"" << node->name() << "\" on device \""
+                  << device->name() << "\" from " << parallel_iterations->i()
+                  << " to 1. (This does not affect subsequent non-debug "
+                  << "runs.)";
+        node->AddAttr<int64>("parallel_iterations", 1);
       }
     }
   }
@@ -305,15 +261,40 @@ const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
 Status DebugNodeInserter::CreateCopyNode(
     Graph* graph, const DeviceType device_type, const bool is_host_memory,
     const string& src_node_name, const int src_output, const DataType src_dt,
-    const string& tensor_name, Node** copy_node) {
+    const string& tensor_name, const std::vector<string>& debug_ops,
+    const std::vector<string>& debug_urls, Node** copy_node) {
+  const string kGatedGrpcAttributeKey = "gated_grpc";
+
   NodeDef node_def;
   const KernelDef* kdef;
 
   const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
   const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
 
+  // Cross debug_ops and debug_urls to get the list of debug ops and watches.
+  std::vector<string> debug_ops_spec;
+  for (const string& debug_op : debug_ops) {
+    for (const string& debug_url : debug_urls) {
+      string debug_op_name_proper;
+      std::unordered_map<string, string> custom_attributes;
+      TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
+                                          &custom_attributes));
+
+      bool gated_grpc_value = false;
+      if (custom_attributes.find(kGatedGrpcAttributeKey) !=
+          custom_attributes.end()) {
+        TF_RETURN_IF_ERROR(ParseBoolString(
+            custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
+      }
+      debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
+                                               debug_url, ";",
+                                               gated_grpc_value ? "1" : "0"));
+    }
+  }
+
   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
-                     .Input(src_node_name, src_output, src_dt);
+                     .Input(src_node_name, src_output, src_dt)
+                     .Attr("debug_ops_spec", std::move(debug_ops_spec));
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
@@ -422,16 +403,13 @@ Status DebugNodeInserter::SetDebugNodeAttributes(
         }
         debug_node->AddAttr<int>(attr.name(), int_value);
       } else if (attr.type() == "bool") {
-        string bool_str = str_util::Lowercase(attr_value);
-        if (bool_str == "false" || bool_str == "f" || bool_str == "0") {
-          debug_node->AddAttr<bool>(attr.name(), false);
-        } else if (bool_str == "true" || bool_str == "t" || bool_str == "1") {
-          debug_node->AddAttr<bool>(attr.name(), true);
-        } else {
+        bool bool_value;
+        if (!ParseBoolString(attr_value, &bool_value).ok()) {
           return errors::InvalidArgument(
               "Invalid value string for bool-type attribute ", attr.name(),
               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
         }
+        debug_node->AddAttr<bool>(attr.name(), bool_value);
       } else {
         return errors::InvalidArgument(
             "Unsupported type of custom attribute for debug ops: ",
diff --git a/tensorflow/core/debug/debug_graph_utils.h b/tensorflow/core/debug/debug_graph_utils.h
index 23bf0afc1a7edebf91c257d3067569d54982c605..fa8b33b98ab03b4c30b574962306844d7ee945e7 100644
--- a/tensorflow/core/debug/debug_graph_utils.h
+++ b/tensorflow/core/debug/debug_graph_utils.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_DEBUG_NODE_INSERTER_H_
 
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -29,35 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-class DebuggerState : public DebuggerStateInterface {
- public:
-  DebuggerState(const DebugOptions& debug_options);
-  virtual ~DebuggerState();
-
-  // Returns a summary string for RepeatedPtrFields of DebugTensorWatches.
-  const string SummarizeDebugTensorWatches() override;
-
-  // Insert special-purpose debug nodes to graph. See the documentation of
-  // DebugNodeInserter::InsertNodes() for details.
-  Status DecorateGraphForDebug(Graph* graph, Device* device) override;
-
-  const protobuf::RepeatedPtrField<DebugTensorWatch>& watches;
-
-  // Publish metadata about the debugged Session::Run() call.
-  //
-  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
-  // details.
-  Status PublishDebugMetadata(const int64 global_step,
-                              const int64 session_run_count,
-                              const int64 executor_step_count,
-                              const std::vector<string>& input_names,
-                              const std::vector<string>& output_names,
-                              const std::vector<string>& target_names);
-
- private:
-  std::unordered_set<string> debug_urls_;
-};
-
 class DebugNodeInserter {
  public:
   // EXPERIMENTAL: Insert special debug ops (e.g., DebugIdentity) to graph for
@@ -121,11 +91,11 @@ class DebugNodeInserter {
                                        const string& debug_op_name);
 
  private:
-  static Status CreateCopyNode(Graph* graph, const DeviceType device_type,
-                               const bool is_host_memory,
-                               const string& src_node_name,
-                               const int src_output, const DataType src_dt,
-                               const string& tensor_name, Node** copy_node);
+  static Status CreateCopyNode(
+      Graph* graph, const DeviceType device_type, const bool is_host_memory,
+      const string& src_node_name, const int src_output, const DataType src_dt,
+      const string& tensor_name, const std::vector<string>& debug_ops,
+      const std::vector<string>& debug_urls, Node** copy_node);
 
   // Parse the debug_op_name string to extract proper op name and attributes.
   // debug_op_name can be the proper op name only, e.g., "DebugNumericSummary".
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index 15ad08199671b6712d5fb8000cdf834d8b9e95b9..d9fab87aed1a8fe24bbf6237374afe7fa1a26282 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,63 +34,113 @@ namespace test {
   Event event;
 
   while (stream->Read(&event)) {
-    const Summary::Value& val = event.summary().value(0);
-
-    std::vector<string> name_items =
-        tensorflow::str_util::Split(val.node_name(), ':');
-
-    const string node_name = name_items[0];
-    int32 output_slot = 0;
-    tensorflow::strings::safe_strto32(name_items[1], &output_slot);
-    const string debug_op = name_items[2];
-
-    const TensorProto& tensor_proto = val.tensor();
-    Tensor tensor(tensor_proto.dtype());
-    if (!tensor.FromProto(tensor_proto)) {
-      return ::grpc::Status::CANCELLED;
+    if (event.has_log_message()) {
+      debug_metadata_strings.push_back(event.log_message().message());
+    } else if (!event.graph_def().empty()) {
+      encoded_graph_defs.push_back(event.graph_def());
+    } else if (event.has_summary()) {
+      const Summary::Value& val = event.summary().value(0);
+
+      std::vector<string> name_items =
+          tensorflow::str_util::Split(val.node_name(), ':');
+
+      const string node_name = name_items[0];
+      int32 output_slot = 0;
+      tensorflow::strings::safe_strto32(name_items[1], &output_slot);
+      const string debug_op = name_items[2];
+
+      const TensorProto& tensor_proto = val.tensor();
+      Tensor tensor(tensor_proto.dtype());
+      if (!tensor.FromProto(tensor_proto)) {
+        return ::grpc::Status::CANCELLED;
+      }
+
+      node_names.push_back(node_name);
+      output_slots.push_back(output_slot);
+      debug_ops.push_back(debug_op);
+      debug_tensors.push_back(tensor);
     }
+  }
 
-    string dump_path;
-    DebugFileIO::DumpTensorToDir(node_name, output_slot, debug_op, tensor,
-                                 event.wall_time(), dump_root, &dump_path)
-        .IgnoreError();
+  {
+    mutex_lock l(changes_mu_);
+    for (size_t i = 0; i < changes_to_enable_.size(); ++i) {
+      EventReply event_reply;
+      EventReply::DebugOpStateChange* change =
+          event_reply.add_debug_op_state_changes();
+      change->set_change(changes_to_enable_[i]
+                             ? EventReply::DebugOpStateChange::ENABLE
+                             : EventReply::DebugOpStateChange::DISABLE);
+      change->set_node_name(changes_node_names_[i]);
+      change->set_output_slot(changes_output_slots_[i]);
+      change->set_debug_op(changes_debug_ops_[i]);
+      stream->Write(event_reply);
+    }
+    changes_to_enable_.clear();
+    changes_node_names_.clear();
+    changes_output_slots_.clear();
+    changes_debug_ops_.clear();
   }
 
   return ::grpc::Status::OK;
 }
 
-GrpcTestServerClientPair::GrpcTestServerClientPair(const int server_port)
-    : server_port(server_port) {
-  const int kTensorSize = 2;
-  prep_tensor_.reset(
-      new Tensor(DT_FLOAT, TensorShape({kTensorSize, kTensorSize})));
-  for (int i = 0; i < kTensorSize * kTensorSize; ++i) {
-    prep_tensor_->flat<float>()(i) = static_cast<float>(i);
-  }
+void TestEventListenerImpl::ClearReceivedDebugData() {
+  debug_metadata_strings.clear();
+  encoded_graph_defs.clear();
+  node_names.clear();
+  output_slots.clear();
+  debug_ops.clear();
+  debug_tensors.clear();
+}
+
+void TestEventListenerImpl::RequestDebugOpStateChangeAtNextStream(
+    bool to_enable, const string& node_name, const int32 output_slot,
+    const string& debug_op) {
+  mutex_lock l(changes_mu_);
 
-  // Obtain server's gRPC url.
-  test_server_url = strings::StrCat("grpc://localhost:", server_port);
+  changes_to_enable_.push_back(to_enable);
+  changes_node_names_.push_back(node_name);
+  changes_output_slots_.push_back(output_slot);
+  changes_debug_ops_.push_back(debug_op);
+}
+
+void TestEventListenerImpl::RunServer(const int server_port) {
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(strings::StrCat("localhost:", server_port),
+                           ::grpc::InsecureServerCredentials());
+  builder.RegisterService(this);
+  std::unique_ptr<::grpc::Server> server = builder.BuildAndStart();
+
+  while (!stop_requested_.load()) {
+    Env::Default()->SleepForMicroseconds(200 * 1000);
+  }
+  server->Shutdown();
+  stopped_.store(true);
+}
 
-  // Obtain dump directory for the stream server.
-  string tmp_dir = port::Tracing::LogDir();
-  dump_root =
-      io::JoinPath(tmp_dir, strings::StrCat("tfdbg_dump_port", server_port, "_",
-                                            Env::Default()->NowMicros()));
+void TestEventListenerImpl::StopServer() {
+  stop_requested_.store(true);
+  while (!stopped_.load()) {
+  }
 }
 
-bool GrpcTestServerClientPair::PollTillFirstRequestSucceeds() {
-  const std::vector<string> urls({test_server_url});
-  int n_attempts = 0;
+bool PollTillFirstRequestSucceeds(const string& server_url,
+                                  const size_t max_attempts) {
+  const int kSleepDurationMicros = 100 * 1000;
+  size_t n_attempts = 0;
   bool success = false;
 
   // Try a number of times to send the Event proto to the server, as it may
   // take the server a few seconds to start up and become responsive.
-  while (n_attempts++ < kMaxAttempts) {
-    const uint64 wall_time = Env::Default()->NowMicros();
+  Tensor prep_tensor(DT_FLOAT, TensorShape({1, 1}));
+  prep_tensor.flat<float>()(0) = 42.0f;
 
+  while (n_attempts++ < max_attempts) {
+    const uint64 wall_time = Env::Default()->NowMicros();
     Status publish_s = DebugIO::PublishDebugTensor(
-        "prep_node:0", "DebugIdentity", *prep_tensor_, wall_time, urls);
-    Status close_s = DebugIO::CloseDebugURL(test_server_url);
+        "prep_node:0", "DebugIdentity", prep_tensor, wall_time, {server_url});
+    Status close_s = DebugIO::CloseDebugURL(server_url);
 
     if (publish_s.ok() && close_s.ok()) {
       success = true;
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 0e7a78e38570daa0e4fc9d8d09b5d2da101baacd..c2b96e78c5648a1c32f341ffbf6829c85a46f88a 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
 #define TENSORFLOW_DEBUG_GRPC_TESTLIB_H_
 
+#include <atomic>
+
 #include "grpc++/grpc++.h"
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 
@@ -26,41 +29,54 @@ namespace test {
 
 class TestEventListenerImpl final : public EventListener::Service {
  public:
-  TestEventListenerImpl(const string& dump_root) : dump_root(dump_root) {}
+  TestEventListenerImpl() : stop_requested_(false), stopped_(false) {}
+
+  void RunServer(const int server_port);
+  void StopServer();
 
   ::grpc::Status SendEvents(
       ::grpc::ServerContext* context,
       ::grpc::ServerReaderWriter< ::tensorflow::EventReply,
                                   ::tensorflow::Event>* stream);
 
-  string dump_root;
-};
-
-class GrpcTestServerClientPair {
- public:
-  GrpcTestServerClientPair(const int server_port);
-  virtual ~GrpcTestServerClientPair() {}
-
-  // Keep sending requests to the test server until the first success.
-  // This is necessary because the server may take a certain amount of time
-  // to start up and become responsive.
-  //
-  // Returns: A boolean indicating whether a successful response is obtained
-  //   within the limit of maximum number of attempts.
-  bool PollTillFirstRequestSucceeds();
+  // Clear debug data (e.g., Tensors) received so far.
+  void ClearReceivedDebugData();
 
-  string dump_root;
+  void RequestDebugOpStateChangeAtNextStream(bool to_enable,
+                                             const string& node_name,
+                                             const int32 output_slot,
+                                             const string& debug_op);
 
-  int server_port;
-  string test_server_url;
+  std::vector<string> debug_metadata_strings;
+  std::vector<string> encoded_graph_defs;
+  std::vector<string> node_names;
+  std::vector<int32> output_slots;
+  std::vector<string> debug_ops;
+  std::vector<Tensor> debug_tensors;
 
  private:
-  std::unique_ptr<Tensor> prep_tensor_;
+  std::atomic_bool stop_requested_;
+  std::atomic_bool stopped_;
 
-  const int kMaxAttempts = 100;
-  const int kSleepDurationMicros = 100 * 1000;
+  std::vector<bool> changes_to_enable_ GUARDED_BY(changes_mu_);
+  std::vector<string> changes_node_names_ GUARDED_BY(changes_mu_);
+  std::vector<int32> changes_output_slots_ GUARDED_BY(changes_mu_);
+  std::vector<string> changes_debug_ops_ GUARDED_BY(changes_mu_);
+
+  mutex changes_mu_;
 };
 
+// Poll a gRPC debug server by sending a small tensor repeatedly till success.
+//
+// Args:
+//   server_url: gRPC URL of the server to poll, e.g., "grpc://foo:3333".
+//   max_attempts: Maximum number of attempts.
+//
+// Returns:
+//   Whether the polling succeeded within max_attempts.
+bool PollTillFirstRequestSucceeds(const string& server_url,
+                                  const size_t max_attempts);
+
 }  // namespace test
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index da68d1c24797dd686daf7ec5c5473cfeb262f090..43bb3039b51cab6688670d975342008c8e82e377 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <vector>
 
+#if defined(PLATFORM_GOOGLE)
 #include "grpc++/create_channel.h"
+#endif
 
 #if defined(PLATFORM_WINDOWS)
 // winsock2.h is used in grpc, so Ws2_32.lib is needed
@@ -25,6 +27,7 @@ limitations under the License.
 #endif
 
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -84,6 +87,37 @@ string AppendTimestampToFilePath(const string& in, const uint64 timestamp) {
   return out;
 }
 
+#if defined(PLATFORM_GOOGLE)
+Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
+                                      const string& device_name,
+                                      const int64 wall_time,
+                                      const string& debug_url) {
+  static const size_t kChunkSizeLimitBytes = 4000 * 1024;
+  const uint64 hash = ::tensorflow::Hash64(encoded_graph_def);
+  const size_t total_length = encoded_graph_def.size();
+  const size_t num_chunks = static_cast<size_t>(
+      std::ceil(static_cast<float>(total_length) / kChunkSizeLimitBytes));
+  for (size_t i = 0; i < num_chunks; ++i) {
+    const size_t pos = i * kChunkSizeLimitBytes;
+    const size_t len =
+        (i == num_chunks - 1) ? (total_length - pos) : kChunkSizeLimitBytes;
+    Event event;
+    event.set_wall_time(static_cast<double>(wall_time));
+    // Prefix the chunk with
+    //   <hash64>,<device_name>,<wall_time>|<index>|<num_chunks>|.
+    event.set_graph_def(strings::StrCat(hash, ",", device_name, ",", wall_time,
+                                        "|", i, "|", num_chunks, "|",
+                                        encoded_graph_def.substr(pos, len)));
+    if (!DebugGrpcIO::SendEventProtoThroughGrpcStream(event, debug_url).ok()) {
+      return errors::FailedPrecondition(
+          "Failed to send chunk ", i, " of ", num_chunks,
+          " of encoded GraphDef of size ", encoded_graph_def.size(), " bytes");
+    }
+  }
+  return Status::OK();
+}
+#endif
+
 }  // namespace
 
 Status ReadEventFromFile(const string& dump_file_path, Event* event) {
@@ -210,7 +244,8 @@ Status DebugIO::PublishDebugMetadata(
 Status DebugIO::PublishDebugTensor(const string& tensor_name,
                                    const string& debug_op, const Tensor& tensor,
                                    const uint64 wall_time_us,
-                                   const gtl::ArraySlice<string>& debug_urls) {
+                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const bool gated_grpc) {
   // Split the tensor_name into node name and output slot index.
   std::vector<string> name_items = str_util::Split(tensor_name, ':');
   string node_name;
@@ -230,7 +265,7 @@ Status DebugIO::PublishDebugTensor(const string& tensor_name,
         strings::StrCat("Failed to parse tensor name: \"", tensor_name, "\""));
   }
 
-  int num_failed_urls = 0;
+  int32 num_failed_urls = 0;
   std::vector<Status> fail_statuses;
   for (const string& url : debug_urls) {
     if (str_util::Lowercase(url).find(kFileURLScheme) == 0) {
@@ -246,7 +281,8 @@ Status DebugIO::PublishDebugTensor(const string& tensor_name,
     } else if (str_util::Lowercase(url).find(kGrpcURLScheme) == 0) {
 #if defined(PLATFORM_GOOGLE)
       Status s = DebugGrpcIO::SendTensorThroughGrpcStream(
-          node_name, output_slot, debug_op, tensor, wall_time_us, url);
+          node_name, output_slot, debug_op, tensor, wall_time_us, url,
+          gated_grpc);
 
       if (!s.ok()) {
         num_failed_urls++;
@@ -277,7 +313,16 @@ Status DebugIO::PublishDebugTensor(const string& tensor_name,
 }
 
 // static
-Status DebugIO::PublishGraph(const Graph& graph,
+Status DebugIO::PublishDebugTensor(const string& tensor_name,
+                                   const string& debug_op, const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls) {
+  return PublishDebugTensor(tensor_name, debug_op, tensor, wall_time_us,
+                            debug_urls, false);
+}
+
+// static
+Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
                              const std::unordered_set<string>& debug_urls) {
   GraphDef graph_def;
   graph.ToGraphDef(&graph_def);
@@ -294,14 +339,16 @@ Status DebugIO::PublishGraph(const Graph& graph,
   for (const string& debug_url : debug_urls) {
     if (debug_url.find(kFileURLScheme) == 0) {
       const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
+      // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that
+      // reflects the device name.
       const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
     } else if (debug_url.find(kGrpcURLScheme) == 0) {
 #if defined(PLATFORM_GOOGLE)
-      status.Update(
-          DebugGrpcIO::SendEventProtoThroughGrpcStream(event, debug_url));
+      status.Update(PublishEncodedGraphDefInChunks(buf, device_name, now_micros,
+                                                   debug_url));
 #else
       GRPC_OSS_UNIMPLEMENTED_ERROR;
 #endif
@@ -311,6 +358,60 @@ Status DebugIO::PublishGraph(const Graph& graph,
   return status;
 }
 
+// static
+bool DebugIO::IsCopyNodeGateOpen(
+    const std::vector<DebugWatchAndURLSpec>& specs) {
+#if defined(PLATFORM_GOOGLE)
+  for (const DebugWatchAndURLSpec& spec : specs) {
+    if (!spec.gated_grpc || spec.url.compare(0, strlen(DebugIO::kGrpcURLScheme),
+                                             DebugIO::kGrpcURLScheme)) {
+      return true;
+    } else {
+      if (DebugGrpcIO::IsGateOpen(spec.watch_key, spec.url)) {
+        return true;
+      }
+    }
+  }
+  return false;
+#else
+  return true;
+#endif
+}
+
+// static
+bool DebugIO::IsDebugNodeGateOpen(const string& watch_key,
+                                  const std::vector<string>& debug_urls) {
+#if defined(PLATFORM_GOOGLE)
+  for (const string& debug_url : debug_urls) {
+    if (debug_url.compare(0, strlen(DebugIO::kGrpcURLScheme),
+                          DebugIO::kGrpcURLScheme)) {
+      return true;
+    } else {
+      if (DebugGrpcIO::IsGateOpen(watch_key, debug_url)) {
+        return true;
+      }
+    }
+  }
+  return false;
+#else
+  return true;
+#endif
+}
+
+// static
+bool DebugIO::IsDebugURLGateOpen(const string& watch_key,
+                                 const string& debug_url) {
+#if defined(PLATFORM_GOOGLE)
+  if (debug_url.find(kGrpcURLScheme) != 0) {
+    return true;
+  } else {
+    return DebugGrpcIO::IsGateOpen(watch_key, debug_url);
+  }
+#else
+  return true;
+#endif
+}
+
 // static
 Status DebugIO::CloseDebugURL(const string& debug_url) {
   if (debug_url.find(DebugIO::kGrpcURLScheme) == 0) {
@@ -430,17 +531,26 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
 
 #if defined(PLATFORM_GOOGLE)
 DebugGrpcChannel::DebugGrpcChannel(const string& server_stream_addr)
-    : ctx_(),
-      channel_(::grpc::CreateCustomChannel(server_stream_addr,
-                                           ::grpc::InsecureChannelCredentials(),
-                                           ::grpc::ChannelArguments())),
-      stub_(EventListener::NewStub(channel_)),
-      reader_writer_(stub_->SendEvents(&ctx_)),
-      mu_() {}
-// TODO(cais): Set GRPC_ARG_MAX_MESSAGE_LENGTH to max if necessary.
-
-bool DebugGrpcChannel::is_channel_ready() {
-  return channel_->GetState(false) == GRPC_CHANNEL_READY;
+    : server_stream_addr_(server_stream_addr),
+      url_(strings::StrCat(DebugIO::kGrpcURLScheme, server_stream_addr)) {}
+
+Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  channel_ = ::grpc::CreateCustomChannel(
+      server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
+  if (!channel_->WaitForConnected(
+          gpr_time_add(gpr_now(GPR_CLOCK_REALTIME),
+                       gpr_time_from_micros(timeout_micros, GPR_TIMESPAN)))) {
+    return errors::FailedPrecondition(
+        "Failed to connect to gRPC channel at ", server_stream_addr_,
+        " within a timeout of ", timeout_micros / 1e6, " s.");
+  }
+  stub_ = EventListener::NewStub(channel_);
+  reader_writer_ = stub_->SendEvents(&ctx_);
+  return Status::OK();
 }
 
 bool DebugGrpcChannel::WriteEvent(const Event& event) {
@@ -449,10 +559,29 @@ bool DebugGrpcChannel::WriteEvent(const Event& event) {
   return reader_writer_->Write(event);
 }
 
-Status DebugGrpcChannel::Close() {
+Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   mutex_lock l(mu_);
 
   reader_writer_->WritesDone();
+
+  // Read all EventReply messages (if any) from the server.
+  EventReply event_reply;
+  while (reader_writer_->Read(&event_reply)) {
+    for (const EventReply::DebugOpStateChange& debug_op_state_change :
+         event_reply.debug_op_state_changes()) {
+      string watch_key = strings::StrCat(debug_op_state_change.node_name(), ":",
+                                         debug_op_state_change.output_slot(),
+                                         ":", debug_op_state_change.debug_op());
+      if (debug_op_state_change.change() ==
+          EventReply::DebugOpStateChange::ENABLE) {
+        DebugGrpcIO::EnableWatchKey(url_, watch_key);
+      } else if (debug_op_state_change.change() ==
+                 EventReply::DebugOpStateChange::DISABLE) {
+        DebugGrpcIO::DisableWatchKey(url_, watch_key);
+      }
+    }
+  }
+
   if (reader_writer_->Finish().ok()) {
     return Status::OK();
   } else {
@@ -463,22 +592,35 @@ Status DebugGrpcChannel::Close() {
 
 // static
 mutex DebugGrpcIO::streams_mu;
-std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>
-    DebugGrpcIO::stream_channels;
 
 // static
-Status DebugGrpcIO::SendTensorThroughGrpcStream(const string& node_name,
-                                                const int32 output_slot,
-                                                const string& debug_op,
-                                                const Tensor& tensor,
-                                                const uint64 wall_time_us,
-                                                const string& grpc_stream_url) {
-  const string tensor_name = strings::StrCat(node_name, ":", output_slot);
+int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
+// TODO(cais): Make this configurable?
 
-  // Prepare tensor Event data to be sent.
-  Event event = WrapTensorAsEvent(tensor_name, debug_op, tensor, wall_time_us);
+// static
+std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+DebugGrpcIO::GetStreamChannels() {
+  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+      stream_channels =
+          new std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>();
+  return stream_channels;
+}
 
-  return SendEventProtoThroughGrpcStream(event, grpc_stream_url);
+// static
+Status DebugGrpcIO::SendTensorThroughGrpcStream(
+    const string& node_name, const int32 output_slot, const string& debug_op,
+    const Tensor& tensor, const uint64 wall_time_us,
+    const string& grpc_stream_url, const bool gated) {
+  if (gated &&
+      !IsGateOpen(strings::StrCat(node_name, ":", output_slot, ":", debug_op),
+                  grpc_stream_url)) {
+    return Status::OK();
+  } else {
+    const string tensor_name = strings::StrCat(node_name, ":", output_slot);
+    return SendEventProtoThroughGrpcStream(
+        WrapTensorAsEvent(tensor_name, debug_op, tensor, wall_time_us),
+        grpc_stream_url);
+  }
 }
 
 // static
@@ -492,45 +634,114 @@ Status DebugGrpcIO::SendEventProtoThroughGrpcStream(
   std::shared_ptr<DebugGrpcChannel> debug_grpc_channel;
   {
     mutex_lock l(streams_mu);
-    if (stream_channels.find(grpc_stream_url) == stream_channels.end()) {
+    std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+        stream_channels = GetStreamChannels();
+    if (stream_channels->find(grpc_stream_url) == stream_channels->end()) {
       debug_grpc_channel.reset(new DebugGrpcChannel(server_stream_addr));
+      TF_RETURN_IF_ERROR(
+          debug_grpc_channel->Connect(channel_connection_timeout_micros));
 
-      if (!debug_grpc_channel->is_channel_ready()) {
-        return errors::FailedPrecondition(
-            strings::StrCat("Channel at the following gRPC stream URL is ",
-                            "not ready: ", grpc_stream_url));
-      }
-
-      stream_channels[grpc_stream_url] = debug_grpc_channel;
+      (*stream_channels)[grpc_stream_url] = debug_grpc_channel;
+      CreateEmptyEnabledSet(grpc_stream_url);
     } else {
-      debug_grpc_channel = stream_channels[grpc_stream_url];
+      debug_grpc_channel = (*stream_channels)[grpc_stream_url];
     }
   }
 
   bool write_ok = debug_grpc_channel->WriteEvent(event_proto);
   if (!write_ok) {
     return errors::Cancelled(strings::StrCat("Write event to stream URL ",
-                                             grpc_stream_url, "failed."));
+                                             grpc_stream_url, " failed."));
   }
 
   return Status::OK();
 }
 
+// static
+bool DebugGrpcIO::IsGateOpen(const string& watch_key,
+                             const string& grpc_debug_url) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    return false;
+  } else {
+    const auto& url_enabled = (*enabled_watch_keys)[grpc_debug_url];
+    return url_enabled.find(watch_key) != url_enabled.end();
+  }
+}
+
+// static
 Status DebugGrpcIO::CloseGrpcStream(const string& grpc_stream_url) {
   mutex_lock l(streams_mu);
 
-  if (stream_channels.find(grpc_stream_url) != stream_channels.end()) {
+  std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+      stream_channels = GetStreamChannels();
+  if (stream_channels->find(grpc_stream_url) != stream_channels->end()) {
     // Stream of the specified address exists. Close it and remove it from
     // record.
     Status s;
-    s = stream_channels[grpc_stream_url]->Close();
-    stream_channels.erase(grpc_stream_url);
+    s = (*stream_channels)[grpc_stream_url]->ReceiveServerRepliesAndClose();
+    (*stream_channels).erase(grpc_stream_url);
     return s;
   } else {
     // Stream of the specified address does not exist. No action.
     return Status::OK();
   }
 }
+
+// static
+std::unordered_map<string, std::unordered_set<string>>*
+DebugGrpcIO::GetEnabledWatchKeys() {
+  static std::unordered_map<string, std::unordered_set<string>>*
+      enabled_watch_keys =
+          new std::unordered_map<string, std::unordered_set<string>>();
+  return enabled_watch_keys;
+}
+
+// static
+void DebugGrpcIO::EnableWatchKey(const string& grpc_debug_url,
+                                 const string& watch_key) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    CreateEmptyEnabledSet(grpc_debug_url);
+  }
+  (*enabled_watch_keys)[grpc_debug_url].insert(watch_key);
+}
+
+// static
+void DebugGrpcIO::DisableWatchKey(const string& grpc_debug_url,
+                                  const string& watch_key) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    LOG(WARNING) << "Attempt to disable a watch key for an unregistered gRPC "
+                 << "debug URL: " << grpc_debug_url;
+  } else {
+    std::unordered_set<string>& url_enabled =
+        (*enabled_watch_keys)[grpc_debug_url];
+    if (url_enabled.find(watch_key) == url_enabled.end()) {
+      LOG(WARNING) << "Attempt to disable a watch key that is not currently "
+                   << "enabled at " << grpc_debug_url << ": " << watch_key;
+    } else {
+      url_enabled.erase(watch_key);
+    }
+  }
+}
+
+// static
+void DebugGrpcIO::ClearEnabledWatchKeys() { GetEnabledWatchKeys()->clear(); }
+
+// static
+void DebugGrpcIO::CreateEmptyEnabledSet(const string& grpc_debug_url) {
+  std::unordered_map<string, std::unordered_set<string>>* enabled_watch_keys =
+      GetEnabledWatchKeys();
+  if (enabled_watch_keys->find(grpc_debug_url) == enabled_watch_keys->end()) {
+    std::unordered_set<string> empty_watch_keys;
+    (*enabled_watch_keys)[grpc_debug_url] = empty_watch_keys;
+  }
+}
+
 #endif  // #if defined(PLATFORM_GOOGLE)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index cd4462b6aefc6168044bd808108a7b1bb376188b..e6118cb15da3a61d10d1a08e460878d785a6a548 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -30,6 +30,16 @@ namespace tensorflow {
 
 Status ReadEventFromFile(const string& dump_file_path, Event* event);
 
+struct DebugWatchAndURLSpec {
+  DebugWatchAndURLSpec(const string& watch_key, const string& url,
+                       const bool gated_grpc)
+      : watch_key(watch_key), url(url), gated_grpc(gated_grpc) {}
+
+  const string watch_key;
+  const string url;
+  const bool gated_grpc;
+};
+
 class DebugIO {
  public:
   static Status PublishDebugMetadata(
@@ -51,6 +61,14 @@ class DebugIO {
   //   wall_time_us: Time stamp for the Tensor. Unit: microseconds (us).
   //   debug_urls: An array of debug target URLs, e.g.,
   //     "file:///foo/tfdbg_dump", "grpc://localhost:11011"
+  //   gated_grpc: Whether this call is subject to gRPC gating.
+  static Status PublishDebugTensor(const string& tensor_name,
+                                   const string& debug_op, const Tensor& tensor,
+                                   const uint64 wall_time_us,
+                                   const gtl::ArraySlice<string>& debug_urls,
+                                   const bool gated_grpc);
+
+  // Convenience overload of the method above for no gated_grpc by default.
   static Status PublishDebugTensor(const string& tensor_name,
                                    const string& debug_op, const Tensor& tensor,
                                    const uint64 wall_time_us,
@@ -61,9 +79,54 @@ class DebugIO {
   // Args:
   //   graph: The graph to be published.
   //   debug_urls: The set of debug URLs to publish the graph to.
-  static Status PublishGraph(const Graph& graph,
+  static Status PublishGraph(const Graph& graph, const string& device_name,
                              const std::unordered_set<string>& debug_urls);
 
+  // Determine whether a copy node needs to perform deep-copy of input tensor.
+  //
+  // The input arguments contain sufficient information about the attached
+  // downstream debug ops for this method to determine whether all the said
+  // ops are disabled given the current status of the gRPC gating.
+  //
+  // Args:
+  //   specs: A vector of DebugWatchAndURLSpec carrying information about the
+  //     debug ops attached to the Copy node, their debug URLs and whether
+  //     they have the attribute value gated_grpc == True.
+  //
+  // Returns:
+  //   Whether any of the attached downstream debug ops is enabled given the
+  //   current status of the gRPC gating.
+  static bool IsCopyNodeGateOpen(
+      const std::vector<DebugWatchAndURLSpec>& specs);
+
+  // Determine whether a debug node needs to proceed given the current gRPC
+  // gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_urls: the debug URLs of the debug node.
+  //
+  // Returns:
+  //   Whether this debug op should proceed.
+  static bool IsDebugNodeGateOpen(const string& watch_key,
+                                  const std::vector<string>& debug_urls);
+
+  // Determine whether debug information should be sent through a grpc://
+  // debug URL given the current gRPC gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   debug_url: the debug URL, e.g., "grpc://localhost:3333",
+  //     "file:///tmp/tfdbg_1".
+  //
+  // Returns:
+  //   Whether the sending of debug data to the debug_url should
+  //     proceed.
+  static bool IsDebugURLGateOpen(const string& watch_key,
+                                 const string& debug_url);
+
   static Status CloseDebugURL(const string& debug_url);
 
   static const char* const kFileURLScheme;
@@ -152,8 +215,16 @@ class DebugGrpcChannel {
 
   virtual ~DebugGrpcChannel() {}
 
-  // Query whether the gRPC channel is ready for use.
-  bool is_channel_ready();
+  // Attempt to establish connection with server.
+  //
+  // Args:
+  //   timeout_micros: Timeout (in microseconds) for the attempt to establish
+  //     the connection.
+  //
+  // Returns:
+  //   OK Status iff connection is successfully established before timeout,
+  //   otherwise return an error Status.
+  Status Connect(const int64 timeout_micros);
 
   // Write an Event proto to the debug gRPC stream.
   //
@@ -166,10 +237,13 @@ class DebugGrpcChannel {
   //   True iff the write is successful.
   bool WriteEvent(const Event& event);
 
-  // Close the stream and the channel.
-  Status Close();
+  // Receive EventReplies from server (if any) and close the stream and the
+  // channel.
+  Status ReceiveServerRepliesAndClose();
 
  private:
+  string server_stream_addr_;
+  string url_;
   ::grpc::ClientContext ctx_;
   std::shared_ptr<::grpc::Channel> channel_;
   std::unique_ptr<EventListener::Stub> stub_;
@@ -182,12 +256,10 @@ class DebugGrpcChannel {
 class DebugGrpcIO {
  public:
   // Send a tensor through a debug gRPC stream.
-  static Status SendTensorThroughGrpcStream(const string& node_name,
-                                            const int32 output_slot,
-                                            const string& debug_op,
-                                            const Tensor& tensor,
-                                            const uint64 wall_time_us,
-                                            const string& grpc_stream_url);
+  static Status SendTensorThroughGrpcStream(
+      const string& node_name, const int32 output_slot, const string& debug_op,
+      const Tensor& tensor, const uint64 wall_time_us,
+      const string& grpc_stream_url, const bool gated);
 
   // Send an Event proto through a debug gRPC stream.
   // Thread-safety: Safe with respect to other calls to the same method and
@@ -195,15 +267,54 @@ class DebugGrpcIO {
   static Status SendEventProtoThroughGrpcStream(const Event& event_proto,
                                                 const string& grpc_stream_url);
 
+  // Check whether a debug watch key is allowed to send data to a given grpc://
+  // debug URL given the current gating status.
+  //
+  // Args:
+  //   watch_key: debug tensor watch key, in the format of
+  //     tensor_name:debug_op, e.g., "Weights:0:DebugIdentity".
+  //   grpc_debug_url: the debug URL, e.g., "grpc://localhost:3333",
+  //
+  // Returns:
+  //   Whether the sending of debug data to grpc_debug_url should
+  //     proceed.
+  static bool IsGateOpen(const string& watch_key, const string& grpc_debug_url);
+
   // Close a gRPC stream to the given address, if it exists.
   // Thread-safety: Safe with respect to other calls to the same method and
   // calls to SendTensorThroughGrpcStream().
   static Status CloseGrpcStream(const string& grpc_stream_url);
 
+  // Enable a debug watch key at a grpc:// debug URL.
+  static void EnableWatchKey(const string& grpc_debug_url,
+                             const string& watch_key);
+
+  // Disable a debug watch key at a grpc:// debug URL.
+  static void DisableWatchKey(const string& grpc_debug_url,
+                              const string& watch_key);
+
  private:
+  // Returns a global map from grpc debug URLs to the corresponding
+  // DebugGrpcChannels.
+  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>*
+  GetStreamChannels();
+
+  // Returns a global map from grpc debug URLs to the enabled gated debug nodes.
+  // The keys are grpc:// URLs of the debug servers, e.g., "grpc://foo:3333".
+  // Each value element of the value has the format
+  // <node_name>:<output_slot>:<debug_op>", e.g.,
+  // "Weights_1:0:DebugNumericSummary".
+  static std::unordered_map<string, std::unordered_set<string>>*
+  GetEnabledWatchKeys();
+
+  static void ClearEnabledWatchKeys();
+  static void CreateEmptyEnabledSet(const string& grpc_debug_url);
+
   static mutex streams_mu;
-  static std::unordered_map<string, std::shared_ptr<DebugGrpcChannel>>
-      stream_channels GUARDED_BY(streams_mu);
+  static int64 channel_connection_timeout_micros;
+
+  friend class GrpcDebugTest;
+  friend class DebugNumericSummaryOpTest;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/debug/debug_service.proto b/tensorflow/core/debug/debug_service.proto
index 967c99c108f9c46b5f53b78f9c78cd52e1f0a5d5..63d6668292ae752a6bca898abf25cc33a1872332 100644
--- a/tensorflow/core/debug/debug_service.proto
+++ b/tensorflow/core/debug/debug_service.proto
@@ -20,9 +20,22 @@ package tensorflow;
 import "tensorflow/core/util/event.proto";
 
 // Reply message from EventListener to the client, i.e., to the source of the
-// Event protocal buffers, e.g., debug ops inserted by a debugged runtime to a
+// Event protocol buffers, e.g., debug ops inserted by a debugged runtime to a
 // TensorFlow graph being executed.
 message EventReply {
+  message DebugOpStateChange {
+    enum Change {
+      DISABLE = 0;
+      ENABLE = 1;
+    }
+
+    Change change = 1;
+    string node_name = 2;
+    int32 output_slot = 3;
+    string debug_op = 4;
+  }
+
+  repeated DebugOpStateChange debug_op_state_changes = 1;
 }
 
 // EventListener: Receives Event protos, e.g., from debugged TensorFlow
diff --git a/tensorflow/core/debug/debug_test_server_main.cc b/tensorflow/core/debug/debug_test_server_main.cc
deleted file mode 100644
index 578563be0931b8dea8482fac65605300f0a9b71f..0000000000000000000000000000000000000000
--- a/tensorflow/core/debug/debug_test_server_main.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "grpc++/grpc++.h"
-#include "tensorflow/core/debug/debug_grpc_testlib.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-
-// Usage: debug_test_server_main <port> <dump_root>
-int main(int argc, char* argv[]) {
-  if (argc != 3) {
-    std::cerr << "Usage: debug_test_server_main <port> <dump_root>"
-              << std::endl;
-    return 1;
-  }
-
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  int port = 0;
-  tensorflow::strings::safe_strto32(argv[1], &port);
-  std::string test_server_addr = tensorflow::strings::StrCat("0.0.0.0:", port);
-
-  tensorflow::test::TestEventListenerImpl debug_test_server(argv[2]);
-
-  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(test_server_addr,
-                           ::grpc::InsecureServerCredentials());
-  builder.RegisterService(&debug_test_server);
-  std::unique_ptr<::grpc::Server> test_server = builder.BuildAndStart();
-
-  test_server->Wait();
-
-  return 0;
-}
diff --git a/tensorflow/core/debug/debugger_state_impl.cc b/tensorflow/core/debug/debugger_state_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88e3c882e6611c0cfb91417794b4fd4a64f6e138
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.cc
@@ -0,0 +1,67 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/debug/debugger_state_impl.h"
+
+#include "tensorflow/core/debug/debug_graph_utils.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+
+namespace tensorflow {
+
+DebuggerState::DebuggerState(const DebugOptions& debug_options) {
+  for (const DebugTensorWatch& watch :
+       debug_options.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls_.insert(url);
+    }
+  }
+}
+
+DebuggerState::~DebuggerState() {
+  for (const string& debug_url : debug_urls_) {
+    DebugIO::CloseDebugURL(debug_url).IgnoreError();
+  }
+}
+
+Status DebuggerState::PublishDebugMetadata(
+    const int64 global_step, const int64 session_run_count,
+    const int64 executor_step_count, const std::vector<string>& input_names,
+    const std::vector<string>& output_names,
+    const std::vector<string>& target_names) {
+  return DebugIO::PublishDebugMetadata(global_step, session_run_count,
+                                       executor_step_count, input_names,
+                                       output_names, target_names, debug_urls_);
+}
+
+Status DebugGraphDecorator::DecorateGraph(Graph* graph, Device* device) {
+  DebugNodeInserter::DeparallelizeWhileLoops(graph, device);
+  return DebugNodeInserter::InsertNodes(
+      debug_options_.debug_tensor_watch_opts(), graph, device);
+}
+
+Status DebugGraphDecorator::PublishGraph(const Graph& graph,
+                                         const string& device_name) {
+  std::unordered_set<string> debug_urls;
+  for (const DebugTensorWatch& watch :
+       debug_options_.debug_tensor_watch_opts()) {
+    for (const string& url : watch.debug_urls()) {
+      debug_urls.insert(url);
+    }
+  }
+
+  return DebugIO::PublishGraph(graph, device_name, debug_urls);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/debug/debugger_state_impl.h b/tensorflow/core/debug/debugger_state_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..52e2663d0837c67d4cd60b24a3b8db32aeb04daa
--- /dev/null
+++ b/tensorflow/core/debug/debugger_state_impl.h
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+#define TENSORFLOW_DEBUGGER_STATE_IMPL_H_
+
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
+
+#include <unordered_set>
+#include <vector>
+
+namespace tensorflow {
+
+class DebuggerState : public DebuggerStateInterface {
+ public:
+  DebuggerState(const DebugOptions& debug_options);
+  virtual ~DebuggerState();
+
+  // Publish metadata about the debugged Session::Run() call.
+  //
+  // See the doc string of DebuggerStateInterface::PublishDebugMetadata() for
+  // details.
+  Status PublishDebugMetadata(const int64 global_step,
+                              const int64 session_run_count,
+                              const int64 executor_step_count,
+                              const std::vector<string>& input_names,
+                              const std::vector<string>& output_names,
+                              const std::vector<string>& target_names) override;
+
+ private:
+  std::unordered_set<string> debug_urls_;
+};
+
+class DebugGraphDecorator : public DebugGraphDecoratorInterface {
+ public:
+  DebugGraphDecorator(const DebugOptions& debug_options)
+      : debug_options_(debug_options) {}
+  virtual ~DebugGraphDecorator() {}
+
+  Status DecorateGraph(Graph* graph, Device* device) override;
+  Status PublishGraph(const Graph& graph, const string& device_name) override;
+
+ private:
+  DebugOptions debug_options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DEBUGGER_STATE_IMPL_H_
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c68729410cea69d36015bb86ee7558af1e3d520
--- /dev/null
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -0,0 +1,288 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/debug/debug_io_utils.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
+#include "tensorflow/core/protobuf/master.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+static SessionOptions Devices(int num_cpus, int num_gpus) {
+  SessionOptions result;
+  (*result.config.mutable_device_count())["CPU"] = num_cpus;
+  (*result.config.mutable_device_count())["GPU"] = num_gpus;
+  return result;
+}
+
+void CreateGraphDef(GraphDef* graph_def, string node_names[3]) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({1, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  node_names[0] = a->name();
+
+  Tensor b_tensor(DT_FLOAT, TensorShape({2, 1}));
+  test::FillValues<float>(&b_tensor, {2.0, 1.0});
+  Node* b = test::graph::Constant(&graph, b_tensor);
+  node_names[1] = b->name();
+
+  // c = a * b
+  Node* c = test::graph::Matmul(&graph, a, b, false, false);
+  node_names[2] = c->name();
+
+  test::graph::ToGraphDef(&graph, graph_def);
+}
+
+// Asserts that "val" is a single float tensor. The only float is
+// "expected_val".
+static void IsSingleFloatValue(const Tensor& val, float expected_val) {
+  ASSERT_EQ(val.dtype(), DT_FLOAT);
+  ASSERT_EQ(val.NumElements(), 1);
+  ASSERT_EQ(val.flat<float>()(0), expected_val);
+}
+
+static SessionOptions Options(const string& target, int placement_period) {
+  SessionOptions options;
+  // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
+  // string.
+  options.target = strings::StrCat("grpc://", target);
+  options.config.set_placement_period(placement_period);
+  options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_opt_level(OptimizerOptions::L0);
+  return options;
+}
+
+static Session* NewRemote(const SessionOptions& options) {
+  return CHECK_NOTNULL(NewSession(options));
+}
+
+class GrpcSessionDebugTest : public ::testing::Test {
+ protected:
+  void SetUp() override { CreateDumpDir(); }
+
+  void TearDown() override { DeleteDumpDir(); }
+
+  void DeleteDumpDir() {
+    if (Env::Default()->IsDirectory(dump_dir_).ok()) {
+      int64 undeleted_files = 0;
+      int64 undeleted_dirs = 0;
+      ASSERT_TRUE(
+          Env::Default()
+              ->DeleteRecursively(dump_dir_, &undeleted_files, &undeleted_dirs)
+              .ok());
+      ASSERT_EQ(0, undeleted_files);
+      ASSERT_EQ(0, undeleted_dirs);
+    }
+  }
+
+  const string GetDebugURL() { return debug_url_; }
+
+  void LoadTensorDumps(const string& subdir, std::vector<Tensor>* tensors) {
+    const string dirpath = io::JoinPath(dump_dir_, subdir);
+    if (!(Env::Default()->IsDirectory(dirpath).ok())) {
+      return;
+    }
+
+    std::vector<string> filenames;
+    TF_ASSERT_OK(Env::Default()->GetChildren(dirpath, &filenames));
+
+    for (const string& filename : filenames) {
+      Event event;
+      TF_ASSERT_OK(ReadEventFromFile(io::JoinPath(dirpath, filename), &event));
+      if (event.summary().value().size() == 1) {
+        Tensor tensor;
+        ASSERT_TRUE(tensor.FromProto(event.summary().value(0).tensor()));
+        tensors->push_back(tensor);
+      }
+    }
+  }
+
+ private:
+  void CreateDumpDir() {
+    char dir_template[] = "/tmp/tfdbg_grpc_sessions_XXXXXX";
+    dump_dir_ = mkdtemp(dir_template);
+    debug_url_ = strings::StrCat("file://", dump_dir_);
+  }
+
+  string dump_dir_;
+  string debug_url_;
+};
+
+TEST_F(GrpcSessionDebugTest, FileDebugURL) {
+  GraphDef graph;
+  string node_names[3];
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+  TF_CHECK_OK(session->Create(graph));
+
+  // Iteration 0: No watch.
+  // Iterations 1 and 2: Watch one Tensor.
+  // Iterations 3 and 4: Watch two Tensors.
+  // Iteration 5: No watch.
+  for (size_t i = 0; i < 6; ++i) {
+    RunOptions options;
+    if (i >= 1 && i < 5) {
+      DebugOptions* debug_options = options.mutable_debug_options();
+      DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+      watch->set_node_name(node_names[0]);
+      watch->set_output_slot(0);
+      watch->add_debug_ops("DebugIdentity");
+      watch->add_debug_urls(GetDebugURL());
+
+      if (i >= 3) {
+        watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(node_names[1]);
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+      }
+    }
+
+    RunMetadata metadata;
+    std::vector<Tensor> outputs;
+    TF_CHECK_OK(
+        session->Run(options, {}, {node_names[2]}, {}, &outputs, &metadata));
+    ASSERT_EQ(1, outputs.size());
+    IsSingleFloatValue(outputs[0], 4.0);
+
+    std::vector<Tensor> dumped_tensors;
+    LoadTensorDumps("n", &dumped_tensors);
+
+    if (i == 0 || i == 5) {
+      ASSERT_EQ(0, dumped_tensors.size());
+    } else {
+      if (i == 1 || i == 2) {
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({1, 2}), dumped_tensors[0].shape());
+        ASSERT_EQ(1.0, dumped_tensors[0].flat<float>()(0));
+        ASSERT_EQ(2.0, dumped_tensors[0].flat<float>()(1));
+      } else {
+        ASSERT_EQ(2, dumped_tensors.size());
+      }
+      DeleteDumpDir();
+    }
+  }
+  TF_CHECK_OK(session->Close());
+}
+
+void SetDevice(GraphDef* graph, const string& name, const string& dev) {
+  for (size_t i = 0; i < graph->node_size(); ++i) {
+    if (graph->node(i).name() == name) {
+      graph->mutable_node(i)->set_device(dev);
+      return;
+    }
+  }
+  LOG(FATAL) << "Name '" << name << "' not found.";
+}
+
+TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 1), 2, &cluster));
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1000)));
+  ASSERT_TRUE(session != nullptr);
+
+  // b = a
+  Graph graph(OpRegistry::Global());
+  Tensor a_tensor(DT_STRING, TensorShape({2, 2}));
+  for (size_t i = 0; i < 4; ++i) {
+    a_tensor.flat<string>()(i) = "hello, world";
+  }
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  Node* b = test::graph::Identity(&graph, a);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  // In this test, we force each node (a, b) on every possible device.
+  // We test all possible cases.
+  for (const auto& a_dev : cluster->devices()) {
+    for (const auto& b_dev : cluster->devices()) {
+      LOG(INFO) << "a: " << a_dev.name() << " b: " << b_dev.name();
+      SetDevice(&def, a->name(), a_dev.name());
+      SetDevice(&def, b->name(), b_dev.name());
+
+      Status s = session->Create(def);
+      if (s.ok()) {
+        std::vector<Tensor> outputs;
+
+        RunOptions options;
+        DebugOptions* debug_options = options.mutable_debug_options();
+        DebugTensorWatch* watch = debug_options->add_debug_tensor_watch_opts();
+        watch->set_node_name(a->name());
+        watch->set_output_slot(0);
+        watch->add_debug_ops("DebugIdentity");
+        watch->add_debug_urls(GetDebugURL());
+
+        RunMetadata metadata;
+        TF_CHECK_OK(
+            session->Run(options, {}, {b->name()}, {}, &outputs, &metadata));
+        ASSERT_EQ(1, outputs.size());
+        ASSERT_EQ(outputs[0].dtype(), DT_STRING);
+        ASSERT_EQ(outputs[0].NumElements(), 4);
+        for (size_t i = 0; i < outputs[0].NumElements(); ++i) {
+          EXPECT_EQ(outputs[0].flat<string>()(i), "hello, world");
+        }
+        TF_CHECK_OK(session->Close());
+
+        std::vector<Tensor> dumped_tensors;
+        LoadTensorDumps("n", &dumped_tensors);
+        ASSERT_EQ(1, dumped_tensors.size());
+        ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
+        for (size_t i = 0; i < 4; ++i) {
+          ASSERT_EQ("hello, world", dumped_tensors[0].flat<string>()(i));
+        }
+
+        DeleteDumpDir();
+      } else {
+        LOG(ERROR) << "Error: " << s;
+        ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
+                    (b_dev.device_type() == DEVICE_GPU));
+        ASSERT_FALSE(s.ok());
+      }
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 0c2d2b5d5d966be6beb9eb367386a5dcc26cfcf5..e8aabf72dcb1259343442b483ed8b4956d822935 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -44,6 +44,28 @@ package(default_visibility = [
     "//tensorflow:internal",
 ])
 
+cc_library(
+    name = "partial_run_mgr",
+    srcs = ["partial_run_mgr.cc"],
+    hdrs = ["partial_run_mgr.h"],
+    deps = [
+        ":worker_interface",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_test(
+    name = "partial_run_mgr_test",
+    size = "small",
+    srcs = ["partial_run_mgr_test.cc"],
+    deps = [
+        ":partial_run_mgr",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "message_wrappers",
     srcs = ["message_wrappers.cc"],
@@ -77,7 +99,6 @@ cc_library(
     ],
     deps = [
         ":graph_mgr",
-        ":rendezvous_mgr_interface",
         ":worker_cache",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
@@ -92,9 +113,9 @@ cc_library(
     deps = [
         ":graph_mgr",
         ":worker_session",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
     ],
 )
 
@@ -142,6 +163,7 @@ cc_library(
     ],
     deps = [
         ":graph_mgr",
+        ":partial_run_mgr",
         ":rendezvous_mgr_interface",
         ":session_mgr",
         ":worker_interface",
@@ -237,6 +259,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -259,6 +282,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/debug:debug_graph_utils",
     ],
 )
 
@@ -329,6 +353,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/debug",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/README.md b/tensorflow/core/distributed_runtime/README.md
index ab1771e29426605eb225e7a1d5df32f82c806649..d22cd2a45bc68ee8ff5015327f5e56c24879b8f9 100644
--- a/tensorflow/core/distributed_runtime/README.md
+++ b/tensorflow/core/distributed_runtime/README.md
@@ -5,6 +5,4 @@ distributed TensorFlow runtime, using [gRPC](http://grpc.io) for inter-process
 communication.
 
 To learn how to use the distributed runtime to create a TensorFlow cluster,
-see the "Distributed TensorFlow" How To, which is available [in this
-repository](../../g3doc/how_tos/distributed/index.md), and will be available
-on the TensorFlow website after the next version is released.
+see the [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) How-To.
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 5863727f19b6f5410c10f699d84389ce95d221f1..e68aea46ecd436d557d8394c3544684965a81878 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -35,9 +35,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env,
-                                     const string& worker_name)
-    : worker_env_(worker_env), worker_name_(worker_name) {}
+BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env)
+    : worker_env_(worker_env) {}
 
 BaseRendezvousMgr::~BaseRendezvousMgr() {
   for (auto& p : table_) {
@@ -47,7 +46,7 @@ BaseRendezvousMgr::~BaseRendezvousMgr() {
   }
 }
 
-Rendezvous* BaseRendezvousMgr::Find(int64 step_id) {
+RemoteRendezvous* BaseRendezvousMgr::Find(int64 step_id) {
   return FindOrCreate(step_id);
 }
 
@@ -55,7 +54,7 @@ BaseRemoteRendezvous* BaseRendezvousMgr::FindOrCreate(int64 step_id) {
   mutex_lock l(mu_);
   Table::iterator iter = table_.find(step_id);
   if (iter == table_.end()) {
-    auto rr = Create(step_id, worker_env_, worker_name_);
+    auto rr = Create(step_id, worker_env_);
     iter = table_.insert({step_id, rr}).first;
   }
   iter->second->Ref();
@@ -128,14 +127,12 @@ void BaseRendezvousMgr::CleanupAll() {
   }
 }
 
-BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env,
-                                           const string& worker_name,
-                                           int64 step_id,
+BaseRemoteRendezvous::BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
                                            bool tolerate_dup_recv)
     : env_(env),
-      worker_name_(worker_name),
       step_id_(step_id),
-      local_(NewLocalRendezvous(tolerate_dup_recv)) {}
+      local_(NewLocalRendezvous(tolerate_dup_recv)),
+      session_(nullptr) {}
 
 BaseRemoteRendezvous::~BaseRemoteRendezvous() {
   CHECK(active_.empty());
@@ -150,6 +147,41 @@ static bool IsLocalDevice(const string& worker_name,
   return device_name.starts_with(worker_name);
 }
 
+Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
+  CHECK_NE(session, nullptr) << "session must not be null!";
+  std::vector<DeferredCall> deferred_calls;
+  {
+    mutex_lock l(mu_);
+    if (session_ != nullptr) {
+      if (session_->worker_name == session->worker_name) {
+        LOG(INFO) << "Skipping rendezvous re-initialization.";
+        return Status::OK();
+      }
+      Status s = errors::Internal(
+          "Double init! Worker names would have changed from: ",
+          session_->worker_name, " -> ", session->worker_name);
+      LOG(WARNING) << s;
+      return s;
+    }
+    session_ = session;
+    std::swap(deferred_calls, deferred_calls_);
+  }
+  for (DeferredCall& call : deferred_calls) {
+    RecvLocalAsyncInternal(call.parsed, std::move(call.done));
+  }
+  return Status::OK();
+}
+
+WorkerSession* BaseRemoteRendezvous::session() {
+  mutex_lock l(mu_);
+  return session_;
+}
+
+bool BaseRemoteRendezvous::is_initialized() {
+  mutex_lock l(mu_);
+  return is_initialized_locked();
+}
+
 Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
                                   const Rendezvous::Args& args,
                                   const Tensor& val, const bool is_dead) {
@@ -157,10 +189,12 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
-  }
-  if (!IsLocalDevice(worker_name_, parsed.src_device)) {
-    return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+    DCHECK(is_initialized_locked());
+    if (!IsLocalDevice(session_->worker_name, parsed.src_device)) {
+      return errors::InvalidArgument(
+          "Invalid rendezvous key (src): ", parsed.FullKey(), " @ ",
+          session_->worker_name);
+    }
   }
   // Buffers "val" and "device_context" in local_.
   return local_->Send(parsed, args, val, is_dead);
@@ -168,17 +202,24 @@ Status BaseRemoteRendezvous::Send(const Rendezvous::ParsedKey& parsed,
 
 Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
                                              bool is_src) {
+  // Cache session pointer to avoid repeatedly taking & releasing the lock
+  // (e.g. calling session())
+  WorkerSession* sess = nullptr;
   {
     mutex_lock l(mu_);
     if (!status_.ok()) return status_;
+    if (!is_initialized_locked()) {
+      return errors::Internal("ValidateDevices called before initialization.");
+    }
+    sess = session_;
   }
-  if (is_src && !IsLocalDevice(worker_name_, parsed.src_device)) {
+  if (is_src && !IsLocalDevice(sess->worker_name, parsed.src_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (src): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
-  if (!is_src && !IsLocalDevice(worker_name_, parsed.dst_device)) {
+  if (!is_src && !IsLocalDevice(sess->worker_name, parsed.dst_device)) {
     return errors::InvalidArgument("Invalid rendezvous key (dst): ",
-                                   parsed.FullKey(), " @ ", worker_name_);
+                                   parsed.FullKey(), " @ ", sess->worker_name);
   }
   return Status::OK();
 }
@@ -244,6 +285,7 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
+  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
@@ -280,6 +322,26 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
 
 void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
                                           DoneCallback done) {
+  {
+    mutex_lock l(mu_);
+    if (!is_initialized_locked()) {
+      // RecvLocalAsync can be called (due to an incoming RecvTensor RPC from a
+      // remote worker) before the RunStep (or PartialRunStep) RPC from the
+      // master arrives. RecvLocalAsync thus buffers the arguments until after
+      // the RemoteRendezvous is Initialize()'d, when it completes the
+      // rendezvous logic. At some point after Initialize() is called, a Tensor
+      // is produced locally that will then be sent in response to the incoming
+      // RPC.
+      DeferredCall call(parsed, std::move(done));
+      deferred_calls_.push_back(call);
+      return;
+    }
+  }
+  RecvLocalAsyncInternal(parsed, std::move(done));
+}
+
+void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
+                                                  DoneCallback done) {
   Status s = ValidateDevices(parsed, true /* is_src */);
   if (!s.ok()) {
     done(s, Args(), Args(), Tensor(), false);
@@ -318,4 +380,8 @@ void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call) {
   active_.erase(call);
 }
 
+BaseRemoteRendezvous::DeferredCall::DeferredCall(const ParsedKey& parsed,
+                                                 DoneCallback done)
+    : parsed(parsed), done(std::move(done)) {}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index 447a75913d64bb58aea51de0c571cab4897dc448..b252f45fe96354f8e2a91a5aa3a05f1a937e3939 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -59,15 +59,17 @@ class BaseRecvTensorCall;
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey().
 class BaseRendezvousMgr : public RendezvousMgrInterface {
  public:
-  explicit BaseRendezvousMgr(const WorkerEnv* worker_env,
-                             const string& worker_name);
+  explicit BaseRendezvousMgr(const WorkerEnv* worker_env);
 
   ~BaseRendezvousMgr() override;
 
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  Rendezvous* Find(int64 step_id) override;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  RemoteRendezvous* Find(int64 step_id) override;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
@@ -91,8 +93,7 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 
  protected:
   virtual BaseRemoteRendezvous* Create(int64 step_id,
-                                       const WorkerEnv* worker_env,
-                                       const string& worker_name) = 0;
+                                       const WorkerEnv* worker_env) = 0;
 
  private:
   // Maps step_id to rendezvous.
@@ -100,7 +101,6 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 
   // Not owned.
   const WorkerEnv* const worker_env_;
-  const string worker_name_;
 
   mutex mu_;
   Table table_ GUARDED_BY(mu_);
@@ -116,10 +116,13 @@ class BaseRendezvousMgr : public RendezvousMgrInterface {
 // Buffering of Tensor values is delegated to a "local" Rendezvous
 // obtained from NewLocalRendezvous().  This class just adds
 // functionality to coordinate with remote workers.
-class BaseRemoteRendezvous : public Rendezvous {
+class BaseRemoteRendezvous : public RemoteRendezvous {
  public:
-  BaseRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
-                       int64 step_id, bool tolerate_dup_recv);
+  BaseRemoteRendezvous(const WorkerEnv* env, int64 step_id,
+                       bool tolerate_dup_recv);
+
+  // Upgrades the BaseRemoteRendezvous to full initialization.
+  Status Initialize(WorkerSession* session) override;
 
   // Forwards to local_, where the Tensor "val" will be buffered and
   // any waiting callback stored.
@@ -163,10 +166,13 @@ class BaseRemoteRendezvous : public Rendezvous {
   // Removes "call" from active_ if "call" is in active_.
   void DeregisterCall(BaseRecvTensorCall* call);
 
+  WorkerSession* session();
+
+  bool is_initialized();
+
   ~BaseRemoteRendezvous() override;
 
   const WorkerEnv* const env_;  // Not owned.
-  const string worker_name_;
   const int64 step_id_;
 
  private:
@@ -176,10 +182,24 @@ class BaseRemoteRendezvous : public Rendezvous {
 
   // Status given by StartAbort() if any.
   Status status_ GUARDED_BY(mu_);
+  WorkerSession* session_ GUARDED_BY(mu_);  // Not owned.
+
+  // Data structures to handle calls when partially initialized.
+  struct DeferredCall {
+    const ParsedKey parsed;
+    DoneCallback done;
+
+    DeferredCall(const ParsedKey& parsed, DoneCallback done);
+  };
+  std::vector<DeferredCall> deferred_calls_ GUARDED_BY(mu_);
 
   // Active outstanding RecvTensor calls.
   gtl::FlatSet<BaseRecvTensorCall*> active_ GUARDED_BY(mu_);
 
+  bool is_initialized_locked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return session_ != nullptr;
+  }
+
   // If "is_src" is true, checks that the rendezvous key "parsed"'s
   // source is in this process. If "is_src" is false, checks that the
   // rendezvous key "parsed"'s destination is in this process.
@@ -194,6 +214,9 @@ class BaseRemoteRendezvous : public Rendezvous {
                           const Rendezvous::Args& out_args, const Tensor& in,
                           Tensor* out, StatusCallback done);
 
+  // Must be called only if fully initialized.
+  void RecvLocalAsyncInternal(const ParsedKey& parsed, DoneCallback done);
+
   TF_DISALLOW_COPY_AND_ASSIGN(BaseRemoteRendezvous);
 };
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 171e75c0d0a68f56fd6d54e64135d5e1e7901443..f4bf9dcd3b92f85af161694ceab9377851d2834a 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -41,13 +42,19 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
-GraphMgr::GraphMgr(const WorkerEnv* worker_env,
-                   RendezvousMgrInterface* rendezvous_mgr)
-    : worker_env_(worker_env), rendezvous_mgr_(rendezvous_mgr), table_(5) {
-  CHECK(rendezvous_mgr) << "Rendezvous mgr was null";
+GraphMgr::GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr)
+    : worker_env_(worker_env), device_mgr_(device_mgr), table_(5) {
+  // The default value of sync_on_finish will be flipped soon and this
+  // environment variable will be removed as well.
+  Status status =
+      ReadBoolFromEnvVar("TF_SYNC_ON_FINISH", true, &sync_on_finish_);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
 }
 
 GraphMgr::~GraphMgr() {
@@ -86,6 +93,16 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
   return Status::OK();
 }
 
+Status GraphMgr::DecorateAndPublishGraphForDebug(
+    const DebugOptions& debug_options, Graph* graph, Device* device) {
+  std::unique_ptr<DebugGraphDecoratorInterface> decorator;
+  TF_RETURN_IF_ERROR(
+      DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
+  TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
+  TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
+  return Status::OK();
+}
+
 // Creates executors given a graph definition "gdef" of a "session".
 // If a node in "gdef" is shared by other graphs in "session", the
 // same op kernel is reused. E.g., typically a params node is shared
@@ -98,7 +115,8 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
 // "executors" are filled with one executor per device if success and
 // the caller takes the ownership of returned executors.
 Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, Item* item) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, Item* item) {
   item->session = session;
   item->lib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library());
@@ -112,7 +130,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   }
 
   // Constructs the graph out of "gdef".
-  Graph graph(item->lib_def);
+  Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = true;
@@ -128,7 +146,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* device = nullptr;
-    Status s = worker_env_->device_mgr->LookupDevice(name, &device);
+    Status s = device_mgr_->LookupDevice(name, &device);
     if (s.ok()) {
       return device->attributes().incarnation();
     } else {
@@ -144,7 +162,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
   std::unordered_map<string, std::unique_ptr<Graph>> partition_graphs;
   for (const auto& partition : partitions) {
-    std::unique_ptr<Graph> device_graph(new Graph(item->lib_def));
+    std::unique_ptr<Graph> device_graph(new Graph(OpRegistry::Global()));
     GraphConstructorOptions device_opts;
     // There are internal operations (e.g., send/recv) that we now allow.
     device_opts.allow_internal_ops = true;
@@ -173,8 +191,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     ExecutionUnit* unit = &(item->units.back());
 
     // Find the device.
-    Status s =
-        worker_env_->device_mgr->LookupDevice(device_name, &unit->device);
+    Status s = device_mgr_->LookupDevice(device_name, &unit->device);
     if (!s.ok()) {
       // Remove the empty unit from the item as the item destructor wants all
       // units to have valid devices.
@@ -194,7 +211,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 
     // Function library runtime.
     unit->lib = NewFunctionLibraryRuntime(
-        worker_env_->device_mgr, worker_env_->env, unit->device,
+        device_mgr_, worker_env_->env, unit->device,
         subgraph->versions().producer(), item->lib_def,
         graph_options.optimizer_options());
 
@@ -224,6 +241,13 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     };
 
     optimizer.Optimize(lib, worker_env_->env, params.device, &subgraph);
+
+    // EXPERIMENTAL: tfdbg inserts debug nodes (i.e., probes) to the graph.
+    if (!debug_options.debug_tensor_watch_opts().empty()) {
+      TF_RETURN_IF_ERROR(DecorateAndPublishGraphForDebug(
+          debug_options, subgraph.get(), params.device));
+    }
+
     TF_RETURN_IF_ERROR(
         EnsureMemoryTypes(DeviceType(unit->device->device_type()),
                           unit->device->name(), subgraph.get()));
@@ -239,9 +263,10 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 }
 
 Status GraphMgr::Register(const string& session, const GraphDef& gdef,
-                          const GraphOptions& graph_options, string* handle) {
+                          const GraphOptions& graph_options,
+                          const DebugOptions& debug_options, string* handle) {
   Item* item = new Item;
-  Status s = InitItem(session, gdef, graph_options, item);
+  Status s = InitItem(session, gdef, graph_options, debug_options, item);
   if (!s.ok()) {
     item->Unref();
     return s;
@@ -375,9 +400,7 @@ void GraphMgr::RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous,
             }
           }
           call_state->mu.lock();
-          if (status.ok()) {
-            call_state->shared_status = status;
-          }
+          call_state->shared_status.Update(status);
           call_state->done_counter--;
           // If we are the last async call to return, call the done callback.
           if (call_state->done_counter == 0) {
@@ -393,14 +416,14 @@ void GraphMgr::RecvOutputsFromRendezvousAsync(Rendezvous* rendezvous,
 }
 
 Status GraphMgr::SendInputs(const int64 step_id, const NamedTensors& in) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = SendInputsToRendezvous(rendezvous, in);
   rendezvous->Unref();
   return s;
 }
 
 Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = RecvOutputsFromRendezvous(rendezvous, out);
   rendezvous->Unref();
   return s;
@@ -408,7 +431,7 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
 
 void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
                                 StatusCallback done) {
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   RecvOutputsFromRendezvousAsync(rendezvous, out,
                                  [done, rendezvous](const Status s) {
                                    rendezvous->Unref();
@@ -417,7 +440,8 @@ void GraphMgr::RecvOutputsAsync(const int64 step_id, NamedTensors* out,
 }
 
 void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
-                            const ExecutorOpts& opts,
+                            WorkerSession* session,
+                            const ExecutorOpts& /*opts*/,
                             StepStatsCollector* collector,
                             CostGraphDef* cost_graph,
                             CancellationManager* cancellation_manager,
@@ -438,10 +462,14 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  Rendezvous* rendezvous = rendezvous_mgr_->Find(step_id);
+  RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
+  Status s = rendezvous->Initialize(session);
 
   // Sends values specified by the caller.
-  Status s = SendInputsToRendezvous(rendezvous, in);
+  if (s.ok()) {
+    s = SendInputsToRendezvous(rendezvous, in);
+  }
+
   if (!s.ok()) {
     done(s);
     item->Unref();
@@ -466,10 +494,9 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
                                       StatusCallback done) {
   const int num_units = item->units.size();
   CHECK_GE(num_units, 1);
-  ScopedStepContainer* step_container =
-      new ScopedStepContainer(step_id, [this](const string& name) {
-        worker_env_->device_mgr->ClearContainers({name});
-      });
+  ScopedStepContainer* step_container = new ScopedStepContainer(
+      step_id,
+      [this](const string& name) { device_mgr_->ClearContainers({name}); });
   // NOTE: Transfer one ref of rendezvous and item.
   ExecutorBarrier* barrier =
       new ExecutorBarrier(num_units, rendezvous,
@@ -488,7 +515,7 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
   args.cancellation_manager = cancellation_manager;
   args.stats_collector = collector;
   args.step_container = step_container;
-  args.sync_on_finish = true;
+  args.sync_on_finish = sync_on_finish_;
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordStep(args.step_id, handle);
   }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 4477a2764bec7dcd586e49159a40def1cf288fd0..4ee3711d02861cb7e22f4ede21964658079be711 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -30,12 +30,15 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/debug.pb.h"
 
 namespace tensorflow {
 
 class ExecutorOpts;
 class StepStatsCollector;
 class RendezvousMgrInterface;
+class DeviceMgr;
+struct WorkerSession;
 
 // GraphMgr keeps track of a set of graphs that are registered with a
 // TensorFlow worker. Each registered graph is identified by a handle
@@ -61,13 +64,13 @@ class RendezvousMgrInterface;
 //   EXPECT_EQ(out["c"], Tensor({4, 6}));
 class GraphMgr {
  public:
-  explicit GraphMgr(const WorkerEnv* worker_env,
-                    RendezvousMgrInterface* rendezvous_mgr);
+  explicit GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr);
   ~GraphMgr();
 
   // Registers a graph. Fills in "handle"
   Status Register(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, string* handle);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, string* handle);
 
   // Executes one step of a registered graph "handle".
   //
@@ -76,8 +79,8 @@ class GraphMgr {
   typedef std::map<string, Tensor> NamedTensors;
   typedef std::function<void(const Status&)> StatusCallback;
   void ExecuteAsync(const string& handle, const int64 step_id,
-                    const ExecutorOpts& opts, StepStatsCollector* collector,
-                    CostGraphDef* cost_graph,
+                    WorkerSession* session, const ExecutorOpts& opts,
+                    StepStatsCollector* collector, CostGraphDef* cost_graph,
                     CancellationManager* cancellation_manager,
                     const NamedTensors& in, StatusCallback done);
 
@@ -105,9 +108,9 @@ class GraphMgr {
   };
 
   struct Item : public core::RefCounted {
-    // TOOD(zhifengc): Keeps a copy of the original graph if the need arises.
-    // TOOD(zhifengc): Stats, updated by multiple runs potentially.
-    // TOOD(zhifengc): Dup-detection. Ensure step_id only run once.
+    // TODO(zhifengc): Keeps a copy of the original graph if the need arises.
+    // TODO(zhifengc): Stats, updated by multiple runs potentially.
+    // TODO(zhifengc): Dup-detection. Ensure step_id only run once.
     ~Item() override;
 
     // Session handle.
@@ -123,13 +126,13 @@ class GraphMgr {
     // has a root executor which may call into the runtime library.
     std::vector<ExecutionUnit> units;
 
-    // Used to deresgister a cost model when cost model is requried in graph
+    // Used to deresgister a cost model when cost model is required in graph
     // manager.
     GraphMgr* graph_mgr;
   };
 
   const WorkerEnv* worker_env_;             // Not owned.
-  RendezvousMgrInterface* rendezvous_mgr_;  // Not owned.
+  DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
 
@@ -137,6 +140,9 @@ class GraphMgr {
   mutex mu_;
   int64 next_id_ GUARDED_BY(mu_) = 0;
 
+  // If true, blocks until device has finished all queued operations in a step.
+  bool sync_on_finish_ = true;
+
   // Table mapping graph handles to registered graphs.
   //
   // TODO(zhifengc): If the client does not call Deregister, we'll
@@ -151,7 +157,7 @@ class GraphMgr {
                               CancellationManager* cancellation_manager,
                               StatusCallback done);
 
-  // Don't attempt to process cost models unless explicitely requested for at
+  // Don't attempt to process cost models unless explicitly requested for at
   // least one of the items.
   bool skip_cost_models_ = true;
 
@@ -164,7 +170,11 @@ class GraphMgr {
                                       const StatusCallback& done);
 
   Status InitItem(const string& session, const GraphDef& gdef,
-                  const GraphOptions& graph_options, Item* item);
+                  const GraphOptions& graph_options,
+                  const DebugOptions& debug_options, Item* item);
+
+  Status DecorateAndPublishGraphForDebug(const DebugOptions& debug_options,
+                                         Graph* graph, Device* device);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GraphMgr);
 };
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index 1bb706e930b6356cb90e82d038dbffe514838579..c7ba7abeaffc654b24adfcc320ed45990cf5bc77 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -23,9 +23,12 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-
-Status WaitForNotification(CallOptions* call_options, Notification* n) {
+Status WaitForNotification(CallOptions* call_options,
+                           const int64 default_timeout_in_ms, Notification* n) {
   int64 timeout_in_ms = call_options->GetTimeout();
+  if (timeout_in_ms == 0) {
+    timeout_in_ms = default_timeout_in_ms;
+  }
   if (timeout_in_ms > 0) {
     int64 timeout_in_us = timeout_in_ms * 1000;
     bool notified = WaitForNotificationWithTimeout(n, timeout_in_us);
@@ -41,9 +44,11 @@ Status WaitForNotification(CallOptions* call_options, Notification* n) {
   }
   return Status::OK();
 }
-}
+}  // namespace
 
-LocalMaster::LocalMaster(Master* master_impl) : master_impl_(master_impl) {}
+LocalMaster::LocalMaster(Master* master_impl, const int64 default_timeout_in_ms)
+    : master_impl_(master_impl),
+      default_timeout_in_ms_(default_timeout_in_ms) {}
 
 Status LocalMaster::CreateSession(CallOptions* call_options,
                                   const CreateSessionRequest* request,
@@ -54,7 +59,8 @@ Status LocalMaster::CreateSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -67,7 +73,8 @@ Status LocalMaster::ExtendSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -80,7 +87,8 @@ Status LocalMaster::PartialRunSetup(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -94,7 +102,8 @@ Status LocalMaster::RunStep(CallOptions* call_options,
                           ret.Update(s);
                           n.Notify();
                         });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -115,7 +124,8 @@ Status LocalMaster::CloseSession(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -128,7 +138,8 @@ Status LocalMaster::ListDevices(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -141,7 +152,8 @@ Status LocalMaster::Reset(CallOptions* call_options,
     ret.Update(s);
     n.Notify();
   });
-  TF_RETURN_IF_ERROR(WaitForNotification(call_options, &n));
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
   return ret;
 }
 
@@ -151,7 +163,15 @@ mutex* get_local_master_registry_lock() {
   return &local_master_registry_lock;
 }
 
-typedef std::unordered_map<string, Master*> LocalMasterRegistry;
+struct MasterInfo {
+  Master* master;
+  const int64 default_timeout_in_ms;
+
+  MasterInfo(Master* master, const int64 default_timeout_in_ms)
+      : master(master), default_timeout_in_ms(default_timeout_in_ms) {}
+};
+
+typedef std::unordered_map<string, MasterInfo> LocalMasterRegistry;
 LocalMasterRegistry* local_master_registry() {
   static LocalMasterRegistry* local_master_registry_ = new LocalMasterRegistry;
   return local_master_registry_;
@@ -159,9 +179,11 @@ LocalMasterRegistry* local_master_registry() {
 }  // namespace
 
 /* static */
-void LocalMaster::Register(const string& target, Master* master) {
+void LocalMaster::Register(const string& target, Master* master,
+                           int64 default_timeout_in_ms) {
   mutex_lock l(*get_local_master_registry_lock());
-  local_master_registry()->insert({target, master});
+  local_master_registry()->insert(
+      {target, MasterInfo(master, default_timeout_in_ms)});
 }
 
 /* static */
@@ -170,7 +192,8 @@ std::unique_ptr<LocalMaster> LocalMaster::Lookup(const string& target) {
   mutex_lock l(*get_local_master_registry_lock());
   auto iter = local_master_registry()->find(target);
   if (iter != local_master_registry()->end()) {
-    ret.reset(new LocalMaster(iter->second));
+    ret.reset(new LocalMaster(iter->second.master,
+                              iter->second.default_timeout_in_ms));
   }
   return ret;
 }
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index 33b81c33c0d481325fb92a53d20761807d3e1d65..5fc21d3a1e25faa5f6478914c69a3d513b50530c 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
 
 #include <memory>
 
@@ -38,7 +38,7 @@ class Master;
 // for cancellation.
 class LocalMaster : public MasterInterface {
  public:
-  ~LocalMaster(){};
+  ~LocalMaster() {}
 
   Status CreateSession(CallOptions* call_options,
                        const CreateSessionRequest* request,
@@ -78,7 +78,8 @@ class LocalMaster : public MasterInterface {
   // any LocalMaster objects that may wrap this master. There is no
   // corresponding deregister method, since clean server shutdown is
   // not currently implemented for any server type.
-  static void Register(const string& target, Master* master);
+  static void Register(const string& target, Master* master,
+                       int64 default_timeout_in_ms);
 
   // Returns a pointer to the local master associated with the given
   // `target`, or nullptr if none exists.
@@ -86,14 +87,15 @@ class LocalMaster : public MasterInterface {
 
  private:
   Master* master_impl_;  // Not owned.
+  const int64 default_timeout_in_ms_;
 
   // See `LocalMaster::Lookup` for the factory function that creates
   // objects of this type.
-  LocalMaster(Master* master_impl);
+  LocalMaster(Master* master_impl, const int64 default_timeout_in_ms);
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalMaster);
 };
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_REGISTRY_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_LOCAL_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index facbfbb2643f6104cae7663c9054992c1ff35605..81d938bd4f07a8527097cc70211f3ec6a34ffb7e 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -25,7 +25,7 @@ limitations under the License.
 // A Master discovers remote devices on-demand and keeps track of
 // statistics of those remote devices.
 //
-// Each session analyses the graph, places nodes across available
+// Each session analyzes the graph, places nodes across available
 // devices, and ultimately drives the graph computation by initiating
 // RunGraph on the workers.
 
@@ -34,6 +34,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
@@ -48,12 +49,17 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+namespace {
+const char* const kGrpcProtocol = "grpc://";
+}  // namespace
+
 Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
@@ -198,7 +204,7 @@ class DeviceFinder {
     while (num_pending_ != 0) {
       pending_zero_.wait_for(l, std::chrono::milliseconds(kLoggingPeriodMs));
       if (num_pending_ != 0) {
-        for (int i = 0; i < targets_.size(); ++i) {
+        for (size_t i = 0; i < targets_.size(); ++i) {
           if (!seen_targets_[i]) {
             LOG(INFO)
                 << "CreateSession still waiting for response from worker: "
@@ -290,25 +296,122 @@ void Master::CreateSession(const CreateSessionRequest* req,
                            CreateSessionResponse* resp, MyClosure done) {
   SchedClosure([this, req, resp, done]() {
     Status status;
+    WorkerCacheFactoryOptions worker_cache_factory_options;
+    string grpc_protocol("grpc");
+    worker_cache_factory_options.protocol = &grpc_protocol;
     auto call_done = gtl::MakeCleanup([&status, &done] { done(status); });
     status = ValidateExternalGraphDefSyntax(req->graph_def());
     if (!status.ok()) return;
-    // Ping all the workers and build the list of devices that the
-    // session will use.
+
+    // The following 4 variables are set differently, depending on whether this
+    // session uses a client-provided clusterspec or not.
+    WorkerCacheInterface* worker_cache = nullptr;
+    // Note: worker_cache_ptr will be null except if this session is using a
+    // client-supplied ClusterDef (ClusterSpec propagation).
+    std::unique_ptr<WorkerCacheInterface> worker_cache_ptr;
+    std::unique_ptr<DeviceSet> device_set;
     // TODO(saeta): Convert to std::make_unique when available.
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devices(
         new std::vector<std::unique_ptr<Device>>());
-    status = DeviceFinder::GetRemoteDevices(req->config().device_filters(),
-                                            env_, env_->worker_cache,
-                                            remote_devices.get());
-    if (!status.ok()) return;
+
+    if (req->config().has_cluster_def()) {
+      worker_cache_factory_options.cluster_def = &req->config().cluster_def();
+
+      // Set the server_def's job_name and task_index fields.
+      string normalized_string;
+      string grpc_protocol(kGrpcProtocol);
+      if (req->target().compare(0, grpc_protocol.length(), grpc_protocol) ==
+          0) {
+        normalized_string =
+            req->target().substr(grpc_protocol.length(), string::npos);
+      } else {
+        normalized_string = req->target();
+      }
+      for (auto&& job : req->config().cluster_def().job()) {
+        for (auto&& task : job.tasks()) {
+          if (task.second == normalized_string) {
+            if (worker_cache_factory_options.job_name != nullptr) {
+              status = errors::InvalidArgument(
+                  "Found multiple matching tasks that correspond to "
+                  "to the master. Master target: '",
+                  req->target(), "'. ClusterDef: ",
+                  req->config().cluster_def().ShortDebugString());
+              LOG(ERROR) << status;
+              return;
+            }
+            if (env_->local_devices[0]->parsed_name().job == job.name() &&
+                env_->local_devices[0]->parsed_name().task == task.first) {
+              // TODO(b/37868888): Remove this limitation when resolved
+              status = errors::InvalidArgument(
+                  "The ClusterSpec names the job and task index to be the same "
+                  "names that were provided when the server booted. This is "
+                  "currently not allowed. Job: ",
+                  job.name(), ", task index: ", task.first);
+              return;
+            }
+            worker_cache_factory_options.job_name = &job.name();
+            worker_cache_factory_options.task_index = task.first;
+          }
+        }
+      }
+
+      // Create the worker cache from the computed server_def.
+      status = env_->worker_cache_factory(worker_cache_factory_options,
+                                          &worker_cache);
+      if (!status.ok()) return;
+      worker_cache_ptr = std::unique_ptr<WorkerCacheInterface>(worker_cache);
+      // Ping all the workers and build the list of devices that the
+      // session will use.
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
+        DeviceNameUtils::ParsedName name = d->parsed_name();
+        if (name.job == *worker_cache_factory_options.job_name &&
+            name.task == worker_cache_factory_options.task_index &&
+            name.type == "CPU") {
+          device_set->set_client_device(d.get());
+        }
+      }
+    } else {
+      worker_cache = env_->worker_cache;
+      // Ping all the workers and build the list of devices that the
+      // session will use.
+      status =
+          DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
+                                         worker_cache, remote_devices.get());
+      if (!status.ok()) return;
+      device_set.reset(new DeviceSet);
+      for (auto&& d : *remote_devices) {
+        device_set->AddDevice(d.get());
+      }
+      int num_local_devices = 0;
+      for (Device* d : env_->local_devices) {
+        device_set->AddDevice(d);
+        if (num_local_devices == 0) {
+          // Uses the first local device as the client device.
+          device_set->set_client_device(d);
+        }
+        num_local_devices++;
+      }
+    }
+
+    CHECK(device_set->client_device());
+
     SessionOptions options;
     options.config = req->config();
-    MasterSession* session =
-        env_->master_session_factory(options, env_, std::move(remote_devices));
+
+    MasterSession* session = env_->master_session_factory(
+        options, env_, std::move(remote_devices), std::move(worker_cache_ptr),
+        std::move(device_set));
+
     GraphDef* gdef =
         const_cast<CreateSessionRequest*>(req)->mutable_graph_def();
-    status = session->Create(gdef);
+
+    status = session->Create(gdef, worker_cache_factory_options);
     if (!status.ok()) {
       session->Close().IgnoreError();
       session->Unref();
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 01218fb1556e8453f87fb448fe5a3a11c6dc7d22..bb548adda1586a65f1914f322ce800ebb84a474f 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -19,17 +19,41 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
 class Device;
+class DeviceSet;
 class Env;
 class MasterSession;
 class OpRegistryInterface;
 class WorkerCacheInterface;
 
+// Options passed to the worker_cache_factory function.
+struct WorkerCacheFactoryOptions {
+  const ClusterDef* cluster_def = nullptr;
+  const string* job_name = nullptr;
+  int task_index;
+  const string* protocol = nullptr;
+
+  WorkerCacheFactoryOptions() {}
+
+  // Construct from a ServerDef proto.
+  //
+  // Note: server_def must outlive WorkerCacheFactoryOptions!
+  WorkerCacheFactoryOptions(const ServerDef& server_def) {
+    if (server_def.has_cluster() && !server_def.job_name().empty()) {
+      cluster_def = &server_def.cluster();
+      job_name = &server_def.job_name();
+      task_index = server_def.task_index();
+      protocol = &server_def.protocol();
+    }
+  }
+};
+
 // The master environment class, which holds a bag of pointers to
 // per-master state.
 //
@@ -56,9 +80,15 @@ struct MasterEnv {
   // `MasterSession`, which may not be null. Ownership of the
   // `MasterEnv*` is retained by the caller.
   std::function<MasterSession*(
-      const SessionOptions&, MasterEnv*,
-      std::unique_ptr<std::vector<std::unique_ptr<Device>>>)>
+      SessionOptions, MasterEnv*,
+      std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
+      std::unique_ptr<WorkerCacheInterface>,
+      std::unique_ptr<DeviceSet> device_set)>
       master_session_factory;
+
+  std::function<Status(const WorkerCacheFactoryOptions&,
+                       WorkerCacheInterface**)>
+      worker_cache_factory;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 870df353cb6686b6bfd054d06973b3647c1a4b48..dddff4dce486b6fa6d6dea29620d7802badcdf3a 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
+#include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/distributed_runtime/scheduler.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
@@ -35,11 +36,13 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -60,38 +63,23 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
                     std::unique_ptr<SimpleClientGraph> cg,
                     const SessionOptions& session_opts,
-                    StatsPublisherFactory stats_publisher_factory,
+                    const StatsPublisherFactory& stats_publisher_factory,
                     SimpleGraphExecutionState* execution_state, bool is_partial,
                     WorkerCacheInterface* worker_cache)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
+        debug_opts_(bopts.debug_options),
         worker_cache_(worker_cache) {
     VLOG(1) << "Created ReffedClientGraph for node with "
             << client_graph_->graph.num_node_ids();
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // If this is a partial run we need to initialize a name to node map for
-    // testing that fetches are reachable.
-    if (is_partial) {
-      std::unordered_set<StringPiece, StringPiece::Hasher> names;
-      for (const string& input : bopts.feed_endpoints) {
-        TensorId id(ParseTensorName(input));
-        names.emplace(id.first);
-      }
-      for (const string& output : bopts.fetch_endpoints) {
-        TensorId id(ParseTensorName(output));
-        names.emplace(id.first);
-      }
-      // We use the graph from the execution_state because we want the graph
-      // nodes before they are rewritten replaced by the rewriter.
-      for (Node* n : execution_state->full_graph()->nodes()) {
-        if (names.count(n->name()) > 0) {
-          name_to_node_.insert({n->name(), n});
-        }
-      }
+    // Initialize a name to node map for testing that fetches are reachable.
+    for (Node* n : execution_state->full_graph()->nodes()) {
+      name_to_node_.insert({n->name(), n});
     }
   }
 
@@ -146,8 +134,9 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       for (auto& p : partitions_) {
         LoggingResponse* resp = new LoggingResponse;
         p.worker->LoggingAsync(
-            &req, resp, [step_id, ss, resp, &scoped_mu, &waiting_for,
-                         &all_done](const Status& s) {
+            &req, resp,
+            [step_id, ss, resp, &scoped_mu, &waiting_for,
+             &all_done](const Status& s) {
               {
                 mutex_lock l(scoped_mu);
                 if (s.ok()) {
@@ -175,14 +164,12 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Partitions the graph into subgraphs and registers them on
   // workers.
   Status RegisterPartitions(const PartitionOptions& popts,
-                            const FunctionDefLibrary& func_def_lib);
+                            const FunctionLibraryDefinition& flib_def);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
-                       int64 execution_count,
-                       SimpleGraphExecutionState* execution_state,
-                       PerStepState* pss, CallOptions* opts,
-                       const RunStepRequestWrapper& req,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
 
@@ -192,18 +179,16 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // Post-processing of any runtime statistics gathered during execution.
   void ProcessStats(int64 step_id, PerStepState* pss,
-                    SimpleGraphExecutionState* execution_state,
                     ProfileHandler* ph, const RunOptions& options,
                     RunMetadata* resp);
   void ProcessDeviceStats(ProfileHandler* ph,
-                          const SimpleGraphExecutionState* execution_state,
                           const DeviceStepStats& ds, bool is_rpc);
   // Checks that the requested fetches can be computed from the provided feeds.
   Status CheckFetches(const RunStepRequestWrapper& req,
                       const RunState* run_state,
                       SimpleGraphExecutionState* execution_state);
 
-  string DetailText(const NodeDef& def, const NodeExecStats& ns) {
+  string DetailText(const Node& node, const NodeExecStats& ns) {
     int64 tot = 0;
     for (auto& no : ns.output()) {
       tot += no.tensor_description().allocation_description().requested_bytes();
@@ -212,12 +197,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     if (tot >= 0.1 * 1048576.0) {
       bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
     }
-    return strings::StrCat(
-        bytes, def.name(), " = ", def.op(), "(",
-        str_util::Join(
-            std::vector<StringPiece>(def.input().begin(), def.input().end()),
-            ", "),
-        ")");
+    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
+                           str_util::Join(node.requested_inputs(), ", "), ")");
   }
 
  private:
@@ -225,6 +206,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<SimpleClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
+  const DebugOptions& debug_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node_;
 
@@ -289,7 +271,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts, const FunctionDefLibrary& func_def_lib) {
+    const PartitionOptions& popts, const FunctionLibraryDefinition& flib_def) {
   {  // Ensure register once.
     mu_.lock();
     if (!init_started_) {
@@ -297,17 +279,20 @@ Status MasterSession::ReffedClientGraph::RegisterPartitions(
       mu_.unlock();
       std::unordered_map<string, GraphDef> graph_defs;
       Status s = DoBuildPartitions(popts, &graph_defs);
-      // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
-      // valid after the call to DoRegisterPartitions begins, so
-      // `stats_publisher_` must make a copy if it wants to retain the
-      // GraphDef objects.
-      std::vector<const GraphDef*> graph_defs_for_publishing;
-      graph_defs_for_publishing.reserve(partitions_.size());
-      for (const auto& name_def : graph_defs) {
-        graph_defs_for_publishing.push_back(&name_def.second);
+      if (s.ok()) {
+        // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
+        // valid after the call to DoRegisterPartitions begins, so
+        // `stats_publisher_` must make a copy if it wants to retain the
+        // GraphDef objects.
+        std::vector<const GraphDef*> graph_defs_for_publishing;
+        graph_defs_for_publishing.reserve(partitions_.size());
+        for (const auto& name_def : graph_defs) {
+          graph_defs_for_publishing.push_back(&name_def.second);
+        }
+        stats_publisher_->PublishGraphProto(graph_defs_for_publishing);
+        s = DoRegisterPartitions(popts, flib_def.ToProto(),
+                                 std::move(graph_defs));
       }
-      stats_publisher_->PublishGraphProto(graph_defs_for_publishing);
-      s = DoRegisterPartitions(popts, func_def_lib, std::move(graph_defs));
       mu_.lock();
       init_result_ = s;
       init_done_.Notify();
@@ -382,7 +367,6 @@ Status MasterSession::ReffedClientGraph::DoBuildPartitions(
   }
 
   // Partition the graph.
-  Status s;
   return Partition(popts, &client_graph_->graph, out_partitions);
 }
 
@@ -424,6 +408,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     // For simplicity, we ship the library completely to every worker.
     *c->req.mutable_graph_def()->mutable_library() = func_def_lib;
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
+    *c->req.mutable_debug_options() = debug_opts_;
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -440,19 +425,6 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   return s;
 }
 
-static bool CopyIfNeeded(TensorProto* in, TensorProto* out) {
-  if (in->tensor_content().empty()) {
-    // If the tensor is not encoded in tensor_content or contains 0
-    // elements, we can return it to the client directly.
-    out->Swap(in);
-  } else {
-    Tensor t(in->dtype());
-    if (!t.FromProto(cpu_allocator(), *in)) return false;
-    t.AsProtoTensorContent(out);
-  }
-  return true;
-}
-
 // Helper class to manage "num" parallel RunGraph calls.
 class RunManyGraphs {
  public:
@@ -511,8 +483,7 @@ class RunManyGraphs {
 
 Status MasterSession::ReffedClientGraph::RunPartitions(
     const MasterEnv* env, int64 step_id, int64 execution_count,
-    SimpleGraphExecutionState* execution_state, PerStepState* pss,
-    CallOptions* call_opts, const RunStepRequestWrapper& req,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp, CancellationManager* cm,
     const bool is_last_partial_run) {
   VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
@@ -555,6 +526,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       c->req->set_is_partial(is_partial_);
       c->req->set_is_last_partial_run(is_last_partial_run);
     }
+    c->req->set_session_handle(session_handle_);
     c->req->set_graph_handle(part.graph_handle);
     c->req->set_step_id(step_id);
     *c->req->mutable_exec_opts() = exec_opts;
@@ -565,7 +537,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (int i = 0; i < req.num_feeds(); ++i) {
+      for (size_t i = 0; i < req.num_feeds(); ++i) {
         const string& name = req.feed_name(i);
         auto iter = part.feed_key.find(name);
         if (iter == part.feed_key.end()) {
@@ -577,7 +549,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
         if (feeds_iter == feeds.end()) {
           return errors::InvalidArgument("No feed is provided for feed=", name,
                                          ", key=", key);
-        } else if (feeds_iter->second != i) {
+        } else if (feeds_iter->second != static_cast<size_t>(i)) {
           return errors::Internal("Cannot find feed named \"", name,
                                   " in request.");
         }
@@ -585,7 +557,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; i < req.num_fetches(); ++i) {
+      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
         const string& req_fetch = req.fetch_name(i);
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
@@ -736,10 +708,11 @@ void MasterSession::ReffedClientGraph::CleanupPartitionsAsync(
   }
 }
 
-void MasterSession::ReffedClientGraph::ProcessStats(
-    int64 step_id, PerStepState* pss,
-    SimpleGraphExecutionState* execution_state, ProfileHandler* ph,
-    const RunOptions& options, RunMetadata* resp) {
+void MasterSession::ReffedClientGraph::ProcessStats(int64 step_id,
+                                                    PerStepState* pss,
+                                                    ProfileHandler* ph,
+                                                    const RunOptions& options,
+                                                    RunMetadata* resp) {
   if (!pss->collect_costs && !pss->collect_timeline) return;
 
   // Out-of-band logging data is collected now, during post-processing.
@@ -751,13 +724,13 @@ void MasterSession::ReffedClientGraph::ProcessStats(
     const StepStats& ss = pss->step_stats[i];
     if (ph) {
       for (const auto& ds : ss.dev_stats()) {
-        ProcessDeviceStats(ph, execution_state, ds, false /*is_rpc*/);
+        ProcessDeviceStats(ph, ds, false /*is_rpc*/);
       }
     }
   }
   if (ph) {
     for (const auto& ds : pss->rpc_stats.dev_stats()) {
-      ProcessDeviceStats(ph, execution_state, ds, true /*is_rpc*/);
+      ProcessDeviceStats(ph, ds, true /*is_rpc*/);
     }
     ph->StepDone(pss->start_micros, pss->end_micros,
                  Microseconds(0) /*cleanup_time*/, 0 /*total_runops*/,
@@ -781,8 +754,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(
 }
 
 void MasterSession::ReffedClientGraph::ProcessDeviceStats(
-    ProfileHandler* ph, const SimpleGraphExecutionState* execution_state,
-    const DeviceStepStats& ds, bool is_rpc) {
+    ProfileHandler* ph, const DeviceStepStats& ds, bool is_rpc) {
   const string& dev_name = ds.device();
   VLOG(1) << "Device " << dev_name << " reports stats for "
           << ds.node_stats_size() << " nodes";
@@ -793,7 +765,7 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       ph->RecordOneOp(dev_name, ns, true /*is_copy*/, "", ns.node_name(),
                       ns.timeline_label());
     } else {
-      const Node* node = execution_state->get_node_by_name(ns.node_name());
+      const Node* node = name_to_node_[ns.node_name()];
       const bool found_node_in_graph = node != nullptr;
       if (!found_node_in_graph && ns.timeline_label().empty()) {
         // The counter incrementing is not thread-safe. But we don't really
@@ -814,7 +786,7 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(node->def(), ns);
+        details = DetailText(*node, ns);
       } else {
         // Leave details string empty
       }
@@ -828,16 +800,20 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
+// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
+// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     SimpleGraphExecutionState* execution_state) {
   // Build the set of pending feeds that we haven't seen.
   std::unordered_set<TensorId, TensorId::Hasher> pending_feeds;
-  for (const string& feed : run_state->pending_inputs) {
-    TensorId id(ParseTensorName(feed));
+  for (const auto& input : run_state->pending_inputs) {
+    // Skip if already fed.
+    if (input.second) continue;
+    TensorId id(ParseTensorName(input.first));
     auto it = name_to_node_.find(id.first);
     if (it == name_to_node_.end()) {
-      return errors::NotFound("Feed ", feed, ": not found");
+      return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
   }
@@ -894,6 +870,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
     // The graph handle may be empty if we failed during partition registration.
     if (!part.graph_handle.empty()) {
       Call* c = new Call;
+      c->req.set_session_handle(session_handle_);
       c->req.set_graph_handle(part.graph_handle);
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
@@ -927,6 +904,10 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
     opts->target_nodes.push_back(req.target_name(i));
   }
 
+  if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
+    opts->debug_options = req.options().debug_options();
+  }
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -944,6 +925,8 @@ void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
     opts->target_nodes.push_back(target);
   }
 
+  // TODO(cais): Add TFDBG support to partial runs.
+
   std::sort(opts->feed_endpoints.begin(), opts->feed_endpoints.end());
   std::sort(opts->target_nodes.begin(), opts->target_nodes.end());
   std::sort(opts->fetch_endpoints.begin(), opts->fetch_endpoints.end());
@@ -960,6 +943,13 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   for (const string& name : opts.fetch_endpoints) {
     h = Hash64(name.c_str(), name.size(), h);
   }
+
+  if (!opts.debug_options.debug_tensor_watch_opts().empty()) {
+    const string watch_summary = SummarizeDebugTensorWatches(
+        opts.debug_options.debug_tensor_watch_opts());
+    h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
+  }
+
   return h;
 }
 
@@ -983,31 +973,25 @@ string BuildGraphOptionsString(const BuildGraphOptions& opts) {
 MasterSession::MasterSession(
     const SessionOptions& opt, const MasterEnv* env,
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+    std::unique_ptr<WorkerCacheInterface> worker_cache,
+    std::unique_ptr<DeviceSet> device_set,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
       handle_(strings::FpToString(random::New64())),
       remote_devs_(std::move(remote_devs)),
+      worker_cache_(std::move(worker_cache)),
+      devices_(std::move(device_set)),
       stats_publisher_factory_(std::move(stats_publisher_factory)),
       graph_version_(0),
       run_graphs_(5),
       partial_run_graphs_(5) {
   UpdateLastAccessTime();
+  CHECK(devices_) << "device_set was null!";
 
   VLOG(1) << "Session " << handle_ << " #local " << env->local_devices.size()
           << " #remote " << remote_devs_->size();
-  for (auto&& d : *remote_devs_) {
-    devices_.AddDevice(d.get());
-  }
-  int num_local_devices = 0;
-  for (Device* d : env->local_devices) {
-    devices_.AddDevice(d);
-    if (num_local_devices == 0) {
-      // Uses the first local device as the client device.
-      devices_.set_client_device(d);
-    }
-    num_local_devices++;
-  }
+
   LOG(INFO) << "Start master session " << handle_
             << " with config: " << std::endl
             << session_opts_.config.DebugString();
@@ -1022,7 +1006,8 @@ void MasterSession::UpdateLastAccessTime() {
   last_access_time_usec_.store(Env::Default()->NowMicros());
 }
 
-Status MasterSession::Create(GraphDef* graph_def) {
+Status MasterSession::Create(GraphDef* graph_def,
+                             const WorkerCacheFactoryOptions& options) {
   if (session_opts_.config.graph_options().place_pruned_graph()) {
     // TODO(b/29900832): Fix this or remove the option.
     LOG(WARNING) << "Distributed session does not support the "
@@ -1030,14 +1015,93 @@ Status MasterSession::Create(GraphDef* graph_def) {
     session_opts_.config.mutable_graph_options()->set_place_pruned_graph(false);
   }
 
-  SimpleGraphExecutionStateOptions options;
-  options.device_set = &devices_;
-  options.session_options = &session_opts_;
-  TF_RETURN_IF_ERROR(SimpleGraphExecutionState::MakeForBaseGraph(
-      graph_def, options, &execution_state_));
+  SimpleGraphExecutionStateOptions execution_options;
+  execution_options.device_set = devices_.get();
+  execution_options.session_options = &session_opts_;
+  {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(SimpleGraphExecutionState::MakeForBaseGraph(
+        graph_def, execution_options, &execution_state_));
+  }
+  if (options.cluster_def != nullptr) {
+    return CreateWorkerSessions(options);
+  }
   return Status::OK();
 }
 
+Status MasterSession::CreateWorkerSessions(
+    const WorkerCacheFactoryOptions& options) {
+  CHECK(worker_cache_) << "CreateWorkerSessions should be called only with "
+                       << "dynamic cluster membership.";
+  std::vector<string> worker_names;
+  worker_cache_->ListWorkers(&worker_names);
+
+  struct WorkerGroup {
+    // The worker name. (Not owned.)
+    const string* name;
+
+    // The worker referenced by name. (Not owned.)
+    WorkerInterface* worker = nullptr;
+
+    // Request and responses used for a given worker.
+    CreateWorkerSessionRequest request;
+    CreateWorkerSessionResponse response;
+    Status status = Status::OK();
+  };
+  BlockingCounter done(worker_names.size());
+  std::vector<WorkerGroup> workers(worker_names.size());
+
+  // Release the workers.
+  auto cleanup = gtl::MakeCleanup([this, &workers] {
+    for (auto&& worker_group : workers) {
+      if (worker_group.worker != nullptr) {
+        worker_cache_->ReleaseWorker(*worker_group.name, worker_group.worker);
+      }
+    }
+  });
+
+  Status status = Status::OK();
+  // Create all the workers & kick off the computations.
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    workers[i].name = &worker_names[i];
+    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].request.set_session_handle(handle_);
+    *workers[i].request.mutable_server_def()->mutable_cluster() =
+        *options.cluster_def;
+    workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+
+    DeviceNameUtils::ParsedName name;
+    if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
+      status = errors::Internal("Could not parse name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+    if (!name.has_job || !name.has_task) {
+      status = errors::Internal("Incomplete worker name ", worker_names[i]);
+      LOG(WARNING) << status;
+      return status;
+    }
+
+    workers[i].request.mutable_server_def()->set_job_name(name.job);
+    workers[i].request.mutable_server_def()->set_task_index(name.task);
+  }
+
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    auto cb = [i, &workers, &done](const Status& s) {
+      workers[i].status = s;
+      done.DecrementCount();
+    };
+    workers[i].worker->CreateWorkerSessionAsync(&workers[i].request,
+                                                &workers[i].response, cb);
+  }
+
+  done.Wait();
+  for (size_t i = 0; i < workers.size(); ++i) {
+    status.Update(workers[i].status);
+  }
+  return status;
+}
+
 Status MasterSession::Extend(const ExtendSessionRequest* req,
                              ExtendSessionResponse* resp) {
   UpdateLastAccessTime();
@@ -1048,15 +1112,6 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
       return errors::FailedPrecondition("Session is closed.");
     }
 
-    // TODO(mrry): Redesign the locking with reader/writer locks to prevent
-    //   starvation due to concurrent steps being issued. This is not
-    //   immediately important because we expect Extend to be used in
-    //   development/interactive exploration, and not during high-throughput
-    //   training.
-    while (num_running_ != 0) {
-      num_running_is_zero_.wait(l);
-    }
-
     if (graph_version_ != req->current_graph_version()) {
       return errors::Aborted("Current version is ", graph_version_,
                              " but caller expected ",
@@ -1076,6 +1131,13 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
   return Status::OK();
 }
 
+WorkerCacheInterface* MasterSession::get_worker_cache() const {
+  if (worker_cache_) {
+    return worker_cache_.get();
+  }
+  return env_->worker_cache;
+}
+
 Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
                                 ReffedClientGraph** rcg, bool is_partial) {
   const uint64 hash = HashBuildGraphOptions(opts);
@@ -1099,10 +1161,11 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
               << "\n";
       std::unique_ptr<SimpleClientGraph> client_graph;
       TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+      WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
           stats_publisher_factory_, execution_state_.get(), is_partial,
-          env_->worker_cache);
+          worker_cache);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
@@ -1177,6 +1240,8 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
       return errors::FailedPrecondition("Session is closed.");
     }
     ++num_running_;
+    // Note: all code paths must eventually call MarkRunCompletion()
+    // in order to appropriate decrement the num_running_ counter.
   }
   Status status;
   if (!req.partial_run_handle().empty()) {
@@ -1184,16 +1249,18 @@ Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
   } else {
     status = DoRunWithLocalExecution(opts, req, resp);
   }
-  {
-    mutex_lock l(mu_);
-    --num_running_;
-    if (num_running_ == 0) {
-      num_running_is_zero_.notify_all();
-    }
-  }
   return status;
 }
 
+// Decrements num_running_ and broadcasts if num_running_ is zero.
+void MasterSession::MarkRunCompletion() {
+  mutex_lock l(mu_);
+  --num_running_;
+  if (num_running_ == 0) {
+    num_running_is_zero_.notify_all();
+  }
+}
+
 Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
@@ -1203,7 +1270,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
   popts.get_incarnation = [this](const string& name) -> int64 {
-    Device* d = devices_.FindDeviceByName(name);
+    Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
       return PartitionOptions::kIllegalIncarnation;
     } else {
@@ -1230,7 +1297,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   }
 
   TF_RETURN_IF_ERROR(
-      rcg->RegisterPartitions(popts, rcg->client_graph()->flib_def->ToProto()));
+      rcg->RegisterPartitions(popts, *rcg->client_graph()->flib_def));
 
   return Status::OK();
 }
@@ -1238,6 +1305,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
 Status MasterSession::DoPartialRun(CallOptions* opts,
                                    const RunStepRequestWrapper& req,
                                    MutableRunStepResponseWrapper* resp) {
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
   const string& prun_handle = req.partial_run_handle();
   RunState* run_state = nullptr;
   {
@@ -1283,10 +1351,14 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Make sure that this is a new set of feeds that are still pending.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    const string& feed = req.feed_name(i);
+    auto it = run_state->pending_inputs.find(feed);
     if (it == run_state->pending_inputs.end()) {
-      return errors::InvalidArgument("The feed ", req.feed_name(i),
-                                     " had already been fed.");
+      return errors::InvalidArgument(
+          "The feed ", feed, " was not specified in partial_run_setup.");
+    } else if (it->second) {
+      return errors::InvalidArgument("The feed ", feed,
+                                     " has already been fed.");
     }
   }
   // Check that this is a new set of fetches that are still pending.
@@ -1294,29 +1366,35 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const string& fetch = req.fetch_name(i);
     auto it = run_state->pending_outputs.find(fetch);
     if (it == run_state->pending_outputs.end()) {
+      return errors::InvalidArgument(
+          "The fetch ", fetch, " was not specified in partial_run_setup.");
+    } else if (it->second) {
       return errors::InvalidArgument("The fetch ", fetch,
-                                     " had already been fetched.");
+                                     " has already been fetched.");
     }
   }
 
   // Ensure that the requested fetches can be computed from the provided feeds.
-  TF_RETURN_IF_ERROR(
-      run_state->rcg->CheckFetches(req, run_state, execution_state_.get()));
+  {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(
+        run_state->rcg->CheckFetches(req, run_state, execution_state_.get()));
+  }
 
   // Determine if this partial run satisfies all the pending inputs and ouputs.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    run_state->pending_inputs.erase(req.feed_name(i));
+    auto it = run_state->pending_inputs.find(req.feed_name(i));
+    it->second = true;
   }
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    run_state->pending_outputs.erase(req.fetch_name(i));
+    auto it = run_state->pending_outputs.find(req.fetch_name(i));
+    it->second = true;
   }
-  bool is_last_partial_run =
-      (run_state->pending_inputs.empty() && run_state->pending_outputs.empty());
+  bool is_last_partial_run = run_state->PendingDone();
 
   Status s = run_state->rcg->RunPartitions(
-      env_, run_state->step_id, run_state->count, execution_state_.get(),
-      &run_state->pss, opts, req, resp, &cancellation_manager_,
-      is_last_partial_run);
+      env_, run_state->step_id, run_state->count, &run_state->pss, opts, req,
+      resp, &cancellation_manager_, is_last_partial_run);
 
   // Delete the run state if there is an error or all fetches are done.
   if (!s.ok() || is_last_partial_run) {
@@ -1324,15 +1402,16 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     run_state->pss.end_micros = Env::Default()->NowMicros();
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->Ref();
-    rcg->ProcessStats(run_state->step_id, &run_state->pss,
-                      execution_state_.get(), run_state->ph.get(),
+    rcg->ProcessStats(run_state->step_id, &run_state->pss, run_state->ph.get(),
                       req.options(), resp->mutable_metadata());
+    cleanup.release();  // MarkRunCompletion called in done closure.
     rcg->CleanupPartitionsAsync(
         run_state->step_id, [this, rcg, prun_handle](const Status& s) {
           if (!s.ok()) {
             LOG(ERROR) << "Cleanup partition error: " << s;
           }
           rcg->Unref();
+          MarkRunCompletion();
         });
     mutex_lock l(mu_);
     partial_runs_.erase(prun_handle);
@@ -1340,13 +1419,44 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
   return s;
 }
 
+Status MasterSession::CreateDebuggerState(
+    const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+    int64 rcg_execution_count,
+    std::unique_ptr<DebuggerStateInterface>* debugger_state) {
+  TF_RETURN_IF_ERROR(
+      DebuggerStateRegistry::CreateState(debug_options, debugger_state));
+
+  std::vector<string> input_names;
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    input_names.push_back(req.feed_name(i));
+  }
+  std::vector<string> output_names;
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    output_names.push_back(req.fetch_name(i));
+  }
+  std::vector<string> target_names;
+  for (size_t i = 0; i < req.num_targets(); ++i) {
+    target_names.push_back(req.target_name(i));
+  }
+
+  // TODO(cais): We currently use -1 as a dummy value for session run count.
+  // While this counter value is straightforward to define and obtain for
+  // DirectSessions, it is less so for non-direct Sessions. Devise a better
+  // way to get its value when the need arises.
+  TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
+      debug_options.global_step(), -1, rcg_execution_count, input_names,
+      output_names, target_names));
+
+  return Status::OK();
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
-  VLOG(2) << "DoRunWithLocalExecution "
-          << "req: " << req.DebugString();
+  VLOG(2) << "DoRunWithLocalExecution req: " << req.DebugString();
   PerStepState pss;
   pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
 
   // Prepare.
   BuildGraphOptions bgopts;
@@ -1358,6 +1468,13 @@ Status MasterSession::DoRunWithLocalExecution(
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
 
+  std::unique_ptr<DebuggerStateInterface> debugger_state;
+  const DebugOptions& debug_options = req.options().debug_options();
+
+  if (!debug_options.debug_tensor_watch_opts().empty()) {
+    TF_RETURN_IF_ERROR(
+        CreateDebuggerState(debug_options, req, count, &debugger_state));
+  }
   TF_RETURN_IF_ERROR(BuildAndRegisterPartitions(rcg));
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
@@ -1384,15 +1501,14 @@ Status MasterSession::DoRunWithLocalExecution(
     pss.collect_rpcs = ph->should_collect_rpcs();
   }
 
-  Status s =
-      rcg->RunPartitions(env_, step_id, count, execution_state_.get(), &pss,
-                         opts, req, resp, &cancellation_manager_, false);
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
   if (s.ok()) {
     pss.end_micros = Env::Default()->NowMicros();
 
     // Schedule post-processing and cleanup to be done asynchronously.
-    rcg->ProcessStats(step_id, &pss, execution_state_.get(), ph.get(),
-                      req.options(), resp->mutable_metadata());
+    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
+                      resp->mutable_metadata());
   } else if (errors::IsCancelled(s)) {
     mutex_lock l(mu_);
     if (closed_) {
@@ -1407,11 +1523,13 @@ Status MasterSession::DoRunWithLocalExecution(
     }
   }
   rcg->Ref();
-  rcg->CleanupPartitionsAsync(step_id, [rcg](const Status& s) {
+  cleanup.release();  // MarkRunCompletion called in done closure.
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
     if (!s.ok()) {
       LOG(ERROR) << "Cleanup partition error: " << s;
     }
     rcg->Unref();
+    MarkRunCompletion();
   });
   return s;
 }
@@ -1452,10 +1570,10 @@ MasterSession::RunState::RunState(const std::vector<string>& input_names,
     : rcg(rcg), step_id(step_id), count(count) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : input_names) {
-    pending_inputs.emplace(name);
+    pending_inputs[name] = false;
   }
   for (auto& name : output_names) {
-    pending_outputs.emplace(name);
+    pending_outputs[name] = false;
   }
 }
 
@@ -1463,4 +1581,14 @@ MasterSession::RunState::~RunState() {
   if (rcg) rcg->Unref();
 }
 
+bool MasterSession::RunState::PendingDone() const {
+  for (const auto& it : pending_inputs) {
+    if (!it.second) return false;
+  }
+  for (const auto& it : pending_outputs) {
+    if (!it.second) return false;
+  }
+  return true;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 8e0460bd14bbca1a3198114827ead16c73fbbae8..3acc5bc5f0ae79a6bcdccd45bb20d14d5f49451f 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -19,12 +19,14 @@ limitations under the License.
 #include <atomic>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/simple_graph_execution_state.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -48,13 +50,15 @@ class MasterSession : public core::RefCounted {
   MasterSession(
       const SessionOptions& options, const MasterEnv* env,
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+      std::unique_ptr<WorkerCacheInterface> worker_cache,
+      std::unique_ptr<DeviceSet> device_set,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
   // Run(), or Close().
   //
   // After this method returns, `def` will no longer be valid.
-  Status Create(GraphDef* def);
+  Status Create(GraphDef* def, const WorkerCacheFactoryOptions& options);
 
   // Returns the session handle.
   const string& handle() const { return handle_; }
@@ -106,8 +110,14 @@ class MasterSession : public core::RefCounted {
 
   std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
 
+  // The optional session-specific worker cluster.
+  // TODO(saeta): Convert to std::optional when available.
+  std::unique_ptr<WorkerCacheInterface> worker_cache_;
+  // Retrieves either worker_cache_ or the env_->worker_cache as appropriate.
+  WorkerCacheInterface* get_worker_cache() const;
+
   // The device set used by this session.
-  DeviceSet devices_;
+  std::unique_ptr<DeviceSet> devices_;
 
   StatsPublisherFactory stats_publisher_factory_;
 
@@ -116,7 +126,7 @@ class MasterSession : public core::RefCounted {
   std::atomic<int64> partial_run_handle_counter_ = {0};
 
   mutex mu_;
-  std::unique_ptr<SimpleGraphExecutionState> execution_state_;
+  std::unique_ptr<SimpleGraphExecutionState> execution_state_ GUARDED_BY(mu_);
   int64 graph_version_;
 
   // We keep a map from a signature of a run request to the
@@ -141,8 +151,8 @@ class MasterSession : public core::RefCounted {
   };
 
   struct RunState {
-    std::unordered_set<string> pending_inputs;
-    std::unordered_set<string> pending_outputs;
+    std::unordered_map<string, bool> pending_inputs;   // true if fed
+    std::unordered_map<string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
     uint64 step_id;
     int64 count = 0;
@@ -154,6 +164,8 @@ class MasterSession : public core::RefCounted {
              const std::vector<string>& output_names, ReffedClientGraph* rcg,
              const uint64 step_id, const int64 count);
 
+    bool PendingDone() const;
+
     ~RunState();
   };
   std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
@@ -178,6 +190,13 @@ class MasterSession : public core::RefCounted {
   // Private dtor. The client must call Close().
   virtual ~MasterSession();
 
+  // Creates sessions on all workers.
+  //
+  // If this session is operating using the new ClusterSpec propagation behavior
+  // call this method in order to propagate the cluster membership to all
+  // workers.
+  Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
+
   Status StartStep(const BuildGraphOptions& opts, int64* count,
                    ReffedClientGraph** graph, bool is_partial);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
@@ -187,10 +206,16 @@ class MasterSession : public core::RefCounted {
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  void MarkRunCompletion();
   void UpdateLastAccessTime();
 
   Status BuildAndRegisterPartitions(ReffedClientGraph* rcg);
 
+  Status CreateDebuggerState(
+      const DebugOptions& debug_options, const RunStepRequestWrapper& req,
+      int64 rcg_execution_count,
+      std::unique_ptr<DebuggerStateInterface>* debugger_state);
+
   TF_DISALLOW_COPY_AND_ASSIGN(MasterSession);
 };
 
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 77c1537b54792c29414415fcc40b8f7791acb7d8..121c58762f10a87fea059ce43b190f70e49e1f64 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -49,8 +49,9 @@ class MasterTest : public ::testing::Test {
     (*options.config.mutable_device_count())["CPU"] = 1;
     (*options.config.mutable_device_count())["GPU"] = 0;
     TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 2, &cluster_));
-    master_ = grpc::MasterService::NewStub(
-        NewHostPortGrpcChannel(cluster_->targets()[0]));
+    SharedGrpcChannelPtr channel_ptr;
+    TF_CHECK_OK(NewHostPortGrpcChannel(cluster_->targets()[0], &channel_ptr));
+    master_ = grpc::MasterService::NewStub(channel_ptr);
   }
 
   std::unique_ptr<test::TestCluster> cluster_;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 7b58feb93cc0c39badb03cd20bcdb0d7811ee0f0..f3bab589a19f44cd976b9acf2f1fa3eba8cae8ee 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -252,6 +252,14 @@ string ProtoRunStepRequest::DebugString() const {
 
 const RunStepRequest& ProtoRunStepRequest::ToProto() const { return *request_; }
 
+const string& InMemoryRunGraphRequest::session_handle() const {
+  return session_handle_;
+}
+
+void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
+  session_handle_ = handle;
+}
+
 const string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
@@ -320,6 +328,7 @@ void InMemoryRunGraphRequest::set_is_last_partial_run(
 const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   if (!proto_version_) {
     proto_version_.reset(new RunGraphRequest);
+    proto_version_->set_session_handle(session_handle());
     proto_version_->set_graph_handle(graph_handle());
     proto_version_->set_step_id(step_id());
     *proto_version_->mutable_exec_opts() = exec_opts();
@@ -337,6 +346,14 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   return *proto_version_;
 }
 
+const string& MutableProtoRunGraphRequest::session_handle() const {
+  return request_.session_handle();
+}
+
+void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
+  request_.set_session_handle(handle);
+}
+
 const string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
@@ -423,6 +440,10 @@ const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
 ProtoRunGraphRequest::ProtoRunGraphRequest(const RunGraphRequest* request)
     : request_(request) {}
 
+const string& ProtoRunGraphRequest::session_handle() const {
+  return request_->session_handle();
+}
+
 const string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
@@ -495,6 +516,7 @@ CostGraphDef* InMemoryRunGraphResponse::mutable_cost_graph() {
 
 RunGraphResponse* InMemoryRunGraphResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunGraphResponse";
+  return NULL;
 }
 
 size_t OwnedProtoRunGraphResponse::num_recvs() const {
@@ -613,6 +635,7 @@ RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
 
 RunStepResponse* InMemoryRunStepResponse::get_proto() {
   LOG(FATAL) << "Cannot get a mutable protobuf for an InMemoryRunStepResponse";
+  return NULL;
 }
 
 size_t OwnedProtoRunStepResponse::num_tensors() const {
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 02516eabb4a990a53563d63a4c297fb958b482e8..795a6add0e794ccaa902195828c75cf653565eb9 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -223,6 +223,10 @@ class RunGraphRequestWrapper {
  public:
   virtual ~RunGraphRequestWrapper() {}
 
+  // The session handle used to register the graph. If empty, a single global
+  // namespace is used.
+  virtual const string& session_handle() const = 0;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   virtual const string& graph_handle() const = 0;
@@ -262,6 +266,7 @@ class RunGraphRequestWrapper {
 // See `RunGraphRequestWrapper` above for a description of the fields.
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
+  virtual void set_session_handle(const string& handle) = 0;
   virtual void set_graph_handle(const string& handle) = 0;
   virtual void set_step_id(int64 step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
@@ -280,6 +285,7 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -293,6 +299,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -304,6 +311,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   void set_is_last_partial_run(bool is_last_partial_run) override;
 
  private:
+  string session_handle_;
   string graph_handle_;
   int64 step_id_;
   ExecutorOpts exec_opts_;
@@ -325,6 +333,7 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
@@ -338,6 +347,7 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
+  void set_session_handle(const string& handle) override;
   void set_graph_handle(const string& handle) override;
   void set_step_id(int64 step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
@@ -357,6 +367,7 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   ProtoRunGraphRequest(const RunGraphRequest* request);
 
   // RunGraphRequestWrapper methods.
+  const string& session_handle() const override;
   const string& graph_handle() const override;
   int64 step_id() const override;
   const ExecutorOpts& exec_opts() const override;
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.cc b/tensorflow/core/distributed_runtime/partial_run_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0dbabf9a212eb87c36320bf29383979d965f1a7
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.cc
@@ -0,0 +1,96 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+
+namespace tensorflow {
+
+namespace {
+// TODO(suharshs): Move this to a common location to allow other part of the
+// repo to use it.
+template <typename T, typename... Args>
+std::unique_ptr<T> MakeUnique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+}  // namespace
+
+bool PartialRunMgr::FindOrCreate(int step_id,
+                                 CancellationManager** cancellation_manager) {
+  mutex_lock l(mu_);
+  auto it = step_id_to_partial_run_.find(step_id);
+  if (it != step_id_to_partial_run_.end()) {
+    *cancellation_manager = it->second->cancellation_manager.get();
+    return false;
+  }
+
+  std::unique_ptr<PartialRunState> partial_run = MakeUnique<PartialRunState>();
+  partial_run->cancellation_manager = MakeUnique<CancellationManager>();
+  *cancellation_manager = partial_run->cancellation_manager.get();
+  step_id_to_partial_run_[step_id] = std::move(partial_run);
+  return true;
+}
+
+void PartialRunMgr::ExecutorDone(int step_id, const Status& executor_status) {
+  StatusCallback done;
+  Status callback_status;
+  {
+    mutex_lock l(mu_);
+    auto run_it = step_id_to_partial_run_.find(step_id);
+    if (run_it == step_id_to_partial_run_.end()) {
+      return;
+    }
+    // If we found the partial_run, we call the final callback, if it
+    // exists.
+    // It is guaranteed that run_it->second->final_callback is left empty
+    // after the std::move call.
+    done = std::move(run_it->second->final_callback);
+    if (!executor_status.ok()) {
+      run_it->second->final_status = executor_status;
+    }
+    callback_status = run_it->second->final_status;
+    run_it->second->executor_done = true;
+  }
+  if (done != nullptr) {
+    done(callback_status);
+    mutex_lock l(mu_);
+    step_id_to_partial_run_.erase(step_id);
+  }
+}
+
+void PartialRunMgr::PartialRunDone(int step_id, StatusCallback done,
+                                   const Status& status) {
+  Status callback_status;
+  {
+    mutex_lock l(mu_);
+    auto run_it = step_id_to_partial_run_.find(step_id);
+    if (run_it == step_id_to_partial_run_.end()) {
+      return;
+    }
+    run_it->second->final_status.Update(status);
+    if (!run_it->second->executor_done) {
+      // If we found the partial_run, we set the final callback to call only
+      // when the executor is completely done.
+      run_it->second->final_callback = std::move(done);
+      return;
+    }
+    callback_status = run_it->second->final_status;
+  }
+  // Otherwise we call the callback immediately.
+  done(callback_status);
+  mutex_lock l(mu_);
+  step_id_to_partial_run_.erase(step_id);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr.h b/tensorflow/core/distributed_runtime/partial_run_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..af56e723a9a7e6710b06943c3806ca3690667810
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr.h
@@ -0,0 +1,87 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PartialRunMgr keeps track of pending partial run requests, and ensures that
+// the partial run is only marked complete when the corresponding executor is
+// run to completion.
+//
+// In tensorflow workers, the executor runs operations asynchronously until
+// specified fetches (operations that return tensors) or targets (operations
+// that don't return tensors) are reached. A PartialRun has two components: a
+// setup which specifies all desired fetches and targets, and run calls that
+// specify fetch values (from the setup calls) to retrieve.
+// On the last partial run call, it is possible to satisfy the
+// required fetches before the executor has completed running the graph to all
+// the desired targets.
+// PartialRunMgr is used to ensure that we don't complete and return the final
+// partial run call to the user until both the partial run and executor have
+// completed.
+//
+// PartialRunMgr is thread-safe.
+class PartialRunMgr {
+ public:
+  // Find or create the CancellationManager associated with step_id.
+  // The PartialRunMgr owns the cancellation_manager.
+  // Returns true if a new CancellationManager was created
+  // (i.e this is a new partial run).
+  bool FindOrCreate(int step_id, CancellationManager** cancellation_manager);
+
+  // Calls the final callback if the PartialRunRequest has already completed.
+  // Otherwise stores the executor_status to be propagated when the
+  // PartialRunRequest completes (PartialRunDone has been called).
+  void ExecutorDone(int step_id, const Status& executor_status);
+
+  // Calls done if the executor has already completed (ExecutorDone has been
+  // called). Otherwise, stores the status and done callback, calling them when
+  // ExecutorDone is called. The callback will either be called by the calling
+  // thread of either PartialRunDone or ExecutorDone.
+  // If executor_status in ExecutorDone is not OK, it takes precedence over
+  // status and is passed to the done callback.
+  void PartialRunDone(int step_id, StatusCallback done, const Status& status);
+
+ private:
+  // PartialRunState stores state associated with a pending partial run request.
+  // This is protected by the mutex in PartialRunMgr.
+  struct PartialRunState {
+    std::unique_ptr<CancellationManager> cancellation_manager;
+
+    bool executor_done = false;
+    StatusCallback final_callback = nullptr;
+    Status final_status;
+  };
+
+  mutex mu_;
+
+  std::unordered_map<int, std::unique_ptr<PartialRunState>>
+      step_id_to_partial_run_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_PARTIAL_RUN_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c0cb3cae7c97fac4b4c335a617687f31bd3b5
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
+
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PartialRunMgrFindOrCreate, Create) {
+  // Basic test of PartialRunMgr CancellationManager creation.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  EXPECT_TRUE(cancellation_manager != nullptr);
+}
+
+TEST(PartialRunMgrFindOrCreate, Find) {
+  // Basic test of PartialRunMgr CancellationManager find.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  // Looking for the same step should return the same cancellation_manager.
+  CancellationManager* found_cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &found_cancellation_manager);
+  EXPECT_EQ(cancellation_manager, found_cancellation_manager);
+}
+
+TEST(PartialRunMgrFindOrCreate, NewCreate) {
+  // Test that PartialRunMgr creates a new CancellationManager for new steps.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+  // FindOrCreate on a new step should return a new cancellation_manager.
+  int new_step_id = 2;
+  CancellationManager* new_cancellation_manager;
+  partial_run_mgr.FindOrCreate(new_step_id, &new_cancellation_manager);
+  EXPECT_NE(cancellation_manager, new_cancellation_manager);
+}
+
+TEST(PartialRunMgr, PartialRunRemoved) {
+  // Test that PartialRunMgr ensures that the PartialRun is deleted after
+  // ExecutorDone and PartialRunDone are called.
+  PartialRunMgr partial_run_mgr;
+  int step_id = 1;
+  CancellationManager* cancellation_manager;
+  partial_run_mgr.FindOrCreate(step_id, &cancellation_manager);
+
+  int called = 0;
+  partial_run_mgr.PartialRunDone(
+      step_id, [&called](Status status) { called++; }, Status::OK());
+  partial_run_mgr.ExecutorDone(step_id, Status::OK());
+
+  // Calling ExecutorDone and PartialRunDone on the step_id should still only
+  // result in the callback being called once.
+  // This proves that the original PartialRun has been removed.
+  partial_run_mgr.PartialRunDone(
+      step_id, [&called](Status status) { called++; }, Status::OK());
+  partial_run_mgr.ExecutorDone(step_id, Status::OK());
+  EXPECT_EQ(1, called);
+}
+
+struct StatusTestParam {
+  Status executor_status;
+  Status partial_run_status;
+  Status expected_status;
+};
+
+class StatusPropagationTest : public ::testing::TestWithParam<StatusTestParam> {
+ protected:
+  PartialRunMgr partial_run_mgr_;
+
+  // State to help keep track of when the callback is called.
+  Notification invoked_;
+  Status status_;
+
+  void set_status(const Status& status) {
+    status_ = status;
+    invoked_.Notify();
+  }
+
+  // Blocks until status is set.
+  Status status() {
+    invoked_.WaitForNotification();
+    return status_;
+  }
+};
+
+TEST_P(StatusPropagationTest, ExecutorDoneFirst) {
+  // Tests error propagation when ExecutorDone is called first.
+  StatusTestParam param = GetParam();
+  int step_id = 1;
+
+  CancellationManager* cancellation_manager;
+  partial_run_mgr_.FindOrCreate(step_id, &cancellation_manager);
+
+  partial_run_mgr_.ExecutorDone(step_id, param.executor_status);
+  partial_run_mgr_.PartialRunDone(step_id,
+                                  [this](Status status) { set_status(status); },
+                                  param.partial_run_status);
+
+  EXPECT_EQ(status(), param.expected_status);
+}
+
+TEST_P(StatusPropagationTest, PartialRunDoneFirst) {
+  // Tests error propagation when PartialRunDone is called first.
+  StatusTestParam param = GetParam();
+  int step_id = 1;
+
+  CancellationManager* cancellation_manager;
+  partial_run_mgr_.FindOrCreate(step_id, &cancellation_manager);
+
+  partial_run_mgr_.PartialRunDone(step_id,
+                                  [this](Status status) { set_status(status); },
+                                  param.partial_run_status);
+  partial_run_mgr_.ExecutorDone(step_id, param.executor_status);
+
+  EXPECT_EQ(status(), param.expected_status);
+}
+
+// Instantiate tests for all error orderings, for both call orders of
+// ExecutorDone and PartialRunDone.
+Status ExecutorError() { return errors::Internal("executor error"); }
+Status PartialRunError() { return errors::Internal("partial run error"); }
+INSTANTIATE_TEST_CASE_P(
+    PartialRunMgr, StatusPropagationTest,
+    ::testing::Values(
+        StatusTestParam{Status::OK(), Status::OK(), Status::OK()},
+        StatusTestParam{ExecutorError(), Status::OK(), ExecutorError()},
+        StatusTestParam{Status::OK(), PartialRunError(), PartialRunError()},
+        StatusTestParam{ExecutorError(), PartialRunError(), ExecutorError()}));
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 9632e9c439879b3258e3899141b3da5f5c83c07c..91c1fb99fef91c9fd484ddf5fa68476f5d54d523 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/remote_device.h"
 
 #include <vector>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -43,8 +45,7 @@ string GetLocalDeviceName(StringPiece fullname) {
 class RemoteDevice : public Device {
  public:
   RemoteDevice(Env* env, const DeviceAttributes& da)
-      : Device(env, da, nullptr),
-        local_dev_name_(GetLocalDeviceName(da.name())) {}
+      : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {}
 
   Status Sync() override { return Status::OK(); }
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
@@ -68,18 +69,50 @@ void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
     GetStatusResponse resp;
   };
   Call* call = new Call;
-  auto cb = [env, worker_cache, worker_name, done, wi, call](const Status& s) {
+  auto cb = [env, worker_cache, worker_name, done, wi,
+             call](const Status& status) {
+    Status s = status;
     std::vector<Device*> remote_devices;
+    auto cleanup = gtl::MakeCleanup(
+        [&worker_cache, &worker_name, &wi, &done, &remote_devices, &s, call] {
+          worker_cache->ReleaseWorker(worker_name, wi);
+          done(s, &remote_devices);
+          delete call;
+        });
     if (s.ok()) {
+      DeviceNameUtils::ParsedName worker_name_parsed;
+      if (!DeviceNameUtils::ParseFullName(worker_name, &worker_name_parsed) ||
+          !worker_name_parsed.has_job || !worker_name_parsed.has_replica ||
+          !worker_name_parsed.has_task) {
+        s = errors::InvalidArgument("Could not parse worker name: ",
+                                    worker_name);
+        LOG(WARNING) << s;
+        return;
+      }
       remote_devices.reserve(call->resp.device_attributes_size());
       for (const DeviceAttributes& da : call->resp.device_attributes()) {
-        auto d = new RemoteDevice(env, da);
-        remote_devices.push_back(d);
+        DeviceNameUtils::ParsedName device_name_parsed;
+        CHECK(DeviceNameUtils::ParseFullName(da.name(), &device_name_parsed))
+            << "Device attribute name '" << da.name() << "' could not be "
+            << "parsed. Device Attribute: " << da.DebugString();
+        // Preserve the exact name, if possible.
+        // TODO(b/37868888): Simplify when legacy device name formats removed.
+        if (device_name_parsed.job == worker_name_parsed.job &&
+            device_name_parsed.replica == worker_name_parsed.replica &&
+            device_name_parsed.task == worker_name_parsed.task) {
+          auto d = new RemoteDevice(env, da);
+          remote_devices.push_back(d);
+        } else {
+          DeviceAttributes da_rewritten = da;
+          da_rewritten.set_name(DeviceNameUtils::FullName(
+              worker_name_parsed.job, worker_name_parsed.replica,
+              worker_name_parsed.task, device_name_parsed.type,
+              device_name_parsed.id));
+          auto d = new RemoteDevice(env, da_rewritten);
+          remote_devices.push_back(d);
+        }
       }
     }
-    worker_cache->ReleaseWorker(worker_name, wi);
-    done(s, &remote_devices);
-    delete call;
   };
   wi->GetStatusAsync(&call->req, &call->resp, cb);
 }
diff --git a/tensorflow/core/distributed_runtime/remote_device_test.cc b/tensorflow/core/distributed_runtime/remote_device_test.cc
index 20d8d19e3c19f73972014441f687dfedbf6335ea..fd8f8c0f35e9cd3bdee27c5e2b79e5bb5efb3edd 100644
--- a/tensorflow/core/distributed_runtime/remote_device_test.cc
+++ b/tensorflow/core/distributed_runtime/remote_device_test.cc
@@ -47,8 +47,10 @@ class RemoteDeviceTest : public ::testing::Test {
     const string& hostport = cluster_->targets()[0];
     GrpcChannelSpec spec;
     TF_CHECK_OK(spec.AddHostPortsJob("localhost", {hostport}));
+    ChannelCreationFunction channel_func =
+        ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
     worker_cache_.reset(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, NewHostPortGrpcChannel)));
+        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
     remote_name_ = "/job:localhost/replica:0/task:0";
     wi_ = worker_cache_->CreateWorker(remote_name_);
   }
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index 04c1fc248ef2a1c76319093fa1f147c139b711dc..43267d4362fac45624962229753ceb766c88eb95 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -25,6 +25,23 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct WorkerSession;
+
+// RemoteRendezvous follow a 2-part initialization. First the objects are
+// constructed. Eventually, they will be initialized. Clients of the
+// RendezvousMgrInterface must guarantee to call Initialize on the returned
+// RemoteRendezvous eventually.
+//
+// Partially initialized RemoteRendezvous must respect the Rendezvous interface
+// (i.e. Send() must never block), however implementations are not expected to
+// actually perform the underlying operations until after the RemoteRendezvous
+// has been Initialize'd.
+class RemoteRendezvous : public Rendezvous {
+ public:
+  // Fully construct the RemoteRendezvous.
+  virtual Status Initialize(WorkerSession* session) = 0;
+};
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -51,7 +68,10 @@ class RendezvousMgrInterface {
   // Returns Rendezvous supporting send and recv among workers in the
   // "step_id".  The caller takes ownership of one reference on the
   // returned Rendezvous instance.
-  virtual Rendezvous* Find(int64 step_id) = 0;
+  //
+  // Note: the caller must guarantee to eventually call Initialize on the
+  // returned RemoteRendezvous
+  virtual RemoteRendezvous* Find(int64 step_id) = 0;
 
   // Finds the local rendezvous instance for the "step_id".  Runs
   // "done" when the tensor for "key" is produced or an error occurs.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index 35f849c7a5ed35ac3aa4fae6c0e1531292454d12..e85b8ccbd39ac213406903397be5f064600c6cef 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CALL_H_
 
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/macros.h"
 
 #include "grpc++/grpc++.h"
@@ -88,7 +89,7 @@ class UntypedCall : public core::RefCounted {
   virtual void RequestReceived(Service* service, bool ok) = 0;
 
   // This method will be called either (i) when the server is notified
-  // that the request has been cancelled, or (ii) when the request completes
+  // that the request has been canceled, or (ii) when the request completes
   // normally. The implementation should distinguish these cases by querying
   // the `grpc::ServerContext` associated with the request.
   virtual void RequestCancelled(Service* service, bool ok) = 0;
@@ -174,7 +175,7 @@ class Call : public UntypedCall<Service> {
   }
 
   // Registers `callback` as the function that should be called if and when this
-  // call is cancelled by the client.
+  // call is canceled by the client.
   void SetCancelCallback(std::function<void()> callback) {
     mutex_lock l(mu_);
     cancel_callback_ = std::move(callback);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 73b26cc43b1d6928bee75b5c00209fb88770c9c1..bcd2c71f841f783ea19c5565de9b71729bc112a1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -42,20 +42,6 @@ string MakeAddress(const string& job, int task) {
   return strings::StrCat("/job:", job, "/replica:0/task:", task);
 }
 
-}  // namespace
-
-SharedGrpcChannelPtr NewHostPortGrpcChannel(const string& target) {
-  // TODO(mrry): Implement secure channels.
-  ::grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
-  // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
-  // on connection failure, which makes our tests time out.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
-  return ::grpc::CreateCustomChannel(
-      target, ::grpc::InsecureChannelCredentials(), args);
-}
-
-namespace {
 Status ValidateHostPortPair(const string& host_port) {
   uint32 port;
   std::vector<string> parts = str_util::Split(host_port, ':');
@@ -69,6 +55,35 @@ Status ValidateHostPortPair(const string& host_port) {
 }
 }  // namespace
 
+Status NewHostPortGrpcChannel(const string& target,
+                              SharedGrpcChannelPtr* channel_pointer) {
+  // Minimally ensure that the target is valid
+  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+
+  // TODO(mrry): Implement secure channels.
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
+  // on connection failure, which makes our tests time out.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  *channel_pointer = ::grpc::CreateCustomChannel(
+      target, ::grpc::InsecureChannelCredentials(), args);
+  return Status::OK();
+}
+
+ChannelCreationFunction ConvertToChannelCreationFunction(
+    const std::function<Status(string, SharedGrpcChannelPtr*)>&
+        new_channel_func_ptr) {
+  return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
+    SharedGrpcChannelPtr channel_ptr;
+    if (new_channel_func_ptr(target, &channel_ptr).ok()) {
+      return channel_ptr;
+    } else {
+      return nullptr;
+    }
+  };
+}
+
 Status GrpcChannelSpec::AddHostPortsJob(const string& job_id,
                                         const std::vector<string>& host_ports) {
   std::map<int, string> host_ports_map;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 8d97523a5b9360d8224af487eb1178841e751af1..c662cde9be8998b8303b345403620ca920f3ca92 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -84,7 +84,12 @@ GrpcChannelCache* NewGrpcChannelCache(const GrpcChannelSpec& channel_spec,
 
 // Below here are internal-only functions.
 
-SharedGrpcChannelPtr NewHostPortGrpcChannel(const string& target);
+ChannelCreationFunction ConvertToChannelCreationFunction(
+    const std::function<Status(string, SharedGrpcChannelPtr*)>&
+        new_channel_func_ptr);
+
+Status NewHostPortGrpcChannel(const string& target,
+                              SharedGrpcChannelPtr* channel_pointer);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index 1d09b2ffb2e54c18dc89db5ef5212c4e7f3f5967..c975563a21fac18c48189d352bb9e21ef1712819 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -59,8 +59,10 @@ TEST(GrpcChannelTest, HostPorts) {
   GrpcChannelSpec spec;
   TF_EXPECT_OK(spec.AddHostPortsJob(
       "mnist", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
-  std::unique_ptr<GrpcChannelCache> cc(
-      NewGrpcChannelCache(spec, NewHostPortGrpcChannel));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  std::unique_ptr<GrpcChannelCache> cc(NewGrpcChannelCache(spec, channel_func));
+
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:6"));
@@ -100,8 +102,10 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   GrpcChannelSpec spec;
   TF_EXPECT_OK(
       spec.AddHostPortsJob("mnist", {{0, "a:1"}, {3, "d:4"}, {4, "e:5"}}));
-  std::unique_ptr<GrpcChannelCache> cc(
-      NewGrpcChannelCache(spec, NewHostPortGrpcChannel));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  std::unique_ptr<GrpcChannelCache> cc(NewGrpcChannelCache(spec, channel_func));
+
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:1"));
@@ -140,4 +144,16 @@ TEST(GrpcChannelTest, SparseHostPorts) {
             workers);
 }
 
+TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
+  SharedGrpcChannelPtr mock_ptr;
+
+  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222", &mock_ptr).ok());
+
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222", &mock_ptr).ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/", &mock_ptr).ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:", &mock_ptr).ok());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index c8b3746d180cd39964aed1a3eed568266187d550..fae1c5227b2773c22cd2ee40094991cba39cc898 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -25,7 +25,7 @@ limitations under the License.
 // A GrpcMasterService discovers remote devices in the background and
 // keeps track of statistics of those remote devices.
 //
-// Each session analyses the graph, places nodes across available
+// Each session analyzes the graph, places nodes across available
 // devices, and ultimately drives the graph computation by initiating
 // RunGraph on workers.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service.h"
@@ -46,8 +46,11 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, ::grpc::ServerBuilder* builder)
-      : master_impl_(master), is_shutdown_(false) {
+  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+                    ::grpc::ServerBuilder* builder)
+      : master_impl_(master),
+        default_timeout_in_ms_(default_timeout_in_ms),
+        is_shutdown_(false) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
@@ -127,6 +130,7 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
+  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
@@ -171,6 +175,11 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for running one step in a session.
   void RunStepHandler(MasterCall<RunStepRequest, RunStepResponse>* call) {
     CallOptions* call_opts = new CallOptions;
+    if (call->request.options().timeout_in_ms() > 0) {
+      call_opts->SetTimeout(call->request.options().timeout_in_ms());
+    } else {
+      call_opts->SetTimeout(default_timeout_in_ms_);
+    }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
     MutableRunStepResponseWrapper* wrapped_response =
@@ -221,8 +230,9 @@ class GrpcMasterService : public AsyncServiceInterface {
 };
 
 AsyncServiceInterface* NewGrpcMasterService(Master* master,
+                                            int64 default_timeout_in_ms,
                                             ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, builder);
+  return new GrpcMasterService(master, default_timeout_in_ms, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 77ae400bd57a4dee4f2efa5b4b3b0d5bedb02314..8770dcc3ac9bf7f0b6c7544a34ccb6d6fa5966b5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_H_
 
 #include <memory>
+#include "tensorflow/core/platform/types.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,6 +29,7 @@ class AsyncServiceInterface;
 class Master;
 
 AsyncServiceInterface* NewGrpcMasterService(Master* master,
+                                            int64 default_timeout_in_ms,
                                             ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index c3b76ed31bcf841b90b3afa8e215db36a40b2c5b..bf72d9a7fcdb5e027be968e94c85970b6b127c14 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 
+#include <utility>
+
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
@@ -29,7 +31,7 @@ namespace tensorflow {
 // that uses gRPC to talk to the Master service.
 class GrpcRemoteMaster : public MasterInterface {
  public:
-  explicit GrpcRemoteMaster(SharedGrpcChannelPtr client_channel)
+  explicit GrpcRemoteMaster(const SharedGrpcChannelPtr& client_channel)
       : stub_(grpc::MasterService::NewStub(client_channel)) {}
 
   ~GrpcRemoteMaster() override {}
@@ -106,7 +108,7 @@ class GrpcRemoteMaster : public MasterInterface {
   }
 };
 
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel) {
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel) {
   return new GrpcRemoteMaster(channel);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
index 881a6b10e30eb570d8fe3ae790873bf9c87e37e7..d661caaa6029dc29c9eb8983c009f232fb2b3cbf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 // Returns a MasterInterface wrapped around the gRPC channel `channel`.
-MasterInterface* NewGrpcMaster(SharedGrpcChannelPtr channel);
+MasterInterface* NewGrpcMaster(const SharedGrpcChannelPtr& channel);
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_REMOTE_MASTER_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 36626e1a33f03e17a7e34b63993f25eece2b647c..2b1a47a93f906c3341c535105ad97578b45d209c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h"
 
+#include <utility>
+
 #include "grpc++/grpc++.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -37,7 +39,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             WorkerCacheLogger* logger)
-      : channel_(channel),
+      : channel_(std::move(channel)),
         cq_(completion_queue),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
@@ -272,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(channel, completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index ff1df585df1be03beaab42841ca462da44ab4f5b..3867dd1f4d025ac2ae4529aae48afb6aedd36a1f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -62,6 +62,11 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
+// static utility function
+RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
+  return new RpcRendezvousMgr(env);
+}
+
 }  // namespace
 
 GrpcServer::GrpcServer(const ServerDef& server_def, Env* env)
@@ -77,6 +82,9 @@ GrpcServer::~GrpcServer() {
   // TODO(mrry): Refactor the *Env classes so that it is less fiddly
   // to destroy them.
 
+  // Shut down all outstanding rendezvous.
+  delete worker_env_.rendezvous_mgr;
+
   // We must delete graph_mgr before device_mgr, due to shared
   // ownership of OpKernels in the executors. (The graph_mgr will
   // free all stateless OpKernels, and pass over borrowed stateful
@@ -84,8 +92,10 @@ GrpcServer::~GrpcServer() {
   // OpSegments.)
   if (worker_env_.session_mgr != nullptr) {
     delete worker_env_.session_mgr;  // Deletes graph_mgr's.
+  } else {
+    // Note: session_mgr's legacy_session_ deletes device_mgr now.
+    delete worker_env_.device_mgr;
   }
-  delete worker_env_.device_mgr;
 
   // Do not delete (as these are not owned by the server):
   // - master_env_.env
@@ -93,14 +103,17 @@ GrpcServer::~GrpcServer() {
   // - worker_env_.compute_pool
 }
 
-Status GrpcServer::Init() {
+Status GrpcServer::Init(
+    ServiceInitFunction service_func,
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
   worker_env_.env = env_;
 
   SessionOptions sess_opts;
-  sess_opts.config = server_def_.default_session_config();
+  ConfigProto config = server_def_.default_session_config();
+  sess_opts.config = config;
 
   // Configure shared devices between master and worker.
   string name_prefix =
@@ -108,7 +121,11 @@ Status GrpcServer::Init() {
                       "/task:", server_def_.task_index());
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
                                                &master_env_.local_devices));
-  worker_env_.device_mgr = new DeviceMgr(master_env_.local_devices);
+  worker_env_.local_devices = master_env_.local_devices;
+  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+                                   ? new RpcRendezvousMgr(&worker_env_)
+                                   : rendezvous_mgr_func(&worker_env_);
   string unused;
   string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
@@ -164,10 +181,15 @@ Status GrpcServer::Init() {
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   master_impl_ = CreateMaster(&master_env_);
-  master_service_ = NewGrpcMasterService(master_impl_.get(), &builder);
+  master_service_ = NewGrpcMasterService(
+      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
   worker_impl_ = NewGrpcWorker(&worker_env_);
   worker_service_ =
       NewGrpcWorkerService(worker_impl_.get(), &builder).release();
+  // extra service:
+  if (service_func != nullptr) {
+    service_func(&worker_env_, &builder);
+  }
   server_ = builder.BuildAndStart();
 
   if (!server_) {
@@ -175,18 +197,18 @@ Status GrpcServer::Init() {
   }
 
   WorkerCacheInterface* worker_cache;
-  TF_RETURN_IF_ERROR(WorkerCacheFactory(server_def_, &worker_cache));
+  WorkerCacheFactoryOptions worker_cache_factory_options(server_def_);
+  TF_RETURN_IF_ERROR(
+      WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
   // Set up worker environment.
-  std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr(
-      new RpcRendezvousMgr(&worker_env_, name_prefix, worker_cache));
   worker_env_.session_mgr = new SessionMgr(
       &worker_env_, SessionMgr::WorkerNameFromServerDef(server_def_),
       std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(rendezvous_mgr),
       [this](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
-        return WorkerCacheFactory(server_def, worker_cache);
+        WorkerCacheFactoryOptions options(server_def);
+        return WorkerCacheFactory(options, worker_cache);
       });
   worker_env_.compute_pool = ComputePool(sess_opts);
 
@@ -194,21 +216,34 @@ Status GrpcServer::Init() {
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
   master_env_.master_session_factory =
-      [](const SessionOptions& options, const MasterEnv* env,
-         std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs) {
+      [config](
+          SessionOptions options, const MasterEnv* env,
+          std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
+          std::unique_ptr<WorkerCacheInterface> worker_cache,
+          std::unique_ptr<DeviceSet> device_set) {
+        options.config.MergeFrom(config);
         return new MasterSession(options, env, std::move(remote_devs),
+                                 std::move(worker_cache), std::move(device_set),
                                  CreateNoOpStatsPublisher);
       };
+  master_env_.worker_cache_factory =
+      [this](const WorkerCacheFactoryOptions& options,
+             WorkerCacheInterface** worker_cache) {
+        return WorkerCacheFactory(options, worker_cache);
+      };
 
   // Provide direct access to the master from in-process clients.
-  LocalMaster::Register(target(), master_impl_.get());
+  LocalMaster::Register(target(), master_impl_.get(),
+                        config.operation_timeout_in_ms());
 
   return Status::OK();
 }
 
-Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
+Status GrpcServer::Init() { return Init(nullptr, nullptr); }
+
+Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
-  for (const auto& job : server_def.cluster().job()) {
+  for (const auto& job : options.cluster_def->job()) {
     std::map<int, string> host_ports;
     for (const auto& task : job.tasks()) {
       string& host_port = host_ports[task.first];
@@ -218,8 +253,7 @@ Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
                                        task.first, "\": ", host_port, " and ",
                                        task.second);
       }
-      if (job.name() == server_def.job_name() &&
-          task.first == server_def.task_index()) {
+      if (job.name() == *options.job_name && task.first == options.task_index) {
         host_port = strings::StrCat("localhost:", bound_port_);
       } else {
         host_port = task.second;
@@ -230,19 +264,29 @@ Status GrpcServer::ParseChannelSpec(const ServerDef& server_def,
   return Status::OK();
 }
 
-Status GrpcServer::WorkerCacheFactory(const ServerDef& server_def,
+Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
                                       WorkerCacheInterface** worker_cache) {
-  string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
-                      "/task:", server_def.task_index());
+  if (options.job_name == nullptr || options.job_name->empty()) {
+    Status s = errors::InvalidArgument(
+        "The master (current machine) is not included in the provided "
+        "cluster_def. ",
+        options.cluster_def->DebugString());
+    LOG(WARNING) << s;
+    return s;
+  }
 
   GrpcChannelSpec channel_spec;
-  TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
+  TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
+
+  std::unique_ptr<GrpcChannelCache> channel_cache(
+      NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
+
+  string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
+                                       "/task:", options.task_index);
 
-  std::unique_ptr<GrpcChannelCache> channel_cache(NewGrpcChannelCache(
-      channel_spec, GetChannelCreationFunction(server_def)));
   const string host_port = channel_cache->TranslateTask(name_prefix);
   int requested_port;
+
   if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
                              &requested_port)) {
     return errors::Internal("Could not parse port for local server from \"",
@@ -325,9 +369,10 @@ std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
   return ::grpc::InsecureServerCredentials();
 }
 
-ChannelCreationFunction GrpcServer::GetChannelCreationFunction(
-    const ServerDef& server_def) const {
-  return NewHostPortGrpcChannel;
+ChannelCreationFunction GrpcServer::GetChannelCreationFunction() const {
+  // We can do this because SparseGrpcChannelCache is robust to nullptr being
+  // returned by the channel creation function
+  return ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
 }
 
 std::unique_ptr<Master> GrpcServer::CreateMaster(MasterEnv* master_env) {
@@ -339,7 +384,8 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<ServerInterface>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  TF_RETURN_IF_ERROR(ret->Init());
+  ServiceInitFunction service_func = nullptr;
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr));
   *out_server = std::move(ret);
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c6ba26010411a8a6542fcf1b9da3bf44fdcef32c..7b54bb84c88fe0f6669a5fd63744722c0d5231b7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -36,6 +36,15 @@ namespace tensorflow {
 class GrpcWorker;
 class Master;
 
+// function that creates a RendezvousMgr.
+typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
+    RendezvousMgrCreationFunction;
+
+// function that registers a service to the server. The service needs to
+// be registered before builder.BuildAndStart().
+typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
+    ServiceInitFunction;
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
@@ -55,29 +64,35 @@ class GrpcServer : public ServerInterface {
   const string target() const override;
 
  protected:
+  Status Init(ServiceInitFunction service_func,
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
+
   Status Init();
 
   // A subclass can override this method to support secure credentials.
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
       const ServerDef& server_def) const;
 
-  virtual ChannelCreationFunction GetChannelCreationFunction(
-      const ServerDef& server_def) const;
+  virtual ChannelCreationFunction GetChannelCreationFunction() const;
 
   virtual std::unique_ptr<Master> CreateMaster(MasterEnv* master_env);
 
   // Creates a WorkerCacheInterface for a session.
-  Status WorkerCacheFactory(const ServerDef& server_def,
+  Status WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
                             WorkerCacheInterface** worker_cache);
 
-  // Parses a ServerDef into a GrpcChannelSpec.
-  Status ParseChannelSpec(const ServerDef& server_def,
+  // Parses a WorkerCacheFactoryOptions into a GrpcChannelSpec.
+  Status ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                           GrpcChannelSpec* channel_spec);
 
   // Returns the port to which this server is bound.
   // This method may only be called after `this->Init()` returns successfully.
   int bound_port() const { return bound_port_; }
 
+  WorkerEnv* worker_env() { return &worker_env_; }
+
+  const ServerDef& server_def() const { return server_def_; }
+
  private:
   // The overall server configuration.
   const ServerDef server_def_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 4e2f5de21397a066b2b8b651d17ffabca1382d70..38d59d5bb59978be6160dd9dcdf9225fd2588d3f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -43,7 +43,7 @@ const size_t kSchemePrefixLength = strlen(kSchemePrefix);
 /* static */
 Status GrpcSession::Create(const SessionOptions& options,
                            std::unique_ptr<GrpcSession>* out_session) {
-  std::unique_ptr<GrpcSession> ret(new GrpcSession(options));
+  std::unique_ptr<GrpcSession> session(new GrpcSession(options));
   std::unique_ptr<MasterInterface> master;
   // For testing, we enable the client to disable the use of the local
   // master registry, so that the RPC stack is exercised.
@@ -51,12 +51,13 @@ Status GrpcSession::Create(const SessionOptions& options,
     master = LocalMaster::Lookup(options.target);
   }
   if (!master) {
-    SharedGrpcChannelPtr master_channel =
-        NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength));
+    SharedGrpcChannelPtr master_channel;
+    TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
+        options.target.substr(kSchemePrefixLength), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
-  ret->SetRemoteMaster(std::move(master));
-  *out_session = std::move(ret);
+  session->SetRemoteMaster(std::move(master));
+  *out_session = std::move(session);
   return Status::OK();
 }
 
@@ -101,6 +102,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionRequest req;
   *req.mutable_config() = options_.config;
   *req.mutable_graph_def() = graph;
+  req.set_target(options_.target);
   ReEncodeConsts(req.mutable_graph_def());
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
@@ -176,6 +178,11 @@ Status GrpcSession::RunHelper(
 
   *req->mutable_options() = run_options;
 
+  if (run_options.timeout_in_ms() == 0) {
+    req->mutable_options()->set_timeout_in_ms(
+        options_.config.operation_timeout_in_ms());
+  }
+
   if (!prun_handle.empty()) {
     req->set_partial_run_handle(prun_handle);
   }
@@ -196,7 +203,7 @@ Status GrpcSession::RunHelper(
   }
 
   CallOptions call_options;
-  call_options.SetTimeout(run_options.timeout_in_ms());
+  call_options.SetTimeout(req->options().timeout_in_ms());
   TF_RETURN_IF_ERROR(RunProto(&call_options, req.get(), resp.get()));
 
   if (!output_tensor_names.empty()) {
@@ -344,8 +351,9 @@ void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
 // Static method.
 Status GrpcSession::Reset(const SessionOptions& options,
                           const std::vector<string>& containers) {
-  SharedGrpcChannelPtr master_channel =
-      NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength));
+  SharedGrpcChannelPtr master_channel;
+  TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
+      options.target.substr(kSchemePrefixLength), &master_channel));
   auto master = NewGrpcMaster(master_channel);
   ResetRequest req;
   for (const auto& c : containers) req.add_container(c);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index a6e9ce5e09e3b8d13754a921762018d6dfe6d574..eeb3e02966adc8588c6e0e3a548d990b0147a8a1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -517,7 +517,7 @@ TEST(GrpcSessionTest, Error) {
     //
     // Subgraph for "b" sleeps at the node "b_delay". When the sleep
     // finishes, the subgraph "b" will continue execution till it
-    // notices that it is cancelled. Meanwhile, subgraph's executor
+    // notices that it is canceled. Meanwhile, subgraph's executor
     // and its related state (registered ops) should still be alive.
     auto b = test::graph::Constant(&g, Tensor());
     b->set_assigned_device_name(dev_b);
@@ -814,7 +814,7 @@ TEST(SessionTest, ExtendValidation) {
 // Tests that Create() with "operation_timeout_in_ms" set times out.
 TEST(SessionTest, CreateTimeoutWithSessionOptions) {
   // Creates a RemoteSession with "operation_timeout_in_ms" set to 100.
-  SessionOptions options = Options("example.org", 1);
+  SessionOptions options = Options("example.org:2222", 1);
   options.config.set_operation_timeout_in_ms(100);
   std::unique_ptr<Session> session(NewRemote(options));
 
@@ -832,7 +832,7 @@ TEST(SessionTest, CreateTimeoutWithSessionOptions) {
 
 // Tests that Create() with "timeout_in_ms" in RunOptions set times out.
 TEST(SessionTest, CreateTimeoutWithRunOptions) {
-  SessionOptions options = Options("example.org", 1);
+  SessionOptions options = Options("example.org:2222", 1);
   std::unique_ptr<Session> session(NewRemote(options));
 
   // Creates a long running op.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index c6260afa20e7b0a91597fb35e071b9d12a7ed404..90e311a493079526c10c12d44cbeac609bfa6847 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "grpc++/support/byte_buffer.h"
 #include "grpc++/support/slice.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -27,10 +28,9 @@ namespace tensorflow {
 namespace grpc {
 
 static void do_nothing(void* raw) {}
-static void unref_tensorreference(void* raw) {
-  TensorReference* ref = static_cast<TensorReference*>(raw);
-  ref->Unref();
-  delete ref;
+static void unref_tensorbuffer(void* raw) {
+  TensorBuffer* buf = static_cast<TensorBuffer*>(raw);
+  buf->Unref();
 }
 
 void EncodeRecvTensorResponseToByteBuffer(const RecvTensorResponse& proto,
@@ -166,7 +166,7 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
         (e_skeleton.size() +
          VarLengthEncodingSize(TensorProto::kTensorContentFieldNumber,
                                tdata.size()));
-    string header;  // All of RecvTensorRequest except the tensor() field
+    string header;  // All of RecvTensorResponse except the tensor() field
     response.AppendToString(&header);
 
     size_t expected_size =
@@ -219,8 +219,8 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
     if (tensor_data_is_large) {
       // Encode the actual tensor data by pointing to the backing store,
-      // and add a special zero-length slice that is really a TensorReference
-      // object that we will destroy when we are done.
+      // and add a special zero-length slice that is really a TensorBuffer
+      // reference that we will unref when we are done.
       //
       // TODO(jeff): Note that this approach relies on the fact that
       // slices are destroyed in the order in which they are added to
@@ -241,17 +241,15 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
       // (E) Encode tensor data, but by sharing backing store
 
-      // TODO(jeff,sanjay): It'd be nice to avoid this TensorReference
-      // allocation, and instead get our hands on the underlying
-      // TensorBuffer object and just directly ref it here and unref
-      // it in unref_tensorreference.
-      TensorReference* ref = new TensorReference(val);
+      const TensorBuffer* buf = DMAHelper::buffer(&val);
+      buf->Ref();
       gpr_slice s1 = gpr_slice_new(
           const_cast<void*>(static_cast<const void*>(tdata.data())),
           tdata.size(), do_nothing);
       slices[1] = ::grpc::Slice(s1, ::grpc::Slice::STEAL_REF);
 
-      gpr_slice s2 = gpr_slice_new(ref, 0, unref_tensorreference);
+      gpr_slice s2 =
+          gpr_slice_new(const_cast<TensorBuffer*>(buf), 0, unref_tensorbuffer);
       slices[2] = ::grpc::Slice(s2, ::grpc::Slice::STEAL_REF);
       num_slices += 2;
     }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index c11266587d8cba97b742818c85d2ae54f7e32f26..873ef8588f4ffee07df9f8e33a4d6fd8884f36a8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -113,6 +113,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
     // completes, and we may decide to bound some of the request
     // types.
     ENQUEUE_REQUEST(GetStatus, false);
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
     ENQUEUE_REQUEST(CleanupAll, false);
     ENQUEUE_REQUEST(RegisterGraph, false);
     ENQUEUE_REQUEST(DeregisterGraph, false);
@@ -181,6 +182,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(GetStatus, false);
   }
 
+  void CreateWorkerSessionHandler(
+      WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
+          call) {
+    Schedule([this, call]() {
+      Status s = worker_->CreateWorkerSession(&call->request, &call->response);
+      call->SendResponse(ToGrpcStatus(s));
+    });
+    ENQUEUE_REQUEST(CreateWorkerSession, false);
+  }
+
   void CleanupAllHandler(
       WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
     Schedule([this, call]() {
@@ -298,7 +309,6 @@ void GrpcWorker::RecvTensorAsync(CallOptions* opts,
                                  ::grpc::ByteBuffer* response,
                                  StatusCallback done) {
   const int64 step_id = request->step_id();
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
   const string& key = request->rendezvous_key();
   TRACEPRINTF("RecvTensor: %lld %s", step_id, key.c_str());
   Rendezvous::ParsedKey parsed;
@@ -317,7 +327,7 @@ void GrpcWorker::RecvTensorAsync(CallOptions* opts,
   // of execution of the callback lambda body below, an RPC
   // cancellation should abort the rendezvous.
   opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
-  session->rendezvous_mgr->RecvLocalAsync(
+  env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
       [opts, response, done, src_dev](const Status& status,
                                       const Rendezvous::Args& send_args,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 7854949033f1cc79030af0e09dcb77b9815365bd..80a2f89337c6914dd871c4df346016d70d0f4093 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -49,6 +49,9 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
     case GrpcWorkerMethod::kTracing:
       return "/tensorflow.WorkerService/Tracing";
   }
+  // Shouldn't be reached.
+  LOG(FATAL) << "Invalid id: this line shouldn't be reached.";
+  return "invalid id";
 }
 
 namespace grpc {
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 7518a289fdb1855900a5a3bb594a0135b3e959cc..8265100061e4cb0a1a3ea1da96abb5b563f010c8 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -38,9 +38,8 @@ namespace {
 
 class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  public:
-  RpcRemoteRendezvous(const WorkerEnv* env, const string& worker_name,
-                      WorkerCacheInterface* cache, int64 step_id)
-      : BaseRemoteRendezvous(env, worker_name, step_id, false), cache_(cache) {}
+  RpcRemoteRendezvous(const WorkerEnv* env, int64 step_id)
+      : BaseRemoteRendezvous(env, step_id, false) {}
 
  protected:
   void RecvFromRemoteAsync(const Rendezvous::ParsedKey& parsed,
@@ -50,7 +49,6 @@ class RpcRemoteRendezvous : public BaseRemoteRendezvous {
  private:
   ~RpcRemoteRendezvous() override {}
 
-  WorkerCacheInterface* const cache_;  // Not owned.
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRemoteRendezvous);
 };
 
@@ -204,75 +202,10 @@ static RpcRecvTensorFreeList* get_call_freelist() {
   return call_freelist;
 }
 
-// A private cache that wraps worker_cache and allows reuse of
-// WorkerInterface objects.
-class WorkerFreeListCache : public WorkerCacheInterface {
- public:
-  explicit WorkerFreeListCache(WorkerCacheInterface* w) : wrapped_(w) {}
-
-  ~WorkerFreeListCache() {
-    for (auto p : workers_) {
-      wrapped_->ReleaseWorker(p.first, p.second.worker);
-    }
-  }
-
-  void ListWorkers(std::vector<string>* workers) const override {
-    wrapped_->ListWorkers(workers);
-  }
-
-  WorkerInterface* CreateWorker(const string& target) override {
-    mutex_lock l(mu_);
-    auto p = workers_.find(target);
-    if (p != workers_.end()) {
-      return p->second.worker;
-    }
-    WorkerState state;
-    state.worker = wrapped_->CreateWorker(target);
-    if (state.worker != nullptr) {
-      workers_.insert(std::make_pair(target, state));
-    }
-    return state.worker;
-  }
-
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
-    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
-  }
-
-  bool GetDeviceLocalityNonBlocking(const string& device,
-                                    DeviceLocality* locality) override {
-    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
-  }
-
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
-                              StatusCallback done) override {
-    wrapped_->GetDeviceLocalityAsync(device, locality, done);
-  }
-
-  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
-
-  void ClearLogs() override { wrapped_->ClearLogs(); }
-
-  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
-    return wrapped_->RetrieveLogs(step_id, ss);
-  }
-
- private:
-  WorkerCacheInterface* wrapped_;
-
-  // Information kept per created WorkerInterface.
-  struct WorkerState {
-    WorkerInterface* worker;
-    // TODO(jeff,sanjay): Add reference count if we support eviction.
-  };
-
-  // TODO(jeff,sanjay): Eviction when the map becomes too big.
-  mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
-};
-
 void RpcRemoteRendezvous::RecvFromRemoteAsync(
     const Rendezvous::ParsedKey& parsed, const Rendezvous::Args& recv_args,
     DoneCallback done) {
+  CHECK(is_initialized());
   Status s;
 
   // Prepare a RecvTensor call that can handle being aborted.
@@ -284,17 +217,21 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     s = errors::Internal(parsed.src_device,
                          " is invalid remote source device.");
   }
-  WorkerInterface* rwi = cache_->CreateWorker(call->src_worker_);
+  WorkerSession* sess = session();
+  WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
   }
 
   Device* dst_device;
   if (s.ok()) {
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
+    s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device);
   }
   if (!s.ok()) {
-    get_call_freelist()->Release(call, cache_);
+    if (rwi != nullptr) {
+      sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
+    }
+    get_call_freelist()->Release(call, sess->worker_cache.get());
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -314,26 +251,21 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // current status should be bad.
     Status s = call->status();
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    cache_->ReleaseWorker(call->src_worker_, call->wi_);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
     call->wi_ = nullptr;
-    get_call_freelist()->Release(call, cache_);
+    get_call_freelist()->Release(call, session()->worker_cache.get());
     Unref();
   });
 }
 
 }  // namespace
 
-RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env,
-                                   const string& worker_name,
-                                   WorkerCacheInterface* worker_cache)
-    : BaseRendezvousMgr(env, worker_name),
-      cache_(new WorkerFreeListCache(worker_cache)) {}
+RpcRendezvousMgr::RpcRendezvousMgr(const WorkerEnv* env)
+    : BaseRendezvousMgr(env) {}
 
 BaseRemoteRendezvous* RpcRendezvousMgr::Create(int64 step_id,
-                                               const WorkerEnv* worker_env,
-                                               const string& worker_name) {
-  return new RpcRemoteRendezvous(worker_env, worker_name, cache_.get(),
-                                 step_id);
+                                               const WorkerEnv* worker_env) {
+  return new RpcRemoteRendezvous(worker_env, step_id);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
index 75dc62d98fd635cdc797c593a2cd848e5319da57..34c48a79177618679b99ba2b2476b05b3954bffd 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h
@@ -17,13 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_RPC_RENDEZVOUS_MGR_H_
 
 #include "tensorflow/core/distributed_runtime/base_rendezvous_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
-#include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // RendezvousMgr keeps track of a set of local rendezvous instances.
 // All tensors sent by this worker are buffered in a RendezvousMgr
 // until the tensor is received.  Each global unique "step_id"
@@ -44,17 +44,12 @@ namespace tensorflow {
 // RendezvousMgr must have keys generated by Rendezvous::CreateKey.
 class RpcRendezvousMgr : public BaseRendezvousMgr {
  public:
-  explicit RpcRendezvousMgr(const WorkerEnv* env, const string& worker_name,
-                            WorkerCacheInterface* worker_cache);
+  explicit RpcRendezvousMgr(const WorkerEnv* env);
 
  protected:
-  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env,
-                               const string& session_name) override;
+  BaseRemoteRendezvous* Create(int64 step_id, const WorkerEnv* worker_env);
 
  private:
-  // Private cache_ that allows us to reuse WorkerInterface objects.
-  std::unique_ptr<WorkerCacheInterface> cache_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(RpcRendezvousMgr);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 9b778eab3a593fe219602ba5d4cf0b04565f6ce4..2d0d76623d4e9b83d101b362b7a2316bc7a8084f 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -68,9 +68,9 @@ class RpcRendezvousMgrTest : public ::testing::Test {
       : cache_(new DummyWorkerCache),
         worker_session_("/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
-                        std::unique_ptr<RendezvousMgrInterface>(),
+                        std::unique_ptr<DeviceMgr>(),
                         std::unique_ptr<GraphMgr>()),
-        rmgr_(&env, worker_session_.worker_name, cache_) {
+        rmgr_(&env) {
     env.env = Env::Default();
   }
 
@@ -87,7 +87,8 @@ TEST_F(RpcRendezvousMgrTest, LocalSendRecv) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
@@ -107,7 +108,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {  // Explicit Abort().
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     SchedClosure([this, rendez]() {
       env.env->SleepForMicroseconds(100 * 1000);
@@ -116,11 +117,12 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
   {  // Cleanup causes Abort().
     const int64 step_id = 321;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     SchedClosure([this, step_id]() {
       env.env->SleepForMicroseconds(100 * 1000);
@@ -129,6 +131,7 @@ TEST_F(RpcRendezvousMgrTest, LocalAbort) {
     Tensor val(DT_STRING);
     bool val_dead = false;
     Rendezvous::Args args;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     EXPECT_TRUE(errors::IsAborted(rendez->Recv(key, args, &val, &val_dead)));
   }
 }
@@ -139,7 +142,8 @@ TEST_F(RpcRendezvousMgrTest, CleanupAll) {
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
     const int64 step_id = 123;
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
@@ -168,10 +172,11 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
       "/job:mnist/replica:1/task:2/cpu:0", 7890,
       "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
   {
-    Rendezvous* rendez = rmgr_.Find(step_id);
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
     core::ScopedUnref unref(rendez);
     Rendezvous::Args args;
     args.device_context = dc;
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
     TF_ASSERT_OK(rendez->Send(key, args, V("peach"), false));
   }
   {
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e2be62f816c35a97c669e85310a1845202bf7e7c..22551d54821b0ef34f4e535ee5923d6d695cfdc1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -26,23 +27,12 @@ namespace tensorflow {
 SessionMgr::SessionMgr(
     WorkerEnv* worker_env, const string& default_worker_name,
     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-    std::unique_ptr<RendezvousMgrInterface> default_rendezvous_mgr,
-    WorkerCacheFactory worker_cache_factory)
-    : SessionMgr(
-          worker_env, default_worker_name, std::move(default_worker_cache),
-          default_rendezvous_mgr.release(), std::move(worker_cache_factory)) {}
-
-SessionMgr::SessionMgr(
-    WorkerEnv* worker_env, const string& default_worker_name,
-    std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-    RendezvousMgrInterface* default_rendezvous_mgr,
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
-      legacy_session_(
-          default_worker_name, std::move(default_worker_cache),
-          std::unique_ptr<RendezvousMgrInterface>(default_rendezvous_mgr),
-          std::unique_ptr<GraphMgr>(
-              new GraphMgr(worker_env, default_rendezvous_mgr))),
+      legacy_session_(default_worker_name, std::move(default_worker_cache),
+                      std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
+                      std::unique_ptr<GraphMgr>(
+                          new GraphMgr(worker_env, worker_env->device_mgr))),
       worker_cache_factory_(std::move(worker_cache_factory)) {}
 
 string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
@@ -53,20 +43,28 @@ string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
 Status SessionMgr::CreateSession(const string& session,
                                  const ServerDef& server_def) {
   mutex_lock l(mu_);
+  if (session.empty()) {
+    return errors::InvalidArgument("Session must be non-empty.");
+  }
+
   const string worker_name = WorkerNameFromServerDef(server_def);
 
   WorkerCacheInterface* worker_cache = nullptr;
   TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
 
-  std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr(
-      new RpcRendezvousMgr(worker_env_, worker_name, worker_cache));
+  std::vector<Device*> renamed_devices;
+  for (Device* d : worker_env_->local_devices) {
+    renamed_devices.push_back(
+        RenamedDevice::NewRenamedDevice(worker_name, d, false));
+  }
+  std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
 
   std::unique_ptr<GraphMgr> graph_mgr(
-      new GraphMgr(worker_env_, rendezvous_mgr.get()));
+      new GraphMgr(worker_env_, device_mgr.get()));
 
   std::unique_ptr<WorkerSession> worker_session(new WorkerSession(
       worker_name, std::unique_ptr<WorkerCacheInterface>(worker_cache),
-      std::move(rendezvous_mgr), std::move(graph_mgr)));
+      std::move(device_mgr), std::move(graph_mgr)));
 
   sessions_.insert(std::make_pair(session, std::move(worker_session)));
   return Status::OK();
@@ -78,22 +76,6 @@ Status SessionMgr::DeleteSession(const string& session) {
   if (it != sessions_.end()) {
     sessions_.erase(it);
   }
-  std::set<string> graph_handles;
-  for (auto graph_handle_it = sessions_by_graph_handle_.begin();
-       graph_handle_it != sessions_by_graph_handle_.end(); ++graph_handle_it) {
-    if (graph_handle_it->second == session) {
-      graph_handles.insert(graph_handle_it->first);
-      graph_handle_it = sessions_by_graph_handle_.erase(graph_handle_it);
-      if (graph_handle_it == sessions_by_graph_handle_.end()) break;
-    }
-  }
-  for (auto step_id_it = graphs_by_step_id_.begin();
-       step_id_it != graphs_by_step_id_.end(); ++step_id_it) {
-    if (graph_handles.find(step_id_it->second) != graph_handles.end()) {
-      step_id_it = graphs_by_step_id_.erase(step_id_it);
-      if (step_id_it == graphs_by_step_id_.end()) break;
-    }
-  }
   return Status::OK();
 }
 
@@ -114,58 +96,4 @@ WorkerSession* SessionMgr::WorkerSessionForSession(const string& session) {
 
 WorkerSession* SessionMgr::LegacySession() { return &legacy_session_; }
 
-WorkerSession* SessionMgr::WorkerSessionForGraphHandleUnlocked(
-    const string& graph_handle) {
-  auto it = sessions_by_graph_handle_.find(graph_handle);
-  if (it == sessions_by_graph_handle_.end()) {
-    return &legacy_session_;
-  } else {
-    return WorkerSessionForSessionUnlocked(it->second);
-  }
-}
-
-WorkerSession* SessionMgr::WorkerSessionForGraphHandle(
-    const string& graph_handle) {
-  mutex_lock l(mu_);
-  return WorkerSessionForGraphHandleUnlocked(graph_handle);
-}
-
-WorkerSession* SessionMgr::WorkerSessionForStepId(const int64 step_id) {
-  mutex_lock l(mu_);
-  auto it = graphs_by_step_id_.find(step_id);
-  if (it == graphs_by_step_id_.end()) {
-    return &legacy_session_;
-  } else {
-    return WorkerSessionForGraphHandleUnlocked(it->second);
-  }
-}
-
-void SessionMgr::AssociateGraphWithSession(const string& session,
-                                           const string& graph_handle) {
-  mutex_lock l(mu_);
-  sessions_by_graph_handle_[graph_handle] = session;
-}
-
-void SessionMgr::DisassociateGraphFromSession(const string& graph_handle) {
-  mutex_lock l(mu_);
-  auto it = sessions_by_graph_handle_.find(graph_handle);
-  if (it != sessions_by_graph_handle_.end()) {
-    sessions_by_graph_handle_.erase(it);
-  }
-}
-
-void SessionMgr::AssociateStepIdWithGraph(const string& graph_handle,
-                                          const int64 step_id) {
-  mutex_lock l(mu_);
-  graphs_by_step_id_[step_id] = graph_handle;
-}
-
-void SessionMgr::DisassociateStepIdFromGraph(const int64 step_id) {
-  mutex_lock l(mu_);
-  auto it = graphs_by_step_id_.find(step_id);
-  if (it != graphs_by_step_id_.end()) {
-    graphs_by_step_id_.erase(it);
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index 455b5c8d9d9e07f1743dc3f7695e3e70cbc078f0..c44bca7b7a407957b1a36d7659f2b35ea0b30d07 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -30,6 +30,8 @@ struct WorkerEnv;
 
 // SessionMgr keeps track of information related to a given session.
 //
+// SessionMgr runs on the workers.
+//
 // SessionMgr is threadsafe.
 class SessionMgr {
  public:
@@ -39,7 +41,6 @@ class SessionMgr {
   explicit SessionMgr(
       WorkerEnv* worker_env, const string& default_worker_name,
       std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-      std::unique_ptr<RendezvousMgrInterface> default_rendezvous_mgr,
       WorkerCacheFactory worker_cache_factory);
   ~SessionMgr() {}
 
@@ -50,49 +51,36 @@ class SessionMgr {
   WorkerSession* WorkerSessionForSession(const string& session);
   WorkerSession* LegacySession();
 
-  // Locates the worker session for a given graph handle
-  WorkerSession* WorkerSessionForGraphHandle(const string& graph_handle);
-  void AssociateGraphWithSession(const string& session,
-                                 const string& graph_handle);
-  void DisassociateGraphFromSession(const string& graph_handle);
-
-  // Locates a worker session for a given step id
-  WorkerSession* WorkerSessionForStepId(const int64 step_id);
-  void AssociateStepIdWithGraph(const string& graph_handle,
-                                const int64 step_id);
-  void DisassociateStepIdFromGraph(const int64 step_id);
-
   Status DeleteSession(const string& session);
 
   static string WorkerNameFromServerDef(const ServerDef& server_def);
 
  private:
-  // Private constructor to work around std::unique_ptr ownership issues.
-  explicit SessionMgr(
-      WorkerEnv* worker_env, const string& default_worker_name,
-      std::unique_ptr<WorkerCacheInterface> default_worker_cache,
-      RendezvousMgrInterface* default_rendezvous_mgr,
-      WorkerCacheFactory worker_cache_factory);
-
   const WorkerEnv* const worker_env_;  // Not owned.
+
+  // A note about destruction:
+  // We must delete graph_mgr before device_mgr, due to shared
+  // ownership of OpKernels in the executors. (The graph_mgr will
+  // free all stateless OpKernels, and pass over borrowed stateful
+  // OpKernels, which are also held in their respective devices'
+  // OpSegments.)
+  //
+  // legacy_session_ owns the worker_env_.device_mgr, and so we must ensure
+  // that sessions_'s WorkerSessions are deleted (which do not own the
+  // underlying devices, but instead own RenamedDevices) before
+  // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
+  // device_mgr is deleted after WorkerSession's graph_mgr.
+
   WorkerSession legacy_session_;
 
   const WorkerCacheFactory worker_cache_factory_;
 
   WorkerSession* WorkerSessionForSessionUnlocked(const string& session)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
-  WorkerSession* WorkerSessionForGraphHandleUnlocked(const string& graph_handle)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   // A map from session identifier to internal session structure.
   std::map<string, std::unique_ptr<WorkerSession>> sessions_ GUARDED_BY(mu_);
-
-  // A map from graph handles to the session that they belong to.
-  std::map<string, string> sessions_by_graph_handle_ GUARDED_BY(mu_);
-
-  // A map from globally-unique step id's to the corresponding graph handles.
-  std::map<int64, string> graphs_by_step_id_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index d3f3fa83958750f57ccfd61aef5b2f516c582b1d..7132f123a5943d0680743f3cc3bc17470f49d65d 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -27,8 +27,6 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(),
-             std::unique_ptr<RendezvousMgrInterface>(new RpcRendezvousMgr(
-                 &env_, "/job:mnist/replica:0/task:0", nullptr)),
              factory_),
         legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {}
 
@@ -48,90 +46,19 @@ TEST_F(SessionMgrTest, CreateSessionSimple) {
   TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
-}
-
-TEST_F(SessionMgrTest, AssociateGraphWithSession) {
-  ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(session, graph_session);
-
+  EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
-TEST_F(SessionMgrTest, AssociateStepWithGraph) {
+TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
+  string session_handle = "";
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(session, graph_session);
-
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(session, step_session);
-  ASSERT_EQ(graph_session, step_session);
+  EXPECT_EQ(mgr_.LegacySession(), session);
 
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
-TEST_F(SessionMgrTest, AssociateGraphWithSession_MissingSession) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(legacy_session_, graph_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingGraph) {
-  ServerDef server_def;
-  string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
-  WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
-  ASSERT_NE(nullptr, session) << "Session for " << session_handle << "was null";
-
-  string graph_handle = "test_graph_handle";
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingSession) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  mgr_.AssociateGraphWithSession(session_handle, graph_handle);
-  WorkerSession* graph_session = mgr_.WorkerSessionForGraphHandle(graph_handle);
-  ASSERT_EQ(legacy_session_, graph_session);
-
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
-TEST_F(SessionMgrTest, AssociateStepWithGraph_MissingSessionAndGraph) {
-  string session_handle = "test_session_handle";
-  string graph_handle = "test_graph_handle";
-  int64 step_id = 1234567890L;
-  mgr_.AssociateStepIdWithGraph(graph_handle, step_id);
-  WorkerSession* step_session = mgr_.WorkerSessionForStepId(step_id);
-  ASSERT_EQ(legacy_session_, step_session);
-}
-
 TEST_F(SessionMgrTest, WorkerNameFromServerDef) {
   ServerDef server_def;
   server_def.set_job_name("worker");
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 16be15fe662bd7bb3b7c6c930194ee0b5a7924c2..32ea0cfaa484d726c9a5e409cf237f45b9a5cc6b 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -55,11 +55,7 @@ void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
       env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Register(
       request->session_handle(), request->graph_def(), request->graph_options(),
-      response->mutable_graph_handle());
-  if (s.ok()) {
-    env_->session_mgr->AssociateGraphWithSession(request->session_handle(),
-                                                 response->graph_handle());
-  }
+      request->debug_options(), response->mutable_graph_handle());
   done(s);
 }
 
@@ -67,84 +63,14 @@ void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
                                   DeregisterGraphResponse* response,
                                   StatusCallback done) {
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(request->graph_handle());
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
   Status s = session->graph_mgr->Deregister(request->graph_handle());
-  env_->session_mgr->DisassociateGraphFromSession(request->graph_handle());
-
-  done(s);
-}
-
-Worker::PartialRunState* Worker::FindPartialRun(const string& graph_handle,
-                                                int step_id) {
-  const std::pair<string, int> k(graph_handle, step_id);
-  Worker::PartialRunState* prun_state = nullptr;
-  mutex_lock l(mu_);
-  auto it = partial_runs_.find(k);
-  if (it != partial_runs_.end()) {
-    prun_state = it->second.get();
-  }
-  return prun_state;
-}
-
-void Worker::InsertPartialRunLocked(const string& graph_handle, int step_id,
-                                    Worker::PartialRunState* partial_run_state)
-    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  const std::pair<string, int> k(graph_handle, step_id);
-  partial_runs_.emplace(std::make_pair(
-      k, std::unique_ptr<Worker::PartialRunState>(partial_run_state)));
-}
-
-void Worker::RemovePartialRun(const string& graph_handle, int step_id) {
-  const std::pair<string, int> k(graph_handle, step_id);
-  mutex_lock l(mu_);
-  partial_runs_.erase(partial_runs_.find(k));
-}
-
-void Worker::MaybeCallFinalCallback(const string& graph_handle, int step_id,
-                                    const Status& executor_status) {
-  const std::pair<string, int> k(graph_handle, step_id);
-  StatusCallback done;
-  Status s;
-  {
-    mutex_lock l(mu_);
-    auto it = partial_runs_.find(k);
-    if (it != partial_runs_.end()) {
-      // If we found the partial_run, we call the final callback, if it
-      // exists.
-      std::swap(done, it->second->final_callback);
-      s = it->second->final_status;
-      it->second->executor_done = true;
-    }
-  }
-  if (done != nullptr) {
-    if (s.ok()) {
-      s = executor_status;
-    }
-    done(s);
-  }
-}
 
-void Worker::SetOrCallFinalCallback(const string& graph_handle, int step_id,
-                                    StatusCallback done, const Status& s) {
-  const std::pair<string, int> k(graph_handle, step_id);
-  {
-    mutex_lock l(mu_);
-    auto it = partial_runs_.find(k);
-    if (!it->second->executor_done) {
-      // If we found the partial_run, we set the final callback to call only
-      // when the executor is completely done.
-      it->second->final_callback = std::move(done);
-      it->second->final_status = s;
-      return;
-    }
-  }
-  // Otherwise we call the callback immediately.
   done(s);
 }
 
 void Worker::AbortStep(int64 step_id) {
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
-  Rendezvous* rendez = session->rendezvous_mgr->Find(step_id);
+  Rendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
   SchedNonBlockingClosureAfter(1000000, [rendez, step_id]() {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
@@ -195,8 +121,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   const int64 step_id = request->step_id();
   TRACEPRINTF("RunGraph: %lld", step_id);
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(request->graph_handle());
-  env_->session_mgr->AssociateStepIdWithGraph(request->graph_handle(), step_id);
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
@@ -233,8 +158,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   }
   CostGraphDef* cost_graph = response->mutable_cost_graph();
   session->graph_mgr->ExecuteAsync(
-      request->graph_handle(), step_id, request->exec_opts(), collector,
-      cost_graph, cm, in,
+      request->graph_handle(), step_id, session, request->exec_opts(),
+      collector, cost_graph, cm, in,
       [this, step_id, response, session, cm, out, token, collector, opts,
        done](Status s) {
         if (s.ok()) {
@@ -269,8 +194,8 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   const string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
   WorkerSession* session =
-      env_->session_mgr->WorkerSessionForGraphHandle(graph_handle);
-  env_->session_mgr->AssociateStepIdWithGraph(graph_handle, step_id);
+      env_->session_mgr->WorkerSessionForSession(request->session_handle());
+
   GraphMgr::NamedTensors in;
   GraphMgr::NamedTensors* out = new GraphMgr::NamedTensors;
   Status s = PrepareRunGraph(request, &in, out);
@@ -284,18 +209,8 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
     return;
   }
 
-  PartialRunState* partial_run_state = FindPartialRun(graph_handle, step_id);
-
   CancellationManager* cm = nullptr;
-  // If this is a new partial run call we need to create a new cancellation
-  // manager.
-  // Otherwise we use the cancellation manager stored in the found partial
-  // run state.
-  if (partial_run_state == nullptr) {
-    cm = new CancellationManager;
-  } else {
-    cm = partial_run_state->cancellation_manager;
-  }
+  bool is_new_partial_run = partial_run_mgr_.FindOrCreate(step_id, &cm);
 
   // Before we start doing anything, we set the RPC cancellation.
   opts->SetCancelCallback([this, cm, step_id]() {
@@ -305,27 +220,23 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
 
   // If this is a new partial run request, the request will need to start the
   // executors.
-  if (partial_run_state == nullptr) {
+  if (is_new_partial_run) {
     CancellationToken token;
     {
       mutex_lock l(mu_);
-      // Insert the new partial run into the partial_runs_ map.
-      partial_run_state = new PartialRunState(cm);
-      InsertPartialRunLocked(graph_handle, step_id, partial_run_state);
       token = cancellation_manager_->get_cancellation_token();
       cancellation_manager_->RegisterCallback(token,
                                               [cm]() { cm->StartCancel(); });
     }
     session->graph_mgr->ExecuteAsync(
-        graph_handle, step_id, request->exec_opts(), nullptr /* collector */,
-        nullptr /* cost_graph */, cm, in,
-        [this, token, graph_handle, step_id, cm](Status s) {
+        graph_handle, step_id, session, request->exec_opts(),
+        nullptr /* collector */, nullptr /* cost_graph */, cm, in,
+        [this, token, step_id, cm](Status s) {
           {
             mutex_lock l(mu_);
             cancellation_manager_->DeregisterCallback(token);
           }
-          MaybeCallFinalCallback(graph_handle, step_id, s);
-          delete cm;
+          partial_run_mgr_.ExecutorDone(step_id, s);
         });
   } else {
     // Send the partial run's new inputs.
@@ -337,8 +248,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   }
 
   session->graph_mgr->RecvOutputsAsync(
-      step_id, out,
-      [this, out, request, response, graph_handle, step_id, finish](Status s) {
+      step_id, out, [this, out, request, response, step_id, finish](Status s) {
         if (s.ok()) {
           // Construct and return the resp.
           for (const auto& p : *out) {
@@ -348,15 +258,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
           }
         }
         if (request->is_last_partial_run()) {
-          SetOrCallFinalCallback(
-              graph_handle, step_id,
-              [this, graph_handle, step_id, finish](const Status& s) {
-                finish(s);
-                // We must wait to remove the partial_run_state until both the
-                // executor and the RecvAsync are complete.
-                RemovePartialRun(graph_handle, step_id);
-              },
-              s);
+          partial_run_mgr_.PartialRunDone(step_id, std::move(finish), s);
         } else {
           finish(s);
         }
@@ -367,8 +269,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
                                CleanupGraphResponse* response,
                                StatusCallback done) {
   const int64 step_id = request->step_id();
-  WorkerSession* session = env_->session_mgr->WorkerSessionForStepId(step_id);
-  session->rendezvous_mgr->Cleanup(step_id);
+  env_->rendezvous_mgr->Cleanup(step_id);
   done(Status::OK());
 }
 
@@ -396,8 +297,8 @@ void Worker::TracingAsync(const TracingRequest* request,
 Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                                  Device** src_dev) {
   // Figures out which device the tensor is hosted on.
-  TF_RETURN_IF_ERROR(
-      env_->device_mgr->LookupDevice(parsed.src_device, src_dev));
+  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  TF_RETURN_IF_ERROR(env_->device_mgr->LookupDevice(local_name, src_dev));
 
   // Does the device have the right incarnation number we expect?
   if ((*src_dev)->attributes().incarnation() != parsed.src_incarnation) {
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 290fc6de952e2e3d3111a492ab357241d05773de..07300338c3871f2d85ae5a50595f1996bcc77f67 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/partial_run_mgr.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 
@@ -93,43 +94,11 @@ class Worker : public WorkerInterface {
   void AbortStep(int64);
 
  private:
+  PartialRunMgr partial_run_mgr_;
+
   mutex mu_;
   CancellationManager* cancellation_manager_ GUARDED_BY(mu_);
 
-  struct PartialRunState {
-    CancellationManager* cancellation_manager;
-
-    bool executor_done = false;
-    StatusCallback final_callback = nullptr;
-    Status final_status;
-
-    explicit PartialRunState(CancellationManager* cm)
-        : cancellation_manager(cm) {}
-  };
-  struct PairHash {
-    std::size_t operator()(std::pair<string, int> const& p) const {
-      return Hash64Combine(std::hash<string>()(p.first),
-                           std::hash<int>()(p.second));
-    }
-  };
-  std::unordered_map<std::pair<string, int>, std::unique_ptr<PartialRunState>,
-                     PairHash>
-      partial_runs_ GUARDED_BY(mu_);
-
-  PartialRunState* FindPartialRun(const string& graph_handle, int step_id);
-
-  void InsertPartialRunLocked(const string& graph_handle, int step_id,
-                              PartialRunState* partial_run_state)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  void RemovePartialRun(const string& graph_handle, int step_id);
-
-  void MaybeCallFinalCallback(const string& graph_handle, int step_id,
-                              const Status& executor_status);
-
-  void SetOrCallFinalCallback(const string& graph_handle, int step_id,
-                              StatusCallback done, const Status& s);
-
   Status PrepareRunGraph(RunGraphRequestWrapper* req,
                          GraphMgr::NamedTensors* in,
                          GraphMgr::NamedTensors* out);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 6d68c82fd19257eabdd127672435ec8f583035e7..9e16ffa95745cfdac9bd0f5ac68b92a7b485b632 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -35,7 +35,7 @@ void WorkerCacheLogger::SetLogging(bool v) {
     ++want_logging_count_;
   } else {
     --want_logging_count_;
-    // If RPCs get cancelled, it may be possible for the count
+    // If RPCs get canceled, it may be possible for the count
     // to go negative.  This should not be a fatal error, since
     // logging is non-critical.
     if (want_logging_count_ < 0) want_logging_count_ = 0;
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 24fb5948a710df68e45a253dc9d614de43b4a889..f09bea328fd99426d07a853791df46cf579d93fd 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_ENV_H_
 
+#include <vector>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -24,8 +25,10 @@ namespace thread {
 class ThreadPool;
 }  // namespace thread
 
+class Device;
 class DeviceMgr;
 class Env;
+class RendezvousMgrInterface;
 class SessionMgr;
 
 // The worker environment class, which holds a bag of pointers to
@@ -38,10 +41,18 @@ struct WorkerEnv {
   // session_mgr encapsulates state for each session.
   SessionMgr* session_mgr = nullptr;
 
+  // The local devices of this worker. Devices are owned by the device_mgr.
+  //
+  // REQUIRES: !local_devices.empty().
+  std::vector<Device*> local_devices;
+
   // device_mgr manages local devices (cpu and gpu). The WorkerService
   // is the network interface for managed devices.
   DeviceMgr* device_mgr = nullptr;
 
+  // A set of rendezvous keyed by step ids.
+  RendezvousMgrInterface* rendezvous_mgr = nullptr;
+
   // A pool of threads for scheduling compute work.
   thread::ThreadPool* compute_pool = nullptr;
 };
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index 508bc7f46803d61c1e915b5e0167f773403f92fb..c9db28ec67f86d469c16427aa9343a2a1d36c0e7 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -113,6 +113,11 @@ class WorkerInterface {
     return CallAndWait(&ME::GetStatusAsync, request, response);
   }
 
+  Status CreateWorkerSession(const CreateWorkerSessionRequest* request,
+                             CreateWorkerSessionResponse* response) {
+    return CallAndWait(&ME::CreateWorkerSessionAsync, request, response);
+  }
+
   Status RegisterGraph(const RegisterGraphRequest* request,
                        RegisterGraphResponse* response) {
     return CallAndWait(&ME::RegisterGraphAsync, request, response);
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 8298e169595f0f4b4c89641c661c3a7882d97616..8691450e9bc42fe6ddae30e74c2b81ed85cab273 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -17,14 +17,84 @@ limitations under the License.
 
 namespace tensorflow {
 
-WorkerSession::WorkerSession(
-    const string& worker_name,
-    std::unique_ptr<WorkerCacheInterface> worker_cache,
-    std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr,
-    std::unique_ptr<GraphMgr> graph_mgr)
+namespace {
+
+// A private cache that wraps worker_cache and allows reuse of
+// WorkerInterface objects.
+class WorkerFreeListCache : public WorkerCacheInterface {
+ public:
+  explicit WorkerFreeListCache(std::unique_ptr<WorkerCacheInterface> w)
+      : wrapped_(std::move(w)) {}
+
+  ~WorkerFreeListCache() final {
+    for (auto p : workers_) {
+      wrapped_->ReleaseWorker(p.first, p.second.worker);
+    }
+  }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    wrapped_->ListWorkers(workers);
+  }
+
+  WorkerInterface* CreateWorker(const string& target) override {
+    mutex_lock l(mu_);
+    auto p = workers_.find(target);
+    if (p != workers_.end()) {
+      return p->second.worker;
+    }
+    WorkerState state;
+    state.worker = wrapped_->CreateWorker(target);
+    if (state.worker != nullptr) {
+      workers_.insert(std::make_pair(target, state));
+    }
+    return state.worker;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+    // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
+  }
+
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    wrapped_->GetDeviceLocalityAsync(device, locality, done);
+  }
+
+  void SetLogging(bool active) override { wrapped_->SetLogging(active); }
+
+  void ClearLogs() override { wrapped_->ClearLogs(); }
+
+  bool RetrieveLogs(int64 step_id, StepStats* ss) override {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  std::unique_ptr<WorkerCacheInterface> wrapped_;
+
+  // Information kept per created WorkerInterface.
+  struct WorkerState {
+    WorkerInterface* worker;
+    // TODO(jeff,sanjay): Add reference count if we support eviction.
+  };
+
+  // TODO(jeff,sanjay): Eviction when the map becomes too big.
+  mutex mu_;
+  std::unordered_map<string, WorkerState> workers_ GUARDED_BY(mu_);
+};
+
+}  // namespace
+
+WorkerSession::WorkerSession(const string& worker_name,
+                             std::unique_ptr<WorkerCacheInterface> worker_cache,
+                             std::unique_ptr<DeviceMgr> device_mgr,
+                             std::unique_ptr<GraphMgr> graph_mgr)
     : worker_name(worker_name),
-      worker_cache(std::move(worker_cache)),
-      rendezvous_mgr(std::move(rendezvous_mgr)),
+      worker_cache(new WorkerFreeListCache(std::move(worker_cache))),
+      device_mgr(std::move(device_mgr)),
       graph_mgr(std::move(graph_mgr)) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index e6ebe88329822569b866a38dc2c79fa11aac105a..77cf4de8f7455f1f5b9553890922e2c310018b6b 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -18,14 +18,13 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
-#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 
 namespace tensorflow {
 
 class GraphMgr;
-class RendezvousMgrInterface;
 class WorkerCacheInterface;
 
 // WorkerSession encapsulates all of the state relating to a given session.
@@ -36,17 +35,20 @@ struct WorkerSession {
   // Object from which WorkerInterface instances can be obtained.
   const std::unique_ptr<WorkerCacheInterface> worker_cache;
 
-  // A set of rendezvous keyed by step ids.
-  const std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr;
+  // Collection of local devices. These devices are typically RenamedDevices
+  // in all except the SessionMgr.legacy_session_. legacy_session_.device_mgr
+  // == worker_env_.device_mgr, which holds the true devices.
+  const std::unique_ptr<DeviceMgr> device_mgr;
 
   // graph_mgr keeps track of the registered graphs of this session.
   //
   // Note: graph_mgr must be deleted before rendezvous_mgr!
+  // Note: graph_mgr must be deleted before device_mgr!
   const std::unique_ptr<GraphMgr> graph_mgr;
 
   WorkerSession(const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
-                std::unique_ptr<RendezvousMgrInterface> rendezvous_mgr,
+                std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr);
 };
 
diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc
index 946050687d47797efea58906f1fac6b2a27caf42..486be39ae31c487560efebc79e0fbab90ddca9db 100644
--- a/tensorflow/core/framework/allocator_registry.cc
+++ b/tensorflow/core/framework/allocator_registry.cc
@@ -26,22 +26,37 @@ AllocatorRegistry* AllocatorRegistry::Global() {
   return global_allocator_registry;
 }
 
-bool AllocatorRegistry::CheckForDuplicates(const string& name, int priority) {
+Allocator* AllocatorRegistry::GetRegisteredAllocator(const string& name,
+                                                     int priority) {
   for (auto entry : allocators_) {
     if (!name.compare(entry.name) && priority == entry.priority) {
-      return true;
+      return entry.allocator;
     }
   }
-  return false;
+  return nullptr;
 }
 
 void AllocatorRegistry::Register(const string& name, int priority,
                                  Allocator* allocator) {
   CHECK(!name.empty()) << "Need a valid name for Allocator";
   CHECK_GE(priority, 0) << "Priority needs to be non-negative";
-  CHECK(!CheckForDuplicates(name, priority))
-      << "Allocator with name: [" << name << "] and priority: [" << priority
-      << "] already registered";
+
+  Allocator* existing = GetRegisteredAllocator(name, priority);
+  if (existing != nullptr) {
+    // A duplicate is if the registration name and priority match
+    // but the Allocator::Name()'s don't match.
+    CHECK_EQ(existing->Name(), allocator->Name())
+        << "Allocator with name: [" << name << "], type [" << existing->Name()
+        << "], priority: [" << priority
+        << "] already registered.  Choose a different name to register "
+        << "an allocator of type " << allocator->Name();
+
+    // The allocator names match, so we can just return.
+    // It should be safe to delete the allocator since the caller
+    // gives up ownership of it.
+    delete allocator;
+    return;
+  }
 
   AllocatorRegistryEntry tmp_entry;
   tmp_entry.name = name;
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index c419366ae1aa6f35cf98c351844d930bf1b49728..b26e79ac3b01c7b3fe5099f626c4e35862586282 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -27,7 +27,8 @@ namespace tensorflow {
 // A global AllocatorRegistry is used to hold allocators for CPU backends
 class AllocatorRegistry {
  public:
-  // Add an allocator to the registry.
+  // Add an allocator to the registry.  Caller releases ownership of
+  // 'allocator'.
   void Register(const string& name, int priority, Allocator* allocator);
 
   // Return allocator with highest priority
@@ -44,7 +45,9 @@ class AllocatorRegistry {
     Allocator* allocator;  // not owned
   } AllocatorRegistryEntry;
 
-  bool CheckForDuplicates(const string& name, int priority);
+  // Returns the Allocator registered for 'name' and 'priority',
+  // or 'nullptr' if not found.
+  Allocator* GetRegisteredAllocator(const string& name, int priority);
 
   std::vector<AllocatorRegistryEntry> allocators_;
   Allocator* m_curr_allocator_;  // not owned
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 55e009964012f28296485ea9d97494b46663a314..b18ce3decc0268c11e16812a22ecf98275320946 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -36,8 +36,8 @@ string SummarizeString(const string& str) {
 string SummarizeTensor(const TensorProto& tensor_proto) {
   Tensor t;
   if (!t.FromProto(tensor_proto)) {
-    return strings::StrCat("<Invalid TensorProto: ",
-                           ProtoShortDebugString(tensor_proto), ">");
+    return strings::StrCat(
+        "<Invalid TensorProto: ", ProtoShortDebugString(tensor_proto), ">");
   }
   return t.DebugString();
 }
@@ -48,7 +48,7 @@ string SummarizeFunc(const NameAttrList& func) {
     entries.push_back(
         strings::StrCat(p.first, "=", SummarizeAttrValue(p.second)));
   }
-  sort(entries.begin(), entries.end());
+  std::sort(entries.begin(), entries.end());
   return strings::StrCat(func.name(), "[", str_util::Join(entries, ", "), "]");
 }
 
@@ -290,12 +290,12 @@ bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
 #define DEFINE_SET_ATTR_VALUE_ONE(ARG_TYPE, FIELD) \
   void SetAttrValue(ARG_TYPE value, AttrValue* out) { out->set_##FIELD(value); }
 
-#define DEFINE_SET_ATTR_VALUE_LIST(ARG_TYPE, FIELD)              \
-  void SetAttrValue(ARG_TYPE value, AttrValue* out) {            \
-    out->mutable_list(); /* create list() even if value empty */ \
-    for (const auto& v : value) {                                \
-      out->mutable_list()->add_##FIELD(v);                       \
-    }                                                            \
+#define DEFINE_SET_ATTR_VALUE_LIST(ARG_TYPE, FIELD)                       \
+  void SetAttrValue(ARG_TYPE value, AttrValue* out) {                     \
+    out->mutable_list()->Clear(); /* create list() even if value empty */ \
+    for (const auto& v : value) {                                         \
+      out->mutable_list()->add_##FIELD(v);                                \
+    }                                                                     \
   }
 
 #define DEFINE_SET_ATTR_VALUE_BOTH(ARG_TYPE, FIELD) \
@@ -319,7 +319,7 @@ void SetAttrValue(StringPiece value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<StringPiece> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     out->mutable_list()->add_s(v.data(), v.size());
   }
@@ -338,14 +338,14 @@ void SetAttrValue(const PartialTensorShape& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<TensorShape> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     v.AsProto(out->mutable_list()->add_shape());
   }
 }
 
 void SetAttrValue(gtl::ArraySlice<TensorShapeProto> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_shape() = v;
   }
@@ -353,7 +353,7 @@ void SetAttrValue(gtl::ArraySlice<TensorShapeProto> value, AttrValue* out) {
 
 void SetAttrValue(const gtl::ArraySlice<PartialTensorShape> value,
                   AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     v.AsProto(out->mutable_list()->add_shape());
   }
@@ -368,7 +368,7 @@ void SetAttrValue(const Tensor& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<Tensor> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     if (v.NumElements() > 1) {
       v.AsProtoTensorContent(out->mutable_list()->add_tensor());
@@ -383,7 +383,7 @@ void SetAttrValue(const TensorProto& value, AttrValue* out) {
 }
 
 void SetAttrValue(const gtl::ArraySlice<TensorProto> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_tensor() = v;
   }
@@ -394,22 +394,39 @@ void SetAttrValue(const NameAttrList& value, AttrValue* out) {
 }
 
 void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
-  out->mutable_list();  // Create list() even if value empty.
+  out->mutable_list()->Clear();  // Create list() even if value empty.
   for (const auto& v : value) {
     *out->mutable_list()->add_func() = v;
   }
 }
 
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+template <typename T>
+static bool DeterministicSerialization(const T& t, string* result) {
+  const int size = t.ByteSize();
+  *result = string(size, '\0');
+  ::tensorflow::protobuf::io::ArrayOutputStream array_stream(&(*result)[0],
+                                                             size);
+  ::tensorflow::protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  t.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
   string a_str, b_str;
-  a.SerializeToString(&a_str);
-  b.SerializeToString(&b_str);
+  DeterministicSerialization(a, &a_str);
+  DeterministicSerialization(b, &b_str);
   // Note: it should be safe to compare proto serializations of the attr
   // values since at most one field should be set in each (indeed, it
   // must be the same field if they are to compare equal).
   // Exception: there are multiple equivalent representations of
   // TensorProtos.  So a return value of true implies a == b, but not the
   // converse.
+  // TODO(phawkins): this is incorrect for NameAttrList attributes that may
+  // contain nested AttrValue maps.
   return a_str == b_str;
 }
 
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 4cc3f9235312f02a043da4589dc1d2f1713bfd5d..651c054fe8b69f6458d090d208385dc1006783af 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -36,7 +36,7 @@ namespace tensorflow {
 // CancellationManager::get_cancellation_token.
 typedef int64 CancellationToken;
 
-// A callback that is invoked when a step is cancelled.
+// A callback that is invoked when a step is canceled.
 //
 // NOTE(mrry): See caveats about CancelCallback implementations in the
 // comment for CancellationManager::RegisterCallback.
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 4c87a453e2a16d56167661af4053d44e337fee55..d5e6e293d6d8ea90746ad20ee55fb1c4022bd223 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -873,6 +873,13 @@ Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status RandomShape(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
                             ShapeHandle values_shape, ShapeHandle shape_shape) {
   // Validate ranks.
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 73509fb7fbaa4eeb6d6861d1de5a8bd8aa5c6e3e..dc99e48adb97abc6948e34da7d5bb239d208756e 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -199,6 +199,9 @@ Status ConcatV2Shape(shape_inference::InferenceContext* c);
 // Tested by ops/math_ops_test.cc.
 Status BroadcastBinaryOpShapeFn(InferenceContext* c);
 
+// Shape function for random operations.
+Status RandomShape(shape_inference::InferenceContext* c);
+
 // Validates the 3 component tensors of a sparse tensor have the proper
 // shapes. This mimics SparseTensor.__init__ in python/framework/ops.py.
 Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 8894671fdf3a22098cb2e6eca23d1adeb38a5f18..27fe28fe60a9bd020f9db16c49506741336c9863 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -115,7 +115,7 @@ class DeviceBase {
     cpu_worker_threads_ = t;
   }
 
-  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
+  virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
     CHECK(cpu_worker_threads_ != nullptr);
     return cpu_worker_threads_;
   }
@@ -140,7 +140,7 @@ class DeviceBase {
     gpu_device_info_ = g;
   }
 
-  const GpuDeviceInfo* tensorflow_gpu_device_info() const {
+  virtual const GpuDeviceInfo* tensorflow_gpu_device_info() const {
     return gpu_device_info_;
   }
 
@@ -170,13 +170,13 @@ class DeviceBase {
     return GetAllocator(attr);
   }
 
-  const Eigen::ThreadPoolDevice* eigen_cpu_device() {
+  virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
   }
 
 #ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice* eigen_sycl_device() const {
+  virtual const Eigen::SyclDevice* eigen_sycl_device() const {
     CHECK(eigen_sycl_device_ != nullptr);
     return eigen_sycl_device_;
   }
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index e1603330ebab013e2121f55a3c44686c801d37bc..186095201d1efb4898595e15ec2145ee37fa9f07 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/core/framework/function.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 
@@ -42,12 +45,11 @@ namespace {
 // Otherwise (arg_def is a simple type T), *is_type_list is set to
 // false, and *dtypes is set to a single element vector, whose only
 // element is T.
-Status ArgNumType(const InstantiateAttrValueMap& attrs,
-                  const OpDef::ArgDef& arg_def, bool* is_type_list,
-                  DataTypeVector* dtypes) {
+Status ArgNumType(AttrSlice attrs, const OpDef::ArgDef& arg_def,
+                  bool* is_type_list, DataTypeVector* dtypes) {
   dtypes->clear();
   if (!arg_def.type_list_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_list_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_list_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ",
                               arg_def.type_list_attr());
@@ -62,7 +64,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   *is_type_list = false;
   int num = 1;
   if (!arg_def.number_attr().empty()) {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.number_attr());
+    const AttrValue* v = attrs.Find(arg_def.number_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -75,7 +77,7 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   } else if (arg_def.type_attr().empty()) {
     dtype = DT_INVALID;
   } else {
-    const AttrValue* v = gtl::FindOrNull(attrs, arg_def.type_attr());
+    const AttrValue* v = attrs.Find(arg_def.type_attr());
     if (v == nullptr) {
       return errors::NotFound("type attr not found: ", arg_def.type_attr());
     }
@@ -85,35 +87,22 @@ Status ArgNumType(const InstantiateAttrValueMap& attrs,
   return Status::OK();
 }
 
-string Name(int node_index) { return strings::StrCat("n", node_index); }
-
-string Name(int node_index, int output_index) {
-  if (output_index == 0) {
-    return Name(node_index);
-  } else {
-    return strings::StrCat("n", node_index, ":", output_index);
-  }
-}
-
-string Dep(int node_index) { return strings::StrCat("^", Name(node_index)); }
-
 template <typename T>
 void AddAttr(const string& name, const T& val, NodeDef* ndef) {
   SetAttrValue(val, &((*ndef->mutable_attr())[name]));
 }
 
-Status ValidateSignatureWithAttrs(const OpDef& sig,
-                                  const InstantiateAttrValueMap& attr_values) {
+Status ValidateSignatureWithAttrs(const OpDef& sig, AttrSlice attr_values) {
   // attr_values should specify all attrs defined in fdef.
   for (const auto& a : sig.attr()) {
-    auto const iter = attr_values.find(a.name());
-    if (iter == attr_values.end()) {
+    const AttrValue* v = attr_values.Find(a.name());
+    if (!v) {
       return errors::NotFound("Attr ", a.name(), " is not found from ",
                               SummarizeOpDef(sig));
     }
-    Status status = AttrValueHasType(iter->second, a.type());
+    Status status = AttrValueHasType(*v, a.type());
     if (!status.ok()) {
-      errors::AppendToMessage(&status, "for attr '", iter->first, "'");
+      errors::AppendToMessage(&status, "for attr '", a.name(), "'");
       return status;
     }
   }
@@ -144,230 +133,305 @@ Status ValidateSignatureWithAttrs(const OpDef& sig,
   return Status::OK();
 }
 
-// We build a small index for all names that can be used as a node's
-// input arguments.
-//
-// If is_func_arg is true, the name is a function's argument.  In
-// this case, the produced graph def has gdef.node[nid ... nid +
-// dtype.size()).
-//
-// Otherwise, the name is a function body's node return value.  In
-// this case, the produced graph def has one node gdef.node[nid] and
-// the node's output index [idx ... idx + num) corresponds to the
-// named outputs.
-//
-// In all cases, "dtype" specifies the data type.
-struct NameInfoItem {
-  bool is_func_arg;
-  int nid;
-  int idx;
-  bool is_type_list;
-  DataTypeVector dtypes;
-};
-typedef std::unordered_map<string, NameInfoItem> NameInfoIndex;
-
-Status AddArgName(NameInfoIndex* name_info, const string& arg,
-                  const NameInfoItem& item) {
-  if (!name_info->insert({arg, item}).second) {
-    return errors::InvalidArgument("Duplicated arg name: ", arg);
-  }
-  return Status::OK();
-}
-
-Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                          const InstantiateAttrValueMap& attr_values,
-                          NameInfoIndex* name_info,
-                          InstantiationResult* result) {
-  bool is_type_list;
-  DataTypeVector dtypes;
-  TF_RETURN_IF_ERROR(ArgNumType(attr_values, arg_def, &is_type_list, &dtypes));
-  CHECK_GE(dtypes.size(), size_t{1});
-  GraphDef* gdef = &result->gdef;
-  int arg_index = gdef->node_size();
-  TF_RETURN_IF_ERROR(AddArgName(name_info, arg_def.name(),
-                                {true, arg_index, 0, is_type_list, dtypes}));
-  // Creates dtypes.size() nodes in the gdef.
-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    TF_RETURN_IF_ERROR(AddArgName(name_info,
-                                  strings::StrCat(arg_def.name(), ":", i),
-                                  {true, arg_index, 0, false, {dtypes[i]}}));
-    DCHECK_EQ(arg_index, gdef->node_size());
-    NodeDef* gnode = gdef->add_node();
-    gnode->set_name(Name(arg_index));
-    gnode->set_op("_Arg");
-    AddAttr("T", dtypes[i], gnode);
-    AddAttr("index", arg_index, gnode);
-    result->arg_types.push_back(dtypes[i]);
-    ++arg_index;
-  }
-  return Status::OK();
-}
-
-Status AddRetName(NameInfoIndex* name_info, const string& ret,
-                  const NameInfoItem& item) {
-  if (!name_info->insert({ret, item}).second) {
-    return errors::InvalidArgument("Duplicated ret name: ", ret);
+// A helper class for instantiating functions. This contains shared information
+// like the resulting graph and node name index.
+class FunctionInstantiationHelper {
+ public:
+  FunctionInstantiationHelper(GetFunctionSignature get_function,
+                              InstantiationResult* result)
+      : get_function_(std ::move(get_function)), result_(*result) {
+    result_.gdef.Clear();
+  }
+
+  // Builds index for nodes that can be used as node's input arguments.
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
+                            AttrSlice attr_values) {
+    bool is_type_list;
+    DataTypeVector dtypes;
+    TF_RETURN_IF_ERROR(
+        ArgNumType(attr_values, arg_def, &is_type_list, &dtypes));
+    CHECK_GE(dtypes.size(), size_t{1});
+    GraphDef* gdef = &result_.gdef;
+    int arg_index = gdef->node_size();
+    TF_RETURN_IF_ERROR(
+        AddItem(arg_def.name(), {true, arg_index, 0, is_type_list, dtypes}));
+    // Creates dtypes.size() nodes in the gdef.
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      TF_RETURN_IF_ERROR(AddItem(strings::StrCat(arg_def.name(), ":", i),
+                                 {true, arg_index, 0, false, {dtypes[i]}}));
+      DCHECK_EQ(arg_index, gdef->node_size());
+      string name = arg_def.name();
+      if (dtypes.size() > 1) {
+        strings::StrAppend(&name, "_", i);
+      }
+      NodeDef* gnode = AddNode(name);
+      gnode->set_op("_Arg");
+      AddAttr("T", dtypes[i], gnode);
+      AddAttr("index", arg_index, gnode);
+      result_.arg_types.push_back(dtypes[i]);
+      ++arg_index;
+    }
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-Status BuildNodeOutputIndex(const NodeDef& node,
-                            const InstantiateAttrValueMap& attrs,
-                            GetFunctionSignature get_function,
-                            const int arg_index, NameInfoIndex* name_info) {
-  const OpDef* node_sig = nullptr;
-  TF_RETURN_IF_ERROR(get_function(node.op(), &node_sig));
-  if (node_sig->output_arg_size() == 0) {
-    return AddRetName(name_info, node.name(), {false, arg_index, 0, false, {}});
-  }
-  const int num_retval = node_sig->output_arg_size();
-  int start = 0;
-  bool is_type_list;
-  DataTypeVector dtypes;
-  for (int i = 0; i < num_retval; ++i) {
-    TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
-    // Note that we rely on the backwards-compatibility test enforcing
-    // that output_arg(*).name() doesn't change here.
-    const string base_name =
-        strings::StrCat(node.name(), ":", node_sig->output_arg(i).name());
-    TF_RETURN_IF_ERROR(AddRetName(
-        name_info, base_name, {false, arg_index, start, is_type_list, dtypes}));
-    for (int j = 0; j < static_cast<int>(dtypes.size()); ++j) {
+  Status BuildNodeOutputIndex(const NodeDef& node, AttrSlice attrs,
+                              const int arg_index) {
+    const OpDef* node_sig = nullptr;
+    TF_RETURN_IF_ERROR(get_function_(node.op(), &node_sig));
+    if (node_sig->output_arg_size() == 0) {
+      return AddItem(node.name(), {false, arg_index, 0, false, {}});
+    }
+    const int num_retval = node_sig->output_arg_size();
+    int start = 0;
+    bool is_type_list;
+    DataTypeVector dtypes;
+    for (int i = 0; i < num_retval; ++i) {
       TF_RETURN_IF_ERROR(
-          AddRetName(name_info, strings::StrCat(base_name, ":", j),
-                     {false, arg_index, start + j, false, {dtypes[j]}}));
+          ArgNumType(attrs, node_sig->output_arg(i), &is_type_list, &dtypes));
+      // Note that we rely on the backwards-compatibility test enforcing
+      // that output_arg(*).name() doesn't change here.
+      const string base_name =
+          strings::StrCat(node.name(), ":", node_sig->output_arg(i).name());
+      TF_RETURN_IF_ERROR(
+          AddItem(base_name, {false, arg_index, start, is_type_list, dtypes}));
+      for (int j = 0; j < static_cast<int>(dtypes.size()); ++j) {
+        TF_RETURN_IF_ERROR(
+            AddItem(strings::StrCat(base_name, ":", j),
+                    {false, arg_index, start + j, false, {dtypes[j]}}));
+      }
+      start += dtypes.size();
     }
-    start += dtypes.size();
+    return Status::OK();
   }
-  return Status::OK();
-}
 
-Status InstantiateNode(const NodeDef& fnode,
-                       const InstantiateAttrValueMap& attrs,
-                       GetFunctionSignature get_function,
-                       const NameInfoIndex& name_info, GraphDef* gdef) {
-  const OpDef* fnode_sig = nullptr;
-  TF_CHECK_OK(get_function(fnode.op(), &fnode_sig));
-  NodeDef* gnode = gdef->add_node();
-  gnode->set_name(Name(gdef->node_size() - 1));
-  gnode->set_op(fnode.op());
-  gnode->set_device(fnode.device());
-
-  // Input
-  const int num_args = fnode_sig->input_arg_size();
-  bool is_type_list;  // ignored
-  DataTypeVector dtypes;
-  int fnode_arg_index = 0;
-  for (int i = 0; i < num_args; ++i) {
-    TF_RETURN_IF_ERROR(
-        ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes));
-    // Consume inputs (indexed by fnode_arg_index) until we have
-    // matched each element of dtypes (indexed by j).
-    for (size_t j = 0; j < dtypes.size(); ++fnode_arg_index) {
-      if (fnode_arg_index >= fnode.input_size()) {
-        // Should never happen if we computed dtypes correctly.
-        return errors::InvalidArgument("Attempt to access beyond input size: ",
-                                       fnode_arg_index, " >= ",
-                                       fnode.input_size());
-      }
-      // Look up the next input.
-      const string& input_name = fnode.input(fnode_arg_index);
-      const NameInfoItem* item = gtl::FindOrNull(name_info, input_name);
-      if (item == nullptr) {
-        return errors::InvalidArgument("input ", input_name, " is not found: ",
-                                       SummarizeNodeDef(fnode));
-      }
-      if (item->dtypes.size() > dtypes.size() - j) {
-        return errors::InvalidArgument("Input ", input_name, " too long for ",
-                                       fnode_sig->input_arg(i).name());
-      }
-      // Match up all the elements of this input (indexed by k) with
-      // elements of dtypes (advancing j).
-      for (int k = 0; k < item->dtypes.size(); ++k, ++j) {
-        if (item->dtypes[k] != dtypes[j]) {
+  Status InstantiateNode(const NodeDef& fnode, AttrSlice attrs) {
+    const OpDef* fnode_sig = nullptr;
+    TF_CHECK_OK(get_function_(fnode.op(), &fnode_sig));
+    NodeDef* gnode = AddNode(fnode.name());
+    gnode->set_op(fnode.op());
+    gnode->set_device(fnode.device());
+    int gnode_idx = nodes_.size() - 1;
+
+    // Input
+    const int num_args = fnode_sig->input_arg_size();
+    bool is_type_list;  // ignored
+    DataTypeVector dtypes;
+    int fnode_arg_index = 0;
+    for (int i = 0; i < num_args; ++i) {
+      TF_RETURN_IF_ERROR(
+          ArgNumType(attrs, fnode_sig->input_arg(i), &is_type_list, &dtypes));
+      // Consume inputs (indexed by fnode_arg_index) until we have
+      // matched each element of dtypes (indexed by j).
+      for (size_t j = 0; j < dtypes.size(); ++fnode_arg_index) {
+        if (fnode_arg_index >= fnode.input_size()) {
+          // Should never happen if we computed dtypes correctly.
+          return errors::InvalidArgument(
+              "Attempt to access beyond input size: ", fnode_arg_index,
+              " >= ", fnode.input_size());
+        }
+        // Look up the next input.
+        const string& input_name = fnode.input(fnode_arg_index);
+        const auto* item = GetItemOrNull(input_name);
+        if (item == nullptr) {
           return errors::InvalidArgument(
-              "input ", fnode_sig->input_arg(i).name(), "[", j,
-              "] expected type ", DataTypeString(dtypes[j]), " != ",
-              DataTypeString(item->dtypes[k]), ", the type of ", input_name,
-              "[", k, "]");
+              "input ", input_name, " is not found: ", SummarizeNodeDef(fnode));
+        }
+        if (item->dtypes.size() > dtypes.size() - j) {
+          return errors::InvalidArgument("Input ", input_name, " too long for ",
+                                         fnode_sig->input_arg(i).name());
         }
-        if (item->is_func_arg) {
-          gnode->add_input(Name(item->nid + k));
-        } else {
-          gnode->add_input(Name(item->nid, item->idx + k));
+        // Match up all the elements of this input (indexed by k) with
+        // elements of dtypes (advancing j).
+        for (int k = 0; k < item->dtypes.size(); ++k, ++j) {
+          if (item->dtypes[k] != dtypes[j]) {
+            return errors::InvalidArgument(
+                "input ", fnode_sig->input_arg(i).name(), "[", j,
+                "] expected type ", DataTypeString(dtypes[j]),
+                " != ", DataTypeString(item->dtypes[k]), ", the type of ",
+                input_name, "[", k, "]");
+          }
+          if (item->is_func_arg) {
+            AddInput(gnode_idx, item->nid + k, 0);
+          } else {
+            AddInput(gnode_idx, item->nid, item->idx + k);
+          }
         }
       }
     }
+
+    // Control deps.
+    for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
+      const string& input = fnode.input(i);
+      if (input.empty() || input[0] != '^') {
+        return errors::InvalidArgument("Expected input[", i, "] == '", input,
+                                       "' to be a control input.");
+      }
+      int nid = -1;
+      const string node_name = input.substr(1);
+      const string node_colon = node_name + ":";
+      for (const auto& p : index_) {
+        if (p.first == node_name ||
+            tensorflow::StringPiece(p.first).starts_with(node_colon)) {
+          nid = p.second.nid;
+          break;
+        }
+      }
+      if (nid == -1) {
+        return errors::InvalidArgument("input[", i, "] == '", input,
+                                       "', is not found.");
+      }
+      AddDep(gnode_idx, nid);
+    }
+
+    // Attrs.
+    for (const auto& p : attrs) {
+      (*gnode->mutable_attr())[p.first] = p.second;
+    }
+
+    return Status::OK();
   }
 
-  // Control deps.
-  for (int i = fnode_arg_index; i < fnode.input_size(); ++i) {
-    const string& input = fnode.input(i);
-    if (input.empty() || input[0] != '^') {
-      return errors::InvalidArgument("Expected input[", i, "] == '", input,
-                                     "' to be a control input.");
+  Status AddReturnNode(
+      const OpDef::ArgDef& ret_def, AttrSlice attrs,
+      const ::tensorflow::protobuf::Map<string, string>& ret_map,
+      int* ret_index) {
+    auto ret_iter = ret_map.find(ret_def.name());
+    if (ret_iter == ret_map.end()) {
+      return errors::InvalidArgument("Return ", ret_def.name(), " missing.");
+    }
+    bool is_type_list;
+    DataTypeVector dtypes;
+    TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+    CHECK_GE(dtypes.size(), size_t{1});
+    const auto* item = GetItemOrNull(ret_iter->second);
+    if (item == nullptr) {
+      return errors::InvalidArgument("Return ", ret_def.name(), " -> ",
+                                     ret_iter->second, " is not found.");
     }
-    int nid = -1;
-    const string node_name = input.substr(1);
-    const string node_colon = node_name + ":";
-    for (const auto& p : name_info) {
-      if (p.first == node_name ||
-          tensorflow::StringPiece(p.first).starts_with(node_colon)) {
-        nid = p.second.nid;
-        break;
+    if (dtypes != item->dtypes) {
+      return errors::InvalidArgument("Invalid ret types ", ret_def.name(),
+                                     " : ", DataTypeVectorString(dtypes),
+                                     " vs. ",
+                                     DataTypeVectorString(item->dtypes));
+    }
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      string name = strings::StrCat(ret_def.name(), "_RetVal");
+      if (dtypes.size() > 1) {
+        strings::StrAppend(&name, "_", i);
       }
+      NodeDef* gnode = AddNode(name);
+      gnode->set_op("_Retval");
+      AddInput(nodes_.size() - 1, item->nid, item->idx + i);
+      AddAttr("T", dtypes[i], gnode);
+      AddAttr("index", (*ret_index)++, gnode);
+      result_.ret_types.push_back(dtypes[i]);
     }
-    if (nid == -1) {
-      return errors::InvalidArgument("input[", i, "] == '", input,
-                                     "', is not found.");
+    return Status::OK();
+  }
+
+  // Adds the actual node inputs to the result graph by converting indexes to
+  // the node names.
+  void AddNodeInputs() {
+    for (int i = 0; i < result_.gdef.node_size(); i++) {
+      NodeInfo& node_info = nodes_[i];
+      for (const auto& p : node_info.data_inputs) {
+        result_.gdef.mutable_node(i)->add_input(Name(p.first, p.second));
+      }
+      for (int index : node_info.control_inputs) {
+        result_.gdef.mutable_node(i)->add_input(Dep(index));
+      }
     }
-    gnode->add_input(Dep(nid));
   }
 
-  // Attrs.
-  for (const auto& p : attrs) {
-    (*gnode->mutable_attr())[p.first] = p.second;
+ private:
+  // This is used to build a small index for all names that can be used as a
+  // node's input arguments.
+  //
+  // If is_func_arg is true, the name is a function's argument.  In
+  // this case, the produced graph def has gdef.node[nid ... nid +
+  // dtype.size()).
+  //
+  // Otherwise, the name is a function body's node return value.  In
+  // this case, the produced graph def has one node gdef.node[nid] and
+  // the node's output index [idx ... idx + num) corresponds to the
+  // named outputs.
+  //
+  // In all cases, "dtype" specifies the data type.
+  struct NameInfoItem {
+    bool is_func_arg;
+    int nid;
+    int idx;
+    bool is_type_list;
+    DataTypeVector dtypes;
+  };
+
+  // Adds an item into the input name index.
+  Status AddItem(const string& name, const NameInfoItem& item) {
+    if (!index_.insert({name, item}).second) {
+      return errors::InvalidArgument(
+          strings::StrCat("Duplicated ", item.is_func_arg ? "arg" : "ret",
+                          " name: "),
+          name);
+    }
+    return Status::OK();
   }
 
-  return Status::OK();
-}
+  const NameInfoItem* GetItemOrNull(const string& name) const {
+    return gtl::FindOrNull(index_, name);
+  }
 
-Status AddReturnNode(const OpDef::ArgDef& ret_def,
-                     const InstantiateAttrValueMap& attrs,
-                     const ::tensorflow::protobuf::Map<string, string>& ret_map,
-                     const NameInfoIndex& name_info, int* ret_index,
-                     InstantiationResult* result) {
-  auto ret_iter = ret_map.find(ret_def.name());
-  if (ret_iter == ret_map.end()) {
-    return errors::InvalidArgument("Return ", ret_def.name(), " missing.");
-  }
-  bool is_type_list;
-  DataTypeVector dtypes;
-  TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
-  CHECK_GE(dtypes.size(), size_t{1});
-  const NameInfoItem* item = gtl::FindOrNull(name_info, ret_iter->second);
-  if (item == nullptr) {
-    return errors::InvalidArgument("Return ", ret_def.name(), " -> ",
-                                   ret_iter->second, " is not found.");
-  }
-  if (dtypes != item->dtypes) {
-    return errors::InvalidArgument("Invalid ret types ", ret_def.name(), " : ",
-                                   DataTypeVectorString(dtypes), " vs. ",
-                                   DataTypeVectorString(item->dtypes));
-  }
-  GraphDef* gdef = &result->gdef;
-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    NodeDef* gnode = gdef->add_node();
-    gnode->set_name(Name(gdef->node_size() - 1));
-    gnode->set_op("_Retval");
-    gnode->add_input(Name(item->nid, item->idx + i));
-    AddAttr("T", dtypes[i], gnode);
-    AddAttr("index", (*ret_index)++, gnode);
-    result->ret_types.push_back(dtypes[i]);
+  string Dep(int node_index) const {
+    return strings::StrCat("^", Name(node_index));
+  }
+
+  string Name(int node_index) const {
+    CHECK_LT(node_index, nodes_.size());
+    return nodes_[node_index].name;
+  }
+
+  string Name(int node_index, int output_index) const {
+    if (output_index == 0) {
+      return Name(node_index);
+    } else {
+      return strings::StrCat(Name(node_index), ":", output_index);
+    }
+  }
+
+  NodeDef* AddNode(const string& name) {
+    NodeDef* gnode = result_.gdef.add_node();
+    gnode->set_name(name);
+    nodes_.push_back({name, {}, {}});
+    CHECK_EQ(result_.gdef.node_size(), nodes_.size());
+    return gnode;
   }
-  return Status::OK();
-}
+
+  void AddInput(int node_index, int output_node, int output_index) {
+    CHECK_LT(node_index, nodes_.size());
+    nodes_[node_index].data_inputs.push_back(
+        std::make_pair(output_node, output_index));
+  }
+
+  void AddDep(int node_index, int dep_index) {
+    CHECK_LT(node_index, nodes_.size());
+    nodes_[node_index].control_inputs.push_back(dep_index);
+  }
+
+  GetFunctionSignature get_function_;
+  InstantiationResult& result_;
+  // A small index for all names that can be used as a node's input arguments.
+  std::unordered_map<string, NameInfoItem> index_;
+  // This contains information about a node in the new graph including the node
+  // names and input nodes' indexes.
+  struct NodeInfo {
+    string name;
+    // Data inputs where <n, k> means arg k of node n.
+    std::vector<std::pair<int, int>> data_inputs;
+    // Control inputs (dependencies).
+    std::vector<int> control_inputs;
+  };
+  // nodes_[i] is the information about result_.gdef.node(i).
+  std::vector<NodeInfo> nodes_;
+};
 
 // Various helpers Print(proto) to print relevant protos to ascii.
 string Print(const OpDef::ArgDef& arg) {
@@ -407,7 +471,7 @@ string Print(const AttrValue& attr_value) {
     for (auto p : attr_value.func().attr()) {
       entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
     }
-    sort(entries.begin(), entries.end());
+    std::sort(entries.begin(), entries.end());
     return strings::StrCat(attr_value.func().name(), "[",
                            str_util::Join(entries, ", "), "]");
   }
@@ -423,7 +487,7 @@ string Print(const NodeDef& n) {
     for (auto& a : n.attr()) {
       entries.push_back(strings::StrCat(a.first, "=", Print(a.second)));
     }
-    sort(entries.begin(), entries.end());
+    std::sort(entries.begin(), entries.end());
     strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
@@ -501,8 +565,8 @@ string Print(const GraphDef& gdef) {
     TF_CHECK_OK(GetNodeAttr(*y, "index", &yi));
     return xi < yi;
   };
-  sort(arg.begin(), arg.end(), comp);
-  sort(ret.begin(), ret.end(), comp);
+  std::sort(arg.begin(), arg.end(), comp);
+  std::sort(ret.begin(), ret.end(), comp);
   string out;
   strings::StrAppend(&out, "\n(");
   auto get_type = [](const NodeDef& n) {
@@ -516,14 +580,14 @@ string Print(const GraphDef& gdef) {
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_GE(n->attr_size(), 2);
     strings::StrAppend(&out, n->name(), ":", get_type(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_LE(2, n->attr_size());
     CHECK_EQ(1, n->input_size());
     strings::StrAppend(&out, n->input(0), ":", get_type(*n));
   }
@@ -535,8 +599,9 @@ string Print(const GraphDef& gdef) {
   return out;
 }
 
-Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
-                       InstantiateAttrValueMap* attrs) {
+Status AddDefaultAttrs(const string& op,
+                       const GetFunctionSignature& get_function,
+                       AttrValueMap* attrs) {
   const OpDef* op_def = nullptr;
   TF_RETURN_IF_ERROR(get_function(op, &op_def));
   AttrSlice attr_slice(attrs);
@@ -552,41 +617,35 @@ Status AddDefaultAttrs(const string& op, GetFunctionSignature get_function,
 
 }  // end namespace
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
   VLOG(3) << "Instantiation Function: " << Print(fdef);
 
   const OpDef& sig = fdef.signature();
-  GraphDef* gdef = &result->gdef;
-  gdef->Clear();
-
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
-  NameInfoIndex name_info;
+  FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = BuildInputArgIndex(arg_def, attr_values, &name_info, result);
+    s = helper.BuildInputArgIndex(arg_def, attr_values);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
     }
   }
 
-  auto substitute = [&attr_values](const string& name, AttrValue* val) {
-    auto iter = attr_values.find(name);
-    if (iter == attr_values.end()) {
-      return false;
-    } else {
-      *val = iter->second;
+  auto substitute = [attr_values](StringPiece name, AttrValue* val) {
+    if (const AttrValue* v = attr_values.Find(name)) {
+      *val = *v;
       return true;
     }
+    return false;
   };
 
   // Makes a copy of all attrs in fdef and substitutes placeholders.
   // After this step, every attr is bound to a concrete value.
-  std::vector<InstantiateAttrValueMap> node_attrs;
+  std::vector<AttrValueMap> node_attrs;
   node_attrs.resize(fdef.node_def_size());
   for (int i = 0; i < fdef.node_def_size(); ++i) {
     for (auto attr : fdef.node_def(i).attr()) {
@@ -603,8 +662,8 @@ Status InstantiateFunction(const FunctionDef& fdef,
   }
 
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = BuildNodeOutputIndex(fdef.node_def(i), node_attrs[i], get_function,
-                             gdef->node_size() + i, &name_info);
+    s = helper.BuildNodeOutputIndex(fdef.node_def(i), AttrSlice(&node_attrs[i]),
+                                    result->gdef.node_size() + i);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
       return s;
@@ -612,8 +671,7 @@ Status InstantiateFunction(const FunctionDef& fdef,
   }
   // Emits one gdef.node for each fdef.node_def.
   for (int i = 0; i < fdef.node_def_size(); ++i) {
-    s = InstantiateNode(fdef.node_def(i), node_attrs[i], get_function,
-                        name_info, gdef);
+    s = helper.InstantiateNode(fdef.node_def(i), AttrSlice(&node_attrs[i]));
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
       return s;
@@ -623,14 +681,16 @@ Status InstantiateFunction(const FunctionDef& fdef,
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
-    s = AddReturnNode(ret_def, attr_values, fdef.ret(), name_info, &ret_index,
-                      result);
+    s = helper.AddReturnNode(ret_def, attr_values, fdef.ret(), &ret_index);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In function output ", Print(ret_def));
       return s;
     }
   }
 
+  // Adds the actual node inputs using the input indexes.
+  helper.AddNodeInputs();
+
   return Status::OK();
 }
 
@@ -652,14 +712,43 @@ string DebugStringWhole(const GraphDef& gdef) {
   return ret;
 }
 
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs) {
+bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
+  // NOTE(skyewm): Using MessageDifferencer would be better here, but that is
+  // currently not included in tensorflow/core/platform/default/protobuf.h, so
+  // play fast and loose here.  I don't see anything in OpDef that should allow
+  // multiple equivalent string serializations, with the exception of
+  // AttrValues, which can vary for tensor values (see AreAttrValuesEqual()
+  // comments).
+  string sig1, sig2;
+  f1.signature().SerializeToString(&sig1);
+  f2.signature().SerializeToString(&sig2);
+  if (sig1 != sig2) return false;
+
+  if (f1.attr().size() != f2.attr().size()) return false;
+  for (auto iter1 : f1.attr()) {
+    auto iter2 = f2.attr().find(iter1.first);
+    if (iter2 == f2.attr().end()) return false;
+    if (!AreAttrValuesEqual(iter1.second, iter2->second)) return false;
+  }
+
+  if (!EqualRepeatedNodeDef(f1.node_def(), f2.node_def(), nullptr)) {
+    return false;
+  }
+
+  std::map<string, string> ret1(f1.ret().begin(), f1.ret().end());
+  std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
+  if (ret1 != ret2) return false;
+
+  return true;
+}
+
+string Canonicalize(const string& funcname, AttrSlice attrs) {
   std::vector<string> entries;
   entries.reserve(attrs.size());
   for (auto p : attrs) {
     entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
   }
-  sort(entries.begin(), entries.end());
+  std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
 }
 
@@ -695,7 +784,7 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   rets->clear();
   rets->reserve(rets_.size());
   for (size_t i = 0; i < rets_.size(); ++i) {
-    auto item = rets_[i];
+    const auto& item = rets_[i];
     if (item.has_val) {
       rets->push_back(item.val);
     } else {
@@ -705,6 +794,19 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   return Status::OK();
 }
 
+Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets) {
+  rets->clear();
+  rets->reserve(rets_.size());
+  for (size_t i = 0; i < rets_.size(); ++i) {
+    if (rets_[i].has_val) {
+      rets->emplace_back(std::move(rets_[i].val));
+    } else {
+      return errors::Internal("Retval[", i, "] does not have value");
+    }
+  }
+  return Status::OK();
+}
+
 Status FunctionCallFrame::GetArg(int index, Tensor* val) const {
   if (index < 0 || static_cast<size_t>(index) >= args_.size()) {
     return errors::InvalidArgument("GetArg ", index, " is not within [0, ",
@@ -775,6 +877,12 @@ Status FunctionLibraryDefinition::AddFunctionDef(const FunctionDef& fdef) {
                                    fdef.signature().name(),
                                    " already exists in function library.");
   }
+  const OpDef* op_def;
+  if (default_registry_->LookUpOpDef(fdef.signature().name(), &op_def).ok()) {
+    return errors::InvalidArgument(
+        "Cannot add function '", fdef.signature().name(),
+        "' because an op with the same name already exists.");
+  }
   ptr.reset(new FunctionDefAndOpRegistration(fdef));
   return Status::OK();
 }
@@ -802,6 +910,17 @@ Status FunctionLibraryDefinition::AddLibrary(
   return Status::OK();
 }
 
+Status FunctionLibraryDefinition::AddLibrary(
+    const FunctionDefLibrary& lib_def) {
+  for (const FunctionDef& fdef : lib_def.function()) {
+    TF_RETURN_IF_ERROR(AddFunctionDef(fdef));
+  }
+  for (const GradientDef& grad : lib_def.gradient()) {
+    TF_RETURN_IF_ERROR(AddGradientDef(grad));
+  }
+  return Status::OK();
+}
+
 string FunctionLibraryDefinition::FindGradient(const string& func) const {
   return gtl::FindWithDefault(func_grad_, func, "");
 }
@@ -827,8 +946,7 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   // If ndef is SymbolicGradient[f=Foo], we use Foo's gradient or
   // Foo's attributes.
   const NameAttrList* forward_func_attrs;
-  if (!GetNodeAttr(AttrSlice(&ndef.attr()), kFuncAttr, &forward_func_attrs)
-           .ok()) {
+  if (!GetNodeAttr(ndef, kFuncAttr, &forward_func_attrs).ok()) {
     return nullptr;
   }
   const string& func_name = forward_func_attrs->name();
@@ -855,34 +973,30 @@ FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   return lib;
 }
 
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attr_values) {
-    m.insert({aval.first, aval.second.proto});
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
+                                          const string& attr, T* value) const {
+  const FunctionDef* fdef = GetAttrImpl(ndef);
+  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
+    return Status::OK();
   }
-  return InstantiateFunction(fdef, m, get_function, result);
+  return errors::InvalidArgument("Attr ", attr, " is not defined.");
 }
 
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Canonicalize(funcname, m);
+template <typename T>
+Status FunctionLibraryDefinition::GetAttr(const Node& node, const string& attr,
+                                          T* value) const {
+  return GetAttr(node.def(), attr, value);
 }
 
-Status FunctionLibraryRuntime::Instantiate(const string& function_name,
-                                           InstantiateAttrValueSlice attrs,
-                                           Handle* handle) {
-  InstantiateAttrValueMap m;
-  for (const auto& aval : attrs) {
-    m.insert({aval.first, aval.second.proto});
-  }
-  return Instantiate(function_name, m, handle);
-}
+#define GET_ATTR(T)                                                            \
+  template Status FunctionLibraryDefinition::GetAttr(const Node&,              \
+                                                     const string&, T*) const; \
+  template Status FunctionLibraryDefinition::GetAttr(const NodeDef&,           \
+                                                     const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index e27311041fd19db73d397977e09ce86eeff15174..188c3855c6e6243b9d07c65ba9da5c2eec0bced9 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -36,6 +36,7 @@ class CancellationManager;
 class OpKernel;
 class ResourceMgr;
 class ScopedStepContainer;
+class Node;
 
 // FunctionDefHelper::Create is a convenient helper to construct a
 // FunctionDef proto.
@@ -190,11 +191,6 @@ inline FunctionDefHelper::AttrValueWrapper::AttrValueWrapper(StringPiece val) {
 // InstantiateFunction calls "get_function" to find signatures of other
 // functions and primitive ops.
 
-// Placeholders in "fdef" is substituted based on "attr_values" here.
-typedef ::tensorflow::protobuf::Map<string, AttrValue> InstantiateAttrValueMap;
-typedef gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
-    InstantiateAttrValueSlice;
-
 // GetFunctionSignature(func name, opdef) returns OK if the func name is found
 // and opdef is filled with a pointer to the corresponding signature
 // (a OpDef proto). Otherwise, returns an error.
@@ -206,12 +202,7 @@ struct InstantiationResult {
   DataTypeVector ret_types;
   GraphDef gdef;
 };
-Status InstantiateFunction(const FunctionDef& fdef,
-                           const InstantiateAttrValueMap& attr_values,
-                           GetFunctionSignature get_function,
-                           InstantiationResult* result);
-Status InstantiateFunction(const FunctionDef& fdef,
-                           InstantiateAttrValueSlice attr_values,
+Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result);
 
@@ -230,6 +221,10 @@ string DebugString(const GraphDef& instantiated_func_def);
 // its supporting functions defined in its library).
 string DebugStringWhole(const GraphDef& gdef);
 
+// Returns true if f1 == f2. Compares all fields, including descriptions. Order
+// of NodeDefs doesn't matter.
+bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2);
+
 // Returns a canonicalized string for the instantiation of the
 // function of the given "name" and attributes "attrs".
 //
@@ -237,9 +232,7 @@ string DebugStringWhole(const GraphDef& gdef);
 // space. But it may be change as the implementation
 // evolves. Therefore, it should not be persisted or compared across
 // address spaces.
-string Canonicalize(const string& funcname,
-                    const InstantiateAttrValueMap& attrs);
-string Canonicalize(const string& funcname, InstantiateAttrValueSlice attrs);
+string Canonicalize(const string& funcname, AttrSlice attrs);
 
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
@@ -255,6 +248,7 @@ class FunctionCallFrame {
   // Caller methods.
   Status SetArgs(gtl::ArraySlice<Tensor> args);
   Status GetRetvals(std::vector<Tensor>* rets) const;
+  Status ConsumeRetvals(std::vector<Tensor>* rets);
 
   // Callee methods.
   Status GetArg(int index, Tensor* val) const;
@@ -303,6 +297,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Adds the functions and gradients in 'other' to this function library.
   Status AddLibrary(const FunctionLibraryDefinition& other);
 
+  // Adds the functions and gradients in 'lib_def' to this function library.
+  Status AddLibrary(const FunctionDefLibrary& lib_def);
+
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
   // returns an empty string.
@@ -322,9 +319,16 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Given a node def 'ndef', inspects attributes of the callee
   // function to derive the attribute 'value' for 'attr'. Returns OK
   // iff the attribute is given by the function's definition.
+  // TODO(irving): Remove; keep only the const Node& version.
   template <typename T>
   Status GetAttr(const NodeDef& ndef, const string& attr, T* value) const;
 
+  // Given a node, inspects attributes of the callee function to derive the
+  // attribute 'value' for 'attr'. Returns OK iff the attribute is given by the
+  // function's definition.
+  template <typename T>
+  Status GetAttr(const Node& node, const string& attr, T* value) const;
+
   // Returns a proto representation of the state of this function library.
   FunctionDefLibrary ToProto() const;
 
@@ -367,11 +371,8 @@ class FunctionLibraryRuntime {
   // Returns OK and fills in "handle" if the instantiation succeeds.
   // Otherwise returns an error and "handle" is undefined.
   typedef uint64 Handle;
-  virtual Status Instantiate(const string& function_name,
-                             const InstantiateAttrValueMap& attrs,
+  virtual Status Instantiate(const string& function_name, AttrSlice attrs,
                              Handle* handle) = 0;
-  Status Instantiate(const string& function_name,
-                     InstantiateAttrValueSlice attrs, Handle* handle);
 
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
@@ -498,17 +499,15 @@ bool RegisterOp(const string& op, Creator func);
 Status GetOpGradientCreator(const string& op, Creator* creator);
 };
 
-// Implementation details.
-
-template <typename T>
-Status FunctionLibraryDefinition::GetAttr(const NodeDef& ndef,
-                                          const string& attr, T* value) const {
-  const FunctionDef* fdef = GetAttrImpl(ndef);
-  if (fdef && GetNodeAttr(AttrSlice(&fdef->attr()), attr, value).ok()) {
-    return Status::OK();
-  }
-  return errors::InvalidArgument("Attr ", attr, " is not defined.");
-}
+// Declare explicit instantiations of GetAttr
+#define GET_ATTR(T)                                          \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const Node&, const string&, T*) const;                 \
+  extern template Status FunctionLibraryDefinition::GetAttr( \
+      const NodeDef&, const string&, T*) const;
+GET_ATTR(string)
+GET_ATTR(bool)
+#undef GET_ATTR
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 414b0979978e98dcf494309b4ab3278d213f4edf..f3ad935c7877f0c2d42140467b4df5bb5f66cb45 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -29,6 +29,24 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace {
+
+// A helper class to make AttrSlice from initializer lists
+class Attrs {
+ public:
+  Attrs(const std::initializer_list<  // NOLINT(runtime/explicit)
+        std::pair<string, FunctionDefHelper::AttrValueWrapper>>
+            attrs) {
+    for (const auto& aval : attrs) {
+      map_.insert({aval.first, aval.second.proto});
+    }
+  }
+
+  operator AttrSlice() { return AttrSlice(&map_); }  // NOLINT(runtime/explicit)
+
+ private:
+  AttrValueMap map_;
+};
 
 typedef FunctionDefHelper FDH;
 
@@ -46,8 +64,6 @@ y: A scalar in type T.
 
 )doc");
 
-static InstantiateAttrValueMap kNoAttrs;
-
 TEST(TFunc, SquarePlusOne) {
   auto fdef = FDH::Create(
       // Name
@@ -81,12 +97,13 @@ SquarePlusOne[T:{float, double, int32, int64}](x:T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n3:float) {
-  n1 = Square[T=float](n0)
-  n2 = One[T=float]()
-  n3 = Add[T=float](n1, n2)
+(x:float) -> (y:float) {
+  a = Square[T=float](x)
+  o = One[T=float]()
+  y = Add[T=float](a, o)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
@@ -126,12 +143,13 @@ ControlDep(x:int32) -> (y:int32) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result));
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:int32) -> (n3:int32) {
-  n1 = Identity[T=int32](n0)
-  n2 = NoOp() @ n1
-  n3 = Identity[T=int32](n1) @ n2
+(x:int32) -> (y:int32) {
+  a = Identity[T=int32](x)
+  o = NoOp() @ a
+  y = Identity[T=int32](a) @ o
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_INT32}));
@@ -145,7 +163,7 @@ REGISTER_OP("HasDefaultType")
 
 // This verifies that a function using an op before a type attr (with
 // a default) is added, still works.  This is important for backwards
-// compatibilty.
+// compatibility.
 TEST(TFunc, MissingTypeAttr) {
   auto fdef = FDH::Create(
       // Name
@@ -171,12 +189,11 @@ BackCompat() -> (y:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(
-      InstantiateFunction(fdef, InstantiateAttrValueMap{}, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   // Should get T=float from Op's default.
   const char* e2 = R"P(
-() -> (n0:float) {
-  n0 = HasDefaultType[T=float]()
+() -> (a:float) {
+  a = HasDefaultType[T=float]()
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector());
@@ -209,10 +226,10 @@ NTimesT(x:float, y:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float, n1:float) -> (n2:float) {
-  n2 = AddN[N=2, T=float](n0, n1)
+(x:float, y:float) -> (a:float) {
+  a = AddN[N=2, T=float](x, y)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT, DT_FLOAT}));
@@ -272,12 +289,12 @@ AddSquared[N:int, T:{float, double, int32, int64}](x:N*T) -> (y:T) {
 
   // Instantiate one with T=float
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, {{"N", 3}, {"T", DT_FLOAT}}, GetOpSig,
-                                   &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, Attrs({{"N", 3}, {"T", DT_FLOAT}}),
+                                   GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float, n1:float, n2:float) -> (n4:float) {
-  n3 = Map[N=3, T=float, U=float, func=Square[T=float]](n0, n1, n2)
-  n4 = AddN[N=3, T=float](n3, n3:1, n3:2)
+(x_0:float, x_1:float, x_2:float) -> (y:float) {
+  a = Map[N=3, T=float, U=float, func=Square[T=float]](x_0, x_1, x_2)
+  y = AddN[N=3, T=float](a, a:1, a:2)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT, DT_FLOAT, DT_FLOAT}));
@@ -315,14 +332,14 @@ ControlDeps(x:float) -> () {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> () {
-  n1 = One[T=float]() @ n0
-  n2 = NoOp() @ n1
-  n3 = One[T=float]() @ n2
-  n4 = NoOp() @ n3
-  n5 = One[T=float]() @ n1, n4
+(x:float) -> () {
+  a = One[T=float]() @ x
+  u = NoOp() @ a
+  b = One[T=float]() @ u
+  v = NoOp() @ b
+  c = One[T=float]() @ a, v
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
@@ -395,15 +412,15 @@ Test(i:float) -> (o:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n6:float) {
-  n1 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Split[T=float, num_split=4](n1, n0)
-  n3 = Mul[T=float](n2, n2:1)
-  n4 = Mul[T=float](n2:2, n2:3)
-  n5 = _ListToArray[N=2, T=float, Tin={float, float}](n3, n4)
-  n6 = AddN[N=2, T=float](n5, n5:1)
+(i:float) -> (o:float) {
+  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
+  s = Split[T=float, num_split=4](zero, i)
+  l = Mul[T=float](s, s:1)
+  r = Mul[T=float](s:2, s:3)
+  x = _ListToArray[N=2, T=float, Tin={float, float}](l, r)
+  o = AddN[N=2, T=float](x, x:1)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
@@ -467,11 +484,11 @@ MySelect(x:float) -> (z:float) {
   EXPECT_EQ(DebugString(fdef), e);
 
   InstantiationResult result;
-  TF_ASSERT_OK(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result));
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   const char* e2 = R"P(
-(n0:float) -> (n2:float) {
-  n1 = Cond[Tin={float}, cond=MyCond, else_branch=MyElse, out_types={float}, then_branch=MyThen](n0)
-  n2 = Cond[Tin={float, float}, cond=MyCond2, else_branch=MyElse2, out_types={float}, then_branch=MyThen2](n1, n1)
+(x:float) -> (z:float) {
+  y = Cond[Tin={float}, cond=MyCond, else_branch=MyElse, out_types={float}, then_branch=MyThen](x)
+  z = Cond[Tin={float, float}, cond=MyCond2, else_branch=MyElse2, out_types={float}, then_branch=MyThen2](y, y)
 }
 )P";
   EXPECT_EQ(result.arg_types, DataTypeVector({DT_FLOAT}));
@@ -488,8 +505,9 @@ TEST(InstantiateErrors, Not_Sufficient_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"U", DT_FLOAT}}, GetOpSig, &result),
-           "Attr T is not found from ");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"U", DT_FLOAT}}), GetOpSig, &result),
+      "Attr T is not found from ");
 }
 
 #if 0  // TODO(josh11b): Enable this test once having an extra attr is an error.
@@ -497,7 +515,7 @@ TEST(InstantiateErrors, Too_Many_Attrs) {
   auto fdef =
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_INT32}, {"U", DT_FLOAT}},
+  HasError(InstantiateFunction(fdef, Attrs({{"T", DT_INT32}, {"U", DT_FLOAT}}),
                                GetOpSig, &result),
            "Attr U is not found in ");
 }
@@ -508,7 +526,7 @@ TEST(InstantiateErrors, AttrValue_Value_Placeholder) {
       FDH::Define("nop", {}, {}, {"T:{float, double, int32, int64}"}, {});
   InstantiationResult result;
   HasError(
-      InstantiateFunction(fdef, {{"T", "$bad"}}, GetOpSig, &result),
+      InstantiateFunction(fdef, Attrs({{"T", "$bad"}}), GetOpSig, &result),
       "AttrValue had value with unexpected type 'placeholder'\n\tfor attr 'T'");
 }
 
@@ -518,14 +536,15 @@ TEST(InstantiateErrors, Unbounded_Attr) {
                               {{"a"}, "One", {}, {{"T", "$unknown"}}, {"x"}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, {{"T", DT_FLOAT}}, GetOpSig, &result),
-           "Failed to bind all placeholders");
+  HasError(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result),
+      "Failed to bind all placeholders");
 }
 
 TEST(InstantiateErrors, DupArgs) {
   auto fdef = FDH::Define("test", {"x:float", "x:float"}, {}, {}, {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated arg name");
 }
 
@@ -536,7 +555,7 @@ TEST(InstantiateErrors, Dup_Node_Names) {
                               {{"y"}, "One", {}, {{"T", DT_FLOAT}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Duplicated ret name");
 }
 
@@ -547,7 +566,7 @@ TEST(InstantiateErrors, Node_Arg_Notfound) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input z is not found");
 }
 
@@ -557,7 +576,7 @@ TEST(InstantiateErrors, Node_Arg_TypeMismatch) {
                               {{"y"}, "Add", {"x", "x"}, {{"T", DT_INT32}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input x[0] expected type int32 != float, the type of x[0]");
 }
 
@@ -568,7 +587,7 @@ TEST(InstantiateErrors, Node_Arg_ControlMissing) {
                       {{"y"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}, {"z"}},
                   });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input[2] == '^z', is not found.");
 }
 
@@ -579,7 +598,7 @@ TEST(InstantiateErrors, FuncRet_Missing) {
                           },
                           {});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -590,7 +609,7 @@ TEST(InstantiateErrors, FuncRet_NotFound) {
                           },
                           {{"y", "z"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y -> z is not found");
 }
 
@@ -601,7 +620,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
                           },
                           {{"z", "x:y:0"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Return y missing");
 }
 
@@ -613,7 +632,7 @@ TEST(InstantiateErrors, FuncRet_NameMismatch) {
 //                           },
 //                           {{"y", "x:y:0"}, {"z", "x:y:0"}});
 //   InstantiationResult result;
-//   HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+//   HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
 //            "ret is not found");
 // }
 
@@ -623,7 +642,7 @@ TEST(InstantiateErrors, FuncRet_TypeMismatch) {
                               {{"y"}, "One", {}, {{"T", DT_DOUBLE}}},
                           });
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types y : float vs. double\n\tIn function output y");
 }
 
@@ -649,7 +668,7 @@ TEST(InstantiateErrors, TypeList_Missing_Retval_Attr) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "type attr not found: out_types");
 }
 
@@ -676,7 +695,7 @@ TEST(InstantiateErrors, TypeList_Num_Retval_Mismatch) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Invalid ret types");
 }
 
@@ -703,7 +722,7 @@ TEST(InstantiateErrors, TypeList_Missing_Arg) {
       },
       {{"y", "y:output"}});
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input unknown is not found");
 }
 
@@ -724,7 +743,7 @@ TEST(InstantiateErrors, TooManyInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[2] == 'x' to be a control input.");
 }
 
@@ -745,7 +764,7 @@ TEST(InstantiateErrors, TooFewInputs) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Attempt to access beyond input size: 2 >= 2");
 }
 
@@ -773,7 +792,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray1) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Expected input[1] == 'y' to be a control input.");
 }
 
@@ -801,7 +820,7 @@ TEST(InstantiateErrors, TooManyInputsFromArray2) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "Input a:output too long for inputs");
 }
 
@@ -822,7 +841,7 @@ TEST(InstantiateErrors, TypeMismatch) {
       {{"z", "a:sum:0"}});
 
   InstantiationResult result;
-  HasError(InstantiateFunction(fdef, kNoAttrs, GetOpSig, &result),
+  HasError(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result),
            "input inputs[1] expected type float != int32, the type of y[0]");
 }
 
@@ -874,17 +893,17 @@ TEST(FunctionCallFrame, Float_Float_Float) {
 }
 
 TEST(Canonicalize, Basic) {
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_a", false},
-                                    {"transpose_b", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_a", false},
+                                          {"transpose_b", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_FLOAT},
-                                    {"transpose_b", false},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_FLOAT},
+                                          {"transpose_b", false},
+                                          {"transpose_a", false}})),
             "MatMul[T=float,transpose_a=false,transpose_b=false]");
-  EXPECT_EQ(Canonicalize("MatMul", {{"T", DT_DOUBLE},
-                                    {"transpose_b", true},
-                                    {"transpose_a", false}}),
+  EXPECT_EQ(Canonicalize("MatMul", Attrs({{"T", DT_DOUBLE},
+                                          {"transpose_b", true},
+                                          {"transpose_a", false}})),
             "MatMul[T=double,transpose_a=false,transpose_b=true]");
 }
 
@@ -944,6 +963,15 @@ TEST(FunctionLibraryDefinitionTest, AddFunctionDef) {
   ASSERT_NE(second, nullptr);
   EXPECT_EQ(second->DebugString(),
             test::function::WXPlusB().signature().DebugString());
+
+  // Can't add function with same name as existing op
+  FunctionDef fdef = test::function::XTimesTwo();
+  fdef.mutable_signature()->set_name("Add");
+  Status s = lib_def.AddFunctionDef(fdef);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
 }
 
 TEST(FunctionLibraryDefinitionTest, AddGradientDef) {
@@ -993,7 +1021,7 @@ TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   EXPECT_EQ(s.error_message(),
             "Gradient for function 'XTimesTwo' already exists.");
 
-  // No conflicing functions or gradients OK
+  // No conflicting functions or gradients OK
   proto.Clear();
   *proto.add_function() = test::function::XTimesFour();
   grad.set_function_name(test::function::XTimes16().signature().name());
@@ -1107,4 +1135,37 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+// TODO(skyewm): this could be more thorough
+TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
+  // Equal functions
+  FunctionDef fdef1 = test::function::XTimesTwo();
+  FunctionDef fdef2 = test::function::XTimesTwo();
+  EXPECT_TRUE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different functions
+  fdef2 = test::function::XTimesFour();
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different signatures
+  fdef2 = test::function::XTimesTwo();
+  fdef2.mutable_signature()->mutable_input_arg(0)->set_name("foo");
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Descriptions must be equal
+  fdef2 = test::function::XTimesTwo();
+  fdef2.mutable_signature()->mutable_input_arg(0)->set_description("foo");
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different NodeDefs
+  fdef2 = test::function::XTimesTwo();
+  *fdef2.add_node_def() = fdef2.node_def(0);
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+
+  // Different return values
+  fdef2 = test::function::XTimesTwo();
+  (*fdef2.mutable_ret())["y"] = "y:z:1";  // originally is "y:z:0"
+  EXPECT_FALSE(FunctionDefsEqual(fdef1, fdef2));
+}
+
+}  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index fb1ad0102f6b66f131f8913ab646e87f2597a161..e45f156e1e590187e7b01a19892e418d663bf13d 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -126,25 +126,33 @@ FunctionDef XTimes16() {
       {{"y", "y:y:0"}});
 }
 
-FunctionDef WXPlusB() {
-  return FDH::Define(
-      // Name
-      "WXPlusB",
-      // Args
-      {"w: T", "x: T", "b: T"},
-      // Return values
-      {"y: T"},
-      // Attr def
-      {"T: {float, double}"},
-      // Nodes
-      {{{"mm"},
-        "MatMul",
-        {"w", "x"},
-        {{"T", "$T"},
-         {"transpose_a", false},
-         {"transpose_b", false},
+FunctionDef WXPlusB(){return FDH::Define(
+    // Name
+    "WXPlusB",
+    // Args
+    {"w: T", "x: T", "b: T"},
+    // Return values
+    {"y: T"},
+    // Attr def
+    {"T: {float, double}"},
+    // Nodes
+    {
+      {{"mm"},
+       "MatMul",
+       {"w", "x"},
+       {
+           {"T", "$T"}, {"transpose_a", false}, {"transpose_b", false},
+#ifdef INTEL_MKL
+       }},
+#else
          {"_kernel", "eigen"}}},
-       {{"y"}, "Add", {"mm", "b"}, {{"T", "$T"}}}});
+#endif
+      {
+        {"y"}, "Add", {"mm", "b"}, {
+          { "T", "$T" }
+        }
+      }
+    });
 }
 
 FunctionDef Swap() {
diff --git a/tensorflow/core/framework/graph_def_util_test.cc b/tensorflow/core/framework/graph_def_util_test.cc
index 8c76a74a4a524aae7cd50ed7b6a22ed852148d86..1ac322e48e2e6a9a572d8e85b01e166fc7e36f74 100644
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Status FinalizeOpDef(OpDefBuilder b, OpDef* op_def) {
+Status FinalizeOpDef(const OpDefBuilder& b, OpDef* op_def) {
   OpRegistrationData op_reg_data;
   const Status s = b.Finalize(&op_reg_data);
   *op_def = op_reg_data.op_def;
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 14d8d91490e2e5e056f6023bbb517fd5d234f66b..c1dde1504a7cf647455c174b659bab3fb3792789 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/memory_types.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -64,7 +66,7 @@ MemoryType MTypeFromDType(const DataType dtype) {
 }  // namespace
 
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* inp_mtypes,
                           MemoryTypeVector* out_mtypes) {
   // Look up the Op registered for this op name.
diff --git a/tensorflow/core/framework/memory_types.h b/tensorflow/core/framework/memory_types.h
index 3d4ca7597a43b29f2f0f53e287ee4bd705edb0a7..e35e22f5907b099afa8722e291fe408cf9c96fc5 100644
--- a/tensorflow/core/framework/memory_types.h
+++ b/tensorflow/core/framework/memory_types.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 // REQUIRES: * '*_memory_types' is not nullptr.
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status MemoryTypesForNode(const OpRegistryInterface* op_registry,
-                          DeviceType device_type, const NodeDef& ndef,
+                          const DeviceType& device_type, const NodeDef& ndef,
                           MemoryTypeVector* input_memory_types,
                           MemoryTypeVector* output_memory_types);
 
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index 196e5e46edbe5523f23ac9c1315564874944fcd2..e836873f667a6971b2c12d44860e5436a04cb93c 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -208,9 +208,8 @@ TEST_F(NodeDefBuilderTest, OpDoesNotExist) {
       .ControlInput("y")
       .Attr("foo", 12)
       .Device("device");
-  ExpectFailure(
-      builder,
-      "Op type not registered 'Op Does Not Exist' while building NodeDef 'n'");
+  ExpectFailures(builder, {"Op type not registered 'Op Does Not Exist'",
+                           "while building NodeDef 'n'"});
 }
 
 TEST_F(NodeDefBuilderTest, Polymorphic) {
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 7e8ad507172b540cc009aa804485e2174fd0dd66..9b737e1f72d26f0c1db64553e24df65575d4b5b4 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -36,18 +37,23 @@ namespace tensorflow {
 const char* const kColocationAttrName = "_class";
 const char* const kColocationGroupPrefix = "loc:@";
 
+AttrSlice::AttrSlice() : ndef_(nullptr) {
+  static const AttrValueMap* const kEmptyAttrValueMap = new AttrValueMap;
+  attrs_ = kEmptyAttrValueMap;
+}
+
 AttrSlice::AttrSlice(const NodeDef& node_def)
     : ndef_(&node_def), attrs_(&ndef_->attr()) {}
 
 AttrSlice::AttrSlice(const AttrValueMap* a) : ndef_(nullptr), attrs_(a) {}
 
-string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+static string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device) {
+  string ret;
 
   // We sort the attrs so the output is deterministic.
   std::vector<string> attr_names;
-  attr_names.reserve(node_def.attr().size());
-  for (const auto& attr : node_def.attr()) {
+  attr_names.reserve(attrs.size());
+  for (const auto& attr : attrs) {
     attr_names.push_back(attr.first);
   }
   std::sort(attr_names.begin(), attr_names.end());
@@ -55,20 +61,34 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& attr_name : attr_names) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    auto iter = node_def.attr().find(attr_name);
-    strings::StrAppend(&ret, attr_name, "=", SummarizeAttrValue(iter->second));
+    strings::StrAppend(&ret, attr_name, "=",
+                       SummarizeAttrValue(*attrs.Find(attr_name)));
   }
 
   // Consider the device to be a final attr with name "_device".
-  if (!node_def.device().empty()) {
+  if (!device.empty()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
-    strings::StrAppend(&ret, "_device=\"", node_def.device(), "\"");
+    strings::StrAppend(&ret, "_device=\"", device, "\"");
   }
+  return ret;
+}
+
+string AttrSlice::SummarizeNode() const {
+  return ndef_ ? SummarizeNodeDef(*ndef_)
+               : strings::StrCat(
+                     "[", SummarizeAttrsHelper(*this, StringPiece()), "]");
+}
+
+string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
+
+string SummarizeNodeDef(const NodeDef& node_def) {
+  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+  strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
   // Output inputs, including control inputs, verbatim.
-  first = true;
+  bool first = true;
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
@@ -79,9 +99,24 @@ string SummarizeNodeDef(const NodeDef& node_def) {
 }
 
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
-  auto iter = attrs_->find(attr_name.ToString());
-  if (iter == attrs_->end()) return nullptr;
-  return &iter->second;
+  // Currently, the collection used for NodeDef::attr() (google::protobuf::Map)
+  // requires that the keys used for lookups have type 'const string&'. Because
+  // this method takes a StringPiece, it is necessary to allocate a temporary
+  // string, copy attr_name to it, and then use that temporary string for the
+  // lookup. This causes an excessive number of short-lived allocations, and for
+  // large graphs, this can be a significant cost.
+  //
+  // Because most nodes have a small number of attributes, a simple linear scan
+  // is generally more efficient than a hashed lookup.  If google::protobuf::Map
+  // changes so that it supports efficient lookups using StringPiece instead of
+  // const string&, then this code could be changed to use attrs_->find() again.
+
+  for (const auto& attr : *attrs_) {
+    if (attr.first == attr_name) {
+      return &attr.second;
+    }
+  }
+  return nullptr;
 }
 
 Status AttrSlice::Find(StringPiece attr_name,
@@ -94,12 +129,28 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!StringPiece(attr_name).starts_with("_") && ndef_) {
+  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
 }
 
+bool AttrSlice::EqualAttrs(AttrSlice other, Scratch* scratch) const {
+  if (size() != other.size()) return false;
+
+  for (const auto& attr : *other.attrs_) {
+    auto iter = attrs_->find(attr.first);
+    if (iter == attrs_->end()) return false;
+    // TODO(irving): Comparing AttrValues by proto is slightly buggy, since
+    // TensorProto is a nonunique representation of Tensor.  This bug will go
+    // away once AttrSlice switches over to NodeInfo.
+    iter->second.SerializeToString(&scratch->a);
+    attr.second.SerializeToString(&scratch->b);
+    if (scratch->a != scratch->b) return false;
+  }
+  return true;
+}
+
 // The ... is to allow the caller to inject some value validation code.  Use
 // just ; if no additional validation code is needed.
 #define DEFINE_GET_ATTR(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...)         \
@@ -125,7 +176,41 @@ Status AttrSlice::Find(StringPiece attr_name,
     return Status::OK();                                                      \
   }
 
+#define DEFINE_GET_ATTR_SIMPLE(TYPE, FIELD, ATTR_TYPE, APPEND_OP, CAST, ...) \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         TYPE* value) {                                      \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, ATTR_TYPE);                     \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    const auto& v = attr_value->FIELD();                                     \
+    __VA_ARGS__;                                                             \
+    *value = CAST;                                                           \
+    return true;                                                             \
+  }                                                                          \
+  bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,      \
+                         std::vector<TYPE>* value) {                         \
+    const AttrValue* attr_value = attrs.Find(attr_name);                     \
+    if (attr_value == nullptr) {                                             \
+      return false;                                                          \
+    }                                                                        \
+    Status s = AttrValueHasType(*attr_value, "list(" ATTR_TYPE ")");         \
+    if (!s.ok()) {                                                           \
+      return false;                                                          \
+    }                                                                        \
+    for (const auto& v : attr_value->list().FIELD()) {                       \
+      __VA_ARGS__;                                                           \
+      value->APPEND_OP(CAST);                                                \
+    }                                                                        \
+    return true;                                                             \
+  }
+
 DEFINE_GET_ATTR(string, s, "string", emplace_back, v, ;)
+DEFINE_GET_ATTR_SIMPLE(string, s, "string", emplace_back, v, ;)
 DEFINE_GET_ATTR(int64, i, "int", emplace_back, v, ;)
 DEFINE_GET_ATTR(int32, i, "int", emplace_back, static_cast<int32>(v),
                 if (static_cast<int64>(static_cast<int32>(v)) != v) {
@@ -156,6 +241,20 @@ DEFINE_GET_ATTR(Tensor, tensor, "tensor", emplace_back, t, Tensor t;
 
 #undef DEFINE_GET_ATTR
 
+static const string& kEmptyString = *new string();
+
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name) {
+  const AttrValue* attr_value = attrs.Find(attr_name);
+  if (attr_value == nullptr) {
+    return kEmptyString;
+  }
+  Status s = AttrValueHasType(*attr_value, "string");
+  if (!s.ok()) {
+    return kEmptyString;
+  }
+  return attr_value->s();
+}
+
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    DataTypeVector* value) {
   const AttrValue* attr_value;
@@ -278,14 +377,14 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     if (StringPiece(input).starts_with("^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
-        return errors::InvalidArgument("Control input '", input,
-                                       "' must not have ':' in NodeDef: ",
-                                       SummarizeNodeDef(node_def));
+        return errors::InvalidArgument(
+            "Control input '", input,
+            "' must not have ':' in NodeDef: ", SummarizeNodeDef(node_def));
       }
     } else if (seen_control) {
-      return errors::InvalidArgument("Non-control input '", input,
-                                     "' after control input in NodeDef: ",
-                                     SummarizeNodeDef(node_def));
+      return errors::InvalidArgument(
+          "Non-control input '", input,
+          "' after control input in NodeDef: ", SummarizeNodeDef(node_def));
     } else {
       ++num_inputs;
     }
@@ -295,8 +394,8 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   for (const auto& attr : op_def.attr()) {
     if (!gtl::InsertIfNotPresent(&op_attrs, attr.name(), &attr)) {
       return errors::InvalidArgument("OpDef has duplicate attr name '",
-                                     attr.name(), "': ",
-                                     SummarizeOpDef(op_def));
+                                     attr.name(),
+                                     "': ", SummarizeOpDef(op_def));
     }
   }
   for (const auto& attr : node_def.attr()) {
@@ -320,8 +419,9 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
           "with your GraphDef-generating binary.).");
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second), "; NodeDef: ",
-        SummarizeNodeDef(node_def), "; ", SummarizeOpDef(op_def));
+        ValidateAttrValue(attr.second, *iter->second),
+        "; NodeDef: ", SummarizeNodeDef(node_def), "; ",
+        SummarizeOpDef(op_def));
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
@@ -368,9 +468,9 @@ Status ComputeArgRange(const NodeDef& node_def, const OpDef::ArgDef& arg_def,
   } else if (!arg_def.type_attr().empty() || arg_def.type() != DT_INVALID) {
     *num = 1;
   } else {
-    return errors::InvalidArgument("Argument '", arg_def.name(),
-                                   "' incorrectly specified in op definition: ",
-                                   SummarizeOpDef(op_def));
+    return errors::InvalidArgument(
+        "Argument '", arg_def.name(),
+        "' incorrectly specified in op definition: ", SummarizeOpDef(op_def));
   }
   return Status::OK();
 }
@@ -402,6 +502,11 @@ Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
   return Status::OK();
 }
 
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs) {
+  return NameRangesForNode(node.def(), op_def, inputs, outputs);
+}
+
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def) {
   for (const auto& attr_def : op_def.attr()) {
     AttrSlice attrs(*node_def);
@@ -502,4 +607,8 @@ Status AttachDef(const Status& status, const NodeDef& node_def) {
   return ret;
 }
 
+Status AttachDef(const Status& status, const Node& node) {
+  return AttachDef(status, node.def());
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 5c4d2272682de59221002e1b863007efdff6a321..1438abdec606442246baba00cc6ca818c4cec7d5 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -29,6 +29,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class Node;
+
 // Name of the attribute used to encode node colocation constraints.
 //
 // Nodes can be co-located on the same device. Desire for explicit co-location
@@ -39,8 +41,9 @@ extern const char* const kColocationAttrName;
 // String prefix applied to the operation name for colocation constraints.
 extern const char* const kColocationGroupPrefix;
 
-// Produce a human-readable version of a NodeDef that is more concise
+// Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
+string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
@@ -78,8 +81,11 @@ class AttrSlice {
  public:
   AttrSlice(const NodeDef& node_def);  // NOLINT(runtime/explicit)
 
+  AttrSlice();  // Empty
   explicit AttrSlice(const AttrValueMap* a);
 
+  int size() const { return attrs_->size(); }
+
   // Returns the attr with attr_name if found.  Otherwise, returns
   // nullptr.
   const AttrValue* Find(StringPiece attr_name) const;
@@ -88,6 +94,33 @@ class AttrSlice {
   // NotFound status.
   Status Find(StringPiece attr_name, const AttrValue** attr_value) const;
 
+  // Helper class to avoid allocations in EqualAttrs.
+  // TODO(irving): Will go away once NodeInfo is used.
+  struct Scratch {
+    string a;
+    string b;
+  };
+
+  // Check if all attrs and attr values match.  Does not take defaults into
+  // account.
+  //
+  // TODO(irving): There is a bug in this routine inherited from its
+  // OptimizerCSE::EqualAttrs precedecessor.  The same tensor attr can be
+  // represented in more than one way as an AttrValue, since TensorProto is
+  // not 1-1.  This bug will go away once I replace everything with NodeInfo,
+  // which stores a Tensor object directly.  The Scratch object will also go
+  // away.
+  bool EqualAttrs(AttrSlice other, Scratch* scratch) const;
+
+  // If this AttrSlice has an attached NodeDef, summarize it.  This is for
+  // error messages only: we intentionally do not provide direct access to the
+  // NodeDef, since it is not always there.
+  string SummarizeNode() const;
+
+  // Iteration over all attrs
+  AttrValueMap::const_iterator begin() const { return attrs_->begin(); }
+  AttrValueMap::const_iterator end() const { return attrs_->end(); }
+
  private:
   const NodeDef* ndef_;
   const AttrValueMap* attrs_;
@@ -153,6 +186,20 @@ Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
 Status GetNodeAttr(const AttrSlice& attrs, StringPiece attr_name,
                    std::vector<NameAttrList>* value);  // type: "list(func)"
 
+// Look up the attr with name attr_name and set *value to its value.  If no
+// attr with attr_name is found in node_def, or the attr does not have
+// a matching type, false is returned.
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       string* value);  // type: "string"
+bool GetNodeAttrSimple(const AttrSlice& attrs, StringPiece attr_name,
+                       std::vector<string>* value);  // type: "string"
+
+// Look up the attr with name attr_name and return a reference to its value.
+// If no attr with attr_name is found in node_def, or the attr does not have
+// a matching type, a reference to an empty string is returned.
+// REQUIRES: Must not use the returned value beyond the lifetime of node_def.
+const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name);
+
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
@@ -169,9 +216,12 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def);
 // corresponding input/output index range.  For example,
 // input "foo" corresponds to input indices
 //   [ (*inputs)["foo"].first, (*inputs)["foo"].second ).
+// TODO(irving): Remove the NodeDef version; keep only the Node version.
 typedef std::unordered_map<string, std::pair<int, int>> NameRangeMap;
 Status NameRangesForNode(const NodeDef& node_def, const OpDef& op_def,
                          NameRangeMap* inputs, NameRangeMap* outputs);
+Status NameRangesForNode(const Node& node, const OpDef& op_def,
+                         NameRangeMap* inputs, NameRangeMap* outputs);
 
 // Adds default values to *node_def for unspecified attrs from op_def.
 void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
@@ -192,6 +242,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
 // Returns "status" with kernel's NodeDef attached as additional text
 // in the error message.
 Status AttachDef(const Status& status, const NodeDef& node_def);
+Status AttachDef(const Status& status, const Node& node);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 6bff192b1ec2f08b1e6f8ac0664713449526d683..5ddac6b198207d2779062a23e12851b9b037aa62 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -83,7 +84,10 @@ Status OpRegistry::LookUp(const string& op_type_name,
       first_unregistered = false;
     }
     Status status =
-        errors::NotFound("Op type not registered '", op_type_name, "'");
+        errors::NotFound("Op type not registered '", op_type_name,
+                         "' in binary running on ", port::Hostname(), ". ",
+                         "Make sure the Op and Kernel are registered in the "
+                         "binary running in this process.");
     VLOG(1) << status.ToString();
     return status;
   }
@@ -225,7 +229,10 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     *op_reg_data = nullptr;
-    return errors::NotFound("Op type not registered '", op_type_name, "'");
+    return errors::NotFound("Op type not registered '", op_type_name,
+                            "' in binary running on ", port::Hostname(), ". ",
+                            "Make sure the Op and Kernel are registered in the "
+                            "binary running in this process.");
   }
   *op_reg_data = iter->second;
   return Status::OK();
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index f047ddb12a1519696b71d10a4fca030a32551e27..892ed9b60b413fed94180bf51094bcaa078e9f24 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -293,6 +293,18 @@ struct OpDefBuilderReceiver {
           ::tensorflow::register_op::OpDefBuilderWrapper<SHOULD_REGISTER_OP( \
               name)>(name)
 
+// The `REGISTER_SYSTEM_OP()` macro acts as `REGISTER_OP()` except
+// that the op is registered unconditionally even when selective
+// registration is used.
+#define REGISTER_SYSTEM_OP(name) \
+  REGISTER_SYSTEM_OP_UNIQ_HELPER(__COUNTER__, name)
+#define REGISTER_SYSTEM_OP_UNIQ_HELPER(ctr, name) \
+  REGISTER_SYSTEM_OP_UNIQ(ctr, name)
+#define REGISTER_SYSTEM_OP_UNIQ(ctr, name)                                \
+  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr \
+      TF_ATTRIBUTE_UNUSED =                                               \
+          ::tensorflow::register_op::OpDefBuilderWrapper<true>(name)
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_OP_H_
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index a6ffd5c59618883874587fb608dd19ec9b714910..bde5bb2c397ed98c1c1ed5bf2178ecbdbc324e2e 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -73,7 +73,7 @@ class OpDefBuilderTest : public ::testing::Test {
     }
   }
 
-  void ExpectFailure(const OpDefBuilder& builder, string error) {
+  void ExpectFailure(const OpDefBuilder& builder, const string& error) {
     OpRegistrationData op_reg_data;
     Status status = builder.Finalize(&op_reg_data);
     EXPECT_FALSE(status.ok());
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 3626de58d628fb9325e4feb8dca66c3e8453b698..6c3917c6869e654cfa550fb2f0e845dc4967d803 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -89,9 +91,9 @@ OpKernel::OpKernel(OpKernelConstruction* context)
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
-                 NameRangesForNode(def_, context->op_def(), &input_name_map_,
+                 NameRangesForNode(def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
-  OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
+  OP_REQUIRES_OK(context, CheckOpDeprecation(*context->op_def_,
                                              context->graph_def_version()));
 
   // Kernels executing on GPU tie very few resources on the CPU where the
@@ -125,6 +127,23 @@ Status OpKernel::OutputRange(StringPiece output_name, int* start,
   }
 }
 
+Status OpKernel::MakeShape(const Tensor& shape, TensorShape* out) const {
+  if (!IsLegacyVector(shape.shape())) {
+    return errors::InvalidArgument(
+        "shape must be a vector of {int32,int64}, got shape ",
+        shape.shape().DebugString());
+  }
+  if (shape.dtype() == DataType::DT_INT32) {
+    auto vec = shape.flat<int32>();
+    return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
+  } else if (shape.dtype() == DataType::DT_INT64) {
+    auto vec = shape.flat<int64>();
+    return TensorShapeUtils::MakeShape(vec.data(), vec.size(), out);
+  } else {
+    return errors::InvalidArgument("shape must be a vector of {int32,int64}.");
+  }
+}
+
 void AsyncOpKernel::Compute(OpKernelContext* context) {
   Notification n;
   ComputeAsync(context, [&n]() { n.Notify(); });
@@ -639,22 +658,6 @@ Status OpKernelContext::allocate_persistent(DataType type,
       *out_tensor = out_persistent->AccessTensor(this);
     }
   }
-  if (track_allocations() && persistent.TotalBytes() > 0) {
-    // TODO(yuefengz): some allocators allocate memory even if the requested
-    // size is 0.
-    Allocator* a = get_allocator(attr);
-    if (a->TracksAllocationSizes()) {
-      int64 alloc_size =
-          a->AllocatedSize(const_cast<char*>(persistent.tensor_data().data()));
-      int64 alloc_id =
-          a->AllocationId(const_cast<char*>(persistent.tensor_data().data()));
-      if (allocate_on_host(attr)) {
-        record_host_persistent_memory_allocation(alloc_size, alloc_id);
-      } else {
-        record_device_persistent_memory_allocation(alloc_size, alloc_id);
-      }
-    }
-  }
   return s;
 }
 
@@ -806,7 +809,7 @@ static KernelRegistry* GlobalKernelRegistryTyped() {
   return reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
 }
 
-static string Key(StringPiece op_type, DeviceType device_type,
+static string Key(StringPiece op_type, const DeviceType& device_type,
                   StringPiece label) {
   return strings::StrCat(op_type, ":", DeviceTypeString(device_type), ":",
                          label);
@@ -840,13 +843,10 @@ bool InTypeList(DataType dt, const AttrValue& type_list) {
   return false;
 }
 
-// Returns whether the attrs in the NodeDef satisfy the constraints in
-// the kernel_def.  Returns an error if attrs in kernel_def are not
-// found, or have a mismatching type.
-Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
-                  bool* match) {
+// Returns whether the attrs satisfy the constraints in the kernel_def.  Returns
+// an error if attrs in kernel_def are not found, or have a mismatching type.
+Status AttrsMatch(AttrSlice attrs, const KernelDef& kernel_def, bool* match) {
   *match = false;
-  AttrSlice attrs(node_def);
   for (const auto& constraint : kernel_def.constraint()) {
     if (constraint.allowed_values().list().type_size() == 0) {
       return errors::Unimplemented(
@@ -870,7 +870,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
               "' that has value '", SummarizeAttrValue(*found),
               "' that does not have type 'type' or 'list(type)' in NodeDef "
               "'",
-              SummarizeNodeDef(node_def), "'");
+              attrs.SummarizeNode(), "'");
         }
 
         for (int t : found->list().type()) {
@@ -883,7 +883,7 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
     } else {
       return errors::InvalidArgument(
           "OpKernel '", kernel_def.op(), "' has constraint on attr '",
-          constraint.name(), "' not in NodeDef '", SummarizeNodeDef(node_def),
+          constraint.name(), "' not in NodeDef '", attrs.SummarizeNode(),
           "', KernelDef: '", ProtoShortDebugString(kernel_def), "'");
     }
   }
@@ -891,13 +891,18 @@ Status AttrsMatch(const NodeDef& node_def, const KernelDef& kernel_def,
   return Status::OK();
 }
 
-Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
+static const StringPiece kKernelAttr("_kernel");
+
+// TODO(irving): Replace with const Node& version below.
+Status FindKernelRegistration(const DeviceType& device_type,
+                              const NodeDef& node_def,
                               const KernelRegistration** reg,
                               bool* was_attr_mismatch) {
   *reg = nullptr;
   *was_attr_mismatch = false;
-  string label;  // Label defaults to empty if not found in NodeDef.
-  GetNodeAttr(node_def, "_kernel", &label).IgnoreError();
+  // Label defaults to empty if not found in NodeDef.
+  const string& label = GetNodeAttrString(node_def, kKernelAttr);
+
   const string key = Key(node_def.op(), device_type, label);
   auto regs = GlobalKernelRegistryTyped()->equal_range(key);
   for (auto iter = regs.first; iter != regs.second; ++iter) {
@@ -921,9 +926,17 @@ Status FindKernelRegistration(DeviceType device_type, const NodeDef& node_def,
   return Status::OK();
 }
 
+Status FindKernelRegistration(const DeviceType& device_type, const Node& node,
+                              const KernelRegistration** reg,
+                              bool* was_attr_mismatch) {
+  return FindKernelRegistration(device_type, node.def(), reg,
+                                was_attr_mismatch);
+}
+
 }  // namespace
 
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+// TODO(irving): Change const NodeDef& to const Node&
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name) {
   const KernelRegistration* reg = nullptr;
   bool was_attr_mismatch;
@@ -1005,8 +1018,8 @@ std::unique_ptr<OpKernel> CreateOpKernel(
     DeviceType device_type, DeviceBase* device, Allocator* allocator,
     const NodeDef& node_def, int graph_def_version, Status* status) {
   OpKernel* kernel = nullptr;
-  *status = CreateOpKernel(device_type, device, allocator, nullptr, node_def,
-                           graph_def_version, &kernel);
+  *status = CreateOpKernel(std::move(device_type), device, allocator, nullptr,
+                           node_def, graph_def_version, &kernel);
   return std::unique_ptr<OpKernel>(kernel);
 }
 
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index d874b9087f1843a453469e098c4db6b84dab2579..465395d858c612e0c2eb12ae21a9558e9ca60c08 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
@@ -151,6 +152,10 @@ class OpKernel {
     return shape.dims() == 1 || (allow_legacy_scalars() && shape.dims() == 0);
   }
 
+  // Turn a shape Tensor into a TensorShape
+  // TODO(irving): Move to TensorShapeUtils once !allow_legacy_scalars
+  Status MakeShape(const Tensor& shape, TensorShape* out) const;
+
  private:
   const NodeDef def_;
   const DataTypeVector input_types_;
@@ -223,7 +228,7 @@ class OpKernelConstruction {
                        const DataTypeSlice& output_types,
                        const MemoryTypeSlice& output_memory_types,
                        int graph_def_version, Status* status)
-      : device_type_(device_type),
+      : device_type_(std::move(device_type)),
         device_(device),
         allocator_(allocator),
         def_(node_def),
@@ -273,9 +278,6 @@ class OpKernelConstruction {
   // User-supplied configuration of this operation.
   const NodeDef& def() const { return *def_; }
 
-  // Op registered for this op type.
-  const OpDef& op_def() const { return *op_def_; }
-
   // For inspecting the inputs to this operation.
   int num_inputs() const { return input_types_.size(); }
   DataType input_type(int i) const { return input_types_[i]; }
@@ -349,6 +351,10 @@ class OpKernelConstruction {
   const int graph_def_version_;
   Status* status_;
 
+  // Allow op_def_ across from OpKernel, but not from subclasses.
+  // TODO(irving): Remove protos from this header entirely.
+  friend class OpKernel;
+
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
 };
 
@@ -428,6 +434,7 @@ class OpOutputList {
   OpOutputList& operator=(const OpOutputList& other) = default;
   Tensor* operator[](int i);
   bool required(int i) const;
+  DataType expected_output_dtype(int i) const;
   Status allocate(int i, const TensorShape& shape, Tensor** output);
   void set(int i, const Tensor& tensor);
   void set_ref(int i, mutex* mu, Tensor* tensor_for_ref);
@@ -1190,6 +1197,17 @@ class Name : public KernelDefBuilder {
       : KernelDefBuilder(SHOULD_REGISTER_OP(op) ? op : "_no_register") {}
 };
 
+namespace system {
+
+class Name : public KernelDefBuilder {
+ public:
+  // For system kernels, we ignore selective registration and
+  // unconditionally register the kernel.
+  explicit Name(const char* op) : KernelDefBuilder(op) {}
+};
+
+}  // namespace system
+
 }  // namespace register_kernel
 
 #define REGISTER_KERNEL_BUILDER(kernel_builder, ...) \
@@ -1212,12 +1230,32 @@ class Name : public KernelDefBuilder {
             return new __VA_ARGS__(context);                          \
           });
 
+// The `REGISTER_SYSTEM_KERNEL_BUILDER()` macro acts as
+// `REGISTER_KERNEL_BUILDER()` except that the kernel is registered
+// unconditionally even when selective registration is used.
+#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...)               \
+  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, \
+                                             __VA_ARGS__)
+
+#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
+  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
+
+#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)    \
+  static ::tensorflow::kernel_factory::OpKernelRegistrar                 \
+      registrar__body__##ctr##__object(                                  \
+          ::tensorflow::register_kernel::system::kernel_builder.Build(), \
+          #__VA_ARGS__,                                                  \
+          [](::tensorflow::OpKernelConstruction* context)                \
+              -> ::tensorflow::OpKernel* {                               \
+            return new __VA_ARGS__(context);                             \
+          });
+
 void* GlobalKernelRegistry();
 
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
 // <kernel_class_name> may be null.
-Status FindKernelDef(DeviceType device_type, const NodeDef& node_def,
+Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name);
 
 // Writes a list of all registered kernels to LOG(INFO), to help users debug
@@ -1417,6 +1455,12 @@ inline bool OpOutputList::required(int i) const {
   return ctx_->output_required(start_ + i);
 }
 
+inline DataType OpOutputList::expected_output_dtype(int i) const {
+  DCHECK_GE(i, 0);
+  DCHECK_LT(i, stop_ - start_);
+  return ctx_->expected_output_dtype(start_ + i);
+}
+
 inline Status OpOutputList::allocate(int i, const TensorShape& shape,
                                      Tensor** output) {
   DCHECK_GE(i, 0);
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 1c561899159e42269b55c81f3838325207623f03..e8e931b52e40f8440145856345c9bb7e314551d9 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <memory>
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -133,8 +134,8 @@ class OpKernelTest : public ::testing::Test {
                      const DataTypeVector& outputs) {
     Status status;
     std::unique_ptr<OpKernel> op(CreateOpKernel(
-        device_type, &device_, cpu_allocator(), CreateNodeDef(op_type, inputs),
-        TF_GRAPH_DEF_VERSION, &status));
+        std::move(device_type), &device_, cpu_allocator(),
+        CreateNodeDef(op_type, inputs), TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(status.ok()) << status;
     EXPECT_TRUE(op != nullptr);
     if (op != nullptr) {
@@ -148,9 +149,9 @@ class OpKernelTest : public ::testing::Test {
     NodeDef node_def;
     protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
     Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device_,
-                                                cpu_allocator(), node_def,
-                                                TF_GRAPH_DEF_VERSION, &status));
+    std::unique_ptr<OpKernel> op(
+        CreateOpKernel(std::move(device_type), &device_, cpu_allocator(),
+                       node_def, TF_GRAPH_DEF_VERSION, &status));
     EXPECT_TRUE(op == nullptr);
     EXPECT_FALSE(status.ok());
     if (!status.ok()) {
@@ -384,7 +385,7 @@ class OpKernelBuilderTest : public ::testing::Test {
   }
 
   std::unique_ptr<OpKernel> ExpectSuccess(const string& op_type,
-                                          DeviceType device_type,
+                                          const DeviceType& device_type,
                                           const std::vector<string>& attrs,
                                           DataTypeSlice input_types = {}) {
     Status status;
@@ -423,7 +424,7 @@ class OpKernelBuilderTest : public ::testing::Test {
     return op;
   }
 
-  void ExpectFailure(const string& op_type, DeviceType device_type,
+  void ExpectFailure(const string& op_type, const DeviceType& device_type,
                      const std::vector<string>& attrs, error::Code code) {
     Status status;
     const NodeDef def = CreateNodeDef(op_type, attrs);
@@ -613,6 +614,36 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
                 error::INVALID_ARGUMENT);
 }
 
+REGISTER_OP("ListOut").Output("a: int32").Output("b: T").Attr("T: list(type)");
+REGISTER_KERNEL_BUILDER(Name("ListOut").Device(tensorflow::DEVICE_CPU),
+                        DummyKernel);
+
+TEST_F(OpKernelBuilderTest, OpOutputList) {
+  Env* env = Env::Default();
+  OpKernelContext::Params params;
+  params.record_tensor_accesses = false;
+  std::unique_ptr<DummyDevice> device(
+      new DummyDevice(env, params.record_tensor_accesses));
+  params.device = device.get();
+  Status status;
+  std::unique_ptr<OpKernel> op(CreateOpKernel(
+      DEVICE_CPU, params.device, cpu_allocator(),
+      CreateNodeDef("ListOut", {"T|list(type)|[DT_FLOAT, DT_INT32]"}),
+      TF_GRAPH_DEF_VERSION, &status));
+  EXPECT_TRUE(status.ok()) << status.ToString();
+  params.op_kernel = op.get();
+  gtl::InlinedVector<TensorValue, 4> inputs{};
+  params.inputs = &inputs;
+  std::unique_ptr<OpKernelContext> ctx(new OpKernelContext(&params));
+
+  EXPECT_EQ(DT_INT32, ctx->expected_output_dtype(0));
+  OpOutputList out_list;
+  EXPECT_FALSE(ctx->output_list("non_existent_output", &out_list).ok());
+  ASSERT_TRUE(ctx->output_list("b", &out_list).ok());
+  EXPECT_EQ(DT_FLOAT, out_list.expected_output_dtype(0));
+  EXPECT_EQ(DT_INT32, out_list.expected_output_dtype(1));
+}
+
 class GetAttrKernel : public ::tensorflow::OpKernel {
  public:
   explicit GetAttrKernel(OpKernelConstruction* context) : OpKernel(context) {
diff --git a/tensorflow/core/framework/partial_tensor_shape.cc b/tensorflow/core/framework/partial_tensor_shape.cc
index f650468c1c83ab12ac859d964fef783b04550a85..f02553b434ef9f71abeabf2a041060edd9091b11 100644
--- a/tensorflow/core/framework/partial_tensor_shape.cc
+++ b/tensorflow/core/framework/partial_tensor_shape.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -81,6 +82,17 @@ PartialTensorShape::PartialTensorShape(gtl::ArraySlice<int64> dim_sizes)
   }
 }
 
+PartialTensorShape::PartialTensorShape(const TensorShape& shape)
+    : is_unknown_(false) {
+  const int num_dims = shape.dims();
+  dim_sizes_.reserve(num_dims);
+  for (int i = 0; i < num_dims; ++i) {
+    const int64 dim = internal::SubtleMustCopy(shape.dim_size(i));
+    CHECK_GE(dim, 0);
+    dim_sizes_.push_back(dim);
+  }
+}
+
 PartialTensorShape PartialTensorShape::Concatenate(int64 size) const {
   if (is_unknown_) {
     return *this;
diff --git a/tensorflow/core/framework/partial_tensor_shape.h b/tensorflow/core/framework/partial_tensor_shape.h
index 7a70167d7c926127c22cd88f5b26e09991bba206..ab22eec356b3acf77801bf3543d13afcb3ffceeb 100644
--- a/tensorflow/core/framework/partial_tensor_shape.h
+++ b/tensorflow/core/framework/partial_tensor_shape.h
@@ -46,7 +46,9 @@ class PartialTensorShape {
       : PartialTensorShape(gtl::ArraySlice<int64>(dim_sizes)) {}
 
   /// REQUIRES: `IsValid(proto)`
-  explicit PartialTensorShape(const TensorShapeProto& proto);
+  PartialTensorShape(
+      const TensorShapeProto& proto);            // NOLINT(runtime/explicit)
+  PartialTensorShape(const TensorShape& shape);  // NOLINT(runtime/explicit)
 
   /// Returns `true` iff `proto` is a valid partial tensor shape.
   static bool IsValid(const TensorShapeProto& proto);
diff --git a/tensorflow/core/framework/reader_op_kernel.h b/tensorflow/core/framework/reader_op_kernel.h
index 502b98f13d9ecb6b4ff6dfef6b73496fded85d0b..ffd6a1a18486cc0b015c75775b40c3a1118109c0 100644
--- a/tensorflow/core/framework/reader_op_kernel.h
+++ b/tensorflow/core/framework/reader_op_kernel.h
@@ -47,7 +47,28 @@ class ReaderOpKernel : public ResourceOpKernel<ReaderInterface> {
     factory_ = factory;
   }
 
+  void Compute(OpKernelContext* context) override {
+    if (!IsCancellable()) {
+      ResourceOpKernel<ReaderInterface>::Compute(context);
+    } else {
+      // Install cancellation
+      CancellationManager* cm = context->cancellation_manager();
+      CancellationToken token = cm->get_cancellation_token();
+      bool already_cancelled =
+          !cm->RegisterCallback(token, [this]() { this->Cancel(); });
+
+      if (!already_cancelled) {
+        ResourceOpKernel<ReaderInterface>::Compute(context);
+      } else {
+        context->SetStatus(errors::Cancelled("read operation was cancelled"));
+      }
+    }
+  }
+
  private:
+  virtual bool IsCancellable() const { return false; }
+  virtual void Cancel() {}
+
   Status CreateResource(ReaderInterface** reader)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) override {
     *reader = factory_();
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 7f9fe084ba4b3f1fb9ef6052f0ba7adbdcdd913f..55860d92271899cc4d6a7f75ff94d2ac122b6081 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -24,6 +24,34 @@ limitations under the License.
 #include "tensorflow/core/platform/demangle.h"
 
 namespace tensorflow {
+ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
+                                  const string& name,
+                                  const TypeIndex& type_index) {
+  ResourceHandle result;
+  result.set_device(ctx->device()->attributes().name());
+  string actual_container;
+  if (!container.empty()) {
+    actual_container = container;
+  } else {
+    actual_container = ctx->resource_manager()->default_container();
+  }
+  result.set_container(actual_container);
+  result.set_name(name);
+  result.set_hash_code(type_index.hash_code());
+  result.set_maybe_type_name(type_index.name());
+  return result;
+}
+
+Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
+                                  const string& container, const string& name,
+                                  const TypeIndex& type_index) {
+  Tensor* handle;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(output_index, TensorShape({}), &handle));
+  handle->scalar<ResourceHandle>()() =
+      MakeResourceHandle(context, container, name, type_index);
+  return Status::OK();
+}
 
 namespace internal {
 
@@ -246,6 +274,14 @@ ResourceHandle HandleFromInput(OpKernelContext* ctx, int input) {
   return ctx->input(input).flat<ResourceHandle>()(0);
 }
 
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle) {
+  const Tensor* tensor;
+  TF_RETURN_IF_ERROR(ctx->input(input, &tensor));
+  *handle = tensor->flat<ResourceHandle>()(0);
+  return Status::OK();
+}
+
 Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p) {
   TF_RETURN_IF_ERROR(internal::ValidateDevice(ctx, p));
   return ctx->resource_manager()->Delete(p);
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index fe6e09378fd1fa2dbdb52c138a54be7d8fb7b86b..0e1a5a82d3fa4b5d96dfd0bb899c12d65fa87574 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -202,15 +202,28 @@ class ResourceMgr {
 
 // Makes a resource handle with the specified type for a given container /
 // name.
+ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
+                                  const string& name,
+                                  const TypeIndex& type_index);
+
 template <typename T>
 ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
-                                  const string& name);
+                                  const string& name) {
+  return MakeResourceHandle(ctx, container, name, MakeTypeIndex<T>());
+}
+
+Status MakeResourceHandleToOutput(OpKernelContext* context, int output_index,
+                                  const string& container, const string& name,
+                                  const TypeIndex& type_index);
+
 template <typename T>
 ResourceHandle MakePerStepResourceHandle(OpKernelContext* ctx,
                                          const string& name);
 
 // Returns a resource handle from a numbered op input.
 ResourceHandle HandleFromInput(OpKernelContext* ctx, int input);
+Status HandleFromInput(OpKernelContext* ctx, StringPiece input,
+                       ResourceHandle* handle);
 
 // Create a resource pointed by a given resource handle.
 template <typename T>
@@ -421,25 +434,6 @@ Status GetResourceFromContext(OpKernelContext* ctx, const string& input_name,
   return ctx->resource_manager()->Lookup(container, shared_name, resource);
 }
 
-template <typename T>
-ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
-                                  const string& name) {
-  ResourceHandle result;
-  result.set_device(ctx->device()->attributes().name());
-  string actual_container;
-  if (!container.empty()) {
-    actual_container = container;
-  } else {
-    actual_container = ctx->resource_manager()->default_container();
-  }
-  result.set_container(actual_container);
-  result.set_name(name);
-  auto type_index = MakeTypeIndex<T>();
-  result.set_hash_code(type_index.hash_code());
-  result.set_maybe_type_name(type_index.name());
-  return result;
-}
-
 template <typename T>
 ResourceHandle MakePerStepResourceHandle(OpKernelContext* ctx,
                                          const string& name) {
diff --git a/tensorflow/core/framework/resource_op_kernel.h b/tensorflow/core/framework/resource_op_kernel.h
index de65657a9e59ac5e6496fd5d29b03ca06be9738d..813ec6eed58e975ec1dda0e1a61f01a37414a56f 100644
--- a/tensorflow/core/framework/resource_op_kernel.h
+++ b/tensorflow/core/framework/resource_op_kernel.h
@@ -95,11 +95,9 @@ class ResourceOpKernel : public OpKernel {
       resource_ = resource;
     }
     if (context->expected_output_dtype(0) == DT_RESOURCE) {
-      Tensor* handle;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(0, TensorShape({}), &handle));
-      handle->scalar<ResourceHandle>()() =
-          MakeResourceHandle<T>(context, cinfo_.container(), cinfo_.name());
+      OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                  context, 0, cinfo_.container(), cinfo_.name(),
+                                  MakeTypeIndex<T>()));
     } else {
       context->set_output_ref(0, &mu_, handle_.AccessTensor(context));
     }
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 449d8f55f5636c825120c4e3024121d3fa6c1f36..a990dc2f04d24aea475efaa5b60b722512c1cac3 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -239,8 +239,11 @@ string InferenceContext::DebugString() const {
                          ProtoDebugString(node_def_));
 }
 
-Status InferenceContext::WithRank(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
                                   ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing == rank) {
     *out = shape;
@@ -261,8 +264,11 @@ Status InferenceContext::WithRank(ShapeHandle shape, int32 rank,
                                  existing);
 }
 
-Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64 rank,
                                          ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing >= rank) {
     *out = shape;
@@ -276,8 +282,11 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int32 rank,
                                  " but is rank ", existing);
 }
 
-Status InferenceContext::WithRankAtMost(ShapeHandle shape, int32 rank,
+Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64 rank,
                                         ShapeHandle* out) {
+  if (rank > kint32max) {
+    return errors::InvalidArgument("Rank cannot exceed kint32max");
+  }
   const int32 existing = Rank(shape);
   if (existing == kUnknownRank) {
     return ReturnUnknownShape(out);
@@ -470,12 +479,12 @@ Status InferenceContext::Concatenate(ShapeHandle s1, ShapeHandle s2,
   return ReturnCreatedShape(dims, out);
 }
 
-Status InferenceContext::ReplaceDim(ShapeHandle s, int dim_index_in,
+Status InferenceContext::ReplaceDim(ShapeHandle s, int64 dim_index_in,
                                     DimensionHandle new_dim, ShapeHandle* out) {
   if (!RankKnown(s)) {
     return ReturnUnknownShape(out);
   }
-  int dim_index = dim_index_in;
+  int64 dim_index = dim_index_in;
   if (dim_index < 0) {
     dim_index = s->dims_.size() + dim_index;
   }
@@ -510,7 +519,8 @@ ShapeHandle InferenceContext::UnknownShape() {
   return shape_manager_.UnknownShape();
 }
 
-ShapeHandle InferenceContext::UnknownShapeOfRank(int32 rank) {
+ShapeHandle InferenceContext::UnknownShapeOfRank(int64 rank) {
+  CHECK_LE(rank, kint32max) << "rank must be less than kint32max";
   std::vector<DimensionHandle> dims(rank);
   for (int32 i = 0; i < rank; ++i) {
     dims[i] = UnknownDim();
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index b7f1725c5f16449964b29529d1da9e864dbd461b..7bb091622da78981065fbc843fb8fcfb2aa0fe1d 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -182,19 +182,29 @@ class InferenceContext {
     if (!s.ok()) {
       return AttachContext(s);
     }
-#if 0
-    // TODO(cwhipkey): enable this check
 #ifndef NDEBUG
     for (int i = 0; i < num_outputs(); ++i) {
-      DCHECK(output(i).IsSet()) << i << " for " << node_def().name()
-                                << " of type " << node_def().op();
+      DCHECK(output(i).IsSet())
+          << i << " for " << node_def_.name() << " of type " << node_def_.op();
     }
 #endif  // NDEBUG
-#endif
     return s;
   }
 
-  ShapeHandle input(int idx) const { return inputs_[idx]; }
+  // Merge the stored shape of the input in position idx with the specified
+  // shape. This requires idx to be in the [0, num_inputs) range. If the merge
+  // is successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool MergeInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
+        inputs_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    inputs_[idx] = new_shape;
+    return true;
+  }
+  ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
 
@@ -235,9 +245,11 @@ class InferenceContext {
   Status output(StringPiece output_name,
                 std::vector<ShapeHandle>* output) const;
 
+  AttrSlice attrs() const { return AttrSlice(node_def_); }
+
   // idx can be negative for an offset from end of dimensions.
   // idx must be in the range [-1 * s.rank, s.rank).
-  DimensionHandle Dim(ShapeHandle s, int32 idx) {
+  DimensionHandle Dim(ShapeHandle s, int64 idx) {
     if (s->rank_ == kUnknownRank) {
       return UnknownDim();
     }
@@ -277,11 +289,11 @@ class InferenceContext {
   // the shape with asserted rank in <*out>. Otherwise return an error.
   //
   // Note that <*out> may be set to <shape>.
-  Status WithRank(ShapeHandle shape, int32 rank,
+  Status WithRank(ShapeHandle shape, int64 rank,
                   ShapeHandle* out) TF_MUST_USE_RESULT;
-  Status WithRankAtLeast(ShapeHandle shape, int32 rank,
+  Status WithRankAtLeast(ShapeHandle shape, int64 rank,
                          ShapeHandle* out) TF_MUST_USE_RESULT;
-  Status WithRankAtMost(ShapeHandle shape, int32 rank,
+  Status WithRankAtMost(ShapeHandle shape, int64 rank,
                         ShapeHandle* out) TF_MUST_USE_RESULT;
 
   // If <dim> has value <value>, or its value is unknown, returns OK and returns
@@ -332,7 +344,7 @@ class InferenceContext {
 
   // Returns in <out> the shape from replacing <s.dim[dim_index]> with
   // <new_dim>.
-  Status ReplaceDim(ShapeHandle s, int dim_index, DimensionHandle new_dim,
+  Status ReplaceDim(ShapeHandle s, int64 dim_index, DimensionHandle new_dim,
                     ShapeHandle* out) TF_MUST_USE_RESULT;
 
   // Returns a new shape with the given dims. The returned value is owned by
@@ -344,7 +356,7 @@ class InferenceContext {
   ShapeHandle UnknownShape();
 
   // Returns a shape with specified rank but unknown dims.
-  ShapeHandle UnknownShapeOfRank(int32 rank);
+  ShapeHandle UnknownShapeOfRank(int64 rank);
 
   // Returns a new shape of zero dimensions.
   ShapeHandle Scalar();
@@ -384,11 +396,6 @@ class InferenceContext {
   // the value.
   Status MakeDimForScalarInput(int idx, DimensionHandle* out);
 
-  // Returns the NodeDef. The returned reference does not outlive the
-  // InferenceContext, and it should not be used after InferenceContext is
-  // destroyed.
-  const NodeDef& node_def() { return node_def_; }
-
   // Look up the attr for the NodeDef being evaluated with name attr_name and
   // set *value to its value.  If no attr with attr_name is found in def(), or
   // the attr does not have a matching type, a non-ok status will be returned.
@@ -433,15 +440,65 @@ class InferenceContext {
   // and dtypes of tensors which can be accessed via the handle. These methods
   // propagate that information. Output handle dtypes and shapes are ignored if
   // the output tensor is not of type DT_RESOURCE.
+
+  // Merge the stored shape corresponding to the input handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_inputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+  bool MergeInputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(input_handle_shape_[idx], shape, &new_shape).ok() ||
+        input_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    input_handle_shape_[idx] = shape;
+    return true;
+  }
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_inputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_input_handle_dtype(int idx, DataType dtype) {
+    if (input_handle_dtype_[idx] != dtype) {
+      input_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
+  }
   ShapeHandle input_handle_shape(int idx);
   DataType input_handle_dtype(int idx) const {
     return input_handle_dtype_[idx];
   }
+
+  // Merge the stored shape corresponding to the output handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_outputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+
+  bool MergeOutputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(output_handle_shape_[idx], shape, &new_shape).ok() ||
+        output_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
+    }
+    output_handle_shape_[idx] = shape;
+    return true;
+  }
+  // Overwrite the shape corresponding to the output handle in position idx with
+  // the specified shape.
   void set_output_handle_shape(int idx, ShapeHandle shape) {
     output_handle_shape_[idx] = shape;
   }
-  void set_output_handle_dtype(int idx, DataType dtype) {
-    output_handle_dtype_[idx] = dtype;
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_outputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_output_handle_dtype(int idx, DataType dtype) {
+    if (output_handle_dtype_[idx] != dtype) {
+      output_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
   }
   ShapeHandle output_handle_shape(int idx) const {
     return output_handle_shape_[idx];
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index c82b506e4b939c42117b56d520484d4edaf24534..78d1fc0fc5e8bfa7235fb9b90cbaa39270219fd3 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -558,6 +558,11 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 0), c.Dim(out, 0)));
   EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 1), c.Dim(out, 1)));
 
+  auto s_u1 = c.UnknownShapeOfRank(1);
+  auto s_u2 = c.UnknownShapeOfRank(1);
+  TF_EXPECT_OK(c.Merge(s_u1, s_u2, &out));
+  EXPECT_TRUE(SameHandle(s_u1, out));
+
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index b0af0e5bd91e7c16bdda05fa542e9924eac3a0df..de14c071b46f648a516c2d0e0e5129b05bc99f8d 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -93,10 +93,11 @@ TEST(ShapeInferenceTestutilTest, Failures) {
             RunInferShapes(op, "[1];[2];[1]", "e", fn_copy_input_0));
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "[1];[2]", fn_copy_input_0),
                   "wrong number of outputs");
-  EXPECT_EQ("Op type not registered 'NoSuchOp'",
-            ShapeInferenceTestutil::InferShapes(
-                ShapeInferenceTestOp("NoSuchOp"), "", "")
-                .error_message());
+  auto error_message = ShapeInferenceTestutil::InferShapes(
+                           ShapeInferenceTestOp("NoSuchOp"), "", "")
+                           .error_message();
+  EXPECT_TRUE(StringPiece(error_message)
+                  .starts_with("Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index ecb9810d83c9577ca89e12e30167b0d8f4c78f5b..d049da1c9d5ce16506527388d9c42086db9dcec2 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -902,42 +902,27 @@ void Tensor::FillDescription(TensorDescription* description) const {
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatInnerDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = num_out_dims - 1; out_dim > 0; --out_dim) {
-    const int64 in_dim = out_dim + (dims() - num_out_dims);
-    out_dims[out_dim] = (in_dim >= dims() || in_dim < 0) ? 1 : dim_size(in_dim);
-    prod_out_dims *= out_dims[out_dim];
-  }
-  if (prod_out_dims != 0) {
-    out_dims[0] = num_elements / prod_out_dims;
-  } else {
-    out_dims[0] = 0;
+  int64 offset = orig.size() - num_out_dims;
+  for (int64 out_dim = num_out_dims - 1; out_dim >= 0; --out_dim) {
+    const int64 in_dim = out_dim + offset;
+    out_dims[out_dim] = in_dim < 0 ? 1 : orig[in_dim];
+  }
+  for (int64 in_dim = 0; in_dim < offset; ++in_dim) {
+    out_dims[0] *= orig[in_dim];
   }
   return out_dims;
 }
 
 gtl::InlinedVector<int64, 4> Tensor::ComputeFlatOuterDims(
-    int64 num_out_dims) const {
-  if (num_out_dims == dims()) {
-    return shape_.dim_sizes();
-  }
+    gtl::ArraySlice<int64> orig, int64 num_out_dims) {
   gtl::InlinedVector<int64, 4> out_dims(num_out_dims, 0);
-  const int64 num_elements = NumElements();
-  int64 prod_out_dims = 1;
-  for (int64 out_dim = 0; out_dim < num_out_dims - 1; ++out_dim) {
-    out_dims[out_dim] = out_dim >= dims() ? 1 : dim_size(out_dim);
-    prod_out_dims *= out_dims[out_dim];
-  }
-  if (prod_out_dims != 0) {
-    out_dims[num_out_dims - 1] = num_elements / prod_out_dims;
-  } else {
-    out_dims[num_out_dims - 1] = 0;
+  for (int64 out_dim = 0; out_dim <= num_out_dims - 1; ++out_dim) {
+    out_dims[out_dim] = out_dim >= orig.size() ? 1 : orig[out_dim];
+  }
+  for (int64 in_dim = num_out_dims; in_dim < orig.size(); ++in_dim) {
+    out_dims[num_out_dims - 1] *= orig[in_dim];
   }
   return out_dims;
 }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 103da4c1b373076d35189f9462171f3345b82e1a..5810970a38adf2786d27eab34adb94fbd77735d8 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -103,9 +103,9 @@ class Tensor {
   /// Copy constructor.
   Tensor(const Tensor& other);
 
-  /// \brief Move constructor. After this call, <other> is safely destructible and can
-  /// be assigned to, but other calls on it (e.g. shape manipulation) are not
-  /// valid.
+  /// \brief Move constructor. After this call, <other> is safely destructible
+  /// and can be assigned to, but other calls on it (e.g. shape manipulation)
+  /// are not valid.
   Tensor(Tensor&& other);
 
   ~Tensor();
@@ -304,6 +304,15 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::Tensor flat_outer_dims();
 
+  /// Returns the data as an Eigen::Tensor with NDIMS dimensions, collapsing the
+  /// first 'begin' Tensor dimensions into the first dimension of the result and
+  /// the Tensor dimensions of the last dims() - 'begin' - NDIMS into the last
+  /// dimension of the result. If 'begin' < 0 then the the |'begin'| leading
+  /// dimensions of size 1 will be added. If 'begin' + NDIMS > dims() then
+  /// 'begin' + NDIMS - dims() trailing dimensions of size 1 will be added.
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64 begin);
+
   template <typename T, size_t NDIMS>
   typename TTypes<T, NDIMS>::Tensor shaped(gtl::ArraySlice<int64> new_sizes);
 
@@ -386,6 +395,9 @@ class Tensor {
   template <typename T, size_t NDIMS = 2>
   typename TTypes<T, NDIMS>::ConstTensor flat_outer_dims() const;
 
+  template <typename T, size_t NDIMS = 3>
+  typename TTypes<T, NDIMS>::Tensor flat_inner_outer_dims(int64 begin) const;
+
   /// Render the first `max_entries` values in `*this` into a string.
   string SummarizeValue(int64 max_entries) const;
 
@@ -429,10 +441,11 @@ class Tensor {
       gtl::ArraySlice<int64> new_sizes,
       Eigen::array<Eigen::DenseIndex, NDIMS>* dims) const;
 
-  // TODO(rmlarsen): These shouldn't hardcode '4' so that it lines up with
   // TensorShape's InlineVector.
-  gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(int64 num_out_dims) const;
-  gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(int64 num_out_dims) const;
+  static gtl::InlinedVector<int64, 4> ComputeFlatInnerDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
+  static gtl::InlinedVector<int64, 4> ComputeFlatOuterDims(
+      gtl::ArraySlice<int64> orig, int64 num_out_dims);
 
   TensorShape shape_;
   TensorBuffer* buf_;
@@ -529,7 +542,6 @@ typename TTypes<T, NDIMS>::ConstTensor Tensor::tensor() const {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::Tensor(base<T>(),
                                            shape().AsEigenDSizes<NDIMS>());
 }
@@ -537,7 +549,6 @@ typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_tensor() {
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_tensor() const {
   CHECK(IsAligned());
-  ;
   return typename TTypes<T, NDIMS>::ConstTensor(base<const T>(),
                                                 shape().AsEigenDSizes<NDIMS>());
 }
@@ -568,7 +579,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape<NDIMS>(new_sizes, &dims);
   return typename TTypes<T, NDIMS>::Tensor(base<T>(), dims);
@@ -609,7 +619,6 @@ template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::bit_casted_shaped(
     gtl::ArraySlice<int64> new_sizes) const {
   CHECK(IsAligned());
-  ;
   Eigen::array<Eigen::DenseIndex, NDIMS> dims;
   FillDimsAndValidateCompatibleShape(&dims, new_sizes);
   return typename TTypes<T, NDIMS>::ConstTensor(base<T>(), dims);
@@ -638,22 +647,36 @@ typename TTypes<T>::ConstScalar Tensor::scalar() const {
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_dims() {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::Tensor Tensor::flat_outer_dims() {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_inner_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatInnerDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(shape_.dim_sizes(), NDIMS));
 }
 
 template <typename T, size_t NDIMS>
 typename TTypes<T, NDIMS>::ConstTensor Tensor::flat_outer_dims() const {
-  return shaped<T, NDIMS>(ComputeFlatOuterDims(NDIMS));
+  return shaped<T, NDIMS>(ComputeFlatOuterDims(shape_.dim_sizes(), NDIMS));
+}
+
+template <typename T, size_t NDIMS>
+typename TTypes<T, NDIMS>::Tensor Tensor::flat_inner_outer_dims(int64 begin) const {
+  gtl::InlinedVector<int64,4> flat_outer = ComputeFlatOuterDims(
+      shape_.dim_sizes(), begin + NDIMS);
+  return shaped<T, NDIMS>(ComputeFlatInnerDims(flat_outer, NDIMS));
 }
 
 inline Tensor::Tensor(const Tensor& other)
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c907bbb69fe418d898b8404e18582b0620c8f540..2626402ccd5b8fb563630c282a64ab26bef8afd7 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -202,11 +202,19 @@ TEST(Tensor_QInt32, Simple) {
   TestCopies<qint32>(t);
 }
 
-TEST(Tensor_Float, Reshape) {
-  Tensor t(DT_FLOAT, TensorShape({2, 3, 4, 5}));
-  EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+class TensorReshapeTest : public ::testing::Test {
+ protected:
+  Tensor t;
+  Tensor zero_t;
+
+  TensorReshapeTest()
+      : t(DT_FLOAT, TensorShape({2, 3, 4, 5})),
+        zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5})) {}
+
+  virtual void SetUp() {
+    EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 3, 4, 5})));
+    EXPECT_TRUE(zero_t.shape().IsSameSize(TensorShape({3, 0, 2, 0, 5})));
 
-  {
     auto tensor = t.tensor<float, 4>();
     EXPECT_EQ(2, tensor.dimension(0));
     EXPECT_EQ(3, tensor.dimension(1));
@@ -217,6 +225,10 @@ TEST(Tensor_Float, Reshape) {
     tensor(0, 0, 0, 0) = 0.01f;
     tensor(1, 2, 3, 4) = 0.02f;
   }
+};
+
+TEST_F(TensorReshapeTest, Reshape) {
+  LOG(INFO) << "shaped";
   {
     auto shaped = t.shaped<float, 1>({120});
     EXPECT_EQ(120, shaped.dimension(0));
@@ -248,6 +260,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(shaped(0, 0, 0, 0), 0.01f);
     EXPECT_EQ(shaped(1, 2, 3, 4), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, Flat) {
+  LOG(INFO) << "flat";
   {
     auto flat = t.flat<float>();
     EXPECT_EQ(flat(0), 0.01f);
@@ -255,6 +271,10 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat(0), 0.01f);
     EXPECT_EQ(flat(119), 0.02f);
   }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerDims) {
+  LOG(INFO) << "flat_inner_dims";
   {
     auto flat_inner_dims = t.flat_inner_dims<float>();
     EXPECT_EQ(24, flat_inner_dims.dimension(0));
@@ -262,13 +282,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(23, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(60, flat_outer_dims.dimension(1));
-    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 3>();
     EXPECT_EQ(6, flat_inner_dims.dimension(0));
@@ -277,14 +290,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(5, 3, 4), 0.02f);
   }
-  {
-    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
-    EXPECT_EQ(2, flat_outer_dims.dimension(0));
-    EXPECT_EQ(3, flat_outer_dims.dimension(1));
-    EXPECT_EQ(20, flat_outer_dims.dimension(2));
-    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
-    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
-  }
   {
     auto flat_inner_dims = t.flat_inner_dims<float, 5>();
     EXPECT_EQ(1, flat_inner_dims.dimension(0));
@@ -295,6 +300,44 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_inner_dims(0, 0, 0, 0, 0), 0.01f);
     EXPECT_EQ(flat_inner_dims(0, 1, 2, 3, 4), 0.02f);
   }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float>();
+    EXPECT_EQ(0, flat_inner_dims.dimension(0));
+    EXPECT_EQ(5, flat_inner_dims.dimension(1));
+  }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float, 3>();
+    EXPECT_EQ(0, flat_inner_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_dims.dimension(2));
+  }
+  {
+    auto flat_inner_dims = zero_t.flat_inner_dims<float, 5>();
+    EXPECT_EQ(3, flat_inner_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_dims.dimension(2));
+    EXPECT_EQ(0, flat_inner_dims.dimension(3));
+    EXPECT_EQ(5, flat_inner_dims.dimension(4));
+  }
+}
+
+TEST_F(TensorReshapeTest, FlatOuterDims) {
+  LOG(INFO) << "flat_outer_dims";
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(60, flat_outer_dims.dimension(1));
+    EXPECT_EQ(flat_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 59), 0.02f);
+  }
+  {
+    auto flat_outer_dims = t.flat_outer_dims<float, 3>();
+    EXPECT_EQ(2, flat_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_outer_dims.dimension(2));
+    EXPECT_EQ(flat_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_outer_dims(1, 2, 19), 0.02f);
+  }
   {
     auto flat_outer_dims = t.flat_outer_dims<float, 5>();
     EXPECT_EQ(2, flat_outer_dims.dimension(0));
@@ -305,8 +348,6 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(flat_outer_dims(0, 0, 0, 0, 0), 0.01f);
     EXPECT_EQ(flat_outer_dims(1, 2, 3, 4, 0), 0.02f);
   }
-
-  Tensor zero_t(DT_FLOAT, TensorShape({3, 0, 2, 0, 5}));
   {
     auto flat_outer_dims = zero_t.flat_outer_dims<float>();
     EXPECT_EQ(3, flat_outer_dims.dimension(0));
@@ -326,24 +367,132 @@ TEST(Tensor_Float, Reshape) {
     EXPECT_EQ(0, flat_outer_dims.dimension(3));
     EXPECT_EQ(5, flat_outer_dims.dimension(4));
   }
+}
+
+TEST_F(TensorReshapeTest, FlatInnerOuterDims) {
+  LOG(INFO) << "flat_inner_outer_dims";
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float>();
-    EXPECT_EQ(0, flat_inner_dims.dimension(0));
-    EXPECT_EQ(5, flat_inner_dims.dimension(1));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 4>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4), 0.02f);
   }
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float, 3>();
-    EXPECT_EQ(0, flat_inner_dims.dimension(0));
-    EXPECT_EQ(0, flat_inner_dims.dimension(1));
-    EXPECT_EQ(5, flat_inner_dims.dimension(2));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4), 0.02f);
   }
   {
-    auto flat_inner_dims = zero_t.flat_inner_dims<float, 5>();
-    EXPECT_EQ(3, flat_inner_dims.dimension(0));
-    EXPECT_EQ(0, flat_inner_dims.dimension(1));
-    EXPECT_EQ(2, flat_inner_dims.dimension(2));
-    EXPECT_EQ(0, flat_inner_dims.dimension(3));
-    EXPECT_EQ(5, flat_inner_dims.dimension(4));
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 6>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 8>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(5));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(6));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(7));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(4, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 3, 4, 0, 0), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 5>(-2);
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(1, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(4));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 0, 0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(0, 0, 1, 2, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = t.flat_inner_outer_dims<float, 2>(1);
+    EXPECT_EQ(6, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(20, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(flat_inner_outer_dims(0, 0), 0.01f);
+    EXPECT_EQ(flat_inner_outer_dims(5, 19), 0.02f);
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 5>(0);
+    EXPECT_EQ(3, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(2));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(3));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(4));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 2>(3);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(1));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(2);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(5, flat_inner_outer_dims.dimension(2));
+  }
+  {
+    auto flat_inner_outer_dims = zero_t.flat_inner_outer_dims<float, 3>(1);
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(0));
+    EXPECT_EQ(2, flat_inner_outer_dims.dimension(1));
+    EXPECT_EQ(0, flat_inner_outer_dims.dimension(2));
   }
 }
 
@@ -810,7 +959,8 @@ TEST(Tensor, Slice_Basic) {
 
 namespace {
 template <typename T>
-Tensor MkTensor(DataType dt, TensorShape shape, std::vector<T> init_values) {
+Tensor MkTensor(DataType dt, const TensorShape& shape,
+                std::vector<T> init_values) {
   Tensor x(dt, shape);
   const int limit = x.NumElements();
   int vi = 0;
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index a374f848a1789ba90bca5bb54a437202a15c3089..dc396e468ae8ebfc357b95ff6419b20d3ac3b5ff 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -169,7 +169,9 @@ bool DataTypeFromString(StringPiece sp, DataType* dt) {
   return false;
 }
 
-string DeviceTypeString(DeviceType device_type) { return device_type.type(); }
+string DeviceTypeString(const DeviceType& device_type) {
+  return device_type.type();
+}
 
 string DataTypeSliceString(const DataTypeSlice types) {
   string out;
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 932d788f230bccd434316146fb8d9ce69cd0eb62..0a81b1cb9f300a1734ddb7cd2e80fb1077d45d52 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -82,7 +82,7 @@ typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
-string DeviceTypeString(DeviceType device_type);
+string DeviceTypeString(const DeviceType& device_type);
 string DataTypeSliceString(const DataTypeSlice dtypes);
 inline string DataTypeVectorString(const DataTypeVector& dtypes) {
   return DataTypeSliceString(dtypes);
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 38f011ecaf1308f88d89a294bac456309e0614b3..3bfba3fc4ee8fd02abf3adacfbbe81f437cfc443 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-void DFS(const Graph& g, std::function<void(Node*)> enter,
-         std::function<void(Node*)> leave) {
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
@@ -61,15 +61,23 @@ void DFS(const Graph& g, std::function<void(Node*)> enter,
   }
 }
 
-void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave) {
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave) {
+  ReverseDFSFrom(g, {g.sink_node()}, enter, leave);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.sink_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 74aace8072270f3daa6fa03dac0bad5d5c86fd62..01d36e0a12403c6fc9b3db0d2c73205d0c002197 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -21,20 +21,28 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Perform a depth-first-search on g starting at the source node.
 // If enter is not empty, calls enter(n) before visiting any children of n.
 // If leave is not empty, calls leave(n) after visiting all children of n.
-extern void DFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave);
+extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave);
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
-extern void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                       std::function<void(Node*)> leave);
+extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                       const std::function<void(Node*)>& leave);
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& enter,
+                           const std::function<void(Node*)>& leave);
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 8409fb4cd0b41b3f36dc5b4ba7a3abd0478a7994..db6683d1e74512e37a40773b7642cf33eb888782 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -88,7 +88,7 @@ Status BuildControlFlowInfo(Graph* g, std::vector<ControlFlowInfo>* info) {
           out_info->frame = out;
           out_info->parent_frame = frame;
           TF_RETURN_IF_ERROR(
-              GetNodeAttr(out->def(), "frame_name", &out_info->frame_name));
+              GetNodeAttr(out->attrs(), "frame_name", &out_info->frame_name));
           if (out_info->frame_name.empty()) {
             return errors::InvalidArgument("The Enter node ", out->name(),
                                            " must have a frame name.");
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 65baf4cd8563fffa154675e79062b37415aa83dd..9066de5668076687d11cd938d4fce38dab9c248b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -30,6 +30,45 @@ const int Graph::kControlSlot = -1;
 
 // Node
 
+#define REF_CLASS(key, value) \
+  {key, value}, { "Ref" key, value }
+
+const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
+    *new std::unordered_map<string, Node::NodeClass>({
+        // Keep in same order as NodeClass values
+        REF_CLASS("Switch", NC_SWITCH),
+        REF_CLASS("Merge", NC_MERGE),
+        REF_CLASS("Enter", NC_ENTER),
+        REF_CLASS("Exit", NC_EXIT),
+        REF_CLASS("NextIteration", NC_NEXT_ITERATION),
+        {"LoopCond", NC_LOOP_COND},
+        {"ControlTrigger", NC_CONTROL_TRIGGER},
+        {"_Send", NC_SEND},
+        {"_HostSend", NC_HOST_SEND},
+        {"_Recv", NC_RECV},
+        {"_HostRecv", NC_HOST_RECV},
+        {"Const", NC_CONSTANT},
+        {"HostConst", NC_CONSTANT},
+        {"Variable", NC_VARIABLE},
+        {"VariableV2", NC_VARIABLE},
+        REF_CLASS("Identity", NC_IDENTITY),
+        {"GetSessionHandle", NC_GET_SESSION_HANDLE},
+        {"GetSessionHandleV2", NC_GET_SESSION_HANDLE},
+        {"GetSessionTensor", NC_GET_SESSION_TENSOR},
+        {"DeleteSessionTensor", NC_DELETE_SESSION_TENSOR},
+    });
+
+#undef REF_CLASS
+
+Node::NodeClass Node::GetNodeClassForOp(const string& ts) {
+  auto it = kNodeClassTable.find(ts);
+  if (it != kNodeClassTable.end()) {
+    return it->second;
+  } else {
+    return NC_OTHER;
+  }
+}
+
 string Node::DebugString() const {
   string ret = strings::StrCat("{name:'", name(), "' id:", id_);
   if (IsSource()) {
@@ -39,7 +78,7 @@ string Node::DebugString() const {
   } else {
     strings::StrAppend(&ret, " op device:");
     strings::StrAppend(&ret, "{", assigned_device_name_, "}");
-    strings::StrAppend(&ret, " def:{", SummarizeNodeDef(def()), "}}");
+    strings::StrAppend(&ret, " def:{", SummarizeNode(*this), "}}");
   }
   return ret;
 }
@@ -70,41 +109,7 @@ void Node::Initialize(int id, int cost_id, Properties* props) {
   }
   props_ = props;
   // Initialize the class_ based on the type string
-  const string& ts = this->type_string();
-  class_ = NC_UNINITIALIZED;
-
-#define SET_CLASS(enum_val, ts, str1, str2)        \
-  do {                                             \
-    if ((((ts) == (str1)) || ((ts) == (str2)))) {  \
-      /* Cannot be member of more than one class*/ \
-      CHECK(class_ == NC_UNINITIALIZED);           \
-      class_ = (enum_val);                         \
-    }                                              \
-  } while (0)
-
-  SET_CLASS(NC_SWITCH, ts, "Switch", "RefSwitch");
-  SET_CLASS(NC_MERGE, ts, "Merge", "RefMerge");
-  SET_CLASS(NC_ENTER, ts, "Enter", "RefEnter");
-  SET_CLASS(NC_EXIT, ts, "Exit", "RefExit");
-  SET_CLASS(NC_NEXT_ITERATION, ts, "NextIteration", "RefNextIteration");
-  SET_CLASS(NC_LOOP_COND, ts, "LoopCond", "");
-  SET_CLASS(NC_CONTROL_TRIGGER, ts, "ControlTrigger", "");
-  SET_CLASS(NC_SEND, ts, "_Send", "");
-  SET_CLASS(NC_HOST_SEND, ts, "_HostSend", "");
-  SET_CLASS(NC_RECV, ts, "_Recv", "");
-  SET_CLASS(NC_HOST_RECV, ts, "_HostRecv", "");
-  SET_CLASS(NC_CONSTANT, ts, "Const", "HostConst");
-  SET_CLASS(NC_VARIABLE, ts, "Variable", "");
-  SET_CLASS(NC_VARIABLE, ts, "VariableV2", "");
-  SET_CLASS(NC_IDENTITY, ts, "Identity", "RefIdentity");
-  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandle", "");
-  SET_CLASS(NC_GET_SESSION_HANDLE, ts, "GetSessionHandleV2", "");
-  SET_CLASS(NC_GET_SESSION_TENSOR, ts, "GetSessionTensor", "");
-  SET_CLASS(NC_DELETE_SESSION_TENSOR, ts, "DeleteSessionTensor", "");
-  if (class_ == NC_UNINITIALIZED) {
-    class_ = NC_OTHER;  // Catch all
-  }
-#undef SET_CLASS
+  class_ = GetNodeClassForOp(props->node_def_.op());
 }
 
 void Node::Clear() {
@@ -199,7 +204,7 @@ Status Node::input_edges(std::vector<const Edge*>* input_edges) const {
   return Status::OK();
 }
 
-Status Node::input_node(int idx, const Node** n) const {
+Status Node::input_node(int idx, Node** n) const {
   const Edge* e;
   TF_RETURN_IF_ERROR(input_edge(idx, &e));
   if (e == nullptr) {
@@ -210,6 +215,13 @@ Status Node::input_node(int idx, const Node** n) const {
   return Status::OK();
 }
 
+Status Node::input_node(int idx, const Node** const_n) const {
+  Node* n;
+  TF_RETURN_IF_ERROR(input_node(idx, &n));
+  *const_n = n;
+  return Status::OK();
+}
+
 // Node::Properties
 
 Node::Properties::Properties(const OpDef* op_def, const NodeDef& node_def,
@@ -292,6 +304,17 @@ Node* Graph::CopyNode(Node* node) {
   props->Ref();
   Node* copy = AllocateNode(props, node);
   copy->set_assigned_device_name(node->assigned_device_name());
+
+  // Since the OpDef of a function may be owned by the Graph that owns 'node',
+  // relookup the OpDef in the target graph. If it differs, then clone the
+  // node properties with the updated OpDef.
+  const OpDef* op_def;
+  TF_CHECK_OK(ops_.LookUpOpDef(node->type_string(), &op_def));
+  if (op_def != props->op_def_) {
+    copy->MaybeCopyOnWrite();
+    copy->props_->op_def_ = op_def;
+  }
+
   return copy;
 }
 
@@ -337,7 +360,7 @@ const Edge* Graph::AddEdge(Node* source, int x, Node* dest, int y) {
   CHECK(source->out_edges_.insert(e).second);
   CHECK(dest->in_edges_.insert(e).second);
   edges_.push_back(e);
-  edge_set_.insert(e);
+  ++num_edges_;
   return e;
 }
 
@@ -347,8 +370,8 @@ void Graph::RemoveEdge(const Edge* e) {
   CHECK_EQ(e->src_->out_edges_.erase(e), size_t{1});
   CHECK_EQ(e->dst_->in_edges_.erase(e), size_t{1});
   CHECK_EQ(e, edges_[e->id_]);
+  CHECK_GT(num_edges_, 0);
 
-  CHECK_EQ(edge_set_.erase(e), size_t{1});
   edges_[e->id_] = nullptr;
 
   Edge* del = const_cast<Edge*>(e);
@@ -358,6 +381,39 @@ void Graph::RemoveEdge(const Edge* e) {
   del->src_output_ = kControlSlot - 1;
   del->dst_input_ = kControlSlot - 1;
   free_edges_.push_back(del);
+  --num_edges_;
+}
+
+Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+  for (const FunctionDef& fdef : fdef_lib.function()) {
+    const FunctionDef* preexisting_fdef = ops_.Find(fdef.signature().name());
+    if (preexisting_fdef != nullptr) {
+      if (!FunctionDefsEqual(*preexisting_fdef, fdef)) {
+        return errors::InvalidArgument(
+            "Cannot add function '", fdef.signature().name(),
+            "' because a different function with the same name already "
+            "exists.");
+      }
+      // Ignore duplicate FunctionDefs
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ops_.AddFunctionDef(fdef));
+  }
+  for (const GradientDef& grad : fdef_lib.gradient()) {
+    string preexisting_grad_func = ops_.FindGradient(grad.function_name());
+    if (!preexisting_grad_func.empty()) {
+      if (preexisting_grad_func != grad.gradient_func()) {
+        return errors::InvalidArgument(
+            "Cannot assign gradient function '", grad.gradient_func(), "' to '",
+            grad.function_name(), "' because it already has gradient function ",
+            "'", preexisting_grad_func, "'");
+      }
+      // Ignore duplicate GradientDefs
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ops_.AddGradientDef(grad));
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -380,7 +436,8 @@ void Graph::ToGraphDef(GraphDef* graph_def) const {
 
 void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
   graph_def->Clear();
-  graph_def->mutable_versions()->CopyFrom(versions());
+  *graph_def->mutable_versions() = versions();
+  *graph_def->mutable_library() = ops_.ToProto();
   std::vector<const Edge*>
       inputs;  // Construct this outside the loop for speed.
   for (auto id = from_node_id; id < num_node_ids(); ++id) {
@@ -417,7 +474,7 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
     for (size_t i = 0; i < inputs.size(); ++i) {
       const Edge* edge = inputs[i];
       if (edge == nullptr) {
-        node_def->add_input(node->def().input(i));
+        node_def->add_input(node->requested_inputs()[i]);
       } else {
         const Node* src = edge->src();
         if (!src->IsOp()) continue;
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 4af4b0bb109febe4a4f4cc99440914f168c00567..8554cb2f4b7aa8b58fbc2b4218877fe88499d193 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -71,6 +71,7 @@ class Node {
   int cost_id() const { return cost_id_; }
   const string& name() const { return props_->node_def_.name(); }
   const string& type_string() const { return props_->node_def_.op(); }
+
   // def() provides the NodeDef the user supplied, but the specifics
   // of this Node may have changed due to placement, optimization, etc.
   // In particular:
@@ -80,18 +81,23 @@ class Node {
   // * def().device() is the "user's requested device" and may not match
   //   the actual assigned device, see assigned_device_name() below;
   // * def().attr() is authoritative.
+  // TODO(irving): Replace with NodeInfo.
   const NodeDef& def() const { return props_->node_def_; }
   const OpDef& op_def() const { return *props_->op_def_; }
 
   // input and output types
-  int num_inputs() const { return props_->input_types_.size(); }
-  DataType input_type(int i) const { return props_->input_types_[i]; }
+  int32 num_inputs() const { return props_->input_types_.size(); }
+  DataType input_type(int32 i) const { return props_->input_types_[i]; }
   const DataTypeVector& input_types() const { return props_->input_types_; }
 
-  int num_outputs() const { return props_->output_types_.size(); }
-  DataType output_type(int o) const { return props_->output_types_[o]; }
+  int32 num_outputs() const { return props_->output_types_.size(); }
+  DataType output_type(int32 o) const { return props_->output_types_[o]; }
   const DataTypeVector& output_types() const { return props_->output_types_; }
 
+  // The device requested by the user.  For the actual assigned device,
+  // use assigned_device_name() below.
+  const string& requested_device() const { return def().device(); }
+
   // This gives the device the runtime has assigned this node to.  If
   // you want the device the user requested, use def().device() instead.
   // TODO(josh11b): Validate that the assigned_device, if not empty:
@@ -103,6 +109,14 @@ class Node {
     assigned_device_name_ = device_name;
   }
 
+  // Read only access to attributes
+  AttrSlice attrs() const { return AttrSlice(def()); }
+
+  // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
+  const protobuf::RepeatedPtrField<string>& requested_inputs() const {
+    return def().input();
+  }
+
   // Get the neighboring nodes via edges either in or out of this node.
   gtl::iterator_range<NeighborIter> in_nodes() const;
   gtl::iterator_range<NeighborIter> out_nodes() const;
@@ -159,6 +173,7 @@ class Node {
   // Returns into '*n' the node that has an output connected to the
   // 'idx' input of this Node.
   Status input_node(int idx, const Node** n) const;
+  Status input_node(int idx, Node** n) const;
 
  private:
   friend class Graph;
@@ -220,6 +235,10 @@ class Node {
     NC_OTHER  // Not a special kind of node
   };
 
+  static const std::unordered_map<string, NodeClass>& kNodeClassTable;
+
+  static NodeClass GetNodeClassForOp(const string& ts);
+
   int id_;       // -1 until Initialize() is called
   int cost_id_;  // -1 if there is no corresponding cost accounting node
   NodeClass class_;
@@ -267,6 +286,66 @@ class Edge {
   int dst_input_;
 };
 
+// Allows for iteration of the edges of a Graph, by iterating the underlying
+// Graph.edges_ vector while skipping over null entries.
+class GraphEdgesIterable {
+ private:
+  const std::vector<Edge*>& edges_;
+
+ public:
+  explicit GraphEdgesIterable(const std::vector<Edge*>& edges)
+      : edges_(edges) {}
+
+  typedef Edge* value_type;
+
+  class const_iterator {
+   private:
+    // The underlying iterator.
+    std::vector<value_type>::const_iterator iter_;
+
+    // The end of the underlying iterator.
+    std::vector<value_type>::const_iterator end_;
+
+    // Advances iter_ until it reaches a non-null item, or reaches the end.
+    void apply_filter() {
+      while (iter_ != end_ && *iter_ == nullptr) {
+        ++iter_;
+      }
+    }
+
+   public:
+    const_iterator(std::vector<value_type>::const_iterator iter,
+                   std::vector<value_type>::const_iterator end)
+        : iter_(iter), end_(end) {
+      apply_filter();
+    }
+
+    bool operator==(const const_iterator& other) const {
+      return iter_ == other.iter_;
+    }
+
+    bool operator!=(const const_iterator& other) const {
+      return iter_ != other.iter_;
+    }
+
+    // This is the prefix increment operator (++x), which is the operator
+    // used by C++ range iteration (for (x : y) ...).  We intentionally do not
+    // provide a postfix increment operator.
+    const_iterator& operator++() {
+      ++iter_;
+      apply_filter();
+      return *this;
+    }
+
+    value_type operator*() { return *iter_; }
+  };
+
+  const_iterator begin() {
+    return const_iterator(edges_.begin(), edges_.end());
+  }
+  const_iterator end() { return const_iterator(edges_.end(), edges_.end()); }
+};
+
 // Thread compatible but not thread safe.
 class Graph {
  public:
@@ -324,6 +403,12 @@ class Graph {
   // REQUIRES: The edge must exist.
   void RemoveEdge(const Edge* edge);
 
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib);
+
   // The number of live nodes in the graph.
   //
   // Because nodes can be removed from the graph, num_nodes() is often
@@ -338,7 +423,7 @@ class Graph {
   // smaller than num_edge_ids(). If one needs to create an array of
   // edges indexed by edge ids, num_edge_ids() should be used as the
   // array's size.
-  int num_edges() const { return edges().size(); }
+  int num_edges() const { return num_edges_; }
 
   // Serialize the nodes starting at `from_node_id` to a GraphDef.
   void ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const;
@@ -374,7 +459,7 @@ class Graph {
 
   // Access to the set of all edges.  Example usage:
   //   for (const Edge* e : graph.edges()) { ... }
-  const EdgeSet& edges() const { return edge_set_; }
+  GraphEdgesIterable edges() const { return GraphEdgesIterable(edges_); }
 
   // The pre-defined nodes.
   enum { kSourceId = 0, kSinkId = 1 };
@@ -414,9 +499,8 @@ class Graph {
   // the edge with that id was removed from the graph.
   std::vector<Edge*> edges_;
 
-  // For ease of iteration, we currently just keep a set of all live
-  // edges.  May want to optimize by removing this copy.
-  EdgeSet edge_set_;
+  // The number of entries in edges_ that are not nullptr.
+  int num_edges_ = 0;
 
   // Allocated but free nodes and edges.
   std::vector<Node*> free_nodes_;
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 6b27e4e29454456fe5f465bc29aa12b56f76e526..70087b8fe1590f2849d949cdc233e1eae309f18d 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -423,7 +424,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
   // For nodes with the _output_shapes atttribute, override the shape.
   std::vector<TensorShapeProto> shape_attrs;
   const char* kAttrName = "_output_shapes";
-  if (!GetNodeAttr(node->def(), kAttrName, &shape_attrs).ok()) {
+  if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
     // No _output_shapes attribute, the AddNode call above was sufficient.
     return Status::OK();
   }
@@ -457,7 +458,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
       // functions that are not critical to correct execution but
       // would cause graphs to fail if imported after correcting.
       //
-      const string& op = node->def().op();
+      const string& op = node->type_string();
       const std::vector<string> whitelist = {
           // To be removed after 2017/03/08.
           "RandomShuffleQueue", "PaddingFIFOQueue", "FIFOQueue",
@@ -604,6 +605,10 @@ void GraphConstructor::AddPrefixToNodeDef(
 }
 
 Status GraphConstructor::Convert() {
+  // Import functions before adding nodes, since imported nodes may refer to
+  // functions
+  TF_RETURN_IF_ERROR(g_->AddFunctionLibrary(gdef_->library()));
+
   std::vector<InputInfo> inputs;
   int processed = 0;
   // Process the NodeDefs in topological order.
@@ -705,7 +710,12 @@ Status GraphConstructor::Convert() {
         TF_RETURN_IF_ERROR(MakeEdge(inputs[i].node, inputs[i].index, node, i));
       }
     }
-    TF_RETURN_IF_ERROR(ValidateShape(node));
+
+    // TODO(skyewm): remove conditional when b/35715995 ("Functions lack shape
+    // inference") is resolved.
+    if (g_->flib_def().Find(node_def->name()) == nullptr) {
+      TF_RETURN_IF_ERROR(ValidateShape(node));
+    }
 
     // Update pending_count_ for outputs.
     for (size_t i = 0; i < outputs_[o].size(); ++i) {
@@ -829,11 +839,6 @@ Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
 Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
                       Graph* g, ShapeRefiner* refiner,
                       std::vector<std::pair<Node*, int>>* return_tensors) {
-  ShapeRefiner default_refiner(gdef.versions().producer(), g->op_registry());
-  if (refiner == nullptr) {
-    refiner = &default_refiner;
-  }
-
   if (!opts.return_tensors.empty()) {
     if (return_tensors == nullptr) {
       return errors::InvalidArgument(
@@ -847,10 +852,36 @@ Status ImportGraphDef(const ImportGraphDefOptions& opts, const GraphDef& gdef,
           return_tensors->size(), ")");
     }
   }
-  if (gdef.library().function_size() != 0) {
-    return errors::Unimplemented(
-        "Importing GraphDefs containing functions not yet implemented");
-  }
+
+  ShapeRefiner default_refiner(gdef.versions().producer(), g->op_registry());
+  if (refiner == nullptr) {
+    refiner = &default_refiner;
+  } else {
+    // Log a warning if we are importing a GraphDef at an older
+    // producer version after already having added non-source/sink
+    // nodes to the graph in the past.
+    if (gdef.versions().producer() > 0 &&
+        gdef.versions().producer() < refiner->graph_def_version() &&
+        g->num_nodes() > 2) {
+      LOG(WARNING) << "Importing a graph with a lower producer version "
+                   << gdef.versions().producer()
+                   << " into an existing graph with producer version "
+                   << refiner->graph_def_version() << ". Shape inference will "
+                   << "have run different parts of the graph with different "
+                   << "producer versions.";
+    }
+  }
+
+  // Set the graph def version of the refiner as the min of the
+  // current value and the version from the graph we are about to
+  // import.
+  //
+  // Note: to match Run() semantics, we should re-run shape inference
+  // on the entire graph if the producer version has changed.  For now
+  // we log the warning above.
+  refiner->set_graph_def_version(
+      std::min(refiner->graph_def_version(), gdef.versions().producer()));
+
   return GraphConstructor::Construct(opts, &gdef, g, refiner, return_tensors);
 }
 
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index 4252b08e48c3eec5a791c92adc7238b1a0cfff03..54d38cac65ce12e8c12a56852277b574a781ee45 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -24,15 +24,6 @@ limitations under the License.
 namespace tensorflow {
 class ShapeRefiner;
 
-// Options specific to constant folding optimizations.
-//
-// TODO(ashankar,vrv): This should move to where constant folding is done.
-struct ConstantFoldingOptions {
-  // If "consider" is not a nullptr, then only constant fold a node "n" if
-  // consider(n) returns true.
-  std::function<bool(const Node*)> consider = nullptr;
-};
-
 // Construct a Graph *g out of a GraphDef gdef. Returns non-OK on
 // error, in which case *g is left in an incomplete state.
 //
@@ -60,7 +51,7 @@ extern Status ConvertGraphDefToGraph(const GraphConstructorOptions& opts,
 // On error, returns non-OK and leaves *g unmodified.
 //
 // "shape_refiner" can be null. It should be non-null if the caller
-// intends to add additonal nodes to the graph after the import. This
+// intends to add additional nodes to the graph after the import. This
 // allows the caller to validate shapes of those nodes (since
 // ShapeRefiner::AddNode must be called in topological order).
 //
@@ -113,8 +104,6 @@ struct ImportGraphDefOptions {
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
   // python API.
-
-  // TODO(skyewm): Enable importing functions
 };
 
 // Each `return_tensors` entry is the requested node and output index. The index
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 7c847916d12f54dacde4dbebcbbe86907931b180..6013b2ff512e74febead624731d59fabc65c13ee 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/version.h"
 
 // TODO(josh11b): Test InitCostModel().
@@ -145,7 +146,7 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     std::vector<string> value;
-    Status s = GetNodeAttr(n->def(), kColocationAttrName, &value);
+    Status s = GetNodeAttr(n->attrs(), kColocationAttrName, &value);
     if (!s.ok()) {
       return "";
     }
@@ -996,7 +997,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
   }
   ASSERT_TRUE(a != nullptr);
   int value = 0;
-  s = GetNodeAttr(a->def(), "default_int", &value);
+  s = GetNodeAttr(a->attrs(), "default_int", &value);
   ASSERT_EQ(Status::OK(), s) << s << " -- " << a->def().DebugString();
   EXPECT_EQ(31415, value);
 }
@@ -1200,9 +1201,9 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMap) {
 
   // Check that t1's NodeDef is consistent with graph
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  ASSERT_EQ(t1->def().input(0), "input:1");
-  ASSERT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "input:1");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
@@ -1253,19 +1254,19 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithPrefix) {
 
   // Check that NodeDefs are consistent with graph
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "input:0");
-  EXPECT_EQ(t1->def().input(1), "input:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "input:0");
 
   Node* t2 = FindNode("import/t2");
-  ASSERT_EQ(t2->def().input_size(), 2);
-  EXPECT_EQ(t2->def().input(0), "import/t1:0");
-  EXPECT_EQ(t2->def().input(1), "import/t1:0");
+  ASSERT_EQ(t2->requested_inputs().size(), 2);
+  EXPECT_EQ(t2->requested_inputs()[0], "import/t1:0");
+  EXPECT_EQ(t2->requested_inputs()[1], "import/t1:0");
 
   Node* t3 = FindNode("import/t3");
-  ASSERT_EQ(t3->def().input_size(), 2);
-  EXPECT_EQ(t3->def().input(0), "import/unmapped_input:0");
-  EXPECT_EQ(t3->def().input(1), "import/unmapped_input:1");
+  ASSERT_EQ(t3->requested_inputs().size(), 2);
+  EXPECT_EQ(t3->requested_inputs()[0], "import/unmapped_input:0");
+  EXPECT_EQ(t3->requested_inputs()[1], "import/unmapped_input:1");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithControlEdges) {
@@ -1794,24 +1795,24 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
 
   // Test that node defs are consistent with graph
   Node* w1 = FindNode("import/W1");
-  ASSERT_EQ(w1->def().input_size(), 2);
-  EXPECT_EQ(w1->def().input(0), "^W1");
-  EXPECT_EQ(w1->def().input(1), "^W2");
+  ASSERT_EQ(w1->requested_inputs().size(), 2);
+  EXPECT_EQ(w1->requested_inputs()[0], "^W1");
+  EXPECT_EQ(w1->requested_inputs()[1], "^W2");
 
   Node* input = FindNode("import/input");
-  ASSERT_EQ(input->def().input_size(), 2);
-  EXPECT_EQ(input->def().input(0), "^W1");
-  EXPECT_EQ(input->def().input(1), "^W2");
+  ASSERT_EQ(input->requested_inputs().size(), 2);
+  EXPECT_EQ(input->requested_inputs()[0], "^W1");
+  EXPECT_EQ(input->requested_inputs()[1], "^W2");
 
   Node* input2 = FindNode("import/input2");
-  ASSERT_EQ(input2->def().input_size(), 2);
-  EXPECT_EQ(input2->def().input(0), "^W1");
-  EXPECT_EQ(input2->def().input(1), "^W2");
+  ASSERT_EQ(input2->requested_inputs().size(), 2);
+  EXPECT_EQ(input2->requested_inputs()[0], "^W1");
+  EXPECT_EQ(input2->requested_inputs()[1], "^W2");
 
   Node* t1 = FindNode("import/t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "import/input:0");
-  EXPECT_EQ(t1->def().input(1), "import/input:1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "import/input:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "import/input:1");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
@@ -1855,15 +1856,15 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsWithCycle) {
 
   // Test that node defs are consistent with graph
   Node* merge = FindNode("merge");
-  ASSERT_EQ(merge->def().input_size(), 3);
-  EXPECT_EQ(merge->def().input(0), "input:0");
-  EXPECT_EQ(merge->def().input(1), "t1:0");
-  EXPECT_EQ(merge->def().input(2), "^W1");
+  ASSERT_EQ(merge->requested_inputs().size(), 3);
+  EXPECT_EQ(merge->requested_inputs()[0], "input:0");
+  EXPECT_EQ(merge->requested_inputs()[1], "t1:0");
+  EXPECT_EQ(merge->requested_inputs()[2], "^W1");
 
   Node* t1 = FindNode("t1");
-  ASSERT_EQ(t1->def().input_size(), 2);
-  EXPECT_EQ(t1->def().input(0), "merge:0");
-  EXPECT_EQ(t1->def().input(1), "merge:0");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  EXPECT_EQ(t1->requested_inputs()[0], "merge:0");
+  EXPECT_EQ(t1->requested_inputs()[1], "merge:0");
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDepsErrors) {
@@ -2008,30 +2009,196 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
 #undef EXPECT_IMPORT_FAILURE
 }
 
-TEST_F(GraphConstructorTest, ImportGraphDef_ErrorFunctionDefsUnimplemented) {
-  ExpectError(
+TEST_F(GraphConstructorTest, ImportGraphDef_FunctionDefs) {
+  // Import a graph def containing a function. The graph def was generated using
+  // this python code:
+  // @function.Defun(tf.float32, tf.float32, tf.float32)
+  // def FooGrad(x, y, dz): return dz, dz
+  //
+  // @function.Defun(tf.float32, tf.float32, grad_func=FooGrad)
+  // def Foo(x, y): return x + y
+  //
+  // p1 = tf.placeholder(tf.float32)
+  // p2 = tf.placeholder(tf.float32)
+  // foo = Foo(p1, p2)
+  ImportGraphDefOptions opts;
+  ExpectOK(
       R"EOF(
-library {
-  function {
-    signature {
-      name: "Foo_cc661786"
-      input_arg {
-        name: "x"
-        type: DT_FLOAT
+      node {
+        name: "Placeholder" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
       }
-      output_arg {
-        name: "x"
-        type: DT_FLOAT
+      node {
+        name: "Placeholder_1" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
       }
-    }
-    ret {
-      key: "x"
-      value: "x:0"
-    }
-  }
-})EOF",
-      ImportGraphDefOptions(),
-      {"Importing GraphDefs containing functions not yet implemented"});
+      node {
+        name: "Foo_d03c39a3" op: "Foo_d03c39a3"
+        input: "Placeholder" input: "Placeholder_1"
+      }
+      library {
+        function {
+          signature {
+            name: "Foo_d03c39a3"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "add" type: DT_FLOAT }
+          }
+          node_def {
+            name: "add" op: "Add" input: "x" input: "y"
+            attr { key: "T" value { type: DT_FLOAT } }
+          }
+          ret { key: "add" value: "add:z:0" }
+        }
+        function {
+          signature {
+            name: "FooGrad_dc60abc8"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            input_arg { name: "dz" type: DT_FLOAT }
+            output_arg { name: "dz" type: DT_FLOAT }
+            output_arg { name: "dz_U0" type: DT_FLOAT }
+          }
+          ret { key: "dz" value: "dz:0" }
+          ret { key: "dz_U0" value: "dz:0" }
+        }
+        gradient {
+          function_name: "Foo_d03c39a3" gradient_func: "FooGrad_dc60abc8"
+        }
+      }
+      versions { producer: 21 min_consumer: 12 }
+      )EOF",
+      opts);
+
+  EXPECT_TRUE(HasNode("Placeholder"));
+  EXPECT_TRUE(HasNode("Placeholder_1"));
+  EXPECT_TRUE(HasNode("Foo_d03c39a3"));
+  // Check that Foo and FooGrad have been imported
+  const OpDef* op_def;
+  TF_ASSERT_OK(graph_.op_registry()->LookUpOpDef("Foo_d03c39a3", &op_def));
+  TF_ASSERT_OK(graph_.op_registry()->LookUpOpDef("FooGrad_dc60abc8", &op_def));
+
+  // Re-serialize and run the graph. This tests that re-serialized functions can
+  // be imported again and that imported functions can be run.
+  GraphDef gdef;
+  graph_.ToGraphDef(&gdef);
+  EXPECT_EQ(gdef.library().function_size(), 2);
+  EXPECT_EQ(gdef.library().gradient_size(), 1);
+  EXPECT_EQ(gdef.library().gradient()[0].function_name(), "Foo_d03c39a3");
+  EXPECT_EQ(gdef.library().gradient()[0].gradient_func(), "FooGrad_dc60abc8");
+
+  std::unique_ptr<Session> sess(NewSession(SessionOptions()));
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  Tensor p1(DT_FLOAT, TensorShape({1}));
+  p1.scalar<float>()() = 1.0;
+  Tensor p2(DT_FLOAT, TensorShape({1}));
+  p2.scalar<float>()() = 2.0;
+  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
+                                                   {"Placeholder_1", p2}};
+  std::vector<string> output_names = {"Foo_d03c39a3"};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run(inputs, output_names, target_names, &outputs));
+
+  ASSERT_EQ(outputs.size(), 1);
+  EXPECT_EQ(outputs[0].scalar<float>()(), 3.0);
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
+  // Import a graph def containing a function. The graph def was generated using
+  // this python code:
+  //   @function.Defun(tf.float32, tf.float32)
+  //   def Inner(x, y): return x + y
+  //
+  //   @function.Defun(tf.float32, tf.float32)
+  //   def Outer(x, y): return Inner(x, y)
+  //
+  //   p1 = tf.placeholder(tf.float32)
+  //   p2 = tf.placeholder(tf.float32)
+  //   Outer(p1, p2)
+  ImportGraphDefOptions opts;
+  ExpectOK(
+      R"EOF(
+      node {
+        name: "Placeholder" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Placeholder_1" op: "Placeholder"
+        attr { key: "dtype" value { type: DT_FLOAT } }
+        attr { key: "shape" value { shape { } } }
+      }
+      node {
+        name: "Outer_966fa13d" op: "Outer_966fa13d"
+        input: "Placeholder" input: "Placeholder_1"
+      }
+      library {
+        function {
+          signature {
+            name: "Outer_966fa13d"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "Inner_d03c39a3" type: DT_FLOAT }
+          }
+          node_def {
+            name: "Inner_d03c39a3" op: "Inner_d03c39a3" input: "x" input: "y"
+          }
+          ret { key: "Inner_d03c39a3" value: "Inner_d03c39a3:add:0" }
+        }
+        function {
+          signature {
+            name: "Inner_d03c39a3"
+            input_arg { name: "x" type: DT_FLOAT }
+            input_arg { name: "y" type: DT_FLOAT }
+            output_arg { name: "add" type: DT_FLOAT }
+          }
+          node_def {
+            name: "add" op: "Add" input: "x" input: "y"
+            attr { key: "T" value { type: DT_FLOAT } }
+          }
+          ret { key: "add" value: "add:z:0" }
+        }
+      }
+      versions { producer: 21 min_consumer: 12 }
+      )EOF",
+      opts);
+
+  EXPECT_TRUE(HasNode("Placeholder"));
+  EXPECT_TRUE(HasNode("Placeholder_1"));
+  EXPECT_TRUE(HasNode("Outer_966fa13d"));
+  // Check that Inner and Outer have been imported
+  const OpDef* op_def;
+  Status s = graph_.op_registry()->LookUpOpDef("Inner_d03c39a3", &op_def);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+  s = graph_.op_registry()->LookUpOpDef("Outer_966fa13d", &op_def);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  // Re-serialize and run the graph. This tests that re-serialized functions can
+  // be imported again and that imported functions can be run.
+  GraphDef gdef;
+  graph_.ToGraphDef(&gdef);
+  std::unique_ptr<Session> sess(NewSession(SessionOptions()));
+  s = sess->Create(gdef);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  Tensor p1(DT_FLOAT, TensorShape({1}));
+  p1.scalar<float>()() = 1.0;
+  Tensor p2(DT_FLOAT, TensorShape({1}));
+  p2.scalar<float>()() = 2.0;
+  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
+                                                   {"Placeholder_1", p2}};
+  std::vector<string> output_names = {"Outer_966fa13d"};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  s = sess->Run(inputs, output_names, target_names, &outputs);
+  ASSERT_TRUE(s.ok()) << s.error_message();
+
+  ASSERT_EQ(outputs.size(), 1);
+  EXPECT_EQ(outputs[0].scalar<float>()(), 3.0);
 }
 
 TEST_F(GraphConstructorTest, CopyGraph) {
@@ -2104,5 +2271,176 @@ TEST_F(GraphConstructorTest, GraphDefVersionMergingDuringImport) {
   EXPECT_EQ(3, graph_.versions().bad_consumers(2));
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDefProvidedShapeRefinerVersions) {
+  ImportGraphDefOptions opts;
+  // A valid graph at producer version 20, but one
+  // that would not import if the graph_def_version were 21.
+  string gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "Sum/input"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Sum/reduction_indices"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\000\000\000\000\001\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Sum"
+  op: "Sum"
+  input: "Sum/input"
+  input: "Sum/reduction_indices"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 20
+})EOF");
+
+  // Create a shape refiner with the latest TF_GRAPH_DEF_VERSION.
+  // Importing the graphdef with an existing refiner should
+  // make the refiner inherit the graphdef version from the
+  // passed in graphdef since it has a lower producer.
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+  ExpectOK(gdef_ascii, opts, &refiner);
+
+  // Add another node with a higher producer
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+versions {
+  producer: 21
+})EOF");
+
+  ExpectOK(gdef_ascii, opts, &refiner);
+  // Check that the refiner's graph def version is the lowest of
+  // the graph defs we have seen so far.
+  EXPECT_EQ(20, refiner.graph_def_version());
+
+  // Add another node with a lower producer
+  gdef_ascii = strings::StrCat(R"EOF(
+node {
+  name: "RandomConst2"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\001\000\000\000\002\000\000\000"
+      }
+    }
+  }
+}
+versions {
+  producer: 17
+})EOF");
+  ExpectOK(gdef_ascii, opts, &refiner);
+
+  // Check that the refiner's graph def version is the lowest of
+  // the graph defs we have seen so far.
+  EXPECT_EQ(17, refiner.graph_def_version());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index fe994024c48f825d3f3eb976fe54c458844147d1..57a2f399e0e074ff0a2c671e8d12e0090006cfd3 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -356,7 +356,7 @@ string ControlLoopName(const string& name) {
 }
 
 bool IsControlLoop(const Node* node) {
-  const string& name = node->def().name();
+  const string& name = node->name();
   return StringPiece(name).starts_with("_cloop");
 }
 
@@ -468,7 +468,7 @@ Status AddControlLoop(const PartitionOptions& opts, Graph* g, const Node* src,
   const string& device_name = edge->dst()->assigned_device_name();
   const string& frame_name = src_info.frame_name;
   int parallel_iterations;
-  status = GetNodeAttr(src_info.frame->def(), "parallel_iterations",
+  status = GetNodeAttr(src_info.frame->attrs(), "parallel_iterations",
                        &parallel_iterations);
   if (!status.ok()) return status;
 
@@ -903,11 +903,11 @@ Status Partition(const PartitionOptions& opts, Graph* g,
           send_start_time = opts.start_times[src->id()].value();
           recv_start_time = opts.start_times[dst->id()].value();
         } else {
-          status = GetNodeAttr(src->def(), "_start_time", &send_start_time);
+          status = GetNodeAttr(src->attrs(), "_start_time", &send_start_time);
           if (!status.ok()) {
             return status;
           }
-          status = GetNodeAttr(dst->def(), "_start_time", &recv_start_time);
+          status = GetNodeAttr(dst->attrs(), "_start_time", &recv_start_time);
           if (!status.ok()) {
             return status;
           }
@@ -1028,9 +1028,10 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     }
   }
 
-  // Set versions
+  // Set versions and function library
   for (auto& it : *partitions) {
     it.second.mutable_versions()->CopyFrom(g->versions());
+    *it.second.mutable_library() = g->flib_def().ToProto();
   }
 
   // Set the start times for recvs at the very end.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index aa732f1fc98067ecc9f6ba0da522cd500ab44dc3..ee545dbfbfa6bdb8365544ab16fbb72776945a9b 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -137,7 +138,8 @@ Output ConstructOp(const Scope& scope, const string& op_type,
                    const gtl::ArraySlice<Input>& inputs) {
   if (!scope.ok()) return Output();
   const string unique_name = scope.GetUniqueNameForOp(op_type);
-  auto builder = NodeBuilder(unique_name, op_type);
+  auto builder =
+      NodeBuilder(unique_name, op_type, scope.graph()->op_registry());
   for (auto const& input : inputs) {
     builder.Input(ops::NodeOut(input.node(), input.index()));
   }
@@ -188,6 +190,15 @@ class GraphPartitionTest : public ::testing::Test {
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[b]);
   }
 
+  void ExpectFunctions(const FunctionDefLibrary& library,
+                       const std::set<string>& expected_names) {
+    std::set<string> actual_names;
+    for (const FunctionDef& fdef : library.function()) {
+      actual_names.insert(fdef.signature().name());
+    }
+    EXPECT_EQ(actual_names, expected_names);
+  }
+
   Scope in_;
   GraphDef in_graph_def_;
   Scope scope_a_;
@@ -401,5 +412,27 @@ TEST_F(GraphPartitionTest, PartitionIncompleteGraph) {
   EXPECT_EQ(error::INVALID_ARGUMENT, status.code()) << status;
 }
 
+TEST_F(GraphPartitionTest, Functions) {
+  FunctionDefLibrary fdef_lib;
+  *fdef_lib.add_function() = test::function::XTimesTwo();
+  *fdef_lib.add_function() = test::function::XTimesFour();
+  TF_ASSERT_OK(in_.graph()->AddFunctionLibrary(fdef_lib));
+
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+  auto a1 = FloatInput(in_.WithOpName("A1"));
+  auto b1 = FloatInput(in_.WithOpName("B1"));
+  ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
+  ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
+
+  Partition(ToGraphDef(), &partitions_);
+  EXPECT_EQ(2, partitions_.size());
+
+  // Test that partition graphs inherit function library from original graph
+  string a = "/job:a/replica:0/task:0/cpu:0";
+  string b = "/job:a/replica:0/task:0/cpu:1";
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+  ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index f5ed7a83e47f64648daa168f898394a83b0e1726..89784c631f002528db5b9d58dab40c68c9fcf173 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <set>
 #include <vector>
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -50,8 +51,8 @@ class GraphTest : public ::testing::Test {
   GraphTest() : graph_(OpRegistry::Global()) {}
   ~GraphTest() override {}
 
-  static void VerifyNodes(Node* node, std::vector<Node*> expected_in,
-                          std::vector<Node*> expected_out) {
+  static void VerifyNodes(Node* node, const std::vector<Node*>& expected_in,
+                          const std::vector<Node*>& expected_out) {
     std::vector<Node*> in;
     for (const Edge* e : node->in_edges()) {
       in.push_back(e->src());
@@ -317,21 +318,21 @@ TEST_F(GraphTest, AddAttr) {
   n1->AddAttr("_a", "new_attr");
 
   string attr;
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
 
   Node* n2 = graph_.CopyNode(n1);
 
   n1->AddAttr("_b", "new_attr_2");
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->def(), "_b", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n1->attrs(), "_b", &attr));
   EXPECT_EQ("new_attr_2", attr);
 
-  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->def(), "_a", &attr));
+  EXPECT_EQ(Status::OK(), GetNodeAttr(n2->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
-  EXPECT_NE(Status::OK(), GetNodeAttr(n2->def(), "_b", &attr));
+  EXPECT_NE(Status::OK(), GetNodeAttr(n2->attrs(), "_b", &attr));
 }
 
 // Convert edge iteration results into a sorted string.
@@ -387,6 +388,60 @@ TEST_F(GraphTest, InputEdges) {
   TF_EXPECT_OK(b->input_edges(&edges));
 }
 
+TEST_F(GraphTest, AddFunctionLibrary) {
+  // Basic functionality
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::XTimesFour();
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesTwo") != nullptr);
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesFour") != nullptr);
+
+  // Duplicate functions are ignored
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesTwo") != nullptr);
+  EXPECT_TRUE(graph_.flib_def().Find("XTimesFour") != nullptr);
+
+  // Duplicate names corresponding to different functions trigger an error
+  FunctionDefLibrary error_proto = proto;
+  *error_proto.mutable_function(0)->add_node_def() =
+      error_proto.function(0).node_def(0);
+  Status s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'XTimesTwo' because a different function with "
+            "the same name already exists.");
+
+  // Function with same name as an existing op triggers an error
+  error_proto = proto;
+  error_proto.mutable_function(0)->mutable_signature()->set_name("Add");
+  s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot add function 'Add' because an op with the same name "
+            "already exists.");
+
+  // Adding a gradient function to an existing function is ok
+  GradientDef* grad = proto.add_gradient();
+  grad->set_function_name("XTimesTwo");
+  grad->set_gradient_func("Undefined");  // undefined funcs in grads are ok
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_EQ(graph_.flib_def().FindGradient("XTimesTwo"), "Undefined");
+
+  // Duplicate gradients are ignored
+  TF_EXPECT_OK(graph_.AddFunctionLibrary(proto));
+  EXPECT_EQ(graph_.flib_def().FindGradient("XTimesTwo"), "Undefined");
+
+  // Conflicting gradient triggers an error
+  error_proto = proto;
+  error_proto.mutable_gradient(0)->set_gradient_func("Undefined2");
+  s = graph_.AddFunctionLibrary(error_proto);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(),
+            "Cannot assign gradient function 'Undefined2' to 'XTimesTwo' "
+            "because it already has gradient function 'Undefined'");
+}
+
 REGISTER_OP("Input").Output("o: float");
 REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("o: float");
 
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 6deaa79485dd3d917584d4e3fb898c5ce1efdbe5..94741a11ffa0ca5eb00ff2e9e5834e153f25b4b4 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/util/mkl_util.h"
@@ -48,7 +49,7 @@ namespace tensorflow {
 //     1) Propagating Mkl layout as an additional output tensor
 //        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
 //         henceforth.) from every Mkl supported NN layer.
-//     2) Context-based rewrite: This is neded in order to optimize
+//     2) Context-based rewrite: This is needed in order to optimize
 //        gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
 //        MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
 //        Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
@@ -63,12 +64,12 @@ namespace tensorflow {
 //           P = BiasAdd(O, C)
 //
 // We merge them into Conv2DWithBias as:
-//           P = MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
 //
-// Meaning of A_m, B_m and C_m is explained in B.1.
+// The meaning of A_m, B_m and C_m is explained in B.1.
 //
 // Merge rules:
-//  - Merge for Conv2D and BiasAdd happens only when output of Conv2D _only_
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
 //    goes to BiasAdd.
 //  - Also, the intersection of attributes of both the nodes must have same
 //    values.
@@ -76,7 +77,7 @@ namespace tensorflow {
 //
 // Example of B.1 : Rewriting nodes to Mkl nodes
 // ---------------------------------------------
-// Consider Relu layer. Current definition of Relu layer looks like:
+// Consider a Relu node. Current definition of Relu node looks like:
 //
 //           O = Relu(A)
 //
@@ -87,58 +88,59 @@ namespace tensorflow {
 //
 //          O, O_m = MklRelu(A, A_m)
 //
-// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here A input is
-// same as A input of Relu; O output is same as O output of Relu. O_m is the
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
 // additional output tensor that will be set by MklRelu, and it represents
 // Mkl tensor corresponding to O -- in other words, O_m is some kind of
 // metadata for O. A_m is additional input of Relu, and it represents metadata
 // for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
-// this metadata from previous layer (in the graph).
+// this metadata from previous node in the graph.
 //
-// When previous layer in the graph is Mkl layer, A_m will represent a valid
-// Mkl tensor. But when previous Mkl layer is not an Mkl layer, then A_m
-// represents a dummy Mkl tensor.
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
 //
 // Rewriting rules:
-//  - Selection of an op for rewriting happens by registering an op with this
-//     pass. If an op is not registered, then it is not rewritten.
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
 //  - Number of inputs after rewriting:
-//      Since for every input Tensorflow tensor, the rewritten layer gets Mkl
-//      tensor, rewritten op gets 2*N inputs, where N is the number of inputs
-//      for original op.
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
 //  - Number of outputs after rewriting:
-//      Since for every output Tensorflow tensor, the rewritten layer generates
-//      Mkl tensor, rewritten op generates 2*N outputs, where N is the number
-//      of outputs of original op.
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
 //  - Ordering of Tensorflow tensors and Mkl tensors:
-//      Since every op generates twice the number of inputs and outputs, one
-//      could imagine different ordering among Tensorflow tensors and Mkl
-//      tensors. E.g., let's assume an op 'Conv2D' takes (A, B) as input, then
-//      new op 'MklConv2D' can take (A, A_m, B, B_m) as input or it can also
-//      take (A, B, A_m, B_m) as input. Among N inputs one can get N!
-//      permutations.
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
 //
-//      So the question is: which one do we follow? Currently, we follow an
-//      intuitive order where Mkl tensor follows a corresponding Tensorflow
-//      tensor immediately. In the context of above example, it will be: (A,
-//      A_m, B, B_m). We follow same ordering rule for output tensors.
-//
-// NOTE: Current rewriting approach rewrites an op to Mkl op without any
-//      conditions. But in the future, it may be possible to consider
-//      conditions such as input shapes and sizes to rewrite an op.
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
 //
 // Graph rewrite algorithm:
 //      Algorithm: Graph Rewrite
-//      Input: Graph G, Names of nodes to rewrite and their new nodes
-//      Output: Modified Graph G' if nodes are modified, G otherwise.
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
 //      Start:
-//        N = Topological_Sort(G) // N is set of nodes in toposort order.
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
 //        foreach node n in N
 //        do
-//          if (Is_MKL_Layer(n))  // Can this layer accept Mkl layout as input.
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
 //          then
 //            E = set of <incoming edge and its src_output slot> of n
-//            E' = {}   // new set of edges for rewritten node
+//            E' = {}   // a new set of edges for rewritten node
 //            foreach <e,s> in E
 //            do
 //              E' U {<e,s>}  // First copy edge which generates Tensorflow
@@ -146,42 +148,44 @@ namespace tensorflow {
 //              m = Source node of edge e
 //              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
 //              then
-//                E' U {<m,s+1>}    // If yes, then m will generate Mkl tensor
-//                                  // as output.
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
 //              else
-//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate dummy
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
 //                                                 // Mkl tensor.
-//                E' U {<d,0>}   // Dummy Mkl tensor has only 1 output slot.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
 //              fi
 //            done
 //            n' = Build_New_Node(G,new_name,E')
-//            Mark_Rewritten(n')  // Mark new node as being rewritten.
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
 //          fi
 //        done
 //
 //      Explanation:
-//        For graph rewrite, we visit nodes of the graph in the topological
-//        sort order. With this ordering, we visit nodes in top-to-bottom
-//        fashion. We need this order because while visiting a node we want
-//        all of its input nodes (parents) visited (and rewritten if
-//        applicable). This is because if we need to rewrite a current node
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
 //        then all of its input nodes need to be fixed (in other words they
-//        cannot be removed later.)
+//        cannot be deleted later.)
 //
-//        While visiting each node, we first check if it is Mkl layer. If
-//        it is, then we rewrite that node after constructing new inputs to
-//        the node. If it is not Mkl layer, then we do not rewrite the node.
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
 //
 // Handling workspace propagation for certain ops:
 //
 //        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
-//        passing of workspace from their corresponding forward ops. But
-//        TensorFlow does not have a notion of workspace and as a result
-//        does not allow producing additional outputs from these forward ops.
-//        For these ops, we need to add an additional edge between forward
-//        ops and their corresponding backward ops, and this edge carries
-//        workspace tensor value and another edge carries Mkl tensor for
-//        workspace tensor.
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
 //
 //        Example:
 //
@@ -190,59 +194,61 @@ namespace tensorflow {
 //        A = MaxPool(T)
 //        B = MaxPoolGrad(X, A, Y)
 //
-//        We will transform this graph to propagate workspace as:
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
 //
-//        A, A_m, W, W_m = MklMaxPool(T, T_m)
-//        B, B_m = MklMaxPoolGrad(X, X_m, A, A_m, Y, Y_m, W, W_m)
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
 //
-//        Here W is the workspace tensor. Transformed tensors with name
-//        suffix _m are Mkl tensors and this transformation has been done
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
 //        using the algorithm discussed earlier. The transformation for
-//        workspace only adds extra outputs (W, W_m) for forward op and
-//        connects them to corresponding backward ops.
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
 //
 //        Terms:
 //
 //        Forward op name = name of the op in the forward pass
-//          where workspace originates (MaxPool in this example)
+//          where a workspace tensor originates (MaxPool in this example)
 //        Backward op name = name of the op in the backward pass that receives
-//          workspace from forward op (MaxPoolGrad in the example)
-//        Slot = Number of the output or input slot that will be
-//               used by the workspace (2 for MklMaxPool as W is 3rd
-//               output of MaxPool (0 is 1st); 6 for MklMaxPoolGrad)
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
 //
 //        Question:
 //
-//        How do we associate backward op to forward op? There can be more
-//        than one op with exact same name.
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
 //
-//        In this example we associate MaxPoolGrad with MaxPool. But there
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
 //        could be more than one MaxPool ops. To solve this problem, we look
-//        for _direct_ edge between forward op and backward op (tensor A is
-//        flowing along this edge in the example.)
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
 //
-//        How do we transform forward and backward op when there is no direct
-//        edge between them? In such case, we generate dummy tensors as
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
 //        workspace tensors. For the example, transformation of MaxPool will
-//        be exactly same --- it is just that MaxPool won't generate any
-//        workspace tensor. For MaxPoolGrad, transformation will also be same,
-//        but instead of connecting W and W_m with outputs of MaxPool, we will
-//        produce dummy tensors for them, and we will set workspace_enabled
-//        attribute to false.
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
 //
 // Example of B.2 : Context-based node rewrite
 // -------------------------------------------
 // Consider BiasAddGrad op as:
 //
-//           O = MklConv2D(A, A_m, B, B_m, C, C_m)
+//           O = _MklConv2D(A, B, C, A_m, B_m, C_m)
 //           P = BiasAddGrad(O)
 //
-// Then we rewrite is as:
+// Then we rewrite it as:
 //
 //           P = Conv2DWithBiasBackpropBias(O, O_m)
 //
-// 'Distance' between input of BiasAddGrad and MklConv2D in terms of hops is
-// the context matching depth. If MklConv2DWithBias is not within the context
+// 'Distance' between input of BiasAddGrad and _MklConv2D in terms of hops is
+// the context matching depth. If _MklConv2DWithBias is not within the context
 // matching depth, then we do not rewrite BiasAddGrad.
 
 // How many hops do we search for matching node in the backward dataflow graph?
@@ -255,54 +261,118 @@ static size_t kNodeMergeContextMaxDepth = 10;
 class MklLayoutRewritePass : public GraphOptimizationPass {
  public:
   MklLayoutRewritePass() {
-    csinfo_.conv2d            = "Conv2D";
-    csinfo_.mklconv2d         = "MklConv2D";
-    csinfo_.mklconv2dwithbias = "MklConv2DWithBias";
-    csinfo_.mklconv2dwithbiasbackpropbias = "MklConv2DWithBiasBackpropBias";
-    csinfo_.biasadd           = "BiasAdd";
-    csinfo_.matmul            = "MatMul";
-    csinfo_.biasaddgrad       = "BiasAddGrad";
-    csinfo_.relu              = "Relu";
-    csinfo_.relugrad          = "ReluGrad";
-    csinfo_.maxpool           = "MaxPool";
-    csinfo_.maxpoolgrad       = "MaxPoolGrad";
-    csinfo_.avgpool           = "AvgPool";
-    csinfo_.avgpoolgrad       = "AvgPoolGrad";
-    csinfo_.conv2dgradinput   = "Conv2DBackpropInput";
-    csinfo_.conv2dgradfilter  = "Conv2DBackpropFilter";
-
-    rinfo_.push_back({csinfo_.conv2d,   csinfo_.mklconv2d,
-                      2, CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2dgradfilter,
-        GetMklOpName(csinfo_.conv2dgradfilter),
-                      3, CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.conv2dgradinput,
-        GetMklOpName(csinfo_.conv2dgradinput),
-                      3, CopyAttrsConv2D, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.relu, GetMklOpName(csinfo_.relu),
-                      1, CopyAttrsRelu, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.maxpool, GetMklOpName(csinfo_.maxpool),
-                      1, CopyAttrsPooling, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.maxpoolgrad, GetMklOpName(csinfo_.maxpoolgrad),
-                      3, CopyAttrsPooling, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.avgpool, GetMklOpName(csinfo_.avgpool),
-                      1, CopyAttrsPooling, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.avgpoolgrad, GetMklOpName(csinfo_.avgpoolgrad),
-                      2, CopyAttrsPooling, AlwaysRewrite});
+    // NOTE: names are alphabetically sorted.
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_with_bias_backprop_bias =
+                                   "_MklConv2DWithBiasBackpropBias";
+    csinfo_.relu                  = "Relu";
+    csinfo_.relu_grad             = "ReluGrad";
+    csinfo_.reshape               = "Reshape";
+    csinfo_.split                 = "Split";
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.avg_pool,
+                      GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
+    // on if context contains Conv2D.
+    rinfo_.push_back({csinfo_.bias_add_grad,
+                      csinfo_.mkl_conv2d_with_bias_backprop_bias,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_conv2dwithbias_context_});
+    // BiasAddGrad gets written into BiasAddGrad depending on if context
+    // contains MatMul.
+    rinfo_.push_back({csinfo_.bias_add_grad, csinfo_.matmul,
+                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
+                      &biasaddgrad_matmul_context_});
+    rinfo_.push_back({csinfo_.concat,
+                      GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.concatv2,
+                      GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d,
+                      GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
+                      GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.identity,
+                      GetMklOpName(csinfo_.identity),
+                      CopyAttrsIdentity, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn,
+                      GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.max_pool,
+                      GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu,
+                      GetMklOpName(csinfo_.relu),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsRelu, AlwaysRewrite, nullptr});
+    rinfo_.push_back({csinfo_.reshape,
+                      GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite, nullptr});
 
     // Add info about which ops to add workspace edge to and the slots.
-    wsinfo_.push_back({csinfo_.maxpool, csinfo_.maxpoolgrad, 0, 1, 2, 6});
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
 
     // Add a rule for merging nodes
-    minfo_.push_back({csinfo_.mklconv2d, csinfo_.biasadd, 0,
-                      csinfo_.mklconv2dwithbias});
+    minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
+                      csinfo_.mkl_conv2d_with_bias});
 
     // We use maxhop of 10 based on empirical observations. Also, these are
     // maxhops in backward data-flow graph. Since input of forward nodes
     // (Conv2D) directly goes to backward nodes, we do not expect the
     // hop-distance would be more than few nodes.
-    cinfo_.push_back({csinfo_.biasaddgrad, csinfo_.mklconv2dwithbias,
-                      kNodeMergeContextMaxDepth});
+    biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
+                                   kNodeMergeContextMaxDepth};
+
+    biasaddgrad_conv2dwithbias_context_ = {csinfo_.bias_add_grad,
+                                   csinfo_.mkl_conv2d_with_bias,
+                                   kNodeMergeContextMaxDepth};
+
+    cinfo_.push_back(&biasaddgrad_matmul_context_);
+    cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
   }
 
   // Standard interface to run pass
@@ -317,93 +387,106 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @return true, if and only if graph is mutated; false otherwise.
   bool RunPass(std::unique_ptr<Graph>* g);
 
- private:
-  /// Structure to specify name of original op, its new name after rewrite,
-  /// the number of inputs to the original op, and the function to be used
-  /// to copy attributes for the op
+  /// Structure to specify the context information used in a node rewrite rule
   typedef struct {
-    string name;   // Original name of the op in the graph
-    string newname;   // New name of op in the graph
-    int    numins;  // Number of inputs to the original op
-    // Function handler to copy attributes from old node to new node.
-    std::function<void(const Node*, NodeBuilder*)> copyattrs;
-    std::function<bool(const Node*)> rewriterule;  // Rule under which to
-                    // rewrite this node.
+    string node;     // Name of the node to be rewritten
+    string fwd;      // Name of the node in the forward pass that this node
+                     // corresponds to
+    size_t max_hop;  // Maximum number of hops the fwd is located
+                     // from this node. If the fwd is farther than max_hop
+                     // then we do not rewrite the node.
+  } ContextInfo;
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*, const ContextInfo* c)> rewrite_rule;
+    // ContextInfo, if any, to be used for rewrite
+    ContextInfo* context;
   } RewriteInfo;
 
-  /// Structure to specify forward op, backward op, and the slot numbers
-  /// in forward and backward op where we will add workspace edge.
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
   typedef struct {
-    string fwdop;   // Name of the forward op in the graph
-    string bwdop;   // Name of the backward op in the graph
-    int fwdslot;    // Output slot in the forward op node where actual
-                    // output tensor resides
-    int bwdslot;    // Input slot in the backward op node where actual
-                    // input tensor resides
-    int wsfwdslot;  // Output slot in the forward op node where workspace
-                    // edge is added
-    int wsbwdslot;  // Input slot in the backward op node where workspace
-                    // edge is added
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
   } WorkSpaceInfo;
 
   /// Structure to specify information used in node merge
   typedef struct {
-    string pred;  // Predecessor node string
-    string succ;  // Successor node string
-    int    op;    // What operand no the predecessor node corresponds
-                  // to successor node?
-    string newnode;  // Name of the node after merge
+    string pred;      // Predecessor node string
+    string succ;      // Successor node string
+    int op;           // The operand no the predecessor node corresponds
+                      // to the successor node
+    string new_node;  // Name of the node after merge
   } MergeInfo;
 
-  /// Structure to specify the context information used in node rewrite rule
-  typedef struct {
-    string node;  // Name of the node to be rewritten
-    string fwd;  // Node name in forward pass that this node
-                 // corresponds to
-    size_t maxhop;  // Maximum number of hops the fwd is located
-                    // from this node. If fwd is farther than maxhop
-                    // then we do not rewrite the node.
-  } ContextInfo;
-
   /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
   struct {
-    string relu;
-    string relugrad;
-    // Conv ops
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
     string conv2d;
-    string mklconv2d;
-    string conv2dgradinput;
-    string conv2dgradfilter;
-    string mklconv2dwithbias;
-    string mklconv2dwithbiasbackpropbias;
-    // Pooling ops
-    string maxpool;
-    string maxpoolgrad;
-    string avgpool;
-    string avgpoolgrad;
-    // Others
-    string biasadd;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
     string matmul;
-    string biasaddgrad;
+    string max_pool;
+    string max_pool_grad;
+    string mkl_conv2d;
+    string mkl_conv2d_with_bias;
+    string mkl_conv2d_with_bias_backprop_bias;
+    string relu;
+    string relu_grad;
+    string reshape;
+    string split;
   } csinfo_;
 
+ private:
   /// Maintain info about nodes to rewrite
   std::vector<RewriteInfo> rinfo_;
 
   /// Maintain info about nodes to add workspace edge
   std::vector<WorkSpaceInfo> wsinfo_;
 
-  /// Maintain info  to be merged
+  /// Maintain info about nodes to be merged
   std::vector<MergeInfo> minfo_;
 
   /// Maintain info about nodes to rewrite
-  static std::vector<ContextInfo> cinfo_;
+  static std::vector<ContextInfo*> cinfo_;
+
+  /// Context variables used in referencing rules
+  static ContextInfo biasaddgrad_matmul_context_;
+  static ContextInfo biasaddgrad_conv2dwithbias_context_;
 
   /// Hash table to maintain nodes visited in the graph.
   std::unordered_set<const Node*> visited_nodes_;
 
  private:
-  // Predicate to check if we rewrote node 'n'
+  // Check if we rewrote node 'n'
   //
   // If we rewrote the node, then the rewritten node will produce
   // Mkl tensor as output. If we did not rewrite the node, then
@@ -418,8 +501,30 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   inline void MarkRewrittenNode(Node* n) { visited_nodes_.insert(n); }
 
   // Clear all visited nodes
-  inline void UnMarkRewrittenNodes() {
-    visited_nodes_.clear();
+  inline void UnMarkRewrittenNodes() { visited_nodes_.clear(); }
+
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
   }
 
   // Get the name of Mkl op from original TensorFlow op
@@ -427,10 +532,43 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // TODO(nhasabni) We should move this to mkl_util.h.
   inline string GetMklOpName(const string& name) const {
     // Prefix that we add to Tensorflow op name to construct Mkl op name.
-    const char* const kMklOpPrefix = "Mkl";
+    const char* const kMklOpPrefix = "_Mkl";
     return string(kMklOpPrefix) + name;
   }
 
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "cpu";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
   // Return a node that can be merged with input node 'n'
   //
   // @return pointer to the node if we can find such a
@@ -442,7 +580,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //
   // Input nodes succ and pred may be deleted if the call to
   // this function is successful. Attempt to use the pointers
-  // after the call to function may result is undefined behaviors.
+  // after the call to function may result in undefined behaviors.
   //
   // @input g - input graph, succ - successor node, pred - predecessor node
   // @return Status::OK(), if merging is successful and supported.
@@ -455,30 +593,63 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We check for 2 scenarios for rewrite.
   //
   // @return RewriteInfo* for the applicable rewrite rule
-  const RewriteInfo* CheckForNodeRewrite(const Node *n) const;
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n) { return true; }
-  // Rewrite rule that uses context-information for matching
+  static bool AlwaysRewrite(const Node* n, const ContextInfo* c = nullptr) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n,
+                                           const ContextInfo* c) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrite rule that uses context-information for matching,
   // used in scenario 2.
   //
   // @input - Node 'n' for which to search for matching context
-  // @return - true if matching context is found; false otherwise.
-  static bool ContextMatchRewrite(const Node* n);
+  // @input - The context 'c' under which to rewrite
+  // @return - true if we can rewrite node under context 'c';
+  //           false otherwise.
+  static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
 
   // Helper function that searches the matching contextinfo for the node.
   // Implements depth-first search in the data dependence graph for the
   // gradient op in the backward direction.
   //
   // @input n - Node (gradient op) whose contextinfo is to be searched,
-  //        fwdn - pointer to node from the forward pass that this node
-  //        belongs to. fwdn cannot be NULL.
+  //        fwd_node - pointer to node from the forward pass that this node
+  //        belongs to. fwd_node cannot be NULL.
   // @return Matching contextinfo in case a match is found; null otherwise.
-  //         Also updates *fwdn with pointer to forward node that this context
-  //         matches.
+  //         Also updates *fwd_node with pointer to forward node that this
+  //         context matches.
   static const ContextInfo* SearchMatchingContext(const Node* n,
-                                                  const Node** fwdn);
+                                                  const Node** fwd_node);
 
   // Rewrites input node to a new node specified by its matching rewrite info.
   //
@@ -496,49 +667,139 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   //         Otherwise, it is not updated.
   Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
 
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+
   // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
-  // in graph 'g'. Original node is input in 'orign'.
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
   //
-  // For details, refer to 'Number of inputs after rewriting' section in the
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
   // documentation above.
   //
   // Returns Status::OK() if setting up inputs is successful, otherwise
   // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
   Status SetUpInputs(std::unique_ptr<Graph>* g,
                      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-                     NodeBuilder* nb, Node* orign);
-
-  // Add workspace edge on the input or output side of Node 'orign' by using
-  // NodeBuilder 'nb' for the new node provided. If 'orign' does not dictate
-  // adding workspace edge then do not add it.
-  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orign,
-      NodeBuilder* nb);
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
 
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
-  static void CopyAttrsConv2D(const Node* orign, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orign, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orign, NodeBuilder* nb);
-  static void CopyAttrsRelu(const Node* orign, NodeBuilder* nb);
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsRelu(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
-  // using node for original node 'orign' and return it in '*out'.
+  // using node for original node 'orig_node' and return it in '*out'.
   // TODO(nhasabni) We should move this to mkl_util.h
   void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                             Node* orign);
+                             Node* orig_node);
   void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                             Node* orign);
+                                   Node* orig_node);
 };
 
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
+MklLayoutRewritePass::ContextInfo
+  MklLayoutRewritePass::biasaddgrad_matmul_context_;
+std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
 
-std::vector<MklLayoutRewritePass::ContextInfo> MklLayoutRewritePass::cinfo_;
-
-// We register Mkl rewrite pass for phase 1 in pre-placement group.
-// Do not change the ordering of the Mkl passes.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
-                      MklLayoutRewritePass);
-
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
 
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions for creating new node
@@ -547,7 +808,6 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1,
 static void FillInputs(const Node* n,
                        gtl::InlinedVector<Node*, 4>* control_edges,
                        gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
-  DCHECK_EQ(in->size(), n->num_inputs());
   control_edges->clear();
   for (const Edge* e : n->in_edges()) {
     if (e->IsControlEdge()) {
@@ -565,9 +825,30 @@ static void FillInputs(const Node* n,
   }
 }
 
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
 // TODO(nhasabni) We should move this to mkl_util.h.
 void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
-                                                 Node** out, Node* orign) {
+                                                 Node** out, Node* orig_node) {
   // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
   // dummy Mkl tensor. 8 = 2*size_t.
   const DataType dt = DataTypeToEnum<uint8>::v();
@@ -579,61 +860,227 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
   TensorShape dummy_shape({8});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                 .Attr("value", proto)
-                 .Attr("dtype", dt)
-                 .Device(orign->def().device())  // We place this node on same
-                                             // device as device of original
-                                             // node.
-                 .Finalize(&**g, out));
-  (*out)->set_assigned_device_name(orign->assigned_device_name());
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
 }
 
-Status MklLayoutRewritePass::SetUpInputs(
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, NodeBuilder* nb,
-    Node* orign) {
-  std::vector<NodeBuilder::NodeOut> new_inputs;
-
-  // 1. Let's setup inputs for the new node.
-  for (int i = 0; i < inputs.size(); i++) {
-    Node* n = inputs[i].first;
-    // First let's copy original TF tensor input as it is.
-    new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second));
-
-    // Second, let's add edge to propagate Mkl tensors from input Mkl layers,
-    // or generate a dummy Mkl tensor representing not-mkl-tensor case.
-    if (IsRewrittenNode(n)) {
-      // If we have visited this node and rewritten it, then it will generate
-      // an edge that will receive Mkl tensor from a node.
-      // First, let's assert that this op is Mkl layer.
-      DataType T;
-      TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
-      // If this op has been rewritten, then its name must have been same as
-      // Mkl op.
-      CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string(), T), true);
-      // src slot number for Mkl tensor would be the one next to TF tensor
-      // slot number.
-      new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second + 1));
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+  if (IsRewrittenNode(n)) {
+    // If we have visited this node and rewritten it, then it will generate
+    // an edge that will receive Mkl tensor from a node.
+    // First, let's assert that this op is Mkl layer.
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
+    // If this op has been rewritten, then its name must have been same as
+    // Mkl op.
+    CHECK_EQ(mkl_op_registry::IsMklOp(n->type_string(), T), true);
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
     } else {
-      // If we have not visited the node and rewritten it, then we need
-      // to create a dummy node that will feed a non-Mkl tensor to this node.
-      // DummyMklTensor node has no input and generates only 1 output
-      // (dummy Mkl tensor) as output slot number 0.
-      Node* dmt = nullptr;
-      GetDummyMklTensorNode(g, &dmt, orign);
-      CHECK_NOTNULL(dmt);
-      new_inputs.push_back(NodeBuilder::NodeOut(dmt, 0));
+      nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      iidx++;
+      nn_slot_idx++;
     }
   }
 
-  // The total number of inputs to new node _must_ be 2 times the number
-  // of inputs to the original node: N original Tensorflow tensors and
-  // N for Mkl tensors corresponding to each Tensorflow tensors.
-  CHECK_EQ(new_inputs.size(), inputs.size() * 2);
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
 
-  // 2. Let's add the new inputs.
-  for (auto ni : new_inputs) {
-    nb->Input(ni.node, ni.index);
+  // Let's now setup all Mkl inputs to new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                old_node_inputs[iidx].second,
+                                &mkl_node, &mkl_node_output_slot);
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
   }
 
   return Status::OK();
@@ -645,7 +1092,7 @@ Status MklLayoutRewritePass::SetUpInputs(
 
 // TODO(nhasabni) We should move this to mkl_util.h.
 void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
-    std::unique_ptr<Graph>* g, Node** out, Node* orign) {
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
   // We use a tensor of shape {1} and value 0 to represent
   // dummy float tensor. We need this as a dummy workspace tensor.
   // Workspace tensor has type float.
@@ -653,42 +1100,63 @@ void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
   TensorProto proto;
   proto.set_dtype(dt);
   float zero[1] = {0};
-  proto.set_tensor_content(const_cast<const void*>(
-      static_cast<void*>(&zero)), 4);
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
   TensorShape dummy_shape({1});
   dummy_shape.AsProto(proto.mutable_tensor_shape());
   TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                 .Attr("value", proto)
-                 .Attr("dtype", dt)
-                 .Device(orign->def().device())  // We place this node on same
-                                             // device as device of original
-                                             // node.
-                 .Finalize(&**g, out));
-  (*out)->set_assigned_device_name(orign->assigned_device_name());
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
 }
 
-void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    Node* orign, NodeBuilder* nb) {
-  bool workspace_edge_added = false;
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
   DataType T;
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
-    if (orign->type_string() == ws.fwdop &&
-        mkl_layer_registry::IsMklLayer(
-          GetMklOpName(orign->type_string()), T)) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
-      // edge from this node's fwdslot to bwdop's bwdslot. If there is
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
       // an edge, then we just add an attribute on this node for setting
       // workspace_passed to true. We don't add actual workspace edge
       // in this node. Actual workspace edge gets added in the backward
       // op for this node.
-      for (const Edge* e : orign->out_edges()) {
-        if (e->src_output() == ws.fwdslot &&
-            e->dst()->type_string() == ws.bwdop &&
-            e->dst_input() == ws.bwdslot) {
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
           VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orign->type_string();
+                  << orig_node->type_string();
           workspace_edge_added = true;
           // We found the edge that we were looking for, so break.
           break;
@@ -700,34 +1168,40 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g,
         // node.
         nb->Attr("workspace_enabled", false);
       }
-    } else if (orign->type_string() == ws.bwdop &&
-          mkl_layer_registry::IsMklLayer(
-            GetMklOpName(orign->type_string()), T)) {
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(GetMklOpName(orig_node->type_string()),
+                                        T)) {
       // If this op is a bwd op, then we need to add workspace edge and
       // it's Mkl tensor edge between its corresponding fwd op and this
-      // op. Corresponding fwd op is specified in 'fwdop' field of
-      // workspace info. fwdslot and bwdslot in workspace info specify
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
       // an edge between which slots connect forward and backward op.
       // Once all these criteria match, we add a workspace edge between
-      // wsfwdslot and wsbwdslot. It's corresponding Mkl tensor is added
-      // in wsfwdslot+1 and wsbwdslot+1.
-      for (const Edge* e : orign->in_edges()) {
-        if (e->src_output() == ws.fwdslot &&
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
             // We would have rewritten the forward op, so we need to use
             // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() == GetMklOpName(ws.fwdop) &&
-            e->dst_input() == ws.bwdslot) {
+            e->src()->type_string() == GetMklOpName(ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
           // Add workspace edge between fwd op and bwd op.
-          nb->Input(e->src(), ws.wsfwdslot);
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
           // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
-          nb->Input(e->src(), ws.wsfwdslot+1);
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
           // In terms of input ordering, we add these calls to add Input
           // here because workspace edge (and its Mkl tensor) is the last
           // edge in the fwdop and bwdop. So all inputs before workspace
           // tensor have been added by SetUpInputs function.
           VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orign->type_string();
+                  << orig_node->type_string();
           workspace_edge_added = true;
           // We found the edge that we were looking for, so break.
           break;
@@ -740,17 +1214,20 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g,
       // workspace_enabled to false.
       if (!workspace_edge_added) {
         nb->Attr("workspace_enabled", false);
-        Node* dmt_ws = nullptr;  // Dummy tensor for workspace
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
         Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
-        GetDummyWorkspaceTensorNode(g, &dmt_ws, orign);
-        GetDummyMklTensorNode(g, &dmt_mkl_ws, orign);
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
         CHECK_NOTNULL(dmt_ws);
         CHECK_NOTNULL(dmt_mkl_ws);
-        nb->Input(dmt_ws, 0);  // We add dummy tensor as workspace tensor.
-        nb->Input(dmt_mkl_ws, 0);  // We add dummy tensor as Mkl
-                             // tensor for workspace tensor.
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
         VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
-              << orign->type_string();
+                << orig_node->type_string();
       }
     } else {
       // If this node does not match any workspace info, then we do not
@@ -763,8 +1240,8 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g,
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orign,
-    NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
   DataType T;
   string data_format;
   string padding;
@@ -772,11 +1249,12 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orign,
   bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -786,16 +1264,16 @@ void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orign,
   nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
 }
 
-void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orign,
-    NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
   DataType T;
   string data_format;
   std::vector<int32> strides;
 
   // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -803,19 +1281,52 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orign,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsPooling(const Node* orign,
-    NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsIdentity(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> ksize, strides;
 
   // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "ksize", &ksize));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -825,14 +1336,96 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orign,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsRelu(const Node* orign, NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsRelu(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
   DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
 
   // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
 
   // Add attributes to new node.
   nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -864,15 +1457,16 @@ Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
     FillInputs(a, &a_control_edges, &a_in);
 
     // Get operand op of the operator
-    Node *b = nullptr;
+    Node* b = nullptr;
     b = a_in[mi->op].first;
     if (b == nullptr || (b->type_string() != mi->pred)) {
       // NOTE: Should the first check be assert?
       continue;
     }
 
+    const int B_in = b->num_inputs();
     gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
     FillInputs(b, &b_control_edges, &b_in);
 
     // Shouldn't merge if a and b have different control edges.
@@ -887,13 +1481,13 @@ Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
   return nullptr;
 }
 
-Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g,
-                                     Node* succ, Node* pred) {
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
+                                       Node* pred) {
   CHECK_NOTNULL(succ);
   CHECK_NOTNULL(pred);
 
-  if (succ->type_string() == csinfo_.biasadd &&
-      pred->type_string() == csinfo_.mklconv2d) {
+  if (succ->type_string() == csinfo_.bias_add &&
+      pred->type_string() == csinfo_.mkl_conv2d) {
     // 1. Get all attributes from input nodes.
     DataType T_pred, T_succ;
     string padding;
@@ -906,15 +1500,14 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g,
     TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
     TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
     TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu",
-                            &use_cudnn_on_gnu));
+    TF_CHECK_OK(
+        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
     // We check to ensure that data formats of both succ and pred are same.
     // We expect them to be same, so we can enforce this as assert.
     // But assert can be too strict, so we enforce this as a check.
     // If the check fails, then we do not merge two nodes.
     // We also do same check for devices.
-    if (data_format_pred != data_format_succ ||
-        T_pred != T_succ ||
+    if (data_format_pred != data_format_succ || T_pred != T_succ ||
         pred->assigned_device_name() != succ->assigned_device_name() ||
         pred->def().device() != succ->def().device()) {
       return Status(error::Code::INVALID_ARGUMENT,
@@ -940,37 +1533,53 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g,
                     "Will skip node merge optimization");
     }
 
-    for (const Edge *e : pred->out_edges()) {
+    for (const Edge* e : pred->out_edges()) {
       if (e->dst() != succ) {
         return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D does not feed to BiasAdd."
-                    "Will skip node merge optimization");
+                      "Conv2D does not feed to BiasAdd."
+                      "Will skip node merge optimization");
       }
     }
 
     // 2. Get inputs from both the nodes.
     // Find the 2 inputs from the conv and the bias from the add Bias.
     // Get operand 0, 1 of conv2D and their Mkl tensors.
-    CHECK_EQ(pred->in_edges().size(), 4);  // MklConv2D must have 4 inputs.
+    CHECK_EQ(pred->in_edges().size(), 4);  // _MklConv2D must have 4 inputs.
     // Get operand 1 of add_bias
     // BiasAdd must have 2 inputs: Conv, bias
     CHECK_EQ(succ->in_edges().size(), 2);
-    Node* oper3_mkl    = nullptr;  // Mkl tensor corresponding to oper3
-    int oper3_mkl_slot = 0;  // For dummy MKL tensor node, output slot is 0.
-    GetDummyMklTensorNode(g, &oper3_mkl, succ);  // Get dummy Mkl tensor node
+    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
+    int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
+    GetDummyMklTensorNode(g, &oper3_mkl, pred);  // Get dummy Mkl tensor node
     // as BiasAdd does not have Mkl tensor as input.
     CHECK_NOTNULL(oper3_mkl);
 
     // We will use the node name of BiasAdd as the name of new node
     // Build new node. We use same name as original node, but change the op
     // name.
-    NodeBuilder nb(succ->name(), csinfo_.mklconv2dwithbias);
-    nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
-    nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
-    nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
-    nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
-    nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
-    nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+    NodeBuilder nb(succ->name(), csinfo_.mkl_conv2d_with_bias);
+    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
+      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
+      // we follow contiguous ordering.
+      nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
+      nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
+      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
+      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+    } else {
+      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
+      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
+      // we follow contiguous ordering.
+      nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+      nb.Input(pred_in[2].first, pred_in[2].second);  // Mkl for In1 of Conv2D
+      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2 of Conv2D
+      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
+    }
 
     // Copy attributes from Conv2D to Conv2DWithBias.
     CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
@@ -979,30 +1588,59 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g,
     nb.Device(succ->def().device());
 
     // Create node.
-    Node* newn;
-    nb.Finalize(&**g, &newn);
-    CHECK_NOTNULL(newn);
+    Node* new_node;
+    nb.Finalize(&**g, &new_node);
+    CHECK_NOTNULL(new_node);
 
     // Set the Mkl layer label for this op.
-    newn->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+    new_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
+
+    // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+    // node are already copied in BuildNode. We handle control edges now.
+    for (const Edge* e : pred->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
+    for (const Edge* e : succ->in_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+      }
+    }
 
     // Incoming edges are fixed, we will fix the outgoing edges now.
+    // First, we will fix outgoing control edges from 'pred' node.
+    // We don't need to handle outgoing data edges from 'pred' node
+    // because pred has only 1 output going to succ node (we enforced
+    // this check for merge already).
+    for (const Edge* e : pred->out_edges()) {
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      }
+    }
+
+    // Second, we will fix outgoing control and data edges from 'succ' node.
     for (const Edge* e : succ->out_edges()) {
-      (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+      if (e->IsControlEdge()) {
+        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+      } else {
+        CHECK_NOTNULL((*g)->AddEdge(new_node, e->src_output(), e->dst(),
+                                  e->dst_input()));
+      }
     }
 
     // Copy device assigned to old node to new node.
     // It's ok to use pred or succ as we have enforced a check that
     // both have same device assigned.
-    newn->set_assigned_device_name(pred->assigned_device_name());
+    new_node->set_assigned_device_name(pred->assigned_device_name());
 
     VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
-            << ", and node: " << succ->DebugString() << ", into node:"
-            << newn->DebugString();
+            << ", and node: " << succ->DebugString()
+            << ", into node:" << new_node->DebugString();
 
     (*g)->RemoveNode(succ);
     (*g)->RemoveNode(pred);
-    MarkRewrittenNode(newn);
+    MarkRewrittenNode(new_node);
 
     return Status::OK();
   }
@@ -1015,55 +1653,68 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g,
 //           Helper functions for node rewrite
 //////////////////////////////////////////////////////////////////////////
 
-Status MklLayoutRewritePass::RewriteNode(
-    std::unique_ptr<Graph>* g, Node* orign, const RewriteInfo* ri) {
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
   CHECK_NOTNULL(ri);
-  CHECK_NOTNULL(orign);
+  CHECK_NOTNULL(orig_node);
 
-  VLOG(1) << "MklLayoutRewritePass: Original node:" << orign->DebugString();
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
 
   // Check if this is scenario 2 (context-based rewrite).
   // Get the matching ContextInfo if it is.
-  const Node* fwdn = nullptr;
+  const Node* fwd_node = nullptr;
   const ContextInfo* ci = nullptr;
   bool is_context_based_rewrite = false;
-  if ((ci = SearchMatchingContext(orign, &fwdn)) != nullptr) {
-    CHECK_NOTNULL(fwdn);
+  if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
+    CHECK_NOTNULL(fwd_node);
     is_context_based_rewrite = true;
 
     // Sanity checks for context-based rewrite (if any)
-    if (orign->type_string() == csinfo_.biasaddgrad &&
-        ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
+    if (orig_node->type_string() == csinfo_.bias_add_grad &&
+        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
       DataType orig_T, ctx_T;
       string orig_data_format, ctx_data_format;
-      TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &orig_T));
-      TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &orig_data_format));
-      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "T", &ctx_T));
-      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "data_format", &ctx_data_format));
+      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
+      TF_CHECK_OK(
+          GetNodeAttr(orig_node->def(), "data_format", &orig_data_format));
+      TF_CHECK_OK(GetNodeAttr(fwd_node->def(), "T", &ctx_T));
+      TF_CHECK_OK(
+          GetNodeAttr(fwd_node->def(), "data_format", &ctx_data_format));
 
       if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
-          orign->assigned_device_name() != fwdn->assigned_device_name() ||
-          orign->def().device() != fwdn->def().device()) {
-        return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute or devices of BiasAddGrad and "
-                    "Conv2D do not match. Will skip node rewrite optimization");
+          orig_node->assigned_device_name() !=
+              fwd_node->assigned_device_name() ||
+          orig_node->def().device() != fwd_node->def().device()) {
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "data_format or T attribute or devices of BiasAddGrad and "
+            "Conv2D do not match. Will skip node rewrite optimization");
       }
+    } else if (orig_node->type_string() == csinfo_.bias_add_grad &&
+               ri->new_name == csinfo_.matmul) {
+      // When BiasAddGrad has MatMul in context, we do not do any rewrite
+      // and leave BiasAddGrad as it is. But we check for this condition
+      // when we check for node rewrite rule. So we should not even come
+      // here for MatMul. So we will fail now.
+        return Status(
+            error::Code::INVALID_ARGUMENT,
+            "No rewrite is required for BiasAddGrad for MatMul context.");
     }
   }
 
   // Get all inputs.
-  const int num = orign->num_inputs();
-  CHECK_EQ(num, ri->numins);
+  const int num_inputs = orig_node->in_edges().size();
   gtl::InlinedVector<Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num);
-  FillInputs(orign, &control_edges, &inputs);
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
 
   // Build new node. We use same name as original node, but change the op name.
-  NodeBuilder nb(orign->name().c_str(), ri->newname.c_str());
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
   // Copy user-specified device assigned to original node to new node.
-  nb.Device(orign->def().device());
+  nb.Device(orig_node->def().device());
   // Set up new inputs to the rewritten node.
-  Status s = SetUpInputs(g, inputs, &nb, orign);
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
   if (s != Status::OK()) {
     return s;
   }
@@ -1071,71 +1722,75 @@ Status MklLayoutRewritePass::RewriteNode(
   // Copy attributes from original node to new node (for scenario 1).
   // For context-based rewrite, we use context to copy the attributes.
   if (is_context_based_rewrite) {
-    if (orign->type_string() == csinfo_.biasaddgrad &&
-        ri->newname == csinfo_.mklconv2dwithbiasbackpropbias) {
-      CHECK_NOTNULL(fwdn);
-      ri->copyattrs(fwdn, &nb);
+    if (orig_node->type_string() == csinfo_.bias_add_grad &&
+        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
+      CHECK_NOTNULL(fwd_node);
+      ri->copy_attrs(fwd_node, &nb);
     } else {
       return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node rewrite optimization.");
+                    "Unimplemented case for node rewrite optimization.");
     }
   } else {
-    ri->copyattrs(const_cast<const Node*>(orign), &nb);
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
   }
   // Set the Mkl layer label for this op.
-  nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel);
-
-  // Add workspace edge to this node if needed.
-  // We add workspace edge only for MaxPool, LRN and BatchNorm.
-  AddWorkSpaceEdgeIfNeeded(g, orign, &nb);
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
 
   // Finalize graph and get new node.
-  Node* newn = nullptr;
-  TF_CHECK_OK(nb.Finalize(&**g, &newn));
-  CHECK_NOTNULL(newn);
-
-  // Incoming edges from 'orign' node to new 'newn' node are already copied
-  // in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node.
-  // Since the output also follows same ordering among Tensorflow tensors and
-  // Mkl tensors. We need to connect Tensorflow tensors appropriately.
-  // Specifically, nth output of original node will become 2*nth output of
-  // Mkl node. GetTensorDataIndex provides this mapping function.
-  for (const Edge* e : orign->out_edges()) {
-    // We need to handle control-edges by using their original slot number.
-    // Generally, -1 is reserved for control slot.
-    if (e->src_output() < 0) {
-      (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input());
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
     } else {
-      (*g)->AddEdge(newn, GetTensorDataIndex(e->src_output()),
-                  e->dst(), e->dst_input());
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
     }
   }
 
   // Copy the runtime device assigned from original code to new node.
-  newn->set_assigned_device_name(orign->assigned_device_name());
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
 
   // Delete original node and mark new node as rewritten.
-  (*g)->RemoveNode(orign);
-  MarkRewrittenNode(newn);
+  (*g)->RemoveNode(orig_node);
+  MarkRewrittenNode(new_node);
 
-  VLOG(1) << "MklLayoutRewritePass: New node:" << newn->DebugString();
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
   return Status::OK();
 }
 
 const MklLayoutRewritePass::ContextInfo*
 MklLayoutRewritePass::SearchMatchingContext(const Node* n,
-    const Node** fwdn) {
+                                            const Node** fwd_node) {
   CHECK_NOTNULL(n);
-  CHECK_NOTNULL(fwdn);
-  *fwdn = nullptr;
+  CHECK_NOTNULL(fwd_node);
+  *fwd_node = nullptr;
 
   // Search for matching contextinfo based on node name.
   // There could be more than one matching contextinfos.
   bool is_matching_cinfo_found = false;
   std::vector<const ContextInfo*> mci;
   for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == ci->node) {
-      mci.push_back(&*ci);
+    if (n->type_string() == (*ci)->node) {
+      mci.push_back(*ci);
       is_matching_cinfo_found = true;
     }
   }
@@ -1144,8 +1799,8 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
     return nullptr;
   }
 
-  VLOG(1) << "MklLayoutRewritePass: Searching graph for: "
-          << n->type_string() << " in backwards.";
+  VLOG(1) << "MklLayoutRewritePass: Searching graph for: " << n->type_string()
+          << " in backwards.";
 
   // Now we will check for forward op name for context info in data
   // flow graph. Get the max hops we should search for the fwd node.
@@ -1164,19 +1819,18 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
     nqueue.pop();
 
     std::set<const Node*> visited_nodes;
-    curr_node  = curr_pair.first;
+    curr_node = curr_pair.first;
     curr_depth = curr_pair.second;
     CHECK_NOTNULL(curr_node);
 
     VLOG(1) << "MklLayoutRewritePass: Visiting node: "
-            << curr_node->type_string()
-            << " at depth: " << curr_depth
+            << curr_node->type_string() << " at depth: " << curr_depth
             << " for node: " << n->type_string();
 
     // If we find a match, we return immediately.
     for (const ContextInfo* ci : mci) {
       if (curr_node->type_string() == ci->fwd) {
-        *fwdn = curr_node;
+        *fwd_node = curr_node;
         return ci;
       }
     }
@@ -1186,9 +1840,9 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
     for (const Edge* e : curr_node->in_edges()) {
       // We do not visit already visited node.
       if (visited_nodes.find(e->src()) == visited_nodes.end()) {
-         // Depth of these nodes is 1 more than the depth of current node.
-         nqueue.push(std::make_pair(e->src(), curr_depth+1));
-         visited_nodes.insert(e->src());
+        // Depth of these nodes is 1 more than the depth of current node.
+        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
+        visited_nodes.insert(e->src());
       }
     }
   } /* while */
@@ -1196,14 +1850,14 @@ MklLayoutRewritePass::SearchMatchingContext(const Node* n,
   return nullptr;
 }
 
-bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n) {
-  const Node* fwdn = nullptr;
-  return SearchMatchingContext(n, &fwdn) != nullptr;
+bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n,
+                                               const ContextInfo* c) {
+  const Node* fwd_node = nullptr;
+  return SearchMatchingContext(n, &fwd_node) == c;
 }
 
 const MklLayoutRewritePass::RewriteInfo*
-MklLayoutRewritePass::CheckForNodeRewrite(
-    const Node *n) const {
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   CHECK_NOTNULL(n);
 
   // First check if node along with its type is supported by MKL layer.
@@ -1214,18 +1868,30 @@ MklLayoutRewritePass::CheckForNodeRewrite(
   if (!GetNodeAttr(n->def(), "T", &T).ok()) {
     return nullptr;
   }
-  if (!mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string()), T)) {
-    return nullptr;
+
+  // BiasAddGrad is not an Mkl layer, so we make an exception for it.
+  if (n->type_string() != csinfo_.bias_add_grad) {
+    if (!mkl_op_registry::IsMklOp(GetMklOpName(n->type_string()), T)) {
+      return nullptr;
+    }
   }
 
   // We support 2 types of node rewrites:
-  // 1. Rewriting BiasAddGrad depending on its context.
+  // 1. Rewriting BiasAddGrad depending on its MklConv2DWithBias context.
   // 2. Rewriting an op to Mkl op always
   // We return true if any of these 2 conditions is met.
 
   // Find matching RewriteInfo and then check that rewrite rule applies.
   for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 && ri->rewriterule(n)) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n, ri->context)) {
+      // If we are rewriting BiasAddGrad into BiasAddGrad for MatMul context,
+      // then we just return directly.
+      if (n->type_string() == csinfo_.bias_add_grad &&
+          ri->context->fwd == csinfo_.matmul &&
+          ri->new_name == csinfo_.bias_add_grad) {
+        return nullptr;
+      }
       return &*ri;
     }
   }
@@ -1238,8 +1904,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
 
-bool MklLayoutRewritePass::RunPass(
-    std::unique_ptr<Graph>* g) {
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   bool result = false;
   CHECK_NOTNULL(g);
 
@@ -1249,7 +1914,8 @@ bool MklLayoutRewritePass::RunPass(
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
 
   for (Node* n : order) {
-    if (!n->IsOp()) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
       continue;
     }
 
@@ -1265,22 +1931,21 @@ bool MklLayoutRewritePass::RunPass(
               << " layout optimization.";
 
       if (RewriteNode(g, n, ri) == Status::OK()) {
-          VLOG(1) << "MklLayoutRewritePass: rewrote node "
-                  << node_name << " with op " << op_name
-                  << " for Mkl layout optimization.";
-          result = true;
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
       }
     } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
       // Otherwise, we will check if the node is to be merged.
       string n1_name = n->name();
       string n2_name = predn->name();
 
-      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes "
-              << n1_name << " and " << n2_name << " for merging";
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
 
       if (MergeNode(g, n, predn) == Status::OK()) {
-        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name
-              << " and " << n2_name;
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
         result = true;
       }
     }
@@ -1298,18 +1963,31 @@ bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
   return MklLayoutRewritePass().RunPass(g);
 }
 
-Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.graph == nullptr) {
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
     return Status::OK();
   }
 
-  // Get the ownership of graph
-  std::unique_ptr<Graph>* g = std::move(options.graph);
-
-  RunPass(g);
-
-  // Return the ownership of graph back
-  options.graph->reset(g->release());
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index dd7ee45a705b10d70837089f4c0842de1e25b16e..3c4a5263afd3817907ede7f14c9b433de5fce83c 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_layout_pass.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#include <vector>
-#include <string>
 #include <algorithm>
+#include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -39,7 +39,11 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static void InitGraph(const string& s, Graph* graph) {
+const char kCPUDevice[] = "/job:a/replica:0/task:0/cpu:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/gpu:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -47,14 +51,18 @@ static void InitGraph(const string& s, Graph* graph) {
   CHECK(parser.MergeFromString(s, &graph_def)) << s;
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
 }
 
 class MklLayoutPassTest : public ::testing::Test {
  public:
   MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
 
-  void InitGraph(const string& s) {
-    ::tensorflow::InitGraph(s, &graph_);
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
     original_ = CanonicalGraphString(&graph_);
   }
 
@@ -110,9 +118,11 @@ class MklLayoutPassTest : public ::testing::Test {
 };
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
-REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("MklInput2").Output("o: uint8")
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
                         .Output("o1: uint8").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
@@ -134,20 +144,22 @@ TEST_F(MklLayoutPassTest, Basic) {
 
 // Test set 1: Conv2D + AddBias
 
-// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
+// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved ordering)
+// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous ordering)
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
+      " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -158,26 +170,29 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
-            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
-            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
+            "N->E:4;Y->Z:1");
 }
 
-// C=MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y)
+// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for interleaved)
+// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Sub(E,Y) (for contiguous)
 // Test for correct output slots selected
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput2'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput2'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput2'}"
+      "node { name: 'N' op: '_MklInput2'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M:1', 'B', 'N:1']}"
+      " input: ['A', 'B', 'M:1', 'N:1']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -188,16 +203,18 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
-            "M(MklInput2);N(MklInput2);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
-            "DMT/_0->E:5;E->Z;M:1->E:1;N:1->E:3;Y->Z:1");
+            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
+            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Sub)|A->E;"
+            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
+            "M:1->E:3;N:1->E:4;Y->Z:1");
 }
 
 // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
 // This is a case of node rewrite followed by node merge.
-// We will first rewrite Conv2D to MklConv2D, and then merge MklConv2D
-// with BiasAdd to produce MklConv2DWithBias.
+// We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D
+// with BiasAdd to produce _MklConv2DWithBias.
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
@@ -219,70 +236,71 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
       " input: ['E', 'Y']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(MklConv2DWithBias);Y(Input);Z(Sub)|"
-            "A->E;B->E:2;D->E:4;DMT/_0->E:1;DMT/_1->E:3;DMT/_2->E:5;"
-            "E->Z;Y->Z:1");
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Sub)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
 }
 
-// Graph contains only MklConv2D, no AddBias.
+// Graph contains only _MklConv2D, no AddBias.
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}");
+      " input: ['A', 'B', 'M', 'N']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
-             "A->C;B->C:2;M->C:1;N->C:3");
+            "A(Input);B(Input);C(_MklConv2D);M(_MklInput);N(_MklInput)|"
+            "A->C;B->C:1;M->C:2;N->C:3");
 }
 
-// MklConv2D output does not go to BiasAdd.
+// _MklConv2D output does not go to BiasAdd.
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
+      " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
       "node { name: 'F' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3");
 }
 
-// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
+// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
 // Merge should not be done in such case.
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
+      " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
       "node { name: 'F' op: 'BiasAdd'"
@@ -294,9 +312,9 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['C', 'E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
-            "E->F:1;E->G:1;M->C:1;N->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
+            "G(Add);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
+            "E->F:1;E->G:1;M->C:2;N->C:3");
 }
 
 // data_format attribute value mismatch. Merge should not be done
@@ -304,43 +322,77 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
+      " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NHCW' } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
-            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);M(_MklInput);"
+            "N(_MklInput)|A->C;B->C:1;C->E;D->E:1;M->C:2;N->C:3");
+}
+
+// Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
+// rewrite tests
+
+// D=_MklConv2D(A,M,B,N,C,O); E=Sub(D,A); F=BiasAddGrad(E)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
+            "E(Sub);F(_MklConv2DWithBiasBackpropBias);M(_MklInput);"
+            "N(_MklInput);O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;"
+            "DMT/_0->F:1;E->F;E:control->DMT/_0:control;M->D:3;N->D:4;"
+            "O->D:5");
 }
 
-// No MklConv2D in context, but Conv2D in context.
-// Only Conv2D would be rewritten to MklConv2D, but no rewrite
-// for BiasAddGrad should happen.
-// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
+// No _MklConv2DWithBias in context, but _MklConv2D in context.
+// No rewrite for BiasAddGrad should happen.
+// C=_MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) (for interleaved)
+// C=_MklConv2D(A,B,M,N); D=Sub(C,A); E=BiasAddGrad(D) (for contiguous)
 TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
+      " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Sub'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
       " input: ['C', 'A']}"
@@ -349,9 +401,9 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
       " input: ['D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Sub);E(BiasAddGrad);"
-            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;"
-            "M->C:1;N->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Sub);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
 }
 
 // No Conv2D in the context for BiasAddGrad. No rewrite should happen.
@@ -372,7 +424,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
       " input: ['D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A->C;A->D:1;B->C:1;C->D;D->E");
 }
 
 // No Conv2D in the context for BiasAddGrad, but MatMul in context.
@@ -396,7 +448,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
       " input: ['D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A->C;A->D:1;B->C:1;C->D;D->E");
 }
 
 // Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
@@ -419,7 +471,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
       " input: ['D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A->C;A->D:1;B->C:1;C->D;D->E");
 }
 
 // No MatMul in the context for BiasAddGrad. No rewrite should happen.
@@ -440,7 +492,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
       " input: ['D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A->C;A->D:1;B->C:1;C->D;D->E");
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -463,8 +515,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|"
-            "A->C;B->C:2;B->D;C->D:1;DMT/_0->C:1;DMT/_1->C:3");
+            "A(Input);B(Input);C(_MklConv2D);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
 }
 
 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
@@ -490,9 +544,11 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MklConv2D);D(MklConv2D);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:2;C->D:2;C->E;"
-            "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1");
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:1->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
 }
 
 // Conv2D with INT32 which is not supported by Mkl
@@ -514,81 +570,1035 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
             "A->C;B->C:1;B->D;C->D:1");
 }
 
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to rewriting node for workspace edges
-/////////////////////////////////////////////////////////////////////
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
 
-/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Mul)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:1->H:4;F->H:2;F:1->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
+            "H(_MklConcat);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:1->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Mul)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:1->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:1->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Mul);G(Const);"
+            "H(_MklConcatV2);I(Mul)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:1->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
       " attr { key: 'T'            value { type: DT_FLOAT } }"
       " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
       " attr { key: 'padding'      value { s: 'VALID' } }"
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
       " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Mul);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
       " attr { key: 'T'            value { type: DT_FLOAT } }"
       " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
       " attr { key: 'padding'      value { s: 'VALID' } }"
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(MklMaxPoolGrad);F(Mul)|"
-            "A->B;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;D->E:4;"
-            "DMT/_0->B:1;DMT/_1->E:1;DMT/_2->E:5;E->F:1");
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
 }
 
-// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPool node but workspace edges will not
-// be present.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
       " attr { key: 'T'            value { type: DT_FLOAT } }"
       " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
       " attr { key: 'padding'      value { s: 'VALID' } }"
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
       " input: ['A'] }"
-      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MklMaxPool);C(Mul);DMT/_0(Const)|"
-            "A->B;A->C;B->C:1;DMT/_0->B:1");
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Mul);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
 }
 
-// Test MaxPool->MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
-// its Mkl part, we will generate dummy tensor.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
       "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'MaxPoolGrad'"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Mul)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
       " attr { key: 'T'            value { type: DT_FLOAT } }"
       " attr { key: 'data_format'  value { s: 'NCHW' } }"
       " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
       " attr { key: 'padding'      value { s: 'VALID' } }"
       " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(MklMaxPoolGrad);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
-            "A->D;A->E;B->D:2;C->D:4;D->E:1;DMT/_0->D:1;DMT/_1->D:3;"
-            "DMT/_2->D:5;DMT/_3->D:6;DMT/_4->D:7");
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Mul)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Mul)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Mul)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Mul)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Mul);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Mul)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Mul)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Sub);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Mul)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Mul)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Mul)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Mul)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Mul)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Mul)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Sub'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Sub)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.cc b/tensorflow/core/graph/mkl_optimizer_merge.cc
deleted file mode 100644
index a171a27d8f5fdf0d0619fcdd16bb35879eac9a39..0000000000000000000000000000000000000000
--- a/tensorflow/core/graph/mkl_optimizer_merge.cc
+++ /dev/null
@@ -1,651 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-// This module implements node merging optimization on the graph.
-// We process the nodes in the graph in reverse postorder
-// (i.e. inputs before their downstream dependencies).
-//
-#include <memory>
-#include <queue>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/core/graph/mkl_optimizer_merge.h"
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// How many hops do we search for matching node in the backward dataflow graph?
-// We use maxhop of 10 based on empirical observations. Also, these are
-// maxhops in backward data-flow graph. Since input of forward nodes (Conv2D)
-// directly goes to backward nodes, we do not expect the hop-distance
-// would be more than few nodes.
-static size_t kNodeMergeContextMaxDepth = 10;
-
-// This optimization pass performs two tasks: merge
-// nodes in the forward pass, and rewrite the gradient ops
-// corresponding to merged forward ops.
-//
-// Merging nodes in the graph: Currently, it merges Conv2D+AddBias together.
-//
-// Rewriting nodes in the graph: This is neded in order to optimize
-// gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
-// MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
-// Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
-// This is context-specific optimization, where the context is the
-// forward operator that the BiasAddGrad corresponds to.
-class NodeMergeRewritePass : public GraphOptimizationPass {
- public:
-  NodeMergeRewritePass() {
-    csinfo_.conv2d = "MklConv2D";
-    csinfo_.conv2dwithbias = "MklConv2DWithBias";
-    csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias";
-    csinfo_.biasadd = "BiasAdd";
-    csinfo_.matmul = "MatMul";
-    csinfo_.biasaddgrad = "BiasAddGrad";
-
-    minfo_.push_back(
-        {csinfo_.conv2d, csinfo_.biasadd, 0, csinfo_.conv2dwithbias});
-
-// We use maxhop of 10 based on emperical observations. Also, these are
-// maxhops in backward data-flow graph. Since input of forward nodes
-// (Conv2D) directly goes to backward nodes, we do not expect the
-// hop-distance would be more than few nodes.
-// TODO(nhasabni) Temporarily disabling rewrite of BiasAddGrad.
-// Will enable it once we support Conv2DWithBiasBackpropBias op.
-#if 0
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
-                  {csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}});
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias,
-                  {csinfo_.conv2d, kNodeMergeContextMaxDepth}});
-    // For now, we are rewriting BiasAddGrad to BiasAddGrad for MatMul. This is
-    // because we do not have a separate Op for MatMulwithBias.
-    rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad,
-                      {csinfo_.matmul, kNodeMergeContextMaxDepth}});
-#endif
-  }
-
-  // Standard interface to run optimization pass
-  Status Run(const GraphOptimizationPassOptions& options);
-
-  // Helper function which does most of heavy lifting for node merge
-  //
-  // Extracts common functionality between Run public interface and
-  // test interface.
-  //
-  // @return true, if and only if graph is mutated; false otherwise.
-  bool RunPass(std::unique_ptr<Graph>* g);
-
- private:
-  /// Structure to specify information used in node merge
-  typedef struct {
-    string pred;     // Predecessor node string
-    string succ;     // Successor node string
-    int op;          // What operand no the predecessor node corresponds
-                     // to successor node?
-    string newnode;  // Name of the node after merge
-  } MergeInfo;
-
-  /// Structure to specify information used in node rewrite
-  typedef struct {
-    string node;     // Name of the node to be rewritten
-    string rewrite;  // New name of the node after rewrite
-    typedef struct {
-      string fwd;     // Node name in forward pass that this node
-                      // corresponds to
-      size_t maxhop;  // Maximum number of hops the mfwd_ is located
-                      // from this node. If mfwd_ is farther than mmaxhop_
-                      // then we do not rewrite the node.
-    } ContextInfo;
-    ContextInfo cinfo;  // Context for rewrite
-  } RewriteInfo;
-
-  /// Structure to store all constant strings
-  typedef struct {
-    string conv2d;
-    string conv2dwithbias;
-    string conv2dwithbiasbackpropbias;
-    string biasadd;
-    string matmul;
-    string biasaddgrad;
-  } ConstStringInfo;
-
-  ConstStringInfo csinfo_;
-  std::vector<MergeInfo> minfo_;
-  std::vector<RewriteInfo> rinfo_;
-
- private:
-  // Return a node that can be merged with input node
-  //
-  // @return pointer to the node if we can find such a
-  // node. Otherwise, it returns nullptr.
-  Node* FindNodeForMerge(const Node* a) const;
-
-  // Merge predecessor node with its successor.
-  // Currently, we merge Conv2D with AddBias only.
-  //
-  // Input nodes succ and pred may be deleted if the call to
-  // this function is successful. Attempt to use the pointers
-  // after the call to function may result is undefined behaviors.
-  //
-  // @input g - input graph, succ - successor node, pred - predecessor node
-  // @return Status::OK(), if merging is successful and supported.
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case nodes are merged. Otherwise, it is
-  //         not updated.
-  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
-
-  // Is input node (n) a candidate for rewrite?
-  //
-  // @return true, if it can be rewritten; false, otherwise.
-  bool IsApplicableRewriteNode(const Node* n) const;
-
-  // Rewrites input node to a new node specified by its matching rewrite info.
-  //
-  // Method first searches matching rewrite info for input node and then
-  // uses that info to rewrite.
-  //
-  // Input node may be deleted in case of rewrite. Attempt to use the node
-  // after the call can result in undefined behaviors.
-  //
-  // @input  g - input graph, n - Node to be rewritten
-  // @return Status::OK(), if the input node is rewritten;
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case the input node is rewritten.
-  //         Otherwise, it is not updated.
-  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n);
-
-  // Helper function that searches the matching rewriteinfo for the node.
-  // Implements depth-first search in the data dependence graph for the
-  // gradient op in backward direction.
-  //
-  // @input n - Node (gradient op) whose rewriteinfo is to be searched,
-  //        fwdn - pointer to node from the forward pass that this node
-  //        belongs to
-  // @return Matching rewriteinfo in case a match is found; null otherwise.
-  const RewriteInfo* FindMatchingRewriteInfo(const Node* n,
-                                             const Node** fwdn) const;
-
-  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
-  // and return it in '*out'.
-  // TODO(nhasabni) We should move this to mkl_util.h
-  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out);
-};
-
-// We register merge optimizer for phase 2 in pre-placement group.
-// Do not change the ordering of the Mkl passes.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 2,
-                      NodeMergeRewritePass);
-
-static void FillInputs(const Node* n,
-                       gtl::InlinedVector<Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
-  DCHECK_EQ(in->size(), n->num_inputs());
-  control_edges->clear();
-  for (const Edge* e : n->in_edges()) {
-    if (e->IsControlEdge()) {
-      control_edges->push_back(e->src());
-    } else {
-      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
-    }
-  }
-  std::sort(control_edges->begin(), control_edges->end());
-  if (n->op_def().is_commutative()) {
-    // For commutative inputs, we sort the input by the input Node*
-    // to get a canonical ordering (so that add(a,b) and add(b, a) will
-    // hash to the same value if is_commutative is true for 'add').
-    std::sort(in->begin(), in->end());
-  }
-}
-
-Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const {
-  // Search for all matching mergeinfo.
-  // We allow more than one match for extensibility.
-  std::vector<const MergeInfo*> matching_mi;
-  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
-    if (a->type_string() == mi->succ) {
-      matching_mi.push_back(&*mi);
-    }
-  }
-
-  for (const MergeInfo* mi : matching_mi) {
-    const int N_in = a->num_inputs();
-    if (mi->op >= N_in) {
-      continue;
-    }
-
-    // Get the control edges and input of node
-    gtl::InlinedVector<Node*, 4> a_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
-    FillInputs(a, &a_control_edges, &a_in);
-
-    // Get operand op of the operator
-    Node* b = nullptr;
-    b = a_in[mi->op].first;
-    if (b == nullptr || (b->type_string() != mi->pred)) {
-      // NOTE: Should the first check be assert?
-      continue;
-    }
-
-    gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(N_in);
-    FillInputs(b, &b_control_edges, &b_in);
-
-    // Shouldn't merge if a and b have different control edges.
-    if (a_control_edges != b_control_edges) {
-      continue;
-    } else {
-      // We found a match.
-      return b;
-    }
-  }
-
-  return nullptr;
-}
-
-void NodeMergeRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
-                                                 Node** out) {
-  const DataType dt = DataTypeToEnum<uint8>::v();
-  TensorProto proto;
-  proto.set_dtype(dt);
-  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
-                           8);
-  TensorShape dummy_shape({8});
-  dummy_shape.AsProto(proto.mutable_tensor_shape());
-  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Finalize(&**g, out));
-}
-
-Status NodeMergeRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
-                                       Node* pred) {
-  CHECK_NOTNULL(succ);
-  CHECK_NOTNULL(pred);
-
-  if (succ->type_string() == csinfo_.biasadd &&
-      pred->type_string() == csinfo_.conv2d) {
-    // 1. Get all attributes from input nodes.
-    DataType T_pred, T_succ;
-    string padding;
-    std::vector<int32> strides;
-    string data_format_pred, data_format_succ;
-    bool use_cudnn_on_gnu;
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-    TF_CHECK_OK(
-        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
-    // We check to ensure that data formats of both succ and pred are same.
-    // We expect them to be same, so we can enforce this as assert.
-    // But assert can be too strict, so we enforce this as a check.
-    // If the check fails, then we do not merge two nodes.
-    // We also do same check for devices.
-    if (data_format_pred != data_format_succ || T_pred != T_succ ||
-        pred->assigned_device_name() != succ->assigned_device_name() ||
-        pred->def().device() != succ->def().device()) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute or devices of Conv2D and "
-                    "BiasAdd do not match. Will skip node merge optimization");
-    }
-
-    // 2. Get inputs from both the nodes.
-    // Find the 2 inputs from the conv and the bias from the add Bias.
-    Node* oper1 = nullptr;
-    Node* oper1_mkl = nullptr;  // Mkl tensor corresponding to oper1
-    Node* oper2 = nullptr;
-    Node* oper2_mkl = nullptr;  // Mkl tensor corresponding to oper2
-    Node* oper3 = nullptr;
-    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
-
-    const int succ_num = succ->num_inputs();
-    gtl::InlinedVector<Node*, 4> succ_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
-    FillInputs(succ, &succ_control_edges, &succ_in);
-
-    const int pred_num = pred->num_inputs();
-    gtl::InlinedVector<Node*, 4> pred_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
-    FillInputs(pred, &pred_control_edges, &pred_in);
-
-    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
-    // Otherwise, merging is semantically incorrect.
-    if (pred->out_edges().size() != 1) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D has multiple outputs."
-                    "Will skip node merge optimization");
-    }
-
-    for (const Edge* e : pred->out_edges()) {
-      if (e->dst() != succ) {
-        return Status(error::Code::INVALID_ARGUMENT,
-                      "Conv2D does not feed to BiasAdd."
-                      "Will skip node merge optimization");
-      }
-    }
-
-    // Get operand 0, 1 of conv2D and their Mkl tensors.
-    CHECK_EQ(pred->in_edges().size(), 4);  // MklConv2D must have 4 inputs.
-    oper1 = pred_in[0].first;
-    oper1_mkl = pred_in[1].first;
-    oper2 = pred_in[2].first;
-    oper2_mkl = pred_in[3].first;
-    // Get operand 1 of add_bias
-    // BiasAdd must have 2 inputs: Conv, bias
-    CHECK_EQ(succ->in_edges().size(), 2);
-    oper3 = succ_in[1].first;
-    GetDummyMklTensorNode(g, &oper3_mkl);  // Get dummy Mkl tensor node
-    // as BiasAdd does not have Mkl tensor as input.
-    CHECK_NOTNULL(oper3_mkl);
-
-    Node* ret;
-    // We will use the node name of BiasAdd as the name of new node
-    TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias)
-                    .Input(oper1)
-                    .Input(oper1_mkl)
-                    .Input(oper2)
-                    .Input(oper2_mkl)
-                    .Input(oper3)
-                    .Input(oper3_mkl)
-                    .Attr("T", T_pred)
-                    .Attr("strides", strides)
-                    .Attr("padding", padding)
-                    .Attr("data_format", data_format_pred)
-                    .Attr("use_cudnn_on_gpu", use_cudnn_on_gnu)
-                    .Device(succ->def().device())
-                    .Finalize(&**g, &ret));
-    CHECK_NOTNULL(ret);
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    for (const Edge* e : succ->out_edges()) {
-      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
-    }
-
-    // Copy device assigned to old node to new node.
-    // It's ok to use pred or succ as we have enforced a check that
-    // both have same device assigned.
-    ret->set_assigned_device_name(pred->assigned_device_name());
-
-    VLOG(1) << "NodeMergeRewritePass: Merged old node:" << pred->DebugString()
-            << ", and node: " << succ->DebugString()
-            << ", into node:" << ret->DebugString();
-
-    (*g)->RemoveNode(succ);
-    (*g)->RemoveNode(pred);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node merge optimization.");
-}
-
-Status NodeMergeRewritePass::RewriteNode(std::unique_ptr<Graph>* g, Node* n) {
-  CHECK_NOTNULL(n);
-
-  // Get the matching rewriteinfo for the node
-  const Node* fwdn = nullptr;
-  const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn);
-  if (ri == nullptr || fwdn == nullptr) {
-    VLOG(2) << "NodeMergeRewritePass: Rewriteinfo not found for: "
-            << n->type_string();
-    return Status(error::Code::INVALID_ARGUMENT,
-                  "Rewrite info not found for the node."
-                  "Will skip node rewrite optimization");
-  }
-
-  VLOG(1) << "NodeMergeRewritePass: Rewrite called for: " << n->type_string();
-
-  if (n->type_string() == csinfo_.biasaddgrad &&
-      ri->node == csinfo_.biasaddgrad &&
-      (ri->rewrite == csinfo_.conv2dwithbiasbackpropbias ||
-       ri->rewrite == csinfo_.biasaddgrad)) {
-    DataType T;
-    string data_format;
-    TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T));
-    TF_CHECK_OK(GetNodeAttr(n->def(), "data_format", &data_format));
-
-    int n_num = n->num_inputs();  // this must be 1.
-    CHECK_EQ(n_num, 1);
-
-    gtl::InlinedVector<Node*, 4> n_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> n_in(n_num);
-    FillInputs(n, &n_control_edges, &n_in);
-
-    Node *ret = nullptr, *op = n_in[0].first;
-
-    if (ri->rewrite == csinfo_.conv2dwithbiasbackpropbias) {
-      // Get strides info from Conv2D (node in the forward pass that this
-      // node corresponds to).
-      std::vector<int32> strides;
-      TF_CHECK_OK(GetNodeAttr(fwdn->def(), "strides", &strides));
-
-      // We use same name as original node name as there may be fetchoutputs
-      // associated with it.
-      TF_CHECK_OK(NodeBuilder(n->name(), ri->rewrite)
-                      .Input(op)
-                      .Attr("T", T)
-                      .Attr("data_format", data_format)
-                      .Attr("strides", strides)
-                      .Device(n->def().device())
-                      .Finalize(&**g, &ret));
-    } else {
-      CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad);
-      TF_CHECK_OK(NodeBuilder(n->name(), ri->rewrite)
-                      .Input(op)
-                      .Attr("T", T)
-                      .Attr("data_format", data_format)
-                      .Device(n->def().device())
-                      .Finalize(&**g, &ret));
-    }
-
-    CHECK_NOTNULL(ret);
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    for (const Edge* e : n->out_edges()) {
-      (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input());
-    }
-
-    // Copy device assigned to old node to new node.
-    ret->set_assigned_device_name(n->assigned_device_name());
-
-    VLOG(1) << "MKLOptimizerMergePass: Rewrote old node:" << n->DebugString()
-            << ", into node:" << ret->DebugString();
-    (*g)->RemoveNode(n);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node rewrite optimization.");
-}
-
-const NodeMergeRewritePass::RewriteInfo*
-NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n,
-                                              const Node** fwdn) const {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(fwdn);
-  *fwdn = nullptr;
-
-  // Search for matching rewriteinfo based on node name.
-  // There could be more than one matching rewriteinfos.
-  std::vector<const RewriteInfo*> matching_ri;
-  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string() == ri->node) {
-      matching_ri.push_back(&*ri);
-    }
-  }
-
-  VLOG(1) << "NodeMergeRewritePass: Searching graph for: " << n->type_string()
-          << " in backwards.";
-
-  // Now we will check for forward op name for rewrite info in data
-  // flow graph. Get the max hops we should search for the fwd node
-  // We are now going to search (breadth-first) backwards in data
-  // dependence graph (for up to max hops) from n for the node
-  // specified in fwd.
-  // queue to maintain nodes to be visited and depth info for
-  // breadth-first search
-  std::queue<std::pair<const Node*, int>> nqueue;
-  const Node* curr_node = n;
-  size_t curr_depth = 0;
-  nqueue.push(std::make_pair(curr_node, curr_depth));
-
-  while (curr_depth < kNodeMergeContextMaxDepth && !nqueue.empty()) {
-    std::pair<const Node*, int> curr_pair = nqueue.front();
-    nqueue.pop();
-
-    std::set<const Node*> visited_nodes;
-    curr_node = curr_pair.first;
-    curr_depth = curr_pair.second;
-    CHECK_NOTNULL(curr_node);
-
-    VLOG(1) << "NodeMergeRewritePass: Visiting node: "
-            << curr_node->type_string() << " at depth: " << curr_depth
-            << " for node: " << n->type_string();
-
-    // If we find a match, we return immediately with the matching rewrite
-    // info.
-    for (const RewriteInfo* ri : matching_ri) {
-      if (curr_node->type_string() == ri->cinfo.fwd) {
-        *fwdn = curr_node;
-        return ri;
-      }
-    }
-
-    // Else we explore backward edges from current node.
-    // Add the source nodes of all incoming edges of the node to the queue.
-    for (const Edge* e : curr_node->in_edges()) {
-      // We do not visit already visited node.
-      if (visited_nodes.find(e->src()) == visited_nodes.end()) {
-        // Depth of these nodes is 1 more than the depth of current node.
-        nqueue.push(std::make_pair(e->src(), curr_depth + 1));
-        visited_nodes.insert(e->src());
-      }
-    }
-  } /* while */
-
-  return nullptr;
-}
-
-bool NodeMergeRewritePass::IsApplicableRewriteNode(const Node* n) const {
-  CHECK_NOTNULL(n);
-
-  // Search for matching rewriteinfo
-  // Even if we find one match, we return true.
-  bool match_found = false;
-  for (const RewriteInfo& ri : rinfo_) {
-    if (n->type_string() == ri.node) {
-      match_found = true;
-      break;
-    }
-  }
-
-  return match_found;
-}
-
-bool NodeMergeRewritePass::RunPass(std::unique_ptr<Graph>* g) {
-  bool result = false;
-  CHECK_NOTNULL(g);
-
-  DumpGraph("Before OptimizeMerge", &**g);
-
-  std::vector<Node*> order;
-  GetReversePostOrder(**g, &order);
-  std::vector<std::pair<Node*, Node*>> nodes_to_be_merged;
-  std::vector<Node*> nodes_to_be_rewritten;
-
-  for (Node* n : order) {
-    if (!n->IsOp()) continue;
-    Node* n1 = nullptr;
-    if ((n1 = FindNodeForMerge(n)) != nullptr) {
-      VLOG(1) << "NodeMergeRewritePass: Scheduled nodes " << n->name()
-              << " and " << n1->name() << " for merging";
-      nodes_to_be_merged.push_back(std::make_pair(n, n1));
-    } else if (IsApplicableRewriteNode(n)) {
-      VLOG(1) << "NodeMergeRewritePass: Scheduled node " << n->name()
-              << " for rewrite";
-      nodes_to_be_rewritten.push_back(n);
-    }
-  }
-
-  for (std::pair<Node*, Node*> i : nodes_to_be_merged) {
-    // Even if MergeNode merges single pair of nodes, we
-    // need to return true.
-    string n1_name = i.first->name();
-    string n2_name = i.second->name();
-    if (MergeNode(g, i.first, i.second) == Status::OK()) {
-      VLOG(1) << "NodeMergeRewritePass: Merged nodes " << n1_name << " and "
-              << n2_name;
-      result = true;
-    }
-  }
-
-  DumpGraph("After OptimizeMerge(nodemerge)", &**g);
-
-  for (Node* i : nodes_to_be_rewritten) {
-    string name = i->name();
-    if (RewriteNode(g, i) == Status::OK()) {
-      VLOG(1) << "NodeMergeRewritePass: Rewrite node: " << name
-              << " successful.";
-      result = true;
-    }
-  }
-
-  DumpGraph("After OptimizeMerge(noderewrite)", &**g);
-
-  return result;
-}
-
-bool OptimizeNodeMerge(std::unique_ptr<Graph>* g) {
-  return NodeMergeRewritePass().RunPass(g);
-}
-
-Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.graph == nullptr) {
-    return Status::OK();
-  }
-
-  // Get the ownership of graph
-  std::unique_ptr<Graph>* g = std::move(options.graph);
-
-  RunPass(g);
-
-  // Return the ownership of graph back
-  options.graph->reset(g->release());
-
-  return Status::OK();
-}
-
-}  // namespace tensorflow
-
-#endif
diff --git a/tensorflow/core/graph/mkl_optimizer_merge_test.cc b/tensorflow/core/graph/mkl_optimizer_merge_test.cc
deleted file mode 100644
index f752721d6e070d334f314546ee9f86d5b523e02d..0000000000000000000000000000000000000000
--- a/tensorflow/core/graph/mkl_optimizer_merge_test.cc
+++ /dev/null
@@ -1,470 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/graph/mkl_optimizer_merge.h"
-
-#include <vector>
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/random/simple_philox.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-class OptimizerMergeTest : public ::testing::Test {
- public:
-  OptimizerMergeTest() : graph_(OpRegistry::Global()) {}
-
-  static void InitGraph(const string& s, Graph* graph) {
-    GraphDef graph_def;
-
-    auto parser = protobuf::TextFormat::Parser();
-    CHECK(parser.MergeFromString(s, &graph_def)) << s;
-    GraphConstructorOptions opts;
-    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
-  }
-
-  void InitGraph(const string& s) {
-    InitGraph(s, &graph_);
-    original_ = CanonicalGraphString(&graph_);
-  }
-
-  static bool IncludeNode(const Node* n) { return n->IsOp(); }
-
-  static string EdgeId(const Node* n, int index) {
-    if (index == 0) {
-      return n->name();
-    } else if (index == Graph::kControlSlot) {
-      return strings::StrCat(n->name(), ":control");
-    } else {
-      return strings::StrCat(n->name(), ":", index);
-    }
-  }
-
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
-    for (const Node* n : g->nodes()) {
-      if (IncludeNode(n)) {
-        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
-      }
-    }
-    for (const Edge* e : g->edges()) {
-      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
-        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
-                                        EdgeId(e->dst(), e->dst_input())));
-      }
-    }
-    // Canonicalize
-    std::sort(nodes.begin(), nodes.end());
-    std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
-  }
-
-  string DoNodeMerge() {
-    string before = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "Before node merge optimize: " << before;
-
-    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
-    OptimizeNodeMerge(ug);
-
-    string result = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "After node merge optimize:  " << result;
-    return result;
-  }
-
-  const string& OriginalGraph() const { return original_; }
-
-  Graph graph_;
-  string original_;
-};
-
-REGISTER_OP("Input").Output("o: float").SetIsStateful();
-REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
-
-TEST_F(OptimizerMergeTest, Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Mul);D(Mul)|"
-            "A->C;A->D;B->C:1;B->D:1");
-}
-
-// Test set 1: Conv2D + AddBias
-
-// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y)
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);"
-            "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;"
-            "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1");
-}
-
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
-// We do not merge in this case as op is Conv2D and not MklConv2D.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoMklConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|"
-            "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1");
-}
-
-// Graph contains only MklConv2D, no AddBias.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|"
-            "A->C;B->C:2;M->C:1;N->C:3");
-}
-
-// MklConv2D output does not go to BiasAdd.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of MklConv2D does not go to BiasAdd.
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3");
-}
-
-// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add).
-// Merge should not be done in such case.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }"  // Conv2D has two outputs.
-                              // No merge should happen.
-      "node { name: 'G' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;"
-            "E->F:1;E->G:1;M->C:1;N->C:3");
-}
-
-// data_format attribute value mismatch. Merge should not be done
-// in such case.
-TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NHCW' } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);"
-            "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3");
-}
-
-#if 0
-// This test set is disabled temporarily as we do not enable node rewrite.
-// This test set will be enabled when we support Mkl-specific kernels for
-// backward bias.
-//
-// Test set 2: MklConv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias
-// rewrite tests
-
-// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MklConv2D);D(Sub);E(Conv2DWithBiasBackpropBias);"
-            "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;M->C:1;N->C:3");
-}
-
-// No MklConv2D in context, but Conv2D in context. No rewrite should happen.
-// C=Conv2D(A,B); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoMklConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Conv2D);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad, but MatMul in context.
-// Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(MatMul);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D)
-TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Add'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoNodeMerge(),
-            "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|"
-             "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-#endif
-
-static void BM_NodeMerge(int iters, int op_nodes) {
-  testing::StopTiming();
-  string s;
-  for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
-  }
-  random::PhiloxRandom philox(301, 17);
-  random::SimplePhilox rnd(&philox);
-  for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
-        "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
-        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
-        op, rnd.Uniform(10), rnd.Uniform(10));
-  }
-
-  bool first = true;
-  while (iters > 0) {
-    Graph* graph = new Graph(OpRegistry::Global());
-    OptimizerMergeTest::InitGraph(s, graph);
-    int N = graph->num_node_ids();
-    if (first) {
-      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
-      first = false;
-    }
-    {
-      testing::StartTiming();
-      std::unique_ptr<Graph> ug(graph);
-      OptimizeNodeMerge(&ug);
-      testing::StopTiming();
-    }
-    iters -= N;  // Our benchmark units are individual graph nodes,
-                 // not whole graphs
-    // delete graph;
-  }
-}
-BENCHMARK(BM_NodeMerge)->Arg(1000)->Arg(10000);
-
-}  // namespace
-}  // namespace tensorflow
-
-#endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 2097d432be7449d117e2e0fdb2fa8ab7d4ab446f..590b3d030fa212ec4f510ef35fb7a425f2aa2f9e 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -40,16 +40,16 @@ namespace tensorflow {
 
 // This pass inserts Mkl to Tf tensor conversion nodes (represented by C)
 // in the graph in between A and B, where A and B match any one
-// of the following
-// cases:
-//  1) A = layer/Op that generates output in Mkl format and,
-//     B = layer/Op that does not accept input in Mkl format and,
+// of the following cases:
+//
+//  1) A = a node that generates output in the Mkl format and,
+//     B = a node that does not accept input in the Mkl format and,
 //     A -> B (there is a direct edge between A and B, then
 //     We will insert C such that A->C->B.
 //
-//  2) A = layer/Op that generates output in Mkl format and,
-//     B = NULL (in other words, A is the last layer in the graph), then
-//     We will insert C such that A->C->B. (C will be the last layer.)
+//  2) A = a node that generates output in the Mkl format and,
+//     B = NULL (in other words, A is the last node in the graph), then
+//     We will insert C such that A->C->B. (C will be the last node.)
 //
 //  Note that case 1 applies to all outputs of A that are input to B.
 //  In other words, the conversions will be required for every output
@@ -59,9 +59,9 @@ namespace tensorflow {
 //  do the conversion for A1 and A2 only. We do not need to do any conversion
 //  for A3.
 //
-// This pass relies on layers registering themselves about their Mkl compliant.
-// Mkl compliant layer can accept inputs in Mkl format, and produce output in
-// Mkl format. Non-compliant layer accepts inputs and outputs in
+// This pass relies on ops registering themselves about their Mkl compliance.
+// An Mkl-compliant op can accept inputs in the Mkl format, and produce outputs
+// in the Mkl format. Non-compliant ops accept inputs and outputs in the
 // TensorFlow format.
 //
 class MklToTfConversionPass : public GraphOptimizationPass {
@@ -84,7 +84,7 @@ class MklToTfConversionPass : public GraphOptimizationPass {
   // @input T Datatype to use for checking input op
   // @return true if op is Mkl supported; false, otherwise.
   inline bool IsMklSupportedOp(const string& op_name, DataType T) const {
-    return mkl_layer_registry::IsMklLayer(op_name, T);
+    return mkl_op_registry::IsMklOp(op_name, T);
   }
 
   // Insert layout conversion node on the edge pointed by 'e' from graph 'g'.
@@ -98,12 +98,13 @@ class MklToTfConversionPass : public GraphOptimizationPass {
   Status InsertConversionNodeOnEdge(std::unique_ptr<Graph>* g, Edge*);
 };
 
-// We register MklToTf insertion for phase 1 in post-partition grouping.
-// We register this pass after partitioning so that we get a complete
-// picture of inputs and outputs of the nodes in the graphs.
+// We register MklToTf insertion for phase 2 in post-partition grouping
+// because we register MklLayoutRewritePass for phase 1 in post-partition
+// grouping. We register this pass after partitioning so that we get a
+// complete picture of inputs and outputs of the nodes in the graphs.
 const OptimizationPassRegistry::Grouping kMklTfConvPassGroup =
     OptimizationPassRegistry::POST_PARTITIONING;
-REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass);
+REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 2, MklToTfConversionPass);
 
 Status MklToTfConversionPass::InsertConversionNodeOnEdge(
     std::unique_ptr<Graph>* g, Edge* e) {
@@ -121,22 +122,26 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   string data_format;
 
   TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype));
-  if (src_datatype != dst_datatype) {
-    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
-                     " do not match. Will not insert" +
+  bool dst_dtype_found = GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                          Status::OK();
+  // We compare source and destination datatypes only when both are found.
+  if (dst_dtype_found && (src_datatype != dst_datatype)) {
+    string err_msg = "T attribute of " + src->name() + " and " +
+                      dst->name() + " do not match. Will not insert" +
                      " MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
 
-  // Lets build the conversion node and specify src as input.
+  // Build the conversion node and specify src as input.
   TF_CHECK_OK(
-      NodeBuilder((*g)->NewName("Mkl2Tf"), "MklToTf")
+      NodeBuilder((*g)->NewName("Mkl2Tf"), "_MklToTf")
           .Input(src, e->src_output())
-          .Input(src, e->src_output() + 1)  // Mkl tensor immediately
-                                            // follows Tf tensor.
-          .Device(src->def().device())      // We want to get conversion node
-                                            // on same device as source node.
+          .Input(src, DataIndexToMetaDataIndex(
+                          e->src_output(),
+                          src->num_outputs()))  // Get an Mkl tensor slot
+                                                // from the Tf tensor slot.
+          .Device(src->def().device())  // We want to get conversion node
+                                        // on same device as source node.
           .Attr("T", src_datatype)
           .Finalize(&**g, &conversion_node));
 
@@ -149,8 +154,8 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   // We want conversion node to be on the same device as the source node.
   conversion_node->set_assigned_device_name(src->assigned_device_name());
 
-  // Set the Mkl layer label for this op.
-  conversion_node->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel);
+  // Set the Mkl op label for this op.
+  conversion_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
 
   // Now that we have added edge from src->conversion_node, let's add edge from
   // output of conversion_node to the dest node. Since conversion_node
@@ -173,11 +178,11 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("Before MklToTfConversionPass", &**g);
 
-  // Since we are looking for mkl-supported op node immediately
-  // followed by non-mkl op node, we will just iterate over edge
+  // Since we are looking for an Mkl-supported op node immediately
+  // followed by a non-Mkl op node, we will just iterate over edge
   // set of the graph.
-  // vector to maintain candiadate edges whose source and destination
-  // are candidate for inserting conversion node
+  // edge set whose source and destination are candidates for
+  // inserting conversion node
   std::vector<Edge*> candidate_edges;
 
   for (const Edge* e : (*g)->edges()) {
@@ -190,9 +195,9 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
     }
 
     // We skip adding MklToTf on an edge between X->MklToTf or
-    // MklToTf->X, where X is any layer.
-    if (src->type_string().compare("MklToTf") == 0 ||
-        dst->type_string().compare("MklToTf") == 0) {
+    // MklToTf->X, where X is any node.
+    if (src->type_string().compare("_MklToTf") == 0 ||
+        dst->type_string().compare("_MklToTf") == 0) {
       continue;
     }
 
@@ -200,19 +205,19 @@ bool MklToTfConversionPass::RunPass(std::unique_ptr<Graph>* g) {
             << src->type_string() << " and " << dst->type_string();
 
     // Let's get source and destination data type.
-    DataType src_datatype = DT_INVALID;
-    if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) {
-      continue;
-    }
     // We cannot check datatype on destination node because destination node
     // may not be Mkl node.
-    DataType dst_datatype = DT_INVALID;
-    GetNodeAttr(dst->def(), "T", &dst_datatype);
+    DataType src_datatype;
+    DataType dst_datatype;
+    bool src_is_mkl_op = (GetNodeAttr(src->def(), "T", &src_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(src->type_string(), src_datatype));
+    bool dst_is_mkl_op = (GetNodeAttr(dst->def(), "T", &dst_datatype) ==
+                            Status::OK() &&
+                          IsMklSupportedOp(dst->type_string(), dst_datatype));
 
     // Check if src with is Mkl-compliant, while dst is not Mkl-compliant.
-
-    if (IsMklSupportedOp(src->type_string(), src_datatype) &&
-       !IsMklSupportedOp(dst->type_string(), dst_datatype)) {
+    if (src_is_mkl_op && !dst_is_mkl_op) {
       VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name()
               << " and " << dst->name() << " for inserting conversion nodes";
       candidate_edges.push_back(const_cast<Edge*>(e));
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index 4e211980d7fec757a5864ecb81cf02ec1de04f44..90bef111648452f823a669cab3c063377ed7bdef 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -16,10 +16,11 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/util/mkl_util.h"
 
-#include <vector>
-#include <string>
 #include <algorithm>
+#include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -109,7 +110,7 @@ class MklToTfConversionPass : public ::testing::Test {
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
-REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
 
 TEST_F(MklToTfConversionPass, Basic) {
   InitGraph(
@@ -125,58 +126,116 @@ TEST_F(MklToTfConversionPass, Basic) {
 }
 
 // MklConv2D followed by Non-Mkl layer
-// C=MklConv2D(A,M,B,N); E=Sub(C,D)
+// C=MklConv2D(A,M,B,N); E=Sub(C,D) (for interleaved ordering)
+// C=MklConv2D(A,B,M,N); E=Sub(C,D) (for contiguous ordering)
 TEST_F(MklToTfConversionPass, Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}");
-  EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(Input);B(Input);C(MklConv2D);D(Input);E(Sub);M(MklInput);"
-            "Mkl2Tf/_0(MklToTf);N(MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
-            "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'M', 'B', 'N']}"
+        "node { name: 'D' op: 'Input'}"
+        "node { name: 'E' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['C', 'D']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;"
+              "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'B', 'M', 'N']}"
+        "node { name: 'D' op: 'Input'}"
+        "node { name: 'E' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['C', 'D']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(Input);E(Sub);M(_MklInput);"
+              "Mkl2Tf/_0(_MklToTf);N(_MklInput)|A->C;B->C:1;C->Mkl2Tf/_0;"
+              "C:1->Mkl2Tf/_0:1;D->E:1;M->C:2;Mkl2Tf/_0->E;N->C:3");
+  }
 }
 
 // MklConv2D followed by MklToTf op followed by Non-Mkl layer.
-// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E)
+// C=MklConv2D(A,M,B,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for interleaved)
+// C=MklConv2D(A,B,M,N); D=MklToTf(C:0, C:1) F=Sub(D,E) (for contiguous)
 // MklToTf node should not be inserted again.
 TEST_F(MklToTfConversionPass, Negative_DoubleInsert) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'M' op: 'MklInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'N' op: 'MklInput'}"
-      "node { name: 'C' op: 'MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'M', 'B', 'N']}"
-      "node { name: 'D' op: 'MklToTf'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C:0', 'C:1']}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'Sub'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'E']}");
-  EXPECT_EQ(DoRunMklToTfConversionPass(),
-            "A(Input);B(Input);C(MklConv2D);D(MklToTf);E(Input);"
-            "F(Sub);M(MklInput);N(MklInput)|"
-            "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'M', 'B', 'N']}"
+        "node { name: 'D' op: '_MklToTf'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " input: ['C:0', 'C:1']}"
+        "node { name: 'E' op: 'Input'}"
+        "node { name: 'F' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['D', 'E']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
+              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A->C;B->C:2;C->D;C:1->D:1;D->F;E->F:1;M->C:1;N->C:3");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    InitGraph(
+        "node { name: 'A' op: 'Input'}"
+        "node { name: 'B' op: 'Input'}"
+        "node { name: 'M' op: '_MklInput'}"
+        "node { name: 'N' op: '_MklInput'}"
+        "node { name: 'C' op: '_MklConv2D'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+        " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } "
+        "}"
+        " attr { key: 'padding'          value { s: 'SAME' } }"
+        " input: ['A', 'B', 'M', 'N']}"
+        "node { name: 'D' op: '_MklToTf'"
+        " attr { key: 'T'                value { type: DT_FLOAT } }"
+        " attr { key: 'data_format'      value { s: 'NCHW' } }"
+        " input: ['C:0', 'C:1']}"
+        "node { name: 'E' op: 'Input'}"
+        "node { name: 'F' op: 'Sub'"
+        " attr {key: 'T'                 value { type: DT_FLOAT } }"
+        " input: ['D', 'E']}");
+    EXPECT_EQ(DoRunMklToTfConversionPass(),
+              "A(Input);B(Input);C(_MklConv2D);D(_MklToTf);E(Input);"
+              "F(Sub);M(_MklInput);N(_MklInput)|"
+              "A->C;B->C:1;C->D;C:1->D:1;D->F;E->F:1;M->C:2;N->C:3");
+  }
 }
 
 // C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y);
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 46e54c9eabe9cbe624799fabf40db64d5f03d311..500ac129e8b5d00ca2e392049bb6bb1ab138115f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -21,14 +21,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-NodeBuilder::NodeOut::NodeOut(Node* n, int i)  // NOLINT(runtime/explicit)
+NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
     : node(n),
       error(false),
       name(node != nullptr ? node->name() : (error = true, "")),
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
-NodeBuilder::NodeOut::NodeOut(StringPiece n, int i, DataType t)
+NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n.ToString()), index(i), dt(t) {}
 
 NodeBuilder::NodeOut::NodeOut()
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 2684e482865139c97afa7ba53cb67551affe279e..86647a49c12085b6850a0e6d2622ef1bb58c513d 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -49,13 +49,13 @@ class NodeBuilder {
   // ArraySlice.
   struct NodeOut {
     // For referencing an existing Node.
-    NodeOut(Node* n, int i = 0);
+    NodeOut(Node* n, int32 i = 0);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
     // back edge to a node that hasn't been added to the graph yet,
     // but will be.
-    NodeOut(StringPiece name, int i, DataType t);
+    NodeOut(StringPiece name, int32 i, DataType t);
 
     // Default constructor for std::vector<NodeOut>.
     NodeOut();
@@ -67,7 +67,7 @@ class NodeBuilder {
     // * an out-of-range index was passed to the NodeOut constructor.
     bool error;
     string name;
-    int index;
+    int32 index;
     DataType dt;
   };
 
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 59dff60ea3bafeb7f747b0e5424f448565e7bc85..a22a9b3fa31ff45fa7372e9270ac4ef8968b8f66 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/optimizer_cse.h"
 
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/graph/algorithm.h"
@@ -52,14 +53,12 @@ class OptimizerCSE {
  public:
   explicit OptimizerCSE(Graph* g) : g_(g) {}
 
-  bool Optimize(std::function<bool(const Node*)> consider_fn);
+  bool Optimize(const std::function<bool(const Node*)>& consider_fn);
 
  private:
-  struct Scratch;
-
   static size_t NodeHash(const Node* n);
-  static bool Equivalent(const Node* a, const Node* b, Scratch* s);
-  static bool EqualAttrs(const Node* a, const Node* b, Scratch* s);
+  static bool Equivalent(const Node* a, const Node* b,
+                         AttrSlice::Scratch* scratch);
 
   Graph* g_;
 };
@@ -109,7 +108,7 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   // Hash the attrs.  For example, this makes sure different constants
   // end up in different hash buckets.
   string tmp;
-  for (const auto& attr : n->def().attr()) {
+  for (const auto& attr : n->attrs()) {
     tmp = attr.first;
     attr.second.AppendToString(&tmp);
     // Add hashes of attrs, so the order of attrs doesn't matter.
@@ -121,28 +120,6 @@ size_t OptimizerCSE::NodeHash(const Node* n) {
   return h;
 }
 
-struct OptimizerCSE::Scratch {
-  // For EqualAttrs():
-  string a;
-  string b;
-};
-
-bool OptimizerCSE::EqualAttrs(const Node* a, const Node* b, Scratch* scratch) {
-  if (a->def().attr_size() != b->def().attr_size()) return false;
-
-  for (const auto& attr : b->def().attr()) {
-    auto iter = a->def().attr().find(attr.first);
-    if (iter == a->def().attr().end()) return false;
-    // Note: it should be safe to compare proto serializations of the attr
-    // values since at most one field should be set in each (indeed, it
-    // should be the same field).
-    iter->second.SerializeToString(&scratch->a);
-    attr.second.SerializeToString(&scratch->b);
-    if (scratch->a != scratch->b) return false;
-  }
-  return true;
-}
-
 static bool HasRefInput(const Node* n) {
   for (auto dt : n->input_types()) {
     if (IsRefType(dt)) return true;
@@ -150,7 +127,8 @@ static bool HasRefInput(const Node* n) {
   return false;
 }
 
-bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
+bool OptimizerCSE::Equivalent(const Node* a, const Node* b,
+                              AttrSlice::Scratch* scratch) {
   // Different op names are different
   if (a->type_string() != b->type_string()) return false;
 
@@ -163,7 +141,7 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
 
   // Compare attrs.  Note that equal attrs implies equal input and
   // output types.
-  if (!EqualAttrs(a, b, scratch)) return false;
+  if (!a->attrs().EqualAttrs(b->attrs(), scratch)) return false;
 
   // Compare input sources
   if (a->num_inputs() != b->num_inputs()) return false;
@@ -180,7 +158,8 @@ bool OptimizerCSE::Equivalent(const Node* a, const Node* b, Scratch* scratch) {
   return true;
 }
 
-bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
+bool OptimizerCSE::Optimize(
+    const std::function<bool(const Node*)>& consider_fn) {
   // This very simple implementation works if the whole graph is one
   // giant basic block (because we just traverse nodes in a
   // topological order). This simple implementation works well
@@ -204,7 +183,7 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   // Scratch space for Equivalent calls.  Allocated here and passed in to
   // Equivalent to avoid allocation inside the loop below.
   bool changed = false;
-  Scratch scratch;
+  AttrSlice::Scratch scratch;
   for (Node* n : order) {
     if (!n->IsOp()) continue;
 
@@ -232,7 +211,8 @@ bool OptimizerCSE::Optimize(std::function<bool(const Node*)> consider_fn) {
   return changed;
 }
 
-bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn) {
+bool OptimizeCSE(Graph* g,
+                 const std::function<bool(const Node*)>& consider_fn) {
   OptimizerCSE opt(g);
   return opt.Optimize(consider_fn);
 }
diff --git a/tensorflow/core/graph/optimizer_cse.h b/tensorflow/core/graph/optimizer_cse.h
index 24ec5658d86dab147d60ba9095138b7585ec4cb6..b8f3230c70c314f15cc2179c98d727902ef1ab9d 100644
--- a/tensorflow/core/graph/optimizer_cse.h
+++ b/tensorflow/core/graph/optimizer_cse.h
@@ -29,7 +29,8 @@ namespace tensorflow {
 // during the common subexpression elimination.
 //
 // Returns true if and only if 'g' is mutated.
-extern bool OptimizeCSE(Graph* g, std::function<bool(const Node*)> consider_fn);
+extern bool OptimizeCSE(Graph* g,
+                        const std::function<bool(const Node*)>& consider_fn);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 1091af4e451d5d3481dc2ad2422483ac3b3791f8..94250240eb746a49be8f8a37e73b793e37e1832c 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/optimizer_cse.h"
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index 63294c695e51a457af71e693832ebe04e30474bc..a0c3fbe2aa407f89dcf7c2caad6bae90dfa648e6 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -192,9 +192,9 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
   Tensor tensor_names;
   Tensor shape_and_slices;
   TF_RETURN_IF_ERROR(
-      GetNodeAttr(AttrSlice(tensor_names_op->def()), "value", &tensor_names));
-  TF_RETURN_IF_ERROR(GetNodeAttr(AttrSlice(shape_and_slices_op->def()), "value",
-                                 &shape_and_slices));
+      GetNodeAttr(tensor_names_op->attrs(), "value", &tensor_names));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(shape_and_slices_op->attrs(), "value", &shape_and_slices));
 
   int tn_size = tensor_names.NumElements();
   int var_size = added_variables.size();
@@ -226,9 +226,7 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
   }
   save_op_builder = save_op_builder.Input(var_nodeouts);
 
-  // Clear the old attr for the two constants and add the new ones.
-  tensor_names_op->ClearAttr("value");
-  shape_and_slices_op->ClearAttr("value");
+  // Update the attrs.
   tensor_names_op->AddAttr("value", new_tensor_names);
   shape_and_slices_op->AddAttr("value", new_shape_and_slices);
 
@@ -247,7 +245,7 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
 // Add a restore subgraph for each variable and connect to the restore_all op.
 // For each variable we add the following subgraph:
 //           Assign----restore_all
-//          /      \
+//          |      |
 //   RestoreV2    Variable
 Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
                                    const std::vector<const Edge*>& in_edges,
@@ -311,7 +309,7 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
 // Adds new variables to save and restore ops matching the Save and Restore
 // graphs created in tensorflow/python/training/saver.py.
 Status AddSaveAndRestore(Graph* graph, const std::vector<Node*>& variables) {
-  Node* save_op;
+  Node* save_op = nullptr;
   std::vector<const Edge*> in_edges;
   bool found = false;
   TF_RETURN_IF_ERROR(FindSaveOp(graph, &save_op, &in_edges, &found));
@@ -526,31 +524,42 @@ Status MakeInputMinMax(Graph* graph, const string& name_prefix,
   return Status::OK();
 }
 
-// Adds a QuantizeAndDequantizeV2Op (and required input nodes) based on edge.
+// Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
+// (and required input nodes) based on edge.
 // The result is stored in convert_node.
-Status MakeQuantizeAndDequantizeV2(Graph* graph, const string& name_prefix,
-                                   const EdgeToConvert& edge,
-                                   std::vector<Node*>* added_variables,
-                                   Node** convert_node) {
+Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
+                      const string& quant_op_type, const EdgeToConvert& edge,
+                      std::vector<Node*>* added_variables,
+                      Node** convert_node) {
   Node* input_min;
   Node* input_max;
   TF_RETURN_IF_ERROR(MakeInputMinMax(graph, name_prefix, edge, added_variables,
                                      &input_min, &input_max));
-
-  string quant_name = strings::StrCat(name_prefix, "/QuantizeAndDequantizeV2");
-  TF_RETURN_IF_ERROR(NodeBuilder(quant_name, "QuantizeAndDequantizeV2")
-                         .Input(edge.edge->src())
-                         .Input(input_min)
-                         .Input(input_max)
-                         .Attr("signed_input", edge.signed_input)
-                         .Attr("num_bits", edge.num_bits)
-                         .Attr("range_given", true)
-                         .Finalize(graph, convert_node));
+  string quant_name = strings::StrCat(name_prefix, "/", quant_op_type);
+  if (quant_op_type == "QuantizeAndDequantizeV2") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("signed_input", edge.signed_input)
+                           .Attr("num_bits", edge.num_bits)
+                           .Attr("range_given", true)
+                           .Finalize(graph, convert_node));
+  } else if (quant_op_type == "FakeQuantWithMinMaxVars") {
+    TF_RETURN_IF_ERROR(NodeBuilder(quant_name, quant_op_type)
+                           .Input(edge.edge->src())
+                           .Input(input_min)
+                           .Input(input_max)
+                           .Attr("num_bits", edge.num_bits)
+                           .Finalize(graph, convert_node));
+  } else {
+    return errors::InvalidArgument("Unknown quant op type: ", quant_op_type);
+  }
   return Status::OK();
 }
 
 // Insert conversion op, connect it to the graph and remove the old edge.
-Status ProcessTargetEdges(Graph* graph,
+Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
                           const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
@@ -562,8 +571,8 @@ Status ProcessTargetEdges(Graph* graph,
 
     auto iter = name_index.find(name_prefix);
     if (iter == name_index.end()) {
-      TF_RETURN_IF_ERROR(MakeQuantizeAndDequantizeV2(
-          graph, name_prefix, edge, &added_variables, &convert_node));
+      TF_RETURN_IF_ERROR(MakeQuantizeOp(graph, name_prefix, quant_op_type, edge,
+                                        &added_variables, &convert_node));
       name_index[name_prefix] = convert_node;
     } else {
       convert_node = iter->second;
@@ -580,7 +589,8 @@ Status ProcessTargetEdges(Graph* graph,
 
 }  // namespace
 
-Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* graph) {
   if (graph == nullptr) {
     return errors::InvalidArgument("Cannot accept empty graph pointer.");
   }
@@ -638,13 +648,14 @@ Status DoQuantizeTraining(int32 num_bits, Graph* graph) {
     }
   }
 
-  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, target_edges));
+  TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, quant_op_type, target_edges));
 
   return Status::OK();
 }
 
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph) {
   // First create the graph from the GraphDef.
   Graph graph(OpRegistry::Global());
@@ -656,7 +667,7 @@ Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, input_graphdef, &graph));
 
   // Call the rewriter on the graph.
-  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, &graph));
+  TF_RETURN_IF_ERROR(DoQuantizeTraining(num_bits, quant_op_type, &graph));
 
   // Convert the result graph back to a GraphDef.
   GraphDef output_graphdef;
diff --git a/tensorflow/core/graph/quantize_training.h b/tensorflow/core/graph/quantize_training.h
index 66db0c5bf4b6f1546f38f23ebf26d79d83bbb3da..2c1a7e6ae3618904ef37b5ec0ed38c61c6180455 100644
--- a/tensorflow/core/graph/quantize_training.h
+++ b/tensorflow/core/graph/quantize_training.h
@@ -24,6 +24,10 @@ namespace tensorflow {
 // the model can learn to deal with such loss and achieve better accuracy when
 // it is quantized later for inference.
 // Note that the num_bits should be in [1, 63] and 'g' must be not null.
+// quant_op_type specifies which quantization op should be used.
+// Current ops supported:
+// - QuantizeAndDequantizeV2.
+// - FakeQuantWithMinMaxVars.
 //
 // On success, returns OK.
 //
@@ -31,12 +35,14 @@ namespace tensorflow {
 //    - num_bits out of range.
 //    - g is null.
 //    - More than 1 unknown ops encountered.
-Status DoQuantizeTraining(int32 num_bits, Graph* g);
+Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
+                          Graph* g);
 
 // Converts a input GraphDef and returns a rewritten GraphDef with the
 // quantized training.
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph,
                                               int32 num_bits,
+                                              const string& quant_op_type,
                                               string* result_graph);
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 867dd8161b02ab15b8551bcc2be336fea744c5cb..d817d980de90aad7df91eecbf92de50c3dd1b243 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -103,7 +103,7 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
       a       b
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(63, g->num_nodes());
 
@@ -112,17 +112,15 @@ TEST_F(QuantizeTrainingTest, SignedInput) {
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(identity->name(), "/QuantizeAndDequantizeV2"),
                &identity_q_node));
-  NodeDef identity_q = identity_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(identity_q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*identity_q_node->attrs().Find("signed_input")));
   // Quantize_and_dequantize node for relu should have signed_input==false.
   Node* relu_q_node;
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
                &relu_q_node));
-  NodeDef relu_q = relu_q_node->def();
   ASSERT_EQ("false",
-            SummarizeAttrValue(relu_q.attr().find("signed_input")->second));
+            SummarizeAttrValue(*relu_q_node->attrs().Find("signed_input")));
 }
 
 TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
@@ -156,7 +154,7 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
       a       b
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(38, g->num_nodes());
 
@@ -165,20 +163,18 @@ TEST_F(QuantizeTrainingTest, RangeGivenTrue) {
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu6->name(), "/QuantizeAndDequantizeV2"),
                &relu6_q_node));
-  NodeDef identity_q = relu6_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(identity_q.attr().find("range_given")->second));
+            SummarizeAttrValue(*relu6_q_node->attrs().Find("range_given")));
   // Quantize_and_dequantize node for relu should have range_given==true.
   Node* relu_q_node;
   TF_ASSERT_OK(
       FindNode(g, strings::StrCat(relu->name(), "/QuantizeAndDequantizeV2"),
                &relu_q_node));
-  NodeDef relu_q = relu_q_node->def();
   ASSERT_EQ("true",
-            SummarizeAttrValue(relu_q.attr().find("range_given")->second));
+            SummarizeAttrValue(*relu_q_node->attrs().Find("range_given")));
 }
 
-TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   // Construct a graph with an additional backward Matmul.
   Reset();
   Graph* g = g_.get();
@@ -211,11 +207,11 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
   g->AddControlEdge(backward_m, g->sink_node());
 
   int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
 
   EXPECT_EQ(95, g->num_nodes());
 
-  // Ensure that we the backwards matmul input was not quantized.
+  // Ensure that the backwards matmul input was not quantized.
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
@@ -232,6 +228,60 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes) {
       g, strings::StrCat(c->name(), "/QuantizeAndDequantizeV2"), &found_node));
 }
 
+TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
+  // Construct a graph with an additional backward Matmul.
+  Reset();
+  Graph* g = g_.get();
+  Node* a = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* b = Constant<float>({1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Node* c = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  // We will use node d as input to the backwards matmul to ensure that it
+  // isn't quantized.
+  Node* d = Constant<float>({0.0, 1.0, 1.0, 0.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), b);
+  g->AddControlEdge(g->source_node(), c);
+  g->AddControlEdge(g->source_node(), d);
+  Node* relu = test::graph::Relu(g, a);
+  Node* identity = test::graph::Identity(g, b);
+  Node* m1 = test::graph::Matmul(g, relu, identity, false, false);
+  Node* m2 = test::graph::Matmul(g, identity, c, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+  g->AddControlEdge(m2, g->sink_node());
+
+  // Add a Matmul node with name starting with "gradients". We will check that
+  // its input d was not quantized.
+  Node* backward_m;
+  TF_ASSERT_OK(NodeBuilder(g->NewName("gradients/n"), "MatMul")
+                   .Input(d)
+                   .Input(m2)
+                   .Attr("transpose_a", true)
+                   .Attr("transpose_b", false)
+                   .Finalize(g, &backward_m));
+  g->AddControlEdge(backward_m, g->sink_node());
+
+  int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
+
+  EXPECT_EQ(95, g->num_nodes());
+
+  // Ensure that the backwards matmul input was not quantized.
+  Node* found_node;
+  Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
+                      &found_node);
+  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+
+  // Ensure that m1 and m2's inputs were quantized.
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(relu->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(
+      FindNode(g, strings::StrCat(identity->name(), "/FakeQuantWithMinMaxVars"),
+               &found_node));
+  TF_ASSERT_OK(FindNode(
+      g, strings::StrCat(c->name(), "/FakeQuantWithMinMaxVars"), &found_node));
+}
+
 TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   // Construct a simple graph with 5 nodes.
   Reset();
@@ -254,8 +304,8 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   input_graph.SerializeToString(&input_string);
 
   string result_string;
-  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(input_string, num_bits,
-                                                      &result_string));
+  TF_ASSERT_OK(DoQuantizeTrainingOnSerializedGraphDef(
+      input_string, num_bits, "QuantizeAndDequantizeV2", &result_string));
 
   GraphDef result_graphdef;
   EXPECT_TRUE(ParseProtoUnlimited(&result_graphdef, result_string));
@@ -265,11 +315,105 @@ TEST_F(QuantizeTrainingTest, QuantizeGraphDef) {
   GraphConstructorOptions opts;
   Graph result_graph(OpRegistry::Global());
   TF_ASSERT_OK(ConvertGraphDefToGraph(opts, result_graphdef, &result_graph));
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, graph));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", graph));
   EXPECT_EQ(graph->num_nodes(), result_graph.num_nodes());
 }
 
-TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange) {
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_QuantizeAndDequantize) {
+  // Construct the following graph
+  // Relu has an unknown range, so we will check if the EMA correctly estimates
+  // the range.
+  /*
+           m1
+        /      \
+      Relu    Relu6
+        |       |
+        a       c
+  */
+  Reset();
+  Graph* g = g_.get();
+  Node* a;
+  TF_ASSERT_OK(Placeholder(g, "a", {2, 2}, &a));
+  Node* c = Constant<float>({2.0, 3.0, 4.0, 5.0}, {2, 2});
+  g->AddControlEdge(g->source_node(), a);
+  g->AddControlEdge(g->source_node(), c);
+  Node* relu = test::graph::Relu(g, a);
+  Node* relu6 = test::graph::Relu6(g, c);
+  Node* m1 = test::graph::Matmul(g, relu, relu6, false, false);
+  g->AddControlEdge(m1, g->sink_node());
+
+  // This is rewritten into the following subgraph, where Q_a and Q_c are
+  // quantize and dequantize subgraphs.
+  // Since relu's range is unknown, we check that the exponential moving average
+  // works correctly.
+  /*
+         m1
+      /      \
+     Q_a     Q_c
+      |       |
+    Relu     Relu6
+      |       |
+      a       c
+  */
+  const int num_bits = 8;
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "QuantizeAndDequantizeV2", g));
+
+  SessionOptions options;
+  Session* sess;
+  TF_ASSERT_OK(NewSession(options, &sess));
+  GraphDef gdef;
+  g->ToGraphDef(&gdef);
+  TF_ASSERT_OK(sess->Create(gdef));
+
+  // The min and max values of the relu6 quantization should be constant values
+  // of 0 and 6.
+  string min_const_name = strings::StrCat(relu6->name(), "/InputMin");
+  string max_const_name = strings::StrCat(relu6->name(), "/InputMax");
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  Tensor a1(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a1, {0.0, 1.0, 2.0, 3.0});
+  Tensor a2(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a2, {1.0, 2.0, 3.0, 4.0});
+
+  TF_ASSERT_OK(sess->Run({{"a", a1}}, {m1->name()}, {}, &outputs));
+
+  // The value of the min and max should be set to the min and max of a1 since
+  // this is the first run that initializes the EMA variables.
+  string min_var_name = strings::StrCat(relu->name(), "/Min/Variable");
+  string max_var_name = strings::StrCat(relu->name(), "/Max/Variable");
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 3.0);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+
+  // Now when we run with new inputs, we should get a moving average for the min
+  // and max variables. They should be equal to:
+  // min_var = old_min_var * decay + min(a2) * (1 - decay)
+  // max_var = old_max_var * decay + max(a2) * (1 - decay)
+  TF_ASSERT_OK(sess->Run({{"a", a2}}, {m1->name()}, {}, &outputs));
+
+  TF_ASSERT_OK(sess->Run({}, {min_var_name, max_var_name}, {}, &outputs));
+  const float decay = 0.999;
+  const float expected_min = 0.0 * decay + 1.0 * (1.0 - decay);
+  const float expected_max = 3.0 * decay + 4.0 * (1.0 - decay);
+  EXPECT_NEAR(outputs[0].flat<float>()(0), expected_min, 1e-4);
+  EXPECT_NEAR(outputs[1].flat<float>()(0), expected_max, 1e-4);
+
+  // The relu6 quantization range should remain unchanged.
+  TF_ASSERT_OK(sess->Run({}, {min_const_name, max_const_name}, {}, &outputs));
+  EXPECT_EQ(outputs[0].flat<float>()(0), 0.0);
+  EXPECT_EQ(outputs[1].flat<float>()(0), 6.0);
+}
+
+TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange_FakeQuant) {
   // Construct the following graph
   // Relu has an unknown range, so we will check if the EMA correctly estimates
   // the range.
@@ -306,7 +450,7 @@ TEST_F(QuantizeTrainingTest, FixedRangeAndEMARange) {
       a       c
   */
   const int num_bits = 8;
-  TF_ASSERT_OK(DoQuantizeTraining(num_bits, g));
+  TF_ASSERT_OK(DoQuantizeTraining(num_bits, "FakeQuantWithMinMaxVars", g));
 
   SessionOptions options;
   Session* sess;
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index ff46abd439d60cd8d10242be83ddbb1256fa2b38..2a08bf8ca019185cf9d13dd83ca1880c3741090f 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -55,8 +55,13 @@ namespace {
 // state).
 static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
                          const gtl::ArraySlice<string>& fed_outputs,
-                         subgraph::NameIndex* name_index) {
-  for (const string& t : fed_outputs) {
+                         bool use_function_convention,
+                         subgraph::NameIndex* name_index,
+                         DataTypeVector* out_feed_types) {
+  out_feed_types->clear();
+  out_feed_types->reserve(fed_outputs.size());
+  for (size_t i = 0; i < fed_outputs.size(); ++i) {
+    const string& t = fed_outputs[i];
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
@@ -71,32 +76,32 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
     }
 
     Node* recv_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
-                    "_Recv")
-            .Attr("tensor_type", BaseType(n->output_type(id.second)))
-            .Attr("tensor_name", t)
-            .Attr("send_device", device_info.name())
-            .Attr("recv_device", device_info.name())
-            .Attr("send_device_incarnation",
-                  static_cast<int64>(device_info.incarnation()))
-            .Attr("client_terminated", true)
-            .Finalize(g, &recv_node));
-    recv_node->set_assigned_device_name(device_info.name());
 
-    // Copy the _output_shapes from the original node to the feed node,
-    // if any.
-    std::vector<PartialTensorShape> output_shapes;
-    if (GetNodeAttr(n->def(), "_output_shapes", &output_shapes).ok()) {
-      if (n->num_outputs() != output_shapes.size()) {
-        return errors::InvalidArgument(
-            "FeedInputs: ", t,
-            ": size of _output_shapes attribute does not "
-            "match the number of node outputs");
-      }
-      std::vector<PartialTensorShape> feed_shapes = {output_shapes[id.second]};
-      recv_node->AddAttr("_output_shapes", feed_shapes);
+    if (!use_function_convention) {
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(strings::StrCat("_recv_", id.first, "_", id.second),
+                      "_Recv")
+              .Attr("tensor_type", BaseType(n->output_type(id.second)))
+              .Attr("tensor_name", t)
+              .Attr("send_device", device_info.name())
+              .Attr("recv_device", device_info.name())
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(device_info.incarnation()))
+              .Attr("client_terminated", true)
+              .Finalize(g, &recv_node));
+    } else {
+      // NOTE(mrry): We must include the index as part of the node
+      // name, because _Arg is a "stateful" kernel and therefore
+      // its name must uniquely identify a kernel instance across all
+      // graphs in the same session.
+      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_arg_", id.first, "_",
+                                                     id.second, "_", i),
+                                     "_Arg")
+                             .Attr("T", BaseType(n->output_type(id.second)))
+                             .Attr("index", static_cast<int32>(i))
+                             .Finalize(g, &recv_node));
     }
+    recv_node->set_assigned_device_name(device_info.name());
 
     // Update name_index
     (*name_index)[recv_node->name()] = recv_node;
@@ -110,8 +115,8 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
       if (e->src_output() == id.second) {
         to_remove.emplace_back(e);
       } else if (e->src_output() == Graph::kControlSlot &&
-                 (n->def().op() == "Placeholder" ||
-                  n->def().op() == "PlaceholderV2")) {
+                 (n->type_string() == "Placeholder" ||
+                  n->type_string() == "PlaceholderV2")) {
         // When feeding a Placeholder node, any outgoing control edges
         // will be replaced with a control edge from the replacement
         // recv_node.
@@ -130,6 +135,7 @@ static Status FeedInputs(Graph* g, const DeviceAttributes& device_info,
       }
       g->RemoveEdge(e);
     }
+    out_feed_types->push_back(BaseType(n->output_type(id.second)));
   }
   return Status::OK();
 }
@@ -143,10 +149,7 @@ static bool AddNodeToTargets(const string& node_or_tensor_name,
     return false;
   }
   const Node* n = iter->second;
-  if (n->name() != node_or_tensor_name) {
-    return false;
-  }
-
+  CHECK_EQ(n->name(), id.first);
   targets->insert(n);
   return true;
 }
@@ -184,9 +187,14 @@ namespace subgraph {
 
 Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
                     const gtl::ArraySlice<string>& fetch_outputs,
-                    NameIndex* name_index, std::vector<Node*>* fetch_nodes) {
-  fetch_nodes->clear();
-  for (const string& t : fetch_outputs) {
+                    bool use_function_convention, NameIndex* name_index,
+                    std::vector<Node*>* out_fetch_nodes,
+                    DataTypeVector* out_fetch_types) {
+  out_fetch_nodes->clear();
+  out_fetch_nodes->reserve(fetch_outputs.size());
+  for (size_t i = 0; i < fetch_outputs.size(); ++i) {
+    const string& t = fetch_outputs[i];
+
     // Parse t into node_name and output_index.
     TensorId id(ParseTensorName(t));
 
@@ -216,25 +224,39 @@ Status FetchOutputs(Graph* g, const DeviceAttributes& device_info,
 
     // Create the fetch Node and connect it up
     Node* send_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
-                    "_Send")
-            .Input(n, id.second)
-            .Attr("tensor_name", t)
-            .Attr("send_device", device_info.name())
-            .Attr("recv_device", device_info.name())
-            .Attr("send_device_incarnation",
-                  static_cast<int64>(device_info.incarnation()))
-            .Attr("client_terminated", true)
-            .Finalize(g, &send_node));
+    if (!use_function_convention) {
+      TF_RETURN_IF_ERROR(
+          NodeBuilder(strings::StrCat("_send_", id.first, "_", id.second),
+                      "_Send")
+              .Input(n, id.second)
+              .Attr("tensor_name", t)
+              .Attr("send_device", device_info.name())
+              .Attr("recv_device", device_info.name())
+              .Attr("send_device_incarnation",
+                    static_cast<int64>(device_info.incarnation()))
+              .Attr("client_terminated", true)
+              .Finalize(g, &send_node));
+    } else {
+      // NOTE(mrry): We must include the index as part of the node
+      // name, because _Retval is a "stateful" kernel and therefore
+      // its name must uniquely identify a kernel instance across all
+      // graphs in the same session.
+      TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("_retval_", id.first, "_",
+                                                     id.second, "_", i),
+                                     "_Retval")
+                             .Input(n, id.second)
+                             .Attr("T", BaseType(n->output_type(id.second)))
+                             .Attr("index", static_cast<int32>(i))
+                             .Finalize(g, &send_node));
+    }
     send_node->set_assigned_device_name(device_info.name());
-    VLOG(1) << "Created fetch node: " << SummarizeNodeDef(send_node->def());
 
     // Update the index.
     (*name_index)[send_node->name()] = send_node;
 
     g->AddControlEdge(send_node, g->sink_node());
-    fetch_nodes->push_back(send_node);
+    out_fetch_nodes->push_back(send_node);
+    out_fetch_types->push_back(BaseType(n->output_type(id.second)));
   }
 
   return Status::OK();
@@ -244,7 +266,8 @@ Status RewriteGraphForExecution(
     Graph* g, const gtl::ArraySlice<string>& fed_outputs,
     const gtl::ArraySlice<string>& fetch_outputs,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info) {
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata) {
   if (fetch_outputs.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
@@ -277,18 +300,21 @@ Status RewriteGraphForExecution(
   // currently listed in "fetch_nodes".  We pass "name_index" so the index is
   // kept up to date.
   if (!fed_outputs.empty()) {
-    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs, &name_index));
+    TF_RETURN_IF_ERROR(FeedInputs(g, device_info, fed_outputs,
+                                  use_function_convention, &name_index,
+                                  &out_metadata->feed_types));
   }
 
   // Add the fetch nodes, also updating "name_index".
   std::vector<Node*> fetch_nodes;
   if (!fetch_outputs.empty()) {
-    TF_RETURN_IF_ERROR(
-        FetchOutputs(g, device_info, fetch_outputs, &name_index, &fetch_nodes));
+    TF_RETURN_IF_ERROR(FetchOutputs(g, device_info, fetch_outputs,
+                                    use_function_convention, &name_index,
+                                    &fetch_nodes, &out_metadata->fetch_types));
   }
 
   // Prune the graph to only compute what is needed for the fetch nodes and the
-  // targets nodes.
+  // target nodes.
   if (!fetch_nodes.empty() || !target_node_names.empty()) {
     TF_RETURN_IF_ERROR(
         PruneForTargets(g, name_index, fetch_nodes, target_node_names));
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index d94d983d000e0b6ebfeaa4cc540f0bc82df341de..8ccc27914bce325469b0e73deacf6a3c44a55246 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -26,6 +26,18 @@ limitations under the License.
 namespace tensorflow {
 namespace subgraph {
 
+// Information about a graph rewritten by `RewriteGraphForExecution()`.
+struct RewriteGraphMetadata {
+  // The element type of each tensor fed to this subgraph. The order
+  // of types corresponds to the order of tensor names in
+  // `fed_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector feed_types;
+  // The element type of each tensor fetched from this subgraph. The
+  // order of types corresponds to the order of tensor names in
+  // `fetch_outputs` when calling `RewriteGraphForExecution()`.
+  DataTypeVector fetch_types;
+};
+
 // Rewrite the graph structure of "*g" to deal with feeding node
 // outputs, fetching node outputs, and only running a subset of the
 // graph.  "fed_outputs" and "fetch_outputs" are both lists of
@@ -56,7 +68,8 @@ Status RewriteGraphForExecution(
     Graph* g, const gtl::ArraySlice<string>& fed_outputs,
     const gtl::ArraySlice<string>& fetch_outputs,
     const gtl::ArraySlice<string>& target_node_names,
-    const DeviceAttributes& device_info);
+    const DeviceAttributes& device_info, bool use_function_convention,
+    RewriteGraphMetadata* out_metadata);
 
 typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher> NameIndex;
 
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index ee4960121f533b9227e46be796ecbf413cfbceaa..fde1ea17437e86d01054a1b153055170bda51e8b 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -81,7 +81,7 @@ class SubgraphTest : public ::testing::Test {
     for (const string& s : expected_nodes) {
       Node* n = FindNode(s);
       EXPECT_TRUE(n != nullptr) << s;
-      if (n->def().op() == "_Send" || n->def().op() == "_Recv") {
+      if (n->type_string() == "_Send" || n->type_string() == "_Recv") {
         EXPECT_EQ(device_info_.name(), n->assigned_device_name()) << s;
       }
     }
@@ -104,7 +104,8 @@ class SubgraphTest : public ::testing::Test {
   }
 
   string Subgraph(const string& fed_str, const string& fetch_str,
-                  const string& targets_str) {
+                  const string& targets_str,
+                  bool use_function_convention = false) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(*g_, subgraph);
     std::vector<string> fed =
@@ -114,13 +115,18 @@ class SubgraphTest : public ::testing::Test {
     std::vector<string> targets =
         str_util::Split(targets_str, ',', str_util::SkipEmpty());
 
-    Status s = subgraph::RewriteGraphForExecution(subgraph, fed, fetch, targets,
-                                                  device_info_);
+    subgraph::RewriteGraphMetadata metadata;
+    Status s = subgraph::RewriteGraphForExecution(
+        subgraph, fed, fetch, targets, device_info_, use_function_convention,
+        &metadata);
     if (!s.ok()) {
       delete subgraph;
       return s.ToString();
     }
 
+    EXPECT_EQ(fed.size(), metadata.feed_types.size());
+    EXPECT_EQ(fetch.size(), metadata.fetch_types.size());
+
     // Replace the graph with the subgraph for the rest of the display program
     g_.reset(subgraph);
     return "OK";
@@ -178,6 +184,20 @@ TEST_F(SubgraphTest, FedOutputs1) {
   ExpectNodes("W1,W2,_recv_input_1,t1,t2");
 }
 
+TEST_F(SubgraphTest, FedOutputs1_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK",
+            Subgraph("input:1", "", "t2", true /* use_function_convention */));
+  ExpectNodes("W1,W2,_arg_input_1_0,t1,t2");
+}
+
 TEST_F(SubgraphTest, FedRefNode) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
@@ -189,7 +209,19 @@ TEST_F(SubgraphTest, FedRefNode) {
   EXPECT_FALSE(IsRefType(CHECK_NOTNULL(n)->output_type(0)));
 }
 
-TEST_F(SubgraphTest, FedOutputs2) {
+TEST_F(SubgraphTest, FedRefNode_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W2', 'W1' ] }");
+  EXPECT_EQ("OK",
+            Subgraph("W1:0", "", "t1", true /* use_function_convention */));
+  ExpectNodes("_arg_W1_0_0,W2,t1");
+  Node* n = FindNode("_arg_W1_0_0");
+  EXPECT_FALSE(IsRefType(CHECK_NOTNULL(n)->output_type(0)));
+}
+
+TEST_F(SubgraphTest, FedOutputs2_FunctionConvention) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
       "node { name: 'W2' op: 'TestParams' }"
@@ -200,8 +232,9 @@ TEST_F(SubgraphTest, FedOutputs2) {
       "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
   // We feed input:1, but nothing connects to it, so the _recv(input:1)
   // node also disappears.
-  EXPECT_EQ("OK", Subgraph("input:1,t1,W2", "", "t2"));
-  ExpectNodes("_recv_t1_0,_recv_W2_0,t2");
+  EXPECT_EQ("OK", Subgraph("input:1,t1,W2", "", "t2",
+                           true /* use_function_convention */));
+  ExpectNodes("_arg_t1_0_1,_arg_W2_0_2,t2");
 }
 
 TEST_F(SubgraphTest, FetchOutputs1) {
@@ -218,6 +251,22 @@ TEST_F(SubgraphTest, FetchOutputs1) {
       "W1,W2,input,t1,t2,_send_W2_0,_send_input_1,_send_t1_0,_send_t2_0");
 }
 
+TEST_F(SubgraphTest, FetchOutputs1_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK", Subgraph("", "W2,input:1,t1,t2", "t2",
+                           true /* use_function_convention */));
+  ExpectNodes(
+      "W1,W2,input,t1,t2,_retval_W2_0_0,_retval_input_1_1,_retval_t1_0_2,_"
+      "retval_t2_0_3");
+}
+
 TEST_F(SubgraphTest, FetchOutputs2) {
   ExpectOK(
       "node { name: 'W1' op: 'TestParams' }"
@@ -231,6 +280,20 @@ TEST_F(SubgraphTest, FetchOutputs2) {
   ExpectNodes("W1,W2,input,t1,t2,t3_a,_send_t3_a_0");
 }
 
+TEST_F(SubgraphTest, FetchOutputs2_FunctionConvention) {
+  ExpectOK(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'W2' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }"
+      "node { name: 't3_a' op: 'TestRelu' input: 't2' }"
+      "node { name: 't3_b' op: 'TestRelu' input: 't2' }");
+  EXPECT_EQ("OK",
+            Subgraph("", "t3_a", "t2", true /* use_function_convention */));
+  ExpectNodes("W1,W2,input,t1,t2,t3_a,_retval_t3_a_0_0");
+}
+
 TEST_F(SubgraphTest, ChainOfFools) {
   ExpectOK(
       "node { name: 'a' op: 'TestParams' }"
@@ -275,47 +338,11 @@ TEST_F(SubgraphTest, Errors) {
   EXPECT_TRUE(HasSubstr(Subgraph("", "", ""), "at least one target"));
 }
 
-TEST_F(SubgraphTest, FedOutputsPreservesOutputShapes) {
-  ExpectOK(
-      R"proto(
-        node { name: 'W1' op: 'TestParams' }
-        node { name: 'W2' op: 'TestParams' }
-        node {
-          name: 'input'
-          op: 'TestInput'
-          attr {
-            key: '_output_shapes'
-            value {
-              list {
-                shape { unknown_rank: true }
-                shape { dim { size: 23 } }
-              }
-            }
-          }
-        }
-        node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }
-        node { name: 't2' op: 'TestMul' input: [ 'W2', 't1' ] }
-        node { name: 't3_a' op: 'TestRelu' input: 't2' }
-        node { name: 't3_b' op: 'TestRelu' input: 't2' }
-      )proto");
-  EXPECT_EQ("OK", Subgraph("input:1", "", "t2"));
-  ExpectNodes("W1,W2,_recv_input_1,t1,t2");
-
-  for (Node* node : graph()->nodes()) {
-    if (node->name() == "_recv_input_1") {
-      std::vector<PartialTensorShape> shapes;
-      TF_ASSERT_OK(GetNodeAttr(node->def(), "_output_shapes", &shapes));
-      ASSERT_EQ(1, shapes.size());
-      EXPECT_TRUE(PartialTensorShape({23}).IsIdenticalTo(shapes[0]));
-      break;
-    }
-  }
-}
-
 REGISTER_OP("In").Output("o: float");
 REGISTER_OP("Op").Input("i: float").Output("o: float");
 
-static void BM_Subgraph(int iters, int num_nodes) {
+static void BM_SubgraphHelper(int iters, int num_nodes,
+                              bool use_function_convention) {
   DeviceAttributes device_info;
   device_info.set_name("/job:a/replica:0/task:0/cpu:0");
   device_info.set_device_type(DeviceType(DEVICE_CPU).type());
@@ -347,12 +374,26 @@ static void BM_Subgraph(int iters, int num_nodes) {
   while (--iters > 0) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(g, subgraph);
-    TF_CHECK_OK(subgraph::RewriteGraphForExecution(subgraph, fed, fetch,
-                                                   targets, device_info));
+    subgraph::RewriteGraphMetadata metadata;
+    TF_CHECK_OK(subgraph::RewriteGraphForExecution(
+        subgraph, fed, fetch, targets, device_info, use_function_convention,
+        &metadata));
     delete subgraph;
   }
 }
+
+static void BM_Subgraph(int iters, int num_nodes) {
+  BM_SubgraphHelper(iters, num_nodes, false /* use_function_convention */);
+}
+static void BM_SubgraphFunctionConvention(int iters, int num_nodes) {
+  BM_SubgraphHelper(iters, num_nodes, true /* use_function_convention */);
+}
 BENCHMARK(BM_Subgraph)->Arg(100)->Arg(1000)->Arg(10000)->Arg(100000);
+BENCHMARK(BM_SubgraphFunctionConvention)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(100000);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index f0ab5520f11fbc1e7c7d948ef4f72703716a7519..c495b2181207a0520bab8f33cdc28cd723b1ef40 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -416,24 +416,6 @@ Node* Cast(Graph* g, Node* in, DataType dst) {
   return ret;
 }
 
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1) {
-  Node* ret;
-  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastGradientArgs")
-                  .Input(s0)
-                  .Input(s1)
-                  .Finalize(g, &ret));
-  return ret;
-}
-
 Node* Gather(Graph* g, Node* in0, Node* in1) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Gather")
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index d508f65ada5bef3392d6e002833ca734f7fa1160..48250fef0fa44ee9fe25d7751c067d3c1257d4b7 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -174,12 +174,6 @@ Node* Cast(Graph* g, Node* in, DataType dst);
 // Perform gather op on params "in0" with indices "in1".
 Node* Gather(Graph* g, Node* in0, Node* in1);
 
-// Computes broadcasted shape from the given input shapes.
-Node* BroadcastArgs(Graph* g, Node* s0, Node* s1);
-
-// Computes the args needed broadcast gradient function.
-Node* BroadcastGradientArgs(Graph* g, Node* s0, Node* s1);
-
 // Gets a tensor stored in the session state.
 Node* GetSessionTensor(Graph* g, Node* in);
 
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index c42eebae5387e48a857ccfe6f72939677476a3a5..73016f5fb5eef4485ad084bc3ded61adff832ba3 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -14,19 +14,14 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "android_srcs",
-    srcs = [
-        "devices.cc",
-        "devices.h",
-        "grappler_item.h",
-        "utils.cc",
-        "utils.h",
-        "//tensorflow/core/grappler/clusters:android_srcs",
-        "//tensorflow/core/grappler/inputs:android_srcs",
-        "//tensorflow/core/grappler/optimizers:android_srcs",
+cc_library(
+    name = "op_types",
+    srcs = ["op_types.cc"],
+    hdrs = ["op_types.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
     ],
-    visibility = ["//tensorflow:__subpackages__"],
 )
 
 cc_library(
@@ -73,6 +68,7 @@ cc_library(
     hdrs = ["grappler_item.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":op_types",
         ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -88,7 +84,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":grappler_item",
+        ":op_types",
         ":utils",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index bd02c1e75df1e40b64b1f6a454cb9acbf4b0dac0..556eb1dbc5ceaaab390d91c805d5c7a7f1203b6a 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -12,14 +12,30 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "android_srcs",
-    srcs = glob(
-        [
-            "cluster.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+config_setting(
+    name = "xsmm",
+    licenses = ["notice"],
+    values = {
+        "define": "tensorflow_xsmm=1",
+    },
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = [
+        "utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ] + select({
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
@@ -38,6 +54,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "virtual_cluster",
+    srcs = ["virtual_cluster.cc"],
+    hdrs = [
+        "virtual_cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":utils",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "single_machine",
     srcs = ["single_machine.cc"],
@@ -47,11 +77,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
+        ":utils",
         "//tensorflow/cc:coordinator",
         "//tensorflow/cc:queue_runner",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
     ],
@@ -63,12 +95,14 @@ cc_test(
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     deps = [
         ":single_machine",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index c93911c902e19b937e8c5080c2edfc9331d13b31..b2a326b3b0d03bd836bc23da2107301fedc1f5a1 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -56,5 +56,15 @@ void Cluster::DisableDetailedStats(bool disable) {
   }
 }
 
+const std::vector<string> Cluster::GetDeviceNames() const {
+  std::vector<string> device_names;
+  device_names.reserve(devices_.size());
+  for (const auto& device : devices_) {
+    device_names.push_back(device.first);
+  }
+  std::sort(device_names.begin(), device_names.end());
+  return device_names;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 45821db1ee8378d84a5a8aa40abf2195fa78e1b0..017cbb13c8ce899269928edbc8b524fd3d825ef3 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -17,13 +17,14 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
 
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -47,6 +48,12 @@ class Cluster {
   // of the requested resources are available.
   virtual Status Provision() = 0;
 
+  // Attempts to shutdown the cluster.
+  // Returns OK iff there are no pending calls to the Run() method and all the
+  // resources used by the cluster could be released. Returns an error
+  // otherwise.
+  virtual Status Shutdown() { return Status::OK(); }
+
   // Whether soft placement is allowed. If allow_soft_placement is true,
   // an op will be placed on CPU if there's no GPU implementation for the OP
   // or if no GPU devices are known or registered or if we need to co-locate
@@ -57,23 +64,20 @@ class Cluster {
   // before Provision().
   void SetNumWarmupSteps(int num_steps);
 
-  // Disable the collection of detailed statistics.
+  // Disable the collection of detailed statistics. Must be called
+  // before Provision().
   void DisableDetailedStats(bool disable);
 
   // Return the list of TensorFlow devices that are available to execute a
   // graph. This is empty until provision() is called.
-  const std::vector<DeviceAttributes>& GetDevices() const { return devices_; }
-
-  // Convenience method that returns the set of device names.
-  const std::vector<string> GetDeviceNames() const {
-    std::vector<string> device_names;
-    device_names.reserve(devices_.size());
-    for (const auto& device : devices_) {
-      device_names.push_back(device.name());
-    }
-    return device_names;
+  const std::unordered_map<string, DeviceProperties>& GetDevices() const {
+    return devices_;
   }
 
+  // Convenience method that returns the set of device names. These names are
+  // sorted alphabetically.
+  const std::vector<string> GetDeviceNames() const;
+
   // Prepare the session to run the specified grappler item. This include
   // initializing all the model variables.
   virtual Status Initialize(const GrapplerItem& item) = 0;
@@ -85,7 +89,7 @@ class Cluster {
                      RunMetadata* metadata) = 0;
 
  protected:
-  std::vector<DeviceAttributes> devices_;
+  std::unordered_map<string, DeviceProperties> devices_;
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 6296c71a6c20b7b71bfdce97006a0a1a56b84d07..22ccf5208c1e0a8799907a6b86f93fd4a1ca44b9 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -18,11 +18,14 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/cc/training/queue_runner.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -31,6 +34,7 @@ namespace grappler {
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
     : Cluster(timeout_s),
       num_gpus_(num_gpus),
+      expected_init_time_s_(0),
       closing_(false) {
   thread_pool_.reset(new thread::ThreadPool(
       Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
@@ -41,7 +45,10 @@ SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
   }
   CHECK_GE(num_cpu_cores, 1);
   options_.config.set_intra_op_parallelism_threads(num_cpu_cores);
-  options_.config.set_inter_op_parallelism_threads(num_cpu_cores);
+  // Create a session specific thread pool to ensure the threads are reset when
+  // the session is reset.
+  options_.config.add_session_inter_op_thread_pool()->set_num_threads(
+      num_cpu_cores);
   if (timeout_s > 0) {
     options_.config.set_operation_timeout_in_ms(timeout_s * 1000);
   }
@@ -53,6 +60,8 @@ SingleMachine::~SingleMachine() {
   // Reset the thread-pool so that there are no outstanding Session::Run(...)s
   // when we delete the session.
   thread_pool_.reset();
+
+  Reset(options_, {}).IgnoreError();
 }
 
 Status SingleMachine::Provision() {
@@ -61,16 +70,12 @@ Status SingleMachine::Provision() {
     return status;
   }
 
-  DeviceAttributes attr;
-  attr.set_name("/job:localhost/replica:0/task:0/cpu:0");
-  attr.set_device_type("CPU");
-  devices_.push_back(attr);
+  DeviceProperties attr = GetLocalCPUInfo();
+  devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
   for (int i = 0; i < num_gpus_; ++i) {
-    DeviceAttributes attr;
-    attr.set_name(strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i));
-    attr.set_device_type("GPU");
-    devices_.push_back(attr);
+    devices_[strings::StrCat("/job:localhost/replica:0/task:0/gpu:", i)] =
+        GetLocalGPUInfo(i);
   }
   return Status::OK();
 }
@@ -79,6 +84,7 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
   mutex_lock l(this->last_graph_mu_);
   if (last_graph_ != &item.graph || last_graph_id_ != item.id) {
     init_ops_ = item.init_ops;
+    expected_init_time_s_ = item.expected_init_time;
     last_graph_ = nullptr;
     queue_runner_defs_ = item.queue_runners;
     last_graph_id_ = item.id;
@@ -86,6 +92,31 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
   return Status::OK();
 }
 
+Status SingleMachine::Shutdown() {
+  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+
+  // Delete the threadpool: this ensures that all the pending closures complete
+  // before we return. Note that if that if TF deadlocked on us, the closures
+  // will never complete, and the call to thread_pool_.reset() will never
+  // return: therefore we need to delete the threadpool with the background
+  // thread. That thread itself will also never complete, so the user should
+  // abort the process to avoid leaking too many resources.
+  auto n = std::make_shared<Notification>();
+  Env::Default()->SchedClosure([this, n]() {
+    thread_pool_.reset();
+    n->Notify();
+  });
+  int64 timeout_us = 1000000ll * timeout_s_;
+  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
+  if (!notified) {
+    // Let the caller know that we can't shutdown the session properly since
+    // there are calls to Session::Run() still running.
+    return errors::Unavailable("The session is still running graphs after ",
+                               timeout_s_, " seconds");
+  }
+  return Status::OK();
+}
+
 Status SingleMachine::Run(const GraphDef& graph_def,
                           const std::vector<std::pair<string, Tensor>>& feed,
                           const std::vector<string>& fetch,
@@ -97,20 +128,23 @@ Status SingleMachine::Run(const GraphDef& graph_def,
       TF_RETURN_IF_ERROR(session_->Create(graph_def));
       if (!init_ops_.empty()) {
         init_metadata_ = RunMetadata();
-        TF_RETURN_IF_ERROR(RunWithTimeout({}, init_ops_, &init_metadata_));
+        int64 timeout_s = timeout_s_ + expected_init_time_s_;
+        TF_RETURN_IF_ERROR(
+            RunWithTimeout({}, init_ops_, &init_metadata_, timeout_s));
         // The compute cost for init ops is likely to be pessimistic since init
         // ops are run only once before warmup. Therefore we only keep their
         // memory costs.
         for (auto node : *init_metadata_.mutable_cost_graph()->mutable_node()) {
           node.clear_compute_cost();
         }
-        metadata->MergeFrom(init_metadata_);
+        // Also clear the timeline to save memory
+        init_metadata_.clear_step_stats();
       }
       for (int i = 0; i < queue_runner_defs_.size(); ++i) {
         std::unique_ptr<QueueRunner> queue_runner;
         TF_RETURN_IF_ERROR(QueueRunner::New(queue_runner_defs_[i],
                                             coordinator_.get(), &queue_runner));
-        TF_RETURN_IF_ERROR(queue_runner->StartAndCollectRunMetadata(
+        TF_RETURN_IF_ERROR(queue_runner->StartAndCollectCostGraph(
             session_.get(), &run_options_));
         TF_RETURN_IF_ERROR(
             coordinator_->RegisterRunner(std::move(queue_runner)));
@@ -127,33 +161,46 @@ Status SingleMachine::Run(const GraphDef& graph_def,
     }
   }
 
-  TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
   if (metadata) {
-    return coordinator_->ExportCostGraph(metadata->mutable_cost_graph());
+    TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
+    // Merge the costs of the initialization and the queue runners.
+    CostGraphDef queue_costs;
+    TF_RETURN_IF_ERROR(coordinator_->ExportCostGraph(&queue_costs));
+    MergeCosts(metadata->mutable_cost_graph(), init_metadata_.cost_graph(),
+               queue_costs);
   } else {
-    return Status::OK();
+    return RunWithTimeout(feed, fetch, nullptr);
   }
+  return Status::OK();
 }
 
 Status SingleMachine::RunWithTimeout(
     const std::vector<std::pair<string, Tensor>>& feed,
     const std::vector<string>& fetch, RunMetadata* run_metadata) {
+  return RunWithTimeout(feed, fetch, run_metadata, timeout_s_);
+}
+
+Status SingleMachine::RunWithTimeout(
+    const std::vector<std::pair<string, Tensor>>& feed,
+    const std::vector<string>& fetch, RunMetadata* run_metadata,
+    int64 timeout_s) {
   // We shouldn't be running or closing the session at this point.
   {
     mutex_lock l(close_mu_);
     CHECK(!closing_);
   }
+
   auto status = std::make_shared<Status>();
   auto local_metadata = std::make_shared<RunMetadata>();
   const bool executed_in_time = ExecuteWithTimeout(
-      [this, status, local_metadata, &feed, &fetch]() {
+      [this, status, local_metadata, feed, fetch]() {
         *status = session_->Run(run_options_, feed, {}, fetch, nullptr,
                                 local_metadata.get());
       },
-      timeout_s_ * 1000, thread_pool_.get());
+      timeout_s * 1000, thread_pool_.get());
   if (!executed_in_time) {
-    return errors::DeadlineExceeded("Failed to run the graph after ",
-                                    timeout_s_, " seconds, aborting");
+    return errors::DeadlineExceeded("Failed to run the graph after ", timeout_s,
+                                    " seconds, aborting");
   } else if (run_metadata && status->ok()) {
     *run_metadata = *local_metadata;
   }
@@ -210,11 +257,7 @@ Status SingleMachine::ResetSession() {
     LOG(INFO) << "Cleaning up previous session";
 
     // Make sure the session is properly closed
-    TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
-
-    // Flush all the pending closures (if any).
-    thread_pool_.reset(new thread::ThreadPool(
-        Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
+    TF_RETURN_IF_ERROR(Shutdown());
 
     // We need to Reset the session to ensure that all the variables are
     // deleted. But first we need to delete the session since Reset()
@@ -225,6 +268,10 @@ Status SingleMachine::ResetSession() {
 
   LOG(INFO) << "Starting new session";
 
+  // Create a new threadpool
+  thread_pool_.reset(new thread::ThreadPool(
+      Env::Default(), SanitizeThreadSuffix("single_machine"), 2));
+
   session_.reset(NewSession(options_));
   CHECK(session_ != nullptr);
 
@@ -233,5 +280,36 @@ Status SingleMachine::ResetSession() {
   return Status::OK();
 }
 
+void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
+                               const CostGraphDef& init_costs,
+                               const CostGraphDef& queue_costs) {
+  graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
+                                       init_costs.node_size() +
+                                       queue_costs.node_size());
+  std::unordered_set<string> nodes_seen;
+  for (const auto& node : graph_costs->node()) {
+    nodes_seen.insert(node.name());
+  }
+
+  // The costs obtained by running the main graph could be more stable than
+  // the one we get from the queue runners since the queue runners run
+  // asynchronously.
+  for (const auto& node : queue_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+
+  // Don't overwrite the costs with that generated during initialization since
+  // these are possibly outdated.
+  for (const auto& node : init_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index b739b39f2cf8423ad7ff8a3af4c6577e375e517b..d3efbe3c614580d0502874412697cd5719e28be5 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -33,6 +33,8 @@ class SingleMachine : public Cluster {
   ~SingleMachine() override;
 
   Status Provision() override;
+  Status Shutdown() override;
+
   Status Initialize(const GrapplerItem& item) override;
   Status Run(const GraphDef& item,
              const std::vector<std::pair<string, Tensor>>& feed,
@@ -42,8 +44,13 @@ class SingleMachine : public Cluster {
   Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
                         const std::vector<string>& fetch,
                         RunMetadata* run_metadata);
+  Status RunWithTimeout(const std::vector<std::pair<string, Tensor>>& feed,
+                        const std::vector<string>& fetch,
+                        RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
+                  const CostGraphDef& queue_costs);
 
   const int num_gpus_;
   std::unique_ptr<Session> session_;
@@ -52,6 +59,7 @@ class SingleMachine : public Cluster {
   mutex last_graph_mu_;
   const GraphDef* last_graph_ GUARDED_BY(last_graph_mu_) = nullptr;
   std::vector<string> init_ops_;
+  int64 expected_init_time_s_;
   std::unique_ptr<Coordinator> coordinator_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 3b39e5be614a91884107f73214270af25d1effc7..27813c0e41d89fc1802ad61cc53063a853cb8530 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,8 +31,10 @@ namespace {
 class SingleMachineTest : public ::testing::Test {
  public:
   void SetUp() override {
-    // Provision a single machine with 3 cpu cores
-    cluster_.reset(new SingleMachine(5 * 60, 3, 0));
+    // Provision a single machine with 3 cpu cores, and a short timeout of 5
+    // seconds: since there isn't much work to process a test graph that should
+    // be plenty.
+    cluster_.reset(new SingleMachine(5, 3, 0));
     TF_CHECK_OK(cluster_->Provision());
   }
 
@@ -129,6 +134,168 @@ TEST_F(SingleMachineTest, MultipleItems) {
   }
 }
 
+TEST_F(SingleMachineTest, TimeOuts) {
+  // Create a graph that will block forever: Just try to dequeue data from a
+  // queue that is never fed.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto q = ops::FIFOQueue(root.WithOpName("queue"), {DataType::DT_INT32});
+  auto dequeue =
+      ops::QueueDequeue(root.WithOpName("dequeue"), q, {DataType::DT_INT32});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+  item.fetch.push_back("dequeue");
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  Status s1 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
+  EXPECT_TRUE(errors::IsDeadlineExceeded(s1));
+  Status s2 = cluster_->Run(item.graph, item.feed, item.fetch, &metadata);
+  EXPECT_TRUE(errors::IsDeadlineExceeded(s2));
+}
+
+TEST_F(SingleMachineTest, InitializationMemory) {
+  // Build a variable and its initialization graph.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  int batch_size = 10;
+  Output x =
+      ops::RandomNormal(s.WithOpName("x"), {batch_size, 1}, DataType::DT_FLOAT);
+  Output v = ops::Variable(s.WithOpName("v"), TensorShape({batch_size, 1}),
+                           DataType::DT_FLOAT);
+  Output init = ops::Assign(s.WithOpName("init"), v, x);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.init_ops.push_back(init.name());
+  item.fetch.push_back(v.name());
+
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check that the initialization op is present in the cost model.
+  bool found = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    found |= (node.name() == NodeName(init.name()));
+  }
+  EXPECT_TRUE(found);
+}
+
+namespace {
+template <class T>
+inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+  AttrValue attr_value;
+  SetAttrValue(value, &attr_value);
+  auto* attr_map = node->mutable_attr();
+  (*attr_map)[key] = attr_value;
+}
+template <>
+inline void SetNodeAttr(const string& key, const Tensor& tensor,
+                        NodeDef* node) {
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  SetNodeAttr(key, tensor_proto, node);
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, PersistentMemory) {
+  // Build a hashtable and its initialization graph.
+  GrapplerItem item;
+  const DataType key_dtype = DT_INT64;
+  const DataType data_dtype = DT_INT64;
+
+  NodeDef* hashtable_node = item.graph.add_node();
+  hashtable_node->set_op("HashTable");
+  hashtable_node->set_name("hash_table");
+  SetNodeAttr("key_dtype", key_dtype, hashtable_node);
+  SetNodeAttr("value_dtype", data_dtype, hashtable_node);
+
+  // Initial hashtable keys and values
+  NodeDef* keys_node = item.graph.add_node();
+  keys_node->set_op("Const");
+  keys_node->set_name("table_keys");
+  SetNodeAttr("dtype", key_dtype, keys_node);
+  Tensor keys(key_dtype, TensorShape{2});
+  keys.vec<int64>()(0) = 123;
+  keys.vec<int64>()(1) = 321;
+  SetNodeAttr("value", keys, keys_node);
+
+  NodeDef* values_node = item.graph.add_node();
+  values_node->set_op("Const");
+  values_node->set_name("table_values");
+  SetNodeAttr("dtype", data_dtype, values_node);
+  Tensor values(data_dtype, TensorShape{2});
+  values.vec<int64>()(0) = 789;
+  values.vec<int64>()(1) = 987;
+  SetNodeAttr("value", values, values_node);
+
+  // InitializeTable node
+  NodeDef* init_table_node = item.graph.add_node();
+  init_table_node->set_op("InitializeTable");
+  init_table_node->set_name("initialize_table");
+  SetNodeAttr("Tkey", key_dtype, init_table_node);
+  SetNodeAttr("Tval", data_dtype, init_table_node);
+  *init_table_node->add_input() = "hash_table";
+  *init_table_node->add_input() = "table_keys";
+  *init_table_node->add_input() = "table_values";
+  item.init_ops.push_back(init_table_node->name());
+
+  // Key to lookup
+  NodeDef* query_node = item.graph.add_node();
+  query_node->set_op("Const");
+  query_node->set_name("query");
+  SetNodeAttr("dtype", key_dtype, query_node);
+  Tensor query(key_dtype, TensorShape({}));
+  query.flat<int64>()(0) = 0;
+  SetNodeAttr("value", query, query_node);
+
+  // Default return value of hashtable lookup
+  NodeDef* default_value_node = item.graph.add_node();
+  default_value_node->set_op("Const");
+  default_value_node->set_name("default_table_value");
+  SetNodeAttr("dtype", data_dtype, default_value_node);
+  Tensor dflt(data_dtype, TensorShape({}));
+  dflt.flat<int64>()(0) = 456;
+  SetNodeAttr("value", dflt, default_value_node);
+
+  // HashTable lookup node
+  NodeDef* lookup_node = item.graph.add_node();
+  lookup_node->set_op("LookupTableFind");
+  lookup_node->set_name("table_lookup");
+  SetNodeAttr("Tin", key_dtype, lookup_node);
+  SetNodeAttr("Tout", data_dtype, lookup_node);
+  *lookup_node->add_input() = "hash_table";
+  *lookup_node->add_input() = "query";
+  *lookup_node->add_input() = "default_table_value";
+  item.fetch.push_back(lookup_node->name());
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check the cost model.
+  bool found_table_init = false;
+  bool found_hashtable = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    if (node.name() == "hash_table") {
+      found_hashtable = true;
+      // Persistent memory usage should be 0 since it's recorded as part of the
+      // initialize_table op.
+      EXPECT_EQ(0, node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    } else if (node.name() == "initialize_table") {
+      found_table_init = true;
+      // Persistent memory should hold 2 keys and 2 values.
+      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    }
+  }
+  EXPECT_TRUE(found_table_init);
+  EXPECT_TRUE(found_hashtable);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..592e4b789d0dcb7369e2f0c6db447eb9daa92870
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+#include "third_party/eigen3/Eigen/Core"
+
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "cuda/include/cudnn.h"
+#endif
+
+#ifdef EIGEN_USE_LIBXSMM
+#include "include/libxsmm.h"
+#endif
+
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+
+namespace tensorflow {
+namespace grappler {
+
+DeviceProperties GetLocalCPUInfo() {
+  DeviceProperties device;
+  device.set_type("CPU");
+
+  device.set_vendor(port::CPUVendorIDString());
+  // Combine cpu family and model into the model string.
+  device.set_model(
+      strings::StrCat((port::CPUFamily() << 4) + port::CPUModelNum()));
+  device.set_frequency(port::NominalCPUFrequency() * 1e-6);
+  device.set_num_cores(port::NumSchedulableCPUs());
+  device.set_l1_cache_size(Eigen::l1CacheSize());
+  device.set_l2_cache_size(Eigen::l2CacheSize());
+  device.set_l3_cache_size(Eigen::l3CacheSize());
+
+  (*device.mutable_environment())["cpu_instruction_set"] =
+      Eigen::SimdInstructionSetsInUse();
+
+  (*device.mutable_environment())["eigen"] = strings::StrCat(
+      EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
+#ifdef EIGEN_USE_LIBXSMM
+  (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
+#endif
+
+  return device;
+}
+
+DeviceProperties GetLocalGPUInfo(int gpu_id) {
+  DeviceProperties device;
+  device.set_type("GPU");
+
+#if GOOGLE_CUDA
+  cudaDeviceProp properties;
+  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
+  if (error == cudaSuccess) {
+    device.set_vendor("NVidia");
+    device.set_model(properties.name);
+    device.set_frequency(properties.clockRate * 1e-3);
+    device.set_num_cores(properties.multiProcessorCount);
+    device.set_num_registers(properties.regsPerMultiprocessor);
+    // For compute capability less than 5, l1 cache size is configurable to
+    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
+    // compute capability larger or equal to 5, l1 cache (unified with texture
+    // cache) size is 24 KB. This number may need to be updated for future
+    // compute capabilities.
+    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
+    device.set_l2_cache_size(properties.l2CacheSize);
+    device.set_l3_cache_size(0);
+    device.set_shared_memory_size_per_multiprocessor(
+        properties.sharedMemPerMultiprocessor);
+    device.set_memory_size(properties.totalGlobalMem);
+    // 8 is the number of bits per byte. 2 is accounted for
+    // double data rate (DDR).
+    device.set_bandwidth(properties.memoryBusWidth / 8 *
+                         properties.memoryClockRate * 2);
+  }
+
+  (*device.mutable_environment())["architecture"] =
+      strings::StrCat(properties.major, ".", properties.minor);
+  (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
+  (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
+#endif
+
+  return device;
+}
+
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device) {
+  if (device.type == "CPU") {
+    return GetLocalCPUInfo();
+  } else if (device.type == "GPU") {
+    if (device.has_id) {
+      return GetLocalGPUInfo(device.id);
+    } else {
+      return GetLocalGPUInfo(0);
+    }
+  }
+  DeviceProperties result;
+  result.set_type("UNKNOWN");
+  return result;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..191942040a1fdd276bb50f799ce314389c2cb0fe
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Returns the DeviceProperties of the CPU on which grappler is running.
+DeviceProperties GetLocalCPUInfo();
+
+// Returns the DeviceProperties for the specified GPU attached to the server on
+// which grappler is running.
+DeviceProperties GetLocalGPUInfo(int gpu_id);
+
+// Returns the DeviceProperties of the specified device
+DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca4c03dbb6dd7c2c578b0d86de2ecbe16f8e652
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices)
+    : Cluster(0) {
+  devices_ = devices;
+}
+
+VirtualCluster::~VirtualCluster() {}
+
+Status VirtualCluster::Provision() { return Status::OK(); }
+
+Status VirtualCluster::Initialize(const GrapplerItem& item) {
+  return Status::OK();
+}
+
+Status VirtualCluster::Run(const GraphDef& item,
+                           const std::vector<std::pair<string, Tensor>>& feed,
+                           const std::vector<string>& fetch,
+                           RunMetadata* metadata) {
+  return Status::OK();
+
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd8436a9870e97457b67474870ad6b46215cf9ee
--- /dev/null
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Create a simple cluster that lists the devices (and their properties)
+// available in a TensorFlow session. This cluster doesn't allow running an
+// actual graph. It is useful however when used in conjusction with costs models
+// that aren't based on the execution of the graph.
+class VirtualCluster : public Cluster {
+ public:
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
+
+  ~VirtualCluster() override;
+
+  Status Provision() override;
+  Status Initialize(const GrapplerItem& item) override;
+  Status Run(const GraphDef& item,
+             const std::vector<std::pair<string, Tensor>>& feed,
+             const std::vector<string>& fetch, RunMetadata* metadata) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index d078d9af09e72bda2e64877bf0179982670e5e4f..adaf6cab05a8769035eed6f2729f9fe5b55624dd 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -25,7 +25,9 @@ tf_proto_library(
     name = "op_performance_data",
     srcs = ["op_performance_data.proto"],
     cc_api_version = 2,
-    protodeps = ["//tensorflow/core:protos_all"],
+    protodeps = [
+        "//tensorflow/core:protos_all",
+    ],
     visibility = ["//visibility:public"],
 )
 
@@ -37,7 +39,7 @@ cc_library(
     deps = [
         ":op_performance_data_cc",
         ":utils",
-        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
@@ -50,11 +52,13 @@ cc_test(
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     deps = [
         ":graph_properties",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
@@ -88,17 +92,37 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "robust_stats",
+    srcs = ["robust_stats.cc"],
+    hdrs = ["robust_stats.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "robust_stats_test",
+    srcs = ["robust_stats_test.cc"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    defines = if_cuda(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
     deps = [
         ":op_performance_data_cc",
         "//third_party/eigen3",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/grappler/clusters:utils",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
     ] + if_cuda([
         "//tensorflow/core:cuda",
@@ -114,3 +138,98 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "virtual_placer",
+    srcs = ["virtual_placer.cc"],
+    hdrs = ["virtual_placer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_library(
+    name = "virtual_scheduler",
+    srcs = ["virtual_scheduler.cc"],
+    hdrs = ["virtual_scheduler.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+    ],
+)
+
+cc_library(
+    name = "measuring_cost_estimator",
+    srcs = ["measuring_cost_estimator.cc"],
+    hdrs = ["measuring_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_library(
+    name = "op_level_cost_estimator",
+    srcs = ["op_level_cost_estimator.cc"],
+    hdrs = ["op_level_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":op_performance_data_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler/clusters:utils",
+    ],
+)
+
+cc_test(
+    name = "op_level_cost_estimator_test",
+    srcs = ["op_level_cost_estimator_test.cc"],
+    deps = [
+        ":op_level_cost_estimator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "analytical_cost_estimator",
+    srcs = ["analytical_cost_estimator.cc"],
+    hdrs = ["analytical_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cost_estimator",
+        ":graph_properties",
+        ":op_level_cost_estimator",
+        ":op_performance_data_cc",
+        ":utils",
+        ":virtual_placer",
+        ":virtual_scheduler",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6eba75ee60454cf12c43ee9891535be9896da1f
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
+
+#include <limits>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
+                                                 bool use_static_shapes)
+    : cluster_(cluster), use_static_shapes_(use_static_shapes) {}
+
+Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
+  item_ = item;
+  return Status::OK();
+}
+
+Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                             CostGraphDef* cost_graph,
+                                             Costs* costs) const {
+  GrapplerItem item = item_;
+  item.graph = optimized_graph;
+  GraphProperties properties(item);
+  Status status;
+  if (use_static_shapes_) {
+    status = properties.InferStatically();
+  } else {
+    status = properties.InferDynamically(cluster_);
+  }
+
+  if (!status.ok()) {
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  std::unordered_map<string, CostGraphDef::Node*> name_to_cost;
+  if (cost_graph) {
+    for (auto& node : *cost_graph->mutable_node()) {
+      name_to_cost[node.name()] = &node;
+    }
+  }
+  std::vector<string> inaccurate_nodes;
+  VirtualScheduler scheduler(optimized_graph, item_.fetch);
+  VirtualPlacer placer(cluster_);
+  Costs node_costs;
+  do {
+    const NodeDef* node = scheduler.GetCurrNode();
+    std::vector<OpInfo::TensorProperties> inputs =
+        properties.GetInputProperties(node->name());
+
+    DeviceProperties device = placer.get_device(*node);
+    OpInfo op_info;
+    op_info.set_op(node->op());
+    *op_info.mutable_attr() = node->attr();
+    for (auto& input : inputs) {
+      op_info.add_inputs()->Swap(&input);
+    }
+    op_info.mutable_device()->Swap(&device);
+
+    node_costs = node_estimator_.PredictCosts(op_info);
+    if (node_costs.inaccurate) {
+      inaccurate_nodes.push_back(node->name());
+    }
+    if (cost_graph) {
+      auto it = name_to_cost.find(node->name());
+      CostGraphDef::Node* cost_node;
+      if (it != name_to_cost.end()) {
+        cost_node = it->second;
+      } else {
+        cost_node = cost_graph->add_node();
+        cost_node->set_name(node->name());
+      }
+      string device_name = properties.GetDeviceName(node->name());
+      cost_node->set_device(device_name);
+      cost_node->set_compute_cost(
+          node_costs.execution_time.asMicroSeconds().count());
+      cost_node->set_compute_time(
+          node_costs.compute_time.asMicroSeconds().count());
+      cost_node->set_memory_time(
+          node_costs.memory_time.asMicroSeconds().count());
+      std::vector<OpInfo::TensorProperties> outputs =
+          properties.GetOutputProperties(node->name());
+      for (const auto& output : outputs) {
+        auto output_info = cost_node->add_output_info();
+        output_info->set_dtype(output.dtype());
+        auto shape = output_info->mutable_shape();
+        *shape = output.shape();
+      }
+    }
+  } while (scheduler.MarkCurrNodeExecuted(node_costs));
+
+  *costs = scheduler.Summary();
+  VLOG(1) << inaccurate_nodes.size() << " out of "
+          << optimized_graph.node_size()
+          << " nodes have inaccurate time estimation";
+  for (const auto& node : inaccurate_nodes) {
+    VLOG(2) << "Node with inaccurate time estimation: " << node;
+  }
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f267fac73ffedc6b056284620f08ad0cdb95b443
--- /dev/null
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item based on the theoretical
+// performance of the hardware that will run the model.
+class AnalyticalCostEstimator : public CostEstimator {
+ public:
+  // Does not take ownership of cluster.
+  explicit AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
+  ~AnalyticalCostEstimator() override {}
+
+  // Initializes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Predict the performance of each node of the optimized graph and annotate
+  // the CostGraphDef with the corresponding estimates. Also returns the
+  // expected latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_latency) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  GrapplerItem item_;
+  OpLevelCostEstimator node_estimator_;
+  bool use_static_shapes_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ANALYTICAL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 093b7e29dc812c03ecdb742aea50fb689b19b024..786840384ada702650566212c0f8d4897c8fcd4c 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -42,7 +42,8 @@ struct Costs {
   struct MicroSeconds : std::chrono::microseconds {
     MicroSeconds() : std::chrono::microseconds(0) {}
     MicroSeconds(double d) : std::chrono::microseconds(static_cast<int64>(d)) {}
-    MicroSeconds(std::chrono::microseconds& d) : std::chrono::microseconds(d) {}
+    MicroSeconds(const std::chrono::microseconds& d)
+        : std::chrono::microseconds(d) {}
     MicroSeconds& operator=(const std::chrono::microseconds& d) {
       std::chrono::microseconds::operator=(d);
       return *this;
@@ -51,7 +52,8 @@ struct Costs {
   struct NanoSeconds : std::chrono::nanoseconds {
     NanoSeconds() : std::chrono::nanoseconds(0) {}
     NanoSeconds(double d) : std::chrono::nanoseconds(static_cast<int64>(d)) {}
-    NanoSeconds(std::chrono::nanoseconds& d) : std::chrono::nanoseconds(d) {}
+    NanoSeconds(const std::chrono::nanoseconds& d)
+        : std::chrono::nanoseconds(d) {}
     NanoSeconds& operator=(const std::chrono::nanoseconds& d) {
       std::chrono::nanoseconds::operator=(d);
       return *this;
@@ -90,6 +92,8 @@ struct Costs {
   int64 max_per_op_buffers;    // Sum of all buffers used by the ops.
   int64 max_per_op_streaming;  // Ignore largest input buffer, assuming it
                                // streams from main memory.
+  // If the time estimation is inaccurate.
+  bool inaccurate = false;
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Costs::MicroSeconds d) {
@@ -126,7 +130,7 @@ class CostEstimator {
  public:
   virtual ~CostEstimator() {}
 
-  // Initalizes the estimator for the specified grappler item.
+  // Initializes the estimator for the specified grappler item.
   // The estimator shouldn't be used if this function returns any status other
   // that OK.
   virtual Status Initialize(const GrapplerItem& item) = 0;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 345c7e2f21f0809614aaf5ae2a9e46b51b4a2a8b..b0e69d44edd129eaa29ac282540b7791cf377fd0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -26,10 +29,81 @@ namespace grappler {
 Status GraphProperties::InferStatically() {
   Graph graph(OpRegistry::Global());
   ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
   ImportGraphDefOptions options;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  // List the resources and the nodes using them
+  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
+  for (const Node* const node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (node->input_type(i) == DataType::DT_RESOURCE) {
+        const Node* resource;
+        TF_CHECK_OK(node->input_node(i, &resource));
+        resources[resource].insert(node);
+      }
+    }
+  }
+
+  // If we found a resource, try to propagate the shapes through it.
+  bool done = true;
+  do {
+    std::queue<const Node*> new_shapes;
+    for (const auto& resource_data : resources) {
+      const Node* qnode = resource_data.first;
+      StringPiece type(qnode->type_string());
+      if (!type.ends_with("QueueV2")) {
+        continue;
+      }
+      auto qctx = shape_refiner.GetContext(qnode);
+      if (!qctx) {
+        continue;
+      }
+      DataType queue_type = qctx->output_handle_dtype(0);
+      shape_inference::ShapeHandle queue_shp = qctx->output_handle_shape(0);
+      if (qctx->FullyDefined(queue_shp) && queue_type != DT_INVALID) {
+        continue;
+      }
+
+      for (const auto& node : resource_data.second) {
+        auto ctx = shape_refiner.GetContext(node);
+        if (!ctx) {
+          continue;
+        }
+        if (node->type_string().find("Enqueue") != std::string::npos) {
+          if (ctx->num_inputs() == 2) {
+            const DataType dtype = node->input_type(1);
+            if (queue_type == DT_INVALID) {
+              queue_type = dtype;
+            } else {
+              CHECK_EQ(queue_type, dtype);
+            }
+            shape_inference::ShapeHandle shp = ctx->input(1);
+            TF_RETURN_IF_ERROR(qctx->Merge(queue_shp, shp, &queue_shp));
+          }
+        }
+      }
+      if (qctx->set_output_handle_dtype(0, queue_type) |
+          qctx->MergeOutputHandleShape(0, queue_shp)) {
+        new_shapes.push(qnode);
+      }
+    }
+    // Propagate the shapes in the transitive fan-out of the queue.
+    done = new_shapes.empty();
+    while (!new_shapes.empty()) {
+      const Node* n = new_shapes.front();
+      new_shapes.pop();
+      for (const Node* fanout : n->out_nodes()) {
+        bool updated = false;
+        TF_RETURN_IF_ERROR(shape_refiner.UpdateNode(fanout, &updated));
+        if (updated) {
+          new_shapes.push(fanout);
+        }
+      }
+    }
+  } while (!done);
+
   for (const Node* const node : graph.nodes()) {
     VLOG(1) << "<Node> " << node->name();
     auto ctx = shape_refiner.GetContext(node);
@@ -77,8 +151,8 @@ Status GraphProperties::InferStatically() {
 
     if (!node->assigned_device_name().empty()) {
       device_names_[node->name()] = node->assigned_device_name();
-    } else if (!node->def().device().empty()) {
-      device_names_[node->name()] = node->def().device();
+    } else if (!node->requested_device().empty()) {
+      device_names_[node->name()] = node->requested_device();
     } else {
       device_names_[node->name()] = "not set";
     }
@@ -96,6 +170,7 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
       cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
 
   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
+  std::unordered_map<string, const NodeDef*> name_to_node;  // Empty
   for (auto& node : metadata.cost_graph().node()) {
     name_to_cost[node.name()] = &node;
 
@@ -119,7 +194,7 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
       continue;
     }
     std::vector<OpInfo::TensorProperties> inputs =
-        FindInputFeatures(node, name_to_cost);
+        FindInputFeatures(node, name_to_cost, name_to_node);
 
     input_properties_[node.name()] = inputs;
 
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 32683644fbbe7293a821605a8467aa428273a1fc..be5ae3c3646a6aec0bf177a0e7c666b9365023d1 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -129,6 +132,328 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
   }
 }
 
+TEST_F(GraphPropertiesTest, VarHandles) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+
+  TF_CHECK_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Input("Var", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("VarRead");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_FALSE(prop.shape().unknown_rank());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  EXPECT_EQ(3, prop.shape().dim(0).size());
+  EXPECT_EQ(7, prop.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Queues) {
+  // Create a graph with known input shapes, and propagate the shapes through a
+  // couple of queues.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+
+  auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
+  Output rnd =
+      ops::RandomNormal(root.WithOpName("rnd"), {3, 7}, DataType::DT_FLOAT);
+  Output square1 = ops::Square(root.WithOpName("Square1"), rnd);
+  auto enqueue1 = ops::QueueEnqueue(root.WithOpName("Enqueue1"), q1, {square1});
+  auto dequeue1 =
+      ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
+
+  auto q2 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue2"), {DataType::DT_FLOAT});
+  Output square2 = ops::Square(root.WithOpName("Square2"), dequeue1[0]);
+  auto enqueue2 = ops::QueueEnqueue(root.WithOpName("Enqueue2"), q2, {square2});
+  auto dequeue2 =
+      ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
+
+  // Create a queue that feeds itself.
+  auto q3 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
+  auto dequeue3 =
+      ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
+  auto merge3 = ops::Merge(root.WithOpName("Merge3"), {dequeue3[0], square2});
+  auto enqueue3 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue3"), q3, {merge3.output});
+
+  auto q4 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
+  auto enqueue4 = ops::QueueEnqueue(root.WithOpName("Enqueue4"), q4, {square2});
+  auto enqueue4_2 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue3[0]});
+  auto dequeue4 =
+      ops::QueueDequeue(root.WithOpName("Dequeue4"), q4, {DataType::DT_FLOAT});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props1 = properties.GetOutputProperties("Dequeue1");
+  EXPECT_EQ(1, props1.size());
+  const OpInfo::TensorProperties& prop1 = props1[0];
+  EXPECT_EQ(DT_FLOAT, prop1.dtype());
+  EXPECT_FALSE(prop1.shape().unknown_rank());
+  EXPECT_EQ(2, prop1.shape().dim_size());
+  EXPECT_EQ(3, prop1.shape().dim(0).size());
+  EXPECT_EQ(7, prop1.shape().dim(1).size());
+
+  const auto props2 = properties.GetOutputProperties("Dequeue2");
+  EXPECT_EQ(1, props2.size());
+  const OpInfo::TensorProperties& prop2 = props2[0];
+  EXPECT_EQ(DT_FLOAT, prop2.dtype());
+  EXPECT_FALSE(prop2.shape().unknown_rank());
+  EXPECT_EQ(2, prop2.shape().dim_size());
+  EXPECT_EQ(3, prop2.shape().dim(0).size());
+  EXPECT_EQ(7, prop2.shape().dim(1).size());
+
+  // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
+  // that we merge the 2 properly to determine the shape of the data coming out
+  // of the queue.
+  const auto props4 = properties.GetOutputProperties("Dequeue4");
+  EXPECT_EQ(1, props4.size());
+  const OpInfo::TensorProperties& prop4 = props4[0];
+  EXPECT_EQ(DT_FLOAT, prop4.dtype());
+  EXPECT_FALSE(prop4.shape().unknown_rank());
+  EXPECT_EQ(2, prop4.shape().dim_size());
+  EXPECT_EQ(3, prop4.shape().dim(0).size());
+  EXPECT_EQ(7, prop4.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Loops) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("while/Exit");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_INT32, prop.dtype());
+  EXPECT_TRUE(prop.shape().unknown_rank());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9262883b2a7bae1ade9d8fffb6680e1808d7e53b
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+
+#include <limits>
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+MeasuringCostEstimator::MeasuringCostEstimator(Cluster* cluster,
+                                               int measurement_steps,
+                                               int measurement_threads)
+    : measurement_steps_(measurement_steps),
+      measurement_threads_(measurement_threads) {
+  CHECK_GE(measurement_steps, 1);
+  if (measurement_threads > 0) {
+    thread_pool_.reset(new thread::ThreadPool(
+        Env::Default(), SanitizeThreadSuffix("measurements"),
+        measurement_threads));
+  }
+  cluster_ = cluster;
+}
+
+Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
+  feed_ = item.feed;
+  fetch_ = item.fetch;
+  return cluster_->Initialize(item);
+}
+
+Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                            CostGraphDef* cost_graph,
+                                            Costs* costs) const {
+  std::vector<double> times(measurement_steps_);
+  BlockingCounter barrier(measurement_steps_);
+
+  mutex status_mu;
+  Status status;
+
+  auto measurement_fn = [&](const int step) {
+    const Costs::MicroSeconds start = Env::Default()->NowMicros();
+
+    RunMetadata metadata;
+    const Status local_status =
+        cluster_->Run(optimized_graph, feed_, fetch_, &metadata);
+    {
+      mutex_lock lock(status_mu);
+      status.Update(local_status);
+    }
+    if (step < 0) {
+      // Discard the first iteration as it triggers the warmup, and therefore
+      // takes much longer than a normal step.
+      return;
+    }
+    if (!local_status.ok()) {
+      // Discard the data if the run wasn't successful.
+      barrier.DecrementCount();
+      return;
+    }
+
+    const Costs::MicroSeconds finish = Env::Default()->NowMicros();
+    const double time = (finish - start).count() * 1e3;
+    times[step] = time;
+
+    if (cost_graph && (step + 1 == measurement_steps_)) {
+      metadata.mutable_cost_graph()->Swap(cost_graph);
+    }
+
+    barrier.DecrementCount();
+  };
+
+  // Initialize the computation and warm up TensorFlow.
+  measurement_fn(-1);
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to run start measurements: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  // Run "measurement_steps_" and measure the time.
+  if (measurement_threads_ > 0) {
+    for (int i = 0; i < measurement_steps_; ++i) {
+      thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
+    }
+    barrier.Wait();
+  } else {
+    for (int i = 0; i < measurement_steps_ && status.ok(); ++i) {
+      measurement_fn(i);
+    }
+  }
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to measure graph performance: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    costs->max_execution_time = Costs::Duration::max();
+    costs->min_execution_time = 0;
+    return status;
+  }
+
+  // Compute the average time of the measure steps. Use Huber statistics
+  // to filter out outliers.
+  RobustStats stats(times);
+  costs->execution_time = Costs::Duration(stats.mean());
+  costs->max_execution_time = Costs::Duration(stats.hi());
+  costs->min_execution_time = Costs::Duration(stats.lo());
+
+  return Status::OK();
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b3edb4c27b325d03884624bf48d57fe09768df7
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initializes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measure
+  // the runtimes of each operation, and annotated the CostGraphDef
+  // with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5298dc756567f5c5f4631f84606b2f9d8ddd3159
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -0,0 +1,561 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr int kOpsPerMac = 2;
+constexpr char kConv2d[] = "Conv2D";
+constexpr char kConv2dBackPropFilter[] = "Conv2DBackpropFilter";
+constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
+constexpr char kMatMul[] = "MatMul";
+constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kIdentity[] = "Identity";
+constexpr char kNoOp[] = "NoOp";
+constexpr char kReshape[] = "Reshape";
+
+OpLevelCostEstimator::OpLevelCostEstimator() {
+  // Syntactic sugar to build and return a lambda that takes an OpInfo and
+  // returns a cost.
+  typedef Costs (OpLevelCostEstimator::*CostImpl)(const OpInfo& op_feature)
+      const;
+  auto wrap = [this](CostImpl impl) -> std::function<Costs(const OpInfo&)> {
+    return [this, impl](const OpInfo& op) { return (this->*impl)(op); };
+  };
+
+  device_cost_impl_ = {
+      {kConv2d, wrap(&OpLevelCostEstimator::PredictConv2D)},
+      {kConv2dBackPropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropFilter)},
+      {kConv2dBackPropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackPropInput)},
+      {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)}};
+}
+
+Costs OpLevelCostEstimator::PredictCosts(const OpInfo& op_features) const {
+  auto it = device_cost_impl_.find(op_features.op());
+  if (it == device_cost_impl_.end()) {
+    VLOG(1) << "Missing implementation for op: " << op_features.op();
+    Costs costs;
+    costs = DummyExecutionTime(op_features);
+    return costs;
+  }
+
+  std::function<Costs(const OpInfo&)> estimator = it->second;
+  Costs costs = estimator(op_features);
+  VLOG(1) << "Operation " << op_features.op() << " takes "
+          << costs.execution_time.count() << " ns.";
+  return costs;
+}
+
+std::pair<double, double> OpLevelCostEstimator::GetDeviceInfo(
+    const DeviceProperties& device) const {
+  double gflops = -1;
+  double bandwidth = -1;
+  if (device.bandwidth() > 0) {
+    bandwidth = device.bandwidth() / 1e6;
+  }
+
+  if (device.type() == "CPU") {
+    DeviceProperties local_cpu;
+    if (device.num_cores() <= 0 || device.frequency() <= 0) {
+      local_cpu = GetLocalCPUInfo();
+    } else {
+      local_cpu = device;
+    }
+
+    // Check if vector instructions are available, and refine performance
+    // prediction based on this.
+    // Frequencies are stored in MHz in the DeviceProperties.
+    gflops = local_cpu.num_cores() * local_cpu.frequency() * 1e-3;
+    if (bandwidth < 0) {
+      if (local_cpu.bandwidth() > 0) {
+        bandwidth = local_cpu.bandwidth() / 1e6;
+      } else {
+        bandwidth = 32;
+      }
+    }
+  } else if (device.type() == "GPU") {
+    const DeviceProperties local_gpu = GetLocalGPUInfo(0);
+    const string architecture = local_gpu.environment().at("architecture");
+    int cores_per_multiprocessor;
+    if (architecture < "3") {
+      // Fermi
+      cores_per_multiprocessor = 32;
+    } else if (architecture < "4") {
+      // Kepler
+      cores_per_multiprocessor = 192;
+    } else if (architecture < "6") {
+      //  Maxwell
+      cores_per_multiprocessor = 128;
+    } else {
+      // Pascal.
+      cores_per_multiprocessor = 64;
+    }
+    gflops = local_gpu.num_cores() * local_gpu.frequency() * 1e-3 *
+             cores_per_multiprocessor * kOpsPerMac;
+    if (bandwidth < 0) {
+      CHECK(local_gpu.bandwidth() > 0);
+      bandwidth = local_gpu.bandwidth() / 1e6;
+    }
+  }
+
+  return std::make_pair(gflops, bandwidth);
+}
+
+Costs OpLevelCostEstimator::DummyExecutionTime(
+    const OpInfo& op_features) const {
+  Costs costs = PredictOpCountBasedCost(0, op_features);
+  costs.inaccurate = true;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictOpCountBasedCost(
+    double operations, const OpInfo& op_features) const {
+  std::pair<double, double> device_perf = GetDeviceInfo(op_features.device());
+  Costs::NanoSeconds compute_cost(operations / device_perf.first);
+  VLOG(1) << "Op:" << op_features.op() << " GOps:" << operations / 1e9
+          << " Execution Time (ns):" << compute_cost.count();
+
+  bool found_unknown_shapes = false;
+  double total_input_size =
+      CalculateInputSize(op_features, &found_unknown_shapes);
+  double total_output_size =
+      CalculateOutputSize(op_features, &found_unknown_shapes);
+  double total_io_size = total_input_size + total_output_size;
+
+  Costs::NanoSeconds memory_cost(total_io_size / device_perf.second);
+  VLOG(1) << "Op:" << op_features.op() << " Size (KB):" << (total_io_size) / 1e3
+          << " Memory Time (ns):" << memory_cost.count();
+
+  Costs costs;
+  costs.compute_time = compute_cost;
+  costs.memory_time = memory_cost;
+  costs.execution_time = compute_cost + memory_cost;
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+namespace {
+
+string GetDataFormat(const OpInfo& op_features) {
+  string data_format = "NHWC";  // Default format.
+  if (op_features.attr().find("data_format") != op_features.attr().end()) {
+    data_format = op_features.attr().at("data_format").s();
+  }
+  return data_format;
+}
+
+Padding GetPadding(const OpInfo& op_features) {
+  if (op_features.attr().find("padding") != op_features.attr().end() &&
+      op_features.attr().at("padding").s() == "VALID") {
+    return Padding::VALID;
+  }
+  return Padding::SAME;  // Default padding.
+}
+
+std::vector<int64> GetStrides(const OpInfo& op_features) {
+  if (op_features.attr().find("strides") != op_features.attr().end()) {
+    const auto strides = op_features.attr().at("strides").list().i();
+    return {strides[0], strides[1], strides[2], strides[3]};
+  }
+  return {1, 1, 1, 1};
+}
+
+int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
+                    const Padding& padding) {
+  // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
+  // function in third_party/tensorflow/core/framework/common_shape_fns.cc.
+  if (padding == Padding::VALID) {
+    return (input - filter + stride) / stride;
+  } else {  // SAME.
+    return (input + stride - 1) / stride;
+  }
+}
+
+// Return a minimum shape if the shape is unknown. If known, return the original
+// shape.
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes) {
+  auto shape = original_shape;
+  if (shape.unknown_rank()) {
+    *found_unknown_shapes = true;
+  }
+  if (shape.unknown_rank() || shape.dim_size() == 0) {
+    TensorShapeProto::Dim dim;
+    VLOG(1) << "WARNING: Use minimum shape because the shape is unknown.";
+    // The size of each dimension is at least 1, if unknown.
+    dim.set_size(1);
+    for (int i = 0; i < rank; i++) {
+      *shape.add_dim() = dim;
+    }
+  } else {
+    CHECK_EQ(shape.dim_size(), rank);
+    for (int i = 0; i < rank; i++) {
+      if (shape.dim(i).size() == -1) {
+        *found_unknown_shapes = true;
+        VLOG(1)
+            << "WARNING: Use minimum dim size 1 because the shape is unknown.";
+        // The size of each dimension is at least 1, if unknown.
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+  }
+  return shape;
+}
+}  // namespace
+
+// Helper to translate the positional arguments into named fields.
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    bool* found_unknown_shapes) {
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  auto filter_shape =
+      MaybeGetMinimumShape(original_filter_shape, 4, found_unknown_shapes);
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_features);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+  int64 kx = filter_shape.dim(0).size();
+  int64 ky = filter_shape.dim(1).size();
+  std::vector<int64> strides = GetStrides(op_features);
+  const auto padding = GetPadding(op_features);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = filter_shape.dim(3).size();
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (iz != 1 && filter_shape.dim(2).size() != 1) {
+    CHECK_EQ(iz, filter_shape.dim(2).size());
+  } else {
+    iz = std::max<int64>(iz, filter_shape.dim(2).size());
+  }
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+
+  VLOG(1) << "Batch Size:" << batch;
+  VLOG(1) << "Image Dims:" << ix << "," << iy;
+  VLOG(1) << "Input Features:" << iz;
+  VLOG(1) << "Kernel Dims:" << kx << "," << ky;
+  VLOG(1) << "Output Features:" << oz;
+  VLOG(1) << "Output Dims:" << ox << "," << oy;
+  VLOG(1) << "Strides:" << sx << "," << sy;
+  VLOG(1) << "Padding:" << (padding == Padding::VALID ? "VALID" : "SAME");
+  return conv_dims;
+}
+
+int64 OpLevelCostEstimator::CountConv2DOperations(
+    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    bool* found_unknown_shapes) const {
+  if (op_features.op() != kConv2d) {
+    LOG(ERROR) << "Invalid Operation";
+    return 0;
+  }
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  int64 ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+  VLOG(1) << "Operations for Conv2D" << ops;
+
+  if (conv_info != nullptr) {
+    *conv_info = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+}
+
+int64 OpLevelCostEstimator::CountMatMulOperations(
+    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    bool* found_unknown_shapes) const {
+  double ops = 0;
+
+  // TODO(nishantpatil): Create separate estimator for Sparse Matmul
+  if ((op_features.op() != kMatMul) && (op_features.op() != kSparseMatMul)) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  // first matrix
+  auto& a_matrix = op_features.inputs(0);
+  auto& b_matrix = op_features.inputs(1);
+
+  bool transpose_a = false;
+  bool transpose_b = false;
+
+  double m_dim, n_dim, k_dim, k_dim_b = 0;
+
+  for (const auto& item : op_features.attr()) {
+    VLOG(1) << "Key:" << item.first
+            << " Value:" << SummarizeAttrValue(item.second);
+    if (item.first == "transpose_a" && item.second.b() == true)
+      transpose_a = true;
+    if (item.first == "transpose_b" && item.second.b() == true)
+      transpose_b = true;
+  }
+  VLOG(1) << "transpose_a:" << transpose_a;
+  VLOG(1) << "transpose_b:" << transpose_b;
+  auto a_matrix_shape =
+      MaybeGetMinimumShape(a_matrix.shape(), 2, found_unknown_shapes);
+  auto b_matrix_shape =
+      MaybeGetMinimumShape(b_matrix.shape(), 2, found_unknown_shapes);
+  if (transpose_a) {
+    m_dim = a_matrix_shape.dim(1).size();
+    k_dim = a_matrix_shape.dim(0).size();
+  } else {
+    m_dim = a_matrix_shape.dim(0).size();
+    k_dim = a_matrix_shape.dim(1).size();
+  }
+  if (transpose_b) {
+    k_dim_b = b_matrix_shape.dim(1).size();
+    n_dim = b_matrix_shape.dim(0).size();
+  } else {
+    k_dim_b = b_matrix_shape.dim(0).size();
+    n_dim = b_matrix_shape.dim(1).size();
+  }
+
+  VLOG(1) << "M, N, K: " << m_dim << "," << n_dim << "," << k_dim;
+  // Only check equality when both sizes are known (in other words, when
+  // neither is set to a minimum dimension size of 1).
+  if (k_dim_b != 1 && k_dim != 1 && k_dim_b != k_dim) {
+    LOG(ERROR) << "Incompatible Matrix dimensions";
+    return ops;
+  } else {
+    // One of k_dim and k_dim_b might be 1 (mininum dimension size).
+    k_dim = std::max(k_dim, k_dim_b);
+  }
+
+  ops = m_dim * n_dim * k_dim * 2;
+  VLOG(1) << "Operations for Matmul" << ops;
+
+  if (mat_mul != nullptr) {
+    mat_mul->m = m_dim;
+    mat_mul->n = n_dim;
+    mat_mul->k = k_dim;
+  }
+  return ops;
+}
+
+// TODO(cliffy): Dedup this method and CountConv2DBackPropFilterOperations.
+int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+
+  if (op_features.op() != kConv2dBackPropInput) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.attr().find("_output_shapes") == op_features.attr().end()) {
+    // Need _output_shapes for input shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropInput op feaure.";
+    return ops;
+  }
+
+  const auto& input_shape =
+      op_features.attr().at("_output_shapes").list().shape(0);
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      input_shape, op_features.inputs(1).shape(), op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropInput" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
+    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    bool* found_unknown_shapes) const {
+  int64 ops = 0;
+  if (op_features.op() != kConv2dBackPropFilter) {
+    LOG(ERROR) << "Invalid Operation";
+    return ops;
+  }
+
+  if (op_features.attr().find("_output_shapes") == op_features.attr().end()) {
+    // Need _output_shapes for filter shape.
+    LOG(ERROR) << "No output shape in Conv2DBackPropFilter op feaure.";
+    return ops;
+  }
+
+  const auto& filter_shape =
+      op_features.attr().at("_output_shapes").list().shape(0);
+  ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
+      op_features.inputs(0).shape(), filter_shape, op_features,
+      found_unknown_shapes);
+
+  ops = conv_dims.batch;
+  ops *= conv_dims.ox * conv_dims.oy;
+  ops *= conv_dims.kx * conv_dims.ky;
+  ops *= conv_dims.iz * conv_dims.oz;
+  ops *= kOpsPerMac;
+
+  VLOG(1) << "Operations for Conv2DBackPropFilter" << ops;
+
+  if (returned_conv_dims != nullptr) {
+    *returned_conv_dims = conv_dims;
+  }
+  return ops;
+}
+
+int64 OpLevelCostEstimator::CalculateSingleInputSize(
+    const OpInfo::TensorProperties& input, bool* found_unknown_shapes) const {
+  VLOG(1) << "   with " << input.dtype() << " input of shape "
+          << input.shape().DebugString();
+  int64 input_size = 1;
+  int num_dims = std::max(1, input.shape().dim_size());
+  auto input_shape =
+      MaybeGetMinimumShape(input.shape(), num_dims, found_unknown_shapes);
+  for (const auto& dim : input_shape.dim()) {
+    input_size *= dim.size();
+  }
+  return input_size * DataTypeSize(input.dtype());
+}
+
+int64 OpLevelCostEstimator::CalculateInputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_input_size = 0;
+  for (auto& input : op_features.inputs()) {
+    int64 input_size = CalculateSingleInputSize(input, found_unknown_shapes);
+    total_input_size += input_size;
+    VLOG(1) << "Input Size: " << input_size
+            << " Total Input Size:" << total_input_size;
+  }
+  return total_input_size;
+}
+
+int64 OpLevelCostEstimator::CalculateOutputSize(
+    const OpInfo& op_features, bool* found_unknown_shapes) const {
+  int64 total_output_size = 0;
+  // use float as default for calculations
+  DataType dt = DT_FLOAT;
+  for (const auto& item : op_features.attr()) {
+    VLOG(1) << "Key:" << item.first
+            << " Value:" << SummarizeAttrValue(item.second);
+    if (item.first == "_output_shapes") {
+      for (const auto& original_output_shape : item.second.list().shape()) {
+        int64 output_size = 1;
+        int num_dims = std::max(1, original_output_shape.dim_size());
+        auto output_shape = MaybeGetMinimumShape(
+            original_output_shape, num_dims, found_unknown_shapes);
+        for (const auto& dim : output_shape.dim()) {
+          output_size *= dim.size();
+        }
+        output_size *= DataTypeSize(dt);
+        total_output_size += output_size;
+        VLOG(1) << "Output Size: " << output_size
+                << " Total Output Size:" << total_output_size;
+      }
+    }
+    if (item.first == "T") {
+      dt = item.second.type();
+    }
+  }
+  return total_output_size;
+}
+
+Costs OpLevelCostEstimator::PredictConv2D(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropInput(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropInputOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictConv2DBackPropFilter(
+    const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs =
+      PredictOpCountBasedCost(CountConv2DBackPropFilterOperations(
+                                  op_features, nullptr, &found_unknown_shapes),
+                              op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMatMul(const OpInfo& op_features) const {
+  bool found_unknown_shapes = false;
+  auto costs = PredictOpCountBasedCost(
+      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+  costs.inaccurate = found_unknown_shapes;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictNoOp(const OpInfo& op_features) const {
+  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  return Costs::ZeroCosts();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea7d3d3f69b3418b855cd047afc18527863324f8
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
+
+#include <functional>
+#include <map>
+#include <string>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class OpLevelCostEstimator {
+ public:
+  OpLevelCostEstimator();
+  virtual ~OpLevelCostEstimator() {}
+
+  Costs PredictCosts(const OpInfo& op_features) const;
+
+ protected:
+  // Returns an estimate of device performance (in billions of operations
+  // executed per second) and memory bandwidth (in GigaBytes/second) for the
+  // specified device.
+  virtual std::pair<double, double> GetDeviceInfo(
+      const DeviceProperties& device) const;
+
+  // For operations for which we haven't yet built estimates, returns a dummy
+  // value based on input size.
+  Costs DummyExecutionTime(const OpInfo& op_features) const;
+
+  // Naive cost estimate based on operations divided by device ops/sec.
+  Costs PredictOpCountBasedCost(double operations,
+                                const OpInfo& op_features) const;
+
+  // This family of routines counts the number of operations to perform the
+  // specified TensorFlow Op.
+  struct MatMulDimensions {
+    int m;
+    int n;
+    int k;
+  };
+  struct ConvolutionDimensions {
+    int64 batch;      // Batch size.
+    int64 ix;         // Input size x.
+    int64 iy;         // Input size y.
+    int64 iz;         // Input depth.
+    int64 kx;         // Kernel x.
+    int64 ky;         // Kernel y.
+    int64 oz;         // Output depth.
+    int64 ox;         // Output size x.
+    int64 oy;         // Output size y.
+    int64 sx;         // Stride x.
+    int64 sy;         // Stride y.
+    Padding padding;  // SAME or VALID.
+  };
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountConv2DOperations(const OpInfo& op_features,
+                              ConvolutionDimensions* conv_info,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              bool* found_unknown_shapes) const;
+  int64 CountMatMulOperations(const OpInfo& op_features,
+                              MatMulDimensions* mat_mul,
+                              bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropInputOperations(const OpInfo& op_features,
+                                           ConvolutionDimensions* conv_info,
+                                           bool* found_unknown_shapes) const;
+  int64 CountConv2DBackPropFilterOperations(const OpInfo& op_features,
+                                            ConvolutionDimensions* conv_info,
+                                            bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of a single input to a TensorFlow op.
+  int64 CalculateSingleInputSize(const OpInfo::TensorProperties& input,
+                                 bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the inputs of specified TensorFlow Op
+  int64 CalculateInputSize(const OpInfo& op_features,
+                           bool* found_unknown_shapes) const;
+
+  // Calculate the total size in bytes of the all
+  // the outputs of specified TensorFlow Op
+  int64 CalculateOutputSize(const OpInfo& op_features,
+                            bool* found_unknown_shapes) const;
+
+  // This family of routines predicts the costs to
+  // perform the specified TensorFlow Op on the
+  // device represented by a subclass. The default
+  // implementation just divides the operations to
+  // perform the op (from the "Count" routines,
+  // above) by the device peak operations per
+  // second. Override to supply a better estimate.
+  // Implementation of costs other than
+  // execution_time is optional, depending on the
+  // device.
+  Costs PredictConv2D(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropInput(const OpInfo& op_features) const;
+  Costs PredictConv2DBackPropFilter(const OpInfo& op_features) const;
+  Costs PredictMatMul(const OpInfo& op_features) const;
+  Costs PredictNoOp(const OpInfo& op_features) const;
+
+  // Utility function for safe division. Returns 0
+  // if rhs is 0 or negative.
+  static double SafeDiv(const double lhs, const double rhs) {
+    if (rhs > 0) {
+      return lhs / rhs;
+    } else {
+      return 0.0;
+    }
+  }
+
+  static ConvolutionDimensions ConvolutionDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      bool* found_unknown_shapes);
+
+ protected:
+  typedef std::function<Costs(const OpInfo& op_feature)> CostImpl;
+  std::map<string, CostImpl> device_cost_impl_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd1eac687ec70b649a5d6faa8ead5fb89d46357
--- /dev/null
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+// Wrangles the minimum number of proto fields to set up a matrix.
+void DescribeMatrix(int rows, int columns, OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  auto shape_rows = shape->add_dim();
+  shape_rows->set_size(rows);
+  auto shape_columns = shape->add_dim();
+  shape_columns->set_size(columns);
+  input->set_dtype(DT_FLOAT);
+}
+
+// Returns an OpInfo for MatMul with the minimum set of fields set up.
+OpInfo DescribeMatMul(int m, int n, int l, int k) {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("MatMul");
+
+  DescribeMatrix(m, l, &op_features);
+  DescribeMatrix(k, n, &op_features);
+  return op_features;
+}
+
+// Returns an OpInfo for MatMul with unknown input shapes.
+OpInfo DescribeMatMulUnknownShape() {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("MatMul");
+
+  auto input = op_features.add_inputs();
+  auto shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  input = op_features.add_inputs();
+  shape = input->mutable_shape();
+  shape->set_unknown_rank(true);
+
+  return op_features;
+}
+
+// Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
+// estimation purposes.
+void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
+                      OpInfo *op_features) {
+  auto input = op_features->add_inputs();
+  auto shape = input->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  shape->add_dim()->set_size(dim1);
+  shape->add_dim()->set_size(dim2);
+  shape->add_dim()->set_size(dim3);
+}
+
+// Returns an OpInfo for Conv2D with the minimum set of fields set up.
+OpInfo DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2, int kx,
+                           int ky, int oz) {
+  OpInfo op_features;
+  auto device = op_features.mutable_device();
+  device->set_type("CPU");
+  op_features.set_op("Conv2D");
+
+  DescribeTensor4D(batch, ix, iy, iz1, &op_features);
+  DescribeTensor4D(kx, ky, iz2, oz, &op_features);
+  return op_features;
+}
+}  // namespace
+
+TEST(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
+  OpLevelCostEstimator estimator;
+
+  EXPECT_EQ(false,
+            estimator.PredictCosts(DescribeMatMul(2, 4, 7, 7)).inaccurate);
+  EXPECT_EQ(true,
+            estimator.PredictCosts(DescribeMatMul(-1, 4, 7, 7)).inaccurate);
+  EXPECT_EQ(true,
+            estimator.PredictCosts(DescribeMatMul(2, 4, -1, 7)).inaccurate);
+
+  EXPECT_EQ(
+      false,
+      estimator.PredictCosts(DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256))
+          .inaccurate);
+  EXPECT_EQ(
+      true,
+      estimator.PredictCosts(DescribeConvolution(16, -1, 19, 48, 48, 5, 5, 256))
+          .inaccurate);
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_performance_data.proto b/tensorflow/core/grappler/costs/op_performance_data.proto
index 3279158f8763896967ce9cf4744e7f3e929534e4..887a714c0f77cbbd43def8d4fd5a52b546375fbf 100644
--- a/tensorflow/core/grappler/costs/op_performance_data.proto
+++ b/tensorflow/core/grappler/costs/op_performance_data.proto
@@ -18,9 +18,11 @@ syntax = "proto3";
 package tensorflow;
 option cc_enable_arenas = true;
 
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/device_properties.proto";
 
 // Description of an operation as well as the parameters expected to impact its
 // performance.
@@ -31,44 +33,15 @@ message OpInfo {
   // Custom parameters impacting the behavior of the op.
   map<string, AttrValue> attr = 2;
 
-  // Input types and shapes
+  // Input types, shapes and values if known.
   message TensorProperties {
     DataType dtype = 1;
     TensorShapeProto shape = 2;
+    TensorProto value = 3;
   };
   repeated TensorProperties inputs = 3;
 
   // Device on which the operation is run.
-  message DeviceProperties {
-    // Device type (CPU, GPU, ...)
-    string type = 1;
-    // Vendor (Intel, nvidia, ...)
-    string vendor = 2;
-    // Model (Haswell, K40, ...)
-    string model = 3;
-    // Core Frequency in Mhz
-    int64 frequency = 4;
-    // Number of cores
-    int64 num_cores = 5;
-    // Version of the tools and libraries used with this device (e.g. gcc 4.9,
-    // cudnn 5.1)
-    map<string, string> environment = 6;
-    // Number of registers per core.
-    int64 num_registers = 7;
-    // L1 cache size in bytes
-    int64 l1_cache_size = 8;
-    // L2 cache size in bytes
-    int64 l2_cache_size = 9;
-    // L3 cache size in bytes
-    int64 l3_cache_size = 10;
-    // Shared memory size per multiprocessor in bytes. This field is
-    // applicable to GPUs only.
-    int64 shared_memory_size_per_multiprocessor = 11;
-    // Memory size in bytes
-    int64 memory_size = 12;
-    // Memory bandwidth in KB/s
-    int64 bandwidth = 13;
-  }
   DeviceProperties device = 4;
 }
 
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9866bc86887e2fa1a1fcfe95e3e9673b7df1a8f3
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include <algorithm>
+#include <cmath>
+
+namespace tensorflow {
+namespace grappler {
+
+// Given a sorted vector of values, calculate the median.
+// Returns 0 for an empty vector.  Does not verify sortedness.
+static double SortedMedian(const std::vector<double> &values) {
+  const int n = values.size();
+  if (n == 0) return 0.0;
+  if (n & 1) {
+    return values[n / 2];
+  } else {
+    return (values[n / 2] + values[n / 2 - 1]) / 2.0;
+  }
+}
+
+// Given a vector of values (sorted or not), calculate the median.
+static double Median(std::vector<double> &&values) {
+  const size_t n = values.size();
+  if (n == 0) return 0;
+  const auto middle = values.begin() + (n / 2);
+  // Put the middle value in its place.
+  std::nth_element(values.begin(), middle, values.end());
+  if (n & 1) {
+    return *middle;
+  }
+  // Return the average of the two elements, the max_element lower than
+  // *middle is found between begin and middle as a post-cond of
+  // nth_element.
+  const auto lower_middle = std::max_element(values.begin(), middle);
+  // Preventing overflow. We know that '*lower_middle <= *middle'.
+  // If both are on opposite sides of zero, the sum won't overflow, otherwise
+  // the difference won't overflow.
+  if (*lower_middle <= 0 && *middle >= 0) {
+    return (*lower_middle + *middle) / 2;
+  }
+  return *lower_middle + (*middle - *lower_middle) / 2;
+}
+
+// Given a set of values, calculates the scaled Median Absolute Deviation (a
+// robust approximation to the standard deviation).  This is calculated as the
+// median of the absolute deviations from the median, scaled by 1.4826.  Its
+// advantage over the standard deviation is that it is not (as) affected by
+// outlier values.  Returns a pair<median, mad>.
+static std::pair<double, double> ScaledMedianAbsoluteDeviation(
+    const std::vector<double> &sorted_values) {
+  double median = SortedMedian(sorted_values);
+
+  // Next, we calculate the absolute deviations from the median,
+  // find the median of the resulting data, and scale by 1.4826.
+  std::vector<double> deviations;
+  deviations.reserve(sorted_values.size());
+  for (double d : sorted_values) {
+    deviations.push_back(std::abs(d - median));
+  }
+  double mad = Median(std::move(deviations)) * 1.4826;
+  return std::pair<double, double>(median, mad);
+}
+
+RobustStats::RobustStats(const std::vector<double> &values)
+    : RobustStats(std::vector<double>(values)) {}
+
+RobustStats::RobustStats(std::vector<double> &&values) {
+  std::sort(values.begin(), values.end());
+  lo_ = values[0];
+  hi_ = values.back();
+  HuberMAD(values);
+}
+
+// Computes an updated mean using Huber's weighting function (values beyond
+// the margin are weighted by margin / abs(value - mean).
+double UpdateHuberMean(const std::vector<double> &sorted_values, double mean,
+                       double margin) {
+  int num_within = 0;
+  double sum = 0.0;
+
+  for (double d : sorted_values) {
+    if (d < mean - margin) {
+      sum -= margin;
+    } else if (d > mean + margin) {
+      sum += margin;
+    } else {
+      sum += d;
+      ++num_within;
+    }
+  }
+
+  // It is possible, for a set with an interquartile distance of 0, i.e., with
+  // more than half of the values at the median, to encounter the case where
+  // the Huber mean drifts slightly off the median and there are no values
+  // within the margin.  In that case, just return the old mean, and the caller
+  // will quit.
+  if (num_within > 0) {
+    return sum / num_within;
+  } else {
+    return mean;
+  }
+}
+
+// Given a list of values, this approximates the stddev using the MAD and then
+// uses it to compute a Huber robust mean (sandwich mean).  A margin of
+// c*stddev is defined around the current mean, and values are weighted by
+// margin / abs(value - mean) if outside the margin, or 1 if inside.  This
+// computes the mean iteratively, because each time it changes the margin
+// shifts a bit.  It typically settles very quickly, but it's possible for it
+// to be unstable.  We limit it to 10 iterations.
+//
+void RobustStats::HuberMAD(const std::vector<double> &sorted_values) {
+  const std::pair<double, double> median_mad =
+      ScaledMedianAbsoluteDeviation(sorted_values);
+  mean_ = median_mad.first;
+  stddev_ = median_mad.second;
+
+  // c = 1.345 is the commonly used cutoff with 95% efficiency at the normal.
+  // We're using c = 1.5 to be a little more conservative, and because that's
+  // the default in S-plus.
+  // TODO(dehnert): Specialize Stats for integral types so we don't implement
+  // methods that don't make sense.
+  const double c = 1.5;
+  const double margin = c * stddev_;
+
+  // Iterate 10 times, or until the Huber mean stabilizes.
+  // If the margin is zero, we don't want mean to drift from the median.
+  if (margin > 0.0) {
+    for (int k = 0; k < 10; ++k) {
+      double old_mean = mean_;
+      mean_ = UpdateHuberMean(sorted_values, mean_, margin);
+      if (mean_ == old_mean) break;
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xla/port/initialize.h b/tensorflow/core/grappler/costs/robust_stats.h
similarity index 51%
rename from tensorflow/compiler/xla/port/initialize.h
rename to tensorflow/core/grappler/costs/robust_stats.h
index 13d9632f97c72296e9a335c2a10edefa9abc0e17..9d8f5bc970ad9cde6e5c31ce0df72272e35d1662 100644
--- a/tensorflow/compiler/xla/port/initialize.h
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -13,27 +13,30 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
-#define TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
+#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
 
-#undef REGISTER_MODULE_INITIALIZER
-
-namespace xla {
-
-class Initializer {
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
  public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-};
+  RobustStats(const std::vector<double>& values);
+  RobustStats(std::vector<double>&& values);
 
-}  // namespace xla
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
 
-#define REGISTER_INITIALIZER(type, name, body)         \
-  static void google_init_##type##_##name() { body; }  \
-  xla::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
+ private:
+  void HuberMAD(const std::vector<double>& values);
 
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_XLA_PORT_INITIALIZE_H_
+#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats_test.cc b/tensorflow/core/grappler/costs/robust_stats_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..924097b126d395f0d2bdb1285b49c9891d6c8c10
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RobustStatsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    for (double d = 1.0; d <= 5.0; d += 1.0) {
+      values1_.push_back(5.0 - d);
+      values1_.push_back(5.0 + d);
+      values2_.push_back(25.0 - 2 * d);
+      values2_.push_back(25.0 + 2 * d);
+      values3_.push_back(-3.0 - d);
+      values3_.push_back(-3.0 + d);
+    }
+    values1_.push_back(5.0);  // Odd # elements, mean is 5.0
+    values3_.push_back(197.0);
+    values3_.push_back(-203.0);  // Even # elements, mean is -3.0
+  }
+
+  std::vector<double> values1_;
+  std::vector<double> values2_;
+  std::vector<double> values3_;
+};
+
+TEST_F(RobustStatsTest, Simple) {
+  RobustStats s1(values1_);
+  EXPECT_EQ(5.0, s1.mean());
+  EXPECT_EQ(0.0, s1.lo());
+  EXPECT_EQ(10.0, s1.hi());
+
+  RobustStats s2(values2_);
+  EXPECT_EQ(25.0, s2.mean());
+  EXPECT_EQ(15.0, s2.lo());
+  EXPECT_EQ(35.0, s2.hi());
+
+  RobustStats s3(values3_);
+  EXPECT_EQ(-3.0, s3.mean());
+  EXPECT_EQ(-203.0, s3.lo());
+  EXPECT_EQ(197.0, s3.hi());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 19266208fe1a3efa9876c15e5a7bc86d402bb7e9..209eccd40d1c4272dd75a6dac6d3d0cd287f5850 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -26,49 +26,139 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 #endif
 
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/clusters/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
+static OpInfo::TensorProperties UnknownInput() {
+  OpInfo::TensorProperties input;
+  input.set_dtype(DataType::DT_INVALID);
+  input.mutable_shape()->set_unknown_rank(true);
+  return input;
+}
+
+static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
+  std::vector<TensorProto> tensors;
+  switch (attr_value.value_case()) {
+    case AttrValue::kTensor: {
+      tensors.push_back(attr_value.tensor());
+      break;
+    }
+    case AttrValue::kList: {
+      for (const auto& tensor_proto : attr_value.list().tensor()) {
+        tensors.push_back(tensor_proto);
+      }
+      break;
+    }
+    default: {}
+  }
+  return tensors;
+}
+
+static void ExtractExtraProperties(
+    const NodeDef& node,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    std::vector<OpInfo::TensorProperties>* extra_inputs,
+    protobuf::Map<string, AttrValue>* attr_map) {
+  OpRegistry* op_registry = OpRegistry::Global();
+  const OpDef* op_def;
+  auto s = op_registry->LookUpOpDef(node.op(), &op_def);
+  if (!s.ok()) {
+    op_def = nullptr;
+  }
+
+  for (int i = 0; i < node.input_size(); ++i) {
+    const string input_name = node.input(i);
+    CHECK(!input_name.empty());
+    TensorId input_tensor_id = ParseTensorName(input_name);
+    const string input_node_name = input_tensor_id.first.ToString();
+
+    auto iter = name_to_node.find(input_node_name);
+    if (iter == name_to_node.end()) continue;
+    const NodeDef* input_node = iter->second;
+
+    // The value attribute in Const input is useful for cost prediction.
+    if (input_node->op() == "Const") {
+      auto it = input_node->attr().find("value");
+      if (it == input_node->attr().end()) continue;
+
+      const AttrValue& attr_value = it->second;
+      std::vector<TensorProto> tensors = ExtractTensors(attr_value);
+      if (tensors.empty()) continue;
+
+      const TensorProto& t = tensors[0];
+      OpInfo::TensorProperties input;
+      input.set_dtype(t.dtype());
+      *(input.mutable_shape()) = t.tensor_shape();
+      *(input.mutable_value()) = t;
+      extra_inputs->push_back(input);
+
+      // For filename input, the file size can also be useful.
+      if (op_def &&
+          op_def->input_arg(i).name().find("filename") != std::string::npos) {
+        Tensor tensor;
+        CHECK(tensor.FromProto(t));
+        const string filename = tensor.scalar<string>()();
+
+        Env* env = Env::Default();
+        FileStatistics stat;
+        Status s = env->Stat(filename, &stat);
+        if (s.ok()) {
+          AttrValue attr;
+          attr.set_i(stat.length);
+          string attr_key = strings::StrCat("input_", i, "_filesize");
+          (*attr_map)[attr_key] = attr;
+        }
+      }
+    }
+
+    // When the input is a handle (e.g. look up table handle), the information
+    // in the op itself is not sufficient to predict the op memory.
+    if (op_def &&
+        op_def->input_arg(i).name().find("handle") != std::string::npos) {
+      string new_key = strings::StrCat("parent_", i, "_op");
+      AttrValue attr;
+      attr.set_s(input_node->op());
+      (*attr_map)[new_key] = attr;
+      // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
+      // and attributes when necessary.
+    }
+  }
+}
+
 std::vector<OpInfo::TensorProperties> FindInputFeatures(
     const NodeDef& node,
-    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost) {
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
+    const std::unordered_map<string, const NodeDef*>& name_to_node) {
   std::vector<OpInfo::TensorProperties> inputs;
   for (const auto& input_name : node.input()) {
-    // Skip control inputs. These are prefixed with the ^ character.
     CHECK(!input_name.empty());
-    if (input_name[0] == '^') {
-      continue;
-    }
+    TensorId input_tensor_id = ParseTensorName(input_name);
+    const string input_node_name = input_tensor_id.first.ToString();
+    const int output_index = input_tensor_id.second;
 
-    // Each input is "node_name:output_imdex" with "node_name" being a string
-    // name and "output_index" indicating which output tensor to use from
-    // "node_name". If "output_index" is 0 the ":0" suffix can be omitted.
-    string input_node_name;
-    int output_index = -1;
-    const size_t pos = input_name.rfind(':');
-    if (pos == string::npos) {
-      input_node_name = input_name;
-      output_index = 0;
-    } else {
-      string index = input_name.substr(pos);
-      if (strings::safe_strto32(index, &output_index)) {
-        input_node_name = input_name.substr(0, pos);
-      }
+    // Skip control inputs.
+    if (output_index == Graph::kControlSlot) {
+      continue;
     }
 
-    auto it = name_to_cost.find(input_name);
+    auto it = name_to_cost.find(input_node_name);
     if (it == name_to_cost.end() || output_index < 0) {
-      OpInfo::TensorProperties input;
-      input.set_dtype(DataType::DT_INVALID);
-      input.mutable_shape()->set_unknown_rank(true);
-      inputs.push_back(input);
+      inputs.push_back(UnknownInput());
     } else {
       const CostGraphDef::Node* input_cost = it->second;
       const CostGraphDef::Node::OutputInfo& output =
@@ -83,76 +173,44 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
   return inputs;
 }
 
-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
+DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceNameUtils::ParsedName parsed;
-  if (DeviceNameUtils::ParseFullName(node.device(), &parsed)) {
+  if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
     if (parsed.type == "GPU") {
       return GetLocalGPUInfo(parsed.id);
     } else if (parsed.type == "CPU") {
       return GetLocalCPUInfo();
     }
   }
-  OpInfo::DeviceProperties device;
+  DeviceProperties device;
   device.set_type("UNKNOWN");
   return device;
 }
 
-OpInfo::DeviceProperties GetLocalCPUInfo() {
-  OpInfo::DeviceProperties device;
-  device.set_type("CPU");
-
-  device.set_num_cores(port::NumSchedulableCPUs());
-  device.set_l1_cache_size(Eigen::l1CacheSize());
-  device.set_l2_cache_size(Eigen::l2CacheSize());
-  device.set_l3_cache_size(Eigen::l3CacheSize());
-
-  (*device.mutable_environment())["cpu_instruction_set"] =
-      Eigen::SimdInstructionSetsInUse();
-
-  (*device.mutable_environment())["eigen"] = strings::StrCat(
-      EIGEN_WORLD_VERSION, ".", EIGEN_MAJOR_VERSION, ".", EIGEN_MINOR_VERSION);
-#ifdef EIGEN_USE_LIBXSMM
-  (*device.mutable_environment())["libxsmm"] = LIBXSMM_VERSION;
-#endif
-
-  return device;
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
+  return GetDeviceInfo(node.device());
 }
 
-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id) {
-  OpInfo::DeviceProperties device;
-  device.set_type("GPU");
-
-#if GOOGLE_CUDA
-  cudaDeviceProp properties;
-  cudaError_t error = cudaGetDeviceProperties(&properties, gpu_id);
-  if (error == cudaSuccess) {
-    device.set_vendor("NVidia");
-    device.set_model(properties.name);
-    device.set_frequency(properties.clockRate / 1000);
-    device.set_num_cores(properties.multiProcessorCount);
-    device.set_num_registers(properties.regsPerMultiprocessor);
-    // For compute capability less than 5, l1 cache size is configurable to
-    // either 16 KB or 48 KB. We use the initial configuration 16 KB here. For
-    // compute capability larger or equal to 5, l1 cache (unified with texture
-    // cache) size is 24 KB. This number may need to be updated for future
-    // compute capabilities.
-    device.set_l1_cache_size((properties.major < 5) ? 16 * 1024 : 24 * 1024);
-    device.set_l2_cache_size(properties.l2CacheSize);
-    device.set_l3_cache_size(0);
-    device.set_shared_memory_size_per_multiprocessor(
-        properties.sharedMemPerMultiprocessor);
-    device.set_memory_size(properties.totalGlobalMem);
-    // 8 is the number of bits per byte. 2 is accounted for
-    // double data rate (DDR).
-    device.set_bandwidth(properties.memoryBusWidth / 8 *
-                         properties.memoryClockRate * 2);
+OpInfo BuildOpInfo(
+    const NodeDef& node, const string& device_str,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  OpInfo op_info;
+  op_info.set_op(node.op());
+  *op_info.mutable_attr() = node.attr();
+  *op_info.mutable_device() = GetDeviceInfo(device_str);
+  for (auto& input : inputs) {
+    *op_info.add_inputs() = input;
   }
 
-  (*device.mutable_environment())["cuda"] = strings::StrCat(CUDA_VERSION);
-  (*device.mutable_environment())["cudnn"] = strings::StrCat(CUDNN_VERSION);
-#endif
+  std::vector<OpInfo::TensorProperties> extra_inputs;
+  ExtractExtraProperties(node, name_to_node, &extra_inputs,
+                         op_info.mutable_attr());
+  for (auto& input : extra_inputs) {
+    *op_info.add_inputs() = input;
+  }
 
-  return device;
+  return op_info;
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 79be9061281b534cc29542332ece420cab3027ff..bdba4e4b15618a48c277e94a36577c851c0186df 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -22,9 +22,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -32,20 +34,24 @@ namespace grappler {
 // Returns a vector of InputProperties for 'node'. The vector will contain one
 // entry for each input of 'node'.
 // For each node in the graph, the 'name_to_cost' map stores a pointer to the
-// corresponding cost graph node indexed by node name.
+// corresponding cost graph node indexed by node name. The 'name_to_node' maps a
+// node name to its node definition.
 std::vector<OpInfo::TensorProperties> FindInputFeatures(
     const NodeDef& node,
-    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost);
+    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
+    const std::unordered_map<string, const NodeDef*>& name_to_node);
 
 // Returns the DeviceProperties of the device on which 'node' runs.
-OpInfo::DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
-
-// Returns the DeviceProperties of the CPU on which grappler is running.
-OpInfo::DeviceProperties GetLocalCPUInfo();
-
-// Returns the DeviceProperties for the specified GPU attached to the server on
-// which grappler is running.
-OpInfo::DeviceProperties GetLocalGPUInfo(int gpu_id);
+DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
+DeviceProperties GetDeviceInfo(const string& device_str);
+
+// Builds the OpInfo proto for node, given all nodes in the graph, the node's
+// device and its input properties which are typically built by shape inference
+// or calling FindInputFeatures.
+OpInfo BuildOpInfo(
+    const NodeDef& node, const string& device_str,
+    const std::unordered_map<string, const NodeDef*>& name_to_node,
+    const std::vector<OpInfo::TensorProperties>& inputs);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eafa6789feb6e35983775bf70266d790ec992d4c
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+VirtualPlacer::VirtualPlacer(const Cluster* cluster) : has_gpu_(false) {
+  CHECK(cluster);
+  devices_ = cluster->GetDevices();
+  for (const auto& device : devices_) {
+    if (str_util::Lowercase(device.first).find("gpu") != string::npos) {
+      has_gpu_ = true;
+    }
+  }
+
+  unknown_device_.set_type("UNKNOWN");
+}
+
+const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
+  DeviceNameUtils::ParsedName parsed;
+  if (!node.device().empty()) {
+    auto it = devices_.find(node.device());
+    if (it != devices_.end()) {
+      return it->second;
+    }
+    if (DeviceNameUtils::ParseLocalName(node.device(), &parsed)) {
+      string device_name =
+          strings::StrCat("/job:localhost/replica:0/task:0/",
+                          str_util::Lowercase(parsed.type), ":", parsed.id);
+      it = devices_.find(device_name);
+      if (it != devices_.end()) {
+        return it->second;
+      }
+    }
+    return unknown_device_;
+  }
+  string device;
+  if (has_gpu_) {
+    device = "/job:localhost/replica:0/task:0/gpu:0";
+  } else {
+    device = "/job:localhost/replica:0/task:0/cpu:0";
+  }
+  auto it = devices_.find(device);
+  if (it == devices_.end()) {
+    return unknown_device_;
+  }
+  return it->second;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
new file mode 100644
index 0000000000000000000000000000000000000000..40cd64e37c1f1df62956accabb56971c82a65dac
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
+
+#include <unordered_map>
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+namespace tensorflow {
+class NodeDef;
+
+namespace grappler {
+class Cluster;
+
+// The virtual placer emulates the behavior of the TF placer.
+class VirtualPlacer {
+ public:
+  VirtualPlacer(const Cluster* cluster);
+
+  const DeviceProperties& get_device(const NodeDef& node) const;
+
+ private:
+  std::unordered_map<string, DeviceProperties> devices_;
+  bool has_gpu_;
+  DeviceProperties unknown_device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f77d7677ace0bc5b0ab885fce4643ff918a872a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+  VLOG(2) << "costs execution_time=" << result.execution_time.count()
+          << " max_memory=" << result.max_memory
+          << " max_per_op_buffers=" << result.max_per_op_buffers
+          << " max_per_op_streaming=" << result.max_per_op_streaming;
+  return result;
+}
+}  // namespace
+
+VirtualScheduler::VirtualScheduler(const GraphDef& graph,
+                                   const std::vector<string>& fetch_nodes)
+    : graph_costs_(Costs::ZeroCosts()),
+      // TODO(dyoon): Use a better way than FIFO.
+      ready_nodes_(new FIFOManager()) {
+  // First, get the nodes that would run to output fetch_nodes.
+  std::vector<const NodeDef*> nodes =
+      ComputeTransitiveFanin(graph, fetch_nodes);
+
+  // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
+  // ComputeTransitiveFanin().
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  // Build node_map.
+  for (const auto* node : nodes) {
+    auto& node_state = GetNodeStateOrCreateIt(node);
+    // TODO(dyoon): add SendRecv considering devices and control dependency.
+    for (const string& input : node->input()) {
+      const NodeDef* in = name_to_node[NodeName(input)];
+      CHECK(in);
+      node_state.inputs.push_back(in);
+      auto& input_node_state = GetNodeStateOrCreateIt(in);
+      input_node_state.outputs.push_back(node);
+    }
+    if (node->input().empty()) {
+      node_state.time_ready =
+          Costs::Duration();  // Node without input: ready at time 0.
+      ready_nodes_->AddNode(node);
+    }
+  }
+}
+
+const NodeDef* VirtualScheduler::GetCurrNode() const {
+  return ready_nodes_->GetCurrNode();
+}
+
+NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+  auto it = node_map_.find(node);
+  if (it == node_map_.end()) {
+    it = node_map_.emplace(node, NodeState()).first;
+  }
+  return it->second;
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  graph_costs_ = CombineCosts(graph_costs_, node_costs);
+  const auto* node = GetCurrNode();
+  const auto& op_name = node->op();
+
+  auto it = op_to_cost_.find(op_name);
+  if (it == op_to_cost_.end()) {
+    it = op_to_cost_.emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  auto& op_cost = it->second;
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Update node and device states.
+  auto& node_state = node_map_[node];
+  auto& device = device_[node->device()];
+  device.nodes_executed.push_back(node);
+  // Node is scheduled when the device is available AND all the inputs are
+  // ready; hence, time_scheduled is time_ready if time_ready > device curr
+  // time.
+  node_state.time_scheduled =
+      std::max(device.GetCurrTime(), node_state.time_ready);
+  // Override device curr time with the time_scheduled.
+  device.device_costs.execution_time = node_state.time_scheduled;
+  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  auto curr_time = device.GetCurrTime();
+  node_state.time_finished = curr_time;
+
+  // Update device's per-op cost.
+  {
+    auto it = device.op_to_cost.find(op_name);
+    if (it == device.op_to_cost.end()) {
+      it = device.op_to_cost.emplace(op_name, Costs::ZeroCosts()).first;
+    }
+    auto& op_cost = it->second;
+    op_cost = CombineCosts(op_cost, node_costs);
+
+    VLOG(2) << "Op scheduled -- name: " << node->name()
+            << ", op: " << node->op() << ", device: " << node->device()
+            << ", ready: " << node_state.time_ready.count()
+            << ", scheduled: " << node_state.time_scheduled.count()
+            << ", finished: " << node_state.time_finished.count();
+
+    // Increment num_inputs_ready of the output nodes.
+    for (auto* output : node_state.outputs) {
+      auto& output_state = node_map_[output];
+      output_state.num_inputs_ready++;
+      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output);
+      }
+    }
+
+    // Increment num_outputs_executed of the input nodes.
+    for (auto* input : node_state.inputs) {
+      auto& input_state = node_map_[input];
+      input_state.num_outputs_executed++;
+      if (input_state.num_outputs_executed == input_state.outputs.size()) {
+        // All the outputs are executed; no reference to this input nodel
+        input_state.time_no_reference = curr_time;
+        // TODO(dyoon): collect device memory usage; note that this input node
+        // use device memory between time_scheduled and time_no_reference.
+      }
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+  return !ready_nodes_->Empty();  // True if not empty.
+}
+
+Costs VirtualScheduler::Summary() const {
+  // Print out basic execution summary.
+  VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
+  VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
+  VLOG(1) << "Expected max per-op streaming buffers: "
+          << graph_costs_.max_per_op_streaming;
+
+  VLOG(1) << "Per-op execution time:";
+  for (const auto& op_cost_pair : op_to_cost_) {
+    const auto& op = op_cost_pair.first;
+    const auto& cost = op_cost_pair.second.execution_time.count();
+    if (cost) {  // Skip printing out zero-cost ops.
+      VLOG(1) << " + " << op << " : " << cost;
+    }
+  }
+
+  // Print per device summary
+  VLOG(1) << "Devices:";
+  Costs critical_path_costs = Costs::ZeroCosts();
+
+  for (const auto& device : device_) {
+    const auto& name = device.first;
+    const auto& state = device.second;
+    VLOG(1) << "Device = " << name
+            << ", num_nodes = " << state.nodes_executed.size()
+            << ", execution_time = " << state.GetCurrTime().count();
+    VLOG(1) << "Per-op execution time:";
+    for (const auto& op_cost_pair : state.op_to_cost) {
+      const auto& op = op_cost_pair.first;
+      const auto& cost = op_cost_pair.second.execution_time.count();
+      if (cost) {  // Skip printing out zero-cost ops.
+        VLOG(1) << " + " << op << " : " << cost;
+      }
+    }
+    if (critical_path_costs.execution_time <= state.GetCurrTime()) {
+      critical_path_costs = state.device_costs;
+    }
+  }
+
+  VLOG(1) << "Critical path execution time: "
+          << critical_path_costs.execution_time.count();
+  return critical_path_costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d437dff50ef37c13a7a210ffdd68ba5ccd57ef1
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+struct NodeState {
+  std::vector<const NodeDef*> inputs;
+  std::vector<const NodeDef*> outputs;
+  int num_inputs_ready;
+  int num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  Costs::Duration time_no_reference;
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Between time_scheduled and time_no_reference, the node's output tensor
+  // needs to be on the device, using up device memory.
+
+  NodeState() {
+    num_inputs_ready = 0;
+    num_outputs_executed = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    time_no_reference = Costs::Duration::max();
+  }
+};
+
+struct DeviceState {
+  std::vector<const NodeDef*> nodes_executed;
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  DeviceState() { device_costs = Costs::ZeroCosts(); }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  VirtualScheduler(const GraphDef& graph,
+                   const std::vector<string>& fetch_nodes);
+
+  const NodeDef* GetCurrNode() const;
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  Costs Summary() const;
+
+ private:
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+
+  Costs graph_costs_;                   // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index d3fc9044d3b7434335ac57a5c4ecf73c0d5d7444..b318ac22d4babe96968e94730691dc0d0664d585 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -53,6 +53,22 @@ int GetNumAvailableGPUs() {
   return num_eligible_gpus;
 }
 
+int64 AvailableGPUMemory(int gpu_id) {
+#if GOOGLE_CUDA
+  // Look up the device, to see its attributes.
+  perftools::gputools::Platform* gpu_platform = GPUMachineManager();
+  CHECK_LT(gpu_id, gpu_platform->VisibleDeviceCount());
+  perftools::gputools::StreamExecutor* se =
+      gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
+  int64 total_memory, available_memory;
+  CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
+
+  return available_memory;
+#else
+  return 0;
+#endif
+}
+
 int GetNumAvailableLogicalCPUCores() { return port::NumSchedulableCPUs(); }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/devices.h b/tensorflow/core/grappler/devices.h
index 329e8e2e65581b3199469328eda901a19879334e..2d6c41888d92e044fbfd06104f2079562c37fcfc 100644
--- a/tensorflow/core/grappler/devices.h
+++ b/tensorflow/core/grappler/devices.h
@@ -29,6 +29,10 @@ namespace grappler {
 // than 8.
 int GetNumAvailableGPUs();
 
+// Maximum amount of gpu memory available per gpu. gpu_id must be in the range
+// [0, num_available_gpu)
+int64 AvailableGPUMemory(int gpu_id);
+
 // Get the number of logical CPU cores (aka hyperthreads) available.
 int GetNumAvailableLogicalCPUCores();
 
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 38db9eff7e8eca6155db2b73ebfe41941bf27d72..312a457abf447be6ce291563505d77a4d7b30768 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
@@ -37,7 +38,7 @@ std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
   std::vector<const NodeDef*> fanin = ComputeTransitiveFanin(graph, init_ops);
   std::vector<const NodeDef*> vars;
   for (const NodeDef* node : fanin) {
-    if (node->op() == "Variable" || node->op() == "VariableV2") {
+    if (IsVariable(*node)) {
       vars.push_back(node);
     }
   }
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index cb21ae54f0b472469c60532658320c36cca2b1fe..e0709c682b003cd961d13902d1b7192f85c3f2b9 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -42,6 +42,8 @@ struct GrapplerItem {
 
   // Initialization op(s).
   std::vector<string> init_ops;
+  // Expected initialization time in seconds, or 0 if unknown
+  int64 expected_init_time = 0;
 
   // Queue runner(s) required to run the queue(s) of this model.
   std::vector<QueueRunnerDef> queue_runners;
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 7889b0e02593049c2f8f482a690517de861e78df..02eecb0ac570597113c247928166e15814e25d18 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -19,11 +19,18 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variable.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
@@ -50,6 +57,99 @@ void InitializeTensor(DataType type, Tensor* tensor) {
            tensor->tensor_data().size());
   }
 }
+
+// Helper function that returns a bool indicating if there are function
+// call nodes in graph.
+bool HasFunctionInGraph(const Graph& graph) {
+  for (const Node* n : graph.nodes()) {
+    if (graph.flib_def().Find(n->type_string()) != nullptr) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Wrapper around FunctionDefToBodyHelper that creates a FunctionBody
+// for function_def.
+Status CreateFunctionBody(const FunctionLibraryDefinition& function_library,
+                          const FunctionDef& function_def,
+                          const NodeDef& node_def,
+                          FunctionBody** function_body) {
+  std::function<Status(const string&, const OpDef**)> get_function_signature =
+      [&function_library](const string& name, const OpDef** signature) {
+        return function_library.LookUpOpDef(name, signature);
+      };
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      function_def, AttrSlice(node_def), &function_library,
+      get_function_signature, function_body));
+  return Status::OK();
+}
+
+// Inlines all functions in a Graph.  Does not recursively inline, so if graph
+// contains Function A that calls Function B, calling InlineFunctions once will
+// produce a graph with A inlined but not B.  Calling InlineFunctions a second
+// time will produce a graph with both A and B inlined.
+Status InlineFunctions(Graph* graph) {
+  const FunctionLibraryDefinition& function_library = graph->flib_def();
+  std::vector<std::pair<Node*, FunctionBody*>> nodes_and_funcs_to_inline;
+  std::unordered_map<string, std::unique_ptr<FunctionBody>>
+      function_name_to_body;
+  std::function<Status(const string&, const OpDef**)> get_function_signature =
+      [&function_library](const string& name, const OpDef** signature) {
+        return function_library.LookUpOpDef(name, signature);
+      };
+
+  for (Node* node : graph->nodes()) {
+    const FunctionDef* function_def =
+        function_library.Find(node->type_string());
+    if (!function_def) {
+      // Not a function node.
+      continue;
+    }
+    FunctionBody* function_body = nullptr;
+    const string key = Canonicalize(node->def().op(), AttrSlice(node->def()));
+    if (function_name_to_body.find(key) == function_name_to_body.end()) {
+      TF_RETURN_IF_ERROR(CreateFunctionBody(function_library, *function_def,
+                                            node->def(), &function_body));
+      function_name_to_body.emplace(
+          key, std::unique_ptr<FunctionBody>(function_body));
+    }
+    function_body = function_name_to_body[key].get();
+    if (function_body) {
+      nodes_and_funcs_to_inline.emplace_back(node, function_body);
+    }
+  }
+
+  for (const auto& iter : nodes_and_funcs_to_inline) {
+    InlineFunctionBody(function_library, graph, iter.first, iter.second);
+  }
+  return Status::OK();
+}
+
+// Sets *inlined_graph to be graph with all function NodeDefs in graph inlined.
+// Recursively inlines, so if graph contains Function A that calls Function B,
+// calling InlineAllFunctions once will produce a graph with both A and B
+// inlined.
+Status InlineAllFunctions(const GraphDef& graph_def,
+                          GraphDef* inlined_graph_def) {
+  *inlined_graph_def = GraphDef::default_instance();
+  // Create a Graph from graph_def. Inlining needs to happen
+  // on a single Graph object in order to guarantee unique
+  // names of nodes created during the inlining process.
+  GraphConstructorOptions graph_ctor_opts;
+  graph_ctor_opts.allow_internal_ops = true;
+  graph_ctor_opts.expect_device_spec = false;
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             graph_def.library());
+  Graph inlined_graph(function_library);
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &inlined_graph));
+  while (HasFunctionInGraph(inlined_graph)) {
+    TF_RETURN_IF_ERROR(InlineFunctions(&inlined_graph));
+  }
+  inlined_graph.ToGraphDef(inlined_graph_def);
+  return Status::OK();
+}
 }  // namespace
 
 // static
@@ -63,6 +163,15 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   new_item->id = id;
   new_item->graph = meta_graph.graph_def();
 
+  if (cfg.inline_functions) {
+    Status s = InlineAllFunctions(meta_graph.graph_def(), &new_item->graph);
+    if (!s.ok()) {
+      LOG(ERROR) << "Unable to inline functions: " << s.error_message()
+                 << ", skipping this input.";
+      return nullptr;
+    }
+  }
+
   // Attempt to detect the fetch node(s).
   if (meta_graph.collection_def().count("train_op") > 0) {
     const CollectionDef& nodes = meta_graph.collection_def().at("train_op");
@@ -85,12 +194,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   }
 
   for (auto& node : *new_item->graph.mutable_node()) {
-    // Delete user specified placement if requested.
-    if (cfg.ignore_user_placement) {
-      node.clear_device();
-    }
-
-    if (node.op() == "Placeholder" || node.op() == "PlaceholderV2") {
+    if (IsPlaceholder(node)) {
       if (node.attr().count("dtype") == 0) {
         LOG(ERROR) << "Unknown type for placeholder " << node.name()
                    << ", skipping this input";
@@ -141,6 +245,11 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
       new_item->feed.emplace_back(node.name(), fake_input);
     }
 
+    // Delete user specified placement if requested.
+    if (cfg.ignore_user_placement) {
+      node.clear_device();
+    }
+    // Delete colocation constraints if requested.
     if (cfg.ignore_colocation) {
       auto attr = node.mutable_attr();
       auto it = attr->find("_class");
@@ -172,6 +281,11 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     if (inits.has_node_list()) {
       for (const auto& node : inits.node_list().value()) {
         new_item->init_ops.push_back(node);
+        // Tables are initialized from files, which can take a long time. Add 30
+        // minutes to the initialization time for each table to avoid timing
+        // out.
+        // TODO(bsteiner): adjust the timeout based on the file size.
+        new_item->expected_init_time += 30 * 60;
       }
     }
   }
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 70886369949610d9fe469c7ef431a68014fd8db1..b51c826cc488c9aeebb7304caccbece2ebf55f2b 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -27,13 +27,21 @@ class MetaGraphDef;
 namespace grappler {
 
 struct ItemConfig {
+  ItemConfig()
+      : ignore_user_placement(true),
+        ignore_colocation(true),
+        placeholder_unknown_output_shape_dim(-1),
+        inline_functions(true) {}
+
   // If true, ignore all user specified node placement.
-  bool ignore_user_placement = true;
+  bool ignore_user_placement;
   // If true, ignore all user specified colocation attributes.
-  bool ignore_colocation = true;
+  bool ignore_colocation;
   // Dimension to use if a placeholder node has an _output_shapes attribute with
   // a dimension of -1.
-  int placeholder_unknown_output_shape_dim = -1;
+  int placeholder_unknown_output_shape_dim;
+  // If true, inline all functions in the graph.
+  bool inline_functions;
 };
 
 // Factory method for creating a GrapplerItem from a MetaGraphDef.
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 8700196475d7f30346746048f6a7c1e14ecc02b2..37047b2b82ada0e6c42f32b814833b832d144c94 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -12,16 +12,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "android_srcs",
-    srcs = glob(
-        [
-            "utils.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "utils",
     srcs = [
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 4c5600c816b3970f9ff851a76e6dbf2b51f1c048..434b660614b4267df7344222b449c5224db35996 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace grappler {
 
 class Cluster;
-class GrapplerItem;
+struct GrapplerItem;
 
 class TrivialTestGraphInputYielder : public InputYielder {
  public:
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..543c884ee8d9a45ee3effbc8927ede386e08ae55
--- /dev/null
+++ b/tensorflow/core/grappler/op_types.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/op_types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsConcat(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat" || op == "ConcatV2";
+}
+
+bool IsDequeueOp(const NodeDef& node) {
+  static const std::set<std::string> dequeue_ops = {
+      "QueueDequeueManyV2", "QueueDequeueMany", "QueueDequeueV2",
+      "QueueDequeue"};
+  return dequeue_ops.count(node.op()) > 0;
+}
+
+bool IsPlaceholder(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Placeholder" || op == "PlaceholderV2";
+}
+
+bool IsTranspose(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Transpose";
+}
+
+bool IsVariable(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
+         op == "VarHandleOp";
+}
+
+bool IsMerge(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Merge";
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce9e4a062860c3feafd85dc1f00fabddffbd1230
--- /dev/null
+++ b/tensorflow/core/grappler/op_types.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#define TENSORFLOW_GRAPPLER_OP_TYPES_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+bool IsConcat(const NodeDef& node);
+bool IsDequeueOp(const NodeDef& node);
+bool IsPlaceholder(const NodeDef& node);
+bool IsTranspose(const NodeDef& node);
+bool IsVariable(const NodeDef& node);
+bool IsMerge(const NodeDef& node);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OP_TYPES_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index d09a3c4e3047d7bcc6365ead3a6eb8c489cceef1..f88b995c89fba21cb1fa2ad0381e82583a237451 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -12,16 +12,76 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-filegroup(
-    name = "android_srcs",
-    srcs = glob(
-        [
-            "*_optimizer.*",
-            "model_pruner.*",
-            "graph_rewriter.*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+cc_library(
+    name = "static_schedule",
+    srcs = ["static_schedule.cc"],
+    hdrs = [
+        "static_schedule.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:op_level_cost_estimator",
+        "//tensorflow/core/grappler/costs:virtual_placer",
+    ],
+)
+
+cc_test(
+    name = "static_schedule_test",
+    srcs = ["static_schedule_test.cc"],
+    deps = [
+        ":static_schedule",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "auto_parallel",
+    srcs = ["auto_parallel.cc"],
+    hdrs = [
+        "auto_parallel.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+    ],
+)
+
+cc_test(
+    name = "auto_parallel_test",
+    srcs = ["auto_parallel_test.cc"],
+    deps = [
+        ":auto_parallel",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
 )
 
 cc_library(
@@ -117,6 +177,40 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_optimizer",
+    srcs = ["memory_optimizer.cc"],
+    hdrs = [
+        "memory_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        ":graph_rewriter",
+        ":static_schedule",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+cc_test(
+    name = "memory_optimizer_test",
+    srcs = ["memory_optimizer_test.cc"],
+    deps = [
+        ":memory_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+    ],
+)
+
 cc_library(
     name = "layout_optimizer",
     srcs = ["layout_optimizer.cc"],
@@ -131,11 +225,28 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
     ],
 )
 
+cc_test(
+    name = "layout_optimizer_test",
+    srcs = ["layout_optimizer_test.cc"],
+    deps = [
+        ":layout_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
 cc_library(
     name = "meta_optimizer",
     srcs = ["meta_optimizer.cc"],
@@ -144,11 +255,15 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":auto_parallel",
+        ":constant_folding",
         ":graph_optimizer",
         ":layout_optimizer",
+        ":memory_optimizer",
         ":model_pruner",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4326a022f465d8e11503b7bbae61747f8b0bb21
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -0,0 +1,268 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+const char kAutoParallelPrefix[] = "AutoParallel";
+
+NodeDef* AutoParallel::AddNodeDivConst() {
+  NodeDef* node = graph_.add_node();
+  node->set_name(strings::StrCat(kAutoParallelPrefix, "-Div-Const"));
+  node->set_op("Const");
+
+  AttrValue attr_data_type;
+  attr_data_type.set_type(DT_FLOAT);
+  node->mutable_attr()->insert({"dtype", attr_data_type});
+
+  AttrValue attr_tensor;
+  auto tensor = attr_tensor.mutable_tensor();
+  tensor->add_float_val(static_cast<float>(num_replicas_));
+  tensor->set_dtype(DT_FLOAT);
+  node->mutable_attr()->insert({"value", attr_tensor});
+  return node;
+}
+
+NodeDef* AutoParallel::AddNodeDiv(const string& name, const string& input_a,
+                                  const string& input_b) {
+  NodeDef* node = graph_.add_node();
+  node->set_name(strings::StrCat(kAutoParallelPrefix, "-Div-", name));
+  node->set_op("RealDiv");
+  node->add_input(input_a);
+  node->add_input(input_b);
+  AttrValue attr_type;
+  attr_type.set_type(DT_FLOAT);
+  node->mutable_attr()->insert({"T", attr_type});
+  return node;
+}
+
+NodeDef* AutoParallel::AddNodeControl(const string& name,
+                                      const std::set<string>& deps,
+                                      GraphDef* graph) {
+  NodeDef* node = graph->add_node();
+  node->set_name(name);
+  node->set_op("NoOp");
+  for (const auto& dep : deps) {
+    node->add_input(strings::StrCat("^", dep));
+  }
+  return node;
+}
+
+Status AutoParallel::Initialize(const GrapplerItem& item) {
+  num_gpus_ = GetNumAvailableGPUs();
+  LOG(INFO) << "Number of GPUs: " << num_gpus_;
+  item_ = &item;
+  graph_ = item.graph;
+  LOG(INFO) << "Original graph size: " << graph_.node_size();
+  if (item.fetch.empty()) {
+    return Status(error::INVALID_ARGUMENT, "No fetch nodes provided.");
+  }
+
+  if (item.MainVariables().empty()) {
+    return Status(error::INVALID_ARGUMENT, "No variables provided.");
+  }
+
+  for (const auto& init : item.init_ops) {
+    VLOG(1) << "Init node: " << init;
+  }
+
+  for (const auto& fetch : item.fetch) {
+    VLOG(1) << "Fetch node: " << fetch;
+  }
+
+  for (const auto& var : item.MainVariables()) {
+    VLOG(2) << "Variable: " << var->name();
+  }
+
+  const std::set<string> apply_gradients_ops = {"ApplyGradientDescent",
+                                                "ApplyProximalGradientDescent",
+                                                "ApplyAdadelta",
+                                                "ApplyAdagrad",
+                                                "ApplyProximalAdagrad",
+                                                "ApplyAdagradDA",
+                                                "ApplyFtrl",
+                                                "ApplyMomentum",
+                                                "ApplyAdam",
+                                                "ApplyRMSProp",
+                                                "ApplyCenteredRMSProp"};
+  for (int i = 0; i < graph_.node_size(); i++) {
+    all_nodes_.insert(
+        std::make_pair(graph_.node(i).name(), graph_.mutable_node(i)));
+    if (apply_gradients_ops.find(graph_.node(i).op()) !=
+        apply_gradients_ops.end()) {
+      apply_gradients_nodes_.insert(graph_.node(i).name());
+      VLOG(2) << "Apply gradients node: " << graph_.node(i).name();
+    }
+  }
+
+  auto div_const_node = AddNodeDivConst();
+  all_nodes_.insert(std::make_pair(div_const_node->name(), div_const_node));
+  std::map<string, int> gradient_pos = {{"ApplyGradientDescent", 2},
+                                        {"ApplyProximalGradientDescent", 4},
+                                        {"ApplyAdadelta", 6},
+                                        {"ApplyAdagrad", 3},
+                                        {"ApplyProximalAdagrad", 5},
+                                        {"ApplyAdagradDA", 3},
+                                        {"ApplyFtrl", 3},
+                                        {"ApplyMomentum", 3},
+                                        {"ApplyAdam", 9},
+                                        {"ApplyRMSProp", 7},
+                                        {"ApplyCenteredRMSProp", 8}};
+  for (const auto& apply_gradient_node_name : apply_gradients_nodes_) {
+    auto apply_gradients_op = all_nodes_[apply_gradient_node_name]->op();
+    auto apply_gradients_node = all_nodes_[apply_gradient_node_name];
+
+    auto div_node = AddNodeDiv(
+        apply_gradient_node_name,
+        apply_gradients_node->input(gradient_pos[apply_gradients_op]),
+        div_const_node->name());
+    all_nodes_.insert(std::make_pair(div_node->name(), div_node));
+    *apply_gradients_node->mutable_input(gradient_pos[apply_gradients_op]) =
+        div_node->name();
+  }
+  LOG(INFO) << "Graph size after adding div nodes: " << all_nodes_.size();
+
+  auto train_nodes = ComputeTransitiveFanin(graph_, item.fetch);
+  LOG(INFO) << "Number of training nodes: " << train_nodes.size();
+
+  const NodeDef* dequeue_node;
+  for (const auto& train_node : train_nodes) {
+    if (IsDequeueOp(*train_node)) {
+      dequeue_node = train_node;
+      break;
+    }
+  }
+
+  std::vector<const NodeDef*> input_nodes;
+  if (dequeue_node) {
+    LOG(INFO) << "Dequeue node: " << dequeue_node->name();
+    input_nodes = ComputeTransitiveFanin(graph_, {dequeue_node->name()});
+  }
+  LOG(INFO) << "Number of input nodes: " << input_nodes.size();
+
+  std::set<string> dont_replicate_nodes;
+  for (const auto& variable : item.MainVariables()) {
+    dont_replicate_nodes.insert(variable->name());
+  }
+  // Don't replicate all input nodes, except the dequeue node.
+  for (const auto& input_node : input_nodes) {
+    if (input_node->name() != dequeue_node->name()) {
+      dont_replicate_nodes.insert(input_node->name());
+    }
+  }
+
+  for (const auto& node : train_nodes) {
+    if (dont_replicate_nodes.find(node->name()) == dont_replicate_nodes.end()) {
+      replica_nodes_.insert(node->name());
+    }
+  }
+  LOG(INFO) << "Number of replica nodes: " << replica_nodes_.size();
+
+  for (const auto& node : all_nodes_) {
+    if (replica_nodes_.find(node.first) == replica_nodes_.end()) {
+      shared_nodes_.insert(node.first);
+    }
+  }
+  LOG(INFO) << "Number of shared nodes: " << shared_nodes_.size();
+  return Status::OK();
+}
+
+bool AutoParallel::NotSharedNode(const string& name) {
+  return shared_nodes_.find(name) == shared_nodes_.end();
+}
+
+void AutoParallel::AddSharedNodes(GraphDef* graph) {
+  string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", 0);
+  for (const auto& node : shared_nodes_) {
+    auto new_node = graph->add_node();
+    *new_node = *all_nodes_[node];
+    for (int i = 0; i < new_node->input_size(); i++) {
+      if (NotSharedNode(NodeName(new_node->input(i)))) {
+        string new_name = AddPrefixToNodeName(new_node->input(i), prefix);
+        *new_node->mutable_input(i) = new_name;
+      }
+    }
+  }
+}
+
+void AutoParallel::AddOneReplica(GraphDef* graph, int number) {
+  string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", number);
+  for (const auto& node : replica_nodes_) {
+    auto new_node = graph->add_node();
+    *new_node = *all_nodes_[node];
+    if (NotSharedNode(new_node->name())) {
+      new_node->set_name(AddPrefixToNodeName(new_node->name(), prefix));
+      if (num_gpus_ > 0) {
+        new_node->set_device(strings::StrCat("/gpu:", number % num_gpus_));
+      }
+      for (int i = 0; i < new_node->input_size(); i++) {
+        if (NotSharedNode(NodeName(new_node->input(i)))) {
+          string new_name = AddPrefixToNodeName(new_node->input(i), prefix);
+          *new_node->mutable_input(i) = new_name;
+        }
+      }
+    }
+  }
+}
+
+void AutoParallel::BuildGraph(GraphDef* graph) {
+  AddSharedNodes(graph);
+  for (int i = 0; i < num_replicas_; i++) {
+    AddOneReplica(graph, i);
+  }
+  std::set<string> fetches;
+  for (size_t i = 0; i < item_->fetch.size(); i++) {
+    for (int j = 0; j < num_replicas_; j++) {
+      string prefix = strings::StrCat(kAutoParallelPrefix, "-Replica-", j);
+      string fetch = AddPrefixToNodeName(item_->fetch[i], prefix);
+      fetches.insert(fetch);
+    }
+  }
+  string name_control =
+      strings::StrCat(kAutoParallelPrefix, "-Control-", "Fetch");
+  auto control = AddNodeControl(name_control, fetches, graph);
+
+  for (const auto& fetch : item_->fetch) {
+    AddNodeControl(fetch, {control->name()}, graph);
+  }
+  *(graph->mutable_library()) = item_->graph.library();
+  LOG(INFO) << "Parallelized graph size: " << graph->node_size();
+}
+
+Status AutoParallel::Optimize(Cluster* cluster, const GrapplerItem& item,
+                              GraphDef* output) {
+  TF_RETURN_IF_ERROR(Initialize(item));
+  BuildGraph(output);
+  return Status::OK();
+}
+
+void AutoParallel::Feedback(Cluster* cluster, const GrapplerItem& item,
+                            const GraphDef& optimize_output, double result) {
+  // TODO(yaozhang): Add feedback.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad90bbe02892276d0e3bb28eb872c950640164a2
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Automatically parallelize a graph by splitting in the batch dimension.
+class AutoParallel : public GraphOptimizer {
+ public:
+  AutoParallel(int num_replicas) : num_replicas_(num_replicas) {
+    CHECK(num_replicas_ >= 2);
+  }
+  ~AutoParallel() override {}
+
+  string name() const override { return "autoparallel"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  GraphDef graph_;
+  std::map<string, NodeDef*> all_nodes_;
+  std::set<string> apply_gradients_nodes_;
+  std::set<string> replica_nodes_;
+  std::set<string> shared_nodes_;
+  const GrapplerItem* item_;
+  int num_replicas_;
+  int num_gpus_;
+  Status Initialize(const GrapplerItem& item);
+  NodeDef* AddNodeDivConst();
+  NodeDef* AddNodeDiv(const string& name, const string& input_a,
+                      const string& input_b);
+  NodeDef* AddNodeControl(const string& name, const std::set<string>& deps,
+                          GraphDef* graph);
+  bool NotSharedNode(const string& name);
+  void AddSharedNodes(GraphDef* graph);
+  void AddOneReplica(GraphDef* graph, int number);
+  void BuildGraph(GraphDef* graph);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel_test.cc b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d1b4a34bfce69817d327b72b3643f3d391c10ec
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/auto_parallel_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class AutoParallelTest : public ::testing::Test {};
+
+TEST_F(AutoParallelTest, SimpleParallel) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant_a = ops::Const(s.WithOpName("constant_a"), 1.0f, {1});
+  Output constant_b = ops::Const(s.WithOpName("constant_b"), 1, {1});
+  Output var = ops::Variable(s.WithOpName("var"), {1}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign"), {var}, {constant_a});
+  Output fifo_queue = ops::FIFOQueue(s.WithOpName("fifo_queue"), {DT_FLOAT});
+  auto dequeue = ops::QueueDequeueMany(s.WithOpName("dequeue"), {fifo_queue},
+                                       {constant_b}, {DT_FLOAT});
+  Output add = ops::AddN(s.WithOpName("add"), {constant_a, dequeue[0]});
+  Output learning_rate = ops::Const(s.WithOpName("learning_rate"), 0.01f, {1});
+  Output apply_gradient = ops::ApplyGradientDescent(
+      s.WithOpName("apply_gradient"), {var}, {learning_rate}, {add});
+
+  GrapplerItem item;
+  item.init_ops.push_back("assign");
+  item.fetch.push_back("apply_gradient");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  AutoParallel parallel(2);
+  GraphDef output;
+  Status status = parallel.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(20, output.node_size());
+
+  const NodeDef& node_assign = output.node(0);
+  EXPECT_EQ("assign", node_assign.name());
+  EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_assign.input(1));
+
+  const NodeDef& node_constant_b = output.node(1);
+  EXPECT_EQ("constant_b", node_constant_b.name());
+
+  const NodeDef& node_fifo_queue = output.node(2);
+  EXPECT_EQ("fifo_queue", node_fifo_queue.name());
+
+  const NodeDef& node_var = output.node(3);
+  EXPECT_EQ("var", node_var.name());
+
+  const NodeDef& node_div_const0 = output.node(4);
+  EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-Const",
+            node_div_const0.name());
+
+  const NodeDef& node_div0 = output.node(5);
+  EXPECT_EQ("AutoParallel-Replica-0/AutoParallel-Div-apply_gradient",
+            node_div0.name());
+  const NodeDef& node_add0 = output.node(6);
+  EXPECT_EQ("AutoParallel-Replica-0/add", node_add0.name());
+
+  const NodeDef& node_gradient0 = output.node(7);
+  EXPECT_EQ("AutoParallel-Replica-0/apply_gradient", node_gradient0.name());
+
+  const NodeDef& node_constant_a0 = output.node(8);
+  EXPECT_EQ("AutoParallel-Replica-0/constant_a", node_constant_a0.name());
+
+  const NodeDef& node_dequeue0 = output.node(9);
+  EXPECT_EQ("AutoParallel-Replica-0/dequeue", node_dequeue0.name());
+
+  const NodeDef& node_learning_rate0 = output.node(10);
+  EXPECT_EQ("AutoParallel-Replica-0/learning_rate", node_learning_rate0.name());
+
+  const NodeDef& node_div_const1 = output.node(11);
+  EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-Const",
+            node_div_const1.name());
+
+  const NodeDef& node_div1 = output.node(12);
+  EXPECT_EQ("AutoParallel-Replica-1/AutoParallel-Div-apply_gradient",
+            node_div1.name());
+
+  const NodeDef& node_add1 = output.node(13);
+  EXPECT_EQ("AutoParallel-Replica-1/add", node_add1.name());
+
+  const NodeDef& node_gradient1 = output.node(14);
+  EXPECT_EQ("AutoParallel-Replica-1/apply_gradient", node_gradient1.name());
+
+  const NodeDef& node_constant_a1 = output.node(15);
+  EXPECT_EQ("AutoParallel-Replica-1/constant_a", node_constant_a1.name());
+
+  const NodeDef& node_dequeue1 = output.node(16);
+  EXPECT_EQ("AutoParallel-Replica-1/dequeue", node_dequeue1.name());
+
+  const NodeDef& node_learning_rate1 = output.node(17);
+  EXPECT_EQ("AutoParallel-Replica-1/learning_rate", node_learning_rate1.name());
+
+  const NodeDef& node_fetch = output.node(18);
+  EXPECT_EQ("AutoParallel-Control-Fetch", node_fetch.name());
+  EXPECT_EQ("^AutoParallel-Replica-0/apply_gradient", node_fetch.input(0));
+  EXPECT_EQ("^AutoParallel-Replica-1/apply_gradient", node_fetch.input(1));
+
+  const NodeDef& node_gradient = output.node(19);
+  EXPECT_EQ("apply_gradient", node_gradient.name());
+  EXPECT_EQ("^AutoParallel-Control-Fetch", node_gradient.input(0));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 49891e2a7803f23ba390eef2469a9b79a7ef0aba..8f79c55810b6cb7ebb281468dc58e398c6d9f0d2 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -72,8 +72,7 @@ class DeviceSimple : public DeviceBase {
                              Tensor* tensor) override {
     Tensor parsed(tensor_proto.dtype());
     if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
-      return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                     tensor_proto.DebugString());
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
     }
     *tensor = parsed;
     return Status::OK();
@@ -130,7 +129,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (!status.ok()) {
     return false;
   }
-
   if (op_def->is_stateful()) {
     return false;
   }
@@ -144,6 +142,15 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
+  // No need to (and don't) fold nodes that have no outgoing edges. Such nodes
+  // could be introduced by an earlier constant folding pass and are preserved
+  // in case users want to fetch their values; re-processing them would
+  // lead to an error of adding a duplicated node to graph.
+  auto outputs = node_map_->GetOutputs(node.name());
+  if (outputs.empty()) {
+    return false;
+  }
+
   for (const auto& input : node.input()) {
     bool is_const = IsConst(*node_map_->GetNode(input));
     if (!is_const) {
@@ -224,8 +231,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     Status(error::INVALID_ARGUMENT, "Expected at least one output.");
   }
   for (int i = 0; i < output_tensors.size(); i++) {
-    string node_name = strings::StrCat(
-        AddPrefixToNodeName(node.name(), kConstantFoldingConst));
+    string node_name = AddPrefixToNodeName(node.name(), kConstantFoldingConst);
     if (output_tensors.size() > 1) {
       node_name = strings::StrCat(node_name, "-", i);
     }
@@ -299,6 +305,7 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     nodes_to_preserve_.insert(NodeName(node));
   }
   device_.reset(new DeviceSimple());
+  *output = GraphDef();
   TF_RETURN_IF_ERROR(FoldGraph(output));
   LOG(INFO) << "Optimized graph size: " << output->node_size();
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index ab79e7410312bec8f39c4d311a25015734e2fce4..93e2a797466cf037f197272b01f1ef51a64fac2f 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -64,7 +64,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   EXPECT_EQ(5, output.node_size());
 
   const NodeDef& new_c = output.node(0);
-  EXPECT_EQ("ConstantFolding-c", new_c.name());
+  EXPECT_EQ("ConstantFolding/c", new_c.name());
   EXPECT_EQ("Const", new_c.op());
 
   const NodeDef& new_a = output.node(1);
@@ -78,7 +78,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
 
   const NodeDef& new_d = output.node(4);
   EXPECT_EQ("d", new_d.name());
-  EXPECT_EQ("ConstantFolding-c", new_d.input(1));
+  EXPECT_EQ("ConstantFolding/c", new_d.input(1));
 
   std::vector<string> fetch = {"a", "b", "c", "d"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
@@ -112,11 +112,11 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   EXPECT_EQ(6, output.node_size());
 
   const NodeDef& new_b_0 = output.node(0);
-  EXPECT_EQ("ConstantFolding-b-0", new_b_0.name());
+  EXPECT_EQ("ConstantFolding/b-0", new_b_0.name());
   EXPECT_EQ("Const", new_b_0.op());
 
   const NodeDef& new_b_1 = output.node(1);
-  EXPECT_EQ("ConstantFolding-b-1", new_b_1.name());
+  EXPECT_EQ("ConstantFolding/b-1", new_b_1.name());
   EXPECT_EQ("Const", new_b_1.op());
 
   const NodeDef& new_a = output.node(2);
@@ -127,11 +127,11 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
 
   const NodeDef& new_c = output.node(4);
   EXPECT_EQ("c", new_c.name());
-  EXPECT_EQ("ConstantFolding-b-0", new_c.input(0));
+  EXPECT_EQ("ConstantFolding/b-0", new_c.input(0));
 
   const NodeDef& new_d = output.node(5);
   EXPECT_EQ("d", new_d.name());
-  EXPECT_EQ("ConstantFolding-b-1", new_d.input(0));
+  EXPECT_EQ("ConstantFolding/b-1", new_d.input(0));
 
   std::vector<string> fetch = {"a", "b", "c", "d"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
index fbb7e849ba22c662daf6fa743e2ffe14167e4fe5..d1ab5a1d9b45e7051e97f777634c852a715e5005 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
@@ -64,5 +64,15 @@ bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
          control_dependency_drivers_.end();
 }
 
+bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
+  for (const auto& input : node.input()) {
+    CHECK(!input.empty());
+    if (input[0] == '^') {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index a9cc777809fa95a0f66908b72cda0c92c5fdd57f..adbe5a24c863876c66cefc13a009324f607bf492 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -43,6 +43,10 @@ class GraphRewriter {
   // a control dependency edge.
   bool DrivesControlDependency(const NodeDef& node) const;
 
+  // Returns true if at least one of the incident edges is a control dependency
+  // edge.
+  bool IsDrivenByControlDependency(const NodeDef& node) const;
+
  private:
   std::unordered_map<string, const NodeDef*> nodes_;
   std::unordered_set<const NodeDef*> control_dependency_drivers_;
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index b0988b8a891ff3187e1438c557a78a27b601b2d6..e37c4a5b36afc43e90b0b0dd6d0e07e6374b6a16 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -68,8 +69,7 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Slice",
                                           "SquaredDifference",
                                           "Squeeze",
-                                          "Sub",
-                                          "Sum"};
+                                          "Sub"};
   return ops_format_agnostic;
 }
 
@@ -110,9 +110,9 @@ class NodeProcessor {
   }
 
  protected:
-  bool IsDimsN(NodeDef* node, int n) const {
-    if (node->attr().find("_output_shapes") != node->attr().end()) {
-      auto shape = node->attr().at("_output_shapes").list().shape(0);
+  bool IsDimsN(const NodeDef& node, int n) const {
+    if (node.attr().find("_output_shapes") != node.attr().end()) {
+      auto shape = node.attr().at("_output_shapes").list().shape(0);
       if (shape.dim_size() == n) {
         return true;
       }
@@ -120,7 +120,7 @@ class NodeProcessor {
     return false;
   }
 
-  bool IsDimsFour(NodeDef* node) const { return IsDimsN(node, 4); }
+  bool IsDimsFour(const NodeDef& node) const { return IsDimsN(node, 4); }
 
   bool IsNHWC() const {
     if (node_->attr().find("data_format") != node_->attr().end()) {
@@ -145,7 +145,7 @@ class NodeProcessor {
   }
 
   virtual bool ShouldProcess() const {
-    return IsNHWC() && IsDimsFour(node_) && HasOutputs();
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs();
   }
 
   void UpdateAttrDataFormat() {
@@ -268,6 +268,8 @@ class NodeProcessor {
     for (const auto& output : outputs) {
       string node_name_NCHWToNHWC = strings::StrCat(
           kTransposeNCHWToNHWC, "-", node_->name(), "-", output->name());
+      // TODO (yaozhang): handle the rare case where node A is connected to more
+      // than one input of node B.
       auto it = std::find_if(output->mutable_input()->begin(),
                              output->mutable_input()->end(),
                              [this](const string& input) {
@@ -341,7 +343,7 @@ class BiasAddGradProcessor : public NodeProcessor {
   bool ShouldProcess() const override {
     auto input = node_map_->GetNode(node_->input(0));
     if (input) {
-      if ((IsNHWC() && IsDimsFour(input)) || IsNodeNCHWToNHWC(input->name())) {
+      if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
         return true;
       }
     }
@@ -351,13 +353,89 @@ class BiasAddGradProcessor : public NodeProcessor {
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 };
 
-class Conv2DBackpropFilterProcessor : public NodeProcessor {
+class Conv2DProcessor : public NodeProcessor {
+ public:
+  Conv2DProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  bool no_gemm)
+      : NodeProcessor(graph, node, node_map), no_gemm_(no_gemm) {}
+
+ protected:
+  bool ShouldProcess() const override {
+    return IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
+           (!IsGemmUsed() || no_gemm_);
+  }
+
+  TensorShapeProto GetShape(const string& input_name) const {
+    string node_name;
+    int output_pos;
+    node_name = ParseNodeName(input_name, &output_pos);
+    NodeDef* node = node_map_->GetNode(node_name);
+    if (node->attr().find("_output_shapes") != node->attr().end()) {
+      return node->attr().at("_output_shapes").list().shape(output_pos);
+    }
+    TensorShapeProto shape;
+    return shape;
+  }
+
+  bool IsStrideOne() const {
+    if (node_->attr().find("strides") != node_->attr().end()) {
+      auto list = node_->attr().at("strides").list();
+      return list.i(1) == 1 && list.i(2) == 1;
+    }
+    return false;
+  }
+
+  bool IsValidPadding() const {
+    if (node_->attr().find("padding") != node_->attr().end()) {
+      auto padding = node_->attr().at("padding").s();
+      return padding == "VALID";
+    }
+    return false;
+  }
+
+  // The logic inside this function is based on the internal implementation of
+  // Conv2D, Conv2DBackpropInput, and Conv2DBackpropFilter ops, and thus
+  // needs to be updated accordingly if the internal implementation changes.
+  bool IsGemmUsed(const TensorShapeProto& filter_shape,
+                  const TensorShapeProto& input_shape) const {
+    if (filter_shape.dim_size() == 4) {
+      if (filter_shape.dim(0).size() == 1 && filter_shape.dim(1).size() == 1 &&
+          IsStrideOne()) {
+        return true;
+      }
+    }
+    if (input_shape.dim_size() == 4 && filter_shape.dim_size() == 4) {
+      if (input_shape.dim(1).size() == filter_shape.dim(0).size() &&
+          input_shape.dim(2).size() == filter_shape.dim(1).size() &&
+          IsValidPadding()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual bool IsGemmUsed() const {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->input(0));
+    return IsGemmUsed(filter_shape, input_shape);
+  }
+
+  bool no_gemm_;
+};
+
+class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
  public:
   Conv2DBackpropFilterProcessor(GraphDef* graph, NodeDef* node,
-                                NodeMap* node_map)
-      : NodeProcessor(graph, node, node_map) {}
+                                NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
 
  protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->name());
+    auto input_shape = GetShape(node_->input(0));
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {0, 2};
     return input_pos;
@@ -370,17 +448,24 @@ class Conv2DBackpropFilterProcessor : public NodeProcessor {
   void UpdateAttrShape() override {}
 };
 
-class Conv2DBackpropInputProcessor : public NodeProcessor {
+class Conv2DBackpropInputProcessor : public Conv2DProcessor {
  public:
   Conv2DBackpropInputProcessor(GraphDef* graph, NodeDef* node,
-                               NodeMap* node_map)
-      : NodeProcessor(graph, node, node_map) {}
+                               NodeMap* node_map, bool no_gemm)
+      : Conv2DProcessor(graph, node, node_map, no_gemm) {}
 
  protected:
+  bool IsGemmUsed() const override {
+    auto filter_shape = GetShape(node_->input(1));
+    auto input_shape = GetShape(node_->name());
+    return Conv2DProcessor::IsGemmUsed(filter_shape, input_shape);
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {2};
     return input_pos;
   }
+
   Status CustomizedProcessing() override {
     NodeDef* node = node_map_->GetNode(node_->input(0));
     return UpdateAttrValue(node);
@@ -418,7 +503,7 @@ class AgnosticNodeProcessor : public NodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
   }
 
   bool IsNodeAfterNCHWToNHWC() const {
@@ -467,7 +552,7 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
             Is4DOperateWithVector());
   }
@@ -484,10 +569,10 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     auto input0 = node_map_->GetNode(node_->input(0));
     auto input1 = node_map_->GetNode(node_->input(1));
     if (input0 && input1) {
-      return (IsDimsFour(input0) || IsNodeNCHWToNHWC(input0->name())) &&
+      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
              ((n == 4)
-                  ? (IsDimsFour(input1) || IsNodeNCHWToNHWC(input1->name()))
-                  : IsDimsN(input1, n));
+                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
+                  : IsDimsN(*input1, n));
     }
     return false;
   }
@@ -508,7 +593,7 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
     AttrValue attr_tensor;
     Tensor tensor(DT_INT32, TensorShape({4}));
     std::vector<int> shape = {1, num_channels, 1, 1};
-    for (int i = 0; i < shape.size(); i++) {
+    for (int i = 0; i < static_cast<int>(shape.size()); i++) {
       tensor.flat<int>()(i) = shape[i];
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
@@ -571,7 +656,7 @@ class ConcatProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            IsAlongDimC();
   }
 
@@ -615,11 +700,9 @@ class ReluGradProcessor : public AgnosticNodeProcessor {
   }
 };
 
-// This is the older, less optimized gather-based SliceProcessor. We keep it as
-// a test case for constant propagation optimization.
-class SliceProcessorGatherBased : public AgnosticNodeProcessor {
+class SliceProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessorGatherBased(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+  SliceProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
       : AgnosticNodeProcessor(graph, node, node_map) {}
 
  protected:
@@ -663,9 +746,30 @@ class SliceProcessorGatherBased : public AgnosticNodeProcessor {
   }
 };
 
-class SliceProcessor : public AgnosticNodeProcessor {
+// Specialized SliceProcessor, used if the second and third input are const
+// nodes, which could be the case if a constant folding pass is applied
+// before this optimization.
+class SliceProcessorConst : public AgnosticNodeProcessor {
  public:
-  SliceProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+  SliceProcessorConst(GraphDef* graph, NodeDef* node, NodeMap* node_map)
+      : AgnosticNodeProcessor(graph, node, node_map) {}
+
+ protected:
+  Status CustomizedProcessing() override {
+    // Skip the first input, which is the data to be sliced.
+    for (int i = 1; i < node_->input_size(); i++) {
+      auto shape_node = node_map_->GetNode(node_->input(i));
+      TF_RETURN_IF_ERROR(UpdateAttrValue(shape_node));
+    }
+    return Status::OK();
+  }
+};
+
+// Specialized SliceProcessor, used if the second input is ConcatOffset. An
+// example use case is in the gradient computation of Concat for InceptionV3.
+class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
+ public:
+  SliceProcessorConcatOffset(GraphDef* graph, NodeDef* node, NodeMap* node_map)
       : AgnosticNodeProcessor(graph, node, node_map) {}
 
  protected:
@@ -720,7 +824,7 @@ class SqueezeProcessor : public AgnosticNodeProcessor {
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsN(node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return IsDimsN(*node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            IsInputConvertible() && IsAlongDimHW();
   }
 
@@ -771,7 +875,7 @@ class SumProcessor : public AgnosticNodeProcessor {
   bool ShouldProcess() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
     return HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (IsDimsFour(input0) || IsNodeNCHWToNHWC(input0->name())) &&
+           (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
            IsAlongDimNHW();
   }
 
@@ -806,10 +910,21 @@ class SumProcessor : public AgnosticNodeProcessor {
   }
 };
 
+struct TuningConfig {
+  // If true, do not use the NHWC GEMM implementation. When filter size is
+  // one or filter size is equal to input image size,
+  // the NHWC implementation of Conv2D, Conv2DBackpropInput, and
+  // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
+  // usually faster than the NCHW implementation. The downside is that this
+  // might result in more non-cancellable layout conversion nodes (implemented
+  // by the Tranpose op).
+  bool no_gemm;
+};
+
 class DataLayoutOptimizer {
  public:
-  explicit DataLayoutOptimizer(GraphDef* graph)
-      : graph_(graph), node_map_(graph_) {}
+  explicit DataLayoutOptimizer(GraphDef* graph, TuningConfig config)
+      : graph_(graph), node_map_(graph_), config_(config) {}
 
   Status Optimize() {
     LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
@@ -832,7 +947,7 @@ class DataLayoutOptimizer {
     node->mutable_attr()->insert({"dtype", attr_data_type});
     AttrValue attr_tensor;
     Tensor tensor(DT_INT32, TensorShape({4}));
-    for (int i = 0; i < permutation.size(); i++) {
+    for (int i = 0; static_cast<size_t>(i) < permutation.size(); i++) {
       tensor.flat<int>()(i) = permutation[i];
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
@@ -866,7 +981,7 @@ class DataLayoutOptimizer {
     AttrValue attr_tensor;
     Tensor tensor(DT_INT32, TensorShape({3}));
     std::vector<int> axis = {0, 2, 3};
-    for (int i = 0; i < axis.size(); i++) {
+    for (int i = 0; static_cast<size_t>(i) < axis.size(); i++) {
       tensor.flat<int>()(i) = axis[i];
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
@@ -889,12 +1004,15 @@ class DataLayoutOptimizer {
         } else if (node->op().compare("BiasAddGrad") == 0) {
           node_processor.reset(
               new BiasAddGradProcessor(graph_, node, &node_map_));
-        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
+        } else if (node->op().compare("Conv2D") == 0) {
           node_processor.reset(
-              new Conv2DBackpropFilterProcessor(graph_, node, &node_map_));
+              new Conv2DProcessor(graph_, node, &node_map_, config_.no_gemm));
+        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
+          node_processor.reset(new Conv2DBackpropFilterProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
         } else if (node->op().compare("Conv2DBackpropInput") == 0) {
-          node_processor.reset(
-              new Conv2DBackpropInputProcessor(graph_, node, &node_map_));
+          node_processor.reset(new Conv2DBackpropInputProcessor(
+              graph_, node, &node_map_, config_.no_gemm));
         } else if (node->op().compare("FusedBatchNormGrad") == 0) {
           node_processor.reset(
               new FusedBatchNormGradProcessor(graph_, node, &node_map_));
@@ -938,14 +1056,17 @@ class DataLayoutOptimizer {
             node_processor.reset(
                 new ReluGradProcessor(graph_, node, &node_map_));
           } else if (node->op().compare("Slice") == 0) {
-            auto maybe_concatoffset_node =
-                node_map_.GetNode(NodeName(node->input(1)));
-            if (maybe_concatoffset_node->op() == "ConcatOffset") {
+            auto input1 = node_map_.GetNode(NodeName(node->input(1)));
+            auto input2 = node_map_.GetNode(NodeName(node->input(2)));
+            if (input1->op() == "ConcatOffset") {
               node_processor.reset(
-                  new SliceProcessor(graph_, node, &node_map_));
+                  new SliceProcessorConcatOffset(graph_, node, &node_map_));
+            } else if (input1->op() == "Const" && input2->op() == "Const") {
+              node_processor.reset(
+                  new SliceProcessorConst(graph_, node, &node_map_));
             } else {
               node_processor.reset(
-                  new SliceProcessorGatherBased(graph_, node, &node_map_));
+                  new SliceProcessor(graph_, node, &node_map_));
             }
 
           } else if (node->op().compare("Squeeze") == 0) {
@@ -1003,17 +1124,46 @@ class DataLayoutOptimizer {
 
   GraphDef* graph_;
   NodeMap node_map_;
+  TuningConfig config_;
 };
 
+int GetNumTranspose(const GraphDef& graph) {
+  int number = 0;
+  for (const auto& node : graph.node()) {
+    if (IsTranspose(node)) {
+      number++;
+    }
+  }
+  LOG(INFO) << "Number of Transpose nodes: " << number;
+  return number;
+}
+
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
-  if (GetNumAvailableGPUs() < 1) {
+  if (num_gpus_ == 0) {
+    num_gpus_ = GetNumAvailableGPUs();
+  }
+  if (num_gpus_ < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     return Status::OK();
   }
+
   *output = item.graph;
-  DataLayoutOptimizer layout_optimizer(output);
+  TuningConfig config;
+  config.no_gemm = false;
+  DataLayoutOptimizer layout_optimizer(output, config);
   auto status = layout_optimizer.Optimize();
+
+  // This is based on an empirical observation that if the introduced Transpose
+  // nodes is more than 30, not using GEMM implementation would result in better
+  // performance.
+  if (status.ok() && GetNumTranspose(*output) > 30) {
+    *output = item.graph;
+    config.no_gemm = true;
+    DataLayoutOptimizer layout_optimizer(output, config);
+    status = layout_optimizer.Optimize();
+  }
+
   if (!status.ok()) {
     *output = item.graph;
   }
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 66dec17a35c125dca9dfe3a2c7f483e4fcd650ad..1bd6f9544b1da87fc86201aef67f151cd06c7124 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -29,11 +29,17 @@ class LayoutOptimizer : public GraphOptimizer {
 
   string name() const override { return "layout"; };
 
+  // This is for testing only.
+  void set_num_gpus(int num_gpus) { num_gpus_ = num_gpus; };
+
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  int num_gpus_ = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be38ca1a69e7360d5d9fa582b0492f9ea48eae14
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+void AddOutputShape(Node* node, const TensorShape& shape) {
+  std::vector<TensorShapeProto> output_shapes;
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  output_shapes.push_back(shape_proto);
+  node->AddAttr("_output_shapes", output_shapes);
+}
+
+class LayoutOptimizerTest : public ::testing::Test {
+ protected:
+  Output SimpleConv(tensorflow::Scope* s, int input_size, int filter_size,
+                    const string& padding) {
+    int batch_size = 128;
+    int input_height = input_size;
+    int input_width = input_size;
+    int input_depth = 3;
+    int filter_count = 2;
+    int stride = 1;
+    TensorShape input_shape(
+        {batch_size, input_height, input_width, input_depth});
+    Tensor input_data(DT_FLOAT, input_shape);
+    test::FillIota<float>(&input_data, 1.0f);
+    Output input =
+        ops::Const(s->WithOpName("Input"), Input::Initializer(input_data));
+    AddOutputShape(input.node(), input_shape);
+
+    TensorShape filter_shape(
+        {filter_size, filter_size, input_depth, filter_count});
+    Tensor filter_data(DT_FLOAT, filter_shape);
+    test::FillIota<float>(&filter_data, 1.0f);
+    Output filter =
+        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+    AddOutputShape(filter.node(), filter_shape);
+
+    Output conv = ops::Conv2D(s->WithOpName("Conv2D"), input, filter,
+                              {1, stride, stride, 1}, padding);
+    AddOutputShape(conv.node(), input_shape);
+    return conv;
+  }
+};
+
+TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 2, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv(&s, 2, 3, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(
+      node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ed7cab4abfdc5281f3906780527eb06e6f93f03
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -0,0 +1,290 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
+
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+const char* kRecomputedNodePrefix = "Recomputed";
+
+string RecomputedOrOriginalNodeName(
+    const std::unordered_set<string>& recomputed_node_names,
+    const string& original_node_name) {
+  if (recomputed_node_names.find(original_node_name) ==
+      recomputed_node_names.end()) {
+    return original_node_name;
+  } else {
+    return AddPrefixToNodeName(original_node_name, kRecomputedNodePrefix);
+  }
+}
+
+void RecomputeSubgraph(
+    const std::vector<const NodeDef*>& recomputed_source_nodes,
+    const string& recompute_trigger_node_name,
+    const std::vector<NodeDef*>& target_nodes, GraphDef* graph) {
+  std::unordered_set<string> recomputed_node_names;
+  for (const NodeDef* to_recompute : recomputed_source_nodes) {
+    recomputed_node_names.insert(to_recompute->name());
+  }
+  // Create the recomputed sub-graph
+  for (const NodeDef* original_node : recomputed_source_nodes) {
+    NodeDef* copied_node = graph->add_node();
+    copied_node->set_name(
+        AddPrefixToNodeName(original_node->name(), kRecomputedNodePrefix));
+    copied_node->set_op(original_node->op());
+    *copied_node->mutable_attr() = original_node->attr();
+    copied_node->set_device(original_node->device());
+    for (const string& original_input_name : original_node->input()) {
+      // Set inputs which are internal to the copied subgraph to their copied
+      // versions.
+      *copied_node->add_input() = RecomputedOrOriginalNodeName(
+          recomputed_node_names, original_input_name);
+    }
+    // Set control dependencies on the recomputed nodes so that they are not run
+    // until the specified trigger runs.
+    *copied_node->add_input() =
+        strings::StrCat("^", recompute_trigger_node_name);
+  }
+  // Set the inputs of nodes in the target subgraph to the recomputed nodes
+  // where applicable.
+  for (NodeDef* target_node : target_nodes) {
+    for (string& target_input_name : *target_node->mutable_input()) {
+      target_input_name = RecomputedOrOriginalNodeName(recomputed_node_names,
+                                                       target_input_name);
+    }
+  }
+}
+
+std::pair<NodeDef*, NodeDef*> BuildSwapPair(NodeDef* node, int input_to_swap,
+                                            GraphDef* graph) {
+  string tensor_to_swap = strings::StrCat(node->name(), "_", input_to_swap);
+
+  // Force the tensor to be copied to cpu.
+  NodeDef* swap_out_node = graph->add_node();
+  swap_out_node->set_name(strings::StrCat("swap_out_", tensor_to_swap));
+  swap_out_node->set_op("Identity");
+  swap_out_node->set_device("/CPU");
+
+  // Force the tensor to be restored to the device.
+  NodeDef* swap_in_node = graph->add_node();
+  swap_in_node->set_name(strings::StrCat("swap_in_", tensor_to_swap));
+  swap_in_node->set_op("Identity");
+  *swap_in_node->add_input() = swap_out_node->name();
+
+  // Colocate the swap_in_ node with the node itself.
+  string coloc_group = strings::StrCat("loc@", tensor_to_swap);
+  (*swap_in_node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
+  (*node->mutable_attr())["_class"].mutable_list()->add_s(coloc_group);
+
+  return std::make_pair(swap_out_node, swap_in_node);
+}
+
+static int64 EstimateSize(const OpInfo::TensorProperties& t) {
+  DataType dtype = t.dtype();
+  int64 size = DataTypeSize(dtype);
+  TensorShapeProto shape = t.shape();
+  if (shape.unknown_rank()) {
+    // Can't infer the size if the rank is unknown. It has to be at least a
+    // scalar though.
+    return size;
+  }
+  // If one of the dimensions is unknown statically, assume it's at least one.
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    if (shape.dim(i).size() < 0) {
+      shape.mutable_dim(i)->set_size(1);
+    }
+  }
+  int64 num_elems = TensorShape(shape).num_elements();
+  return num_elems * size;
+}
+
+struct SwapInfo {
+  std::vector<int> inputs_to_swap;
+  Costs::NanoSeconds time_to_swap = 0;
+};
+
+static const NodeDef* FindSwapTrigger(
+    const NodeDef* node, const SwapInfo& swap_info,
+    const std::unordered_map<string, const NodeDef*>& name_map,
+    const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
+        execution_times) {
+  // max_trigger_time stores the time before which the swap operation needs to
+  // be started in order to load the data back onto the accelerator without
+  // delaying the downstream computation.
+  Costs::NanoSeconds max_trigger_time(0);
+  std::set<string> possible_inputs;
+  for (int i = 0; i < node->input_size(); ++i) {
+    const string input_node_name = NodeName(node->input(i));
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    max_trigger_time = std::max(max_trigger_time, it2->second);
+    possible_inputs.insert(input_node_name);
+  }
+
+  for (const int i : swap_info.inputs_to_swap) {
+    const string input_node_name = NodeName(node->input(i));
+    possible_inputs.erase(input_node_name);
+  }
+  if (possible_inputs.empty()) {
+    return nullptr;
+  }
+
+  max_trigger_time -= swap_info.time_to_swap;
+
+  std::map<Costs::NanoSeconds, const NodeDef*> candidates;
+  while (!possible_inputs.empty()) {
+    const string input_node_name = *possible_inputs.begin();
+    possible_inputs.erase(possible_inputs.begin());
+    auto it1 = name_map.find(input_node_name);
+    if (it1 == name_map.end()) {
+      return nullptr;
+    }
+    const NodeDef* input_node = it1->second;
+    // Don't jump over frames, since adding a control dependency from one frame
+    // to the next isn't supported. Don't go through branches, since we don't
+    // know whether they'll be executed or not.
+    if (input_node->op() == "NextIteration" || input_node->op() == "Switch" ||
+        input_node->op() == "Merge") {
+      continue;
+    }
+    auto it2 = execution_times.find(input_node);
+    if (it2 == execution_times.end()) {
+      return nullptr;
+    }
+    if (it2->second < max_trigger_time) {
+      candidates[it2->second] = input_node;
+    } else {
+      for (const string& fanin : input_node->input()) {
+        possible_inputs.insert(NodeName(fanin));
+      }
+    }
+  }
+
+  // Select the candidate that will execute last, since we want to swap the data
+  // back at the last minute while still allowing enough time for data to be
+  // swapped back timely to feed the downstream nodes.
+  if (!candidates.empty()) {
+    return candidates.rbegin()->second;
+  }
+  return nullptr;
+}
+
+Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  // Figure out what needs to be swapped;
+  std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
+  for (auto& node : *optimized_graph->mutable_node()) {
+    if (node.attr().count("_swap_to_host") != 0) {
+      SwapInfo& swap_info = nodes_to_swap[&node];
+      const AttrValue& val = node.attr().at("_swap_to_host");
+      if (val.has_list()) {
+        for (int64 input_id : val.list().i()) {
+          swap_info.inputs_to_swap.push_back(input_id);
+        }
+      } else {
+        int64 input_id = val.i();
+        swap_info.inputs_to_swap.push_back(input_id);
+      }
+    }
+  }
+  if (nodes_to_swap.empty()) {
+    // Nothing to do.
+    return Status::OK();
+  }
+
+  {
+    // Estimate the size of the data to swap for each node.
+    GraphProperties properties(item);
+    TF_RETURN_IF_ERROR(properties.InferStatically());
+    for (auto& swap : nodes_to_swap) {
+      const NodeDef* node = swap.first;
+      std::vector<OpInfo::TensorProperties> props =
+          properties.GetInputProperties(node->name());
+      SwapInfo& swap_info = swap.second;
+      int64 bytes_to_swap = 0;
+      for (int64 input_id : swap_info.inputs_to_swap) {
+        const OpInfo::TensorProperties& t = props[input_id];
+        bytes_to_swap += EstimateSize(t);
+      }
+      // Let's assume we're going to swap over PCIe running at 16 GBps.
+      swap_info.time_to_swap = bytes_to_swap / 16;
+    }
+  }
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
+  TF_RETURN_IF_ERROR(
+      EstimateEarliestExecutionTimes(item, cluster, &execution_times));
+
+  std::unordered_map<string, const NodeDef*> name_map;
+  for (const auto& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+  }
+
+  for (auto& swap : nodes_to_swap) {
+    NodeDef* node = swap.first;
+    SwapInfo& swap_info = swap.second;
+
+    // Make sure the tensor isn't swapped back in right away: look for node that
+    // will execute just before we need to swap the data back, and add a control
+    // dependency from that node to the swap node.
+    const NodeDef* trigger =
+        FindSwapTrigger(node, swap_info, name_map, execution_times);
+    if (!trigger) {
+      continue;
+    }
+    // Swap all the tensors that are marked with the 'swap_to_host' attribute.
+    for (int input_id : swap_info.inputs_to_swap) {
+      std::pair<NodeDef*, NodeDef*> swap_nodes =
+          BuildSwapPair(node, input_id, optimized_graph);
+      *swap_nodes.first->add_input() = node->input(input_id);
+      *node->mutable_input(input_id) = swap_nodes.second->name();
+
+      // Add the control dependency needed to delay the execution of the swap.
+      *swap_nodes.second->add_input() = strings::StrCat("^", trigger->name());
+    }
+  }
+
+  return Status::OK();
+}
+
+void MemoryOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                               const GraphDef& optimized_graph, double result) {
+  // Nothing to do for MemoryOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfb24c05c99c3292647833db058591839b1a1d15
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+
+#include <vector>
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Swap tensors in and out of device memory.
+class MemoryOptimizer : public GraphOptimizer {
+ public:
+  MemoryOptimizer() {}
+  ~MemoryOptimizer() override {}
+
+  string name() const override { return "memory_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* pruned_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& pruned_graph, double result) override;
+};
+
+// Helper function to recompute a sub-graph (recomputed_source_nodes) on a
+// trigger. Edges from recomputed_source_nodes to target_nodes are changed to
+// start from the recomputed nodes.
+void RecomputeSubgraph(
+    const std::vector<const NodeDef*>& recomputed_source_nodes,
+    const string& recompute_trigger_node_name,
+    const std::vector<NodeDef*>& target_nodes, GraphDef* graph);
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4f8e22e1d8306ac2f1499cf8031e8fc669d8855
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -0,0 +1,179 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RecomputeSubgraphTest : public ::testing::Test {};
+
+TEST_F(RecomputeSubgraphTest, SimpleSubgraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 1.f, {2, 3, 4});
+  Output b = ops::AddN(s.WithOpName("b"), {a});  // Recomputed
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d"), {c});
+  Output e = ops::AddN(s.WithOpName("e"), {d, b});
+  Output f = ops::AddN(s.WithOpName("f"), {e, a});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_EQ(6, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  std::vector<const NodeDef*> recomputed_source_nodes;
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
+  std::vector<NodeDef*> target_nodes;
+  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
+  RecomputeSubgraph(recomputed_source_nodes, d.name(), target_nodes,
+                    &item.graph);
+  NodeMap post_transform_node_map(&item.graph);
+  EXPECT_EQ(7, item.graph.node_size());
+  NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
+  EXPECT_EQ(2, transformed_e->input_size());
+  EXPECT_EQ("d", transformed_e->input(0));
+  EXPECT_EQ("Recomputed/b", transformed_e->input(1));
+  NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/b");
+  EXPECT_EQ(2, recomputed_b->input_size());
+  EXPECT_EQ("a", recomputed_b->input(0));
+  EXPECT_EQ("^d", recomputed_b->input(1).substr(0, 2));
+}
+
+TEST_F(RecomputeSubgraphTest, MultiNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("Conv"), 1.f, {2, 3, 4});
+  Output b = ops::AddN(s.WithOpName("BN"), {a});    // Recomputed
+  Output c = ops::AddN(s.WithOpName("ReLU"), {b});  // Recomputed
+  Output d = ops::AddN(s.WithOpName("Conv1"), {c});
+
+  Output trigger = ops::Const(s.WithOpName("BN1Grad"), 0.f, {2, 3, 4});
+  Output e = ops::AddN(s.WithOpName("Conv1Grad"), {trigger, c});
+  Output f = ops::AddN(s.WithOpName("ReLUGrad"), {e, c});
+  Output g = ops::AddN(s.WithOpName("BNGrad"), {f, a});
+  Output h = ops::AddN(s.WithOpName("ConvGrad"), {g});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_EQ(9, item.graph.node_size());
+  NodeMap pre_transform_node_map(&item.graph);
+  std::vector<const NodeDef*> recomputed_source_nodes;
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(b.name()));
+  recomputed_source_nodes.push_back(pre_transform_node_map.GetNode(c.name()));
+  std::vector<NodeDef*> target_nodes;
+  target_nodes.push_back(pre_transform_node_map.GetNode(e.name()));
+  target_nodes.push_back(pre_transform_node_map.GetNode(f.name()));
+  target_nodes.push_back(pre_transform_node_map.GetNode(g.name()));
+  RecomputeSubgraph(recomputed_source_nodes, trigger.name(), target_nodes,
+                    &item.graph);
+  NodeMap post_transform_node_map(&item.graph);
+  EXPECT_EQ(11, item.graph.node_size());
+  NodeDef* transformed_e = post_transform_node_map.GetNode(e.name());
+  EXPECT_EQ(2, transformed_e->input_size());
+  EXPECT_EQ("BN1Grad", transformed_e->input(0));
+  EXPECT_EQ("Recomputed/ReLU", transformed_e->input(1));
+  NodeDef* transformed_f = post_transform_node_map.GetNode(f.name());
+  EXPECT_EQ(2, transformed_f->input_size());
+  EXPECT_EQ("Conv1Grad", transformed_f->input(0));
+  EXPECT_EQ("Recomputed/ReLU", transformed_f->input(1));
+  NodeDef* transformed_g = post_transform_node_map.GetNode(g.name());
+  EXPECT_EQ(2, transformed_g->input_size());
+  EXPECT_EQ("ReLUGrad", transformed_g->input(0));
+  EXPECT_EQ("Conv", transformed_g->input(1));
+
+  NodeDef* recomputed_b = post_transform_node_map.GetNode("Recomputed/BN");
+  EXPECT_EQ(2, recomputed_b->input_size());
+  EXPECT_EQ("Conv", recomputed_b->input(0));
+  EXPECT_EQ("^BN1Grad", recomputed_b->input(1).substr(0, 8));
+  NodeDef* recomputed_c = post_transform_node_map.GetNode("Recomputed/ReLU");
+  EXPECT_EQ(2, recomputed_c->input_size());
+  EXPECT_EQ("Recomputed/BN", recomputed_c->input(0));
+  EXPECT_EQ("^BN1Grad", recomputed_c->input(1).substr(0, 8));
+}
+
+class MemoryOptimizerTest : public ::testing::Test {
+ public:
+  static VirtualCluster CreateVirtualCluster() {
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
+
+TEST_F(MemoryOptimizerTest, SimpleSwapping) {
+  // Build a simple graph with an op that's marked for swapping.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::AddN(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d"), {c});
+  Output e = ops::AddN(s.WithOpName("e"), {b, d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  EXPECT_EQ(5, item.graph.node_size());
+  EXPECT_EQ(NodeName(e.name()), item.graph.node(4).name());
+  AttrValue& val =
+      (*item.graph.mutable_node(4)->mutable_attr())["_swap_to_host"];
+  val.mutable_list()->add_i(0);
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  MemoryOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(&cluster, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(1));
+  EXPECT_EQ("swap_in_e_0", new_e.input(0));
+
+  const NodeDef& swap_out = output.node(5);
+  EXPECT_EQ("swap_out_e_0", swap_out.name());
+
+  const NodeDef& swap_in = output.node(6);
+  EXPECT_EQ("swap_in_e_0", swap_in.name());
+
+  EXPECT_EQ(NodeName(b.name()), swap_out.input(0));
+  EXPECT_EQ(NodeName(swap_out.name()), swap_in.input(0));
+  EXPECT_EQ("^c", swap_in.input(1));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 44a1f5bab928ca6f6a69962ca3abd0184a1ee28a..8bb7800df4e204c420e15898bc04ac941b8fbdeb 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,32 +14,96 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/optimizers/auto_parallel.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
+std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
+    const string& optimizer) {
+  VLOG(1) << "Adding graph optimization pass: " << optimizer;
+  std::unique_ptr<GraphOptimizer> graph_optimizer;
+  if (optimizer == "pruning") {
+    graph_optimizer.reset(new ModelPruner());
+  }
+  if (optimizer == "constfold") {
+    graph_optimizer.reset(new ConstantFolding());
+  }
+  if (optimizer == "layout") {
+    graph_optimizer.reset(new LayoutOptimizer());
+  }
+  if (optimizer == "memory") {
+    graph_optimizer.reset(new MemoryOptimizer());
+  }
+  if (optimizer == "autoparallel") {
+    graph_optimizer.reset(
+        new AutoParallel(cfg_.auto_parallel().num_replicas()));
+  }
+  return graph_optimizer;
+}
+
 Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
-  bool already_optimized = false;
-  if (!cfg_.disable_model_pruning()) {
-    already_optimized = true;
-    ModelPruner pruner;
-    TF_RETURN_IF_ERROR(pruner.Optimize(nullptr, item, optimized_graph));
+  std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
+  if (cfg_.optimizers().empty()) {
+    if (!cfg_.disable_model_pruning()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
+    }
+    if (cfg_.constant_folding()) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new ConstantFolding()));
+    }
+    if (cfg_.optimize_tensor_layout()) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
+    }
+    if (cfg_.memory_optimization() > 0) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new MemoryOptimizer()));
+    }
+    if (cfg_.auto_parallel().enable()) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new AutoParallel(cfg_.auto_parallel().num_replicas())));
+    }
+  } else {
+    std::set<string> available_optimizers = {"pruning", "constfold", "layout",
+                                             "memory", "autoparallel"};
+    for (const auto& optimizer : cfg_.optimizers()) {
+      if (available_optimizers.find(optimizer) != available_optimizers.end()) {
+        optimizers.push_back(NewOptimizer(optimizer));
+      }
+    }
   }
-  if (cfg_.optimize_tensor_layout()) {
-    LayoutOptimizer layout_optimizer;
+
+  if (optimizers.empty()) {
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  bool already_optimized = false;
+  for (const auto& optimizer : optimizers) {
     if (!already_optimized) {
-      return layout_optimizer.Optimize(nullptr, item, optimized_graph);
+      TF_RETURN_IF_ERROR(optimizer->Optimize(cluster, item, optimized_graph));
+      already_optimized = true;
     } else {
       GrapplerItem optimized_item = item;
       optimized_item.graph = *optimized_graph;
-      return layout_optimizer.Optimize(nullptr, optimized_item,
-                                       optimized_graph);
+      TF_RETURN_IF_ERROR(
+          optimizer->Optimize(cluster, optimized_item, optimized_graph));
     }
   }
+  TopologicalSort(optimized_graph);
+  // Copy the graph version.
+  *optimized_graph->mutable_versions() = item.graph.versions();
+
   return Status::OK();
 }
 
@@ -49,13 +113,14 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  return cfg.optimize_tensor_layout();
+  return cfg.optimize_tensor_layout() || cfg.constant_folding() ||
+         cfg.auto_parallel().enable() || !cfg.optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-                        GraphDef* optimized_graph) {
+                        Cluster* cluster, GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cfg);
-  return optimizer.Optimize(nullptr, item, optimized_graph);
+  return optimizer.Optimize(cluster, item, optimized_graph);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index d7ff03f5907d3647cb81a32c6fcfeb70611b4330..6b950c973d9a2db04675aeee26e5f70e0371f400 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -39,13 +39,14 @@ class MetaOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
   RewriterConfig cfg_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
-                        GraphDef* optimized_graph);
+                        Cluster* cluster, GraphDef* optimized_graph);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index a89831b6e6481f49f662e6e1b677ee5da4aa087c..efa216383696eb035291e7f9251ec9516fd0ebb4 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -46,29 +46,32 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
       continue;
     }
-    // Don't remove nodes that are explicitely placed.
+    // Don't remove nodes that are explicitly placed.
     if (!node.device().empty()) {
       continue;
     }
     // Don't remove nodes that drive control dependencies.
-    if (!rewriter.DrivesControlDependency(node)) {
+    // Don't remove nodes that are driven by control dependencies either since
+    // we can't ensure (yet) that we won't increase the number of control
+    // dependency edges by deleting them (for example, removing a node driven by
+    // 10 control edges and driving 10 control edges would result in the
+    // creation of 100 edges).
+    if (!rewriter.DrivesControlDependency(node) &&
+        !rewriter.IsDrivenByControlDependency(node)) {
       nodes_to_delete.insert(&node);
     }
   }
 
   for (auto& node : item.graph.node()) {
-    if (nodes_to_delete.find(&node) != nodes_to_delete.end()) {
-      continue;
-    }
     NodeDef* new_node = pruned_graph->add_node();
     *new_node = node;
     new_node->clear_input();
     rewriter.ForwardInputs(node, nodes_to_delete, new_node);
   }
 
-  LOG(INFO) << "Pruned " << nodes_to_delete.size()
-            << " nodes from the graph. The graph now contains "
-            << pruned_graph->node_size() << " nodes.";
+  VLOG(1) << "Pruned " << nodes_to_delete.size()
+          << " nodes from the graph. The graph now contains "
+          << pruned_graph->node_size() << " nodes.";
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
index 3956d33961389dd8162ab4fd0154a6eb448d8d7e..3d76aebef433f126c660a2861c3ee4a1d18b4c6f 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.h
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace grappler {
 
 // Prune a model to make it more efficient:
-// * Remove unecessary operations.
+// * Remove unnecessary operations.
 // * Optimize gradient computations.
 class ModelPruner : public GraphOptimizer {
  public:
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 47d45a6f49090f8268dd7ad2709332a15d9fddca..67954d291461084925b1ad1b44b2a1bf7dbc0f5b 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -70,16 +70,22 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(3, output.node_size());
+  EXPECT_EQ(5, output.node_size());
   const NodeDef& new_a = output.node(0);
   EXPECT_EQ(NodeName(a.name()), new_a.name());
   const NodeDef& new_b = output.node(1);
   EXPECT_EQ(NodeName(b.name()), new_b.name());
-  const NodeDef& new_e = output.node(2);
+  const NodeDef& new_c = output.node(2);
+  EXPECT_EQ(NodeName(c.name()), new_c.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
   EXPECT_EQ(1, new_e.input_size());
   EXPECT_EQ(NodeName(b.name()), new_e.input(0));
+  EXPECT_EQ(1, new_d.input_size());
+  EXPECT_EQ(NodeName(b.name()), new_d.input(0));
 }
 
 TEST_F(ModelPrunerTest, IdentityPruning) {
@@ -104,18 +110,22 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ(5, output.node_size());
   const NodeDef& new_a = output.node(0);
   EXPECT_EQ(NodeName(a.name()), new_a.name());
   const NodeDef& new_b = output.node(1);
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_e = output.node(3);
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
   EXPECT_EQ(1, new_e.input_size());
   EXPECT_EQ(NodeName(c.name()), new_e.input(0));
+  EXPECT_EQ(1, new_d.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
 }
 
 TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
@@ -142,14 +152,16 @@ TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ(5, output.node_size());
   const NodeDef& new_a = output.node(0);
   EXPECT_EQ(NodeName(a.name()), new_a.name());
   const NodeDef& new_b = output.node(1);
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_e = output.node(3);
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
   EXPECT_EQ(NodeName(e.name()), new_e.name());
 
   EXPECT_EQ(2, new_e.input_size());
@@ -157,7 +169,7 @@ TEST_F(ModelPrunerTest, PruningSkipsCtrlDependencies) {
   EXPECT_EQ("^c", new_e.input(1));
 }
 
-TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
+TEST_F(ModelPrunerTest, PruningPerservesCtrlDependencies) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
@@ -183,20 +195,28 @@ TEST_F(ModelPrunerTest, PruningForwardsCtrlDependencies) {
   Status status = pruner.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ(6, output.node_size());
   const NodeDef& new_a = output.node(0);
   EXPECT_EQ(NodeName(a.name()), new_a.name());
   const NodeDef& new_b = output.node(1);
   EXPECT_EQ(NodeName(b.name()), new_b.name());
   const NodeDef& new_c = output.node(2);
   EXPECT_EQ(NodeName(c.name()), new_c.name());
-  const NodeDef& new_f = output.node(3);
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ(NodeName(d.name()), new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ(NodeName(e.name()), new_e.name());
+  const NodeDef& new_f = output.node(5);
   EXPECT_EQ(NodeName(f.name()), new_f.name());
 
-  EXPECT_EQ(3, new_f.input_size());
-  EXPECT_EQ(NodeName(c.name()), new_f.input(0));
-  EXPECT_EQ("^b", new_f.input(1));
-  EXPECT_EQ("^c", new_f.input(2));
+  EXPECT_EQ(1, new_f.input_size());
+  EXPECT_EQ(NodeName(e.name()), new_f.input(0));
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ(NodeName(d.name()), new_e.input(0));
+  EXPECT_EQ("^c", new_e.input(1));
+  EXPECT_EQ(2, new_d.input_size());
+  EXPECT_EQ(NodeName(c.name()), new_d.input(0));
+  EXPECT_EQ("^b", new_d.input(1));
 }
 
 TEST_F(ModelPrunerTest, PruningPerservesFetch) {
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e31499eac66a9ecf350a2de6fc15b68662499854
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include <deque>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+namespace grappler {
+
+static Costs::NanoSeconds PredictExecutionTime(
+    const GraphProperties& properties, const OpLevelCostEstimator& estimator,
+    const VirtualPlacer& placer, const NodeDef& node) {
+  OpInfo op_features;
+  op_features.set_op(node.op());
+  *op_features.mutable_attr() = node.attr();
+
+  std::vector<OpInfo::TensorProperties> inputs =
+      properties.GetInputProperties(node.name());
+  for (auto& input : inputs) {
+    op_features.add_inputs()->Swap(&input);
+  }
+
+  DeviceProperties device = placer.get_device(node);
+  op_features.mutable_device()->Swap(&device);
+
+  Costs::NanoSeconds estimate =
+      estimator.PredictCosts(op_features).execution_time;
+
+  // Make sure our estimates are at least one nanosecond per node.
+  return std::max(estimate, Costs::NanoSeconds(1));
+}
+
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* completion_times) {
+  std::unordered_map<string, const NodeDef*> name_map;
+  std::unordered_map<const NodeDef*, int> pending_inputs;
+  std::deque<const NodeDef*> ready_nodes;
+  for (const NodeDef& node : item.graph.node()) {
+    name_map[node.name()] = &node;
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+      (*completion_times)[&node] = 0;
+    } else if (IsMerge(node)) {
+      // Merge nodes are processed as soon as one of the input becomes
+      // available.
+      pending_inputs[&node] = 1;
+    } else {
+      pending_inputs[&node] = node.input_size();
+    }
+  }
+
+  std::unordered_map<const NodeDef*, std::vector<const NodeDef*>> fanouts;
+  for (const NodeDef& node : item.graph.node()) {
+    for (const string& input : node.input()) {
+      string node_name = NodeName(input);
+      auto it = name_map.find(node_name);
+      if (it == name_map.end()) {
+        return errors::InvalidArgument(
+            strings::StrCat("Unknown input node ", input));
+      }
+      const NodeDef* fanin = it->second;
+      fanouts[fanin].push_back(&node);
+    }
+  }
+  name_map.clear();
+
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically());
+  OpLevelCostEstimator estimator;
+  VirtualPlacer placer(cluster);
+
+  while (!ready_nodes.empty()) {
+    const NodeDef* node = ready_nodes.front();
+    ready_nodes.pop_front();
+
+    Costs::NanoSeconds execution_time =
+        PredictExecutionTime(properties, estimator, placer, *node);
+    Costs::NanoSeconds completion_time =
+        execution_time + (*completion_times)[node];
+    (*completion_times)[node] = completion_time;
+
+    for (const NodeDef* fanout : fanouts[node]) {
+      int pending = pending_inputs[fanout];
+      if (pending == 0) {
+        // Already processed. Avoid going through loops more than once.
+        continue;
+      } else if (pending == 1) {
+        ready_nodes.push_back(fanout);
+      }
+      pending_inputs[fanout]--;
+
+      Costs::NanoSeconds ready_time =
+          std::max(completion_time, (*completion_times)[fanout]);
+      (*completion_times)[fanout] = ready_time;
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.h b/tensorflow/core/grappler/optimizers/static_schedule.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dd82b0dab1248a1b99e952d2825acb90a13b0bb
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule.h
@@ -0,0 +1,41 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute the earliest time as which the execution of each node in the graph
+// can complete.
+// In our estimation, we ensure that each node takes at least one nanosecond to
+// execute: therefore the execution times can be used to derive a topological
+// ordering of the graph (at least as long as there is no loop in the graph).
+Status EstimateEarliestExecutionTimes(
+    const GrapplerItem& item, const Cluster* cluster,
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds>* execution_times);
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_STATIC_SCHEDULE_H_
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f53feaca4c396f3927689c135cfa6fcb4d578154
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/static_schedule.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StaticScheduleTest : public ::testing::Test {
+ public:
+  VirtualCluster CreateVirtualCluster() const {
+    // Invent a CPU so that predictions remain the same from machine to machine.
+    DeviceProperties cpu_device;
+    cpu_device.set_type("CPU");
+    cpu_device.set_frequency(1000);
+    cpu_device.set_num_cores(4);
+    cpu_device.set_bandwidth(32);
+    cpu_device.set_l1_cache_size(32 * 1024);
+    cpu_device.set_l2_cache_size(256 * 1024);
+    cpu_device.set_l3_cache_size(4 * 1024 * 1024);
+    std::unordered_map<string, DeviceProperties> devices;
+    devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
+    return VirtualCluster(devices);
+  }
+};
+
+TEST_F(StaticScheduleTest, BasicGraph) {
+  // This trivial graph is so basic there's nothing to prune.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "Const/Const") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "x") {
+      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
+    } else if (time.first->name() == "AddN") {
+      EXPECT_EQ(Costs::NanoSeconds(1500001), time.second);
+    } else if (time.first->name() == "AddN_1") {
+      EXPECT_EQ(Costs::NanoSeconds(2750001), time.second);
+    } else if (time.first->name() == "AddN_2") {
+      EXPECT_EQ(Costs::NanoSeconds(4000001), time.second);
+    } else if (time.first->name() == "AddN_3") {
+      EXPECT_EQ(Costs::NanoSeconds(5250001), time.second);
+    } else if (time.first->name() == "y") {
+      EXPECT_EQ(Costs::NanoSeconds(6500001), time.second);
+    }
+  }
+}
+
+TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
+  // Build a simple graph with a control dependency.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::AddN(s.WithOpName("b"), {a});
+  Output c = ops::Identity(s.WithOpName("c"), b);
+  Output d = ops::Identity(s.WithOpName("d"), c);
+  Output e = ops::AddN(s.WithOpName("e"), {d});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Add a control dependency between c and e.
+  EXPECT_EQ("c", item.graph.node(2).name());
+  EXPECT_EQ("e", item.graph.node(4).name());
+  *item.graph.mutable_node(4)->add_input() = "^c";
+
+  VirtualCluster cluster(CreateVirtualCluster());
+
+  std::unordered_map<const NodeDef*, Costs::NanoSeconds> completion_times;
+  Status status =
+      EstimateEarliestExecutionTimes(item, &cluster, &completion_times);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), completion_times.size());
+
+  for (auto time : completion_times) {
+    if (time.first->name() == "a") {
+      EXPECT_EQ(Costs::NanoSeconds(1), time.second);
+    } else if (time.first->name() == "b") {
+      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
+    } else if (time.first->name() == "c") {
+      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
+    } else if (time.first->name() == "d") {
+      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index a56961cd954d5dc34139ef50a3a7170ab7b6f576..06ef61a9613656b5239905a01a787d0b105c1c62 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -97,10 +97,10 @@ int NodePosition(const string& name) {
 string AddPrefixToNodeName(const string& name, const string& prefix) {
   if (!name.empty()) {
     if (name[0] == '^') {
-      return strings::StrCat("^", prefix, "-", name.substr(1));
+      return strings::StrCat("^", prefix, "/", name.substr(1));
     }
   }
-  return strings::StrCat(prefix, "-", name);
+  return strings::StrCat(prefix, "/", name);
 }
 
 bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
@@ -116,10 +116,7 @@ bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
   });
   const bool notified =
       WaitForNotificationWithTimeout(done.get(), timeout_in_ms * 1000);
-  if (!notified) {
-    return false;
-  }
-  return true;
+  return notified;
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 17b980c5b8c3b47567c49992fca57e6bf01d67f6..0fb531ef1bd2a86433b3e7c26be7eda97b37ae37 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -34,7 +34,7 @@ class NodeMap {
   NodeDef* GetNode(const string& name);
   std::set<NodeDef*> GetOutputs(const string& node_name);
   // This method doesn't record the outputs of the added node; the outputs need
-  // to be explictly added by the AddOutput method.
+  // to be explicitly added by the AddOutput method.
   void AddNode(const string& name, NodeDef* node);
   void AddOutput(const string& node, const string& output);
   void UpdateOutput(const string& node, const string& old_output,
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e1db1a8cd29633c4f8054a159e955606e58e2a10
--- /dev/null
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -0,0 +1,64 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+cc_library(
+    name = "scc",
+    srcs = ["scc.cc"],
+    hdrs = ["scc.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "scc_test",
+    srcs = ["scc_test.cc"],
+    deps = [
+        ":scc",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
+cc_library(
+    name = "topological_sort",
+    srcs = ["topological_sort.cc"],
+    hdrs = ["topological_sort.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+cc_test(
+    name = "topological_sort_test",
+    srcs = ["topological_sort_test.cc"],
+    deps = [
+        ":topological_sort",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/scc.cc b/tensorflow/core/grappler/utils/scc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6568e99aa3f6c1c690d689653f0cd9fb16f82673
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.cc
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Data structure used to store data for Tarjan's Strongly Connected
+// Components algorithm.
+struct SCCNodeData {
+  SCCNodeData()
+      : node(nullptr),
+        index(-1),
+        lowlink(-1),
+        onstack(false),
+        caller(nullptr),
+        caller_loop_location(-1) {}
+  void ResetStack(int new_index, SCCNodeData* new_caller) {
+    index = new_index;
+    lowlink = new_index;
+    onstack = true;
+    caller = new_caller;
+    caller_loop_location = 0;
+  }
+  const NodeDef* node;
+  int index;
+  int lowlink;
+  bool onstack;
+  std::vector<SCCNodeData*> children;
+  // StrongConnect "call stack" storage.
+  SCCNodeData* caller;       // Node calling StrongConnect
+  int caller_loop_location;  // Index in parent StrongConnect for loop
+};
+
+// Core DFS step of Tarjan's Strongly Connected Component algorithm
+// (implemented using iteration instead of recursion).
+void StrongConnect(SCCNodeData* v, std::stack<SCCNodeData*>* stack, int* index,
+                   std::unordered_map<const NodeDef*, int>* components,
+                   int* scc_index) {
+  // Iterative version of Tarjan's StrongConnect function.
+  // The "call stack" state is composed of a SCCNodeData's caller and
+  // caller_loop_location properties.
+  v->ResetStack(*index /* index */, nullptr /* caller */);
+  ++*index;
+  stack->push(v);
+
+  // No one put v on a StrongConnect call stack, reset caller values.
+  v->caller = nullptr;
+  v->caller_loop_location = 0;
+
+  SCCNodeData* last = v;
+  while (true) {
+    if (last->caller_loop_location < last->children.size()) {
+      // Recursive equivalent: Looping over the children of v (possibly
+      // continuing at v->caller_loop_location after having finished a
+      // recursive call.
+      SCCNodeData* w = last->children[last->caller_loop_location];
+      ++(last->caller_loop_location);  // For loop iterator increment
+      if (w->index == -1) {
+        w->ResetStack(*index /* index */, last /* caller */);
+        ++*index;
+        stack->push(w);
+        last = w;
+      } else if (w->onstack == true) {
+        last->lowlink = std::min(last->lowlink, w->index);
+      }
+    } else {
+      // At the end of v's children
+      if (last->lowlink == last->index) {
+        // v is the root of a strongly connected component
+        SCCNodeData* top;
+        while (true) {
+          top = stack->top();
+          stack->pop();
+          top->onstack = false;
+          (*components)[top->node] = *scc_index;
+          if (top == last) {
+            break;
+          }
+        }
+        ++*scc_index;
+      }
+
+      // Go up the recursive call stack
+      SCCNodeData* next_last = last->caller;
+      if (next_last == nullptr) {
+        // All nodes have been seen; finished.
+        break;
+      } else {
+        next_last->lowlink = std::min(next_last->lowlink, last->lowlink);
+        last = next_last;
+      }
+    }
+  }
+}
+
+// This is an implementation of Tarjan's Strongly Connected Components
+// DFS algorithm.  Most of the hard work is done in the function
+// StrongConnect, which is an iterative reimplementation of the
+// recursive version described here:
+//   https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
+//
+// The edges for the purpose of this algorithm are directed from input
+// to op (the reverse of the declarations of the NodeDef, which
+// contain in-edges)
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_components) {
+  std::stack<SCCNodeData*> stack;
+  std::unordered_map<string, SCCNodeData*> name_to_data;
+  std::vector<SCCNodeData> node_data_container;
+  node_data_container.reserve(graph.node_size());
+  std::unordered_map<const NodeDef*, SCCNodeData*> node_to_data;
+
+  for (const NodeDef& node : graph.node()) {
+    SCCNodeData node_data;
+    node_data.node = &node;
+    node_data_container.push_back(node_data);
+    name_to_data[node.name()] = &(*node_data_container.rbegin());
+    node_to_data[&node] = &(*node_data_container.rbegin());
+  }
+
+  // Create a list of top-level parents (add them to object queue)
+  // Also create a mapping from nodes to their children.
+  for (const NodeDef& node : graph.node()) {
+    for (const string& input : node.input()) {
+      name_to_data[NodeName(input)]->children.push_back(node_to_data[&node]);
+    }
+  }
+
+  components->clear();
+  *num_components = 0;
+  int index = 0;
+  for (auto& v : node_data_container) {
+    if (v.index == -1) {
+      // Node has not yet been visited.  Start a DFS at v.
+      StrongConnect(&v, &stack, &index, components, num_components);
+    }
+  }
+
+  std::vector<int> counts_per_component(*num_components, 0);
+  for (auto& component : *components) {
+    DCHECK(component.second >= 0);
+    DCHECK(component.second < *num_components);
+    counts_per_component[component.second]++;
+  }
+  for (auto& component : *components) {
+    if (counts_per_component[component.second] == 1) {
+      component.second = -1;
+      (*num_components)--;
+    }
+  }
+  (*num_components) += 1;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/scc.h b/tensorflow/core/grappler/utils/scc.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b0577763d66817e99eb62e9f517b12bd07aea79
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Compute modified strongly connected components:
+// All nodes that are not part of a loop are assigned the special -1 id
+// All nodes that are part of at least one loop are assigned a positive
+// component id: if 2 nodes v and w are reachable from one another (i.e. if they
+// belong to the same scc), they'll be assigned the same id, otherwise they'll
+// be assigned distinct ids. Returns the number of distinct ids.
+void StronglyConnectedComponents(
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* components,
+    int* num_ids);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_SCC_H_
diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3185cbe232631d70e8a79b31168ca39b53e62272
--- /dev/null
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -0,0 +1,410 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/scc.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class SCCTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    std::unordered_map<string, DeviceProperties> devices;
+    DeviceProperties unknown_device;
+    devices["MY_DEVICE"] = unknown_device;
+    cluster_.reset(new VirtualCluster(devices));
+    TF_CHECK_OK(cluster_->Provision());
+  }
+
+  void TearDown() override { cluster_.reset(); }
+
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            gtl::ArraySlice<string> inputs) {
+    NodeDef node;
+    node.set_name(name);
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+
+  std::unique_ptr<VirtualCluster> cluster_;
+};
+
+TEST_F(SCCTest, NoLoops) {
+  // Create a simple graph without any loop.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 1);
+  for (const auto& node : item.graph.node()) {
+    EXPECT_EQ(-1, components[&node]);
+  }
+}
+
+TEST_F(SCCTest, DisjointCycleAndPath) {
+  GraphDef graph;
+  // Create a cycle
+  *graph.add_node() = CreateNode("a", {"d"});
+  *graph.add_node() = CreateNode("b", {"a"});
+  *graph.add_node() = CreateNode("c", {"b"});
+  *graph.add_node() = CreateNode("d", {"c"});
+
+  // Add a path disjoint from cycle
+  *graph.add_node() = CreateNode("e", {});
+  *graph.add_node() = CreateNode("f", {"e"});
+  *graph.add_node() = CreateNode("g", {"f"});
+  *graph.add_node() = CreateNode("h", {"g"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+
+  for (const auto& pair : {std::make_pair("a", "b"), std::make_pair("a", "c"),
+                           std::make_pair("a", "d")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& node : {"e", "f", "g", "h"})
+    EXPECT_EQ(-1, components[name_to_node[node]]);
+}
+}  // namespace
+
+TEST_F(SCCTest, WikipediaExample) {
+  // Graph with 4 SCCs:
+
+  // SCC1:
+  // a -> b
+  // b -> c
+  // c -> a
+
+  // d -> b
+  // d -> c
+
+  // SCC2:
+  // d -> e
+  // e -> d
+
+  // e -> f
+  // f -> c
+
+  // SCC3:
+  // f -> g
+  // g -> f
+
+  // h -> g
+  // h -> d
+
+  // SCC4:
+  // h -> h
+
+  // NodeDefs define inbound connections (inputs)
+  GraphDef graph;
+  *graph.add_node() = CreateNode("a", {"c"});
+  *graph.add_node() = CreateNode("b", {"a", "d"});
+  *graph.add_node() = CreateNode("c", {"b", "d", "f"});
+  *graph.add_node() = CreateNode("d", {"e"});
+  *graph.add_node() = CreateNode("e", {"d"});
+  *graph.add_node() = CreateNode("f", {"e", "g"});
+  *graph.add_node() = CreateNode("g", {"f", "h"});
+  *graph.add_node() = CreateNode("h", {"h"});
+
+  std::vector<const NodeDef*> nodes;
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& n : graph.node()) {
+    nodes.push_back(&n);
+    name_to_node[n.name()] = &n;
+  }
+
+  int num_components;
+  std::unordered_map<const NodeDef*, int> components;
+  StronglyConnectedComponents(graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 4);
+  for (const auto& pair :
+       {std::make_pair("a", "b"), std::make_pair("a", "c"),
+        std::make_pair("d", "e"), std::make_pair("f", "g")}) {
+    EXPECT_EQ(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+
+  for (const auto& pair :
+       {std::make_pair("a", "d"), std::make_pair("a", "f"),
+        std::make_pair("a", "h"), std::make_pair("d", "f"),
+        std::make_pair("d", "h"), std::make_pair("f", "h")}) {
+    EXPECT_NE(components[name_to_node[pair.first]],
+              components[name_to_node[pair.second]]);
+  }
+}
+
+TEST_F(SCCTest, TensorFlowLoop) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+
+  std::unordered_map<const NodeDef*, int> components;
+  int num_components;
+  StronglyConnectedComponents(item.graph, &components, &num_components);
+
+  EXPECT_EQ(num_components, 2);
+  for (const auto& node : item.graph.node()) {
+    if (node.name() == "Const" || node.name() == "while/Enter" ||
+        node.name() == "while/Exit") {
+      // These nodes are not part of the loop, they should be assigned the id
+      // -1.
+      EXPECT_EQ(-1, components[&node]);
+    } else {
+      // These nodes are part of the loop, they should be assigned a positive
+      // id.
+      EXPECT_LE(0, components[&node]);
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
new file mode 100644
index 0000000000000000000000000000000000000000..131756fc5c2b2f7090934e791d6dfa7acf7ccfa7
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -0,0 +1,64 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include <deque>
+#include <unordered_map>
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Kahn's algorithm is implemented.
+// For details, see https://en.wikipedia.org/wiki/Topological_sorting
+void TopologicalSort(GraphDef* graph) {
+  NodeMap node_map(graph);
+  std::deque<const NodeDef*> ready_nodes;
+  std::unordered_map<const NodeDef*, int> ready_inputs;
+  for (const NodeDef& node : graph->node()) {
+    if (node.input_size() == 0) {
+      ready_nodes.push_back(&node);
+    }
+    if (node.op() == "Merge") {
+      ready_inputs[&node] = 0;
+      for (const auto& input : node.input()) {
+        if (node_map.GetNode(input)->op() == "NextIteration") {
+          ready_inputs[&node]++;
+        }
+      }
+    } else {
+      ready_inputs[&node] = 0;
+    }
+  }
+  GraphDef sorted_graph;
+  while (!ready_nodes.empty()) {
+    auto ready_node = ready_nodes.front();
+    *sorted_graph.add_node() = *ready_node;
+    for (const auto& fanout : node_map.GetOutputs(ready_node->name())) {
+      ready_inputs[fanout]++;
+      if (ready_inputs[fanout] == fanout->input_size()) {
+        ready_nodes.push_back(fanout);
+      }
+    }
+    ready_nodes.pop_front();
+  }
+  if (sorted_graph.node_size() == graph->node_size()) {
+    *graph = sorted_graph;
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d8034ef577a0282dbce161aed8ba440bf248ab
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
+
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Sort a graph in topological order.
+void TopologicalSort(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55f66b273496c53d9450626ee0c896e725415a48
--- /dev/null
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class TopologicalSortTest : public ::testing::Test {
+ protected:
+  static NodeDef CreateNode(const string& name,
+                            const std::vector<string>& inputs) {
+    return CreateNode(name, "", inputs);
+  }
+  static NodeDef CreateNode(const string& name, const string& op,
+                            const std::vector<string>& inputs) {
+    NodeDef node;
+    node.set_name(name);
+    if (!op.empty()) {
+      node.set_op(op);
+    }
+    for (const string& input : inputs) {
+      node.add_input(input);
+    }
+    return node;
+  }
+};
+
+TEST_F(TopologicalSortTest, NoLoop) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithLoop) {
+  GraphDef graph;
+  // Create a loop
+  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
+  *graph.add_node() = CreateNode("3", "Switch", {"2"});
+  *graph.add_node() = CreateNode("4", "Identity", {"3"});
+  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+TEST_F(TopologicalSortTest, WithIllegalLoop) {
+  GraphDef graph;
+  // A loop without Merge and NextIteration is illegal and the original node
+  // order and graph will be preserved.
+  *graph.add_node() = CreateNode("2", {"1", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("1", {});
+
+  TopologicalSort(&graph);
+  std::vector<string> order = {"2", "3", "1"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 4ef9f8e4e57ff2bce8924d2146a24e8711b08dd2..f055eb776b98d8ab66985e7ed2063a7ec4e36094 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -59,9 +59,9 @@ TEST_F(UtilsTest, NodePosition) {
 }
 
 TEST_F(UtilsTest, AddNodeNamePrefix) {
-  EXPECT_EQ("OPTIMIZED-abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
-  EXPECT_EQ("^OPTIMIZED-abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
-  EXPECT_EQ("OPTIMIZED-", AddPrefixToNodeName("", "OPTIMIZED"));
+  EXPECT_EQ("OPTIMIZED/abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
+  EXPECT_EQ("^OPTIMIZED/abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
+  EXPECT_EQ("OPTIMIZED/", AddPrefixToNodeName("", "OPTIMIZED"));
 }
 
 TEST_F(UtilsTest, ExecuteWithTimeout) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 9c47d520d96b3b74da0e0263e1c4f19fc8107410..1365634bbfec9c290a6a492b5a60d618f4b04681 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -17,7 +17,10 @@ licenses(["notice"])  # Apache 2.0
 
 package_group(
     name = "friends",
-    packages = ["//tensorflow/..."],
+    packages = [
+        "//learning/brain/contrib/...",
+        "//tensorflow/...",
+    ],
 )
 
 load(
@@ -41,6 +44,7 @@ load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -104,6 +108,7 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
+        ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -283,6 +288,15 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "map_stage_op",
+    srcs = ["map_stage_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "queue_base",
     srcs = ["queue_base.cc"],
@@ -358,6 +372,18 @@ tf_kernel_library(
     alwayslink = 0,
 )
 
+cc_library(
+    name = "split_lib_hdrs",
+    hdrs = [
+        "split_lib.h",
+    ],
+    deps = [
+        ":eigen_helpers",
+        ":ops_util_hdrs",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "typed_queue",
     hdrs = ["typed_queue.h"],
@@ -366,6 +392,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "training_op_helpers",
+    srcs = ["training_op_helpers.cc"],
+    hdrs = ["training_op_helpers.h"],
+    visibility = [":friends"],
+    deps = [
+        ":variable_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "bounds_check",
     hdrs = ["bounds_check.h"],
@@ -376,6 +415,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "warn_about_ints",
+    srcs = ["warn_about_ints.cc"],
+    hdrs = ["warn_about_ints.h"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
+)
+
 # Private support libraries ---------------------------------------------------
 
 cc_header_only_library(
@@ -1028,6 +1076,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/debug:debug_io_utils",
     ],
 )
 
@@ -1050,6 +1099,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "dequantize_op_test",
+    size = "small",
+    srcs = ["dequantize_op_test.cc"],
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "reverse_op_test",
     size = "small",
@@ -1256,6 +1324,7 @@ cc_library(
         ":fifo_queue_op",
         ":lookup_table_init_op",
         ":lookup_table_op",
+        ":map_stage_op",
         ":padding_fifo_queue_op",
         ":priority_queue_op",
         ":queue_ops",
@@ -1269,6 +1338,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "lookup",
+    deps = [
+        ":lookup_table_init_op",
+        ":lookup_table_op",
+    ],
+)
+
 DATA_FLOW_DEPS = [
     ":bounds_check",
     ":concat_lib",
@@ -1392,10 +1469,10 @@ LOOKUP_DEPS = [
     ":initializable_lookup_table",
     ":lookup_util",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -1558,9 +1635,8 @@ cc_library(
         ":attention_ops",
         ":colorspace_op",
         ":crop_and_resize_op",
-        ":decode_gif_op",
-        ":decode_jpeg_op",
-        ":decode_png_op",
+        ":decode_bmp_op",
+        ":decode_image_op",
         ":draw_bounding_box_op",
         ":encode_jpeg_op",
         ":encode_png_op",
@@ -1625,20 +1701,14 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "decode_jpeg_op",
-    prefix = "decode_jpeg_op",
-    deps = IMAGE_DEPS,
-)
-
-tf_kernel_library(
-    name = "decode_png_op",
-    prefix = "decode_png_op",
+    name = "decode_bmp_op",
+    prefix = "decode_bmp_op",
     deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
-    name = "decode_gif_op",
-    prefix = "decode_gif_op",
+    name = "decode_image_op",
+    prefix = "decode_image_op",
     deps = IMAGE_DEPS,
 )
 
@@ -1971,6 +2041,32 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "cuda_solvers",
+    srcs = ["cuda_solvers.cc"],
+    hdrs = ["cuda_solvers.h"],
+    gpu_srcs = [
+        "cuda_solvers.h",
+        "cuda_solvers_gpu.cu.cc",
+    ],
+    # @local_config_cuda//cuda:cusolver, //third_party/eigen3:blas,
+    # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
+    # and f2c helper functions in global namespace. Tell the compiler to
+    # allow multiple definitions when linking this.
+    linkopts = select({
+        "//tensorflow:darwin": [],
+        "//conditions:default": ["-Wl,-z,muldefs"],
+    }),
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+)
+
 LINALG_DEPS = [
     ":linalg_ops_common",
     "//third_party/eigen3",
@@ -1982,7 +2078,10 @@ LINALG_DEPS = [
 tf_kernel_library(
     name = "cholesky_op",
     prefix = "cholesky_op",
-    deps = LINALG_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+        ":matrix_band_part_op",
+    ]) + LINALG_DEPS,
 )
 
 tf_kernel_library(
@@ -2012,7 +2111,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_inverse_op",
     prefix = "matrix_inverse_op",
-    deps = LINALG_DEPS,
+    deps = if_cuda([
+        ":cuda_solvers",
+    ]) + LINALG_DEPS,
 )
 
 tf_kernel_library(
@@ -2030,7 +2131,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2171,6 +2274,7 @@ cc_library(
         ":batch_matmul_op",
         ":betainc_op",
         ":bincount_op",
+        ":bucketize_op",
         ":cast_op",
         ":check_numerics_op",
         ":cross_op",
@@ -2208,6 +2312,12 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+tf_kernel_library(
+    name = "bucketize_op",
+    prefix = "bucketize_op",
+    deps = MATH_DEPS,
+)
+
 tf_kernel_library(
     name = "cast_op",
     prefix = "cast_op",
@@ -2237,7 +2347,9 @@ tf_kernel_library(
     prefix = "fft_ops",
     deps = MATH_DEPS + [
         "//tensorflow/core:spectral_ops_op_lib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2262,6 +2374,8 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
 
@@ -2541,7 +2655,10 @@ tf_kernel_library(
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2655,13 +2772,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "softplus_op",
     prefix = "softplus_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
     name = "softsign_op",
     prefix = "softsign_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + [":warn_about_ints"],
 )
 
 tf_kernel_library(
@@ -3014,6 +3131,18 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "stateless_random_ops",
+    prefix = "stateless_random_ops",
+    deps = [
+        ":bounds_check",
+        ":random_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stateless_random_ops_op_lib",
+    ],
+)
+
 cc_library(
     name = "required",
     deps = [
@@ -3048,6 +3177,7 @@ cc_library(
         ":sparse_add_grad_op",
         ":sparse_add_op",
         ":sparse_concat_op",
+        ":sparse_cross_op",
         ":sparse_dense_binary_op_shared",
         ":sparse_reduce_sum_op",
         ":sparse_reorder_op",
@@ -3092,6 +3222,12 @@ tf_kernel_library(
     deps = SPARSE_DEPS,
 )
 
+tf_kernel_library(
+    name = "sparse_cross_op",
+    prefix = "sparse_cross_op",
+    deps = SPARSE_DEPS,
+)
+
 tf_kernel_library(
     name = "sparse_reduce_sum_op",
     prefix = "sparse_reduce_sum_op",
@@ -3421,6 +3557,7 @@ tf_kernel_library(
     prefix = "training_ops",
     deps = [
         ":bounds_check",
+        ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3537,158 +3674,393 @@ tf_kernel_library(
     ],
 )
 
-# Android libraries -----------------------------------------------------------
-
-# Changes to the Android srcs here should be replicated in
-# tensorflow/contrib/makefile/tf_op_files.txt
-# LINT.IfChange
 filegroup(
-    name = "android_srcs",
+    name = "spectrogram_test_data",
     srcs = [
-        "avgpooling_op.h",
-        "bounds_check.h",
-        "cwise_ops.h",
-        "cwise_ops_common.h",
-        "cwise_ops_gradients.h",
-        "eigen_activations.h",
-        "eigen_attention.h",
-        "eigen_backward_cuboid_convolutions.h",
-        "eigen_backward_spatial_convolutions.h",
-        "eigen_cuboid_convolution.h",
-        "eigen_pooling.h",
-        "eigen_softmax.h",
-        "eigen_spatial_convolutions.h",
-        "eigen_volume_patch.h",
-        "fifo_queue.h",
-        "maxpooling_op.h",
-        "ops_util.cc",
-        "ops_util.h",
-        "padding_fifo_queue.h",
-        "pooling_ops_common.cc",
-        "pooling_ops_common.h",
-        "queue_base.h",
-        "queue_op.h",
-        "typed_queue.h",
+        "spectrogram_test_data/short_test_segment.wav",
+        "spectrogram_test_data/short_test_segment_spectrogram.csv.bin",
+        "spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin",
     ],
+    visibility = ["//visibility:public"],
 )
 
-# Core kernels we want on Android. Only a subset of kernels to keep
-# base library small.
-filegroup(
-    name = "android_core_ops",
-    srcs = [
-        "aggregate_ops.cc",
-        "aggregate_ops.h",
-        "aggregate_ops_cpu.h",
-        "assign_op.h",
-        "bias_op.cc",
-        "bias_op.h",
-        "bounds_check.h",
-        "cast_op.cc",
-        "cast_op.h",
-        "cast_op_impl.h",
-        "cast_op_impl_bfloat.cc",
-        "cast_op_impl_bool.cc",
-        "cast_op_impl_complex128.cc",
-        "cast_op_impl_complex64.cc",
-        "cast_op_impl_double.cc",
-        "cast_op_impl_float.cc",
-        "cast_op_impl_half.cc",
-        "cast_op_impl_int16.cc",
-        "cast_op_impl_int32.cc",
-        "cast_op_impl_int64.cc",
-        "cast_op_impl_int8.cc",
-        "cast_op_impl_uint16.cc",
-        "cast_op_impl_uint8.cc",
-        "concat_lib.h",
-        "concat_lib_cpu.cc",
-        "concat_lib_cpu.h",
-        "concat_op.cc",
-        "constant_op.cc",
-        "constant_op.h",
-        "cwise_ops.h",
-        "cwise_ops_common.cc",
-        "cwise_ops_common.h",
-        "cwise_ops_gradients.h",
-        "dense_update_ops.cc",
-        "dense_update_ops.h",
-        "example_parsing_ops.cc",
-        "fill_functor.cc",
-        "fill_functor.h",
-        "function_ops.cc",
-        "gather_functor.h",
-        "gather_op.cc",
-        "identity_op.cc",
-        "identity_op.h",
-        "immutable_constant_op.cc",
-        "immutable_constant_op.h",
-        "matmul_op.cc",
-        "matmul_op.h",
-        "no_op.cc",
-        "no_op.h",
-        "non_max_suppression_op.cc",
-        "non_max_suppression_op.h",
-        "one_hot_op.cc",
-        "one_hot_op.h",
-        "ops_util.h",
-        "pack_op.cc",
-        "pooling_ops_common.h",
-        "reshape_op.cc",
-        "reshape_op.h",
-        "reverse_sequence_op.cc",
-        "reverse_sequence_op.h",
-        "sendrecv_ops.cc",
-        "sendrecv_ops.h",
-        "sequence_ops.cc",
-        "shape_ops.cc",
-        "shape_ops.h",
-        "slice_op.cc",
-        "slice_op.h",
-        "slice_op_cpu_impl.h",
-        "slice_op_cpu_impl_1.cc",
-        "slice_op_cpu_impl_2.cc",
-        "slice_op_cpu_impl_3.cc",
-        "slice_op_cpu_impl_4.cc",
-        "slice_op_cpu_impl_5.cc",
-        "slice_op_cpu_impl_6.cc",
-        "slice_op_cpu_impl_7.cc",
-        "softmax_op.cc",
-        "softmax_op.h",
-        "softmax_op_functor.h",
-        "split_lib.h",
-        "split_lib_cpu.cc",
-        "split_op.cc",
-        "split_v_op.cc",
-        "strided_slice_op.cc",
-        "strided_slice_op.h",
-        "strided_slice_op_impl.h",
-        "strided_slice_op_inst_0.cc",
-        "strided_slice_op_inst_1.cc",
-        "strided_slice_op_inst_2.cc",
-        "strided_slice_op_inst_3.cc",
-        "strided_slice_op_inst_4.cc",
-        "strided_slice_op_inst_5.cc",
-        "strided_slice_op_inst_6.cc",
-        "strided_slice_op_inst_7.cc",
-        "unpack_op.cc",
-        "variable_ops.cc",
-        "variable_ops.h",
+cc_library(
+    name = "spectrogram",
+    srcs = ["spectrogram.cc"],
+    hdrs = ["spectrogram.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d//:fft2d",
     ],
 )
 
-# Other kernels we may want on Android.
-#
-# The kernels can be consumed as a whole or in two groups for
-# supporting separate compilation. Note that the split into groups
-# is entirely for improving compilation time, and not for
-# organizational reasons; you should not depend on any
-# of those groups independently.
-filegroup(
-    name = "android_extended_ops",
-    srcs = [
-        ":android_extended_ops_group1",
-        ":android_extended_ops_group2",
-        ":android_quantized_ops",
+cc_library(
+    name = "spectrogram_test_utils",
+    testonly = 1,
+    srcs = ["spectrogram_test_utils.cc"],
+    hdrs = ["spectrogram_test_utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_binary(
+    name = "spectrogram_convert_test_data",
+    testonly = 1,
+    srcs = ["spectrogram_convert_test_data.cc"],
+    deps = [
+        ":spectrogram_test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "spectrogram_test",
+    size = "medium",
+    srcs = ["spectrogram_test.cc"],
+    data = [":spectrogram_test_data"],
+    deps = [
+        ":spectrogram",
+        ":spectrogram_test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "spectrogram_op",
+    prefix = "spectrogram_op",
+    deps = [
+        ":spectrogram",
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "spectrogram_op_test",
+    size = "small",
+    srcs = ["spectrogram_op_test.cc"],
+    deps = [
+        ":ops_util",
+        ":spectrogram_op",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "mfcc_dct",
+    srcs = ["mfcc_dct.cc"],
+    hdrs = ["mfcc_dct.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_dct_test",
+    size = "small",
+    srcs = ["mfcc_dct_test.cc"],
+    deps = [
+        ":mfcc_dct",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "mfcc_mel_filterbank",
+    srcs = ["mfcc_mel_filterbank.cc"],
+    hdrs = ["mfcc_mel_filterbank.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_mel_filterbank_test",
+    size = "small",
+    srcs = ["mfcc_mel_filterbank_test.cc"],
+    deps = [
+        ":mfcc_mel_filterbank",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_library(
+    name = "mfcc",
+    srcs = ["mfcc.cc"],
+    hdrs = ["mfcc.h"],
+    copts = tf_copts(),
+    deps = [
+        ":mfcc_dct",
+        ":mfcc_mel_filterbank",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_test",
+    size = "small",
+    srcs = ["mfcc_test.cc"],
+    deps = [
+        ":mfcc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_test_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "mfcc_op",
+    prefix = "mfcc_op",
+    deps = [
+        ":mfcc",
+        "//tensorflow/core:audio_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "mfcc_op_test",
+    size = "small",
+    srcs = ["mfcc_op_test.cc"],
+    deps = [
+        ":mfcc_op",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "audio",
+    deps = [
+        ":decode_wav_op",
+        ":encode_wav_op",
+        ":mfcc_op",
+        ":spectrogram_op",
+    ],
+)
+
+# Android libraries -----------------------------------------------------------
+
+# Changes to the Android srcs here should be replicated in
+# tensorflow/contrib/makefile/tf_op_files.txt
+# LINT.IfChange
+filegroup(
+    name = "mobile_srcs",
+    srcs = [
+        "avgpooling_op.h",
+        "bounds_check.h",
+        "cwise_ops.h",
+        "cwise_ops_common.h",
+        "cwise_ops_gradients.h",
+        "eigen_activations.h",
+        "eigen_attention.h",
+        "eigen_backward_cuboid_convolutions.h",
+        "eigen_backward_spatial_convolutions.h",
+        "eigen_cuboid_convolution.h",
+        "eigen_pooling.h",
+        "eigen_softmax.h",
+        "eigen_spatial_convolutions.h",
+        "eigen_volume_patch.h",
+        "fifo_queue.h",
+        "maxpooling_op.h",
+        "ops_util.cc",
+        "ops_util.h",
+        "padding_fifo_queue.h",
+        "pooling_ops_common.cc",
+        "pooling_ops_common.h",
+        "queue_base.h",
+        "queue_op.h",
+        "typed_queue.h",
+    ],
+)
+
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
+# Core kernels we want on Android. Only a subset of kernels to keep
+# base library small.
+filegroup(
+    name = "android_core_ops",
+    srcs = [
+        "aggregate_ops.cc",
+        "aggregate_ops.h",
+        "aggregate_ops_cpu.h",
+        "assign_op.h",
+        "bias_op.cc",
+        "bias_op.h",
+        "bounds_check.h",
+        "cast_op.cc",
+        "cast_op.h",
+        "cast_op_impl.h",
+        "cast_op_impl_bfloat.cc",
+        "cast_op_impl_bool.cc",
+        "cast_op_impl_complex128.cc",
+        "cast_op_impl_complex64.cc",
+        "cast_op_impl_double.cc",
+        "cast_op_impl_float.cc",
+        "cast_op_impl_half.cc",
+        "cast_op_impl_int16.cc",
+        "cast_op_impl_int32.cc",
+        "cast_op_impl_int64.cc",
+        "cast_op_impl_int8.cc",
+        "cast_op_impl_uint16.cc",
+        "cast_op_impl_uint8.cc",
+        "concat_lib.h",
+        "concat_lib_cpu.cc",
+        "concat_lib_cpu.h",
+        "concat_op.cc",
+        "constant_op.cc",
+        "constant_op.h",
+        "cwise_ops.h",
+        "cwise_ops_common.cc",
+        "cwise_ops_common.h",
+        "cwise_ops_gradients.h",
+        "dense_update_ops.cc",
+        "dense_update_ops.h",
+        "example_parsing_ops.cc",
+        "fill_functor.cc",
+        "fill_functor.h",
+        "function_ops.cc",
+        "gather_functor.h",
+        "gather_op.cc",
+        "identity_op.cc",
+        "identity_op.h",
+        "immutable_constant_op.cc",
+        "immutable_constant_op.h",
+        "matmul_op.cc",
+        "matmul_op.h",
+        "no_op.cc",
+        "no_op.h",
+        "non_max_suppression_op.cc",
+        "non_max_suppression_op.h",
+        "one_hot_op.cc",
+        "one_hot_op.h",
+        "ops_util.h",
+        "pack_op.cc",
+        "pooling_ops_common.h",
+        "reshape_op.cc",
+        "reshape_op.h",
+        "reverse_sequence_op.cc",
+        "reverse_sequence_op.h",
+        "sendrecv_ops.cc",
+        "sendrecv_ops.h",
+        "sequence_ops.cc",
+        "shape_ops.cc",
+        "shape_ops.h",
+        "slice_op.cc",
+        "slice_op.h",
+        "slice_op_cpu_impl.h",
+        "slice_op_cpu_impl_1.cc",
+        "slice_op_cpu_impl_2.cc",
+        "slice_op_cpu_impl_3.cc",
+        "slice_op_cpu_impl_4.cc",
+        "slice_op_cpu_impl_5.cc",
+        "slice_op_cpu_impl_6.cc",
+        "slice_op_cpu_impl_7.cc",
+        "softmax_op.cc",
+        "softmax_op.h",
+        "softmax_op_functor.h",
+        "split_lib.h",
+        "split_lib_cpu.cc",
+        "split_op.cc",
+        "split_v_op.cc",
+        "strided_slice_op.cc",
+        "strided_slice_op.h",
+        "strided_slice_op_impl.h",
+        "strided_slice_op_inst_0.cc",
+        "strided_slice_op_inst_1.cc",
+        "strided_slice_op_inst_2.cc",
+        "strided_slice_op_inst_3.cc",
+        "strided_slice_op_inst_4.cc",
+        "strided_slice_op_inst_5.cc",
+        "strided_slice_op_inst_6.cc",
+        "strided_slice_op_inst_7.cc",
+        "unpack_op.cc",
+        "variable_ops.cc",
+        "variable_ops.h",
+    ],
+)
+
+# Other kernels we may want on Android.
+#
+# The kernels can be consumed as a whole or in two groups for
+# supporting separate compilation. Note that the split into groups
+# is entirely for improving compilation time, and not for
+# organizational reasons; you should not depend on any
+# of those groups independently.
+filegroup(
+    name = "android_extended_ops",
+    srcs = [
+        ":android_extended_ops_group1",
+        ":android_extended_ops_group2",
+        ":android_quantized_ops",
     ],
     visibility = ["//visibility:public"],
 )
@@ -3717,6 +4089,7 @@ filegroup(
         "relu_op.h",
         "relu_op_functor.h",
         "resize_bilinear_op.h",
+        "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
         "softplus_op.h",
@@ -3726,9 +4099,12 @@ filegroup(
         "tensor_array.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
+        "topk_op.h",
+        "training_op_helpers.h",
         "training_ops.h",
         "transpose_functor.h",
         "transpose_op.h",
+        "warn_about_ints.h",
         "where_op.h",
         "xent_op.h",
     ],
@@ -3761,6 +4137,7 @@ filegroup(
         "cwise_op_equal_to_2.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
+        "cwise_op_floor_div.cc",
         "cwise_op_greater.cc",
         "cwise_op_greater_equal.cc",
         "cwise_op_isfinite.cc",
@@ -3807,6 +4184,7 @@ filegroup(
     srcs = [
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
+        "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
         "in_topk_op.cc",
@@ -3825,6 +4203,7 @@ filegroup(
         "queue_base.cc",
         "queue_ops.cc",
         "random_op.cc",
+        "reduction_ops_all.cc",
         "reduction_ops_any.cc",
         "reduction_ops_common.cc",
         "reduction_ops_max.cc",
@@ -3861,9 +4240,11 @@ filegroup(
         "tile_ops_cpu_impl_6.cc",
         "tile_ops_cpu_impl_7.cc",
         "topk_op.cc",
+        "training_op_helpers.cc",
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
+        "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
@@ -3927,19 +4308,22 @@ filegroup(
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
             "sdca_internal.*",
+            "sparse_cross_op.*",
             "text_line_reader_op.*",
             "summary_image_op.*",
+            "decode_image_op.*",
             "encode_png_op.*",
-            "decode_png_op.*",
             "encode_jpeg_op.*",
             "decode_jpeg_op.*",
             "decode_gif_op.*",
             "identity_reader_op.*",
             "remote_fused_graph_execute_op.*",
+            "remote_fused_graph_rewriter_transform.*",
             "fixed_length_record_reader_op.*",
             "whole_file_read_ops.*",
             "sample_distorted_bounding_box_op.*",
             "ctc_loss_op.*",
+            "spectrogram_convert_test_data.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
@@ -3959,6 +4343,12 @@ cc_library(
         "//conditions:default": [],
     }),
     copts = tf_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-ldl",
+        ],
+        "//conditions:default": [],
+    }),
     tags = [
         "manual",
         "notap",
@@ -4415,6 +4805,7 @@ cc_library(
 
 cc_library(
     name = "remote_fused_graph_execute_op_test_utils",
+    testonly = 1,
     srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
     hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
     deps = [
@@ -4423,6 +4814,7 @@ cc_library(
         "//tensorflow/cc:scope",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:cwise_op",
     ],
 )
@@ -4445,6 +4837,7 @@ tf_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -4479,6 +4872,40 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "remote_fused_graph_rewriter_transform",
+    srcs = [
+        "remote_fused_graph_rewriter_transform.cc",
+    ],
+    deps = [
+        ":remote_fused_graph_execute_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:remote_fused_graph_ops",
+        "//tensorflow/core",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "remote_fused_graph_rewriter_transform_test",
+    size = "small",
+    srcs = ["remote_fused_graph_rewriter_transform_test.cc"],
+    deps = [
+        ":remote_fused_graph_execute_op_test_utils",
+        ":remote_fused_graph_execute_utils",
+        ":remote_fused_graph_rewriter_transform",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/graph_transforms:transform_utils",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
     prefix = "mkl_conv",
@@ -4544,6 +4971,348 @@ tf_mkl_kernel_library(
     ],
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_fused_batch_norm_op",
+    srcs = ["mkl_fused_batch_norm_op.cc"],
+    deps = NN_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_concat_op",
+    prefix = "mkl_concat_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_reshape_op",
+    prefix = "mkl_reshape_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_identity_op",
+    prefix = "mkl_identity_op",
+    deps = ARRAY_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+tf_mkl_kernel_library(
+    name = "mkl_lrn_op",
+    prefix = "mkl_lrn_op",
+    deps = NN_DEPS + [
+        "//third_party/mkl:intel_binary_blob",
+    ],
+)
+
+cc_library(
+    name = "dataset",
+    hdrs = ["dataset.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "captured_function",
+    srcs = ["captured_function.cc"],
+    hdrs = ["captured_function.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "window_dataset",
+    srcs = ["window_dataset.cc"],
+    hdrs = ["window_dataset.h"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "batch_dataset_op",
+    srcs = ["batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "padded_batch_dataset_op",
+    srcs = ["padded_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        ":window_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "filter_dataset_op",
+    srcs = ["filter_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_dataset_op",
+    srcs = ["map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_map_dataset_op",
+    srcs = ["parallel_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_map_dataset_op",
+    srcs = ["flat_map_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "repeat_dataset_op",
+    srcs = ["repeat_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "take_dataset_op",
+    srcs = ["take_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_dataset_op",
+    srcs = ["skip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_dataset_op",
+    srcs = ["range_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "shuffle_dataset_op",
+    srcs = ["shuffle_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sparse_tensor_slice_dataset_op",
+    srcs = ["sparse_tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_dataset_op",
+    srcs = ["tensor_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_slice_dataset_op",
+    srcs = ["tensor_slice_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "zip_dataset_op",
+    srcs = ["zip_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "reader_dataset_ops",
+    srcs = ["reader_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "iterator_ops",
+    srcs = ["iterator_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "dataset_ops",
+    deps = [
+        ":batch_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
+        ":filter_dataset_op",
+        ":flat_map_dataset_op",
+        ":group_by_window_dataset_op",
+        ":iterator_ops",
+        ":map_dataset_op",
+        ":padded_batch_dataset_op",
+        ":parallel_map_dataset_op",
+        ":range_dataset_op",
+        ":reader_dataset_ops",
+        ":repeat_dataset_op",
+        ":shuffle_dataset_op",
+        ":skip_dataset_op",
+        ":sparse_tensor_slice_dataset_op",
+        ":take_dataset_op",
+        ":tensor_dataset_op",
+        ":tensor_slice_dataset_op",
+        ":zip_dataset_op",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
 
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index cbc0537b4549e566612432304c2f0b8d66cbe9f2..0aa65729de264c3792e2a0afba8dc113685ce807 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -161,9 +161,11 @@ TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU);
 #undef REGISTER_ADDN_CPU
 
 #if GOOGLE_CUDA
-REGISTER_ADDN(Eigen::half, GPU);
-REGISTER_ADDN(float, GPU);
-REGISTER_ADDN(double, GPU);
+#define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
+TF_CALL_complex64(REGISTER_ADDN_GPU);
+TF_CALL_complex128(REGISTER_ADDN_GPU);
+#undef REGISTER_ADDN_GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 51393787acbd24aa6c61e30c3ceafc45d04f67c3..3f449be754492bf9034ee68b2ba2571b12960b6f 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -154,6 +154,8 @@ struct Add9Functor<GPUDevice, T> {
   template struct functor::Add9Functor<GPUDevice, type>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_FUNCTORS);
+TF_CALL_complex64(REGISTER_FUNCTORS);
+TF_CALL_complex128(REGISTER_FUNCTORS);
 
 #undef REGISTER_FUNCTORS
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 03880b98273af7aa7eb174f128c2ee869107ed32..83633a1dd98f172aab66088826282b28a8fb217b 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -88,7 +88,7 @@ class Barrier : public ResourceBase {
   template <typename T>
   void TryInsertMany(const Tensor& keys, int component_index,
                      const Tensor& values, OpKernelContext* ctx,
-                     DoneCallback callback) {
+                     const DoneCallback& callback) {
     TensorShape element_shape = values.shape();
     OP_REQUIRES_ASYNC(
         ctx, keys.NumElements() == 0 || element_shape.num_elements() > 0,
@@ -195,7 +195,8 @@ class Barrier : public ResourceBase {
   }
 
   void TryTakeMany(int num_elements, bool allow_small_batch, int64 timeout,
-                   OpKernelContext* ctx, IndicesKeysValuesCallback callback) {
+                   OpKernelContext* ctx,
+                   const IndicesKeysValuesCallback& callback) {
     int num_elements_to_deliver = num_elements;
     {
       mutex_lock lock(mu_);
@@ -247,7 +248,7 @@ class Barrier : public ResourceBase {
   }
 
   void Close(OpKernelContext* ctx, bool cancel_pending_enqueues,
-             DoneCallback callback) {
+             const DoneCallback& callback) {
     mutex_lock lock(mu_);
     // We're allowed to close twice if the first close wasn't a
     // cancel but the second one is.
@@ -399,7 +400,8 @@ class Barrier : public ResourceBase {
   }
 
   void CloseQueueLocked(OpKernelContext* ctx, bool cancel_pending_enqueues,
-                        DoneCallback callback) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                        const DoneCallback& callback)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // CloseQueueLocked may only be called with mu_ held.
     if (!cancel_pending_enqueues && queue_closed_) {
       callback();
diff --git a/tensorflow/core/kernels/basic_ops_benchmark_test.cc b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
index 54532318cec8e40f13c1e97232c12e7406834de1..5726062938bc230911d74c26865b765533d127fa 100644
--- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// We focus on the single thread performance of runing ops.
+// We focus on the single thread performance of running ops.
 static SessionOptions InitOptions() {
   SessionOptions opts;
   opts.config.set_intra_op_parallelism_threads(1);
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/batch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8289eff2a73a13a752f043f0266f0ec7c673b9e
--- /dev/null
+++ b/tensorflow/core/kernels/batch_dataset_op.cc
@@ -0,0 +1,223 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class BatchDatasetOp : public OpKernel {
+ public:
+  explicit BatchDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new BatchDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* batch_size_t;
+    OP_REQUIRES_OK(ctx, ctx->input("batch_size", &batch_size_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(batch_size_t->shape()),
+                errors::InvalidArgument("batch_size must be a scalar"));
+    const int64 batch_size = batch_size_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    DatasetBase* dataset = new Dataset(batch_size, input);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, const DatasetBase* input)
+        : batch_size_(batch_size), input_(input) {
+      input_->Ref();
+
+      // NOTE(mrry): Currently we implement "batch up to" semantics. If
+      // we could tell statically that the input dataset is infinite,
+      // then we could always report `batch_size` as the 0th dimension.
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (const auto& input_shape : input_shapes) {
+        output_shapes_.emplace_back(
+            PartialTensorShape({-1}).Concatenate(input_shape));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
+    }
+
+   private:
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    //
+    // TODO(mrry): Reconcile this method with the similar method in
+    // the queue implementation.
+    template <DataType DT>
+    static Status HandleElementToSlice(const Tensor& element, Tensor* parent,
+                                       int index) {
+      typedef typename EnumToDataType<DT>::Type T;
+      if (element.NumElements() !=
+          (parent->NumElements() / parent->dim_size(0))) {
+        TensorShape chip_shape = parent->shape();
+        chip_shape.RemoveDim(0);
+        return errors::Internal(
+            "HandleElementToSlice Cannot copy slice: number of elements does "
+            "not "
+            "match.  Shapes are: [element]: ",
+            element.shape().DebugString(),
+            ", [parent slice]: ", chip_shape.DebugString());
+      }
+      auto parent_as_matrix = parent->flat_outer_dims<T>();
+      parent_as_matrix.chip(index, 0) = element.flat<T>();
+      return Status::OK();
+    }
+
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
+                                     int64 index) {
+#define HANDLE_TYPE(DT)                                                   \
+  if (element.dtype() == DT) {                                            \
+    TF_RETURN_IF_ERROR(HandleElementToSlice<DT>(element, parent, index)); \
+    return Status::OK();                                                  \
+  }
+      HANDLE_TYPE(DT_FLOAT);
+      HANDLE_TYPE(DT_HALF);
+      HANDLE_TYPE(DT_DOUBLE);
+      HANDLE_TYPE(DT_INT32);
+      HANDLE_TYPE(DT_UINT8);
+      HANDLE_TYPE(DT_INT16);
+      HANDLE_TYPE(DT_INT8);
+      HANDLE_TYPE(DT_STRING);
+      HANDLE_TYPE(DT_COMPLEX64);
+      HANDLE_TYPE(DT_COMPLEX128);
+      HANDLE_TYPE(DT_INT64);
+      HANDLE_TYPE(DT_BOOL);
+      HANDLE_TYPE(DT_QINT8);
+      HANDLE_TYPE(DT_QUINT8);
+      HANDLE_TYPE(DT_QINT32);
+      HANDLE_TYPE(DT_QINT16);
+      HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+      return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
+                                   element.dtype());
+    }
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of `batch_elements` is a tuple of tensors from the
+        // input iterator.
+        std::vector<std::vector<Tensor>> batch_elements;
+        batch_elements.reserve(dataset()->batch_size_);
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              batch_elements.emplace_back(std::move(batch_element_tuple));
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Copy the retrieved batch elements into one output tensor
+        // per tuple component.
+        // NOTE(mrry): If the input or output sizes are statically
+        // known, we could potentially read the input values in-place
+        // into their respective slice locations. This would require a
+        // different GetNext() overload that supports zero-copy, and might
+        // make sense in an optimization pass.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          const Tensor& first_element = batch_elements[0][component_index];
+          TensorShape batch_component_shape({num_batch_elements});
+          batch_component_shape.AppendShape(first_element.shape());
+          Tensor batch_component(cpu_allocator(), first_element.dtype(),
+                                 batch_component_shape);
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (size_t i = 0; i < num_batch_elements; ++i) {
+            TF_RETURN_IF_ERROR(CopyElementToSlice(
+                batch_elements[i][component_index], &batch_component, i));
+          }
+          out_tensors->emplace_back(std::move(batch_component));
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("BatchDataset").Device(DEVICE_CPU),
+                        BatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batchtospace_op.cc b/tensorflow/core/kernels/batchtospace_op.cc
index b24a8340839e076ee3399065ad3ea95584710eb8..99b5d3daaa4b9ad2b962ceef986063229a887e7f 100644
--- a/tensorflow/core/kernels/batchtospace_op.cc
+++ b/tensorflow/core/kernels/batchtospace_op.cc
@@ -97,6 +97,10 @@ static void BatchToSpaceOpCompute(OpKernelContext* context,
   for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
     block_shape_product *= block_shape[block_dim];
   }
+  OP_REQUIRES(
+      context, block_shape_product > 0,
+      errors::InvalidArgument("Product of block sizes must be positive, got ",
+                              block_shape_product));
 
   const int64 orig_input_batch_size = orig_input_tensor.dim_size(0);
   OP_REQUIRES(
diff --git a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc b/tensorflow/core/kernels/bucketize_op.cc
similarity index 98%
rename from tensorflow/contrib/layers/kernels/bucketization_kernel.cc
rename to tensorflow/core/kernels/bucketize_op.cc
index 5cfa39de7645c982d094e012a55e5265adb26bbb..93c2d01221f3b1d36fefa7742762025b96cc5387 100644
--- a/tensorflow/contrib/layers/kernels/bucketization_kernel.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// See docs in ../ops/math_ops.cc.
+
 #include <algorithm>
 #include <vector>
 
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f231331bb4b137c10f742bb0eff6ac414f4853d
--- /dev/null
+++ b/tensorflow/core/kernels/captured_function.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/captured_function.h"
+
+#include <utility>
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/lookup_interface.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/queue_interface.h"
+#include "tensorflow/core/framework/reader_interface.h"
+#include "tensorflow/core/framework/resource_handle.pb_text.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/public/session_options.h"
+
+
+namespace tensorflow {
+
+/* static */
+Status CapturedFunction::Create(
+    OpKernelContext* ctx, const NameAttrList* func, int graph_def_version,
+    std::vector<Tensor> captured_inputs,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  // NOTE(mrry): We need to assign a name to the device, and we choose
+  // the same name as the calling context's device so that we do not
+  // need to rewrite resource handles that are found in `captured_inputs`.
+  std::unique_ptr<Device> device(new ThreadPoolDevice(
+      SessionOptions(), ctx->device()->attributes().name(), Bytes(256 << 20),
+      DeviceLocality(), cpu_allocator()));
+
+// TODO(mrry): Handle arbitrary resource types, which might require a
+// redesign (or opening up access to `ResourceMgr::DoLookup()` and
+// `ResourceMgr::DoCreate()` to this code).
+#define HANDLE_RESOURCE_TYPE(ResourceType)                                     \
+  if (input_handle.hash_code() == MakeTypeIndex<ResourceType>().hash_code()) { \
+    ResourceType* resource;                                                    \
+    Status s = LookupResource(ctx, input_handle, &resource);                   \
+    if (errors::IsNotFound(s)) {                                               \
+      return errors::FailedPrecondition(                                       \
+          "Failed to capture resource named \"", input_handle.name(),          \
+          "\" in a dataset function. You may need to initialize it "           \
+          "explicitly before initializing an iterator that uses it.");         \
+    } else if (!s.ok()) {                                                      \
+      return s;                                                                \
+    }                                                                          \
+    TF_RETURN_IF_ERROR(device->resource_manager()->Create(                     \
+        input_handle.container(), input_handle.name(), resource));             \
+    continue;                                                                  \
+  }
+
+  for (size_t i = 0; i < captured_inputs.size(); ++i) {
+    if (captured_inputs[i].dtype() == DT_RESOURCE) {
+      // Extract the resource from `ctx->resource_manager()` and
+      // insert it into `device->resource_manager()` so that it can be
+      // used when the function executes.
+      ResourceHandle input_handle =
+          captured_inputs[i].scalar<ResourceHandle>()();
+      HANDLE_RESOURCE_TYPE(lookup::LookupInterface);
+      HANDLE_RESOURCE_TYPE(QueueInterface);
+      HANDLE_RESOURCE_TYPE(Var);
+      return errors::Unimplemented(
+          "Cannot currently capture resource '",
+          ProtoDebugString(input_handle),
+          "' in a dataset function (type not supported).");
+    }
+  }
+#undef HANDLE_RESOURCE_TYPE
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def(
+      new FunctionLibraryDefinition(
+          *ctx->function_library()->GetFunctionLibraryDefinition()));
+  std::unique_ptr<FunctionLibraryRuntime> lib(NewFunctionLibraryRuntime(
+      nullptr /* device_mgr */, ctx->env(), device.get(), graph_def_version,
+      flib_def.get(), {} /* TODO(mrry): OptimizerOptions? */));
+
+  FunctionLibraryRuntime::Handle f_handle;
+  TF_RETURN_IF_ERROR(
+      lib->Instantiate(func->name(), AttrSlice(&func->attr()), &f_handle));
+
+  out_function->reset(new CapturedFunction(
+      std::move(device), std::move(flib_def), std::move(lib), f_handle,
+      std::move(captured_inputs)));
+  return Status::OK();
+}
+
+Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
+                             gtl::ArraySlice<Tensor> args,
+                             std::vector<Tensor>* rets) {
+  Notification n;
+  Status s;
+  auto done_callback = [&n, &s](Status func_status) {
+    s.Update(func_status);
+    n.Notify();
+  };
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  CancellationManager c_mgr;
+  f_opts.cancellation_manager = &c_mgr;
+  // TODO(mrry): Implement a synchronous version of
+  // FunctionLibraryRuntime::Run() that avoids a context switch for small
+  // functions.
+  if (captured_inputs_.empty()) {
+    lib_->Run(f_opts, f_handle_, args, rets, done_callback);
+  } else {
+    std::vector<Tensor> args_with_captured;
+    args_with_captured.reserve(args.size() + captured_inputs_.size());
+    args_with_captured.insert(args_with_captured.end(), args.begin(),
+                              args.end());
+    args_with_captured.insert(args_with_captured.end(),
+                              captured_inputs_.begin(), captured_inputs_.end());
+    lib_->Run(f_opts, f_handle_, args_with_captured, rets, done_callback);
+  }
+  n.WaitForNotification();
+  return s;
+}
+
+CapturedFunction::CapturedFunction(
+    std::unique_ptr<Device> device,
+    std::unique_ptr<FunctionLibraryDefinition> flib_def,
+    std::unique_ptr<FunctionLibraryRuntime> lib,
+    FunctionLibraryRuntime::Handle f_handle,
+    std::vector<Tensor> captured_inputs)
+    : device_(std::move(device)),
+      flib_def_(std::move(flib_def)),
+      lib_(std::move(lib)),
+      f_handle_(f_handle),
+      captured_inputs_(std::move(captured_inputs)) {}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a132cdc3a0e9d9eb7aab1e135a1e32517f052d4
--- /dev/null
+++ b/tensorflow/core/kernels/captured_function.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+class Device;
+class OpKernelContext;
+class ResourceMgr;
+
+// A `CapturedFunction` encapsulates a TensorFlow function and all of
+// the runtime support required to execute it.
+//
+// The `Dataset`-related classes use `CapturedFunction` to execute
+// TensorFlow functions outside a the normal `OpKernel::Compute()`
+// context.
+//
+// NOTE(mrry): Here we are taking a conservative approach to dealing with
+// ownership of the various framework and runtime objects that are needed
+// to execute functions. We copy the function library *definition* (i.e.
+// a set of FunctionDefs) out of this kernel's context's function library
+// *runtime*, then we use that together with a specially-created
+// ThreadPoolDevice to build a new FunctionLibraryRuntime for the Dataset.
+//
+// We need to do this (or refactor the ownership of framework components
+// in each of the session implementations) to make it possible to close
+// down a ParallelMapDataset::Iterator when its session is closed.
+//
+// TODO(mrry): Clean this up. Investigate whether it would be possible to
+// reuse the session's FunctionLibraryRuntime(s) or Device(s).
+class CapturedFunction {
+ public:
+  // NOTE(mrry): The `captured_inputs` are passed by value. For
+  // efficiency, you are recommended to move this argument into the call.
+  static Status Create(OpKernelContext* ctx, const NameAttrList* func,
+                       int graph_def_version,
+                       std::vector<Tensor> captured_inputs,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  Status Run(FunctionLibraryRuntime::Options f_opts,
+             gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets);
+
+  Device* device() const { return device_.get(); }
+
+  ResourceMgr* resource_manager() const { return device_->resource_manager(); }
+
+ private:
+  CapturedFunction(std::unique_ptr<Device> device,
+                   std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                   std::unique_ptr<FunctionLibraryRuntime> lib,
+                   FunctionLibraryRuntime::Handle f_handle,
+                   std::vector<Tensor> captured_inputs);
+
+  const std::unique_ptr<Device> device_;
+  const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  const std::unique_ptr<FunctionLibraryRuntime> lib_;
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const std::vector<Tensor> captured_inputs_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 0def600ac0ccfb4b243839ee81f3f50bd06b95a3..5c24f164a41c793df6a1066496ce86b37153fadd 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -50,7 +50,7 @@ template <typename From, typename To>
 struct scalar_cast_op<std::complex<From>, To> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE To
   operator()(const std::complex<From>& a) const {
-    // Replicate numpy behaviour of returning just the real part
+    // Replicate numpy behavior of returning just the real part
     return static_cast<To>(a.real());
   }
 };
@@ -59,7 +59,7 @@ template <typename From, typename To>
 struct scalar_cast_op<From, std::complex<To>> {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<To> operator()(
       const From& a) const {
-    // Replicate numpy behaviour of setting the imaginary part to 0
+    // Replicate numpy behavior of setting the imaginary part to 0
     return std::complex<To>(static_cast<To>(a), To(0));
   }
 };
diff --git a/tensorflow/core/kernels/cholesky_op.cc b/tensorflow/core/kernels/cholesky_op.cc
index e5bf164cfaa4cd8ef0836205a12deae525149eee..10595faf4ba9dd5224f16b9565fd0e771e085b48 100644
--- a/tensorflow/core/kernels/cholesky_op.cc
+++ b/tensorflow/core/kernels/cholesky_op.cc
@@ -16,31 +16,40 @@ limitations under the License.
 // See docs in ../ops/linalg_ops.cc.
 // TODO(konstantinos): Enable complex inputs. This will require additional tests
 //                     and OP_REQUIRES.
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
 #include "third_party/eigen3/Eigen/Cholesky"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/matrix_band_part_op.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif
+
 namespace tensorflow {
 
+static const char kErrMsg[] =
+    "Cholesky decomposition was not successful. The input might not be valid.";
+
 template <class Scalar>
 class CholeskyOp : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit CholeskyOp(OpKernelConstruction* context) : Base(context) {}
 
-  using Matrix = typename Base::Matrix;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
@@ -60,11 +69,111 @@ class CholeskyOp : public LinearAlgebraOp<Scalar> {
     outputs->at(0) = llt_decomposition.matrixL();
 
     OP_REQUIRES(context, llt_decomposition.info() == Eigen::Success,
-                errors::InvalidArgument("LLT decomposition was not successful. "
-                                        "The input might not be valid."));
+                errors::InvalidArgument(kErrMsg));
   }
 };
 
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void MatrixBandPart<GPUDevice, T>::Compute(                                \
+      const GPUDevice& d, Eigen::DenseIndex num_lower,                       \
+      Eigen::DenseIndex num_upper, typename TTypes<T, 3>::ConstTensor input, \
+      typename TTypes<T, 3>::Tensor output);                                 \
+  extern template struct MatrixBandPart<GPUDevice, T>;
+
+TF_CALL_float(DECLARE_GPU_SPEC);
+TF_CALL_double(DECLARE_GPU_SPEC);
+}  // namespace functor
+
+template <class Scalar>
+class CholeskyOpGpu : public AsyncOpKernel {
+ public:
+  explicit CholeskyOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &output),
+                         done);
+
+    if (n == 0) {
+      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
+      // Therefore, we return X.
+      done();
+      return;
+    }
+
+    // Copy the lower triangular part of the input matrices to the output and
+    // set the strictly upper triangular part to zero. We use a pre-existing
+    // kernel MatrixBandPart to do this for all matrices in the batch at once,
+    // before we launch each of the Cholesky factorization kernels in paralle.
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    auto output_reshaped = output->template flat_inner_dims<Scalar, 3>();
+    functor::MatrixBandPart<GPUDevice, Scalar>::Compute(
+        context->eigen_device<GPUDevice>(), n, 0, input_reshaped,
+        output_reshaped);
+
+    // Launch a Cholesky kernel for each matrix in the batch.
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "potrf");
+    // TODO(rmlarsen): Parallelize over batches if it turns out to be
+    // an important use case.
+    CudaSolver solver(context);
+    for (int64 i = 0; i < batch_size; ++i) {
+      Scalar* output_ptr = output_reshaped.data() + i * n * n;
+      int* dev_info_ptr = dev_info.back().mutable_data() + i;
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver.Potrf(CUBLAS_FILL_MODE_UPPER, n, output_ptr, n, dev_info_ptr),
+          done);
+    }
+
+    // Register callback to check info after kernels finish.
+    auto info_checker = [context, dev_info, done](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& /* unused */) {
+      Status full_status = status;
+      if (!full_status.ok()) {
+        full_status.Update(errors::InvalidArgument(kErrMsg));
+      }
+      OP_REQUIRES_OK_ASYNC(context, full_status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+};
+
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("Cholesky", (CholeskyOpGpu<double>), double);
+
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("Cholesky", (CholeskyOp<float>), float);
 REGISTER_LINALG_OP("Cholesky", (CholeskyOp<double>), double);
 REGISTER_LINALG_OP("BatchCholesky", (CholeskyOp<float>), float);
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 592621c52af96c89d54b4383fadc8ead8bc2717f..cd0414ef4096a2c1e9bbb7b9d90412e5492aca28 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -113,6 +113,8 @@ void ConcatGPU(
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
+TF_CALL_complex64(REGISTER);
+TF_CALL_complex128(REGISTER);
 REGISTER(bfloat16);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 019d6b6ab282233c66973b42b25cf84d951d524d..3ed6241b7a746030d41f4e62cb60480587f48bea 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -198,15 +198,23 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
+TF_CALL_complex64(REGISTER_GPUCONCAT32);
+TF_CALL_complex128(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
+TF_CALL_complex64(REGISTER_GPUCONCAT64);
+TF_CALL_complex128(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
+TF_CALL_complex64(REGISTER_GPU32);
+TF_CALL_complex128(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
+TF_CALL_complex64(REGISTER_GPU64);
+TF_CALL_complex128(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 
 #undef REGISTER_GPUCONCAT32
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index 9628a7efa4b767018c5e546122fd46835bb1c084..916bbc49963915c32aecf621318382d1d7bb1ba9 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -193,6 +193,8 @@ REGISTER_CONCAT(bfloat16);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index a59277f18e2da5bd046953e49f3c7e26e9012225..20394cad432abc8b9c215af33c008d09d2b387e9 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -99,25 +99,26 @@ struct LaunchXsmmBackwardFilter {
                   typename TTypes<T, 4>::Tensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     return false;
   }
 };
- 
+
 template <>
 struct LaunchXsmmBackwardFilter<CPUDevice, float> {
   bool operator()(OpKernelContext* context, const CPUDevice& d,
                   typename TTypes<float, 4>::ConstTensor input,
                   typename TTypes<float, 4>::Tensor filter,
-                  typename TTypes<float, 4>::ConstTensor output,
-                  int input_rows, int input_cols, int row_stride,
-                  int col_stride,int pad_h, int pad_w,  TensorFormat data_format) const {
+                  typename TTypes<float, 4>::ConstTensor output, int input_rows,
+                  int input_cols, int row_stride, int col_stride, int pad_h,
+                  int pad_w, TensorFormat data_format) const {
     auto batch = input.dimension(0);
     auto in_depth = input.dimension(3);
     auto out_depth = output.dimension(3);
     auto filter_rows = filter.dimension(0);
     auto filter_cols = filter.dimension(1);
- 
+
     auto num_threads =
         context->device()->tensorflow_cpu_worker_threads()->num_threads;
     // See libxsmm_dnn.h for this struct definition.
@@ -144,13 +145,11 @@ struct LaunchXsmmBackwardFilter<CPUDevice, float> {
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
- 
- 
+
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
- 
- 
+
     auto input_ptr = input.data();
     auto filter_ptr = filter.data();
     auto output_ptr = output.data();
@@ -161,8 +160,6 @@ struct LaunchXsmmBackwardFilter<CPUDevice, float> {
 };
 #endif
 
-
-
 template <typename Device, class T>
 class Conv2DFastBackpropFilterOp : public OpKernel {
  public:
@@ -210,8 +207,7 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, filter_shape, &filter_backprop));
 
-    #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
- 
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
     OP_REQUIRES_OK(
@@ -226,22 +222,20 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
             dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
- 
-    if ( pad_left == pad_right && pad_top == pad_bottom ) {
- 
+
+    if (pad_left == pad_right && pad_top == pad_bottom) {
       if (LaunchXsmmBackwardFilter<Device, T>()(
-            context, context->eigen_device<Device>(),
-            input.tensor<T, 4>(),filter_backprop->tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(),  dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
-            (int)dims.spatial_dims[1].stride,(int)pad_top, (int)pad_left, data_format_)) {
-      return;
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
       }
     }
-    #endif
-
-
-
+#endif
 
     functor::SpatialConvolutionBackwardKernel<Device, T>()(
         context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
@@ -321,19 +315,20 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
             dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-  #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
-    if ( pad_left == pad_right && pad_top == pad_bottom ) {
- 
+#if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
+    if (pad_left == pad_right && pad_top == pad_bottom) {
       if (LaunchXsmmBackwardFilter<Device, T>()(
-            context, context->eigen_device<Device>(),
-            input.tensor<T, 4>(),filter_backprop->tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(),  dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
-            (int)dims.spatial_dims[1].stride,(int)pad_top, (int)pad_left, data_format_)) {
-      return;
+              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
+              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
+              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
+        return;
       }
     }
-  #endif
+#endif
 
     // The total dimension size of each kernel.
     const int filter_total_size = dims.spatial_dims[0].filter_size *
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 7e0912b4dbcaa5d80156c9dc094d7adc2fe0df6a..9a50431a2fa9c9c66590f923762d6e30afb16d70 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,7 +131,8 @@ struct LaunchXsmmBackwardInputConvolution {
                   typename TTypes<T, 4>::ConstTensor kernel,
                   typename TTypes<T, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     return false;
   }
 };
@@ -143,7 +144,8 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
                   typename TTypes<float, 4>::ConstTensor kernel,
                   typename TTypes<float, 4>::ConstTensor output_backward,
                   int input_rows, int input_cols, int row_stride,
-                  int col_stride, int pad_h, int pad_w, TensorFormat data_format) const {
+                  int col_stride, int pad_h, int pad_w,
+                  TensorFormat data_format) const {
     auto batch = input_backward.dimension(0);
     auto in_depth = input_backward.dimension(3);
     auto out_depth = output_backward.dimension(3);
@@ -251,13 +253,16 @@ class Conv2DFastBackpropInputOp : public OpKernel {
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
 
-    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+    if (pad_left == pad_right && pad_top == pad_bottom) {
       if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
-            (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left, data_format_)) {
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
         return;
       }
     }
@@ -326,8 +331,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input_shape, &in_backprop));
 
-    // TODO(andydavis) Consider moving code shared with
-    // Conv2DCustomBackpropFilterOp into a shared helper function.
+// TODO(andydavis) Consider moving code shared with
+// Conv2DCustomBackpropFilterOp into a shared helper function.
 #if defined TENSORFLOW_USE_LIBXSMM && defined TENSORFLOW_USE_LIBXSMM_BACKWARD
     int64 pad_top, pad_bottom;
     int64 pad_left, pad_right;
@@ -344,13 +349,16 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
             dims.spatial_dims[1].stride, padding_,
             &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
 
-    if ( pad_left == pad_right && pad_top == pad_bottom ) {
+    if (pad_left == pad_right && pad_top == pad_bottom) {
       if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-            context, context->eigen_device<Device>(),
-            in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-            out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-            dims.spatial_dims[1].input_size, (int)dims.spatial_dims[0].stride,
-            (int)dims.spatial_dims[1].stride, (int)pad_top, (int)pad_left, data_format_)) {
+              context, context->eigen_device<Device>(),
+              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
+              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
+              dims.spatial_dims[1].input_size,
+              static_cast<int>(dims.spatial_dims[0].stride),
+              static_cast<int>(dims.spatial_dims[1].stride),
+              static_cast<int>(pad_top), static_cast<int>(pad_left),
+              data_format_)) {
         return;
       }
     }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 8076daf387bac7c29981b9c7e25a9e8df7fadd26..b3803778c8676518a96ac211b6878812a22dd79b 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -117,6 +117,12 @@ class LaunchConv2DOp<CPUDevice, T> {
               const Tensor& input, const Tensor& filter, int row_stride,
               int col_stride, const Eigen::PaddingType& padding, Tensor* output,
               TensorFormat data_format) {
+    if (data_format != FORMAT_NHWC) {
+      ctx->SetStatus(
+          errors::Unimplemented("Generic conv implementation only supports "
+                                "NHWC tensor format for now."));
+      return;
+    }
     LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, row_stride,
                                         col_stride, padding, output,
                                         data_format);
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 219e6d5e978b407946ea94c971837f04d70e5604..291ebf2298762d25e2d44aa5b82ffd495ea92c0e 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -74,8 +74,9 @@ enum SamplingMode {
 //       my_vector[current] *= 10.0f;
 //     }
 // });
-void FusedConvParallelFor(OpKernelContext* context, int64 begin, int64 end,
-                          std::function<void(int64, int64)> task_function) {
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
 // On iOS, the thread management imposes a very big performance penalty, so
 // just call the function directly with no multithreading.
 #if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
@@ -712,7 +713,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
       const int32 before =
           paddings_matrix(d, 0);  // Pad before existing elements.
       const int32 after =
-          paddings_matrix(d, 1);  // Pad after exisitng elements.
+          paddings_matrix(d, 1);  // Pad after existing elements.
       OP_REQUIRES(context, before >= 0 && after >= 0,
                   errors::InvalidArgument("paddings must be non-negative: ",
                                           before, " ", after));
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index b122e7f0e847dc6bdfeb313b8a81bb48655aa059..cd9aa4a53efface3654dc405887d6fd82dfacf04 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -116,8 +116,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                int input_depth, int resize_width,
                                int resize_height, int y_padding, int x_padding,
                                int filter_size, int filter_count,
-                               bool resize_align_corners, string pad_mode,
-                               int stride, string padding) {
+                               bool resize_align_corners,
+                               const string& pad_mode, int stride,
+                               const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -170,8 +171,8 @@ class FusedResizePadConvOpTest : public OpsTestBase {
   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
                                       int input_depth, int y_padding,
                                       int x_padding, int filter_size,
-                                      int filter_count, string pad_mode,
-                                      int stride, string padding) {
+                                      int filter_count, const string& pad_mode,
+                                      int stride, const string& padding) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index caf73420ba95f1cb8afe1818250a13a9468e846a..c68a8b0bd27cd0fc1c3a5e93e1ff713ff03741e0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/crop_and_resize_op.h"
 
+#include <functional>
+#include <string>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,10 +29,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -37,41 +43,67 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+using Callback = std::function<void()>;
+
+namespace {
 
-static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
-                                         const Tensor& boxes,
-                                         const Tensor& box_ind,
-                                         int* num_boxes) {
-  if (boxes.NumElements() == 0 && box_ind.NumElements() == 0) {
+static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
+                                           const Tensor& box_index,
+                                           int* num_boxes) {
+  if (boxes.NumElements() == 0 && box_index.NumElements() == 0) {
     *num_boxes = 0;
-    return;
+    return Status::OK();
   }
   // The shape of 'boxes' is [num_boxes, 4].
-  OP_REQUIRES(context, boxes.dims() == 2,
-              errors::InvalidArgument("boxes must be 2-D",
-                                      boxes.shape().DebugString()));
+  if (boxes.dims() != 2) {
+    return errors::InvalidArgument("boxes must be 2-D",
+                                   boxes.shape().DebugString());
+  }
   *num_boxes = boxes.dim_size(0);
-  OP_REQUIRES(context, boxes.dim_size(1) == 4,
-              errors::InvalidArgument("boxes must have 4 columns"));
-
-  // The shape of 'box_ind' is [num_boxes].
-  OP_REQUIRES(context, box_ind.dims() == 1,
-              errors::InvalidArgument("box_ind must be 1-D",
-                                      box_ind.shape().DebugString()));
-  OP_REQUIRES(context, box_ind.dim_size(0) == *num_boxes,
-              errors::InvalidArgument("box_ind has incompatible shape"));
+  if (boxes.dim_size(1) != 4) {
+    return errors::InvalidArgument("boxes must have 4 columns");
+  }
+  // The shape of 'box_index' is [num_boxes].
+  if (box_index.dims() != 1) {
+    return errors::InvalidArgument("box_index must be 1-D",
+                                   box_index.shape().DebugString());
+  }
+  if (box_index.dim_size(0) != *num_boxes) {
+    return errors::InvalidArgument("box_index has incompatible shape");
+  }
+  return Status::OK();
 }
 
-// Verifies that all values in box_ind are in [0, batch).
+// Conditionally calls the compute callback if all values in box_index are in
+// [0, batch_size) then calls done.
 template <typename Device>
-inline void CheckValidBoxInd(
-    OpKernelContext* context,
-    typename TTypes<int32, 1>::ConstTensor box_ind_data, int batch);
+inline void RunIfBoxIndexIsValid(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done);
+
+// Specialization of CheckValidBoxIndex for a CPUDevice.
+template <>
+inline void RunIfBoxIndexIsValid<CPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done) {
+  const int num_boxes = box_index.dimension(0);
+  for (int b = 0; b < num_boxes; ++b) {
+    OP_REQUIRES_ASYNC(
+        context, FastBoundsCheck(box_index(b), batch_size),
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+  }
+  compute();
+  done();
+}
+
+}  // namespace
 
 template <typename Device, typename T>
-class CropAndResizeOp : public OpKernel {
+class CropAndResizeOp : public AsyncOpKernel {
  public:
-  explicit CropAndResizeOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit CropAndResizeOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
@@ -80,69 +112,77 @@ class CropAndResizeOp : public OpKernel {
                                              &extrapolation_value_));
   }
 
-  void Compute(OpKernelContext* context) override {
-    // The shape of 'image' is [batch, image_height, image_width, channels].
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    // The shape of 'image' is [batch_size, image_height, image_width,
+    // channels].
     const Tensor& image = context->input(0);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
-    const int image_height = image.dim_size(1);
-    const int image_width = image.dim_size(2);
-    const int depth = image.dim_size(3);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'crop_size' is [2].
     const Tensor& crop_size = context->input(3);
 
-    OP_REQUIRES(context, crop_size.dims() == 1,
-                errors::InvalidArgument("crop_size must be 1-D",
-                                        crop_size.shape().DebugString()));
-    OP_REQUIRES(context, crop_size.dim_size(0) == 2,
-                errors::InvalidArgument("crop_size must have two elements",
-                                        crop_size.shape().DebugString()));
-
+    // Validate inputs dimensions.
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
+    const int image_height = image.dim_size(1);
+    const int image_width = image.dim_size(2);
+    const int depth = image.dim_size(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
+
+    OP_REQUIRES_ASYNC(context, crop_size.dims() == 1,
+                      errors::InvalidArgument("crop_size must be 1-D",
+                                              crop_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(
+        context, crop_size.dim_size(0) == 2,
+        errors::InvalidArgument("crop_size must have two elements",
+                                crop_size.shape().DebugString()),
+        done);
+
+    // Copy and validate crop sizes.
     auto crop_size_vec = crop_size.vec<int32>();
     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("crop dimensions must be positive"));
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("crop dimensions must be positive"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
+    OP_REQUIRES_OK_ASYNC(
         context,
         context->allocate_output(
             0, TensorShape({num_boxes, crop_height, crop_width, depth}),
-            &output));
-
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 4>::Tensor crops_data = output->tensor<float, 4>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResize<Device, T>()(
-        context->eigen_device<Device>(), image_data, boxes_data, box_ind_data,
-        extrapolation_value_, crops_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeKernel."));
-    }
+            &output),
+        done);
+
+    auto compute_callback = [this, context, output]() {
+      const Tensor& image = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResize<Device, T>()(
+          context->eigen_device<Device>(), image.tensor<T, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          extrapolation_value_, output->tensor<float, 4>());
+      if (!status) {
+        context->SetStatus(
+            errors::Internal("Failed launch CropAndResizeKernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 
  private:
@@ -155,10 +195,10 @@ template <typename T>
 struct CropAndResize<CPUDevice, T> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -173,8 +213,8 @@ struct CropAndResize<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -216,12 +256,14 @@ struct CropAndResize<CPUDevice, T> {
           const float x_lerp = in_x - left_x_index;
 
           for (int d = 0; d < depth; ++d) {
-            const float top_left(image(b_in, top_y_index, left_x_index, d));
-            const float top_right(image(b_in, top_y_index, right_x_index, d));
-            const float bottom_left(
-                image(b_in, bottom_y_index, left_x_index, d));
-            const float bottom_right(
-                image(b_in, bottom_y_index, right_x_index, d));
+            const float top_left(
+                static_cast<float>(image(b_in, top_y_index, left_x_index, d)));
+            const float top_right(
+                static_cast<float>(image(b_in, top_y_index, right_x_index, d)));
+            const float bottom_left(static_cast<float>(
+                image(b_in, bottom_y_index, left_x_index, d)));
+            const float bottom_right(static_cast<float>(
+                image(b_in, bottom_y_index, right_x_index, d)));
             const float top = top_left + (top_right - top_left) * x_lerp;
             const float bottom =
                 bottom_left + (bottom_right - bottom_left) * x_lerp;
@@ -233,89 +275,94 @@ struct CropAndResize<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradImageOp : public OpKernel {
+class CropAndResizeGradImageOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
-
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-    const int crop_height = grads.dim_size(1);
-    const int crop_width = grads.dim_size(2);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
-    OP_REQUIRES(
-        context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'image_size' is [4].
     const Tensor& image_size = context->input(3);
-    OP_REQUIRES(context, image_size.dims() == 1,
-                errors::InvalidArgument("image_size must be 1-D",
-                                        image_size.shape().DebugString()));
-    OP_REQUIRES(context, image_size.dim_size(0) == 4,
-                errors::InvalidArgument("image_size must have 4 elements",
-                                        image_size.shape().DebugString()));
 
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
+    const int crop_height = grads.dim_size(1);
+    const int crop_width = grads.dim_size(2);
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
+    OP_REQUIRES_ASYNC(
+        context, grads.dim_size(0) == num_boxes,
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
+
+    OP_REQUIRES_ASYNC(context, image_size.dims() == 1,
+                      errors::InvalidArgument("image_size must be 1-D",
+                                              image_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(context, image_size.dim_size(0) == 4,
+                      errors::InvalidArgument("image_size must have 4 elements",
+                                              image_size.shape().DebugString()),
+                      done);
     auto image_size_vec = image_size.vec<int32>();
-    const int batch = internal::SubtleMustCopy(image_size_vec(0));
+    const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
     const int depth = internal::SubtleMustCopy(image_size_vec(3));
-
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(3) == depth,
-        errors::InvalidArgument("image_size and grads are incompatible"));
+        errors::InvalidArgument("image_size and grads are incompatible"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-                     0, TensorShape({batch, image_height, image_width, depth}),
-                     &output));
-
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropImage<Device, T>()(
-        context->eigen_device<Device>(), grads_data, boxes_data, box_ind_data,
-        output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropImageKernel."));
-    }
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, image_height, image_width, depth}),
+            &output),
+        done);
+
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          output->tensor<T, 4>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropImage kernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -326,9 +373,9 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
   bool operator()(const CPUDevice& d,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<T, 4>::Tensor grads_image) {
-    const int batch = grads_image.dimension(0);
+    const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
 
@@ -345,8 +392,8 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -397,83 +444,90 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradBoxesOp : public OpKernel {
+class CropAndResizeGradBoxesOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
+    // The shape of 'boxes' is [num_boxes, 4].
+    const Tensor& boxes = context->input(2);
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(3);
+    // The shape of 'image' is [batch_size, image_height, image_width, depth].
+    const Tensor& image = context->input(1);
 
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
     const int crop_height = grads.dim_size(1);
     const int crop_width = grads.dim_size(2);
     const int depth = grads.dim_size(3);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
-
-    // The shape of 'image' is [batch, image_height, image_width, depth].
-    const Tensor& image = context->input(1);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
+
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
     const int image_height = image.dim_size(1);
     const int image_width = image.dim_size(2);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(context, image.dim_size(3) == depth,
-                errors::InvalidArgument("image, grads depth differ"));
-
-    // The shape of 'boxes' is [num_boxes, 4].
-    const Tensor& boxes = context->input(2);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(context, image.dim_size(3) == depth,
+                      errors::InvalidArgument("image, grads depth differ"),
+                      done);
 
     int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
 
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({num_boxes, 4}), &output));
-
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 2>::Tensor output_data = output->tensor<float, 2>();
-
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
-        context->eigen_device<Device>(), grads_data, image_data, boxes_data,
-        box_ind_data, output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropBoxesKernel."));
-    }
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(0, TensorShape({num_boxes, 4}), &output),
+        done);
+
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& image = context->input(1);
+      const Tensor& boxes = context->input(2);
+      const Tensor& box_index = context->input(3);
+      const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          image.tensor<T, 4>(), boxes.tensor<float, 2>(),
+          box_index.tensor<int32, 1>(), output->tensor<float, 2>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropBoxes kernel."));
+      }
+    };
+
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -485,9 +539,9 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<float, 2>::Tensor grads_boxes) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -504,8 +558,8 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -545,12 +599,14 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
           const float x_lerp = in_x - left_x_index;
 
           for (int d = 0; d < depth; ++d) {
-            const float top_left(image(b_in, top_y_index, left_x_index, d));
-            const float top_right(image(b_in, top_y_index, right_x_index, d));
-            const float bottom_left(
-                image(b_in, bottom_y_index, left_x_index, d));
-            const float bottom_right(
-                image(b_in, bottom_y_index, right_x_index, d));
+            const float top_left(
+                static_cast<float>(image(b_in, top_y_index, left_x_index, d)));
+            const float top_right(
+                static_cast<float>(image(b_in, top_y_index, right_x_index, d)));
+            const float bottom_left(static_cast<float>(
+                image(b_in, bottom_y_index, left_x_index, d)));
+            const float bottom_right(static_cast<float>(
+                image(b_in, bottom_y_index, right_x_index, d)));
             // Compute the image gradient.
             float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
                                  x_lerp * (bottom_right - top_right);
@@ -585,88 +641,123 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
     return true;
   }
 };
-}  // namespace functor
 
-// Specialization of CheckValidBoxInd for a CPUDevice.
-template <>
-inline void CheckValidBoxInd<CPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
-  for (int b = 0; b < num_boxes; ++b) {
-    OP_REQUIRES(context, box_ind(b) >= 0 && box_ind(b) < batch,
-                errors::OutOfRange("box_ind has values outside [0, batch)"));
-  }
-}
+}  // namespace functor
 
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .HostMemory("crop_size"),            \
-                          CropAndResizeOp<CPUDevice, T>);          \
-                                                                   \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage")           \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .HostMemory("image_size"),           \
-                          CropAndResizeGradImageOp<CPUDevice, T>); \
-                                                                   \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")           \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T"),             \
+#define REGISTER_KERNEL(T)                                \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResize")           \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T")     \
+                              .HostMemory("crop_size"),   \
+                          CropAndResizeOp<CPUDevice, T>); \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")  \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T"),    \
                           CropAndResizeGradBoxesOp<CPUDevice, T>);
 
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
+
+#undef REGISTER_KERNEL
+
+#define REGISTER_KERNEL(T)                               \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradImage") \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<T>("T")    \
+                              .HostMemory("image_size"), \
+                          CropAndResizeGradImageOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_KERNEL);
 TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_double(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
 
-// Forward declaration of the CheckValidBoxIndHelper specialization for GPU.
+// Forward declaration of the CheckValidBoxIndexHelper specialization for GPU.
 namespace functor {
 template <>
-void CheckValidBoxIndHelper<GPUDevice>::operator()(
-    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch, typename TTypes<bool, 0>::Tensor isvalid);
-extern template struct CheckValidBoxIndHelper<GPUDevice>;
+void CheckValidBoxIndexHelper<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, typename TTypes<bool, 0>::Tensor isvalid);
+extern template struct CheckValidBoxIndexHelper<GPUDevice>;
 }  // namespace functor
 
-// Specialization of CheckValidBoxInd for a GPUDevice.
+namespace {
+
+// Specialization of CheckValidBoxIndex for a GPUDevice.
 template <>
-inline void CheckValidBoxInd<GPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
+inline void RunIfBoxIndexIsValid<GPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, const Callback& compute, const Callback& done) {
+  const int num_boxes = box_index.dimension(0);
   if (num_boxes == 0) {
+    compute();
+    done();
     return;
   }
-  Tensor isvalid_tensor;
-  OP_REQUIRES_OK(context,
-                 context->allocate_temp(DataTypeToEnum<bool>::value,
-                                        TensorShape({}), &isvalid_tensor));
 
-  typename TTypes<bool, 0>::Tensor isvalid = isvalid_tensor.tensor<bool, 0>();
+  Tensor isvalid_dev_tensor;
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_dev_tensor),
+      done);
+  typename TTypes<bool, 0>::Tensor isvalid_dev =
+      isvalid_dev_tensor.tensor<bool, 0>();
 
-  functor::CheckValidBoxIndHelper<GPUDevice>()(
-      context->eigen_device<GPUDevice>(), box_ind, batch, isvalid);
+  // Run the actual box check on the device.
+  functor::CheckValidBoxIndexHelper<GPUDevice>()(
+      context->eigen_device<GPUDevice>(), box_index, batch_size, isvalid_dev);
 
+  // Copy the result back to the host.
   auto* stream = context->op_device_context()->stream();
-  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-  bool isvalid_host = false;
-  perftools::gputools::DeviceMemoryBase isvalid_gpu(isvalid.data(),
-                                                    sizeof(bool));
-  stream->ThenMemcpy(&isvalid_host, isvalid_gpu, sizeof(bool));
-  stream->BlockHostUntilDone();
-
-  OP_REQUIRES(context, stream->ok(),
-              errors::Internal("cudaMemcpy from device to host failed"));
-
-  OP_REQUIRES(context, isvalid_host,
-              errors::OutOfRange("box_ind has values outside [0, batch)"));
+  OP_REQUIRES_ASYNC(context, stream,
+                    errors::Internal("No GPU stream available."), done);
+  Tensor isvalid_host_tensor;
+  // Use pinned host memory on the host to avoid unnecessary
+  // synchronization.
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  alloc_attr.set_gpu_compatible(true);
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_host_tensor, alloc_attr),
+      done);
+  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
+                                                sizeof(bool));
+  const bool status =
+      stream
+          ->ThenMemcpy(
+              isvalid_host_tensor.scalar<bool>().data() /* destination */,
+              wrapped /* source */, sizeof(bool))
+          .ok();
+  OP_REQUIRES_ASYNC(
+      context, status,
+      errors::Internal("Failed to launch copy of isvalid from device to host."),
+      done);
+
+  // We capture both temporary tensors to prevent them from being deallocated
+  // when ComputeAsync returns and before the closure runs.
+  auto wrapped_callback = [context, isvalid_host_tensor, isvalid_dev_tensor,
+                           compute, done]() {
+    const bool isvalid = isvalid_host_tensor.scalar<bool>()();
+    OP_REQUIRES_ASYNC(
+        context, isvalid,
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+    compute();
+    done();
+  };
+
+  context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+      stream, wrapped_callback);
 }
 
+}  // namespace
+
 #define REGISTER_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
                               .Device(DEVICE_GPU)                  \
@@ -685,7 +776,7 @@ inline void CheckValidBoxInd<GPUDevice>(
                               .TypeConstraint<T>("T"),             \
                           CropAndResizeGradBoxesOp<GPUDevice, T>);
 
-TF_CALL_float(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/crop_and_resize_op.h
index 22df1bdd56bd0ef1d610fcb684c1987e0e32ed98..460dbad22b484f7df8cd10221f183df75ecffb55 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/crop_and_resize_op.h
@@ -53,12 +53,12 @@ struct CropAndResizeBackpropBoxes {
 };
 
 template <typename Device>
-struct CheckValidBoxIndHelper {
-  // Checks if all values in box_ind are in [0, batch).
+struct CheckValidBoxIndexHelper {
+  // Checks if all values in box_index are in [0, batch).
   void operator()(const Device& d,
-                  typename TTypes<int32, 1>::ConstTensor box_ind, int batch,
+                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
                   typename TTypes<bool, 0>::Tensor isvalid) {
-    isvalid.device(d) = ((box_ind >= 0) && (box_ind < batch)).all();
+    isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
   }
 };
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 75146b28e66d26be70c33975a254a5dceaad4c15..c1235fda89216fb535b51170dd4967fd5eddd7f0 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -88,26 +88,26 @@ __global__ void CropAndResizeKernel(
     const int right_x_index = ceilf(in_x);
     const float x_lerp = in_x - left_x_index;
 
-    const float top_left(
+    const float top_left(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d]);
-    const float top_right(
+                  d]));
+    const float top_right(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d]);
-    const float bottom_left(
+                  d]));
+    const float bottom_left(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d]);
-    const float bottom_right(
+                  d]));
+    const float bottom_right(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d]);
+                  d]));
     const float top = top_left + (top_right - top_left) * x_lerp;
     const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
     crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
@@ -258,26 +258,26 @@ __global__ void CropAndResizeBackpropBoxesKernel(
     const int right_x_index = ceilf(in_x);
     const float x_lerp = in_x - left_x_index;
 
-    const float top_left =
+    const float top_left(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d];
-    const float top_right =
+                  d]));
+    const float top_right(static_cast<float>(
         image_ptr[((b_in * image_height + top_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d];
-    const float bottom_left =
+                  d]));
+    const float bottom_left(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    left_x_index) *
                       depth +
-                  d];
-    const float bottom_right =
+                  d]));
+    const float bottom_right(static_cast<float>(
         image_ptr[((b_in * image_height + bottom_y_index) * image_width +
                    right_x_index) *
                       depth +
-                  d];
+                  d]));
 
     // Compute the image gradient.
     float image_grad_y = (1 - x_lerp) * (bottom_left - top_left) +
@@ -436,11 +436,11 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
   template struct CropAndResizeBackpropImage<GPUDevice, T>; \
   template struct CropAndResizeBackpropBoxes<GPUDevice, T>;
 
-TF_CALL_float(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 
-template struct CheckValidBoxIndHelper<GPUDevice>;
+template struct CheckValidBoxIndexHelper<GPUDevice>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 68e077e44dfc007f2af4e590fe551adcfe917fd1..d6139dae966812261f6d59158ba807bbdfe40283 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,9 +32,10 @@ namespace tensorflow {
 
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
+  template <typename T>
   void MakeOp(float extrapolation_value) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Input(FakeInput(DT_INT32))
@@ -43,12 +45,33 @@ class CropAndResizeOpTest : public OpsTestBase {
   }
 };
 
-TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1) {
-  MakeOp(0);
+#define REGISTER_TEST(T)                                               \
+  TEST_F(CropAndResizeOpTest, TestCropAndResize##T) {                  \
+    MakeOp<T>(0);                                                      \
+    AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
+    AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
+    AddInputFromArray<int32>(TensorShape({1}), {0});                   \
+    AddInputFromArray<int32>(TensorShape({2}), {1, 1});                \
+    TF_ASSERT_OK(RunOpKernel());                                       \
+                                                                       \
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
+    test::FillValues<float>(&expected, {2.5});                         \
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
+  }
+
+REGISTER_TEST(float)
+REGISTER_TEST(double)
+REGISTER_TEST(int8)
+REGISTER_TEST(uint8)
+
+#undef REGISTER_TEST
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
+  MakeOp<uint8>(0);
   // Input:
   //  1, 2
   //  3, 4
-  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
   AddInputFromArray<int32>(TensorShape({2}), {1, 1});
@@ -60,7 +83,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -76,7 +99,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -97,7 +120,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -118,7 +141,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -143,7 +166,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -169,7 +192,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   const float v = -1;
-  MakeOp(v);
+  MakeOp<float>(v);
   // Input:
   //  1, 2
   //  3, 4
@@ -190,7 +213,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
-  MakeOp(0);
+  MakeOp<float>(0);
   // Input:
   //  1, 2
   //  3, 4
@@ -208,7 +231,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
@@ -220,7 +243,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -228,12 +251,12 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_ind has incompatible shape"))
+      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
       << s;
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
-  MakeOp(0);
+  MakeOp<float>(0);
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {1});
@@ -241,8 +264,10 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_ind has values outside [0, batch)"))
+                  .contains("box_index has values outside [0, batch_size)"))
       << s;
 }
 
+// TODO(zhengxq, rmlarsen): Add a benchmark.
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 05d0169b112d1aa399fff8106d723e61dbfccf30..426382edeca7086676201cc86acd3b718c4bcb13 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -42,6 +42,8 @@ class CTCLossOp : public OpKernel {
                                      &preprocess_collapse_repeated_));
     OP_REQUIRES_OK(ctx,
                    ctx->GetAttr("ctc_merge_repeated", &ctc_merge_repeated_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("ignore_longer_outputs_than_inputs",
+                                     &ignore_longer_outputs_than_inputs_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -150,12 +152,15 @@ class CTCLossOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctc_loss_calculator.CalculateLoss(
                             seq_len_t, labels_t, input_list_t,
                             preprocess_collapse_repeated_, ctc_merge_repeated_,
-                            &loss_t, &gradient_list_t, &workers));
+                            ignore_longer_outputs_than_inputs_, &loss_t,
+                            &gradient_list_t, &workers));
   }
 
  private:
   bool preprocess_collapse_repeated_;
   bool ctc_merge_repeated_;
+  bool ignore_longer_outputs_than_inputs_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CTCLossOp);
 };
 
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2d1a53af4b0cec297f94398d921a3e0c3126ac6
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -0,0 +1,336 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================
+*/
+#ifdef GOOGLE_CUDA
+#include "tensorflow/core/kernels/cuda_solvers.h"
+
+#include <chrono>
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cuda/include/cublas_v2.h"
+#include "cuda/include/cusolverDn.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+inline bool CopyHostToDevice(OpKernelContext* context, void* dst,
+                             const void* src, uint64 bytes) {
+  auto stream = context->op_device_context()->stream();
+  perftools::gputools::DeviceMemoryBase wrapped_dst(dst);
+  return stream->ThenMemcpy(&wrapped_dst, src, bytes).ok();
+}
+
+// Type traits to get CUDA complex types from std::complex<>.
+template <typename T>
+struct CUDAComplexT {
+  typedef T type;
+};
+template <>
+struct CUDAComplexT<std::complex<float>> {
+  typedef cuComplex type;
+};
+template <>
+struct CUDAComplexT<std::complex<double>> {
+  typedef cuDoubleComplex type;
+};
+// Converts pointers of std::complex<> to pointers of
+// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
+template <typename T>
+inline const typename CUDAComplexT<T>::type* CUDAComplex(const T* p) {
+  return reinterpret_cast<const typename CUDAComplexT<T>::type*>(p);
+}
+template <typename T>
+inline typename CUDAComplexT<T>::type* CUDAComplex(T* p) {
+  return reinterpret_cast<typename CUDAComplexT<T>::type*>(p);
+}
+
+// A set of initialized handles to the underlying Cuda libraries used by
+// CudaSolver. We maintain one such set of handles per unique stream.
+struct CudaSolverHandles {
+  explicit CudaSolverHandles(cudaStream_t stream) {
+    CHECK(cusolverDnCreate(&cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to create cuSolverDN instance.";
+    CHECK(cusolverDnSetStream(cusolver_dn_handle, stream) ==
+          CUSOLVER_STATUS_SUCCESS)
+        << "Failed to set cuSolverDN stream.";
+    CHECK(cublasCreate(&cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to create cuBlas instance.";
+    CHECK(cublasSetStream(cublas_handle, stream) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to set cuBlas stream.";
+  }
+
+  ~CudaSolverHandles() {
+    CHECK(cublasDestroy(cublas_handle) == CUBLAS_STATUS_SUCCESS)
+        << "Failed to destroy cuBlas instance.";
+    CHECK(cusolverDnDestroy(cusolver_dn_handle) == CUSOLVER_STATUS_SUCCESS)
+        << "Failed to destroy cuSolverDN instance.";
+  }
+  cublasHandle_t cublas_handle;
+  cusolverDnHandle_t cusolver_dn_handle;
+};
+
+static mutex handle_map_mutex(LINKER_INITIALIZED);
+
+using HandleMap =
+    std::unordered_map<cudaStream_t, std::unique_ptr<CudaSolverHandles>>;
+
+// Returns a singleton map used for storing initialized handles for each unique
+// cuda stream.
+HandleMap* GetHandleMapSingleton() {
+  static HandleMap* cm = new HandleMap;
+  return cm;
+}
+
+}  // namespace
+
+#define TF_RETURN_IF_CUSOLVER_ERROR(expr)                                      \
+  do {                                                                         \
+    auto status = (expr);                                                      \
+    if (TF_PREDICT_FALSE(status != CUSOLVER_STATUS_SUCCESS)) {                 \
+      return errors::Internal("cuSolverDN call failed with status =", status); \
+    }                                                                          \
+  } while (0)
+
+#define TF_RETURN_IF_CUBLAS_ERROR(expr)                                \
+  do {                                                                 \
+    auto status = (expr);                                              \
+    if (TF_PREDICT_FALSE(status != CUBLAS_STATUS_SUCCESS)) {           \
+      return errors::Internal("cuBlas call failed status = ", status); \
+    }                                                                  \
+  } while (0)
+
+CudaSolver::CudaSolver(OpKernelContext* context) : context_(context) {
+  const cudaStream_t* cu_stream_ptr = CHECK_NOTNULL(
+      reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                ->stream()
+                                                ->implementation()
+                                                ->CudaStreamMemberHack()));
+  cuda_stream_ = *cu_stream_ptr;
+  HandleMap* handle_map = CHECK_NOTNULL(GetHandleMapSingleton());
+  mutex_lock lock(handle_map_mutex);
+  auto it = handle_map->find(cuda_stream_);
+  if (it == handle_map->end()) {
+    LOG(INFO) << "Creating CudaSolver handles for stream " << cuda_stream_;
+    // Previously unseen Cuda stream. Initialize a set of Cuda solver library
+    // handles for it.
+    std::unique_ptr<CudaSolverHandles> new_handles(
+        new CudaSolverHandles(cuda_stream_));
+    it =
+        handle_map->insert(std::make_pair(cuda_stream_, std::move(new_handles)))
+            .first;
+  }
+  cusolver_dn_handle_ = it->second->cusolver_dn_handle;
+  cublas_handle_ = it->second->cublas_handle;
+}
+
+Status CudaSolver::CopyLapackInfoToHostAsync(
+    const std::vector<DeviceLapackInfo>& dev_lapack_infos,
+    std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+        info_checker_callback) const {
+  std::vector<HostLapackInfo> host_lapack_infos;
+  if (dev_lapack_infos.empty()) {
+    info_checker_callback(Status::OK(), std::move(host_lapack_infos));
+    return Status::OK();
+  }
+
+  // Launch memcpys to copy info back from the device to the host.
+  for (const auto& dev_lapack_info : dev_lapack_infos) {
+    bool success = true;
+    auto host_copy = dev_lapack_info.CopyToHost(&success);
+    if (!success) {
+      return errors::Internal(
+          "Failed to launch copy of dev_lapack_info to host, debug_info = ",
+          dev_lapack_info.debug_info());
+    }
+    host_lapack_infos.push_back(std::move(host_copy));
+  }
+
+  // This callback checks that all batch items in all calls were processed
+  // successfully and passes status to the info_checker_callback accordingly.
+  auto wrapped_info_checker_callback =
+      [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
+        Status status;
+        for (auto host_lapack_info : host_lapack_infos) {
+          for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
+            const int info_value = (host_lapack_info.data())[i];
+            if (info_value != 0) {
+              status = errors::InvalidArgument(
+                  "Got info = ", info_value, " for batch index ", i,
+                  ", expected info = 0. Debug_info =",
+                  host_lapack_info.debug_info());
+            }
+          }
+          if (!status.ok()) {
+            break;
+          }
+        }
+        info_checker_callback(status, host_lapack_infos);
+      };
+  auto cb =
+      std::bind(wrapped_info_checker_callback, std::move(host_lapack_infos));
+  auto stream = context_->op_device_context()->stream();
+  context_->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+      stream, std::move(cb));
+  return Status::OK();
+}
+
+// Macro that specializes a solver method for all 4 standard
+// numeric types.
+#define TF_CALL_LAPACK_TYPES(m) \
+  m(float, S) m(double, D) m(std::complex<float>, C) m(std::complex<double>, Z)
+
+// Macros to construct cusolverDn method names.
+#define DN_SOLVER_FN(method, lapack_prefix) cusolverDn##lapack_prefix##method
+#define DN_SOLVER_NAME(method, lapack_prefix) \
+  "cusolverDn" #lapack_prefix #method
+#define DN_BUFSIZE_FN(method, lapack_prefix) \
+  cusolverDn##lapack_prefix##method##_bufferSize
+
+// Macros to construct cublas method names.
+#define BLAS_SOLVER_FN(method, lapack_prefix) cublas##lapack_prefix##method
+#define BLAS_SOLVER_NAME(method, lapack_prefix) "cublas" #lapack_prefix #method
+
+//=============================================================================
+// Wrappers of cuSolverDN computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cusolver_api.h header file.
+//=============================================================================
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status PotrfImpl(BufSizeFnT bufsize, SolverFnT solver,
+                               OpKernelContext* context,
+                               cusolverDnHandle_t cusolver_dn_handle,
+                               cublasFillMode_t uplo, int n, Scalar* A, int lda,
+                               int* dev_lapack_info) {
+  /* Get amount of workspace memory required. */
+  int lwork;
+  TF_RETURN_IF_CUSOLVER_ERROR(
+      bufsize(cusolver_dn_handle, uplo, n, CUDAComplex(A), lda, &lwork));
+  /* Allocate device memory for workspace. */
+  ScratchSpace<Scalar> dev_workspace(context, lwork, /* on_host */ false);
+  /* Launch the solver kernel. */
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, uplo, n, CUDAComplex(A), lda,
+      CUDAComplex(dev_workspace.mutable_data()), lwork, dev_lapack_info));
+  return Status::OK();
+}
+
+#define POTRF_INSTANCE(Scalar, lapack_prefix)                                \
+  template <>                                                                \
+  Status CudaSolver::Potrf<Scalar>(cublasFillMode_t uplo, int n, Scalar* A,  \
+                                   int lda, int* dev_lapack_info) const {    \
+    return PotrfImpl(DN_BUFSIZE_FN(potrf, lapack_prefix),                    \
+                     DN_SOLVER_FN(potrf, lapack_prefix), context_,           \
+                     cusolver_dn_handle_, uplo, n, A, lda, dev_lapack_info); \
+  }
+
+TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
+
+//=============================================================================
+// Wrappers of cuBlas computational methods begin here.
+//
+// WARNING to implementers: The function signatures listed in the online docs
+// are sometimes inaccurate, e.g., are missing 'const' on pointers
+// to immutable arguments, while the actual headers have them as expected.
+// Check the actual declarations in the cublas_api.h header file.
+//=============================================================================
+template <typename Scalar, typename SolverFnT>
+static inline Status GetrfBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  if (!CopyHostToDevice(
+          context, (void*)dev_a_dev_ptrs.mutable_data() /* dest */,
+          (const void*)host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes())) {
+    return errors::Internal("GetrfBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (CudaScalar**)dev_a_dev_ptrs.mutable_data(), lda,
+             dev_pivots, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRF_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetrfBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, int* dev_pivots,        \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetrfBatchedImpl(BLAS_SOLVER_FN(getrfBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, dev_lapack_info, batch_size);          \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);
+
+template <typename Scalar, typename SolverFnT>
+static inline Status GetriBatchedImpl(
+    SolverFnT solver, OpKernelContext* context, cublasHandle_t cublas_handle,
+    int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,
+    const Scalar* host_a_inv_dev_ptrs[], int ldainv,
+    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+  using CudaScalar = typename CUDAComplexT<Scalar>::type;
+  ScratchSpace<uint8> dev_a_dev_ptrs(context, sizeof(CudaScalar*) * batch_size,
+                                     /* on_host */ false);
+  ScratchSpace<uint8> dev_a_inv_dev_ptrs(
+      context, sizeof(CudaScalar*) * batch_size, /* on_host */ false);
+  if (!CopyHostToDevice(
+          context, (void*)dev_a_dev_ptrs.mutable_data() /* dest */,
+          (const void*)host_a_dev_ptrs /* source */, dev_a_dev_ptrs.bytes()) ||
+      !CopyHostToDevice(context, (void*)dev_a_inv_dev_ptrs.mutable_data(),
+                        (const void*)host_a_inv_dev_ptrs,
+                        dev_a_inv_dev_ptrs.bytes())) {
+    return errors::Internal("GetriBatched: failed to copy pointers to device");
+  }
+  TF_RETURN_IF_CUBLAS_ERROR(
+      solver(cublas_handle, n, (const CudaScalar**)dev_a_dev_ptrs.data(), lda,
+             dev_pivots, (CudaScalar**)dev_a_inv_dev_ptrs.mutable_data(),
+             ldainv, dev_lapack_info->mutable_data(), batch_size));
+  return Status::OK();
+}
+
+#define GETRI_BATCHED_INSTANCE(Scalar, lapack_prefix)                          \
+  template <>                                                                  \
+  Status CudaSolver::GetriBatched(                                             \
+      int n, const Scalar* host_a_dev_ptrs[], int lda, const int* dev_pivots,  \
+      const Scalar* host_a_inv_dev_ptrs[], int ldainv,                         \
+      DeviceLapackInfo* dev_lapack_info, int batch_size) const {               \
+    return GetriBatchedImpl(BLAS_SOLVER_FN(getriBatched, lapack_prefix),       \
+                            context_, cublas_handle_, n, host_a_dev_ptrs, lda, \
+                            dev_pivots, host_a_inv_dev_ptrs, ldainv,           \
+                            dev_lapack_info, batch_size);                      \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d1c807e66eaaf1adb5cb4272b875d7e44effdb8
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -0,0 +1,320 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================
+*/
+
+// This header declares the class CudaSolver, which contains wrappers of linear
+// algebra solvers in the cuBlas and cuSolverDN libraries for use in TensorFlow
+// kernels.
+
+#ifdef GOOGLE_CUDA
+
+#include <functional>
+#include <vector>
+
+#include "cuda/include/cublas_v2.h"
+#include "cuda/include/cusolverDn.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+
+// Container of LAPACK info data (an array of int) generated on-device by
+// a CudaSolver call. One or more such objects can be passed to
+// CudaSolver::CopyLapackInfoToHostAsync() along with a callback to
+// check the LAPACK info data after the corresponding kernels
+// finish and LAPACK info has been copied from the device to the host.
+class DeviceLapackInfo;
+
+// Host-side copy of LAPACK info.
+class HostLapackInfo;
+
+// The CudaSolver class provides a simplified templated API for the dense linear
+// solvers implemented in cuSolverDN (http://docs.nvidia.com/cuda/cusolver) and
+// cuBlas (http://docs.nvidia.com/cuda/cublas/#blas-like-extension/).
+// An object of this class wraps static cuSolver and cuBlas instances,
+// and will launch Cuda kernels on the stream wrapped by the GPU device
+// in the OpKernelContext provided to the constructor.
+//
+// Notice: All the computational member functions are asynchronous and simply
+// launch one or more Cuda kernels on the Cuda stream wrapped by the CudaSolver
+// object. To check the final status of the kernels run, call
+// CopyLapackInfoToHostAsync() on the CudaSolver object to set a callback that
+// will be invoked with the status of the kernels launched thus far as
+// arguments.
+//
+// Example of an asynchronous TensorFlow kernel using CudaSolver:
+//
+// template <typename Scalar>
+// class SymmetricPositiveDefiniteSolveOpGpu : public AsyncOpKernel {
+//  public:
+//   explicit SymmetricPositiveDefiniteSolveOpGpu(OpKernelConstruction* context)
+//       : AsyncOpKernel(context) { }
+//   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+//     // 1. Set up input and output device ptrs. See, e.g.,
+//     // matrix_inverse_op.cc for a full example.
+//     ...
+//
+//     // 2. Initialize the solver object.
+//     CudaSolver solver(context);
+//
+//     // 3. Launch the two compute kernels back to back on the stream without
+//     // synchronizing.
+//     std::vector<DeviceLapackInfo> dev_info;
+//     const int batch_size = 1;
+//     dev_info.emplace_back(context, batch_size, "potrf");
+//     // Compute the Cholesky decomposition of the input matrix.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrf(uplo, n, dev_matrix_ptrs, n,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//     dev_info.emplace_back(context, batch_size, "potrs");
+//     // Use the Cholesky decomposition of the input matrix to solve A X = RHS.
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.Potrs(uplo, n, nrhs, dev_matrix_ptrs, n,
+//                                       dev_output_ptrs, ldrhs,
+//                                       dev_info.back().mutable_data()),
+//                          done);
+//
+//     // 4. Check the status after the computation finishes and call done.
+//     // Capture dev_info so the underlying buffers don't get deallocated
+//     // before the kernels run.
+//     auto check_status = [context, done, dev_info](const Status& status,
+//       const std::vector<HostLapackInfo>& /* unused */) {
+//           // In this example we don't care about the exact cause of
+//           // death, so just check status.
+//           OP_REQUIRES_OK_ASYNC(context, status, done);
+//           done();
+//     };
+//     OP_REQUIRES_OK_ASYNC(context,
+//                          solver.CopyLapackInfoToHostAsync(
+//                            dev_info, std::move(check_status));
+//                          done);
+//   }
+// };
+
+class CudaSolver {
+ public:
+  // This object stores a pointer to context, which must outlive it.
+  explicit CudaSolver(OpKernelContext* context);
+  virtual ~CudaSolver() {}
+
+  // Launches a memcpy of solver status data specified by dev_lapack_info from
+  // device to the host, and asynchronously invokes the given callback when the
+  // copy is complete. The first Status argument to the callback will be
+  // Status::OK if all lapack infos retrieved are zero, otherwise an error status
+  // is given. The second argument contains a host-side copy of the entire set
+  // of infos retrieved, and can be used for generating detailed error messages.
+  Status CopyLapackInfoToHostAsync(
+      const std::vector<DeviceLapackInfo>& dev_lapack_info,
+      std::function<void(const Status&, const std::vector<HostLapackInfo>&)>
+          info_checker_callback) const;
+
+  // ====================================================================
+  // Wrappers for cuSolverDN and cuBlas solvers start here.
+  //
+  // Apart from capitalization of the first letter, the method names below map
+  // to those in cuSolverDN and cuBlas, which follow the naming convention in
+  // LAPACK see, e.g., http://docs.nvidia.com/cuda/cusolver/#naming-convention
+
+  // Computes the Cholesky factorization A = L * L^T for a single matrix.
+  // Returns Status::OK(), if the kernel was launched successfully. See:
+  // http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrf
+  template <typename Scalar>
+  Status Potrf(cublasFillMode_t uplo, int n, Scalar* dev_A, int lda,
+               int* dev_lapack_info) const;
+
+  // Computes partially pivoted LU factorizations for a batch of matrices.
+  // Returns Status::OK() if the kernel was launched successfully.See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+  template <typename Scalar>
+  Status GetrfBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      int* dev_pivots, DeviceLapackInfo* dev_lapack_info,
+                      int batch_size) const;
+
+  // Computes matrix inverses for a batch of matrices. Uses the outputs from
+  // GetrfBatched. Returns Status::OK() if the kernel was launched successfully.
+  // See:
+  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getribatched
+  template <typename Scalar>
+  Status GetriBatched(int n, const Scalar* host_a_dev_ptrs[], int lda,
+                      const int* dev_pivots,
+                      const Scalar* host_a_inverse_dev_ptrs[], int ldainv,
+                      DeviceLapackInfo* dev_lapack_info, int batch_size) const;
+
+  /*
+  TODO(rmlarsen, volunteers): Implement the kernels below.
+  // Uses Cholesky factorization to solve A * X = B.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-potrs
+  template <typename Scalar>
+  Status Potrs(cublasFillMode_t uplo, int n, int nrhs, const Scalar* dev_A, int
+  lda, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
+
+  // LU factorization.
+  // Computes LU factorization with partial pivoting P * A = L * U.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrf
+  template <typename Scalar>
+  Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
+             int* dev_lapack_info) const;
+
+  // Uses LU factorization to solve A * X = B.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-getrs
+  template <typename Scalar>
+  Status Getrs(int n, int nrhs, const Scalar* dev_A, int lda, const int*
+  dev_pivots, Scalar* dev_B, int ldb, int* dev_lapack_info) const;
+
+  // QR factorization.
+  // Computes QR factorization A = Q * R.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
+  template <typename Scalar>
+  Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_TAU, int*
+  devInfo) const;
+
+  // Multiplies by Q.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-ormqr
+  template <typename Scalar>
+  Status Ormqr(cublasSideMode_t side, cublasOperation_t trans, int m, int n, int
+  k, const Scalar* dev_a, int lda, const Scalar* dev_tau, Scalar* dev_c, int
+  ldc, int* dev_lapack_info) const;
+
+  // Generate Q.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-orgqr
+  template <typename Scalar>
+  Status Orgqr(int m, int n, int k, Scalar* dev_A, int lda, const Scalar*
+  dev_tau, int* dev_lapack_info) const;
+
+  // Symmetric/Hermitian Eigen decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-syevd
+  template <typename Scalar>
+  Status Syevd(cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, Scalar*
+  dev_A, int lda, Scalar* dev_W, int* dev_lapack_info) const;
+
+  // Singular value decomposition.
+  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-gesvd
+  template <typename Scalar>
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+             int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+             int ldvt, int* dev_lapack_info);
+
+  // Batched linear solver using LU factorization from getrfBatched.
+  // See:
+  http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
+  template <typename Scalar>
+  Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
+                    const Scalar* dev_Aarray[], int lda, const int* devIpiv,
+                    Scalar* dev_Barray[], int ldb, int* info, int batch_size)
+  const;
+  */
+
+ private:
+  OpKernelContext* context_;  // not owned.
+  cudaStream_t cuda_stream_;
+  cusolverDnHandle_t cusolver_dn_handle_;
+  cublasHandle_t cublas_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CudaSolver);
+};
+
+// Helper class to allocate scratch memory and keep track of debug info.
+// Mostly a thin wrapper around Tensor.
+template <typename Scalar>
+class ScratchSpace {
+ public:
+  ScratchSpace(OpKernelContext* context, int size, bool on_host)
+      : ScratchSpace(context, size, "", on_host) {}
+
+  ScratchSpace(OpKernelContext* context, int size, const string& debug_info,
+               bool on_host)
+      : context_(context), debug_info_(debug_info), on_host_(on_host) {
+    AllocatorAttributes alloc_attr;
+    if (on_host) {
+      // Allocate pinned memory on the host to avoid unnecessary
+      // synchronization.
+      alloc_attr.set_on_host(true);
+      alloc_attr.set_gpu_compatible(true);
+    }
+    TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                       TensorShape({size}), &scratch_tensor_,
+                                       alloc_attr));
+  }
+
+  virtual ~ScratchSpace() {}
+
+  Scalar* mutable_data() {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  const Scalar* data() const {
+    return scratch_tensor_.template flat<Scalar>().data();
+  }
+  int64 bytes() const { return scratch_tensor_.TotalBytes(); }
+  int64 size() const { return scratch_tensor_.NumElements(); }
+  const string& debug_info() const { return debug_info_; }
+
+  // Returns true if this ScratchSpace is in host memory.
+  bool on_host() const { return on_host_; }
+
+ protected:
+  OpKernelContext* context() const { return context_; }
+
+ private:
+  OpKernelContext* context_;  // not owned
+  const string debug_info_;
+  const bool on_host_;
+  Tensor scratch_tensor_;
+};
+
+class HostLapackInfo : public ScratchSpace<int> {
+ public:
+  HostLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ true){};
+};
+
+class DeviceLapackInfo : public ScratchSpace<int> {
+ public:
+  DeviceLapackInfo(OpKernelContext* context, int size, const string& debug_info)
+      : ScratchSpace<int>(context, size, debug_info, /* on_host */ false) {}
+
+  // Allocates a new scratch space on the host and launches a copy of the
+  // contents of *this to the new scratch space. Sets success to true if
+  // the copy kernel was launched successfully.
+  HostLapackInfo CopyToHost(bool* success) const {
+    CHECK(success != nullptr);
+    HostLapackInfo copy(context(), size(), debug_info());
+    auto stream = context()->op_device_context()->stream();
+    perftools::gputools::DeviceMemoryBase wrapped_src(
+        static_cast<void*>(const_cast<int*>(this->data())));
+    *success =
+        stream->ThenMemcpy(copy.mutable_data(), wrapped_src, this->bytes())
+            .ok();
+    return copy;
+  }
+};
+
+namespace functor {
+// Helper functor to transpose and conjugate all matrices in a flattened batch.
+template <typename Device, typename Scalar>
+struct AdjointBatchFunctor {
+  // We assume that the tensor sizes are correct.
+  void operator()(const Device& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output);
+};
+}  // namespace functor
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d32f506557792926797b18e49db869ff94cdd623
--- /dev/null
+++ b/tensorflow/core/kernels/cuda_solvers_gpu.cu.cc
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cuda_solvers.h"
+
+#include <complex>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// TODO(rmlarsen): Add a faster custom kernel similar to
+// SwapDimension1And2InTensor3 in tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+template <typename Scalar>
+struct AdjointBatchFunctor<GPUDevice, Scalar> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<Scalar, 3>::ConstTensor input,
+                  typename TTypes<Scalar, 3>::Tensor output) {
+    const Eigen::array<int, 3> perm({0, 2, 1});
+    To32Bit(output).device(d) = To32Bit(input).shuffle(perm).conjugate();
+  }
+};
+
+// Instantiate implementations for the 4 numeric types.
+template struct AdjointBatchFunctor<GPUDevice, float>;
+template struct AdjointBatchFunctor<GPUDevice, double>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<float>>;
+template struct AdjointBatchFunctor<GPUDevice, std::complex<double>>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index 93efc93957b6c4929cdcc5ec005dc01f9b9c9cfc..5939ecdf62bc32d90d53e80c1cc0fd57bca7b931 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -243,11 +243,10 @@ void DnnPooling3dGradOp<T>::Compute(
   }
 }
 
-#define DEFINE_DNN_OPS(T)                       \
-  template class DnnPooling3dOp<T>;               \
+#define DEFINE_DNN_OPS(T)           \
+  template class DnnPooling3dOp<T>; \
   template class DnnPooling3dGradOp<T>;
-TF_CALL_float(DEFINE_DNN_OPS)
-TF_CALL_half(DEFINE_DNN_OPS)
+TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
 #undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/tensorboard/gulp_tasks/bower.js b/tensorflow/core/kernels/cwise_op_atan2.cc
similarity index 72%
rename from tensorflow/tensorboard/gulp_tasks/bower.js
rename to tensorflow/core/kernels/cwise_op_atan2.cc
index 7c0e515c6c956443bea0a857e7eb2f6ad7492fd3..68f67c444ef1b6ed905c8107838b2c50f542256e 100644
--- a/tensorflow/tensorboard/gulp_tasks/bower.js
+++ b/tensorflow/core/kernels/cwise_op_atan2.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-var gulp = require('gulp');
-var bower = require('gulp-bower');
+#include "tensorflow/core/kernels/cwise_ops_common.h"
 
-module.exports = function() {
-  return function() {
-    return bower();
-  }
-}
+namespace tensorflow {
+REGISTER2(BinaryOp, CPU, "Atan2", functor::atan2, float, double);
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "Atan2", functor::atan2, float, double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
index e7f4d3c07e295496cc52f7786e788ae0a0d53e06..57e19c7202d38bdbb363fa32980c0fd542046ee2 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -25,7 +25,8 @@ namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32, int64, complex64,
           complex128, string, bool);
 #if GOOGLE_CUDA
-REGISTER4(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64, bool);
+REGISTER6(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64,
+          complex64, complex128, bool);
 #endif  // GOOGLE_CUDA
 
 #endif  // !defined(__ANDROID_TYPES_SLIM__)
diff --git a/tensorflow/tensorboard/gulp_tasks/test.js b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
similarity index 62%
rename from tensorflow/tensorboard/gulp_tasks/test.js
rename to tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
index ffa8122c7b56d32719381bdac4798d26d908a92c..137e14ef840e0d3731d69513e87fdb48b13e53fb 100644
--- a/tensorflow/tensorboard/gulp_tasks/test.js
+++ b/tensorflow/core/kernels/cwise_op_gpu_atan2.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-var gulp = require('gulp');
-var tester = require('web-component-tester').test;
+#if GOOGLE_CUDA
 
-module.exports = function(done) {
-  tester({}, function(error) {
-    if (error) {
-      // Pretty error for gulp.
-      error = new Error(error.message || error);
-      error.showStack = false;
-    }
-    done(error);
-  });
-}
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY2(atan2, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
index 2f234f2bab22b5ca53190dc3fbcc807ea574fa0d..d74cab6edf5c868174fa7c6d9eb40bfe567e1e94 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY5(neg, Eigen::half, float, double, int32, int64);
+DEFINE_UNARY7(neg, Eigen::half, float, double, int32, int64, complex64,
+              complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index c4a9b2288357c91d113c1b0a3a155333467b2bfb..eb7e3764d9d51401b6544c81ecf146b5bd8e11d1 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -43,7 +43,8 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
 #endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER4(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64);
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
+          complex64, complex128);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 59424d7a28dde89dfff09c1c282e8397bf7b7bcc..7d4ecec59f1564c90c11bb05d6e96c7e1b52a60d 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -25,8 +25,8 @@ namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32, int64,
           complex64, complex128, string, bool);
 #if GOOGLE_CUDA
-REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
-          bool);
+REGISTER6(BinaryOp, GPU, "NotEqual", functor::not_equal_to, int8, int16, int64,
+          complex64, complex128, bool);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index fb4c649a82bc6b2163a50f24c31d8fa32d1cac02..423307fd4cd647b0f7b00c95671aee72988903b5 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -658,6 +658,22 @@ struct zeta : base<T, Eigen::internal::scalar_zeta_op<T>> {};
 template <typename T>
 struct polygamma : base<T, Eigen::internal::scalar_polygamma_op<T>> {};
 
+template <typename Scalar>
+struct scalar_atan2_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_atan2_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+  operator()(const Scalar& y, const Scalar& x) const {
+#if GOOGLE_CUDA
+    return ::atan2(y, x);
+#else
+    return std::atan2(y, x);
+#endif
+  }
+};
+
+template <typename T>
+struct atan2 : base<T, scalar_atan2_op<T>> {};
+
 template <typename T>
 struct squared_difference
     : base<T, Eigen::internal::scalar_compose_op<
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index b8e23e2c3576cc0b8265e4d6da3a9682487476a6..6dd108f7226ab5a64b8c074afa9ab219f045158a 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -133,6 +133,9 @@ struct ApproximateEqual<GPUDevice, T> {
 #define DEFINE_UNARY6(F, T0, T1, T2, T3, T4, T5) \
   DEFINE_UNARY2(F, T0, T1);                      \
   DEFINE_UNARY4(F, T2, T3, T4, T5)
+#define DEFINE_UNARY7(F, T0, T1, T2, T3, T4, T5, T6) \
+  DEFINE_UNARY2(F, T0, T1);                          \
+  DEFINE_UNARY5(F, T2, T3, T4, T5, T6)
 
 // Macros to explicitly instantiate kernels on GPU for multiple types
 // (T0, T1, etc.) for BinaryFunctor.
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..83ffabe224ed73560cd6ee037aa67c337f1c370a
--- /dev/null
+++ b/tensorflow/core/kernels/dataset.h
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
+
+#include <memory>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+class ResourceMgr;
+
+// A cut-down version of OpKernelContext for running computations in
+// iterators. Note that we cannot simply use OpKernelContext here
+// because we might run computation in an iterator whose lifetime is
+// not nested within the lifetime of a single OpKernelContext
+// (e.g. asynchronous prefetching).
+//
+// TODO(mrry): We will probably need to support more of
+// OpKernelContext here. For example, should allocation be handled by
+// the IteratorContext?
+// TODO(mrry): We will need to fabricate step IDs for calls to ops
+// that are not nested within a particular step.
+// TODO(mrry): We're making some daring assumptions about the lifetime
+// of the FunctionLibraryRuntime and runner passed in here. Once
+// created, a FunctionLibraryRuntime should stay alive for the
+// remainder of a session, so we copy the pointer. A runner will be
+// deleted when the original step ends, but all existing runners only
+// close over session-lifetime (or longer-lived) state, so we can make
+// a copy of the function. There's nothing in the definition of either
+// class to guarantee that what we are doing is safe. We should
+// formalize the properties here.
+class IteratorContext {
+ public:
+  struct Params {
+    // Interface to operating system functionality.
+    Env* env;
+
+    // The step being executed.
+    int64 step_id = 0;
+
+    // Shared resources accessible by this iterator invocation.
+    ResourceMgr* resource_manager = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+  };
+
+  explicit IteratorContext(Params params) : params_(std::move(params)) {}
+
+  Env* env() const { return params_.env; }
+
+  int64 step_id() const { return params_.step_id; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
+  }
+
+  ResourceMgr* resource_manager() const { return params_.resource_manager; }
+
+ private:
+  Params params_;
+};
+
+// Represents the current position in a range of outputs, where the
+// range of outputs is typically represented by an `DatasetBase`,
+// defined below.
+class IteratorBase {
+ public:
+  virtual ~IteratorBase() {}
+
+  // Gets the next output from the range that this iterator is traversing.
+  //
+  // If at least one output remains in this iterator's range, that
+  // output will be stored in `*out_tensors` and `false` will be
+  // stored in `*end_of_sequence`.
+  //
+  // If no more outputs remain in this iterator's range, `true` will
+  // be stored in `*end_of_sequence`, and the content of
+  // `*out_tensors` will be undefined.
+  //
+  // This method is thread-safe.
+  //
+  // TODO(mrry): Define `GetNextAsync()` or `GetNextManyAsync()`, and
+  // potentially remove this method.
+  virtual Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // iterator.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this iterator.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// Represents a (potentially infinite) range of outputs, where each
+// output is a tuple of tensors.
+class DatasetBase : public ResourceBase {
+ public:
+  // Returns a new iterator for iterating over the range of elements in
+  // this dataset.
+  //
+  // This method may be called multiple times on the same instance,
+  // and the resulting iterators will have distinct state. Each
+  // iterator will traverse all elements in this dataset from the
+  // start.
+  //
+  // Ownership of the created iterator will be transferred to the caller.
+  virtual std::unique_ptr<IteratorBase> MakeIterator() const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this
+  // dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// Represents an iterator that is associated with a particular parent dataset.
+template <class DatasetType>
+class DatasetIterator : public IteratorBase {
+ public:
+  explicit DatasetIterator(const DatasetType* dataset) : dataset_(dataset) {
+    dataset_->Ref();
+  }
+
+  ~DatasetIterator() override { dataset_->Unref(); }
+
+  // The dataset from which this iterator was created.
+  const DatasetType* dataset() const { return dataset_; }
+
+  const DataTypeVector& output_dtypes() const override {
+    return dataset_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return dataset_->output_shapes();
+  }
+
+ private:
+  const DatasetType* const dataset_;  // Owns one reference on the
+                                      // shared dataset resource.
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 5437bc5a330c3c29548b0a290deffe1d1778eeb2..353b837e04b2db2d5f305b8ca038e626fba412b3 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -35,13 +35,32 @@ class CopyOp : public OpKernel {
  public:
   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
+
+    std::vector<string> debug_ops_spec;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("debug_ops_spec", &debug_ops_spec));
+    for (const string& debug_op_spec : debug_ops_spec) {
+      // Assume debug_op_spec has the format
+      // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
+      // DebugIdentity;grpc://localhost:3333;1
+      const std::vector<string> items = str_util::Split(debug_op_spec, ";");
+      OP_REQUIRES(
+          context, items.size() == 3,
+          errors::Internal(
+              "Unexpected number of semicolons in debug_ops_spec element: ",
+              debug_op_spec));
+      debug_op_and_url_specs_.push_back(
+          DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
+                               items[1], items[2] == "1"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& src_tensor = context->input(0);
 
     if (src_tensor.IsInitialized() &&
-        DataTypeCanUseMemcpy(src_tensor.dtype())) {
+        DataTypeCanUseMemcpy(src_tensor.dtype()) &&
+        DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
       // Source tensor is initialized and is mem-copyable. Make a copy.
       Tensor* copied_tensor;
       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
@@ -79,48 +98,101 @@ class CopyOp : public OpKernel {
 
  private:
   string tensor_name_;
+  std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
 };
 
-// Identity op for debugging.
-//   Output slot 0 carries the debug signal and is always allocated on the
-//   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
-//   the debug signal is equal to the input tensor.
-class DebugIdentityOp : public OpKernel {
+// Base class of all debug ops.
+class BaseDebugOp : public OpKernel {
  public:
-  explicit DebugIdentityOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit BaseDebugOp(const string& debug_op_name,
+                       OpKernelConstruction* context)
+      : OpKernel(context), debug_op_name_(debug_op_name) {
     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+    OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
+    watch_key_ = strings::StrCat(tensor_name_, ":", debug_op_name_);
   }
 
-  void Compute(OpKernelContext* context) override {
-    if (!debug_urls_.empty()) {
-      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugIdentity",
-                                  context->input(0),
-                                  Env::Default()->NowMicros(), debug_urls_)
-          .IgnoreError();
-    }
+  bool IsExpensive() override { return false; }
 
-    context->set_output(0, context->input(0));
+ protected:
+  // Apply gRPC gating (if gated_grpc_ attribute is true).
+  //
+  // Returns false if and only if all grpc:// debug URLs of the debug op are
+  // disabled currently (i.e., gated off), in which case the debug op will emit
+  // an empty (size {0}) tensor of undefined data type.
+  bool ApplyGrpcGating(OpKernelContext* context) {
+    if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(watch_key_, debug_urls_)) {
+      // The entire node is gated off: Output an empty tensor and avoid
+      // expensive computation.
+      Tensor* output_tensor;
+      TensorShape shape({0});
+      if (!context->allocate_output(0, shape, &output_tensor).ok()) {
+        LOG(ERROR) << "Debug node of watch key " << watch_key_
+                   << "failed to allocate empty tensor under gated-off state.";
+      }
+      return false;
+    } else {
+      return true;
+    }
   }
 
-  bool IsExpensive() override { return false; }
+  // Publish a tensor to all debug URLs of the debug op.
+  // Log an error if the publishing failed.
+  void PublishTensor(const Tensor& tensor) {
+    if (!debug_urls_.empty()) {
+      Status status = DebugIO::PublishDebugTensor(
+          tensor_name_, debug_op_name_, tensor, Env::Default()->NowMicros(),
+          debug_urls_, gated_grpc_);
+      if (!status.ok()) {
+        LOG(ERROR) << "Debug node of watch key " << watch_key_
+                   << "failed to publish debug tensor data to all URLs "
+                   << str_util::Join(debug_urls_, ", ")
+                   << ", due to: " << status.error_message();
+      }
+    }
+  }
 
  private:
+  string debug_op_name_;
   string tensor_name_;
+  string watch_key_;
   std::vector<string> debug_urls_;
+  bool gated_grpc_;
+};
+
+// Identity op for debugging.
+//   Output slot 0 carries the debug signal and is always allocated on the
+//   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
+//   the debug signal is equal to the input tensor.
+class DebugIdentityOp : public BaseDebugOp {
+ public:
+  explicit DebugIdentityOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugIdentity", context) {}
+
+  void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    PublishTensor(context->input(0));
+    context->set_output(0, context->input(0));
+  }
 };
 
 // NaN-counter op for debugging.
 template <typename T>
-class DebugNanCountOp : public OpKernel {
+class DebugNanCountOp : public BaseDebugOp {
  public:
-  explicit DebugNanCountOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
-  }
+  explicit DebugNanCountOp(OpKernelConstruction* context)
+      : BaseDebugOp("DebugNanCount", context) {}
 
   void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
     const Tensor& input = context->input(0);
 
     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
@@ -140,34 +212,18 @@ class DebugNanCountOp : public OpKernel {
     }
 
     TensorShape shape({1});
-
-    Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<int64>()(0) = nan_count;
-
-    if (!debug_urls_.empty()) {
-      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugNanCount", *output_tensor,
-                                  Env::Default()->NowMicros(), debug_urls_)
-          .IgnoreError();
-    }
+    PublishTensor(*output_tensor);
   }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  string tensor_name_;
-  std::vector<string> debug_urls_;
 };
 
 // Numeric summary op for debugging.
 template <typename T>
-class DebugNumericSummaryOp : public OpKernel {
+class DebugNumericSummaryOp : public BaseDebugOp {
  public:
   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
-    OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
+      : BaseDebugOp("DebugNumericSummary", context) {
     OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
     OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
     OP_REQUIRES_OK(context,
@@ -175,6 +231,11 @@ class DebugNumericSummaryOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
+    if (!ApplyGrpcGating(context)) {
+      return;
+    }
+
+    Tensor* output_tensor;
     const Tensor& input = context->input(0);
 
     int64 is_initialized = 0;
@@ -254,8 +315,6 @@ class DebugNumericSummaryOp : public OpKernel {
     }
 
     TensorShape shape({12});
-
-    Tensor* output_tensor;
     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
@@ -272,20 +331,12 @@ class DebugNumericSummaryOp : public OpKernel {
 
     bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
                 positive_inf_count == 0;
-    if (!mute && !debug_urls_.empty()) {
-      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-      DebugIO::PublishDebugTensor(tensor_name_, "DebugNumericSummary",
-                                  *output_tensor, Env::Default()->NowMicros(),
-                                  debug_urls_)
-          .IgnoreError();
+    if (!mute) {
+      PublishTensor(*output_tensor);
     }
   }
 
-  bool IsExpensive() override { return false; }
-
  private:
-  string tensor_name_;
-  std::vector<string> debug_urls_;
   float lower_bound_;
   float upper_bound_;
   bool mute_if_healthy_;
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 917d4c529904e842361445472254da909c50268c..487f045cc8d6066107dbc2919cb18db73b27aeab 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <fstream>
 #include <vector>
 
+#include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -32,11 +33,10 @@ limitations under the License.
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
-namespace {
 
 class DebugIdentityOpTest : public OpsTestBase {
  protected:
-  Status Init(DataType input_type, const std::vector<string> debug_urls) {
+  Status Init(DataType input_type, const std::vector<string>& debug_urls) {
     env_ = Env::Default();
 
     TF_CHECK_OK(NodeDefBuilder("op", "DebugIdentity")
@@ -230,6 +230,24 @@ class DebugNumericSummaryOpTest : public OpsTestBase {
                     .Finalize(node_def()));
     return InitOp();
   }
+
+  Status InitGated(DataType input_type, const std::vector<string>& debug_urls) {
+    TF_CHECK_OK(NodeDefBuilder("op", "DebugNumericSummary")
+                    .Input(FakeInput(input_type))
+                    .Attr("tensor_name", "FakeTensor:0")
+                    .Attr("gated_grpc", true)
+                    .Attr("debug_urls", debug_urls)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+
+#if defined(PLATFORM_GOOGLE)
+  void ClearEnabledWatchKeys() { DebugGrpcIO::ClearEnabledWatchKeys(); }
+
+  void CreateEmptyEnabledSet(const string& grpc_debug_url) {
+    DebugGrpcIO::CreateEmptyEnabledSet(grpc_debug_url);
+  }
+#endif
 };
 
 TEST_F(DebugNumericSummaryOpTest, Float_full_house) {
@@ -485,6 +503,35 @@ TEST_F(DebugNumericSummaryOpTest, BoolSuccess) {
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
 
+#if defined(PLATFORM_GOOGLE)
+TEST_F(DebugNumericSummaryOpTest, DisabledDueToEmptyEnabledSet) {
+  ClearEnabledWatchKeys();
+  CreateEmptyEnabledSet("grpc://server:3333");
+
+  std::vector<string> debug_urls({"grpc://server:3333"});
+  TF_ASSERT_OK(InitGated(DT_FLOAT, debug_urls));
+  AddInputFromArray<float>(TensorShape({2, 2}), {1.0, 3.0, 3.0, 7.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_disabled(allocator(), DT_DOUBLE, TensorShape({0}));
+  test::ExpectTensorNear<double>(expected_disabled, *GetOutput(0), 1e-8);
+}
+
+TEST_F(DebugNumericSummaryOpTest, DisabledDueToNonMatchingWatchKey) {
+  ClearEnabledWatchKeys();
+  DebugGrpcIO::EnableWatchKey("grpc://server:3333",
+                              "FakeTensor:1:DebugNumeriSummary");
+
+  std::vector<string> debug_urls({"grpc://server:3333"});
+  TF_ASSERT_OK(InitGated(DT_FLOAT, debug_urls));
+  AddInputFromArray<float>(TensorShape({2, 2}), {1.0, 3.0, 3.0, 7.0});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_disabled(allocator(), DT_DOUBLE, TensorShape({0}));
+  test::ExpectTensorNear<double>(expected_disabled, *GetOutput(0), 1e-8);
+}
+#endif
+
 // Tests for DebugNumericSummaryOp
 class DebugNumericSummaryOpCustomLowerBoundTest : public OpsTestBase {
  protected:
@@ -572,5 +619,4 @@ TEST_F(DebugNumericSummaryOpCustomLowerUpperBoundsTest, Int32Success) {
   test::ExpectTensorNear<double>(expected, *GetOutput(0), 1e-8);
 }
 
-}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..086369a9f127143a6dfd71e10b1abffd54c8a191
--- /dev/null
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+// Decode the contents of a BMP file
+class DecodeBmpOp : public OpKernel {
+ public:
+  explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+    OP_REQUIRES(
+        context, channels_ == 0 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("channels must be 0, 3 or 4, got ", channels_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Start decoding image to get shape details
+    const StringPiece input = contents.scalar<string>()();
+
+    const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
+    const int32 header_size = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 10)));
+    const int32 width = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 18)));
+    const int32 height = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 22)));
+    const int32 bpp = internal::SubtleMustCopy(
+        *(reinterpret_cast<const int32*>(img_bytes + 28)));
+
+    if (channels_) {
+      OP_REQUIRES(context, (channels_ == bpp / 8),
+                  errors::InvalidArgument(
+                      "channels attribute ", channels_,
+                      " does not match bits per pixel from file ", bpp / 8));
+    } else {
+      channels_ = bpp / 8;
+    }
+
+    // Current implementation only supports 3 or 4 channel
+    // bitmaps.
+    OP_REQUIRES(context, (channels_ == 3 || channels_ == 4),
+                errors::InvalidArgument(
+                    "Number of channels must be 3 or 4, was ", channels_));
+
+    // if height is negative, data layout is top down
+    // otherwise, it's bottom up
+    bool top_down = (height < 0);
+
+    // Decode image, allocating tensor once the image size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(
+                     0, TensorShape({abs(height), width, channels_}), &output));
+
+    const uint8* bmp_pixels = &img_bytes[header_size];
+
+    Decode(bmp_pixels, output->flat<uint8>().data(), width, abs(height),
+           channels_, top_down);
+  }
+
+  uint8* Decode(const uint8* input, uint8* const output, const int width,
+                const int height, const int channles, bool top_down);
+
+ private:
+  int channels_;
+};
+REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
+
+uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
+                           const int width, const int height,
+                           const int channels, bool top_down) {
+  // there may be padding bytes when the width is not a multiple of 4 bytes
+  // 8 * channels == bits per pixel
+  int row_size = (8 * channels * width + 31) / 32 * 4;
+
+  for (int i = 0; i < height; i++) {
+    int src_pos;
+    int dst_pos;
+
+    for (int j = 0; j < width; j++) {
+      if (!top_down) {
+        src_pos = ((height - 1 - i) * row_size) + j * channels;
+      } else {
+        src_pos = i * row_size + j * channels;
+      }
+
+      dst_pos = (i * width + j) * channels;
+
+      switch (channels) {
+        case 3:
+          // BGR -> RGB
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          break;
+        case 4:
+          // BGRA -> RGBA
+          output[dst_pos] = input[src_pos + 2];
+          output[dst_pos + 1] = input[src_pos + 1];
+          output[dst_pos + 2] = input[src_pos];
+          output[dst_pos + 3] = input[src_pos + 3];
+          break;
+        default:
+          LOG(FATAL) << "Unexpected number of channels: " << channels;
+          break;
+      }
+    }
+  }
+
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_gif_op.cc b/tensorflow/core/kernels/decode_gif_op.cc
deleted file mode 100644
index 2bc17f8a309972f55206aa55fffd215678401fad..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_gif_op.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gif/gif_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a GIF file
-class DecodeGifOp : public OpKernel {
- public:
-  explicit DecodeGifOp(OpKernelConstruction* context) : OpKernel(context) {}
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece input = contents.scalar<string>()();
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = nullptr;
-    OP_REQUIRES(
-        context,
-        gif::Decode(input.data(), input.size(),
-                    [=, &output](int num_frames, int width, int height,
-                                 int channels) -> uint8* {
-                      Status status(context->allocate_output(
-                          0, TensorShape({num_frames, height, width, channels}),
-                          &output));
-                      if (!status.ok()) {
-                        VLOG(1) << status;
-                        context->SetStatus(status);
-                        return nullptr;
-                      }
-                      return output->flat<uint8>().data();
-                    }),
-        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeGifOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76f8c225432dd7ddb36933722f3cf0c9404c48ad
--- /dev/null
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -0,0 +1,315 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+enum FileFormat {
+  kUnknownFormat = 0,
+  kPngFormat = 1,
+  kJpgFormat = 2,
+  kGifFormat = 3,
+};
+
+// Classify the contents of a file based on starting bytes (the magic number).
+FileFormat ClassifyFileFormat(StringPiece data) {
+  // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
+  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
+  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  return kUnknownFormat;
+}
+
+string FileFormatString(FileFormat magic, StringPiece data) {
+  switch (magic) {
+    case kPngFormat:
+      return "PNG";
+    case kJpgFormat:
+      return "JPEG";
+    case kGifFormat:
+      return "GIF";
+    default: {
+      if (data.empty()) return "empty file";
+      return strings::StrCat("unknown format starting with '",
+                             str_util::CEscape(data.substr(0, 16)), "'");
+    }
+  }
+}
+
+// Decode an image (either jpeg, png, or gif).  We use a single op so that
+// users don't have to care about which format they have.
+class DecodeImageOp : public OpKernel {
+ public:
+  explicit DecodeImageOp(OpKernelConstruction* context) : OpKernel(context) {
+    // Determine which op we are: jpeg, png, gif, or any
+    if (type_string() == "DecodeJpeg") {
+      format_ = kJpgFormat;
+    } else if (type_string() == "DecodePng") {
+      format_ = kPngFormat;
+    } else if (type_string() == "DecodeGif") {
+      format_ = kGifFormat;
+    } else {
+      OP_REQUIRES_OK(context,
+                     errors::InvalidArgument("Bad op type ", type_string()));
+    }
+
+    if (format_ == kGifFormat) {
+      channels_ = 3;
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
+      OP_REQUIRES(
+          context,
+          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
+                                  channels_));
+    }
+    flags_.components = channels_;
+
+    // In the case of png, we support uint16 output
+    if (format_ == kPngFormat) {
+      DataType dt;
+      OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
+      OP_REQUIRES(
+          context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
+          errors::InvalidArgument("Type must be uint8 or uint16, got ", dt));
+      if (dt == DataType::DT_UINT8) {
+        channel_bits_ = 8;
+      } else {
+        channel_bits_ = 16;
+      }
+    }
+
+    // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing
+    // image quality for speed.
+    flags_.dct_method = JDCT_IFAST;
+
+    if (format_ == kJpgFormat) {
+      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
+      OP_REQUIRES(context,
+                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
+                      flags_.ratio == 8,
+                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
+                                          flags_.ratio));
+      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
+                                               &flags_.fancy_upscaling));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("try_recover_truncated",
+                                      &flags_.try_recover_truncated_jpeg));
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("acceptable_fraction",
+                                      &flags_.min_acceptable_fraction));
+
+      string dct_method;
+      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
+      OP_REQUIRES(
+          context,
+          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
+           dct_method == "INTEGER_ACCURATE"),
+          errors::InvalidArgument("dct_method must be one of "
+                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
+      if (dct_method == "INTEGER_FAST") {
+        flags_.dct_method = JDCT_IFAST;
+      } else if (dct_method == "INTEGER_ACCURATE") {
+        flags_.dct_method = JDCT_ISLOW;
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& contents = context->input(0);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
+                errors::InvalidArgument("contents must be scalar, got shape ",
+                                        contents.shape().DebugString()));
+
+    // Determine format
+    const StringPiece input = contents.scalar<string>()();
+    const auto magic = ClassifyFileFormat(input);
+    OP_REQUIRES(
+        context,
+        magic == kJpgFormat || magic == kPngFormat || magic == kGifFormat,
+        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
+                                FileFormatString(magic, input)));
+    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
+                errors::InvalidArgument(
+                    FileFormatString(magic, input),
+                    " contents are too large for int: ", input.size()));
+    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
+                errors::InvalidArgument(FileFormatString(magic, input),
+                                        " does not support uint16 output"));
+
+    switch (magic) {
+      case kJpgFormat:
+        DecodeJpeg(context, input);
+        break;
+      case kPngFormat:
+        DecodePng(context, input);
+        break;
+      case kGifFormat:
+        DecodeGif(context, input);
+        break;
+      default:
+        LOG(FATAL) << "Should never get here after check above";
+        break;
+    }
+  }
+
+  void DecodeJpeg(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
+                errors::InvalidArgument(
+                    "channels must be 0, 1, or 3 for JPEG, got ", channels_));
+
+    // Decode jpeg, allocating tensor once the size is known
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        jpeg::Uncompress(
+            input.data(), input.size(), flags_, nullptr /* nwarn */,
+            [=, &output](int width, int height, int channels) -> uint8* {
+              Status status(context->allocate_output(
+                  0,
+                  format_ == kGifFormat
+                      ? TensorShape({1, height, width, channels})
+                      : TensorShape({height, width, channels}),
+                  &output));
+              if (!status.ok()) {
+                VLOG(1) << status;
+                context->SetStatus(status);
+                return nullptr;
+              }
+              return output->flat<uint8>().data();
+            }),
+        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
+  }
+
+  void DecodePng(OpKernelContext* context, StringPiece input) {
+    // Start decoding png to get shape details
+    png::DecodeContext decode;
+    OP_REQUIRES(context,
+                png::CommonInitDecode(input, channels_, channel_bits_, &decode),
+                errors::InvalidArgument("Invalid PNG header, data size ",
+                                        input.size()));
+
+    // Verify that width and height are not too large:
+    // - verify width and height don't overflow int.
+    // - width can later be multiplied by channels_ and sizeof(uint16), so
+    //   verify single dimension is not too large.
+    // - verify when width and height are multiplied together, there are a few
+    //   bits to spare as well.
+    const int width = static_cast<int>(decode.width);
+    const int height = static_cast<int>(decode.height);
+    const int64 total_size =
+        static_cast<int64>(width) * static_cast<int64>(height);
+    if (width != static_cast<int64>(decode.width) || width <= 0 ||
+        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
+        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
+      png::CommonFreeDecode(&decode);
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("PNG size too large for int: ",
+                                          decode.width, " by ", decode.height));
+    }
+
+    // Allocate tensor
+    Tensor* output = nullptr;
+    const auto status = context->allocate_output(
+        0,
+        format_ == kGifFormat ? TensorShape({1, height, width, decode.channels})
+                              : TensorShape({height, width, decode.channels}),
+        &output);
+    if (!status.ok()) png::CommonFreeDecode(&decode);
+    OP_REQUIRES_OK(context, status);
+
+    if (channel_bits_ == 8) {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
+              decode.channels * width * sizeof(uint8), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    } else {
+      // Finish decoding png
+      OP_REQUIRES(
+          context,
+          png::CommonFinishDecode(
+              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
+              decode.channels * width * sizeof(uint16), &decode),
+          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
+    }
+  }
+
+  void DecodeGif(OpKernelContext* context, StringPiece input) {
+    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
+                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
+                                        channels_));
+
+    // Decode GIF, allocating tensor once the size is known.
+    Tensor* output = nullptr;
+    OP_REQUIRES(
+        context,
+        gif::Decode(input.data(), input.size(),
+                    [=, &output](int num_frames, int width, int height,
+                                 int channels) -> uint8* {
+                      Status status;
+                      if (format_ == kGifFormat) {
+                        status = context->allocate_output(
+                            0,
+                            TensorShape({num_frames, height, width, channels}),
+                            &output);
+                      } else if (num_frames == 1) {
+                        status = context->allocate_output(
+                            0, TensorShape({height, width, channels}), &output);
+                      } else {
+                        status = errors::InvalidArgument(
+                            "Got ", num_frames, " frames, but animated gifs ",
+                            "can only be decoded by tf.image.decode_gif or ",
+                            "tf.image.decode_image");
+                      }
+                      if (!status.ok()) {
+                        VLOG(1) << status;
+                        context->SetStatus(status);
+                        return nullptr;
+                      }
+                      return output->flat<uint8>().data();
+                    }),
+        errors::InvalidArgument("Invalid GIF data, size ", input.size()));
+  }
+
+ private:
+  FileFormat format_;
+  int channels_;
+  int channel_bits_ = 8;
+  jpeg::UncompressFlags flags_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodeImageOp);
+REGISTER_KERNEL_BUILDER(Name("DecodeGif").Device(DEVICE_CPU), DecodeImageOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_jpeg_op.cc b/tensorflow/core/kernels/decode_jpeg_op.cc
deleted file mode 100644
index b795f3955037d934ee89e9533b476438b032f2fc..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_jpeg_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/jpeg/jpeg_mem.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a JPEG file
-class DecodeJpegOp : public OpKernel {
- public:
-  explicit DecodeJpegOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &flags_.components));
-    OP_REQUIRES(context, flags_.components == 0 || flags_.components == 1 ||
-                             flags_.components == 3,
-                errors::InvalidArgument("channels must be 0, 1, or 3, got ",
-                                        flags_.components));
-    OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-    OP_REQUIRES(context, flags_.ratio == 1 || flags_.ratio == 2 ||
-                             flags_.ratio == 4 || flags_.ratio == 8,
-                errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
-                                        flags_.ratio));
-    OP_REQUIRES_OK(
-        context, context->GetAttr("fancy_upscaling", &flags_.fancy_upscaling));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("try_recover_truncated",
-                                    &flags_.try_recover_truncated_jpeg));
-    OP_REQUIRES_OK(context, context->GetAttr("acceptable_fraction",
-                                             &flags_.min_acceptable_fraction));
-
-    string dct_method;
-    OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
-    OP_REQUIRES(
-        context, (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-                  dct_method == "INTEGER_ACCURATE"),
-        errors::InvalidArgument("dct_method must be one of "
-                                "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
-    if (dct_method == "INTEGER_FAST") {
-      flags_.dct_method = JDCT_IFAST;
-    } else if (dct_method == "INTEGER_ACCURATE") {
-      flags_.dct_method = JDCT_ISLOW;
-    } else {
-      // The TensorFlow-chosen default is IFAST, sacrificing decoding
-      // image quality for speed.
-      flags_.dct_method = JDCT_IFAST;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-    const StringPiece input = contents.scalar<string>()();
-    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument("JPEG contents are too large for int: ",
-                                        input.size()));
-
-    // Decode image, allocating tensor once the image size is known
-    Tensor* output = NULL;
-    OP_REQUIRES(
-        context,
-        jpeg::Uncompress(
-            input.data(), input.size(), flags_, nullptr /* nwarn */,
-            [=, &output](int width, int height, int channels) -> uint8* {
-              Status status(context->allocate_output(
-                  0, TensorShape({height, width, channels}), &output));
-              if (!status.ok()) {
-                VLOG(1) << status;
-                context->SetStatus(status);
-                return nullptr;
-              }
-              return output->flat<uint8>().data();
-            }),
-        errors::InvalidArgument("Invalid JPEG data, size ", input.size()));
-  }
-
- private:
-  jpeg::UncompressFlags flags_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodeJpeg").Device(DEVICE_CPU), DecodeJpegOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_png_op.cc b/tensorflow/core/kernels/decode_png_op.cc
deleted file mode 100644
index 1906ae7746c4f96e5392c77d6551e4d268304e76..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/decode_png_op.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/image_ops.cc
-
-#include <memory>
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/png/png_io.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-
-// Decode the contents of a PNG file
-class DecodePngOp : public OpKernel {
- public:
-  explicit DecodePngOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3 ||
-                             channels_ == 4,
-                errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                        channels_));
-
-    DataType dt;
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
-    OP_REQUIRES(
-        context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
-        errors::InvalidArgument("Type must be UINT8 or UINT16, got ", dt));
-    if (dt == DataType::DT_UINT8) {
-      desired_channel_bits_ = 8;
-    } else {
-      desired_channel_bits_ = 16;
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Start decoding image to get shape details
-    const StringPiece data = contents.scalar<string>()();
-    png::DecodeContext decode;
-    OP_REQUIRES(
-        context,
-        png::CommonInitDecode(data, channels_, desired_channel_bits_, &decode),
-        errors::InvalidArgument("Invalid PNG header, data size ", data.size()));
-
-    // Verify that width and height are not too large:
-    // - verify width and height don't overflow int.
-    // - width can later be multiplied by channels_ and sizeof(uint16), so
-    //   verify single dimension is not too large.
-    // - verify when width and height are multiplied together, there are a few
-    //   bits to spare as well.
-    const int width = static_cast<int>(decode.width);
-    const int height = static_cast<int>(decode.height);
-    const int64 total_size =
-        static_cast<int64>(width) * static_cast<int64>(height);
-    if (width != static_cast<int64>(decode.width) || width <= 0 ||
-        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
-        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
-      png::CommonFreeDecode(&decode);
-      OP_REQUIRES(context, false,
-                  errors::InvalidArgument("PNG size too large for int: ",
-                                          decode.width, " by ", decode.height));
-    }
-
-    // Allocate tensor
-    Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0, TensorShape({height, width, decode.channels}), &output);
-    if (!status.ok()) png::CommonFreeDecode(&decode);
-    OP_REQUIRES_OK(context, status);
-
-    if (desired_channel_bits_ == 8) {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    } else {
-      // Finish decoding image
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", data.size()));
-    }
-  }
-
- private:
-  int channels_;
-  int desired_channel_bits_;
-};
-REGISTER_KERNEL_BUILDER(Name("DecodePng").Device(DEVICE_CPU), DecodePngOp);
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index 4247abcd710c316942c40e648ebe315e018b0511..da247161f9aea3f150aa0f4d2d2c2b2543a3cce4 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -50,7 +50,7 @@ class DecodeRawOp : public OpKernel {
       }
     }
     TensorShape out_shape = input.shape();
-    if (str_size == -1) {  // Empty input
+    if (str_size == -1 || str_size == 0) {  // Empty input
       out_shape.AddDim(1);
       Tensor* output_tensor = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output("output", out_shape,
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 9e6d8e42a47b64c53cb014be7c0eeece45a7cf1e..8e9b8a7e2e7be8e55deeacd4de3f77033499387f 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -26,7 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// DeepConv2D is a Conv2D implementation specialzied for deep convolutions (i.e
+// DeepConv2D is a Conv2D implementation specialized for deep convolutions (i.e
 // large 'in_depth' and 'out_depth' product. See cost models below for details).
 //
 // DeepConv2D is implemented by computing the following equation:
@@ -1069,7 +1069,7 @@ struct DeepConv2D<CPUDevice, T> {
       // Allocate temporary buffer 'buffer2', which is first used for
       // transformed input tiles, then re-used for transformed output tiles.
       // Calculate required buffer size for 'buffer2' as max required buffer
-      // between input and output tranform buffer sizes.
+      // between input and output transform buffer sizes.
       const int64 buffer2_tile_transform_size =
           tile_spatial_size * num_tiles * in_depth;
       const int64 buffer2_out_transform_size =
diff --git a/tensorflow/core/kernels/deep_conv2d.h b/tensorflow/core/kernels/deep_conv2d.h
index a9de20e7ae7c21b6e2c6d7968dac0c4ed419932e..c3f6f66dc9ba6fcf3e29c139eec0030cc7a0be57 100644
--- a/tensorflow/core/kernels/deep_conv2d.h
+++ b/tensorflow/core/kernels/deep_conv2d.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 
 class OpKernelContext;
 
-// DeepConv2D is a Conv2D implementation specialzied for deep (i.e. large
+// DeepConv2D is a Conv2D implementation specialized for deep (i.e. large
 // in_depth * out_depth product) convolutions (see deep_conv2d.cc for details).
 
 // DeepConv2DTransform is an interface for implementing transforms for
diff --git a/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c36093355a53e4c9e014f85f42e65d178886487
--- /dev/null
+++ b/tensorflow/core/kernels/dense_to_sparse_batch_dataset_op.cc
@@ -0,0 +1,274 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class DenseToSparseBatchDatasetOp : public OpKernel {
+ public:
+  explicit DenseToSparseBatchDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new DenseToSparseBatchDatasetOp::Dataset, insert it in the
+    // step-local container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OP_REQUIRES(
+        ctx, input->output_dtypes().size() == 1,
+        errors::InvalidArgument("DenseToSparseBatchDataset only supports "
+                                "inputs with a single component."));
+
+    const Tensor* batch_size_t;
+    OP_REQUIRES_OK(ctx, ctx->input("batch_size", &batch_size_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(batch_size_t->shape()),
+                errors::InvalidArgument("batch_size must be a scalar"));
+    const int64 batch_size = batch_size_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    const Tensor* row_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->input("row_shape", &row_shape_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(row_shape_t->shape()),
+                errors::InvalidArgument("row_shape must be a vector"));
+    TensorShape row_shape;
+    for (size_t i = 0; i < row_shape_t->dim_size(0); ++i) {
+      row_shape.AddDim(row_shape_t->vec<int64>()(i));
+    }
+
+    DatasetBase* dataset = nullptr;
+
+#define HANDLE_TYPE(DT)                                                      \
+  if (input->output_dtypes()[0] == DT) {                                     \
+    dataset =                                                                \
+        new Dataset<EnumToDataType<DT>::Type>(batch_size, row_shape, input); \
+  }
+    HANDLE_TYPE(DT_FLOAT);
+    HANDLE_TYPE(DT_HALF);
+    HANDLE_TYPE(DT_DOUBLE);
+    HANDLE_TYPE(DT_INT32);
+    HANDLE_TYPE(DT_UINT8);
+    HANDLE_TYPE(DT_INT16);
+    HANDLE_TYPE(DT_INT8);
+    HANDLE_TYPE(DT_STRING);
+    HANDLE_TYPE(DT_COMPLEX64);
+    HANDLE_TYPE(DT_COMPLEX128);
+    HANDLE_TYPE(DT_INT64);
+    HANDLE_TYPE(DT_BOOL);
+    HANDLE_TYPE(DT_QINT8);
+    HANDLE_TYPE(DT_QUINT8);
+    HANDLE_TYPE(DT_QINT32);
+    HANDLE_TYPE(DT_QINT16);
+    HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+    OP_REQUIRES(
+        ctx, dataset != nullptr,
+        errors::Unimplemented("DenseToSparseBatchDataset unhandled data type: ",
+                              input->output_dtypes()[0]));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  // TODO(mrry): Push the templated code down to the raw copying routine.
+  template <class T>
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, const TensorShape& row_shape,
+            const DatasetBase* input)
+        : batch_size_(batch_size), row_shape_(row_shape), input_(input) {
+      input_->Ref();
+
+      output_shapes_.reserve(3);
+      // Outputs represent a SparseTensor as (indices, values, dense_shape).
+      output_shapes_.push_back({-1, row_shape_.dims() + 1});
+      output_shapes_.push_back({-1});
+      output_shapes_.push_back({row_shape_.dims() + 1});
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* output_dtypes_ =
+          new DataTypeVector({DT_INT64, DataTypeToEnum<T>::value, DT_INT64});
+      return *output_dtypes_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("DenseToSparseBatchDatasetOp(", batch_size_,
+                             ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset<T>> {
+     public:
+      explicit Iterator(const Dataset<T>* dataset)
+          : DatasetIterator<Dataset<T>>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of the output SparseTensor is an individual tensor
+        // from the input iterator.
+        std::vector<Tensor> batch_elements;
+        int64 total_elements = 0;
+        batch_elements.reserve(
+            DatasetIterator<Dataset<T>>::dataset()->batch_size_);
+        const TensorShape& row_shape =
+            DatasetIterator<Dataset<T>>::dataset()->row_shape_;
+        const int row_ndims = row_shape.dims();
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0;
+               i < DatasetIterator<Dataset<T>>::dataset()->batch_size_ &&
+               !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              DCHECK_EQ(1, batch_element_tuple.size());
+              batch_elements.push_back(std::move(batch_element_tuple[0]));
+              total_elements += batch_element_tuple[0].NumElements();
+
+              // TODO(mrry): Investigate how to hoist this check when we
+              // have static information that renders it unnecessary.
+              if (batch_element_tuple[0].shape().dims() != row_ndims) {
+                return errors::InvalidArgument(
+                    "Input element had shape (",
+                    batch_element_tuple[0].shape().DebugString(),
+                    ") that is incompatible with the row shape (",
+                    row_shape.DebugString(), ").");
+              }
+              for (int i = 0; i < row_ndims; ++i) {
+                if (batch_element_tuple[0].shape().dim_size(i) >
+                    row_shape.dim_size(i)) {
+                  return errors::DataLoss(
+                      "Input element had shape (",
+                      batch_element_tuple[0].shape().DebugString(),
+                      ") that is larger than the row shape (",
+                      row_shape.DebugString(), ").");
+                }
+              }
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Determine the size of the output tensors:
+        // * indices will be [`total_elements`, `row_shape + 1`].
+        // * values will be [`total_elements`].
+        // * dense_shape will be [`row_shape + 1`].
+        Tensor indices(cpu_allocator(), DT_INT64,
+                       {total_elements, row_ndims + 1});
+        Tensor values(
+            cpu_allocator(),
+            DatasetIterator<Dataset<T>>::dataset()->output_dtypes()[1],
+            {total_elements});
+        Tensor dense_shape(cpu_allocator(), DT_INT64, {row_ndims + 1});
+        auto indices_matrix = indices.matrix<int64>();
+        auto values_flat = values.flat<T>();
+        auto dense_shape_vec = dense_shape.vec<int64>();
+
+        int64 current_position_in_values = 0;
+        for (int64 i = 0; i < batch_elements.size(); ++i) {
+          const Tensor& t = batch_elements[i];
+          const auto& t_flat = t.flat<T>();
+          // TODO(mrry): Replace with a memcpy or something more
+          // efficient. (Maybe an Eigen assign op?)
+          gtl::InlinedVector<int64, 4> strides(row_ndims);
+          if (!strides.empty()) {
+            strides[row_ndims - 1] = 1;
+            for (int64_t row_dim = strides.size() - 2; row_dim >= 0;
+                 --row_dim) {
+              strides[row_dim] =
+                  strides[row_dim + 1] * t.shape().dim_size(row_dim + 1);
+            }
+          }
+
+          for (int64 j = 0; j < t.NumElements(); ++j) {
+            values_flat(current_position_in_values) = t_flat(j);
+            indices_matrix(current_position_in_values, 0) = i;
+            int64 index = j;
+            for (size_t k = 0; k < strides.size(); ++k) {
+              indices_matrix(current_position_in_values, k + 1) =
+                  index / strides[k];
+              index %= strides[k];
+            }
+            ++current_position_in_values;
+          }
+        }
+
+        dense_shape_vec(0) = batch_elements.size();
+        for (size_t i = 0; i < row_ndims; ++i) {
+          dense_shape_vec(i + 1) = row_shape.dim_size(i);
+        }
+
+        out_tensors->push_back(std::move(indices));
+        out_tensors->push_back(std::move(values));
+        out_tensors->push_back(std::move(dense_shape));
+
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const TensorShape row_shape_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
+                        DenseToSparseBatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 26d45f79d82213c602fd3be15cf66e115f922876..2e7213f95686d45a68bb6ca0c25392e4dd672c13 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -441,7 +441,9 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       DepthwiseConv2dNativeOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
+#if defined(PLATFORM_WINDOWS) && !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
+#endif
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5377d09ec69db701a337352a63d2d00a08e36dce..051d4772449cf9da429a5a901631b992337ab68d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -24,28 +24,32 @@ limitations under the License.
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #else
 #define UNROLL
+#define NOUNROLL
 #endif
 
 namespace tensorflow {
 
-namespace {
-
-typedef Eigen::GpuDevice GPUDevice;
+using Eigen::GpuDevice;
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NHWC format.
-template <typename T>
-__global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -114,16 +118,20 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T>
-__global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
-                                             const T* input, const T* filter,
-                                             T* output, int num_outputs) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(1024, 2)
+    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
+                                 const T* filter, T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -235,29 +243,58 @@ __global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
   }
 }
 
-}  // namespace
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+                              const T* input, const T* filter, T* output,
+                              TensorFormat data_format) {
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  // The compile-time constant version runs faster with a single block.
+  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
+                                      kKnownDepthMultiplier < 0 ||
+                                      args.out_rows * args.out_cols <= 256
+                                  ? std::numeric_limits<int>::max()
+                                  : d.getNumCudaMultiProcessors();
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_outputs, d,
+        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                     kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<std::min(max_block_count, config.block_count),
+           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
+                                                     output, num_outputs);
+  } else {
+    assert(false);
+  }
+}
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
                   const T* filter, T* output, TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients from one element
-    // in the out_backprop. Note that one element in the out_backprop can map
-    // to multiple filter elements.
-    const int num_outputs =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output,
+                                           data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output,
+                                              data_format);
     }
   }
 };
@@ -266,18 +303,22 @@ template struct DepthwiseConv2dGPULaunch<float>;
 template struct DepthwiseConv2dGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
-template <typename T, int KNOWN_DEPTH_MULTIPLIER>
-__global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* filter,
-    T* in_backprop, int num_in_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
+                                              const T* out_backprop,
+                                              const T* filter, T* in_backprop,
+                                              int num_in_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = KNOWN_DEPTH_MULTIPLIER == -1
-                                   ? args.depth_multiplier
-                                   : KNOWN_DEPTH_MULTIPLIER;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -301,14 +342,12 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
         tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
     const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
 
-#pragma nounroll
-    for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
       const int f_r = in_r + pad_rows - out_r * stride;
       const int temp_out_backprop_offset =
           out_depth * out_cols * (out_r + out_rows * b);
       const int temp_filter_offset = filter_cols * f_r;
-#pragma nounroll
-      for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
         const int f_c = in_c + pad_cols - out_c * stride;
         int filter_offset =
             depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
@@ -328,8 +367,9 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
   }
 }
 
-template <typename T>
-__global__ void __launch_bounds__(1024)
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
     DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
                                               const T* out_backprop,
                                               const T* filter, T* in_backprop,
@@ -337,9 +377,12 @@ __global__ void __launch_bounds__(1024)
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -395,34 +438,57 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
+                                           const DepthwiseArgs args,
+                                           const T* out_backprop,
+                                           const T* filter, T* in_backprop,
+                                           TensorFormat data_format) {
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropInputGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_in_backprop, d,
+        DepthwiseConv2dBackpropInputGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropInputGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* filter, T* in_backprop,
                   TensorFormat data_format) {
-    const int num_in_backprop =
-        args.batch * args.in_rows * args.in_cols * args.in_depth;
-
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
-    // Increase block count for when there are more warps/SM than threads/SM.
-    config.block_count *= 4;
-    if (data_format == FORMAT_NHWC) {
-      if (args.depth_multiplier == 1) {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, 1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+    if (args.depth_multiplier == 1) {
+      if (args.filter_rows == 3 && args.filter_cols == 3) {
+        LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       } else {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, -1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+        LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       }
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropInputGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, filter, in_backprop, num_in_backprop);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>(
+          d, args, out_backprop, filter, in_backprop, data_format);
     }
   }
 };
@@ -431,16 +497,23 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<float>;
 template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -518,16 +591,23 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
 }
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
-__global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
-    const DepthwiseArgs args, const T* out_backprop, const T* input,
-    T* filter_backprop, int num_out_backprop) {
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+__global__ void __launch_bounds__(640, 2)
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
+                                               const T* out_backprop,
+                                               const T* input,
+                                               T* filter_backprop,
+                                               int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -610,28 +690,53 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
+                                            const DepthwiseArgs args,
+                                            const T* out_backprop,
+                                            const T* input, T* filter_backprop,
+                                            TensorFormat data_format) {
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  if (data_format == FORMAT_NHWC) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(
+        num_out_backprop, d,
+        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
+        0);
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* input, T* filter_backprop,
                   TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients for one element in
-    // the out_backprop.
-    const int num_out_backprop =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
-
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dBackpropFilterGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropFilterGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     }
   }
 };
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index c28909e03ba40f2c6982b2c48a676cf420b269fb..8efe0d1e35f19fe10ccc7323c37f51220ca9c772 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -69,12 +69,15 @@ class DequantizeOp : public OpKernel {
           (static_cast<float>(std::numeric_limits<T>::max()) -
            std::numeric_limits<T>::min());
 
-      // Multiply by scale factor and add min_range.
-      output->flat<float>() =
-          ((input.flat<T>().template cast<int>().template cast<float>() +
-            half_range_) *
-           scale_factor) +
-          min_range;
+      float* out_ptr = output->flat<float>().data();
+      const T* in_ptr = input.flat<T>().data();
+
+      const int64 num_elements = input.NumElements();
+      for (int i = 0; i < num_elements; ++i) {
+        out_ptr[i] =
+            ((static_cast<int>(in_ptr[i]) + half_range_) * scale_factor) +
+            min_range;
+      }
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
       if (meta::IsSupportedAndEnabled() && std::is_same<T, quint8>()) {
         auto input_ui8_array = input.flat<quint8>();
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efce810175420acbf1abf073d9c49a0ac5bcf0e0
--- /dev/null
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+class DequantizeOpTest : public OpsTestBase {
+ protected:
+  template <typename T>
+  void ComputeDequantizeMinCombinedUsingEigen(const Tensor& input,
+                                              float min_range, float max_range,
+                                              Tensor* output) {
+    float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+    const float scale_factor =
+        (max_range - min_range) /
+        (static_cast<float>(std::numeric_limits<T>::max()) -
+         std::numeric_limits<T>::min());
+    output->flat<float>() =
+        ((input.flat<T>().template cast<int>().template cast<float>() +
+          half_range) *
+         scale_factor) +
+        min_range;
+  }
+
+  // Compares dequantize min vs the same using eigen. This tests that a change
+  // to not use eigen gives equivalent results to using eigen.
+  template <typename T>
+  void RunDequantizeMinCombinedTest(float min_range, float max_range) {
+    TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Attr("mode", "MIN_COMBINED")
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+
+    std::vector<T> input;
+    for (int64 i = std::numeric_limits<T>::min();
+         i < std::numeric_limits<T>::max(); ++i) {
+      input.push_back(static_cast<T>(i));
+    }
+    TensorShape shape({static_cast<int64>(input.size())});
+    AddInputFromArray<T>(shape, input);
+    AddInputFromArray<float>(TensorShape({1}), {min_range});
+    AddInputFromArray<float>(TensorShape({1}), {max_range});
+    TF_ASSERT_OK(RunOpKernel());
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    ComputeDequantizeMinCombinedUsingEigen<T>(GetInput(0), min_range, max_range,
+                                              &expected);
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  }
+};
+
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint8) {
+  RunDequantizeMinCombinedTest<quint8>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQint8) {
+  RunDequantizeMinCombinedTest<qint8>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQint16) {
+  RunDequantizeMinCombinedTest<qint16>(0, 255.0f);
+}
+TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
+  RunDequantizeMinCombinedTest<quint16>(0, 255.0f);
+}
+
+template <typename T>
+static void BM_DequantizeMinCombinedCpu(int iters) {
+  auto root = Scope::NewRootScope().ExitOnError();
+  const int64 num_values = 1500 * 250;
+  std::vector<T> inputs;
+  for (int i = 0; i < num_values; ++i) inputs.push_back(i);
+  ops::Dequantize(root, test::AsTensor<T>(inputs),
+                  test::AsTensor<float>({-1.5f}),
+                  test::AsTensor<float>({20.5f}),
+                  ops::Dequantize::Attrs().Mode("MIN_COMBINED"));
+  TF_CHECK_OK(root.status());
+  Graph* g = new Graph(OpRegistry::Global());
+  TF_CHECK_OK(root.ToGraph(g));
+
+  test::Benchmark("cpu", g).Run(iters);
+  testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
+  testing::ItemsProcessed(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
+  BM_DequantizeMinCombinedCpu<quint16>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQint16(int iters) {
+  BM_DequantizeMinCombinedCpu<qint16>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
+  BM_DequantizeMinCombinedCpu<quint8>(iters);
+}
+
+static void BM_DequantizeMinCombinedCpuQint8(int iters) {
+  BM_DequantizeMinCombinedCpu<qint8>(iters);
+}
+
+BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
+BENCHMARK(BM_DequantizeMinCombinedCpuQint16);
+BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
+BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops.cc b/tensorflow/core/kernels/fake_quant_ops.cc
index 41f9c218437f3a911843673affd7c4ae8813f4b9..c198f67bbb69b7fa38f5c5260e8210d21a289453 100644
--- a/tensorflow/core/kernels/fake_quant_ops.cc
+++ b/tensorflow/core/kernels/fake_quant_ops.cc
@@ -48,6 +48,10 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+namespace {
+bool IsNumBitsValid(int num_bits) { return num_bits >= 2 && num_bits <= 8; }
+}  // namespace
+
 // -----------------------------------------------------------------------------
 // Implementation of FakeQuantWithMinMaxArgsOp, see its documentation in
 // core/ops/array_ops.cc.
@@ -60,19 +64,25 @@ class FakeQuantWithMinMaxArgsOp
       : Base::UnaryElementWiseOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
     OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
   }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     FakeQuantWithMinMaxArgsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(), min_, max_,
-            output->flat<float>());
+            steps_, output->flat<float>());
   }
  private:
   float min_;
   float max_;
+  int steps_;
 };
 
 // Implementation of FakeQuantWithMinMaxArgsGradientOp, see its documentation in
@@ -88,9 +98,14 @@ class FakeQuantWithMinMaxArgsGradientOp
       : Base::BinaryElementWiseOp(context) {
     OP_REQUIRES_OK(context, context->GetAttr("min", &min_));
     OP_REQUIRES_OK(context, context->GetAttr("max", &max_));
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
     OP_REQUIRES(context, min_ < max_,
                 InvalidArgument("min has to be smaller than max, was: ", min_,
                                 " >= ", max_));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
   }
 
   template <int NDIMS>
@@ -105,11 +120,12 @@ class FakeQuantWithMinMaxArgsGradientOp
                 InvalidArgument("gradient and input must be the same size"));
     FakeQuantWithMinMaxArgsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
-            input.flat<float>(), min_, max_, output->flat<float>());
+            input.flat<float>(), min_, max_, steps_, output->flat<float>());
   }
  private:
   float min_;
   float max_;
+  int steps_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_CPU),
@@ -124,20 +140,16 @@ typedef Eigen::GpuDevice GPUDevice;
 // Forward declarations for functor specializations for GPU.
 template <>
 void FakeQuantWithMinMaxArgsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
-    typename TTypes<float>::Flat outputs);
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs, float min,
+    float max, int steps, typename TTypes<float>::Flat outputs);
 extern template struct FakeQuantWithMinMaxArgsFunctor<GPUDevice>;
 REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxArgs").Device(DEVICE_GPU),
                         FakeQuantWithMinMaxArgsOp<GPUDevice>);
 
 template <>
 void FakeQuantWithMinMaxArgsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
-    typename TTypes<float>::ConstFlat inputs,
-    const float min, const float max,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
+    typename TTypes<float>::ConstFlat inputs, float min, float max, int steps,
     typename TTypes<float>::Flat backprops);
 REGISTER_KERNEL_BUILDER(
     Name("FakeQuantWithMinMaxArgsGradient").Device(DEVICE_GPU),
@@ -152,6 +164,11 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -175,7 +192,7 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
 
     FakeQuantWithMinMaxVarsFunctor<Device> functor;
     functor(context->eigen_device<Device>(), input.flat<float>(),
-            min.scalar<float>(), max.scalar<float>(),
+            min.scalar<float>(), max.scalar<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
             check_min_max->scalar<bool>(),
 #endif
@@ -183,6 +200,7 @@ class FakeQuantWithMinMaxVarsOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -195,6 +213,11 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsGradientOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -231,6 +254,7 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
     FakeQuantWithMinMaxVarsGradientFunctor<Device> functor;
     functor(context->eigen_device<Device>(), gradient.flat<float>(),
             input.flat<float>(), min.scalar<float>(), max.scalar<float>(),
+            steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
             check_min_max->scalar<bool>(),
 #endif
@@ -239,6 +263,7 @@ class FakeQuantWithMinMaxVarsGradientOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -253,10 +278,9 @@ REGISTER_KERNEL_BUILDER(
 #if GOOGLE_CUDA
 template <>
 void FakeQuantWithMinMaxVarsFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat inputs,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
+    typename TTypes<float>::ConstScalar max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -270,11 +294,10 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVars")
 
 template <>
 void FakeQuantWithMinMaxVarsGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstFlat gradients,
+    const GPUDevice& d, typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstScalar min,
-    typename TTypes<float>::ConstScalar max,
+    typename TTypes<float>::ConstScalar max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -297,6 +320,11 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelOp(OpKernelConstruction* context)
       : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -330,7 +358,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
         FakeQuant4WithMinMaxVarsPerChannelFunctor<Device> functor;
         functor(context->eigen_device<Device>(), input.dim_size(0),
                 input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+                input.flat<float>(), min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -339,9 +367,9 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       }
       case 2: {
         FakeQuant2WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                input.flat<float>(), min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), input.flat<float>(), min.vec<float>(),
+                max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -350,8 +378,8 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
       }
       case 1: {
         FakeQuant1WithMinMaxVarsPerChannelFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.vec<float>(), min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.vec<float>(),
+                min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
@@ -366,6 +394,7 @@ class FakeQuantWithMinMaxVarsPerChannelOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -378,6 +407,11 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
  public:
   explicit FakeQuantWithMinMaxVarsPerChannelGradientOp(
       OpKernelConstruction* context) : OpKernel::OpKernel(context) {
+    int num_bits;
+    OP_REQUIRES_OK(context, context->GetAttr("num_bits", &num_bits));
+    OP_REQUIRES(context, IsNumBitsValid(num_bits),
+                InvalidArgument("num_bits must be between 2 and 8, inclusive"));
+    steps_ = (1 << num_bits) - 1;
 #ifndef FAKE_QUANT_NO_DEBUG
     OP_REQUIRES_OK(context,
                    context->allocate_persistent(DT_BOOL, {},
@@ -423,38 +457,36 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
         FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
         functor(context->eigen_device<Device>(), input.dim_size(0),
                 input.dim_size(1), input.dim_size(2), input.dim_size(3),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
+                gradient.flat<float>(), input.flat<float>(), min.vec<float>(),
+                max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->flat<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       case 2: {
         FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                input.dim_size(0), input.dim_size(1),
-                gradient.flat<float>(), input.flat<float>(),
-                min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), input.dim_size(0),
+                input.dim_size(1), gradient.flat<float>(), input.flat<float>(),
+                min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->flat<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->flat<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       case 1: {
         FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<Device> functor;
-        functor(context->eigen_device<Device>(),
-                gradient.vec<float>(), input.vec<float>(),
-                min.vec<float>(), max.vec<float>(),
+        functor(context->eigen_device<Device>(), gradient.vec<float>(),
+                input.vec<float>(), min.vec<float>(), max.vec<float>(), steps_,
 #ifndef FAKE_QUANT_NO_DEBUG
                 check_min_max->scalar<bool>(),
 #endif
-                grad_wrt_input->vec<float>(),
-                grad_wrt_min->vec<float>(), grad_wrt_max->vec<float>());
+                grad_wrt_input->vec<float>(), grad_wrt_min->vec<float>(),
+                grad_wrt_max->vec<float>());
         break;
       }
       default:
@@ -465,6 +497,7 @@ class FakeQuantWithMinMaxVarsPerChannelGradientOp : public OpKernel {
   }
 
  private:
+  int steps_;
 #ifndef FAKE_QUANT_NO_DEBUG
   PersistentTensor check_min_max_handle_;
 #endif
@@ -480,10 +513,9 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannelGradient")
 #if GOOGLE_CUDA
 template <>
 void FakeQuant1WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    const GPUDevice& d, typename TTypes<float>::ConstVec inputs,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -495,7 +527,7 @@ void FakeQuant2WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
     const GPUDevice& d, const Index batch_size, const Index depth,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
+    typename TTypes<float>::ConstFlat max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -508,7 +540,7 @@ void FakeQuant4WithMinMaxVarsPerChannelFunctor<GPUDevice>::operator()(
     const Index width, const Index depth,
     typename TTypes<float>::ConstFlat inputs,
     typename TTypes<float>::ConstFlat min,
-    typename TTypes<float>::ConstFlat max,
+    typename TTypes<float>::ConstFlat max, int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -523,11 +555,10 @@ REGISTER_KERNEL_BUILDER(Name("FakeQuantWithMinMaxVarsPerChannel")
 
 template <>
 void FakeQuant1WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
-    const GPUDevice& d,
-    typename TTypes<float>::ConstVec gradients,
+    const GPUDevice& d, typename TTypes<float>::ConstVec gradients,
     typename TTypes<float>::ConstVec inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -542,8 +573,8 @@ void FakeQuant2WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
     const GPUDevice& d, const Index batch_size, const Index depth,
     typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
@@ -559,8 +590,8 @@ void FakeQuant4WithMinMaxVarsPerChannelGradientFunctor<GPUDevice>::operator()(
     const Index width, const Index depth,
     typename TTypes<float>::ConstFlat gradients,
     typename TTypes<float>::ConstFlat inputs,
-    typename TTypes<float>::ConstVec min,
-    typename TTypes<float>::ConstVec max,
+    typename TTypes<float>::ConstVec min, typename TTypes<float>::ConstVec max,
+    int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
     typename TTypes<bool>::Scalar check_min_max,
 #endif
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 242eddfb799e14e98fa4034f396f4b903cae9f7c..1aefaec691d1e42e1cd6561a5be0bb682590b5e0 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -35,31 +35,27 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float StdRound(float input) {
 
 namespace tensorflow {
 
-static constexpr int kSteps = 255;
-static constexpr float kStepsFloat = static_cast<float>(kSteps);
-
 // Gymnastics with nudged zero point is to ensure that real zero maps to
 // an integer, which is required for e.g. zero-padding in convolutional layers.
 // Returns (nudged_min, nudged_max, nudged_scale).
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(const float min,
-                                                 const float max,
-                                                 float* nudged_min,
-                                                 float* nudged_max,
-                                                 float* scale) {
-  *scale = (max - min) / (kStepsFloat - 0.0f);
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void Nudge(
+    const float min, const float max, const int steps, float* nudged_min,
+    float* nudged_max, float* scale) {
+  const float steps_float = static_cast<float>(steps);
+  *scale = (max - min) / (steps_float - 0.0f);
   const float zero_point_from_min = 0.0f - min / *scale;
-  const uint8 nudged_zero_point = [zero_point_from_min] {
+  const uint8 nudged_zero_point = [zero_point_from_min, steps, steps_float] {
     if (zero_point_from_min < 0.0f) {
       return static_cast<uint8>(0);
-    } else if (zero_point_from_min > kStepsFloat) {
-      return static_cast<uint8>(kSteps);
-    } else {
-      return static_cast<uint8>(StdRound(zero_point_from_min));
     }
+    if (zero_point_from_min > steps_float) {
+      return static_cast<uint8>(steps);
+    }
+    return static_cast<uint8>(StdRound(zero_point_from_min));
   }();
 
   *nudged_min = (0.0f - nudged_zero_point) * (*scale);
-  *nudged_max = (kStepsFloat - nudged_zero_point) * (*scale);
+  *nudged_max = (steps_float - nudged_zero_point) * (*scale);
 }
 
 template <typename T>
@@ -80,13 +76,13 @@ using Flat = typename tensorflow::TTypes<T>::Flat;
 template <typename Device>
 struct FakeQuantWithMinMaxArgsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs, const float min,
-                  const float max, Flat<float> outputs) {
+                  const float max, const int steps, Flat<float> outputs) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, steps, &nudged_min, &nudged_max, &nudged_scale);
     const float inv_nudged_scale = 1.0f / nudged_scale;
 
     auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -103,13 +99,13 @@ template <typename Device>
 struct FakeQuantWithMinMaxArgsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, const float min, const float max,
-                  Flat<float> backprops) {
+                  const int steps, Flat<float> backprops) {
     eigen_assert(min <= 0.0f && "min should be <= 0.0");
     eigen_assert(max >= 0.0f && "max should be >= 0.0");
     eigen_assert(min < max && "min should be < max");
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min, max, &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min, max, steps, &nudged_min, &nudged_max, &nudged_scale);
 
     auto between_nudged_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -124,6 +120,7 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsFunctor {
   void operator()(const Device& d, ConstFlat<float> inputs,
                   ConstScalar<float> min, ConstScalar<float> max,
+                  const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -138,7 +135,7 @@ struct FakeQuantWithMinMaxVarsFunctor {
 #endif
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), steps, &nudged_min, &nudged_max, &nudged_scale);
     const auto nudged_scale_repl = inputs.constant(nudged_scale);
 
     const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -155,7 +152,7 @@ template <typename Device>
 struct FakeQuantWithMinMaxVarsGradientFunctor {
   void operator()(const Device& d, ConstFlat<float> gradients,
                   ConstFlat<float> inputs, ConstScalar<float> min,
-                  ConstScalar<float> max,
+                  ConstScalar<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -172,7 +169,7 @@ struct FakeQuantWithMinMaxVarsGradientFunctor {
 #endif
 
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), &nudged_min, &nudged_max, &nudged_scale);
+    Nudge(min(), max(), steps, &nudged_min, &nudged_max, &nudged_scale);
 
     const auto between_min_max =
         (inputs >= nudged_min && inputs <= nudged_max)
@@ -200,7 +197,7 @@ using Index = typename tensorflow::TTypes<float>::ConstTensor::Index;
 template <typename Device>
 struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -216,7 +213,7 @@ struct FakeQuant1WithMinMaxVarsPerChannelFunctor {
 
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const float clamped =
           std::max(std::min(inputs(i), nudged_max), nudged_min);
       const float clamped_shifted = clamped - nudged_min;
@@ -233,7 +230,7 @@ template <typename Device>
 struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, const Index batch_size, const Index depth,
                   ConstFlat<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -251,7 +248,7 @@ struct FakeQuant2WithMinMaxVarsPerChannelFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto clamped =
           inputs_restored.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
@@ -269,7 +266,7 @@ template <typename Device>
 struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
   void operator()(const Device& d, const Index batch_size, const Index height,
                   const Index width, const Index depth, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -287,7 +284,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto clamped =
           inputs_restored.chip<3>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
@@ -308,7 +305,7 @@ template <typename Device>
 struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, ConstVec<float> gradients,
                   ConstVec<float> inputs, ConstVec<float> min,
-                  ConstVec<float> max,
+                  ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -325,7 +322,7 @@ struct FakeQuant1WithMinMaxVarsPerChannelGradientFunctor {
 
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
 
       const bool between_min_max =
           inputs(i) >= nudged_min && inputs(i) <= nudged_max;
@@ -346,7 +343,7 @@ template <typename Device>
 struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, const Index batch_size, const Index depth,
                   ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -366,7 +363,7 @@ struct FakeQuant2WithMinMaxVarsPerChannelGradientFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto gradients_chip = gradients_restored.chip<1>(i);
       const auto inputs_chip = inputs_restored.chip<1>(i);
 
@@ -399,7 +396,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
   void operator()(const Device& d, const Index batch_size, const Index height,
                   const Index width, const Index depth,
                   ConstFlat<float> gradients, ConstFlat<float> inputs,
-                  ConstVec<float> min, ConstVec<float> max,
+                  ConstVec<float> min, ConstVec<float> max, const int steps,
 #ifndef FAKE_QUANT_NO_DEBUG
                   Scalar<bool> check_min_max,
 #endif
@@ -419,7 +416,7 @@ struct FakeQuant4WithMinMaxVarsPerChannelGradientFunctor {
     const auto inputs_restored = inputs.reshape(restored);
     for (Index i = 0; i < min.size(); ++i) {
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), &nudged_min, &nudged_max, &nudged_scale);
+      Nudge(min(i), max(i), steps, &nudged_min, &nudged_max, &nudged_scale);
       const auto gradients_chip = gradients_restored.chip<3>(i);
       const auto inputs_chip = inputs_restored.chip<3>(i);
 
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 38ad345f0d3b346bf5c33d52d999051738df9401..2be92269655dd149e485ddd14fe7b092a94d776d 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -48,30 +48,94 @@ class QuantOpsTest : public OpsTestBase {
       inputs_.push_back({nullptr, input});
     }
   }
+
+  void RunTestFakeQuantWithMinMaxArgs(const int num_bits, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Attr("min", min)
+                     .Attr("max", max)
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVars(const int num_bits, const float min,
+                                      const float max, const TensorShape& shape,
+                                      const gtl::ArraySlice<float>& data,
+                                      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(TensorShape({}), {min});
+    // Max.
+    AddInputFromArray<float>(TensorShape({}), {max});
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
+
+  void RunTestFakeQuantWithMinMaxVarsPerChannel(
+      const int num_bits, const TensorShape& minmax_shape,
+      const gtl::ArraySlice<float>& min, const gtl::ArraySlice<float>& max,
+      const TensorShape& shape, const gtl::ArraySlice<float>& data,
+      gtl::ArraySlice<float> expected_data) {
+    TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+                     .Input(FakeInput(DT_FLOAT))  // inputs
+                     .Input(FakeInput(DT_FLOAT))  // min
+                     .Input(FakeInput(DT_FLOAT))  // max
+                     .Attr("num_bits", num_bits)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    // Downstream inputs.
+    AddInputFromArray<float>(shape, data);
+    // Min.
+    AddInputFromArray<float>(minmax_shape, min);
+    // Max.
+    AddInputFromArray<float>(minmax_shape, max);
+
+    // Tested code.
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DT_FLOAT, shape);
+    FillValues<float>(&expected, expected_data);
+    ExpectClose(expected, *output);
+  }
 };
 
 TEST_F(QuantOpsTest, WithArgsNoNudging) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -10.0f)
-                   .Attr("max", 53.75f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
-  ExpectClose(expected, *output);
+  // Expected quantized values: -10.0, -9.75, ..., 53.75.
+  RunTestFakeQuantWithMinMaxArgs(
+      8, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
@@ -79,23 +143,9 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0) {
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.1f)
-                   .Attr("max", 63.65f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
@@ -103,23 +153,9 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1) {
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -0.125f)
-                   .Attr("max", 63.625f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
-
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
-
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  RunTestFakeQuantWithMinMaxArgs(8, -0.125f, 63.625f, TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
 }
 
 TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
@@ -127,23 +163,78 @@ TEST_F(QuantOpsTest, WithArgsNudgedZeroIs255) {
   // Scale: 1/4,  original zero point: 254.6, nudged to 255.
   // Nudged range: [-63.75; 0.0].
   // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Attr("min", -63.65f)
-                   .Attr("max", 0.1f)
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f});
+  RunTestFakeQuantWithMinMaxArgs(
+      8, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
+}
 
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
+TEST_F(QuantOpsTest, WithArgsNoNudging_4Bits) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxArgs(4, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
-  ExpectClose(expected, *output);
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxArgs(4, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.7f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxArgs(4, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs15_4Bits) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxArgs(4, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNoNudging_2Bits) {
+  // Original quantization range: [-1 + 0 / 2, -1 + 3 / 2], scale: 1/2.
+  // Original zero point: 2, no nudging necessary.
+  // Expected quantized values: -1.0, -0.5, 0.0, 0.5.
+  RunTestFakeQuantWithMinMaxArgs(2, -1.0f, 0.5f, TensorShape({2, 3}),
+                                 {-1.1f, -1.0f, -0.9f, -0.3f, 0.1f, 1.0f},
+                                 {-1.0f, -1.0f, -1.0f, -0.5f, 0.0f, 0.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs0_2Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 1.5].
+  // Expected quantized values: 0.0, 0.5, 1.0, 1.5.
+  RunTestFakeQuantWithMinMaxArgs(2, -0.1f, 1.4f, TensorShape({2, 3}),
+                                 {-0.2f, 0.1f, 0.7f, 1.0f, 1.3f, 1.6f},
+                                 {0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithArgsNudgedZeroIs1_2Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 3 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 1.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, 1.0.
+  RunTestFakeQuantWithMinMaxArgs(2, -0.4f, 1.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 1.0f, 1.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 1.0f, 1.0f});
 }
 
 TEST_F(QuantOpsTest, WithArgsGradient) {
@@ -176,74 +267,130 @@ TEST_F(QuantOpsTest, WithArgsGradient) {
   ExpectClose(expected, *output);
 }
 
-TEST_F(QuantOpsTest, WithVarsNoNudging) {
-  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
-  // Original zero point: 40, no nudging necessary.
-  // Expected quantized values: -10.0, -10.25, ..., 53.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+TEST_F(QuantOpsTest, WithArgsGradient_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradient
                    .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("min", -0.4f)
+                   .Attr("max", 7.1f)
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-10.0f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {53.75f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
   Tensor* output = GetOutput(0);
+  auto input_flat = GetInput(0).flat<float>();
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+  FillValues<float>(&expected, {0.0f, input_flat(1), input_flat(2),
+                                input_flat(3), input_flat(4), 0.0f});
   ExpectClose(expected, *output);
 }
 
+TEST_F(QuantOpsTest, WithVarsNoNudging) {
+  // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
+  // Original zero point: 40, no nudging necessary.
+  // Expected quantized values: -10.0, -10.25, ..., 53.75.
+  RunTestFakeQuantWithMinMaxVars(
+      8, -10.0f, 53.75f, TensorShape({2, 3}),
+      {-10.1f, -10.0f, -9.9f, -9.75f, 53.75f, 53.8f},
+      {-10.0f, -10.0f, -10.0f, -9.75f, 53.75f, 53.75f});
+}
+
 TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0) {
   // Original quantization range: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged range: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
-                   .Input(FakeInput(DT_FLOAT))  // inputs
-                   .Input(FakeInput(DT_FLOAT))  // min
-                   .Input(FakeInput(DT_FLOAT))  // max
-                   .Finalize(node_def()));
-  TF_EXPECT_OK(InitOp());
-  // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
-  // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.1f});
-  // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.65f});
+  RunTestFakeQuantWithMinMaxVars(8, -0.1f, 63.65f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+                                 {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+}
 
-  // Tested code.
-  TF_ASSERT_OK(RunOpKernel());
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged range: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVars(8, -0.125f, 63.625f, TensorShape({2, 3}),
+                                 {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+                                 {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+}
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs255) {
+  // Original quantization range: [0.4 / 4 - 255 / 4, 0.4 / 4 + 0 / 4].
+  // Scale: 1/4,  original zero point: 254.6, nudged to 255.
+  // Nudged range: [-63.75; 0.0].
+  // Expected quantized values: -63.75, -63.5, -63.25, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(
+      8, -63.65f, 0.1f, TensorShape({2, 3}),
+      {-63.8f, -63.75f, -63.7f, -63.5f, 0.0f, 0.1f},
+      {-63.75f, -63.75f, -63.75f, -63.5f, 0.0f, 0.0f});
 }
 
-TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsNoNudging_4Bits) {
+  // Original quantization range: [-6 + 0 / 2, -6 + 15 / 2], scale: 1/2.
+  // Original zero point: 12, no nudging necessary.
+  // Expected quantized values: -6, -5.5, ..., 1.5.
+  RunTestFakeQuantWithMinMaxVars(4, -6.0f, 1.5f, TensorShape({2, 3}),
+                                 {-6.1f, -6.0f, -5.9f, -5.5f, 1.5f, 1.6f},
+                                 {-6.0f, -6.0f, -6.0f, -5.5f, 1.5f, 1.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVars(4, -0.1f, 7.4f, TensorShape({2, 3}),
+                                 {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.7f},
+                                 {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVars(4, -0.4f, 7.1f, TensorShape({2, 3}),
+                                 {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+                                 {-0.5f, -0.5f, 0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsNudgedZeroIs15_4Bits) {
+  // Original quantization range: [0.4 / 2 - 15 / 2, 0.4 / 2 + 0 / 2].
+  // Scale: 1/2,  original zero point: 14.6, nudged to 15.
+  // Nudged range: [-7.5; 0.0].
+  // Expected quantized values: -7.5, -7.0, ..., 0.0.
+  RunTestFakeQuantWithMinMaxVars(4, -7.3f, 0.2f, TensorShape({2, 3}),
+                                 {-7.6f, -7.5f, -7.4f, -7.2f, 0.0f, 0.1f},
+                                 {-7.5f, -7.5f, -7.5f, -7.0f, 0.0f, 0.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsGradient) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged range: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
@@ -255,34 +402,46 @@ TEST_F(QuantOpsTest, WithVarsNudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = in_flat(0);
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = in_flat(5);
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsGradient) {
-  // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged range: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsGradient_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({}), {-0.125f});
+  AddInputFromArray<float>(TensorShape({}), {-0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({}), {63.625f});
+  AddInputFromArray<float>(TensorShape({}), {7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -290,10 +449,8 @@ TEST_F(QuantOpsTest, WithVarsGradient) {
   Tensor* output_bprop_wrt_input = GetOutput(0);
   Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
   auto in_flat = GetInput(0).flat<float>();
-  FillValues<float>(&expected_bprop_wrt_input,
-                    {0.0f, in_flat(1),
-                     in_flat(2), in_flat(3),
-                     in_flat(4), 0.0f});
+  FillValues<float>(&expected_bprop_wrt_input, {0.0f, in_flat(1), in_flat(2),
+                                                in_flat(3), in_flat(4), 0.0f});
   ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
 
   Tensor* output_bprop_wrt_min = GetOutput(1);
@@ -312,12 +469,182 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f}, TensorShape({4}),
+      {-0.1f, 0.0f, 63.75f, 63.8f}, {0.0f, 0.0f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f}, TensorShape({4}),
+      {-0.26f, -0.25f, -0.24f, 63.6f}, {-0.25f, -0.25f, -0.25f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {63.65f, 63.65f, 63.65f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f},
+      {0.0f, 0.0f, 0.0f, 0.25f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({3}), {-0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f}, TensorShape({2, 3}),
+      {-0.26f, -0.25f, -0.24f, 0.0f, 63.5f, 63.6f},
+      {-0.25f, -0.25f, -0.25f, 0.0f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {63.65f, 63.65f, 63.65f, 63.65f}, TensorShape({1, 2, 3, 4}),
+      {-0.1f, 0.0f,   0.1f,   0.25f,  0.5f,   0.75f,
+       1.0f,  1.25f,  1.5f,   1.75f,  2.0f,   2.25f,
+
+       63.0f, 63.25f, 63.5f,  63.7f,  63.75f, 63.8f,
+       63.9f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {0.0f,   0.0f,   0.0f,   0.25f,  0.5f,   0.75f,
+       1.0f,   1.25f,  1.5f,   1.75f,  2.0f,   2.25f,
+
+       63.0f,  63.25f, 63.5f,  63.75f, 63.75f, 63.75f,
+       63.75f, 63.75f, 63.75f, 63.75f, 63.75f, 63.75f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
+  // Nudged ranges: [-0.25; 63.5].
+  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, TensorShape({4}), {-0.125f, -0.125f, -0.125f, -0.125f},
+      {63.625f, 63.625f, 63.625f, 63.625f}, TensorShape({1, 2, 3, 4}),
+      {-0.3f,  -0.25f, -0.2f,  0.0f,   0.25f,  0.5f,
+       0.75f,  1.0f,   1.25f,  1.5f,   1.75f,  2.0f,
+
+       63.0f,  63.25f, 63.4f,  63.5f,  63.6f,  63.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {-0.25f, -0.25f, -0.25f, 0.0f,  0.25f, 0.5f,
+       0.75f,  1.0f,   1.25f,  1.5f,  1.75f, 2.0f,
+
+       63.0f,  63.25f, 63.5f,  63.5f, 63.5f, 63.5f,
+       63.5f,  63.5f,  63.5f,  63.5f, 63.5f, 63.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {7.4f, 7.4f, 7.4f, 7.4f}, TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {7.1f, 7.1f, 7.1f, 7.1f}, TensorShape({4}), {-0.51f, -0.5f, -0.24f, 7.1f},
+      {-0.5f, -0.5f, -0.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({3}), {-0.1f, -0.1f, -0.1f}, {7.4f, 7.4f, 7.4f},
+      TensorShape({2, 3}), {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({3}), {-0.4f, -0.4f, -0.4f}, {7.1f, 7.1f, 7.1f},
+      TensorShape({2, 3}), {-0.51f, -0.5f, -0.24f, 0.0f, 7.0f, 7.1f},
+      {-0.5f, -0.5f, -0.0f, 0.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f},
+      {7.4f, 7.4f, 7.4f, 7.4f}, TensorShape({1, 2, 3, 4}),
+      {-0.1f, 0.0f,   0.1f,   0.5f,   1.0f,   1.5f,
+       1.5f,  2.0f,   2.5f,   3.0f,   3.5f,   4.0f,
+
+       6.0f,  6.5f,   7.0f,   7.4f,   7.5f,   7.7f,
+       7.8f,  100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {0.0f, 0.0f, 0.0f, 0.5f, 1.0f, 1.5f, 1.5f, 2.0f, 2.5f, 3.0f, 3.5f, 4.0f,
+
+       6.0f, 6.5f, 7.0f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f, 7.5f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      4, TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f},
+      {7.1f, 7.1f, 7.1f, 7.1f}, TensorShape({1, 2, 3, 4}),
+      {-0.6f,  -0.5f,  -0.4f,  0.0f,   0.5f,   1.0f,
+       1.5f,   2.0f,   2.5f,   3.0f,   3.5f,   4.0f,
+
+       6.0f,   6.5f,   6.9f,   7.0f,   7.1f,   7.7f,
+       100.0f, 100.0f, 100.0f, 100.0f, 100.0f, 1000.0f},
+      {-0.5f, -0.5f, -0.5f, 0.0f, 0.5f, 1.0f,
+       1.5f,  2.0f,  2.5f,  3.0f, 3.5f, 4.0f,
+
+       6.0f,  6.5f,  7.0f,  7.0f, 7.0f, 7.0f,
+       7.0f,  7.0f,  7.0f,  7.0f, 7.0f, 7.0f});
+}
+
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
+  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
+  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
+  // Nudged ranges: [0.0; 63.75].
+  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
   // Min.
@@ -328,25 +655,40 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.26f, -0.25f, -0.24f, 63.6f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}),
                            {-0.125f, -0.125f, -0.125f, -0.125f});
@@ -357,27 +699,41 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({4}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1), grad_flat(2), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                           0.25f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f, 0.1f, 0.25f, 63.75f, 63.8f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
   // Max.
@@ -386,28 +742,42 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {0.0f, 0.0f, 0.0f,
-                                0.25f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.26f, -0.25f, -0.24f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.3f, -0.25f, -0.2f, 0.0f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
   // Max.
@@ -416,33 +786,46 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
-  FillValues<float>(&expected, {-0.25f, -0.25f, -0.25f,
-                                0.0f, 63.5f, 63.5f});
-  ExpectClose(expected, *output);
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {0.0f, grad_flat(1), grad_flat(2), grad_flat(3), grad_flat(4), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_min, {grad_flat(0), 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({3}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, grad_flat(5)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
   // Nudged ranges: [0.0; 63.75].
   // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 0.1f, 0.25f,
-                             0.5f, 0.75f, 1.0f, 1.25f,
-                             1.5f, 1.75f, 2.0f, 2.25f,
+                           {-0.1f,  0.0f,  63.75f, 63.8f, -0.1f,  0.0f,
+                            63.75f, 63.8f, -0.1f,  0.0f,  63.75f, 63.8f,
 
-                             63.0f,  63.25f, 63.5f,   63.7f,
-                             63.75f, 63.8f,  63.9f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
+                            -0.1f,  0.0f,  63.75f, 63.8f, -0.1f,  0.0f,
+                            63.75f, 63.8f, -0.1f,  0.0f,  63.75f, 63.8f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
@@ -451,39 +834,58 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs0) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {0.0f, 0.0f,  0.0f, 0.25f,
-                     0.5f, 0.75f, 1.0f, 1.25f,
-                     1.5f, 1.75f, 2.0f, 2.25f,
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
 
-                     63.0f,  63.25f, 63.5f,  63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f,
-                     63.75f, 63.75f, 63.75f, 63.75f});
-  ExpectClose(expected, *output);
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
   // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
   // Nudged ranges: [-0.25; 63.5].
   // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
-  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, -0.2f,  0.0f,
-                             0.25f, 0.5f,   0.75f, 1.0f,
-                             1.25f, 1.5f,   1.75f, 2.0f,
+                           {-0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f,
 
-                             63.0f,  63.25f, 63.4f,   63.5f,
-                             63.6f,  63.7f, 100.0f,  100.0f,
-                            100.0f, 100.0f, 100.0f, 1000.0f});
+                            -0.3f, -0.25f, 63.5f, 63.6f,  -0.3f, -0.25f,
+                            63.5f, 63.6f,  -0.3f, -0.25f, 63.5f, 63.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}),
                            {-0.125f, -0.125f, -0.125f, -0.125f});
@@ -494,39 +896,58 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedZeroIs1) {
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  FillValues<float>(&expected,
-                    {-0.25f, -0.25f, -0.25f, 0.0f,
-                      0.25f,  0.5f,   0.75f, 1.0f,
-                      1.25f,  1.5f,   1.75f, 2.0f,
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT,
+                                  TensorShape({1, 2, 3, 4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {0.0f, grad_flat(1),  grad_flat(2),  0.0f,
+                     0.0f, grad_flat(5),  grad_flat(6),  0.0f,
+                     0.0f, grad_flat(9),  grad_flat(10), 0.0f,
 
-                      63.0f, 63.25f, 63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f,
-                      63.5f, 63.5f,  63.5f, 63.5f});
-  ExpectClose(expected, *output);
+                     0.0f, grad_flat(13), grad_flat(14), 0.0f,
+                     0.0f, grad_flat(17), grad_flat(18), 0.0f,
+                     0.0f, grad_flat(21), grad_flat(22), 0.0f});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min,
+                    {grad_flat(0) + grad_flat(4) + grad_flat(8) +
+                         grad_flat(12) + grad_flat(16) + grad_flat(20),
+                     0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max,
+                    {0.0f, 0.0f, 0.0f,
+                     grad_flat(3) + grad_flat(7) + grad_flat(11) +
+                         grad_flat(15) + grad_flat(19) + grad_flat(23)});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 63.75f, 63.8f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.1f, 0.0f, 7.5f, 7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -551,28 +972,27 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({4}));
   // Downstream inputs.
-  AddInputFromArray<float>(TensorShape({4}), {-0.3f, -0.25f, 63.5f, 63.6f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.6f, -0.5f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -597,28 +1017,28 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedZeroIs1) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.1f, 0.0f, 0.1f,
-                            0.25f, 63.75f, 63.8f});
+                           {-0.1f, 0.0f, 0.1f, 0.5f, 7.5f, 7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({3}), {-0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({3}), {7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -644,28 +1064,28 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({2, 3}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({2, 3}),
-                           {-0.3f, -0.25f, -0.2f,
-                            0.0f, 63.5f, 63.6f});
+                           {-0.6f, -0.5f, -0.4f, 0.0f, 7.0f, 7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({3}), {-0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({3}), {-0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({3}), {63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({3}), {7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -691,33 +1111,32 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim2GradientNudgedZeroIs1) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
-  // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.4, nudged to 0.
-  // Nudged ranges: [0.0; 63.75].
-  // Expected quantized values: 0.0, 0.25, 0.5, ..., 63.75.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0_4Bits) {
+  // Original quantization range: [-0.2 / 2 + 0 / 2, -0.2 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.2, nudged to 0.
+  // Nudged range: [0.0; 7.5].
+  // Expected quantized values: 0.0, 0.5, ..., 7.5.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
+                           {-0.1f, 0.0f, 7.5f,  7.6f, -0.1f, 0.0f,
+                            7.5f,  7.6f, -0.1f, 0.0f, 7.5f,  7.6f,
 
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f,
-                            -0.1f, 0.0f, 63.75f, 63.8f});
+                            -0.1f, 0.0f, 7.5f,  7.6f, -0.1f, 0.0f,
+                            7.5f,  7.6f, -0.1f, 0.0f, 7.5f,  7.6f});
   // Min.
   AddInputFromArray<float>(TensorShape({4}), {-0.1f, -0.1f, -0.1f, -0.1f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}), {63.65f, 63.65f, 63.65f, 63.65f});
+  AddInputFromArray<float>(TensorShape({4}), {7.4f, 7.4f, 7.4f, 7.4f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
@@ -754,35 +1173,32 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs0) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
-TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1) {
-  // Original quantization ranges: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
-  // Scale: 1/4,  original zero point: 0.5, nudged to 1.
-  // Nudged ranges: [-0.25; 63.5].
-  // Expected quantized values: -0.25, 0.0, 0.25, ..., 63.5.
+TEST_F(QuantOpsTest, WithVarsPerChannelDim4GradientNudgedZeroIs1_4Bits) {
+  // Original quantization range: [-0.8 / 2 + 0 / 2, -0.8 / 2 + 15 / 2].
+  // Scale: 1/2,  original zero point: 0.8, nudged to 1.
+  // Nudged range: [-0.5; 7.0].
+  // Expected quantized values: -0.5, 0.0, 0.5, ..., 7.0.
   TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
                    .Input(FakeInput(DT_FLOAT))  // gradients
                    .Input(FakeInput(DT_FLOAT))  // inputs
                    .Input(FakeInput(DT_FLOAT))  // min
                    .Input(FakeInput(DT_FLOAT))  // max
+                   .Attr("num_bits", 4)
                    .Finalize(node_def()));
   TF_EXPECT_OK(InitOp());
   // Upstream gradients.
   AddRandomInput(TensorShape({1, 2, 3, 4}));
   // Downstream inputs.
   AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {-0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
+                           {-0.6f, -0.5f, 7.0f,  7.1f,  -0.6f, -0.5f,
+                            7.0f,  7.1f,  -0.6f, -0.5f, 7.0f,  7.1f,
 
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f,
-                            -0.3f, -0.25f, 63.5f, 63.6f});
+                            -0.6f, -0.5f, 7.0f,  7.1f,  -0.6f, -0.5f,
+                            7.0f,  7.1f,  -0.6f, -0.5f, 7.0f,  7.1f});
   // Min.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {-0.125f, -0.125f, -0.125f, -0.125f});
+  AddInputFromArray<float>(TensorShape({4}), {-0.4f, -0.4f, -0.4f, -0.4f});
   // Max.
-  AddInputFromArray<float>(TensorShape({4}),
-                           {63.625f, 63.625f, 63.625f, 63.625f});
+  AddInputFromArray<float>(TensorShape({4}), {7.1f, 7.1f, 7.1f, 7.1f});
 
   // Tested code.
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index c4cfd514c3a73ca4e9bebfaad816e874a4710b32..21e6c694642ab5ad26e318f1e7056f031128c5d0 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-// See docs in ../ops/fft_ops.cc.
+// See docs in ../ops/spectral_ops.cc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
@@ -29,22 +29,13 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
+#endif
 
 namespace tensorflow {
 
-namespace {
-// TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
-template <typename T>
-perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
-  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
-  perftools::gputools::DeviceMemory<T> typed(wrapped);
-  return typed;
-}
-}  // end namespace
-
-class FFTGPUBase : public OpKernel {
+class FFTBase : public OpKernel {
  public:
-  explicit FFTGPUBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit FFTBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& in = ctx->input(0);
@@ -97,9 +88,110 @@ class FFTGPUBase : public OpKernel {
   virtual bool IsForward() const = 0;
   virtual bool IsReal() const = 0;
 
- private:
+  // The function that actually computes the FFT.
+  virtual void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+                     Tensor* out) = 0;
+};
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <bool Forward, bool _Real, int FFTRank>
+class FFTCPU : public FFTBase {
+ public:
+  using FFTBase::FFTBase;
+
+ protected:
+  int Rank() const override { return FFTRank; }
+  bool IsForward() const override { return Forward; }
+  bool IsReal() const override { return _Real; }
+
   void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
-             Tensor* out) {
+             Tensor* out) override {
+    // Create the axes (which are always trailing).
+    auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
+    auto device = ctx->eigen_device<CPUDevice>();
+
+    if (!IsReal()) {
+      auto input = ((Tensor)in).flat_inner_dims<complex64, FFTRank + 1>();
+      // Compute the FFT using eigen.
+      auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+      output.device(device) = input.template fft < Eigen::BothParts,
+      Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE > (axes);
+    } else {
+      if (IsForward()) {
+        auto input = ((Tensor)in).flat_inner_dims<float, FFTRank + 1>();
+        auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+        Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> startIndices;
+
+        // Compute the full FFT using a temporary tensor.
+        Tensor temp;
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<complex64>::v(),
+                                               in.shape(), &temp));
+        auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+        full_fft.device(device) =
+            input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
+
+        // Slice away the negative frequency components.
+        output.device(device) =
+            full_fft.slice(startIndices, output.dimensions());
+      } else {
+        // TODO: reconstruct the full fft and take the inverse.
+        ctx->CtxFailureWithWarning(
+            errors::Unimplemented("IRFFT is not implemented as a CPU kernel"));
+      }
+    }
+  }
+};
+
+// Use labels to distinguish between internal and open source versions
+// of these kernels.
+#ifdef PLATFORM_GOOGLE
+#define FFT_LABEL "eigen"
+#else
+#define FFT_LABEL ""
+#endif
+
+REGISTER_KERNEL_BUILDER(Name("FFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("IFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 1>);
+REGISTER_KERNEL_BUILDER(Name("FFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("IFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 2>);
+REGISTER_KERNEL_BUILDER(Name("FFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, false, 3>);
+REGISTER_KERNEL_BUILDER(Name("IFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<false, false, 3>);
+
+REGISTER_KERNEL_BUILDER(Name("RFFT").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 1>);
+REGISTER_KERNEL_BUILDER(Name("RFFT2D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 2>);
+REGISTER_KERNEL_BUILDER(Name("RFFT3D").Device(DEVICE_CPU).Label(FFT_LABEL),
+                        FFTCPU<true, true, 3>);
+
+#undef FFT_LABEL
+
+#if GOOGLE_CUDA
+
+namespace {
+// TODO(vrv/zhifengc): Refactor AsDeviceMemory() into GPUUtil.
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // end namespace
+
+class FFTGPUBase : public FFTBase {
+ public:
+  using FFTBase::FFTBase;
+
+ protected:
+  void DoFFT(OpKernelContext* ctx, const Tensor& in, uint64* fft_shape,
+             Tensor* out) override {
     auto* stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
@@ -238,7 +330,6 @@ REGISTER_KERNEL_BUILDER(Name("BatchFFT3D").Device(DEVICE_GPU),
                         FFTGPU<true, false, 3>);
 REGISTER_KERNEL_BUILDER(Name("BatchIFFT3D").Device(DEVICE_GPU),
                         FFTGPU<false, false, 3>);
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/filter_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62ad921062f5885aca1e64f575b07866bb6f5947
--- /dev/null
+++ b/tensorflow/core/kernels/filter_dataset_op.cc
@@ -0,0 +1,158 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class FilterDatasetOp : public OpKernel {
+ public:
+  explicit FilterDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    DatasetBase* dataset = new Dataset(input, std::move(captured_func));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  const int graph_def_version_;
+
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : input_(input), captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "FilterDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // NOTE(mrry): This method is thread-safe as long as
+        // `input_impl_` and `f` are thread-safe. However, if multiple
+        // threads enter this method, outputs may be observed in a
+        // non-deterministic order.
+        bool matched;
+        do {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+
+          FunctionLibraryRuntime::Options opts;
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+          opts.runner = ctx->runner();
+          // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+          // stack-rip the iterators and use async kernels.
+          Notification n;
+          Status ret;
+          std::vector<Tensor> result;
+          ret = dataset()->captured_func_->Run(opts, *out_tensors, &result);
+
+          if (!ret.ok()) {
+            return ret;
+          } else if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+                     result[0].NumElements() != 1) {
+            return errors::InvalidArgument(
+                "Filter predicate `f` must return a scalar bool.");
+          }
+          matched = result[0].scalar<bool>()();
+        } while (!matched);
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+ private:
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FilterDataset").Device(DEVICE_CPU),
+                        FilterDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index 637a6cef95da5e534cb81321e00afea22381f873..ce7fb9c332bc263d2c560d5bfb346f5585f40b3c 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -28,12 +28,14 @@ namespace tensorflow {
 class FixedLengthRecordReader : public ReaderBase {
  public:
   FixedLengthRecordReader(const string& node_name, int64 header_bytes,
-                          int64 record_bytes, int64 footer_bytes, Env* env)
+                          int64 record_bytes, int64 footer_bytes,
+                          int64 hop_bytes, Env* env)
       : ReaderBase(
             strings::StrCat("FixedLengthRecordReader '", node_name, "'")),
         header_bytes_(header_bytes),
         record_bytes_(record_bytes),
         footer_bytes_(footer_bytes),
+        hop_bytes_(hop_bytes),
         env_(env),
         file_pos_limit_(-1),
         record_number_(0) {}
@@ -62,14 +64,31 @@ class FixedLengthRecordReader : public ReaderBase {
 
   Status ReadLocked(string* key, string* value, bool* produced,
                     bool* at_end) override {
-    if (input_buffer_->Tell() >= file_pos_limit_) {
+    // The condition `input_buffer_->Tell() + record_bytes_ > file_pos_limit_`
+    // is to confirm that none of record bytes is out of the range of
+    // file_pos_limit_.
+    // This is necessary for the condition `hop_bytes > 0`. For example.
+    // File: "0123456"
+    // Reader setting: `record_bytes=3`, `hop_bytes=2`, `footer_bytes=0`,
+    //     `header_bytes=0`
+    // Without this checking condition, the forth time the reader will at
+    // this position: "012345|6" and the reading operation will result in
+    // an error.
+    if (input_buffer_->Tell() >= file_pos_limit_ ||
+        input_buffer_->Tell() + record_bytes_ > file_pos_limit_) {
       *at_end = true;
       return Status::OK();
     }
+    const int64 pos_before_read = input_buffer_->Tell();
     TF_RETURN_IF_ERROR(input_buffer_->ReadNBytes(record_bytes_, value));
     *key = strings::StrCat(current_work(), ":", record_number_);
     *produced = true;
     ++record_number_;
+
+    if (hop_bytes_ > 0) {
+      input_buffer_->Seek(pos_before_read + hop_bytes_).IgnoreError();
+    }
+
     return Status::OK();
   }
 
@@ -87,6 +106,7 @@ class FixedLengthRecordReader : public ReaderBase {
   const int64 header_bytes_;
   const int64 record_bytes_;
   const int64 footer_bytes_;
+  const int64 hop_bytes_;
   Env* const env_;
   int64 file_pos_limit_;
   int64 record_number_;
@@ -98,10 +118,12 @@ class FixedLengthRecordReaderOp : public ReaderOpKernel {
  public:
   explicit FixedLengthRecordReaderOp(OpKernelConstruction* context)
       : ReaderOpKernel(context) {
-    int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1;
+    int64 header_bytes = -1, record_bytes = -1, footer_bytes = -1,
+          hop_bytes = -1;
     OP_REQUIRES_OK(context, context->GetAttr("header_bytes", &header_bytes));
     OP_REQUIRES_OK(context, context->GetAttr("record_bytes", &record_bytes));
     OP_REQUIRES_OK(context, context->GetAttr("footer_bytes", &footer_bytes));
+    OP_REQUIRES_OK(context, context->GetAttr("hop_bytes", &hop_bytes));
     OP_REQUIRES(context, header_bytes >= 0,
                 errors::InvalidArgument("header_bytes must be >= 0 not ",
                                         header_bytes));
@@ -111,11 +133,15 @@ class FixedLengthRecordReaderOp : public ReaderOpKernel {
     OP_REQUIRES(context, footer_bytes >= 0,
                 errors::InvalidArgument("footer_bytes must be >= 0 not ",
                                         footer_bytes));
+    OP_REQUIRES(
+        context, hop_bytes >= 0,
+        errors::InvalidArgument("hop_bytes must be >= 0 not ", hop_bytes));
     Env* env = context->env();
-    SetReaderFactory([this, header_bytes, record_bytes, footer_bytes, env]() {
-      return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
-                                         footer_bytes, env);
-    });
+    SetReaderFactory(
+        [this, header_bytes, record_bytes, footer_bytes, hop_bytes, env]() {
+          return new FixedLengthRecordReader(name(), header_bytes, record_bytes,
+                                             footer_bytes, hop_bytes, env);
+        });
   }
 };
 
diff --git a/tensorflow/core/kernels/flat_map_dataset_op.cc b/tensorflow/core/kernels/flat_map_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..68a6cf19600fa25cde6b2459a51b571acf7ce04d
--- /dev/null
+++ b/tensorflow/core/kernels/flat_map_dataset_op.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class FlatMapDatasetOp : public OpKernel {
+ public:
+  explicit FlatMapDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    DatasetBase* dataset = new Dataset(input, std::move(captured_func),
+                                       output_types_, output_shapes_);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (current_element_iterator_) {
+            // We are currently precessing a mapped element, so try to get the
+            // next subelement.
+            bool end_of_element;
+            TF_RETURN_IF_ERROR(current_element_iterator_->GetNext(
+                ctx, out_tensors, &end_of_element));
+            if (!end_of_element) {
+              // Produce the subelement as output.
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+
+            // We have reached the end of the current element, so maybe move on
+            // to the next element.
+            current_element_iterator_.reset();
+          }
+
+          // Get the next element from the input dataset.
+          std::vector<Tensor> args;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+
+          FunctionLibraryRuntime::Options opts;
+          opts.runner = ctx->runner();
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+          ScopedStepContainer step_container(
+              opts.step_id, [this, ctx](const string& name) {
+                dataset()
+                    ->captured_func_->resource_manager()
+                    ->Cleanup(name)
+                    .IgnoreError();
+              });
+          opts.step_container = &step_container;
+          std::vector<Tensor> return_values;
+          TF_RETURN_IF_ERROR(
+              dataset()->captured_func_->Run(opts, args, &return_values));
+
+          if (!(return_values.size() == 1 &&
+                return_values[0].dtype() == DT_RESOURCE &&
+                TensorShapeUtils::IsScalar(return_values[0].shape()))) {
+            return errors::InvalidArgument(
+                "`f` must return a single scalar of dtype DT_RESOURCE.");
+          }
+
+          // Retrieve the dataset that was created in `f`.
+          DatasetBase* returned_dataset;
+          const ResourceHandle& dataset_resource =
+              return_values[0].scalar<ResourceHandle>()();
+
+          // NOTE(mrry): We cannot use the core `LookupResource()` or
+          // `DeleteResource()` functions, because we have an
+          // `IteratorContext*` and not an `OpKernelContext*`, so we
+          // replicate the necessary functionality here.
+          auto type_index = MakeTypeIndex<DatasetBase>();
+          if (type_index.hash_code() != dataset_resource.hash_code()) {
+            return errors::InvalidArgument(
+                "`f` must return a Dataset resource.");
+          }
+          TF_RETURN_IF_ERROR(
+              dataset()->captured_func_->resource_manager()->Lookup(
+                  dataset_resource.container(), dataset_resource.name(),
+                  &returned_dataset));
+          core::ScopedUnref unref_dataset(returned_dataset);
+
+          // Create an iterator for the dataset that was returned by
+          // `f`. This transfers ownership of the dataset to the
+          // iterator, so we can delete it from the resource manager.
+          current_element_iterator_ = returned_dataset->MakeIterator();
+          TF_RETURN_IF_ERROR(
+              dataset()
+                  ->captured_func_->resource_manager()
+                  ->Delete<DatasetBase>(dataset_resource.container(),
+                                        dataset_resource.name()));
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FlatMapDataset").Device(DEVICE_CPU),
+                        FlatMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index bcbf7424b195975ea68ec651552efc7f59fd7c4f..8c3137ece9fa902c12452f262c9d647afce9d231 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -84,8 +84,8 @@ class RetvalOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
-REGISTER_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Arg").Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name("_Retval").Device(DEVICE_CPU), RetvalOp);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER(type)     \
@@ -186,12 +186,12 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                         PassOn);
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
-      PassOn);                                                           \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"),\
+#define REGISTER_SYCL_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      PassOn);                                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
       PassOn);
 
 REGISTER_SYCL_KERNELS(float);
@@ -211,7 +211,7 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .HostMemory("output")
                             .TypeConstraint<int32>("T"),
                         PassOn);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
@@ -227,7 +227,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
 
     FunctionLibraryRuntime::Handle handle;
     OP_REQUIRES_OK_ASYNC(
-        ctx, lib->Instantiate(kGradientOp, def().attr(), &handle), done);
+        ctx, lib->Instantiate(kGradientOp, AttrSlice(def()), &handle), done);
 
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index be220d5c95dde431551bdb8c772f9265ed0795da..c1d58733a2aec6dc1483071935866f3e6e4a7dd1 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -38,6 +38,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_complex64(DECLARE_GPU_SPECS);
+TF_CALL_complex128(DECLARE_GPU_SPECS);
 
 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index f1c10250786fed80f8452d52fea2617630dc3e9d..39b6924d74a004b4daa35a8cdfe656963d564490 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -32,6 +32,8 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_complex64(DEFINE_GPU_SPECS);
+TF_CALL_complex128(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index d8182218af10afb320f341b20d6c605cce2174f8..dd25f589574cd023f3f08d9b5be689fc8de3b9d2 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -114,6 +114,8 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
+TF_CALL_complex64(REGISTER_GATHER_GPU);
+TF_CALL_complex128(REGISTER_GATHER_GPU);
 
 #undef REGISTER_GATHER_GPU
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index c340223aa10eac0467e65d1de80f0c494c5244f1..23645dafad440d6b91ef5cc44c1d308e51733669 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -40,9 +40,9 @@ namespace {
 
 class GatherOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType index_type) {
+  void MakeOp(DataType data_type, DataType index_type) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "Gather")
-                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
@@ -50,7 +50,7 @@ class GatherOpTest : public OpsTestBase {
 };
 
 TEST_F(GatherOpTest, ScalarIndices) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5}), {0, 1, 2, 3, 4});
@@ -63,8 +63,26 @@ TEST_F(GatherOpTest, ScalarIndices) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(GatherOpTest, ScalarIndices_Complex) {
+  MakeOp(DT_COMPLEX64, DT_INT32);
+
+  // Feed and run
+  AddInputFromArray<std::complex<float>>(
+      TensorShape({5}), {std::complex<float>(0, 10), std::complex<float>(1, 11),
+                         std::complex<float>(2, 12), std::complex<float>(3, 13),
+                         std::complex<float>(4, 14)});
+  AddInputFromArray<int32>(TensorShape({}), {3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output.
+  Tensor expected(allocator(), DT_COMPLEX64, TensorShape({}));
+  test::FillValues<std::complex<float>>(&expected,
+                                        {std::complex<float>(3, 13)});
+  test::ExpectTensorEqual<std::complex<float>>(expected, *GetOutput(0));
+}
+
 TEST_F(GatherOpTest, Simple_TwoD32) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
@@ -79,7 +97,7 @@ TEST_F(GatherOpTest, Simple_TwoD32) {
 }
 
 TEST_F(GatherOpTest, ZeroSize_TwoD32) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 0}), {});
@@ -92,7 +110,7 @@ TEST_F(GatherOpTest, ZeroSize_TwoD32) {
 }
 
 TEST_F(GatherOpTest, Simple_TwoD64) {
-  MakeOp(DT_INT64);
+  MakeOp(DT_FLOAT, DT_INT64);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
@@ -107,7 +125,7 @@ TEST_F(GatherOpTest, Simple_TwoD64) {
 }
 
 TEST_F(GatherOpTest, HighRank) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({4}), {0, 1, 2, 3});
@@ -121,7 +139,7 @@ TEST_F(GatherOpTest, HighRank) {
 }
 
 TEST_F(GatherOpTest, Error_IndexOutOfRange) {
-  MakeOp(DT_INT32);
+  MakeOp(DT_FLOAT, DT_INT32);
 
   // Feed and run
   AddInputFromArray<float>(TensorShape({5, 3}),
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a58c15a0976b82cb65331377d519a0324df1c90f
--- /dev/null
+++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc
@@ -0,0 +1,344 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/window_dataset.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class GroupByWindowDatasetOp : public OpKernel {
+ public:
+  explicit GroupByWindowDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* window_size_t;
+    OP_REQUIRES_OK(ctx, ctx->input("window_size", &window_size_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(window_size_t->shape()),
+                errors::InvalidArgument("window_size must be a scalar"));
+    const int64 window_size = window_size_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, window_size > 0,
+        errors::InvalidArgument("Window size must be greater than zero."));
+
+    // Get captured inputs for the key and reduce functions.
+    OpInputList key_func_other_argument_inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("key_func_other_arguments",
+                                        &key_func_other_argument_inputs));
+    std::vector<Tensor> key_func_other_arguments;
+    key_func_other_arguments.reserve(key_func_other_argument_inputs.size());
+    for (const Tensor& t : key_func_other_argument_inputs) {
+      key_func_other_arguments.push_back(t);
+    }
+    OpInputList reduce_func_other_argument_inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("reduce_func_other_arguments",
+                                        &reduce_func_other_argument_inputs));
+    std::vector<Tensor> reduce_func_other_arguments;
+    reduce_func_other_arguments.reserve(
+        reduce_func_other_argument_inputs.size());
+    for (const Tensor& t : reduce_func_other_argument_inputs) {
+      reduce_func_other_arguments.push_back(t);
+    }
+    // TODO(mrry): Refactor CapturedFunction to share the runtime
+    // state between multiple functions?
+    std::unique_ptr<CapturedFunction> captured_key_func;
+    OP_REQUIRES_OK(ctx,
+                   CapturedFunction::Create(ctx, key_func_, graph_def_version_,
+                                            std::move(key_func_other_arguments),
+                                            &captured_key_func));
+    std::unique_ptr<CapturedFunction> captured_reduce_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(ctx, reduce_func_, graph_def_version_,
+                                      std::move(reduce_func_other_arguments),
+                                      &captured_reduce_func));
+
+    DatasetBase* dataset = new Dataset(
+        input, window_size, std::move(captured_key_func),
+        std::move(captured_reduce_func), output_types_, output_shapes_);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int64 window_size,
+            std::unique_ptr<CapturedFunction> captured_key_func,
+            std::unique_ptr<CapturedFunction> captured_reduce_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          window_size_(window_size),
+          captured_key_func_(std::move(captured_key_func)),
+          captured_reduce_func_(std::move(captured_reduce_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "GroupByWindowDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          if (current_group_iterator_) {
+            // We are currently processing a group, so try to get the
+            // next element.
+            bool end_of_group;
+            TF_RETURN_IF_ERROR(current_group_iterator_->GetNext(
+                ctx, out_tensors, &end_of_group));
+            if (!end_of_group) {
+              // Produce the subelement as output.
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+            // We have reached the end of the current group, so maybe move on
+            // to the next group.
+            current_group_iterator_.reset();
+          }
+
+          // Iterate through the input dataset until we get a full
+          // group, or reach the end.
+          while (!end_of_input_) {
+            std::vector<Tensor> next_input_element;
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, &next_input_element, &end_of_input_));
+
+            if (!end_of_input_) {
+              FunctionLibraryRuntime::Options opts;
+              // Choose a step ID that is guaranteed not to clash with any
+              // Session-generated step ID. DirectSession only generates
+              // non-negative step IDs (contiguous, starting from 0), and
+              // MasterSession generates 56-bit random step IDs whose MSB is
+              // always 0, so a negative random step ID should suffice.
+              opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+              opts.runner = ctx->runner();
+              ScopedStepContainer step_container(
+                  opts.step_id, [this, ctx](const string& name) {
+                    dataset()
+                        ->captured_key_func_->resource_manager()
+                        ->Cleanup(name)
+                        .IgnoreError();
+                  });
+              opts.step_container = &step_container;
+
+              // Run the key function on the input element to identify its
+              // group.
+              std::vector<Tensor> key_func_output;
+              TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Run(
+                  opts, next_input_element, &key_func_output));
+
+              if (key_func_output.size() != 1 ||
+                  key_func_output[0].dtype() != DT_INT64 ||
+                  key_func_output[0].NumElements() != 1) {
+                // TODO(mrry): Support non-int64 keys.
+                return errors::InvalidArgument(
+                    "`key_func` must return a scalar int64.");
+              }
+              const int64 key = key_func_output[0].scalar<int64>()();
+
+              std::vector<std::vector<Tensor>>& group = groups_[key];
+              group.push_back(std::move(next_input_element));
+
+              if (group.size() == dataset()->window_size_) {
+                TF_RETURN_IF_ERROR(StartFlushingGroup(ctx, key));
+                break;
+              }
+            }
+          }
+
+          if (end_of_input_) {
+            if (!groups_.empty()) {
+              // We have consumed all of the input, so flush an
+              // arbitrarily chosen group.
+              TF_RETURN_IF_ERROR(
+                  StartFlushingGroup(ctx, groups_.begin()->first));
+            }
+          }
+        } while (current_group_iterator_ || !end_of_input_);
+
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     private:
+      Status StartFlushingGroup(IteratorContext* ctx, int64 key)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        FunctionLibraryRuntime::Options opts;
+        // Choose a step ID that is guaranteed not to clash with any
+        // Session-generated step ID. DirectSession only generates
+        // non-negative step IDs (contiguous, starting from 0), and
+        // MasterSession generates 56-bit random step IDs whose MSB is
+        // always 0, so a negative random step ID should suffice.
+        opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+        opts.runner = ctx->runner();
+        ScopedStepContainer step_container(
+            opts.step_id, [this, ctx](const string& name) {
+              dataset()
+                  ->captured_reduce_func_->resource_manager()
+                  ->Cleanup(name)
+                  .IgnoreError();
+            });
+        opts.step_container = &step_container;
+
+        DatasetBase* group_dataset;
+        TF_RETURN_IF_ERROR(NewWindowDataset(
+            std::move(groups_[key]), dataset()->input_->output_dtypes(),
+            dataset()->input_->output_shapes(), &group_dataset));
+        groups_.erase(key);
+
+        Tensor key_arg(DT_INT64, TensorShape({}));
+        key_arg.scalar<int64>()() = key;
+
+        Tensor group_dataset_arg(DT_RESOURCE, TensorShape({}));
+
+        // NOTE(mrry): We cannot use the core `MakeResourceHandle()`,
+        // `LookupResource()` or `DeleteResource()` functions, because
+        // we have an `IteratorContext*` and not an
+        // `OpKernelContext*`, so we replicate the necessary
+        // functionality here.
+        ResourceHandle group_dataset_handle;
+        group_dataset_handle.set_device(
+            dataset()->captured_reduce_func_->device()->attributes().name());
+        group_dataset_handle.set_container(step_container.name());
+        group_dataset_handle.set_name(kWindowResourceName);
+        auto type_index = MakeTypeIndex<DatasetBase>();
+        group_dataset_handle.set_hash_code(type_index.hash_code());
+        group_dataset_handle.set_maybe_type_name(type_index.name());
+        // NOTE(mrry): Ownership of `group_dataset` transfers to
+        // `step_container` here.
+        TF_RETURN_IF_ERROR(dataset()
+                               ->captured_reduce_func_->resource_manager()
+                               ->Create<DatasetBase>(
+                                   group_dataset_handle.container(),
+                                   group_dataset_handle.name(), group_dataset));
+
+        group_dataset_arg.scalar<ResourceHandle>()() = group_dataset_handle;
+
+        std::vector<Tensor> args(
+            {std::move(key_arg), std::move(group_dataset_arg)});
+        std::vector<Tensor> return_values;
+
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->Run(opts, args, &return_values));
+
+        if (!(return_values.size() == 1 &&
+              return_values[0].dtype() == DT_RESOURCE &&
+              TensorShapeUtils::IsScalar(return_values[0].shape()))) {
+          return errors::InvalidArgument(
+              "`reduce_func` must return a single scalar of dtype "
+              "DT_RESOURCE.");
+        }
+
+        // Retrieve the dataset that was created in `f`.
+        DatasetBase* returned_dataset;
+        const ResourceHandle& dataset_resource =
+            return_values[0].scalar<ResourceHandle>()();
+        if (type_index.hash_code() != dataset_resource.hash_code()) {
+          return errors::InvalidArgument(
+              "`reduce_func` must return a Dataset resource.");
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_reduce_func_->resource_manager()->Lookup(
+                dataset_resource.container(), dataset_resource.name(),
+                &returned_dataset));
+        core::ScopedUnref unref_returned_dataset(returned_dataset);
+
+        // Create an iterator for the dataset that was returned by
+        // `f`. This transfers ownership of the dataset to the
+        // iterator.
+        current_group_iterator_ = returned_dataset->MakeIterator();
+        return Status::OK();
+      }
+
+      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      // TODO(mrry): Optimize for dense key space if appropriate.
+      bool end_of_input_ GUARDED_BY(mu_) = false;
+      std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
+    };
+
+    // A resource name for the temporary window dataset that is
+    // created as the input to the reduce function.
+    static constexpr const char* kWindowResourceName = "__window_dataset";
+
+    const DatasetBase* const input_;
+    const int64 window_size_;
+    const std::unique_ptr<CapturedFunction> captured_key_func_;
+    const std::unique_ptr<CapturedFunction> captured_reduce_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* key_func_;
+  const NameAttrList* reduce_func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
+                        GroupByWindowDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
index 4352f13309fb832717e6e4a197278ea01ea8238f..b672db2016e3efdca950e0415a62691ef2adfbab 100644
--- a/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transfer_utils.cc
@@ -107,44 +107,36 @@ GraphTransferUtils::BuildRemoteFusedGraphExecuteInfo(
         AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map, &node_def));
   }
   CHECK(status.ok());
-  const DataType input_data_type =
-      inputs.empty() ? DT_FLOAT : inputs.at(0).second.dtype();
 
   Scope root = Scope::NewRootScope();
   std::vector<Output> output_list;
+  DataTypeVector input_types;
   for (const std::pair<string, Tensor>& input_node_info : inputs) {
     const Scope& scope = root.WithOpName(input_node_info.first);
     Node* ret;
-    const auto unique_name = scope.GetUniqueNameForOp("PlaceholderV2");
-    const DataType dt = input_node_info.second.dtype();
-    // DataType of input arguments should be same.
-    CHECK_EQ(input_data_type, dt);
-    auto builder = NodeBuilder(unique_name, "PlaceholderV2")
+    const auto unique_name = scope.GetUniqueNameForOp("Placeholder");
+    auto builder = NodeBuilder(unique_name, "Placeholder")
                        .Attr("dtype", input_node_info.second.dtype())
                        .Attr("shape", input_node_info.second.shape());
     scope.UpdateBuilder(&builder);
     scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
-    CHECK(scope.ok());
+    TF_CHECK_OK(scope.status());
     output_list.emplace_back(Output(ret, 0));
+    input_types.push_back(input_node_info.second.dtype());
   }
 
   const RemoteFusedGraphExecuteInfo execute_info =
       BuildRemoteFusedGraphExecuteInfo(*original_def, inputs, outputs,
                                        tensor_shape_map);
 
-  const std::pair<DataType, TensorShape>* tensor_shape_type =
-      RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
-                                                       outputs.at(0));
-  CHECK_NE(tensor_shape_type, nullptr);
-  const DataType output_data_type = tensor_shape_type->first;
+  DataTypeVector output_types;
   // Sanity-check to confirm all output data types are same.
   for (const string& output_node_name : outputs) {
     const std::pair<DataType, TensorShape>* tst =
         RemoteFusedGraphExecuteUtils::GetTensorShapeType(tensor_shape_map,
                                                          output_node_name);
     CHECK_NE(tst, nullptr);
-    const DataType dt = tensor_shape_type->first;
-    CHECK_EQ(output_data_type, dt);
+    output_types.push_back(tst->first);
   }
 
   const Scope& scope = root.WithOpName(remote_graph_execute_name);
@@ -152,18 +144,17 @@ GraphTransferUtils::BuildRemoteFusedGraphExecuteInfo(
   auto node_out_list = ops::AsNodeOutList(scope, InputList(output_list));
   Node* node;
   const auto unique_name = scope.GetUniqueNameForOp("RemoteFusedGraphExecute");
+
   auto builder = NodeBuilder(unique_name, "RemoteFusedGraphExecute")
                      .Input(node_out_list)
-                     .Attr("M", static_cast<int64>(output_list.size()))
-                     .Attr("N", static_cast<int64>(outputs.size()))
-                     .Attr("T", input_data_type)
-                     .Attr("U", output_data_type)
-                     .Attr("serialized_graph_transfer_info",
+                     .Attr("Tinputs", input_types)
+                     .Attr("Toutputs", output_types)
+                     .Attr("serialized_remote_fused_graph_execute_info",
                            StringPiece(execute_info.SerializeAsString()));
   CHECK(scope.ok());
   scope.UpdateBuilder(&builder);
   scope.UpdateStatus(builder.Finalize(scope.graph(), &node));
-  CHECK(scope.ok());
+  CHECK(scope.ok()) << scope.status();
 
   GraphDef fusedGraphDef;
   TF_CHECK_OK(root.ToGraphDef(&fusedGraphDef));
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index e3fc228cc708221f782a828c3ba16b62d7bef2c6..d927ef3efa08bf7f0fdb255e21b59b0620475a83 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -140,7 +140,7 @@ Status GraphTransferer::LoadGraphFromProto(
     std::vector<DataType> data_types;
     std::vector<TensorShape> shapes;
     status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-        node->def(), &data_types, &shapes);
+        node->attrs(), &data_types, &shapes);
     if (status.ok()) {
       CHECK(data_types.size() > port);
       graph_output_node_info.set_dtype(data_types.at(port));
@@ -309,8 +309,9 @@ Status GraphTransferer::RegisterNode(
     RegisterNodeWithPaddingAndStrides(ops_definitions, shape_refiner, node);
   } else if (IsNodeFlattenReshape(node, shape_refiner)) {
     RegisterFlattenNode(ops_definitions, shape_refiner, node);
-  } else if (ops_definitions.GetOpIdFor(node.type_string()) !=
+  } else if (ops_definitions.GetOpIdFor(node.type_string(), {}) !=
              IGraphTransferOpsDefinitions::INVALID_OP_ID) {
+    // TODO(satok): Set correct data type if it's given.
     RegisterGenericNode(ops_definitions, shape_refiner, node);
   } else {
     return errors::InvalidArgument(node.type_string() +
@@ -358,7 +359,7 @@ void GraphTransferer::RegisterConstantNode(const ShapeRefiner& shape_refiner,
   const_node_info.add_shape(shape_array[2]);
   const_node_info.add_shape(shape_array[3]);
   const TensorProto* proto = nullptr;
-  TF_CHECK_OK(GetNodeAttr(node.def(), "value", &proto));
+  TF_CHECK_OK(GetNodeAttr(node.attrs(), "value", &proto));
   Tensor const_tensor;
   // TODO(b/32704451): Don't just ignore this status!
   MakeTensorFromProto(*proto, &const_tensor).IgnoreError();
@@ -394,8 +395,9 @@ int GraphTransferer::RegisterConstantShape(const std::vector<int>& shape) {
 }
 
 bool GraphTransferer::HasPaddingAndStrides(const Node& node) {
-  return node.def().attr().count(PADDING_ATTR_NAME) > 0 &&
-         node.def().attr().count(STRIDES_ATTR_NAME) > 0;
+  auto attrs = node.attrs();
+  return attrs.Find(PADDING_ATTR_NAME) != nullptr &&
+         attrs.Find(STRIDES_ATTR_NAME) != nullptr;
 }
 
 bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
@@ -422,7 +424,7 @@ bool GraphTransferer::IsNodeFlattenReshape(const Node& node,
   } else {
     std::vector<TensorShape> shapes;
     TF_CHECK_OK(RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-        node.def(), nullptr, &shapes));
+        node.attrs(), nullptr, &shapes));
 
     // Number of outputs should be 1 for reshape node.
     CHECK_EQ(1, shapes.size());
@@ -443,22 +445,23 @@ void GraphTransferer::RegisterNodeWithPaddingAndStrides(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   shape_inference::InferenceContext* context = shape_refiner.GetContext(&node);
-  CHECK_GT(node.def().attr().count(PADDING_ATTR_NAME), 0);
+  CHECK(node.attrs().Find(PADDING_ATTR_NAME));
   // TODO(satok): Use context->GetAttr(...) instead?
   Padding padding;
   TF_CHECK_OK(context->GetAttr(PADDING_ATTR_NAME, &padding));
-  CHECK_GT(node.def().attr().count(STRIDES_ATTR_NAME), 0);
+  CHECK(node.attrs().Find(STRIDES_ATTR_NAME));
   std::vector<int32> strides;
   TF_CHECK_OK(context->GetAttr(STRIDES_ATTR_NAME, &strides));
   const int stride_id = RegisterConstantShape(strides);
   std::vector<int> extra_inputs{stride_id};
-  if (node.def().attr().count(KSIZE_ATTR_NAME) > 0) {
+  if (node.attrs().Find(KSIZE_ATTR_NAME)) {
     std::vector<int32> kernel_sizes;
     TF_CHECK_OK(context->GetAttr(KSIZE_ATTR_NAME, &kernel_sizes));
     const int ksize_id = RegisterConstantShape(kernel_sizes);
     extra_inputs.insert(extra_inputs.begin(), ksize_id);
   }
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op " << node.type_string() << " not found in map(id = " << op_type_id
       << ")";
@@ -477,7 +480,8 @@ void GraphTransferer::RegisterInputNode(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const string op_type = node.type_string();
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount())
       << "Op" << node.name() << ", " << op_type << " is not supported,"
       << op_type_id;
@@ -494,7 +498,8 @@ void GraphTransferer::RegisterFlattenNode(
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
   const string op_type = IGraphTransferOpsDefinitions::FLATTEN_OP_NAME;
-  const int op_type_id = ops_definitions.GetOpIdFor(op_type);
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(op_type, {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
@@ -509,7 +514,8 @@ void GraphTransferer::RegisterGenericNode(
   VLOG(1) << "Register generic node: " << node.name();
   CHECK_EQ(node_name_to_id_cache_map_.count(node.name()), 1);
   const int id = node_name_to_id_cache_map_[node.name()];
-  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string());
+  // TODO(satok): Set correct data type if it's given.
+  const int op_type_id = ops_definitions.GetOpIdFor(node.type_string(), {});
   CHECK(op_type_id >= 0 && op_type_id < ops_definitions.GetTotalOpsCount());
 
   AppendNodeParamsWithIoParams(
@@ -592,7 +598,7 @@ void GraphTransferer::AppendNodeOutputParams(const ShapeRefiner& shape_refiner,
 
   std::vector<TensorShape> shapes;
   Status status = RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-      node.def(), nullptr, &shapes);
+      node.attrs(), nullptr, &shapes);
 
   for (int i = 0; i < node.num_outputs(); ++i) {
     int data_size = -1;
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
index 142c9e128325923a9b3f57a729c856bfe428c1b6..ebd4a90330155958da4c1324f368116a2e8f48e8 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer_test.cc
@@ -54,16 +54,18 @@ const RemoteFusedGraphExecuteUtils::TensorShapeMap EMPTY_OUTPUT_TENSOR_MAP;
 class TestGraphTransferOpsDefinitions : public IGraphTransferOpsDefinitions {
  public:
   int GetTotalOpsCount() const final { return OP_TYPES.size(); }
-  int GetOpIdFor(const string& op_type) const final {
-    for (int i = 0; i < OP_TYPES.size(); ++i) {
-      if (OP_TYPES[i] == op_type) {
-        return i;
-      }
+
+int GetOpIdFor(const string& op_type, const DataTypeVector&) const final {
+  for (int i = 0; i < OP_TYPES.size(); ++i) {
+    if (OP_TYPES[i] == op_type) {
+      return i;
     }
-    return -1;
   }
-  GraphTransferInfo::Destination GetTransferDestination() const final {
-    return GraphTransferInfo::NOP;
+  return -1;
+}
+
+GraphTransferInfo::Destination GetTransferDestination() const final {
+  return GraphTransferInfo::NOP;
   }
 
  private:
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 4e6944e63e07f0fb392f57b0f3661f3a5a322868..518b399c37482dd7b5ad1ef333f86c6e97f75631 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -88,7 +88,7 @@ bool HexagonControlWrapper::SetupGraph() {
     CHECK_NE(node_info, nullptr);
     node_info->set_type_name(INPUT_OP_NAME);
     node_info->set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME));
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(INPUT_OP_NAME, {}));
   }
 
   // Generate a new output node which is connected to graph output node
@@ -106,7 +106,7 @@ bool HexagonControlWrapper::SetupGraph() {
     new_output_node_info.set_node_id(new_output_node_id);
     new_output_node_info.set_type_name(OUTPUT_OP_NAME);
     new_output_node_info.set_soc_op_id(
-        HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME));
+        HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME, {}));
     new_output_node_info.set_padding_id(0 /* PADDING_NA_ID */);
     new_output_node_info.set_input_count(1);
     new_output_node_info.set_output_count(0);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index c5d5657492a353bc3954fff8ca96996c05d657e6..54ba101501f4134a672e4cf2f87a5df558f82589 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -35,6 +35,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
@@ -268,8 +269,7 @@ static void RunFusedGraph(const GraphDef& fused_graph_def) {
   session_options.env = Env::Default();
   std::unique_ptr<Session> session =
       std::unique_ptr<Session>(NewSession(session_options));
-  Status status = session->Create(fused_graph_def);
-  ASSERT_TRUE(status.ok());
+  TF_ASSERT_OK(session->Create(fused_graph_def));
 
   // Setup session arguments
   RunOptions run_options;
@@ -283,9 +283,8 @@ static void RunFusedGraph(const GraphDef& fused_graph_def) {
 
   LOG(INFO) << "Run graph";
   // Run inference with all node as output
-  status = session->Run(run_options, input_tensors, output_node_names, {},
-                        &output_tensors, &run_metadata);
-  ASSERT_TRUE(status.ok());
+  TF_ASSERT_OK(session->Run(run_options, input_tensors, output_node_names, {},
+                            &output_tensors, &run_metadata));
   ASSERT_EQ(1, output_tensors.size());
   const Tensor& output_tensor = output_tensors.at(0);
   LOG(INFO) << "Output byte size = " << output_tensor.TotalBytes();
@@ -295,6 +294,83 @@ static void RunFusedGraph(const GraphDef& fused_graph_def) {
       reinterpret_cast<const float*>(output_tensor.flat<float>().data()));
 }
 
+static void CompareGraphTransferInfo(const GraphTransferInfo& gfi0,
+                                     const GraphTransferInfo& gfi1) {
+  LOG(INFO) << "(1) node count: " << gfi1.node_info_size() << ", "
+            << gfi1.const_node_info_size();
+
+  // 1. check node_info
+  ASSERT_EQ(gfi0.node_info_size(), gfi1.node_info_size());
+  for (int i = 0; i < gfi0.node_info_size(); ++i) {
+    const GraphTransferInfo::NodeInfo& ni0 = gfi0.node_info(i);
+    const GraphTransferInfo::NodeInfo& ni1 = gfi1.node_info(i);
+    EXPECT_EQ(ni0.DebugString(), ni1.DebugString());
+    EXPECT_EQ(ni0.ByteSize(), ni1.ByteSize());
+  }
+
+  // 2. check const_node_info
+  ASSERT_EQ(gfi0.const_node_info_size(), gfi1.const_node_info_size());
+  for (int i = 0; i < gfi0.const_node_info_size(); ++i) {
+    const GraphTransferInfo::ConstNodeInfo& cni0 = gfi0.const_node_info(i);
+    const GraphTransferInfo::ConstNodeInfo& cni1 = gfi1.const_node_info(i);
+    ASSERT_EQ(cni0.shape_size(), cni1.shape_size());
+    for (int j = 0; j < cni0.shape_size(); ++j) {
+      EXPECT_EQ(cni0.shape(j), cni1.shape(j));
+    }
+    EXPECT_EQ(cni0.ByteSize(), cni1.ByteSize());
+    EXPECT_EQ(cni0.DebugString(), cni1.DebugString());
+  }
+
+  // 3. check node_input_info
+  ASSERT_EQ(gfi0.node_input_info_size(), gfi1.node_input_info_size());
+  for (int i = 0; i < gfi0.node_input_info_size(); ++i) {
+    const GraphTransferInfo::NodeInputInfo& nii0 = gfi0.node_input_info(i);
+    const GraphTransferInfo::NodeInputInfo& nii1 = gfi1.node_input_info(i);
+    EXPECT_EQ(nii0.ByteSize(), nii1.ByteSize());
+    EXPECT_EQ(nii0.DebugString(), nii1.DebugString());
+  }
+
+  // 4. check node_output_info
+  ASSERT_EQ(gfi0.node_output_info_size(), gfi1.node_output_info_size());
+  for (int i = 0; i < gfi0.node_output_info_size(); ++i) {
+    const GraphTransferInfo::NodeOutputInfo& noi0 = gfi0.node_output_info(i);
+    const GraphTransferInfo::NodeOutputInfo& noi1 = gfi1.node_output_info(i);
+    ASSERT_EQ(noi0.max_byte_size_size(), noi1.max_byte_size_size());
+    for (int j = 0; j < noi0.max_byte_size_size(); ++j) {
+      EXPECT_EQ(noi0.max_byte_size(j), noi1.max_byte_size(j));
+    }
+    EXPECT_EQ(noi0.ByteSize(), noi1.ByteSize());
+    EXPECT_EQ(noi0.DebugString(), noi1.DebugString());
+  }
+
+  // 5. check graph_input_node_info
+  ASSERT_EQ(gfi0.graph_input_node_info_size(),
+            gfi1.graph_input_node_info_size());
+  for (int i = 0; i < gfi0.graph_input_node_info_size(); ++i) {
+    const GraphTransferInfo::GraphInputNodeInfo& gini0 =
+        gfi0.graph_input_node_info(i);
+    const GraphTransferInfo::GraphInputNodeInfo& gini1 =
+        gfi0.graph_input_node_info(i);
+    EXPECT_EQ(gini0.ByteSize(), gini1.ByteSize());
+    EXPECT_EQ(gini0.DebugString(), gini1.DebugString());
+  }
+
+  // 6. check graph_output_node_info
+  ASSERT_EQ(gfi0.graph_output_node_info_size(),
+            gfi1.graph_output_node_info_size());
+  for (int i = 0; i < gfi0.graph_output_node_info_size(); ++i) {
+    const GraphTransferInfo::GraphOutputNodeInfo& goni0 =
+        gfi0.graph_output_node_info(i);
+    const GraphTransferInfo::GraphOutputNodeInfo& goni1 =
+        gfi0.graph_output_node_info(i);
+    EXPECT_EQ(goni0.ByteSize(), goni1.ByteSize());
+    EXPECT_EQ(goni0.DebugString(), goni1.DebugString());
+  }
+
+  // 7. check destination
+  EXPECT_EQ(gfi0.destination(), gfi1.destination());
+}
+
 // CAVEAT: This test only runs when you specify hexagon library using
 // makefile.
 // CAVEAT: This test is disabled by default because hexagon can keep only
@@ -450,34 +526,22 @@ TEST(GraphTransferer, DISABLED_CheckShapeInferencePerformance) {
   prof1.Stop();
   prof1.DumpStatistics("Estiame shape by shape inference");
 
-  LOG(INFO) << "(1) node count: " << gfi1.node_info_size() << ", "
-            << gfi1.const_node_info_size();
-
-  ASSERT_EQ(gfi0.node_info_size(), gfi1.node_info_size());
+  CompareGraphTransferInfo(gfi0, gfi1);
 
-  ASSERT_EQ(gt0.GetGraphTransferInfo().const_node_info_size(),
-            gt1.GetGraphTransferInfo().const_node_info_size());
+  const RemoteFusedGraphExecuteInfo ei0 =
+      BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(gfi0);
+  const RemoteFusedGraphExecuteInfo ei1 =
+      BuildRemoteFusedGraphExecuteInfoWithGraphTransferInfo(gfi1);
 
-  for (int i = 0; i < gfi0.const_node_info_size(); ++i) {
-    const GraphTransferInfo::ConstNodeInfo& ni0 = gfi0.const_node_info(i);
-    const GraphTransferInfo::ConstNodeInfo& ni1 = gfi1.const_node_info(i);
-    ASSERT_EQ(ni0.shape_size(), ni1.shape_size());
-    for (int j = 0; j < ni0.shape_size(); ++j) {
-      EXPECT_EQ(ni0.shape(j), ni1.shape(j));
-    }
-  }
+  GraphTransferInfo rgfi0;
+  rgfi0.ParseFromString(ei0.serialized_executor_parameters());
+  GraphTransferInfo rgfi1;
+  rgfi1.ParseFromString(ei1.serialized_executor_parameters());
 
-  ASSERT_EQ(gfi0.node_output_info_size(), gfi1.node_output_info_size());
-  for (int i = 0; i < gfi0.node_output_info_size(); ++i) {
-    const GraphTransferInfo::NodeOutputInfo& no0 = gfi0.node_output_info(i);
-    const GraphTransferInfo::NodeOutputInfo& no1 = gfi1.node_output_info(i);
-    ASSERT_EQ(no0.max_byte_size_size(), no1.max_byte_size_size());
-    for (int j = 0; j < no0.max_byte_size_size(); ++j) {
-      EXPECT_EQ(no0.max_byte_size(j), no1.max_byte_size(j));
-    }
-  }
+  CompareGraphTransferInfo(rgfi0, rgfi1);
+  CompareGraphTransferInfo(gfi0, rgfi0);
+  CompareGraphTransferInfo(gfi1, rgfi1);
 }
-
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
index 851d87b15bb7795ddec380a357ab7e828e2d6ce2..67f26b6db97b376351ae475b47059304ff602c5d 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
 
-#include <unordered_map>
-
 #include "tensorflow/core/framework/types.h"
 
+// CAVEAT: Comment-out the following macro if you want to use experimental
+// hexagon ops.
+//#define ENABLE_EXPERIMENTAL_HEXNN_OPS
+
 namespace tensorflow {
 
 // HVX internal supported ops names
-enum class SupportedOpType {
+// TODO(satok): Remove this map once hexnn lib supports an API to retrieve op id
+// from op name and data type
+enum class HexagonOpsDefinitions::SupportedOpType {
   INPUT,
   OUTPUT,
   NOP,
@@ -38,6 +42,136 @@ enum class SupportedOpType {
   PPRINT_FLOAT,
   PREFREE,
   FLATTEN,
+
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  // With Reference
+  QUANTIZEDCONV2D_8X8TO32,
+  QUANTIZEDCONV2D_8X8TO32_REF,
+  QUANTIZEDMATMUL_8X8TO32,
+  QUANTIZEDMATMUL_8X8TO32_REF,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8,
+  QUANTIZEDOWNANDSHRINKRANGE_32TO8_REF,
+  QUANTIZEDRELU_8,
+  QUANTIZEDRELU_8_REF,
+  QUANTIZEDRELUX_8,
+  QUANTIZEDRELUX_8_REF,
+  QUANTIZEDMAXPOOL_8,
+  QUANTIZEDMAXPOOL_8_REF,
+  QUANTIZEDAVGPOOL_8,
+  QUANTIZEDAVGPOOL_8_REF,
+  QUANTIZEDCONCAT_8,
+  QUANTIZEDCONCAT_8_REF,
+  QUANTIZEDBIASADD_8P8TO32,
+  QUANTIZEDBIASADD_8P8TO32_REF,
+  MIN_F,
+  MIN_F_REF,
+  MAX_F,
+  MAX_F_REF,
+  QUANTIZE,
+  QUANTIZE_REF,
+  DEQUANTIZE,
+  DEQUANTIZE_REF,
+  SUPERNODE_8X8P8TO8,
+  SUPERNODE_8X8P8TO8_REF,
+
+  QUANTIZEDFLATTEN,
+  SOFTMAX_F,
+  CONV2D_F,
+  MATMUL_F,
+  RELU_F,
+  RELUX_F,
+  AVGPOOL_F,
+  MAXPOOL_F,
+  CONCAT_F,
+  BIASADD_F,
+  LRN_F,
+
+  VARIABLE,
+  ASSIGN,
+  RESHAPE,
+  QUANTIZED_RESHAPE,
+  TANH_F,
+  SIGMOID_F,
+  SLICE_8,
+  SLICE_F,
+  QUANTIZED_SLICE_8,
+  ADD_F,
+  MUL_F,
+  MINIMUM_F,
+  MAXIMAM_F,
+
+  REQUANTIZE_32_TO_8,
+  REQUANTIZE_32_TO_8_REF,
+  REQUANTIZATION_RANGE_32,
+  REQUANTIZATION_RANGE_32_REF,
+
+  NEG_F,
+  SUB_F,
+  ADD_N_F,
+  RANGE_INT32,
+  RANK_INT32,
+  TRANSPOSE_INT32,
+  TRANSPOSE_F,
+  INSTANCE_NORM_F,
+  QUANTIZED_INSTANCENORM_8,
+  QUANTIZED_INSTANCENORM_8_REF,
+  SUB_INT32,
+  ADD_INT32,
+  SPLIT_F,
+  DEQUANTIZE_QINT32_F,
+  PRELU_F,
+  QUANTIZED_PRELU_8,
+  SUM_F,
+  PROD_F,
+  MUL_INT32,
+  LOGICAL_AND_INT32,
+  LOGICALOR_INT32,
+  LOGICAL_XOR_INT32,
+  SPAPE_INT32,
+  PACK_INT32,
+  MIRROR_PAD_F,
+  RESIZE_NEAREST_NEIGHBOR_F,
+  STRIDED_SLICE_INT32,
+  STRIDED_SLICE_F,
+  EXPAND_DIMS_INT32,
+  EXPAND_DIMS_F,
+
+  LOG_SOFTMAX_F,
+  SPLIT_INT32,
+  QUANTIZED_SPLIT_8,
+
+  DECONV_F,
+  QUANTIZED_DECONV_8X8TO32,
+  QUANTIZED_DECONV_8X8TO32_REF,
+
+  QUANTIZED_MUL_8x8to32,
+  QUANTIZED_MUL_8x8to32_REF,
+  QUANTIZED_ADD_8p8to32,
+  QUANTIZED_ADD_8p8to32_REF,
+  QUANTIZED_SIGMOID_8,
+  QUANTIZED_SIGMOID_8_REF,
+  QUANTIZED_TANH_8,
+  QUANTIZED_TANH_8_REF,
+  QUANTIZED_SOFTMAX_8,
+  QUANTIZED_SOFTMAX_8_REF,
+  QUANTIZED_LRN_8,
+  QUANTIZED_LRN_8_REF,
+  QUANTIZED_PAD2D_FRAME_8P,
+  QUANTIZED_PAD2D_FRAME_8P_REF,
+  QUANTIZED_SUB_8P8TO32,
+  QUANTIZED_SUB_8P8TO32_REF,
+  QUANTIZED_MAXIMUM_8,
+  QUANTIZED_MAXIMUM_8_REF,
+  QUANTIZED_MINIMUM_8,
+  QUANTIZED_MINIMUM_8_REF,
+
+  PAD_F,
+  SPACE_TO_BATCH_ND_F,
+  BATCH_TO_SPACE_ND_F,
+  RESIZE_BILINEAR_F,
+  CONCAT_V2_F,
+
+#else
   // With Reference
   QUANTIZEDCONV2D_8X8TO32,
   QUANTIZEDCONV2D_8X8TO32_REF,
@@ -145,37 +279,92 @@ enum class SupportedOpType {
   DECONV_F,
   QUANTIZED_DECONV_8X8TO32,
   QUANTIZED_DECONV_8X8TO32_REF,
+#endif
 
   SUPPORTED_OP_TYPE_COUNT  // TERMINATOR. DO NOT REMOVE
 };
 
-const std::unordered_map<string, SupportedOpType> OP_NAME_TO_SOC_OP_TYPE_MAP{
-    // Custom Op name
-    {"INPUT", SupportedOpType::INPUT},
-    {"OUTPUT", SupportedOpType::OUTPUT},
-    {"NoOp", SupportedOpType::NOP},
-    {IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, SupportedOpType::FLATTEN},
-    // Tensorflow op name
-    {"QuantizedConv2D", SupportedOpType::QUANTIZEDCONV2D_8X8TO32},
-    {"QuantizedMatMul", SupportedOpType::QUANTIZEDMATMUL_8X8TO32},
-    {"QuantizeDownAndShrinkRange",
-     SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8},
-    {"QuantizedRelu", SupportedOpType::QUANTIZEDRELU_8},
-    {"QuantizedReluX", SupportedOpType::QUANTIZEDRELUX_8},
-    {"QuantizedMaxPool", SupportedOpType::QUANTIZEDMAXPOOL_8},
-    {"QuantizedAvgPool", SupportedOpType::QUANTIZEDAVGPOOL_8},
-    {"QuantizedConcat", SupportedOpType::QUANTIZEDCONCAT_8},
-    {"QuantizedBiasAdd", SupportedOpType::QUANTIZEDBIASADD_8P8TO32},
-    {"Min", SupportedOpType::MIN_F},
-    {"Max", SupportedOpType::MAX_F},
-    {"QuantizeV2", SupportedOpType::QUANTIZE},
-    {"Dequantize", SupportedOpType::DEQUANTIZE},
-    {"Softmax", SupportedOpType::SOFTMAX_F},
-    {"Placeholder", SupportedOpType::NOP},
-    {"RequantizationRange", SupportedOpType::REQUANTIZATION_RANGE_32},
-    {"Requantize", SupportedOpType::REQUANTIZE_32_TO_8},
+/* static */ void HexagonOpsDefinitions::EmplaceOpType(
+    const string& op_type, const DataTypeVector& dt_vec,
+    const SupportedOpType supported_op_type,
+    std::unordered_map<string, std::vector<DataTypeToOp>>* map) {
+  if (map->count(op_type) <= 0) {
+    map->emplace(op_type, std::vector<DataTypeToOp>());
+  }
+  map->at(op_type).emplace_back(
+      std::forward_as_tuple(dt_vec, supported_op_type));
+}
+
+/* static */ std::unordered_map<
+    string, std::vector<HexagonOpsDefinitions::DataTypeToOp>>
+HexagonOpsDefinitions::BuildOpNameToSocOpTypeMap() {
+  std::unordered_map<string, std::vector<DataTypeToOp>> op_map;
+  // Custom Op name
+  EmplaceOpType("INPUT", {}, SupportedOpType::INPUT, &op_map);
+  EmplaceOpType("OUTPUT", {}, SupportedOpType::OUTPUT, &op_map);
+  EmplaceOpType("NoOp", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType(IGraphTransferOpsDefinitions::FLATTEN_OP_NAME, {},
+                SupportedOpType::FLATTEN, &op_map);
+  // Tensorflow op name
+  // CAVEAT: Keep order of SupportedOpType
+  EmplaceOpType("Identity", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Placeholder", {}, SupportedOpType::NOP, &op_map);
+  EmplaceOpType("Const", {}, SupportedOpType::OP_CONST, &op_map);
+  EmplaceOpType("QuantizedConv2D", {}, SupportedOpType::QUANTIZEDCONV2D_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizedMatMul", {}, SupportedOpType::QUANTIZEDMATMUL_8X8TO32,
+                &op_map);
+  EmplaceOpType("QuantizeDownAndShrinkRange", {},
+                SupportedOpType::QUANTIZEDOWNANDSHRINKRANGE_32TO8, &op_map);
+  EmplaceOpType("QuantizedRelu", {}, SupportedOpType::QUANTIZEDRELU_8, &op_map);
+  EmplaceOpType("QuantizedReluX", {}, SupportedOpType::QUANTIZEDRELUX_8,
+                &op_map);
+  EmplaceOpType("QuantizedMaxPool", {}, SupportedOpType::QUANTIZEDMAXPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedAvgPool", {}, SupportedOpType::QUANTIZEDAVGPOOL_8,
+                &op_map);
+  EmplaceOpType("QuantizedConcat", {}, SupportedOpType::QUANTIZEDCONCAT_8,
+                &op_map);
+  EmplaceOpType("QuantizedBiasAdd", {},
+                SupportedOpType::QUANTIZEDBIASADD_8P8TO32, &op_map);
+  EmplaceOpType("Min", {}, SupportedOpType::MIN_F, &op_map);
+  EmplaceOpType("Max", {}, SupportedOpType::MAX_F, &op_map);
+  EmplaceOpType("QuantizeV2", {}, SupportedOpType::QUANTIZE, &op_map);
+  EmplaceOpType("Dequantize", {}, SupportedOpType::DEQUANTIZE, &op_map);
+  EmplaceOpType("Softmax", {}, SupportedOpType::SOFTMAX_F, &op_map);
+  EmplaceOpType("Reshape", {}, SupportedOpType::RESHAPE, &op_map);
+  EmplaceOpType("QuantizedReshape", {}, SupportedOpType::QUANTIZED_RESHAPE,
+                &op_map);
+  EmplaceOpType("Sigmoid", {}, SupportedOpType::SIGMOID_F, &op_map);
+  EmplaceOpType("Slice", {}, SupportedOpType::SLICE_F, &op_map);
+  EmplaceOpType("Add", {}, SupportedOpType::ADD_F, &op_map);
+  EmplaceOpType("Mul", {}, SupportedOpType::MUL_F, &op_map);
+  EmplaceOpType("Requantize", {}, SupportedOpType::REQUANTIZE_32_TO_8, &op_map);
+  EmplaceOpType("RequantizationRange", {},
+                SupportedOpType::REQUANTIZATION_RANGE_32, &op_map);
+  EmplaceOpType("Sub", {}, SupportedOpType::SUB_F, &op_map);
+  EmplaceOpType("Pack", {}, SupportedOpType::PACK_INT32, &op_map);
+  EmplaceOpType("StridedSlice", {}, SupportedOpType::STRIDED_SLICE_F, &op_map);
+  EmplaceOpType("ExpandDims", {}, SupportedOpType::EXPAND_DIMS_F, &op_map);
+#ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
+  EmplaceOpType("QuantizedMul", {}, SupportedOpType::QUANTIZED_MUL_8x8to32,
+                &op_map);
+  EmplaceOpType("Pad", {}, SupportedOpType::PAD_F, &op_map);
+  EmplaceOpType("SpaceToBatchND", {}, SupportedOpType::SPACE_TO_BATCH_ND_F,
+                &op_map),
+      EmplaceOpType("BatchToSpaceND", {}, SupportedOpType::BATCH_TO_SPACE_ND_F,
+                    &op_map);
+  EmplaceOpType("ResizeBilinear", {}, SupportedOpType::RESIZE_BILINEAR_F,
+                &op_map);
+  EmplaceOpType("ConcatV2", {}, SupportedOpType::CONCAT_V2_F, &op_map);
+  EmplaceOpType("Conv2DBackpropInput", {}, SupportedOpType::DECONV_F, &op_map);
+#endif
+  return op_map;
 };
 
+HexagonOpsDefinitions::HexagonOpsDefinitions()
+    : op_name_to_soc_op_type_map_(BuildOpNameToSocOpTypeMap()) {}
+
 /* static */ const IGraphTransferOpsDefinitions&
 HexagonOpsDefinitions::getInstance() {
   const static HexagonOpsDefinitions instance{};
@@ -186,9 +375,21 @@ int HexagonOpsDefinitions::GetTotalOpsCount() const {
   return static_cast<int>(SupportedOpType::SUPPORTED_OP_TYPE_COUNT);
 }
 
-int HexagonOpsDefinitions::GetOpIdFor(const string& op_type) const {
-  if (OP_NAME_TO_SOC_OP_TYPE_MAP.count(op_type) > 0) {
-    return static_cast<int>(OP_NAME_TO_SOC_OP_TYPE_MAP.at(op_type));
+int HexagonOpsDefinitions::GetOpIdFor(const string& op_type,
+                                      const DataTypeVector& dt_vec) const {
+  if (op_name_to_soc_op_type_map_.count(op_type) > 0) {
+    const std::vector<DataTypeToOp>& dt_to_op_vec =
+        op_name_to_soc_op_type_map_.at(op_type);
+    CHECK(!dt_to_op_vec.empty());
+    // If argument DataType is empty, return the first entry.
+    if (dt_vec.empty()) {
+      return static_cast<int>(std::get<1>(dt_to_op_vec.front()));
+    }
+    for (const DataTypeToOp& data_type_to_op : dt_to_op_vec) {
+      if (std::get<0>(data_type_to_op) == dt_vec) {
+        return static_cast<int>(std::get<1>(data_type_to_op));
+      }
+    }
   }
   return IGraphTransferOpsDefinitions::INVALID_OP_ID;
 }
diff --git a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
index eca4e16f68d4c2eef216e540e349edf88105ed2e..bd1120e1df64ca72b2a3a95d7af91fabf693af98 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_HEXAGON_OPS_DEFINITIONS_H_
 
+#include <unordered_map>
+
 #include "i_graph_transfer_ops_definitions.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
@@ -29,11 +31,25 @@ class HexagonOpsDefinitions final : public IGraphTransferOpsDefinitions {
   static const IGraphTransferOpsDefinitions& getInstance();
 
   int GetTotalOpsCount() const final;
-  int GetOpIdFor(const string& op_type) const final;
+  int GetOpIdFor(const string& op_type, const DataTypeVector& dt) const final;
   GraphTransferInfo::Destination GetTransferDestination() const final;
 
  private:
-  HexagonOpsDefinitions() = default;
+  enum class SupportedOpType;
+  using DataTypeToOp = std::tuple<DataTypeVector, SupportedOpType>;
+
+  HexagonOpsDefinitions();
+
+  static void EmplaceOpType(
+      const string& op_type, const DataTypeVector& dt_vec,
+      const SupportedOpType supported_op_type,
+      std::unordered_map<string, std::vector<DataTypeToOp>>* map);
+
+  static std::unordered_map<string, std::vector<DataTypeToOp>>
+  BuildOpNameToSocOpTypeMap();
+
+  const std::unordered_map<string, std::vector<DataTypeToOp>>
+      op_name_to_soc_op_type_map_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HexagonOpsDefinitions);
 };
diff --git a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
index f2b1958105bbfa7db3f9c69e7e3cfc2c01f9d230..ee548c6887e73888c8c135f912f0b49b41eda061 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_rewriter_transform.cc
@@ -47,7 +47,7 @@ Status RewriteQuantizedStrippedModelForHexagon(
                "graph execute op...";
   std::vector<std::pair<string, Tensor>> inputs;
   std::vector<string> outputs;
-  for (auto i = 0; i < context.input_names.size(); ++i) {
+  for (auto i = 0; static_cast<size_t>(i) < context.input_names.size(); ++i) {
     const string& input_name = context.input_names.at(i);
 
     // Get input shape
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
index 031b2e31cc8d24c717c794bd92fd5d3ebbd5f194..3d6f493a9c163f326d094bd05071a94965990cbe 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
@@ -37,7 +37,8 @@ class IGraphTransferOpsDefinitions {
   // Return total ops count supported by SOC
   virtual int GetTotalOpsCount() const = 0;
   // Return op id for given string op name
-  virtual int GetOpIdFor(const string& op_name) const = 0;
+  virtual int GetOpIdFor(const string& op_name,
+                         const DataTypeVector& dt) const = 0;
   // Return destination of transfer
   virtual GraphTransferInfo::Destination GetTransferDestination() const = 0;
 
diff --git a/tensorflow/core/kernels/hinge-loss.h b/tensorflow/core/kernels/hinge-loss.h
index 36b02fcc5d67585ef7c1b3424bee1cb07b4689af..789a7ce7a3d8ec9e5d918dd75fce8d644a3b5682 100644
--- a/tensorflow/core/kernels/hinge-loss.h
+++ b/tensorflow/core/kernels/hinge-loss.h
@@ -44,7 +44,7 @@ class HingeLossUpdater : public DualLossUpdater {
                             const double current_dual, const double wx,
                             const double weighted_example_norm) const final {
     // Intutitvely there are 3 cases:
-    // a. new optimal value of the dual variable falls withing the admissible
+    // a. new optimal value of the dual variable falls within the admissible
     // range [0, 1]. In this case we set new dual to this value.
     // b. new optimal value is < 0. Then, because of convexity, the optimal
     // valid value for new dual = 0
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 33383d16a8665af5f7abdf1810dbdfd2f21f57e1..f088315ff538e821666aa95d9a4c4ed49f7c0b59 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is a helper struct to package up the input and ouput
+// This is a helper struct to package up the input and output
 // parameters of an image resizer (the height, widths, etc.).  To
 // reduce code duplication and ensure consistency across the different
 // resizers, it performs the input validation.
@@ -122,7 +122,7 @@ struct ImageResizerState {
   int64 channels;
   float height_scale;
   float width_scale;
-  Tensor* output;
+  Tensor* output = nullptr;
 
  private:
   bool align_corners_;
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..880c6a7e82411bcc0b5b602639c7fb720e6ff481
--- /dev/null
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -0,0 +1,344 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following ops.
+
+Status VerifyTypesMatch(const DataTypeVector& expected,
+                        const DataTypeVector& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " types but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (expected[i] != received[i]) {
+      return errors::InvalidArgument("Data type mismatch at component ", i,
+                                     ": expected ", DataTypeString(expected[i]),
+                                     " but got ", DataTypeString(received[i]),
+                                     ".");
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
+                              const std::vector<PartialTensorShape>& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " shapes but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (!expected[i].IsCompatibleWith(received[i])) {
+      return errors::InvalidArgument("Incompatible shapes at component ", i,
+                                     ": expected ", expected[i].DebugString(),
+                                     " but got ", received[i].DebugString(),
+                                     ".");
+    }
+  }
+
+  return Status::OK();
+}
+
+class IteratorResource : public ResourceBase {
+ public:
+  IteratorResource(const DataTypeVector& output_dtypes,
+                   const std::vector<PartialTensorShape>& output_shapes)
+      : iterator_(nullptr),
+        output_dtypes_(output_dtypes),
+        output_shapes_(output_shapes) {}
+
+  Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+    if (captured_iterator) {
+      return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
+    } else {
+      return errors::FailedPrecondition(
+          "GetNext() failed because the iterator has not been initialized. "
+          "Ensure that you have run the initializer operation for this "
+          "iterator before getting the next element.");
+    }
+  }
+
+  // Transfers ownership of iterator to this. This method is thread-safe.
+  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
+    if (iterator) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    }
+    iterator_.reset(iterator.release());
+    return Status::OK();
+  }
+
+  string DebugString() override { return "Iterator resource"; }
+
+  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+ private:
+  std::shared_ptr<IteratorBase> iterator_;
+  const DataTypeVector output_dtypes_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+// TODO(mrry): Can we simply use the template kernel here?
+class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
+ public:
+  explicit IteratorHandleOp(OpKernelConstruction* ctx)
+      : ResourceOpKernel<IteratorResource>(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ private:
+  Status CreateResource(IteratorResource** ret) override
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    *ret = new IteratorResource(output_dtypes_, output_shapes_);
+    return Status::OK();
+  }
+
+  Status VerifyResource(IteratorResource* resource) override {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+ private:
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+class MakeIteratorOp : public OpKernel {
+ public:
+  explicit MakeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &dataset));
+    core::ScopedUnref unref_dataset(dataset);
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
+    OP_REQUIRES_OK(ctx,
+                   iterator_resource->set_iterator(dataset->MakeIterator()));
+    iterator_resource->Unref();
+  }
+};
+
+class OneShotIteratorOp : public OpKernel {
+ public:
+  explicit OneShotIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string shared_name;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &shared_name));
+    OP_REQUIRES(ctx, shared_name.empty(),
+                errors::InvalidArgument("OneShotIteratorOp does not currently "
+                                        "support the 'shared_name' attr."));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("dataset_factory", &dataset_factory_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  ~OneShotIteratorOp() override {
+    if (iterator_resource_ != nullptr) {
+      iterator_resource_->Unref();
+      if (!cinfo_.resource_manager()
+               ->Delete<IteratorResource>(cinfo_.container(), cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource can have been deleted by session resets.
+      }
+    }
+  }
+
+  // NOTE(mrry): This is based on `ResourceOpKernel<T>::Compute()`,
+  // but due to the fact that `ResourceOpKernel<T>::CreateResource()`
+  // does not provide access to the `OpKernelContext*` and we need this
+  // to invoke the factory function, it's not possible to implement
+  // this kernel by implementing `CreateResource()`.
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+    if (iterator_resource_ == nullptr) {
+      ResourceMgr* mgr = ctx->resource_manager();
+      OP_REQUIRES_OK(ctx, cinfo_.Init(mgr, def()));
+
+      // Create an IteratorResource that will hold the iterator for this op.
+      IteratorResource* resource;
+      OP_REQUIRES_OK(
+          ctx,
+          mgr->LookupOrCreate<IteratorResource>(
+              cinfo_.container(), cinfo_.name(), &resource,
+              [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                *ret = new IteratorResource(output_dtypes_, output_shapes_);
+                return Status::OK();
+              }));
+      Status s = VerifyTypesMatch(output_dtypes_, resource->output_dtypes());
+      s.Update(
+          VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+      if (TF_PREDICT_FALSE(!s.ok())) {
+        resource->Unref();
+        ctx->SetStatus(s);
+        return;
+      }
+      iterator_resource_ = resource;
+
+      // Call the dataset_factory_func_ to create a new dataset,
+      // over which this op will iterate.
+      FunctionLibraryRuntime::Handle f_handle;
+      OP_REQUIRES_OK(ctx,
+                     ctx->function_library()->Instantiate(
+                         dataset_factory_func_->name(),
+                         AttrSlice(&dataset_factory_func_->attr()), &f_handle));
+      FunctionLibraryRuntime::Options opts;
+      opts.cancellation_manager = ctx->cancellation_manager();
+      // Choose a step ID that is guaranteed not to clash with any
+      // Session-generated step ID. DirectSession only generates
+      // non-negative step IDs (contiguous, starting from 0), and
+      // MasterSession generates 56-bit random step IDs whose MSB is
+      // always 0, so a negative random step ID should suffice.
+      opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+      ScopedStepContainer step_container(
+          opts.step_id, [ctx](const string& name) {
+            ctx->resource_manager()->Cleanup(name).IgnoreError();
+          });
+      opts.step_container = &step_container;
+      opts.runner = ctx->runner();
+      Notification n;
+      Status factory_status;
+      std::vector<Tensor> return_values;
+      ctx->function_library()->Run(opts, f_handle, {}, &return_values,
+                                   [&n, &factory_status](Status s) {
+                                     factory_status.Update(s);
+                                     n.Notify();
+                                   });
+      n.WaitForNotification();
+      OP_REQUIRES_OK(ctx, std::move(factory_status));
+      OP_REQUIRES(
+          ctx,
+          return_values.size() == 1 &&
+              return_values[0].dtype() == DT_RESOURCE &&
+              TensorShapeUtils::IsScalar(return_values[0].shape()),
+          errors::InvalidArgument("The `dataset_factory` function must return "
+                                  "a single scalar of dtype DT_RESOURCE."));
+
+      // Retrieve the dataset that was created in the factory function.
+      DatasetBase* dataset;
+      const ResourceHandle& dataset_resource =
+          return_values[0].flat<ResourceHandle>()(0);
+      OP_REQUIRES_OK(ctx, LookupResource(ctx, dataset_resource, &dataset));
+      core::ScopedUnref unref_dataset(dataset);
+
+      // Create an iterator for the dataset that was created in the
+      // factory function. This transfers ownership of the dataset to
+      // the iterator, so we can delete it from the resource manager.
+      OP_REQUIRES_OK(ctx,
+                     iterator_resource_->set_iterator(dataset->MakeIterator()));
+      OP_REQUIRES_OK(ctx, DeleteResource<DatasetBase>(ctx, dataset_resource));
+    }
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<IteratorResource>(
+        ctx, cinfo_.container(), cinfo_.name());
+  }
+
+ private:
+  const NameAttrList* dataset_factory_func_;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+
+  mutex mu_;
+  ContainerInfo cinfo_ GUARDED_BY(mu_);
+  IteratorResource* iterator_resource_ = nullptr;
+};
+
+class IteratorGetNextOp : public OpKernel {
+ public:
+  explicit IteratorGetNextOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // TODO(mrry): Convert this to an async op, because
+  // `iterator->GetNext()` could trigger long-running operations
+  // (e.g. a QueueDequeue or a remote read).
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
+
+    std::vector<Tensor> components;
+    bool end_of_sequence;
+
+    IteratorContext::Params params;
+    params.env = ctx->env();
+    params.step_id = ctx->step_id();
+    params.resource_manager = ctx->resource_manager();
+    params.runner = *(ctx->runner());
+    IteratorContext iter_ctx(std::move(params));
+
+    OP_REQUIRES_OK(ctx,
+                   iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+    OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
+
+    for (int i = 0; i < components.size(); ++i) {
+      // TODO(mrry): Check that the shapes match the shape attrs.
+      ctx->set_output(i, components[i]);
+    }
+  }
+};
+
+class IteratorDisposeOp : public OpKernel {
+ public:
+  explicit IteratorDisposeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
+    core::ScopedUnref unref_iterator(iterator);
+    OP_REQUIRES_OK(ctx, iterator->set_iterator(nullptr));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
+REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
+                        MakeIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
+                        OneShotIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
+                        IteratorGetNextOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorDispose").Device(DEVICE_CPU),
+                        IteratorDisposeOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg_ops_common.cc b/tensorflow/core/kernels/linalg_ops_common.cc
index dc001857d7bb755b32e8450665fd4d8173460708..36907fb5716fcde3b0efc28cc4edca543432c8f4 100644
--- a/tensorflow/core/kernels/linalg_ops_common.cc
+++ b/tensorflow/core/kernels/linalg_ops_common.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 
+#include <utility>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -106,6 +108,7 @@ void LinearAlgebraOp<Scalar>::Compute(OpKernelContext* context) {
   auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
         batch_shape.num_elements(), GetCostPerUnit(input_matrix_shapes), shard);
+
 }
 
 template <typename Scalar>
@@ -145,10 +148,9 @@ void LinearAlgebraOp<Scalar>::AnalyzeInputs(OpKernelContext* context,
     const int col_dimension = input_rank - 1;
     const int64 num_rows = in.dim_size(row_dimension);
     const int64 num_cols = in.dim_size(col_dimension);
-    // TODO(rmlarsen): Use emplace_back when it is added to InlinedVector. Same
-    // in several places below.
-    input_matrix_shapes->push_back(TensorShape({num_rows, num_cols}));
-    inputs->push_back(&in);
+    input_matrix_shapes->emplace_back(
+        std::initializer_list<int64>({num_rows, num_cols}));
+    inputs->emplace_back(&in);
   }
   // Have the derived class validate that the inputs are as expected.
   ValidateInputMatrixShapes(context, *input_matrix_shapes);
@@ -190,9 +192,7 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
       // concatenated with the output_matrix_shape (if the output is not
       // scalar).
       output_tensor_shape = batch_shape;
-      for (int dim = 0; dim < output_matrix_shape.dims(); ++dim) {
-        output_tensor_shape.AddDim(output_matrix_shape.dim_size(dim));
-      }
+      output_tensor_shape.AppendShape(output_matrix_shape);
     }
     Tensor* out = nullptr;
     // See if there is an input buffer we can reuse for this output.
@@ -211,7 +211,7 @@ void LinearAlgebraOp<Scalar>::PrepareOutputs(
       OP_REQUIRES_OK(context, context->allocate_output(
                                   output_idx, output_tensor_shape, &out));
     }
-    outputs->push_back(out);
+    outputs->emplace_back(out);
   }
 }
 
@@ -224,11 +224,10 @@ void LinearAlgebraOp<Scalar>::ComputeTensorSlice(
   for (size_t i = 0; i < inputs.size(); ++i) {
     // TODO(kalakris): Handle alignment if possible. Eigen::Map is
     // unaligned by default.
-    matrix_inputs.push_back(
-        ConstMatrixMap(inputs[i]->flat<Scalar>().data() +
-                           matrix_index * input_matrix_shapes[i].num_elements(),
-                       input_matrix_shapes[i].dim_size(0),
-                       input_matrix_shapes[i].dim_size(1)));
+    matrix_inputs.emplace_back(
+        inputs[i]->flat<Scalar>().data() +
+            matrix_index * input_matrix_shapes[i].num_elements(),
+        input_matrix_shapes[i].dim_size(0), input_matrix_shapes[i].dim_size(1));
   }
 
   MatrixMaps matrix_outputs;
@@ -240,10 +239,10 @@ void LinearAlgebraOp<Scalar>::ComputeTensorSlice(
     int num_output_cols = output_matrix_shapes[i].dims() == 2
                               ? output_matrix_shapes[i].dim_size(1)
                               : 1;
-    matrix_outputs.push_back(
-        MatrixMap(outputs[i]->flat<Scalar>().data() +
-                      matrix_index * output_matrix_shapes[i].num_elements(),
-                  num_output_rows, num_output_cols));
+    matrix_outputs.emplace_back(
+        outputs[i]->flat<Scalar>().data() +
+            matrix_index * output_matrix_shapes[i].num_elements(),
+        num_output_rows, num_output_cols);
   }
   ComputeMatrix(context, matrix_inputs, &matrix_outputs);
 }
diff --git a/tensorflow/core/kernels/linalg_ops_common.h b/tensorflow/core/kernels/linalg_ops_common.h
index ab4142ac9323583e87cf864492ee51bb208bce98..1d31786728f5c4aac023d7c4ef1e347577267110 100644
--- a/tensorflow/core/kernels/linalg_ops_common.h
+++ b/tensorflow/core/kernels/linalg_ops_common.h
@@ -21,10 +21,7 @@ limitations under the License.
 // computations across different threads if necessary.
 #include <algorithm>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/Eigen/Core"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -133,9 +130,8 @@ class LinearAlgebraOp : public OpKernel {
  private:
   using TensorInputs = gtl::InlinedVector<const Tensor*, 4>;
   using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
-
-  // This function maps slices (matrices) of the input and output tensors using
-  // Eigen::Map and calls ComputeMatrix implemented in terms of the
+  // This function maps 2-d slices (matrices) of the input and output tensors
+  // using Eigen::Map and calls ComputeMatrix implemented in terms of the
   // Eigen::MatrixBase API by the derived class.
   //
   // The 'matrix_index' parameter specifies the index of the matrix to be used
@@ -167,8 +163,8 @@ class LinearAlgebraOp : public OpKernel {
                       TensorShapes* output_matrix_shapes);
 };
 
-// Declare that LinearAlgebraOp is explicitly instantiated in
-// linalg_ops_common.cc for float and double.
+// Declare LinearAlgebraOp, which is explicitly instantiated in
+// linalg_ops_common.cc for float, double, complex64, and complex128.
 extern template class LinearAlgebraOp<float>;
 extern template class LinearAlgebraOp<double>;
 extern template class LinearAlgebraOp<complex64>;
@@ -176,8 +172,25 @@ extern template class LinearAlgebraOp<complex128>;
 
 }  // namespace tensorflow
 
-#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
-  REGISTER_KERNEL_BUILDER(                          \
+#define INHERIT_LINALG_TYPEDEFS(Scalar)                   \
+  typedef LinearAlgebraOp<Scalar> Base;                   \
+  using Matrix = typename Base::Matrix;                   \
+  using MatrixMap = typename Base::MatrixMap;             \
+  using MatrixMaps = typename Base::MatrixMaps;           \
+  using ConstMatrixMap = typename Base::ConstMatrixMap;   \
+  using ConstMatrixMaps = typename Base::ConstMatrixMaps; \
+  using TensorShapes = typename Base::TensorShapes;
+
+#define REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
       Name(OpName).Device(DEVICE_CPU).TypeConstraint<Scalar>("T"), OpClass)
 
+#define REGISTER_LINALG_OP_GPU(OpName, OpClass, Scalar) \
+  REGISTER_KERNEL_BUILDER(                              \
+      Name(OpName).Device(DEVICE_GPU).TypeConstraint<Scalar>("T"), OpClass)
+
+// Deprecated, use one of the device-specific macros above.
+#define REGISTER_LINALG_OP(OpName, OpClass, Scalar) \
+  REGISTER_LINALG_OP_CPU(OpName, OpClass, Scalar)
+
 #endif  // TENSORFLOW_KERNELS_LINALG_OPS_COMMON_H_
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index a46e6f762b48b6fcb18c7a37d90d6ad20b36e6ad..ada6fe8d95045040191d48808f6ff11ad2435322 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/lookup_table_init_op.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -304,6 +304,8 @@ class TextFileLineIterator
   TF_DISALLOW_COPY_AND_ASSIGN(TextFileLineIterator);
 };
 
+}  // namespace
+
 // Helper function to initialize an InitializableLookupTable from a text file.
 Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
                                    char delimiter, int32 key_index,
@@ -349,7 +351,6 @@ Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
   return s;
 }
 
-}  // namespace
 }  // namespace lookup
 
 // Kernel to initialize a look table given a key and value tensors.
@@ -366,7 +367,9 @@ class InitializeTableOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
@@ -407,6 +410,8 @@ class InitializeTableOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTable").Device(DEVICE_CPU),
                         InitializeTableOp);
+REGISTER_KERNEL_BUILDER(Name("InitializeTableV2").Device(DEVICE_CPU),
+                        InitializeTableOp);
 
 // Kernel to initialize a lookup table from a text file.
 //
@@ -432,7 +437,9 @@ class InitializeTableFromTextFileOp : public OpKernel {
                    GetInitializableLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, DT_STRING};
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, DT_STRING};
     DataTypeVector expected_outputs = {};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
 
@@ -471,5 +478,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("InitializeTableFromTextFile").Device(DEVICE_CPU),
                         InitializeTableFromTextFileOp);
+REGISTER_KERNEL_BUILDER(
+    Name("InitializeTableFromTextFileV2").Device(DEVICE_CPU),
+    InitializeTableFromTextFileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_init_op.h b/tensorflow/core/kernels/lookup_table_init_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..177a26daa8ab6cf30c5f73395d9f52f602eb5734
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_table_init_op.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+#define TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
+
+#include "tensorflow/core/kernels/initializable_lookup_table.h"
+
+namespace tensorflow {
+namespace lookup {
+
+// Helper function to initialize an InitializableLookupTable from a text file.
+Status InitializeTableFromTextFile(const string& filename, int64 vocab_size,
+                                   char delimiter, int32 key_index,
+                                   int32 value_index, Env* env,
+                                   InitializableLookupTable* table);
+
+}  // namespace lookup
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_INIT_OP_H_
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 41a254305473f017f07114114b953cfd7bf03b4b..11ce2a71dcb5f60f2c5274120cacb186f2076424 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,126 +22,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 namespace lookup {
-namespace {
-
-// Ensure that the compiler cannot elide a copy into a local, for
-// bounds checking on source tensors that might be updated asynchronously for
-// integral types. However non-integer variables are not allowed and therefore
-// the local copy is unnecessary.
-template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
-  return internal::SubtleMustCopy(value);
-}
-
-const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
-  return value;
-}
-
-const float SubtleMustCopyUnlessStringOrFloat(const float value) {
-  return value;
-}
-
-const double SubtleMustCopyUnlessStringOrFloat(const double value) {
-  return value;
-}
-
-}  // namespace
-
-// Lookup table that wraps an unordered_map, where the key and value data type
-// is specified.
-//
-// This table is recommended for any variations to key values.
-//
-// For look up, the table is required to be initialized (allocated
-// and populated). Once the table is marked as initialized it becomes read-only.
-//
-// Sample use case:
-//
-// HashTable<int64, int64> table;  // int64 -> int64.
-// table.Prepare(10); // Prepare the underlying data structure, the number of
-//                    // elements is required by interface, but not used.
-// // Populate the table, elements could be added in one or multiple calls.
-// table.Insert(key_tensor, value_tensor); // Populate the table.
-// ...
-// table.set_is_initialized();
-//
-// table.Find(in_t, &out_t, default_t)
-//
-template <class K, class V>
-class HashTable : public InitializableLookupTable {
- public:
-  HashTable(OpKernelContext* ctx, OpKernel* kernel) {}
-
-  size_t size() const override {
-    // return the size of the table only if it's initialized, otherwise 0.
-    if (!is_initialized_) {
-      return 0;
-    }
-    std::atomic_thread_fence(std::memory_order_acquire);
-    return table_ ? table_->size() : 0;
-  }
-
-  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
-
-  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
-
- protected:
-  Status DoPrepare(size_t unused) override {
-    if (is_initialized_) {
-      return errors::Aborted("HashTable already initialized.");
-    }
-    if (!table_) {
-      table_ = std::unique_ptr<std::unordered_map<K, V>>(
-          new std::unordered_map<K, V>());
-    }
-    return Status::OK();
-  };
-
-  Status DoInsert(const Tensor& keys, const Tensor& values) override {
-    if (!table_) {
-      return errors::FailedPrecondition("HashTable is not prepared.");
-    }
-
-    const auto key_values = keys.flat<K>();
-    const auto value_values = values.flat<V>();
-    for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
-      const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
-      if (previous_value != value) {
-        return errors::FailedPrecondition(
-            "HashTable has different value for same key. Key ", key, " has ",
-            previous_value, " and trying to add value ", value);
-      }
-    }
-    return Status::OK();
-  }
-
-  Status DoFind(const Tensor& key, Tensor* value,
-                const Tensor& default_value) override {
-    const V default_val = default_value.flat<V>()(0);
-    const auto key_values = key.flat<K>();
-    auto value_values = value->flat<V>();
-
-    for (int64 i = 0; i < key_values.size(); ++i) {
-      value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
-    }
-    return Status::OK();
-  }
-
- private:
-  std::unique_ptr<std::unordered_map<K, V>> table_;
-};
 
 // Lookup table that wraps an unordered_map, where the key and value data type
 // is specified. Each individual value must be a scalar. If vector values are
@@ -738,7 +624,10 @@ class LookupTableFindOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    // Input 0 could be a STRING_REF or a RESOURCE
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     DataTypeVector expected_outputs = {table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, expected_outputs));
@@ -761,6 +650,8 @@ class LookupTableFindOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableFind").Device(DEVICE_CPU),
                         LookupTableFindOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableFindV2").Device(DEVICE_CPU),
+                        LookupTableFindOp);
 
 // Table insert op.
 class LookupTableInsertOp : public OpKernel {
@@ -772,7 +663,9 @@ class LookupTableInsertOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
@@ -794,6 +687,8 @@ class LookupTableInsertOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableInsert").Device(DEVICE_CPU),
                         LookupTableInsertOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableInsertV2").Device(DEVICE_CPU),
+                        LookupTableInsertOp);
 
 // Op that returns the size of the given table.
 class LookupTableSizeOp : public OpKernel {
@@ -813,6 +708,8 @@ class LookupTableSizeOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableSize").Device(DEVICE_CPU),
                         LookupTableSizeOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableSizeV2").Device(DEVICE_CPU),
+                        LookupTableSizeOp);
 
 // Op that outputs tensors of all keys and all values.
 class LookupTableExportOp : public OpKernel {
@@ -830,6 +727,8 @@ class LookupTableExportOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableExport").Device(DEVICE_CPU),
                         LookupTableExportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableExportV2").Device(DEVICE_CPU),
+                        LookupTableExportOp);
 
 // Clear the table and insert data.
 class LookupTableImportOp : public OpKernel {
@@ -841,7 +740,9 @@ class LookupTableImportOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    DataTypeVector expected_inputs = {DT_STRING_REF, table->key_dtype(),
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype(),
                                       table->value_dtype()};
     OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
 
@@ -863,6 +764,8 @@ class LookupTableImportOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
                         LookupTableImportOp);
+REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
+                        LookupTableImportOp);
 
 // Register the HashTable op with the currently supported key and value types.
 #define REGISTER_KERNEL(key_dtype, value_dtype)                           \
@@ -871,6 +774,13 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImport").Device(DEVICE_CPU),
           .Device(DEVICE_CPU)                                             \
           .TypeConstraint<key_dtype>("key_dtype")                         \
           .TypeConstraint<value_dtype>("value_dtype"),                    \
+      LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
+                    value_dtype>)                                         \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("HashTableV2")                                                 \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<key_dtype>("key_dtype")                         \
+          .TypeConstraint<value_dtype>("value_dtype"),                    \
       LookupTableOp<lookup::HashTable<key_dtype, value_dtype>, key_dtype, \
                     value_dtype>)
 
@@ -892,6 +802,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableV2")                                               \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfScalars<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -910,6 +827,13 @@ REGISTER_KERNEL(int64, float);
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
+      LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("MutableHashTableOfTensorsV2")                                      \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
       LookupTableOp<lookup::MutableHashTableOfTensors<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
@@ -927,6 +851,13 @@ REGISTER_KERNEL(string, bool);
           .Device(DEVICE_CPU)                                              \
           .TypeConstraint<key_dtype>("key_dtype")                          \
           .TypeConstraint<value_dtype>("value_dtype"),                     \
+      LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
+                    key_dtype, value_dtype>)                               \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("MutableDenseHashTableV2")                                      \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<key_dtype>("key_dtype")                          \
+          .TypeConstraint<value_dtype>("value_dtype"),                     \
       LookupTableOp<lookup::MutableDenseHashTable<key_dtype, value_dtype>, \
                     key_dtype, value_dtype>)
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 41123a300a378785f88a2986e13e667674f65f5c..ff23a09a24f3c291aaec546577ead757e3eaa422 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -49,40 +51,52 @@ class LookupTableOp : public OpKernel {
   // ctx is not owned by this function.
   void Compute(OpKernelContext* ctx) override {
     mutex_lock l(mu_);
+
     if (!table_handle_set_) {
       OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
                                       use_node_name_sharing_));
-      auto creator = [ctx, this](lookup::LookupInterface** ret) {
-        lookup::LookupInterface* container = new Container(ctx, this);
-        if (!ctx->status().ok()) {
-          container->Unref();
-          return ctx->status();
-        }
-        if (ctx->track_allocations()) {
-          ctx->record_device_persistent_memory_allocation(
-              container->MemoryUsed());
-        }
-        *ret = container;
-        return Status::OK();
-      };
-
-      lookup::LookupInterface* table = nullptr;
-      OP_REQUIRES_OK(
-          ctx, cinfo_.resource_manager()
-                   ->template LookupOrCreate<lookup::LookupInterface>(
-                       cinfo_.container(), cinfo_.name(), &table, creator));
-      core::ScopedUnref unref_me(table);
-
-      OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
-                              *table, DataTypeToEnum<key_dtype>::v(),
-                              DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
-
-      auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
-      h(0) = cinfo_.container();
-      h(1) = cinfo_.name();
-      table_handle_set_ = true;
     }
-    ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+
+    auto creator = [ctx, this](lookup::LookupInterface** ret) {
+      lookup::LookupInterface* container = new Container(ctx, this);
+      if (!ctx->status().ok()) {
+        container->Unref();
+        return ctx->status();
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_host_persistent_memory_allocation(
+            container->MemoryUsed() + table_handle_.AllocatedBytes());
+      }
+      *ret = container;
+      return Status::OK();
+    };
+
+    lookup::LookupInterface* table = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   cinfo_.resource_manager()
+                       ->template LookupOrCreate<lookup::LookupInterface>(
+                           cinfo_.container(), cinfo_.name(), &table, creator));
+    core::ScopedUnref unref_me(table);
+
+    OP_REQUIRES_OK(ctx, lookup::CheckTableDataTypes(
+                            *table, DataTypeToEnum<key_dtype>::v(),
+                            DataTypeToEnum<value_dtype>::v(), cinfo_.name()));
+
+    if (ctx->expected_output_dtype(0) == DT_RESOURCE) {
+      Tensor* handle;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+      handle->scalar<ResourceHandle>()() =
+          MakeResourceHandle<lookup::LookupInterface>(ctx, cinfo_.container(),
+                                                      cinfo_.name());
+    } else {
+      if (!table_handle_set_) {
+        auto h = table_handle_.AccessTensor(ctx)->template flat<string>();
+        h(0) = cinfo_.container();
+        h(1) = cinfo_.name();
+      }
+      ctx->set_output_ref(0, &mu_, table_handle_.AccessTensor(ctx));
+    }
+    table_handle_set_ = true;
   }
 
   ~LookupTableOp() override {
@@ -104,6 +118,128 @@ class LookupTableOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(LookupTableOp);
 };
 
+namespace lookup {
+
+// Ensure that the compiler cannot elide a copy into a local, for
+// bounds checking on source tensors that might be updated asynchronously for
+// integral types. However non-integer variables are not allowed and therefore
+// the local copy is unnecessary.
+template <typename T>
+T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+  return internal::SubtleMustCopy(value);
+}
+
+inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+  return value;
+}
+
+inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+  return value;
+}
+
+inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+  return value;
+}
+
+// Lookup table that wraps an unordered_map, where the key and value data type
+// is specified.
+//
+// This table is recommended for any variations to key values.
+//
+// For look up, the table is required to be initialized (allocated
+// and populated). Once the table is marked as initialized it becomes read-only.
+//
+// Sample use case:
+//
+// HashTable<int64, int64> table;  // int64 -> int64.
+// table.Prepare(10); // Prepare the underlying data structure, the number of
+//                    // elements is required by interface, but not used.
+// // Populate the table, elements could be added in one or multiple calls.
+// table.Insert(key_tensor, value_tensor); // Populate the table.
+// ...
+// table.set_is_initialized();
+//
+// table.Find(in_t, &out_t, default_t)
+//
+template <class K, class V>
+class HashTable : public InitializableLookupTable {
+ public:
+  HashTable(OpKernelContext* ctx, OpKernel* kernel) {}
+
+  size_t size() const override {
+    // return the size of the table only if it's initialized, otherwise 0.
+    if (!is_initialized_) {
+      return 0;
+    }
+    std::atomic_thread_fence(std::memory_order_acquire);
+    return table_ ? table_->size() : 0;
+  }
+
+  DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
+
+  DataType value_dtype() const override { return DataTypeToEnum<V>::v(); }
+
+ protected:
+  Status DoPrepare(size_t unused) override {
+    if (is_initialized_) {
+      return errors::Aborted("HashTable already initialized.");
+    }
+    if (!table_) {
+      table_ = std::unique_ptr<std::unordered_map<K, V>>(
+          new std::unordered_map<K, V>());
+    }
+    return Status::OK();
+  };
+
+  Status DoInsert(const Tensor& keys, const Tensor& values) override {
+    if (!table_) {
+      return errors::FailedPrecondition("HashTable is not prepared.");
+    }
+
+    const auto key_values = keys.flat<K>();
+    const auto value_values = values.flat<V>();
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
+      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
+      if (previous_value != value) {
+        return errors::FailedPrecondition(
+            "HashTable has different value for same key. Key ", key, " has ",
+            previous_value, " and trying to add value ", value);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status DoFind(const Tensor& key, Tensor* value,
+                const Tensor& default_value) override {
+    const V default_val = default_value.flat<V>()(0);
+    const auto key_values = key.flat<K>();
+    auto value_values = value->flat<V>();
+
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      value_values(i) = gtl::FindWithDefault(
+          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
+          default_val);
+    }
+    return Status::OK();
+  }
+
+  int64 MemoryUsed() const override {
+    if (table_) {
+      const int64 num_elements = table_->size();
+      return num_elements * (sizeof(K) + sizeof(V));
+    } else {
+      return 0;
+    }
+  }
+
+ private:
+  std::unique_ptr<std::unordered_map<K, V>> table_;
+};
+
+}  // namespace lookup
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_LOOKUP_TABLE_OP_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index f87ce0e6b206cd0066208fefd3cfa9ad18128cb6..d0f269be231500700c6084437fc7783a25d77960 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -49,26 +49,48 @@ Status GetLookupTable(const string& input_name, OpKernelContext* ctx,
                       LookupInterface** table) {
   string container;
   string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
-  return ctx->resource_manager()->Lookup(container, table_handle, table);
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    return LookupResource(ctx, handle, table);
+  } else {
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    return ctx->resource_manager()->Lookup(container, table_handle, table);
+  }
 }
 
 Status GetInitializableLookupTable(const string& input_name,
                                    OpKernelContext* ctx,
                                    InitializableLookupTable** table) {
-  string container;
-  string table_handle;
-  TF_RETURN_IF_ERROR(
-      GetTableHandle(input_name, ctx, &container, &table_handle));
   LookupInterface* lookup_table;
-  TF_RETURN_IF_ERROR(
-      ctx->resource_manager()->Lookup(container, table_handle, &lookup_table));
-  *table = lookup_table->GetInitializableLookupTable();
-  if (*table == nullptr) {
-    lookup_table->Unref();
-    return errors::InvalidArgument("Table ", container, " ", table_handle,
-                                   " is not initializable");
+  DataType handle_dtype;
+  TF_RETURN_IF_ERROR(ctx->input_dtype(input_name, &handle_dtype));
+  if (handle_dtype == DT_RESOURCE) {
+    ResourceHandle handle;
+    TF_RETURN_IF_ERROR(HandleFromInput(ctx, input_name, &handle));
+    TF_RETURN_IF_ERROR(LookupResource(ctx, handle, &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", handle.container(), " ",
+                                     handle.name(), " is not initializable");
+    }
+  } else {
+    string container;
+    string table_handle;
+    TF_RETURN_IF_ERROR(
+        GetTableHandle(input_name, ctx, &container, &table_handle));
+    TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup(container, table_handle,
+                                                       &lookup_table));
+    *table = lookup_table->GetInitializableLookupTable();
+    if (*table == nullptr) {
+      lookup_table->Unref();
+      return errors::InvalidArgument("Table ", container, " ", table_handle,
+                                     " is not initializable");
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index 3435486c9539abee6b5403253a1504438d0daff5..c905ebc84a6e9251a5e30be19b086d3fae215cad 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -79,11 +79,11 @@ struct LaunchLRN<CPUDevice, T> {
     const int rows = static_cast<int>(in.dim_size(1));
     const int cols = static_cast<int>(in.dim_size(2));
     const int depth = static_cast<int>(in.dim_size(3));
-    const int nodes = cols * rows;
 
 #if defined(IS_MOBILE_PLATFORM)
     SingleThreadedLRN(in, batch, rows, cols, depth, output);
 #else
+    const int nodes = cols * rows;
     if (depth > kSingleThreadedLRNDepthCutoff &&
         (beta_ == T(0.5) || beta_ == T(1))) {
       SingleThreadedLRN(in, batch, rows, cols, depth, output);
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/map_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08308d85570a5ddd4453aff4e054709b07e348a7
--- /dev/null
+++ b/tensorflow/core/kernels/map_dataset_op.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class MapDatasetOp : public OpKernel {
+ public:
+  explicit MapDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    DatasetBase* dataset = new Dataset(input, std::move(captured_func),
+                                       output_types_, output_shapes_);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : input_(input),
+          captured_func_(std::move(captured_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "MapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // NOTE(mrry): This method is thread-safe as long as
+        // `input_impl_` and `f` are thread-safe. However, if multiple
+        // threads enter this method, outputs may be observed in a
+        // non-deterministic order.
+
+        std::vector<Tensor> args;
+        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
+
+        FunctionLibraryRuntime::Options opts;
+        // Choose a step ID that is guaranteed not to clash with any
+        // Session-generated step ID. DirectSession only generates
+        // non-negative step IDs (contiguous, starting from 0), and
+        // MasterSession generates 56-bit random step IDs whose MSB is
+        // always 0, so a negative random step ID should suffice.
+        opts.step_id = -std::abs(static_cast<int64>(random::New64()));
+        opts.runner = ctx->runner();
+        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
+        // stack-rip the iterators and use async kernels.
+        return dataset()->captured_func_->Run(opts, args, out_tensors);
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f69298e952453f37da2be07af328acabc8e1d78a
--- /dev/null
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -0,0 +1,816 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Partial Ordering Comparator for Tensor keys containing scalar int64's
+struct KeyTensorLess {
+  bool operator()(const Tensor & lhs, const Tensor & rhs) const {
+    return std::less<int64>{}(lhs.scalar<int64>()(),
+                              rhs.scalar<int64>()());
+  }
+};
+
+// Key Equality operator for Tensor keys containing scalar int64's
+struct KeyTensorEqual {
+  bool operator()(const Tensor & lhs, const Tensor & rhs) const {
+    return std::equal_to<int64>{}(lhs.scalar<int64>()(),
+                                  rhs.scalar<int64>()());
+  }
+};
+
+// Hash for Tensor keys containing scalar int64's
+struct KeyTensorHash {
+  std::size_t operator()(const Tensor & key) const {
+    return std::hash<int64>{}(key.scalar<int64>()());
+  }
+};
+
+
+// General Template Definition
+template <bool Ordered, typename Data>
+struct MapTraits {};
+
+// Partially specialise for ordered
+template <typename Data>
+struct MapTraits<true, Data>
+{
+  typedef Tensor KeyType;
+  typedef Data DataType;
+  typedef std::map<KeyType, Data, KeyTensorLess> MapType;
+};
+
+// Partially specialise for unordered
+template <typename Data>
+struct MapTraits<false, Data>
+{
+  typedef Tensor KeyType;
+  typedef Data DataType;
+  typedef std::unordered_map<KeyType, Data,
+                            KeyTensorHash, KeyTensorEqual> MapType;
+};
+
+// Wrapper around map/unordered_map
+template <bool Ordered>
+class StagingMap : public ResourceBase
+{
+public:
+  // Public typedefs
+  typedef std::vector<Tensor> Tuple;
+  typedef gtl::optional<Tensor> OptionalTensor;
+  typedef std::vector<OptionalTensor> IncompleteTuple;
+
+  typedef MapTraits<Ordered, Tuple> MapTraits_;
+  typedef typename MapTraits_::MapType MapType;
+  typedef typename MapTraits_::KeyType KeyType;
+
+  typedef MapTraits<false, IncompleteTuple> IncompleteTraits;
+  typedef typename IncompleteTraits::MapType IncompleteType;
+
+private:
+  // Private variables
+  DataTypeVector dtypes_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::size_t memory_limit_ GUARDED_BY(mu_);
+  std::size_t current_bytes_ GUARDED_BY(mu_);
+  mutex mu_;
+  condition_variable not_empty_;
+  condition_variable full_;
+  IncompleteType incomplete_ GUARDED_BY(mu_);
+  MapType map_ GUARDED_BY(mu_);
+
+private:
+  // private methods
+
+  // If map is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(mutex_lock & l)
+  {
+    if(has_capacity() || has_memory_limit())
+    {
+      l.unlock();
+      full_.notify_one();
+    }
+  }
+
+  // Notify any removers waiting to extract values
+  // that data is now available
+  void notify_removers(mutex_lock & l)
+  {
+      l.unlock();
+      not_empty_.notify_one();
+  }
+
+  inline bool has_capacity()
+    { return capacity_ > 0; }
+
+  inline bool has_memory_limit()
+    { return memory_limit_ > 0; }
+
+  inline bool would_exceed_memory_limit(std::size_t bytes)
+    { return bytes + current_bytes_ > memory_limit_; }
+
+  inline bool is_capacity_full()
+    { return map_.size() >= capacity_; }
+
+  // Get number of bytes in the tuple
+  inline std::size_t get_tuple_bytes(const Tuple & tuple)
+  {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+      [](const std::size_t & lhs, const Tensor & rhs) {
+        return lhs + rhs.TotalBytes();
+    });
+  }
+
+  // Check that the index is within bounds
+  inline Status check_index(const Tensor & key, std::size_t index)
+  {
+    if(index >= dtypes_.size())
+    {
+      return Status(errors::InvalidArgument("Index '",
+        index, "' for key '", key.scalar<int64>()(),
+        "' was out of bounds '", dtypes_.size(), "'."));
+    }
+
+    return Status::OK();
+  }
+
+
+  // Check that the optional value at the specified index
+  // is uninitialized
+  inline Status check_index_uninitialized(const Tensor & key,
+                                  std::size_t index,
+                                  const IncompleteTuple & tuple)
+  {
+    if(tuple[index].has_value())
+    {
+      return Status(errors::InvalidArgument("The tensor for index '",
+        index, "' for key '", key.scalar<int64>()(),
+        "' was already initialized '", dtypes_.size(), "'."));
+    }
+
+    return Status::OK();
+  }
+
+  // Check that the indices are strictly ordered
+  inline Status check_index_ordering(const Tensor & indices)
+  {
+    auto findices = indices.flat<int>();
+
+    for(std::size_t i = 0; i < findices.dimension(0)-1; ++i)
+    {
+      if(findices(i) < findices(i+1))
+        { continue; }
+
+      return Status(errors::InvalidArgument("Indices are not "
+                                          "strictly ordered"));
+    }
+
+    return Status::OK();
+  }
+
+  // Check bytes are within memory limits memory limits
+  inline Status check_memory_limit(std::size_t bytes)
+  {
+    if(has_memory_limit() && bytes > memory_limit_) {
+      return Status(errors::ResourceExhausted("Attempted to insert "
+        "tensors with combined size of '", bytes, "' bytes into "
+        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    }
+
+    return Status::OK();
+  }
+
+  // Insert incomplete data into the Barrier
+  Status put_incomplete(const KeyType & key,
+                        const Tensor & indices,
+                        Tuple *  tuple,
+                        mutex_lock &l)
+  {
+    auto findices = indices.flat<int>();
+
+    // Search for the key in our incomplete set
+    auto it = incomplete_.find(key);
+
+    // Check that the tuple fits within the memory limit
+    std::size_t tuple_bytes = get_tuple_bytes(*tuple);
+    TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
+
+    if(has_memory_limit())
+    {
+      full_.wait(l, [tuple_bytes, this]() {
+        // Stop waiting if we don't exceed the memory limit
+        return !would_exceed_memory_limit(tuple_bytes);
+      });
+    }
+
+    // This key isn't present in the incomplete set
+    // Create IncompleteTuple and insert
+    if(it == incomplete_.end())
+    {
+      IncompleteTuple empty(dtypes_.size());
+
+      // Initialize empty tuple with given dta
+      for(std::size_t i = 0; i < findices.dimension(0); ++i)
+      {
+        std::size_t index = findices(i);
+        TF_RETURN_IF_ERROR(check_index(key, index));
+
+        // Assign tuple at this index
+        empty[index] = std::move((*tuple)[i]);
+      }
+
+      // Insert into incomplete map
+      incomplete_.insert({key, std::move(empty)});
+
+      // Increment size
+      current_bytes_ += tuple_bytes;
+    }
+    // Found an entry in the incomplete index
+    // Update with given data and insert complete entries
+    // into the main map
+    else
+    {
+      // Reference existing incomplete tuple
+      IncompleteTuple & present = it->second;
+
+      // Assign given data
+      for(std::size_t i = 0; i < findices.dimension(0); ++i)
+      {
+        std::size_t index = findices(i);
+        TF_RETURN_IF_ERROR(check_index(key, index));
+        TF_RETURN_IF_ERROR(check_index_uninitialized(key,
+                                                    index, present));
+
+        // Assign tuple at this index
+        present[index] = std::move((*tuple)[i]);
+      }
+
+      // Increment size
+      current_bytes_ += tuple_bytes;
+
+      // Do we have values at all tuple elements?
+      bool complete = std::all_of(present.begin(), present.end(),
+        [](const OptionalTensor & v) { return v.has_value(); });
+
+      // If so, put the tuple in the actual map
+      if(complete)
+      {
+        // Create a tuple for insertion
+        Tuple new_tuple;
+
+        for(const auto & v: present)
+          { new_tuple.push_back(v.value()); }
+
+        // Remove from incomplete
+        incomplete_.erase(it);
+
+        TF_RETURN_IF_ERROR(put_complete(key, &new_tuple, l));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Does the insertion into the actual staging area
+  Status put_complete(const KeyType & key, Tuple * tuple,
+                    mutex_lock & l)
+  {
+    // Insert key and tuples into the map
+    map_.insert({key, std::move(*tuple)});
+
+    notify_removers(l);
+
+    return Status::OK();
+  }
+
+public:
+  // public methods
+  explicit StagingMap(const DataTypeVector & dtypes,
+          std::size_t capacity, std::size_t memory_limit) :
+      dtypes_(dtypes),
+      capacity_(capacity),
+      memory_limit_(memory_limit),
+      current_bytes_(0) {}
+
+  Status put(KeyType* key, const Tensor * indices,
+              Tuple* tuple)
+  {
+    mutex_lock l(mu_);
+
+    // Sanity check the indices
+    TF_RETURN_IF_ERROR(check_index_ordering(*indices));
+
+    // Handle incomplete inserts
+    if(indices->NumElements() != dtypes_.size())
+    {
+      return put_incomplete(*key, *indices, tuple, l);
+    }
+
+    std::size_t tuple_bytes = get_tuple_bytes(*tuple);
+    // Check that tuple_bytes fits within the memory limit
+    TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
+
+    // If map capacity is bounded wait until map is not full
+    if(has_capacity() || has_memory_limit()) {
+      full_.wait(l, [tuple_bytes, this]() {
+        // If there's a memory limit, check if there's space for insertion
+        bool memory_limit_valid = has_memory_limit() ?
+              !would_exceed_memory_limit(tuple_bytes) : true;
+        // If we're configured for capacity check if there's space for insertion
+        bool capacity_valid = has_capacity() ? !is_capacity_full() : true;
+
+        // Stop waiting upon success for both conditions
+        return memory_limit_valid && capacity_valid;
+      });
+    }
+
+    // Do the put operation
+    TF_RETURN_IF_ERROR(put_complete(*key, tuple, l));
+
+    // Update the current size
+    current_bytes_ += tuple_bytes;
+
+    return Status::OK();
+  }
+
+  Status get(const KeyType* key, Tuple* tuple)
+  {
+    mutex_lock l(mu_);
+
+    typename MapType::const_iterator it;
+
+    // Wait until the element with the requested key is present
+    not_empty_.wait(l, [&, this]() {
+      it = map_.find(*key);
+      return it != map_.end();
+    });
+
+    // Copy tensors into the tuple
+    for(const auto & tensor : it->second)
+      { tuple->push_back(tensor); }
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    return Status::OK();
+  }
+
+  Status pop(const KeyType* key, Tuple* tuple)
+  {
+    mutex_lock l(mu_);
+
+    typename MapType::iterator it;
+
+    // Wait until the element with the requested key is present
+    not_empty_.wait(l, [&, this]() {
+      it = map_.find(*key);
+      return it != this->map_.end();
+    });
+
+    // Move from the entry as its erased anyway
+    *tuple = std::move(it->second);
+
+    // Remove
+    map_.erase(it);
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    notify_inserters_if_bounded(l);
+
+    return Status::OK();
+  }
+
+  Status popitem(KeyType* key, Tuple* tuple)
+  {
+    mutex_lock l(mu_);
+
+    // Wait until map is not empty
+    not_empty_.wait(l, [this]() { return !this->map_.empty(); });
+
+    // Move from the first element and erase it
+    *tuple = std::move(map_.begin()->second);
+    *key = map_.begin()->first;
+    map_.erase(map_.begin());
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= get_tuple_bytes(*tuple);
+
+    notify_inserters_if_bounded(l);
+
+    return Status::OK();
+  }
+
+  Status clear()
+  {
+    mutex_lock l(mu_);
+    map_.clear();
+    incomplete_.clear();
+    current_bytes_ = 0;
+
+    notify_inserters_if_bounded(l);
+
+    return Status::OK();
+  }
+
+  size_t incomplete_size()
+  {
+    mutex_lock l(mu_);
+    return incomplete_.size();
+  }
+
+  size_t size()
+  {
+    // Lock the map and return the size
+    mutex_lock l(mu_);
+    return map_.size();
+  }
+
+  string DebugString()
+  {
+    return "StagingMap";
+  }
+};
+
+template <bool Ordered>
+Status GetStagingMap(OpKernelContext* ctx,
+                    const NodeDef& ndef,
+                    StagingMap<Ordered>** map)
+{
+  auto rm = ctx->resource_manager();
+  ContainerInfo cinfo;
+
+  // Lambda for creating the Staging Area
+  auto create_fn = [&ndef](StagingMap<Ordered>** ret) -> Status
+  {
+    DataTypeVector dtypes;
+    int64 capacity;
+    int64 memory_limit;
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "dtypes", &dtypes));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "memory_limit", &memory_limit));
+    *ret = new StagingMap<Ordered>(dtypes, capacity, memory_limit);
+    return Status::OK();
+  };
+
+  TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<StagingMap<Ordered>>(
+                        cinfo.container(), cinfo.name(),
+                        map, create_fn));
+  return Status::OK();
+}
+
+template <bool Ordered>
+class MapStageOp : public OpKernel
+{
+ public:
+  explicit MapStageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor * key_tensor;
+    const Tensor * indices_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices_tensor));
+    OP_REQUIRES_OK(ctx, ctx->input_list("values", &values_tensor));
+
+    // Create copy for insertion into Staging Area
+    Tensor key(*key_tensor);
+
+    // Create the tuple to store
+    for (std::size_t i = 0; i < values_tensor.size(); ++i) {
+      tuple.push_back(values_tensor[i]);
+    }
+
+    // Store the tuple in the map
+    OP_REQUIRES_OK(ctx, map->put(&key, indices_tensor, &tuple));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapStage").Device(DEVICE_CPU),
+                      MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage").Device(DEVICE_CPU),
+                      MapStageOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapStage")
+                      .HostMemory("key")
+                      .HostMemory("indices")
+                      .Device(DEVICE_GPU), MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
+                      .HostMemory("key")
+                      .HostMemory("indices")
+                      .Device(DEVICE_GPU), MapStageOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapStage").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapStage").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapStageOp<true>);
+
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapUnstageOp : public OpKernel
+{
+ public:
+  explicit MapUnstageOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor * key_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, map->pop(key_tensor, &tuple));
+
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
+        errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                                " vs. ", ctx->num_outputs()));
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapUnstage").Device(DEVICE_CPU),
+                            MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage").Device(DEVICE_CPU),
+                            MapUnstageOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapUnstage").HostMemory("key")
+                            .Device(DEVICE_GPU), MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage").HostMemory("key")
+                            .Device(DEVICE_GPU), MapUnstageOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapUnstage").HostMemory("key")
+                            .Device(DEVICE_SYCL), MapUnstageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage").HostMemory("key")
+                            .Device(DEVICE_SYCL), MapUnstageOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapPeekOp : public OpKernel
+{
+ public:
+  explicit MapPeekOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    const Tensor * key_tensor;
+    OpInputList values_tensor;
+
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES_OK(ctx, map->get(key_tensor, &tuple));
+
+    OP_REQUIRES(
+        ctx, tuple.size() == (size_t)ctx->num_outputs(),
+        errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                                " vs. ", ctx->num_outputs()));
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapPeek").Device(DEVICE_CPU),
+                      MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek").Device(DEVICE_CPU),
+                      MapPeekOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapPeek").HostMemory("key")
+                      .Device(DEVICE_GPU), MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek").HostMemory("key")
+                      .Device(DEVICE_GPU), MapPeekOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapPeek").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapPeekOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapPeekOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+
+
+template <bool Ordered>
+class MapUnstageNoKeyOp : public OpKernel
+{
+ public:
+  explicit MapUnstageNoKeyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Pop a random (key, value) off the map
+    typename StagingMap<Ordered>::KeyType key;
+    typename StagingMap<Ordered>::Tuple tuple;
+
+    OP_REQUIRES_OK(ctx, map->popitem(&key, &tuple));
+
+    // Allocate a key tensor and assign the key as the first output
+    ctx->set_output(0, key);
+
+    // Set the rest of the outputs to the tuple Tensors
+    OP_REQUIRES(ctx,
+      tuple.size() == (size_t)ctx->num_outputs()-1,
+      errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                              " vs. ", ctx->num_outputs()-1));
+    for (size_t i = 0; i < tuple.size(); ++i)
+    {
+      ctx->set_output(i+1, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey").Device(DEVICE_CPU),
+                      MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey").Device(DEVICE_CPU),
+                      MapUnstageNoKeyOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey").HostMemory("key")
+                      .Device(DEVICE_GPU), MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey").HostMemory("key")
+                      .Device(DEVICE_GPU), MapUnstageNoKeyOp<true>);
+
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapUnstageNoKeyOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey").HostMemory("key")
+                      .Device(DEVICE_SYCL), MapUnstageNoKeyOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+
+template <bool Ordered>
+class MapSizeOp : public OpKernel
+{
+ public:
+  explicit MapSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override
+  {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Allocate size output tensor
+    Tensor * size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
+                                                     &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(map->size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_CPU),
+                        MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_CPU),
+                        MapSizeOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapSizeOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapSizeOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapIncompleteSizeOp : public OpKernel
+{
+ public:
+  explicit MapIncompleteSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override
+  {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    // Allocate size output tensor
+    Tensor * size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
+                                                     &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(map->incomplete_size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_CPU),
+                        MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_CPU),
+                        MapIncompleteSizeOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_GPU)
+                        .HostMemory("size"), MapIncompleteSizeOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL)
+                        .HostMemory("size"), MapIncompleteSizeOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+template <bool Ordered>
+class MapClearOp : public OpKernel
+{
+ public:
+  explicit MapClearOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override
+  {
+    StagingMap<Ordered>* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetStagingMap(ctx, def(), &map));
+    core::ScopedUnref scope(map);
+
+    OP_REQUIRES_OK(ctx, map->clear());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_CPU),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_CPU),
+                        MapClearOp<true>);
+
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_GPU),
+                        MapClearOp<true>);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_SYCL),
+                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_SYCL),
+                        MapClearOp<true>);
+#endif // TENSORFLOW_USE_SYCL
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_inverse_op.cc b/tensorflow/core/kernels/matrix_inverse_op.cc
index 0572d48b3ecb01360e0ca5ab71d923ad3c7a53ee..8343201d9a1254b6025cd95d5411ea796afa83b3 100644
--- a/tensorflow/core/kernels/matrix_inverse_op.cc
+++ b/tensorflow/core/kernels/matrix_inverse_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/linalg_ops.cc.
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/LU"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -26,22 +30,22 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#endif
+
 namespace tensorflow {
 
 template <class Scalar>
 class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit MatrixInverseOp(OpKernelConstruction* context) : Base(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
 
-  using Matrix = typename Base::Matrix;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& input = inputs[0];
@@ -77,6 +81,142 @@ class MatrixInverseOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixInverseOp);
 };
 
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <class Scalar>
+class MatrixInverseOpGpu : public AsyncOpKernel {
+ public:
+  explicit MatrixInverseOpGpu(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+    const int ndims = input.dims();
+    const int64 n = input.dim_size(ndims - 1);
+    // Validate inputs.
+    OP_REQUIRES_ASYNC(
+        context, ndims >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", ndims),
+        done);
+    OP_REQUIRES_ASYNC(
+        context, input.dim_size(ndims - 2) == n,
+        errors::InvalidArgument("Input matrices must be squares, got",
+                                input.dim_size(ndims - 2), " != ", n),
+        done);
+
+    // Allocate output.
+    Tensor* out;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->forward_input_or_allocate_output({0}, 0, input.shape(), &out),
+        done);
+
+    // By definition, an empty matrix's inverse is an empty matrix.
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Make a copy of the (possible adjointed) input that we will use for the
+    // factorization step.
+    Tensor input_copy;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_temp(DataTypeToEnum<Scalar>::value,
+                                                input.shape(), &input_copy),
+                         done);
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+    auto input_copy_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
+    auto input_reshaped = input.template flat_inner_dims<Scalar, 3>();
+    if (!adjoint_) {
+      d.memcpy(input_copy_reshaped.data(), input_reshaped.data(),
+               input.NumElements() * sizeof(Scalar));
+    } else {
+      functor::AdjointBatchFunctor<GPUDevice, Scalar> functor;
+      functor(d, input_reshaped, input_copy_reshaped);
+    }
+    const int64 batch_size = input_copy_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    ScratchSpace<int> pivots(context, n * batch_size, /* on_host */ false);
+
+    // Prepare pointer arrays for cuBlas' batch interface.
+    // TODO(rmlarsen): Find a way to encode pointer arrays in pinned host memory
+    // without the ugly casting.
+    ScratchSpace<uint8> input_copy_ptrs(context, sizeof(Scalar*) * batch_size,
+                                        /* on_host */ true);
+    ScratchSpace<uint8> output_ptrs(context, sizeof(Scalar*) * batch_size,
+                                    /* on_host */ true);
+    const Scalar** input_copy_ptrs_base =
+        reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
+    const Scalar** output_ptrs_base =
+        reinterpret_cast<const Scalar**>(output_ptrs.mutable_data());
+    auto output_reshaped = out->template flat_inner_dims<Scalar, 3>();
+    for (int64 i = 0; i < batch_size; ++i) {
+      input_copy_ptrs_base[i] = input_copy_reshaped.data() + i * n * n;
+      output_ptrs_base[i] = output_reshaped.data() + i * n * n;
+    }
+
+    // Launch the two solver kernels back to back without waiting.
+    // 1. Compute the partially pivoted LU factorization(s) of the
+    // matrix/matrices.
+    CudaSolver solver(context);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.emplace_back(context, batch_size, "getrf");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetrfBatched(n, input_copy_ptrs_base, n, pivots.mutable_data(),
+                            &dev_info.back(), batch_size),
+        done);
+    // 2. Compute the inverse(s).
+    dev_info.emplace_back(context, batch_size, "getri");
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.GetriBatched(n, input_copy_ptrs_base, n, pivots.data(),
+                            output_ptrs_base, n, &dev_info.back(), batch_size),
+        done);
+
+    // Register callback to check info after kernels finish. Also capture the
+    // temporary Tensors/ScratchSpace so they don't get deallocated before the
+    // kernels run. TODO(rmlarsen): Use move capture once C++14 becomes
+    // available.
+    auto info_checker = [context, dev_info, input_copy, pivots, input_copy_ptrs,
+                         output_ptrs,
+                         done](const Status& status,
+                               const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the call itself
+          // below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        solver.CopyLapackInfoToHostAsync(dev_info, std::move(info_checker)),
+        done);
+  }
+
+ private:
+  bool adjoint_;
+};
+
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<float>), float);
+REGISTER_LINALG_OP_GPU("MatrixInverse", (MatrixInverseOpGpu<double>), double);
+
+#endif  // GOOGLE_CUDA
+
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<float>), float);
 REGISTER_LINALG_OP("MatrixInverse", (MatrixInverseOp<double>), double);
 REGISTER_LINALG_OP("BatchMatrixInverse", (MatrixInverseOp<float>), float);
diff --git a/tensorflow/core/kernels/matrix_triangular_solve_op.cc b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
index 32aa7a8008e01004e83d5f9b89f79c6a29f9e67f..80176721191370be8b9553a24f9e1bb825235aa8 100644
--- a/tensorflow/core/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_triangular_solve_op.cc
@@ -124,12 +124,25 @@ class MatrixTriangularSolveOp : public LinearAlgebraOp<Scalar> {
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOp);
 };
 
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<double>), double);
+REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOp<double>), double);
 
 #ifdef GOOGLE_CUDA
+
+// TODO(rmlarsen): Re-factor to
+// 1. Enable buffer forwarding from rhs->out.
+// 2. Save Memcpy when buffer forwarding is used.
+// 3. Copy entire rhs in a single Memcpy when forwarding is not used.
 template <class Scalar>
 class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
  public:
-  typedef LinearAlgebraOp<Scalar> Base;
+  INHERIT_LINALG_TYPEDEFS(Scalar);
 
   explicit MatrixTriangularSolveOpGPU(OpKernelConstruction* context)
       : Base(context), lower_(true), adjoint_(false) {
@@ -137,13 +150,6 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     OP_REQUIRES_OK(context, context->GetAttr("adjoint", &adjoint_));
   }
 
-  using TensorShapes = typename Base::TensorShapes;
-  using Matrix = typename Base::Matrix;
-  using MatrixMap = typename Base::MatrixMap;
-  using MatrixMaps = typename Base::MatrixMaps;
-  using ConstMatrixMap = typename Base::ConstMatrixMap;
-  using ConstMatrixMaps = typename Base::ConstMatrixMaps;
-
   virtual void ValidateInputMatrixShapes(
       OpKernelContext* context,
       const TensorShapes& input_matrix_shapes) const final {
@@ -166,6 +172,8 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
                                                   : static_cast<int64>(cost);
   }
 
+  bool EnableInputForwarding() const final { return false; }
+
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
     const ConstMatrixMap& matrix = inputs[0];
@@ -186,7 +194,7 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
     uint64 rhs_elems = rhs.rows() * rhs.cols();
     bool copy_status =
         stream->ThenMemcpyD2D(&out_ptr, rhs_ptr, sizeof(Scalar) * rhs_elems)
-        .ok();
+            .ok();
     if (!copy_status) {
       context->SetStatus(
           errors::Internal("Failed to copy rhs into output before solve"));
@@ -236,41 +244,16 @@ class MatrixTriangularSolveOpGPU : public LinearAlgebraOp<Scalar> {
 
   TF_DISALLOW_COPY_AND_ASSIGN(MatrixTriangularSolveOpGPU);
 };
-#endif  // GOOGLE_CUDA
 
-REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<float>),
-                   float);
-REGISTER_LINALG_OP("MatrixTriangularSolve", (MatrixTriangularSolveOp<double>),
-                   double);
-REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
-                   (MatrixTriangularSolveOp<float>), float);
-REGISTER_LINALG_OP("BatchMatrixTriangularSolve",
-                   (MatrixTriangularSolveOp<double>), double);
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<float>), float);
+REGISTER_LINALG_OP_GPU("MatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<double>), double);
+REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<float>), float);
+REGISTER_LINALG_OP_GPU("BatchMatrixTriangularSolve",
+                       (MatrixTriangularSolveOpGPU<double>), double);
 
-#ifdef GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(
-    Name("MatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T"),
-    MatrixTriangularSolveOpGPU<float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("MatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T"),
-    MatrixTriangularSolveOpGPU<double>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchMatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T"),
-    MatrixTriangularSolveOpGPU<float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("BatchMatrixTriangularSolve")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T"),
-    MatrixTriangularSolveOpGPU<double>);
-#endif  //GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 3a0f19ffb0c10d93899d85b6b1e9b00adc9406d0..6cb56797bff24d6d31827edfdb5ead0d371e2af1 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -295,8 +295,8 @@ static void MaxPoolingBackwardCustomKernel(
       params.tensor_in_rows, params.tensor_in_cols, params.depth,
       params.out_height, params.out_width, params.window_rows,
       params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(),
-      output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>());
+      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
+      context->eigen_device<Eigen::GpuDevice>());
 }
 
 template <class T>
@@ -474,8 +474,7 @@ class MaxPoolingGradGradOp : public OpKernel {
     //    tensor_out_as_matrix with the corresponding values in
     //    top_diff_as_matrix.
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
-
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
@@ -588,8 +587,8 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
 
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {2}, 0, tensor_out.shape(), &output));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, tensor_out.shape(), &output));
 
     PoolParameters params{context,  ksize_,       stride_,
                           padding_, data_format_, tensor_in.shape()};
@@ -1010,38 +1009,34 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
 // default Eigen implementation so we are using the custom kernel as the
 // default. However, you can explicitly invoke the eigen version using
 // kernel_label_map.
-#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                        \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPool")                                            \
-          .Device(DEVICE_GPU)                                    \
-          .TypeConstraint<T>("T")                                \
-          .Label("eigen_tensor"),                                \
-      MaxPoolingOp<GPUDevice, T>);                               \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
-      MaxPoolingNoMaskOp<GPUDevice, T>);                         \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPoolWithArgmax")                                  \
-          .Device(DEVICE_GPU)                                    \
-          .TypeConstraint<int64>("Targmax")                      \
-          .TypeConstraint<T>("T"),                               \
-      MaxPoolingWithArgmaxOp<GPUDevice, T>);                     \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPoolGradWithArgmax")                              \
-          .Device(DEVICE_GPU)                                    \
-          .TypeConstraint<T>("T")                                \
-          .TypeConstraint<int64>("Targmax"),                     \
-      MaxPoolingGradWithArgmaxOp<GPUDevice, T>);                 \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MaxPoolGradGradWithArgmax")                          \
-          .Device(DEVICE_GPU)                                    \
-          .TypeConstraint<T>("T")                                \
-          .TypeConstraint<int64>("Targmax"),                     \
-      MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
+#define REGISTER_GPU_ONLY_POOL_KERNELS(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPool")                            \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .Label("eigen_tensor"),                \
+                          MaxPoolingOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"),     \
+      MaxPoolingNoMaskOp<GPUDevice, T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<int64>("Targmax")      \
+                              .TypeConstraint<T>("T"),               \
+                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<int64>("Targmax"),     \
+                          MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS);
 #undef REGISTER_GPU_ONLY_POOL_KERNELS
 
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER_MAX_POOL_KERNELS
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 5462c6401daef65aede640b7dde5f6d170a79fd6..e3a57d2f28ac1e882f79a2bd7eec3a7a27fccaae 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    dtype maxval = -FLT_MAX;
+    dtype maxval = Eigen::NumTraits<dtype>::lowest();
     int maxidx = -1;
     const dtype* bottom_data_n = bottom_data + n * channels * height * width;
     for (int h = hstart; h < hend; ++h) {
@@ -312,9 +312,6 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
 //     bottom_offset: the pre-computed per-image offset of the maxpool output.
 //         This is equal to Hout*Wout*C.
 //     bottom_diff: the gradient of the gradient w.r.t. output.
-// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
-// the kernel is run, you will need to make sure that bottom_diff is filled with
-// zero first.
 template <typename dtype>
 __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
                                     const int64* mask, const int top_offset,
@@ -333,11 +330,11 @@ namespace functor {
 
 template <typename T>
 bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
-    const T* bottom_data, const int batch, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    T* top_data, int64* mask, const Eigen::GpuDevice& d) {
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, T* top_data,
+    int64* mask, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
 
@@ -351,21 +348,18 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
 
 template <typename T>
 bool MaxPoolBackwardNoMask<T>::operator()(
-    const T* bottom_data, const int batch,
-    const int height, const int width,
-    const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h,
-    const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_t, const int pad_l,
-    const T* top_diff, T* bottom_diff,
-    const Eigen::GpuDevice& d) {
+    const T* bottom_data, const int batch, const int height, const int width,
+    const int channels, const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
+    T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
-  const int bottom_size = batch * channels * height * width;
-  const int top_size = batch * channels * pooled_height * pooled_width;
 
+  const int bottom_size = batch * channels * height * width;
   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
 
+  const int top_size = batch * channels * pooled_height * pooled_width;
   MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
                                   kThreadsPerBlock,
                               kThreadsPerBlock, 0, d.stream()>>>(
@@ -377,9 +371,8 @@ bool MaxPoolBackwardNoMask<T>::operator()(
 
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
-    const int output_size, const int input_size,
-    const T* top_diff, const int64* mask,
-    const int top_offset, const int bottom_offset,
+    const int output_size, const int input_size, const T* top_diff,
+    const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 99e2b73d0c915ecda763cdc63cb374ae47632648..d2029f5719ae84f600188781e91068e9fec71c76 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -36,38 +36,36 @@ template <typename T>
 struct MaxPoolForwardWithOptionalArgmax {
   bool operator()(const T* bottom_data, const int batch, const int height,
                   const int width, const int channels, const int pooled_height,
-                  const int pooled_width, const int kernel_h, const int kernel_w,
-                  const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-                  T* top_data, int64* mask, const Eigen::GpuDevice& d);
+                  const int pooled_width, const int kernel_h,
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, T* top_data, int64* mask,
+                  const Eigen::GpuDevice& d);
 };
 
-
 template <typename T>
 struct MaxPoolBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
-                  const T* top_diff, const int64* mask,
-                  const int top_offset, const int bottom_offset,
-                  T* bottom_diff, const Eigen::GpuDevice& d);
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
 };
 
 template <typename T>
 struct MaxPoolBackwardNoMask {
-  bool operator()(const T* bottom_data, const int batch,
-                  const int height, const int width,
-                  const int channels, const int pooled_height,
+  bool operator()(const T* bottom_data, const int batch, const int height,
+                  const int width, const int channels, const int pooled_height,
                   const int pooled_width, const int kernel_h,
-                  const int kernel_w, const int stride_h,
-                  const int stride_w, const int pad_t, const int pad_l,
-                  const T* top_diff, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const int kernel_w, const int stride_h, const int stride_w,
+                  const int pad_t, const int pad_l, const T* top_diff,
+                  T* bottom_diff, const Eigen::GpuDevice& d);
 };
 
 template <typename T>
 struct MaxPoolGradBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
-                  const T* top_diff, const int64* mask,
-                  const int top_offset, const int bottom_offset,
-                  T* bottom_diff, const Eigen::GpuDevice& d);
+                  const T* top_diff, const int64* mask, const int top_offset,
+                  const int bottom_offset, T* bottom_diff,
+                  const Eigen::GpuDevice& d);
 };
 
 template <typename T>
@@ -75,12 +73,10 @@ struct MaxPoolGradBackwardNoMask {
   bool operator()(TensorFormat data_format, const T* bottom_data,
                   const T* output_data, const int batch,
                   const int pooled_height, const int pooled_width,
-                  const int channels, const int height,
-                  const int width, const int kernel_h,
-                  const int kernel_w, const int stride_h,
+                  const int channels, const int height, const int width,
+                  const int kernel_h, const int kernel_w, const int stride_h,
                   const int stride_w, const int pad_t, const int pad_l,
-                  const T* top_diff, T* bottom_diff,
-                  const Eigen::GpuDevice& d);
+                  const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/meta_support.h b/tensorflow/core/kernels/meta_support.h
index 0d87baf0344a4baf1b3137028c68c83d416e2079..53aece78e87c17cac76866a84c930f3024d38cae 100644
--- a/tensorflow/core/kernels/meta_support.h
+++ b/tensorflow/core/kernels/meta_support.h
@@ -64,7 +64,7 @@ bool IsSupportedAndEnabled();
 //     sum((a_data[i, l] + offset_a) * (b_data[l, j] + offset_b)) : l in [0, k)
 //
 // If transpose_a is false the lhs operand has row major layout, otherwise
-// column major. Similarily transpose_b describes the layout of the rhs operand.
+// column major. Similarly transpose_b describes the layout of the rhs operand.
 // lda, ldb, and ldc are the strides of the lhs operand, rhs operand and the
 // result arrays.
 void QuantizedGemm(OpKernelContext* context, bool transpose_a, bool transpose_b,
diff --git a/tensorflow/core/kernels/mfcc.cc b/tensorflow/core/kernels/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2793005aa2678b4017dc7a562b8362470e43b8ed
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc.cc
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "tensorflow/core/kernels/mfcc.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+const double kDefaultUpperFrequencyLimit = 4000;
+const double kDefaultLowerFrequencyLimit = 20;
+const double kFilterbankFloor = 1e-12;
+const int kDefaultFilterbankChannelCount = 40;
+const int kDefaultDCTCoefficientCount = 13;
+
+Mfcc::Mfcc() : initialized_(false),
+               lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+               upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+               filterbank_channel_count_(kDefaultFilterbankChannelCount),
+               dct_coefficient_count_(kDefaultDCTCoefficientCount) { }
+
+bool Mfcc::Initialize(int input_length,
+                      double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(input_length,
+                                                input_sample_rate,
+                                                filterbank_channel_count_,
+                                                lower_frequency_limit_,
+                                                upper_frequency_limit_);
+  initialized &= dct_.Initialize(filterbank_channel_count_,
+                                 dct_coefficient_count_);
+  initialized_ = initialized;
+  return initialized;
+}
+
+void Mfcc::Compute(const std::vector<double>& spectrogram_frame,
+                   std::vector<double>* output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "Mfcc not initialized.";
+    return;
+  }
+  std::vector<double> working;
+  mel_filterbank_.Compute(spectrogram_frame, &working);
+  for (int i = 0; i < working.size(); ++i) {
+    double val = working[i];
+    if (val < kFilterbankFloor) {
+      val = kFilterbankFloor;
+    }
+    working[i] = log(val);
+  }
+  dct_.Compute(working, output);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
new file mode 100644
index 0000000000000000000000000000000000000000..c39f10499091f0b5c6c74a3e70a812169b84c807
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for computing MFCCs from spectrogram slices.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
+
+#include <vector>
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class Mfcc {
+ public:
+  Mfcc();
+  bool Initialize(int input_length,
+                  double input_sample_rate);
+
+  // Input is a single magnitude spectrogram frame. The input spectrum
+  // is filtered into bands using a triangular mel filterbank and a
+  // discrete cosine transform (DCT) of the values is taken. Output is
+  // populated with the lowest dct_coefficient_count of these values.
+  void Compute(const std::vector<double>& spectrogram_frame,
+               std::vector<double>* output) const;
+
+  void set_upper_frequency_limit(double upper_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    upper_frequency_limit_ = upper_frequency_limit;
+  }
+
+  void set_lower_frequency_limit(double lower_frequency_limit) {
+    CHECK(!initialized_) << "Set frequency limits before calling Initialize.";
+    lower_frequency_limit_ = lower_frequency_limit;
+  }
+
+  void set_filterbank_channel_count(int filterbank_channel_count) {
+    CHECK(!initialized_) << "Set channel count before calling Initialize.";
+    filterbank_channel_count_ = filterbank_channel_count;
+  }
+
+  void set_dct_coefficient_count(int dct_coefficient_count) {
+    CHECK(!initialized_) << "Set coefficient count before calling Initialize.";
+    dct_coefficient_count_ = dct_coefficient_count;
+  }
+
+ private:
+  MfccMelFilterbank mel_filterbank_;
+  MfccDct dct_;
+  bool initialized_;
+  double lower_frequency_limit_;
+  double upper_frequency_limit_;
+  int filterbank_channel_count_;
+  int dct_coefficient_count_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Mfcc);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_H_
diff --git a/tensorflow/core/kernels/mfcc_dct.cc b/tensorflow/core/kernels/mfcc_dct.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa67a8d6499dd7156ce4c109f668d86740afca90
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+
+#include <math.h>
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+MfccDct::MfccDct() : initialized_(false) {}
+
+bool MfccDct::Initialize(int input_length, int coefficient_count) {
+  coefficient_count_ = coefficient_count;
+  input_length_ = input_length;
+
+  if (coefficient_count_ < 1) {
+    LOG(ERROR) << "Coefficient count must be positive.";
+    return false;
+  }
+
+  if (input_length < 1) {
+    LOG(ERROR) << "Input length must be positive.";
+    return false;
+  }
+
+  if (coefficient_count_ > input_length_) {
+    LOG(ERROR) << "Coefficient count must be less than or equal to "
+               << "input length.";
+    return false;
+  }
+
+  cosines_.resize(coefficient_count_);
+  double fnorm = sqrt(2.0 / input_length_);
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = std::atan(1) * 4;
+  double arg = pi / input_length_;
+  for (int i = 0; i < coefficient_count_; ++i) {
+    cosines_[i].resize(input_length_);
+    for (int j = 0; j < input_length_; ++j) {
+      cosines_[i][j] = fnorm * cos(i * arg * (j + 0.5));
+    }
+  }
+  initialized_ = true;
+  return true;
+}
+
+void MfccDct::Compute(const std::vector<double> &input,
+                      std::vector<double> *output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "DCT not initialized.";
+    return;
+  }
+
+  output->resize(coefficient_count_);
+  int length = input.size();
+  if (length > input_length_) {
+    length = input_length_;
+  }
+
+  for (int i = 0; i < coefficient_count_; ++i) {
+    double sum = 0.0;
+    for (int j = 0; j < length; ++j) {
+      sum += cosines_[i][j] * input[j];
+    }
+    (*output)[i] = sum;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_dct.h b/tensorflow/core/kernels/mfcc_dct.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fa3c01628d7f4888e6dd2c9cb5a1ef664e42723
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic minimal DCT class for MFCC speech processing.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccDct {
+ public:
+  MfccDct();
+  bool Initialize(int input_length, int coefficient_count);
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  bool initialized_;
+  int coefficient_count_;
+  int input_length_;
+  std::vector<std::vector<double> > cosines_;
+  TF_DISALLOW_COPY_AND_ASSIGN(MfccDct);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_DCT_H_
diff --git a/tensorflow/core/kernels/mfcc_dct_test.cc b/tensorflow/core/kernels/mfcc_dct_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7526278fe9e9c324025af5bbe48eb09c57b62206
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_dct_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_dct.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+TEST(MfccDctTest, AgreesWithMatlab) {
+  // This test verifies the DCT against MATLAB's dct function.
+  MfccDct dct;
+  std::vector<double> input = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  const int kCoefficientCount = 6;
+  ASSERT_TRUE(dct.Initialize(input.size(), kCoefficientCount));
+  std::vector<double> output;
+  dct.Compute(input, &output);
+  // Note, the matlab dct function divides the first coefficient by
+  // sqrt(2), whereas we don't, so we multiply the first element of
+  // the matlab result by sqrt(2) to get the expected values below.
+  std::vector<double> expected = {12.1243556530, -4.1625617959, 0.0,
+                                  -0.4082482905, 0.0,           -0.0800788912};
+  ASSERT_EQ(output.size(), kCoefficientCount);
+  for (int i = 0; i < kCoefficientCount; ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-10);
+  }
+}
+
+TEST(MfccDctTest, InitializeFailsOnInvalidInput) {
+  MfccDct dct1;
+  EXPECT_FALSE(dct1.Initialize(-50, 1));
+  MfccDct dct2;
+  EXPECT_FALSE(dct1.Initialize(10, -4));
+  MfccDct dct3;
+  EXPECT_FALSE(dct1.Initialize(-1, -1));
+  MfccDct dct4;
+  EXPECT_FALSE(dct1.Initialize(20, 21));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.cc b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d68c60280d992a32bc79c0fffe84bf1b48b76437
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This code resamples the FFT bins, and smooths then with triangle-shaped
+// weights to create a mel-frequency filter bank. For filter i centered at f_i,
+// there is a triangular weighting of the FFT bins that extends from
+// filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i
+// (where the filter value is 1) to f_i+1 (where the filter values returns to
+// zero).
+
+// Note: this code fails if you ask for too many channels.  The algorithm used
+// here assumes that each FFT bin contributes to at most two channels: the
+// right side of a triangle for channel i, and the left side of the triangle
+// for channel i+1.  If you ask for so many channels that some of the
+// resulting mel triangle filters are smaller than a single FFT bin, these
+// channels may end up with no contributing FFT bins.  The resulting mel
+// spectrum output will have some channels that are always zero.
+
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+
+#include <math.h>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
+
+bool MfccMelFilterbank::Initialize(int input_length,
+                               double input_sample_rate,
+                               int output_channel_count,
+                               double lower_frequency_limit,
+                               double upper_frequency_limit) {
+  num_channels_ = output_channel_count;
+  sample_rate_  = input_sample_rate;
+  input_length_ = input_length;
+
+  if (num_channels_ < 1) {
+    LOG(ERROR) << "Number of filterbank channels must be positive.";
+    return false;
+  }
+
+  if (sample_rate_ <= 0) {
+    LOG(ERROR) << "Sample rate must be positive.";
+    return false;
+  }
+
+  if (input_length < 2) {
+    LOG(ERROR) << "Input length must greater than 1.";
+    return false;
+  }
+
+  if (lower_frequency_limit <= 0) {
+    LOG(ERROR) << "Lower frequency limit must be positive.";
+    return false;
+  }
+
+  if (upper_frequency_limit <= lower_frequency_limit) {
+    LOG(ERROR) << "Upper frequency limit must be greater than "
+               << "lower frequency limit.";
+    return false;
+  }
+
+  // An extra center frequency is computed at the top to get the upper
+  // limit on the high side of the final triangular filter.
+  center_frequencies_.resize(num_channels_ + 1);
+  const double mel_low = FreqToMel(lower_frequency_limit);
+  const double mel_hi = FreqToMel(upper_frequency_limit);
+  const double mel_span = mel_hi - mel_low;
+  const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1);
+  for (int i = 0; i < num_channels_ + 1; ++i) {
+    center_frequencies_[i] = mel_low + (mel_spacing * (i + 1));
+  }
+
+  // Always exclude DC; emulate HTK.
+  const double hz_per_sbin = 0.5 * sample_rate_ /
+      static_cast<double>(input_length_ - 1);
+  start_index_ = static_cast<int>(1.5 + (lower_frequency_limit /
+                                           hz_per_sbin));
+  end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
+
+  // Maps the input spectrum bin indices to filter bank channels/indices. For
+  // each FFT bin, band_mapper tells us which channel this bin contributes to
+  // on the right side of the triangle.  Thus this bin also contributes to the
+  // left side of the next channel's triangle response.
+  band_mapper_.resize(input_length_);
+  int channel = 0;
+  for (int i = 0; i < input_length_; ++i) {
+    double melf = FreqToMel(i * hz_per_sbin);
+    if ((i < start_index_) || (i > end_index_)) {
+      band_mapper_[i] = -2;  // Indicate an unused Fourier coefficient.
+    } else {
+      while ((center_frequencies_[channel] < melf) &&
+             (channel < num_channels_)) {
+        ++channel;
+      }
+      band_mapper_[i] = channel - 1;  // Can be == -1
+    }
+  }
+
+  // Create the weighting functions to taper the band edges.  The contribution
+  // of any one FFT bin is based on its distance along the continuum between two
+  // mel-channel center frequencies.  This bin contributes weights_[i] to the
+  // current channel and 1-weights_[i] to the next channel.
+  weights_.resize(input_length_);
+  for (int i = 0; i < input_length_; ++i) {
+    channel = band_mapper_[i];
+    if ((i < start_index_) || (i > end_index_)) {
+      weights_[i] = 0.0;
+    } else {
+      if (channel >= 0) {
+        weights_[i] = (center_frequencies_[channel + 1] -
+                       FreqToMel(i * hz_per_sbin)) /
+            (center_frequencies_[channel + 1] - center_frequencies_[channel]);
+      } else {
+        weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
+            (center_frequencies_[0] - mel_low);
+      }
+    }
+  }
+  // Check the sum of FFT bin weights for every mel band to identify
+  // situations where the mel bands are so narrow that they don't get
+  // significant weight on enough (or any) FFT bins -- i.e., too many
+  // mel bands have been requested for the given FFT size.
+  std::vector<int> bad_channels;
+  for (int c = 0; c < num_channels_; ++c) {
+    float band_weights_sum = 0.0;
+    for (int i = 0; i < input_length_; ++i) {
+      if (band_mapper_[i] == c - 1) {
+        band_weights_sum += (1.0 - weights_[i]);
+      } else if (band_mapper_[i] == c) {
+        band_weights_sum += weights_[i];
+      }
+    }
+    // The lowest mel channels have the fewest FFT bins and the lowest
+    // weights sum.  But given that the target gain at the center frequency
+    // is 1.0, if the total sum of weights is 0.5, we're in bad shape.
+    if (band_weights_sum < 0.5) {
+      bad_channels.push_back(c);
+    }
+  }
+  if (!bad_channels.empty()) {
+    LOG(ERROR) << "Missing " << bad_channels.size() << " bands " <<
+        " starting at " << bad_channels[0] <<
+        " in mel-frequency design. " <<
+        "Perhaps too many channels or " <<
+        "not enough frequency resolution in spectrum. (" <<
+        "input_length: " << input_length <<
+        " input_sample_rate: " << input_sample_rate <<
+        " output_channel_count: " << output_channel_count <<
+        " lower_frequency_limit: " << lower_frequency_limit <<
+        " upper_frequency_limit: " << upper_frequency_limit;
+  }
+  initialized_ = true;
+  return true;
+}
+
+// Compute the mel spectrum from the squared-magnitude FFT input by taking the
+// square root, then summing FFT magnitudes under triangular integration windows
+// whose widths increase with frequency.
+void MfccMelFilterbank::Compute(const std::vector<double> &input,
+                            std::vector<double> *output) const {
+  if (!initialized_) {
+    LOG(ERROR) << "Mel Filterbank not initialized.";
+    return;
+  }
+
+  if (input.size() <= end_index_) {
+    LOG(ERROR) << "Input too short to compute filterbank";
+    return;
+  }
+
+  // Ensure output is right length and reset all values.
+  output->assign(num_channels_, 0.0);
+
+  for (int i = start_index_; i <= end_index_; i++) {  // For each FFT bin
+    double spec_val = sqrt(input[i]);
+    double weighted = spec_val * weights_[i];
+    int channel = band_mapper_[i];
+    if (channel >= 0)
+      (*output)[channel] += weighted;  // Right side of triangle, downward slope
+    channel++;
+    if (channel < num_channels_)
+      (*output)[channel] += spec_val - weighted;  // Left side of triangle
+  }
+}
+
+double MfccMelFilterbank::FreqToMel(double freq) const {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..33ea1bdb5bc3e2a2326913c99f2f6713bd82f096
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic class for applying a mel-scale filterbank to an input.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
+
+#include <vector>
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class MfccMelFilterbank {
+ public:
+  MfccMelFilterbank();
+  bool Initialize(int input_length,  // Number of unique FFT bins fftsize/2+1.
+                  double input_sample_rate,
+                  int output_channel_count,
+                  double lower_frequency_limit,
+                  double upper_frequency_limit);
+
+  // Takes a magnitude spectrogram slice as input, computes a
+  // traingular mel filterbank and places the result in output.
+  void Compute(const std::vector<double>& input,
+               std::vector<double>* output) const;
+
+ private:
+  double FreqToMel(double freq) const;
+  bool initialized_;
+  int num_channels_;
+  double sample_rate_;
+  int input_length_;
+  std::vector<double> center_frequencies_;  // In mel, for each mel channel.
+
+  // Each FFT bin b contributes to two triangular mel channels, with
+  // proportion weights_[b] going into mel channel band_mapper_[b], and
+  // proportion (1 - weights_[b]) going into channel band_mapper_[b] + 1.
+  // Thus, weights_ contains the weighting applied to each FFT bin for the
+  // upper-half of the triangular band.
+  std::vector<double> weights_;  // Right-side weight for this fft  bin.
+
+  // FFT bin i contributes to the upper side of mel channel band_mapper_[i]
+  std::vector<int> band_mapper_;
+  int start_index_;  // Lowest FFT bin used to calculate mel spectrum.
+  int end_index_;  // Highest FFT bin used to calculate mel spectrum.
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MfccMelFilterbank);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a7e7794038597fd3e5536598205cfdfa0c6ba5
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
+
+#include <vector>
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+TEST(MfccMelFilterbankTest, AgreesWithPythonGoldenValues) {
+  // This test verifies the Mel filterbank against "golden values".
+  // Golden values are from an independent Python Mel implementation.
+  MfccMelFilterbank filterbank;
+
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(i + 1);
+  }
+  const int kChannelCount = 20;
+  filterbank.Initialize(input.size(),
+                        22050 /* sample rate */,
+                        kChannelCount /* channels */,
+                        20.0 /*  lower frequency limit */,
+                        4000.0 /* upper frequency limit */);
+
+  std::vector<double> output;
+  filterbank.Compute(input, &output);
+
+  std::vector<double> expected = {
+      7.38894574,   10.30330648, 13.72703292,  17.24158686,  21.35253118,
+      25.77781089,  31.30624108, 37.05877236,  43.9436536,   51.80306637,
+      60.79867148,  71.14363376, 82.90910141,  96.50069158,  112.08428368,
+      129.96721968, 150.4277597, 173.74997634, 200.86037462, 231.59802942};
+
+  ASSERT_EQ(output.size(), kChannelCount);
+
+  for (int i = 0; i < kChannelCount; ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-04);
+  }
+}
+
+TEST(MfccMelFilterbankTest, IgnoresExistingContentOfOutputVector) {
+  // Test for bug where the output vector was not cleared before
+  // accumulating next frame's weighted spectral values.
+  MfccMelFilterbank filterbank;
+
+  const int kSampleCount = 513;
+  std::vector<double> input;
+  std::vector<double> output;
+
+  filterbank.Initialize(kSampleCount,
+                        22050 /* sample rate */,
+                        20 /* channels */,
+                        20.0 /*  lower frequency limit */,
+                        4000.0 /* upper frequency limit */);
+
+
+  // First call with nonzero input value, and an empty output vector,
+  // will resize the output and fill it with the correct, nonzero outputs.
+  input.assign(kSampleCount, 1.0);
+  filterbank.Compute(input, &output);
+  for (const double value : output) {
+    EXPECT_LE(0.0, value);
+  }
+
+  // Second call with zero input should also generate zero output.  However,
+  // the output vector now is already the correct size, but full of nonzero
+  // values.  Make sure these don't affect the output.
+  input.assign(kSampleCount, 0.0);
+  filterbank.Compute(input, &output);
+  for (const double value : output) {
+    EXPECT_EQ(0.0, value);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_op.cc b/tensorflow/core/kernels/mfcc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02643857c1f52cf2b2a36f487347ae09bf73d319
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_op.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/mfcc.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Create a speech fingerpring from spectrogram data.
+class MfccOp : public OpKernel {
+ public:
+  explicit MfccOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("upper_frequency_limit",
+                                             &upper_frequency_limit_));
+    OP_REQUIRES_OK(context, context->GetAttr("lower_frequency_limit",
+                                             &lower_frequency_limit_));
+    OP_REQUIRES_OK(context, context->GetAttr("filterbank_channel_count",
+                                             &filterbank_channel_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("dct_coefficient_count",
+                                             &dct_coefficient_count_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& spectrogram = context->input(0);
+    OP_REQUIRES(context, spectrogram.dims() == 3,
+                errors::InvalidArgument("spectrogram must be 3-dimensional",
+                                        spectrogram.shape().DebugString()));
+    const Tensor& sample_rate_tensor = context->input(1);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(sample_rate_tensor.shape()),
+                errors::InvalidArgument(
+                    "Input sample_rate should be a scalar tensor, got ",
+                    sample_rate_tensor.shape().DebugString(), " instead."));
+    const int32 sample_rate = sample_rate_tensor.scalar<int32>()();
+
+    const int spectrogram_channels = spectrogram.dim_size(2);
+    const int spectrogram_samples = spectrogram.dim_size(1);
+    const int audio_channels = spectrogram.dim_size(0);
+
+    Mfcc mfcc;
+    mfcc.set_upper_frequency_limit(upper_frequency_limit_);
+    mfcc.set_lower_frequency_limit(lower_frequency_limit_);
+    mfcc.set_filterbank_channel_count(filterbank_channel_count_);
+    mfcc.set_dct_coefficient_count(dct_coefficient_count_);
+    OP_REQUIRES(context, mfcc.Initialize(spectrogram_channels, sample_rate),
+                errors::InvalidArgument(
+                    "Mfcc initialization failed for channel count ",
+                    spectrogram_channels, " and sample rate ", sample_rate));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0,
+                       TensorShape({audio_channels, spectrogram_samples,
+                                    dct_coefficient_count_}),
+                       &output_tensor));
+
+    const float* spectrogram_flat = spectrogram.flat<float>().data();
+    float* output_flat = output_tensor->flat<float>().data();
+
+    for (int audio_channel = 0; audio_channel < audio_channels;
+         ++audio_channel) {
+      for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
+           ++spectrogram_sample) {
+        const float* sample_data =
+            spectrogram_flat +
+            (audio_channel * spectrogram_samples * spectrogram_channels) +
+            (spectrogram_sample * spectrogram_channels);
+        std::vector<double> mfcc_input(sample_data,
+                                       sample_data + spectrogram_channels);
+        std::vector<double> mfcc_output;
+        mfcc.Compute(mfcc_input, &mfcc_output);
+        DCHECK_EQ(dct_coefficient_count_, mfcc_output.size());
+        float* output_data =
+            output_flat +
+            (audio_channel * spectrogram_samples * dct_coefficient_count_) +
+            (spectrogram_sample * dct_coefficient_count_);
+        for (int i = 0; i < dct_coefficient_count_; ++i) {
+          output_data[i] = mfcc_output[i];
+        }
+      }
+    }
+  }
+
+ private:
+  float upper_frequency_limit_;
+  float lower_frequency_limit_;
+  int32 filterbank_channel_count_;
+  int32 dct_coefficient_count_;
+};
+REGISTER_KERNEL_BUILDER(Name("Mfcc").Device(DEVICE_CPU), MfccOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_op_test.cc b/tensorflow/core/kernels/mfcc_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d16171d5265a43932f60afce2bc623a4dbae29cc
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_op_test.cc
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(MfccOpTest, SimpleTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor spectrogram_tensor(DT_FLOAT, TensorShape({1, 1, 513}));
+  test::FillIota<float>(&spectrogram_tensor, 1.0f);
+
+  Output spectrogram_const_op = Const(root.WithOpName("spectrogram_const_op"),
+                                      Input::Initializer(spectrogram_tensor));
+
+  Output sample_rate_const_op =
+      Const(root.WithOpName("sample_rate_const_op"), 22050);
+
+  Mfcc mfcc_op = Mfcc(root.WithOpName("mfcc_op"), spectrogram_const_op,
+                      sample_rate_const_op);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(
+      session.Run(ClientSession::FeedType(), {mfcc_op.output}, &outputs));
+
+  const Tensor& mfcc_tensor = outputs[0];
+
+  EXPECT_EQ(3, mfcc_tensor.dims());
+  EXPECT_EQ(13, mfcc_tensor.dim_size(2));
+  EXPECT_EQ(1, mfcc_tensor.dim_size(1));
+  EXPECT_EQ(1, mfcc_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      mfcc_tensor,
+      test::AsTensor<float>(
+          {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878,
+           -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029,
+           -0.0769791, -0.10806114, -0.06047613},
+          TensorShape({1, 1, 13})),
+      1e-3);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mfcc_test.cc b/tensorflow/core/kernels/mfcc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab726e5b9caa6a69f5106fe13644cdfa40d1b84
--- /dev/null
+++ b/tensorflow/core/kernels/mfcc_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mfcc.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+TEST(MfccTest, AgreesWithPythonGoldenValues) {
+  Mfcc mfcc;
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(i + 1);
+  }
+
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 22050 /*sample rate*/));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  std::vector<double> expected = {29.13970072, -6.41568601, -0.61903012,
+                             -0.96778652, -0.26819878, -0.40907028,
+                             -0.15614748, -0.23203119, -0.10481487,
+                             -0.1543029,  -0.0769791,  -0.10806114,
+                             -0.06047613};
+
+  ASSERT_EQ(expected.size(), output.size());
+  for (int i = 0; i < output.size(); ++i) {
+    EXPECT_NEAR(output[i], expected[i], 1e-04);
+  }
+}
+
+TEST(MfccTest, AvoidsNansWithZeroInput) {
+  Mfcc mfcc;
+  std::vector<double> input;
+  const int kSampleCount = 513;
+  for (int i = 0; i < kSampleCount; ++i) {
+    input.push_back(0.0);
+  }
+
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 22050 /*sample rate*/));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  int expected_size = 13;
+  ASSERT_EQ(expected_size, output.size());
+  for (const double value : output) {
+    EXPECT_FALSE(isnan(value));
+  }
+}
+
+TEST(MfccTest, SimpleInputSaneResult) {
+  Mfcc mfcc;
+  mfcc.set_lower_frequency_limit(125.0);
+  mfcc.set_upper_frequency_limit(3800.0);
+  mfcc.set_filterbank_channel_count(40);
+  mfcc.set_dct_coefficient_count(40);
+  const int kSpectrogramSize = 129;
+  std::vector<double> input(kSpectrogramSize, 0.0);
+
+  // Simulate a low-frequency sinusoid from the spectrogram.
+  const int kHotBin = 10;
+  input[kHotBin] = 1.0;
+  ASSERT_TRUE(mfcc.Initialize(input.size(), 8000));
+
+  std::vector<double> output;
+  mfcc.Compute(input, &output);
+
+  // For a single low-frequency input, output beyond c_0 should look like
+  // a slow cosine, with a slight delay.  Largest value will be c_1.
+  EXPECT_EQ(output.begin() + 1, std::max_element(output.begin(), output.end()));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index f1f6a9ce53ac9de5be5448fd538de03ecb48a35b..d90baee069c17e9b25169dcb2650681f6103f9b1 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -29,10 +29,9 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 template <typename Device, typename T>
-class MklAvgPoolingOp : public UnaryOp<T> {
+class MklAvgPoolingOp : public OpKernel {
  public:
-  explicit MklAvgPoolingOp(OpKernelConstruction* context)
-      : UnaryOp<T>(context) {
+  explicit MklAvgPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
@@ -78,6 +77,7 @@ class MklAvgPoolingOp : public UnaryOp<T> {
     Tensor mkl_tmp_input_buf_tensor_;
     mkl_context.MklCreateLayoutsAndPrimitives(context,
                                               &mkl_tmp_input_buf_tensor_);
+    OP_REQUIRES_OK(context, context->status());
 
     Tensor workspace_tensor;
     void* workspace_buf;
@@ -120,7 +120,7 @@ class MklAvgPoolingOp : public UnaryOp<T> {
                                 mkl_out_shape.GetMklLayout())) /
                             sizeof(T));
 
-    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
                               mkl_out_shape);
     mkl_context.pooling_res[dnnResourceDst] =
         static_cast<void*>(output->flat<T>().data());
@@ -138,9 +138,10 @@ class MklAvgPoolingOp : public UnaryOp<T> {
   typedef struct {
     MklPoolingOpParams params;
     MklShape input_shape;
-    dnnPrimitive_t prim_pooling_fwd, convert_input;
-    dnnLayout_t lt_user_input, lt_prim_input, lt_workspace;
-    void* input_buf;
+    dnnPrimitive_t prim_pooling_fwd = nullptr, convert_input = nullptr;
+    dnnLayout_t lt_user_input = nullptr, lt_prim_input = nullptr,
+                lt_workspace = nullptr;
+    void* input_buf = nullptr;
     void* pooling_res[dnnResourceNumber];
 
     void MklCreateLayoutsAndPrimitives(OpKernelContext* context,
@@ -243,6 +244,11 @@ class MklAvgPoolingGradOp : public OpKernel {
     pool_params.Init(context, ksize_, stride_, padding_, data_format_,
                      output_shape);
 
+    if (outbackprop_in_mkl_format == false)
+      mkl_context.params.in_dim = out_backprop.dims();
+    else
+      mkl_context.params.in_dim = mkl_context.out_backprop_shape.GetDimension();
+
     // Extract the parameters for the op from the pooling specs
     ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
 
@@ -250,6 +256,7 @@ class MklAvgPoolingGradOp : public OpKernel {
     Tensor outbackprop_buf_tensor;
     void* outbackprop_buf;
     mkl_context.MklCreateLayoutsAndPrimitives(context);
+    OP_REQUIRES_OK(context, context->status());
 
     // Check if outbackprop layout requires conversion.
     if (!dnnLayoutCompare_F32(mkl_context.lt_user_outbackprop,
@@ -304,7 +311,7 @@ class MklAvgPoolingGradOp : public OpKernel {
                                 mkl_out_shape.GetMklLayout())) /
                             sizeof(T));
 
-    AllocateOutputSetMklshape(context, 0, &output, tensor_out_shape,
+    AllocateOutputSetMklShape(context, 0, &output, tensor_out_shape,
                               mkl_out_shape);
 
     // Set output tensor.
@@ -323,10 +330,10 @@ class MklAvgPoolingGradOp : public OpKernel {
   typedef struct {
     MklPoolingOpParams params;
     MklShape out_backprop_shape;
-    dnnPrimitive_t prim_pooling_bwd, convert_outbackprop;
+    dnnPrimitive_t prim_pooling_bwd = nullptr, convert_outbackprop = nullptr;
     void* pooling_res[dnnResourceNumber];
-    dnnLayout_t lt_user_input, lt_user_outbackprop, lt_prim_outbackprop,
-        lt_workspace;
+    dnnLayout_t lt_user_input = nullptr, lt_user_outbackprop = nullptr,
+                lt_prim_outbackprop = nullptr, lt_workspace = nullptr;
 
     void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
       const Tensor& tensor_in_shape = MklGetInput(context, 0);
@@ -347,11 +354,6 @@ class MklAvgPoolingGradOp : public OpKernel {
                                             "4-dimensional"));
       } else {
         // Input in MKL format.
-        OP_REQUIRES(
-            context, out_backprop.dims() == 2,
-            errors::InvalidArgument("out_backprop in MKL format must be "
-                                    "2-dimensional"));
-
         // For avgpooling, out_backprop should have 4 dimensions.
         OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
                     errors::InvalidArgument("out_backprop must be "
@@ -411,16 +413,16 @@ class MklAvgPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MklAvgPool")
+REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
-                            .Label(mkl_layer_registry::kMklLayerLabel),
+                            .Label(mkl_op_registry::kMklOpLabel),
                         MklAvgPoolingOp<CPUDevice, float>);
 
-REGISTER_KERNEL_BUILDER(Name("MklAvgPoolGrad")
+REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
-                            .Label(mkl_layer_registry::kMklLayerLabel),
+                            .Label(mkl_op_registry::kMklOpLabel),
                         MklAvgPoolingGradOp<CPUDevice, float>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8a8cc74bfae08e2eeeba65e40a8e77e6ac8a1fd
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -0,0 +1,461 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <limits>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
+
+// TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
+// reference inputs.
+// --------------------------------------------------------------------------
+//                      Eigen Concat Op
+// --------------------------------------------------------------------------
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class EigenConcatBaseOp : public OpKernel {
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit EigenConcatBaseOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  // Although, we modify Compute for this call to accept one extra param,
+  // we need to have empty Compute because Compute is pure virtual function.
+  void Compute(OpKernelContext* c) {}
+
+  void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = values[0].dims();
+    const TensorShape& input_shape = values[0].shape();
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(in.shape());
+      OP_REQUIRES(
+          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", in.shape().DebugString()));
+      for (int j = 0; j < input_dims; ++j) {
+        if (j == axis) {
+          continue;
+        }
+        OP_REQUIRES(
+            c, in.dim_size(j) == input_shape.dim_size(j),
+            errors::InvalidArgument(
+                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
+                input_shape.DebugString(), " vs. shape[", i,
+                "] = ", in.shape().DebugString()));
+      }
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      // TODO(irving): Remove check once !allow_legacy_scalars().
+      output_concat_dim += in.dims() > 0 ? in.dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    // TODO(irving): Remove rank 0 case once !allow_legacy_scalars().
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+};
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklConcatOpContext mkl_context;
+
+    // Get input tensors.
+    OpInputList input_tensors;
+    GetMklInputList(context, "values", &input_tensors);
+    const int N = input_tensors.size();
+    // Get MKL shapes.
+    MklShapeList input_shapes(N);
+    GetMklShapeList(context, "values", &input_shapes);
+
+    // If this is Concat, then concat_dim is 0th input.
+    // If this is ConcatV2, then axis is Nth input.
+    const Tensor& concat_dim_tensor = AxisArgName == NAME_IS_CONCAT_DIM
+                                          ? MklGetInput(context, 0)
+                                          : MklGetInput(context, N);
+
+    // Sanity checks
+    OP_REQUIRES(
+        context, IsLegacyScalar(concat_dim_tensor.shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor.shape().DebugString()));
+    int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
+
+    MklShape& inpshape0 = input_shapes[0];
+
+    // Check that all tensors are Mkl, if not we call Eigen version.
+    bool invoke_eigen = false;
+    bool is_concat_dim_channel = true;
+    if (!AreAllMklTensors(input_shapes)) {
+      invoke_eigen = true;
+    }
+
+    // Check that total number of dimensions is 4, if not call Eigen.
+    if (!invoke_eigen) {
+      for (auto& s : input_shapes) {
+        if (s.GetDimension() != 4) {
+          invoke_eigen = true;
+          break;
+        }
+      }
+    }
+
+    // check that concat_dim is channel, if not call Eigen version.
+    if (!invoke_eigen) {
+      for (auto& s : input_shapes) {
+        if (!s.IsMklChannelDim(concat_dim)) {
+          invoke_eigen = true;
+          is_concat_dim_channel = false;
+          break;
+        }
+      }
+    }
+
+    if (invoke_eigen) {
+      string msg = std::string("Invoking Eigen version of Concat. Reason:") +
+                   (!is_concat_dim_channel
+                        ? std::string("Concat dimension is not channel")
+                        : std::string("Not all tensors are in Mkl layout"));
+      VLOG(1) << "_MklConcatOp: " << msg;
+      CallEigenVersion(context, input_tensors, input_shapes);
+      return;
+    }
+
+    // For MKL format, the channel is dimension number 2.
+    // So if we are concating over channel and _all_ inputs are in MKL
+    // format, then we set concat_dim to 2.
+    // Since we have reached till here, it means we are concating
+    // over channel.
+    concat_dim = MklDims::C;
+
+    // One more sanity check: check that ranks of all tensors match
+    // and that their shapes match except for concat_dim.
+    int i = 0;
+    for (auto& s : input_shapes) {
+      size_t exp_dims = inpshape0.GetDimension();
+      OP_REQUIRES(context, s.GetDimension() == exp_dims,
+                  errors::InvalidArgument(
+                      "_MklConcatOp : Ranks of all input tensors should match:"
+                      " input dimensions = ",
+                      s.GetDimension(), " vs. expected rank = ", exp_dims));
+
+      for (int d = 0; d < exp_dims; ++d) {
+        if (d == concat_dim) {
+          continue;
+        }
+
+        size_t exp_size = inpshape0.GetSizes()[d];
+        OP_REQUIRES(
+            context, exp_size == s.GetSizes()[d],
+            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs"
+                                    "should match: shape[0][",
+                                    d, "]= ", exp_size, " vs. shape[", i, "][",
+                                    d, "] = ", s.GetSizes()[d]));
+      }
+      ++i;
+    }
+
+    // Use input MKL layout instead of creating new layouts.
+    int64 output_concat_dim_size = 0;
+    for (auto& s : input_shapes) {
+      output_concat_dim_size +=
+          s.GetDimension() > 0 ? s.GetSizes()[concat_dim] : 1;
+    }
+    mkl_context.MklCreateInputLayouts(context, input_shapes);
+    OP_REQUIRES_OK(context, context->status());
+
+    CHECK_EQ(dnnConcatCreate_F32(&mkl_context.prim_concat, NULL, N,
+                                 &mkl_context.lt_inputs[0]),
+             E_SUCCESS);
+
+    // Calculate output sizes and strides
+    TensorFormat data_format;
+    if (inpshape0.IsTensorInNHWCFormat()) {
+      data_format = FORMAT_NHWC;
+    } else {
+      OP_REQUIRES(
+          context, inpshape0.IsTensorInNCHWFormat(),
+          errors::InvalidArgument(
+              "_MklConcat only supports all inputs in NCHW or NHWC format "));
+      data_format = FORMAT_NCHW;
+    }
+
+    // Since all tensors are in Mkl layout, we copy sizes from input tensor.
+    mkl_context.out_sizes[MklDims::W] = inpshape0.GetSizes()[MklDims::W];
+    mkl_context.out_sizes[MklDims::H] = inpshape0.GetSizes()[MklDims::H];
+    mkl_context.out_sizes[MklDims::C] = output_concat_dim_size;
+    mkl_context.out_sizes[MklDims::N] = inpshape0.GetSizes()[MklDims::N];
+    GetStridesFromSizes(data_format, mkl_context.out_strides,
+                        mkl_context.out_sizes);
+
+    // Set output Mkl shape.
+    int64 dim = 4;
+    MklShape mkl_output_mkl_shape;
+    mkl_output_mkl_shape.SetMklTensor(true);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_concat, dnnResourceDst);
+    mkl_output_mkl_shape.SetTfLayout(dim, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    mkl_output_mkl_shape.SetTfDimOrder(dim, inpshape0.GetTfToMklDimMap());
+
+    TensorShape mkl_output_tf_shape;
+    mkl_output_tf_shape.AddDim(1);
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+
+    Tensor* output = nullptr;
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+
+    // Set destination resource.
+    mkl_context.concat_res[dnnResourceDst] =
+        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+
+    mkl_context.mkl_tmp_tensors.resize(N);
+    mkl_context.MklPrepareConcatInputs(context, input_tensors);
+    OP_REQUIRES_OK(context, context->status());
+
+    // Execute primitive.
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_concat, mkl_context.concat_res),
+             E_SUCCESS);
+
+    mkl_context.MklCleanup();
+    OP_REQUIRES_OK(context, context->status());
+  }
+
+ private:
+  typedef struct {
+    TensorFormat data_format;
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    dnnPrimitive_t prim_concat;
+    void* concat_res[dnnResourceNumber];
+    std::vector<dnnLayout_t> lt_inputs;
+    std::vector<Tensor> mkl_tmp_tensors;
+
+    // Create MKL dnnLayout_t objects for tensors coming into the layer
+    // We only support case where input tensors are all in Mkl layout.
+    void MklCreateInputLayouts(OpKernelContext* context,
+                               MklShapeList& input_shapes) {
+      for (auto& is : input_shapes) {
+        CHECK_EQ(is.IsMklTensor(), true);
+        lt_inputs.push_back((dnnLayout_t)is.GetCurLayout());
+      }
+    }
+
+    void MklPrepareConcatInputs(OpKernelContext* context,
+                                OpInputList& input_tensors) {
+      CHECK_EQ(lt_inputs.size(), mkl_tmp_tensors.size());
+
+      for (int i = 0; i < lt_inputs.size(); ++i) {
+        dnnPrimitive_t mkl_prim_convert_input;
+        dnnLayout_t mkl_lt_internal_input;
+        void* mkl_buf_convert_input = nullptr;
+
+        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                     &mkl_lt_internal_input, prim_concat,
+                     (dnnResourceType_t)(dnnResourceMultipleSrc + i)),
+                 E_SUCCESS);
+
+        if (!dnnLayoutCompare_F32(lt_inputs[i], mkl_lt_internal_input)) {
+          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
+                                           lt_inputs[i], mkl_lt_internal_input),
+                   E_SUCCESS);
+
+          AllocTmpBuffer(context, &mkl_tmp_tensors[i], mkl_lt_internal_input,
+                         &mkl_buf_convert_input);
+
+          CHECK_EQ(dnnConversionExecute_F32(
+                       mkl_prim_convert_input,
+                       const_cast<void*>(static_cast<const void*>(
+                           input_tensors[i].flat<T>().data())),
+                       mkl_buf_convert_input),
+                   E_SUCCESS);
+
+          concat_res[dnnResourceMultipleSrc + i] = mkl_buf_convert_input;
+          CHECK_EQ(dnnDelete_F32(mkl_prim_convert_input), E_SUCCESS);
+        } else {
+          concat_res[dnnResourceMultipleSrc + i] = const_cast<void*>(
+              static_cast<const void*>(input_tensors[i].flat<T>().data()));
+        }
+
+        CHECK_EQ(dnnLayoutDelete_F32(mkl_lt_internal_input), E_SUCCESS);
+      }
+    }
+
+    void MklCleanup() {
+      for (auto& lt : lt_inputs) {
+        lt = nullptr;
+      }
+      CHECK_EQ(dnnDelete_F32(prim_concat), E_SUCCESS);
+    }
+  } MklConcatOpContext;
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const MklShapeList& input_shapes) {
+    // Before calling Eigen version, we need to convert Mkl tensors to TF.
+    // First check that the number of input tensors and the number of Mkl
+    // shapes match.
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      if (input_shapes[i].IsMklTensor()) {
+        // If input tensor is Mkl, then do the conversion.
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+      } else {
+        // If input tensor is TF already, then we do not need any conversion.
+        converted_values.push_back(values[i]);
+      }
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    mkl_tensor_mkl_shape.SetTfDimOrder(4);  // Dimensions
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+/* Use optimized concat for float type only */
+#define REGISTER_MKL_CPU(type)                                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .HostMemory("concat_dim")                     \
+                              .Label(mkl_op_registry::kMklOpLabel),         \
+                          MklConcatOp<CPUDevice, type, NAME_IS_CONCAT_DIM>) \
+  REGISTER_KERNEL_BUILDER(Name("_MklConcatV2")                              \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<int32>("Tidx")                \
+                              .HostMemory("axis")                           \
+                              .Label(mkl_op_registry::kMklOpLabel),         \
+                          MklConcatOp<CPUDevice, type, NAME_IS_AXIS>)
+
+TF_CALL_float(REGISTER_MKL_CPU);
+
+#undef REGISTER_CONCAT_MKL
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index 90b9f7ba90b86be9bcba2593fa45d7dbb8080a16..d4364d31e41790241454050750ecb58d31a0e941 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -87,7 +87,7 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
     Tensor* bias_backprop = nullptr;
     MklShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(false);
-    AllocateOutputSetMklshape(context, 0, &bias_backprop, output_shape,
+    AllocateOutputSetMklShape(context, 0, &bias_backprop, output_shape,
                               output_mkl_shape);
 
     mkl_context.in_dims = 4;
@@ -251,11 +251,11 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MklConv2DCustomBackpropBiasOp);
 };
 
-#define REGISTER_CPU_KERNELS(T)                                           \
-  REGISTER_KERNEL_BUILDER(Name("MklConv2DWithBiasBackpropBias")           \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
+#define REGISTER_CPU_KERNELS(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBiasBackpropBias")    \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklConv2DCustomBackpropBiasOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_CPU_KERNELS);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 266f433e70378384a75e1de87e2af32e1ede313a..dc6b88e953a6eac204f247e8e0aa69c4a1d05314 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -217,7 +217,7 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     mkl_context.grad_filter_shape.SetTfLayout(mkl_context.filter_dims,
                                               mkl_context.filter_sizes,
                                               mkl_context.filter_strides);
-    AllocateOutputSetMklshape(context, 0, &grad_filter, filter_shape,
+    AllocateOutputSetMklShape(context, 0, &grad_filter, filter_shape,
                               mkl_context.grad_filter_shape);
 
     // Need to set member variable for TF layout
@@ -266,8 +266,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
     int input_offsets[2];
     size_t conv_strides[2];
     MklShape input_shape, grad_filter_shape, out_backprop_shape;
-    dnnPrimitive_t prim_conv_bwdfilter, convert_bwdfilter;
-    dnnLayout_t lt_input, lt_grad_filter, lt_out_backprop;
+    dnnPrimitive_t prim_conv_bwdfilter = nullptr;
+    dnnPrimitive_t convert_bwdfilter = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_grad_filter = nullptr;
+    dnnLayout_t lt_out_backprop = nullptr;
     void* conv_res[dnnResourceNumber];
 
     void MklCleanup() {
@@ -408,11 +411,11 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                    \
-  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropFilter")                 \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index c5ebe8024ed20d499709b9306c032a87452479dd..23827ceea50f7b0af19640a049a530ef0798536a 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -202,7 +202,7 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
     mkl_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
                              mklOutputShape.GetMklLayout())) /
                          sizeof(T));
-    AllocateOutputSetMklshape(context, 0, &in_backprop, mkl_out_shape,
+    AllocateOutputSetMklShape(context, 0, &in_backprop, mkl_out_shape,
                               mklOutputShape);
 
     mkl_context.conv_res[dnnResourceDiffSrc] =
@@ -295,7 +295,7 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
         dnnDelete_F32(mkl_convert_filter);
       } else {
         // If we do not need any layout conversion for filter, then
-        // we direclty assign input filter to resources[].
+        // we directly assign input filter to resources[].
         conv_res[dnnResourceFilter] =
             static_cast<void*>(const_cast<T*>(filter.flat<T>().data()));
       }
@@ -341,11 +341,11 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
   TensorFormat data_format;
 };
 
-#define REGISTER_MKL_CPU_KERNELS(T)                                       \
-  REGISTER_KERNEL_BUILDER(Name("MklConv2DBackpropInput")                  \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
+#define REGISTER_MKL_CPU_KERNELS(T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklConv2DCustomBackpropInputOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index acd37786ff12700eda9d5756cfd179883e8cb1a2..76b9f1798ddafcde4b25d086d1445f282559a2e4 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#include "tensorflow/core/util/mkl_util.h"
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -98,10 +98,9 @@ class MklConv2DOp : public OpKernel {
                                         filter.shape().DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                  errors::InvalidArgument("filter too large"));
     }
 
     const int64 input_depth =
@@ -175,7 +174,7 @@ class MklConv2DOp : public OpKernel {
       // Nothing to do, allocate output tensor and return
       MklShape mkl_output_mkl_shape;
       mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklshape(context, 0, &output, input.shape(),
+      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
                                 mkl_output_mkl_shape);
       return;
     }
@@ -261,7 +260,7 @@ class MklConv2DOp : public OpKernel {
         dnnLayoutGetMemorySize_F32(
             static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
         sizeof(T));
-    AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape,
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
                               mkl_output_mkl_shape);
     mkl_context.conv_res[dnnResourceDst] =
         static_cast<void*>(output->flat<T>().data());
@@ -434,16 +433,16 @@ class MklConv2DOp : public OpKernel {
   TensorFormat data_format_;
 };
 
-#define REGISTER_MKL_CPU(T)                                               \
-  REGISTER_KERNEL_BUILDER(Name("MklConv2D")                               \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
-                          MklConv2DOp<CPUDevice, T, false>);              \
-  REGISTER_KERNEL_BUILDER(Name("MklConv2DWithBias")                       \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklConv2DOp<CPUDevice, T, false>);        \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklConv2DOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d53353680247bac3228629d2901b2ca8592d96d5
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -0,0 +1,689 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+// TODO(inteltf) Address comments from PR 8968.
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklFusedBatchNormOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& scale = MklGetInput(context, 1);
+    const Tensor& shift = MklGetInput(context, 2);
+    const Tensor& est_mean = MklGetInput(context, 3);
+    const Tensor& est_variance = MklGetInput(context, 4);
+
+    GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
+    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+    OP_REQUIRES(context, scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    OP_REQUIRES(context, shift.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        shift.shape().DebugString()));
+    OP_REQUIRES(context, est_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        est_mean.shape().DebugString()));
+    OP_REQUIRES(
+        context, est_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                est_variance.shape().DebugString()));
+    if (is_training_) {
+      OP_REQUIRES(context, est_mean.dim_size(0) == 0,
+                  errors::InvalidArgument("estimated_mean empty for training",
+                                          est_mean.shape().DebugString()));
+      OP_REQUIRES(context, est_variance.dim_size(0) == 0,
+                  errors::InvalidArgument(
+                      "estimated_variance must be empty for training",
+                      est_variance.shape().DebugString()));
+    }
+
+    unsigned int flag_batch_norm =
+        is_training_ ? dnnUseScaleShift
+                     : (dnnUseInputMeanVariance | dnnUseScaleShift);
+
+    mkl_context.MklExtractParams(context, tensor_format_);
+
+    // Create layout only for input data as it is used in Op primitive.
+    mkl_context.MklCreateInputLayout(context);
+
+    // Create Op primitive.
+    CHECK_EQ(dnnBatchNormalizationCreateForward_v2_F32(
+                 &(mkl_context.mkl_prim_batchnorm), nullptr,
+                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
+                 flag_batch_norm),
+             E_SUCCESS);
+
+    // Temporary tensors with buffers for the context inputs, if
+    // conversion to MKL-Op specific layouts are required. It is assumed here
+    // that TF's 1D tensors (scale, shift, est_mean, and est_variance) won't
+    // require any conversion.
+    // Since scale-shift is combined in MKL, a buffer is required.
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_scale_shift_buf_tensor;
+    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
+                                        &mkl_tmp_scale_shift_buf_tensor);
+
+    // Output data in MKL layout
+    Tensor* output = nullptr;
+    TensorShape tf_shape_output;
+    MklShape mkl_shape_output;
+    mkl_shape_output.SetMklTensor(true);
+    mkl_shape_output.SetMklLayout(mkl_context.mkl_prim_batchnorm,
+                                  dnnResourceDst);
+    mkl_shape_output.SetTfLayout(mkl_context.mkl_params.in_dim,
+                                 mkl_context.mkl_params.in_sizes,
+                                 mkl_context.mkl_params.in_strides);
+    mkl_shape_output.SetTfDimOrder(mkl_context.mkl_params.in_dim,
+                                   tensor_format_);
+    tf_shape_output.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                               mkl_shape_output.GetMklLayout())) /
+                           sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output, tf_shape_output,
+                              mkl_shape_output);
+    mkl_context.mkl_res_batchnorm[dnnResourceDst] =
+        static_cast<void*>(output->flat<T>().data());
+
+    // Batch mean in TF layout
+    Tensor* batch_mean = nullptr;
+    MklShape mkl_shape_batch_mean;
+    mkl_shape_batch_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, &batch_mean, scale.shape(),
+                              mkl_shape_batch_mean);
+    // Batch variance in TF layout
+    Tensor* batch_variance = nullptr;
+    MklShape mkl_shape_batch_variance;
+    mkl_shape_batch_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 2, &batch_variance, scale.shape(),
+                              mkl_shape_batch_variance);
+    // If training mode, set dnnResourceMean and dnnResourceVariance to
+    // output tensors for batch mean and variance.
+    // Otherwise, set dnnResourceMean and dnnResourceVariance to
+    // estimated mean and variance.
+    if (is_training_)
+      mkl_context.MklSetMeanVariance(*batch_mean, *batch_variance);
+    else
+      mkl_context.MklSetMeanVariance(est_mean, est_variance);
+
+    // Now that all resources are set, it is ready for dnnExecute
+    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm,
+                            mkl_context.mkl_res_batchnorm),
+             E_SUCCESS);
+
+    // Mean and variance (without Bessel's correction) saved for backward
+    // computation to serve as pre-computed mean and variance.
+    Tensor* saved_mean = nullptr;
+    MklShape mkl_shape_saved_mean;
+    mkl_shape_saved_mean.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 3, &saved_mean, scale.shape(),
+                              mkl_shape_saved_mean);
+    std::memcpy(
+        reinterpret_cast<char*>(saved_mean->flat<float>().data()),
+        reinterpret_cast<char*>(mkl_context.mkl_res_batchnorm[dnnResourceMean]),
+        scale.NumElements() * sizeof(float));
+    Tensor* saved_variance = nullptr;
+    MklShape mkl_shape_saved_variance;
+    mkl_shape_saved_variance.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 4, &saved_variance, scale.shape(),
+                              mkl_shape_saved_variance);
+    std::memcpy(reinterpret_cast<char*>(saved_variance->flat<float>().data()),
+                reinterpret_cast<char*>(
+                    mkl_context.mkl_res_batchnorm[dnnResourceVariance]),
+                scale.NumElements() * sizeof(float));
+
+    // Bessel's correction on variance, if training mode is on
+    if (is_training_) {
+      float* p_var = static_cast<float*>(batch_variance->flat<T>().data());
+      auto depth = mkl_context.mkl_params.depth;
+      size_t orig_size = mkl_context.mkl_params.in_sizes[0] *
+                         mkl_context.mkl_params.in_sizes[1] *
+                         mkl_context.mkl_params.in_sizes[3];
+      size_t adjust_size = orig_size - 1;
+      float adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      for (int i = 0; i < depth; i++) p_var[i] = adjust_factor * p_var[i];
+    }
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+
+  // Structure containing all info for MklOp
+  typedef struct {
+    // Parameters used for input and output layouts
+    struct MklBatchNormParams {
+      // BatchNormOp src and
+      size_t in_dim;
+      size_t in_sizes[4];
+      size_t in_strides[4];
+      size_t depth;  // Batch normalization is done for per channel.
+    } mkl_params;
+
+    MklShape mkl_shape_input_shape;
+
+    // MKL primitive and resources for BatchNormOp
+    dnnPrimitive_t mkl_prim_batchnorm = nullptr;
+    void* mkl_res_batchnorm[dnnResourceNumber];
+
+    // MKL layouts for inputs in the context
+    dnnLayout_t mkl_lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
+      if (mkl_prim_batchnorm != nullptr) dnnDelete_F32(mkl_prim_batchnorm);
+    }
+
+    void MklExtractParams(OpKernelContext* context,
+                          const TensorFormat& tensor_format) {
+      const Tensor& input = MklGetInput(context, 0);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      mkl_params.in_dim = input_in_mkl_format
+                              ? mkl_shape_input_shape.GetDimension()
+                              : input.dims();
+      mkl_params.in_sizes[0] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
+                              : GetTensorDim(input, tensor_format, 'W'));
+      mkl_params.in_sizes[1] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
+                              : GetTensorDim(input, tensor_format, 'H'));
+      mkl_params.in_sizes[2] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
+                              : GetTensorDim(input, tensor_format, 'C'));
+      mkl_params.in_sizes[3] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
+                              : GetTensorDim(input, tensor_format, 'N'));
+      mkl_params.depth = mkl_params.in_sizes[2];
+      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
+                          mkl_params.in_sizes);
+    }
+
+    void MklCreateInputLayout(OpKernelContext* context) {
+      const Tensor& input = MklGetInput(context, 0);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        mkl_lt_input =
+            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dim,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+    }
+
+    void MklPrepareContextInputs(OpKernelContext* context,
+                                 Tensor* mkl_tmp_input_buf_tensor,
+                                 Tensor* mkl_tmp_scale_shift_buf_tensor) {
+      bool mkl_convert_input;
+      dnnPrimitive_t mkl_prim_convert_input = nullptr;
+      dnnLayout_t mkl_lt_internal_input = nullptr;
+      void* mkl_buf_converted_input = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
+                   &mkl_lt_internal_input, mkl_prim_batchnorm, dnnResourceSrc),
+               E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_converted_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_converted_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      mkl_res_batchnorm[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
+
+      // scale-shift layout is created from primitive. So no conversion
+      // is needed, however, a buffer has to be allocated.
+      dnnLayout_t mkl_lt_scale_shift = nullptr;
+      void* mkl_buf_scale_shift = nullptr;
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_lt_scale_shift, mkl_prim_batchnorm, dnnResourceScaleShift),
+          E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_scale_shift_buf_tensor,
+                     mkl_lt_scale_shift, &mkl_buf_scale_shift);
+      // Fill the scale-shift buffer with data, presumably buffer is 2D array
+      const Tensor& scale = MklGetInput(context, 1);
+      const Tensor& shift = MklGetInput(context, 2);
+      float* buf_scale_shift = static_cast<float*>(mkl_buf_scale_shift);
+      float* buf_scale = const_cast<float*>(
+          static_cast<const float*>(scale.flat<float>().data()));
+      float* buf_shift = const_cast<float*>(
+          static_cast<const float*>(shift.flat<float>().data()));
+      auto depth = mkl_params.depth;
+      for (int i = 0; i < depth; i++) {
+        buf_scale_shift[i] = buf_scale[i];
+        buf_scale_shift[i + depth] = buf_shift[i];
+      }
+      mkl_res_batchnorm[dnnResourceScaleShift] = mkl_buf_scale_shift;
+    }
+
+    inline void MklSetMeanVariance(const Tensor& mean, const Tensor& variance) {
+      mkl_res_batchnorm[dnnResourceMean] = const_cast<void*>(
+          static_cast<const void*>(mean.flat<float>().data()));
+      mkl_res_batchnorm[dnnResourceVariance] = const_cast<void*>(
+          static_cast<const void*>(variance.flat<float>().data()));
+    }
+  } MklFusedBatchNormOpContext;
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklFusedBatchNormGradOpContext mkl_context;
+
+    const Tensor& out_backprop = MklGetInput(context, 0);
+    const Tensor& input = MklGetInput(context, 1);
+    const Tensor& scale = MklGetInput(context, 2);
+    const Tensor& saved_mean = MklGetInput(context, 3);
+    const Tensor& saved_var = MklGetInput(context, 4);
+
+    // Here scale, mean, and variance are 1D and considered
+    // those having same layout in MKL and TF
+    GetMklShape(context, 0, &(mkl_context.mkl_shape_out_backprop));
+    GetMklShape(context, 1, &(mkl_context.mkl_shape_input_shape));
+
+    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+    bool out_backprop_in_mkl_format =
+        mkl_context.mkl_shape_out_backprop.IsMklTensor();
+    if (!out_backprop_in_mkl_format) {
+      OP_REQUIRES(context, out_backprop.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          out_backprop.shape().DebugString()));
+    }
+    if (!input_in_mkl_format) {
+      OP_REQUIRES(context, input.dims() == 4,
+                  errors::InvalidArgument("input must be 4-dimensional",
+                                          input.shape().DebugString()));
+    }
+    OP_REQUIRES(context, scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    OP_REQUIRES(context, saved_mean.dims() == 1,
+                errors::InvalidArgument("saved mean must be 1-dimensional",
+                                        saved_mean.shape().DebugString()));
+    OP_REQUIRES(context, saved_var.dims() == 1,
+                errors::InvalidArgument("saved variance must be 1-dimensional",
+                                        saved_var.shape().DebugString()));
+
+    mkl_context.MklExtractParams(context, tensor_format_);
+
+    mkl_context.MklCreateInputLayout(context);
+
+    unsigned int flag_batch_norm_grad = dnnUseScaleShift;
+
+    // Create Backward Op primitive.
+    CHECK_EQ(dnnBatchNormalizationCreateBackward_v2_F32(
+                 &(mkl_context.mkl_prim_batchnorm_bwd), nullptr,
+                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
+                 flag_batch_norm_grad),
+             E_SUCCESS);
+
+    // Temporary tensors and their buffers if conversion is required
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_outbackprop_buf_tensor,
+        mkl_tmp_scaleshift_buf_tensor;
+    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
+                                        &mkl_tmp_outbackprop_buf_tensor,
+                                        &mkl_tmp_scaleshift_buf_tensor);
+
+    // Allocate tensor for grad w.r.t. input(x)
+    Tensor* in_backprop = nullptr;
+    TensorShape tf_shape_in_backprop;
+    MklShape mkl_shape_in_backprop;
+    mkl_shape_in_backprop.SetMklTensor(true);
+    mkl_shape_in_backprop.SetMklLayout(mkl_context.mkl_prim_batchnorm_bwd,
+                                       dnnResourceDiffSrc);
+    mkl_shape_in_backprop.SetTfLayout(mkl_context.mkl_params.in_dims,
+                                      mkl_context.mkl_params.in_sizes,
+                                      mkl_context.mkl_params.in_strides);
+    mkl_shape_in_backprop.SetTfDimOrder(mkl_context.mkl_params.in_dims,
+                                        tensor_format_);
+    tf_shape_in_backprop.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_shape_in_backprop.GetMklLayout())) /
+        sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &in_backprop, tf_shape_in_backprop,
+                              mkl_shape_in_backprop);
+    mkl_context.mkl_res_batchnorm_bwd[dnnResourceDiffSrc] =
+        static_cast<void*>(in_backprop->flat<T>().data());
+
+    // grad_scale and grad_shift are combined together in MKL
+    // So create a single temporary buffer for those.
+    // Also set dnnResourceDiffScaleShift to the temporary buffer
+    Tensor mkl_tmp_grad_scale_shift_buf_tensor;
+    mkl_context.MklPrepareGradScaleShift(context,
+                                         &mkl_tmp_grad_scale_shift_buf_tensor);
+
+    // All dnn resources are set now, ready to execute
+    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm_bwd,
+                            mkl_context.mkl_res_batchnorm_bwd),
+             E_SUCCESS);
+
+    // Now separate out scale and shift grad and copy to individual tensors
+    const TensorShape& tf_shape_scale_shift = scale.shape();
+    // Allocate tensor for grad w.r.t. scale (beta)
+    Tensor* scale_backprop = nullptr;
+    MklShape mkl_shape_scale_backprop;
+    AllocateOutputSetMklShape(context, 1, &scale_backprop, tf_shape_scale_shift,
+                              mkl_shape_scale_backprop);
+
+    // Allocate tensor for grad w.r.t. shift(gamma)
+    Tensor* shift_backprop = nullptr;
+    MklShape mkl_shape_shift_backprop;
+    AllocateOutputSetMklShape(context, 2, &shift_backprop, tf_shape_scale_shift,
+                              mkl_shape_shift_backprop);
+
+    // copy scale and shift grads to tensors
+    float* mkl_buf_scale_shift = const_cast<float*>(static_cast<const float*>(
+        mkl_tmp_grad_scale_shift_buf_tensor.flat<T>().data()));
+    float* tf_buf_scale = const_cast<float*>(
+        static_cast<const float*>(scale_backprop->flat<T>().data()));
+    float* tf_buf_shift = const_cast<float*>(
+        static_cast<const float*>(shift_backprop->flat<T>().data()));
+    auto depth = mkl_context.mkl_params.depth;
+    for (int i = 0; i < depth; i++) {
+      tf_buf_scale[i] = mkl_buf_scale_shift[i];
+      tf_buf_shift[i] = mkl_buf_scale_shift[i + depth];
+    }
+
+    // Two placeholders for estimated_mean and estimated_variance, which are
+    // used for inference and thus not needed here for gradient computation.
+    Tensor* placeholder_1 = nullptr;
+    MklShape mkl_shape_placeholder_1;
+    AllocateOutputSetMklShape(context, 3, &placeholder_1, TensorShape({}),
+                              mkl_shape_placeholder_1);
+    Tensor* placeholder_2 = nullptr;
+    MklShape mkl_shape_placeholder_2;
+    AllocateOutputSetMklShape(context, 4, &placeholder_2, TensorShape({}),
+                              mkl_shape_placeholder_2);
+
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+
+  // Structure containing all info for MklOp
+  typedef struct {
+    // Parameters used for input and output layouts
+    struct MklBatchNormParams {
+      // BatchNormOp src and
+      size_t in_dims;
+      size_t in_sizes[4];
+      size_t in_strides[4];
+      size_t depth;  // Batch normalization is done for per channel.
+    } mkl_params;
+
+    MklShape mkl_shape_out_backprop;
+    MklShape mkl_shape_input_shape;
+
+    // MKL primitive and resources for BatchNormOp
+    dnnPrimitive_t mkl_prim_batchnorm_bwd = nullptr;
+    void* mkl_res_batchnorm_bwd[dnnResourceNumber];
+
+    // MKL layouts for inputs in the context
+    dnnLayout_t mkl_lt_out_backprop = nullptr;
+    dnnLayout_t mkl_lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
+      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
+      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_out_backprop);
+
+      dnnDelete_F32(mkl_prim_batchnorm_bwd);
+    }
+
+    void MklExtractParams(OpKernelContext* context,
+                          const TensorFormat& tensor_format) {
+      const Tensor& input = MklGetInput(context, 1);
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      mkl_params.in_dims = input_in_mkl_format
+                               ? mkl_shape_input_shape.GetDimension()
+                               : input.dims();
+      mkl_params.in_sizes[0] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
+                              : GetTensorDim(input, tensor_format, 'W'));
+      mkl_params.in_sizes[1] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
+                              : GetTensorDim(input, tensor_format, 'H'));
+      mkl_params.in_sizes[2] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
+                              : GetTensorDim(input, tensor_format, 'C'));
+      mkl_params.in_sizes[3] = static_cast<size_t>(
+          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
+                              : GetTensorDim(input, tensor_format, 'N'));
+      mkl_params.depth = mkl_params.in_sizes[2];
+      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
+                          mkl_params.in_sizes);
+    }
+
+    void MklCreateInputLayout(OpKernelContext* context) {
+      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
+      if (input_in_mkl_format) {
+        mkl_lt_input =
+            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dims,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+
+      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
+      if (out_backprop_in_mkl_format) {
+        mkl_lt_out_backprop =
+            static_cast<dnnLayout_t>(mkl_shape_out_backprop.GetCurLayout());
+      } else {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&mkl_lt_out_backprop, mkl_params.in_dims,
+                                mkl_params.in_sizes, mkl_params.in_strides),
+            E_SUCCESS);
+      }
+    }
+
+    void MklPrepareContextInputs(OpKernelContext* context,
+                                 Tensor* mkl_tmp_input_buf_tensor,
+                                 Tensor* mkl_tmp_outbackprop_buf_tensor,
+                                 Tensor* mkl_tmp_scaleshift_buf_tensor) {
+      bool mkl_convert_input;
+      dnnPrimitive_t mkl_prim_convert_input = nullptr;
+      dnnLayout_t mkl_lt_internal_input = nullptr;
+      void* mkl_buf_converted_input = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& input = MklGetInput(context, 1);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+      CHECK_EQ(
+          dnnLayoutCreateFromPrimitive_F32(
+              &mkl_lt_internal_input, mkl_prim_batchnorm_bwd, dnnResourceSrc),
+          E_SUCCESS);
+      mkl_convert_input =
+          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
+                                         mkl_lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
+                       &mkl_buf_converted_input);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
+                                          mkl_buf_converted_input),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_input);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_input);
+      mkl_res_batchnorm_bwd[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
+
+      bool mkl_convert_out_backprop;
+      dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
+      dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
+      void* mkl_buf_converted_out_backprop = nullptr;
+      // Compare with internal layouts and convert if needed
+      const Tensor& out_backprop = MklGetInput(context, 0);
+      void* mkl_buf_out_backprop = const_cast<void*>(
+          static_cast<const void*>(out_backprop.flat<T>().data()));
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+      mkl_convert_out_backprop = !dnnLayoutCompare_F32(
+          mkl_lt_internal_out_backprop, mkl_lt_out_backprop);
+      if (mkl_convert_out_backprop) {
+        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
+                                         mkl_lt_out_backprop,
+                                         mkl_lt_internal_out_backprop),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
+                       mkl_lt_internal_out_backprop,
+                       &mkl_buf_converted_out_backprop);
+        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
+                                          mkl_buf_out_backprop,
+                                          mkl_buf_converted_out_backprop),
+                 E_SUCCESS);
+        dnnDelete_F32(mkl_prim_convert_out_backprop);
+      }
+      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
+      mkl_res_batchnorm_bwd[dnnResourceDiffDst] =
+          (mkl_convert_out_backprop) ? mkl_buf_converted_out_backprop
+                                     : mkl_buf_out_backprop;
+
+      // Set dnnResourceMean and dnnResourceVariance
+      const Tensor& saved_mean = MklGetInput(context, 3);
+      const Tensor& saved_var = MklGetInput(context, 4);
+      void* mkl_buf_saved_mean = const_cast<void*>(
+          static_cast<const void*>(saved_mean.flat<T>().data()));
+      void* mkl_buf_saved_var = const_cast<void*>(
+          static_cast<const void*>(saved_var.flat<T>().data()));
+      mkl_res_batchnorm_bwd[dnnResourceMean] = mkl_buf_saved_mean;
+      mkl_res_batchnorm_bwd[dnnResourceVariance] = mkl_buf_saved_var;
+
+      // Set dnnResourceScaleShift
+      // Note backward Op needs only current values of scale parameters,
+      // shift parameters could be garbage and won't be used
+      const Tensor& scale = MklGetInput(context, 2);
+      dnnLayout_t mkl_lt_scale_shift = nullptr;
+      void* mkl_buf_scale_shift = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_scale_shift,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceScaleShift),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_scaleshift_buf_tensor, mkl_lt_scale_shift,
+                     &mkl_buf_scale_shift);
+      float* pscale =
+          const_cast<float*>(static_cast<const float*>(scale.flat<T>().data()));
+      float* pscale_shift = static_cast<float*>(mkl_buf_scale_shift);
+      auto depth = mkl_params.depth;
+      for (int i = 0; i < depth; i++) pscale_shift[i] = pscale[i];
+      mkl_res_batchnorm_bwd[dnnResourceScaleShift] = mkl_buf_scale_shift;
+      dnnLayoutDelete_F32(mkl_lt_scale_shift);
+    }
+
+    void MklPrepareGradScaleShift(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_scale_shift_buf_tensor) {
+      dnnLayout_t mkl_lt_grad_scaleshift = nullptr;
+      void* mkl_buf_grad_scaleshift = nullptr;
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_grad_scaleshift,
+                                                mkl_prim_batchnorm_bwd,
+                                                dnnResourceDiffScaleShift),
+               E_SUCCESS);
+      AllocTmpBuffer(context, mkl_tmp_grad_scale_shift_buf_tensor,
+                     mkl_lt_grad_scaleshift, &mkl_buf_grad_scaleshift);
+      mkl_res_batchnorm_bwd[dnnResourceDiffScaleShift] =
+          mkl_buf_grad_scaleshift;
+      dnnLayoutDelete_F32(mkl_lt_grad_scaleshift);
+    }
+  } MklFusedBatchNormGradOpContext;
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormGradOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ea7e7f90546ceb23564d09c9e064b80347148
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -0,0 +1,63 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklShape mkl_shape_input;
+    GetMklShape(context, 0, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+
+    if (input_in_mkl_format) {
+      ForwarMklTensorInToOut(context, 0, 0);
+    } else {
+      FowardTfTensorInToOut(context, 0, 0);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklIdentityOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..070aeff49fe220881089e8f405fa8f89916c52ac
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -0,0 +1,754 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LRN = Local Response Normalization
+// See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
+// layout and primitives, use MKL dnn primitives to compute local
+// response normalization
+
+#ifdef INTEL_MKL
+
+#define EIGEN_USE_THREADS
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/util/work_sharder.h"
+#endif
+
+namespace tensorflow {
+
+namespace {
+// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
+// depth_radius + 1) around the diagonal.
+template <typename T>
+void GetBandMatrix(int depth, int depth_radius,
+                   Eigen::Tensor<T, 2, Eigen::RowMajor>* result) {
+  result->setZero();
+  for (int row = 0; row < depth; ++row) {
+    const int begin = std::max<int>(0, row - depth_radius);
+    const int end = std::min<int>(depth, row + depth_radius + 1);
+    Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin);
+    Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin);
+    result->slice(start, sizes).setConstant(T(1));
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class MklLRNOp : public OpKernel {
+ public:
+  ~MklLRNOp() {}
+
+  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
+    depth_radius_ = static_cast<size_t>(depth_radius64);
+
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklLRNOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
+    // Sanity checks
+    mkl_context.in_dims = input_in_mkl_format
+                              ? mkl_context.input_shape.GetDimension()
+                              : input.dims();
+    OP_REQUIRES(context, mkl_context.in_dims == 4,
+                errors::InvalidArgument("input must be 4-dimensional"));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("argument to LRN too large"));
+
+    if (!input_in_mkl_format) {
+      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                    beta_, input);
+      return;
+    }
+
+    // TODO(inteltf) MKL will support depth radius not equal to 2 in the future
+    if (depth_radius_ != 2) {
+      Tensor converted_tensor =
+          ConvertMklToTF<T>(context, input, mkl_context.input_shape);
+      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                    beta_, converted_tensor);
+      return;
+    }
+
+    if (input_in_mkl_format) {
+      // MKL supports normalization over channel dimension only
+      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
+          MklDims::C) {
+        mkl_context.lt_input =
+            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
+        workspace_enabled_ = true;
+      } else {
+        Tensor converted_tensor =
+            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
+        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
+                                      beta_, converted_tensor);
+        return;
+      }
+    }
+
+    int kernel_size = 2 * depth_radius_ + 1;
+
+    CHECK_EQ(dnnLRNCreateForward_F32(
+                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
+                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
+             E_SUCCESS);
+
+    // Allocate output tensor and shape
+    Tensor* output = nullptr;
+    Tensor* workspace = nullptr;
+
+    // Convert Inputs if needed
+    Tensor mkl_tmp_input_buf_tensor;
+    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
+
+    // Allocate Layer Outputs
+    mkl_context.MklAllocateOutputs(context, &output, &workspace,
+                                   workspace_enabled_);
+
+    Tensor mkl_tmp_workspace_buf_tensor;
+    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
+                                     &mkl_tmp_workspace_buf_tensor,
+                                     workspace_enabled_);
+
+    // Execute LRN.
+    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
+             E_SUCCESS);
+
+    // Release MKL resources.
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    size_t in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    MklShape input_shape;
+    dnnPrimitive_t lrn_fwd = nullptr;
+    dnnPrimitive_t convert_input = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_internal_input = nullptr;
+    dnnLayout_t lt_internal_workspace = nullptr;
+    dnnLayout_t lt_internal_output = nullptr;
+    void* lrn_res[dnnResourceNumber];
+
+    // Convert Inputs if needed
+    void MklPrepareLRNInputs(OpKernelContext* context,
+                             Tensor* mkl_tmp_input_buf_tensor) {
+      const Tensor& input = MklGetInput(context, 0);
+      void* mkl_buf_input =
+          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
+                                                dnnResourceSrc),
+               E_SUCCESS);
+
+      void* mkl_buf_convert_input = nullptr;
+      bool mkl_convert_input = false;
+      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
+
+      if (mkl_convert_input) {
+        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
+                                         lt_internal_input),
+                 E_SUCCESS);
+        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
+                       &mkl_buf_convert_input);
+        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
+                                          mkl_buf_convert_input),
+                 E_SUCCESS);
+        dnnDelete_F32(convert_input);
+      }
+
+      lrn_res[dnnResourceSrc] =
+          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
+    }
+
+    // Allocate Layer Outputs
+    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
+                            Tensor** workspace, bool workspace_enabled_) {
+      TensorShape mkl_output_tf_shape; /* First tensor */
+      MklShape mkl_output_mkl_shape;   /* Second tensor */
+
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
+      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+                                       input_shape.GetStrides());
+      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
+                                         input_shape.GetTfToMklDimMap());
+      mkl_output_tf_shape.AddDim(
+          dnnLayoutGetMemorySize_F32(
+              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+          sizeof(T));
+      AllocateOutputSetMklShape(context, 0, output,
+                                mkl_output_tf_shape /* First tensor */,
+                                mkl_output_mkl_shape /* Second Tensor */);
+
+      if (workspace_enabled_) {
+        TensorShape mkl_workspace_tf_shape; /* First tensor */
+        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
+        mkl_workspace_mkl_shape.SetMklTensor(false);
+        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
+        // Assumes workspace has same TF layout and TF dim order as input
+        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
+                                            input_shape.GetStrides());
+        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
+                                              input_shape.GetTfToMklDimMap());
+        mkl_workspace_tf_shape.AddDim(
+            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                mkl_workspace_mkl_shape.GetMklLayout())) /
+            sizeof(T));
+        AllocateOutputSetMklShape(context, 1, workspace,
+                                  mkl_workspace_tf_shape /* First tensor */,
+                                  mkl_workspace_mkl_shape /* Second Tensor */);
+      }
+    }
+
+    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
+                              Tensor* workspace,
+                              Tensor* mkl_tmp_workspace_buf_tensor,
+                              bool workspace_enabled_) {
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
+                                                dnnResourceDst),
+               E_SUCCESS);
+
+      void* mkl_buf_output =
+          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+      lrn_res[dnnResourceDst] = mkl_buf_output;
+
+      void* mkl_buf_workspace = nullptr;
+      if (workspace_enabled_) {
+        mkl_buf_workspace = const_cast<void*>(
+            static_cast<const void*>(workspace->flat<T>().data()));
+      } else {
+        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
+                       lt_internal_workspace, &mkl_buf_workspace);
+      }
+      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
+    }
+
+    // Fallback implementation - Taken from lrn_op.cc
+    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
+    // copy.
+    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
+                           float bias_, float alpha_, float beta_,
+                           const Tensor& input) {
+      const int batch = static_cast<int>(input.dim_size(0));
+      const int rows = static_cast<int>(input.dim_size(1));
+      const int cols = static_cast<int>(input.dim_size(2));
+      const int depth = static_cast<int>(input.dim_size(3));
+      const int nodes = cols * rows;
+
+      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
+      // Multiplying the input with the band matrix has the effect of reducing
+      // the
+      // correct patch along the depth.
+      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
+      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
+
+      Tensor *output, *workspace;
+      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      mkl_output_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
+                                mkl_output_mkl_shape);
+
+      mkl_workspace_mkl_shape.SetMklTensor(false);
+      mkl_workspace_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
+                                mkl_workspace_mkl_shape);
+
+      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
+      if (beta_ == T(1)) {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * tmp.inverse();
+      } else if (beta_ == T(0.5)) {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * tmp.rsqrt();
+      } else {
+        out_shaped.device(context->eigen_cpu_device()) =
+            in_shaped * (tmp.log() * -beta_).exp();
+      }
+    }
+
+    // Release MKL resources.
+    void MklCleanup() {
+      dnnDelete_F32(lrn_fwd);
+      dnnLayoutDelete_F32(lt_internal_input);
+      dnnLayoutDelete_F32(lt_internal_workspace);
+      dnnLayoutDelete_F32(lt_internal_output);
+    }
+  } MklLRNOpContext;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+template <typename T>
+class MklLRNGradOp : public OpKernel {
+ public:
+  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+    int64 depth_radius64;
+    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                " larger than int max"));
+    depth_radius_ = static_cast<int>(depth_radius64);
+    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
+    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
+    workspace_enabled_ = false;
+    context->GetAttr("workspace_enabled", &workspace_enabled_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    MklLRNGradOpContext mkl_context;
+    mkl_context.depth_radius_ = depth_radius_;
+    mkl_context.bias_ = bias_;
+    mkl_context.alpha_ = alpha_;
+    mkl_context.beta_ = beta_;
+
+    const Tensor& in_grads = MklGetInput(context, 0);
+    const Tensor& in_image = MklGetInput(context, 1);
+    const Tensor& out_image = MklGetInput(context, 2);
+
+    GetMklShape(context, 0, &mkl_context.ingrad_shape);
+    GetMklShape(context, 1, &mkl_context.inimage_shape);
+    GetMklShape(context, 2, &mkl_context.outimage_shape);
+
+    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
+    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
+    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
+
+    mkl_context.in_dims = inimage_in_mkl_format
+                              ? mkl_context.inimage_shape.GetDimension()
+                              : in_image.dims();
+    OP_REQUIRES(context, mkl_context.in_dims == 4,
+                errors::InvalidArgument("input images must be 4-dimensional"));
+
+    if (!workspace_enabled_) {
+      mkl_context.MklDefaultToEigen(context);
+      return;
+    }
+
+    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
+      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
+                                          ? &mkl_context.ingrad_shape
+                                          : &mkl_context.inimage_shape;
+      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
+        // Fallback to eigen
+        mkl_context.MklDefaultToEigen(context);
+        return;
+      } else {  // MKL supports normalization over channel dimension only
+        for (int i = 0; i < mkl_context.in_dims; i++) {
+          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
+              tmp_mkl_shape->GetSizes()[i];
+          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
+              tmp_mkl_shape->GetStrides()[i];
+        }
+      }
+    } else {
+      // Fallback to eigen
+      mkl_context.MklDefaultToEigen(context);
+      return;
+    }
+
+    // Dimensions check for sanity purpose
+    if (ingrad_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.ingrad_shape.GetDimension() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, in_grads.dims() == 4,
+          errors::InvalidArgument("input gradient must be 4-dimensional"));
+    }
+
+    if (outimage_in_mkl_format) {
+      OP_REQUIRES(
+          context, mkl_context.outimage_shape.GetDimension() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
+    } else {
+      OP_REQUIRES(
+          context, out_image.dims() == 4,
+          errors::InvalidArgument("Output image must be 4-dimensional"));
+    }
+
+    // Prepare mkl input layout
+    mkl_context.MklPrepareLRNInputsLayouts(context);
+    int ksize = 2 * depth_radius_ + 1;
+
+    CHECK_EQ(dnnLRNCreateBackward_F32(
+                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
+                 mkl_context.lt_output, ksize,
+                 static_cast<float>(alpha_ * ksize), beta_, bias_),
+             E_SUCCESS);
+
+    // Allocate output tensor and shape.
+    TensorShape mkl_output_tf_shape; /* First tensor */
+    MklShape mkl_output_mkl_shape;   /* Second tensor */
+    mkl_output_mkl_shape.SetMklTensor(true);
+    CHECK_NE(mkl_context.lrn_bwd, nullptr);
+    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
+    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
+                                     mkl_context.out_strides);
+    if (ingrad_in_mkl_format) {
+      mkl_output_mkl_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_output_mkl_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
+    }
+    mkl_output_tf_shape.AddDim(
+        dnnLayoutGetMemorySize_F32(
+            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
+        sizeof(T));
+    Tensor* output = nullptr;
+    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
+                              mkl_output_mkl_shape);
+
+    // Get pointers to output data.
+    void* user_output =
+        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
+
+    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
+        mkl_tmp_outimage_buf_tensor;
+    // Convert Inputs if needed
+    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
+                                       &mkl_tmp_image_buf_tensor,
+                                       &mkl_tmp_outimage_buf_tensor);
+
+    // We do not do any conversion for output. But we simply emit it
+    // in MKL format.
+    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
+    // Execute LRN backward using dnnExecute
+    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
+             E_SUCCESS);
+    // Release MKL resources.
+    mkl_context.Mklcleanup();
+  }
+
+ private:
+  typedef struct {
+    int depth_radius_;
+    float bias_;
+    float alpha_;
+    float beta_;
+    size_t in_dims;
+    size_t in_sizes[4];
+    size_t in_strides[4];
+    size_t out_sizes[4];
+    size_t out_strides[4];
+    MklShape ingrad_shape, inimage_shape, outimage_shape;
+    dnnPrimitive_t lrn_bwd = nullptr;
+    dnnPrimitive_t convert_input = nullptr;
+    dnnLayout_t lt_input = nullptr;
+    dnnLayout_t lt_output = nullptr;
+    dnnLayout_t lt_bdw_input = nullptr;
+    dnnLayout_t lt_workspace = nullptr;
+    dnnLayout_t lt_internal_input = nullptr;
+    void* res_lrn_bwd[dnnResourceNumber];
+
+    // prepare mkl input
+    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (!ingrad_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
+      }
+
+      if (!inimage_in_mkl_format) {
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
+            E_SUCCESS);
+      } else {
+        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
+      }
+    }
+
+    // convert input if needed
+    void MklPrepareLRNGradInput(OpKernelContext* context,
+                                Tensor* mkl_tmp_input_buf_tensor,
+                                Tensor* mkl_tmp_image_buf_tensor,
+                                Tensor* mkl_tmp_outimage_buf_tensor) {
+      const Tensor& in_grads = MklGetInput(context, 0);
+      const Tensor& in_image = MklGetInput(context, 1);
+      const Tensor& out_image = MklGetInput(context, 2);
+      const Tensor& workspace = MklGetInput(
+          context,
+          3); /*Worskpsace is enabled, get the buffer to the workspace */
+
+      void* user_input = const_cast<void*>(
+          static_cast<const void*>(in_grads.flat<T>().data()));
+      void* user_fwd_input = const_cast<void*>(
+          static_cast<const void*>(in_image.flat<T>().data()));
+      void* user_fwd_output = const_cast<void*>(
+          static_cast<const void*>(out_image.flat<T>().data()));
+      void* workspace_buffer = const_cast<void*>(
+          static_cast<const void*>(workspace.flat<T>().data()));
+
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
+                                                dnnResourceWorkspace),
+               E_SUCCESS);
+      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
+                                                dnnResourceDiffDst),
+               E_SUCCESS);
+
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      if (ingrad_in_mkl_format) {
+        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+                         &res_lrn_bwd[dnnResourceDiffDst]);
+          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
+                                            res_lrn_bwd[dnnResourceDiffDst]);
+        } else {
+          res_lrn_bwd[dnnResourceDiffDst] = user_input;
+        }
+      } else {
+        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
+          CHECK_EQ(
+              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
+              E_SUCCESS);
+
+          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
+                         &res_lrn_bwd[dnnResourceDiffDst]);
+          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
+                                            res_lrn_bwd[dnnResourceDiffDst]),
+                   E_SUCCESS);
+          dnnDelete_F32(convert_input);
+        } else {
+          res_lrn_bwd[dnnResourceDiffDst] = user_input;
+        }
+      }
+
+// Although MKL documentation for LRN does not specify setting/getting
+// of dnnResourceSrc and dnnResourceDst, Caffe code sets dnnResourceSrc.
+// So we set dnnResourceSrc here. But we do not know why we are setting
+// dnnResourceDst.
+#if 0
+    // NOTE: The code below is kept just so that we know how we should handle
+    // dnnResourceSrc if the primitive layout for dnnResourceSrc was supported.
+
+    if (!dnnLayoutCompare_F32(lt_internal_input,
+         static_cast<dnnLayout_t>inimage_shape.GetCurLayout())) {
+      AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
+                     &res_lrn_bwd[dnnResourceSrc]);
+      inimage_shape.GetConvertedFlatData(lt_internal_input,
+                                           user_fwd_input,
+                                           res_lrn_bwd[dnnResourceSrc]);
+    } else {
+      res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+    }
+#endif
+
+      // Since we cannot get expected layout for dnnResourceSrc, we construct
+      // buffer using
+      // MKL format if input is in MKL format.
+      if (inimage_shape.IsMklTensor()) {
+        AllocTmpBuffer(context, mkl_tmp_image_buf_tensor,
+                       (dnnLayout_t)inimage_shape.GetCurLayout(),
+                       &res_lrn_bwd[dnnResourceSrc]);
+      } else {
+        res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
+      }
+
+      // Same comment as above.
+      if (outimage_shape.IsMklTensor()) {
+        AllocTmpBuffer(context, mkl_tmp_outimage_buf_tensor,
+                       (dnnLayout_t)outimage_shape.GetCurLayout(),
+                       &res_lrn_bwd[dnnResourceDst]);
+      } else {
+        res_lrn_bwd[dnnResourceDst] = user_fwd_output;
+      }
+
+      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
+    }
+
+    // Fallback implementation - Taken from lrn_op.cc
+    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
+    // copy.
+    void MklDefaultToEigen(OpKernelContext* context) {
+      // CHECK(false);
+
+      Tensor in_grads;
+      Tensor in_image;
+      Tensor out_image;
+
+      GetMklShape(context, 0, &ingrad_shape);
+      GetMklShape(context, 1, &inimage_shape);
+      GetMklShape(context, 2, &outimage_shape);
+
+      if (ingrad_shape.IsMklTensor()) {
+        in_grads =
+            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
+      } else {
+        in_grads = MklGetInput(context, 0);
+      }
+
+      if (inimage_shape.IsMklTensor()) {
+        in_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
+      } else {
+        in_image = MklGetInput(context, 1);
+      }
+
+      if (outimage_shape.IsMklTensor()) {
+        out_image =
+            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
+      } else {
+        out_image = MklGetInput(context, 2);
+      }
+
+      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
+      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
+      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
+      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
+      const auto nodes = cols * rows;
+
+      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
+      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
+      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
+
+      Tensor* output;
+      MklShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(false);
+      mkl_output_mkl_shape.SetDimensions(4);
+      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
+                                mkl_output_mkl_shape);
+
+      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
+      out_shaped.setZero();
+      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
+                    depth](int64 begin, int64 end) {
+        for (int64 i = begin; i < end; ++i) {
+          for (int64 j = 0; j < depth; ++j) {
+            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
+            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
+
+            T norm(0);
+            for (int64 k = depth_begin; k < depth_end; ++k) {
+              norm += in_shaped(i, k) * in_shaped(i, k);
+            }
+            norm = alpha_ * norm + bias_;
+            DCHECK_GT(norm, T(1e-6));
+            for (int64 k = depth_begin; k < depth_end; ++k) {
+              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
+                      activations(i, j) / norm;
+              if (k == j) {
+                dyi += Eigen::numext::pow(norm, -beta_);
+              }
+              dyi *= grads_shaped(i, j);
+              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
+                  dyi;
+            }
+          }
+        }
+      };
+      auto worker_threads =
+          *(context->device()->tensorflow_cpu_worker_threads());
+      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
+            depth * depth, shard);
+    }
+		
+    // release mkl resources
+    void Mklcleanup() {
+      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
+      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
+      if (!ingrad_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
+      }
+
+      if (!inimage_in_mkl_format) {
+        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
+      }
+      dnnDelete_F32(lrn_bwd);
+      dnnLayoutDelete_F32(lt_bdw_input);
+      dnnLayoutDelete_F32(lt_workspace);
+    }
+  } MklLRNGradOpContext;
+
+  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
+  bool workspace_enabled_;
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
+};
+
+#define REGISTER_MKL_LRN_CPU(T)                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLRNOp<T>);                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklLRNGrad")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLRNGradOp<T>);
+
+TF_CALL_float(REGISTER_MKL_LRN_CPU);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3ba28c13ed555cd569dea0621aae2a170e194bf3..16143191a34ae62704691f4916ac8f30d897f1d4 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -25,11 +25,11 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
+#include "third_party/mkl/include/mkl_cblas.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
-#include "third_party/mkl/include/mkl_cblas.h"
 
 namespace tensorflow {
 
@@ -56,11 +56,11 @@ class MklMatMulOp : public OpKernel {
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
 
-    OP_REQUIRES(ctx,
-                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
-                                        a.shape().DebugString(), ", In[1]: ",
-                                        b.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+        errors::InvalidArgument(
+            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
+            ", In[1]: ", b.shape().DebugString()));
     int a_dim_remaining = 1 - dim_pair[0].first;
     int b_dim_remaining = 1 - dim_pair[0].second;
     TensorShape out_shape(
@@ -199,15 +199,13 @@ class MklMatMulOp : public OpKernel {
   }
 };
 
-#define REGISTER_CPU(T)                                                      \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"),              \
-      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);       \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T").Label("MKL"), \
-      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>)
+#define REGISTER_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("MatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      MklMatMulOp<CPUDevice, T, false /* cublas, ignored for CPU */>);
 
-// TODO:Consider template specialization when adding/removing additional types
+// TODO(inteltf) Consider template specialization when adding/removing
+// additional types
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 7e3efdcc06c0c44f6eb4729427f51afb5ba93083..1e0ee258b09f40ad6849375e75b1492675a027cb 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -83,10 +83,11 @@ class MklMaxPoolingOp : public OpKernel {
     ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
 
     mkl_context.MklCreateLayoutsAndPrimitives(context);
+    OP_REQUIRES_OK(context, context->status());
 
     // Declare output tensor
     TensorShape tensor_out_shape;
-    MklShape mkl_out_shape;
+    MklShape mkl_out_shape, mkl_workspace_shape;
     mkl_out_shape.SetMklTensor(true);
     mkl_out_shape.SetMklLayout(mkl_context.prim_pooling_fwd, dnnResourceDst);
     mkl_out_shape.SetTfLayout(mkl_context.params.in_dim,
@@ -98,31 +99,22 @@ class MklMaxPoolingOp : public OpKernel {
     tensor_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
                                 mkl_out_shape.GetMklLayout())) /
                             sizeof(T));
-    AllocateOutputSetMklshape(context, 0, &output_tensor, tensor_out_shape,
+    AllocateOutputSetMklShape(context, 0, &output_tensor, tensor_out_shape,
                               mkl_out_shape);
 
-    if (!workspace_enabled_) {
-      mkl_out_shape.SetMklTensor(false);
-    }
-
     Tensor* workspace_tensor;
     void* workspace_buf = nullptr;
-    if (workspace_enabled_) {
-      TensorShape workspace_shape;
-      workspace_shape.AddDim(
-          dnnLayoutGetMemorySize_F32(
-              static_cast<dnnLayout_t>(mkl_context.lt_workspace)) /
-          sizeof(T));
-      AllocateOutputSetMklshape(context, 1, &workspace_tensor, workspace_shape,
-                                mkl_out_shape);
-      mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
-          static_cast<const void*>(workspace_tensor->flat<T>().data()));
-    } else {
-      AllocTmpBuffer(context, workspace_tensor, mkl_context.lt_workspace,
-                     &workspace_buf);
-      mkl_context.pooling_res[dnnResourceWorkspace] = workspace_buf;
-    }
 
+    TensorShape workspace_shape;
+    mkl_workspace_shape.SetMklTensor(false);
+    workspace_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                               mkl_context.lt_workspace)) /
+                           sizeof(T));
+    AllocateOutputSetMklShape(context, 1, &workspace_tensor, workspace_shape,
+                              mkl_workspace_shape);
+
+    mkl_context.pooling_res[dnnResourceWorkspace] = const_cast<void*>(
+        static_cast<const void*>(workspace_tensor->flat<T>().data()));
     mkl_context.pooling_res[dnnResourceSrc] =
         const_cast<void*>(static_cast<const void*>(tensor_in.flat<T>().data()));
     mkl_context.pooling_res[dnnResourceDst] = const_cast<void*>(
@@ -140,8 +132,8 @@ class MklMaxPoolingOp : public OpKernel {
     MklPoolingOpParams params;
     MklShape input_shape;
     void* pooling_res[dnnResourceNumber];
-    dnnPrimitive_t prim_pooling_fwd;
-    dnnLayout_t lt_user_input, lt_workspace;
+    dnnPrimitive_t prim_pooling_fwd = nullptr;
+    dnnLayout_t lt_user_input = nullptr, lt_workspace = nullptr;
 
     void MklCreateLayoutsAndPrimitives(OpKernelContext* context) {
       bool input_in_mkl_format = input_shape.IsMklTensor();
@@ -256,8 +248,13 @@ class MklMaxPoolingGradOp : public OpKernel {
     ExtractMklOpParams(context, data_format_, pool_params, &mkl_context.params);
 
     mkl_context.MklCreateLayouts(context);
+    OP_REQUIRES_OK(context, context->status());
+
     mkl_context.MklCreatePrimitives(context, workspace_enabled_);
+    OP_REQUIRES_OK(context, context->status());
+
     mkl_context.MklPrepareInputs(context, workspace_enabled_);
+    OP_REQUIRES_OK(context, context->status());
 
     // Create shape for the input back prop output
     TensorShape mkl_input_backprop;
@@ -274,16 +271,11 @@ class MklMaxPoolingGradOp : public OpKernel {
         dnnLayoutGetMemorySize_F32(
             static_cast<dnnLayout_t>(mkl_output_shape.GetMklLayout())) /
         sizeof(T));
-    AllocateOutputSetMklshape(context, 0, &output_tensor, mkl_input_backprop,
+    AllocateOutputSetMklShape(context, 0, &output_tensor, mkl_input_backprop,
                               mkl_output_shape);
     mkl_context.pooling_res[dnnResourceDiffSrc] = const_cast<void*>(
         static_cast<const void*>(output_tensor->flat<T>().data()));
 
-    int64 output_size = output_tensor->NumElements();
-    for (int64 i = 0; i < output_size; ++i) {
-      (static_cast<float*>(mkl_context.pooling_res[dnnResourceDiffSrc]))[i] = 0;
-    }
-
     CHECK_EQ(
         dnnExecute_F32(mkl_context.prim_pooling_bwd, mkl_context.pooling_res),
         E_SUCCESS);
@@ -297,12 +289,15 @@ class MklMaxPoolingGradOp : public OpKernel {
     MklShape input_shape, output_backprop_shape;
     void* pooling_resfwd[dnnResourceNumber];
     void* pooling_res[dnnResourceNumber];
-    dnnPrimitive_t prim_pooling_fwd, prim_pooling_bwd, convert_input,
-        convert_outbackprop;
-    dnnLayout_t lt_outbackprop_user, lt_outbackprop_prim, lt_input_user,
-        lt_input_prim;
+    dnnPrimitive_t prim_pooling_fwd = nullptr, prim_pooling_bwd = nullptr,
+                   convert_input = nullptr, convert_outbackprop = nullptr;
+    dnnLayout_t lt_outbackprop_user = nullptr, lt_outbackprop_prim = nullptr,
+                lt_input_user = nullptr, lt_input_prim = nullptr;
     void* input_buf;
     void* outbackprop_buf;
+    Tensor tmp_output_buf_tensor;
+    Tensor workspace_buf_tensor;
+    Tensor input_buf_tensor, outbackprop_buf_tensor;
 
     void MklCreateLayouts(OpKernelContext* context) {
       bool input_in_mkl_format = input_shape.IsMklTensor();
@@ -351,9 +346,6 @@ class MklMaxPoolingGradOp : public OpKernel {
                    &lt_outbackprop_prim, prim_pooling_bwd, dnnResourceDiffDst),
                E_SUCCESS);
 
-      // Tensors needed to create temporary buffers
-      Tensor input_buf_tensor, outbackprop_buf_tensor;
-
       if (workspace_enabled == false) {
         CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
                      &lt_input_prim, prim_pooling_fwd, dnnResourceSrc),
@@ -384,11 +376,8 @@ class MklMaxPoolingGradOp : public OpKernel {
       bool input_in_mkl_format = input_shape.IsMklTensor();
       bool outbackprop_in_mkl_format = output_backprop_shape.IsMklTensor();
 
-      void* tmp_output_buf;
-      Tensor tmp_output_buf_tensor;
-
-      void* workspace_buf;
-      Tensor workspace_buf_tensor;
+      void* tmp_output_buf = nullptr;
+      void* workspace_buf = nullptr;
 
       if (workspace_enabled == false) {
         if (convert_input != nullptr) {
@@ -488,16 +477,16 @@ class MklMaxPoolingGradOp : public OpKernel {
   bool workspace_enabled_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MklMaxPool")
+REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
-                            .Label(mkl_layer_registry::kMklLayerLabel),
+                            .Label(mkl_op_registry::kMklOpLabel),
                         MklMaxPoolingOp<CPUDevice, float>);
 
-REGISTER_KERNEL_BUILDER(Name("MklMaxPoolGrad")
+REGISTER_KERNEL_BUILDER(Name("_MklMaxPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
-                            .Label(mkl_layer_registry::kMklLayerLabel),
+                            .Label(mkl_op_registry::kMklOpLabel),
                         MklMaxPoolingGradOp<CPUDevice, float>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 3eb472d7e304ddd29518d30aa524a13d35ce3d03..65e8852cfb11a2dd78395860a7ca7b2cc550be34 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -15,152 +15,136 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <vector>
-#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 
 namespace tensorflow {
 
-  // Initialization for TensorFlow format
-  void MklPoolParameters::Init(OpKernelContext* context,
-                               const std::vector<int32>& ksize,
-                               const std::vector<int32>& stride,
-                               Padding padding,
-                               TensorFormat data_format,
-                               const TensorShape& tensor_in_shape) {
-    // For maxpooling, tensor_in should have 4 dimensions.
-    OP_REQUIRES(context, tensor_in_shape.dims() == 4,
-                errors::InvalidArgument("tensor_in must be 4-dimensional"));
-
-    depth = GetTensorDim(tensor_in_shape, data_format, 'C');
-    tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
-    tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
-    tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
-
-    Init(context, ksize, stride, padding, data_format);
-  }
-
-  // Initialization for MKL format
-  void MklPoolParameters::Init(OpKernelContext* context,
-                               const std::vector<int32>& ksize,
-                               const std::vector<int32>& stride,
-                               Padding padding,
-                               TensorFormat data_format,
-                               const MklShape* mklInputShape) {
-    // Get the input sizes
-    depth = mklInputShape->GetSizes()[2];
-    tensor_in_cols = mklInputShape->GetSizes()[0];
-    tensor_in_rows = mklInputShape->GetSizes()[1];
-    tensor_in_batch = mklInputShape->GetSizes()[3];
-
-    Init(context, ksize, stride, padding, data_format);
-  }
-
-  // Common Initialization for TensorFlow and MKL formats
-  void MklPoolParameters::Init(OpKernelContext* context,
-                               const std::vector<int32>& ksize,
-                               const std::vector<int32>& stride,
-                               Padding padding,
-                               TensorFormat data_format) {
-    // Get the data format
-    this->data_format = data_format;
-
-    // Get the output sizes
-    window_rows = GetTensorDim(ksize, data_format, 'H');
-    window_cols = GetTensorDim(ksize, data_format, 'W');
-    depth_window = GetTensorDim(ksize, data_format, 'C');
-
-    // Get the strides
-    row_stride = GetTensorDim(stride, data_format, 'H');
-    col_stride = GetTensorDim(stride, data_format, 'W');
-    depth_stride = GetTensorDim(stride, data_format, 'C');
-
-    // We only support 2D pooling across width/height and depthwise
-    // pooling, not a combination.
-    OP_REQUIRES(context,
-                (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
-                errors::Unimplemented(
+// Initialization for TensorFlow format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const TensorShape& tensor_in_shape) {
+  // For maxpooling, tensor_in should have 4 dimensions.
+  OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+              errors::InvalidArgument("tensor_in must be 4-dimensional"));
+
+  depth = GetTensorDim(tensor_in_shape, data_format, 'C');
+  tensor_in_cols = GetTensorDim(tensor_in_shape, data_format, 'W');
+  tensor_in_rows = GetTensorDim(tensor_in_shape, data_format, 'H');
+  tensor_in_batch = GetTensorDim(tensor_in_shape, data_format, 'N');
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetSizes()[2];
+  tensor_in_cols = mklInputShape->GetSizes()[0];
+  tensor_in_rows = mklInputShape->GetSizes()[1];
+  tensor_in_batch = mklInputShape->GetSizes()[3];
+
+  Init(context, ksize, stride, padding, data_format);
+}
+
+// Common Initialization for TensorFlow and MKL formats
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format) {
+  // Get the data format
+  this->data_format = data_format;
+
+  // Get the output sizes
+  window_rows = GetTensorDim(ksize, data_format, 'H');
+  window_cols = GetTensorDim(ksize, data_format, 'W');
+  depth_window = GetTensorDim(ksize, data_format, 'C');
+
+  // Get the strides
+  row_stride = GetTensorDim(stride, data_format, 'H');
+  col_stride = GetTensorDim(stride, data_format, 'W');
+  depth_stride = GetTensorDim(stride, data_format, 'C');
+
+  // We only support 2D pooling across width/height and depthwise
+  // pooling, not a combination.
+  OP_REQUIRES(context,
+              (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+              errors::Unimplemented(
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
 
-    if (depth_window == 1) {
-      OP_REQUIRES_OK(context,
-                     GetWindowedOutputSizeVerbose(tensor_in_rows,
-                                                  window_rows,
-                                                  row_stride,
-                                                  padding,
-                                                  &out_height,
-                                                  &pad_top,
-                                                  &pad_bottom));
-
-      OP_REQUIRES_OK(context,
-                     GetWindowedOutputSizeVerbose(tensor_in_cols,
-                                                  window_cols,
-                                                  col_stride,
-                                                  padding,
-                                                  &out_width,
-                                                  &pad_left,
-                                                  &pad_right));
-    } else {
-      // Our current version of depthwise max pooling does not support
-      // any padding, and expects the depth_window to equal the depth
-      // stride (no overlapping).
-      OP_REQUIRES(context, depth % depth_window == 0,
-                  errors::Unimplemented("Depthwise max pooling requires the"
-                                        " depth window to evenly divide the"
-                                        " input depth"));
-      OP_REQUIRES(context, depth_stride == depth_window,
-                  errors::Unimplemented("Depthwise max pooling requires the"
-                                        " depth window to equal the depth"
-                                        " stride"));
-
-      // The current version of depthwise max is only implemented on CPU.
-      OP_REQUIRES(context,
-                  (DeviceType(static_cast<Device*>(context->device())
-                              ->attributes()
-                              .device_type()) == DeviceType(DEVICE_CPU)),
-                  errors::Unimplemented("Depthwise max pooling is currently "
-                                        "only implemented for CPU devices."));
-
-      pad_depth = 0;
-      out_depth = depth / depth_window;
-    }
-  }
-
-  // Transfers the right parameters for pooling to the op parameters
-  // Updates context->status if there is an invalid input.
-  void ExtractMklOpParams(OpKernelContext* context,
-                          TensorFormat data_format,
-                          const MklPoolParameters &params,
-                          MklPoolingOpParams *mkl_params) {
-    mkl_params->in_sizes[0] = params.tensor_in_cols;
-    mkl_params->in_sizes[1] = params.tensor_in_rows;
-    mkl_params->in_sizes[2] = params.depth;
-    mkl_params->in_sizes[3] = params.tensor_in_batch;
-
-    GetStridesFromSizes(data_format,
-                        mkl_params->in_strides,
-                        mkl_params->in_sizes);
-
-    mkl_params->out_sizes[0] = params.out_width;
-    mkl_params->out_sizes[1] = params.out_height;
-    mkl_params->out_sizes[2] = params.depth;
-    mkl_params->out_sizes[3] = params.tensor_in_batch;
-
-    GetStridesFromSizes(data_format,
-                        mkl_params->out_strides,
-                        mkl_params->out_sizes);
-
-    mkl_params->in_offset[0] = -params.pad_left;
-    mkl_params->in_offset[1] = -params.pad_top;
-    mkl_params->in_offset[2] = -params.pad_right;
-    mkl_params->in_offset[3] = -params.pad_bottom;
-
-    mkl_params->kernel_stride[0] = params.col_stride;
-    mkl_params->kernel_stride[1] = params.row_stride;
-
-    mkl_params->kernel_size[0] = params.window_cols;
-    mkl_params->kernel_size[1] = params.window_rows;
+  if (depth_window == 1) {
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
+  } else {
+    // Our current version of depthwise max pooling does not support
+    // any padding, and expects the depth_window to equal the depth
+    // stride (no overlapping).
+    OP_REQUIRES(context, depth % depth_window == 0,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to evenly divide the"
+                                      " input depth"));
+    OP_REQUIRES(context, depth_stride == depth_window,
+                errors::Unimplemented("Depthwise max pooling requires the"
+                                      " depth window to equal the depth"
+                                      " stride"));
+
+    // The current version of depthwise max is only implemented on CPU.
+    OP_REQUIRES(context,
+                (DeviceType(static_cast<Device*>(context->device())
+                                ->attributes()
+                                .device_type()) == DeviceType(DEVICE_CPU)),
+                errors::Unimplemented("Depthwise max pooling is currently "
+                                      "only implemented for CPU devices."));
+
+    pad_depth = 0;
+    out_depth = depth / depth_window;
   }
-}       // namespace tensorflow
+}
+
+// Transfers the right parameters for pooling to the op parameters
+// Updates context->status if there is an invalid input.
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params) {
+  mkl_params->in_sizes[0] = params.tensor_in_cols;
+  mkl_params->in_sizes[1] = params.tensor_in_rows;
+  mkl_params->in_sizes[2] = params.depth;
+  mkl_params->in_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->in_strides,
+                      mkl_params->in_sizes);
+
+  mkl_params->out_sizes[0] = params.out_width;
+  mkl_params->out_sizes[1] = params.out_height;
+  mkl_params->out_sizes[2] = params.depth;
+  mkl_params->out_sizes[3] = params.tensor_in_batch;
+
+  GetStridesFromSizes(data_format, mkl_params->out_strides,
+                      mkl_params->out_sizes);
+
+  mkl_params->in_offset[0] = -params.pad_left;
+  mkl_params->in_offset[1] = -params.pad_top;
+  mkl_params->in_offset[2] = -params.pad_right;
+  mkl_params->in_offset[3] = -params.pad_bottom;
+
+  mkl_params->kernel_stride[0] = params.col_stride;
+  mkl_params->kernel_stride[1] = params.row_stride;
+
+  mkl_params->kernel_size[0] = params.window_cols;
+  mkl_params->kernel_size[1] = params.window_rows;
+}
+}  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 0a7c4dd15ebe95db60fc59a3bdcea6028bb00247..92ea2beb25aa1fd4cab7fd787b04c4d086ca1b05 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -76,17 +76,16 @@ typedef struct {
   size_t in_strides[4];
   size_t out_sizes[4];
   size_t out_strides[4];
-  int    in_offset[4];
+  int in_offset[4];
   size_t kernel_stride[2];
   size_t kernel_size[2];
 } MklPoolingOpParams;
 
 // Transfers the right parameters for pooling to the op parameters
 // Updates context->status if there is an invalid input.
-void ExtractMklOpParams(OpKernelContext* context,
-                        TensorFormat data_format,
-                        const MklPoolParameters &params,
-                        MklPoolingOpParams *mkl_params);
+void ExtractMklOpParams(OpKernelContext* context, TensorFormat data_format,
+                        const MklPoolParameters& params,
+                        MklPoolingOpParams* mkl_params);
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 63c0374981fa8aa998c4b1e503f001272f393cfd..10d2937584ddcd5178f1be75bab980ab00fb05d1 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -1,397 +1,380 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// See docs in ../ops/nn_ops.cc.
-#ifdef INTEL_MKL
-
-#include "tensorflow/core/framework/numeric_op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/platform/default/logging.h"
-#include "tensorflow/core/util/mkl_util.h"
-#include "third_party/mkl/include/mkl_dnn.h"
-#include "third_party/mkl/include/mkl_dnn_types.h"
-
-namespace tensorflow {
-
-typedef Eigen::ThreadPoolDevice CPUDevice;
-
-struct MklReluHelpers {
-  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
-                                     const Tensor& a) {
-    OP_REQUIRES(context, a.IsSameSize(g),
-                errors::InvalidArgument("g and a must be the same size"));
-  }
-  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
-                               const Tensor& a) {
-    ValidateSameSizeHelper(context, g, a);
-    return context->status().ok();
-  }
-};
-
-template <typename Device, typename T>
-class MklReluOp : public OpKernel {
- public:
-  ~MklReluOp() {}
-
-  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklReluOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
-      const TensorShape& o_shape = input.shape();
-      Tensor* out_tensor = nullptr;
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklshape(context, 0, &out_tensor, o_shape,
-                                mkl_context.output_shape);
-      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
-      (static_cast<T*>(out_o))[0] =
-          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
-      return;
-    }
-
-    // Generate size, stride for input if input is in MKL format.
-    if (input_in_mkl_format) {
-      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
-        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
-      }
-    } else {
-      mkl_context.in_dims = input.dims();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
-      }
-      mkl_context.in_strides[0] = 1;
-      for (int i = 1; i < mkl_context.in_dims; i++) {
-        mkl_context.in_strides[i] =
-            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-      }
-    }
-
-    float negative_slope = 0.0;
-    mkl_context.MklCreateInputLayouts(context);
-    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
-                                      mkl_context.lt_input, negative_slope),
-             E_SUCCESS);
-
-    Tensor* output = nullptr;
-
-    if (input_in_mkl_format) {
-      TensorShape tf_shape;
-      mkl_context.output_shape.SetMklTensor(true);
-      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
-                                            dnnResourceDst);
-      mkl_context.output_shape.SetTfLayout(
-          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                          mkl_context.output_shape.GetMklLayout())) /
-                      sizeof(T));
-      AllocateOutputSetMklshape(context, 0, &output, tf_shape,
-                                mkl_context.output_shape);
-    } else {
-      const TensorShape& o_shape = input.shape();
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklshape(context, 0, &output, o_shape,
-                                mkl_context.output_shape);
-    }
-
-    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
-
-    mkl_context.relu_res[dnnResourceDst] = user_o;
-    mkl_context.relu_res[dnnResourceSrc] = user_i;
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
-             E_SUCCESS);
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, output_shape;
-    dnnPrimitive_t prim_relu_fwd = nullptr;
-    void* relu_res[dnnResourceNumber];
-    dnnLayout_t lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      dnnDelete_F32(prim_relu_fwd);
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-    }
-  } MklReluOpContext;
-};
-
-template <typename Device, typename T>
-class MklReluGradOp : public OpKernel {
- public:
-  ~MklReluGradOp() {}
-
-  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override;
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, grad_shape, output_shape;
-    void* relu_res[dnnResourceNumber];
-    dnnPrimitive_t prim_relu_bwd;
-    dnnLayout_t lt_input, lt_grad;
-
-    void MklPrepareReluGradInputs(OpKernelContext* context,
-                                  Tensor* mkl_tmp_grad_buf_tensor,
-                                  Tensor* mkl_tmp_input_buf_tensor) {
-      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
-
-      const Tensor& g = MklGetInput(context, 0);
-      const Tensor& a = MklGetInput(context, 1);
-
-      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_grad, prim_relu_bwd, dnnResourceDiffDst),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_input,
-                                                prim_relu_bwd, dnnResourceSrc),
-               E_SUCCESS);
-
-      if (!dnnLayoutCompare_F32(mkl_lt_internal_grad, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_grad_buf_tensor, mkl_lt_internal_grad,
-                       &relu_res[dnnResourceDiffDst]);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_grad, lt_grad,
-                                         mkl_lt_internal_grad),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_grad, user_g,
-                                          relu_res[dnnResourceDiffDst]),
-                 E_SUCCESS);
-        dnnDelete_F32(cv_user_to_reluB_grad);
-      } else {
-        relu_res[dnnResourceDiffDst] = user_g;
-      }
-
-      if (!dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &relu_res[dnnResourceSrc]);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_user_to_reluB_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_user_to_reluB_input, user_i,
-                                          relu_res[dnnResourceSrc]),
-                 E_SUCCESS);
-        dnnDelete_F32(cv_user_to_reluB_input);
-      } else {
-        relu_res[dnnResourceSrc] = user_i;
-      }
-
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      dnnLayoutDelete_F32(mkl_lt_internal_grad);
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      if (!input_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-
-      if (!grad_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
-      }
-    }
-
-    void MklCleanup() {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      dnnDelete_F32(prim_relu_bwd);
-      if (!input_is_mkl) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      if (!grad_is_mkl) {
-        dnnLayoutDelete_F32(lt_grad);
-      }
-    }
-  } MklReluGradOpContext;
-};
-
-template <typename Device, typename T>
-
-void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
-  MklReluGradOpContext mkl_context;
-  const Tensor& g = MklGetInput(context, 0);
-  const Tensor& a = MklGetInput(context, 1);
-
-  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-  GetMklShape(context, 0, &mkl_context.grad_shape);
-  GetMklShape(context, 1, &mkl_context.input_shape);
-
-  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
-  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
-  if (!input_is_mkl && !grad_is_mkl &&
-      !MklReluHelpers::ValidateSameSize(context, g, a))
-    return;
-  Tensor* output = nullptr;
-  if (!input_is_mkl && !grad_is_mkl &&
-      !a.dims()) {  // handle the case of a scalar
-    // Allocate space for g and
-    const TensorShape& g_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklshape(context, 0, &output, g_shape,
-                              mkl_context.output_shape);
-    void* out_o = static_cast<void*>(output->flat<T>().data());
-    (static_cast<T*>(out_o))[0] =
-        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
-    return;
-  }
-
-  // Generate size, stride for input if input/grad is in MKL format.
-  if (grad_is_mkl || input_is_mkl) {
-    const MklShape* tmp_mkl_shape =
-        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
-
-    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
-      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
-    }
-  } else {
-    mkl_context.in_dims = g.dims();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
-    }
-    mkl_context.in_strides[0] = 1;
-    for (int i = 1; i < mkl_context.in_dims; i++) {
-      mkl_context.in_strides[i] =
-          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-    }
-  }
-
-  mkl_context.MklCreateInputLayouts(context);
-  float negative_slope = 0.0;
-  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
-                                     mkl_context.lt_grad, mkl_context.lt_input,
-                                     negative_slope),
-           E_SUCCESS);
-  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
-                                       &mkl_tmp_input_buf_tensor);
-
-  if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
-    TensorShape tf_shape;
-    mkl_context.output_shape.SetMklTensor(true);
-    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
-                                          dnnResourceDiffSrc);
-    mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
-    // shape of one that is in MKL layout.
-    if (grad_is_mkl == true) {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-    }
-
-    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-    AllocateOutputSetMklshape(context, 0, &output, tf_shape,
-                              mkl_context.output_shape);
-
-  } else {
-    const TensorShape& o_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklshape(context, 0, &output, o_shape,
-                              mkl_context.output_shape);
-  }
-
-  mkl_context.relu_res[dnnResourceDiffSrc] =
-      static_cast<void*>(output->flat<T>().data());
-
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
-  mkl_context.MklCleanup();
-}
-
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Relu and f32*/
-#define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("MklRelu")                                 \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<type>("T")                  \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
-                          MklReluOp<CPUDevice, type>);                    \
-  REGISTER_KERNEL_BUILDER(Name("MklReluGrad")                             \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<type>("T")                  \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
-                          MklReluGradOp<CPUDevice, type>);
-TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
-
-}  // namespace tensorflow
-
-#endif  // INTEL_MKL
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/platform/default/logging.h"
+#include "tensorflow/core/util/mkl_util.h"
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+struct MklReluHelpers {
+  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
+                                     const Tensor& a) {
+    OP_REQUIRES(context, a.IsSameSize(g),
+                errors::InvalidArgument("g and a must be the same size"));
+  }
+  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
+                               const Tensor& a) {
+    ValidateSameSizeHelper(context, g, a);
+    return context->status().ok();
+  }
+};
+
+template <typename Device, typename T>
+class MklReluOp : public OpKernel {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklReluOpContext mkl_context;
+
+    const Tensor& input = MklGetInput(context, 0);
+    GetMklShape(context, 0, &mkl_context.input_shape);
+    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
+    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
+      const TensorShape& o_shape = input.shape();
+      Tensor* out_tensor = nullptr;
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &out_tensor, o_shape,
+                                mkl_context.output_shape);
+      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
+      (static_cast<T*>(out_o))[0] =
+          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+      return;
+    }
+
+    // Generate size, stride for input if input is in MKL format.
+    if (input_in_mkl_format) {
+      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
+        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
+      }
+    } else {
+      mkl_context.in_dims = input.dims();
+      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+      mkl_context.in_strides = new size_t[mkl_context.in_dims];
+      for (int i = 0; i < mkl_context.in_dims; i++) {
+        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
+      }
+      mkl_context.in_strides[0] = 1;
+      for (int i = 1; i < mkl_context.in_dims; i++) {
+        mkl_context.in_strides[i] =
+            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+      }
+    }
+
+    float negative_slope = 0.0;
+    mkl_context.MklCreateInputLayouts(context);
+    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
+                                      mkl_context.lt_input, negative_slope),
+             E_SUCCESS);
+
+    Tensor* output = nullptr;
+
+    if (input_in_mkl_format) {
+      TensorShape tf_shape;
+      mkl_context.output_shape.SetMklTensor(true);
+      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
+                                            dnnResourceDst);
+      mkl_context.output_shape.SetTfLayout(
+          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                          mkl_context.output_shape.GetMklLayout())) /
+                      sizeof(T));
+      AllocateOutputSetMklShape(context, 0, &output, tf_shape,
+                                mkl_context.output_shape);
+    } else {
+      const TensorShape& o_shape = input.shape();
+      mkl_context.output_shape.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, 0, &output, o_shape,
+                                mkl_context.output_shape);
+    }
+
+    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
+
+    mkl_context.relu_res[dnnResourceDst] = user_o;
+    mkl_context.relu_res[dnnResourceSrc] = user_i;
+    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
+             E_SUCCESS);
+    mkl_context.MklCleanup();
+  }
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, output_shape;
+    dnnPrimitive_t prim_relu_fwd = nullptr;
+    void* relu_res[dnnResourceNumber];
+    dnnLayout_t lt_input = nullptr;
+
+    void MklCleanup() {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      dnnDelete_F32(prim_relu_fwd);
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool input_in_mkl_format = input_shape.IsMklTensor();
+      if (!input_in_mkl_format) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+    }
+  } MklReluOpContext;
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public OpKernel {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override;
+
+ private:
+  typedef struct {
+    int in_dims;
+    size_t* in_sizes;
+    size_t* in_strides;
+    MklShape input_shape, grad_shape, output_shape;
+    void* relu_res[dnnResourceNumber];
+    dnnPrimitive_t prim_relu_bwd;
+    dnnLayout_t lt_input, lt_grad;
+
+    void MklPrepareReluGradInputs(OpKernelContext* context,
+                                  Tensor* mkl_tmp_grad_buf_tensor,
+                                  Tensor* mkl_tmp_input_buf_tensor) {
+      dnnPrimitive_t cv_user_to_reluB_input, cv_user_to_reluB_grad;
+      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_grad;
+
+      const Tensor& g = MklGetInput(context, 0);
+      const Tensor& a = MklGetInput(context, 1);
+
+      void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+      void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+      dnnPrimitive_t cv_input_to_grad = NULL;
+      Tensor mkl_tmp_buf_tensor;
+      void* mkl_buffer_convert = nullptr;
+
+      // if input and grad are not in the same layout, do a conversion between
+      // them.
+      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
+        AllocTmpBuffer(context, &mkl_tmp_buf_tensor, lt_grad,
+                       &mkl_buffer_convert);
+        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
+                 E_SUCCESS);
+
+        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, user_i,
+                                          mkl_buffer_convert),
+                 E_SUCCESS);
+        relu_res[dnnResourceSrc] = mkl_buffer_convert;
+        dnnDelete_F32(cv_input_to_grad);
+      } else {
+        relu_res[dnnResourceSrc] = user_i;
+      }
+
+      relu_res[dnnResourceDiffDst] = user_g;
+    }
+
+    void MklCreateInputLayouts(OpKernelContext* context) {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      if (!input_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
+      }
+
+      if (!grad_is_mkl) {
+        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
+                 E_SUCCESS);
+      } else {
+        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
+      }
+    }
+
+    void MklCleanup() {
+      bool grad_is_mkl = grad_shape.IsMklTensor();
+      bool input_is_mkl = input_shape.IsMklTensor();
+      dnnDelete_F32(prim_relu_bwd);
+      if (!input_is_mkl) {
+        dnnLayoutDelete_F32(lt_input);
+        free(in_sizes);
+        free(in_strides);
+      }
+      if (!grad_is_mkl) {
+        dnnLayoutDelete_F32(lt_grad);
+      }
+    }
+  } MklReluGradOpContext;
+};
+
+template <typename Device, typename T>
+
+void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
+  MklReluGradOpContext mkl_context;
+  const Tensor& g = MklGetInput(context, 0);
+  const Tensor& a = MklGetInput(context, 1);
+
+  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
+  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
+
+  GetMklShape(context, 0, &mkl_context.grad_shape);
+  GetMklShape(context, 1, &mkl_context.input_shape);
+
+  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
+  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
+  if (!input_is_mkl && !grad_is_mkl &&
+      !MklReluHelpers::ValidateSameSize(context, g, a))
+    return;
+  Tensor* output = nullptr;
+  if (!input_is_mkl && !grad_is_mkl &&
+      !a.dims()) {  // handle the case of a scalar
+    // Allocate space for g and
+    const TensorShape& g_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 0, &output, g_shape,
+                              mkl_context.output_shape);
+    void* out_o = static_cast<void*>(output->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+
+  // Generate size, stride for input if input/grad is in MKL format.
+  if (grad_is_mkl || input_is_mkl) {
+    const MklShape* tmp_mkl_shape =
+        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
+
+    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
+      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
+    }
+  } else {
+    mkl_context.in_dims = g.dims();
+    mkl_context.in_strides = new size_t[mkl_context.in_dims];
+    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
+
+    for (int i = 0; i < mkl_context.in_dims; i++) {
+      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
+    }
+    mkl_context.in_strides[0] = 1;
+    for (int i = 1; i < mkl_context.in_dims; i++) {
+      mkl_context.in_strides[i] =
+          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
+    }
+  }
+
+  mkl_context.MklCreateInputLayouts(context);
+  float negative_slope = 0.0;
+  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
+                                     mkl_context.lt_grad, mkl_context.lt_grad,
+                                     negative_slope),
+           E_SUCCESS);
+  Tensor mkl_tmp_grad_buf_tensor, mkl_tmp_input_buf_tensor;
+  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_grad_buf_tensor,
+                                       &mkl_tmp_input_buf_tensor);
+
+  if (input_is_mkl ||
+      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+    TensorShape tf_shape;
+    mkl_context.output_shape.SetMklTensor(true);
+    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
+                                          dnnResourceDiffSrc);
+    mkl_context.output_shape.SetTfLayout(
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
+    // shape of one that is in MKL layout.
+    if (grad_is_mkl == true) {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
+    } else {
+      mkl_context.output_shape.SetTfDimOrder(
+          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
+    }
+
+    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
+                        mkl_context.output_shape.GetMklLayout())) /
+                    sizeof(T));
+    AllocateOutputSetMklShape(context, 0, &output, tf_shape,
+                              mkl_context.output_shape);
+
+  } else {
+    const TensorShape& o_shape = g.shape();
+    mkl_context.output_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 0, &output, o_shape,
+                              mkl_context.output_shape);
+  }
+
+  mkl_context.relu_res[dnnResourceDiffSrc] =
+      static_cast<void*>(output->flat<T>().data());
+
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
+           E_SUCCESS);
+  mkl_context.MklCleanup();
+}
+
+/* Register DNN kernels for supported operations and supported types - right now
+ * it is only Relu and f32*/
+#define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReluOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklReluGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..593aa3a2fd6052f275015b1acd2e6f5271a837dd
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -0,0 +1,149 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include <memory>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "third_party/mkl/include/mkl_dnn.h"
+#include "third_party/mkl/include/mkl_dnn_types.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+template <typename Device, typename T>
+class MklReshapeOp : public OpKernel {
+ public:
+  explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+    const int64 num_dims = sizes.NumElements();
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    auto vec_size = sizes.flat<int32>();
+    for (int d = 0; d < num_dims; ++d) {
+      const int32 size = vec_size(d);
+      if (size == -1) {
+        OP_REQUIRES(
+            context, unknown_index == -1,
+            errors::InvalidArgument("only one input size may be -1, not both ",
+                                    unknown_index, " and ", d));
+        unknown_index = d;
+        shape.AddDim(1);
+      } else {
+        OP_REQUIRES(context, size >= 0,
+                    errors::InvalidArgument(
+                        "size ", d, " must be non-negative, not ", size));
+        shape.AddDim(size);
+        product *= size;
+      }
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = input.NumElements() / product;
+      OP_REQUIRES(
+          context, product * missing == input.NumElements(),
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", input.NumElements(),
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        input.NumElements(),
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    MklShape mkl_shape_input;
+    GetMklShape(context, 0, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from;
+      for (size_t i = 0; i < mkl_shape_input.GetDimension(); i++) {
+        // Outermost to innermost dimension
+        shape_from.AddDim(
+            mkl_shape_input.GetSizes()[mkl_shape_input.tf_dim_idx(i)]);
+      }
+
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, 0, 0);
+        return;
+      } else {
+        // Allocate output tensor.
+        Tensor* output_tensor = NULL;
+        MklShape mkl_shape_output;
+        mkl_shape_output.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 0, &output_tensor, shape_to,
+                                  mkl_shape_output);
+
+        // Get output layout pointer.
+        dnnLayout_t output_layout =
+            static_cast<dnnLayout_t>(mkl_shape_input.GetTfLayout());
+
+        // Execute DNNConversion.
+        // Note: we  assume an MKL tensor always have float as its data type.
+        void* input_buffer =
+            static_cast<void*>(const_cast<float*>(input.flat<float>().data()));
+        void* output_buffer = static_cast<void*>(
+            const_cast<float*>(output_tensor->flat<float>().data()));
+        mkl_shape_input.GetConvertedFlatData(output_layout, input_buffer,
+                                             output_buffer);
+
+        VLOG(1) << "MKLToTFConversion complete successfully.";
+        return;
+      }
+    } else {
+      CopyTfTensorInToOutWithShape(context, 0, 0, shape);
+    }
+  }
+};
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklReshape")                       \
+                              .Device(DEVICE_CPU)                   \
+                              .HostMemory("shape")                  \
+                              .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int32>("Tshape")      \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklReshapeOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc
index 51f90b3f901fd5c76ec237ddf1e6fd5c6679f65e..588d6874dd635b89863141a3eccd005bcf6f0317 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.cc
+++ b/tensorflow/core/kernels/mkl_tfconv_op.cc
@@ -105,11 +105,11 @@ class MklToTfOp : public OpKernel {
 //               Register kernel
 ///////////////////////////////////////////////////////////
 
-#define REGISTER_CPU(T)                                                   \
-  REGISTER_KERNEL_BUILDER(Name("MklToTf")                                 \
-                              .Device(DEVICE_CPU)                         \
-                              .TypeConstraint<T>("T")                     \
-                              .Label(mkl_layer_registry::kMklLayerLabel), \
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklToTf")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
                           MklToTfOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7641516e3b00078a5ab4b78585780ce0333743c6
--- /dev/null
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -0,0 +1,43 @@
+# Description:
+# Kernel implementations using Neon intrinsics.
+#
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_kernel_library",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_kernel_library(
+    name = "neon_depthwise_conv_op",
+    hdrs = [
+        "depthwiseconv_float.h",
+        "types.h",
+    ],
+    prefix = "neon_depthwise_conv_op",
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core/kernels:ops_util",
+        "@gemmlowp//:gemmlowp",
+    ],
+)
diff --git a/tensorflow/core/kernels/neon/depthwiseconv_float.h b/tensorflow/core/kernels/neon/depthwiseconv_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..acd58a644f3b0b0b578778f8c017efff30771efa
--- /dev/null
+++ b/tensorflow/core/kernels/neon/depthwiseconv_float.h
@@ -0,0 +1,725 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/core/kernels/neon/types.h"
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace tensorflow {
+namespace neon {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input[4];
+        for (int i = 0; i < 4; i++) {
+          input[i] = vld1q_f32(local_input_ptr + 4 * i);
+        }
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const float* input_data,
+    int pad_width, int depth_multiplier, int filter_width,
+    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, float* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+
+  VLOG(1) << "DepthwiseConv2d using slow path with "
+          << "stride = " << stride << ", "
+          << "input_depth = " << input_depth << ", "
+          << "depth_multiplier = " << depth_multiplier << ".";
+
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  DCHECK(output_depth == input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 1024;
+  float acc_buffer[kAccBufferMaxSize];
+  DCHECK_GE(kAccBufferMaxSize, output_depth)
+      << "Too small kAccBufferMaxSize for this model!";
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
+  DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+
+  const int kMaxFixedDepthMultiplier = 8;
+  int fixed_depth_multiplier = 0;
+  if (depth_multiplier <= kMaxFixedDepthMultiplier) {
+    fixed_depth_multiplier = depth_multiplier;
+  }
+  // kMaxUnrolling is the max number of output values that we aim to handle
+  // in one unrolled iteration of the inner loop. For practical performance
+  // reasons, it is limited by the number of available registers. We could
+  // fine-tune it depending on the architecture, but that's not worth doing
+  // since this whole code is not very optimized to begin with. The
+  // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
+  // vector registers.
+  const int kMaxUnrolling = 8;
+  int fixed_input_depth = 0;
+  if (fixed_depth_multiplier &&
+      input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
+    fixed_input_depth = input_depth;
+  }
+#define TF_NEON_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                         FIXED_DEPTH_MULTIPLIER)           \
+  if ((stride == 1 || ALLOW_STRIDED) &&                                    \
+      fixed_input_depth == FIXED_INPUT_DEPTH &&                            \
+      fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                  \
+    row_accum_func =                                                       \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,       \
+                                   FIXED_DEPTH_MULTIPLIER>;                \
+  }
+
+#ifdef USE_NEON
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TF_NEON_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+#endif  // USE_NEON
+
+#undef TF_NEON_USE_DEPTHWISECONV_KERNEL
+
+  // Now that we have determined row_accum_func, we can start work.
+  float* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(stride, input_depth, input_width,
+                         input_data + in_y * input_dims.strides[2] +
+                             b * input_dims.strides[3],
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_dims.strides[2],
+                         out_x_buffer_start, out_x_buffer_end, output_depth,
+                         acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]);
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f),
+                                 vminq_f32(vdupq_n_f32(6.f), acc[k]));
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(-1.f),
+                                 vminq_f32(vdupq_n_f32(1.f), acc[k]));
+            }
+          }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc =
+                vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc));
+          }
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = std::max(0.f, acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = std::max(0.f, std::min(6.f, acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc = std::max(-1.f, std::min(1.f, acc));
+          }
+          *output_ptr++ = acc;
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace neon
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_DEPTHWISECONV_H_
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..818b44aab3908648b770b34c9bb3c86eca13e7bf
--- /dev/null
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -0,0 +1,203 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#include "public/gemmlowp.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+// A version of tensorflow/core/kernels/depthwise_conv_op.cc that
+// uses the neon intrinsics.
+class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
+ public:
+  explicit NeonDepthwiseConv2dNativeOp(OpKernelConstruction* context)
+      : BinaryOp<float>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(
+        context, (strides_[0] == 1 && strides_[3] == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    const int32 in_depth = input.dim_size(3);
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+    const int32 batch = input.dim_size(0);
+    const int32 input_rows = input.dim_size(1);
+    const int32 input_cols = input.dim_size(2);
+
+    const int32 filter_rows = filter.dim_size(0);
+    const int32 filter_cols = filter.dim_size(1);
+    const int32 depth_multiplier = filter.dim_size(3);
+
+    const int32 out_depth = in_depth * depth_multiplier;
+
+    const int32 stride = strides_[1];
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+    OP_REQUIRES(
+        context, out_shape.num_elements() <= 2147483647,
+        errors::InvalidArgument("total number of outputs should be within the "
+                                "range of int which is used in the GPU kernel",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "NeonDepthwiseConv2dNative: "
+            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
+            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
+            << filter_cols << ", " << in_depth << ", " << depth_multiplier
+            << "]; stride = " << stride << ", pad_rows = " << pad_rows
+            << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
+            << out_rows << ", " << out_cols << ", " << out_depth << "]";
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    const float* input_ptr = input.template flat<float>().data();
+    const float* filter_ptr = filter.template flat<float>().data();
+    float* output_ptr = output->template flat<float>().data();
+
+    auto input_neon_dims = ToNeonDims(input.shape());
+    auto filter_neon_dims = FilterToNeonDims(filter.shape());
+    auto bias_neon_dims = BiasNeonDims(filter.shape());
+
+    int64 bias_size = bias_neon_dims.sizes[0];
+    float* bias_ptr = static_cast<float*>(port::AlignedMalloc(
+        bias_size * sizeof(float), Allocator::kAllocatorAlignment));
+    memset(bias_ptr, 0, bias_size * sizeof(float));
+
+    neon::DepthwiseConv<neon::FusedActivationFunctionType::kNone>(
+        input_ptr, input_neon_dims, filter_ptr, filter_neon_dims, bias_ptr,
+        bias_neon_dims, stride, pad_cols, pad_rows, depth_multiplier,
+        output_ptr, ToNeonDims(out_shape));
+
+    port::AlignedFree(bias_ptr);
+  }
+
+ private:
+  void SetNeonDimStrides(neon::Dims<4>* d) {
+    int64 stride = 1;
+    for (int i = 0; i < 4; ++i) {
+      d->strides[i] = stride;
+      stride *= d->sizes[i];
+    }
+  }
+
+  neon::Dims<4> ToNeonDims(const TensorShape& input) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    neon::Dims<4> result;
+    result.sizes[0] = input.dim_size(3);
+    result.sizes[1] = input.dim_size(2);
+    result.sizes[2] = input.dim_size(1);
+    result.sizes[3] = input.dim_size(0);
+    SetNeonDimStrides(&result);
+    return result;
+  }
+
+  neon::Dims<4> FilterToNeonDims(const TensorShape& filter) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    neon::Dims<4> result;
+    result.sizes[0] = filter.dim_size(2) * filter.dim_size(3);
+    result.sizes[1] = filter.dim_size(1);
+    result.sizes[2] = filter.dim_size(0);
+    result.sizes[3] = 1;
+    SetNeonDimStrides(&result);
+
+    return result;
+  }
+
+  neon::Dims<4> BiasNeonDims(const TensorShape& filter) {
+    // Dims in the neon kernels are channel, x, y, batch order.
+    // Bias has only output channel set.
+    neon::Dims<4> result;
+    result.sizes[0] =
+        filter.dim_size(2) * filter.dim_size(3);  // output channels
+    result.sizes[1] = 1;
+    result.sizes[2] = 1;
+    result.sizes[3] = 1;
+    SetNeonDimStrides(&result);
+
+    return result;
+  }
+
+  std::vector<int32> strides_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NeonDepthwiseConv2dNativeOp);
+};
+
+#define REGISTER_CPU_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<float>("T") \
+                              .Label("neon"),             \
+                          NeonDepthwiseConv2dNativeOp);
+
+TF_CALL_float(REGISTER_CPU_KERNEL);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/neon/types.h b/tensorflow/core/kernels/neon/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e258ee0dfb053990411a07a9d1fea3e18e04e0ff
--- /dev/null
+++ b/tensorflow/core/kernels/neon/types.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
+
+namespace tensorflow {
+namespace neon {
+
+enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu };
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+  DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+  DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+  DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  int max_offset = 0;
+  for (int i = 0; i < 4; i++) {
+    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  }
+  return max_offset + 1;
+}
+
+}  // end namespace neon
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_NEON_TYPES_H_
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 4d4851c70cb5543df75e2cf46f8fc53f88f3ae7e..9ffe71e031e762b8563877ca846f36833fa1d000 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -89,6 +90,59 @@ static inline float ComputeIOU(typename TTypes<float, 2>::ConstTensor boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
+void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
+                           const Tensor& scores, const Tensor& max_output_size,
+                           const float iou_threshold) {
+  OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
+              errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+
+  int num_boxes = 0;
+  ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
+  if (!context->status().ok()) {
+    return;
+  }
+
+  const int output_size = std::min(max_output_size.scalar<int>()(), num_boxes);
+  typename TTypes<float, 2>::ConstTensor boxes_data = boxes.tensor<float, 2>();
+
+  std::vector<float> scores_data(num_boxes);
+  std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
+  std::vector<int> sorted_indices;
+  DecreasingArgSort(scores_data, &sorted_indices);
+
+  std::vector<bool> active(num_boxes, true);
+  std::vector<int> selected;
+  int num_active = active.size();
+  for (int i = 0; i < num_boxes; ++i) {
+    if (num_active == 0 || selected.size() >= output_size) break;
+    if (active[i]) {
+      selected.push_back(sorted_indices[i]);
+    } else {
+      continue;
+    }
+    for (int j = i + 1; j < num_boxes; ++j) {
+      if (active[j]) {
+        float iou =
+            ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
+        if (iou > iou_threshold) {
+          active[j] = false;
+          num_active--;
+        }
+      }
+    }
+  }
+
+  // Allocate output tensor
+  Tensor* output = nullptr;
+  TensorShape output_shape({static_cast<int>(selected.size())});
+  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+  typename TTypes<int, 1>::Tensor selected_indices_data =
+      output->tensor<int, 1>();
+  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
+}
+
+}  // namespace
+
 template <typename Device>
 class NonMaxSuppressionOp : public OpKernel {
  public:
@@ -98,9 +152,6 @@ class NonMaxSuppressionOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override {
-    OP_REQUIRES(context, iou_threshold_ >= 0 && iou_threshold_ <= 1,
-                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
-
     // boxes: [num_boxes, 4]
     const Tensor& boxes = context->input(0);
     // scores: [num_boxes]
@@ -112,59 +163,48 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, scores, &num_boxes);
-    if (!context->status().ok()) {
-      return;
-    }
-
-    const int output_size =
-        std::min(max_output_size.scalar<int>()(), num_boxes);
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-
-    std::vector<float> scores_data(num_boxes);
-    std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
-    std::vector<int> sorted_indices;
-    DecreasingArgSort(scores_data, &sorted_indices);
-
-    std::vector<bool> active(num_boxes, true);
-    std::vector<int> selected;
-    int num_active = active.size();
-    for (int i = 0; i < num_boxes; ++i) {
-      if (num_active == 0 || selected.size() >= output_size) break;
-      if (active[i]) {
-        selected.push_back(sorted_indices[i]);
-      } else {
-        continue;
-      }
-      for (int j = i + 1; j < num_boxes; ++j) {
-        if (active[j]) {
-          float iou =
-              ComputeIOU(boxes_data, sorted_indices[i], sorted_indices[j]);
-          if (iou > iou_threshold_) {
-            active[j] = false;
-            num_active--;
-          }
-        }
-      }
-    }
-
-    // Allocate output tensor
-    Tensor* output = nullptr;
-    TensorShape output_shape({static_cast<int>(selected.size())});
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-    typename TTypes<int, 1>::Tensor selected_indices_data =
-        output->tensor<int, 1>();
-    std::copy_n(selected.begin(), selected.size(),
-                selected_indices_data.data());
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_);
   }
 
  private:
   float iou_threshold_;
 };
 
+template <typename Device>
+class NonMaxSuppressionV2Op : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV2Op(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
+
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_val);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
                         NonMaxSuppressionOp<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
+                        NonMaxSuppressionV2Op<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 72e368db77321e56dad780673de12078013953bb..55672ec83fb96b73834a18021e914de205f4e6a0 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -144,8 +144,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
-      << s;
+              StringPiece(s.ToString()).contains("scores has incompatible shape"))
+    << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
@@ -157,8 +157,8 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
-      << s;
+              StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+    << s;
 }
 
 TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
@@ -173,4 +173,167 @@ TEST_F(NonMaxSuppressionOpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+//
+// NonMaxSuppressionV2Op Tests
+//
+
+class NonMaxSuppressionV2OpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV2")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest,
+       TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest,
+       TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/padded_batch_dataset_op.cc b/tensorflow/core/kernels/padded_batch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0c000dd25f87e35947c9e25362a82d35c58bbd0
--- /dev/null
+++ b/tensorflow/core/kernels/padded_batch_dataset_op.cc
@@ -0,0 +1,379 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+// The following five functions are copied from padding_fifo_queue.cc.
+// TODO(mrry): Reconcile these functions with the similar methods in the
+// queue implementation.
+Status ValidateElementToLargerSlice(const Tensor& element, Tensor* parent) {
+  DCHECK_NE(parent->dim_size(0), 0);
+  if (element.NumElements() > (parent->NumElements() / parent->dim_size(0))) {
+    TensorShape chip_shape = parent->shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "HandleElementToLargerSlice Cannot copy slice: number of entries in "
+        "element is greater than number of elements in parent slice.  ",
+        "Shapes are: [element]: ", element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T, int NDIMS>
+Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                  int index) {
+  TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(element, parent));
+  if (element.NumElements() == 0) {
+    return Status::OK();
+  }
+  auto element_t = element.tensor<T, NDIMS>();
+  auto parent_t = parent->tensor<T, NDIMS + 1>();
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_indices;
+  slice_indices[0] = index;
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS + 1> slice_size;
+  slice_size[0] = 1;
+  for (size_t i = 1; i < slice_size.size(); ++i) {
+    slice_size[i] = element_t.dimension(i - 1);
+  }
+  parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
+  return Status::OK();
+}
+
+template <int NDIMS>
+Status HandleElementToLargerSliceWithRank(const Tensor& element, Tensor* parent,
+                                          int index) {
+#define HANDLE_TYPE(T)                                                   \
+  case DataTypeToEnum<T>::value: {                                       \
+    return HandleElementToLargerSlice<T, NDIMS>(element, parent, index); \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented(
+          "HandleElementToLargerSliceWithRank Unhandled data type: ",
+          element.dtype());
+  }
+}
+
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index) {
+  if (parent->dims() != element.dims() + 1) {
+    return errors::Internal(
+        "Mismatched ranks.  Element's rank is: ", element.dims(),
+        " but element is meant to be a slice in output Tensor having rank: ",
+        parent->dims(), " (should be: ", element.dims() + 1, ")");
+  }
+
+#define HANDLE_DIMS(NDIMS)                                                  \
+  case NDIMS: {                                                             \
+    TF_RETURN_IF_ERROR(                                                     \
+        HandleElementToLargerSliceWithRank<NDIMS>(element, parent, index)); \
+    return Status::OK();                                                    \
+  }
+
+  switch (element.dims()) {
+    HANDLE_DIMS(0);
+    HANDLE_DIMS(1);
+    HANDLE_DIMS(2);
+    HANDLE_DIMS(3);
+    HANDLE_DIMS(4);
+#undef HANDLE_DIMS
+    default:
+      return errors::Unimplemented("CopyElementToLargerSlice Unhandled rank: ",
+                                   element.dims());
+  }
+}
+
+Status SetElementZero(Tensor* element, const Tensor& padding) {
+#define HANDLE_TYPE(T)                                     \
+  if (element->dtype() == DataTypeToEnum<T>::value) {      \
+    element->flat<T>().setConstant(padding.scalar<T>()()); \
+    return Status::OK();                                   \
+  }
+  TF_CALL_ALL_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+  return errors::Unimplemented("SetElementZero Unhandled data type: ",
+                               element->dtype());
+}
+
+class PaddedBatchDatasetOp : public OpKernel {
+ public:
+  explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new BatchDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* batch_size_t;
+    OP_REQUIRES_OK(ctx, ctx->input("batch_size", &batch_size_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(batch_size_t->shape()),
+                errors::InvalidArgument("batch_size must be a scalar"));
+    const int64 batch_size = batch_size_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("Batch size must be greater than zero."));
+
+    OpInputList padded_shape_tensors;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padded_shapes", &padded_shape_tensors));
+    std::vector<PartialTensorShape> padded_shapes;
+    padded_shapes.reserve(padded_shape_tensors.size());
+    OP_REQUIRES(ctx,
+                padded_shape_tensors.size() == input->output_shapes().size(),
+                errors::InvalidArgument("Number of padded shapes (",
+                                        padded_shape_tensors.size(),
+                                        ") must match the number of components "
+                                        "in the input dataset's elements (",
+                                        input->output_shapes().size(), ")"));
+    for (const Tensor& padded_shape_t : padded_shape_tensors) {
+      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
+                  errors::InvalidArgument("All padded shapes must be vectors"));
+      PartialTensorShape padded_shape;
+      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
+                              padded_shape_t.vec<int64>().data(),
+                              padded_shape_t.NumElements(), &padded_shape));
+      padded_shapes.push_back(std::move(padded_shape));
+    }
+    OpInputList padding_values_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("padding_values", &padding_values_list));
+    std::vector<Tensor> padding_values;
+    OP_REQUIRES(ctx,
+                padding_values_list.size() == input->output_shapes().size(),
+                errors::InvalidArgument(
+                    "Number of padding values (", padding_values_list.size(),
+                    ") must match the number of components in the input "
+                    "dataset's elements (",
+                    input->output_shapes().size(), ")"));
+    for (int i = 0; i < padding_values_list.size(); ++i) {
+      const Tensor& padding_value_t = padding_values_list[i];
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
+          errors::InvalidArgument("All padding values must be scalars"));
+      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
+                  errors::InvalidArgument(
+                      "Mismatched type between padding value ", i,
+                      " and input dataset's component ", i, ": ",
+                      DataTypeString(padding_value_t.dtype()), " vs. ",
+                      DataTypeString(input->output_dtypes()[i])));
+      padding_values.push_back(tensor::DeepCopy(padding_value_t));
+    }
+
+    DatasetBase* dataset = new Dataset(batch_size, std::move(padded_shapes),
+                                       std::move(padding_values), input);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 batch_size, std::vector<PartialTensorShape> padded_shapes,
+            std::vector<Tensor> padding_values, const DatasetBase* input)
+        : batch_size_(batch_size),
+          padded_shapes_(std::move(padded_shapes)),
+          padding_values_(std::move(padding_values)),
+          input_(input) {
+      input_->Ref();
+
+      // NOTE(mrry): Currently we implement "batch up to"
+      // semantics. If we could tell statically that the input dataset
+      // is infinite, then we could always report `batch_size` as the
+      // 0th dimension.
+      // TODO(mrry): Need to validate that the input shape and the
+      // padded shape are "compatible" (i.e. that padded shape is >=
+      // input shape, with both static and dynamic checks as appropriate).
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (size_t i = 0; i < input_shapes.size(); ++i) {
+        output_shapes_.push_back(
+            PartialTensorShape({-1}).Concatenate(padded_shapes_[i]));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("PaddedBatchDatasetOp(", batch_size_,
+                             ")::Dataset");
+    }
+
+   private:
+    // Copies element into the index^th slice of parent (in the 0th dimension).
+    //
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        // Each row of `batch_elements` is a tuple of tensors from the
+        // input iterator.
+        std::vector<std::vector<Tensor>> batch_elements;
+        batch_elements.reserve(dataset()->batch_size_);
+        {
+          mutex_lock l(mu_);
+          *end_of_sequence = false;
+          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> batch_element_tuple;
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                    end_of_sequence));
+            if (!*end_of_sequence) {
+              batch_elements.push_back(std::move(batch_element_tuple));
+            }
+          }
+        }
+
+        if (batch_elements.empty()) {
+          DCHECK(*end_of_sequence);
+          return Status::OK();
+        }
+
+        // Copy the retrieved batch elements into one output tensor
+        // per tuple component.
+        // NOTE(mrry): If the input or output sizes are statically
+        // known, we could potentially read the input values in-place
+        // into their respective slice locations. This would require a
+        // different GetNext() overload that supports zero-copy, and might
+        // make sense in an optimization pass.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          // 1. Determine the shape of the padded tensor.
+          TensorShape batch_component_shape({num_batch_elements});
+          const PartialTensorShape& padded_shape =
+              dataset()->padded_shapes_[component_index];
+
+          for (int dim = 0; dim < padded_shape.dims(); ++dim) {
+            if (padded_shape.dim_size(dim) == -1) {
+              batch_component_shape.AddDim(0);
+            } else {
+              batch_component_shape.AddDim(padded_shape.dim_size(dim));
+            }
+          }
+
+          for (int64 i = 0; i < num_batch_elements; ++i) {
+            const TensorShape& element_shape =
+                batch_elements[i][component_index].shape();
+            // TODO(mrry): Perform this check in the shape function if
+            // enough static information is available to do so.
+            if (element_shape.dims() != padded_shape.dims()) {
+              return errors::InvalidArgument(
+                  "All elements in a batch must have the same rank as the "
+                  "padded shape for component",
+                  component_index, ": expected rank ", padded_shape.dims(),
+                  " but got element with rank ", element_shape.dims());
+            }
+            for (int dim = 0; dim < padded_shape.dims(); ++dim) {
+              if (padded_shape.dim_size(dim) == -1) {
+                // Take the max of all batch elements in this dimension.
+                if (batch_elements[i][component_index].shape().dim_size(dim) >
+                    batch_component_shape.dim_size(dim + 1)) {
+                  batch_component_shape.set_dim(
+                      dim + 1,
+                      batch_elements[i][component_index].shape().dim_size(dim));
+                }
+              } else {
+                if (batch_elements[i][component_index].shape().dim_size(dim) >
+                    batch_component_shape.dim_size(dim + 1)) {
+                  return errors::DataLoss(
+                      "Attempted to pad to a smaller size than the input "
+                      "element.");
+                }
+              }
+            }
+          }
+
+          // 2. Copy each batch element to the appropriate location in
+          // the output component tensor.
+          Tensor batch_component(cpu_allocator(),
+                                 output_dtypes()[component_index],
+                                 batch_component_shape);
+          TF_RETURN_IF_ERROR(SetElementZero(
+              &batch_component, dataset()->padding_values_[component_index]));
+
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (int64 i = 0; i < num_batch_elements; ++i) {
+            TF_RETURN_IF_ERROR(ValidateElementToLargerSlice(
+                batch_elements[i][component_index], &batch_component));
+
+            TF_RETURN_IF_ERROR(CopyElementToLargerSlice(
+                batch_elements[i][component_index], &batch_component, i));
+          }
+          out_tensors->push_back(std::move(batch_component));
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 batch_size_;
+    const std::vector<PartialTensorShape> padded_shapes_;
+    const std::vector<Tensor> padding_values_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("PaddedBatchDataset").Device(DEVICE_CPU),
+                        PaddedBatchDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/parallel_map_dataset_op.cc b/tensorflow/core/kernels/parallel_map_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93ed644d72d7cfeacf77c5c765b9e1e662655db8
--- /dev/null
+++ b/tensorflow/core/kernels/parallel_map_dataset_op.cc
@@ -0,0 +1,347 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <deque>
+
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/random.h"
+
+#include "tensorflow/core/kernels/captured_function.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelMapDatasetOp : public OpKernel {
+ public:
+  explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("other_arguments", &inputs));
+    std::vector<Tensor> other_arguments;
+    other_arguments.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      other_arguments.push_back(t);
+    }
+
+    const Tensor* num_threads_t;
+    OP_REQUIRES_OK(ctx, ctx->input("num_threads", &num_threads_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(num_threads_t->shape()),
+                errors::InvalidArgument("num_threads must be a scalar"));
+    const int32 num_threads = num_threads_t->flat<int32>()(0);
+    OP_REQUIRES(
+        ctx, num_threads > 0,
+        errors::InvalidArgument("num_threads must be greater than zero."));
+
+    const Tensor* output_buffer_size_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("output_buffer_size", &output_buffer_size_t));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(output_buffer_size_t->shape()),
+        errors::InvalidArgument("output_buffer_size must be a scalar."));
+    const int64 output_buffer_size = output_buffer_size_t->flat<int64>()(0);
+
+    // TODO(mrry): Relax this requirement? If the output buffer owns
+    // the (tuples of) tensors into which `f` writes its output, it
+    // seems like this constraint would make it easier to (i)
+    // constrain the memory usage of the iterator, and (ii) enforce a
+    // consistent ordering between input and output.
+    OP_REQUIRES(ctx, output_buffer_size >= num_threads,
+                errors::InvalidArgument(
+                    "output_buffer_size (", output_buffer_size,
+                    ") must be greater than or equal to num_threads (",
+                    num_threads, ")."));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, func_, graph_def_version_,
+                                                 std::move(other_arguments),
+                                                 &captured_func));
+
+    // TODO(mrry): It seems unnatural to capture the params from *this
+    // kernel's* OpKernelContext, although the captured values should
+    // be the same for any kernel in the same session. Consider adding
+    // an IteratorContext* argument to Dataset::MakeIterator(), and
+    // threading the context information through that
+    // way. Alternatively, provide a session-scoped context that will
+    // provide this information to all users in the same session (and
+    // that will have the appropriate lifetime).
+    IteratorContext::Params params;
+    params.env = ctx->env();
+    params.resource_manager = ctx->resource_manager();
+    params.runner = *(ctx->runner());
+
+    DatasetBase* dataset =
+        new Dataset(input, num_threads, output_buffer_size, std::move(params),
+                    output_types_, output_shapes_, std::move(captured_func));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int32 num_threads,
+            int64 output_buffer_size, IteratorContext::Params ctx_params,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : input_(input),
+          num_threads_(num_threads),
+          output_buffer_size_(output_buffer_size),
+          ctx_params_(std::move(ctx_params)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            iter_ctx_(dataset->ctx_params_),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      ~Iterator() override {
+        // Signal the mapper threads, if any, so that they terminate.
+        // We will then join those threads when we delete
+        // `this->mapper_threads_`.
+        //
+        // TODO(mrry): Replace this cancellation logic with a
+        // CancellationManager. The syntax would be more heavyweight,
+        // but it would be possible to thread a cancellation manager
+        // through the IteratorContext to upstream,
+        // potentially-blocking iterators, when we add these.
+        {
+          mutex_lock l(output_mu_);
+          cancelled_ = true;
+          cond_var_.notify_all();
+        }
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(output_mu_);
+        TF_RETURN_IF_ERROR(EnsureMapperThreadsStarted(ctx));
+
+        while (true) {
+          // 1. Wait until the next element in the output queue has
+          // been produced, or we are shutting down.
+          while (
+              !cancelled_ && active_threads_ > 0 &&
+              (output_buffer_.empty() || !output_buffer_.front().is_produced)) {
+            cond_var_.wait(l);
+          }
+
+          if (cancelled_) {
+            return errors::Cancelled(
+                "ParallelMapDatasetOp::Dataset::Iterator::GetNext");
+          }
+
+          if (!output_buffer_.empty() && output_buffer_.front().is_produced) {
+            // A new output element is available. Forward the status
+            // from computing it, and (if we successfully got an
+            // element) the output values.
+            Status s = output_buffer_.front().output_status;
+            if (s.ok()) {
+              *out_tensors = std::move(output_buffer_.front().output_value);
+            }
+            output_buffer_.pop_front();
+            *end_of_sequence = false;
+
+            // Wake one of the producing threads, in case they have been
+            // waiting for space in the queue.
+            cond_var_.notify_one();
+            return s;
+          } else if (active_threads_ == 0) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+        }
+      }
+
+     private:
+      // An output queue element comprises a bool (which indicates
+      // whether the element has been produced yet) and a vector of
+      // tensors (which contains the tuple of tensors if the bool is
+      // true).
+      struct OutputQueueElement {
+        // The producer must set `is_produced` to `true` after
+        // `output_status` or `output_value` has been written.
+        bool is_produced = false;
+        // The producer sets `output_status` if either getting the
+        // input element or applying the mapper function to it fails.
+        Status output_status;
+        // The mapped data element.
+        std::vector<Tensor> output_value;
+      };
+
+      Status EnsureMapperThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(output_mu_) {
+        if (mapper_threads_.empty()) {
+          // Choose a step ID that is guaranteed not to clash with any
+          // Session-generated step ID. DirectSession only generates
+          // non-negative step IDs (contiguous, starting from 0), and
+          // MasterSession generates 56-bit random step IDs whose MSB
+          // is always 0, so a negative random step ID should suffice.
+          f_opts_.step_id = -std::abs(static_cast<int64>(random::New64()));
+          f_opts_.runner = iter_ctx_.runner();
+
+          active_threads_ = dataset()->num_threads_;
+          for (int i = 0; i < dataset()->num_threads_; ++i) {
+            mapper_threads_.emplace_back(
+                std::unique_ptr<Thread>(ctx->env()->StartThread(
+                    {}, "mapper_thread", [this]() { MapperThread(); })));
+          }
+        }
+        return Status::OK();
+      }
+
+      void MapperThread() {
+        while (true) {
+          OutputQueueElement* output_queue_element_;
+
+          std::vector<Tensor> input_args;
+          std::vector<Tensor> output_value;
+
+          Status s;
+
+          // 1. Acquire a slot in the output queue and a corresponding input
+          // element.
+          {
+            // First acquire the input lock. Only one MapperThread may
+            // call GetNext() on the input iterator at a time, to
+            // preserve the ordering of elements.
+            mutex_lock input_lock(input_mu_);
+            {
+              // This MapperThread is now responsible for producing
+              // the next element in the output queue. We acquire a
+              // slot in the output queue atomically, which may block,
+              // but we deliberately do not release input_mu_ to
+              // prevent another MapperThread from overtaking us.
+              mutex_lock output_lock(output_mu_);
+              while (!cancelled_ &&
+                     output_buffer_.size() == dataset()->output_buffer_size_) {
+                cond_var_.wait(output_lock);
+              }
+
+              if (cancelled_) {
+                --active_threads_;
+                return;
+              }
+
+              output_buffer_.push_back(OutputQueueElement());
+              output_queue_element_ = &output_buffer_.back();
+            }
+
+            bool end_of_sequence;
+            s = input_impl_->GetNext(&iter_ctx_, &input_args, &end_of_sequence);
+            if (s.ok() && end_of_sequence) {
+              mutex_lock output_lock(output_mu_);
+              --active_threads_;
+              if (active_threads_ == 0) {
+                cond_var_.notify_all();
+              }
+              return;
+            }
+          }
+
+          if (s.ok()) {
+            s = dataset()->captured_func_->Run(f_opts_, input_args,
+                                               &output_value);
+          }
+
+          // 3. Signal that the element has been produced.
+          {
+            mutex_lock output_lock(output_mu_);
+            output_queue_element_->output_status.Update(s);
+            output_queue_element_->is_produced = true;
+            std::swap(output_queue_element_->output_value, output_value);
+            cond_var_.notify_all();
+          }
+        }
+      }
+
+      IteratorContext iter_ctx_;
+      mutex input_mu_;
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(input_mu_);
+      FunctionLibraryRuntime::Options f_opts_;
+      mutex output_mu_;
+      condition_variable cond_var_;
+      std::deque<OutputQueueElement> output_buffer_ GUARDED_BY(output_mu_);
+      std::vector<std::unique_ptr<Thread>> mapper_threads_
+          GUARDED_BY(output_mu_);
+      bool cancelled_ GUARDED_BY(output_mu_) = false;
+      int32 active_threads_ GUARDED_BY(output_mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const int32 num_threads_;
+    const int64 output_buffer_size_;
+    const IteratorContext::Params ctx_params_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const NameAttrList* func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ParallelMapDataset").Device(DEVICE_CPU),
+                        ParallelMapDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 44861d9595dbabcbbef6507b740fda7a7f7128ab..538dca24ae630c9b9d13058e4781bf7d2f6a3dde 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -580,8 +580,7 @@ struct LaunchMaxPooling3dGradGradOp<CPUDevice, T> {
         *(context->device()->tensorflow_cpu_worker_threads());
 
     auto shard = [&params, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat](
-        int64 start, int64 limit) {
-
+                     int64 start, int64 limit) {
       const int32 depth = params.depth;
       const int32 in_planes = params.tensor_in_planes;
       const int32 in_rows = params.tensor_in_rows;
@@ -682,10 +681,9 @@ class MaxPooling3dGradGradOp : public OpKernel {
                     "Pooling is not yet supported on the batch dimension."));
     const int32 ksize_c = GetTensorDim(ksize_, data_format_, 'C');
     const int32 stride_c = GetTensorDim(stride_, data_format_, 'C');
-    OP_REQUIRES(
-        context, ksize_c == 1 && stride_c == 1,
-        errors::Unimplemented(
-            "MaxPooling3dGradGrad is not yet supported on the depth dimension."));
+    OP_REQUIRES(context, ksize_c == 1 && stride_c == 1,
+                errors::Unimplemented("MaxPooling3dGradGrad is not yet "
+                                      "supported on the depth dimension."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -703,7 +701,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
         context, out_grad_backprop.dims() == 5,
         errors::InvalidArgument("out_grad_backprop must be 5-dimensional"));
 
-    Pool3dParameters params{context, ksize_, stride_,
+    Pool3dParameters params{context,  ksize_,       stride_,
                             padding_, data_format_, tensor_in.shape()};
 
     Tensor* output = nullptr;
@@ -736,12 +734,11 @@ class MaxPooling3dGradGradOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("AvgPool3D").Device(DEVICE_##D).TypeConstraint<T>("T"),         \
       Pooling3DOp<D##Device, T, AVG>);                                     \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("AvgPool3DGrad")                                                \
-          .Device(DEVICE_##D)                                              \
-          .TypeConstraint<T>("T")                                          \
-          .HostMemory("orig_input_shape"),                                 \
-      AvgPooling3dGradOp<D##Device, T>);
+  REGISTER_KERNEL_BUILDER(Name("AvgPool3DGrad")                            \
+                              .Device(DEVICE_##D)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .HostMemory("orig_input_shape"),             \
+                          AvgPooling3dGradOp<D##Device, T>);
 
 #define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T)
 TF_CALL_float(REGISTER_CPU_KERNELS);
@@ -835,8 +832,7 @@ struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
 };
 
 #define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T)
-TF_CALL_float(REGISTER_GPU_KERNELS)
-TF_CALL_half(REGISTER_GPU_KERNELS)
+TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
index 08af188b282cc62243de8b87fe16d8f8f430ee8e..341a43c368e66ee4e1b60a0adb3a362abc3cbd26 100644
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
-#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -159,12 +159,11 @@ bool MaxPool3dGradBackward<T>::operator()(
         bottom_diff);
   }
   return d.ok();
-};
+}
 
 }  // namespace functor
 
-#define DEFINE_GPU_SPECS(T) \
-  template struct functor::MaxPool3dGradBackward<T>;
+#define DEFINE_GPU_SPECS(T) template struct functor::MaxPool3dGradBackward<T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 #undef DEFINE_GPU_SPECS
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 9e7314cad5ee141deda1db5bd0e84be257d05978..37747a31999504d7a65efb5778614fa0ed5c52df 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -373,7 +373,7 @@ void DnnPoolingGradOp<T>::Compute(
   }
 }
 
-#define DEFINE_DNN_OPS(T)       \
+#define DEFINE_DNN_OPS(T)         \
   template class DnnPoolingOp<T>; \
   template class DnnPoolingGradOp<T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 91dc52dc7dd637fb97dea2b50dc342ea033224b8..dae40c4678bc81bec2f1aac0264d4cb6e82165ee 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -79,7 +79,13 @@ float QuantizedToFloat(T input, float range_min, float range_max) {
   const int64 lowest_quantized =
       static_cast<int64>(Eigen::NumTraits<T>::lowest());
   const double offset_input = static_cast<double>(input) - lowest_quantized;
-  const double result = range_min + (offset_input * range_scale);
+  // For compatibility with DEQUANTIZE_WITH_EIGEN, we should convert
+  // range_scale to a float, otherwise range_min_rounded might be slightly
+  // different.
+  const double range_min_rounded =
+      round(range_min / static_cast<float>(range_scale)) *
+      static_cast<float>(range_scale);
+  const double result = range_min_rounded + (offset_input * range_scale);
   return static_cast<float>(result);
 }
 
@@ -113,8 +119,8 @@ void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
 // input_array is an eigen Tensor.  q2f is a QuantizedToFloatStruct.
 // This evaluates to an eigen tensor expression, to be used like:
 // auto tensor = DEQUANTIZE_WITH_EIGEN(input_tensor, q2f);
-#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                 \
-  ((q2f.range_min - q2f.lowest_quantized() * q2f.range_scale) + \
+#define DEQUANTIZE_WITH_EIGEN(input_array, q2f)                         \
+  ((q2f.range_min_rounded - q2f.lowest_quantized() * q2f.range_scale) + \
    input_array.template cast<float>() * q2f.range_scale)
 
 // input_array is an eigen Tensor.  f2q is a FloatToQuantizedStruct.
@@ -142,10 +148,14 @@ struct QuantizedToFloatStruct {
 
   QuantizedToFloatStruct(float range_min, float range_max)
       : range_min(range_min),
-        range_scale((range_max - range_min) / (number_of_steps - 1.0)) {}
+        range_scale((range_max - range_min) / (number_of_steps - 1.0)),
+        range_min_rounded(range_max == range_min
+                              ? range_min
+                              : round(range_min / range_scale) * range_scale) {}
 
   const float range_min;
   const float range_scale;
+  const float range_min_rounded;
 };
 
 // For use with QUANTIZE_WITH_EIGEN.
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 845660474050329db3e7f79f2902960fe2c69aa7..0c23c0586c62e20e1e06b142278bf7fb56f347d9 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -355,6 +355,20 @@ TEST_F(QuantizationUtilsTest, AvoidBias) {
     const int back_to_int = FloatToQuantized<quint8>(as_float, 0.0f, 2.0f);
     EXPECT_EQ(i, back_to_int);
   }
+
+  // All perfectly representable floats should survive quantization, even
+  // if we pick a range where min is not itself perfectly representable.
+  const float min = -0.1375f;
+  const float max = 1.1385f;
+  const float step_size = (max - min) / 255.0f;
+  const float tolerance = step_size / 1000.0f;
+  // This is the smallest perfectly representable float in the range.
+  float first_float = ceil(min / step_size) * step_size;
+  for (float f = first_float; f <= max; f += step_size) {
+    const int as_int = FloatToQuantized<quint8>(f, min, max);
+    const float back_to_float = QuantizedToFloat<quint8>(as_int, min, max);
+    EXPECT_NEAR(f, back_to_float, tolerance);
+  }
 }
 
 TEST_F(QuantizationUtilsTest, RequantizeInNewRange) {
@@ -394,11 +408,16 @@ TEST_F(QuantizationUtilsTest, RequantizeInNewRange) {
 }
 
 TEST_F(QuantizationUtilsTest, RequantizeInNewRangeRealData) {
-  const float value_as_float = -0.290169f;
   const float input_min = -0.739539f;
   const float input_max = 0.641057f;
   const float output_min = -2381.49f;
   const float output_max = 2207.6f;
+
+  // Start with a value that can be perfectly represented in 8 bits. This
+  // ensures minimal quantization error, and allows us to use EXPECT_LT below.
+  const float value_as_float =
+      QuantizedToFloat<quint8>(83, input_min, input_max);
+
   const quint8 value_as_quint8 =
       FloatToQuantized<quint8>(value_as_float, input_min, input_max);
   EXPECT_EQ(quint8(83), value_as_quint8);
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index 7b34c32cebd60e93d40259107a9e6a99dc43e01e..f649287fc1dda9e17439e517b52b1a5bb5c3ac37 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -86,6 +86,7 @@ class QuantizeV2Op : public OpKernel {
                                                   fabsf(input_max_range))) /
                           100.0f;
     max_range = std::max(input_max_range, min_range + epsilon);
+    max_range = std::max(0.0f, max_range);
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 41996852f165ad408000516baf05f8215567340f..48bde3b497176c013cb4aff04780db9f3b3c5582 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -132,6 +132,50 @@ TEST_F(QuantizedOpTest, QuantizeV2EqualRange) {
   EXPECT_LT(0.0f, output_max);
 }
 
+TEST_F(QuantizedOpTest, QuantizeV2MovesMinToIncludeZero) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({3}), {0.1, 0.2, 0.3});
+  AddInputFromArray<float>(TensorShape({1}), {0.1});
+  AddInputFromArray<float>(TensorShape({1}), {0.3});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({3}));
+  test::FillValues<quint8>(&expected, {85, 170, 255});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(0.0f, output_min, 1e-5f);
+  EXPECT_NEAR(0.3f, output_max, 1e-5f);
+}
+
+TEST_F(QuantizedOpTest, QuantizeV2MovesMaxToIncludeZero) {
+  TF_ASSERT_OK(NodeDefBuilder("quantize_op", "QuantizeV2")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "MIN_FIRST")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({3}), {-0.1, -0.2, -0.3});
+  AddInputFromArray<float>(TensorShape({1}), {-0.3});
+  AddInputFromArray<float>(TensorShape({1}), {-0.1});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({3}));
+  test::FillValues<quint8>(&expected, {170, 85, 0});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(-0.3f, output_min, 1e-5f);
+  EXPECT_NEAR(0.0f, output_max, 1e-5f);
+}
+
 TEST_F(QuantizedOpTest, Dequantize) {
   TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "Dequantize")
                    .Input(FakeInput(DT_QUINT8))
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index afa1f65aefaadbcaee05f6e6d080ae10f6cc2f20..56a7e161df442de2a00441807ff10d2b759c4479 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -233,9 +233,9 @@ class Im2ColConvFunctor {
     int filter_top_offset;
     if (padding == VALID) {
       filter_left_offset =
-          ((output_width - 1) * stride + filter_width - input_width) / 2;
+          ((output_width - 1) * stride + filter_width - input_width + 1) / 2;
       filter_top_offset =
-          ((output_height - 1) * stride + filter_height - input_height) / 2;
+          ((output_height - 1) * stride + filter_height - input_height + 1) / 2;
     } else {
       filter_left_offset =
           ((output_width - 1) * stride + filter_width - input_width) / 2;
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index a0a0cbb616dbf1c88a679f2b621db684ada9b6a7..07ff70a87524232d38d96528fa49092e70221e82 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -375,6 +375,7 @@ Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
   HANDLE_TYPE(DT_QINT32);
   HANDLE_TYPE(DT_QINT16);
   HANDLE_TYPE(DT_QUINT16);
+  HANDLE_TYPE(DT_UINT16);
 #undef HANDLE_TYPE
   return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
                                parent.dtype());
@@ -405,6 +406,7 @@ Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
   HANDLE_TYPE(DT_QINT32);
   HANDLE_TYPE(DT_QINT16);
   HANDLE_TYPE(DT_QUINT16);
+  HANDLE_TYPE(DT_UINT16);
 #undef HANDLE_TYPE
   return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
                                element.dtype());
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index e13ea46e56f0f3c3c014421bb47d04be7cf7b06d..99d2d19bfda12241ee58a1ba301c618cdeb26352 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -56,6 +56,13 @@ class TypedQueueOp : public QueueOp {
  public:
   using QueueOp::QueueOp;
 
+  void Compute(OpKernelContext* context) override {
+    QueueOp::Compute(context);
+    if (queue_ && context->track_allocations()) {
+      context->record_host_persistent_memory_allocation(queue_->MemoryUsed());
+    }
+  }
+
  protected:
   template <typename TypedQueue>
   Status CreateTypedQueue(TypedQueue* queue, QueueInterface** ret) {
@@ -63,8 +70,12 @@ class TypedQueueOp : public QueueOp {
       return errors::ResourceExhausted("Failed to allocate queue.");
     }
     *ret = queue;
+    queue_ = queue;
     return queue->Initialize();
   }
+
+ private:
+  QueueInterface* queue_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index 82516153866b05c54f12a1554b55eb79c882de17..301d1420a438ce104e6cd7095c0991b5ba3099bd 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -103,11 +103,6 @@ class EnqueueOp : public QueueAccessOpKernel {
     }
 
     OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateTuple(tuple), callback);
-    if (ctx->track_allocations()) {
-      // We can get persistent memory size of the queue when it is kept full, no
-      // matter whether it is before or after the enqueue.
-      ctx->record_host_persistent_memory_allocation(queue->MemoryUsed());
-    }
     queue->TryEnqueue(tuple, ctx, callback);
   }
 
@@ -160,9 +155,6 @@ class EnqueueManyOp : public QueueAccessOpKernel {
     }
 
     OP_REQUIRES_OK_ASYNC(ctx, queue->ValidateManyTuple(tuple), callback);
-    if (ctx->track_allocations()) {
-      ctx->record_host_persistent_memory_allocation(queue->MemoryUsed());
-    }
     queue->TryEnqueueMany(tuple, ctx, callback);
   }
 
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 3063fedac8f98794147c18072a86d2135614d1d5..80b1be8d4cae49509bab945f7441521df1815ead 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -178,27 +178,9 @@ namespace {
 
 static Status AllocateOutputWithShape(OpKernelContext* ctx, const Tensor& shape,
                                       int index, Tensor** output) {
-  if (!ctx->op_kernel().IsLegacyVector(shape.shape())) {
-    return errors::InvalidArgument(
-        "shape must be a vector of {int32,int64}, got shape ",
-        shape.shape().DebugString());
-  }
-  if (shape.dtype() == DataType::DT_INT32) {
-    auto vec = shape.flat<int32>();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeUtils::MakeShape(vec.data(), vec.size(), &tensor_shape));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(index, tensor_shape, output));
-  } else if (shape.dtype() == DataType::DT_INT64) {
-    auto vec = shape.flat<int64>();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeUtils::MakeShape(vec.data(), vec.size(), &tensor_shape));
-    TF_RETURN_IF_ERROR(ctx->allocate_output(index, tensor_shape, output));
-  } else {
-    return errors::InvalidArgument("shape must be a vector of {int32,int64}.");
-  }
-  return Status::OK();
+  TensorShape tensor_shape;
+  TF_RETURN_IF_ERROR(ctx->op_kernel().MakeShape(shape, &tensor_shape));
+  return ctx->allocate_output(index, tensor_shape, output);
 }
 
 // For now, use the same interface as RandomOp, so we can choose either one
@@ -465,6 +447,12 @@ class RandomGammaOp : public OpKernel {
 #define REGISTER(TYPE)                                                      \
   template struct functor::FillPhiloxRandom<                                \
       CPUDevice, random::UniformDistribution<random::PhiloxRandom, TYPE> >; \
+  template struct functor::FillPhiloxRandom<                                \
+      CPUDevice, random::NormalDistribution<random::PhiloxRandom, TYPE> >;  \
+  template struct functor::FillPhiloxRandom<                                \
+      CPUDevice,                                                            \
+      random::TruncatedNormalDistribution<                                  \
+          random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >;       \
   REGISTER_KERNEL_BUILDER(                                                  \
       Name("RandomUniform")                                                 \
           .Device(DEVICE_CPU)                                               \
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index 553a4a7f9397722c008f4735175d23fddbeea280..66123e47c6eaee57bb5a6166b748789ce188ba0f 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -291,33 +291,15 @@ class RandomPoissonOp : public OpKernel {
     const Tensor& shape_t = ctx->input(0);
     const Tensor& rate_t = ctx->input(1);
 
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsVector(shape_t.shape()) &&
-                    (shape_t.dtype() == DataType::DT_INT32 ||
-                     shape_t.dtype() == DataType::DT_INT64),
-                errors::InvalidArgument(
-                    "shape must be a vector of {int32,int64}, got shape: ",
-                    shape_t.DebugString()));
     TensorShape samples_shape;
-    if (shape_t.dtype() == DataType::DT_INT32) {
-      auto vec = shape_t.flat<int32>();
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
-                                                      &samples_shape));
-    } else if (shape_t.dtype() == DataType::DT_INT64) {
-      auto vec = shape_t.flat<int64>();
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
-                                                      &samples_shape));
-    }
+    OP_REQUIRES_OK(ctx, MakeShape(shape_t, &samples_shape));
     const int64 num_samples = samples_shape.num_elements();
-    OP_REQUIRES(ctx, num_samples > 0,
-                errors::InvalidArgument(
-                    "Input shape should have non-zero element count, got: ",
-                    num_samples));
 
     samples_shape.AppendShape(rate_t.shape());
     // Allocate output samples.
     Tensor* samples_t = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, samples_shape, &samples_t));
+    if (num_samples == 0) return;
 
     const auto rate_flat = rate_t.flat<T>().data();
     const int64 num_rate = rate_t.NumElements();
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/range_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c181f6e80432a1d63ab4655b434d449b53c5bc7c
--- /dev/null
+++ b/tensorflow/core/kernels/range_dataset_op.cc
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RangeDatasetOp : public OpKernel {
+ public:
+  explicit RangeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* start_t;
+    OP_REQUIRES_OK(ctx, ctx->input("start", &start_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_t->shape()),
+                errors::InvalidArgument("start must be a scalar"));
+    const int64 start = start_t->flat<int64>()(0);
+
+    const Tensor* stop_t;
+    OP_REQUIRES_OK(ctx, ctx->input("stop", &stop_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(stop_t->shape()),
+                errors::InvalidArgument("stop must be a scalar"));
+    const int64 stop = stop_t->flat<int64>()(0);
+
+    const Tensor* step_t;
+    OP_REQUIRES_OK(ctx, ctx->input("step", &step_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(step_t->shape()),
+                errors::InvalidArgument("step must be a scalar"));
+    const int64 step = step_t->flat<int64>()(0);
+    OP_REQUIRES(ctx, step != 0,
+                errors::InvalidArgument("step must be a non-zero integer."));
+
+    DatasetBase* dataset = new Dataset(start, stop, step);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 start, int64 stop, int64 step)
+        : start_(start), stop_(stop), step_(step) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("RangeDatasetOp(", start_, ", ", stop_, ", ",
+                             step_, ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {
+        next_ = dataset->start_;
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
+            (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        value_tensor.scalar<int64>()() = next_;
+        out_tensors->emplace_back(std::move(value_tensor));
+        *end_of_sequence = false;
+        next_ += dataset()->step_;
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 next_;
+    };
+
+    const int64 start_;
+    const int64 stop_;
+    const int64 step_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RangeDataset").Device(DEVICE_CPU),
+                        RangeDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index f7c1e6c52c19987a9179afc09c6d10aca3254bb5..7e57331ab4fe2199fc03446748ff978a12e93f15 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -262,7 +262,7 @@ FixedUnigramSampler::FixedUnigramSampler(int64 range,
 }
 
 float FixedUnigramSampler::Probability(int64 value) const {
-  if (value >= weights_.size() || value < 0) {
+  if (value < 0 || static_cast<size_t>(value) >= weights_.size()) {
     return 0.0;
   }
   return weights_.at(value) / total_weight_;
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/reader_dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7f65c39cb94d8757cf9de042908b4228b2a3d36
--- /dev/null
+++ b/tensorflow/core/kernels/reader_dataset_ops.cc
@@ -0,0 +1,420 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following ops.
+
+class TextLineDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    DatasetBase* dataset = new Dataset(std::move(filenames));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames)
+        : filenames_(std::move(filenames)) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "TextLineDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next line.
+          if (input_buffer_) {
+            string line_contents;
+            Status s = input_buffer_->ReadLine(&line_contents);
+            if (s.ok()) {
+              // Produce the line as output.
+              Tensor line_tensor(cpu_allocator(), DT_STRING, {});
+              line_tensor.scalar<string>()() = line_contents;
+              out_tensors->emplace_back(std::move(line_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            } else if (!errors::IsOutOfRange(s)) {
+              // Report non-EOF errors to the caller.
+              return s;
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            input_buffer_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
+        } while (true);
+      }
+
+     private:
+      // TODO(mrry): Make this configurable via an attr on the dataset op?
+      // Or maybe via a data input?
+      enum { kBufferSize = 256 << 10 /* 256 kB */ };
+
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_buffer_
+      std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
+                        TextLineDatasetOp);
+
+class FixedLengthRecordDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    const Tensor* header_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("header_bytes", &header_bytes_tensor));
+    OP_REQUIRES(ctx, header_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`header_bytes` must be a scalar."));
+    const int64 header_bytes = header_bytes_tensor->scalar<int64>()();
+
+    const Tensor* record_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("record_bytes", &record_bytes_tensor));
+    OP_REQUIRES(ctx, record_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`record_bytes` must be a scalar."));
+    const int64 record_bytes = record_bytes_tensor->scalar<int64>()();
+
+    const Tensor* footer_bytes_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("footer_bytes", &footer_bytes_tensor));
+    OP_REQUIRES(ctx, footer_bytes_tensor->dims() == 0,
+                errors::InvalidArgument("`footer_bytes` must be a scalar."));
+    const int64 footer_bytes = footer_bytes_tensor->scalar<int64>()();
+
+    DatasetBase* dataset = new Dataset(std::move(filenames), header_bytes,
+                                       record_bytes, footer_bytes);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames, int64 header_bytes,
+                     int64 record_bytes, int64 footer_bytes)
+        : filenames_(std::move(filenames)),
+          header_bytes_(header_bytes),
+          record_bytes_(record_bytes),
+          footer_bytes_(footer_bytes) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return "FixedLengthRecordDatasetOp::Dataset";
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (input_buffer_) {
+            const int64 current_pos = input_buffer_->Tell();
+            DCHECK_GE(file_pos_limit_, 0);
+            if (current_pos < file_pos_limit_) {
+              string record;
+              TF_RETURN_IF_ERROR(
+                  input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
+              // Produce the record as output.
+              Tensor record_tensor(cpu_allocator(), DT_STRING, {});
+              record_tensor.scalar<string>()() = record;
+              out_tensors->emplace_back(std::move(record_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            input_buffer_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          uint64 file_size;
+          TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
+              dataset()->filenames_[current_file_index_], &file_size));
+          file_pos_limit_ = file_size - dataset()->footer_bytes_;
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          input_buffer_.reset(new io::InputBuffer(file_.get(), kBufferSize));
+          TF_RETURN_IF_ERROR(
+              input_buffer_->SkipNBytes(dataset()->header_bytes_));
+        } while (true);
+      }
+
+     private:
+      // TODO(mrry): Make this configurable via an attr on the dataset op?
+      // Or maybe via a data input?
+      enum { kBufferSize = 256 << 10 /* 256 kB */ };
+
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_buffer_
+      std::unique_ptr<io::InputBuffer> input_buffer_ GUARDED_BY(mu_);
+      int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+    };
+
+    const std::vector<string> filenames_;
+    const int64 header_bytes_;
+    const int64 record_bytes_;
+    const int64 footer_bytes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
+                        FixedLengthRecordDatasetOp);
+
+class TFRecordDatasetOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    const Tensor* compression_type_tensor;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("compression_type", &compression_type_tensor));
+    OP_REQUIRES(
+        ctx, compression_type_tensor->dims() == 0,
+        errors::InvalidArgument("`compression_type` must be a scalar."));
+    const string& compression_type =
+        compression_type_tensor->scalar<string>()();
+
+    DatasetBase* dataset = new Dataset(std::move(filenames), compression_type);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->scalar<ResourceHandle>()() = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<string> filenames,
+                     const string& compression_type)
+        : filenames_(std::move(filenames)),
+          options_(io::RecordReaderOptions::CreateRecordReaderOptions(
+              compression_type)) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "TFRecordDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (reader_) {
+            Tensor result_tensor(cpu_allocator(), DT_STRING, {});
+            Status s = reader_->ReadRecord(&offset_,
+                                           &result_tensor.scalar<string>()());
+            if (s.ok()) {
+              out_tensors->emplace_back(std::move(result_tensor));
+              *end_of_sequence = false;
+              return Status::OK();
+            } else if (!errors::IsOutOfRange(s)) {
+              return s;
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            reader_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          const string& next_filename =
+              dataset()->filenames_[current_file_index_];
+          TF_RETURN_IF_ERROR(
+              ctx->env()->NewRandomAccessFile(next_filename, &file_));
+          reader_.reset(new io::RecordReader(file_.get(), dataset()->options_));
+          offset_ = 0;
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      uint64 offset_ GUARDED_BY(mu_) = 0;
+
+      // `reader_` will borrow the object that `file_` points to, so
+      // we must destroy `reader_` before `file_`.
+      std::unique_ptr<RandomAccessFile> file_ GUARDED_BY(mu_);
+      std::unique_ptr<io::RecordReader> reader_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<string> filenames_;
+    io::RecordReaderOptions options_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TFRecordDataset").Device(DEVICE_CPU),
+                        TFRecordDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
index fbc4ee31a4c588a2479b2ad64501fcc93a053fc1..ec4490db83fc38c23208991b15ee93ca072c7d50 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu.cu.cc
@@ -47,7 +47,7 @@ struct ReduceFunctor<GPUDevice, Reducer> {
 };
 
 template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T> > {
+struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
   static void Reduce(const GPUDevice& d, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
@@ -60,7 +60,7 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T> > {
          ++i) {
       num_coeffs_to_reduce *= in.dimension(reduction_axes[i]);
     }
-    T scale = T(1.0) / num_coeffs_to_reduce;
+    T scale = T(1.0 / num_coeffs_to_reduce);
     out.device(d) = (in * scale).sum(reduction_axes);
   }
 
@@ -108,6 +108,10 @@ DEFINE_FOR_ALL_REDUCERS(double);
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::AndReducer);
 DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::OrReducer);
 #undef DEFINE_FOR_TYPE_AND_R
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index e018cb55dd11b2760f5375666536e80ecb956f22..03f737b4fa16130218dae2f0681fe3646376a7da 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::MeanReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -37,9 +37,9 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::MeanReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 #endif
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index e04c655dabb88ff3145dd13a595f739b566a4854..f841a981b41bd186b6bedab5bee50afb2a39f151 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::ProdReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -37,10 +37,10 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::ProdReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(int32);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_int32(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 #endif
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 938ca66a0cb0f64456ef0e6afc3873b2c978cd66..828e1a588cd7bda8c88822c5e63d1d988e5cc055 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -24,12 +24,7 @@ namespace tensorflow {
           .TypeConstraint<type>("T")      \
           .TypeConstraint<int32>("Tidx"), \
       ReductionOp<CPUDevice, type, Eigen::internal::SumReducer<type>>);
-TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
-// NOTE: We should have mean(complex64,int32), too. But that needs to
-// change Eigen::internal::MeanReducer to cast int to complex<float>.
-// We don't see immediate need of mean(complex64,int32) anyway.
-TF_CALL_complex64(REGISTER_CPU_KERNELS);
-TF_CALL_complex128(REGISTER_CPU_KERNELS);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
@@ -42,11 +37,9 @@ TF_CALL_complex128(REGISTER_CPU_KERNELS);
           .TypeConstraint<int32>("Tidx")    \
           .HostMemory("reduction_indices"), \
       ReductionOp<GPUDevice, type, Eigen::internal::SumReducer<type>>);
-REGISTER_GPU_KERNELS(Eigen::half);
-REGISTER_GPU_KERNELS(float);
-REGISTER_GPU_KERNELS(double);
-REGISTER_GPU_KERNELS(complex64);
-REGISTER_GPU_KERNELS(complex128);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
index bd95474a62be98bae48f92e37add8f061ca9cdbe..aa3835ecc569af95126b1799d8475cbad75e424a 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op.cc
@@ -30,7 +30,11 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
       : OpKernel(ctx), execute_info_() {
     string serialized_proto;
     OP_REQUIRES_OK(
-        ctx, ctx->GetAttr("serialized_graph_transfer_info", &serialized_proto));
+        ctx, ctx->GetAttr(RemoteFusedGraphExecuteUtils::
+                              ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+                          &serialized_proto));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tinputs", &input_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutputs", &output_types_));
     execute_info_.ParseFromString(serialized_proto);
     if (!execute_info_.executor_name().empty()) {
       const RemoteFusedGraphExecuteUtils::ExecutorBuildFunc* build_func =
@@ -69,12 +73,15 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
   void Compute(OpKernelContext* const ctx) final {
     CHECK(ctx != nullptr);
     const int input_count = ctx->num_inputs();
-    CHECK(input_count == execute_info_.graph_input_node_name_size())
+    const int graph_input_count = execute_info_.graph_input_node_name_size();
+    CHECK(input_count == graph_input_count &&
+          input_count == input_types_.size())
         << "input_count = " << input_count
-        << ", gt input count = " << execute_info_.graph_input_node_name_size();
+        << ", gt input count = " << execute_info_.graph_input_node_name_size()
+        << ", type count = " << input_types_.size();
 
-    // 3. Send inputs into remote processor
-    for (int i = 0; i < input_count; ++i) {
+    // 3. Send first data type inputs into remote processor
+    for (int i = 0; i < graph_input_count; ++i) {
       const Tensor& input_tensor = ctx->input(i);
       const string& input_node_name = execute_info_.graph_input_node_name(i);
       if (remote_fused_graph_executor_) {
@@ -90,7 +97,8 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
 
     // 5. Load outputs from remote processor
     const int output_count = ctx->num_outputs();
-    CHECK(output_count == execute_info_.graph_output_node_name_size());
+    CHECK(output_count == execute_info_.graph_output_node_name_size() &&
+          output_count == output_types_.size());
     for (int i = 0; i < output_count; ++i) {
       Tensor* output = nullptr;
       const string& output_node_name = execute_info_.graph_output_node_name(i);
@@ -110,6 +118,8 @@ class RemoteFusedGraphExecuteOp : public OpKernel {
  private:
   RemoteFusedGraphExecuteInfo execute_info_;
   std::unique_ptr<IRemoteFusedGraphExecutor> remote_fused_graph_executor_;
+  DataTypeVector input_types_;
+  DataTypeVector output_types_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteOp);
 };
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
index 580be4b7db931a4546eef1fe15dc0a2a10263ff8..655de2f98f38b37dd102265a629472a8f2fc2ac4 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test.cc
@@ -37,20 +37,34 @@ namespace tensorflow {
 
 class RemoteFusedGraphExecuteTest : public OpsTestBase {};
 
-TEST_F(RemoteFusedGraphExecuteTest, ExecuteAddGraph) {
+TEST_F(RemoteFusedGraphExecuteTest, BuildModelWithOneDataType) {
+  DataTypeVector input_types({DT_FLOAT, DT_FLOAT});
+  DataTypeVector output_types({DT_FLOAT});
   TF_ASSERT_OK(
       NodeDefBuilder("remote_fused_graph_execute_op", "RemoteFusedGraphExecute")
           .Input(FakeInput(2, DT_FLOAT))
-          .Attr("M", 2)
-          .Attr("N", 1)
-          .Attr("T", DataTypeToEnum<float>::v())
-          .Attr("U", DataTypeToEnum<float>::v())
-          .Attr("serialized_graph_transfer_info", "")
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   // TODO(satok): Add benchmark
 }
 
+TEST_F(RemoteFusedGraphExecuteTest, BuildModelWithWrongDataType) {
+  DataTypeVector input_types({DT_INT32, DT_INT32});
+  DataTypeVector output_types({DT_FLOAT});
+  ASSERT_FALSE(
+      NodeDefBuilder("remote_fused_graph_execute_op", "RemoteFusedGraphExecute")
+          .Input(FakeInput(2, DT_FLOAT))
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", "")
+          .Finalize(node_def())
+          .ok());
+  // TODO(satok): Add benchmark
+}
+
 ////////////////////////////
 // End-to-end test: Begin //
 ////////////////////////////
@@ -75,8 +89,8 @@ static Output BuildPlaceHolderOp(const string& name, const DataType dt,
                                  const TensorShape& tensor_shape, Scope* root) {
   const Scope& scope = root->WithOpName(name);
   Node* ret;
-  const string unique_name = scope.GetUniqueNameForOp("PlaceholderV2");
-  NodeBuilder builder = NodeBuilder(unique_name, "PlaceholderV2")
+  const string unique_name = scope.GetUniqueNameForOp("Placeholder");
+  NodeBuilder builder = NodeBuilder(unique_name, "Placeholder")
                             .Attr("dtype", dt)
                             .Attr("shape", tensor_shape);
   scope.UpdateBuilder(&builder);
@@ -94,13 +108,15 @@ static Output BuildRemoteFusedGraphExecuteOp(
   CHECK(scope.ok());
   auto node_out_list = ops::AsNodeOutList(scope, InputList(output_list));
   const auto unique_name = scope.GetUniqueNameForOp("RemoteFusedGraphExecute");
+
+  DataTypeVector input_types{DT_FLOAT};
+  DataTypeVector output_types{DT_FLOAT};
+
   auto builder = NodeBuilder(unique_name, "RemoteFusedGraphExecute")
                      .Input(node_out_list)
-                     .Attr("M", static_cast<int64>(output_list.size()))
-                     .Attr("N", static_cast<int64>(output_node_count))
-                     .Attr("T", DT_FLOAT)
-                     .Attr("U", DT_FLOAT)
-                     .Attr("serialized_graph_transfer_info",
+                     .Attr("Tinputs", input_types)
+                     .Attr("Toutputs", output_types)
+                     .Attr("serialized_remote_fused_graph_execute_info",
                            StringPiece(execute_info.SerializeAsString()));
   CHECK(scope.ok());
   scope.UpdateBuilder(&builder);
@@ -253,13 +269,13 @@ static Status RewriteGraphToFusedGraph(const GraphDef& original_graph,
 // 5. Fuse the original graph and run the inference the new fused graph
 TEST(RemoteFusedExecuteGraphOp, EndToEndTest) {
   // 5.1 Load original graph
-  const GraphDef original_graph =
-      RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-          NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef original_graph;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &original_graph));
 
   // 5.2 Fuse graph
   GraphDef fused_graph;
-  TF_CHECK_OK(RewriteGraphToFusedGraph(original_graph, &fused_graph));
+  TF_ASSERT_OK(RewriteGraphToFusedGraph(original_graph, &fused_graph));
 
   // 5.3 Setup session
   std::vector<Tensor> output_tensors;
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
index 6e7d4b73d23dde82d36c3d9754a8d1d919cf0dc1..31c48082dd9715feec54e1b441afade6b489123a 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,17 +39,57 @@ namespace tensorflow {
   return Output(ret, 0);
 }
 
-/* static */ GraphDef RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+/* static */ Status RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
     const string& name0, const float val0, const string& name1,
-    const float val1, const string& name_out) {
+    const float val1, const string& name_out, GraphDef* graph_def) {
   Scope root = Scope::NewRootScope();
   Output node0 = ops::Const(root.WithOpName(name0), val0);
   Output node1 = ops::Const(root.WithOpName(name1), val1);
   RemoteFusedGraphExecuteOpTestUtils::BuildAddOp(root.WithOpName(name_out),
                                                  node0, node1);
-  GraphDef def;
-  TF_CHECK_OK(root.ToGraphDef(&def));
-  return def;
+  TF_RETURN_IF_ERROR(root.ToGraphDef(graph_def));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(
+    GraphDef* graph_def) {
+  Scope root = tensorflow::Scope::NewRootScope();
+
+  Tensor a_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&a_data, 1.0f);
+  Output a_const = ops::Const(root.WithOpName("A"), Input::Initializer(a_data));
+
+  Tensor b_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&b_data, 1.0f);
+  Output b_const = ops::Const(root.WithOpName("B"), Input::Initializer(b_data));
+
+  Tensor c_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&c_data, 1.0f);
+  Output c_const = ops::Const(root.WithOpName("C"), Input::Initializer(c_data));
+
+  Tensor d_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&d_data, 1.0f);
+  Output d_const = ops::Const(root.WithOpName("D"), Input::Initializer(d_data));
+
+  Tensor e_data(DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillIota<float>(&e_data, 1.0f);
+  Output e_const = ops::Const(root.WithOpName("E"), Input::Initializer(e_data));
+
+  Output f_add = ops::Add(root.WithOpName("F"), a_const, b_const);
+
+  Output g_add = ops::Add(root.WithOpName("G"), d_const, e_const);
+
+  Output h_add = ops::Add(root.WithOpName("H"), f_add, c_const);
+
+  Output i_add = ops::Add(root.WithOpName("I"), c_const, g_add);
+
+  Output j_add = ops::Add(root.WithOpName("J"), h_add, i_add);
+
+  Output k_add = ops::Add(root.WithOpName("K"), j_add, g_add);
+
+  TF_RETURN_IF_ERROR(root.ToGraphDef(graph_def));
+
+  return Status::OK();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
index 70d758ea6a84302de55a88c1d6c3443166d31b35..a0df50162b6adbb7cd0d2e71535735edba36d824 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h
@@ -28,9 +28,31 @@ namespace tensorflow {
 class RemoteFusedGraphExecuteOpTestUtils {
  public:
   static Output BuildAddOp(const Scope& scope, const Input& x, const Input& y);
-  static GraphDef BuildAddGraph(const string& name0, const float val0,
-                                const string& name1, const float val1,
-                                const string& name_out);
+  static Status BuildAddGraph(const string& name0, const float val0,
+                              const string& name1, const float val1,
+                              const string& name_out, GraphDef* graph_def);
+
+  // BuildMultipleAddGraph builds the following graph
+  //
+  //  A         B         C         D         E
+  //  |         |         |         |         |
+  //  +----+----+         |         +----+----+
+  //       |              |              |
+  //       F             / \             G
+  //       |            |   |           / \
+  //       +-----+------+   +-----+----+   +
+  //             |                |        |
+  //             H                I        |
+  //             |                |        |
+  //             +-------+--------+        |
+  //                     |                 |
+  //                     J                 |
+  //                     |                 |
+  //                     +--------+--------+
+  //                              |
+  //                              K
+  //
+  static Status BuildMultipleAddGraph(GraphDef* graph_def);
 
  private:
   RemoteFusedGraphExecuteOpTestUtils() = delete;
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index cb16ffe6755ca2483f68cac34a3b8bc5c4ac0e53..d0ffcb1064b9262f090a9f60355239be3fff572f 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -22,15 +22,104 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+namespace {
+const Node* FindNodeByName(const string& name, const Graph& graph) {
+  for (const Node* node : graph.nodes()) {
+    CHECK_NOTNULL(node);
+    if (node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+std::unordered_set<string> BuildNodeSetFromNodeNamesAndPorts(
+    const std::vector<string>& node_names_and_ports) {
+  std::unordered_set<string> retval;
+  for (const string& node_name_and_port : node_names_and_ports) {
+    const TensorId tid = ParseTensorName(node_name_and_port);
+    retval.emplace(tid.first.ToString());
+  }
+  return retval;
+}
+
+Node* FindMutableNodeByName(const string& name, Graph* graph) {
+  for (Node* node : graph->nodes()) {
+    if (node != nullptr && node->name() == name) {
+      return node;
+    }
+  }
+  return nullptr;
+}
+
+const NodeDef* FindNodeDefByName(const string& input,
+                                 const GraphDef& graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  const string name = tid.first.ToString();
+  for (const NodeDef& node_def : graph_def.node()) {
+    if (node_def.name() == name) {
+      return &node_def;
+    }
+  }
+  return nullptr;
+}
+
+string DumpGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+string DumpCluster(const RemoteFusedGraphExecuteUtils::ClusterInfo& cluster) {
+  string out;
+  out += "Nodes:\n";
+  for (const string& str : std::get<0>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nInput border:\n";
+  for (const string& str : std::get<1>(cluster)) {
+    out += str + ", ";
+  }
+  out += "\nOutput border:\n";
+  for (const string& str : std::get<2>(cluster)) {
+    out += str + ", ";
+  }
+  return out;
+}
+
+}  // namespace
 
 /* static */ constexpr const char* const
     RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_DATA_TYPES;
 /* static */ constexpr const char* const
     RemoteFusedGraphExecuteUtils::ATTR_OUTPUT_SHAPES;
+/* static */ constexpr const char* const RemoteFusedGraphExecuteUtils::
+    ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO;
+/* static */ constexpr const char* const RemoteFusedGraphExecuteUtils::
+    TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES;
+/* static */ constexpr const char* const
+    RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES;
 
 RemoteFusedGraphExecuteUtils::ExecutorBuildRegistrar::ExecutorBuildRegistrar(
     const string& name, ExecutorBuildFunc executor_build_func) {
@@ -168,12 +257,12 @@ RemoteFusedGraphExecuteUtils::GetExecutorBuildRegistry() {
     output_tensors.push_back(input_node_info.second);
   }
 
-  for (int i = 0; i < output_node_names.size(); ++i) {
+  for (int i = 0; static_cast<size_t>(i) < output_node_names.size(); ++i) {
     const string& name = output_node_names.at(i);
     const Tensor& tensor = output_tensors.at(i);
     EmplaceTensorShapeType(name, tensor, tensor_shape_map);
   }
-  for (int i = 0; i < input_node_info_list.size(); ++i) {
+  for (int i = 0; static_cast<size_t>(i) < input_node_info_list.size(); ++i) {
     const string& name = input_node_info_list.at(i).first;
     const Tensor& tensor = output_tensors.at(output_node_names.size() + i);
     EmplaceTensorShapeType(name, tensor, tensor_shape_map);
@@ -187,7 +276,8 @@ RemoteFusedGraphExecuteUtils::GetExecutorBuildRegistry() {
     const std::vector<std::pair<string, Tensor>>& input_tensor_vector,
     const string& node_name) {
   for (const std::pair<string, Tensor>& pair : input_tensor_vector) {
-    if (node_name == pair.first) {
+    const TensorId tid = ParseTensorName(pair.first);
+    if (node_name == tid.first.ToString()) {
       return true;
     }
   }
@@ -259,17 +349,17 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
 }
 
 /* static */ Status RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
-    const NodeDef& node_def, std::vector<DataType>* data_types,
+    AttrSlice attrs, std::vector<DataType>* data_types,
     std::vector<TensorShape>* shapes) {
   Status status;
   if (data_types != nullptr) {
-    status = GetNodeAttr(node_def, ATTR_OUTPUT_DATA_TYPES, data_types);
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_DATA_TYPES, data_types);
   }
   if (!status.ok()) {
     return status;
   }
   if (shapes != nullptr) {
-    status = GetNodeAttr(node_def, ATTR_OUTPUT_SHAPES, shapes);
+    status = GetNodeAttr(attrs, ATTR_OUTPUT_SHAPES, shapes);
     if (status.ok() && data_types != nullptr) {
       CHECK_EQ(data_types->size(), shapes->size());
     }
@@ -278,6 +368,26 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
   return status;
 }
 
+/* static */ bool RemoteFusedGraphExecuteUtils::GetOutputTensorShapeType(
+    const GraphDef& graph_def, const string& name_and_port, DataType* data_type,
+    TensorShape* shape) {
+  std::vector<DataType> data_types;
+  std::vector<TensorShape> shapes;
+  const TensorId tid = ParseTensorName(name_and_port);
+  const string node_name = tid.first.ToString();
+  const int port = tid.second;
+  const NodeDef* node_def = FindNodeDefByName(node_name, graph_def);
+  CHECK_NOTNULL(node_def);
+  GetOutputTensorShapeType(*node_def, &data_types, &shapes).IgnoreError();
+  if (data_types.empty()) {
+    return false;
+  }
+  CHECK(data_types.size() > port);
+  *data_type = data_types.at(port);
+  *shape = shapes.at(port);
+  return true;
+}
+
 /* static */ Status RemoteFusedGraphExecuteUtils::PropagateShapeInference(
     const GraphDef& graph_def,
     const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -416,4 +526,529 @@ RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
                      std::make_pair(tensor.dtype(), tensor.shape())));
 }
 
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+    const std::vector<std::pair<string, Tensor>>& input_tensors,
+    const bool dry_run_inference, GraphDef* graph_def) {
+  TensorShapeMap tensor_shape_map;
+  if (dry_run_inference) {
+    TF_RETURN_IF_ERROR(DryRunInferenceForAllNode(*graph_def, input_tensors,
+                                                 /*initialize_by_zero=*/true,
+                                                 &tensor_shape_map));
+  } else {
+    ImportGraphDefOptions opts;
+    Graph graph(OpRegistry::Global());
+    ShapeRefiner shape_refiner(graph.versions().producer(),
+                               graph.op_registry());
+    TF_RETURN_IF_ERROR(
+        ImportGraphDef(opts, *graph_def, &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(PropagateShapeInference(*graph_def, input_tensors,
+                                               &graph, &shape_refiner));
+    TF_RETURN_IF_ERROR(
+        BuildTensorShapeMapFromGraph(graph, shape_refiner, &tensor_shape_map));
+  }
+
+  for (NodeDef& node_def : *graph_def->mutable_node()) {
+    TF_RETURN_IF_ERROR(
+        AddOutputTensorShapeTypeByTensorShapeMap(tensor_shape_map, &node_def));
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+    const string& executor_name, const GraphDef& subgraph_def,
+    const std::vector<string>& inputs, const std::vector<string>& outputs,
+    const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+    DataTypeVector* input_types, DataTypeVector* output_types) {
+  CHECK_NOTNULL(execute_info);
+  CHECK_NOTNULL(input_types);
+  CHECK_NOTNULL(output_types);
+
+  execute_info->Clear();
+  execute_info->set_executor_name(executor_name);
+
+  // copy graph
+  *execute_info->mutable_remote_graph() = subgraph_def;
+
+  for (const string& input : inputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, input, &dt, &shape);
+
+    execute_info->add_graph_input_node_name(input);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& tensor_shape_type =
+          *execute_info->add_default_graph_input_tensor_shape();
+      tensor_shape_type.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto = *tensor_shape_type.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      input_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << input << DumpGraphDef(subgraph_def);
+      // Assuming input type is float if no data provided.
+      input_types->push_back(DT_FLOAT);
+    }
+  }
+
+  for (const string& output : outputs) {
+    DataType dt;
+    TensorShape shape;
+    const bool has_shapetype =
+        GetOutputTensorShapeType(subgraph_def, output, &dt, &shape);
+
+    execute_info->add_graph_output_node_name(output);
+    if (has_shapetype) {
+      RemoteFusedGraphExecuteInfo::TensorShapeTypeProto&
+          tensor_shape_type_proto =
+              *execute_info->add_default_graph_output_tensor_shape();
+      tensor_shape_type_proto.set_dtype(dt);
+      TensorShapeProto& tensor_shape_proto =
+          *tensor_shape_type_proto.mutable_shape();
+      for (const int64 dim : shape.dim_sizes()) {
+        tensor_shape_proto.add_dim()->set_size(dim);
+      }
+      output_types->push_back(dt);
+    } else {
+      CHECK(!require_shape_type)
+          << "No shape type found for " << output << DumpGraphDef(subgraph_def);
+      // Assuming output type is float if no data provided.
+      output_types->push_back(DT_FLOAT);
+    }
+  }
+
+  return Status::OK();
+}
+
+/* static */ Status
+RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+    const string& node_name, const string& executor_name,
+    const GraphDef& subgraph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs, const bool require_shape_type,
+    Graph* graph, Node** created_node) {
+  CHECK_NOTNULL(graph);
+  CHECK_NOTNULL(created_node);
+
+  RemoteFusedGraphExecuteInfo execute_info;
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+
+  TF_CHECK_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      executor_name, subgraph_def, inputs, outputs, require_shape_type,
+      &execute_info, &input_types, &output_types));
+
+  std::vector<NodeBuilder::NodeOut> node_out_list;
+  for (const string& input : inputs) {
+    const TensorId tid = ParseTensorName(input);
+    Node* node = FindMutableNodeByName(tid.first.ToString(), graph);
+    CHECK_NOTNULL(node);
+    node_out_list.emplace_back(node, tid.second);
+  }
+
+  const string execute_info_str = execute_info.SerializeAsString();
+
+  auto builder =
+      NodeBuilder(node_name, "RemoteFusedGraphExecute")
+          .Input(node_out_list)
+          .Attr("Tinputs", input_types)
+          .Attr("Toutputs", output_types)
+          .Attr("serialized_remote_fused_graph_execute_info", execute_info_str);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildIdentityOpNode(
+    const string& node_name, const string& input_node_name,
+    const int input_node_port, const DataType dt, Graph* graph,
+    Node** created_node) {
+  Node* node = FindMutableNodeByName(input_node_name, graph);
+  CHECK_NOTNULL(node);
+  NodeBuilder::NodeOut node_out(node, input_node_port);
+
+  auto builder =
+      NodeBuilder(node_name, "Identity").Input(node_out).Attr("T", dt);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(graph, created_node));
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+    const std::unordered_set<string>& node_names, const GraphDef& graph_def,
+    std::vector<ClusterInfo>* cluster_infos) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+  std::unordered_set<string> remaining_nodes = node_names;
+
+  while (!remaining_nodes.empty()) {
+    ClusterInfo ci;
+
+    // Determine one cluster nodes
+    std::unordered_set<const Node*> visited;
+    std::deque<const Node*> queue;
+    queue.emplace_back(FindNodeByName(*remaining_nodes.begin(), graph));
+    while (!queue.empty()) {
+      const Node* node = queue.front();
+      CHECK_NOTNULL(node);
+      queue.pop_front();
+      const string& node_name = node->name();
+      if (node_names.count(node_name) > 0) {
+        std::get<0>(ci).emplace(node_name);
+        remaining_nodes.erase(node_name);
+      } else {
+        // Edge of subgraph.  Do nothing.
+        continue;
+      }
+      for (const Node* in : node->in_nodes()) {
+        if (visited.insert(in).second) {
+          queue.push_back(in);
+        }
+      }
+      for (const Node* out : node->out_nodes()) {
+        if (visited.insert(out).second) {
+          queue.push_back(out);
+        }
+      }
+    }
+
+    // Determine one cluster border
+    std::vector<string>& border_inputs = std::get<1>(ci);
+    std::vector<string>& border_outputs = std::get<2>(ci);
+    for (const string& node_name : node_names) {
+      Node* node = FindMutableNodeByName(node_name, &graph);
+      CHECK_NOTNULL(node);
+      int input_count = 0;
+      for (const Edge* in_edge : node->in_edges()) {
+        const Node* src_node = in_edge->src();
+        const bool src_is_outside =
+            node_names.count(src_node->name()) <= 0 && !src_node->IsSource();
+        if (src_is_outside) {
+          const string src_name =
+              strings::StrCat(src_node->name(), ":", in_edge->src_output());
+          CHECK_EQ(1, src_node->num_outputs())
+              << "output count of input border node must be one."
+              << src_node->name();
+          if (std::find(border_inputs.begin(), border_inputs.end(), src_name) ==
+              border_inputs.end()) {
+            border_inputs.emplace_back(src_name);
+          }
+        } else {
+          ++input_count;
+        }
+      }
+      CHECK(input_count == 0 || input_count == node->in_edges().size());
+
+      for (const Edge* out_edge : node->out_edges()) {
+        const Node* dst_node = out_edge->dst();
+        CHECK_NOTNULL(dst_node);
+        const bool dst_is_outside = node_names.count(dst_node->name()) <= 0;
+        const string dst_name =
+            strings::StrCat(node->name(), ":", out_edge->src_output());
+        if (dst_is_outside) {
+          if (dst_node->IsSink()) {
+            CHECK_EQ(1, node->num_outputs())
+                << "If you want to specify output node as subgraph output node "
+                << "the output count of the node must be 1 "
+                << "because that node is replaced by identity node.";
+            const string identity_dst_name =
+                strings::StrCat(node->name(), ":", 0);
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          identity_dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(identity_dst_name);
+            }
+          } else {
+            if (std::find(border_outputs.begin(), border_outputs.end(),
+                          dst_name) == border_outputs.end()) {
+              border_outputs.emplace_back(dst_name);
+            }
+          }
+        }
+      }
+    }
+    cluster_infos->emplace_back(ci);
+    VLOG(1) << DumpCluster(ci);
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+    const ClusterInfo& cluster, const GraphDef& graph_def,
+    GraphDef* subgraph_def) {
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  const std::unordered_set<string>& border_input_names =
+      BuildNodeSetFromNodeNamesAndPorts(std::get<1>(cluster));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  for (Node* node : graph.nodes()) {
+    if (node != nullptr && node_names.count(node->name()) <= 0 &&
+        border_input_names.count(node->name()) <= 0 && !node->IsSource() &&
+        !node->IsSink()) {
+      graph.RemoveNode(node);
+    }
+  }
+  graph.ToGraphDef(subgraph_def);
+
+  for (const string& subgraph_input : std::get<1>(cluster)) {
+    const TensorId tid = ParseTensorName(subgraph_input);
+    const string subgraph_input_name = tid.first.ToString();
+    const int subgraph_input_port = tid.second;
+    const NodeDef* node_def = FindNodeDefByName(subgraph_input_name, graph_def);
+    CHECK_NOTNULL(node_def);
+    std::vector<DataType> dt_vec;
+    std::vector<TensorShape> shape_vec;
+    GetOutputTensorShapeType(*node_def, &dt_vec, &shape_vec).IgnoreError();
+    const DataType& dt =
+        dt_vec.empty() ? DT_FLOAT : dt_vec.at(subgraph_input_port);
+    const TensorShape& shape =
+        shape_vec.empty() ? TensorShape({}) : shape_vec.at(subgraph_input_port);
+
+    TF_RETURN_IF_ERROR(ReplaceInputNodeByPlaceHolder(subgraph_input_name, dt,
+                                                     shape, subgraph_def));
+  }
+
+  // sort subgraph_def to align order in graph_def
+  std::unordered_map<string, int> name_to_id_map;
+  for (int i = 0; i < graph_def.node_size(); ++i) {
+    name_to_id_map.emplace(graph_def.node(i).name(), i);
+  }
+  std::sort(subgraph_def->mutable_node()->begin(),
+            subgraph_def->mutable_node()->end(),
+            [&name_to_id_map](const NodeDef& node0, const NodeDef& node1) {
+              CHECK(name_to_id_map.count(node0.name()) > 0);
+              CHECK(name_to_id_map.count(node1.name()) > 0);
+              const int id0 = name_to_id_map.at(node0.name());
+              const int id1 = name_to_id_map.at(node1.name());
+              return id0 < id1;
+            });
+
+  VLOG(1) << DumpGraphDef(*subgraph_def);
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs, const GraphDef& graph_def,
+    ClusterInfo* cluster) {
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, &shape_refiner));
+
+  std::unordered_set<const Node*> visited;
+  std::deque<const Node*> queue;
+  for (const string& output : border_outputs) {
+    const TensorId tid = ParseTensorName(output);
+    const string& output_node_name = tid.first.ToString();
+    for (const Node* node : graph.nodes()) {
+      if (output_node_name == node->name()) {
+        queue.push_back(node);
+        visited.insert(node);
+      }
+    }
+  }
+
+  std::unordered_set<const Node*> border_input_nodes;
+  // propagate visit to parent nodes until input nodes
+  while (!queue.empty()) {
+    const Node* node = queue.front();
+    queue.pop_front();
+    for (const Edge* edge : node->in_edges()) {
+      const Node* src_node = edge->src();
+      CHECK_NOTNULL(src_node);
+      const int src_port = edge->src_output();
+      bool input_found = false;
+      for (const string& input : border_inputs) {
+        const TensorId tid = ParseTensorName(input);
+        if (tid.first.ToString() == src_node->name() &&
+            tid.second == src_port) {
+          input_found = true;
+          border_input_nodes.insert(src_node);
+        }
+      }
+      if (visited.insert(src_node).second) {
+        if (!input_found) {
+          queue.push_back(src_node);
+        }
+      }
+    }
+  }
+
+  for (const Node* node : visited) {
+    if (node != nullptr && !node->IsSource() && !node->IsSink() &&
+        border_input_nodes.count(node) <= 0) {
+      std::get<0>(*cluster).insert(node->name());
+    }
+  }
+  std::get<1>(*cluster) = border_inputs;
+  std::get<2>(*cluster) = border_outputs;
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseCluster(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name, const ClusterInfo& cluster,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  LOG(INFO) << "Transforming quantized stripped model to a remote fused "
+               "graph execute op by fusing a specified subgraph...";
+
+  CHECK(!remote_graph_executor_name.empty());
+
+  const std::vector<string>& border_inputs = std::get<1>(cluster);
+  const std::vector<string>& border_outputs = std::get<2>(cluster);
+
+  GraphDef subgraph_def;
+  TF_RETURN_IF_ERROR(
+      BuildClusterSubgraphDef(cluster, input_graph_def, &subgraph_def));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_RETURN_IF_ERROR(
+      ImportGraphDef({}, input_graph_def, &graph, &shape_refiner));
+
+  Node* fused_node;
+  TF_RETURN_IF_ERROR(BuildRemoteFusedGraphExecuteOpNode(
+      remote_fused_graph_node_name, remote_graph_executor_name, subgraph_def,
+      border_inputs, border_outputs, require_shape_type, &graph, &fused_node));
+
+  for (const Node* node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      const Edge* edge = nullptr;
+      TF_RETURN_IF_ERROR(node->input_edge(i, &edge));
+      for (int j = 0; j < border_outputs.size(); ++j) {
+        const string& output = border_outputs.at(j);
+        const TensorId tid = ParseTensorName(output);
+        const string output_name = tid.first.ToString();
+        Node* src_node = edge->src();
+        if (src_node != nullptr && src_node->name() == output_name &&
+            edge->src_output() == tid.second) {
+          // Source node is replaced by new fused node.
+          Node* dst_node = edge->dst();
+          const int dst_input = edge->dst_input();
+          LOG(INFO) << "Removing existing edge to " << edge->dst()->name()
+                    << " from " << edge->src()->name();
+          graph.RemoveEdge(edge);
+          graph.AddEdge(fused_node, j, dst_node, dst_input);
+        }
+      }
+    }
+  }
+
+  // Replace output nodes by identity nodes which forward outputs from
+  // RemoteFusedGraphExecuteOpNode
+  for (const string& output : outputs) {
+    const TensorId output_tid = ParseTensorName(output);
+    const string output_name = output_tid.first.ToString();
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      const TensorId subgraph_output_tid =
+          ParseTensorName(border_outputs.at(i));
+      const string& subgraph_output_name = subgraph_output_tid.first.ToString();
+      if (output_name == subgraph_output_name) {
+        LOG(INFO) << "As graph output and subgraph output are same, "
+                  << "the graph output node is replaced by identity node";
+        Node* original_output_node = FindMutableNodeByName(output, &graph);
+        CHECK_NOTNULL(original_output_node);
+        CHECK_EQ(1, original_output_node->num_outputs())
+            << "Num outputs should be 1 for " << output << ".";
+        graph.RemoveNode(original_output_node);
+        Node* new_node;
+        TF_RETURN_IF_ERROR(BuildIdentityOpNode(output,
+                                               remote_fused_graph_node_name, i,
+                                               DT_FLOAT, &graph, &new_node));
+        CHECK_NOTNULL(new_node);
+      }
+    }
+  }
+
+  GraphDef result_graph_def;
+
+  graph.ToGraphDef(&result_graph_def);
+
+  ClusterInfo graph_cluster;
+  TF_RETURN_IF_ERROR(
+      BuildClusterByBorder(inputs, outputs, result_graph_def, &graph_cluster));
+
+  // Remove unvisited nodes
+  TF_RETURN_IF_ERROR(BuildClusterSubgraphDef(graph_cluster, result_graph_def,
+                                             output_graph_def));
+
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name_prefix,
+    const std::unordered_set<string>& subgraph_nodes,
+    const string& remote_fused_graph_executor_name,
+    const bool require_shape_type, GraphDef* output_graph_def) {
+  std::vector<ClusterInfo> ci_vec;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      subgraph_nodes, input_graph_def, &ci_vec));
+
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    const string remote_fused_graph_node_name =
+        strings::StrCat(remote_fused_graph_node_name_prefix, "/", i);
+    TF_RETURN_IF_ERROR(FuseCluster(input_graph_def, inputs, outputs,
+                                   remote_fused_graph_node_name, ci_vec.at(i),
+                                   remote_fused_graph_executor_name,
+                                   require_shape_type, output_graph_def));
+  }
+  return Status::OK();
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+    const GraphDef& input_graph_def, const std::vector<string>& inputs,
+    const std::vector<string>& outputs,
+    const string& remote_fused_graph_node_name,
+    const std::vector<string>& border_inputs,
+    const std::vector<string>& border_outputs,
+    const string& remote_graph_executor_name, const bool require_shape_type,
+    GraphDef* output_graph_def) {
+  ClusterInfo cluster;
+  TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      border_inputs, border_outputs, input_graph_def, &cluster));
+
+  return FuseCluster(
+      input_graph_def, inputs, outputs, remote_fused_graph_node_name, cluster,
+      remote_graph_executor_name, require_shape_type, output_graph_def);
+}
+
+/* static */ Status RemoteFusedGraphExecuteUtils::ReplaceInputNodeByPlaceHolder(
+    const string& input, const DataType type, const TensorShape& shape,
+    GraphDef* graph_def) {
+  const TensorId tid = ParseTensorName(input);
+  CHECK_EQ(0, tid.second);
+  const string node_name = tid.first.ToString();
+  for (NodeDef& node : *graph_def->mutable_node()) {
+    if (node.name() != node_name) {
+      continue;
+    }
+    if (node.op() == "Placeholder") {
+      return Status::OK();
+    } else {
+      NodeDef placeholder_node;
+      placeholder_node.set_op("Placeholder");
+      placeholder_node.set_name(node_name);
+      AddNodeAttr("dtype", type, &placeholder_node);
+      AddNodeAttr("shape", shape, &placeholder_node);
+      // TODO(satok): Remove once we merge attributes
+      AddOutputTensorShapeType({type}, {shape}, &placeholder_node);
+      node.Clear();
+      node = placeholder_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument(
+      strings::StrCat(node_name, " not found for replacement."));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
index a71047d42d9ed27b3d0a67e783477880532ca786..3a792824c5084231d6dd900dd63134fc5d850f7e 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_REMOTE_FUSED_GRAPH_EXECUTE_UTILS_H_
 
 #include <unordered_map>
+#include <unordered_set>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
@@ -38,6 +39,25 @@ class RemoteFusedGraphExecuteUtils {
   // TODO(satok): Use "_output_shapes" to share a spec with other ops
   static constexpr const char* const ATTR_OUTPUT_SHAPES =
       "_default_remote_output_shapes";
+  static constexpr const char* const
+      ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO =
+          "serialized_remote_fused_graph_execute_info";
+
+  // Argument key strings to fuse a subgraph into RemoteFusedGraphExecuteOp.
+  static constexpr const char* const
+      TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
+          "remote_fused_graph_executor_name";
+  static constexpr const char* const
+      TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME =
+          "remote_fused_graph_node_name";
+  static constexpr const char* const TRANSFORM_ARG_FUSED_NODES = "fused_nodes";
+  static constexpr const char* const TRANSFORM_ARG_BORDER_INPUTS =
+      "border_inputs";
+  static constexpr const char* const TRANSFORM_ARG_BORDER_OUTPUTS =
+      "border_outputs";
+  static constexpr const char* const TRANSFORM_ARG_INPUT_TYPES = "input_types";
+  static constexpr const char* const TRANSFORM_ARG_INPUT_SHAPES =
+      "input_shapes";
 
   using ExecutorBuildFunc = std::function<Status(
       std::unique_ptr<IRemoteFusedGraphExecutor>* executor)>;
@@ -52,9 +72,12 @@ class RemoteFusedGraphExecuteUtils {
   using ExecutorBuildRegistry = std::map<string, ExecutorBuildFunc>;
 
   using TensorShapeType = std::pair<DataType, TensorShape>;
-  using TensorShapeMap =
-      std::unordered_multimap<string /* node name */,
-                              std::pair<int /* port */, TensorShapeType>>;
+  using TensorShapeMap = std::unordered_multimap<string,         // node name
+                                                 std::pair<int,  // port
+                                                           TensorShapeType>>;
+  using ClusterInfo = std::tuple<std::unordered_set<string>,  // node names
+                                 std::vector<string>,         // border inputs
+                                 std::vector<string>>;        // border outputs
 
   // Return registered ExecutorBuildFunc for given name.
   static const ExecutorBuildFunc* GetExecutorBuildFunc(const string& name);
@@ -99,10 +122,14 @@ class RemoteFusedGraphExecuteUtils {
   static Status AddOutputTensorShapeTypeByTensorShapeMap(
       const TensorShapeMap& tensor_shape_map, NodeDef* node_def);
 
-  static Status GetOutputTensorShapeType(const NodeDef& node_def,
+  static Status GetOutputTensorShapeType(AttrSlice attrs,
                                          std::vector<DataType>* data_types,
                                          std::vector<TensorShape>* shapes);
 
+  static bool GetOutputTensorShapeType(const GraphDef& graph_def,
+                                       const string& name_and_port,
+                                       DataType* data_type, TensorShape* shape);
+
   static Status PropagateShapeInference(
       const GraphDef& graph_def,
       const std::vector<std::pair<string, Tensor>>& input_node_info_list,
@@ -124,10 +151,92 @@ class RemoteFusedGraphExecuteUtils {
       std::vector<std::pair<string, Tensor>>* inputs,
       std::vector<string>* outputs);
 
+  static Status BuildAndAddTensorShapes(
+      const std::vector<std::pair<string, Tensor>>& input_tensors,
+      const bool dry_run_inference, GraphDef* graph_def);
+
+  // Build remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteInfo(
+      const string& executor_name, const GraphDef& subgraph_def,
+      const std::vector<string>& inputs, const std::vector<string>& outputs,
+      const bool require_shape_type, RemoteFusedGraphExecuteInfo* execute_info,
+      DataTypeVector* input_types, DataTypeVector* output_types);
+
+  // Build remote fused graph execute op node by fusing specified subgraph
+  // as remote fused graph execute info
+  static Status BuildRemoteFusedGraphExecuteOpNode(
+      const string& node_name, const string& executor_name,
+      const GraphDef& subgraph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs, const bool require_shape_type,
+      Graph* graph, Node** created_node);
+
+  // Build Identity node to forward remote graph node output
+  static Status BuildIdentityOpNode(const string& node_name,
+                                    const string& input_node_name,
+                                    const int input_node_port,
+                                    const DataType dt, Graph* graph,
+                                    Node** created_node);
+
+  // Create clusters of given nodes
+  static Status ClusterizeNodes(const std::unordered_set<string>& node_names,
+                                const GraphDef& graph_def,
+                                std::vector<ClusterInfo>* cluster_infos);
+
+  // Build GraphDef of a given cluster
+  static Status BuildClusterSubgraphDef(const ClusterInfo& cluster,
+                                        const GraphDef& graph_def,
+                                        GraphDef* subgraph_def);
+
+  // Build a cluster by given border
+  // CAVEAT: The border must be consistent for one cluster.
+  static Status BuildClusterByBorder(const std::vector<string>& border_inputs,
+                                     const std::vector<string>& border_outputs,
+                                     const GraphDef& graph_def,
+                                     ClusterInfo* cluster);
+
+  // Fuse one cluster into a newly created RemoteFusedGraphExecuteOp node.
+  // The subgraph is stored as a graph in RemoteFusedGraphExecuteInfo.
+  // CAVEAT1: This transform strips unvisited nodes with given outputs.
+  // CAVEAT2: If you want to use a graph output as a border output,
+  // that graph output node is replaced by an identity node.  Therefore,
+  // the number of output of the node must be 1.
+  static Status FuseCluster(const GraphDef& input_graph_def,
+                            const std::vector<string>& inputs,
+                            const std::vector<string>& outputs,
+                            const string& remote_fused_graph_node_name,
+                            const ClusterInfo& cluster,
+                            const string& remote_graph_executor_name,
+                            const bool require_shape_type,
+                            GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified nodes
+  static Status FuseRemoteGraphByNodeNames(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name_prefix,
+      const std::unordered_set<string>& subgraph_nodes,
+      const string& remote_fused_graph_executor_name,
+      const bool require_shape_type, GraphDef* output_graph_def);
+
+  // Fuse subgraph of specified border
+  static Status FuseRemoteGraphByBorder(
+      const GraphDef& input_graph_def, const std::vector<string>& inputs,
+      const std::vector<string>& outputs,
+      const string& remote_fused_graph_node_name,
+      const std::vector<string>& border_inputs,
+      const std::vector<string>& border_outputs,
+      const string& remote_graph_executor_name, const bool require_shape_type,
+      GraphDef* output_graph_def);
+
  private:
   static void EmplaceTensorShapeType(const string& name, const Tensor& tensor,
                                      TensorShapeMap* tensor_shape_map);
 
+  static Status ReplaceInputNodeByPlaceHolder(const string& input,
+                                              const DataType type,
+                                              const TensorShape& shape,
+                                              GraphDef* graph_def);
+
   static ExecutorBuildRegistry* GetExecutorBuildRegistry();
 
   TF_DISALLOW_COPY_AND_ASSIGN(RemoteFusedGraphExecuteUtils);
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
index 52afa5dde11cb9f38c479678ba6c58d216f69022..581b61a625bfc284b056b1328feda7e6c4fa1015 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -23,10 +22,13 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 
-constexpr const char* const NAME_A = "a";
-constexpr const char* const NAME_B = "b";
-constexpr const char* const NAME_A_PLUS_B = "a_plus_b";
+using ClusterInfo = RemoteFusedGraphExecuteUtils::ClusterInfo;
+
+constexpr const char* const NAME_A = "A";
+constexpr const char* const NAME_B = "B";
+constexpr const char* const NAME_A_PLUS_B = "A_PLUS_B";
 constexpr float NODE_A_VAL = 2.0f;
 constexpr float NODE_B_VAL = 3.0f;
 constexpr float VALUE_TOLERANCE_FLOAT = 1e-8f;
@@ -41,9 +43,101 @@ static NodeDef* GetNodeDef(const string& name, GraphDef* def) {
   return nullptr;
 }
 
+class FuseRemoteGraphMultipleAddOpsTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    TF_ASSERT_OK(
+        RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def_));
+  }
+
+  void TearDown() final {}
+
+  Status FuseByInOut() {
+    // Feed output shapes and types
+    RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
+    GraphDef graph_def_with_shapetype = graph_def_;
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+        input_tensors_, /*dry_run_inference*/ true, &graph_def_with_shapetype));
+
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+        graph_def_with_shapetype, inputs_, outputs_,
+        "remote_fused_graph_node_names", subgraph_input_names_,
+        subgraph_output_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/true, &result_graph_def_);
+  }
+
+  Status FuseByNodes() {
+    return RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+        graph_def_, inputs_, outputs_, "remote_fused_graph_node_names",
+        subgraph_node_names_, "remote_graph_executor_name",
+        /*require_shape_type=*/false, &result_graph_def_);
+  }
+
+ public:
+  const std::vector<std::pair<string, Tensor>> input_tensors_{
+      {"A", {DT_FLOAT, {1, 1, 1, 1}}}};
+  const std::vector<string> inputs_{"A"};
+  const std::vector<string> outputs_{"K"};
+  GraphDef graph_def_;
+  GraphDef result_graph_def_;
+  std::vector<string> subgraph_input_names_;
+  std::vector<string> subgraph_output_names_;
+  std::unordered_set<string> subgraph_node_names_;
+};
+
+void SetSubgraphArguments(const std::vector<string>& input_names,
+                          const std::vector<string>& output_names,
+                          FuseRemoteGraphMultipleAddOpsTest* fixture) {
+  for (const string& input_name : input_names) {
+    fixture->subgraph_input_names_.emplace_back(input_name);
+  }
+
+  fixture->subgraph_output_names_ = output_names;
+}
+
+template <typename T>
+static string IterToString(const T& set) {
+  string out;
+  for (const string& val : set) {
+    if (!out.empty()) {
+      out += ", ";
+    }
+    out += val;
+  }
+  return out;
+}
+
+static string SummarizeGraphDef(const GraphDef& graph_def) {
+  string out;
+  for (const NodeDef& node : graph_def.node()) {
+    out += strings::StrCat("node: ", node.name(), "\n    input: ");
+    for (const string& input : node.input()) {
+      out += strings::StrCat(input, ", ");
+    }
+    out += "\n";
+  }
+  return out;
+}
+
+static string DumpInOutNames(const std::vector<ClusterInfo>& ci_vec) {
+  for (int i = 0; i < ci_vec.size(); ++i) {
+    LOG(INFO) << "Cluster(" << i << ")";
+    LOG(INFO) << "input: " << IterToString(std::get<1>(ci_vec.at(i)));
+    LOG(INFO) << "output: " << IterToString(std::get<2>(ci_vec.at(i)));
+  }
+  return "";
+}
+
+static void ClearCluster(ClusterInfo* cluster) {
+  std::get<0>(*cluster).clear();
+  std::get<1>(*cluster).clear();
+  std::get<2>(*cluster).clear();
+}
+
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphA) {
-  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
   std::pair<string, Tensor> input_node_info;
   input_node_info.first = NAME_A;
   input_node_info.second = Tensor(DT_FLOAT, {});
@@ -62,8 +156,9 @@ TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphA) {
 }
 
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAUninitialized) {
-  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
   std::pair<string, Tensor> input_node_info;
   input_node_info.first = NAME_A;
   input_node_info.second = Tensor(DT_FLOAT, {});
@@ -81,8 +176,9 @@ TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAUninitialized) {
 }
 
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAB) {
-  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
   std::pair<string, Tensor> input_node_info_a;
   input_node_info_a.first = NAME_A;
   input_node_info_a.second = Tensor(DT_FLOAT, {});
@@ -104,7 +200,7 @@ TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphAB) {
 }
 
 TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphForAllNodes) {
-  // Set Node "a" as an input with value (= 1.0f)
+  // Set Node "A" as an input with value (= 1.0f)
   std::pair<string, Tensor> input_node_info_a;
   input_node_info_a.first = NAME_A;
   input_node_info_a.second = Tensor(DT_FLOAT, {});
@@ -114,8 +210,9 @@ TEST(RemoteFusedGraphExecuteUtils, DryRunAddGraphForAllNodes) {
   const std::vector<std::pair<string, Tensor>> inputs{input_node_info_a};
   RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
 
-  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
 
   // dryrun
   const Status status = RemoteFusedGraphExecuteUtils::DryRunInferenceForAllNode(
@@ -156,8 +253,9 @@ TEST(RemoteFusedGraphExecuteUtils, PropagateAndBuildTensorShapeMap) {
                                                       input_node_info_b};
 
   RemoteFusedGraphExecuteUtils::TensorShapeMap tensor_shape_map;
-  GraphDef def = RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
-      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B);
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
   ImportGraphDefOptions opts;
   Graph graph(OpRegistry::Global());
   ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
@@ -226,4 +324,299 @@ TEST(RemoteFusedGraphExecuteUtils, PropagateAndBuildTensorShapeMap) {
   }
 }
 
+TEST(RemoteFusedGraphExecuteUtils,
+     BuildRemoteFusedGraphExecuteInfoWithShapeInference) {
+  // Build inputs
+  std::pair<string, Tensor> input_node_info_a;
+  input_node_info_a.first = NAME_A;
+  input_node_info_a.second = Tensor(DT_FLOAT, {});
+  input_node_info_a.second.scalar<float>()() = NODE_A_VAL;
+  std::pair<string, Tensor> input_node_info_b;
+  input_node_info_b.first = NAME_B;
+  input_node_info_b.second = Tensor(DT_FLOAT, {});
+  input_node_info_b.second.scalar<float>()() = NODE_B_VAL;
+  const std::vector<std::pair<string, Tensor>> input_tensors{input_node_info_a,
+                                                             input_node_info_b};
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+      input_tensors, /*dry_run_inference*/ true, &def));
+
+  RemoteFusedGraphExecuteInfo execute_info0;
+  DataTypeVector input_types0;
+  DataTypeVector output_types0;
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteInfo(
+      "executor", def, inputs, outputs, /*require_shape_type=*/true,
+      &execute_info0, &input_types0, &output_types0));
+
+  EXPECT_EQ(inputs.size(),
+            execute_info0.default_graph_input_tensor_shape_size());
+  EXPECT_EQ(outputs.size(),
+            execute_info0.default_graph_output_tensor_shape_size());
+  EXPECT_EQ(inputs.size(), input_types0.size());
+  EXPECT_EQ(outputs.size(), output_types0.size());
+
+  EXPECT_EQ(def.node_size(), execute_info0.remote_graph().node_size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildRemoteFusedGraphExecuteOpNode) {
+  const std::vector<string> inputs{NAME_A, NAME_B};
+
+  // Build outputs
+  const std::vector<string> outputs = {NAME_A_PLUS_B};
+
+  GraphDef def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildAddGraph(
+      NAME_A, NODE_A_VAL, NAME_B, NODE_B_VAL, NAME_A_PLUS_B, &def));
+
+  Graph graph(OpRegistry::Global());
+  ShapeRefiner shape_refiner(graph.versions().producer(), graph.op_registry());
+  TF_ASSERT_OK(ImportGraphDef({}, def, &graph, &shape_refiner));
+
+  Node* node;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
+      "fused_name", "executor", def, inputs, outputs,
+      /*require_shape_type=*/false, &graph, &node));
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ExtractSubgraphNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+  ClusterInfo cluster;
+  const std::unordered_set<string>& node_names = std::get<0>(cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"H", "I"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(1, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F", "C", "G"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(3, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"J"}, graph_def, &cluster));
+  EXPECT_EQ(5, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"A", "B", "C", "D", "E"}, {"K"}, graph_def, &cluster));
+  EXPECT_EQ(6, node_names.size()) << IterToString(node_names);
+
+  ClearCluster(&cluster);
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      {"F"}, {"H"}, graph_def, &cluster));
+  EXPECT_EQ(2, node_names.size()) << IterToString(node_names);
+}
+
+TEST(RemoteFusedGraphExecuteUtils, ClusterizeNodes) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+
+  std::vector<ClusterInfo> ci_vec;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteUtils::ClusterizeNodes({"J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(2, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(3, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(1, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"F", "C", "G", "H", "I", "J"}, graph_def, &ci_vec));
+  ASSERT_EQ(1, ci_vec.size());
+  EXPECT_EQ(4, std::get<1>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+  EXPECT_EQ(2, std::get<2>(ci_vec.at(0)).size()) << DumpInOutNames(ci_vec);
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "C", "D", "E"}, graph_def, &ci_vec));
+  ASSERT_EQ(5, ci_vec.size());
+
+  ci_vec.clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::ClusterizeNodes(
+      {"A", "B", "D", "E", "F", "G"}, graph_def, &ci_vec));
+  ASSERT_EQ(2, ci_vec.size());
+}
+
+TEST(RemoteFusedGraphExecuteUtils, BuildSubgraphDefByInOut) {
+  GraphDef graph_def;
+  TF_ASSERT_OK(
+      RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(&graph_def));
+
+  ClusterInfo cluster;
+  GraphDef subgraph_def;
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"H", "I"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F", "C", "G"}, std::vector<string>{"J"}, graph_def,
+      &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(6, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"J"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(10, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"A", "B", "C", "D", "E"}, std::vector<string>{"K"},
+      graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(11, subgraph_def.node_size());
+
+  ClearCluster(&cluster);
+  subgraph_def.Clear();
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterByBorder(
+      std::vector<string>{"F"}, std::vector<string>{"H"}, graph_def, &cluster));
+  TF_ASSERT_OK(RemoteFusedGraphExecuteUtils::BuildClusterSubgraphDef(
+      cluster, graph_def, &subgraph_def));
+  EXPECT_EQ(3, subgraph_def.node_size());
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_HI_J) {
+  SetSubgraphArguments(std::vector<string>{"H", "I"}, std::vector<string>{"J"},
+                       this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_FCG_J) {
+  SetSubgraphArguments(std::vector<string>{"F", "C", "G"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_ABCDE_J) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"J"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(8, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByInOut_ABCDE_K) {
+  SetSubgraphArguments(std::vector<string>{"A", "B", "C", "D", "E"},
+                       std::vector<string>{"K"}, this);
+
+  TF_ASSERT_OK(FuseByInOut());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(7, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_H) {
+  subgraph_node_names_ = {"H"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(11, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_HIJ) {
+  subgraph_node_names_ = {"H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(9, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_CFGHIJ) {
+  subgraph_node_names_ = {"C", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(6, result_graph_def_.node_size())
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_ABCDEFGHIJ) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsTest, FuseSubgraphByNodes_ABCDEFGHIJK) {
+  subgraph_node_names_ = {"A", "B", "C", "D", "E", "F",
+                          "G", "H", "I", "J", "K"};
+
+  TF_ASSERT_OK(FuseByNodes());
+
+  EXPECT_EQ(11, graph_def_.node_size());
+  EXPECT_EQ(3, result_graph_def_.node_size())  // "A", "RFG", "K"
+      << "=== Before: \n"
+      << SummarizeGraphDef(graph_def_) << "\n\n\n=== After: \n"
+      << SummarizeGraphDef(result_graph_def_);
+}
+
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8742214e17ea85931e4a72301e4cffe6b138552f
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform.cc
@@ -0,0 +1,163 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wraps the hexagon rewriter in a transform so it can be used as part of the
+// graph transform tool.
+// A usage example, based on inception v3 model:
+/*
+bazel build tensorflow/tools/graph_transforms:transform_graph
+
+
+// Specify remote graph by node names
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+--in_graph=/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+--out_graph=\
+/tmp/tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb \
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='\
+fuse_remote_graph(
+input_types="float" \
+input_shapes="1,299,299,3" \
+fused_nodes="NodeA,NodeB,NodeC",
+remote_fused_graph_executor_name="executor" \
+remote_fused_graph_node_name="node_name" \
+)'
+
+// Specify remote graph by border inputs and outputs
+bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
+--in_graph=/tmp/tensorflow_inception_v3_stripped_optimized_quantized.pb \
+--out_graph=\
+/tmp/tensorflow_inception_v3_stripped_optimized_quantized_fused_hexagon.pb \
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='\
+fuse_remote_graph(
+input_types="float" \
+input_shapes="1,299,299,3" \
+border_inputs="NodeA:0,NodeB:0" \
+border_outputs="NodeC" \
+remote_fused_graph_executor_name="executor" \
+remote_fused_graph_node_name="node_name" \
+)'
+*/
+
+#include <unordered_set>
+
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+Status FuseRemoteGraph(const GraphDef& input_graph_def,
+                       const TransformFuncContext& context,
+                       GraphDef* output_graph_def) {
+  GraphDef mutable_input_graph_def = input_graph_def;
+
+  const std::vector<string>& inputs = context.input_names;
+  const std::vector<string>& outputs = context.output_names;
+
+  string input_types_str;
+  string input_shapes_str;
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES, "",
+      &input_types_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES, "",
+      &input_shapes_str));
+
+  if (!input_types_str.empty()) {
+    const std::vector<string> input_types_strs =
+        str_util::Split(input_types_str, ",");
+    const std::vector<string> input_shapes_strs =
+        str_util::Split(input_shapes_str, ":");
+    CHECK_EQ(inputs.size(), input_types_strs.size());
+    CHECK_EQ(inputs.size(), input_shapes_strs.size());
+    std::vector<std::pair<string, Tensor>> input_tensors;
+    for (int i = 0; i < inputs.size(); ++i) {
+      const string& name = inputs.at(i);
+      std::vector<int64> dims;
+      CHECK(str_util::SplitAndParseAsInts(input_shapes_strs.at(i), ',', &dims));
+      DataType data_type;
+      CHECK(DataTypeFromString(input_types_strs.at(i), &data_type))
+          << "\"" << input_types_strs.at(i) << "\" was an invalid type";
+      input_tensors.emplace_back(
+          std::make_pair(name, Tensor(data_type, TensorShape(dims))));
+    }
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::BuildAndAddTensorShapes(
+        input_tensors, /*dry_run_inference=*/true, &mutable_input_graph_def));
+  }
+
+  string fused_nodes_str;
+  string border_inputs_str;
+  string border_outputs_str;
+  string remote_graph_executor_name;
+  string remote_fused_graph_node_name;
+
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES, "",
+      &fused_nodes_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS, "",
+      &border_inputs_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS, "",
+      &border_outputs_str));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::
+          TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+      "", &remote_graph_executor_name));
+  TF_RETURN_IF_ERROR(context.GetOneStringParameter(
+      RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME,
+      "", &remote_fused_graph_node_name));
+
+  CHECK(!remote_graph_executor_name.empty());
+
+  const bool require_shape_type = !input_types_str.empty();
+  if (!fused_nodes_str.empty()) {
+    const std::vector<string>& fused_node_name_vector =
+        str_util::Split(fused_nodes_str, ",");
+    const std::unordered_set<string> fused_node_names(
+        fused_node_name_vector.begin(), fused_node_name_vector.end());
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
+        mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        fused_node_names, remote_graph_executor_name, require_shape_type,
+        output_graph_def));
+  } else if (!border_inputs_str.empty() && !border_outputs_str.empty()) {
+    const std::vector<string> border_inputs =
+        str_util::Split(border_inputs_str, ",");
+    const std::vector<string> border_outputs =
+        str_util::Split(border_outputs_str, ",");
+    for (int i = 0; i < border_inputs.size(); ++i) {
+      VLOG(2) << "Border Input(" << i << "): " << border_inputs.at(i);
+    }
+    for (int i = 0; i < border_outputs.size(); ++i) {
+      VLOG(2) << "Border Output(" << i << "): " << border_outputs.at(i);
+    }
+    TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByBorder(
+        mutable_input_graph_def, inputs, outputs, remote_fused_graph_node_name,
+        border_inputs, border_outputs, remote_graph_executor_name,
+        require_shape_type, output_graph_def));
+  } else {
+    CHECK(false) << "Fuse targets are not specified.";
+  }
+
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("fuse_remote_graph", FuseRemoteGraph);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e061437a997d9d0f3c9e4fcc6595e82489927a0
--- /dev/null
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
+#include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+// Declared here so we don't have to put it in a public header.
+Status FuseRemoteGraph(const GraphDef& input_graph_def,
+                       const TransformFuncContext& context,
+                       GraphDef* output_graph_def);
+
+namespace {
+
+constexpr const char* const REMOTE_FUSED_GRAPH_EXECUTOR_NAME =
+    "remote_fused_graph_executor_name";
+constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME =
+    "remote_fused_graph_node_name";
+
+class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
+ protected:
+  void SetUp() final {
+    TF_ASSERT_OK(RemoteFusedGraphExecuteOpTestUtils::BuildMultipleAddGraph(
+        &input_graph_def_));
+  }
+
+  void TearDown() final {}
+
+  Status Fuse() {
+    TransformFuncContext context;
+    context.input_names = inputs_;
+    context.output_names = outputs_;
+
+    if (!input_types_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_TYPES,
+           {input_types_}}));
+    }
+    if (!input_shapes_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_INPUT_SHAPES,
+           {input_shapes_}}));
+    }
+    if (!fused_node_names_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_FUSED_NODES,
+           {fused_node_names_str_}}));
+    }
+
+    if (!border_inputs_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_INPUTS,
+           {border_inputs_str_}}));
+    }
+    if (!border_outputs_str_.empty()) {
+      context.params.insert(std::pair<string, std::vector<string>>(
+          {RemoteFusedGraphExecuteUtils::TRANSFORM_ARG_BORDER_OUTPUTS,
+           {border_outputs_str_}}));
+    }
+
+    context.params.insert(std::pair<string, std::vector<string>>(
+        {RemoteFusedGraphExecuteUtils::
+             TRANSFORM_ARG_REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
+         {REMOTE_FUSED_GRAPH_EXECUTOR_NAME}}));
+    context.params.insert(std::pair<string, std::vector<string>>(
+        {RemoteFusedGraphExecuteUtils::
+             TRANSFORM_ARG_REMOTE_FUSED_GRAPH_NODE_NAME,
+         {REMOTE_FUSED_GRAPH_NODE_NAME}}));
+
+    return FuseRemoteGraph(input_graph_def_, context, &output_graph_def_);
+  }
+
+  void SetInputShapeType() {
+    input_types_ = "float";
+    input_shapes_ = "1,1,1,1";
+  }
+
+  void CheckGraph(int expected_node_count, int expected_cluster_count) {
+    EXPECT_EQ(expected_node_count, output_graph_def_.node_size());
+
+    int cluster_count = 0;
+    for (const NodeDef& node_def : output_graph_def_.node()) {
+      const string& name = node_def.name();
+      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+        ++cluster_count;
+        RemoteFusedGraphExecuteInfo info;
+        string serialized_proto;
+        TF_ASSERT_OK(
+            GetNodeAttr(node_def,
+                        RemoteFusedGraphExecuteUtils::
+                            ATTR_SERIALIZED_REMOTE_FUSED_GRAPH_EXECUTE_INFO,
+                        &serialized_proto));
+        info.ParseFromString(serialized_proto);
+        CHECK_EQ(REMOTE_FUSED_GRAPH_EXECUTOR_NAME, info.executor_name());
+      }
+    }
+    EXPECT_EQ(expected_cluster_count, cluster_count);
+  }
+
+ public:
+  const std::vector<string> inputs_{"A"};
+  const std::vector<string> outputs_{"K"};
+  GraphDef input_graph_def_;
+  string input_types_;
+  string input_shapes_;
+  GraphDef output_graph_def_;
+  string fused_node_names_str_;
+  string border_inputs_str_;
+  string border_outputs_str_;
+};
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithShapeType_HIJ) {
+  SetInputShapeType();
+  fused_node_names_str_ = "H,I,J";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithoutShapeType_HIJ) {
+  fused_node_names_str_ = "H,I,J";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithShapeType_ABCDEFGHIJK) {
+  SetInputShapeType();
+  fused_node_names_str_ = "A,B,C,D,E,F,G,H,I,J,K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByNodesWithoutShapeType_ABCDEFGHIJK) {
+  fused_node_names_str_ = "A,B,C,D,E,F,G,H,I,J,K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(3, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithShapeType_FCG_J) {
+  SetInputShapeType();
+  border_inputs_str_ = "F:0,C:0,G";
+  border_outputs_str_ = "J:0";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithoutShapeType_FCG_J) {
+  border_inputs_str_ = "F:0,C:0,G";
+  border_outputs_str_ = "J:0";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(9, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithShapeType_ABCDE_K) {
+  SetInputShapeType();
+  border_inputs_str_ = "A,B,C,D,E";
+  border_outputs_str_ = "K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(7, 1);
+}
+
+TEST_F(FuseRemoteGraphMultipleAddOpsRewriterTest,
+       FuseRemoteGraphByBorderWithoutShapeType_ABCDE_K) {
+  border_inputs_str_ = "A,B,C,D,E";
+  border_outputs_str_ = "K";
+  TF_ASSERT_OK(Fuse());
+  CheckGraph(7, 1);
+}
+
+}  // namespace
+}  // namespace graph_transforms
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/repeat_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fc59e1779113dfc61d4de1a319b5775f8eb641b
--- /dev/null
+++ b/tensorflow/core/kernels/repeat_dataset_op.cc
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RepeatDatasetOp : public OpKernel {
+ public:
+  explicit RepeatDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new RepeatDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* count_t;
+    OP_REQUIRES_OK(ctx, ctx->input("count", &count_t));
+    const int64 count = count_t->flat<int64>()(0);
+
+    DatasetBase* dataset = new Dataset(count, input);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return std::unique_ptr<IteratorBase>(new ForeverIterator(this));
+      } else if (count_ == 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "RepeatDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        while (i_ < dataset()->count_) {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            return Status::OK();
+          }
+          ++i_;
+          input_impl_ = dataset()->input_->MakeIterator();
+        }
+        *end_of_sequence = true;
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    class ForeverIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit ForeverIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset), input_impl_(nullptr) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        do {
+          if (!input_impl_) {
+            input_impl_ = dataset()->input_->MakeIterator();
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+            // If the first call to GetNext() fails because the end of
+            // sequence has been reached, we return an OutOfRange
+            // error to terminate the iteration. (Otherwise, this
+            // iterator would loop infinitely and never produce a
+            // value.)
+            if (!*end_of_sequence) {
+              return Status::OK();
+            } else {
+              input_impl_.reset();
+              return errors::OutOfRange(
+                  "Attempted to repeat an empty dataset infinitely.");
+            }
+          } else {
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+            if (!*end_of_sequence) {
+              return Status::OK();
+            } else {
+              input_impl_.reset();
+            }
+          }
+        } while (true);
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RepeatDataset").Device(DEVICE_CPU),
+                        RepeatDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index d7b063e0c184ac5a23a993c868be0d7b0928b7e3..5131bce448e519b6997fee19e3dfaf9e732858d3 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -235,9 +235,9 @@ inline void interpolate_with_caching(
       const T* y_ptr_3 = input_b_ptr + y_wai.index_3 * in_row_width;
       if (num_channels == 3) {
         // Manually unroll case of 3 channels.
-        float cached_value_0[4];
-        float cached_value_1[4];
-        float cached_value_2[4];
+        float cached_value_0[4] = {0};
+        float cached_value_1[4] = {0};
+        float cached_value_2[4] = {0};
         for (int64 x = 0; x < resizer_state.out_width; ++x) {
           const WeightsAndIndices& x_wai = x_wais[x];
           // Shift values in cached_value_* to fill first 'advance' values.
@@ -316,7 +316,7 @@ inline void interpolate_with_caching(
         }
       } else {
         for (int64 c = 0; c < num_channels; ++c) {
-          float cached_value[4];
+          float cached_value[4] = {0};
           for (int64 x = 0; x < resizer_state.out_width; ++x) {
             const WeightsAndIndices& x_wai = x_wais[x];
             // Shift values in cached_value to fill first 'advance' values.
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
index a38fb222237f9f5938f6b68fd909fc9fb768fc66..bfd29b7ec89e6a2d0e2757db31b707be70d12c1d 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
+
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -27,13 +29,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
-#endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Device, typename T>
 class ResizeNearestNeighborOp : public OpKernel {
@@ -54,22 +53,27 @@ class ResizeNearestNeighborOp : public OpKernel {
                 errors::InvalidArgument("nearest neighbor requires max height "
                                         "& width of 2^24"));
 
+    // Return if the output is empty.
+    if (st.output->NumElements() == 0) return;
+
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = st.output->tensor<T, 4>();
 
-    for (int b = 0; b < st.batch_size; ++b) {
-      for (int y = 0; y < st.out_height; ++y) {
-        const int64 in_y =
-            std::min(static_cast<int64>(floorf(y * st.height_scale)),
-                     (st.in_height - 1));
-        for (int x = 0; x < st.out_width; ++x) {
-          const int64 in_x =
-              std::min(static_cast<int64>(floorf(x * st.width_scale)),
-                       (st.in_width - 1));
-          std::copy_n(&input_data(b, in_y, in_x, 0), st.channels,
-                      &output_data(b, y, x, 0));
-        }
-      }
+    bool status;
+    if (align_corners_) {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/true>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    } else {
+      status =
+          functor::ResizeNearestNeighbor<Device, T, /*align_corners=*/false>()(
+              context->eigen_device<Device>(), input_data, st.height_scale,
+              st.width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighbor"));
     }
   }
 
@@ -77,6 +81,41 @@ class ResizeNearestNeighborOp : public OpKernel {
   bool align_corners_;
 };
 
+// Partial specialization of ResizeNearestNeighbor functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < out_height; ++y) {
+        const int64 in_y = std::min(
+            (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                            : static_cast<int64>(floorf(y * height_scale)),
+            in_height - 1);
+        for (int x = 0; x < out_width; ++x) {
+          const int64 in_x = std::min(
+              (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                              : static_cast<int64>(floorf(x * width_scale)),
+              in_width - 1);
+          std::copy_n(&input(b, in_y, in_x, 0), channels, &output(b, y, x, 0));
+        }
+      }
+    }
+    return true;
+  }
+};
+}  // namespace functor
+
 template <typename Device, typename T>
 class ResizeNearestNeighborOpGrad : public OpKernel {
  public:
@@ -105,22 +144,23 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
     OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
                 errors::InvalidArgument("shape_t's elements must be positive"));
 
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0,
-                                TensorShape({input.dim_size(0), sizes(0),
-                                             sizes(1), input.dim_size(3)}),
-                                &output));
-
     const int64 batch_size = input.dim_size(0);
     const int64 in_height = input.dim_size(1);
     const int64 in_width = input.dim_size(2);
     const int64 channels = input.dim_size(3);
 
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
+    const int64 out_height = sizes(0);
+    const int64 out_width = sizes(1);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, out_height, out_width, channels}),
+            &output));
+
+    // Return if the output is empty.
+    if (output->NumElements() == 0) return;
 
     typename TTypes<T, 4>::ConstTensor input_data = input.tensor<T, 4>();
     typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
@@ -129,28 +169,67 @@ class ResizeNearestNeighborOpGrad : public OpKernel {
         CalculateResizeScale(out_height, in_height, align_corners_);
     const float width_scale =
         CalculateResizeScale(out_width, in_width, align_corners_);
-    output_data.setZero();
 
-    for (int c = 0; c < channels; ++c) {
-      for (int y = 0; y < in_height; ++y) {
-        const int64 out_y = std::min(
-            static_cast<int64>(floorf(y * height_scale)), (out_height - 1));
+    bool status;
+    if (align_corners_) {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/true>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    } else {
+      status = functor::ResizeNearestNeighborGrad<Device, T,
+                                                  /*align_corners=*/false>()(
+          context->eigen_device<Device>(), input_data, height_scale,
+          width_scale, output_data);
+    }
+    if (!status) {
+      context->SetStatus(
+          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
+    }
+  }
 
-        for (int x = 0; x < in_width; ++x) {
-          const int64 out_x = std::min(
-              static_cast<int64>(floorf(x * width_scale)), (out_width - 1));
+ private:
+  bool align_corners_;
+};
 
-          for (int b = 0; b < batch_size; ++b) {
-            output_data(b, out_y, out_x, c) += input_data(b, y, x, c);
+// Partial specialization of ResizeNearestNeighborGrad functor for a CPUDevice.
+namespace functor {
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<CPUDevice, T, align_corners> {
+  bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    output.setZero();
+
+    for (int y = 0; y < in_height; ++y) {
+      const int64 out_y = std::min(
+          (align_corners) ? static_cast<int64>(roundf(y * height_scale))
+                          : static_cast<int64>(floorf(y * height_scale)),
+          out_height - 1);
+      for (int x = 0; x < in_width; ++x) {
+        const int64 out_x = std::min(
+            (align_corners) ? static_cast<int64>(roundf(x * width_scale))
+                            : static_cast<int64>(floorf(x * width_scale)),
+            out_width - 1);
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < channels; ++c) {
+            output(b, out_y, out_x, c) += input(b, y, x, c);
           }
         }
       }
     }
+    return true;
   }
-
- private:
-  bool align_corners_;
 };
+}  // namespace functor
 
 #define REGISTER_KERNEL(T)                                        \
   REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
@@ -170,121 +249,22 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
 
 #if GOOGLE_CUDA
 
-template <typename T>
-class ResizeNearestNeighborGPUOp : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    ImageResizerState st(align_corners_);
-    st.ValidateAndCreateOutput(context, input);
-    if (!context->status().ok()) return;
-
-    bool status = ResizeNearestNeighbor<T>(
-        input.flat<T>().data(), st.batch_size, st.in_height, st.in_width,
-        st.channels, st.out_height, st.out_width, st.height_scale,
-        st.width_scale, st.output->flat<T>().data(),
-        context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighbor"));
-    }
-  }
-
- private:
-  bool align_corners_;
-};
-
-#define REGISTER_KERNEL(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor") \
-                              .Device(DEVICE_GPU)       \
-                              .TypeConstraint<T>("T")   \
-                              .HostMemory("size"),      \
-                          ResizeNearestNeighborGPUOp<T>);
+#define REGISTER_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighbor")           \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .HostMemory("size"),                \
+                          ResizeNearestNeighborOp<GPUDevice, T>); \
+  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad")       \
+                              .Device(DEVICE_GPU)                 \
+                              .TypeConstraint<T>("T")             \
+                              .HostMemory("size"),                \
+                          ResizeNearestNeighborOpGrad<GPUDevice, T>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
 
 #undef REGISTER_KERNEL
 
-template <typename T>
-class ResizeNearestNeighborGPUOpGrad : public OpKernel {
- public:
-  explicit ResizeNearestNeighborGPUOpGrad(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Grab and validate the input:
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-
-    // Grab and validate the output shape:
-    const Tensor& shape_t = context->input(1);
-    OP_REQUIRES(context, shape_t.dims() == 1,
-                errors::InvalidArgument("shape_t must be 1-dimensional",
-                                        shape_t.shape().DebugString()));
-    OP_REQUIRES(context, shape_t.NumElements() == 2,
-                errors::InvalidArgument("shape_t must have two elements",
-                                        shape_t.shape().DebugString()));
-
-    auto sizes = shape_t.vec<int32>();
-    OP_REQUIRES(context, sizes(0) > 0 && sizes(1) > 0,
-                errors::InvalidArgument("shape_t's elements must be positive"));
-
-    // Initialize shape to the batch size of the input, then add
-    // the rest of the dimensions
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0,
-                                TensorShape({input.dim_size(0), sizes(0),
-                                             sizes(1), input.dim_size(3)}),
-                                &output));
-
-    const int64 batch_size = input.dim_size(0);
-    const int64 in_height = input.dim_size(1);
-    const int64 in_width = input.dim_size(2);
-    const int64 channels = input.dim_size(3);
-
-    const int64 out_height = output->dim_size(1);
-    const int64 out_width = output->dim_size(2);
-
-    const float height_scale =
-        CalculateResizeScale(out_height, in_height, align_corners_);
-    const float width_scale =
-        CalculateResizeScale(out_width, in_width, align_corners_);
-
-    bool status = ResizeNearestNeighborBackward(
-        input.flat<T>().data(), batch_size, in_height, in_width, channels,
-        out_height, out_width, height_scale, width_scale,
-        output->flat<T>().data(), context->eigen_gpu_device());
-
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launching ResizeNearestNeighborGrad"));
-    }
-  }
-  bool align_corners_;
-};
-
-#define REGISTER_KERNEL(T)                                  \
-  REGISTER_KERNEL_BUILDER(Name("ResizeNearestNeighborGrad") \
-                              .Device(DEVICE_GPU)           \
-                              .TypeConstraint<T>("T")       \
-                              .HostMemory("size"),          \
-                          ResizeNearestNeighborGPUOpGrad<T>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);
-
-#undef REGISTER_KERNEL
-
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.h b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9db331ffdcd6c1a1b11c3ab6271d0a949dec6630
--- /dev/null
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.h
@@ -0,0 +1,43 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighbor {
+  bool operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output);
+};
+
+template <typename Device, typename T, bool align_corners>
+struct ResizeNearestNeighborGrad {
+  bool operator()(const Device& d,
+                  typename TTypes<T, 4>::ConstTensor input_grad,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output_grad);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
index 1a3a64f482bac4d5ecb0aff3ad8d0b05fb8ab21c..d65c8fb949abe7227cbae9de36baeca4571b4ff4 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -19,21 +19,25 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h"
+#include "tensorflow/core/kernels/resize_nearest_neighbor_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
 namespace {
 
-template <typename T>
-__global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_data,
-                                          const int in_height, const int in_width,
-                                          const int channels, const int out_height,
-                                          const int out_width, const float height_scale,
-                                          const float width_scale, T* top_data) {
+template <typename T, bool align_corners>
+__global__ void ResizeNearestNeighborNHWC(
+    const int nthreads, const T* bottom_data, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -44,20 +48,25 @@ __global__ void ResizeNearestNeighborNHWC(const int nthreads, const T* bottom_da
     n /= out_height;
 
     const T* bottom_data_n = bottom_data + n * channels * in_height * in_width;
-    const int in_x = min(static_cast<int>(floorf(out_x * width_scale)), in_width - 1);
-    const int in_y = min(static_cast<int>(floorf(out_y * height_scale)), in_height - 1);
+    const int in_y =
+        min((align_corners) ? static_cast<int>(roundf(out_y * height_scale))
+                            : static_cast<int>(floorf(out_y * height_scale)),
+            in_height - 1);
+    const int in_x =
+        min((align_corners) ? static_cast<int>(roundf(out_x * width_scale))
+                            : static_cast<int>(floorf(out_x * width_scale)),
+            in_width - 1);
     const int idx = (in_y * in_width + in_x) * channels + c;
     top_data[index] = ldg(bottom_data_n + idx);
   }
 }
 
-template <typename T>
+template <typename T, bool align_corners>
 __global__ void ResizeNearestNeighborBackwardNHWC(
-                                   const int nthreads, const T* top_diff,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width, const float height_scale,
-                                   const float width_scale, T* bottom_diff) {
+    const int nthreads, const T* top_diff, const int in_height,
+    const int in_width, const int channels, const int out_height,
+    const int out_width, const float height_scale, const float width_scale,
+    T* bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
     int c = n % channels;
@@ -68,8 +77,14 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
     n /= in_height;
 
     T* bottom_diff_n = bottom_diff + n * channels * out_height * out_width;
-    const int out_x = min(static_cast<int>(floorf(in_x * width_scale)), out_width - 1);
-    const int out_y = min(static_cast<int>(floorf(in_y * height_scale)), out_height - 1);
+    const int out_y =
+        min((align_corners) ? static_cast<int>(roundf(in_y * height_scale))
+                            : static_cast<int>(floorf(in_y * height_scale)),
+            out_height - 1);
+    const int out_x =
+        min((align_corners) ? static_cast<int>(roundf(in_x * width_scale))
+                            : static_cast<int>(floorf(in_x * width_scale)),
+            out_width - 1);
     const int idx = (out_y * out_width + out_x) * channels + c;
     CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index));
   }
@@ -77,69 +92,86 @@ __global__ void ResizeNearestNeighborBackwardNHWC(
 
 }  // namespace
 
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch,
-                           const int in_height, const int in_width,
-                           const int channels, const int out_height,
-                           const int out_width,  const float height_scale,
-                           const float width_scale, T* top_data,
-                           const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
-
-  ResizeNearestNeighborNHWC<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-      output_size, bottom_data, in_height, in_width, channels, out_height,
-      out_width, height_scale, width_scale, top_data);
-  return d.ok();
-}
+namespace functor {
+
+// Partial specialization of ResizeNearestNeighbor functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighbor<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const int output_size = batch_size * out_height * out_width * channels;
+    if (output_size == 0) return true;
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
+    ResizeNearestNeighborNHWC<T, align_corners>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            output_size, input.data(), in_height, in_width, channels,
+            out_height, out_width, height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
 
-#define DECLARE_GPU_SPEC(T)                                                        \
-  template bool ResizeNearestNeighbor(const T* bottom_data, const int batch,       \
-                               const int in_height, const int in_width,            \
-                               const int channels, const int out_height,           \
-                               const int out_width,  const float height_scale,     \
-                               const float width_scale, T* top_data,               \
-                               const Eigen::GpuDevice& d);
+#define DECLARE_GPU_SPEC(T)                                   \
+  template struct ResizeNearestNeighbor<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighbor<GPUDevice, T, true>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,
-                                   const int in_height, const int in_width,
-                                   const int channels, const int out_height,
-                                   const int out_width,
-                                   const float height_scale,
-                                   const float width_scale, T* bottom_diff,
-                                   const Eigen::GpuDevice& d) {
-  const int output_size = batch * channels * out_height * out_width;
-  CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
-  SetZero<<<output_config.block_count,
-            output_config.thread_per_block, 0, d.stream()>>>(output_size, bottom_diff);
-
-  const int input_size = batch * channels * in_height * in_width;
-  CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
-  ResizeNearestNeighborBackwardNHWC<T><<<
-      input_config.block_count, input_config.thread_per_block, 0, d.stream()>>>(
-      input_config.virtual_thread_count, top_diff, in_height, in_width,
-      channels, out_height, out_width, height_scale, width_scale, bottom_diff);
-  return d.ok();
-}
+// Partial specialization of ResizeNearestNeighborGrad functor for a GPUDevice.
+template <typename T, bool align_corners>
+struct ResizeNearestNeighborGrad<GPUDevice, T, align_corners> {
+  bool operator()(const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  const float height_scale, const float width_scale,
+                  typename TTypes<T, 4>::Tensor output) {
+    const int batch_size = input.dimension(0);
+    const int64 in_height = input.dimension(1);
+    const int64 in_width = input.dimension(2);
+    const int channels = input.dimension(3);
+
+    const int64 out_height = output.dimension(1);
+    const int64 out_width = output.dimension(2);
+
+    const int output_size = batch_size * channels * out_height * out_width;
+
+    CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
+    SetZero<<<output_config.block_count, output_config.thread_per_block, 0,
+              d.stream()>>>(output_size, output.data());
+    if (!d.ok()) return false;
+
+    const int input_size = batch_size * channels * in_height * in_width;
+    if (input_size == 0) return true;
+
+    CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
+    ResizeNearestNeighborBackwardNHWC<T, align_corners>
+        <<<input_config.block_count, input_config.thread_per_block, 0,
+           d.stream()>>>(input_config.virtual_thread_count, input.data(),
+                         in_height, in_width, channels, out_height, out_width,
+                         height_scale, width_scale, output.data());
+    return d.ok();
+  }
+};
 
-#define DECLARE_GPU_SPEC(T)                                                           \
-  template bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,     \
-                               const int in_height, const int in_width,               \
-                               const int channels, const int out_height,              \
-                               const int out_width, const float height_scale,         \
-                               const float width_scale, T* bottom_diff,               \
-                               const Eigen::GpuDevice& d);
+#define DECLARE_GPU_SPEC(T)                                       \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, false>; \
+  template struct ResizeNearestNeighborGrad<GPUDevice, T, true>;
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPEC);
+TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
-}  // end namespace tensorflow
+}  // namespace functor
+
+}  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
deleted file mode 100644
index 0a8fd6e1665833837e1de0ce4245cc767b8c74c6..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-#define TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
-
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-template <typename T>
-bool ResizeNearestNeighbor(const T* bottom_data, const int batch, const int in_height,
-                           const int in_width, const int channels, const int out_height,
-                           const int out_width, const float height_scale, const float width_scale,
-                           T* top_data, const Eigen::GpuDevice& d);
-
-template <typename T>
-bool ResizeNearestNeighborBackward(const T* top_diff, const int batch, const int in_height,
-                                   const int in_width, const int channels, const int out_height,
-                                   const int out_width, const float height_scale, const float width_scale,
-                                   T* bottom_diff, const Eigen::GpuDevice& d);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_RESIZE_NEAREST_NEIGHBOR_OP_GPU_H_
diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
index 34ebff6c680af99a4070c6f11a92421f5cd69e6d..ecf54c697735cc4c31da081b5a216dcb3fc8d7bf 100644
--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc
@@ -124,9 +124,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    {1, 1, 2,
-     1, 1, 2,
-     3, 3, 4});
+    {1, 2, 2,
+     3, 4, 4,
+     3, 4, 4});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
@@ -235,9 +235,9 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
 
   // clang-format off
   test::FillValues<float>(&expected,
-    { 1,  2,  4,
-      5,  6,  8,
-     13, 14, 16});
+    { 1,  3,  4,
+      9, 11, 12,
+     13, 15, 16});
 
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 24b3ba31b8aaa49d93fa7b8782a3bfd6a63331f7..6f7a0a4df511ede609a0291b1284c55c8bdd84f8 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -266,6 +266,7 @@ class ReverseV2Op : public OpKernel {
                               .HostMemory("axis"),           \
                           ReverseV2Op<CPUDevice, T>)
 TF_CALL_POD_TYPES(REGISTER_KERNELS);
+TF_CALL_string(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 19e25b887d7f9c63570f0086d328462c24f57480..c6193f378d21c513be94dc16e0e6b53ce3ac8483 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -120,7 +120,7 @@ static SessionOptions GetOptions(int intra_threads) {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
-static Graph* Reverse(TensorShape shape, int reverse_axis) {
+static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, shape);
   data.flat<float>().setRandom();
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 35c5d1d4f021984762c664d96c82a864555336ca..80d490174064a366212ffe5a48681a2c48f5f42e 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include <utility>
 #include <vector>
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 
@@ -79,7 +80,7 @@ void SaveTensors(
   VLOG(1) << "About to save tensors to file " << filename_t.flat<string>()(0)
           << "...";
   checkpoint::TensorSliceWriter writer(filename_t.flat<string>()(0),
-                                       builder_func);
+                                       std::move(builder_func));
 
   Status s;
   auto tensor_names_flat = tensor_names_t.flat<string>();
@@ -268,7 +269,8 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                                          &parsed_slice, &parsed_slice_shape));
       if (!restored_full_shape.IsSameSize(parsed_full_shape)) {
         return errors::InvalidArgument(
-            "Shape in shape_and_slice spec ", parsed_full_shape.DebugString(),
+            "tensor_name = ", tensor_name, "; shape in shape_and_slice spec ",
+            parsed_full_shape.DebugString(),
             " does not match the shape stored in checkpoint: ",
             restored_full_shape.DebugString());
       }
@@ -279,10 +281,10 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
           reader.LookupSlice(tensor_name, parsed_slice, restored_tensor));
     }
     if (dtypes[i] != restored_tensor->dtype()) {
-      return errors::InvalidArgument("Expected dtype ",
-                                     DataTypeString(dtypes[i]),
-                                     " does not equal restored dtype ",
-                                     DataTypeString(restored_tensor->dtype()));
+      return errors::InvalidArgument(
+          "tensor_name = ", tensor_name, "; expected dtype ",
+          DataTypeString(dtypes[i]), " does not equal restored dtype ",
+          DataTypeString(restored_tensor->dtype()));
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 2e09956578309c2a465c3316218a1cf5ebf6ef46..c665bc5b03ca741abfa868a4a089d19e97f47536 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -47,8 +47,9 @@ void ValidateInputs(bool is_save_op, OpKernelContext* context,
       context, prefix.NumElements() == 1,
       errors::InvalidArgument("Input prefix should have a single element, got ",
                               prefix.NumElements(), " instead."));
-  OP_REQUIRES(context, TensorShapeUtils::IsVector(tensor_names.shape()) &&
-                           TensorShapeUtils::IsVector(shape_and_slices.shape()),
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsVector(tensor_names.shape()) &&
+                  TensorShapeUtils::IsVector(shape_and_slices.shape()),
               errors::InvalidArgument(
                   "Input tensor_names and shape_and_slices "
                   "should be an 1-D tensors, got ",
@@ -105,6 +106,7 @@ class SaveV2 : public OpKernel {
     const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
 
     BundleWriter writer(Env::Default(), prefix_string);
+    OP_REQUIRES_OK(context, writer.status());
     VLOG(1) << "BundleWriter, prefix_string: " << prefix_string;
 
     for (int i = 0; i < num_tensors; ++i) {
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index fffe8cd18e1b68e0e62a924d2500ba93542cb9c0..9cdbe89457cbe25a65aff6aa655776b43cbd8b4a 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -407,6 +407,8 @@ REGISTER_COMPLEX_CPU_UNSORTED_KERNELS_ALL(complex128);
   REGISTER_GPU_UNSORTED_KERNELS(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex64(REGISTER_GPU_UNSORTED_KERNELS_ALL);
+TF_CALL_complex128(REGISTER_GPU_UNSORTED_KERNELS_ALL);
 #undef REGISTER_GPU_UNSORTED_KERNELS
 #undef REGISTER_GPU_UNSORTED_KERNELS_ALL
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
index 5f53f098aa29be322c5c172604d9b827c162833a..b132b1e8f8b004ff4ad5c675488f33dcb74a6948 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.cc
@@ -19,8 +19,6 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 
-#include <stdio.h>
-
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
@@ -28,6 +26,34 @@ namespace tensorflow {
 
 using GPUDevice = Eigen::GpuDevice;
 
+// Helper for UnusortedSegmentSumCustomKernel that adds value into dest
+// atomically.
+template <typename T>
+static __device__ __forceinline__ void AccumulateInto(T* dest, const T& value) {
+  CudaAtomicAdd(dest, value);
+}
+
+// Specializations of AccumulateInto for complex types, which CudaAtomicAdd does
+// not support. We treat a std::complex<T>* as a T* (the C++ standard section
+// 26.4.4 allows this explicitly) and atomic add the real and imaginary
+// components individually. The operation as a whole is not atomic, but we can
+// safely treat the components independently for the purpose of accumulating.
+template <>
+__device__ __forceinline__ void AccumulateInto(
+    std::complex<float>* dest, const std::complex<float>& value) {
+  auto dest_scalar = reinterpret_cast<float*>(dest);
+  CudaAtomicAdd(dest_scalar, value.real());
+  CudaAtomicAdd(dest_scalar + 1, value.imag());
+}
+
+template <>
+__device__ __forceinline__ void AccumulateInto(
+    std::complex<double>* dest, const std::complex<double>& value) {
+  auto dest_scalar = reinterpret_cast<double*>(dest);
+  CudaAtomicAdd(dest_scalar, value.real());
+  CudaAtomicAdd(dest_scalar + 1, value.imag());
+}
+
 // UnsortedSegmentSumFunctor kernel processes 'input_total_size' elements.
 // Each element is mapped from input to output by a combination of its
 // 'segment_ids' mapping and 'inner_dim_size'.
@@ -48,7 +74,7 @@ __global__ void UnsortedSegmentSumCustomKernel(
     }
     const Index output_index =
         output_segment_index * inner_dim_size + segment_offset;
-    CudaAtomicAdd(output + output_index, ldg(input + input_index));
+    AccumulateInto<T>(output + output_index, ldg(input + input_index));
   }
 }
 
@@ -99,6 +125,8 @@ struct UnsortedSegmentSumFunctor<GPUDevice, T, Index>: UnsortedSegmentBaseFuncto
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_complex64(DEFINE_GPU_SPECS);
+TF_CALL_complex128(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 #undef DEFINE_GPU_SPECS_INDEX
diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc
index 0a281835a4b9b76b91430efbf6303eaaec539897..bdf3c12ff92e4e60ec81fe5e6a2420f88559d952 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -40,8 +40,9 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename Index>
-static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
-                                Index num_cols, Index segment_size) {
+static void BM_SegmentReduction(int iters, const string& reduction,
+                                Index num_rows, Index num_cols,
+                                Index segment_size) {
   testing::StopTiming();
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
diff --git a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
index c647d3aaac6bc486a850f3883a4533ba96160a1c..7a1db4e558eb761fcafc60b65920a096025c9039 100644
--- a/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
+++ b/tensorflow/core/kernels/self_adjoint_eig_v2_op.cc
@@ -69,7 +69,7 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
         errors::InvalidArgument("Self Adjoint Eigen decomposition was not "
                                 "successful. The input might not be valid."));
 
-    outputs->at(0) = eig.eigenvalues();
+    outputs->at(0) = eig.eigenvalues().template cast<Scalar>();
     if (compute_v_) {
       outputs->at(1) = eig.eigenvectors();
     }
@@ -81,7 +81,15 @@ class SelfAdjointEigV2Op : public LinearAlgebraOp<Scalar> {
 
 REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<float>), float);
 REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<double>), double);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("SelfAdjointEigV2", (SelfAdjointEigV2Op<complex128>),
+                   complex128);
 REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<float>), float);
 REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<double>),
                    double);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("BatchSelfAdjointEigV2", (SelfAdjointEigV2Op<complex128>),
+                   complex128);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index 54eca4a20a09428dc3ba2a2293029a5f3198a8bb..27ad2fcd87eae35a67d43b083585eabb5beb4859 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -41,7 +41,7 @@ class GetSessionHandleOp : public OpKernel {
       : OpKernel(context) {}
 
   void Compute(OpKernelContext* ctx) override {
-    Tensor val = ctx->input(0);
+    const Tensor& val = ctx->input(0);
     int64 id = ctx->session_state()->GetNewId();
     TensorStore::TensorAndKey tk{val, id, def().device()};
     OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(def().name(), tk));
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 177a32464ba5d55f5a6536f11f8403a9a7c13fea..d78c6d26394bf2c6ac922b4bb58fde8340705333 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -82,6 +82,7 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                           ShapeOp<int64>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -131,6 +132,7 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                           ShapeNOp<int64>)
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -277,6 +279,7 @@ REGISTER_KERNEL_BUILDER(Name("Size")
                               .HostMemory("output"),             \
                           SizeOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
@@ -351,6 +354,7 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .HostMemory("dim"),            \
                           ExpandDimsOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -395,6 +399,7 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
       Name("Squeeze").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       SqueezeOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
+TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/shuffle_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a48357fa9392ee9faabc66b002dc4de81c2d5c74
--- /dev/null
+++ b/tensorflow/core/kernels/shuffle_dataset_op.cc
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ShuffleDatasetOp : public OpKernel {
+ public:
+  explicit ShuffleDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new ShuffleDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* buffer_size_t;
+    OP_REQUIRES_OK(ctx, ctx->input("buffer_size", &buffer_size_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(buffer_size_t->shape()),
+                errors::InvalidArgument("buffer_size must be a scalar"));
+    const int64 buffer_size = buffer_size_t->flat<int64>()(0);
+    OP_REQUIRES(
+        ctx, buffer_size > 0,
+        errors::InvalidArgument("buffer_size must be greater than zero."));
+
+    const Tensor* seed_t;
+    OP_REQUIRES_OK(ctx, ctx->input("seed", &seed_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(seed_t->shape()),
+                errors::InvalidArgument("seed must be a scalar"));
+    const int64 seed = seed_t->flat<int64>()(0);
+
+    const Tensor* seed2_t;
+    OP_REQUIRES_OK(ctx, ctx->input("seed2", &seed2_t));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(seed2_t->shape()),
+                errors::InvalidArgument("seed2 must be a scalar"));
+    const int64 seed2 = seed2_t->flat<int64>()(0);
+
+    DatasetBase* dataset = new Dataset(input, buffer_size, seed, seed2);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(const DatasetBase* input, int64 buffer_size, int64 seed,
+            int64 seed2)
+        : input_(input), buffer_size_(buffer_size), seed_(seed), seed2_(seed2) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
+                             ", ", seed2_, ")::Dataset");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            input_impl_(dataset->input_->MakeIterator()),
+            generator_(&parent_generator_) {
+        buffer_.reserve(dataset->buffer_size_);
+        int64 seed = dataset->seed_;
+        int64 seed2 = dataset->seed2_;
+        if (seed == 0 && seed2 == 0) {
+          // If both seeds are unspecified, use completely random seeds.
+          seed = random::New64();
+          seed2 = random::New64();
+        }
+        parent_generator_ = random::PhiloxRandom(seed, seed2);
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        while (!end_of_input_sequence_ &&
+               buffer_.size() < dataset()->buffer_size_) {
+          std::vector<Tensor> input_element;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
+                                                  &end_of_input_sequence_));
+          if (!end_of_input_sequence_) {
+            buffer_.emplace_back(std::move(input_element));
+          }
+        }
+
+        if (buffer_.size() > 0) {
+          *end_of_sequence = false;
+          // Choose an element to produce uniformly at random, and
+          // swap the last element into its place in the buffer.
+          int64 index = generator_() % buffer_.size();
+          *out_tensors = std::move(buffer_[index]);
+          std::swap(buffer_[index], buffer_.back());
+          buffer_.pop_back();
+        } else {
+          DCHECK(end_of_input_sequence_);
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      bool end_of_input_sequence_ GUARDED_BY(mu_) = false;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const int64 buffer_size_;
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShuffleDataset").Device(DEVICE_CPU),
+                        ShuffleDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/skip_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1cff90a05e859f0070ebc0a82d218b2733c02932
--- /dev/null
+++ b/tensorflow/core/kernels/skip_dataset_op.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class SkipDatasetOp : public OpKernel {
+ public:
+  explicit SkipDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new RepeatDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* count_t;
+    OP_REQUIRES_OK(ctx, ctx->input("count", &count_t));
+    const int64 count = count_t->flat<int64>()(0);
+
+    DatasetBase* dataset = new Dataset(count, input);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      }  else if (count_ == 0) {
+        // Pass through.
+        return input_->MakeIterator();
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "SkipDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+
+        // Keep calling GetNext().  TODO(vrv): Figure out a way to
+        // skip records without reading, perhaps by adding an
+        // interface to iterator.
+        while (i_ < dataset()->count_) {
+          // Fetch and throw away Tensors.
+          std::vector<Tensor> dummy_out_tensors;
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &dummy_out_tensors,
+                                                  end_of_sequence));
+          if (*end_of_sequence) {
+            // We reached the end before the count was reached.
+            input_impl_.reset();
+            return Status::OK();
+          }
+
+          ++i_;
+        }
+
+        // Return GetNext() on the underlying iterator.
+        TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
+                                                end_of_sequence));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("SkipDataset").Device(DEVICE_CPU),
+                        SkipDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/smooth-hinge-loss.h b/tensorflow/core/kernels/smooth-hinge-loss.h
index 45da0fb11726d9306e31cf157b3b5348d217df91..5074ad0795db0970d08dbebc93e17114d3d92a8c 100644
--- a/tensorflow/core/kernels/smooth-hinge-loss.h
+++ b/tensorflow/core/kernels/smooth-hinge-loss.h
@@ -35,7 +35,7 @@ class SmoothHingeLossUpdater : public DualLossUpdater {
                             const double current_dual, const double wx,
                             const double weighted_example_norm) const final {
     // Intutitvely there are 3 cases:
-    // a. new optimal value of the dual variable falls withing the admissible
+    // a. new optimal value of the dual variable falls within the admissible
     // range [0, 1]. In this case we set new dual to this value.
     // b. new optimal value is < 0. Then, because of convexity, the optimal
     // valid value for new dual = 0
diff --git a/tensorflow/core/kernels/softplus_op.cc b/tensorflow/core/kernels/softplus_op.cc
index 5650435781ae109d2ddc463b532f25fb205e53d7..494a83ed14e83f5fb2506774f1cbabfaf226bbed 100644
--- a/tensorflow/core/kernels/softplus_op.cc
+++ b/tensorflow/core/kernels/softplus_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftplusOp : public UnaryElementWiseOp<T, SoftplusOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftplusOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftplusOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftplusOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softplus<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftplusGradOp
     : public BinaryElementWiseOp<T, SoftplusGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftplusGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftplusGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/softsign_op.cc b/tensorflow/core/kernels/softsign_op.cc
index 33b9628b32188fc43b9f38fa50668aa5f57cd7bc..00ee649b17552da97229926392a4ed4223378711 100644
--- a/tensorflow/core/kernels/softsign_op.cc
+++ b/tensorflow/core/kernels/softsign_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/warn_about_ints.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -33,7 +34,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename T>
 class SoftsignOp : public UnaryElementWiseOp<T, SoftsignOp<Device, T>> {
  public:
-  using UnaryElementWiseOp<T, SoftsignOp<Device, T>>::UnaryElementWiseOp;
+  explicit SoftsignOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, SoftsignOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
     functor::Softsign<Device, T> functor;
@@ -46,7 +50,10 @@ template <typename Device, typename T>
 class SoftsignGradOp
     : public BinaryElementWiseOp<T, SoftsignGradOp<Device, T>> {
  public:
-  using BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>::BinaryElementWiseOp;
+  explicit SoftsignGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, SoftsignGradOp<Device, T>>(context) {
+    WarnAboutInts(context);
+  }
 
   void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
                          const Tensor& a, Tensor* output);
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 3815716ccd9f8022096889b21692cd3d26bbf648..c513683918e9eb53768864e901d3b322b3d18879 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -100,6 +100,10 @@ void SpaceToBatchOpCompute(OpKernelContext* context,
   for (int block_dim = 0; block_dim < block_dims; ++block_dim) {
     block_shape_product *= block_shape[block_dim];
   }
+  OP_REQUIRES(
+      context, block_shape_product > 0,
+      errors::InvalidArgument("Product of block sizes must be positive, got ",
+                              block_shape_product));
 
   const int internal_block_dims =
       block_dims - removed_prefix_block_dims - removed_suffix_block_dims;
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4d5effdad925bf989a51968ee753f4413108cf
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -0,0 +1,572 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains OP to generate sparse crosses.
+#include <assert.h>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+namespace {
+// An interface that represents a column with batches.
+template <typename InternalType>
+class ColumnInterface {
+ public:
+  // Returns the number of features in the specified batch.
+  virtual int64 FeatureCount(int64 batch) const = 0;
+
+  // Returns the fingerprint of nth feature from the specified batch.
+  InternalType Feature(int64 batch, int64 n) const {
+    InternalType not_used = InternalType();
+    return DoFeature(batch, n, not_used);
+  }
+
+  virtual InternalType DoFeature(int64 batch, int64 n,
+                                 InternalType not_used) const = 0;
+
+  virtual ~ColumnInterface() {}
+};
+
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class SparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  SparseTensorColumn(const Tensor& values, std::vector<int64> feature_counts,
+                     std::vector<int64> feature_start_indices)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    CHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  // InternalType is int64 only when using HashCrosser.
+  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    if (DT_STRING == values_.dtype())
+      return Fingerprint64(values_.vec<string>().data()[start + n]);
+    return values_.vec<int64>().data()[start + n];
+  }
+
+  // InternalType is string or StringPiece when using StringCrosser.
+  string DoFeature(int64 batch, int64 n, string not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    if (DT_STRING == values_.dtype())
+      return values_.vec<string>().data()[start + n];
+    return std::to_string(values_.vec<int64>().data()[start + n]);
+  }
+
+  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
+    const int64 start = feature_start_indices_[batch];
+    return values_.vec<string>().data()[start + n];
+  }
+
+  ~SparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class DenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit DenseTensorColumn(const Tensor& tensor) : tensor_(tensor) {}
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  // InternalType is int64 only when using HashCrosser.
+  int64 DoFeature(int64 batch, int64 n, int64 not_used) const {
+    if (DT_STRING == tensor_.dtype())
+      return Fingerprint64(tensor_.matrix<string>()(batch, n));
+    return tensor_.matrix<int64>()(batch, n);
+  }
+
+  // Internal type is string or StringPiece when using StringCrosser.
+  string DoFeature(int64 batch, int64 n, string not_used) const {
+    if (DT_STRING == tensor_.dtype()) return tensor_.matrix<string>()(batch, n);
+    return std::to_string(tensor_.matrix<int64>()(batch, n));
+  }
+
+  StringPiece DoFeature(int64 batch, int64 n, StringPiece not_used) const {
+    return tensor_.matrix<string>()(batch, n);
+  }
+
+  ~DenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+};
+
+// Updates Output tensors with sparse crosses.
+template <typename OutType>
+class OutputUpdater {
+ public:
+  OutputUpdater(const std::vector<int64>& output_start_indices,
+                Tensor* indices_out, Tensor* values_out)
+      : output_start_indices_(output_start_indices),
+        indices_out_(indices_out),
+        values_out_(values_out) {}
+
+  void Update(const int64 batch_index, const int64 cross_count,
+              const OutType& cross) const {
+    const int64 output_index = output_start_indices_[batch_index] + cross_count;
+
+    auto indices_matrix = indices_out_->matrix<int64>();
+    indices_matrix(output_index, 0) = batch_index;
+    indices_matrix(output_index, 1) = cross_count;
+
+    auto value_vec = values_out_->vec<OutType>();
+    value_vec(output_index) = cross;
+  }
+
+ private:
+  const std::vector<int64>& output_start_indices_;
+  Tensor* indices_out_;
+  Tensor* values_out_;
+};
+
+// Generates the sparse crosses as concatenation of strings.
+template <typename InternalType>
+class StringCrosser {
+ public:
+  StringCrosser(const std::vector<
+                    std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+                const int64 num_buckets_unused, const uint64 hash_key_unused)
+      : columns_(columns) {}
+
+  string Generate(const int64 batch_index,
+                  const std::vector<int>& permutation) const {
+    static const auto k_feature_separator = "_X_";
+
+    gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
+    for (int i = 0; i < permutation.size(); i++) {
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+    }
+    // TODO(zakaria): this will copy the string twice, might effect
+    // performance.
+    return str_util::Join(cross_vec, k_feature_separator);
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+};
+
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosser {
+ public:
+  HashCrosser(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key)
+      : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
+
+  int64 Generate(const int64 batch_index,
+                 const std::vector<int>& permutation) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output = hash_key_;
+    for (size_t i = 0; i < permutation.size(); ++i) {
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+  const uint64 hash_key_;
+};
+
+// ProductIterator generates cartesian products based on indices.
+template <typename InternalType>
+class ProductIterator {
+ public:
+  explicit ProductIterator(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_index)
+      : columns_(columns), batch_index_(batch_index) {
+    next_permutation_.resize(columns_.size(), 0);
+    // Sets has_next_ to false if any feature column has 0 features.
+    has_next_ = true;
+    for (int i = 0; i < columns_.size(); i++) {
+      if (columns_[i]->FeatureCount(batch_index_) == 0) {
+        has_next_ = false;
+        break;
+      }
+    }
+  }
+
+  std::vector<int> Next() {
+    std::vector<int> permutation(next_permutation_);
+
+    // Generates next permutation, if available.
+    bool carry = true;
+    for (int i = next_permutation_.size() - 1; i >= 0; i--) {
+      if (carry) {
+        next_permutation_[i] = next_permutation_[i] + 1;
+      }
+      if (next_permutation_[i] == columns_[i]->FeatureCount(batch_index_)) {
+        next_permutation_[i] = 0;
+      } else {
+        carry = false;
+        break;
+      }
+    }
+    has_next_ = !carry;
+    return permutation;
+  }
+
+  bool HasNext() { return has_next_; }
+
+ private:
+  bool has_next_;
+  const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const int64 batch_index_;
+  std::vector<int> next_permutation_;
+};
+
+template <bool HASHED_OUTPUT, typename InternalType>
+struct CrossTraits;
+
+template <typename InternalType>
+struct CrossTraits<false, InternalType> {
+  typedef StringCrosser<InternalType> Crosser;
+  typedef OutputUpdater<string> Updater;
+};
+
+template <>
+struct CrossTraits<true, int64> {
+  typedef HashCrosser Crosser;
+  typedef OutputUpdater<int64> Updater;
+};
+}  // namespace
+
+template <bool HASHED_OUTPUT, typename InternalType>
+class SparseCrossOp : public OpKernel {
+ public:
+  explicit SparseCrossOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    // Read signed_hash_key_ as int64 since uint64 attributes are not
+    // supported by REGISTER_OP.
+    int64 signed_hash_key_;
+    OP_REQUIRES_OK(context, context->GetAttr("hash_key", &signed_hash_key_));
+    hash_key_ = static_cast<uint64>(signed_hash_key_);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    ValidateInput(context, indices_list_in, values_list_in, shapes_list_in,
+                  dense_list_in);
+
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
+        GenerateColumnsFromInput(indices_list_in, values_list_in,
+                                 shapes_list_in, dense_list_in);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser
+        crosser(columns, num_buckets_, hash_key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    CreateOutputTensors(columns, batch_size, context, &indices_out, &values_out,
+                        &shape_out, &output_start_indices);
+
+    typename CrossTraits<HASHED_OUTPUT, InternalType>::Updater
+        updater(output_start_indices, indices_out, values_out);
+    auto do_work = [this, &columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<InternalType> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+
+ private:
+  // Validates input tensors.
+  void ValidateInput(OpKernelContext* context,
+                     const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+    const auto size = indices_list_in.size();
+    // Validates indices_list_in OpInputList.
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(indices_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input indices should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(1) == 2,
+          errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                  indices_list_in[i].shape().dim_size(1),
+                                  " at position ", i));
+    }
+
+    // Validates values_list_in OpInputList.
+    OP_REQUIRES(
+        context, values_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input values, got ",
+                                values_list_in.size()));
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(values_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input values should be a std::vector but received shape ",
+              values_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(
+          context, indices_list_in[i].shape().dim_size(0) ==
+                       values_list_in[i].shape().dim_size(0),
+          errors::InvalidArgument(
+              "Expected size of values to be ",
+              indices_list_in[i].shape().dim_size(0), " got ",
+              values_list_in[i].shape().dim_size(0), " at position ", i));
+    }
+
+    // Validates shapes_list_in OpInputList
+    OP_REQUIRES(
+        context, shapes_list_in.size() == size,
+        errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                shapes_list_in.size()));
+    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    for (int i = 0; i < size; i++) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(shapes_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Input shapes should be a std::vector but received shape ",
+              shapes_list_in[i].shape().DebugString(), " at position ", i));
+
+      OP_REQUIRES(
+          context, shapes_list_in[i].vec<int64>().size() == 2,
+          errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                  shapes_list_in[i].shape().DebugString(),
+                                  " at position ", i));
+      OP_REQUIRES(context, shapes_list_in[i].vec<int64>()(0) == batch_size,
+                  errors::InvalidArgument(
+                      "Expected batch size ", batch_size, " got ",
+                      shapes_list_in[i].vec<int64>()(0), " at position ", i));
+    }
+
+    // Validates dense_list_in OpInputList
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
+          errors::InvalidArgument(
+              "Dense inputs should be a matrix but received shape ",
+              indices_list_in[i].shape().DebugString(), " at position ", i));
+      OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
+                  errors::InvalidArgument("Expected batch size ", batch_size,
+                                          " got ", dense_list_in[i].dim_size(0),
+                                          " at dense tensor ", i));
+    }
+  }
+
+  // Calculate the batch size from either the shapes input or the dense input.
+  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    if (shapes_list_in.size() > 0) {
+      return shapes_list_in[0].vec<int64>()(0);
+    }
+
+    if (dense_list_in.size() > 0) {
+      return dense_list_in[0].dim_size(0);
+    }
+
+    return 0;
+  }
+
+  // Generate the columns given the sparse and dense inputs.
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+  GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                           const OpInputList& values_list_in,
+                           const OpInputList& shapes_list_in,
+                           const OpInputList& dense_list_in) {
+    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    const int64 number_of_columns = shapes_list_in.size();
+
+    std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                   std::vector<int64>());
+    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                          std::vector<int64>());
+
+    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                       &feature_start_indices);
+
+    for (int i = 0; i < values_list_in.size(); ++i) {
+      columns.emplace_back(new SparseTensorColumn<InternalType>(
+          values_list_in[i], std::move(feature_counts[i]),
+          std::move(feature_start_indices[i])));
+    }
+    for (int i = 0; i < dense_list_in.size(); ++i) {
+      columns.emplace_back(
+          new DenseTensorColumn<InternalType>(dense_list_in[i]));
+    }
+
+    return columns;
+  }
+
+  // Extracts data about the features and populates feature data.
+  void ExtractFeatureData(
+      const OpInputList& indices_list_in, int64 batch_size,
+      std::vector<std::vector<int64>>* feature_counts,
+      std::vector<std::vector<int64>>* feature_start_indices) {
+    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+    for (int b = 0; b < batch_size; b++) {
+      for (int i = 0; i < indices_list_in.size(); i++) {
+        const auto indices = indices_list_in[i].matrix<int64>();
+        int64 feature_count = 0;
+        int64 start_index = current_row[i];
+        // Loops until we reach next batch index for current feature column.
+        while (current_row[i] < indices_list_in[i].dim_size(0) &&
+               indices(current_row[i], 0) == b) {
+          feature_count++;
+          current_row[i]++;
+        }
+        (*feature_counts)[i].push_back(feature_count);
+        (*feature_start_indices)[i].push_back(start_index);
+      }
+    }
+  }
+
+  // Allocates output tensors with proper size and sets the shape tensor of
+  // the output SparseTensor.
+  // It also output_start_indices which contains the start indices for each
+  // input in the output SparseTensor.
+  void CreateOutputTensors(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+      Tensor** values_out, Tensor** shape_out,
+      std::vector<int64>* output_start_indices) {
+    // Calculates dimensions for output tensors.
+    int64 cross_count_total = 0;
+    int64 max_cross_count = 0;
+    for (int64 b = 0; b < batch_size; b++) {
+      // For each input, sets starting indices in output SparseTensor
+      (*output_start_indices)[b] = cross_count_total;
+      const auto cross_count = CrossCountByBatchIndex(columns, b);
+      max_cross_count = std::max(max_cross_count, cross_count);
+      cross_count_total += cross_count;
+    }
+
+    // Allocates tensors.
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({cross_count_total, 2}), indices_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape({cross_count_total}),
+                                            values_out));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape({2}), shape_out));
+
+    // Sets shape.
+    auto shape_vec = (*shape_out)->vec<int64>();
+    shape_vec(0) = batch_size;
+    shape_vec(1) = max_cross_count;
+  }
+
+  // Returns number of crosses for a given batch_index
+  int64 CrossCountByBatchIndex(
+      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
+          columns,
+      int batch_index) {
+    int64 cross_count = 1;
+    for (int i = 0; i < columns.size(); i++) {
+      const auto feature_count = columns[i]->FeatureCount(batch_index);
+      // If one column is missing any feature, there won't be any cross.
+      if (feature_count == 0) {
+        return 0;
+      }
+      cross_count *= feature_count;
+    }
+    return cross_count;
+  }
+  int64 num_buckets_;
+  uint64 hash_key_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<false, StringPiece>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<false, string>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<string>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCross")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("out_type")
+                            .TypeConstraint<int64>("internal_type"),
+                        SparseCrossOp<true, int64>);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 46e743b4cf9fb568cca75baa26e50d2769655bd3..22c8bc48b444c174cd7212df7042d4b23219fa85 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1489,10 +1489,6 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
     }
   });
   // Do matrix-matrix multiplication
-  // TODO(jewillco): libxsmm doesn't support beta != 1 yet -- remove when
-  // release
-  // includes beta handling
-  memset(output_data, 0, left_dim0 * right_dim1 * sizeof(TR));
   ptrdiff_t total_num_mult_blocks =
       libxsmm_spmdm_get_num_compute_blocks(&entry->handle);
   std::atomic<int> cur_mult_block_number;
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
index b5093d59fc0e0f3a3246e18023d48aaefdcf38b4..48f38872e253ee969fb7a923812e6a6f0c15ce6e 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.cc
@@ -47,16 +47,26 @@ class SparseTensorDenseAddOp : public OpKernel {
                     "Input a_indices should be a matrix but received shape: ",
                     a_indices_t->shape().DebugString()));
     OP_REQUIRES(
-        ctx, TensorShapeUtils::IsVector(a_values_t->shape()) &&
-                 TensorShapeUtils::IsVector(a_shape_t->shape()),
+        ctx,
+        TensorShapeUtils::IsVector(a_values_t->shape()) &&
+            TensorShapeUtils::IsVector(a_shape_t->shape()),
         errors::InvalidArgument("Inputs a_values and a_shape should be vectors "
                                 "but received shapes: ",
                                 a_values_t->shape().DebugString(), " and ",
                                 a_shape_t->shape().DebugString()));
-    OP_REQUIRES(ctx, a_shape_t->NumElements() == b->dims(),
-                errors::InvalidArgument(
-                    "Two operands have different dimensions; received: ",
-                    a_shape_t->NumElements(), " and ", b->dims()));
+    OP_REQUIRES(
+        ctx, a_shape_t->NumElements() == b->dims(),
+        errors::InvalidArgument("Two operands have different ranks; received: ",
+                                a_shape_t->NumElements(), " and ", b->dims()));
+    const auto a_shape_flat = a_shape_t->flat<Index>();
+    for (int i = 0; i < b->dims(); ++i) {
+      OP_REQUIRES(
+          ctx, a_shape_flat(i) == b->dim_size(i),
+          errors::InvalidArgument(
+              "Dimension ", i,
+              " does not equal (no broadcasting is supported): sparse side ",
+              a_shape_flat(i), " vs dense side ", b->dim_size(i)));
+    }
 
     Tensor *out_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, b->shape(), &out_t));
@@ -82,8 +92,9 @@ class SparseTensorDenseAddOp : public OpKernel {
       NDIMS_CASE(4);
       NDIMS_CASE(5);
       default:
-        OP_REQUIRES(ctx, false, errors::InvalidArgument(
-                                    "Only tensors with ranks between 1 and 5 "
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument("Only tensors with ranks between 1 and 5 "
                                     "are currently supported.  Tensor rank: ",
                                     ndims));
 #undef NDIMS_CASE
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
index b06dcf143ec2a6f9e30bb69b36e8f88d4c53b201..353cf0e51909ea8025c3d2c06cd5b1f3ed58b917 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_add_op.h
@@ -24,7 +24,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-// TOOD(zongheng): this should be a general functor that powers SparseAdd and
+// TODO(zongheng): this should be a general functor that powers SparseAdd and
 // ScatterNd ops.  It should be moved to its own head file, once the other ops
 // are implemented.
 template <typename Device, typename T, typename Index, int NDIMS,
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 1669ac47c82594b535e2b6ce717ebeb52b0809c9..30c57ef287f4c645b198da6ebf6b8554dde4fd12 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename Tindices>
 class SparseTensorDenseMatMulOp : public OpKernel {
  public:
   explicit SparseTensorDenseMatMulOp(OpKernelConstruction* ctx)
@@ -65,7 +65,8 @@ class SparseTensorDenseMatMulOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_indices->shape()),
                 errors::InvalidArgument("Tensor 'a_indices' is not a matrix"));
 
-    OP_REQUIRES(ctx, a_indices->shape().dim_size(0) == a_values->NumElements(),
+    const int64 nnz = a_indices->shape().dim_size(0);
+    OP_REQUIRES(ctx, nnz == a_values->NumElements(),
                 errors::InvalidArgument("Number of rows of a_indices does not "
                                         "match number of entries in a_values"));
 
@@ -89,8 +90,28 @@ class SparseTensorDenseMatMulOp : public OpKernel {
             inner_left, " vs. ", inner_right,
             ".  Did you forget a transpose?  "
             "Dimensions of A: [",
-            a_shape_t(0), ", ", a_shape_t(1), ").  Dimensions of B: ",
-            b->shape().DebugString()));
+            a_shape_t(0), ", ", a_shape_t(1),
+            ").  Dimensions of B: ", b->shape().DebugString()));
+
+    if (std::is_same<Device, GPUDevice>::value) {
+      // The GPU implementation is optimized to use 32 bit indexing, so
+      // give a friendly error to the programmer early on if they
+      // exceed.
+      const int int32max = std::numeric_limits<int>::max();
+      OP_REQUIRES(
+          ctx,
+          (FastBoundsCheck(inner_left, int32max) &&
+           FastBoundsCheck(inner_right, int32max) &&
+           FastBoundsCheck(outer_left, int32max) &&
+           FastBoundsCheck(outer_right, int32max) &&
+           FastBoundsCheck(b->NumElements(), int32max) &&
+           FastBoundsCheck(outer_left * outer_right, int32max) &&
+           FastBoundsCheck(a_values->NumElements(), int32max)),
+          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
+      OP_REQUIRES(ctx, FastBoundsCheck(nnz * outer_right, int32max),
+                  errors::InvalidArgument(
+                      "Cannot use GPU when output.shape[1] * nnz(a) > 2^31"));
+    }
 
     TensorShape out_shape({outer_left, outer_right});
     Tensor* out = nullptr;
@@ -111,40 +132,14 @@ class SparseTensorDenseMatMulOp : public OpKernel {
       return;
     }
 
-    Tensor scratch;
-
-    if (std::is_same<Device, GPUDevice>::value) {
-      // The GPU implementation is optimized to use 32 bit indexing, so
-      // give a friendly error to the programmer early on if they exceed.
-      OP_REQUIRES(
-          ctx,
-          FastBoundsCheck(inner_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(inner_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_left, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(outer_right, std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(b->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(out->NumElements(),
-                              std::numeric_limits<int>::max()) &&
-              FastBoundsCheck(a_values->NumElements(),
-                              std::numeric_limits<int>::max()),
-          errors::InvalidArgument("Cannot use GPU for > 2^31 entry inputs"));
-      const int nnz = static_cast<const int>(a_values->NumElements());
-      // Need nnz length vec scratch space on the GPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({nnz}), &scratch));
-    } else {
-      // We don't need scratch space on the CPU.
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             TensorShape({0}), &scratch));
-    }
-
-#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                            \
-  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                            \
-    functor::SparseTensorDenseMatMulFunctor<Device, T, ADJ_A, ADJ_B>::Compute( \
-        ctx->eigen_device<Device>(), out->matrix<T>(),                         \
-        a_indices->matrix<int64>(), a_values->vec<T>(), b->matrix<T>(),        \
-        scratch.vec<T>());                                                     \
+#define MAYBE_ADJOINT(ADJ_A, ADJ_B)                                        \
+  if (adjoint_a_ == ADJ_A && adjoint_b_ == ADJ_B) {                        \
+    Status functor_status = functor::SparseTensorDenseMatMulFunctor<       \
+        Device, T, Tindices, ADJ_A,                                        \
+        ADJ_B>::Compute(ctx->eigen_device<Device>(), out->matrix<T>(),     \
+                        a_indices->matrix<Tindices>(), a_values->vec<T>(), \
+                        b->matrix<T>());                                   \
+    OP_REQUIRES_OK(ctx, functor_status);                                   \
   }
 
     MAYBE_ADJOINT(false, false);
@@ -160,67 +155,99 @@ class SparseTensorDenseMatMulOp : public OpKernel {
   bool adjoint_b_;
 };
 
-#define REGISTER_CPU(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("SparseTensorDenseMatMul") \
-                              .Device(DEVICE_CPU)         \
-                              .TypeConstraint<T>("T")     \
-                              .HostMemory("a_shape"),     \
-                          SparseTensorDenseMatMulOp<CPUDevice, T>);
-
-REGISTER_CPU(float);
-REGISTER_CPU(double);
-REGISTER_CPU(int32);
-REGISTER_CPU(complex64);
-REGISTER_CPU(complex128);
+#define REGISTER_CPU(TypeT, TypeIndex)           \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("SparseTensorDenseMatMul")            \
+          .Device(DEVICE_CPU)                    \
+          .TypeConstraint<TypeT>("T")            \
+          .TypeConstraint<TypeIndex>("Tindices") \
+          .HostMemory("a_shape"),                \
+      SparseTensorDenseMatMulOp<CPUDevice, TypeT, TypeIndex>);
+
+#define REGISTER_KERNELS_CPU(T) \
+  REGISTER_CPU(T, int64);       \
+  REGISTER_CPU(T, int32)
+
+REGISTER_KERNELS_CPU(float);
+REGISTER_KERNELS_CPU(double);
+REGISTER_KERNELS_CPU(int32);
+REGISTER_KERNELS_CPU(complex64);
+REGISTER_KERNELS_CPU(complex128);
 
 #if GOOGLE_CUDA
 
 namespace functor {
-#define DECLARE_GPU_SPEC(T, ADJ_A, ADJ_B)                                    \
-  template <>                                                                \
-  void SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B>::Compute(  \
-      const GPUDevice& d, typename TTypes<T>::Matrix out,                    \
-      TTypes<int64>::ConstMatrix a_indices,                                  \
-      typename TTypes<T>::ConstVec a_values,                                 \
-      typename TTypes<T>::ConstMatrix b, typename TTypes<T>::Vec scratch);   \
-  extern template struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, \
-                                                        ADJ_B>;
-
-#define DECLARE_ADJOINT_GPU_SPEC(T) \
-  DECLARE_GPU_SPEC(T, false, false) \
-  DECLARE_GPU_SPEC(T, false, true)  \
-  DECLARE_GPU_SPEC(T, true, false)  \
-  DECLARE_GPU_SPEC(T, true, true)
+#define DECLARE_GPU_SPEC(T, Tindices, ADJ_A, ADJ_B)                       \
+  template <>                                                             \
+  Status SparseTensorDenseMatMulFunctor<                                  \
+      GPUDevice, T, Tindices, ADJ_A,                                      \
+      ADJ_B>::Compute(const GPUDevice& d, typename TTypes<T>::Matrix out, \
+                      TTypes<Tindices>::ConstMatrix a_indices,            \
+                      typename TTypes<T>::ConstVec a_values,              \
+                      typename TTypes<T>::ConstMatrix b);                 \
+  extern template struct SparseTensorDenseMatMulFunctor<                  \
+      GPUDevice, T, Tindices, ADJ_A, ADJ_B>;
+
+#define REGISTER_GPU_SPEC(T, ADJ_A, ADJ_B)  \
+  DECLARE_GPU_SPEC(T, int32, ADJ_A, ADJ_B); \
+  DECLARE_GPU_SPEC(T, int64, ADJ_A, ADJ_B)
+
+#define DECLARE_ADJOINT_GPU_SPEC(T)  \
+  REGISTER_GPU_SPEC(T, false, false) \
+  REGISTER_GPU_SPEC(T, false, true)  \
+  REGISTER_GPU_SPEC(T, true, false)  \
+  REGISTER_GPU_SPEC(T, true, true)
 
 DECLARE_ADJOINT_GPU_SPEC(float);
 #undef DECLARE_ADJOINT_GPU_SPEC
 #undef DECLARE_GPU_SPEC
+#undef REGISTER_GPU_SPEC
 
 }  // namespace functor
 
-#define REGISTER_GPU(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("SparseTensorDenseMatMul") \
-                              .Device(DEVICE_GPU)         \
-                              .TypeConstraint<T>("T")     \
-                              .HostMemory("a_shape"),     \
-                          SparseTensorDenseMatMulOp<GPUDevice, T>);
+#define REGISTER_GPU(TypeT, TypeIndex)           \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("SparseTensorDenseMatMul")            \
+          .Device(DEVICE_GPU)                    \
+          .TypeConstraint<TypeT>("T")            \
+          .TypeConstraint<TypeIndex>("Tindices") \
+          .HostMemory("a_shape"),                \
+      SparseTensorDenseMatMulOp<GPUDevice, TypeT, TypeIndex>);
+
+#define REGISTER_KERNELS_GPU(T) \
+  REGISTER_GPU(T, int64);       \
+  REGISTER_GPU(T, int32)
 
-REGISTER_GPU(float);
+REGISTER_KERNELS_GPU(float);
 #undef REGISTER_GPU
+#undef REGISTER_KERNELS_GPU
 #endif  // GOOGLE_CUDA
 
 namespace functor {
 
-template <typename T, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
+namespace {
+Status KOutOfBoundsError(int64 k, std::size_t i, int rhs_index_a,
+                         std::size_t lhs_right) {
+  return errors::InvalidArgument("k (", k, ") from index[", i, ",", rhs_index_a,
+                                 "] out of bounds (>=", lhs_right, ")");
+}
+
+Status MOutOfBoundsError(int64 m, std::size_t i, int lhs_index_a,
+                         int64 out_dim0) {
+  return errors::InvalidArgument("m (", m, ") from index[", i, ",", lhs_index_a,
+                                 "] out of bounds (>=", out_dim0, ")");
+}
+}  // namespace
+
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor<CPUDevice, T, Tindices, ADJ_A, ADJ_B> {
   // Vectorize certain operations above this size.
   static const std::size_t kNumVectorize = 32;
 
-  static void Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
-                      TTypes<int64>::ConstMatrix a_indices,
-                      typename TTypes<T>::ConstVec a_values,
-                      typename TTypes<T>::ConstMatrix b,
-                      typename TTypes<T>::Vec scratch) {
+  static Status Compute(const CPUDevice& d, typename TTypes<T>::Matrix out,
+                        typename TTypes<Tindices>::ConstMatrix a_indices,
+                        typename TTypes<T>::ConstVec a_values,
+                        typename TTypes<T>::ConstMatrix b) {
     const std::size_t nnz = a_values.size();
     const std::size_t rhs_right = (ADJ_B ? b.dimension(0) : b.dimension(1));
     const std::size_t lhs_right = (ADJ_B ? b.dimension(1) : b.dimension(0));
@@ -236,11 +263,16 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
     if (rhs_right < kNumVectorize) {
       // Disable vectorization if the RHS of output is too small
       auto maybe_adjoint_b = MaybeAdjoint<decltype(b), ADJ_B>(b);
+
       for (std::size_t i = 0; i < nnz; ++i) {
-        const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
-        const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
-        CHECK_LT(k, lhs_right);
-        CHECK_LT(m, out.dimension(0));
+        const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a));
+        const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a));
+        if (!FastBoundsCheck(k, lhs_right)) {
+          return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);
+        }
+        if (!FastBoundsCheck(m, out.dimension(0))) {
+          return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));
+        }
         const T a_value = ADJ_A ? MaybeConj(a_values(i)) : a_values(i);
         for (std::size_t n = 0; n < rhs_right; ++n) {
           const T b_value = maybe_adjoint_b(k, n);
@@ -251,15 +283,19 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
       // Vectorization via Eigen.
       const int b_chip_index = ADJ_B ? 1 : 0;
 
-#define LOOP_NNZ(b_passed)                                               \
-  for (std::size_t i = 0; i < nnz; ++i) {                                \
-    const int64 m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
-    const int64 k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
-    const T a_value = (ADJ_A) ? MaybeConj(a_values(i)) : a_values(i);    \
-    CHECK_LT(m, out.dimension(0));                                       \
-    CHECK_LT(k, lhs_right);                                              \
-    out.template chip<0>(m) +=                                           \
-        b_passed.template chip<b_chip_index>(k) * a_value;               \
+#define LOOP_NNZ(b_passed)                                                  \
+  for (std::size_t i = 0; i < nnz; ++i) {                                   \
+    const Tindices m = internal::SubtleMustCopy(a_indices(i, lhs_index_a)); \
+    const Tindices k = internal::SubtleMustCopy(a_indices(i, rhs_index_a)); \
+    const T a_value = (ADJ_A) ? MaybeConj(a_values(i)) : a_values(i);       \
+    if (!FastBoundsCheck(k, lhs_right)) {                                   \
+      return KOutOfBoundsError(k, i, rhs_index_a, lhs_right);               \
+    }                                                                       \
+    if (!FastBoundsCheck(m, out.dimension(0))) {                            \
+      return MOutOfBoundsError(m, i, lhs_index_a, out.dimension(0));        \
+    }                                                                       \
+    out.template chip<0>(m) +=                                              \
+        b_passed.template chip<b_chip_index>(k) * a_value;                  \
   }
 
       if (ADJ_B) {
@@ -274,6 +310,7 @@ struct SparseTensorDenseMatMulFunctor<CPUDevice, T, ADJ_A, ADJ_B> {
       }
 #undef LOOP_NNZ
     }
+    return Status::OK();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
index 3bec4ce5f2db4dd4c384e0007d35bb11365f8909..da131904949763c4b3414f391b57d5d7eaa38bed 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h
@@ -19,19 +19,19 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
 namespace functor {
 
-template <typename Device, typename T, bool ADJ_A, bool ADJ_B>
+template <typename Device, typename T, typename Tindices, bool ADJ_A,
+          bool ADJ_B>
 struct SparseTensorDenseMatMulFunctor {
-  static EIGEN_ALWAYS_INLINE void Compute(const Device& d,
-                                          typename TTypes<T>::Matrix out,
-                                          TTypes<int64>::ConstMatrix a_indices,
-                                          typename TTypes<T>::ConstVec a_values,
-                                          typename TTypes<T>::ConstMatrix b,
-                                          typename TTypes<T>::Vec scratch);
+  static EIGEN_ALWAYS_INLINE Status Compute(
+      const Device& d, typename TTypes<T>::Matrix out,
+      typename TTypes<Tindices>::ConstMatrix a_indices,
+      typename TTypes<T>::ConstVec a_values, typename TTypes<T>::ConstMatrix b);
 };
 
 template <typename MATRIX, bool ADJ>
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index a1776968f076e578b82035d1324346104cc8c924..e261e42e0d3bf43efc3a1328f07b1362f0870dfd 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -20,142 +20,90 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-namespace generator {
-
-template <typename T, bool ADJ_A, bool ADJ_B>
-class SparseTensorDenseMatMulGPUGenerator {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SparseTensorDenseMatMulGPUGenerator(
-      typename TTypes<T, 2>::Tensor32Bit out,
-      TTypes<const int64, 2>::Tensor32Bit a_indices,
-      typename TTypes<const T, 1>::Tensor32Bit a_values,
-      typename TTypes<const T, 2>::Tensor32Bit b)
-      : out_(out),
-        lhs_index_a_(ADJ_A ? 1 : 0),
-        rhs_index_a_(ADJ_A ? 0 : 1),
-        a_indices_(a_indices),
-        a_values_(a_values),
-        lhs_right_size(ADJ_B ? b.dimension(1) : b.dimension(0)),
-        maybe_adjoint_b_(
-            functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit,
-                                  ADJ_B>(b)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
-  operator()(const Eigen::array<int, 2>& j_and_ix) const {
-#ifdef __CUDA_ARCH__
-    const int j = j_and_ix[0];
-    const int ix = j_and_ix[1];
-    int m = a_indices_(ix, lhs_index_a_);
-    int k = a_indices_(ix, rhs_index_a_);
-    assert(k < lhs_right_size);
-    assert(m < out_.dimension(0));
-    // If asserts are disabled, the caller is violating the sparse
-    // tensor index contract, and so we return invalid results.
-    // Force returning NaNs to try to signal that something is amiss.
-    T b_value;
-    if (k >= lhs_right_size || m >= out_.dimension(0)) {
-      m = 0;
-      k = 0;
-      b_value = std::numeric_limits<T>::quiet_NaN();
-    } else {
-      b_value = maybe_adjoint_b_(k, j);
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+__global__ void SparseTensorDenseMatMulKernel(int nnz, int m, int b_rows,
+                                              int b_cols, int p,
+                                              const Tindices* a_indices,
+                                              const T* a_values, const T* b,
+                                              T* out) {
+  // out_{ij} = sum_k {a_ik b_kj}
+  // out = A * B', out_{ij} = sum_k {a_ik (b')_kj}; b'_{kj} = b_{jk}
+  const int n = (ADJ_B) ? b_cols : b_rows;
+  CUDA_1D_KERNEL_LOOP(index, nnz * p) {
+    const int a_ix = index / p;
+    const int j = index % p;
+    const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0));
+    const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1));
+    if (!FastBoundsCheck(i, m)) {
+      continue;  // Nowhere to signal an error :(
+    }
+    // out[i, j]
+    T* out_location = out + i * p + j;
+    if (!FastBoundsCheck(k, n)) {
+      CudaAtomicAdd(out_location, std::numeric_limits<T>::quiet_NaN());
+      continue;
     }
-    atomicAdd(&out_(m, j), a_values_(ix) * b_value);
-#else
-    assert(false && "This should only be run on the device");
-#endif
-    // Return something
-    return T(0);
-  }
 
- private:
-  mutable typename TTypes<T, 2>::Tensor32Bit out_;
-  const int lhs_index_a_;
-  const int rhs_index_a_;
-  TTypes<const int64, 2>::Tensor32Bit a_indices_;
-  typename TTypes<const T, 1>::Tensor32Bit a_values_;
-  const int lhs_right_size;
-  functor::MaybeAdjoint<typename TTypes<const T, 2>::Tensor32Bit, ADJ_B>
-      maybe_adjoint_b_;
-};
+    // a_value == (ADJ_A) ? a[k, i] : a[i, k]
+    const T a_value = ldg(a_values + a_ix);
 
-}  // namespace generator
+    // b_value == (ADJ_B) ? b[j, k] : b[k, j]
+    const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j));
+    CudaAtomicAdd(out_location, a_value * b_value);
+  }
+}
 
 namespace functor {
 
-template <typename T, bool ADJ_A, bool ADJ_B>
-struct SparseTensorDenseMatMulFunctor<GPUDevice, T, ADJ_A, ADJ_B> {
-  static EIGEN_ALWAYS_INLINE void Compute(const GPUDevice& d,
-                                          typename TTypes<T>::Matrix out,
-                                          TTypes<int64>::ConstMatrix a_indices,
-                                          typename TTypes<T>::ConstVec a_values,
-                                          typename TTypes<T>::ConstMatrix b,
-                                          typename TTypes<T>::Vec scratch) {
-    generator::SparseTensorDenseMatMulGPUGenerator<T, ADJ_A, ADJ_B>
-        sparse_tensor_dense_matmul_generator(To32Bit(out), To32Bit(a_indices),
-                                             To32Bit(a_values), To32Bit(b));
-    To32Bit(out).device(d) = To32Bit(out).constant(T(0));
+template <typename T, typename Tindices, bool ADJ_A, bool ADJ_B>
+struct SparseTensorDenseMatMulFunctor<GPUDevice, T, Tindices, ADJ_A, ADJ_B> {
+  static EIGEN_ALWAYS_INLINE Status
+  Compute(const GPUDevice& d, typename TTypes<T>::Matrix out,
+          typename TTypes<Tindices>::ConstMatrix a_indices,
+          typename TTypes<T>::ConstVec a_values,
+          typename TTypes<T>::ConstMatrix b) {
+    out.device(d) = out.constant(T(0));
     int nnz = a_values.size();
-    int n = (ADJ_B) ? b.dimension(0) : b.dimension(1);
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::Tensor<int, 2>::Dimensions matrix_1_by_nnz{{ 1, nnz }};
-    Eigen::array<int, 2> n_by_1{{ n, 1 }};
-    Eigen::array<int, 1> reduce_on_rows{{ 0 }};
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> matrix_1_by_nnz;
-    matrix_1_by_nnz.set(1, nnz);
-    Eigen::IndexList<int, Eigen::type2index<1> > n_by_1;
-    n_by_1.set(0, n);
-    Eigen::IndexList<Eigen::type2index<0> > reduce_on_rows;
-#endif
-
-    // How this works: the generator iterates over (j, ix) where j
-    // iterates from 0 .. n - 1 and ix iterates from
-    // 0 .. nnz - 1.  A side effect of the generator is to accumulate
-    // the products of values in A and B into the appropriate location
-    // in the dense matrix out.  In order to run the iteration,
-    // we take a smaller variable and broadcast to a size (n, nnz).
-    // This is the scratch variable.  In order to enforce execution,
-    // we have to perform assignment back into scratch (taking the sum).
-    // We don't care what gets assigned to scratch - only the side effect
-    // of the execution in the generator.
-    //
-    // Note it's not sufficient that scratch be a scalar, and to
-    // broadcast it to a matrix.  Eigen splits the computation not
-    // based on the largest intermediate shape (the size of the
-    // broadcast of scratch) but based on the output shape.  So
-    // scratch needs to be a vector at least.
-    //
-    // Note also that only float type is supported because the
-    // atomicAdd operation is only supported for floats in hardware.
-    To32Bit(scratch).device(d) =
-        To32Bit(scratch)
-            .reshape(matrix_1_by_nnz)
-            .broadcast(n_by_1)
-            .generate(sparse_tensor_dense_matmul_generator)
-            .sum(reduce_on_rows);
+    // out = A * B, A is [m x n] and B is [n x p], out is [m x p]
+    int m = out.dimension(0);
+    int p = out.dimension(1);
+    int b_rows = b.dimension(0);
+    int b_cols = b.dimension(1);
+
+    // TODO(ebrevdo): Should this be alpha * nnz instead of
+    // out.size()?  Perhaps p * nnz ?
+    CudaLaunchConfig config = GetCudaLaunchConfig(p * nnz, d);
+
+    SparseTensorDenseMatMulKernel<T, Tindices, ADJ_A, ADJ_B>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            nnz, m, b_rows, b_cols, p, a_indices.data(), a_values.data(),
+            b.data(), out.data());
+
+    return Status::OK();
   }
 };
 
 }  // namespace functor
 
-#define DEFINE(T)                                                              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, false, \
-                                                          false>;              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, false, \
-                                                          true>;               \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, true,  \
-                                                          false>;              \
-  template struct functor::SparseTensorDenseMatMulFunctor<GPUDevice, T, true,  \
-                                                          true>;
-
-DEFINE(float);
+#define DEFINE(T, Tindices)                                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, false, false>;               \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, false, true>;                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, true, false>;                \
+  template struct functor::SparseTensorDenseMatMulFunctor< \
+      GPUDevice, T, Tindices, true, true>;
+
+DEFINE(float, int32);
+DEFINE(float, int64);
 #undef DEFINE
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70837f8ba8314d036263858b0b8e07bca33355a6
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
@@ -0,0 +1,223 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <numeric>
+
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+template <typename T>
+class Dataset : public DatasetBase {
+ public:
+  explicit Dataset(const sparse::SparseTensor& sparse_tensor)
+      : sparse_tensor_(sparse_tensor),
+        dtypes_({DT_INT64, sparse_tensor.dtype(), DT_INT64}),
+        shapes_({{-1, sparse_tensor.dims() - 1},
+                 {-1},
+                 {sparse_tensor.dims() - 1}}) {}
+
+  std::unique_ptr<IteratorBase> MakeIterator() const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(this));
+  }
+
+  const DataTypeVector& output_dtypes() const override { return dtypes_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return shapes_;
+  }
+
+  string DebugString() override {
+    return "SparseTensorSliceDatasetOp::Dataset";
+  }
+
+ private:
+  class Iterator : public DatasetIterator<Dataset<T>> {
+   public:
+    explicit Iterator(const Dataset<T>* dataset)
+        : DatasetIterator<Dataset<T>>(dataset),
+          dataset_(dataset),
+          num_elements_(dataset->sparse_tensor_.shape().dim_size(0)),
+          dense_shape_(DT_INT64, {dataset->sparse_tensor_.dims() - 1}),
+          group_iterable_(dataset->sparse_tensor_.group({0})),
+          iter_(group_iterable_.begin()) {
+      for (size_t i = 0; i < dense_shape_.NumElements(); ++i) {
+        dense_shape_.vec<int64>()(i) =
+            dataset_->sparse_tensor_.shape().dim_size(i + 1);
+      }
+    }
+
+    Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      if (i_ == num_elements_) {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+      out_tensors->clear();
+      out_tensors->reserve(3);
+      const int rank = dataset_->sparse_tensor_.dims();
+
+      if (i_ > next_non_empty_i_ && iter_ != group_iterable_.end()) {
+        // We still have elements to consume from `group_iterable_`
+        // and we have emitted all elements up to and including the
+        // current position.
+        sparse::Group group = *iter_;
+        const auto indices = group.indices();
+        const auto values = group.values<T>();
+        const int64 num_entries = values.size();
+        next_non_empty_i_ = indices(0, 0);
+
+        next_indices_ = Tensor(DT_INT64, {num_entries, rank - 1});
+        next_values_ = Tensor(DataTypeToEnum<T>::value, {num_entries});
+
+        auto next_indices_t = next_indices_.matrix<int64>();
+        auto next_values_t = next_values_.vec<T>();
+
+        for (int64 i = 0; i < num_entries; ++i) {
+          for (int d = 1; d < rank; ++d) {
+            next_indices_t(i, d - 1) = indices(i, d);
+          }
+          next_values_t(i) = values(i);
+        }
+
+        ++iter_;
+      }
+
+      if (i_ == next_non_empty_i_) {
+        // The current position is non-empty in the input
+        // `SparseTensor`, and we have already read the value from the
+        // `GroupIterable`.
+        out_tensors->push_back(std::move(next_indices_));
+        out_tensors->push_back(std::move(next_values_));
+        out_tensors->push_back(dense_shape_);
+        next_non_empty_i_ = kNextNonEmptyUnknown;
+      } else {
+        DCHECK(i_ < next_non_empty_i_ || iter_ == group_iterable_.end());
+        // The current position is empty in the input `SparseTensor`,
+        // so emit empty indices and values.
+        out_tensors->push_back(Tensor(DT_INT64, TensorShape({0, rank - 1})));
+        out_tensors->push_back(Tensor(DataTypeToEnum<T>::value, {0}));
+        out_tensors->push_back(dense_shape_);
+      }
+
+      ++i_;
+      *end_of_sequence = false;
+      return Status::OK();
+    }
+
+   private:
+    const Dataset<T>* const dataset_;
+    const int64 num_elements_;
+
+    Tensor dense_shape_;
+
+    mutex mu_;
+    sparse::GroupIterable group_iterable_ GUARDED_BY(mu_);
+    sparse::GroupIterable::IteratorStep iter_ GUARDED_BY(mu_);
+    int64 i_ GUARDED_BY(mu_) = 0;
+    const int64 kNextNonEmptyUnknown = -1;
+    int64 next_non_empty_i_ GUARDED_BY(mu_) = kNextNonEmptyUnknown;
+    Tensor next_indices_ GUARDED_BY(mu_);
+    Tensor next_values_ GUARDED_BY(mu_);
+  };
+
+  const sparse::SparseTensor sparse_tensor_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
+};
+
+template <typename T>
+class SparseTensorSliceDatasetOp : public OpKernel {
+ public:
+  explicit SparseTensorSliceDatasetOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new SparseTensorSliceDatasetOp::Dataset, insert it in
+    // the step container, and return it as the output.
+    const Tensor* indices;
+    OP_REQUIRES_OK(ctx, ctx->input("indices", &indices));
+    const Tensor* values;
+    OP_REQUIRES_OK(ctx, ctx->input("values", &values));
+    const Tensor* dense_shape;
+    OP_REQUIRES_OK(ctx, ctx->input("dense_shape", &dense_shape));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(indices->shape()),
+                errors::InvalidArgument(
+                    "Input indices should be a matrix but received shape ",
+                    indices->shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(values->shape()),
+                errors::InvalidArgument(
+                    "Input values should be a vector but received shape ",
+                    indices->shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(dense_shape->shape()),
+                errors::InvalidArgument(
+                    "Input shape should be a vector but received shape ",
+                    dense_shape->shape().DebugString()));
+
+    // We currently ensure that `sparse_tensor` is ordered in the
+    // batch dimension.
+    // TODO(mrry): Investigate ways to avoid this unconditional check
+    // if we can be sure that the sparse tensor was produced in an
+    // appropriate order (e.g. by `tf.parse_example()` or a Dataset
+    // that batches elements into rows of a SparseTensor).
+    int64 previous_batch_index = -1;
+    for (int64 i = 0; i < indices->dim_size(0); ++i) {
+      int64 next_batch_index = indices->matrix<int64>()(i, 0);
+      OP_REQUIRES(
+          ctx, next_batch_index >= previous_batch_index,
+          errors::Unimplemented("The SparseTensor must be ordered in the batch "
+                                "dimension; handling arbitrarily ordered input "
+                                "is not currently supported."));
+      previous_batch_index = next_batch_index;
+    }
+    gtl::InlinedVector<int64, 8> std_order(dense_shape->NumElements(), 0);
+    sparse::SparseTensor sparse_tensor(
+        *indices, *values, TensorShape(dense_shape->vec<int64>()), std_order);
+
+    DatasetBase* dataset = new Dataset<T>(sparse_tensor);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+};
+
+#define REGISTER_DATASET_KERNEL(type)                           \
+  REGISTER_KERNEL_BUILDER(Name("SparseTensorSliceDataset")      \
+                              .Device(DEVICE_CPU)               \
+                              .TypeConstraint<type>("Tvalues"), \
+                          SparseTensorSliceDatasetOp<type>);
+
+TF_CALL_ALL_TYPES(REGISTER_DATASET_KERNEL);
+#undef REGISTER_DATASET_KERNEL
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram.cc b/tensorflow/core/kernels/spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7531d5d64a5bbfa6374d4c45d2136720831d6a39
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.cc
@@ -0,0 +1,212 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <math.h>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+
+using std::complex;
+
+namespace {
+// Returns the default Hann window function for the spectrogram.
+void GetPeriodicHann(int window_length, std::vector<double>* window) {
+  // Some platforms don't have M_PI, so define a local constant here.
+  const double pi = std::atan(1) * 4;
+  window->resize(window_length);
+  for (int i = 0; i < window_length; ++i) {
+    (*window)[i] = 0.5 - 0.5 * cos((2 * pi * i) / window_length);
+  }
+}
+}  // namespace
+
+bool Spectrogram::Initialize(int window_length, int step_length) {
+  std::vector<double> window;
+  GetPeriodicHann(window_length, &window);
+  return Initialize(window, step_length);
+}
+
+bool Spectrogram::Initialize(const std::vector<double>& window,
+                             int step_length) {
+  window_length_ = window.size();
+  window_ = window;  // Copy window.
+  if (window_length_ < 2) {
+    LOG(ERROR) << "Window length too short.";
+    initialized_ = false;
+    return false;
+  }
+
+  step_length_ = step_length;
+  if (step_length_ < 1) {
+    LOG(ERROR) << "Step length must be positive.";
+    initialized_ = false;
+    return false;
+  }
+
+  fft_length_ = NextPowerOfTwo(window_length_);
+  CHECK(fft_length_ >= window_length_);
+  output_frequency_channels_ = 1 + fft_length_ / 2;
+
+  // Allocate 2 more than what rdft needs, so we can rationalize the layout.
+  fft_input_output_.assign(fft_length_ + 2, 0.0);
+
+  int half_fft_length = fft_length_ / 2;
+  fft_double_working_area_.assign(half_fft_length, 0.0);
+  fft_integer_working_area_.assign(2 + static_cast<int>(sqrt(half_fft_length)),
+                                   0);
+  // Set flag element to ensure that the working areas are initialized
+  // on the first call to cdft.  It's redundant given the assign above,
+  // but keep it as a reminder.
+  fft_integer_working_area_[0] = 0;
+  input_queue_.clear();
+  samples_to_next_step_ = window_length_;
+  initialized_ = true;
+  return true;
+}
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<complex<OutputSample>>>* output) {
+  if (!initialized_) {
+    LOG(ERROR) << "ComputeComplexSpectrogram() called before successful call "
+               << "to Initialize().";
+    return false;
+  }
+  CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // This will convert double to float if it needs to.
+      spectrogram_slice[i] = complex<OutputSample>(
+          fft_input_output_[2 * i], fft_input_output_[2 * i + 1]);
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<float>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<float>& input,
+    std::vector<std::vector<complex<double>>>*);
+template bool Spectrogram::ComputeComplexSpectrogram(
+    const std::vector<double>& input,
+    std::vector<std::vector<complex<double>>>*);
+
+template <class InputSample, class OutputSample>
+bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<InputSample>& input,
+    std::vector<std::vector<OutputSample>>* output) {
+  if (!initialized_) {
+    LOG(ERROR) << "ComputeSquaredMagnitudeSpectrogram() called before "
+               << "successful call to Initialize().";
+    return false;
+  }
+  CHECK(output);
+  output->clear();
+  int input_start = 0;
+  while (GetNextWindowOfSamples(input, &input_start)) {
+    DCHECK_EQ(input_queue_.size(), window_length_);
+    ProcessCoreFFT();  // Processes input_queue_ to fft_input_output_.
+    // Add a new slice vector onto the output, to save new result to.
+    output->resize(output->size() + 1);
+    // Get a reference to the newly added slice to fill in.
+    auto& spectrogram_slice = output->back();
+    spectrogram_slice.resize(output_frequency_channels_);
+    for (int i = 0; i < output_frequency_channels_; ++i) {
+      // Similar to the Complex case, except storing the norm.
+      // But the norm function is known to be a performance killer,
+      // so do it this way with explicit real and imagninary temps.
+      const double re = fft_input_output_[2 * i];
+      const double im = fft_input_output_[2 * i + 1];
+      // Which finally converts double to float if it needs to.
+      spectrogram_slice[i] = re * re + im * im;
+    }
+  }
+  return true;
+}
+// Instantiate it four ways:
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<float>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<float>& input, std::vector<std::vector<double>>*);
+template bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
+    const std::vector<double>& input, std::vector<std::vector<double>>*);
+
+// Return true if a full window of samples is prepared; manage the queue.
+template <class InputSample>
+bool Spectrogram::GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                                         int* input_start) {
+  auto input_it = input.begin() + *input_start;
+  int input_remaining = input.end() - input_it;
+  if (samples_to_next_step_ > input_remaining) {
+    // Copy in as many samples are left and return false, no full window.
+    input_queue_.insert(input_queue_.end(), input_it, input.end());
+    *input_start += input_remaining;  // Increases it to input.size().
+    samples_to_next_step_ -= input_remaining;
+    return false;  // Not enough for a full window.
+  } else {
+    // Copy just enough into queue to make a new window, then trim the
+    // front off the queue to make it window-sized.
+    input_queue_.insert(input_queue_.end(), input_it,
+                        input_it + samples_to_next_step_);
+    *input_start += samples_to_next_step_;
+    input_queue_.erase(
+        input_queue_.begin(),
+        input_queue_.begin() + input_queue_.size() - window_length_);
+    DCHECK_EQ(window_length_, input_queue_.size());
+    samples_to_next_step_ = step_length_;  // Be ready for next time.
+    return true;  // Yes, input_queue_ now contains exactly a window-full.
+  }
+}
+
+void Spectrogram::ProcessCoreFFT() {
+  for (int j = 0; j < window_length_; ++j) {
+    fft_input_output_[j] = input_queue_[j] * window_[j];
+  }
+  // Zero-pad the rest of the input buffer.
+  for (int j = window_length_; j < fft_length_; ++j) {
+    fft_input_output_[j] = 0.0;
+  }
+  const int kForwardFFT = 1;  // 1 means forward; -1 reverse.
+  // This real FFT is a fair amount faster than using cdft here.
+  rdft(fft_length_, kForwardFFT, &fft_input_output_[0],
+       &fft_integer_working_area_[0], &fft_double_working_area_[0]);
+  // Make rdft result look like cdft result;
+  // unpack the last real value from the first position's imag slot.
+  fft_input_output_[fft_length_] = fft_input_output_[1];
+  fft_input_output_[fft_length_ + 1] = 0;
+  fft_input_output_[1] = 0;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram.h b/tensorflow/core/kernels/spectrogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..5476a0a961859c3953eb3d4e8e841ead1f154202
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Class for generating spectrogram slices from a waveform.
+// Initialize() should be called before calls to other functions.  Once
+// Initialize() has been called and returned true, The Compute*() functions can
+// be called repeatedly with sequential input data (ie. the first element of the
+// next input vector directly follows the last element of the previous input
+// vector). Whenever enough audio samples are buffered to produce a
+// new frame, it will be placed in output. Output is cleared on each
+// call to Compute*(). This class is thread-unsafe, and should only be
+// called from one thread at a time.
+// With the default parameters, the output of this class should be very
+// close to the results of the following MATLAB code:
+// overlap_samples = window_length_samples - step_samples;
+// window = hann(window_length_samples, 'periodic');
+// S = abs(spectrogram(audio, window, overlap_samples)).^2;
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
+
+#include <complex>
+#include <deque>
+#include <vector>
+
+#include "third_party/fft2d/fft.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+class Spectrogram {
+ public:
+  Spectrogram() : initialized_(false) {}
+  ~Spectrogram() {}
+
+  // Initializes the class with a given window length and step length
+  // (both in samples). Internally a Hann window is used as the window
+  // function. Returns true on success, after which calls to Process()
+  // are possible. window_length must be greater than 1 and step
+  // length must be greater than 0.
+  bool Initialize(int window_length, int step_length);
+
+  // Initialize with an explicit window instead of a length.
+  bool Initialize(const std::vector<double>& window, int step_length);
+
+  // Processes an arbitrary amount of audio data (contained in input)
+  // to yield complex spectrogram frames. After a successful call to
+  // Initialize(), Process() may be called repeatedly with new input data
+  // each time.  The audio input is buffered internally, and the output
+  // vector is populated with as many temporally-ordered spectral slices
+  // as it is possible to generate from the input.  The output is cleared
+  // on each call before the new frames (if any) are added.
+  //
+  // The template parameters can be float or double.
+  template <class InputSample, class OutputSample>
+  bool ComputeComplexSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<std::complex<OutputSample>>>* output);
+
+  // This function works as the one above, but returns the power
+  // (the L2 norm, or the squared magnitude) of each complex value.
+  template <class InputSample, class OutputSample>
+  bool ComputeSquaredMagnitudeSpectrogram(
+      const std::vector<InputSample>& input,
+      std::vector<std::vector<OutputSample>>* output);
+
+  // Return reference to the window function used internally.
+  const std::vector<double>& GetWindow() const { return window_; }
+
+  // Return the number of frequency channels in the spectrogram.
+  int output_frequency_channels() const { return output_frequency_channels_; }
+
+ private:
+  template <class InputSample>
+  bool GetNextWindowOfSamples(const std::vector<InputSample>& input,
+                              int* input_start);
+  void ProcessCoreFFT();
+
+  int fft_length_;
+  int output_frequency_channels_;
+  int window_length_;
+  int step_length_;
+  bool initialized_;
+  int samples_to_next_step_;
+
+  std::vector<double> window_;
+  std::vector<double> fft_input_output_;
+  std::deque<double> input_queue_;
+
+  // Working data areas for the FFT routines.
+  std::vector<int> fft_integer_working_area_;
+  std::vector<double> fft_double_working_area_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Spectrogram);
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_H_
diff --git a/tensorflow/core/kernels/spectrogram_convert_test_data.cc b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bae13c0213e3a82525e12a7556b63f5a995a19c9
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_convert_test_data.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace wav {
+
+// This takes a CSV file representing an array of complex numbers, and saves out
+// a version using a binary format to save space in the repository.
+Status ConvertCsvToRaw(const string& input_filename) {
+  std::vector<std::vector<std::complex<double>>> input_data;
+  ReadCSVFileToComplexVectorOrDie(input_filename, &input_data);
+  const string output_filename = input_filename + ".bin";
+  if (!WriteComplexVectorToRawFloatFile(output_filename, input_data)) {
+    return errors::InvalidArgument("Failed to write raw float file ",
+                                   input_filename);
+  }
+  LOG(INFO) << "Wrote raw file to " << output_filename;
+  return Status::OK();
+}
+
+}  // namespace wav
+}  // namespace tensorflow
+
+int main(int argc, char* argv[]) {
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc < 2) {
+    LOG(ERROR) << "You must supply a CSV file as the first argument";
+    return 1;
+  }
+  tensorflow::string filename(argv[1]);
+  tensorflow::Status status = tensorflow::wav::ConvertCsvToRaw(filename);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error processing '" << filename << "':" << status;
+    return 1;
+  }
+  return 0;
+}
diff --git a/tensorflow/core/kernels/spectrogram_op.cc b/tensorflow/core/kernels/spectrogram_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e64596b532d00f64813173b3ef1d56355d98450
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/audio_ops.cc
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/spectrogram.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Create a spectrogram frequency visualization from audio data.
+class AudioSpectrogramOp : public OpKernel {
+ public:
+  explicit AudioSpectrogramOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("window_size", &window_size_));
+    OP_REQUIRES_OK(context, context->GetAttr("stride", &stride_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("magnitude_squared", &magnitude_squared_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 2,
+                errors::InvalidArgument("input must be 2-dimensional",
+                                        input.shape().DebugString()));
+    Spectrogram spectrogram;
+    OP_REQUIRES(context, spectrogram.Initialize(window_size_, stride_),
+                errors::InvalidArgument(
+                    "Spectrogram initialization failed for window size ",
+                    window_size_, " and stride ", stride_));
+
+    const auto input_as_matrix = input.matrix<float>();
+
+    const int64 sample_count = input.dim_size(0);
+    const int64 channel_count = input.dim_size(1);
+
+    const int64 output_width = spectrogram.output_frequency_channels();
+    const int64 length_minus_window = (sample_count - window_size_);
+    int64 output_height;
+    if (length_minus_window < 0) {
+      output_height = 0;
+    } else {
+      output_height = 1 + (length_minus_window / stride_);
+    }
+    const int64 output_slices = channel_count;
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_output(
+            0, TensorShape({output_slices, output_height, output_width}),
+            &output_tensor));
+    auto output_flat = output_tensor->flat<float>().data();
+
+    std::vector<float> input_for_channel(sample_count);
+    for (int64 channel = 0; channel < channel_count; ++channel) {
+      float* output_slice =
+          output_flat + (channel * output_height * output_width);
+      for (int i = 0; i < sample_count; ++i) {
+        input_for_channel[i] = input_as_matrix(i, channel);
+      }
+      std::vector<std::vector<float>> spectrogram_output;
+      OP_REQUIRES(context,
+                  spectrogram.ComputeSquaredMagnitudeSpectrogram(
+                      input_for_channel, &spectrogram_output),
+                  errors::InvalidArgument("Spectrogram compute failed"));
+      OP_REQUIRES(context, (spectrogram_output.size() == output_height),
+                  errors::InvalidArgument(
+                      "Spectrogram size calculation failed: Expected height ",
+                      output_height, " but got ", spectrogram_output.size()));
+      OP_REQUIRES(context,
+                  spectrogram_output.empty() ||
+                      (spectrogram_output[0].size() == output_width),
+                  errors::InvalidArgument(
+                      "Spectrogram size calculation failed: Expected width ",
+                      output_width, " but got ", spectrogram_output[0].size()));
+      for (int row_index = 0; row_index < output_height; ++row_index) {
+        const std::vector<float>& spectrogram_row =
+            spectrogram_output[row_index];
+        DCHECK_EQ(spectrogram_row.size(), output_width);
+        float* output_row = output_slice + (row_index * output_width);
+        if (magnitude_squared_) {
+          for (int i = 0; i < output_width; ++i) {
+            output_row[i] = spectrogram_row[i];
+          }
+        } else {
+          for (int i = 0; i < output_width; ++i) {
+            output_row[i] = sqrtf(spectrogram_row[i]);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  int32 window_size_;
+  int32 stride_;
+  bool magnitude_squared_;
+};
+REGISTER_KERNEL_BUILDER(Name("AudioSpectrogram").Device(DEVICE_CPU),
+                        AudioSpectrogramOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_op_test.cc b/tensorflow/core/kernels/spectrogram_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c3cbeeeb93fb37c7718cd522d16fc582ff8ba13
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_op_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using namespace ops;  // NOLINT(build/namespaces)
+
+TEST(SpectrogramOpTest, SimpleTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+  test::FillValues<float>(&audio_tensor,
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1);
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {spectrogram_op.spectrogram}, &outputs));
+
+  const Tensor& spectrogram_tensor = outputs[0];
+
+  EXPECT_EQ(3, spectrogram_tensor.dims());
+  EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      spectrogram_tensor,
+      test::AsTensor<float>({0, 1, 2, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+TEST(SpectrogramOpTest, SquaredTest) {
+  Scope root = Scope::NewRootScope();
+
+  Tensor audio_tensor(DT_FLOAT, TensorShape({8, 1}));
+  test::FillValues<float>(&audio_tensor,
+                          {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f});
+
+  Output audio_const_op = Const(root.WithOpName("audio_const_op"),
+                                Input::Initializer(audio_tensor));
+
+  AudioSpectrogram spectrogram_op =
+      AudioSpectrogram(root.WithOpName("spectrogram_op"), audio_const_op, 8, 1,
+                       AudioSpectrogram::Attrs().MagnitudeSquared(true));
+
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+
+  TF_EXPECT_OK(session.Run(ClientSession::FeedType(),
+                           {spectrogram_op.spectrogram}, &outputs));
+
+  const Tensor& spectrogram_tensor = outputs[0];
+
+  EXPECT_EQ(3, spectrogram_tensor.dims());
+  EXPECT_EQ(5, spectrogram_tensor.dim_size(2));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(1));
+  EXPECT_EQ(1, spectrogram_tensor.dim_size(0));
+
+  test::ExpectTensorNear<float>(
+      spectrogram_tensor,
+      test::AsTensor<float>({0, 1, 4, 1, 0}, TensorShape({1, 1, 5})), 1e-3);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test.cc b/tensorflow/core/kernels/spectrogram_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..73175a91a00e03095246e6dacef92a428b8ac307
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test.cc
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The MATLAB test data were generated using GenerateTestData.m.
+
+#include "tensorflow/core/kernels/spectrogram.h"
+
+#include <complex>
+#include <vector>
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+using ::std::complex;
+
+const char kInputFilename[] =
+    "core/kernels/spectrogram_test_data/short_test_segment.wav";
+
+const char kExpectedFilename[] =
+    "core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin";
+const int kDataVectorLength = 257;
+const int kNumberOfFramesInTestData = 178;
+
+const char kExpectedNonPowerOfTwoFilename[] =
+    "core/kernels/spectrogram_test_data/"
+    "short_test_segment_spectrogram_400_200.csv.bin";
+const int kNonPowerOfTwoDataVectorLength = 257;
+const int kNumberOfFramesInNonPowerOfTwoTestData = 228;
+
+TEST(SpectrogramTest, TooLittleDataYieldsNoFrames) {
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  // Generate 44 samples of audio.
+  SineWave(44100, 1000.0, 0.001, &input);
+  EXPECT_EQ(44, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(0, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeSmallerThanWindow) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(400, 200));
+  std::vector<double> input;
+  // Generate 661 samples of audio.
+  SineWave(44100, 1000.0, 0.015, &input);
+  EXPECT_EQ(661, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(200, 400));
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest, StepSizeBiggerThanWindow2) {
+  Spectrogram sgram;
+  EXPECT_TRUE(sgram.Initialize(200, 400));
+  std::vector<double> input;
+  // Generate more than 600 but fewer than 800 samples of audio.
+  SineWave(44100, 1000.0, 0.016, &input);
+  EXPECT_GT(input.size(), 600);
+  EXPECT_LT(input.size(), 800);
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(2, output.size());
+}
+
+TEST(SpectrogramTest,
+     MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames) {
+  // Repeatedly pass inputs with "extra" samples beyond complete windows
+  // and check that the excess points cumulate to eventually cause an
+  // extra output frame.
+  Spectrogram sgram;
+  sgram.Initialize(200, 400);
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  const std::vector<int> expected_output_sizes = {
+      2,  // One pass of input leaves 82 samples buffered after two steps of
+          // 400.
+      2,  // Passing in 882 samples again will now leave 164 samples buffered.
+      3,  // Third time gives 246 extra samples, triggering an extra output
+          // frame.
+  };
+  for (int expected_output_size : expected_output_sizes) {
+    sgram.ComputeComplexSpectrogram(input, &output);
+    EXPECT_EQ(expected_output_size, output.size());
+  }
+}
+
+TEST(SpectrogramTest, CumulatingExcessInputsForOverlappingFrames) {
+  // Input frames that don't fit into whole windows are cumulated even when
+  // the windows have overlap (similar to
+  // MultipleCallsToComputeComplexSpectrogramMayYieldDifferentNumbersOfFrames
+  // but with window size/hop size swapped).
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  // Generate 882 samples of audio.
+  SineWave(44100, 1000.0, 0.02, &input);
+  EXPECT_EQ(882, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  const std::vector<int> expected_output_sizes = {
+      3,  // Windows 0..400, 200..600, 400..800 with 82 samples buffered.
+      4,  // 1764 frames input; outputs from 600, 800, 1000, 1200..1600.
+      5,  // 2646 frames in; outputs from 1400, 1600, 1800, 2000, 2200..2600.
+  };
+  for (int expected_output_size : expected_output_sizes) {
+    sgram.ComputeComplexSpectrogram(input, &output);
+    EXPECT_EQ(expected_output_size, output.size());
+  }
+}
+
+TEST(SpectrogramTest, StepSizeEqualToWindowWorks) {
+  Spectrogram sgram;
+  sgram.Initialize(200, 200);
+  std::vector<double> input;
+  // Generate 2205 samples of audio.
+  SineWave(44100, 1000.0, 0.05, &input);
+  EXPECT_EQ(2205, input.size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  EXPECT_EQ(11, output.size());
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareComplexData(
+    const std::vector<std::vector<complex<ExpectedSample>>>& expected,
+    const std::vector<std::vector<complex<ActualSample>>>& actual,
+    double tolerance) {
+  ASSERT_EQ(actual.size(), expected.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i].size(), actual[i].size());
+    for (int j = 0; j < expected[i].size(); ++j) {
+      ASSERT_NEAR(real(expected[i][j]), real(actual[i][j]), tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+      ASSERT_NEAR(imag(expected[i][j]), imag(actual[i][j]), tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+    }
+  }
+}
+
+template <class Sample>
+double GetMaximumAbsolute(const std::vector<std::vector<Sample>>& spectrogram) {
+  double max_absolute = 0.0;
+  for (int i = 0; i < spectrogram.size(); ++i) {
+    for (int j = 0; j < spectrogram[i].size(); ++j) {
+      double absolute_value = std::abs(spectrogram[i][j]);
+      if (absolute_value > max_absolute) {
+        max_absolute = absolute_value;
+      }
+    }
+  }
+  return max_absolute;
+}
+
+template <class ExpectedSample, class ActualSample>
+void CompareMagnitudeData(
+    const std::vector<std::vector<complex<ExpectedSample>>>&
+        expected_complex_output,
+    const std::vector<std::vector<ActualSample>>& actual_squared_magnitude,
+    double tolerance) {
+  ASSERT_EQ(actual_squared_magnitude.size(), expected_complex_output.size());
+  for (int i = 0; i < expected_complex_output.size(); ++i) {
+    ASSERT_EQ(expected_complex_output[i].size(),
+              actual_squared_magnitude[i].size());
+    for (int j = 0; j < expected_complex_output[i].size(); ++j) {
+      ASSERT_NEAR(norm(expected_complex_output[i][j]),
+                  actual_squared_magnitude[i][j], tolerance)
+          << ": where i=" << i << " and j=" << j << ".";
+    }
+  }
+}
+
+TEST(SpectrogramTest, ReInitializationWorks) {
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  std::vector<std::vector<complex<double>>> first_output;
+  std::vector<std::vector<complex<double>>> second_output;
+  sgram.Initialize(512, 256);
+  sgram.ComputeComplexSpectrogram(input, &first_output);
+  // Re-Initialize it.
+  sgram.Initialize(512, 256);
+  sgram.ComputeComplexSpectrogram(input, &second_output);
+  // Verify identical outputs.
+  ASSERT_EQ(first_output.size(), second_output.size());
+  int slice_size = first_output[0].size();
+  for (int i = 0; i < first_output.size(); ++i) {
+    ASSERT_EQ(slice_size, first_output[i].size());
+    ASSERT_EQ(slice_size, second_output[i].size());
+    for (int j = 0; j < slice_size; ++j) {
+      ASSERT_EQ(first_output[i][j], second_output[i][j]);
+    }
+  }
+}
+
+TEST(SpectrogramTest, ComputedComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-5);
+}
+
+TEST(SpectrogramTest, ComputedFloatComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> double_input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &double_input));
+  std::vector<float> input;
+  input.assign(double_input.begin(), double_input.end());
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<float>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-4);
+}
+
+TEST(SpectrogramTest, ComputedSquaredMagnitudeDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<double>> output;
+  sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+  CompareMagnitudeData(expected_output, output, 1e-3);
+}
+
+TEST(SpectrogramTest, ComputedFloatSquaredMagnitudeDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(512, 256);
+  std::vector<double> double_input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &double_input));
+  EXPECT_EQ(kInputDataLength, double_input.size());
+  std::vector<float> input;
+  input.assign(double_input.begin(), double_input.end());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kExpectedFilename),
+      kDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInTestData, expected_output.size());
+  EXPECT_EQ(kDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<float>> output;
+  sgram.ComputeSquaredMagnitudeSpectrogram(input, &output);
+  double max_absolute = GetMaximumAbsolute(output);
+  EXPECT_GT(max_absolute, 2300.0);  // Verify that we have some big numbers.
+  // Squaring increases dynamic range; max square is about 2300,
+  // so 2e-4 is about 7 decimal digits; not bad for a float.
+  CompareMagnitudeData(expected_output, output, 2e-4);
+}
+
+TEST(SpectrogramTest, ComputedNonPowerOfTwoComplexDataAgreeWithMatlab) {
+  const int kInputDataLength = 45870;
+  Spectrogram sgram;
+  sgram.Initialize(400, 200);
+  std::vector<double> input;
+  CHECK(ReadWaveFileToVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(), kInputFilename),
+      &input));
+  EXPECT_EQ(kInputDataLength, input.size());
+  std::vector<std::vector<complex<double>>> expected_output;
+  ASSERT_TRUE(ReadRawFloatFileToComplexVector(
+      tensorflow::io::JoinPath(testing::TensorFlowSrcRoot(),
+                               kExpectedNonPowerOfTwoFilename),
+      kNonPowerOfTwoDataVectorLength, &expected_output));
+  EXPECT_EQ(kNumberOfFramesInNonPowerOfTwoTestData, expected_output.size());
+  EXPECT_EQ(kNonPowerOfTwoDataVectorLength, expected_output[0].size());
+  std::vector<std::vector<complex<double>>> output;
+  sgram.ComputeComplexSpectrogram(input, &output);
+  CompareComplexData(expected_output, output, 1e-5);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_data/README b/tensorflow/core/kernels/spectrogram_test_data/README
new file mode 100644
index 0000000000000000000000000000000000000000..271238e0c9c6ab6f620b53f5c1227d221e435769
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_data/README
@@ -0,0 +1,8 @@
+The CSV spectrogram files in this directory are generated from the
+matlab code in ./matlab/GenerateTestData.m
+To save space in the repo, you'll then need to convert them into a binary packed
+format using the convert_test_data.cc command line tool.
+
+
+short_test_segment.wav is approximately 1s of music audio.
+
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav
new file mode 100644
index 0000000000000000000000000000000000000000..7339dfd08c8b31806e4e12c4421b7b0312173953
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin
new file mode 100644
index 0000000000000000000000000000000000000000..67b9e2487c3f9cf960ad4af96f0688683e91cb02
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram.csv.bin differ
diff --git a/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d5e4cc5dd60e96211ced3eaf6f08929bf61cff4e
Binary files /dev/null and b/tensorflow/core/kernels/spectrogram_test_data/short_test_segment_spectrogram_400_200.csv.bin differ
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2141c649f78f18bbbbe00c6dce4d95db65583e6
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -0,0 +1,288 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/spectrogram_test_utils.h"
+
+#include <math.h>
+#include <stddef.h>
+
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data) {
+  string wav_data;
+  if (!ReadFileToString(Env::Default(), file_name, &wav_data).ok()) {
+    LOG(ERROR) << "Wave file read failed for " << file_name;
+    return false;
+  }
+  std::vector<float> decoded_data;
+  uint32 decoded_sample_count;
+  uint16 decoded_channel_count;
+  uint32 decoded_sample_rate;
+  if (!wav::DecodeLin16WaveAsFloatVector(
+           wav_data, &decoded_data, &decoded_sample_count,
+           &decoded_channel_count, &decoded_sample_rate)
+           .ok()) {
+    return false;
+  }
+  // Convert from float to double for the output value.
+  data->resize(decoded_data.size());
+  for (int i = 0; i < decoded_data.size(); ++i) {
+    (*data)[i] = decoded_data[i];
+  }
+  return true;
+}
+
+bool ReadRawFloatFileToComplexVector(
+    const string& file_name, int row_length,
+    std::vector<std::vector<std::complex<double> > >* data) {
+  data->clear();
+  string data_string;
+  if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  float real_out;
+  float imag_out;
+  const int kBytesPerValue = 4;
+  CHECK_EQ(sizeof(real_out), kBytesPerValue);
+  std::vector<std::complex<double> > data_row;
+  int row_counter = 0;
+  int offset = 0;
+  const int end = data_string.size();
+  while (offset < end) {
+    memcpy(&real_out, data_string.data() + offset, kBytesPerValue);
+    offset += kBytesPerValue;
+    memcpy(&imag_out, data_string.data() + offset, kBytesPerValue);
+    offset += kBytesPerValue;
+    if (row_counter >= row_length) {
+      data->push_back(data_row);
+      data_row.clear();
+      row_counter = 0;
+    }
+    data_row.push_back(std::complex<double>(real_out, imag_out));
+    ++row_counter;
+  }
+  if (row_counter >= row_length) {
+    data->push_back(data_row);
+  }
+  return true;
+}
+
+void ReadCSVFileToComplexVectorOrDie(
+    const string& file_name,
+    std::vector<std::vector<std::complex<double> > >* data) {
+  data->clear();
+  string data_string;
+  if (!ReadFileToString(Env::Default(), file_name, &data_string).ok()) {
+    LOG(FATAL) << "Failed to open file " << file_name;
+    return;
+  }
+  std::vector<string> lines = str_util::Split(data_string, '\n');
+  for (const string& line : lines) {
+    if (line == "") {
+      continue;
+    }
+    std::vector<std::complex<double> > data_line;
+    std::vector<string> values = str_util::Split(line, ',');
+    for (std::vector<string>::const_iterator i = values.begin();
+         i != values.end(); ++i) {
+      // each element of values may be in the form:
+      // 0.001+0.002i, 0.001, 0.001i, -1.2i, -1.2-3.2i, 1.5, 1.5e-03+21.0i
+      std::vector<string> parts;
+      // Find the first instance of + or - after the second character
+      // in the string, that does not immediately follow an 'e'.
+      size_t operator_index = i->find_first_of("+-", 2);
+      if (operator_index < i->size() &&
+          i->substr(operator_index - 1, 1) == "e") {
+        operator_index = i->find_first_of("+-", operator_index + 1);
+      }
+      parts.push_back(i->substr(0, operator_index));
+      if (operator_index < i->size()) {
+        parts.push_back(i->substr(operator_index, string::npos));
+      }
+
+      double real_part = 0.0;
+      double imaginary_part = 0.0;
+      for (std::vector<string>::const_iterator j = parts.begin();
+           j != parts.end(); ++j) {
+        if (j->find_first_of("ij") != string::npos) {
+          strings::safe_strtod((*j).c_str(), &imaginary_part);
+        } else {
+          strings::safe_strtod((*j).c_str(), &real_part);
+        }
+      }
+      data_line.push_back(std::complex<double>(real_part, imaginary_part));
+    }
+    data->push_back(data_line);
+  }
+}
+
+void ReadCSVFileToArrayOrDie(const string& filename,
+                             std::vector<std::vector<float> >* array) {
+  string contents;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), filename, &contents));
+  std::vector<string> lines = str_util::Split(contents, '\n');
+  contents.clear();
+
+  array->clear();
+  std::vector<float> values;
+  for (int l = 0; l < lines.size(); ++l) {
+    values.clear();
+    CHECK(str_util::SplitAndParseAsFloats(lines[l], ',', &values));
+    array->push_back(values);
+  }
+}
+
+bool WriteDoubleVectorToFile(const string& file_name,
+                             const std::vector<double>& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteFloatVectorToFile(const string& file_name,
+                            const std::vector<float>& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+                            const double* data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < size; ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteFloatArrayToFile(const string& file_name, int size,
+                           const float* data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < size; ++i) {
+    if (!file->Append(StringPiece(reinterpret_cast<const char*>(&(data[i])),
+                                  sizeof(data[i])))
+             .ok()) {
+      LOG(ERROR) << "Failed to append to file " << file_name;
+      return false;
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+bool WriteComplexVectorToRawFloatFile(
+    const string& file_name,
+    const std::vector<std::vector<std::complex<double> > >& data) {
+  std::unique_ptr<WritableFile> file;
+  if (!Env::Default()->NewWritableFile(file_name, &file).ok()) {
+    LOG(ERROR) << "Failed to open file " << file_name;
+    return false;
+  }
+  for (int i = 0; i < data.size(); ++i) {
+    for (int j = 0; j < data[i].size(); ++j) {
+      const float real_part(real(data[i][j]));
+      if (!file->Append(StringPiece(reinterpret_cast<const char*>(&real_part),
+                                    sizeof(real_part)))
+               .ok()) {
+        LOG(ERROR) << "Failed to append to file " << file_name;
+        return false;
+      }
+
+      const float imag_part(imag(data[i][j]));
+      if (!file->Append(StringPiece(reinterpret_cast<const char*>(&imag_part),
+                                    sizeof(imag_part)))
+               .ok()) {
+        LOG(ERROR) << "Failed to append to file " << file_name;
+        return false;
+      }
+    }
+  }
+  if (!file->Close().ok()) {
+    LOG(ERROR) << "Failed to close file " << file_name;
+    return false;
+  }
+  return true;
+}
+
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+              std::vector<double>* data) {
+  data->clear();
+  for (int i = 0; i < static_cast<int>(sample_rate * duration_seconds); ++i) {
+    data->push_back(
+        sin(2.0 * M_PI * i * frequency / static_cast<double>(sample_rate)));
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.h b/tensorflow/core/kernels/spectrogram_test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..59a903549e853b0d270ba8cd565830f1310b677e
--- /dev/null
+++ b/tensorflow/core/kernels/spectrogram_test_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
+
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// Reads a wav format file into a vector of floating-point values with range
+// -1.0 to 1.0.
+bool ReadWaveFileToVector(const string& file_name, std::vector<double>* data);
+
+// Reads a binary file containing 32-bit floating point values in the
+// form [real_1, imag_1, real_2, imag_2, ...] into a rectangular array
+// of complex values where row_length is the length of each inner vector.
+bool ReadRawFloatFileToComplexVector(
+    const string& file_name, int row_length,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a CSV file of numbers in the format 1.1+2.2i,1.1,2.2i,3.3j into data.
+void ReadCSVFileToComplexVectorOrDie(
+    const string& file_name,
+    std::vector<std::vector<std::complex<double> > >* data);
+
+// Reads a 2D array of floats from an ASCII text file, where each line is a row
+// of the array, and elements are separated by commas.
+void ReadCSVFileToArrayOrDie(const string& filename,
+                             std::vector<std::vector<float> >* array);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleVectorToFile(const string& file_name,
+                             const std::vector<double>& data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatVectorToFile(const string& file_name,
+                            const std::vector<float>& data);
+
+// Write a binary file containing 64-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteDoubleArrayToFile(const string& file_name, int size,
+                            const double* data);
+
+// Write a binary file containing 32-bit floating-point values for
+// reading by, for example, MATLAB.
+bool WriteFloatArrayToFile(const string& file_name, int size,
+                           const float* data);
+
+// Write a binary file in the format read by
+// ReadRawDoubleFileToComplexVector above.
+bool WriteComplexVectorToRawFloatFile(
+    const string& file_name,
+    const std::vector<std::vector<std::complex<double> > >& data);
+
+// Generate a sine wave with the provided parameters, and populate
+// data with the samples.
+void SineWave(int sample_rate, float frequency, float duration_seconds,
+              std::vector<double>* data);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SPECTROGRAM_TEST_UTILS_H_
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index 40b8952e14aab06d4046e687372fbf610ed9c83d..3c0b5d113b0a899e98b3463662216bd3ca9b7ff3 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -50,12 +50,16 @@ void SplitCustom<Device, T>::operator()(
 #define DEFINE_GPU_KERNELS(T) template struct Split<Eigen::GpuDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(bfloat16);
 
 #undef DEFINE_GPU_KERNELS
 #define DEFINE_GPU_KERNELS(T) template struct SplitCustom<Eigen::GpuDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(bfloat16);
 
 #undef DEFINE_GPU_KERNELS
@@ -236,12 +240,16 @@ struct SplitVOpGPULaunch {
 #define REGISTER_GPU_KERNEL(T) template struct SplitOpGPULaunch<T>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #define REGISTER_GPU_KERNEL(T)                 \
   template struct SplitVOpGPULaunch<T, int32>; \
   template struct SplitVOpGPULaunch<T, int64>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_complex64(REGISTER_GPU_KERNEL);
+TF_CALL_complex128(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bfloat16);
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index cca2fc41c2c8a541de22da3aa1c759e7d7fc422e..cf22a22fa3af21d68fdd23d89a6efbb7694f6503 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -337,6 +337,8 @@ REGISTER_SPLIT(quint8);
                           SplitOpGPU<type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 47d98a9f084193d4e632073f3fe6e6f30e37424a..4dff1ea046b88734e3383fa2d20091b832b5800e 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -87,6 +87,12 @@ class SplitVOpBase : public OpKernel {
     // Special case 1: num_split == 1. Nothing to do.
     if (num_split == 1) {
       context->set_output(0, context->input(0));
+      OP_REQUIRES(
+          context, (*split_sizes_vec)[0] == input_size_split_dim,
+          errors::InvalidArgument("If there is only one output, it must have "
+                                  "the same size as the input. Input size: ",
+                                  input_size_split_dim,
+                                  " output size: ", (*split_sizes_vec)[0]));
       *done = true;
       return;
     }
@@ -374,6 +380,8 @@ REGISTER_SPLIT_LEN(bfloat16);
   REGISTER_GPU(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_LEN);
+TF_CALL_complex64(REGISTER_GPU_LEN);
+TF_CALL_complex128(REGISTER_GPU_LEN);
 REGISTER_GPU_LEN(bfloat16);
 #undef REGISTER_GPU_LEN
 #undef REGISTER_GPU
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 161ba89212763d7b0cf4bae1ea4357b73ef3667e..45168112cc38b98b82dadd284a1168c5d865d2a2 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <deque>
+#include <numeric>
 #include <vector>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -30,26 +31,153 @@ namespace {
 
 class Buffer : public ResourceBase {
  public:
-  explicit Buffer() {}
-
+  // public types
   typedef std::vector<Tensor> Tuple;
 
+ private:
+  // private variables
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  mutex mu_;
+  condition_variable non_empty_cond_var_;
+  condition_variable full_cond_var_;
+  std::deque<Tuple> buf_ GUARDED_BY(mu_);
+
+
+ private:
+  // private methods
+
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(mutex_lock & l)
+  {
+    if(IsBounded())
+    {
+      l.unlock();
+      full_cond_var_.notify_one();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() {
+    return capacity_ > 0 || memory_limit_ > 0;
+  }
+
+  bool IsCapacityFull() {
+    return buf_.size() >= capacity_;
+  }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple & tuple)
+  {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+      [](const std::size_t & lhs, const Tensor & rhs) {
+        return lhs + rhs.TotalBytes();
+    });
+  }
+
+ public:
+  // public methods
+  explicit Buffer(std::size_t capacity, std::size_t memory_limit) :
+      capacity_(capacity),
+      memory_limit_(memory_limit),
+      current_bytes_(0) {}
+
   // the Buffer takes ownership of the Tuple
-  void Put(Tuple* tuple) {
+  Status Put(Tuple* tuple) {
     mutex_lock l(mu_);
+
+    std::size_t tuple_bytes = GetTupleBytes(*tuple);
+
+    // Sanity check so that we don't block for ever below
+    if(memory_limit_ > 0 && tuple_bytes > memory_limit_) {
+      return Status(errors::ResourceExhausted("Attempted to insert "
+        "tensors with combined size of '", tuple_bytes, "' bytes into "
+        "Staging Area with a memory limit of '", memory_limit_, "'."));
+    }
+
+
+    // If buffer capacity is bounded wait until elements have been removed
+    if(IsBounded()) {
+      full_cond_var_.wait(l, [tuple_bytes, this]() {
+        // If there's a memory limit, check if there's space for insertion
+        bool memory_limit_valid = memory_limit_ > 0 ?
+            !WouldExceedMemoryLimit(tuple_bytes) : true;
+        // If we're configured for capacity check if there's space for insertion
+        bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true;
+
+        // Stop waiting upon success for both conditions
+        return capacity_valid && memory_limit_valid;
+      });
+    }
+
+    // Update bytes in the Staging Area
+    current_bytes_ += tuple_bytes;
+
+    // Store tuple
     buf_.push_back(std::move(*tuple));
-    non_empty_cond_var_.notify_one();  // maybe possible to optimize by reducing
-                                       // how often this signal is sent
+
+    l.unlock();
+    // maybe possible to optimize by reducing
+    // how often this signal is sent
+    non_empty_cond_var_.notify_one();
+
+    return Status::OK();
   }
 
+  // Get tuple at front of the buffer
   void Get(Tuple* tuple) {  // TODO(zhifengc): Support cancellation.
     mutex_lock l(mu_);
-    while (buf_.empty()) {
-      non_empty_cond_var_.wait(l);
-    }
 
+    // Wait for data if the buffer is empty
+    non_empty_cond_var_.wait(l, [this]() {
+      return !buf_.empty();
+    });
+
+    // Move data into the output tuple
     *tuple = std::move(buf_.front());
     buf_.pop_front();
+
+    // Update bytes in the Staging Area
+    current_bytes_ -= GetTupleBytes(*tuple);
+
+    notify_inserters_if_bounded(l);
+  }
+
+  // Return tuple at index
+  Status Peek(std::size_t index, Tuple* tuple) {
+    mutex_lock l(mu_);
+
+    // Wait if the requested index is not available
+    non_empty_cond_var_.wait(l, [index, this]() {
+      return index < this->buf_.size();
+    });
+
+    // Place tensors in the output tuple
+    for(const auto & tensor: buf_[index]) {
+      tuple->push_back(tensor);
+    }
+
+    return Status::OK();
+  }
+
+  // Buffer size
+  size_t Size() {
+    mutex_lock l(mu_);
+    return buf_.size();
+  }
+
+  void Clear() {
+    mutex_lock l(mu_);
+    buf_.clear();
+    current_bytes_ = 0;
+
+    notify_inserters_if_bounded(l);
   }
 
   string DebugString() {
@@ -57,23 +185,27 @@ class Buffer : public ResourceBase {
     return strings::StrCat("Staging size: ", buf_.size());
   }
 
- private:
-  mutex mu_;
-  condition_variable non_empty_cond_var_;
-  std::deque<Tuple> buf_ GUARDED_BY(mu_);
 };
 
-Status CreateBuffer(Buffer** ret) {
-  *ret = new Buffer;
-  return Status::OK();
-}
-
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
   auto rm = ctx->resource_manager();
   ContainerInfo cinfo;
+
+  // Lambda for creating the Staging Area
+  auto create_fn = [&ndef](Buffer** ret) -> Status
+  {
+    int64 capacity;
+    int64 memory_limit;
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
+    TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "memory_limit", &memory_limit));
+    *ret = new Buffer(capacity, memory_limit);
+    return Status::OK();
+  };
+
+
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<Buffer>(cinfo.container(), cinfo.name(),
-                                                buf, CreateBuffer));
+                                                buf, create_fn));
   return Status::OK();
 }
 
@@ -88,10 +220,10 @@ class StageOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
     core::ScopedUnref scope(buf);
     Buffer::Tuple tuple;
-    for (int i = 0; i < ctx->num_inputs(); ++i) {
+    for (std::size_t i = 0; i < ctx->num_inputs(); ++i) {
       tuple.push_back(ctx->input(i));
     }
-    buf->Put(&tuple);
+    OP_REQUIRES_OK(ctx, buf->Put(&tuple));
   }
 };
 
@@ -114,11 +246,13 @@ class UnstageOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
     core::ScopedUnref scope(buf);
     Buffer::Tuple tuple;
+
     buf->Get(&tuple);
-    OP_REQUIRES(
-        ctx, tuple.size() == (size_t)ctx->num_outputs(),
+
+    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
         errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
                                 " vs. ", ctx->num_outputs()));
+
     for (size_t i = 0; i < tuple.size(); ++i) {
       ctx->set_output(i, tuple[i]);
     }
@@ -133,4 +267,97 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
 #endif // TENSORFLOW_USE_SYCL
 
+class StagePeekOp : public OpKernel {
+ public:
+  explicit StagePeekOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+    Buffer::Tuple tuple;
+
+    std::size_t index = ctx->input(0).scalar<int>()();
+
+    OP_REQUIRES_OK(ctx, buf->Peek(index, &tuple));
+
+    OP_REQUIRES(ctx, tuple.size() == (size_t)ctx->num_outputs(),
+        errors::InvalidArgument("Mismatch stage/unstage: ", tuple.size(),
+                                " vs. ", ctx->num_outputs()));
+
+    for (size_t i = 0; i < tuple.size(); ++i) {
+      ctx->set_output(i, tuple[i]);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU),
+                                              StagePeekOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index").
+                            Device(DEVICE_GPU), StagePeekOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StagePeek").HostMemory("index")
+                          .Device(DEVICE_SYCL), StagePeekOp);
+#endif // TENSORFLOW_USE_SYCL
+
+
+class StageSizeOp : public OpKernel {
+ public:
+  explicit StageSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+
+    // Allocate size output tensor
+    Tensor * size = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}),
+                                                     &size));
+
+    // Set it to the actual size
+    size->scalar<int32>().setConstant(buf->Size());
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
+                        .Device(DEVICE_GPU), StageSizeOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size")
+                        .Device(DEVICE_SYCL), StageSizeOp);
+#endif // TENSORFLOW_USE_SYCL
+
+class StageClearOp : public OpKernel {
+ public:
+  explicit StageClearOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  // Using this op in such a way that it blocks forever
+  // is an error.  As such cancellation is not handled.
+  void Compute(OpKernelContext* ctx) override {
+    Buffer* buf = nullptr;
+    OP_REQUIRES_OK(ctx, GetBuffer(ctx, def(), &buf));
+    core::ScopedUnref scope(buf);
+
+    buf->Clear();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
+#endif
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
+#endif // TENSORFLOW_USE_SYCL
+
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79d0c07acdee4f3e9e979c00bf8ccbf2853a77cc
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -0,0 +1,173 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+class StatelessRandomOpBase : public OpKernel {
+ public:
+  explicit StatelessRandomOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Sanitize input
+    const Tensor& shape_t = context->input(0);
+    const Tensor& seed_t = context->input(1);
+    TensorShape shape;
+    OP_REQUIRES_OK(context, MakeShape(shape_t, &shape));
+    OP_REQUIRES(context, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+
+    // Allocate output
+    Tensor* output;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) return;
+
+    // Grab the two seeds
+    const auto seed = seed_t.flat<int64>();
+    const uint64 seed0 = internal::SubtleMustCopy(seed(0));
+    const uint64 seed1 = internal::SubtleMustCopy(seed(1));
+
+    // Scramble the seeds so that the user doesn't need to worry about which
+    // part of the seed needs to be strong.
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    key[0] = 0x3ec8f720;
+    key[1] = 0x02461e29;
+    counter[0] = static_cast<uint32>(seed0);
+    counter[1] = static_cast<uint32>(seed0 >> 32);
+    counter[2] = static_cast<uint32>(seed1);
+    counter[3] = static_cast<uint32>(seed1 >> 32);
+    const auto mix = random::PhiloxRandom(counter, key)();
+    key[0] = mix[0];
+    key[1] = mix[1];
+    counter[0] = counter[1] = 0;
+    counter[2] = mix[2];
+    counter[3] = mix[3];
+
+    // Fill in the random numbers
+    Fill(context, random::PhiloxRandom(counter, key), output);
+  }
+
+  // The part of Compute that depends on device, type, and distribution
+  virtual void Fill(OpKernelContext* context, random::PhiloxRandom random,
+                    Tensor* output) = 0;
+};
+
+template <typename Device, class Distribution>
+class StatelessRandomOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* context, random::PhiloxRandom random,
+            Tensor* output) override {
+    typedef typename Distribution::ResultElementType T;
+    auto flat = output->flat<T>();
+    // Reuse the compute kernels from the stateful random ops
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        context, context->eigen_device<Device>(), random, flat.data(),
+        flat.size(), Distribution());
+  }
+};
+
+}  // namespace
+
+#define REGISTER(TYPE)                                                 \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomUniform")                                   \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<CPUDevice, random::UniformDistribution<        \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomNormal")                                    \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<CPUDevice, random::NormalDistribution<         \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessTruncatedNormal")                                 \
+          .Device(DEVICE_CPU)                                          \
+          .HostMemory("shape")                                         \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<                                               \
+          CPUDevice,                                                   \
+          random::TruncatedNormalDistribution<                         \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
+#if GOOGLE_CUDA
+
+#define REGISTER(TYPE)                                                 \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomUniform")                                   \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<GPUDevice, random::UniformDistribution<        \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessRandomNormal")                                    \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<GPUDevice, random::NormalDistribution<         \
+                                       random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("StatelessTruncatedNormal")                                 \
+          .Device(DEVICE_GPU)                                          \
+          .HostMemory("shape")                                         \
+          .HostMemory("seed")                                          \
+          .TypeConstraint<int32>("T")                                  \
+          .TypeConstraint<TYPE>("dtype"),                              \
+      StatelessRandomOp<                                               \
+          GPUDevice,                                                   \
+          random::TruncatedNormalDistribution<                         \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 10593516f7176077b368e4f435c9f82a532d7df8..20a6adc493af2880d3f9430ca1c09f01c8a2b0d0 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
@@ -294,8 +295,16 @@ class StridedSliceAssignOp : public OpKernel {
     gtl::InlinedVector<int64, 4> end;
     gtl::InlinedVector<int64, 4> strides;
 
-    context->forward_ref_input_to_ref_output(0, 0);
-    Tensor old_lhs = context->mutable_input(0, true);
+    Tensor old_lhs;
+    if (context->input_dtype(0) == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(context,
+                     LookupResource(context, HandleFromInput(context, 0), &v));
+      old_lhs = *v->tensor();
+    } else {
+      context->forward_ref_input_to_ref_output(0, 0);
+      old_lhs = context->mutable_input(0, true);
+    }
 
     ShapeReadWriteFromTensorShape wrapped_processing_shape(&processing_shape);
     ShapeReadWriteFromTensorShape wrapped_final_shape(&final_shape);
@@ -354,28 +363,35 @@ class StridedSliceAssignOp : public OpKernel {
   int32 ellipsis_mask, new_axis_mask, shrink_axis_mask;
 };
 
-#define REGISTER_STRIDED_SLICE(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceOp<CPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
-                          StridedSliceGradOp<CPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_CPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides"),          \
+#define REGISTER_STRIDED_SLICE(type)                             \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceOp<CPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceGradOp<CPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
+                          StridedSliceAssignOp<CPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides"),            \
                           StridedSliceAssignOp<CPUDevice, type>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
@@ -385,35 +401,44 @@ REGISTER_STRIDED_SLICE(bfloat16);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU(type)                                     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceOp<GPUDevice, type>)     \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<GPUDevice, type>) \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_GPU(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                   \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceOp<GPUDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")               \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("shape")               \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceGradOp<GPUDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")             \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
+                          StridedSliceAssignOp<GPUDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")     \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("begin")               \
+                              .HostMemory("end")                 \
+                              .HostMemory("strides")             \
+                              .TypeConstraint<int32>("Index"),   \
                           StridedSliceAssignOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -448,36 +473,53 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
                             .HostMemory("end")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                 \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceOp<SYCLDevice, type>)    \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")             \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("shape")             \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
-                          StridedSliceGradOp<SYCLDevice, type>)\
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")           \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("end")               \
-                              .HostMemory("strides")           \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_SYCL(type)                                       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                    \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceOp<SYCLDevice, type>)       \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")                \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("shape")                \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceGradOp<SYCLDevice, type>)   \
+  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")              \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
+                          StridedSliceAssignOp<SYCLDevice, type>) \
+  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")      \
+                              .Device(DEVICE_SYCL)                \
+                              .TypeConstraint<type>("T")          \
+                              .HostMemory("begin")                \
+                              .HostMemory("end")                  \
+                              .HostMemory("strides")              \
+                              .TypeConstraint<int32>("Index"),    \
                           StridedSliceAssignOp<SYCLDevice, type>)
 
 REGISTER_SYCL(float);
@@ -516,6 +558,15 @@ REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
                             .HostMemory("end")
                             .HostMemory("strides"),
                         StridedSliceAssignOp<CPUDevice, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32>("Index")
+                            .HostMemory("ref")
+                            .HostMemory("begin")
+                            .HostMemory("end")
+                            .HostMemory("strides"),
+                        StridedSliceAssignOp<CPUDevice, int32>)
 #undef REGISTER_SYCL
 #endif // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index 3679d9b985fc6dd8eb35af52f3f13f7946f4c0ec..a8487f49f4488269e058c6b7ee94d0f82aeb5270 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -52,6 +52,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index d820db348e596c16e7f1af197fcbe189f66d1e8f..df7490486ec2c609a4601722233a261dfa7a0fa9 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -278,9 +278,11 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
 #if GOOGLE_CUDA
 TF_CALL_GPU_PROXY_TYPES(PREVENT_FOR_N_GPU);
 TF_CALL_complex64(PREVENT_FOR_N_GPU);
+TF_CALL_complex128(PREVENT_FOR_N_GPU);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
+TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
 #endif  // END GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/take_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e27a36bc9b8012226480d9bf7dc17c3a2c0ba2c3
--- /dev/null
+++ b/tensorflow/core/kernels/take_dataset_op.cc
@@ -0,0 +1,132 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TakeDatasetOp : public OpKernel {
+ public:
+  explicit TakeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new RepeatDatasetOp::Dataset, insert it in the step-local
+    // container, and return it as the output.
+    DatasetBase* input;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &input));
+    core::ScopedUnref unref_input(input);
+
+    const Tensor* count_t;
+    OP_REQUIRES_OK(ctx, ctx->input("count", &count_t));
+    const int64 count = count_t->flat<int64>()(0);
+
+    DatasetBase* dataset = new Dataset(count, input);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(int64 count, const DatasetBase* input)
+        : count_(count), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      if (count_ < 0) {
+        return input_->MakeIterator();
+      } else if (count_ == 0) {
+        return std::unique_ptr<IteratorBase>(new EmptyIterator(this));
+      } else {
+        return std::unique_ptr<IteratorBase>(new FiniteIterator(this));
+      }
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "TakeDatasetOp::Dataset"; }
+
+   private:
+    class EmptyIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit EmptyIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {}
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+    };
+
+    class FiniteIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit FiniteIterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            input_impl_(dataset->input_->MakeIterator()) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        while (i_ < dataset()->count_) {
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          if (!*end_of_sequence) {
+            ++i_;
+            return Status::OK();
+          }
+          break;
+        }
+        *end_of_sequence = true;
+        input_impl_.reset();
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 count_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TakeDataset").Device(DEVICE_CPU),
+                        TakeDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index ad3f7cb1e55054fb7e792a73a75a61d564c38014..7b85ff2ea4105a150e15f548ca9f881f8a71a43d 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -45,6 +45,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA
@@ -67,6 +69,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU)
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 47041309942c17aa374d2cd7142f91965108ddd0..b43fafe9218f0ee3072d646ac62ac02763df7140 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -61,6 +61,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_CPU)
 
 #define TENSOR_ARRAY_WRITE_OR_ADD_GPU(T) TENSOR_ARRAY_WRITE_OR_ADD(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_WRITE_OR_ADD_GPU);
 #undef TENSOR_ARRAY_WRITE_OR_ADD_GPU
 
 #endif  // GOOGLE_CUDA
@@ -86,6 +88,8 @@ TF_CALL_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_CPU)
 
 #define TENSOR_ARRAY_SET_ZERO_GPU(T) TENSOR_ARRAY_SET_ZERO(GPUDevice, T)
 TF_CALL_GPU_NUMBER_TYPES(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex64(TENSOR_ARRAY_SET_ZERO_GPU);
+TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
 #undef TENSOR_ARRAY_SET_ZERO_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 8202719b4dd3ed804be452526c01f68f7188d96e..bd7556658a03193121563327e717734036b6d8c2 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -236,6 +236,8 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
                           TensorArrayOp);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -288,8 +290,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     // may no longer be resized by new Writes.
     tensor_array->DisableDynamicSize();
 
-    int32 array_size;
-    int32 marked_size;
+    int32 array_size = 0;
+    int32 marked_size = 0;
     TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
     TF_RETURN_IF_ERROR(tensor_array->MarkedSize(&marked_size));
 
@@ -432,6 +434,8 @@ TF_CALL_ALL_TYPES(REGISTER_WRITE);
                           TensorArrayWriteOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -519,6 +523,8 @@ TF_CALL_ALL_TYPES(REGISTER_READ)
                           TensorArrayReadOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -615,6 +621,12 @@ class TensorArrayPackOrGatherOp : public OpKernel {
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
+
+    // If output_tensor is empty, there is nothing to concatenate so return it.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
     ConstMatrixVector input_tensors_flat;
     input_tensors_flat.reserve(num_indices);
     auto output_flat =
@@ -712,6 +724,8 @@ REGISTER_GATHER_AND_PACK(bfloat16);
       TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -932,6 +946,8 @@ REGISTER_CONCAT(bfloat16);
                           TensorArrayConcatOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
 #undef REGISTER_GPU
 
@@ -1149,6 +1165,8 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
                                    false /* LEGACY_UNPACK */>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
@@ -1314,6 +1332,8 @@ TF_CALL_ALL_TYPES(REGISTER_SPLIT);
                           TensorArraySplitOp<GPUDevice, type>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_complex64(REGISTER_GPU);
+TF_CALL_complex128(REGISTER_GPU);
 #undef REGISTER_GPU
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tensor_dataset_op.cc b/tensorflow/core/kernels/tensor_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b6fcb1978615c76cf6fc7223dba665aa8e3c012
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_dataset_op.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TensorDatasetOp : public OpKernel {
+ public:
+  explicit TensorDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new TensorDatasetOp::Dataset, insert it in the step
+    // container, and return it as the output.
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs));
+    // TODO(mrry): Validate that the shapes of the "components" tensors match
+    // the "shapes" attr.;
+    std::vector<Tensor> components;
+    components.reserve(inputs.size());
+    for (const Tensor& t : inputs) {
+      components.push_back(t);
+    }
+    DatasetBase* dataset = new Dataset(std::move(components));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<Tensor> tensors)
+        : tensors_(std::move(tensors)) {
+      for (const Tensor& t : tensors_) {
+        dtypes_.push_back(t.dtype());
+        shapes_.emplace_back(t.shape().dim_sizes());
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return dtypes_; }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "TensorDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset), produced_(false) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!produced_) {
+          *out_tensors = dataset()->tensors_;
+          produced_ = true;
+          *end_of_sequence = false;
+          return Status::OK();
+        } else {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+      }
+
+     private:
+      mutex mu_;
+      bool produced_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<Tensor> tensors_;
+    DataTypeVector dtypes_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorDataset").Device(DEVICE_CPU),
+                        TensorDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc70d2ecc51eceb66d1edf6c5c773817b5f688eb
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TensorSliceDatasetOp : public OpKernel {
+ public:
+  explicit TensorSliceDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Create a new TensorDatasetOp::Dataset, insert it in the step
+    // container, and return it as the output.
+    OpInputList inputs;
+    OP_REQUIRES_OK(ctx, ctx->input_list("components", &inputs));
+    std::vector<Tensor> components;
+    components.reserve(inputs.size());
+    OP_REQUIRES(ctx, inputs[0].dims() > 0,
+                errors::InvalidArgument(
+                    "All components must be at least 1-dimensional"));
+    const int64 num_slices = inputs[0].dim_size(0);
+    for (const Tensor& t : inputs) {
+      components.push_back(t);
+      OP_REQUIRES(ctx, t.dims() > 0,
+                  errors::InvalidArgument(
+                      "All components must be at least 1-dimensional"));
+      OP_REQUIRES(
+          ctx, t.dim_size(0) == num_slices,
+          errors::InvalidArgument(
+              "All components must have the same size in the 0th dimension"));
+    }
+    DatasetBase* dataset = new Dataset(std::move(components));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+        ctx, ctx->step_container()->name(), name());
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, handle, dataset));
+    output->flat<ResourceHandle>()(0) = handle;
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(std::vector<Tensor> tensors)
+        : tensors_(std::move(tensors)) {
+      for (const Tensor& t : tensors_) {
+        dtypes_.push_back(t.dtype());
+        gtl::InlinedVector<int64, 4> partial_dim_sizes;
+        // Handle scalar here. Check that everyone matches here? Or fail
+        // at runtime?
+        for (int i = 1; i < t.dims(); ++i) {
+          partial_dim_sizes.push_back(t.dim_size(i));
+        }
+        shapes_.emplace_back(std::move(partial_dim_sizes));
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return dtypes_; }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return shapes_;
+    }
+
+    string DebugString() override { return "TensorSliceDatasetOp::Dataset"; }
+
+   private:
+    template <DataType DT>
+    static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
+                                       int64 index) {
+      typedef typename EnumToDataType<DT>::Type T;
+      DCHECK_NE(parent.dim_size(0), 0);
+      DCHECK_GE(index, 0);
+      if (element->NumElements() !=
+          (parent.NumElements() / parent.dim_size(0))) {
+        TensorShape chip_shape = parent.shape();
+        chip_shape.RemoveDim(0);
+        return errors::Internal(
+            "HandleSliceToElement Cannot copy slice: number of elements does "
+            "not match.  Shapes are: [element]: ",
+            element->shape().DebugString(), ", [parent slice]: ",
+            chip_shape.DebugString());
+      }
+      auto parent_as_matrix = parent.flat_outer_dims<T>();
+      element->flat<T>() = parent_as_matrix.chip(index, 0);
+      return Status::OK();
+    }
+
+    static Status CopySliceToElement(const Tensor& parent, Tensor* element,
+                                     int64 index) {
+#define HANDLE_TYPE(DT)                                                   \
+  if (parent.dtype() == DT) {                                             \
+    TF_RETURN_IF_ERROR(HandleSliceToElement<DT>(parent, element, index)); \
+    return Status::OK();                                                  \
+  }
+      HANDLE_TYPE(DT_FLOAT);
+      HANDLE_TYPE(DT_HALF);
+      HANDLE_TYPE(DT_DOUBLE);
+      HANDLE_TYPE(DT_INT32);
+      HANDLE_TYPE(DT_UINT8);
+      HANDLE_TYPE(DT_INT16);
+      HANDLE_TYPE(DT_INT8);
+      HANDLE_TYPE(DT_STRING);
+      HANDLE_TYPE(DT_COMPLEX64);
+      HANDLE_TYPE(DT_COMPLEX128);
+      HANDLE_TYPE(DT_INT64);
+      HANDLE_TYPE(DT_BOOL);
+      HANDLE_TYPE(DT_QINT8);
+      HANDLE_TYPE(DT_QUINT8);
+      HANDLE_TYPE(DT_QINT32);
+      HANDLE_TYPE(DT_QINT16);
+      HANDLE_TYPE(DT_QUINT16);
+#undef HANDLE_TYPE
+      return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
+                                   element->dtype());
+    }
+
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset),
+            i_(0),
+            n_(dataset->tensors_[0].dim_size(0)) {}
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (i_ < n_) {
+          out_tensors->clear();
+          out_tensors->reserve(dataset()->tensors_.size());
+          for (int i = 0; i < dataset()->tensors_.size(); ++i) {
+            const Tensor& t = dataset()->tensors_[i];
+            Tensor t_slice(cpu_allocator(), t.dtype(),
+                           TensorShape(dataset()->shapes_[i].dim_sizes()));
+            TF_RETURN_IF_ERROR(CopySliceToElement(t, &t_slice, i_));
+            out_tensors->emplace_back(std::move(t_slice));
+          }
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int i_ GUARDED_BY(mu_);
+      const int n_;
+    };
+
+    const std::vector<Tensor> tensors_;
+    DataTypeVector dtypes_;
+    std::vector<PartialTensorShape> shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorSliceDataset").Device(DEVICE_CPU),
+                        TensorSliceDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index 9822b021ebc441110d7a2c5832d04278a46b9da5..06f20cd9ec8a462e6edda546e8d486c8a8c7db6b 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -124,6 +124,10 @@ class TileOp : public OpKernel {
                                   multiples_array[i]));
       output_shape.AddDim(input.dim_size(i) * multiples_array[i]);
     }
+    if (output_shape == input.shape()) {
+      context->set_output(0, input);
+      return;
+    }
     Tensor* result = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
 
@@ -315,6 +319,10 @@ class TileGradientOp : public OpKernel {
       output_shape.AddDim(input.dim_size(i) / multiples_array[i]);
       input_dim_size_vec.push_back(input.dim_size(i));
     }
+    if (output_shape == input.shape()) {
+      context->set_output(0, input);
+      return;
+    }
     Tensor* result = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &result));
 
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 00c2e6072c6a0689f6f6a50298432d7019a7e300..62f84e8a446a14736ad1d74aba10fff2c775cbab 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -17,17 +17,22 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/topk_op.h"
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
 
 namespace tensorflow {
 
-template <typename T>
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
 class TopK : public OpKernel {
  public:
   explicit TopK(OpKernelConstruction* context) : OpKernel(context) {
@@ -59,8 +64,8 @@ class TopK : public OpKernel {
 
     const auto& input = input_in.flat_inner_dims<T>();
 
-    const auto num_rows = input.dimension(0);  // generally batch_size
-    const auto num_cols = input.dimension(1);
+    const int64 num_rows = input.dimension(0);  // generally batch_size
+    const int64 num_cols = input.dimension(1);
 
     TensorShape output_shape = input_in.shape();
     output_shape.set_dim(input_in.dims() - 1, k);
@@ -76,6 +81,24 @@ class TopK : public OpKernel {
 
     auto values = values_out->flat_inner_dims<T>();
     auto indices = indices_out->flat_inner_dims<int32>();
+    functor::TopKFunctor<Device, T>::Compute(
+        context, sorted_, k, input, num_rows, num_cols, &values, &indices);
+  }
+
+ private:
+  int k_;
+  bool sorted_;
+};
+
+namespace functor {
+
+template <typename T>
+struct TopKFunctor<CPUDevice, T> {
+  static EIGEN_ALWAYS_INLINE void Compute(
+      OpKernelContext* context, bool sorted, int k,
+      const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
+      const int64 num_cols, typename TTypes<T, 2>::Tensor* values,
+      typename TTypes<int, 2>::Tensor* indices) {
     gtl::TopN<std::pair<T, int32>> filter(k);
     for (int r = 0; r < num_rows; r++) {
       for (int32 c = 0; c < num_cols; ++c) {
@@ -85,40 +108,75 @@ class TopK : public OpKernel {
       }
 
       int32 i = 0;
-      if (sorted_ && k > 1) {
+      if (sorted && k > 1) {
         std::unique_ptr<std::vector<std::pair<T, int32>>> top_k(
             filter.Extract());
         for (auto top_k_it = top_k->begin(); top_k_it != top_k->end();
              ++top_k_it, ++i) {
-          values(r, i) = top_k_it->first;
-          indices(r, i) = -top_k_it->second;
+          (*values)(r, i) = top_k_it->first;
+          (*indices)(r, i) = -top_k_it->second;
         }
       } else {
         for (auto top_k_it = filter.unsorted_begin();
              top_k_it != filter.unsorted_end(); ++top_k_it, ++i) {
-          values(r, i) = top_k_it->first;
-          indices(r, i) = -top_k_it->second;
+          (*values)(r, i) = top_k_it->first;
+          (*indices)(r, i) = -top_k_it->second;
         }
       }
       filter.Reset();
     }
   }
-
- private:
-  int k_;
-  bool sorted_;
 };
 
-#define REGISTER_KERNELS_NAME(name, type) \
-  REGISTER_KERNEL_BUILDER(                \
-      Name(#name).Device(DEVICE_CPU).TypeConstraint<type>("T"), TopK<type>)
+}  // namespace functor
+
+#define REGISTER_KERNELS_NAME(name, type)                       \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name(#name).Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      TopK<CPUDevice, type>)
 
 #define REGISTER_KERNELS(type)       \
   REGISTER_KERNELS_NAME(TopK, type); \
   REGISTER_KERNELS_NAME(TopKV2, type)
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS_TO_NAME
+#undef REGISTER_KERNELS_NAME
 #undef REGISTER_KERNELS
 
-}  // namespace tensorflow
+#ifdef GOOGLE_CUDA
+
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void TopKFunctor<GPUDevice, T>::Compute(                                   \
+      OpKernelContext* context, bool sorted, int k,                          \
+      const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows, \
+      const int64 num_cols, typename TTypes<T, 2>::Tensor* values,           \
+      typename TTypes<int, 2>::Tensor* indices);                             \
+  extern template struct functor::TopKFunctor<GPUDevice, T>;
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPEC);
+TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
+
+#undef DECLARE_GPU_SPEC
+
+}  // namespace functor
+
+#define REGISTER_KERNELS(type)                                   \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("TopK").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      TopK<GPUDevice, type>)                                     \
+  REGISTER_KERNEL_BUILDER(Name("TopKV2")                         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .HostMemory("k"),                  \
+                          TopK<GPUDevice, type>)
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNELS);
+TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
+
+#undef REGISTER_KERNELS
+
+#endif  // end GOOGLE_CUDA
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/topk_op.h b/tensorflow/core/kernels/topk_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..799c3e8345936904255d9c0135b5f6c469f3da6b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op.h
@@ -0,0 +1,42 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TOPK_OP_H_
+#define TENSORFLOW_TOPK_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct TopKFunctor {
+  static void Compute(OpKernelContext* context, bool sorted, int k,
+                      const typename TTypes<T, 2>::ConstTensor& input,
+                      const int64 num_rows, const int64 num_cols,
+                      typename TTypes<T, 2>::Tensor* values,
+                      typename TTypes<int, 2>::Tensor* indices);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_TOPK_OP_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7bacc2defa15ad67c35806441028644a4adfa70e
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu.cu.cc
@@ -0,0 +1,414 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <cmath>
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/lib/gtl/top_n.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace impl {
+
+enum class HeapType { kMinHeap, kMaxHeap };
+enum class PreferIndices { kLower, kHigher };
+
+template <typename T>
+struct Entry {
+  int index;
+  T value;
+
+  // Test-only.
+  static bool greater(const Entry<T>& a, const Entry<T>& b) {
+    if (a.value == b.value) {
+      return a.index < b.index;
+    }
+    return a.value > b.value;
+  }
+};
+
+template <typename T>
+struct LinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const { return data[i].index; }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+};
+
+template <typename T>
+struct IndirectLinearData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const { return data[index]; }
+
+  __device__ int get_index(int i) const {
+    return backing_data[data[i].index].index;
+  }
+  __device__ T get_value(int i) const { return data[i].value; }
+
+  Entry* const data;
+  Entry* const backing_data;
+};
+
+#if GOOGLE_CUDA
+template <typename T>
+struct StridedData {
+  typedef impl::Entry<T> Entry;
+
+  __device__ Entry& operator[](std::size_t index) const {
+    return data[index * blockDim.x + threadIdx.x];
+  }
+
+  __device__ int get_index(int i) const { return (*this)[i].index; }
+  __device__ T get_value(int i) const { return (*this)[i].value; }
+
+  Entry* const data;
+};
+#endif
+
+// A heap of Entry<T> that can either work as a min-heap or as a max-heap.
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+struct IndexedHeap {
+  typedef typename Data<T>::Entry Entry;
+  const Data<T> data;
+
+  __device__ bool is_above(int left, int right) {
+    T left_value = data.get_value(left);
+    T right_value = data.get_value(right);
+    if (left_value == right_value) {
+      if (preferIndices == PreferIndices::kLower) {
+        return data.get_index(left) < data.get_index(right);
+      } else {
+        return data.get_index(left) > data.get_index(right);
+      }
+    }
+    if (heapType == HeapType::kMinHeap) {
+      return left_value < right_value;
+    } else {
+      return left_value > right_value;
+    }
+  }
+
+  __device__ void assign(int i, const Entry& entry) { data[i] = entry; }
+
+  __device__ void push_up(int i) {
+    int child = i;
+    int parent;
+    for (; child > 0; child = parent) {
+      parent = (child - 1) / 2;
+      if (!is_above(child, parent)) {
+        // Heap property satisfied.
+        break;
+      }
+      swap(child, parent);
+    }
+  }
+
+  __device__ void swap(int a, int b) {
+    auto tmp = data[b];
+    data[b] = data[a];
+    data[a] = tmp;
+  }
+
+  __device__ void push_root_down(int k) { push_down(0, k); }
+
+  // MAX-HEAPIFY in Cormen
+  __device__ void push_down(int node, int k) {
+    while (true) {
+      const int left = 2 * node + 1;
+      const int right = left + 1;
+      int smallest = node;
+      if (left < k && is_above(left, smallest)) {
+        smallest = left;
+      }
+      if (right < k && is_above(right, smallest)) {
+        smallest = right;
+      }
+      if (smallest == node) {
+        break;
+      }
+      swap(smallest, node);
+      node = smallest;
+    }
+  }
+
+  // BUILD-MAX-HEAPIFY in Cormen
+  __device__ void build(int k) {
+    for (int node = (k - 1) / 2; node >= 0; node--) {
+      push_down(node, k);
+    }
+  }
+
+  // HEAP-EXTRACT-MAX in Cormen
+  __device__ void remove_root(int k) {
+    data[0] = data[k - 1];
+    push_root_down(k - 1);
+  }
+
+  // in-place HEAPSORT in Cormen
+  // This method destroys the heap property.
+  __device__ void sort(int k) {
+    for (int slot = k - 1; slot > 0; slot--) {
+      // This is like remove_root but we insert the element at the end.
+      swap(slot, 0);
+      // Heap is now an element smaller.
+      push_root_down(/*k=*/slot);
+    }
+  }
+
+  __device__ void replace_root(const Entry& entry, int k) {
+    data[0] = entry;
+    push_root_down(k);
+  }
+
+  __device__ const Entry& root() { return data[0]; }
+};
+
+template <HeapType heapType, PreferIndices preferIndices,
+          template <typename> class Data, typename T>
+__device__ IndexedHeap<heapType, preferIndices, Data, T> make_indexed_heap(
+    typename Data<T>::Entry* data) {
+  return IndexedHeap<heapType, preferIndices, Data, T>{Data<T>{data}};
+}
+
+// heapTopK walks over [input, input+length) with `step_size` stride starting at
+// `start_index`.
+// It builds a top-`k` heap that is stored in `heap_entries` using `Accessor` to
+// access elements in `heap_entries`. If sorted=true, the elements will be
+// sorted at the end.
+template <typename T, template <typename> class Data = LinearData>
+__device__ void heapTopK(const T* __restrict__ input, int length, int k,
+                         Entry<T>* __restrict__ heap_entries,
+                         bool sorted = false, int start_index = 0,
+                         int step_size = 1) {
+  assert(k <= length);
+
+  auto heap =
+      make_indexed_heap<HeapType::kMinHeap, PreferIndices::kHigher, Data, T>(
+          heap_entries);
+
+  int heap_end_index = start_index + k * step_size;
+  if (heap_end_index > length) {
+    heap_end_index = length;
+  }
+  // Initialize the min-heap.
+  for (int index = start_index, slot = 0; index < heap_end_index;
+       index += step_size, slot++) {
+    heap.assign(slot, {index, input[index]});
+  }
+
+  heap.build(k);
+
+  // Now iterate over the remaining items.
+  // If an item is smaller than the min element, it is not amongst the top k.
+  // Otherwise, replace the min element with it and push upwards.
+  for (int index = heap_end_index; index < length; index += step_size) {
+    // We prefer elements with lower indices. This is given here.
+    // Later elements automatically have higher indices, so can be discarded.
+    if (input[index] > heap.root().value) {
+      // This element should replace the min.
+      heap.replace_root({index, input[index]}, k);
+    }
+  }
+
+  // Sort if wanted.
+  if (sorted) {
+    heap.sort(k);
+  }
+}
+
+// mergeShards performs a top-k merge on `num_shards` many sorted streams that
+// are sorted and stored in `entries` in a strided way:
+// |s_1 1st|s_2 1st|...s_{num_shards} 1st|s_1 2nd|s_2 2nd|...
+// The overall top k elements are written to `top_k_values` and their indices
+// to top_k_indices.
+// `top_k_heap` is used as temporary storage for the merge heap.
+template <typename T>
+__device__ void mergeShards(int num_shards, int k,
+                            Entry<T>* __restrict__ entries,
+                            Entry<T>* __restrict__ top_k_heap, T* top_k_values,
+                            int* top_k_indices) {
+  // If k < num_shards, we can use a min-heap with k elements to get the top k
+  // of the sorted blocks.
+  // If k > num_shards, we can initialize a min-heap with the top element from
+  // each sorted block.
+  const int heap_size = k < num_shards ? k : num_shards;
+
+  // Min-heap part.
+  {
+    auto min_heap = IndexedHeap<HeapType::kMinHeap, PreferIndices::kHigher,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Initialize the heap as a min-heap.
+    for (int slot = 0; slot < heap_size; slot++) {
+      min_heap.assign(slot, {slot, entries[slot].value});
+    }
+    min_heap.build(heap_size);
+
+    // Now perform top k with the remaining shards (if num_shards > heap_size).
+    for (int shard = heap_size; shard < num_shards; shard++) {
+      const auto entry = entries[shard];
+      const auto root = min_heap.root();
+      if (entry.value < root.value) {
+        continue;
+      }
+      if (entry.value == root.value &&
+          entry.index > entries[root.index].index) {
+        continue;
+      }
+      // This element should replace the min.
+      min_heap.replace_root({shard, entry.value}, heap_size);
+    }
+  }
+
+  // Max-part.
+  {
+    // Turn the min-heap into a max-heap in-place.
+    auto max_heap = IndexedHeap<HeapType::kMaxHeap, PreferIndices::kLower,
+                                IndirectLinearData, T>{
+        IndirectLinearData<T>{top_k_heap, entries}};
+    // Heapify into a max heap.
+    max_heap.build(heap_size);
+
+    // Now extract the minimum k-1 times.
+    // k is treated specially.
+    const int last_k = k - 1;
+    for (int rank = 0; rank < last_k; rank++) {
+      const Entry<T>& max_element = max_heap.root();
+      top_k_values[rank] = max_element.value;
+      int shard_index = max_element.index;
+      top_k_indices[rank] = entries[shard_index].index;
+      int next_shard_index = shard_index + num_shards;
+      // For rank < k-1, each top k heap still contains at least 1 element,
+      // so we can draw a replacement.
+      max_heap.replace_root({next_shard_index, entries[next_shard_index].value},
+                            heap_size);
+    }
+
+    // rank == last_k.
+    const Entry<T>& max_element = max_heap.root();
+    top_k_values[last_k] = max_element.value;
+    int shard_index = max_element.index;
+    top_k_indices[last_k] = entries[shard_index].index;
+  }
+}
+
+extern __shared__ char shared_memory[];
+
+template <typename T>
+__global__ void TopKKernel(const T* input, int length, int k, bool sorted,
+                           T* output, int* indices) {
+  const int batch_index = blockIdx.x;
+  const T* batch_input = input + batch_index * length;
+
+  const int thread_index = threadIdx.x;
+  const int thread_count = blockDim.x;
+
+  Entry<T>* shared_entries = (Entry<T>*)shared_memory;
+
+  heapTopK<T, StridedData>(batch_input, length, k, shared_entries, true,
+                           thread_index, thread_count);
+
+  __syncthreads();
+  if (thread_index == 0) {
+    const int offset = batch_index * k;
+    auto batch_output = output + offset;
+    auto batch_indices = indices + offset;
+    Entry<T>* top_k_heap = shared_entries + thread_count * k;
+
+    mergeShards(thread_count, k, shared_entries, top_k_heap, batch_output,
+                batch_indices);
+  }
+}
+
+template <typename T>
+void LaunchTopKKernel(void* stream, int num_shards, const T* input,
+                      int batch_size, int length, int k, bool sorted, T* output,
+                      int* indices) {
+  // As many shards as possible.
+  if (num_shards <= 0) {
+    constexpr auto shared_memory_size = 48 << 10;  // 48 KB
+    const auto heap_size = k * (sizeof(int) + sizeof(T));
+    // shared_memory_size = (num_shards + 1) * heap_size <=>
+    num_shards = shared_memory_size / heap_size - 1;
+    if (num_shards <= 0) {
+      num_shards = 1;
+    }
+    auto shard_size = length / num_shards;
+    auto min_shard_size = 2 * k;
+    if (shard_size < min_shard_size) {
+      num_shards = length / min_shard_size;
+    }
+    if (num_shards <= 0) {
+      num_shards = 1;
+    } else if (num_shards > 1024) {
+      num_shards = 1024;
+    }
+  }
+  // We are limited by the amount of shared memory we have per block.
+  auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry<T>);
+
+  TopKKernel<<<batch_size, num_shards, shared_memory_size,
+               (cudaStream_t)stream>>>(input, length, k, sorted, output,
+                                       indices);
+}
+
+}  // end namespace impl
+
+namespace functor {
+
+template <typename T>
+struct TopKFunctor<GPUDevice, T> {
+  static EIGEN_ALWAYS_INLINE void Compute(
+      OpKernelContext* context, bool sorted, int k,
+      const typename TTypes<T, 2>::ConstTensor& input, const int64 num_rows,
+      const int64 num_cols, typename TTypes<T, 2>::Tensor* values,
+      typename TTypes<int, 2>::Tensor* indices) {
+    auto stream = context->eigen_gpu_device().stream();
+    impl::LaunchTopKKernel(stream, 0, input.data(), num_rows, num_cols, k,
+                           sorted, values->data(), indices->data());
+  }
+};
+
+}  // end namespace functor
+
+#define INSTANTIATE_TEMPLATE(type) \
+  template struct functor::TopKFunctor<GPUDevice, type>;
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(INSTANTIATE_TEMPLATE);
+TF_CALL_INTEGRAL_TYPES(INSTANTIATE_TEMPLATE);
+#undef INSTANTIATE_TEMPLATE
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11d51188fcc21d5ba23f1583bee452b6ed22babe
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      return var->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a vector of acquired mutexes.
+// Safe to pass duplicates - will only lock each distinct mutex once.  If
+// do_lock is false, returns immediately.  Note that this silently doesn't lock
+// mutexes for invalid variable references; in all usages this is followed by
+// GetInputTensor which will signal a failure.
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  std::vector<mutex_lock> locks;
+  if (!do_lock) {
+    return locks;
+  }
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    mutex* mutex = GetTrainingVariableMutex(ctx, input);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  for (auto input : acquire_order) {
+    mutex* mu = GetTrainingVariableMutex(ctx, input);
+    if (mu != nullptr) {
+      locks.emplace_back(*mu);
+    }
+  }
+  return locks;
+}
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out) {
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    Var* var;
+    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
+      if (lock_held) {
+        *out = *var->tensor();
+      } else {
+        mutex_lock ml(*var->mu());
+        *out = *var->tensor();
+      }
+      return Status::OK();
+    } else {
+      return errors::Internal("Invalid variable reference.");
+    }
+  }
+  *out = ctx->mutable_input(input, lock_held);
+  return Status::OK();
+}
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output) {
+  if (ctx->input_dtype(input) != DT_RESOURCE) {
+    ctx->forward_ref_input_to_ref_output(input, output);
+  }
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2577d452fa7b008bea04ea599ac269094dbfa00
--- /dev/null
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+#define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input);
+
+std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids);
+
+Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
+                                  bool lock_held, Tensor* out);
+
+void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
+                                     int output);
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index c7a5ea3a9cae113bb0bcbd3827d135779a692532..8e2d6dc74eb710d72e6200f06c7791d76164a419 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -58,6 +59,18 @@ struct ApplyGradientDescent<SYCLDevice, T> {
 };
 #endif
 
+template <typename T>
+struct ApplyDelayCompensatedGradientDescent<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar variance,
+                  typename TTypes<T>::Flat shadow) {
+    var.device(d) -= lr() * (grad + variance() * grad * (var - shadow));
+    shadow.device(d) = var;
+  }
+};
+
 template <typename T>
 struct ApplyAdadelta<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -244,12 +257,22 @@ struct ApplyAdamNonCuda {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
     const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
                     (T(1) - beta1_power());
+    // beta1 == μ
+    // beta2 == ν
+    // v     == n
+    // var   == θ
+
     m.device(d) += (grad - m) * (T(1) - beta1());
     v.device(d) += (grad.square() - v) * (T(1) - beta2());
-    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    if (use_nesterov) {
+      var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) /
+                       (v.sqrt() + epsilon());
+    } else {
+      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    }
   }
 };
 
@@ -294,80 +317,6 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
 
 }  // namespace functor
 
-mutex* GetMutex(OpKernelContext* ctx, int input) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      return var->mu();
-    } else {
-      ctx->CtxFailureWithWarning(
-          errors::Internal("Invalid variable reference."));
-      return nullptr;
-    }
-  }
-  return ctx->input_ref_mutex(input);
-}
-
-// MaybeLockMutexesInOrder is a helper function to acquire mutexes in address
-// order to mitigate deadlock.  Returns a vector of acquired mutexes.  Safe to
-// pass duplicates - will only lock each distinct mutex once.  If do_lock is
-// false, returns immediately.  Note that this silently doesn't lock mutexes for
-// invalid variable references; in all usages this is followed by GetInputTensor
-// which will signal a failure.
-std::vector<mutex_lock> MaybeLockMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
-  std::vector<mutex_lock> locks;
-  if (!do_lock) {
-    return locks;
-  }
-  std::vector<mutex*> mutexes;
-  std::vector<int> acquire_order;
-  for (auto input : input_ids) {
-    mutex* mutex = GetMutex(ctx, input);
-    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
-    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
-      acquire_order.push_back(input);
-      mutexes.push_back(mutex);
-    }
-  }
-  std::sort(acquire_order.begin(), acquire_order.end(),
-            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
-
-  for (auto input : acquire_order) {
-    mutex* mu = GetMutex(ctx, input);
-    if (mu != nullptr) {
-      locks.emplace_back(*mu);
-    }
-  }
-  return locks;
-}
-
-Status GetInputTensor(OpKernelContext* ctx, int input, bool lock_held,
-                      Tensor* out) {
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      if (lock_held) {
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        *out = *var->tensor();
-      }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
-    }
-  }
-  *out = ctx->mutable_input(input, lock_held);
-  return Status::OK();
-}
-
-void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
-                                     int output) {
-  if (ctx->input_dtype(input) != DT_RESOURCE) {
-    ctx->forward_ref_input_to_ref_output(input, output);
-  }
-}
 
 template <typename Device, typename T>
 class ApplyGradientDescentOp : public OpKernel {
@@ -377,9 +326,11 @@ class ApplyGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -452,6 +403,73 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyDelayCompensatedGradientDescentOp : public OpKernel {
+ public:
+  explicit ApplyDelayCompensatedGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 4});
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", def().input(0)));
+    const Tensor& alpha = ctx->input(1);
+    OP_REQUIRES(ctx, IsLegacyScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& delta = ctx->input(2);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(delta.shape()),
+        errors::InvalidArgument("var and delta do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                delta.shape().DebugString()));
+    const Tensor& lambda = ctx->input(3);
+    OP_REQUIRES(ctx, IsLegacyScalar(lambda.shape()),
+                errors::InvalidArgument("lambda is not a scalar: ",
+                                        lambda.shape().DebugString()));
+    Tensor shadow;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 4, use_exclusive_lock_, &shadow));
+    OP_REQUIRES(
+        ctx, shadow.shape().IsSameSize(var.shape()),
+        errors::InvalidArgument("shadow and var do not have the same shape",
+                                shadow.shape().DebugString(), " ",
+                                var.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyDelayCompensatedGradientDescent<Device, T>()(
+        device, var.flat<T>(), alpha.scalar<T>(), delta.flat<T>(),
+        lambda.scalar<T>(), shadow.flat<T>()
+    );
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("ApplyDelayCompensatedGradientDescent")             \
+          .Device(DEVICE_##D)                                  \
+          .HostMemory("var")                                   \
+          .HostMemory("shadow")                                \
+          .TypeConstraint<T>("T"),                             \
+      ApplyDelayCompensatedGradientDescentOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdadeltaOp : public OpKernel {
  public:
@@ -461,7 +479,7 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     if (use_exclusive_lock_) {
-      mutex_lock l1(*GetMutex(ctx, 0));
+      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -482,12 +500,14 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoValidate(OpKernelContext* ctx) {
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -534,12 +554,14 @@ class ApplyAdadeltaOp : public OpKernel {
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -606,7 +628,7 @@ class SparseApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetMutex(ctx, 0);
+    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
@@ -615,13 +637,14 @@ class SparseApplyAdadeltaOp : public OpKernel {
       mu_var->lock();
     }
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum_grad;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 1, use_exclusive_lock_, &accum_grad));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &accum_grad));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx,
-                   GetInputTensor(ctx, 2, use_exclusive_lock_, &accum_update));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -756,9 +779,11 @@ class ApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -823,9 +848,11 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -965,11 +992,14 @@ class ApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1055,11 +1085,14 @@ class ApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1159,11 +1192,14 @@ class SparseApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1290,11 +1326,14 @@ class SparseApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1459,15 +1498,17 @@ class ApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1559,15 +1600,17 @@ class SparseApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
-    Tensor gradient_accum;
     OP_REQUIRES_OK(
-        ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &gradient_accum));
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
+    Tensor gradient_accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_,
+                                                   &gradient_accum));
     Tensor gradient_squared_accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_,
-                                       &gradient_squared_accum));
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_,
+                                                   &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1753,14 +1796,18 @@ class ApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1864,13 +1911,17 @@ class SparseApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     Tensor linear;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &linear));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2070,12 +2121,15 @@ class ApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2170,12 +2224,15 @@ class SparseApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor accum;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &accum));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2280,17 +2337,22 @@ class ApplyAdamOp : public OpKernel {
  public:
   explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &m));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &m));
     Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &v));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2346,17 +2408,18 @@ class ApplyAdamOp : public OpKernel {
                                 grad.shape().DebugString()));
 
     const Device& device = ctx->template eigen_device<Device>();
-    functor::ApplyAdam<Device, T>()(device, var.flat<T>(), m.flat<T>(),
-                                    v.flat<T>(), beta1_power.scalar<T>(),
-                                    beta2_power.scalar<T>(), lr.scalar<T>(),
-                                    beta1.scalar<T>(), beta2.scalar<T>(),
-                                    epsilon.scalar<T>(), grad.flat<T>());
+    functor::ApplyAdam<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>(), use_nesterov_);
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
 
  private:
   bool use_exclusive_lock_;
+  bool use_nesterov_;
 };
 
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -2400,7 +2463,7 @@ namespace functor {
       typename TTypes<T>::ConstScalar beta1,                  \
       typename TTypes<T>::ConstScalar beta2,                  \
       typename TTypes<T>::ConstScalar epsilon,                \
-      typename TTypes<T>::ConstFlat grad);                    \
+      typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
   extern template struct ApplyAdam<GPUDevice, T>;
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -2423,14 +2486,18 @@ class ApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2501,17 +2568,21 @@ class ApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2658,14 +2729,18 @@ class SparseApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -2783,17 +2858,21 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockMutexesInOrder(ctx, use_exclusive_lock_, {0, 1, 2, 3});
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2, 3});
 
     Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 0, use_exclusive_lock_, &var));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 0, use_exclusive_lock_, &var));
     Tensor mg;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 1, use_exclusive_lock_, &mg));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 1, use_exclusive_lock_, &mg));
     Tensor ms;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 2, use_exclusive_lock_, &ms));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 2, use_exclusive_lock_, &ms));
     Tensor mom;
-    OP_REQUIRES_OK(ctx, GetInputTensor(ctx, 3, use_exclusive_lock_, &mom));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable(ctx, 3, use_exclusive_lock_, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 6b599656ce129eacf17fc69ad2806d7cc5ec8dea..0a3c5d361ed688fb5ca5723344ea957b0383b1df 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -34,6 +34,15 @@ struct ApplyGradientDescent {
                   typename TTypes<T>::ConstFlat delta);
 };
 
+template <typename Device, typename T>
+struct ApplyDelayCompensatedGradientDescent {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstFlat delta,
+                  typename TTypes<T>::ConstScalar lambda,
+                  typename TTypes<T>::Flat shadow);
+};
+
 template <typename Device, typename T>
 struct ApplyAdadelta {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -123,7 +132,7 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad);
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
 template <typename Device, typename T>
@@ -148,7 +157,6 @@ struct ApplyCenteredRMSProp {
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad);
 };
-
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index f6acdf2422c434432b66086b4fa50a9c8803fe94..3678b96e98f49994089487a833c9a0b4d662041e 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -109,7 +109,7 @@ struct ApplyAdam<GPUDevice, T> {
                   typename TTypes<T>::ConstScalar beta1,
                   typename TTypes<T>::ConstScalar beta2,
                   typename TTypes<T>::ConstScalar epsilon,
-                  typename TTypes<T>::ConstFlat grad) {
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
     Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
     bcast[0] = grad.dimension(0);
     Eigen::Sizes<1> single;
@@ -122,11 +122,25 @@ struct ApplyAdam<GPUDevice, T> {
         v +
         (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
             (grad.square() - v);
-    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
-                      (beta1_power.constant(one) - beta1_power))
-                         .reshape(single)
-                         .broadcast(bcast) *
-                     m / (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+
+    if (use_nesterov) {
+      var.device(d) -=
+          (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+           (beta1_power.constant(one) - beta1_power))
+              .reshape(single)
+              .broadcast(bcast) *
+          (m * beta1.reshape(single).broadcast(bcast) +
+           (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+               grad) /
+          (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+    } else {
+      var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                        (beta1_power.constant(one) - beta1_power))
+                           .reshape(single)
+                           .broadcast(bcast) *
+                       m /
+                       (epsilon.reshape(single).broadcast(bcast) + v.sqrt());
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index fb2ceb4a4a821295ae8a120758884029c66a68e7..bab8e1ee1213f8fa1e4365f83958900157ab3d66 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -89,7 +89,7 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .HostMemory("x")
                             .HostMemory("y"),
                         InvertPermutationOp);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 // output = TransposeOp(T<any> input, T<int32> perm) takes a tensor
 // of type T and rank N, and a permutation of 0, 1, ..., N-1. It
@@ -115,11 +115,6 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
                                       perm.shape().DebugString()));
   auto Vperm = perm.vec<int32>();
   const int dims = input.dims();
-  static const int kMinDims = 0;
-  static const int kMaxDims = 10;
-  OP_REQUIRES(ctx, kMinDims <= dims && dims <= kMaxDims,
-              errors::Unimplemented("Transposing a tensor of rank ", dims,
-                                    " is not implemented."));
   OP_REQUIRES(ctx, dims == Vperm.size(),
               errors::InvalidArgument(
                   "transpose expects a vector of size ", input.dims(),
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index f5d4fcec84ce55a80ed3214fe797f451725932cd..d50e2060acfe64ea09980d0b27639cd6daeb421d 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -21,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
 
@@ -50,8 +50,7 @@ class UniqueOp : public OpKernel {
                                 {0}, 1, input.shape(), &idx));
     auto idx_vec = idx->template vec<int32>();
 
-    std::unordered_map<T, int32> uniq;
-    uniq.reserve(2 * N);
+    gtl::FlatMap<T, int32> uniq(N);
     for (int64 i = 0, j = 0; i < N; ++i) {
       auto it = uniq.insert(std::make_pair(Tin(i), j));
       idx_vec(i) = it.first->second;
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 8c173a4ba30cdb1f1c50d2737f64a8c2a115811e..f0b5796d04a74c6fd39bca64cac2e603a19d46ba 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -76,6 +76,18 @@ class VariableOp : public OpKernel {
     // As long as the resource manager hasn't been cleared the ref we return
     // here is valid because it owns a ref on var.
     ctx->set_output_ref(0, var->mu(), var->tensor());
+    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      if (ctx->allocate_on_host(attr)) {
+        ctx->record_host_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      } else {
+        ctx->record_device_persistent_memory_allocation(
+            var->tensor()->AllocatedBytes());
+      }
+    }
     var->Unref();
   }
 
@@ -115,6 +127,16 @@ class TemporaryVariableOp : public OpKernel {
     OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
                                        var_name_, tmp_var));
     context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/warn_about_ints.cc b/tensorflow/core/kernels/warn_about_ints.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a889c99df47454a5eff1acd646b070d3a4280
--- /dev/null
+++ b/tensorflow/core/kernels/warn_about_ints.cc
@@ -0,0 +1,32 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/warn_about_ints.h"
+
+namespace tensorflow {
+
+void WarnAboutInts(OpKernelConstruction* context) {
+  DataType dtype;
+  OP_REQUIRES_OK(context, context->GetAttr("T", &dtype));
+  if (DataTypeIsInteger(dtype)) {
+    LOG(WARNING) << "Op " << context->def().name() << " of type "
+                 << context->def().op() << " used with integer dtype "
+                 << DataTypeString(dtype)
+                 << ".  This op was registered with integer support "
+                 << "accidentally, and you won't like the result.";
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_optimizer_merge.h b/tensorflow/core/kernels/warn_about_ints.h
similarity index 59%
rename from tensorflow/core/graph/mkl_optimizer_merge.h
rename to tensorflow/core/kernels/warn_about_ints.h
index b2caec58aff311c80f1454ae6d9ba7b50732ee7d..20666b230ece61074af576a6f654a658c593a2a8 100644
--- a/tensorflow/core/graph/mkl_optimizer_merge.h
+++ b/tensorflow/core/kernels/warn_about_ints.h
@@ -13,24 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// An optimization pass that performs node merging and rewrite on graph nodes
+#ifndef TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
+#define TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
 
-#ifndef TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
-#define TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
+#include "tensorflow/core/framework/op_kernel.h"
 
-#ifdef INTEL_MKL
+namespace tensorflow {
 
-#include <sys/types.h>
-#include <memory>
-#include "tensorflow/core/graph/graph.h"
+// Warn if a kernel is being created using ints
+// TODO(irving): Remove in TF 2.0 along with the bad op registrations.
+void WarnAboutInts(OpKernelConstruction* context);
 
-namespace tensorflow {
-// Interface to invoke the pass for unit test
-//
-// Returns true if and only if 'g' is mutated.
-extern bool OptimizeNodeMerge(std::unique_ptr<Graph>* g);
 }  // namespace tensorflow
 
-#endif  // INTEL_MKL
-
-#endif  // TENSORFLOW_GRAPH_MKL_OPTIMIZER_MERGE_H_
+#endif  // TENSORFLOW_KERNELS_WARN_ABOUT_INTS_H_
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 148ede24c335c5cff9305df008174a1b949caeac..8f42bb28324ecc3f7d82ebbbcb3aebaff6b2c47a 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -29,20 +31,11 @@ namespace tensorflow {
 
 static Status ReadEntireFile(Env* env, const string& filename,
                              string* contents) {
-  uint64 file_size = 0;
-  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
-  contents->resize(file_size);
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
-  StringPiece data;
-  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(*contents)[0]));
-  if (data.size() != file_size) {
-    return errors::DataLoss("Truncated read of '", filename, "' expected ",
-                            file_size, " got ", data.size());
-  }
-  if (data.data() != &(*contents)[0]) {
-    memmove(&(*contents)[0], data.data(), data.size());
-  }
+  io::RandomAccessInputStream input_stream(file.get());
+  io::BufferedInputStream in(&input_stream, 1 << 20);
+  TF_RETURN_IF_ERROR(in.ReadAll(contents));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/window_dataset.cc b/tensorflow/core/kernels/window_dataset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8396d9b700c4eceea8ba05953bb2c6397854c43b
--- /dev/null
+++ b/tensorflow/core/kernels/window_dataset.cc
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/window_dataset.h"
+
+namespace tensorflow {
+namespace {
+
+class WindowDataset : public DatasetBase {
+ public:
+  WindowDataset(std::vector<std::vector<Tensor>> elements,
+                DataTypeVector output_types,
+                std::vector<PartialTensorShape> output_shapes)
+      : elements_(std::move(elements)),
+        output_types_(std::move(output_types)),
+        output_shapes_(std::move(output_shapes)) {}
+
+  std::unique_ptr<IteratorBase> MakeIterator() const override {
+    return std::unique_ptr<IteratorBase>(new Iterator(this));
+  }
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  string DebugString() override { return "WindowDataset"; }
+
+ private:
+  class Iterator : public DatasetIterator<WindowDataset> {
+   public:
+    explicit Iterator(const WindowDataset* dataset)
+        : DatasetIterator<WindowDataset>(dataset) {}
+
+    Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) override {
+      mutex_lock l(mu_);
+      if (i_ == dataset()->elements_.size()) {
+        *end_of_sequence = true;
+      } else {
+        *end_of_sequence = false;
+        *out_tensors = dataset()->elements_[i_++];
+      }
+      return Status::OK();
+    }
+
+    mutex mu_;
+    size_t i_ GUARDED_BY(mu_) = 0;
+  };
+
+  const std::vector<std::vector<Tensor>> elements_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace
+
+Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
+                        DataTypeVector output_types,
+                        std::vector<PartialTensorShape> output_shapes,
+                        DatasetBase** out_dataset) {
+  // TODO(mrry): If this becomes more public, we must validate that
+  // the elements match the output_types and output_shapes.
+  *out_dataset = new WindowDataset(std::move(elements), std::move(output_types),
+                                   std::move(output_shapes));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/window_dataset.h b/tensorflow/core/kernels/window_dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4fccf17b4c7cc064c1aec57554bb88bb7b59578
--- /dev/null
+++ b/tensorflow/core/kernels/window_dataset.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/dataset.h"
+
+namespace tensorflow {
+
+// Creates a dataset representing an eagerly-collected window of elements.
+//
+// The `elements` argument defines the elements of the resulting
+// dataset, which is stored in `out_dataset`.
+//
+// This dataset is constructed internally for use in datasets that
+// build nested dataset expressions (e.g. the reducer function for
+// GroupByBatchDataset). It efficiently supports multiple iterators on
+// the same window without recomputation.
+//
+// REQUIRES: `output_types` must match the types of the respective
+// element components in `elements`.
+// REQUIRES: `output_shapes` must be compatible with the shapes of the
+// respective element components in `elements`.a
+Status NewWindowDataset(std::vector<std::vector<Tensor>> elements,
+                        DataTypeVector output_types,
+                        std::vector<PartialTensorShape> output_shapes,
+                        DatasetBase** out_dataset);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_WINDOW_DATASET_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d.cc b/tensorflow/core/kernels/xsmm_conv2d.cc
index d3a29c2e3eaf82fc9d376554f34bf7432f854e96..7936cbcd46f071228d682771969f167f1709cbb6 100644
--- a/tensorflow/core/kernels/xsmm_conv2d.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d.cc
@@ -35,9 +35,9 @@ void dummy_xsmm_conv2d_ensure_file_is_not_empty(void);
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
+#include "libxsmm_main.h"  // TODO(bsteiner): API to avoid incl. header from src/
 #include "include/libxsmm_cpuid.h"
 #include "include/libxsmm_malloc.h"
-#include "libxsmm_main.h" // TODO: API to avoid incl. header from src/
 
 namespace tensorflow {
 
@@ -72,7 +72,6 @@ bool CanUseXsmmConv2D(const libxsmm_dnn_conv_desc& desc,
   return true;
 }
 
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 namespace functor {
@@ -83,25 +82,34 @@ static void chk_libxsmm_err(libxsmm_dnn_err_t status, string msg) {
   }
 }
 
-LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, int S, int C, int K,int blocksifm, int blocksofm, int ifmblock,int ofmblock, int start, int end)
-{
-  LIBXSMM_VLA_DECL(4, const      float, input, rsck, S, C,K);
-  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm,R,S,ifmblock, ofmblock);
-  int r, s, k,c, v1,v2;
-  
-  for (k = start; k < end ; k++ ) { 
-    for(c = 0; c < blocksifm;c++){
-      for ( r = 0; r < R; r++ ) {
-        for ( s = 0; s < S; s++ ){
-          for ( v1 = c*ifmblock; v1 < std::min(C,(c+1)*ifmblock) ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < std::min(K, (k+1)*ofmblock); v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2,  S, C, K);
-            for ( v2 = K; v2 < (k+1)*ofmblock ; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f; 
-            }
-          for ( v1 = C; v1 < (c+1)*ifmblock ; v1++ ) {
-            for ( v2 = k*ofmblock; v2 < (k+1)*ofmblock; v2++ )
-              LIBXSMM_VLA_ACCESS(6,  output, k,c, r, s,v1- c*ifmblock,v2-k*ofmblock, blocksifm, R, S,ifmblock,ofmblock) = 0.0f;
+LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float* kcrs, int R,
+                                        int S, int C, int K, int blocksifm,
+                                        int blocksofm, int ifmblock,
+                                        int ofmblock, int start, int end) {
+  LIBXSMM_VLA_DECL(4, const float, input, rsck, S, C, K);
+  LIBXSMM_VLA_DECL(6, float, output, kcrs, blocksifm, R, S, ifmblock, ofmblock);
+  int r, s, k, c, v1, v2;
+
+  for (k = start; k < end; k++) {
+    for (c = 0; c < blocksifm; c++) {
+      for (r = 0; r < R; r++) {
+        for (s = 0; s < S; s++) {
+          for (v1 = c * ifmblock; v1 < std::min(C, (c + 1) * ifmblock); v1++) {
+            for (v2 = k * ofmblock; v2 < std::min(K, (k + 1) * ofmblock); v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) =
+                  LIBXSMM_VLA_ACCESS(4, input, r, s, v1, v2, S, C, K);
+            for (v2 = K; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
+          }
+          for (v1 = C; v1 < (c + 1) * ifmblock; v1++) {
+            for (v2 = k * ofmblock; v2 < (k + 1) * ofmblock; v2++)
+              LIBXSMM_VLA_ACCESS(6, output, k, c, r, s, v1 - c * ifmblock,
+                                 v2 - k * ofmblock, blocksifm, R, S, ifmblock,
+                                 ofmblock) = 0.0f;
           }
         }
       }
@@ -109,47 +117,28 @@ LIBXSMM_INLINE void copy_RSCK_to_custom(const float* rsck, float *kcrs, int R, i
   }
 }
 
- 
+class libxsmm_dnn_conv_desc_wrap {
+ public:
+  const libxsmm_dnn_conv_desc d;
 
-class libxsmm_dnn_conv_desc_wrap{
-  public:
-    const libxsmm_dnn_conv_desc d;
- 
-    libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc &d_) : d(d_){
-    }
-    bool operator==(const libxsmm_dnn_conv_desc_wrap  &w) const{
-      return( d.N == w.d.N &&
-              d.C == w.d.C &&
-              d.H == w.d.H &&
-              d.W == w.d.W &&
-              d.K == w.d.K &&
-              d.R == w.d.R &&
-              d.S == w.d.S &&
-              d.u == w.d.u &&
-              d.v == w.d.v &&
-              d.pad_h == w.d.pad_h &&
-              d.pad_w == w.d.pad_w
-            );
-    }
+  libxsmm_dnn_conv_desc_wrap(const libxsmm_dnn_conv_desc& d_) : d(d_) {}
+  bool operator==(const libxsmm_dnn_conv_desc_wrap& w) const {
+    return (d.N == w.d.N && d.C == w.d.C && d.H == w.d.H && d.W == w.d.W &&
+            d.K == w.d.K && d.R == w.d.R && d.S == w.d.S && d.u == w.d.u &&
+            d.v == w.d.v && d.pad_h == w.d.pad_h && d.pad_w == w.d.pad_w);
+  }
 };
- 
- 
-struct HashFunction{
-  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap & w) const{
-   
-    
 
+struct HashFunction {
+  std::size_t operator()(const libxsmm_dnn_conv_desc_wrap& w) const {
+    // unsigned char ptr[sizeof(&w.d)];
 
-    //unsigned char ptr[sizeof(&w.d)];
-
-    
-    //memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
-                                       
+    // memcpy(ptr, (unsigned char *)&w.d, sizeof(&w.d))
 
     //
     /*
     std::ostringstream N,C,H,W,K,R,S,u,v,padh,padw;
- 
+
     N << w.d.N; C << w.d.C;
     H << w.d.H; W << w.d.W;
     K << w.d.K; R << w.d.R;
@@ -167,47 +156,53 @@ struct HashFunction{
     //
     //
     */
-    return ( std::hash<unsigned long long>()((unsigned long long)&(w.d)));
+    return (std::hash<unsigned long long>()((unsigned long long)&(w.d)));
   }
 };
- 
-class handles{
-  public:
-    libxsmm_dnn_layer* find( const libxsmm_dnn_conv_desc_wrap &w) {
-      std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i = libxsmm_handles.find(w);
-      if (i == libxsmm_handles.end()){
-        libxsmm_dnn_err_t status;
-        libxsmm_dnn_layer* libxsmm_handle = libxsmm_dnn_create_conv_layer(w.d, &status);
-        chk_libxsmm_err(status, "Create handle");
-        libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
-        return libxsmm_handle;
-      }
-      else
-        return i->second;
+
+class handles {
+ public:
+  libxsmm_dnn_layer* find(const libxsmm_dnn_conv_desc_wrap& w) {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i = libxsmm_handles.find(w);
+    if (i == libxsmm_handles.end()) {
+      libxsmm_dnn_err_t status;
+      libxsmm_dnn_layer* libxsmm_handle =
+          libxsmm_dnn_create_conv_layer(w.d, &status);
+      chk_libxsmm_err(status, "Create handle");
+      libxsmm_handles.insert(std::make_pair(w, libxsmm_handle));
+      return libxsmm_handle;
+    } else {
+      return i->second;
     }
-   ~handles(){
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction>::iterator i;
-    for (i= libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
+  }
+  ~handles() {
+    std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                       HashFunction>::iterator i;
+    for (i = libxsmm_handles.begin(); i != libxsmm_handles.end(); i++)
       chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(i->second),
-                    "Destroy handle");
-    }
-  private:
- 
-    std::unordered_map<libxsmm_dnn_conv_desc_wrap , libxsmm_dnn_layer*, HashFunction> libxsmm_handles;
- 
+                      "Destroy handle");
+  }
+
+ private:
+  std::unordered_map<libxsmm_dnn_conv_desc_wrap, libxsmm_dnn_layer*,
+                     HashFunction>
+      libxsmm_handles;
 };
 
 static handles libxsmm_handles;
 
-//#define LIBXSMM_DETAILED_TIMING
+// #define LIBXSMM_DETAILED_TIMING
 
 template <typename InputPtr, typename FilterPtr, typename OutputPtr>
 static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
                                    const libxsmm_dnn_conv_desc& desc,
-                                   libxsmm_dnn_compute_kind kind, InputPtr input,
-                                   FilterPtr filter, OutputPtr output) {
+                                   libxsmm_dnn_compute_kind kind,
+                                   InputPtr input, FilterPtr filter,
+                                   OutputPtr output) {
 #if defined(LIBXSMM_DETAILED_TIMING)
-  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6, l_tick7, l_tick8, l_tick9, l_tick10;
+  unsigned long long l_tick1, l_tick2, l_tick3, l_tick4, l_tick5, l_tick6,
+      l_tick7, l_tick8, l_tick9, l_tick10;
   l_tick1 = libxsmm_timer_tick();
 #endif
   // setup scoped allocator, which adopts the allocator from the context
@@ -216,14 +211,14 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   libxsmm_dnn_layer* libxsmm_handle;
   libxsmm_dnn_conv_desc_wrap w(desc);
   void* scratch;
- 
-  //if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
+
+  // if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD)
   libxsmm_handle = libxsmm_handles.find(w);
-  //else{
+  // else{
   //  libxsmm_handle = libxsmm_dnn_create_conv_layer(desc, &status);
   //  chk_libxsmm_err(status, "Create handle");
   //}
-  
+
   status = libxsmm_dnn_get_codegen_success(libxsmm_handle, kind);
   if (status == LIBXSMM_DNN_WARN_FALLBACK) {
     chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
@@ -241,12 +236,16 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 #endif
 
   int ifmblock = (libxsmm_handle->ifmblock);
-  int ofmblock = (libxsmm_handle->ofmblock); 
+  int ofmblock = (libxsmm_handle->ofmblock);
 
-  int blocksifm = desc.C%ifmblock ==0 ? desc.C/ifmblock :desc.C/ifmblock + 1;           
-  int blocksofm = desc.K%ofmblock ==0 ? desc.K/ofmblock :desc.K/ofmblock + 1;
-  float *native_filter = (float*)libxsmm_aligned_scratch( blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float), 2097152);
- 
+  int blocksifm =
+      desc.C % ifmblock == 0 ? desc.C / ifmblock : desc.C / ifmblock + 1;
+  int blocksofm =
+      desc.K % ofmblock == 0 ? desc.K / ofmblock : desc.K / ofmblock + 1;
+  float* native_filter =
+      (float*)libxsmm_aligned_scratch(blocksofm * blocksifm * desc.R * desc.S *
+                                          ifmblock * ofmblock * sizeof(float),
+                                      2097152);
 
   const DeviceBase::CpuWorkerThreads* worker_threads =
       ctx->device()->tensorflow_cpu_worker_threads();
@@ -254,90 +253,111 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   int num_threads = worker_threads->num_threads;
 
 #if 1
-  if(kind ==  LIBXSMM_DNN_COMPUTE_KIND_FWD || kind ==  LIBXSMM_DNN_COMPUTE_KIND_BWD){
-    if(blocksofm > num_threads){
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    if (blocksofm > num_threads) {
       int work = blocksofm;
       BlockingCounter count(num_threads);
       for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = work/num_threads*i;
-        int end =  (start + work/num_threads) > work ? work: start + work/num_threads;
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock,start, end);
-        count.DecrementCount();
+          int start = work / num_threads * i;
+          int end = (start + work / num_threads) > work
+                        ? work
+                        : start + work / num_threads;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
       }
       count.Wait();
-    }
-    else{
- 
+    } else {
       int work = blocksofm;
       int num_threads = work;
- 
+
       BlockingCounter count(num_threads);
       for (int i = 0; i < num_threads; ++i) {
         worker_threads->workers->Schedule([=, &count]() {
-        int start = i;
-        int end =  i+1;
-        copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S,desc.C, desc.K,blocksifm,blocksofm,ifmblock,ofmblock, start, end);
-        count.DecrementCount();
+          int start = i;
+          int end = i + 1;
+          copy_RSCK_to_custom(filter, native_filter, desc.R, desc.S, desc.C,
+                              desc.K, blocksifm, blocksofm, ifmblock, ofmblock,
+                              start, end);
+          count.DecrementCount();
         });
       }
       count.Wait();
     }
-  }
-  //Added: for weight update
-  else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD){
-    libxsmm_filter = libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter, LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
-    chk_libxsmm_err(status, "Link filter");//weight update is in RSCK as filter should be returned in RSCK format
+  } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
+    // Added: for weight update
+    libxsmm_filter =
+        libxsmm_dnn_link_filter(libxsmm_handle, LIBXSMM_DNN_FILTER, filter,
+                                LIBXSMM_DNN_TENSOR_FORMAT_RSCK_PTR, &status);
+    chk_libxsmm_err(status,
+                    "Link filter");  // weight update is in RSCK as
+                                     // filter should be returned in RSCK
+                                     // format
   }
 #else
-  memset( native_filter, 0, blocksofm*blocksifm*desc.R*desc.S*ifmblock*ofmblock*sizeof(float));
+  memset(native_filter, 0,
+         blocksofm * blocksifm * desc.R * desc.S * ifmblock * ofmblock *
+             sizeof(float));
 #endif
 
 #if defined(LIBXSMM_DETAILED_TIMING)
   l_tick3 = libxsmm_timer_tick();
 #endif
 
-  libxsmm_input = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_INPUT, input, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  libxsmm_input =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_INPUT, input,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link input buffer");
-  libxsmm_output = libxsmm_dnn_link_buffer(
-      libxsmm_handle, LIBXSMM_DNN_OUTPUT, output, LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
+  libxsmm_output =
+      libxsmm_dnn_link_buffer(libxsmm_handle, LIBXSMM_DNN_OUTPUT, output,
+                              LIBXSMM_DNN_TENSOR_FORMAT_NHWC_PTR, &status);
   chk_libxsmm_err(status, "Link output buffer");
-  if(kind == LIBXSMM_DNN_COMPUTE_KIND_FWD || kind == LIBXSMM_DNN_COMPUTE_KIND_BWD){
-  libxsmm_filter = libxsmm_dnn_link_filter(
-      libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter, LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
-  chk_libxsmm_err(status, "Link filter");
+  if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD ||
+      kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
+    libxsmm_filter = libxsmm_dnn_link_filter(
+        libxsmm_handle, LIBXSMM_DNN_FILTER, native_filter,
+        LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM_PTR, &status);
+    chk_libxsmm_err(status, "Link filter");
   }
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
     chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_output), "Zero output");
 
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
                     "Bind input forward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT),
-        "Bind output forward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_REGULAR_OUTPUT),
+                    "Bind output forward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter forward");
   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
     chk_libxsmm_err(libxsmm_dnn_zero_buffer(libxsmm_input), "Zero input");
 
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_GRADIENT_INPUT),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_GRADIENT_INPUT),
                     "Bind input backward");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
-        "Bind output backward");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output backward");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_REGULAR_FILTER),
                     "Bind filter backward");
   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
     chk_libxsmm_err(libxsmm_dnn_zero_filter(libxsmm_filter), "Zero filter");
 
-    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT),
-                    "Bind input weight udpate");
-    chk_libxsmm_err(
-        libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output, LIBXSMM_DNN_GRADIENT_OUTPUT),
-        "Bind output weight update");
-    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_GRADIENT_FILTER),
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_input,
+                                            LIBXSMM_DNN_REGULAR_INPUT),
+                    "Bind input weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_buffer(libxsmm_handle, libxsmm_output,
+                                            LIBXSMM_DNN_GRADIENT_OUTPUT),
+                    "Bind output weight update");
+    chk_libxsmm_err(libxsmm_dnn_bind_filter(libxsmm_handle, libxsmm_filter,
+                                            LIBXSMM_DNN_GRADIENT_FILTER),
                     "Bind filter weight update");
   } else {
     /* shouldn't happen */
@@ -348,9 +368,14 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 #endif
 
   /* bind scratch */
-  scratch = (void*)libxsmm_aligned_scratch( libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status ), 2097152);
-  chk_libxsmm_err( status, "scratch allocation" );
-  chk_libxsmm_err( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ), "binding scratch" );
+  scratch = (void*)libxsmm_aligned_scratch(
+      libxsmm_dnn_get_scratch_size(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL,
+                                   &status),
+      2097152);
+  chk_libxsmm_err(status, "scratch allocation");
+  chk_libxsmm_err(libxsmm_dnn_bind_scratch(
+                      libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch),
+                  "binding scratch");
 
 #if defined(LIBXSMM_DETAILED_TIMING)
   l_tick5 = libxsmm_timer_tick();
@@ -366,7 +391,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
 #if 1
   BlockingCounter counter(num_threads);
-  
+
   for (int i = 0; i < num_threads; ++i) {
     worker_threads->workers->Schedule([=, &counter]() {
       chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, i),
@@ -376,9 +401,11 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
   }
   counter.Wait();
 #else
-  #pragma omp parallel
+#pragma omp parallel
   {
-    chk_libxsmm_err(libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()), "Worker");
+    chk_libxsmm_err(
+        libxsmm_dnn_execute_st(libxsmm_handle, kind, 0, omp_get_thread_num()),
+        "Worker");
   }
 #endif
 
@@ -387,7 +414,7 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 #endif
 
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
-    libxsmm_dnn_reduce_wu_filters( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER );
+    libxsmm_dnn_reduce_wu_filters(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER);
   }
 
 #if defined(LIBXSMM_DETAILED_TIMING)
@@ -395,19 +422,39 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 #endif
 
   /* clean up */
-  chk_libxsmm_err( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ), "release scratch" );
+  chk_libxsmm_err(
+      libxsmm_dnn_release_scratch(libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL),
+      "release scratch");
   if (kind == LIBXSMM_DNN_COMPUTE_KIND_FWD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_BWD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER),
+        "release filter");
   } else if (kind == LIBXSMM_DNN_COMPUTE_KIND_UPD) {
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ), "release input" );
-    chk_libxsmm_err( libxsmm_dnn_release_buffer( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ), "release output" );
-    chk_libxsmm_err( libxsmm_dnn_release_filter( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ), "release filter" );
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT),
+        "release input");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_buffer(libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT),
+        "release output");
+    chk_libxsmm_err(
+        libxsmm_dnn_release_filter(libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER),
+        "release filter");
   } else {
     /* shouldn't happen */
   }
@@ -418,9 +465,9 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 #if defined(LIBXSMM_DETAILED_TIMING)
   l_tick9 = libxsmm_timer_tick();
 #endif
-  
-  //if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
-  //chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
+
+  // if(kind != LIBXSMM_DNN_COMPUTE_KIND_FWD)
+  // chk_libxsmm_err(libxsmm_dnn_destroy_conv_layer(libxsmm_handle),
   //               "Destroy handle");
 
   libxsmm_free(native_filter);
@@ -428,17 +475,20 @@ static bool CallLibxsmmConvGeneric(OpKernelContext* ctx,
 
 #if defined(LIBXSMM_DETAILED_TIMING)
   l_tick10 = libxsmm_timer_tick();
-  printf("time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n", desc.N, desc.C, desc.K, desc.R, desc.S, 
-                                                                                      libxsmm_timer_duration(l_tick1, l_tick2),
-                                                                                      libxsmm_timer_duration(l_tick2, l_tick3),
-                                                                                      libxsmm_timer_duration(l_tick3, l_tick4),
-                                                                                      libxsmm_timer_duration(l_tick4, l_tick5),
-                                                                                      libxsmm_timer_duration(l_tick5, l_tick6),
-                                                                                      libxsmm_timer_duration(l_tick6, l_tick7),
-                                                                                      libxsmm_timer_duration(l_tick7, l_tick8),
-                                                                                      libxsmm_timer_duration(l_tick8, l_tick9),
-                                                                                      libxsmm_timer_duration(l_tick9, l_tick10),
-                                                                                      libxsmm_timer_duration(l_tick1, l_tick10)  );
+  printf(
+      "time for convolution (%i, %i, %i, %i, %i): %f, %f, %f, %f, %f, %f, %f, "
+      "%f, %f, %f\n",
+      desc.N, desc.C, desc.K, desc.R, desc.S,
+      libxsmm_timer_duration(l_tick1, l_tick2),
+      libxsmm_timer_duration(l_tick2, l_tick3),
+      libxsmm_timer_duration(l_tick3, l_tick4),
+      libxsmm_timer_duration(l_tick4, l_tick5),
+      libxsmm_timer_duration(l_tick5, l_tick6),
+      libxsmm_timer_duration(l_tick6, l_tick7),
+      libxsmm_timer_duration(l_tick7, l_tick8),
+      libxsmm_timer_duration(l_tick8, l_tick9),
+      libxsmm_timer_duration(l_tick9, l_tick10),
+      libxsmm_timer_duration(l_tick1, l_tick10));
 #endif
 
   return true;  // Succeeded
@@ -448,8 +498,8 @@ template <typename T>
 struct XsmmFwdConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, const T* filter, T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_FWD,
+                                  input, filter, output);
   }
 };
 
@@ -457,8 +507,8 @@ template <typename T>
 struct XsmmBkwInputConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   T* input, const T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_BWD,
+                                  input, filter, output);
   }
 };
 
@@ -466,8 +516,8 @@ template <typename T>
 struct XsmmBkwFilterConv2D<CPUDevice, T> {
   bool operator()(OpKernelContext* ctx, const libxsmm_dnn_conv_desc& desc,
                   const T* input, T* filter, const T* output) {
-    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD, input,
-                                  filter, output);
+    return CallLibxsmmConvGeneric(ctx, desc, LIBXSMM_DNN_COMPUTE_KIND_UPD,
+                                  input, filter, output);
   }
 };
 
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/zip_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7fc9bc6b1f1378a4af7d63f3fc9f41299d50969
--- /dev/null
+++ b/tensorflow/core/kernels/zip_dataset_op.cc
@@ -0,0 +1,148 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/dataset.h"
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ZipDatasetOp : public OpKernel {
+ public:
+  explicit ZipDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    std::vector<DatasetBase*> inputs;
+    Status s;
+    for (size_t i = 0; i < ctx->num_inputs(); ++i) {
+      // Create a new ZipDatasetOp::Dataset, insert it in the step-local
+      // container, and return it as the output.
+      DatasetBase* input;
+      s.Update(LookupResource(ctx, HandleFromInput(ctx, i), &input));
+      if (!s.ok()) {
+        break;
+      }
+      inputs.push_back(input);
+    }
+
+    if (s.ok()) {
+      DatasetBase* dataset = new Dataset(inputs);
+      Tensor* output = nullptr;
+      s = ctx->allocate_output(0, TensorShape({}), &output);
+      if (s.ok()) {
+        ResourceHandle handle = MakeResourceHandle<DatasetBase>(
+            ctx, ctx->step_container()->name(), name());
+        s = CreateResource(ctx, handle, dataset);
+        if (s.ok()) {
+          output->flat<ResourceHandle>()(0) = handle;
+        }
+      }
+    }
+
+    // TODO(mrry): Implement a container that acts as a
+    // `std::vector<core::ScopedUnref>`, to avoid having to unref the
+    // inputs manually, and re-enable the use of `OP_REQUIRES_OK()`.
+    for (DatasetBase* input : inputs) {
+      input->Unref();
+    }
+    ctx->SetStatus(s);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const std::vector<DatasetBase*>& inputs)
+        : inputs_(inputs) {
+      for (const auto& input : inputs_) {
+        input->Ref();
+        for (DataType dt : input->output_dtypes()) {
+          output_dtypes_.push_back(dt);
+        }
+        output_shapes_.insert(output_shapes_.end(),
+                              input->output_shapes().begin(),
+                              input->output_shapes().end());
+      }
+    }
+
+    ~Dataset() override {
+      for (const auto& input : inputs_) {
+        input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIterator() const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(this));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_dtypes_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() override { return "ZipDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Dataset* dataset)
+          : DatasetIterator<Dataset>(dataset) {
+        input_impls_.reserve(dataset->inputs_.size());
+        for (const auto& input : dataset->inputs_) {
+          input_impls_.emplace_back(input->MakeIterator());
+        }
+      }
+
+      Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        out_tensors->clear();
+        out_tensors->reserve(dataset()->output_dtypes().size());
+        for (const auto& input_impl : input_impls_) {
+          std::vector<Tensor> input_tensors;
+          TF_RETURN_IF_ERROR(
+              input_impl->GetNext(ctx, &input_tensors, end_of_sequence));
+          if (*end_of_sequence) {
+            return Status::OK();
+          }
+          out_tensors->insert(out_tensors->end(), input_tensors.begin(),
+                              input_tensors.end());
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::vector<std::unique_ptr<IteratorBase>> input_impls_ GUARDED_BY(mu_);
+    };
+
+    const std::vector<DatasetBase*> inputs_;
+    DataTypeVector output_dtypes_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ZipDataset").Device(DEVICE_CPU), ZipDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/bmp/testdata/lena.bmp b/tensorflow/core/lib/bmp/testdata/lena.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..8c4882de4a78355fbfe9d5750627116645bc8503
Binary files /dev/null and b/tensorflow/core/lib/bmp/testdata/lena.bmp differ
diff --git a/tensorflow/core/lib/core/bits.h b/tensorflow/core/lib/core/bits.h
index 30ad0c2bea9f8d1c36bd05d3d1c4f9cd25881788..1110ef5c2a4141e58a977a5b8c7fb8c66f44d7fe 100644
--- a/tensorflow/core/lib/core/bits.h
+++ b/tensorflow/core/lib/core/bits.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_CORE_BITS_H_
 #define TENSORFLOW_LIB_CORE_BITS_H_
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -91,6 +92,18 @@ inline int Log2Ceiling64(uint64 n) {
     return floor + 1;
 }
 
+inline uint32 NextPowerOfTwo(uint32 value) {
+  int exponent = Log2Ceiling(value);
+  DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
+  return 1 << exponent;
+}
+
+inline uint64 NextPowerOfTwo64(uint64 value) {
+  int exponent = Log2Ceiling(value);
+  DCHECK_LT(exponent, std::numeric_limits<uint64>::digits);
+  return 1LL << exponent;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_CORE_BITS_H_
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 2b10ebeaf7cbed4a8466a69898d6d4d6660ed5cb..c8e514df800550abc07ef8394893d8da09e7ed3d 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -47,7 +47,7 @@ struct EigenEnvironment {
                    const string& name)
       : env_(env), thread_options_(thread_options), name_(name) {}
 
-  EnvThread* CreateThread(std::function<void()> f) {
+  EnvThread* CreateThread(const std::function<void()>& f) {
     return env_->StartThread(thread_options_, name_, [=]() {
       // Set the processor flag to flush denormals to zero.
       port::ScopedFlushDenormal flush;
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index df8887b1c479e6bb70e09826f554e26e5994a2ed..b5c0d9f621dd2e6fa8c5fd64d71f886fcfb3fd1e 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // Functions to read images in GIF format.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/gif.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
@@ -24,9 +25,19 @@ limitations under the License.
 namespace tensorflow {
 namespace gif {
 
+struct InputBufferInfo {
+  const uint8_t* buf;
+  int bytes_left;
+};
+
 int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
-  if (gif_file->UserData && memcpy(buf, gif_file->UserData, size)) {
-    gif_file->UserData = ((uint8_t*)gif_file->UserData) + size;
+  InputBufferInfo* const info =
+      reinterpret_cast<InputBufferInfo*>(gif_file->UserData);
+  if (info != nullptr) {
+    if (size > info->bytes_left) size = info->bytes_left;
+    memcpy(buf, info->buf, size);
+    info->buf += size;
+    info->bytes_left -= size;
     return size;
   }
   return 0;
@@ -35,8 +46,16 @@ int input_callback(GifFileType* gif_file, GifByteType* buf, int size) {
 uint8* Decode(const void* srcdata, int datasize,
               std::function<uint8*(int, int, int, int)> allocate_output) {
   int error_code = D_GIF_SUCCEEDED;
+  InputBufferInfo info = {reinterpret_cast<const uint8*>(srcdata), datasize};
   GifFileType* gif_file =
-      DGifOpen(const_cast<void*>(srcdata), &input_callback, &error_code);
+      DGifOpen(static_cast<void*>(&info), &input_callback, &error_code);
+  const auto cleanup = gtl::MakeCleanup([gif_file]() {
+    int error_code = D_GIF_SUCCEEDED;
+    if (gif_file && DGifCloseFile(gif_file, &error_code) != GIF_OK) {
+      LOG(WARNING) << "Fail to close gif file, reason: "
+                   << GifErrorString(error_code);
+    }
+  });
   if (error_code != D_GIF_SUCCEEDED) {
     LOG(ERROR) << "Fail to open gif file, reason: "
                << GifErrorString(error_code);
@@ -52,12 +71,13 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
-  int num_frames = gif_file->ImageCount;
-  int width = gif_file->SWidth;
-  int height = gif_file->SHeight;
-  int channel = 3;
+  const int num_frames = gif_file->ImageCount;
+  const int width = gif_file->SWidth;
+  const int height = gif_file->SHeight;
+  const int channel = 3;
 
-  uint8* dstdata = allocate_output(num_frames, width, height, channel);
+  uint8* const dstdata = allocate_output(num_frames, width, height, channel);
+  if (!dstdata) return nullptr;
   for (int k = 0; k < num_frames; k++) {
     SavedImage* this_image = &gif_file->SavedImages[k];
     GifImageDesc* img_desc = &this_image->ImageDesc;
@@ -84,10 +104,6 @@ uint8* Decode(const void* srcdata, int datasize,
     }
   }
 
-  if (DGifCloseFile(gif_file, &error_code) != GIF_OK) {
-    LOG(WARNING) << "Fail to close gif file, reason: "
-                 << GifErrorString(error_code);
-  }
   return dstdata;
 }
 
diff --git a/tensorflow/core/lib/gif/testdata/lena.gif b/tensorflow/core/lib/gif/testdata/lena.gif
new file mode 100644
index 0000000000000000000000000000000000000000..12980a3b28af48982e50386f94f9a1c112a10f18
Binary files /dev/null and b/tensorflow/core/lib/gif/testdata/lena.gif differ
diff --git a/tensorflow/core/lib/gtl/inlined_vector_test.cc b/tensorflow/core/lib/gtl/inlined_vector_test.cc
index b957fedc4a04b0709deed5e026c56e5082bf694a..6e3c083f58a7f53e473acd2841a51e0209a2be62 100644
--- a/tensorflow/core/lib/gtl/inlined_vector_test.cc
+++ b/tensorflow/core/lib/gtl/inlined_vector_test.cc
@@ -816,7 +816,7 @@ static void BM_StdVectorFillString(int iters, int len) {
   }
   testing::ItemsProcessed(int64{iters} * len);
   // The purpose of the benchmark is to verify that inlined vector is
-  // efficient when moving is more efficent than copying. To do so, we
+  // efficient when moving is more efficient than copying. To do so, we
   // use strings that are larger than the small string optimization.
   CHECK(!StringRepresentedInline(strings[0]));
 }
diff --git a/tensorflow/core/lib/gtl/optional.h b/tensorflow/core/lib/gtl/optional.h
index f80b5c113d0799536d0930ddda8da0935c22e659..8ba4b091434c74c6062f7ac91aa7902b71a533e2 100644
--- a/tensorflow/core/lib/gtl/optional.h
+++ b/tensorflow/core/lib/gtl/optional.h
@@ -541,7 +541,7 @@ class optional : private internal_optional::optional_data<T>,
   // opt.emplace(arg1,arg2,arg3);  (Constructs Foo(arg1,arg2,arg3))
   //
   // If the optional is non-empty, and the `args` refer to subobjects of the
-  // current object, then behaviour is undefined.  This is because the current
+  // current object, then behavior is undefined.  This is because the current
   // object will be destructed before the new object is constructed with `args`.
   //
   template <typename... Args,
@@ -586,7 +586,7 @@ class optional : private internal_optional::optional_data<T>,
 
   // [optional.observe], observers
   // You may use `*opt`, and `opt->m`, to access the underlying T value and T's
-  // member `m`, respectively.  If the optional is empty, behaviour is
+  // member `m`, respectively.  If the optional is empty, behavior is
   // undefined.
   constexpr const T* operator->() const { return this->pointer(); }
   T* operator->() {
diff --git a/tensorflow/core/lib/gtl/top_n_test.cc b/tensorflow/core/lib/gtl/top_n_test.cc
index fafcd445b87731f722589c9754f850da5da2e964..0f2325e6faf251f949b53dc701c1753aa68b5476 100644
--- a/tensorflow/core/lib/gtl/top_n_test.cc
+++ b/tensorflow/core/lib/gtl/top_n_test.cc
@@ -254,7 +254,7 @@ TEST(TopNTest, Iteration) {
   for (int i = 0; i < 8; ++i) top.push(i);
   std::vector<int> actual(top.unsorted_begin(), top.unsorted_end());
   // Check that we have 4,5,6,7 as the top 4 (in some order, so we sort)
-  sort(actual.begin(), actual.end());
+  std::sort(actual.begin(), actual.end());
   EXPECT_EQ(actual.size(), 4);
   EXPECT_EQ(actual[0], 4);
   EXPECT_EQ(actual[1], 5);
diff --git a/tensorflow/core/lib/io/block_builder.cc b/tensorflow/core/lib/io/block_builder.cc
index 5a87da6c86a50a28c1ef9a49342554b623e7f8f3..b2921c076cc127d6e37ff7ea4fa83ab520308c6d 100644
--- a/tensorflow/core/lib/io/block_builder.cc
+++ b/tensorflow/core/lib/io/block_builder.cc
@@ -70,10 +70,12 @@ size_t BlockBuilder::CurrentSizeEstimate() const {
 
 StringPiece BlockBuilder::Finish() {
   // Append restart array
-  for (size_t i = 0; i < restarts_.size(); i++) {
-    core::PutFixed32(&buffer_, restarts_[i]);
+  CHECK_LE(restarts_.size(), std::numeric_limits<uint32_t>::max());
+  for (const auto r : restarts_) {
+    core::PutFixed32(&buffer_, r);
   }
-  core::PutFixed32(&buffer_, restarts_.size());
+  // Downcast safe because of the CHECK.
+  core::PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
   finished_ = true;
   return StringPiece(buffer_);
 }
@@ -93,19 +95,24 @@ void BlockBuilder::Add(const StringPiece& key, const StringPiece& value) {
     }
   } else {
     // Restart compression
-    restarts_.push_back(buffer_.size());
+    CHECK_LE(buffer_.size(), std::numeric_limits<uint32_t>::max());
+    restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
     counter_ = 0;
   }
   const size_t non_shared = key.size() - shared;
 
+  CHECK_LE(shared, std::numeric_limits<uint32_t>::max());
+  CHECK_LE(non_shared, std::numeric_limits<uint32_t>::max());
+  CHECK_LE(value.size(), std::numeric_limits<uint32_t>::max());
+
   // Add "<shared><non_shared><value_size>" to buffer_
-  core::PutVarint32(&buffer_, shared);
-  core::PutVarint32(&buffer_, non_shared);
-  core::PutVarint32(&buffer_, value.size());
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(shared));
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(non_shared));
+  core::PutVarint32(&buffer_, static_cast<uint32_t>(value.size()));
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
-  buffer_.append(value.data(), value.size());
+  buffer_.append(value.data(), static_cast<uint32_t>(value.size()));
 
   // Update state
   last_key_.resize(shared);
diff --git a/tensorflow/core/lib/io/buffered_inputstream.cc b/tensorflow/core/lib/io/buffered_inputstream.cc
index 7bdbc1ff8c7144a37ba317449099c779e54b2078..6f72da47131692130844c1a11d4eb7f1092dc441 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream.cc
@@ -150,6 +150,24 @@ Status BufferedInputStream::Seek(int64 position) {
   return SkipNBytes(position - bufpos);
 }
 
+Status BufferedInputStream::ReadAll(string* result) {
+  result->clear();
+  Status status;
+  while (status.ok()) {
+    status = FillBuffer();
+    if (limit_ == 0) {
+      break;
+    }
+    result->append(buf_);
+    pos_ = limit_;
+  }
+
+  if (errors::IsOutOfRange(status)) {
+    return Status::OK();
+  }
+  return status;
+}
+
 Status BufferedInputStream::Reset() {
   TF_RETURN_IF_ERROR(input_stream_->Reset());
   pos_ = 0;
diff --git a/tensorflow/core/lib/io/buffered_inputstream.h b/tensorflow/core/lib/io/buffered_inputstream.h
index d55dd5cdafbf83324ed53a1974776597217db175..b37766005a920645c604330fbf792f69df889132 100644
--- a/tensorflow/core/lib/io/buffered_inputstream.h
+++ b/tensorflow/core/lib/io/buffered_inputstream.h
@@ -75,6 +75,12 @@ class BufferedInputStream : public InputStreamInterface {
   // no special treatment.
   string ReadLineAsString();
 
+  // Reads the entire contents of the file into *result.
+  //
+  // Note: the amount of memory used by this function call is unbounded, so only
+  // use in ops that expect that behavior.
+  Status ReadAll(string* result);
+
   Status Reset() override;
 
  private:
diff --git a/tensorflow/core/lib/io/buffered_inputstream_test.cc b/tensorflow/core/lib/io/buffered_inputstream_test.cc
index 4def2de373a5d902b86c79b8bcbb522a8acecf49..7265101e1bef402a655192aeac111375aba4b51a 100644
--- a/tensorflow/core/lib/io/buffered_inputstream_test.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream_test.cc
@@ -326,6 +326,42 @@ TEST(BufferedInputStream, Seek) {
   }
 }
 
+TEST(BufferedInputStream, ReadAll_Empty) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  const string expected = "";
+  TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
+
+  for (auto buf_size : BufferSizes()) {
+    RandomAccessInputStream input_stream(file.get());
+    string read;
+    BufferedInputStream in(&input_stream, buf_size);
+    string contents;
+    TF_ASSERT_OK(in.ReadAll(&contents));
+    EXPECT_EQ(expected, contents);
+  }
+}
+
+TEST(BufferedInputStream, ReadAll_Text) {
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  const string expected = "line one\nline two\nline three";
+  TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
+  std::unique_ptr<RandomAccessFile> file;
+  TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
+
+  for (auto buf_size : BufferSizes()) {
+    RandomAccessInputStream input_stream(file.get());
+    string read;
+    BufferedInputStream in(&input_stream, buf_size);
+    string contents;
+    TF_ASSERT_OK(in.ReadAll(&contents));
+    EXPECT_EQ(expected, contents);
+  }
+}
+
 }  // anonymous namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/inputbuffer.cc b/tensorflow/core/lib/io/inputbuffer.cc
index 750737a62d01bd74d4a92d290f0c57fae7f601c5..7efe2dc54341ee9780b6a0f3bd98e896f37a4700 100644
--- a/tensorflow/core/lib/io/inputbuffer.cc
+++ b/tensorflow/core/lib/io/inputbuffer.cc
@@ -47,7 +47,7 @@ Status InputBuffer::ReadLine(string* result) {
   Status s;
   do {
     size_t buf_remain = limit_ - pos_;
-    char* newline = (char*)memchr(pos_, '\n', buf_remain);
+    char* newline = static_cast<char*>(memchr(pos_, '\n', buf_remain));
     if (newline != nullptr) {
       size_t result_len = newline - pos_;
       result->append(pos_, result_len);
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index ab2fd7739f7687c0fa807b458e2786f2cac89e5b..d93dd0296e4f28e024600110eee45153ea9c9cbd 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -177,7 +177,7 @@ string CleanPath(StringPiece unclean_path) {
   }
 
   // Calculate and check the length of the cleaned path.
-  int path_length = dst - path.begin();
+  string::difference_type path_length = dst - path.begin();
   if (path_length != 0) {
     // Remove trailing '/' except if it is root path ("/" ==> path_length := 1)
     if (path_length > 1 && path[path_length - 1] == '/') {
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index d77a1016dad7c0f21190c828eb1776bca7f5b104..3657243c5d38a2076c1ca2c2e5f31b488b5a281b 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -80,15 +80,12 @@ RecordWriter::RecordWriter(WritableFile* dest,
 }
 
 RecordWriter::~RecordWriter() {
-#if !defined(IS_SLIM_BUILD)
-  if (IsZlibCompressed(options_)) {
-    Status s = dest_->Close();
+  if (dest_ != nullptr) {
+    Status s = Close();
     if (!s.ok()) {
       LOG(ERROR) << "Could not finish writing file: " << s;
     }
-    delete dest_;
   }
-#endif  // IS_SLIM_BUILD
 }
 
 static uint32 MaskedCrc(const char* data, size_t n) {
@@ -113,6 +110,18 @@ Status RecordWriter::WriteRecord(StringPiece data) {
   return dest_->Append(StringPiece(footer, sizeof(footer)));
 }
 
+Status RecordWriter::Close() {
+#if !defined(IS_SLIM_BUILD)
+  if (IsZlibCompressed(options_)) {
+    Status s = dest_->Close();
+    delete dest_;
+    dest_ = nullptr;
+    return s;
+  }
+#endif  // IS_SLIM_BUILD
+  return Status::OK();
+}
+
 Status RecordWriter::Flush() {
   if (IsZlibCompressed(options_)) {
     return dest_->Flush();
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 63f0a7c5d07c2dd5e8099b1f890473c00d12b765..daed809af3c5329125628d53cc4e05b47def1052 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -53,6 +53,10 @@ class RecordWriter {
   RecordWriter(WritableFile* dest,
                const RecordWriterOptions& options = RecordWriterOptions());
 
+  // Calls Close() and logs if an error occurs.
+  //
+  // TODO(jhseu): Require that callers explicitly call Close() and remove the
+  // implicit Close() call in the destructor.
   ~RecordWriter();
 
   Status WriteRecord(StringPiece slice);
@@ -62,6 +66,12 @@ class RecordWriter {
   // WritableFile.
   Status Flush();
 
+  // Writes all output to the file. Does *not* close the WritableFile.
+  //
+  // After calling Close(), any further calls to `WriteRecord()` or `Flush()`
+  // are invalid.
+  Status Close();
+
  private:
   WritableFile* dest_;
   RecordWriterOptions options_;
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index f9846968afc60328743a6d07da9928fcc8022816..5dce3673fc0bbe81c7f6bd3252766e675cec45b9 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -45,7 +45,7 @@ enum JPEGErrors {
   JPEGERRORS_BAD_PARAM
 };
 
-// Prevent bad compiler behaviour in ASAN mode by wrapping most of the
+// Prevent bad compiler behavior in ASAN mode by wrapping most of the
 // arguments in a struct struct.
 class FewerArgsForCompiler {
  public:
@@ -337,7 +337,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 uint8* Uncompress(const void* srcdata, int datasize,
                   const UncompressFlags& flags, int64* nwarn,
                   std::function<uint8*(int, int, int)> allocate_output) {
-  FewerArgsForCompiler argball(datasize, flags, nwarn, allocate_output);
+  FewerArgsForCompiler argball(datasize, flags, nwarn,
+                               std::move(allocate_output));
   uint8* const dstdata = UncompressLow(srcdata, &argball);
 
   const float fraction_read =
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index bdc39e5d6f7b1c09ef72245d1d8f18937413f5fc..961a78f83b1af35fb0427d334946e228d7b6ca2d 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string.h>
 #include <sys/types.h>
+#include <zlib.h>
 #include <string>
 #include <utility>
 #include <vector>
@@ -152,7 +153,10 @@ bool DecodeHeader(StringPiece png_string, int* width, int* height,
   if (components != NULL) {
     switch (context.color_type) {
       case PNG_COLOR_TYPE_PALETTE:
-        *components = (context.info_ptr->valid & PNG_INFO_tRNS) ? 4 : 3;
+        *components =
+            (png_get_valid(context.png_ptr, context.info_ptr, PNG_INFO_tRNS))
+                ? 4
+                : 3;
         break;
       case PNG_COLOR_TYPE_GRAY:
         *components = 1;
@@ -176,8 +180,11 @@ bool DecodeHeader(StringPiece png_string, int* width, int* height,
   }
   if (metadata != NULL) {
     metadata->clear();
-    for (int i = 0; i < context.info_ptr->num_text; i++) {
-      const png_text& text = context.info_ptr->text[i];
+    png_textp text_ptr = NULL;
+    int num_text = 0;
+    png_get_text(context.png_ptr, context.info_ptr, &text_ptr, &num_text);
+    for (int i = 0; i < num_text; i++) {
+      const png_text& text = text_ptr[i];
       metadata->push_back(std::make_pair(text.key, text.text));
     }
   }
@@ -228,9 +235,10 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     return false;
   }
   if (context->channels == 0) {  // Autodetect number of channels
-    context->channels = context->info_ptr->channels;
+    context->channels = png_get_channels(context->png_ptr, context->info_ptr);
   }
-  const bool has_tRNS = (context->info_ptr->valid & PNG_INFO_tRNS) != 0;
+  const bool has_tRNS =
+      (png_get_valid(context->png_ptr, context->info_ptr, PNG_INFO_tRNS)) != 0;
   const bool has_alpha = (context->color_type & PNG_COLOR_MASK_ALPHA) != 0;
   if ((context->channels & 1) == 0) {  // We desire alpha
     if (has_alpha) {                   // There is alpha
@@ -268,7 +276,9 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   const bool want_gray = (context->channels < 3);
   const bool is_gray = !(context->color_type & PNG_COLOR_MASK_COLOR);
   if (is_gray) {  // upconvert gray to 8-bit if needed.
-    if (context->bit_depth < 8) png_set_gray_1_2_4_to_8(context->png_ptr);
+    if (context->bit_depth < 8) {
+      png_set_expand_gray_1_2_4_to_8(context->png_ptr);
+    }
   }
   if (want_gray) {  // output is grayscale
     if (!is_gray)
@@ -301,7 +311,9 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
     }
   }
 
-  context->info_ptr->valid |= PNG_INFO_IDAT;
+  // Marks iDAT as valid.
+  png_set_rows(context->png_ptr, context->info_ptr,
+               png_get_rows(context->png_ptr, context->info_ptr));
   png_read_end(context->png_ptr, context->info_ptr);
 
   // Clean up.
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index 1fec5a3b4415d6a592e8d7ddefb55691d3b34183..b2adb4462ba7d71122e84f2f5b4acc3b8327d9f8 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -101,12 +101,15 @@ class Array {
 // 2. PhiloxRandom is compilable by gcc and nvcc.
 class PhiloxRandom {
  public:
-  typedef Array<uint32, 4> ResultType;
-  typedef uint32 ResultElementType;
+  using ResultType = Array<uint32, 4>;
+  using ResultElementType = uint32;
   // The number of elements that will be returned.
   static const int kResultElementCount = 4;
   // Cost of generation of a single element (in cycles).
   static const int kElementCost = 10;
+  // The type for the 64-bit key stored in the form of two 32-bit uint
+  // that are used in the diffusion process.
+  using Key = Array<uint32, 2>;
 
   PHILOX_DEVICE_INLINE
   PhiloxRandom() {}
@@ -125,6 +128,9 @@ class PhiloxRandom {
     counter_[3] = static_cast<uint32>(seed_hi >> 32);
   }
 
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
+
   // Skip the specified number of samples of 128-bits in the current stream.
   PHILOX_DEVICE_INLINE
   void Skip(uint64 count) {
@@ -178,10 +184,6 @@ class PhiloxRandom {
   }
 
  private:
-  // The type for the 64-bit key stored in the form of two 32-bit uint
-  // that are used in the diffusion process.
-  typedef Array<uint32, 2> Key;
-
   // We use the same constants as recommended by the original paper.
   static const uint32 kPhiloxW32A = 0x9E3779B9;
   static const uint32 kPhiloxW32B = 0xBB67AE85;
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 531ed781095bc1434095bf09f3b6220b7913a1e3..28ff5bf6e8e4d9db6a0c7baef616edba97f56521 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -70,7 +70,7 @@ void FillRandomsWithSingles(PhiloxRandom gen,
 //   z_limit: the maximum z-test we would consider the test to pass;
 template <typename T>
 bool CheckSamplesMoments(const std::vector<T>& samples,
-                         std::function<double(int)> theoretical_moments,
+                         const std::function<double(int)>& theoretical_moments,
                          int max_moments, int stride, T z_limit) {
   const T* const samples_data = &samples[0];
   const int samples_size = samples.size();
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index da369ea516a99381b1260033fc5ea16f631c8c65..c68e14f09fbd4a89ad9cd75a8df94144d0cd2c75 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -25,7 +25,7 @@ namespace str_util {
 
 static char hex_char[] = "0123456789abcdef";
 
-string CEscape(const string& src) {
+string CEscape(StringPiece src) {
   string dest;
 
   for (unsigned char c : src) {
@@ -258,6 +258,25 @@ void TitlecaseString(string* s, StringPiece delimiters) {
   }
 }
 
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all) {
+  // TODO(jlebar): We could avoid having to shift data around in the string if
+  // we had a StringPiece::find() overload that searched for a StringPiece.
+  string res = s.ToString();
+  size_t pos = 0;
+  while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
+    res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
+    pos += newsub.size();
+    if (oldsub.empty()) {
+      pos++;  // Match at the beginning of the text and after every byte
+    }
+    if (!replace_all) {
+      break;
+    }
+  }
+  return res;
+}
+
 size_t RemoveLeadingWhitespace(StringPiece* text) {
   size_t count = 0;
   const char* ptr = text->data();
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index bfecfef6cbb2fb2d46468068f70d694150f1b725..669f0d3c5279b90fe31398410c4a95a053d16fd5 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -30,7 +30,7 @@ namespace str_util {
 
 // Returns a version of 'src' where unprintable characters have been
 // escaped using C-style escape sequences.
-string CEscape(const string& src);
+string CEscape(StringPiece src);
 
 // Copies "source" to "dest", rewriting C-style escape sequences --
 // '\n', '\r', '\\', '\ooo', etc -- to their ASCII equivalents.
@@ -85,6 +85,11 @@ string Uppercase(StringPiece s);
 // set of characters that can be used as word boundaries.
 void TitlecaseString(string* s, StringPiece delimiters);
 
+// Replaces the first occurrence (if replace_all is false) or all occurrences
+// (if replace_all is true) of oldsub in s with newsub.
+string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
+                     bool replace_all);
+
 // Join functionality
 template <typename T>
 string Join(const T& s, const char* sep);
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 486690cf57fa3506724c9f4722d6283ec329dc7d..040f7447e4d2d13a9f679ba92670ee74a866dae3 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -352,4 +352,37 @@ TEST(TitlecaseString, Basic) {
   ASSERT_EQ(s, "Dense");
 }
 
+TEST(StringReplace, Basic) {
+  EXPECT_EQ("XYZ_XYZ_XYZ", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/true));
+}
+
+TEST(StringReplace, OnlyFirst) {
+  EXPECT_EQ("XYZ_ABC_ABC", str_util::StringReplace("ABC_ABC_ABC", "ABC", "XYZ",
+                                                   /*replace_all=*/false));
+}
+
+TEST(StringReplace, IncreaseLength) {
+  EXPECT_EQ("a b c",
+            str_util::StringReplace("abc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, IncreaseLengthMultipleMatches) {
+  EXPECT_EQ("a b  b c",
+            str_util::StringReplace("abbc", "b", " b ", /*replace_all=*/true));
+}
+
+TEST(StringReplace, NoChange) {
+  EXPECT_EQ("abc",
+            str_util::StringReplace("abc", "d", "X", /*replace_all=*/true));
+}
+
+TEST(StringReplace, EmptyStringReplaceFirst) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/false));
+}
+
+TEST(StringReplace, EmptyStringReplaceAll) {
+  EXPECT_EQ("", str_util::StringReplace("", "a", "X", /*replace_all=*/true));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index b078e8cf9450fbd4dc7667924affbe7b59900905..3e864c4f2821a00c89eaf5b7e0a0da275bc770a1 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -27,8 +27,6 @@ limitations under the License.
 namespace tensorflow {
 namespace strings {
 
-AlphaNum gEmptyAlphaNum("");
-
 AlphaNum::AlphaNum(const Eigen::half &f)
     : piece_(digits_, strlen(FloatToBuffer(static_cast<float>(f), digits_))) {}
 
diff --git a/tensorflow/core/lib/strings/strcat.h b/tensorflow/core/lib/strings/strcat.h
index 9434b9441107851485fcbaa63dc0939e73a55599..8e35549ed4bdd9afa497011c1f10504b59a0f350 100644
--- a/tensorflow/core/lib/strings/strcat.h
+++ b/tensorflow/core/lib/strings/strcat.h
@@ -144,8 +144,6 @@ class AlphaNum {
   TF_DISALLOW_COPY_AND_ASSIGN(AlphaNum);
 };
 
-extern AlphaNum gEmptyAlphaNum;
-
 // ----------------------------------------------------------------------
 // StrCat()
 //    This merges the given strings or numbers, with no delimiter.  This
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 97e218a793182514378678bd25cef6326197fde8..028ff26ffb9b23d57be908494005c6ed1745c981 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -262,7 +262,7 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
   const uint32 data_count = *sample_count * *channel_count;
   float_values->resize(data_count);
   for (int i = 0; i < data_count; ++i) {
-    int16 single_channel_value;
+    int16 single_channel_value = 0;
     TF_RETURN_IF_ERROR(
         ReadValue<int16>(wav_string, &single_channel_value, &offset));
     (*float_values)[i] = Int16SampleToFloat(single_channel_value);
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index e81490c4988d0f40bb35c277db3c89de7ff23023..b9e56a1742f262f120ee800c0e7aa364be0281a3 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -41,10 +41,10 @@ Status GetAxisForPackAndUnpack(InferenceContext* c, int32 rank_after_pack,
 }
 
 template <typename T>
-std::vector<int64> AsInt64(const Tensor* tensor, int num_elements) {
+std::vector<int64> AsInt64(const Tensor* tensor, int64 num_elements) {
   std::vector<int64> ret(num_elements);
   auto data = tensor->vec<T>();
-  for (int i = 0; i < num_elements; ++i) {
+  for (int64 i = 0; i < num_elements; ++i) {
     ret[i] = data(i);
   }
   return ret;
@@ -52,11 +52,11 @@ std::vector<int64> AsInt64(const Tensor* tensor, int num_elements) {
 
 template <typename T>
 Status PadKnown(InferenceContext* c, ShapeHandle input,
-                const Tensor* paddings_t, int32 num_dims) {
+                const Tensor* paddings_t, int64 num_dims) {
   // paddings_t is known.
   std::vector<DimensionHandle> dims(num_dims);
   auto paddings_data = paddings_t->matrix<T>();
-  for (int i = 0; i < num_dims; ++i) {
+  for (int64 i = 0; i < num_dims; ++i) {
     const T pad0 = paddings_data(i, 0);
     const T pad1 = paddings_data(i, 1);
     if (pad0 < 0 || pad1 < 0) {
@@ -209,7 +209,7 @@ The input tensors are all required to have size 1 in the first dimension.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [[1, 4]]
 # 'y' is [[2, 5]]
 # 'z' is [[3, 6]]
@@ -277,7 +277,7 @@ Etc.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [1, 4]
 # 'y' is [2, 5]
 # 'z' is [3, 6]
@@ -394,6 +394,28 @@ output: A `Tensor` with the concatenation of values stacked along the
   in `concat_dim` where it has the sum of the sizes.
 )doc");
 
+// TODO(vivek.v.rane@intel.com): Prefix the op names with underscore if the ops
+// are not to be made user-accessible.
+#ifdef INTEL_MKL
+REGISTER_OP("_MklConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("mkl_values: N * uint8")
+    .Input("mkl_axis: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::ConcatV2Shape)
+    .Doc(R"doc(
+MKL version of ConcatV2 operator. Uses MKL DNN APIs to perform concatenation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+#endif
+
 REGISTER_OP("ConcatOffset")
     .Input("concat_dim: int32")
     .Input("shape: N * int32")
@@ -410,19 +432,19 @@ Computes offsets of concat inputs within its output.
 
 For example:
 
-```prettyprint
+```
 # 'x' is [2, 2, 7]
 # 'y' is [2, 3, 7]
 # 'z' is [2, 5, 7]
 concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 ```
 
+This is typically used by gradient computations for a concat operation.
+
 concat_dim: The dimension along which to concatenate.
 shape: The `N` int32 vectors representing shape of tensors being concatenated.
 offset: The `N` int32 vectors representing the starting offset
         of input tensors within the concatenated output.
-
-This is typically used by gradient computations for a concat operation.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -488,7 +510,7 @@ REGISTER_OP("SplitV")
       ShapeHandle output_shape;
       const Tensor* size_splits = c->input_tensor(1);
       if (rank == InferenceContext::kUnknownRank) {
-        // If the rank of input tensor is unknown, then return unkown shapes.
+        // If the rank of input tensor is unknown, then return unknown shapes.
         output_shape = c->UnknownShape();
         for (int i = 0; i < num_outputs; ++i) {
           c->set_output(i, output_shape);
@@ -496,8 +518,18 @@ REGISTER_OP("SplitV")
       } else if (rank == 0) {
         // Throw error if input is a scalar.
         return errors::InvalidArgument("Can't split scalars");
-      } else if (size_splits == nullptr || !c->ValueKnown(split_dimension)) {
-        // If split dimension or tensor containing the split sizes is unkown,
+      } else if (size_splits == nullptr && c->ValueKnown(split_dimension)) {
+        // If split dimension is known, but the sizes are unknown, then
+        // only the split dimension is unknown
+        output_shape = input;
+        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                         c->Value(split_dimension),
+                                         c->UnknownDim(), &output_shape));
+        for (int i = 0; i < num_outputs; ++i) {
+          c->set_output(i, output_shape);
+        }
+      } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
+        // If split dimension or tensor containing the split sizes is unknown,
         // then return unknown shapes of same rank as input.
         output_shape = c->UnknownShapeOfRank(rank);
         for (int i = 0; i < num_outputs; ++i) {
@@ -518,12 +550,38 @@ REGISTER_OP("SplitV")
           return errors::InvalidArgument(
               "Length of size_splits should be equal to num_outputs");
         }
+        int64_t cumsum_outputs = 0;
+        bool has_neg_one = false;
+        // If the sizes of the splits are known, then
+        // make sure that the sizes add up to the expected
+        // dimension size, with the possibility of a -1.
+        // Specify the full output shapes.
         for (int i = 0; i < num_outputs; ++i) {
           output_shape = c->UnknownShapeOfRank(rank);
           TF_RETURN_IF_ERROR(c->ReplaceDim(input, split_dim,
                                            c->MakeDim(data[i]), &output_shape));
           c->set_output(i, output_shape);
+          if (data[i] == -1 && !has_neg_one)
+            has_neg_one = true;
+          else if (data[i] == -1 && has_neg_one)
+            return errors::InvalidArgument("size_splits can only have one -1");
+          else
+            cumsum_outputs += data[i];
+        }
+        auto split_dim_size = c->Value(c->Dim(input, split_dim));
+        if (has_neg_one) {
+          if (cumsum_outputs < split_dim_size)
+            cumsum_outputs = split_dim_size;
+          else
+            cumsum_outputs = split_dim_size + 1;
         }
+        if (c->ValueKnown(c->Dim(input, split_dim)) &&
+            cumsum_outputs != c->Value(c->Dim(input, split_dim)))
+          return errors::InvalidArgument(
+              "Sum of output sizes must match "
+              "the size of the original Tensor along the split dimension "
+              "or the sum of the positive sizes must be less if it contains a "
+              "-1");
       }
 
       return Status::OK();
@@ -648,7 +706,7 @@ rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [1, 2, 3, 4]
 tf.diag(diagonal) ==> [[1, 0, 0, 0]
                        [0, 2, 0, 0]
@@ -700,7 +758,7 @@ tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[1, 0, 0, 0]
               [0, 2, 0, 0]
               [0, 0, 3, 0]
@@ -746,7 +804,7 @@ tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
 
 For example:
 
-```prettyprint
+```
 # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
 
 and diagonal.shape = (2, 4)
@@ -858,7 +916,7 @@ The input must be at least a matrix.
 
 For example:
 
-```prettyprint
+```
 # 'input' is [[[1, 0, 0, 0]
                [0, 2, 0, 0]
                [0, 0, 3, 0]
@@ -905,7 +963,7 @@ The indicator function
 
 For example:
 
-```prettyprint
+```
 # if 'input' is [[ 0,  1,  2, 3]
                  [-1,  0,  1, 2]
                  [-2, -1,  0, 1]
@@ -924,7 +982,7 @@ tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
 
 Useful special cases:
 
-```prettyprint
+```
  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
@@ -946,7 +1004,7 @@ REGISTER_OP("Reverse")
     .Output("output: T")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -976,7 +1034,7 @@ of `tensor` must equal the number of elements in `dims`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1023,7 +1081,7 @@ REGISTER_OP("ReverseV2")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
         "T: {uint8, int8, int32, int64, bool, half, float, double, complex64, "
-        "complex128}")
+        "complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1052,7 +1110,7 @@ once, a InvalidArgument error is raised.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [[[[ 0,  1,  2,  3],
 #                  [ 4,  5,  6,  7],
 #                  [ 8,  9, 10, 11]],
@@ -1223,7 +1281,7 @@ This operation creates a tensor of shape `dims` and fills it with `value`.
 
 For example:
 
-```prettyprint
+```
 # Output tensor has shape [2, 3].
 fill([2, 3], 9) ==> [[9, 9, 9]
                      [9, 9, 9]]
@@ -1244,9 +1302,12 @@ REGISTER_OP("_ParallelConcatStart")
     .Attr("dtype: type")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
-      c->set_output(0, out);
+      TensorShapeProto shape_proto;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_proto));
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeProto(shape_proto, &output_shape));
+      c->set_output(0, output_shape);
       return Status::OK();
     })
     .Doc(R"doc(
@@ -1325,11 +1386,11 @@ this operation will permute `params` accordingly.
 
 `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
 `indices` are always validated to be within range. If assigned to GPU,
-out-of-bound indices result in unspecified behavior (currently the result is
-`0`, but this may become an error in the future).
+out-of-bound indices result in safe but unspecified behavior, which may include
+raising an error.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/Gather.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
 </div>
 )doc");
 
@@ -1371,20 +1432,17 @@ REGISTER_OP("GatherNd")
     .Doc(R"doc(
 Gather values or slices from `params` according to `indices`.
 
-`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `params`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+`indices` is an integer tensor containing indices into `params`.  The last
+dimension of `indices` can be at most the rank of `params`:
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `params`.
+    indices.shape[-1] <= params.rank
 
-Produces an output tensor with shape
+The last dimension of `indices` corresponds to elements
+(if `indices.shape[-1] = params.rank`) or slices
+(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+of `params`.  The output tensor has shape
 
-```
-[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-```
+    indices.shape[:-1] + params.shape[indices.shape[-1]:]
 
 Some examples below.
 
@@ -1463,10 +1521,10 @@ Batched indexing into a 3-tensor:
     output = [['b0', 'b1'], ['d0', 'c1']]
 ```
 
-params: `P-D`.  The tensor from which to gather values.
-indices: `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
-output: `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-  `indices`.
+params: The tensor from which to gather values.
+indices: Index tensor.
+output: Values from `params` gathered from indices given by `indices`, with
+  shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
 )doc");
 
 // --------------------------------------------------------------------------
@@ -1484,6 +1542,23 @@ REGISTER_OP("Identity")
 Return a tensor with the same shape and contents as the input tensor or value.
 )Doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklIdentity")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      c->set_output_handle_dtype(0, c->input_handle_dtype(0));
+      c->set_output_handle_shape(0, c->input_handle_shape(0));
+      return Status::OK();
+    })
+    .Doc(R"Doc( Mkl implementation of IdentityOp
+)Doc");
+#endif
+
 // --------------------------------------------------------------------------
 REGISTER_OP("RefIdentity")
     .Input("input: Ref(T)")
@@ -1588,7 +1663,7 @@ implied by `shape` must be the same as the number of elements in `tensor`.
 
 For example:
 
-```prettyprint
+```
 # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
 # tensor 't' has shape [9]
 reshape(t, [3, 3]) ==> [[1, 2, 3],
@@ -1635,6 +1710,21 @@ reshape(t, []) ==> 7
 shape: Defines the shape of the output tensor.
 )Doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklReshape")
+    .Input("tensor: T")
+    .Input("shape: Tshape")
+    .Input("mkl_tensor: uint8")
+    .Input("mkl_shape: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: type")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) { return SetOutputShapeForReshape(c); })
+    .Doc(R"Doc( MKL implementation of ReshapeOp.
+)Doc");
+#endif  // INTEL_MKL
+
 // --------------------------------------------------------------------------
 REGISTER_OP("InvertPermutation")
     .Input("x: T")
@@ -1660,7 +1750,7 @@ The values must include 0. There can be no duplicate values or negative values.
 
 For example:
 
-```prettyprint
+```
 # tensor `x` is [3, 4, 0, 2, 1]
 invert_permutation(x) ==> [2, 4, 3, 0, 1]
 ```
@@ -1765,7 +1855,7 @@ in the unique output `y`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx = unique(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1805,7 +1895,7 @@ contains the count of each element of `y` in `x`. In other words:
 
 For example:
 
-```prettyprint
+```
 # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 y, idx, count = unique_with_counts(x)
 y ==> [1, 2, 4, 7, 8]
@@ -1850,7 +1940,7 @@ This operation returns a 1-D integer tensor representing the shape of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 shape(t) ==> [2, 2, 3]
 ```
@@ -1931,7 +2021,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 
 For example:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 0
 seq_dim = 1
@@ -1953,7 +2043,7 @@ output[3, 2:, :, ...] = input[3, 2:, :, ...]
 
 In contrast, if:
 
-```prettyprint
+```
 # Given this:
 batch_dim = 2
 seq_dim = 0
@@ -1994,7 +2084,7 @@ This operation returns an integer representing the rank of `input`.
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
 # shape of tensor 't' is [2, 2, 3]
 rank(t) ==> 3
@@ -2020,7 +2110,7 @@ This operation returns an integer representing the number of elements in
 
 For example:
 
-```prettyprint
+```
 # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
 size(t) ==> 12
 ```
@@ -2253,7 +2343,7 @@ encoding is best understand by considering a non-trivial example. In
 particular,
 `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 
-```prettyprint
+```
 begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 end = [2, 4, x, x, -3, x]
 strides = [1, 1, x, x, -1, 1]
@@ -2395,6 +2485,32 @@ shape must be exactly the shape produced by the slice of `ref`.
 // broadcasting.
 // --------------------------------------------------------------------------
 
+REGISTER_OP("ResourceStridedSliceAssign")
+    .Input("ref: resource")
+    .Input("begin: Index")
+    .Input("end: Index")
+    .Input("strides: Index")
+    .Input("value: T")
+    .Attr("T: type")
+    .Attr("Index: {int32, int64}")
+    .Attr("begin_mask: int = 0")
+    .Attr("end_mask: int = 0")
+    .Attr("ellipsis_mask: int = 0")
+    .Attr("new_axis_mask: int = 0")
+    .Attr("shrink_axis_mask: int = 0")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Assign `value` to the sliced l-value reference of `ref`.
+
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+
+)doc");
+
 REGISTER_OP("Tile")
     .Input("input: T")
     .Input("multiples: Tmultiples")
@@ -2475,7 +2591,7 @@ the output tensor can vary depending on how many true values there are in
 
 For example:
 
-```prettyprint
+```
 # 'input' tensor is [[True, False]
 #                    [True, False]]
 # 'input' has two true values, so output has two coordinates.
@@ -2579,7 +2695,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 1], [2, 2]]
 # 'paddings' is [[1, 1], [2, 2]]
 # rank of 't' is 2
@@ -2618,7 +2734,7 @@ The padded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6]].
 # 'paddings' is [[1, 1]], [2, 2]].
 # 'mode' is SYMMETRIC.
@@ -2644,10 +2760,10 @@ output: The padded tensor.
 namespace {
 template <typename T>
 Status MirrorPadKnown(InferenceContext* c, ShapeHandle input,
-                      const Tensor* paddings_t, int32 input_rank) {
+                      const Tensor* paddings_t, int64 input_rank) {
   auto paddings_data = paddings_t->matrix<T>();
   std::vector<DimensionHandle> dims(input_rank);
-  for (int i = 0; i < input_rank; ++i) {
+  for (int64 i = 0; i < input_rank; ++i) {
     const int64 pad0 = static_cast<int64>(paddings_data(i, 0));
     const int64 pad1 = static_cast<int64>(paddings_data(i, 1));
     if (pad0 < 0 || pad1 < 0) {
@@ -2714,7 +2830,7 @@ The folded size of each dimension D of the output is:
 
 For example:
 
-```prettyprint
+```
 # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
 # 'paddings' is [[0, 1]], [0, 1]].
 # 'mode' is SYMMETRIC.
@@ -2734,7 +2850,7 @@ output: The folded tensor.
 REGISTER_OP("Placeholder")
     .Output("output: dtype")
     .Attr("dtype: type")
-    .Attr("shape: shape = {}")
+    .Attr("shape: shape = { unknown_rank: true }")
     .SetShapeFn([](InferenceContext* c) {
       PartialTensorShape shape;
       TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
@@ -2742,7 +2858,7 @@ REGISTER_OP("Placeholder")
       // Placeholder has legacy behavior where we cannot tell the difference
       // between a scalar shape attribute and 'unknown shape'.  So if the shape
       // is a scalar, we return an unknown shape.
-      if (shape.dims() <= 0) {
+      if (c->graph_def_version() <= 21 && shape.dims() <= 0) {
         return shape_inference::UnknownShape(c);
       }
 
@@ -2766,11 +2882,9 @@ shape: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
   shape is unconstrained.
 )doc");
 
-// This version fixes an issue with the original version of Placeholder
-// where the empty shape attribute "[]" was used to denote
-// an unknown shape.  This meant that scalars (added later) could
-// not be represented natively.  This new version fixes that
-// limitation.
+// Placeholder was modified in a backwards compatible way to do what
+// PlaceholderV2 did, so we have deprecated V2 (no one was really
+// using it).
 REGISTER_OP("PlaceholderV2")
     .Output("output: dtype")
     .Attr("dtype: type")
@@ -2783,6 +2897,7 @@ REGISTER_OP("PlaceholderV2")
       c->set_output(0, output);
       return Status::OK();
     })
+    .Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.")
     .Doc(R"doc(
 A placeholder op for a value that will be fed into the computation.
 
@@ -2891,7 +3006,7 @@ which will make the shape `[1, height, width, channels]`.
 
 Other examples:
 
-```prettyprint
+```
 # 't' is a tensor of shape [2]
 shape(expand_dims(t, 0)) ==> [1, 2]
 shape(expand_dims(t, 1)) ==> [2, 1]
@@ -2993,14 +3108,14 @@ dimensions, you can remove specific size 1 dimensions by specifying
 
 For example:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t)) ==> [2, 3]
 ```
 
 Or, to remove specific size 1 dimensions:
 
-```prettyprint
+```
 # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 ```
@@ -3043,14 +3158,14 @@ position of each `out` element in `x`. In other words:
 
 For example, given this input:
 
-```prettyprint
+```
 x = [1, 2, 3, 4, 5, 6]
 y = [1, 3, 5]
 ```
 
 This operation would return:
 
-```prettyprint
+```
 out ==> [2, 4, 6]
 idx ==> [1, 3, 5]
 ```
@@ -3309,34 +3424,34 @@ Some examples:
 (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
     `paddings = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3345,7 +3460,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3355,7 +3470,7 @@ x = [[[[1], [3]], [[9], [11]]],
 (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
     paddings = `[[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3364,7 +3479,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 3, 1]` and value:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3438,32 +3553,32 @@ Some examples:
 
 (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 1]` and value:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 The output tensor has shape `[4, 1, 1, 3]` and value:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]],
       [[9],  [10], [11],  [12]],
@@ -3472,7 +3587,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[4, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3481,7 +3596,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3490,7 +3605,7 @@ x = [[[[1],   [2],  [3],  [4]],
 
 The output tensor has shape `[8, 1, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
@@ -3576,26 +3691,26 @@ Some examples:
 (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3603,7 +3718,7 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [0, 0]]`:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3612,7 +3727,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3622,7 +3737,7 @@ x = [[[1],   [2],  [3],  [4]],
 (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
     `crops = [[0, 0], [2, 0]]`:
 
-```prettyprint
+```
 x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
      [[[0], [2], [4]]], [[[0], [10], [12]]],
      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -3631,7 +3746,7 @@ x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [3],  [4]],
       [[5],   [6],  [7],  [8]]],
      [[[9],  [10], [11],  [12]],
@@ -3696,32 +3811,32 @@ Some examples:
 
 (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
 
-```prettyprint
+```
 [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [2]], [[3], [4]]]]
 ```
 
 (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
 
-```prettyprint
+```
 [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 ```
 
 The output tensor has shape `[1, 2, 2, 3]` and value:
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
 
 (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[9], [11]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3730,7 +3845,7 @@ x = [[[[1], [3]], [[9], [11]]],
 
 The output tensor has shape `[1, 4, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[1],   [2],  [3],  [4]],
      [[5],   [6],  [7],  [8]],
      [[9],  [10], [11],  [12]],
@@ -3739,14 +3854,14 @@ x = [[[1],   [2],  [3],  [4]],
 
 (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 ```
 
 The output tensor has shape `[2, 2, 4, 1]` and value:
 
-```prettyprint
+```
 x = [[[[1], [3]], [[5], [7]]],
      [[[2], [4]], [[10], [12]]],
      [[[5], [7]], [[13], [15]]],
@@ -3812,14 +3927,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
 
-```prettyprint
+```
 x = [[[[1], [2]],
       [[3], [4]]]]
 ```
 
 This operation will output a tensor of shape `[1, 1, 1, 4]`:
 
-```prettyprint
+```
 [[[[1, 2, 3, 4]]]]
 ```
 
@@ -3830,7 +3945,7 @@ The output element shape is `[1, 1, 4]`.
 
 For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3], [4, 5, 6]],
       [[7, 8, 9], [10, 11, 12]]]]
 ```
@@ -3838,13 +3953,13 @@ x = [[[[1, 2, 3], [4, 5, 6]],
 This operation, for block_size of 2, will return the following tensor of shape
 `[1, 1, 1, 12]`
 
-```prettyprint
+```
 [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1],   [2],  [5],  [6]],
       [[3],   [4],  [7],  [8]],
       [[9],  [10], [13],  [14]],
@@ -3853,7 +3968,7 @@ x = [[[[1],   [2],  [5],  [6]],
 
 the operator will return the following tensor of shape `[1 2 2 4]`:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -3922,14 +4037,14 @@ purely convolutional models.
 
 For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4]]]]
 
 ```
 
 This operation will output a tensor of shape `[1, 2, 2, 1]`:
 
-```prettyprint
+```
    [[[[1], [2]],
      [[3], [4]]]]
 ```
@@ -3941,14 +4056,14 @@ The output element shape is `[2, 2, 1]`.
 
 For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
 
-```prettyprint
+```
 x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 ```
 
 This operation, for block size of 2, will return the following tensor of shape
 `[1, 2, 2, 3]`
 
-```prettyprint
+```
    [[[[1, 2, 3], [4, 5, 6]],
      [[7, 8, 9], [10, 11, 12]]]]
 
@@ -3956,7 +4071,7 @@ This operation, for block size of 2, will return the following tensor of shape
 
 Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
 
-```prettyprint
+```
 x =  [[[[1, 2, 3, 4],
        [5, 6, 7, 8]],
       [[9, 10, 11, 12],
@@ -3965,7 +4080,7 @@ x =  [[[[1, 2, 3, 4],
 
 the operator will return the following tensor of shape `[1 4 4 1]`:
 
-```prettyprint
+```
 x = [[ [1],   [2],  [5],  [6]],
      [ [3],   [4],  [7],  [8]],
      [ [9],  [10], [13],  [14]],
@@ -4711,37 +4826,35 @@ REGISTER_OP("ScatterNd")
     .Attr("T: type")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape)
-    .Doc(
-        R"doc(Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` tensor according to
+    .Doc(R"doc(
+Scatter `updates` into a new (initially zero) tensor according to `indices`.
+
+Creates a new tensor by applying sparse `updates` to individual
+values or slices within a zero tensor of the given `shape` according to
 indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
 operator which extracts values or slices from a given tensor.
 
-TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-syntax.
-
-`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-`Q`.
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates.
 
-`indices` must be integer tensor, containing indices into `shape`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `shape`.
+    indices.shape[-1] <= shape.rank
 
-`updates` is Tensor of rank `Q-1+P-K` with shape:
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
 
-```
-[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-```
+    indices.shape[:-1] + shape[indices.shape[-1]:]
 
 The simplest form of scatter is to insert individual elements in a tensor by
 index. For example, say we want to insert 4 scattered elements in a rank-1
 tensor with 8 elements.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd1.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
@@ -4752,7 +4865,7 @@ In Python, this scatter operation would look like this:
     shape = tf.constant([8])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
 ```
 
 The resulting tensor would look like this:
@@ -4764,7 +4877,7 @@ example, if we wanted to insert two slices in the first dimension of a
 rank-3 tensor with two matrices of new values.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterNd2.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
 </div>
 
 In Python, this scatter operation would look like this:
@@ -4778,7 +4891,7 @@ In Python, this scatter operation would look like this:
     shape = tf.constant([4, 4, 4])
     scatter = tf.scatter_nd(indices, updates, shape)
     with tf.Session() as sess:
-      print sess.run(scatter)
+      print(sess.run(scatter))
 ```
 
 The resulting tensor would look like this:
@@ -4788,11 +4901,9 @@ The resulting tensor would look like this:
      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 
-indices: A Tensor. Must be one of the following types: int32, int64.
-  A tensor of indices into ref.
-updates: A Tensor. Must have the same type as tensor. A tensor of updated values
-  to store in ref.
-shape: A vector. The shape of the resulting tensor.
+indices: Index tensor.
+updates: Updates to scatter into output.
+shape: 1-D. The shape of the resulting tensor.
 output: A new tensor with the given shape and updates applied according
   to the indices.
 )doc");
@@ -4800,6 +4911,7 @@ output: A new tensor with the given shape and updates applied according
 REGISTER_OP("FakeQuantWithMinMaxArgs")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Output("outputs: float")
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -4809,6 +4921,7 @@ Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 Attributes [min; max] define the clamping range for the 'inputs' data.  Op
 divides this range into 255 steps (total of 256 values), then replaces each
 'inputs' value with the closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 Quantization is called fake since the output is still in floating point.
 )doc");
@@ -4816,6 +4929,7 @@ Quantization is called fake since the output is still in floating point.
 REGISTER_OP("FakeQuantWithMinMaxArgsGradient")
     .Attr("min: float = -6.0")
     .Attr("max: float = 6.0")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Output("backprops: float")
@@ -4830,6 +4944,7 @@ backprops: Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVars")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4848,11 +4963,13 @@ and `max` to 'outputs' tensor of same shape as `inputs`.
 [min; max] is the clamping range for the 'inputs' data.  Op divides this range
 into 255 steps (total of 256 values), then replaces each 'inputs' value with the
 closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max` values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsGradient")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -4881,6 +4998,7 @@ Compute gradients for a FakeQuantWithMinMaxVars operation.
 gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
 min, max: Quantization interval, scalar floats.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs:
   `gradients * (inputs >= min && inputs <= max)`.
 backprop_wrt_min: Backpropagated gradients w.r.t. min parameter:
@@ -4890,6 +5008,7 @@ backprop_wrt_max: Backpropagated gradients w.r.t. max parameter:
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannel")
+    .Attr("num_bits: int = 8")
     .Input("inputs: float")
     .Input("min: float")
     .Input("max: float")
@@ -4916,11 +5035,13 @@ to 'outputs' tensor of same shape as `inputs`.
 [min; max] is the clamping range for the 'inputs' data in the corresponding
 depth channel.  Op divides this range into 255 steps (total of 256 values), then
 replaces each 'inputs' value with the closest of the quantized step values.
+'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 
 This operation has a gradient and thus allows for training `min` and `max` values.
 )doc");
 
 REGISTER_OP("FakeQuantWithMinMaxVarsPerChannelGradient")
+    .Attr("num_bits: int = 8")
     .Input("gradients: float")
     .Input("inputs: float")
     .Input("min: float")
@@ -4954,6 +5075,7 @@ gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
 inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
   same as `gradients`.
 min, max: Quantization interval, floats of shape `[d]`.
+num_bits: The bitwidth of the quantization; between 2 and 8, inclusive.
 backprops_wrt_input: Backpropagated gradients w.r.t. inputs, shape same as
   `inputs`:
     `gradients * (inputs >= min && inputs <= max)`.
@@ -4963,6 +5085,27 @@ backprop_wrt_max: Backpropagated gradients w.r.t. max parameter, shape `[d]`:
   `sum_per_d(gradients * (inputs > max))`.
 )doc");
 
+#ifdef INTEL_MKL
+REGISTER_OP("_MklConcat")
+    .Input("concat_dim: int32")
+    .Input("values: N * T")
+    .Input("mkl_concat_dim: uint8")
+    .Input("mkl_values: N * uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::ConcatShape(c, c->num_inputs() - 3);
+    })
+    .Doc(R"doc(
+MKL version of Concat operator. Uses MKL DNN APIs to perform concatenation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+#endif
+
 // Deprecated op registrations:
 
 // The following can be deleted after 10mar2017.
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index bc99fb09e5eda5ba2d8c2be9fa6368afdf09a898..ef27c513332e54b631259325ba251cee7e827aba 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -786,14 +786,14 @@ TEST(ArrayOpsTest, Placeholder_ShapeFn) {
   }
 
   {
-    // Scalar shapes are unknown shapes due to legacy.
+    // Scalar shapes are supported
     ShapeInferenceTestOp op("Placeholder");
     TensorShape shape({});
     TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
                      .Attr("shape", shape)
                      .Attr("dtype", DT_FLOAT)
                      .Finalize(&op.node_def));
-    INFER_OK(op, "", "?");
+    INFER_OK(op, "", "[]");
   }
 
   {
@@ -809,67 +809,11 @@ TEST(ArrayOpsTest, Placeholder_ShapeFn) {
     INFER_OK(op, "", "[1,?]");
   }
 
-  {
-    ShapeInferenceTestOp op("PlaceholderWithDefault");
-    const int64 dims[2] = {1, -1};
-    PartialTensorShape shape;
-    TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderWithDefault")
-                     .Input("input", 0, DT_FLOAT)
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "[1,2]", "[1,?]");
-
-    // input shape is not compatible with output shape.
-    INFER_ERROR("Dimension 0 in both shapes must be equal, but are 2 and 1", op,
-                "[2,3]");
-    // Wrong rank
-    INFER_ERROR("Shapes must be equal rank, but are 3 and 2", op, "[1,3,10]");
-  }
-}
-
-TEST(ArrayOpsTest, PlaceholderV2_ShapeFn) {
-  {
-    // 2D shape
-    ShapeInferenceTestOp op("PlaceholderV2");
-    TensorShape shape({1, 2});
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "", "[1,2]");
-  }
-
-  {
-    // Scalar shapes are supported in V2.
-    ShapeInferenceTestOp op("PlaceholderV2");
-    TensorShape shape({});
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "", "[]");
-  }
-
-  {
-    // Partial shape
-    ShapeInferenceTestOp op("PlaceholderV2");
-    const int64 dims[2] = {1, -1};
-    PartialTensorShape shape;
-    TF_ASSERT_OK(PartialTensorShape::MakePartialShape(dims, 2, &shape));
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
-                     .Attr("shape", shape)
-                     .Attr("dtype", DT_FLOAT)
-                     .Finalize(&op.node_def));
-    INFER_OK(op, "", "[1,?]");
-  }
-
   {
     // Unknown shape
-    ShapeInferenceTestOp op("PlaceholderV2");
+    ShapeInferenceTestOp op("Placeholder");
     PartialTensorShape shape;
-    TF_ASSERT_OK(NodeDefBuilder("test", "PlaceholderV2")
+    TF_ASSERT_OK(NodeDefBuilder("test", "Placeholder")
                      .Attr("shape", shape)
                      .Attr("dtype", DT_FLOAT)
                      .Finalize(&op.node_def));
@@ -1626,4 +1570,16 @@ TEST(ArrayOpsTest, QuantizedConcat_ShapeFn) {
   // Note that other cases of concat are covered in the Concat tests.
 }
 
+TEST(StateOpsTest, _ParallelConcatStart_ShapeFn) {
+  ShapeInferenceTestOp op("_ParallelConcatStart");
+  TensorShape shape({1, 2, 3});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+  TF_ASSERT_OK(NodeDefBuilder("test", "_ParallelConcatStart")
+                   .Attr("shape", shape_proto)
+                   .Attr("dtype", DT_FLOAT)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "", "[1,2,3]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/audio_ops.cc b/tensorflow/core/ops/audio_ops.cc
index d6dedc3820633eee9d9acfe700d566f94c05df47..02b13a455ceaa4c0d8c91ba63136083a47658c45 100644
--- a/tensorflow/core/ops/audio_ops.cc
+++ b/tensorflow/core/ops/audio_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/bits.h"
 
 namespace tensorflow {
 
@@ -66,6 +67,59 @@ Status EncodeWavShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SpectrogramShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+  int32 window_size;
+  TF_RETURN_IF_ERROR(c->GetAttr("window_size", &window_size));
+  int32 stride;
+  TF_RETURN_IF_ERROR(c->GetAttr("stride", &stride));
+
+  DimensionHandle input_channels = c->Dim(input, 0);
+  DimensionHandle input_length = c->Dim(input, 1);
+
+  DimensionHandle output_length;
+  if (!c->ValueKnown(input_length)) {
+    output_length = c->UnknownDim();
+  } else {
+    const int64 input_length_value = c->Value(input_length);
+    const int64 length_minus_window = (input_length_value - window_size);
+    int64 output_length_value;
+    if (length_minus_window < 0) {
+      output_length_value = 0;
+    } else {
+      output_length_value = 1 + (length_minus_window / stride);
+    }
+    output_length = c->MakeDim(output_length_value);
+  }
+
+  DimensionHandle output_channels =
+      c->MakeDim(1 + NextPowerOfTwo(window_size) / 2);
+  c->set_output(0,
+                c->MakeShape({input_channels, output_length, output_channels}));
+  return Status::OK();
+}
+
+Status MfccShapeFn(InferenceContext* c) {
+  ShapeHandle spectrogram;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 3, &spectrogram));
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+
+  int32 dct_coefficient_count;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr("dct_coefficient_count", &dct_coefficient_count));
+
+  DimensionHandle spectrogram_channels = c->Dim(spectrogram, 0);
+  DimensionHandle spectrogram_length = c->Dim(spectrogram, 1);
+
+  DimensionHandle output_channels = c->MakeDim(dct_coefficient_count);
+
+  c->set_output(0, c->MakeShape({spectrogram_channels, spectrogram_length,
+                                 output_channels}));
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("DecodeWav")
@@ -121,4 +175,79 @@ sample_rate: Scalar containing the sample frequency.
 contents: 0-D. WAV-encoded file contents.
 )doc");
 
+REGISTER_OP("AudioSpectrogram")
+    .Input("input: float")
+    .Attr("window_size: int")
+    .Attr("stride: int")
+    .Attr("magnitude_squared: bool = false")
+    .Output("spectrogram: float")
+    .SetShapeFn(SpectrogramShapeFn)
+    .Doc(R"doc(
+Produces a visualization of audio data over time.
+
+Spectrograms are a standard way of representing audio information as a series of
+slices of frequency information, one slice for each window of time. By joining
+these together into a sequence, they form a distinctive fingerprint of the sound
+over time.
+
+This op expects to receive audio data as an input, stored as floats in the range
+-1 to 1, together with a window width in samples, and a stride specifying how
+far to move the window between slices. From this it generates a three
+dimensional output. The lowest dimension has an amplitude value for each
+frequency during that time slice. The next dimension is time, with successive
+frequency slices. The final dimension is for the channels in the input, so a
+stereo audio input would have two here for example.
+
+This means the layout when converted and saved as an image is rotated 90 degrees
+clockwise from a typical spectrogram. Time is descending down the Y axis, and
+the frequency decreases from left to right.
+
+Each value in the result represents the square root of the sum of the real and
+imaginary parts of an FFT on the current window of samples. In this way, the
+lowest dimension represents the power of each frequency in the current window,
+and adjacent windows are concatenated in the next dimension.
+
+To get a more intuitive and visual look at what this operation does, you can run
+tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+resulting spectrogram as a PNG image.
+
+input: Float representation of audio data.
+window_size: How wide the input window is in samples. For the highest efficiency
+  this should be a power of two, but other values are accepted.
+stride: How widely apart the center of adjacent sample windows should be.
+magnitude_squared: Whether to return the squared magnitude or just the
+  magnitude. Using squared magnitude can avoid extra calculations.
+spectrogram: 3D representation of the audio frequencies as an image.
+)doc");
+
+REGISTER_OP("Mfcc")
+    .Input("spectrogram: float")
+    .Input("sample_rate: int32")
+    .Attr("upper_frequency_limit: float = 4000")
+    .Attr("lower_frequency_limit: float = 20")
+    .Attr("filterbank_channel_count: int = 40")
+    .Attr("dct_coefficient_count: int = 13")
+    .Output("output: float")
+    .SetShapeFn(MfccShapeFn)
+    .Doc(R"doc(
+Transforms a spectrogram into a form that's useful for speech recognition.
+
+Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+been effective as an input feature for machine learning. They are created by
+taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+higher frequencies that are less significant to the human ear. They have a long
+history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+is a good resource to learn more.
+
+spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+  set to true.
+sample_rate: How many samples per second the source audio used.
+upper_frequency_limit: The highest frequency to use when calculating the
+  ceptstrum.
+lower_frequency_limit: The lowest frequency to use when calculating the
+  ceptstrum.
+filterbank_channel_count: Resolution of the Mel bank used internally.
+dct_coefficient_count: How many output channels to produce per time slice.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 037c393574dcbe4ea6bf705b5048b657e97573df..18700be67a667359d7a86d8f81ada383be973a0a 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -55,6 +55,7 @@ REGISTER_OP("UniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a uniform distribution.
 
@@ -80,7 +81,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -103,6 +104,7 @@ REGISTER_OP("LogUniformCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a log-uniform distribution.
 
@@ -129,7 +131,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -152,6 +154,7 @@ REGISTER_OP("LearnedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -177,7 +180,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -200,6 +203,7 @@ REGISTER_OP("ThreadUnsafeUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -225,7 +229,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -254,6 +258,7 @@ REGISTER_OP("FixedUnigramCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -284,7 +289,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to randomly sample per batch.
+num_sampled: Number of candidates to randomly sample.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
@@ -329,6 +334,7 @@ REGISTER_OP("AllCandidateSampler")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .SetShapeFn(CandidateSamplerShapeFn)
+    .SetIsStateful()
     .Doc(R"doc(
 Generates labels for candidate sampling with a learned unigram distribution.
 
@@ -354,7 +360,7 @@ sampled_expected_count: A vector of length num_sampled, for each sampled
   to occur in a batch of sampled candidates.  If unique=true, then this is a
   probability.
 num_true: Number of true labels per context.
-num_sampled: Number of candidates to produce per batch.
+num_sampled: Number of candidates to produce.
 unique: If unique is true, we sample with rejection, so that all sampled
   candidates in a batch are unique. This requires some approximation to
   estimate the post-rejection sampling probabilities.
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 2af4e8692b2743d38fe6a3ee0967933001f69a63..49a364e5ef080374cbd7c7c1ce11a99f063cc773 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -487,6 +487,56 @@ op {
     }
   }
 }
+op {
+  name: "AllCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Any"
   input_arg {
@@ -797,6 +847,93 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAdam"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyCenteredRMSProp"
   input_arg {
@@ -1666,6 +1803,57 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+  }
+  attr {
+    name: "stride"
+    type: "int"
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "AudioSummary"
   input_arg {
@@ -1783,9 +1971,9 @@ op {
   }
 }
 op {
-  name: "AvgPool3D"
+  name: "AvgPool"
   input_arg {
-    name: "input"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
@@ -1796,13 +1984,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -1814,6 +2002,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -1821,26 +2022,15 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "AvgPool3D"
+  name: "AvgPool"
   input_arg {
-    name: "input"
+    name: "value"
     type_attr: "T"
   }
   output_arg {
@@ -1851,13 +2041,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -1873,12 +2063,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -1887,32 +2077,120 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "AvgPool3DGrad"
-  input_arg {
-    name: "orig_input_shape"
-    type: DT_INT32
-  }
+  name: "AvgPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -1948,18 +2226,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2020,18 +2286,67 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
         type: DT_HALF
+        type: DT_DOUBLE
       }
     }
   }
@@ -2091,7 +2406,68 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_HALF
+      }
+    }
+  }
+}
+op {
+  name: "AvgPoolGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
@@ -2309,17 +2685,45 @@ op {
   }
 }
 op {
-  name: "BatchFFT"
+  name: "BatchDataset"
   input_arg {
-    name: "input"
-    type: DT_COMPLEX64
+    name: "input_dataset"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    type: DT_COMPLEX64
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
   }
-  deprecation {
-    version: 15
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BatchFFT"
+  input_arg {
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
+  }
+  deprecation {
+    version: 15
   }
 }
 op {
@@ -3328,6 +3732,33 @@ op {
     }
   }
 }
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+  }
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -3452,6 +3883,54 @@ op {
     }
   }
 }
+op {
+  name: "CTCLoss"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "labels_indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "labels_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "sequence_length"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "loss"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "Cast"
   input_arg {
@@ -3884,7 +4363,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -3948,7 +4426,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4012,7 +4489,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4072,18 +4548,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4125,18 +4589,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4195,18 +4647,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4255,18 +4695,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4312,18 +4740,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4382,18 +4798,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4442,18 +4846,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4499,18 +4891,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4567,6 +4947,60 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "Copy"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "CopyHost"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "CopyHost"
   input_arg {
@@ -4588,6 +5022,14 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -4986,6 +5428,75 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "DebugIdentity"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
 op {
   name: "DebugNanCount"
   input_arg {
@@ -5015,6 +5526,44 @@ op {
       }
     }
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -5046,6 +5595,27 @@ op {
       }
     }
   }
+  attr {
+    name: "lower_bound"
+    type: "float"
+    default_value {
+      f: -inf
+    }
+  }
+  attr {
+    name: "upper_bound"
+    type: "float"
+    default_value {
+      f: inf
+    }
+  }
+  attr {
+    name: "mute_if_healthy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -5098,6 +5668,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -5111,6 +5688,24 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
 op {
   name: "DecodeCSV"
   input_arg {
@@ -5372,6 +5967,38 @@ op {
     }
   }
 }
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -6243,17 +6870,33 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "Elu"
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "EluGrad"
   input_arg {
@@ -6275,17 +6918,37 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "EluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "outputs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "EncodeBase64"
   input_arg {
@@ -6899,17 +7562,13 @@ op {
   }
 }
 op {
-  name: "FakeQuantWithMinMaxArgsGradient"
-  input_arg {
-    name: "gradients"
-    type: DT_FLOAT
-  }
+  name: "FakeQuantWithMinMaxArgs"
   input_arg {
     name: "inputs"
     type: DT_FLOAT
   }
   output_arg {
-    name: "backprops"
+    name: "outputs"
     type: DT_FLOAT
   }
   attr {
@@ -6926,15 +7585,106 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
 }
 op {
-  name: "FakeQuantWithMinMaxVars"
+  name: "FakeQuantWithMinMaxArgsGradient"
   input_arg {
-    name: "inputs"
+    name: "gradients"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min"
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxArgsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "min"
+    type: "float"
+    default_value {
+      f: -6
+    }
+  }
+  attr {
+    name: "max"
+    type: "float"
+    default_value {
+      f: 6
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVars"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
     type: DT_FLOAT
   }
   input_arg {
@@ -6945,6 +7695,44 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
@@ -6976,6 +7764,32 @@ op {
     name: "backprop_wrt_max"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannel"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+  }
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannel"
@@ -6995,6 +7809,44 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+}
+op {
+  name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprops_wrt_input"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "backprop_wrt_max"
+    type: DT_FLOAT
+  }
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
@@ -7026,6 +7878,13 @@ op {
     name: "backprop_wrt_max"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
 }
 op {
   name: "FakeQueue"
@@ -7059,6 +7918,67 @@ op {
     type: "type"
   }
 }
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -7101,10 +8021,11 @@ op {
   is_stateful: true
 }
 op {
-  name: "FixedLengthRecordReaderV2"
+  name: "FixedLengthRecordReader"
   output_arg {
     name: "reader_handle"
-    type: DT_RESOURCE
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "header_bytes"
@@ -7125,10 +8046,104 @@ op {
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "hop_bytes"
+    type: "int"
     default_value {
-      s: ""
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "header_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "record_bytes"
+    type: "int"
+  }
+  attr {
+    name: "footer_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
@@ -7241,6 +8256,145 @@ op {
     }
   }
 }
+op {
+  name: "FixedUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "vocab_file"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "distortion"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "num_reserved_ids"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+    default_value {
+      i: 1
+    }
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shard"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "unigrams"
+    type: "list(float)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Floor"
   input_arg {
@@ -7604,19 +8758,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -7690,19 +8831,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -7751,9 +8879,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -7809,9 +8935,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -7946,7 +9070,22 @@ op {
   }
 }
 op {
-  name: "GetSessionHandleV2"
+  name: "GetSessionHandle"
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "GetSessionHandleV2"
   input_arg {
     name: "value"
     type_attr: "T"
@@ -8040,6 +9179,60 @@ op {
     }
   }
 }
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "HSVToRGB"
   input_arg {
@@ -8102,6 +9295,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "HistogramSummary"
   input_arg {
@@ -8533,6 +9763,70 @@ op {
     }
   }
 }
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -8700,6 +9994,66 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorDispose"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -8717,22 +10071,33 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
+op {
+  name: "L2Loss"
+  input_arg {
+    name: "t"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "LRN"
   input_arg {
@@ -8901,20 +10266,76 @@ op {
   }
 }
 op {
-  name: "Less"
+  name: "LearnedUnigramCandidateSampler"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Less"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
     name: "T"
     type: "type"
     allowed_values {
@@ -9189,6 +10610,62 @@ op {
     }
   }
 }
+op {
+  name: "LogUniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "LogicalAnd"
   input_arg {
@@ -9256,6 +10733,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableFind"
   input_arg {
@@ -9284,6 +10785,34 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableImport"
   input_arg {
@@ -9308,6 +10837,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableInsert"
   input_arg {
@@ -9332,6 +10885,30 @@ op {
     type: "type"
   }
 }
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -9344,6 +10921,18 @@ op {
     type: DT_INT64
   }
 }
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
 op {
   name: "LoopCond"
   input_arg {
@@ -9356,37 +10945,86 @@ op {
   }
 }
 op {
-  name: "MatMul"
+  name: "MakeIterator"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "dataset"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "b"
-    type_attr: "T"
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
         type: DT_HALF
         type: DT_FLOAT
@@ -9587,27 +11225,462 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
+  input_arg {
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
       }
     }
   }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "orig_input"
+    type_attr: "TInput"
   }
   input_arg {
-    name: "rhs"
+    name: "orig_output"
+    type_attr: "TInput"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -9615,110 +11688,146 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
         type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "Max"
+  name: "MaxPool3DGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
-}
-op {
-  name: "MaxPool"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
       }
     }
   }
+}
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -9754,11 +11863,32 @@ op {
       }
     }
   }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
 }
 op {
-  name: "MaxPool3D"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -9769,13 +11899,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -9787,33 +11917,52 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -9824,13 +11973,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -9846,12 +11995,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -9862,36 +12011,31 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
-      }
-    }
-  }
-}
-op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
+      }
+    }
   }
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -9900,13 +12044,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -9918,6 +12062,16 @@ op {
       }
     }
   }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -9925,36 +12079,31 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -9963,13 +12112,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 5
+    minimum: 4
   }
   attr {
     name: "padding"
@@ -9982,54 +12131,42 @@ op {
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGrad"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "orig_input"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "orig_output"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
@@ -10058,50 +12195,47 @@ op {
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "grad"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
-  input_arg {
+  output_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -10115,22 +12249,25 @@ op {
     minimum: 4
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
@@ -10200,12 +12337,16 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -10359,6 +12500,49 @@ op {
     }
   }
 }
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+  }
+  attr {
+    name: "filterbank_channel_count"
+    type: "int"
+    default_value {
+      i: 40
+    }
+  }
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+  }
+}
 op {
   name: "Min"
   input_arg {
@@ -10601,53 +12785,116 @@ op {
     name: "num_samples"
     type: DT_INT32
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "seed"
-    type: "int"
+    name: "value_shape"
+    type: "shape"
     default_value {
-      i: 0
+      shape {
+      }
     }
   }
   attr {
-    name: "seed2"
+    name: "initial_num_buckets"
     type: "int"
     default_value {
-      i: 0
+      i: 131072
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
     }
   }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
+  name: "MutableDenseHashTableV2"
   input_arg {
     name: "empty_key"
     type_attr: "key_dtype"
   }
   output_arg {
     name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -10786,6 +13033,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -10893,6 +13222,29 @@ op {
     }
   }
 }
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
 op {
   name: "NotEqual"
   input_arg {
@@ -10980,6 +13332,44 @@ op {
     }
   }
 }
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "OnesLike"
   input_arg {
@@ -11066,6 +13456,49 @@ op {
     }
   }
 }
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "PaddingFIFOQueue"
   output_arg {
@@ -11181,6 +13614,51 @@ op {
     type: "shape"
   }
 }
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ParameterizedTruncatedNormal"
   input_arg {
@@ -11523,12 +14001,31 @@ op {
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
   }
   attr {
-    name: "out_type"
+    name: "dtype"
     type: "type"
   }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
 }
 op {
   name: "Placeholder"
@@ -11545,6 +14042,7 @@ op {
     type: "shape"
     default_value {
       shape {
+        unknown_rank: true
       }
     }
   }
@@ -11564,6 +14062,24 @@ op {
     type: "shape"
   }
 }
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
 op {
   name: "PlaceholderWithDefault"
   input_arg {
@@ -14078,6 +16594,38 @@ op {
     }
   }
 }
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Rank"
   input_arg {
@@ -14797,6 +17345,34 @@ op {
     }
   }
 }
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "RequantizationRange"
   input_arg {
@@ -15395,6 +17971,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdam"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyCenteredRMSProp"
   input_arg {
@@ -16479,6 +19135,79 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Restore"
   input_arg {
@@ -16546,18 +19275,51 @@ op {
     type: DT_STRING
   }
   input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Reverse"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dims"
+    type: DT_BOOL
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -16589,6 +19351,7 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
@@ -16682,6 +19445,53 @@ op {
     }
   }
 }
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "Rint"
   input_arg {
@@ -17903,6 +20713,40 @@ op {
     }
   }
 }
+op {
+  name: "SelfAdjointEigV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SerializeManySparse"
   input_arg {
@@ -18088,6 +20932,42 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -18218,6 +21098,34 @@ op {
     }
   }
 }
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Skipgram"
   output_arg {
@@ -19588,61 +22496,154 @@ op {
     name: "N"
     type: "int"
     has_minimum: true
-    minimum: 2
+    minimum: 2
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "SparseConditionalAccumulator"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "hashed_output"
+    type: "bool"
   }
-}
-op {
-  name: "SparseConditionalAccumulator"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
   }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "hash_key"
+    type: "int"
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_STRING
       }
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
     }
   }
-  is_stateful: true
 }
 op {
   name: "SparseDenseCwiseAdd"
@@ -20556,6 +23557,84 @@ op {
     }
   }
 }
+op {
+  name: "SparseTensorDenseMatMul"
+  input_arg {
+    name: "a_indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "a_values"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "a_shape"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "adjoint_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adjoint_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "SparseToDense"
   input_arg {
@@ -20951,20 +24030,146 @@ op {
     minimum: 1
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
 }
 op {
   name: "StopGradient"
@@ -21571,6 +24776,22 @@ op {
     type: "func"
   }
 }
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "TFRecordReader"
   output_arg {
@@ -21630,6 +24851,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "TakeManySparseFromTensorsMap"
   input_arg {
@@ -22653,6 +25902,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "TensorSummary"
   input_arg {
@@ -22690,6 +25987,18 @@ op {
     }
   }
 }
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "TextLineReader"
   output_arg {
@@ -22804,6 +26113,62 @@ op {
     }
   }
 }
+op {
+  name: "ThreadUnsafeUnigramCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Tile"
   input_arg {
@@ -23143,6 +26508,62 @@ op {
     }
   }
 }
+op {
+  name: "UniformCandidateSampler"
+  input_arg {
+    name: "true_classes"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unique"
   input_arg {
@@ -23533,3 +26954,34 @@ op {
     }
   }
 }
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_RESOURCE
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/control_flow_ops.cc b/tensorflow/core/ops/control_flow_ops.cc
index 71520f3caaf2fa0c2987e4121254aa2dd0bbd9bc..95f3f0da9db0b0e42fee670b0ff771a6c20324b6 100644
--- a/tensorflow/core/ops/control_flow_ops.cc
+++ b/tensorflow/core/ops/control_flow_ops.cc
@@ -159,7 +159,7 @@ Forwards the value of an available tensor from `inputs` to `output`.
 `Merge` waits for at least one of the tensors in `inputs` to become available.
 It is usually combined with `Switch` to implement branching.
 
-`Merge` forwards the first tensor for become available to `output`, and sets
+`Merge` forwards the first tensor to become available to `output`, and sets
 `value_index` to its index in `inputs`.
 
 inputs: The input tensors, exactly one of which will become available.
@@ -331,7 +331,10 @@ REGISTER_OP("Abort")
     .Attr("exit_without_error: bool = false")
     .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
-Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
+Raise a exception to abort the process when called.
+
+If exit_without_error is true, the process will exit normally,
+otherwise it will exit with a SIGABORT signal.
 
 Returns nothing but an exception.
 
diff --git a/tensorflow/core/ops/ctc_ops.cc b/tensorflow/core/ops/ctc_ops.cc
index c94ce577c0beb8da9cff476a1528c7d67c6df22a..3d8c53393560e9cc31549f6574db72fbfbe35ce9 100644
--- a/tensorflow/core/ops/ctc_ops.cc
+++ b/tensorflow/core/ops/ctc_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("CTCLoss")
     .Input("sequence_length: int32")
     .Attr("preprocess_collapse_repeated: bool = false")
     .Attr("ctc_merge_repeated: bool = true")
+    .Attr("ignore_longer_outputs_than_inputs: bool = false")
     .Output("loss: float")
     .Output("gradient: float")
     .SetShapeFn([](InferenceContext* c) {
@@ -75,6 +76,9 @@ preprocess_collapse_repeated: Scalar, if true then repeated labels are
 ctc_merge_repeated: Scalar.  If set to false, *during* CTC calculation
   repeated non-blank labels will not be merged and are interpreted as
   individual labels.  This is a simplified version of CTC.
+ignore_longer_outputs_than_inputs: Scalar. If set to true, during CTC
+  calculation items have longer input sequences than output sequences
+  are ignored by returning zero-gradient for those items.
 loss: A vector (batch) containing log-probabilities.
 gradient: The gradient of `loss`.  3-D, shape:
   `(max_time x batch_size x num_classes)`.
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 10b5df91f187ed7d6b0baeffceec30fd461840a2..282778e495d197459635d673acdca59d65d0b678 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -101,8 +101,10 @@ For example:
     outputs[1] = [30, 40]
 ```
 
+See `dynamic_stitch` for an example on how to merge partitions back.
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicPartition.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
 </div>
 
 partitions: Any shape.  Indices in the range `[0, num_partitions)`.
@@ -120,7 +122,7 @@ REGISTER_OP("DynamicStitch")
       TF_RETURN_IF_ERROR(c->GetAttr("N", &num_partitions));
 
       ShapeHandle extra_shape = c->UnknownShape();
-      for (int i = 0; i < num_partitions; ++i) {
+      for (int64 i = 0; i < num_partitions; ++i) {
         ShapeHandle indices_shape = c->input(i);
         ShapeHandle data_shape = c->input(i + num_partitions);
         if (!c->RankKnown(indices_shape)) {
@@ -189,8 +191,26 @@ For example:
               [51, 52], [61, 62]]
 ```
 
+This method can be used to merge partitions created by `dynamic_partition`
+as illustrated on the following example:
+
+```python
+    # Apply function (increments x_i) on elements for which a certain condition
+    # apply (x_i != -1 in this example).
+    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+    condition_mask=tf.not_equal(x,tf.constant(-1.))
+    partitioned_data = tf.dynamic_partition(
+        x, tf.cast(condition_mask, tf.int32) , 2)
+    partitioned_data[1] = partitioned_data[1] + 1.0
+    condition_indices = tf.dynamic_partition(
+        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+    x = tf.dynamic_stitch(condition_indices, partitioned_data)
+    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+    # unchanged.
+```
+
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/DynamicStitch.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
 </div>
 )doc");
 
@@ -210,10 +230,29 @@ Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
 Status TwoElementOutput(InferenceContext* c) {
   c->set_output(0, c->Vector(2));
   return Status::OK();
 }
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
 }  // namespace
 
 REGISTER_OP("RandomShuffleQueue")
@@ -604,7 +643,17 @@ REGISTER_OP("QueueDequeueV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      if (c->num_outputs() == 1) {
+        c->set_output(0, c->input_handle_shape(0));
+      } else {
+        // TODO(vrv): handle the case of multiple outputs.
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          c->set_output(i, c->UnknownShape());
+        }
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Dequeues a tuple of one or more tensors from the given queue.
 
@@ -631,20 +680,20 @@ REGISTER_OP("QueueDequeueMany")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -664,20 +713,20 @@ REGISTER_OP("QueueDequeueManyV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
-If the queue is closed and there are fewer than n elements, then an
+If the queue is closed and there are fewer than `n` elements, then an
 OutOfRange error is returned.
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
-N.B. If the queue is empty, this operation will block until n elements
+N.B. If the queue is empty, this operation will block until `n` elements
 have been dequeued (or 'timeout_ms' elapses, if specified).
 
 handle: The handle to a queue.
@@ -697,24 +746,24 @@ REGISTER_OP("QueueDequeueUpTo")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
+in the dequeued tuple will have size `n` in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has k outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -734,24 +783,24 @@ REGISTER_OP("QueueDequeueUpToV2")
     .Attr("timeout_ms: int = -1")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
-Dequeues n tuples of one or more tensors from the given queue.
+Dequeues `n` tuples of one or more tensors from the given queue.
 
 This operation is not supported by all queues.  If a queue does not support
 DequeueUpTo, then an Unimplemented error is returned.
 
-If the queue is closed and there are more than 0 but less than n elements
-remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-is closed and there are 0 elements left in the queue, then an OutOfRange
-error is returned just like in QueueDequeueMany.  Otherwise the behavior
-is identical to QueueDequeueMany:
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
 
 This operation concatenates queue-element component tensors along the
 0th dimension to make a single component tensor.  All of the components
 in the dequeued tuple will have size n in the 0th dimension.
 
-This operation has k outputs, where k is the number of components in
-the tuples stored in the given queue, and output i is the ith
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
 component of the dequeued tuple.
 
 handle: The handle to a queue.
@@ -778,7 +827,7 @@ operations that would block will fail immediately.
 
 handle: The handle to a queue.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be cancelled.
+  blocked on the given queue will be canceled.
 )doc");
 
 REGISTER_OP("QueueCloseV2")
@@ -796,7 +845,7 @@ operations that would block will fail immediately.
 
 handle: The handle to a queue.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the given queue will be cancelled.
+  blocked on the given queue will be canceled.
 )doc");
 
 REGISTER_OP("QueueSize")
@@ -843,8 +892,10 @@ REGISTER_OP("AccumulatorSetGlobalStep")
       return Status::OK();
     })
     .Doc(R"doc(
-Updates the accumulator with a new value for global_step. Logs warning if the
-accumulator's value is already higher than new_global_step.
+Updates the accumulator with a new value for global_step.
+
+Logs warning if the accumulator's value is already higher than
+new_global_step.
 
 handle: The handle to an accumulator.
 new_global_step: The new global_step value to set.
@@ -862,20 +913,22 @@ REGISTER_OP("ConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating gradients. The accumulator accepts
-gradients marked with local_step greater or equal to the most recent global_step
-known to the accumulator. The average can be extracted from the accumulator,
-provided sufficient gradients have been accumulated. Extracting the average
-automatically resets the aggregate to 0, and increments the global_step recorded
-by the accumulator.
+A conditional accumulator for aggregating gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
 shape: The shape of the values, can be [], in which case shape is unknown.
 container: If non-empty, this accumulator is placed in the given container.
   Otherwise, a default container is used.
-shared_name: If non-empty, this accumulator will be shared under the given name
-  across multiple sessions.
+shared_name: If non-empty, this accumulator will be shared under the
+  given name across multiple sessions.
 )doc");
 
 REGISTER_OP("AccumulatorApplyGradient")
@@ -889,8 +942,9 @@ REGISTER_OP("AccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a gradient to a given accumulator. Does not add if local_step is lesser
-than the accumulator's global_step.
+Applies a gradient to a given accumulator.
+
+Does not add if local_step is lesser than the accumulator's global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the gradient was computed.
@@ -913,13 +967,13 @@ REGISTER_OP("AccumulatorTakeGradient")
     })
     .Attr("dtype: numbertype")
     .Doc(R"doc(
-Extracts the average gradient in the given ConditionalAccumulator, provided
-that sufficient (i.e., more than num_required) gradients have been accumulated.
-The op blocks until sufficient gradients have been accumulated.
-If the accumulator has already aggregated more than num_required gradients, it
-returns the average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average gradient in the given ConditionalAccumulator.
+
+The op blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated.  If the accumulator has already
+aggregated more than num_required gradients, it returns the average of
+the accumulated gradients.  Also automatically increments the recorded
+global_step in the accumulator by 1, and resets the aggregate to 0.
 
 handle: The handle to an accumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -940,12 +994,14 @@ REGISTER_OP("SparseConditionalAccumulator")
       return Status::OK();
     })
     .Doc(R"doc(
-A conditional accumulator for aggregating sparse gradients. The accumulator
-accepts gradients marked with local_step greater or equal to the most recent
-global_step known to the accumulator. The average can be extracted from the
-accumulator, provided sufficient gradients have been accumulated. Extracting the
-average automatically resets the aggregate to 0, and increments the global_step
-recorded by the accumulator.
+A conditional accumulator for aggregating sparse gradients.
+
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
 
 handle: The handle to the accumulator.
 dtype: The type of the value being accumulated.
@@ -970,8 +1026,10 @@ REGISTER_OP("SparseAccumulatorApplyGradient")
       return Status::OK();
     })
     .Doc(R"doc(
-Applies a sparse gradient to a given accumulator. Does not add if local_step is
-lesser than the accumulator's global_step.
+Applies a sparse gradient to a given accumulator.
+
+Does not add if local_step is smaller than the accumulator's
+global_step.
 
 handle: The handle to a accumulator.
 local_step: The local_step value at which the sparse gradient was computed.
@@ -1003,13 +1061,14 @@ REGISTER_OP("SparseAccumulatorTakeGradient")
       return shape_inference::UnknownShape(c);
     })
     .Doc(R"doc(
-Extracts the average sparse gradient in the given SparseConditionalAccumulator,
-provided that sufficient (i.e., more than num_required) gradients have been
-accumulated. The op will blocks until sufficient gradients have been
-accumulated. If the accumulator has already aggregated more than num_required
-gradients, it will return its average of the accumulated gradients.
-Also automatically increments the recorded global_step in the accumulator by 1,
-and resets the aggregate to 0.
+Extracts the average sparse gradient in a SparseConditionalAccumulator.
+
+The op will blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated. If the accumulator has already
+aggregated more than num_required gradients, it will return its
+average of the accumulated gradients.  Also automatically increments
+the recorded global_step in the accumulator by 1, and resets the
+aggregate to 0.
 
 handle: The handle to a SparseConditionalAccumulator.
 num_required: Number of gradients required before we return an aggregate.
@@ -1043,7 +1102,10 @@ REGISTER_OP("StackPush")
     .Output("output: T")
     .Attr("T: type")
     .Attr("swap_memory: bool = false")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Push an element onto the stack.
 
@@ -1095,8 +1157,9 @@ REGISTER_OP("TensorArrayV3")
       return Status::OK();
     })
     .Doc(R"doc(
-An array of Tensors of given size, with data written via Write and read
-via Read or Pack.
+An array of Tensors of given size.
+
+Write data via Write and read via Read or Pack.
 
 handle: The handle to the TensorArray.
 flow: A scalar used to control gradient flow.
@@ -1412,8 +1475,10 @@ REGISTER_OP("TensorArrayCloseV3")
       return Status::OK();
     })
     .Doc(R"doc(
-Delete the TensorArray from its resource container.  This enables
-the user to close and release the resource in the middle of a step/run.
+Delete the TensorArray from its resource container.
+
+This enables the user to close and release the resource in the middle
+of a step/run.
 
 handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 )doc");
@@ -1814,7 +1879,7 @@ Subsequent TakeMany operations that would block will fail immediately.
 
 handle: The handle to a barrier.
 cancel_pending_enqueues: If true, all pending enqueue requests that are
-  blocked on the barrier's queue will be cancelled. InsertMany will fail, even
+  blocked on the barrier's queue will be canceled. InsertMany will fail, even
   if no new key is introduced.
 )doc");
 
@@ -1844,392 +1909,375 @@ size: The number of incomplete elements (i.e. those with some of their value
 
 // --------------------------------------------------------------------------
 
-REGISTER_OP("LookupTableFind")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("default_value: Tout")
-    .Output("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // Default value must be scalar or vector.
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
-      c->set_output(0, c->UnknownShape());
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Looks up keys in a table, outputs the corresponding values.
-
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Same shape as `keys`.  Values found in the table, or `default_values`
-   for missing keys.
-)doc");
-
-REGISTER_OP("LookupTableInsert")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    })
+REGISTER_OP("GetSessionHandle")
+    .Input("value: T")
+    .Output("handle: string")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
-Updates the table to associates keys with values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
+Store the input tensor in the state of the current session.
 
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
+value: The tensor to be stored.
+handle: The handle for the tensor stored in the session state, represented
+  as a string.
 )doc");
 
-REGISTER_OP("LookupTableSize")
-    .Input("table_handle: Ref(string)")
-    .Output("size: int64")
-    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
+REGISTER_OP("GetSessionHandleV2")
+    .Input("value: T")
+    .Output("handle: resource")
+    .Attr("T: type")
+    .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
-Computes the number of elements in the given table.
+Store the input tensor in the state of the current session.
 
-table_handle: Handle to the table.
-size: Scalar that contains number of elements in the table.
+value: The tensor to be stored.
+handle: The handle for the tensor stored in the session state, represented
+  as a ResourceHandle object.
 )doc");
 
-REGISTER_OP("LookupTableExport")
-    .Input("table_handle: Ref(string)")
-    .Output("keys: Tkeys")
-    .Output("values: Tvalues")
-    .Attr("Tkeys: type")
-    .Attr("Tvalues: type")
+REGISTER_OP("GetSessionTensor")
+    .Input("handle: string")
+    .Output("value: dtype")
+    .Attr("dtype: type")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      ShapeHandle values = c->UnknownShape();
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
-      ShapeHandle keys = c->Vector(c->Dim(values, 0));
-      c->set_output(0, keys);
-      c->set_output(1, values);
-      return Status::OK();
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      return shape_inference::UnknownShape(c);
     })
     .Doc(R"doc(
-Outputs all keys and values in the table.
+Get the value of the tensor specified by its handle.
 
-table_handle: Handle to the table.
-keys: Vector of all keys present in the table.
-values: Tensor of all values in the table. Indexed in parallel with `keys`.
+handle: The handle for a tensor stored in the session state.
+value: The tensor for the given handle.
+dtype: The type of the output value.
 )doc");
 
-REGISTER_OP("LookupTableImport")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
+REGISTER_OP("DeleteSessionTensor")
+    .Input("handle: string")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      // TODO: Validate keys and values shape.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
       return Status::OK();
     })
     .Doc(R"doc(
-Replaces the contents of the table with the specified keys and values.
-
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
+Delete the tensor specified by its handle in the session.
 
-table_handle: Handle to the table.
-keys:  Any shape.  Keys to look up.
-values: Values to associate with keys.
+handle: The handle for a tensor stored in the session state.
 )doc");
 
-REGISTER_OP("HashTable")
-    .Output("table_handle: Ref(string)")
+REGISTER_OP("Stage")
+    .Input("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
     .Doc(R"doc(
-Creates a non-initialized hash table.
+Stage values similar to a lightweight Enqueue.
 
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
+The basic functionality of this Op is similar to a queue with many
+fewer capabilities and options.  This Op is optimized for performance.
 
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
+values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
+memory_limit: The maximum number of bytes allowed for Tensors in the Staging Area.
+  If > 0, inserts will block until sufficient space is available.
+container: If non-empty, this queue is placed in the given container. Otherwise,
+  a default container is used.
+shared_name: It is necessary to match this name to the matching Unstage Op.
 )doc");
 
-REGISTER_OP("MutableHashTable")
-    .Output("table_handle: Ref(string)")
+REGISTER_OP("Unstage")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
+    .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
     .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
+Op is similar to a lightweight Dequeue.
 
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-use_node_name_sharing: If true and shared_name is empty, the table is shared
-  using the node name.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
+The basic funtionality is similar to dequeue with many fewer
+capabilities and options.  This Op is optimized for performance.
 )doc");
 
-REGISTER_OP("MutableHashTableOfTensors")
-    .Output("table_handle: Ref(string)")
+REGISTER_OP("StagePeek")
+    .Input("index: int32")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
+    .SetShapeFn(shape_inference::UnknownShape)
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
     .Doc(R"doc(
-Creates an empty hash table.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
+Op peeks at the values at the specified index.  If the
+underlying container does not contain sufficient elements
+this op will block until it does.   This Op is optimized for
+performance.
+    )doc");
 
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-)doc");
 
-REGISTER_OP("MutableDenseHashTable")
-    .Input("empty_key: key_dtype")
-    .Output("table_handle: Ref(string)")
+REGISTER_OP("StageSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .Attr("initial_num_buckets: int = 131072")  // 2^17
-    .Attr("max_load_factor: float = 0.8")
+    .SetShapeFn(shape_inference::ScalarShape)
     .SetIsStateful()
-    .SetShapeFn(TwoElementOutput)
     .Doc(R"doc(
-Creates an empty hash table that uses tensors as the backing store. It uses
-"open addressing" with quadratic reprobing to resolve collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-
-empty_key: The key used to represent empty key buckets internally. Must not
-  be used in insert or lookup operations.
-table_handle: Handle to a table.
-container: If non-empty, this table is placed in the given container.
-  Otherwise, a default container is used.
-shared_name: If non-empty, this table is shared under the given name across
-  multiple sessions.
-key_dtype: Type of the table keys.
-value_dtype: Type of the table values.
-value_shape: The shape of each value.
-initial_num_buckets: The initial number of hash table buckets. Must be a power
-  to 2.
-max_load_factor: The maximum ratio between number of entries and number of
-  buckets before growing the table. Must be between 0 and 1.
-)doc");
+Op returns the number of elements in the underlying container.
+    )doc");
 
-REGISTER_OP("InitializeTable")
-    .Input("table_handle: Ref(string)")
-    .Input("keys: Tkey")
-    .Input("values: Tval")
-    .Attr("Tkey: type")
-    .Attr("Tval: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+REGISTER_OP("StageClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes all elements in the underlying container.
+    )doc");
 
-      ShapeHandle keys;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
-      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
-      return Status::OK();
-    })
+// UnorderedMap
+REGISTER_OP("MapStage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Input("values: fake_dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("fake_dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
     .Doc(R"doc(
-Table initializer that takes two tensors for keys and values respectively.
+Stage (key, values) in the underlying container which behaves like a hashtable.
 
-table_handle: Handle to a table which will be initialized.
-keys: Keys of type Tkey.
-values: Values of type Tval.
+key: int64
+values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
+container: If non-empty, this queue is placed in the given container. Otherwise,
+  a default container is used.
+shared_name: It is necessary to match this name to the matching Unstage Op.
 )doc");
 
-REGISTER_OP("InitializeTableFromTextFile")
-    .Input("table_handle: Ref(string)")
-    .Input("filename: string")
-    .Attr("key_index: int >= -2")
-    .Attr("value_index: int >= -2")
-    .Attr("vocab_size: int >= -1 = -1")
-    .Attr("delimiter: string = '\t'")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
-      DimensionHandle unused_dim;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
-
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
-      return Status::OK();
-    })
+REGISTER_OP("MapPeek")
+    .Input("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
     .Doc(R"doc(
-Initializes a table from a text file.
-
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-
-table_handle: Handle to a table which will be initialized.
-filename: Filename of a vocabulary text file.
-key_index: Column index in a line to get the table `key` values from.
-value_index: Column index that represents information of a line to get the table
-  `value` values from.
-vocab_size: Number of elements of the file, use -1 if unknown.
-delimiter: Delimiter to separate fields in a line.
-)doc");
+Op peeks at the values at the specified key.  If the
+underlying container does not contain this key
+this op will block until it does.
+    )doc");
 
-REGISTER_OP("GetSessionHandle")
-    .Input("value: T")
-    .Output("handle: string")
-    .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Deprecated(23, "Use GetSessionHandleV2");
+REGISTER_OP("MapUnstage")
+    .Input("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the values associated with the key
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+    )doc");
 
-REGISTER_OP("GetSessionHandleV2")
-    .Input("value: T")
-    .Output("handle: resource")
-    .Attr("T: type")
-    .SetShapeFn(shape_inference::ScalarShape)
+REGISTER_OP("MapUnstageNoKey")
+    .Output("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
     .Doc(R"doc(
-Store the input tensor in the state of the current session.
+Op removes and returns a random (key, value)
+from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+      )doc");
 
-value: The tensor to be stored.
-handle: The handle for the tensor stored in the session state, represented
-  as a ResourceHandle object.
-)doc");
+REGISTER_OP("MapSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of elements in the underlying container.
+    )doc");
 
-REGISTER_OP("GetSessionTensor")
-    .Input("handle: string")
-    .Output("value: dtype")
-    .Attr("dtype: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      return shape_inference::UnknownShape(c);
-    })
+REGISTER_OP("MapIncompleteSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
     .Doc(R"doc(
-Get the value of the tensor specified by its handle.
+Op returns the number of incomplete elements in the underlying container.
+    )doc");
 
-handle: The handle for a tensor stored in the session state.
-value: The tensor for the given handle.
-dtype: The type of the output value.
-)doc");
 
-REGISTER_OP("DeleteSessionTensor")
-    .Input("handle: string")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle unused;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      return Status::OK();
-    })
+REGISTER_OP("MapClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
     .Doc(R"doc(
-Delete the tensor specified by its handle in the session.
+Op removes all elements in the underlying container.
+    )doc");
 
-handle: The handle for a tensor stored in the session state.
-)doc");
 
-REGISTER_OP("Stage")
-    .Input("values: dtypes")
+// OrderedMap
+REGISTER_OP("OrderedMapStage")
+    .Input("key: int64")
+    .Input("indices: int32")
+    .Input("values: fake_dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
     .Attr("dtypes: list(type)")
+    .Attr("fake_dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .SetIsStateful()
     .Doc(R"doc(
-Stage values similar to a lightweight Enqueue.  The basic functionality of this
-Op is similar to a queue with many fewer capabilities and options.  This Op is
-optimized for performance.
+Stage (key, values) in the underlying container which behaves like a ordered
+associative container.   Elements are ordered by key.
 
+key: int64
 values: a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+capacity: Maximum number of elements in the Staging Area. If > 0, inserts
+  on the container will block when the capacity is reached.
 container: If non-empty, this queue is placed in the given container. Otherwise,
   a default container is used.
 shared_name: It is necessary to match this name to the matching Unstage Op.
-    )doc");
+)doc");
 
-REGISTER_OP("Unstage")
+REGISTER_OP("OrderedMapPeek")
+    .Input("key: int64")
     .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
     .Attr("dtypes: list(type)")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
     .SetIsStateful()
     .Doc(R"doc(
-Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
-dequeue with many fewer capabilities and options.  This Op is optimized for
+Op peeks at the values at the specified key.  If the
+underlying container does not contain this key
+this op will block until it does.   This Op is optimized for
 performance.
     )doc");
 
+REGISTER_OP("OrderedMapUnstage")
+    .Input("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the values associated with the key
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+    )doc");
+
+REGISTER_OP("OrderedMapUnstageNoKey")
+    .Output("key: int64")
+    .Output("values: dtypes")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes and returns the (key, value) element with the smallest
+key from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+      )doc");
+
+REGISTER_OP("OrderedMapSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of elements in the underlying container.
+    )doc");
+
+REGISTER_OP("OrderedMapIncompleteSize")
+    .Output("size: int32")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op returns the number of incomplete elements in the underlying container.
+    )doc");
+
+REGISTER_OP("OrderedMapClear")
+    .Attr("capacity: int >= 0 = 0")
+    .Attr("memory_limit: int >= 0 = 0")
+    .Attr("dtypes: list(type)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .SetIsStateful()
+    .Doc(R"doc(
+Op removes all elements in the underlying container.
+    )doc");
+
 REGISTER_OP("RecordInput")
     .Output("records: string")
     .Attr("file_pattern: string")
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9f762d25fc7d596a271a9c8199c146059d76008
--- /dev/null
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -0,0 +1,471 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+// --------------------------------------------------------------------------
+
+// The ops in this section can be composed to define an input
+// pipeline. Each op produces a (step-local) resource that represents
+// a DAG of "dataset" objects. An "dataset" object can be converted
+// to a stateful "iterator" by passing the "dataset" to the
+// "MakeIterator" op.
+
+REGISTER_OP("TensorDataset")
+    .Input("components: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
+                                               // `components` have shapes
+                                               // compatible with
+                                               // `output_shapes`.
+    .Doc(R"doc(
+Creates a dataset that emits `components` as a tuple of tensors once.
+)doc");
+
+REGISTER_OP("TensorSliceDataset")
+    .Input("components: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that the
+                                               // dim-0 slices of `components`
+                                               // have shapes compatible with
+                                               // `output_shapes`.
+    .Doc(R"doc(
+Creates a dataset that emits each dim-0 slice of `components` once.
+)doc");
+
+REGISTER_OP("SparseTensorSliceDataset")
+    .Input("indices: int64")
+    .Input("values: Tvalues")
+    .Input("dense_shape: int64")
+    .Output("handle: resource")
+    .Attr("Tvalues: type")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that splits a SparseTensor into elements row-wise.
+)doc");
+
+REGISTER_OP("ZipDataset")
+    .Input("input_datasets: N * resource")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that zips together `input_datasets`.
+)doc");
+
+REGISTER_OP("RepeatDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate the shape
+                                               // of `count`.
+    .Doc(R"doc(
+Creates a dataset that emits the outputs of `input_dataset` `count` times.
+
+count: A scalar representing the number of times that `input_dataset` should
+  be repeated. A value of `-1` indicates that it should be repeated infinitely.
+)doc");
+
+REGISTER_OP("TakeDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that contains `count` elements from the `input_dataset`.
+
+count: A scalar representing the number of elements from the `input_dataset`
+  that should be taken. A value of `-1` indicates that all of `input_dataset`
+  is taken.
+)doc");
+
+REGISTER_OP("SkipDataset")
+    .Input("input_dataset: resource")
+    .Input("count: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that skips `count` elements from the `input_dataset`.
+
+count: A scalar representing the number of elements from the `input_dataset`
+  that should be skipped.  If count is -1, skips everything.
+)doc");
+
+REGISTER_OP("MapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+)doc");
+
+REGISTER_OP("ParallelMapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Input("num_threads: int32")
+    .Input("output_buffer_size: int64")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset uses
+up to `num_threads` threads to process elements from `input_dataset`
+in parallel.
+
+num_threads: The number of threads to use to process elements from
+  `input_dataset`.
+output_buffer_size: The maximum number of output elements to buffer in an
+  iterator over this dataset.
+)doc");
+
+REGISTER_OP("FlatMapDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that applies `f` to the outputs of `input_dataset`.
+
+Unlike MapDataset, the `f` in FlatMapDataset is expected to return a
+Dataset resource, and FlatMapDataset will flatten successive results
+into a single Dataset.
+
+f: A function mapping elements of `input_dataset`, concatenated with
+  `other_arguments`, to a Dataset resource that contains elements matching
+  `output_types` and `output_shapes`.
+)doc");
+
+REGISTER_OP("GroupByWindowDataset")
+    .Input("input_dataset: resource")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("window_size: int64")
+    .Output("handle: resource")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that computes a windowed group-by on `input_dataset`.
+
+// TODO(mrry): Support non-int64 keys.
+
+key_func: A function mapping an element of `input_dataset`, concatenated
+  with `key_func_other_arguments` to a scalar value of type DT_INT64.
+)doc");
+
+REGISTER_OP("FilterDataset")
+    .Input("input_dataset: resource")
+    .Input("other_arguments: Targuments")
+    .Output("handle: resource")
+    .Attr("predicate: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset containing elements of `input_dataset` matching `predicate`.
+
+The `predicate` function must return a scalar boolean and accept the
+following arguments:
+
+* One tensor for each component of an element of `input_dataset`.
+* One tensor for each value in `other_arguments`.
+
+predicate: A function returning a scalar boolean.
+other_arguments: A list of tensors, typically values that were captured when
+  building a closure for `predicate`.
+)doc");
+
+REGISTER_OP("BatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that batches `batch_size` elements from `input_dataset`.
+
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+)doc");
+
+REGISTER_OP("PaddedBatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Output("handle: resource")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): Validate that
+                                               // `padded_shapes` are all
+                                               // vectors, the lengths of
+                                               // `output_types` and
+                                               // `output_shapes` are `N`,
+                                               // the `output_shapes` are (as
+                                               // far as possible to tell
+                                               // statically) compatible with
+                                               // `padded_shapes`, and
+                                               // that `padding_values` are
+                                               // all scalars.
+    .Doc(R"doc(
+Creates a dataset that batches and pads `batch_size` elements from the input.
+
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+padded_shapes: A list of int64 tensors representing the desired padded shapes
+  of the corresponding output components. These shapes may be partially
+  specified, using `-1` to indicate that a particular dimension should be
+  padded to the maximum size of all batch elements.
+padding_values: A list of scalars containing the padding value to use for
+  each of the outputs.
+)doc");
+
+REGISTER_OP("DenseToSparseBatchDataset")
+    .Input("input_dataset: resource")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: resource")
+    // NOTE(mrry): the 0th and 2nd elements will be DT_INT64.
+    .Attr("output_types: list(type) >= 1")
+    // NOTE(mrry): the 1st and 2nd elements will be vectors.
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that yields a SparseTensor for each element of the input.
+
+input_dataset: A handle to an input dataset. Must have a single component.
+batch_size: A scalar representing the number of elements to accumulate in a
+  batch.
+row_shape: A vector representing the dense shape of each row in the produced
+  SparseTensor.
+)doc");
+
+REGISTER_OP("RangeDataset")
+    .Input("start: int64")
+    .Input("stop: int64")
+    .Input("step: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset with a range of values. Corresponds to python's xrange.
+
+start: corresponds to start in python's xrange().
+stop: corresponds to stop in python's xrange().
+step: corresponds to step in python's xrange().
+)doc");
+
+REGISTER_OP("ShuffleDataset")
+    .Input("input_dataset: resource")
+    .Input("buffer_size: int64")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+
+buffer_size: The number of output elements to buffer in an iterator over
+  this dataset. Compare with the `min_after_dequeue` attr when creating a
+  `RandomShuffleQueue`.
+seed: A scalar seed for the random number generator. If either seed or
+  seed2 is set to be non-zero, the random number generator is seeded
+  by the given seed.  Otherwise, a random seed is used.
+seed2: A second scalar seed to avoid seed collision.
+)doc");
+
+REGISTER_OP("TextLineDataset")
+    .Input("filenames: string")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)  // TODO(mrry): validate
+                                               // that `filenames` is
+                                               // a scalar or a
+                                               // vector.
+    .Doc(R"doc(
+Creates a dataset that emits the lines of one or more text files.
+
+filenames: A scalar or a vector containing the name(s) of the file(s) to be
+  read.
+)doc");
+
+REGISTER_OP("FixedLengthRecordDataset")
+    .Input("filenames: string")
+    .Input("header_bytes: int64")
+    .Input("record_bytes: int64")
+    .Input("footer_bytes: int64")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the records from one or more binary files.
+
+filenames: A scalar or a vector containing the name(s) of the file(s) to be
+  read.
+header_bytes: A scalar representing the number of bytes to skip at the
+  beginning of a file.
+record_bytes: A scalar representing the number of bytes in each record.
+footer_bytes: A scalar representing the number of bytes to skip at the end
+  of a file.
+)doc");
+
+REGISTER_OP("TFRecordDataset")
+    .Input("filenames: string")
+    .Input("compression_type: string")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a dataset that emits the records from one or more TFRecord files.
+
+filenames: A scalar or vector containing the name(s) of the file(s) to be
+  read.
+compression_type: A scalar containing either (i) the empty string (no
+  compression), (ii) "ZLIB", or (iii) "GZIP".
+)doc");
+
+REGISTER_OP("Iterator")
+    .Output("handle: resource")
+    .Attr("shared_name: string")
+    .Attr("container: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+A container for an iterator resource.
+
+handle: A handle to the iterator that can be passed to a "MakeIterator"
+  or "IteratorGetNext" op.
+)doc");
+
+REGISTER_OP("MakeIterator")
+    .Input("dataset: resource")
+    .Input("iterator: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Makes a new iterator from the given `dataset` and stores it in `iterator`.
+
+This operation may be executed multiple times. Each execution will reset the
+iterator in `iterator` to the first element of `dataset`.
+)doc");
+
+REGISTER_OP("OneShotIterator")
+    .Output("handle: resource")
+    .Attr("dataset_factory: func")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Makes a "one-shot" iterator that can be iterated only once.
+
+A one-shot iterator bundles the logic for defining the dataset and
+the state of the iterator in a single op, which allows simple input
+pipelines to be defined without an additional initialization
+("MakeIterator") step.
+
+One-shot iterators have the following limitations:
+
+* They do not support parameterization: all logic for creating the underlying
+  dataset must be bundled in the `dataset_factory` function.
+* They are not resettable. Once a one-shot iterator reaches the end of its
+  underlying dataset, subsequent "IteratorGetNext" operations on that
+  iterator will always produce an `OutOfRange` error.
+
+For greater flexibility, use "Iterator" and "MakeIterator" to define
+an iterator using an arbitrary subgraph, which may capture tensors
+(including fed values) as parameters, and which may be reset multiple
+times by rerunning "MakeIterator".
+
+handle: A handle to the iterator that can be passed to an "IteratorGetNext"
+  op.
+dataset_factory: A function of type `() -> DT_RESOURCE`, where the returned
+  DT_RESOURCE is a handle to a dataset.
+)doc");
+
+REGISTER_OP("IteratorGetNext")
+    .Input("iterator: resource")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as `output_types` (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Gets the next output from the given iterator.
+)doc");
+
+REGISTER_OP("IteratorDispose")
+    .Input("iterator: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Releases any resources used by the given iterator.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc
index 63f6b605843a9789513712649e2aa752eed07a5e..74f9722956965cbd7b84a30336d4ec373c58b045 100644
--- a/tensorflow/core/ops/debug_ops.cc
+++ b/tensorflow/core/ops/debug_ops.cc
@@ -19,17 +19,25 @@ limitations under the License.
 namespace tensorflow {
 
 // EXPERIMENTAL: tfdbg debugger-inserted ops.
+// These ops are used only internally by tfdbg. There is no API for users to
+// direct create them. Users can create them indirectly by using
+// RunOptions.debug_options during Session::Run() call. See tfdbg documentation
+// for more details.
 REGISTER_OP("Copy")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .Attr("debug_ops_spec: list(string) = []")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Copy Op.
 
 Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
 device on which the tensor is allocated.
+N.B.: If the all downstream attached debug ops are disabled given the current
+gRPC gating status, the output will simply forward the input tensor without
+deep-copying. See the documentation of Debug* ops for more details.
 
 Unlike the CopyHost Op, this op does not have HostMemory constraint on its
 input or output.
@@ -37,6 +45,11 @@ input or output.
 input: Input tensor.
 output: Output tensor, deep-copied from input.
 tensor_name: The name of the input tensor.
+debug_ops_spec: A list of debug op spec (op, url, gated_grpc) for attached debug
+  ops. Each element of the list has the format
+  <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
+  as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
+  "DebugIdentity;file:///tmp/tfdbg_1;0".
 )doc");
 
 REGISTER_OP("CopyHost")
@@ -44,17 +57,26 @@ REGISTER_OP("CopyHost")
     .Output("output: T")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
+    .Attr("debug_ops_spec: list(string) = []")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Copy Host Op.
 
 Performs CPU-to-CPU deep-copying of tensor.
+N.B.: If the all downstream attached debug ops are disabled given the current
+gRPC gating status, the output will simply forward the input tensor without
+deep-copying. See the documentation of Debug* ops for more details.
 
 Unlike the Copy Op, this op has HostMemory constraint on its input or output.
 
 input: Input tensor.
 output: Output tensor, deep-copied from input.
 tensor_name: The name of the input tensor.
+debug_ops_spec: A list of debug op spec (op, url, gated_grpc) for attached debug
+  ops. Each element of the list has the format
+  <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
+  as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
+  "DebugIdentity;file:///tmp/tfdbg_1;0".
 )doc");
 
 REGISTER_OP("DebugIdentity")
@@ -63,6 +85,7 @@ REGISTER_OP("DebugIdentity")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
     .Attr("debug_urls: list(string) = []")
+    .Attr("gated_grpc: bool = false")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug Identity Op.
@@ -73,7 +96,13 @@ input: Input tensor, non-Reference type.
 output: Output tensor that equals the input tensor.
 tensor_name: Name of the input tensor.
 debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
+  file:///foo/tfdbg_dump, grpc:://localhost:11011
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
 )doc");
 
 REGISTER_OP("DebugNanCount")
@@ -82,6 +111,7 @@ REGISTER_OP("DebugNanCount")
     .Attr("T: type")
     .Attr("tensor_name: string = ''")
     .Attr("debug_urls: list(string) = []")
+    .Attr("gated_grpc: bool = false")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug NaN Value Counter Op
@@ -92,7 +122,13 @@ input: Input tensor, non-Reference type.
 output: An integer output tensor that is the number of NaNs in the input.
 tensor_name: Name of the input tensor.
 debug_urls: List of URLs to debug targets, e.g.,
-            file:///foo/tfdbg_dump, grpc:://localhost:11011
+  file:///foo/tfdbg_dump, grpc:://localhost:11011.
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
 )doc");
 
 REGISTER_OP("DebugNumericSummary")
@@ -104,6 +140,7 @@ REGISTER_OP("DebugNumericSummary")
     .Attr("lower_bound: float = -inf")
     .Attr("upper_bound: float = inf")
     .Attr("mute_if_healthy: bool = false")
+    .Attr("gated_grpc: bool = false")
     .SetAllowsUninitializedInput()
     .Doc(R"doc(
 Debug Numeric Summary Op.
@@ -144,6 +181,12 @@ upper_bound: (float) The upper bound >= which values will be included in the
 mute_if_healthy: (bool) Do not send data to the debug URLs unless at least one
   of elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and
   inf counts) is non-zero.
+gated_grpc: Whether this op will be gated. If any of the debug_urls of this
+  debug node is of the grpc:// scheme, when the value of this attribute is set
+  to True, the data will not actually be sent via the grpc stream unless this
+  debug op has been enabled at the debug_url. If all of the debug_urls of this
+  debug node are of the grpc:// scheme and the debug op is enabled at none of
+  them, the output will be an empty Tensor.
 
 )doc");
 
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 9fbebdb088a42e4da10e7f4694c9e06f252de436..ada96fa1d2ddf79b2669fa3fc437ce7b872a2eb1 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_Arg")
+REGISTER_SYSTEM_OP("_Arg")
     .Output("output: T")
     .Attr("T: type")
     .Attr("index: int >= 0")
@@ -34,7 +34,7 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
-REGISTER_OP("_Retval")
+REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
     .Attr("index: int >= 0")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 41a3aa0c38bd624e95b04eb4557d5d9767971ea4..3edae6f927463f30160d737a8efe03af4a17245a 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -349,6 +349,9 @@ The attr `ratio` allows downscaling the image by an integer factor during
 decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
 downscaling the image later.
 
+This op also supports decoding PNGs and non-animated GIFs since the interface is
+the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The JPEG-encoded image.
 channels: Number of color channels for the decoded image.
 ratio: Downscaling ratio.
@@ -525,6 +528,9 @@ Accepted values are:
 If needed, the PNG-encoded image is transformed to match the requested number
 of color channels.
 
+This op also supports decoding JPEGs and non-animated GIFs since the interface
+is the same, though it is cleaner to use `tf.image.decode_image`.
+
 contents: 0-D.  The PNG-encoded image.
 channels: Number of color channels for the decoded image.
 image: 3-D with shape `[height, width, channels]`.
@@ -557,6 +563,28 @@ compression: Compression level.
 contents: 0-D. PNG-encoded image.
 )doc");
 
+// --------------------------------------------------------------------------
+REGISTER_OP("DecodeBmp")
+    .Input("contents: string")
+    .Output("image: uint8")
+    .Attr("channels: int = 0")
+    .SetShapeFn(DecodeImageShapeFn)
+    .Doc(R"doc(
+Decode the first frame of a BMP-encoded image to a uint8 tensor.
+
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the BMP-encoded image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+
+contents: 0-D.  The BMP-encoded image.
+image: 3-D with shape `[height, width, channels]`. RGB order
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("DecodeGif")
     .Input("contents: string")
@@ -576,7 +604,10 @@ Decode the first frame of a GIF-encoded image to a uint8 tensor.
 GIF with frame or transparency compression are not supported
 convert animated GIF from compressed to uncompressed by:
 
-convert $src.gif -coalesce $dst.gif
+    convert $src.gif -coalesce $dst.gif
+
+This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+`tf.image.decode_image`.
 
 contents: 0-D.  The GIF-encoded image.
 image: 4-D with shape `[num_frames, height, width, 3]`. RGB order
@@ -869,7 +900,7 @@ boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
   in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
   `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
   `[0, 1]` interval of normalized image height is mapped to
-  `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+  `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
   which case the sampled crop is an up-down flipped version of the original
   image. The width dimension is treated similarly. Normalized coordinates
   outside the `[0, 1]` range are allowed, in which case we use
@@ -963,11 +994,50 @@ method: A string specifying the interpolation method. Only 'bilinear' is
 // --------------------------------------------------------------------------
 
 REGISTER_OP("NonMaxSuppression")
+  .Input("boxes: float")
+  .Input("scores: float")
+  .Input("max_output_size: int32")
+  .Output("selected_indices: int32")
+  .Attr("iou_threshold: float = 0.5")
+  .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    })
+  .Doc(R"doc(
+Greedily selects a subset of bounding boxes in descending order of score,
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system.  Note that this
+algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression(
+      boxes, scores, max_output_size, iou_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+  score corresponding to each box (each row of boxes).
+max_output_size: A scalar integer tensor representing the maximum number of
+  boxes to be selected by non max suppression.
+iou_threshold: A float representing the threshold for deciding whether boxes
+  overlap too much with respect to IOU.
+selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
+  indices from the boxes tensor, where `M <= max_output_size`.
+)doc");
+
+REGISTER_OP("NonMaxSuppressionV2")
     .Input("boxes: float")
     .Input("scores: float")
     .Input("max_output_size: int32")
+    .Input("iou_threshold: float")
     .Output("selected_indices: int32")
-    .Attr("iou_threshold: float = 0.5")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(c->UnknownDim()));
       return Status::OK();
@@ -989,7 +1059,7 @@ collection of bounding boxes representing the selected boxes.  The bounding
 box coordinates corresponding to the selected indices can then be obtained
 using the `tf.gather operation`.  For example:
 
-  selected_indices = tf.image.non_max_suppression(
+  selected_indices = tf.image.non_max_suppression_v2(
       boxes, scores, max_output_size, iou_threshold)
   selected_boxes = tf.gather(boxes, selected_indices)
 
@@ -998,8 +1068,8 @@ scores: A 1-D float tensor of shape `[num_boxes]` representing a single
   score corresponding to each box (each row of boxes).
 max_output_size: A scalar integer tensor representing the maximum number of
   boxes to be selected by non max suppression.
-iou_threshold: A float representing the threshold for deciding whether boxes
-  overlap too much with respect to IOU.
+iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+  boxes overlap too much with respect to IOU.
 selected_indices: A 1-D integer tensor of shape `[M]` representing the selected
   indices from the boxes tensor, where `M <= max_output_size`.
 )doc");
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 3e2583f706003b1b16451d193b357d6e16fe456b..0bce6fc0ea828b24f60d82bd472cc4ac2ce99308 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -440,6 +440,7 @@ REGISTER_OP("FixedLengthRecordReader")
     .Attr("header_bytes: int = 0")
     .Attr("record_bytes: int")
     .Attr("footer_bytes: int = 0")
+    .Attr("hop_bytes: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
@@ -448,6 +449,11 @@ REGISTER_OP("FixedLengthRecordReader")
 A Reader that outputs fixed-length records from a file.
 
 reader_handle: The handle to reference the Reader.
+header_bytes: Number of bytes in the header, defaults to 0.
+record_bytes: Number of bytes in the record.
+footer_bytes: Number of bytes in the footer, defaults to 0.
+hop_bytes: Number of bytes to hop before each read. Default of 0 means using
+        record_bytes.
 container: If non-empty, this reader is placed in the given container.
         Otherwise, a default container is used.
 shared_name: If non-empty, this reader is named in the given bucket
@@ -459,6 +465,7 @@ REGISTER_OP("FixedLengthRecordReaderV2")
     .Attr("header_bytes: int = 0")
     .Attr("record_bytes: int")
     .Attr("footer_bytes: int = 0")
+    .Attr("hop_bytes: int = 0")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .SetIsStateful()
@@ -467,6 +474,11 @@ REGISTER_OP("FixedLengthRecordReaderV2")
 A Reader that outputs fixed-length records from a file.
 
 reader_handle: The handle to reference the Reader.
+header_bytes: Number of bytes in the header, defaults to 0.
+record_bytes: Number of bytes in the record.
+footer_bytes: Number of bytes in the footer, defaults to 0.
+hop_bytes: Number of bytes to hop before each read. Default of 0 means using
+        record_bytes.
 container: If non-empty, this reader is placed in the given container.
         Otherwise, a default container is used.
 shared_name: If non-empty, this reader is named in the given bucket
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index a2762cf206acbda7c1b47767043db2b9d3d2bf42..872824b885340a2838e9a28a11b660b9380dd4d4 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -318,7 +318,7 @@ REGISTER_OP("SelfAdjointEigV2")
     .Output("e: T")
     .Output("v: T")
     .Attr("compute_v: bool = True")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn)
     .Doc(R"doc(
 Computes the eigen decomposition of one or more square self-adjoint matrices.
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dac02dad8bb861fee0e16e0acb0c8e17688e05fb
--- /dev/null
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -0,0 +1,670 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+// --------------------------------------------------------------------------
+
+namespace {
+Status TwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  for (int i = 0; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
+  ShapeHandle handle;
+  DimensionHandle unused_handle;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+  for (int i = 1; i < c->num_inputs(); ++i) {
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
+    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
+  }
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+Status TwoElementOutput(InferenceContext* c) {
+  c->set_output(0, c->Vector(2));
+  return Status::OK();
+}
+
+Status ScalarOutput(InferenceContext* c) {
+  c->set_output(0, c->Scalar());
+  return Status::OK();
+}
+}  // namespace
+
+REGISTER_OP("LookupTableFind")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableFindV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("default_value: Tout")
+    .Output("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // Default value must be scalar or vector.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Looks up keys in a table, outputs the corresponding values.
+
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Same shape as `keys`.  Values found in the table, or `default_values`
+   for missing keys.
+)doc");
+
+REGISTER_OP("LookupTableInsert")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableInsertV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Updates the table to associates keys with values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableSize")
+    .Input("table_handle: Ref(string)")
+    .Output("size: int64")
+    .SetShapeFn(TwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableSizeV2")
+    .Input("table_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs)
+    .Doc(R"doc(
+Computes the number of elements in the given table.
+
+table_handle: Handle to the table.
+size: Scalar that contains number of elements in the table.
+)doc");
+
+REGISTER_OP("LookupTableExport")
+    .Input("table_handle: Ref(string)")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableExportV2")
+    .Input("table_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Attr("Tkeys: type")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 1, &values));
+      ShapeHandle keys = c->Vector(c->Dim(values, 0));
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the table.
+
+table_handle: Handle to the table.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+)doc");
+
+REGISTER_OP("LookupTableImport")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      // TODO(ebrevdo): Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("LookupTableImportV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Input("values: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO: Validate keys and values shape.
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+table_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("HashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("HashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates a non-initialized hash table.
+
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTable")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+use_node_name_sharing: If true and shared_name is empty, the table is shared
+  using the node name.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensors")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableHashTableOfTensorsV2")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+)doc");
+
+REGISTER_OP("MutableDenseHashTable")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("MutableDenseHashTableV2")
+    .Input("empty_key: key_dtype")
+    .Output("table_handle: resource")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
+    .Attr("value_shape: shape = {}")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .SetIsStateful()
+    .SetShapeFn(ScalarOutput)
+    .Doc(R"doc(
+Creates an empty hash table that uses tensors as the backing store.
+
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+
+empty_key: The key used to represent empty key buckets internally. Must not
+  be used in insert or lookup operations.
+table_handle: Handle to a table.
+container: If non-empty, this table is placed in the given container.
+  Otherwise, a default container is used.
+shared_name: If non-empty, this table is shared under the given name across
+  multiple sessions.
+key_dtype: Type of the table keys.
+value_dtype: Type of the table values.
+value_shape: The shape of each value.
+initial_num_buckets: The initial number of hash table buckets. Must be a power
+  to 2.
+max_load_factor: The maximum ratio between number of entries and number of
+  buckets before growing the table. Must be between 0 and 1.
+)doc");
+
+REGISTER_OP("InitializeTable")
+    .Input("table_handle: Ref(string)")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tkey")
+    .Input("values: Tval")
+    .Attr("Tkey: type")
+    .Attr("Tval: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      ShapeHandle keys;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
+      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Table initializer that takes two tensors for keys and values respectively.
+
+table_handle: Handle to a table which will be initialized.
+keys: Keys of type Tkey.
+values: Values of type Tval.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFile")
+    .Input("table_handle: Ref(string)")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+REGISTER_OP("InitializeTableFromTextFileV2")
+    .Input("table_handle: resource")
+    .Input("filename: string")
+    .Attr("key_index: int >= -2")
+    .Attr("value_index: int >= -2")
+    .Attr("vocab_size: int >= -1 = -1")
+    .Attr("delimiter: string = '\t'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &handle));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Initializes a table from a text file.
+
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+
+table_handle: Handle to a table which will be initialized.
+filename: Filename of a vocabulary text file.
+key_index: Column index in a line to get the table `key` values from.
+value_index: Column index that represents information of a line to get the table
+  `value` values from.
+vocab_size: Number of elements of the file, use -1 if unknown.
+delimiter: Delimiter to separate fields in a line.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b69ce1377567b1b994572abb2ab1cf5ef89c35b5..28c4ec643e588acb7068a9184237c24e0f0fd81e 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -595,7 +595,9 @@ REGISTER_OP("Mod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division.
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
 *NOTE*: `Mod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
@@ -623,12 +625,11 @@ REGISTER_OP("TruncateMod")
     .Attr("T: {int32, int64, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
-Returns element-wise remainder of division. This emulates C semantics where
+Returns element-wise remainder of division. This emulates C semantics in that
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
 
-true, this follows C semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
+*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 )doc");
 
@@ -662,13 +663,12 @@ Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 
 The upper regularized incomplete Gamma function is defined as:
 
-```
-Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-```
+\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+
 where
-```
-Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-```
+
+\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+
 is the upper incomplete Gama function.
 
 Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
@@ -686,13 +686,13 @@ Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 
 The lower regularized incomplete Gamma function is defined as:
 
-```
-P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-```
+
+\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+
 where
-```
-gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-```
+
+\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+
 is the lower incomplete Gamma function.
 
 Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
@@ -710,9 +710,9 @@ Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 
 The Hurwitz zeta function is defined as:
 
-```
-\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-```
+
+\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+
 )doc");
 
 REGISTER_OP("Polygamma")
@@ -726,12 +726,27 @@ Compute the polygamma function \\(\psi^{(n)}(x)\\).
 
 The polygamma function is defined as:
 
-```
-\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-```
+
+\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+
 where \\(\psi(x)\\) is the digamma function.
 )doc");
 
+REGISTER_OP("Atan2")
+    .Input("y: T")
+    .Input("x: T")
+    .Output("z: T")
+    .Attr("T: {float, double}")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
+    .Doc(R"doc(
+Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+This is the angle \( \theta \in [-\pi, \pi] \) such that
+\[ x = r \cos(\theta) \]
+and
+\[ y = r \sin(\theta) \]
+where \(r = \sqrt(x^2 + y^2) \).
+)doc");
+
 REGISTER_OP("Betainc")
     .Input("a: T")
     .Input("b: T")
@@ -775,14 +790,14 @@ Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 
 The regularized incomplete beta integral is defined as:
 
-```
-I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-```
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
 where
 
-```
-B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-```
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
 
 is the incomplete beta function and \\(B(a, b)\\) is the *complete*
 beta function.
@@ -1256,6 +1271,8 @@ REGISTER_OP("ArgMax")
     .Doc(R"doc(
 Returns the index with the largest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -1270,6 +1287,8 @@ REGISTER_OP("ArgMin")
     .Doc(R"doc(
 Returns the index with the smallest value across dimensions of a tensor.
 
+Note that in case of ties the identity of the return value is not guaranteed.
+
 dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
   of the input Tensor to reduce across. For vectors, use dimension = 0.
 )doc");
@@ -1396,8 +1415,8 @@ REGISTER_OP("SegmentSum")
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
 
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \sum_j data_j\\) where sum is over `j` such
@@ -1406,7 +1425,7 @@ that `segment_ids[j] == i`.
 If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1426,9 +1445,8 @@ REGISTER_OP("SegmentMean")
     .Doc(R"doc(
 Computes the mean along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
@@ -1438,7 +1456,7 @@ values summed.
 If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMean.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1458,9 +1476,8 @@ REGISTER_OP("SegmentProd")
     .Doc(R"doc(
 Computes the product along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \prod_j data_j\\) where the product is over `j` such
@@ -1469,7 +1486,7 @@ that `segment_ids[j] == i`.
 If the product is empty for a given segment ID `i`, `output[i] = 1`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentProd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1489,9 +1506,8 @@ REGISTER_OP("SegmentMin")
     .Doc(R"doc(
 Computes the minimum along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
@@ -1500,7 +1516,7 @@ that `segment_ids[j] == i`.
 If the min is empty for a given segment ID `i`, `output[i] = 0`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMin.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1520,8 +1536,8 @@ REGISTER_OP("SegmentMax")
     .Doc(R"doc(
 Computes the maximum along segments of a tensor.
 
-Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-for an explanation of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
@@ -1530,7 +1546,7 @@ that `segment_ids[j] == i`.
 If the max is empty for a given segment ID `i`, `output[i] = 0`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/SegmentMax.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1551,9 +1567,8 @@ REGISTER_OP("UnsortedSegmentSum")
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Computes a tensor such that
 `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
@@ -1566,7 +1581,7 @@ If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 `num_segments` should equal the number of distinct segment IDs.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
 
 segment_ids: A tensor whose shape is a prefix of `data.shape`.
@@ -1577,7 +1592,6 @@ output: Has same shape as data, except for the first `segment_ids.rank`
 
 )doc");
 
-
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
     .Input("segment_ids: Tindices")
@@ -1589,11 +1603,10 @@ REGISTER_OP("UnsortedSegmentMax")
     .Doc(R"doc(
 Computes the Max along segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
-This operator is similar to the [unsorted segment sum operator](../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
 Instead of computing the sum over segments, it computes the maximum
 such that:
 
@@ -1604,7 +1617,7 @@ If the maximum is empty for a given segment ID `i`, it outputs the smallest poss
  `output[i] = numeric_limits<T>::min()`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
 
 segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
@@ -1625,9 +1638,8 @@ REGISTER_OP("SparseSegmentSum")
     .Doc(R"doc(
 Computes the sum along sparse segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -1674,9 +1686,8 @@ REGISTER_OP("SparseSegmentMean")
     .Doc(R"doc(
 Computes the mean along sparse segments of a tensor.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
@@ -1724,9 +1735,8 @@ Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 
 N is the size of the segment being reduced.
 
-Read [the section on
-Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-of segments.
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
 
 indices: A 1-D tensor. Has same rank as `segment_ids`.
 
@@ -2365,4 +2375,35 @@ output_max: the computed max output.
 
 )doc");
 
+// --------------------------------------------------------------------------
+
+REGISTER_OP("Bucketize")
+    .Input("input: T")
+    .Output("output: int32")
+    .Attr("T: {int32, int64, float, double}")
+    .Attr("boundaries: list(float)")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Bucketizes 'input' based on 'boundaries'.
+
+For example, if the inputs are
+    boundaries = [0, 10, 100]
+    input = [[-5, 10000]
+             [150,   10]
+             [5,    100]]
+
+then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+input: Any shape of Tensor contains with int or float type.
+boundaries: A sorted list of floats gives the boundary of the buckets.
+output: Same shape with 'input', each value of input replaced with bucket index.
+
+@compatibility(numpy)
+Equivalent to np.digitize.
+@end_compatibility
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 8a86f90e5c39b74378c9fe05d5f9bb4b2c27397b..560b71a337e2ecc673ebee73b275c4da2d335672 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -181,6 +181,31 @@ Status MaxPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("MaxPool", MaxPoolGrad);
 
+Status AvgPoolGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "grad: T"},
+    // Ret val defs
+    {"output: T"},
+    // Attr defs
+    {"T: {float, half} = DT_FLOAT",
+     "ksize: list(int) >= 4",
+     "strides: list(int) >= 4",
+     GetPaddingAttrString()},
+    // Nodes
+    {
+      {{"i_shape"}, "Shape", {"input"}, {{"T", "$T"}}},
+      {{"output"}, "AvgPoolGrad", {"i_shape", "grad"},
+       /*Attrs=*/{{"T", "$T"},
+                  {"ksize", "$ksize"},
+                  {"strides", "$strides"},
+                  {"padding", "$padding"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("AvgPool", AvgPoolGrad);
 
 Status MaxPoolGradGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
@@ -213,4 +238,25 @@ Status MaxPoolGradGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("MaxPoolGrad", MaxPoolGradGrad);
 
+Status BiasAddGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  *g = FDH::Define(
+    // Arg defs
+    {"input: T", "bias: T", "grad: T"},
+    // Ret val defs
+    {"grad: T", "bias_grad: T"},
+    // Attr defs
+    {{"T: {float, double}"},
+     GetConvnetDataFormatAttrString()},
+    // Nodes
+    {
+      {{"bias_grad"}, "BiasAddGrad", {"grad"},
+           /*Attrs=*/{{"T", "$T"},
+                      {"data_format", "$data_format"}}}
+    });
+  // clang-format on
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("BiasAdd", BiasAddGrad);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index e9d5897af04e202459148fb7a5436e17531d2fd4..3e58669e30e3e43838a0455c30ed73721735db80 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -89,7 +89,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -117,7 +117,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       // NOTE(mrry): We could in principle work out the shape from the
       // gradients and the attrs, but if we do not know orig_input_shape
@@ -272,7 +272,7 @@ REGISTER_OP("FusedBatchNorm")
     .Output("batch_variance: T")
     .Output("reserve_space_1: T")
     .Output("reserve_space_2: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -348,7 +348,7 @@ REGISTER_OP("FusedBatchNormGrad")
     .Output("offset_backprop: T")
     .Output("reserve_space_3: T")
     .Output("reserve_space_4: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
@@ -504,7 +504,7 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -557,7 +557,7 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -599,7 +599,7 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
@@ -735,7 +735,7 @@ REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr("resize_align_corners: bool = false")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
@@ -777,7 +777,7 @@ REGISTER_OP("FusedPadConv2D")
     .Input("paddings: int32")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {float}")
     .Attr(GetMirrorPadModeAttrString())
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
@@ -939,7 +939,7 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -971,7 +971,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
@@ -997,7 +997,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
@@ -1026,7 +1026,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -1063,7 +1063,7 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
@@ -1104,7 +1104,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
@@ -1131,7 +1131,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1166,7 +1166,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype")
+    .Attr("T: {float}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
@@ -1190,12 +1190,12 @@ REGISTER_OP("MaxPool3DGrad")
     .Input("orig_output: TInput")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("ksize: list(int) >= 5 ")
+    .Attr("ksize: list(int) >= 5")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: numbertype = DT_FLOAT")
-    .Attr("TInput: numbertype = DT_FLOAT")
+    .Attr("T: {float} = DT_FLOAT")
+    .Attr("TInput: {float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1226,7 +1226,7 @@ REGISTER_OP("MaxPool3DGradGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: realnumbertype")
+    .Attr("T: {float}")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Pool3DShape(c));
       ShapeHandle unused;
@@ -1260,7 +1260,7 @@ data_format: The data format of the input and output data. With the
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
@@ -1748,7 +1748,7 @@ backprops: The gradients:
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
@@ -1761,7 +1761,7 @@ REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the exponential linear (Elu) operation.
@@ -2612,10 +2612,10 @@ scale_after_normalization: A bool indicating whether the resulted tensor
 )doc");
 
 #ifdef INTEL_MKL
-REGISTER_OP("MklConv2D")
+REGISTER_OP("_MklConv2D")
     .Input("input: T")
-    .Input("mkl_input: uint8")
     .Input("filter: T")
+    .Input("mkl_input: uint8")
     .Input("mkl_filter: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2632,12 +2632,12 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklConv2DWithBias")
+REGISTER_OP("_MklConv2DWithBias")
     .Input("input: T")
-    .Input("mkl_input: uint8")
     .Input("filter: T")
-    .Input("mkl_filter: uint8")
     .Input("bias: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
     .Input("mkl_bias: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2654,12 +2654,12 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklConv2DBackpropFilter")
+REGISTER_OP("_MklConv2DBackpropFilter")
     .Input("input: T")
-    .Input("mkl_input: uint8")
     .Input("filter_sizes: int32")
-    .Input("mkl_filter_size: uint8")
     .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
     .Input("mkl_out_backprop: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2669,7 +2669,7 @@ REGISTER_OP("MklConv2DBackpropFilter")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .SetShapeFn([](InferenceContext* c) {
-      return InputTensorShapeOrUnknown(c, 2 /* input_idx */, 4 /* ndims */);
+      return InputTensorShapeOrUnknown(c, 1 /* input_idx */, 4 /* ndims */);
     })
     .Doc(R"doc(
 MKL version of Conv2DBackpropFilter. Uses MKL DNN APIs to compute the
@@ -2679,7 +2679,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklConv2DWithBiasBackpropBias")
+REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
     .Output("output: T")
@@ -2695,12 +2695,12 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklConv2DBackpropInput")
+REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
-    .Input("mkl_input_sizes: uint8")
     .Input("filter: T")
-    .Input("mkl_filter: uint8")
     .Input("out_backprop: T")
+    .Input("mkl_input_sizes: uint8")
+    .Input("mkl_filter: uint8")
     .Input("mkl_out_backprop: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2720,7 +2720,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklRelu")
+REGISTER_OP("_MklRelu")
     .Input("features: T")
     .Input("mkl_features: uint8")
     .Output("activations: T")
@@ -2734,10 +2734,10 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklReluGrad")
+REGISTER_OP("_MklReluGrad")
     .Input("gradients: T")
-    .Input("mkl_gradients: uint8")
     .Input("features: T")
+    .Input("mkl_gradients: uint8")
     .Input("mkl_features: uint8")
     .Output("backprops: T")
     .Output("mkl_backprops: uint8")
@@ -2751,7 +2751,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklMaxPool")
+REGISTER_OP("_MklMaxPool")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -2761,8 +2761,8 @@ REGISTER_OP("MklMaxPool")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-    .Output("mkl_output: uint8")
     .Output("workspace: T")
+    .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .SetShapeFn(shape_inference::MaxPoolShape)
     .Doc(R"doc(
@@ -2773,7 +2773,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklMaxPoolGrad")
+REGISTER_OP("_MklMaxPoolGrad")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -2781,12 +2781,12 @@ REGISTER_OP("MklMaxPoolGrad")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Input("orig_input: T")
-    .Input("mkl_orig_input: uint8")
     .Input("orig_output: T")
-    .Input("mkl_orig_output: uint8")
     .Input("grad: T")
-    .Input("mkl_grad: uint8")
     .Input("workspace: T")
+    .Input("mkl_orig_input: uint8")
+    .Input("mkl_orig_output: uint8")
+    .Input("mkl_grad: uint8")
     .Input("mkl_workspace: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2801,7 +2801,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklAvgPool")
+REGISTER_OP("_MklAvgPool")
     .Input("value: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
@@ -2820,10 +2820,10 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklAvgPoolGrad")
+REGISTER_OP("_MklAvgPoolGrad")
     .Input("orig_input_shape: int32")
-    .Input("mkl_orig_input: uint8")
     .Input("grad: T")
+    .Input("mkl_orig_input: uint8")
     .Input("mkl_grad: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
@@ -2843,7 +2843,212 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-REGISTER_OP("MklToTf")
+REGISTER_OP("_MklLRN")
+    .Input("input: T")
+    .Input("mkl_input: uint8")
+    .Output("output: T")
+    .Output("workspace: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_workspace: uint8")
+    .Attr("depth_radius: int = 5")
+    .Attr("bias: float = 1.0")
+    .Attr("alpha: float = 1.0")
+    .Attr("beta: float = 0.5")
+    .Attr("workspace_enabled: bool = false")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      return UnchangedShapeWithRank(c, 4);
+    })
+    .Doc(R"doc(
+MKL version of LRN operator. Uses MKL DNN APIs to perform local response
+normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklLRNGrad")
+    .Input("input_grads: T")
+    .Input("input_image: T")
+    .Input("output_image: T")
+    .Input("workspace: T")
+    .Input("mkl_input_grads: uint8")
+    .Input("mkl_input_image: uint8")
+    .Input("mkl_output_image: uint8")
+    .Input("mkl_workspace: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("depth_radius: int = 5")
+    .Attr("bias: float = 1.0")
+    .Attr("alpha: float = 1.0")
+    .Attr("beta: float = 0.5")
+    .Attr("workspace_enabled: bool = false")
+    .Attr("T: {float, half} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
+      TF_RETURN_IF_ERROR(c->Merge(s, c->input(1), &s));     // input_image
+      TF_RETURN_IF_ERROR(c->Merge(s, c->input(2), &s));     // output_image
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of LRNGrad operator. Uses MKL DNN APIs to compute gradient for
+local response normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklFusedBatchNorm")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("offset: T")
+    .Input("mean: T")
+    .Input("variance: T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Output("y: T")
+    .Output("batch_mean: T")
+    .Output("batch_variance: T")
+    .Output("reserve_space_1: T")
+    .Output("reserve_space_2: T")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle x;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
+
+      bool is_training;
+      c->GetAttr("is_training", &is_training);
+      int number_inputs = (is_training) ? 3 : 5;
+      string data_format;
+      c->GetAttr("data_format", &data_format);
+      DimensionHandle channel_dim =
+          (data_format == "NHWC") ? c->Dim(x, 3) : c->Dim(x, 1);
+
+      // covers scale, offset, and if is_training is false, mean, variance
+      for (int i = 1; i < number_inputs; ++i) {
+        ShapeHandle vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
+      }
+
+      ShapeHandle y;
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 3, channel_dim, &y));
+      } else {
+        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 1, channel_dim, &y));
+      }
+      c->set_output(0, y);
+      ShapeHandle vector_shape = c->Vector(channel_dim);
+      c->set_output(1, vector_shape);
+      c->set_output(2, vector_shape);
+      c->set_output(3, vector_shape);
+      c->set_output(4, vector_shape);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of FusedBatchNorm operator. Uses MKL DNN APIs to perform fused
+batch normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklFusedBatchNormGrad")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("reserve_space_1: T")
+    .Input("reserve_space_2: T")
+    .Input("mkl_y_backprop: uint8")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_reserve_space_1: uint8")
+    .Input("mkl_reserve_space_2: uint8")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: T")
+    .Output("offset_backprop: T")
+    .Output("reserve_space_3: T")
+    .Output("reserve_space_4: T")
+    .Output("mkl_x_backprop: uint8")
+    .Output("mkl_scale_backprop: uint8")
+    .Output("mkl_offset_backprop: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Output("mkl_reserve_space_4: uint8")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle y_backprop;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
+      ShapeHandle x;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
+
+      bool is_training;
+      string data_format;
+      c->GetAttr("is_training", &is_training);
+      c->GetAttr("data_format", &data_format);
+      DimensionHandle channel_dim = (data_format == "NHWC")
+                                        ? c->Dim(y_backprop, 3)
+                                        : c->Dim(y_backprop, 1);
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 3), &channel_dim));
+      } else {
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 1), &channel_dim));
+      }
+
+      // covers scale, mean (reserve_space_1), variance (reserve_space_2)
+      for (int i = 2; i < 5; ++i) {
+        ShapeHandle vec;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
+        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
+      }
+
+      ShapeHandle x_backprop;
+      if (data_format == "NHWC") {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(y_backprop, 3, channel_dim, &x_backprop));
+      } else {
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(y_backprop, 1, channel_dim, &x_backprop));
+      }
+      c->set_output(0, x_backprop);
+      c->set_output(1, c->Vector(channel_dim));
+      c->set_output(2, c->Vector(channel_dim));
+      // Set the correct shapes for reserve_spaces
+      // so that gradients can be performed when
+      // the op is in a symbolic condition.
+      if (is_training) {
+        c->set_output(3, c->Vector(0));
+        c->set_output(4, c->Vector(0));
+      } else {
+        c->set_output(3, c->Vector(channel_dim));
+        c->set_output(4, c->Vector(channel_dim));
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of FusedBatchNormGrad operator. Uses MKL DNN APIs to compute
+gradients for fused batch normalization.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklToTf")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2f85351d610129deda422c8772562f59c1e8b9d8..c7a30d5ae26865a4efa50932b88440d8281b9497 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15,8 +15,8 @@ op {
       b: false
     }
   }
-  summary: "Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal."
-  description: "Returns nothing but an exception."
+  summary: "Raise a exception to abort the process when called."
+  description: "If exit_without_error is true, the process will exit normally,\notherwise it will exit with a SIGABORT signal.\n\nReturns nothing but an exception."
 }
 op {
   name: "Abs"
@@ -85,8 +85,8 @@ op {
       }
     }
   }
-  summary: "Applies a gradient to a given accumulator. Does not add if local_step is lesser"
-  description: "than the accumulator\'s global_step."
+  summary: "Applies a gradient to a given accumulator."
+  description: "Does not add if local_step is lesser than the accumulator\'s global_step."
 }
 op {
   name: "AccumulatorNumAccumulated"
@@ -116,8 +116,8 @@ op {
     description: "The new global_step value to set."
     type: DT_INT64
   }
-  summary: "Updates the accumulator with a new value for global_step. Logs warning if the"
-  description: "accumulator\'s value is already higher than new_global_step."
+  summary: "Updates the accumulator with a new value for global_step."
+  description: "Logs warning if the accumulator\'s value is already higher than\nnew_global_step."
 }
 op {
   name: "AccumulatorTakeGradient"
@@ -160,8 +160,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average gradient in the given ConditionalAccumulator, provided"
-  description: "that sufficient (i.e., more than num_required) gradients have been accumulated.\nThe op blocks until sufficient gradients have been accumulated.\nIf the accumulator has already aggregated more than num_required gradients, it\nreturns the average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average gradient in the given ConditionalAccumulator."
+  description: "The op blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated.  If the accumulator has already\naggregated more than num_required gradients, it returns the average of\nthe accumulated gradients.  Also automatically increments the recorded\nglobal_step in the accumulator by 1, and resets the aggregate to 0."
 }
 op {
   name: "Acos"
@@ -538,7 +538,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to produce per batch."
+    description: "Number of candidates to produce."
     has_minimum: true
     minimum: 1
   }
@@ -565,6 +565,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Any"
@@ -923,6 +924,14 @@ op {
     }
     description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, uses the nesterov update."
+  }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
 }
@@ -1551,6 +1560,7 @@ op {
     }
   }
   summary: "Returns the index with the largest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "ArgMin"
@@ -1603,6 +1613,7 @@ op {
     }
   }
   summary: "Returns the index with the smallest value across dimensions of a tensor."
+  description: "Note that in case of ties the identity of the return value is not guaranteed."
 }
 op {
   name: "AsString"
@@ -1903,6 +1914,66 @@ op {
   }
   summary: "Computes atan of x element-wise."
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
+  description: "This is the angle \\( \\theta \\in [-\\pi, \\pi] \\) such that\n\\[ x = r \\cos(\\theta) \\]\nand\n\\[ y = r \\sin(\\theta) \\]\nwhere \\(r = \\sqrt(x^2 + y^2) \\)."
+}
+op {
+  name: "AudioSpectrogram"
+  input_arg {
+    name: "input"
+    description: "Float representation of audio data."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "spectrogram"
+    description: "3D representation of the audio frequencies as an image."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "window_size"
+    type: "int"
+    description: "How wide the input window is in samples. For the highest efficiency\nthis should be a power of two, but other values are accepted."
+  }
+  attr {
+    name: "stride"
+    type: "int"
+    description: "How widely apart the center of adjacent sample windows should be."
+  }
+  attr {
+    name: "magnitude_squared"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether to return the squared magnitude or just the\nmagnitude. Using squared magnitude can avoid extra calculations."
+  }
+  summary: "Produces a visualization of audio data over time."
+  description: "Spectrograms are a standard way of representing audio information as a series of\nslices of frequency information, one slice for each window of time. By joining\nthese together into a sequence, they form a distinctive fingerprint of the sound\nover time.\n\nThis op expects to receive audio data as an input, stored as floats in the range\n-1 to 1, together with a window width in samples, and a stride specifying how\nfar to move the window between slices. From this it generates a three\ndimensional output. The lowest dimension has an amplitude value for each\nfrequency during that time slice. The next dimension is time, with successive\nfrequency slices. The final dimension is for the channels in the input, so a\nstereo audio input would have two here for example.\n\nThis means the layout when converted and saved as an image is rotated 90 degrees\nclockwise from a typical spectrogram. Time is descending down the Y axis, and\nthe frequency decreases from left to right.\n\nEach value in the result represents the square root of the sum of the real and\nimaginary parts of an FFT on the current window of samples. In this way, the\nlowest dimension represents the power of each frequency in the current window,\nand adjacent windows are concatenated in the next dimension.\n\nTo get a more intuitive and visual look at what this operation does, you can run\ntensorflow/examples/wav_to_spectrogram to read in an audio file and save out the\nresulting spectrogram as a PNG image."
+}
 op {
   name: "AudioSummary"
   input_arg {
@@ -2033,8 +2104,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
@@ -2100,18 +2171,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2180,18 +2239,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -2258,8 +2305,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
@@ -2333,7 +2380,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the barrier\'s queue will be cancelled. InsertMany will fail, even\nif no new key is introduced."
+    description: "If true, all pending enqueue requests that are\nblocked on the barrier\'s queue will be canceled. InsertMany will fail, even\nif no new key is introduced."
   }
   summary: "Closes the given barrier."
   description: "This operation signals that no more new elements will be inserted in the\ngiven barrier. Subsequent InsertMany that try to introduce a new key will fail.\nSubsequent InsertMany operations that just add missing components to already\nexisting elements will continue to succeed. Subsequent TakeMany operations will\ncontinue to succeed if sufficient completed elements remain in the barrier.\nSubsequent TakeMany operations that would block will fail immediately."
@@ -2513,6 +2560,36 @@ op {
     explanation: "Use CholeskyGrad instead."
   }
 }
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
+  is_stateful: true
+}
 op {
   name: "BatchFFT"
   input_arg {
@@ -3202,7 +3279,7 @@ op {
   }
   output_arg {
     name: "output"
-    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
+    description: "4-D with shape `[batch, height, width, depth]`, where:\n\n      height = height_pad - crop_top - crop_bottom\n      width = width_pad - crop_left - crop_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1], [3]], [[5], [7]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```"
     type_attr: "T"
   }
   attr {
@@ -3245,7 +3322,7 @@ op {
   }
   input_arg {
     name: "crops"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```prettyprint\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input\n  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is\n  required that\n  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.\n\nThis operation is equivalent to the following steps:\n\n1. Reshape `input` to `reshaped` of shape:\n     [block_shape[0], ..., block_shape[M-1],\n      batch / prod(block_shape),\n      input_shape[1], ..., input_shape[N-1]]\n\n2. Permute dimensions of `reshaped` to produce `permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1], block_shape[0],\n      ...,\n      input_shape[M], block_shape[M-1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\n3. Reshape `permuted` to produce `reshaped_permuted` of shape\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0],\n      ...,\n      input_shape[M] * block_shape[M-1],\n\n      input_shape[M+1],\n      ...,\n      input_shape[N-1]]\n\n4. Crop the start and end of dimensions `[1, ..., M]` of\n   `reshaped_permuted` according to `crops` to produce the output of shape:\n     [batch / prod(block_shape),\n\n      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],\n      ...,\n      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],\n\n      input_shape[M+1], ..., input_shape[N-1]]\n\nSome examples:\n\n(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\n(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\nThe output tensor has shape `[1, 2, 2, 3]` and value:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\n(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\nThe output tensor has shape `[1, 4, 4, 1]` and value:\n\n```\nx = [[[1],   [2],  [3],  [4]],\n     [[5],   [6],  [7],  [8]],\n     [[9],  [10], [11],  [12]],\n     [[13], [14], [15],  [16]]]\n```\n\n(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and\n    `crops = [[0, 0], [2, 0]]`:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nThe output tensor has shape `[2, 2, 4, 1]` and value:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```"
     type_attr: "Tcrops"
   }
   output_arg {
@@ -3314,7 +3391,7 @@ op {
     }
   }
   summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
-  description: "The regularized incomplete beta integral is defined as:\n\n```\nI_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\n```\nwhere\n\n```\nB(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\n```\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
+  description: "The regularized incomplete beta integral is defined as:\n\n\n\\\\(I_x(a, b) = \\frac{B(x; a, b)}{B(a, b)}\\\\)\n\nwhere\n\n\n\\\\(B(x; a, b) = \\int_0^x t^{a-1} (1 - t)^{b-1} dt\\\\)\n\n\nis the incomplete beta function and \\\\(B(a, b)\\\\) is the *complete*\nbeta function."
 }
 op {
   name: "BiasAdd"
@@ -3623,6 +3700,38 @@ op {
   summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
   description: "This is typically used by gradient computations for a broadcasting operation."
 }
+op {
+  name: "Bucketize"
+  input_arg {
+    name: "input"
+    description: "Any shape of Tensor contains with int or float type."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Same shape with \'input\', each value of input replaced with bucket index.\n\n@compatibility(numpy)\nEquivalent to np.digitize.\n@end_compatibility"
+    type: DT_INT32
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "boundaries"
+    type: "list(float)"
+    description: "A sorted list of floats gives the boundary of the buckets."
+  }
+  summary: "Bucketizes \'input\' based on \'boundaries\'."
+  description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -3774,6 +3883,14 @@ op {
     }
     description: "Scalar.  If set to false, *during* CTC calculation\nrepeated non-blank labels will not be merged and are interpreted as\nindividual labels.  This is a simplified version of CTC."
   }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Scalar. If set to true, during CTC\ncalculation items have longer input sequences than output sequences\nare ignored by returning zero-gradient for those items."
+  }
   summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
   description: "the gradient.  This class performs the softmax operation for you, so inputs\nshould be e.g. linear projections of outputs by an LSTM."
 }
@@ -4082,7 +4199,7 @@ op {
   }
   output_arg {
     name: "offset"
-    description: "The `N` int32 vectors representing the starting offset\n        of input tensors within the concatenated output.\n\nThis is typically used by gradient computations for a concat operation."
+    description: "The `N` int32 vectors representing the starting offset\nof input tensors within the concatenated output."
     type: DT_INT32
     number_attr: "N"
   }
@@ -4093,7 +4210,7 @@ op {
     minimum: 2
   }
   summary: "Computes offsets of concat inputs within its output."
-  description: "For example:\n\n```prettyprint\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```"
+  description: "For example:\n\n```\n# \'x\' is [2, 2, 7]\n# \'y\' is [2, 3, 7]\n# \'z\' is [2, 5, 7]\nconcat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]\n```\n\nThis is typically used by gradient computations for a concat operation."
 }
 op {
   name: "ConcatV2"
@@ -4188,10 +4305,10 @@ op {
     default_value {
       s: ""
     }
-    description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
+    description: "If non-empty, this accumulator will be shared under the\ngiven name across multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating gradients. The accumulator accepts"
-  description: "gradients marked with local_step greater or equal to the most recent global_step\nknown to the accumulator. The average can be extracted from the accumulator,\nprovided sufficient gradients have been accumulated. Extracting the average\nautomatically resets the aggregate to 0, and increments the global_step recorded\nby the accumulator."
+  summary: "A conditional accumulator for aggregating gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
 op {
@@ -4266,7 +4383,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4339,7 +4455,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4411,7 +4526,6 @@ op {
       list {
         type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -4477,18 +4591,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4555,18 +4657,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4622,18 +4712,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4699,18 +4777,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4766,18 +4832,6 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -4839,8 +4893,17 @@ op {
     }
     description: "The name of the input tensor."
   }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
   summary: "Copy Op."
-  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
+  description: "Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the\ndevice on which the tensor is allocated.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the CopyHost Op, this op does not have HostMemory constraint on its\ninput or output."
   allows_uninitialized_input: true
 }
 op {
@@ -4867,8 +4930,17 @@ op {
     }
     description: "The name of the input tensor."
   }
+  attr {
+    name: "debug_ops_spec"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    description: "A list of debug op spec (op, url, gated_grpc) for attached debug\nops. Each element of the list has the format\n<debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented\nas 0/1. E.g., \"DebugIdentity;grpc://foo:3333;1\",\n\"DebugIdentity;file:///tmp/tfdbg_1;0\"."
+  }
   summary: "Copy Host Op."
-  description: "Performs CPU-to-CPU deep-copying of tensor.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
+  description: "Performs CPU-to-CPU deep-copying of tensor.\nN.B.: If the all downstream attached debug ops are disabled given the current\ngRPC gating status, the output will simply forward the input tensor without\ndeep-copying. See the documentation of Debug* ops for more details.\n\nUnlike the Copy Op, this op has HostMemory constraint on its input or output."
   allows_uninitialized_input: true
 }
 op {
@@ -5306,6 +5378,14 @@ op {
     }
     description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
+  }
   summary: "Debug Identity Op."
   description: "Provides an identity mapping of the non-Ref type input tensor for debugging."
   allows_uninitialized_input: true
@@ -5341,7 +5421,15 @@ op {
       list {
       }
     }
-    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011"
+    description: "List of URLs to debug targets, e.g.,\nfile:///foo/tfdbg_dump, grpc:://localhost:11011."
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
   }
   summary: "Debug NaN Value Counter Op"
   description: "Counts number of NaNs in the input tensor, for debugging."
@@ -5404,6 +5492,14 @@ op {
     }
     description: "(bool) Do not send data to the debug URLs unless at least one\nof elements [2], [3] and [7] (i.e., the nan count and the generalized -inf and\ninf counts) is non-zero."
   }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Whether this op will be gated. If any of the debug_urls of this\ndebug node is of the grpc:// scheme, when the value of this attribute is set\nto True, the data will not actually be sent via the grpc stream unless this\ndebug op has been enabled at the debug_url. If all of the debug_urls of this\ndebug node are of the grpc:// scheme and the debug op is enabled at none of\nthem, the output will be an empty Tensor."
+  }
   summary: "Debug Numeric Summary Op."
   description: "Provide a basic summary of numeric value types, range and distribution."
   allows_uninitialized_input: true
@@ -5423,6 +5519,28 @@ op {
   summary: "Decode web-safe base64-encoded strings."
   description: "Input may or may not have padding at the end. See EncodeBase64 for padding.\nWeb-safe means that input must use - and _ instead of + and /."
 }
+op {
+  name: "DecodeBmp"
+  input_arg {
+    name: "contents"
+    description: "0-D.  The BMP-encoded image."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "image"
+    description: "3-D with shape `[height, width, channels]`. RGB order"
+    type: DT_UINT8
+  }
+  attr {
+    name: "channels"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Decode the first frame of a BMP-encoded image to a uint8 tensor."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the BMP-encoded image.\n*   3: output an RGB image.\n*   4: output an RGBA image."
+}
 op {
   name: "DecodeCSV"
   input_arg {
@@ -5478,7 +5596,7 @@ op {
     type: DT_UINT8
   }
   summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
-  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\nconvert $src.gif -coalesce $dst.gif"
+  description: "GIF with frame or transparency compression are not supported\nconvert animated GIF from compressed to uncompressed by:\n\n    convert $src.gif -coalesce $dst.gif\n\nThis op also supports decoding JPEGs and PNGs, though it is cleaner to use\n`tf.image.decode_image`."
 }
 op {
   name: "DecodeJSONExample"
@@ -5556,7 +5674,7 @@ op {
     description: "string specifying a hint about the algorithm used for\ndecompression.  Defaults to \"\" which maps to a system-specific\ndefault.  Currently valid values are [\"INTEGER_FAST\",\n\"INTEGER_ACCURATE\"].  The hint may be ignored (e.g., the internal\njpeg library changes to a version that does not have that specific\noption.)"
   }
   summary: "Decode a JPEG-encoded image to a uint8 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the JPEG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n\nIf needed, the JPEG-encoded image is transformed to match the requested number\nof color channels.\n\nThe attr `ratio` allows downscaling the image by an integer factor during\ndecoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than\ndownscaling the image later.\n\nThis op also supports decoding PNGs and non-animated GIFs since the interface is\nthe same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodePng"
@@ -5592,7 +5710,7 @@ op {
     }
   }
   summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
-  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels."
+  description: "The attr `channels` indicates the desired number of color channels for the\ndecoded image.\n\nAccepted values are:\n\n*   0: Use the number of channels in the PNG-encoded image.\n*   1: output a grayscale image.\n*   3: output an RGB image.\n*   4: output an RGBA image.\n\nIf needed, the PNG-encoded image is transformed to match the requested number\nof color channels.\n\nThis op also supports decoding JPEGs and non-animated GIFs since the interface\nis the same, though it is cleaner to use `tf.image.decode_image`."
 }
 op {
   name: "DecodeRaw"
@@ -5734,11 +5852,47 @@ op {
   description: "See SetOperationOp::SetOperationFromContext for values of `set_operation`.\n\nOutput `result` is a `SparseTensor` represented by `result_indices`,\n`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this\nhas rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`\ndimension contains the result of `set_operation` applied to the corresponding\n`[0...n-1]` dimension of `set`."
 }
 op {
-  name: "DenseToSparseSetOperation"
+  name: "DenseToSparseBatchDataset"
   input_arg {
-    name: "set1"
-    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
-    type_attr: "T"
+    name: "input_dataset"
+    description: "A handle to an input dataset. Must have a single component."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    description: "A vector representing the dense shape of each row in the produced\nSparseTensor."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
+  is_stateful: true
+}
+op {
+  name: "DenseToSparseSetOperation"
+  input_arg {
+    name: "set1"
+    description: "`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.\nDimension `n` contains values in a set, duplicates are allowed but ignored."
+    type_attr: "T"
   }
   input_arg {
     name: "set2_indices"
@@ -5821,7 +5975,7 @@ op {
     minimum: 2
   }
   summary: "DepthToSpace for tensors of type T."
-  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```prettyprint\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```prettyprint\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```prettyprint\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```prettyprint\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
+  description: "Rearranges data from depth into blocks of spatial data.\nThis is the reverse transformation of SpaceToDepth. More specifically,\nthis op outputs a copy of the input tensor where values from the `depth`\ndimension are moved in spatial blocks to the `height` and `width` dimensions.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Chunks of data of size `block_size * block_size` from depth are rearranged\n    into non-overlapping blocks of size `block_size x block_size`\n  * The width the output tensor is `input_depth * block_size`, whereas the\n    height is `input_height * block_size`.\n  * The depth of the input tensor must be divisible by\n    `block_size * block_size`.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height*block_size, width*block_size, depth/(block_size*block_size)]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and that `block_size * block_size` be a divisor of the\ninput depth.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:\n\n```\nx = [[[[1, 2, 3, 4]]]]\n\n```\n\nThis operation will output a tensor of shape `[1, 2, 2, 1]`:\n\n```\n   [[[[1], [2]],\n     [[3], [4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,\nthe corresponding output will have 2x2 elements and will have a depth of\n1 channel (1 = `4 / (block_size * block_size)`).\nThe output element shape is `[2, 2, 1]`.\n\nFor an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.\n\n```\nx = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nThis operation, for block size of 2, will return the following tensor of shape\n`[1, 2, 2, 3]`\n\n```\n   [[[[1, 2, 3], [4, 5, 6]],\n     [[7, 8, 9], [10, 11, 12]]]]\n\n```\n\nSimilarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:\n\n```\nx =  [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 4 4 1]`:\n\n```\nx = [[ [1],   [2],  [5],  [6]],\n     [ [3],   [4],  [7],  [8]],\n     [ [9],  [10], [13],  [14]],\n     [ [11], [12], [15],  [16]]]\n\n```"
 }
 op {
   name: "DepthwiseConv2dNative"
@@ -6134,7 +6288,7 @@ op {
     }
   }
   summary: "Returns a diagonal tensor with a given diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of\nrank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:\n\n`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.\n\nFor example:\n\n```\n# \'diagonal\' is [1, 2, 3, 4]\ntf.diag(diagonal) ==> [[1, 0, 0, 0]\n                       [0, 2, 0, 0]\n                       [0, 0, 3, 0]\n                       [0, 0, 0, 4]]\n```"
 }
 op {
   name: "DiagPart"
@@ -6163,7 +6317,7 @@ op {
     }
   }
   summary: "Returns the diagonal part of the tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a\ntensor of rank `k` with dimensions `[D1,..., Dk]` where:\n\n`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.\n\nFor example:\n\n```\n# \'input\' is [[1, 0, 0, 0]\n              [0, 2, 0, 0]\n              [0, 0, 3, 0]\n              [0, 0, 0, 4]]\n\ntf.diag_part(input) ==> [1, 2, 3, 4]\n```"
 }
 op {
   name: "Digamma"
@@ -6480,7 +6634,7 @@ op {
     type: "type"
   }
   summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
-  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicPartition.png\" alt>\n</div>"
+  description: "For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`\nbecomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`\nare placed in `outputs[i]` in lexicographic order of `js`, and the first\ndimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.\nIn detail,\n\n```python\n    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]\n\n    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])\n```\n\n`data.shape` must start with `partitions.shape`.\n\nFor example:\n\n```python\n    # Scalar partitions.\n    partitions = 1\n    num_partitions = 2\n    data = [10, 20]\n    outputs[0] = []  # Empty with shape [0, 2]\n    outputs[1] = [[10, 20]]\n\n    # Vector partitions.\n    partitions = [0, 0, 1, 1, 0]\n    num_partitions = 2\n    data = [10, 20, 30, 40, 50]\n    outputs[0] = [10, 20, 50]\n    outputs[1] = [30, 40]\n```\n\nSee `dynamic_stitch` for an example on how to merge partitions back.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicPartition.png\" alt>\n</div>"
 }
 op {
   name: "DynamicStitch"
@@ -6509,7 +6663,7 @@ op {
     type: "type"
   }
   summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/DynamicStitch.png\" alt>\n</div>"
+  description: "Builds a merged tensor such that\n\n```python\n    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]\n```\n\nFor example, if each `indices[m]` is scalar or vector, we have\n\n```python\n    # Scalar indices:\n    merged[indices[m], ...] = data[m][...]\n\n    # Vector indices:\n    merged[indices[m][i], ...] = data[m][i, ...]\n```\n\nEach `data[i].shape` must start with the corresponding `indices[i].shape`,\nand the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we\nmust have `data[i].shape = indices[i].shape + constant`.  In terms of this\n`constant`, the output shape is\n\n    merged.shape = [max(indices)] + constant\n\nValues are merged in order, so if an index appears in both `indices[m][i]` and\n`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the\nmerged result.\n\nFor example:\n\n```python\n    indices[0] = 6\n    indices[1] = [4, 1]\n    indices[2] = [[5, 2], [0, 3]]\n    data[0] = [61, 62]\n    data[1] = [[41, 42], [11, 12]]\n    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]\n    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],\n              [51, 52], [61, 62]]\n```\n\nThis method can be used to merge partitions created by `dynamic_partition`\nas illustrated on the following example:\n\n```python\n    # Apply function (increments x_i) on elements for which a certain condition\n    # apply (x_i != -1 in this example).\n    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])\n    condition_mask=tf.not_equal(x,tf.constant(-1.))\n    partitioned_data = tf.dynamic_partition(\n        x, tf.cast(condition_mask, tf.int32) , 2)\n    partitioned_data[1] = partitioned_data[1] + 1.0\n    condition_indices = tf.dynamic_partition(\n        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)\n    x = tf.dynamic_stitch(condition_indices, partitioned_data)\n    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain\n    # unchanged.\n```\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/DynamicStitch.png\" alt>\n</div>"
 }
 op {
   name: "EditDistance"
@@ -6578,15 +6732,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -6615,15 +6763,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
@@ -7013,7 +7155,7 @@ op {
     }
   }
   summary: "Inserts a dimension of 1 into a tensor\'s shape."
-  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```prettyprint\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
+  description: "Given a tensor `input`, this operation inserts a dimension of 1 at the\ndimension index `dim` of `input`\'s shape. The dimension index `dim` starts at\nzero; if you specify a negative number for `dim` it is counted backward from\nthe end.\n\nThis operation is useful if you want to add a batch dimension to a single\nelement. For example, if you have a single image of shape `[height, width,\nchannels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,\nwhich will make the shape `[1, height, width, channels]`.\n\nOther examples:\n\n```\n# \'t\' is a tensor of shape [2]\nshape(expand_dims(t, 0)) ==> [1, 2]\nshape(expand_dims(t, 1)) ==> [2, 1]\nshape(expand_dims(t, -1)) ==> [2, 1]\n\n# \'t2\' is a tensor of shape [2, 3, 5]\nshape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]\nshape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]\nshape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]\n```\n\nThis operation requires that:\n\n`-1-input.dims() <= dim <= input.dims()`\n\nThis operation is related to `squeeze()`, which removes dimensions of\nsize 1."
 }
 op {
   name: "Expm1"
@@ -7162,11 +7304,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 1-dimensional discrete Fourier Transform over the inner-most"
-  description: "dimension of `input`."
+  summary: "Fast Fourier transform."
+  description: "Computes the 1-dimensional discrete Fourier transform over the inner-most\ndimension of `input`."
 }
 op {
   name: "FFT2D"
@@ -7177,11 +7319,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 2-dimensional discrete Fourier Transform over the inner-most"
-  description: "2 dimensions of `input`."
+  summary: "2D fast Fourier transform."
+  description: "Computes the 2-dimensional discrete Fourier transform over the inner-most\n2 dimensions of `input`."
 }
 op {
   name: "FFT3D"
@@ -7192,11 +7334,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fftn with 3 dimensions.\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.fftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 3-dimensional discrete Fourier Transform over the inner-most 3"
-  description: "dimensions of `input`."
+  summary: "3D fast Fourier transform."
+  description: "Computes the 3-dimensional discrete Fourier transform over the inner-most 3\ndimensions of `input`."
 }
 op {
   name: "FIFOQueue"
@@ -7333,8 +7475,15 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
-  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\nQuantization is called fake since the output is still in floating point."
+  description: "Attributes [min; max] define the clamping range for the \'inputs\' data.  Op\ndivides this range into 255 steps (total of 256 values), then replaces each\n\'inputs\' value with the closest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nQuantization is called fake since the output is still in floating point."
 }
 op {
   name: "FakeQuantWithMinMaxArgsGradient"
@@ -7367,6 +7516,13 @@ op {
       f: 6
     }
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
 }
 op {
@@ -7387,8 +7543,15 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
-  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  description: "and `max` to \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data.  Op divides this range\ninto 255 steps (total of 256 values), then replaces each \'inputs\' value with the\nclosest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
 }
 op {
   name: "FakeQuantWithMinMaxVarsGradient"
@@ -7425,6 +7588,14 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter:\n`sum(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
 }
 op {
@@ -7445,8 +7616,15 @@ op {
     name: "outputs"
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
   summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
-  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
+  description: "`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`\nto \'outputs\' tensor of same shape as `inputs`.\n\n[min; max] is the clamping range for the \'inputs\' data in the corresponding\ndepth channel.  Op divides this range into 255 steps (total of 256 values), then\nreplaces each \'inputs\' value with the closest of the quantized step values.\n\'num_bits\' is the bitwidth of the quantization; between 2 and 8, inclusive.\n\nThis operation has a gradient and thus allows for training `min` and `max` values."
 }
 op {
   name: "FakeQuantWithMinMaxVarsPerChannelGradient"
@@ -7483,6 +7661,14 @@ op {
     description: "Backpropagated gradients w.r.t. max parameter, shape `[d]`:\n`sum_per_d(gradients * (inputs > max))`."
     type: DT_FLOAT
   }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+    description: "The bitwidth of the quantization; between 2 and 8, inclusive."
+  }
   summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
 }
 op {
@@ -7520,7 +7706,77 @@ op {
     type: "type"
   }
   summary: "Creates a tensor filled with a scalar value."
-  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```prettyprint\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+  description: "This operation creates a tensor of shape `dims` and fills it with `value`.\n\nFor example:\n\n```\n# Output tensor has shape [2, 3].\nfill([2, 3], 9) ==> [[9, 9, 9]\n                     [9, 9, 9]]\n```"
+}
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    description: "A list of tensors, typically values that were captured when\nbuilding a closure for `predicate`."
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+    description: "A function returning a scalar boolean."
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset containing elements of `input_dataset` matching `predicate`."
+  description: "The `predicate` function must return a scalar boolean and accept the\nfollowing arguments:\n\n* One tensor for each component of an element of `input_dataset`.\n* One tensor for each value in `other_arguments`."
+  is_stateful: true
+}
+op {
+  name: "FixedLengthRecordDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    description: "A scalar representing the number of bytes to skip at the\nbeginning of a file."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    description: "A scalar representing the number of bytes in each record."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    description: "A scalar representing the number of bytes to skip at the end\nof a file."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the records from one or more binary files."
+  is_stateful: true
 }
 op {
   name: "FixedLengthRecordReader"
@@ -7536,10 +7792,12 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
+    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -7547,6 +7805,15 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the footer, defaults to 0."
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -7580,10 +7847,12 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the header, defaults to 0."
   }
   attr {
     name: "record_bytes"
     type: "int"
+    description: "Number of bytes in the record."
   }
   attr {
     name: "footer_bytes"
@@ -7591,6 +7860,15 @@ op {
     default_value {
       i: 0
     }
+    description: "Number of bytes in the footer, defaults to 0."
+  }
+  attr {
+    name: "hop_bytes"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    description: "Number of bytes to hop before each read. Default of 0 means using\nrecord_bytes."
   }
   attr {
     name: "container"
@@ -7643,7 +7921,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -7729,6 +8007,47 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "A unigram sampler could use a fixed unigram distribution read from a\nfile or passed in as an in-memory array instead of building up the distribution\nfrom data on the fly. There is also an option to skew the distribution by\napplying a distortion power to the weights.\n\nThe vocabulary file should be in CSV-like format, with the last field\nbeing the weight associated with the word.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
+}
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+    description: "A function mapping elements of `input_dataset`, concatenated with\n`other_arguments`, to a Dataset resource that contains elements matching\n`output_types` and `output_shapes`."
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: "Unlike MapDataset, the `f` in FlatMapDataset is expected to return a\nDataset resource, and FlatMapDataset will flatten successive results\ninto a single Dataset."
+  is_stateful: true
 }
 op {
   name: "Floor"
@@ -8149,19 +8468,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8251,19 +8557,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -8320,9 +8613,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8386,9 +8677,7 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -8465,23 +8754,23 @@ op {
     }
   }
   summary: "Gather slices from `params` according to `indices`."
-  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/Gather.png\" alt>\n</div>"
+  description: "`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).\nProduces an output tensor with shape `indices.shape + params.shape[1:]` where:\n\n```python\n    # Scalar indices\n    output[:, ..., :] = params[indices, :, ... :]\n\n    # Vector indices\n    output[i, :, ..., :] = params[indices[i], :, ... :]\n\n    # Higher rank indices\n    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]\n```\n\nIf `indices` is a permutation and `len(indices) == params.shape[0]` then\nthis operation will permute `params` accordingly.\n\n`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in\n`indices` are always validated to be within range. If assigned to GPU,\nout-of-bound indices result in safe but unspecified behavior, which may include\nraising an error.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/Gather.png\" alt>\n</div>"
 }
 op {
   name: "GatherNd"
   input_arg {
     name: "params"
-    description: "`P-D`.  The tensor from which to gather values."
+    description: "The tensor from which to gather values."
     type_attr: "Tparams"
   }
   input_arg {
     name: "indices"
-    description: "`Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   output_arg {
     name: "output"
-    description: "`(P+Q-K-1)-D`.  Values from `params` gathered from indices given by\n`indices`."
+    description: "Values from `params` gathered from indices given by `indices`, with\nshape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`."
     type_attr: "Tparams"
   }
   attr {
@@ -8499,26 +8788,25 @@ op {
     }
   }
   summary: "Gather values or slices from `params` according to `indices`."
-  description: "`params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.\n\n`indices` must be integer tensor, containing indices into `params`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `params`.\n\nProduces an output tensor with shape\n\n```\n[d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].\n```\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
+  description: "`indices` is an integer tensor containing indices into `params`.  The last\ndimension of `indices` can be at most the rank of `params`:\n\n    indices.shape[-1] <= params.rank\n\nThe last dimension of `indices` corresponds to elements\n(if `indices.shape[-1] = params.rank`) or slices\n(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`\nof `params`.  The output tensor has shape\n\n    indices.shape[:-1] + params.shape[indices.shape[-1]:]\n\nSome examples below.\n\nSimple indexing into a matrix:\n\n```python\n    indices = [[0, 0], [1, 1]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [\'a\', \'d\']\n```\n\nSlice indexing into a matrix:\n\n```python\n    indices = [[1], [0]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'c\', \'d\'], [\'a\', \'b\']]\n```\n\nIndexing into a 3-tensor:\n\n```python\n    indices = [[1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[0, 1], [1, 0]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'c0\', \'d0\'], [\'a1\', \'b1\']]\n\n\n    indices = [[0, 0, 1], [1, 0, 1]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [\'b0\', \'b1\']\n```\n\nBatched indexing into a matrix:\n\n```python\n    indices = [[[0, 0]], [[0, 1]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[\'a\'], [\'b\']]\n```\n\nBatched slice indexing into a matrix:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[\'a\', \'b\'], [\'c\', \'d\']]\n    output = [[[\'c\', \'d\']], [[\'a\', \'b\']]]\n```\n\nBatched indexing into a 3-tensor:\n\n```python\n    indices = [[[1]], [[0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[[\'a1\', \'b1\'], [\'c1\', \'d1\']]],\n              [[[\'a0\', \'b0\'], [\'c0\', \'d0\']]]]\n\n    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[[\'c0\', \'d0\'], [\'a1\', \'b1\']],\n              [[\'a0\', \'b0\'], [\'c1\', \'d1\']]]\n\n\n    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]\n    params = [[[\'a0\', \'b0\'], [\'c0\', \'d0\']],\n              [[\'a1\', \'b1\'], [\'c1\', \'d1\']]]\n    output = [[\'b0\', \'b1\'], [\'d0\', \'c1\']]\n```"
 }
 op {
   name: "GetSessionHandle"
   input_arg {
     name: "value"
+    description: "The tensor to be stored."
     type_attr: "T"
   }
   output_arg {
     name: "handle"
+    description: "The handle for the tensor stored in the session state, represented\nas a string."
     type: DT_STRING
   }
   attr {
     name: "T"
     type: "type"
   }
-  deprecation {
-    version: 23
-    explanation: "Use GetSessionHandleV2"
-  }
+  summary: "Store the input tensor in the state of the current session."
 }
 op {
   name: "GetSessionHandleV2"
@@ -8627,40 +8915,142 @@ op {
   description: "*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
-  name: "HSVToRGB"
+  name: "GroupByWindowDataset"
   input_arg {
-    name: "images"
-    description: "1-D or higher rank. HSV data to convert. Last dimension must be size 3."
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "output"
-    description: "`images` converted to RGB."
-    type_attr: "T"
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
   }
-  summary: "Convert one or more images from HSV to RGB."
-  description: "Outputs a tensor of the same shape as the `images` tensor, containing the RGB\nvalue of the pixels. The output is only well defined if the value in `images`\nare in `[0,1]`.\n\nSee `rgb_to_hsv` for a description of the HSV encoding."
-}
-op {
-  name: "HashTable"
   output_arg {
-    name: "table_handle"
-    description: "Handle to a table."
-    type: DT_STRING
-    is_ref: true
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+    description: "A function mapping an element of `input_dataset`, concatenated\nwith `key_func_other_arguments` to a scalar value of type DT_INT64."
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
+  description: "// TODO(mrry): Support non-int64 keys."
+  is_stateful: true
+}
+op {
+  name: "HSVToRGB"
+  input_arg {
+    name: "images"
+    description: "1-D or higher rank. HSV data to convert. Last dimension must be size 3."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "`images` converted to RGB."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  summary: "Convert one or more images from HSV to RGB."
+  description: "Outputs a tensor of the same shape as the `images` tensor, containing the RGB\nvalue of the pixels. The output is only well defined if the value in `images`\nare in `[0,1]`.\n\nSee `rgb_to_hsv` for a description of the HSV encoding."
+}
+op {
+  name: "HashTable"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates a non-initialized hash table."
+  description: "This op creates a hash table, specifying the type of its keys and values.\nBefore using the table you will have to initialize it.  After initialization the\ntable will be immutable."
+  is_stateful: true
+}
+op {
+  name: "HashTableV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -8749,11 +9139,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its inverse 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most\n  dimension of `input` is replaced with its inverse 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most"
-  description: "dimension of `input`."
+  summary: "Inverse fast Fourier transform."
+  description: "Computes the inverse 1-dimensional discrete Fourier transform over the\ninner-most dimension of `input`."
 }
 op {
   name: "IFFT2D"
@@ -8764,11 +9154,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft2\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 2\n  dimensions of `input` are replaced with their inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most"
-  description: "2 dimensions of `input`."
+  summary: "Inverse 2D fast Fourier transform."
+  description: "Computes the inverse 2-dimensional discrete Fourier transform over the\ninner-most 2 dimensions of `input`."
 }
 op {
   name: "IFFT3D"
@@ -8779,11 +9169,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifftn with 3 dimensions.\n@end_compatibility"
+    description: "A complex64 tensor of the same shape as `input`. The inner-most 3\n  dimensions of `input` are replaced with their inverse 3D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.ifftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most"
-  description: "3 dimensions of `input`."
+  summary: "Inverse 3D fast Fourier transform."
+  description: "Computes the inverse 3-dimensional discrete Fourier transform over the\ninner-most 3 dimensions of `input`."
 }
 op {
   name: "IRFFT"
@@ -8799,11 +9189,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length` samples of its inverse\n  1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft\n@end_compatibility"
+    description: "A float32 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length` samples of its inverse\n  1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Compute the inverse 1-dimensional discrete Fourier Transform of a real-valued"
-  description: "signal over the inner-most dimension of `input`.\n\nThe inner-most dimension of `input` is assumed to be the result of `RFFT`: the\n`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If\n`fft_length` is not provided, it is computed from the size of the inner-most\ndimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to\ncompute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+  summary: "Inverse real-valued fast Fourier transform."
+  description: "Computes the inverse 1-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most dimension of `input`.\n\nThe inner-most dimension of `input` is assumed to be the result of `RFFT`: the\n`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If\n`fft_length` is not provided, it is computed from the size of the inner-most\ndimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to\ncompute `input` is odd, it should be provided since it cannot be inferred\nproperly."
 }
 op {
   name: "IRFFT2D"
@@ -8819,11 +9209,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 2D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft2\n@end_compatibility"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 2D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.irfft2\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Compute the inverse 2-dimensional discrete Fourier Transform of a real-valued"
-  description: "signal over the inner-most 2 dimensions of `input`.\n\nThe inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 2 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+  summary: "Inverse 2D real-valued fast Fourier transform."
+  description: "Computes the inverse 2-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 2 dimensions of `input`.\n\nThe inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 2 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
 }
 op {
   name: "IRFFT3D"
@@ -8839,11 +9229,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A float32 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 3D real Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.irfftn with 3 dimensions.\n@end_compatibility"
+    description: "A float32 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the `fft_length` samples of their\n  inverse 3D real Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.irfftn with 3 dimensions.\n@end_compatibility"
     type: DT_FLOAT
   }
-  summary: "Compute the inverse 3-dimensional discrete Fourier Transform of a real-valued"
-  description: "signal over the inner-most 3 dimensions of `input`.\n\nThe inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 3 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
+  summary: "Inverse 3D real-valued fast Fourier transform."
+  description: "Computes the inverse 3-dimensional discrete Fourier transform of a real-valued\nsignal over the inner-most 3 dimensions of `input`.\n\nThe inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:\nThe inner-most dimension contains the `fft_length / 2 + 1` unique components of\nthe DFT of a real-valued signal. If `fft_length` is not provided, it is computed\nfrom the size of the inner-most 3 dimensions of `input`. If the FFT length used\nto compute `input` is odd, it should be provided since it cannot be inferred\nproperly."
 }
 op {
   name: "Identity"
@@ -8941,7 +9331,7 @@ op {
     }
   }
   summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
-  description: "The lower regularized incomplete Gamma function is defined as:\n\n```\nP(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\n```\nwhere\n```\ngamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\n```\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
+  description: "The lower regularized incomplete Gamma function is defined as:\n\n\n\\\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\\\)\n\nwhere\n\n\\\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\\\)\n\nis the lower incomplete Gamma function.\n\nNote, above `Q(a, x)` (`Igammac`) is the upper regularized complete\nGamma function."
 }
 op {
   name: "Igammac"
@@ -8968,7 +9358,7 @@ op {
     }
   }
   summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
-  description: "The upper regularized incomplete Gamma function is defined as:\n\n```\nQ(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\n```\nwhere\n```\nGamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\n```\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
+  description: "The upper regularized incomplete Gamma function is defined as:\n\n\\\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\\\)\n\nwhere\n\n\\\\(Gamma(a, x) = int_{x}^{\\infty} t^{a-1} exp(-t) dt\\\\)\n\nis the upper incomplete Gama function.\n\nNote, above `P(a, x)` (`Igamma`) is the lower regularized complete\nGamma function."
 }
 op {
   name: "Imag"
@@ -9210,6 +9600,82 @@ op {
   summary: "Initializes a table from a text file."
   description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
 }
+op {
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    description: "Filename of a vocabulary text file."
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    description: "Column index in a line to get the table `key` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    description: "Column index that represents information of a line to get the table\n`value` values from."
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Number of elements of the file, use -1 if unknown."
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+    description: "Delimiter to separate fields in a line."
+  }
+  summary: "Initializes a table from a text file."
+  description: "It inserts one key-value pair into the table for each line of the file.\nThe key and value is extracted from the whole line content, elements from the\nsplit line based on `delimiter` or the line number (starting from zero).\nWhere to extract the key and value from a line is specified by `key_index` and\n`value_index`.\n\n- A value of -1 means use the line number(starting from zero), expects `int64`.\n- A value of -2 means use the whole line content, expects `string`.\n- A value >= 0 means use the index (starting at zero) of the split line based\n  on `delimiter`."
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to a table which will be initialized."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Keys of type Tkey."
+    type_attr: "Tkey"
+  }
+  input_arg {
+    name: "values"
+    description: "Values of type Tval."
+    type_attr: "Tval"
+  }
+  attr {
+    name: "Tkey"
+    type: "type"
+  }
+  attr {
+    name: "Tval"
+    type: "type"
+  }
+  summary: "Table initializer that takes two tensors for keys and values respectively."
+  is_stateful: true
+}
 op {
   name: "Inv"
   input_arg {
@@ -9302,7 +9768,7 @@ op {
     }
   }
   summary: "Computes the inverse permutation of a tensor."
-  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```prettyprint\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
+  description: "This operation computes the inverse of an index permutation. It takes a 1-D\ninteger tensor `x`, which represents the indices of a zero-based array, and\nswaps each value with its index position. In other words, for an output tensor\n`y` and an input tensor `x`, this operation computes the following:\n\n`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`\n\nThe values must include 0. There can be no duplicate values or negative values.\n\nFor example:\n\n```\n# tensor `x` is [3, 4, 0, 2, 1]\ninvert_permutation(x) ==> [2, 4, 3, 0, 1]\n```"
 }
 op {
   name: "IsFinite"
@@ -9397,6 +9863,70 @@ op {
   description: "Outputs boolean scalar indicating whether the tensor has been initialized."
   allows_uninitialized_input: true
 }
+op {
+  name: "Iterator"
+  output_arg {
+    name: "handle"
+    description: "A handle to the iterator that can be passed to a \"MakeIterator\"\nor \"IteratorGetNext\" op."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "A container for an iterator resource."
+  is_stateful: true
+}
+op {
+  name: "IteratorDispose"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  summary: "Releases any resources used by the given iterator."
+  is_stateful: true
+}
+op {
+  name: "IteratorGetNext"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Gets the next output from the given iterator."
+  is_stateful: true
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -9414,20 +9944,9 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -9594,7 +10113,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -9628,6 +10147,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Less"
@@ -9808,7 +10328,7 @@ op {
     }
   }
   summary: "Computes the difference between two lists of numbers or strings."
-  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```prettyprint\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```prettyprint\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
+  description: "Given a list `x` and a list `y`, this operation returns a list `out` that\nrepresents all values that are in `x` but not in `y`. The returned list `out`\nis sorted in the same order that the numbers appear in `x` (duplicates are\npreserved). This operation also returns a list `idx` that represents the\nposition of each `out` element in `x`. In other words:\n\n`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`\n\nFor example, given this input:\n\n```\nx = [1, 2, 3, 4, 5, 6]\ny = [1, 3, 5]\n```\n\nThis operation would return:\n\n```\nout ==> [2, 4, 6]\nidx ==> [1, 3, 5]\n```"
 }
 op {
   name: "Log"
@@ -9920,7 +10440,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -9954,6 +10474,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a log-uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "LogicalAnd"
@@ -10032,19 +10553,79 @@ op {
   summary: "Outputs all keys and values in the table."
 }
 op {
-  name: "LookupTableFind"
+  name: "LookupTableExportV2"
   input_arg {
     name: "table_handle"
     description: "Handle to the table."
-    type: DT_STRING
-    is_ref: true
+    type: DT_RESOURCE
   }
-  input_arg {
+  output_arg {
     name: "keys"
-    description: "Any shape.  Keys to look up."
-    type_attr: "Tin"
+    description: "Vector of all keys present in the table."
+    type_attr: "Tkeys"
   }
-  input_arg {
+  output_arg {
+    name: "values"
+    description: "Tensor of all values in the table. Indexed in parallel with `keys`."
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  summary: "Outputs all keys and values in the table."
+  is_stateful: true
+}
+op {
+  name: "LookupTableFind"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
+  }
+  output_arg {
+    name: "values"
+    description: "Same shape as `keys`.  Values found in the table, or `default_values`\nfor missing keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Looks up keys in a table, outputs the corresponding values."
+  description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
+}
+op {
+  name: "LookupTableFindV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
     name: "default_value"
     type_attr: "Tout"
   }
@@ -10063,6 +10644,7 @@ op {
   }
   summary: "Looks up keys in a table, outputs the corresponding values."
   description: "The tensor `keys` must of the same type as the keys of the table.\nThe output `values` is of the type of the table values.\n\nThe scalar `default_value` is the value output for keys not present in the\ntable. It must also be of the same type as the table values."
+  is_stateful: true
 }
 op {
   name: "LookupTableImport"
@@ -10093,6 +10675,35 @@ op {
   summary: "Replaces the contents of the table with the specified keys and values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableImportV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Replaces the contents of the table with the specified keys and values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableInsert"
   input_arg {
@@ -10122,6 +10733,35 @@ op {
   summary: "Updates the table to associates keys with values."
   description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
 }
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    description: "Any shape.  Keys to look up."
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    description: "Values to associate with keys."
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  summary: "Updates the table to associates keys with values."
+  description: "The tensor `keys` must be of the same type as the keys of the table.\nThe tensor `values` must be of the type of the table values."
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -10137,6 +10777,21 @@ op {
   }
   summary: "Computes the number of elements in the given table."
 }
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    description: "Handle to the table."
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    description: "Scalar that contains number of elements in the table."
+    type: DT_INT64
+  }
+  summary: "Computes the number of elements in the given table."
+  is_stateful: true
+}
 op {
   name: "LoopCond"
   input_arg {
@@ -10152,6 +10807,58 @@ op {
   summary: "Forwards the input to the output."
   description: "This operator represents the loop termination condition used by the\n\"pivot\" switches of a loop."
 }
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  summary: "Makes a new iterator from the given `dataset` and stores it in `iterator`."
+  description: "This operation may be executed multiple times. Each execution will reset the\niterator in `iterator` to the first element of `dataset`."
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  is_stateful: true
+}
 op {
   name: "MatMul"
   input_arg {
@@ -10241,7 +10948,7 @@ op {
     type: "type"
   }
   summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
-  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```prettyprint\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```prettyprint\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
+  description: "to zero.\n\nThe `band` part is computed as follows:\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor with the same shape where\n\n`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.\n\nThe indicator function\n\n`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&\n                 (num_upper < 0 || (n-m) <= num_upper)`.\n\nFor example:\n\n```\n# if \'input\' is [[ 0,  1,  2, 3]\n                 [-1,  0,  1, 2]\n                 [-2, -1,  0, 1]\n                 [-3, -2, -1, 0]],\n\ntf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]\n                                       [-1,  0,  1, 2]\n                                       [ 0, -1,  0, 1]\n                                       [ 0,  0, -1, 0]],\n\ntf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]\n                                      [-1,  0,  1, 0]\n                                      [-2, -1,  0, 1]\n                                      [ 0, -2, -1, 0]]\n```\n\nUseful special cases:\n\n```\n tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.\n tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.\n tf.matrix_band_part(input, 0, 0) ==> Diagonal.\n```"
 }
 op {
   name: "MatrixDeterminant"
@@ -10285,7 +10992,7 @@ op {
     type: "type"
   }
   summary: "Returns a batched diagonal tensor with a given batched diagonal values."
-  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```prettyprint\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
+  description: "Given a `diagonal`, this operation returns a tensor with the `diagonal` and\neverything else padded with zeros. The diagonal is computed as follows:\n\nAssume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a\ntensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:\n\n`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.\n\nFor example:\n\n```\n# \'diagonal\' is [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nand diagonal.shape = (2, 4)\n\ntf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]\n                                     [0, 2, 0, 0]\n                                     [0, 0, 3, 0]\n                                     [0, 0, 0, 4]],\n                                    [[5, 0, 0, 0]\n                                     [0, 6, 0, 0]\n                                     [0, 0, 7, 0]\n                                     [0, 0, 0, 8]]]\n\nwhich has shape (2, 4, 4)\n```"
 }
 op {
   name: "MatrixDiagPart"
@@ -10304,7 +11011,7 @@ op {
     type: "type"
   }
   summary: "Returns the batched diagonal part of a batched tensor."
-  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```prettyprint\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
+  description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
 }
 op {
   name: "MatrixInverse"
@@ -10574,6 +11281,13 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -10676,19 +11390,6 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -10699,12 +11400,12 @@ op {
   input_arg {
     name: "orig_input"
     description: "The original input tensor."
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
     description: "The original output tensor."
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
@@ -10757,29 +11458,31 @@ op {
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
   summary: "Computes gradients of max pooling function."
 }
 op {
-  name: "MaxPoolGrad"
+  name: "MaxPool3DGradGrad"
   input_arg {
     name: "orig_input"
     description: "The original input tensor."
@@ -10792,27 +11495,27 @@ op {
   }
   input_arg {
     name: "grad"
-    description: "4-D.  Gradients w.r.t. the output of `max_pool`."
+    description: "Output backprop of shape `[batch, depth, rows, cols, channels]`."
     type_attr: "T"
   }
   output_arg {
     name: "output"
-    description: "Gradients w.r.t. the input to `max_pool`."
+    description: "Gradients of gradients w.r.t. the input to `max_pool`."
     type_attr: "T"
   }
   attr {
     name: "ksize"
     type: "list(int)"
-    description: "The size of the window for each dimension of the input tensor."
+    description: "1-D tensor of length 5. The size of the window for each dimension of\nthe input tensor. Must have `ksize[0] = ksize[4] = 1`."
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
-    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    description: "1-D tensor of length 5. The stride of the sliding window for each\ndimension of `input`. Must have `strides[0] = strides[4] = 1`."
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -10829,51 +11532,210 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
-    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    description: "The data format of the input and output data. With the\ndefault format \"NDHWC\", the data is stored in the order of:\n    [batch, in_depth, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCDHW\", the data storage order is:\n    [batch, in_channels, in_depth, in_height, in_width]."
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_HALF
       }
     }
   }
-  summary: "Computes gradients of the maxpooling function."
+  summary: "Computes second-order gradients of the maxpooling function."
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
-    description: "The original input."
+    name: "orig_input"
+    description: "The original input tensor."
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\noutput of `max_pool`."
+    name: "orig_output"
+    description: "The original output tensor."
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
+    name: "grad"
+    description: "4-D.  Gradients w.r.t. the output of `max_pool`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients w.r.t. the input to `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "The size of the window for each dimension of the input tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes gradients of the maxpooling function."
+}
+op {
+  name: "MaxPoolGradGrad"
+  input_arg {
+    name: "orig_input"
+    description: "The original input tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    description: "The original output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "4-D.  Gradients of gradients w.r.t. the input of `max_pool`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients of gradients w.r.t. the input to `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "The size of the window for each dimension of the input tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    description: "Specify the data format of the input and output data. With the\ndefault format \"NHWC\", the data is stored in the order of:\n    [batch, in_height, in_width, in_channels].\nAlternatively, the format could be \"NCHW\", the data storage order of:\n    [batch, in_channels, in_height, in_width]."
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
+op {
+  name: "MaxPoolGradGradWithArgmax"
+  input_arg {
+    name: "input"
+    description: "The original input."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\ninput of `max_pool`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
     description: "The indices of the maximum values chosen for each output of `max_pool`."
     type_attr: "Targmax"
   }
   output_arg {
     name: "output"
-    description: "Gradients w.r.t. the input of `max_pool`."
+    description: "Gradients of gradients w.r.t. the input of `max_pool`."
     type_attr: "T"
   }
   attr {
@@ -10914,12 +11776,92 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
+op {
+  name: "MaxPoolGradWithArgmax"
+  input_arg {
+    name: "input"
+    description: "The original input."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the\noutput of `max_pool`."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    description: "The indices of the maximum values chosen for each output of `max_pool`."
+    type_attr: "Targmax"
+  }
+  output_arg {
+    name: "output"
+    description: "Gradients w.r.t. the input of `max_pool`."
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    description: "The size of the window for each dimension of the input tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    description: "The stride of the sliding window for each dimension of the\ninput tensor."
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    description: "The type of padding algorithm to use."
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -10984,12 +11926,16 @@ op {
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
@@ -11120,7 +12066,7 @@ op {
     minimum: 1
   }
   summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor for become available to `output`, and sets\n`value_index` to its index in `inputs`."
+  description: "`Merge` waits for at least one of the tensors in `inputs` to become available.\nIt is usually combined with `Switch` to implement branching.\n\n`Merge` forwards the first tensor to become available to `output`, and sets\n`value_index` to its index in `inputs`."
 }
 op {
   name: "MergeSummary"
@@ -11151,21 +12097,72 @@ op {
     description: "prefixes of V2 checkpoints to merge."
     type: DT_STRING
   }
-  input_arg {
-    name: "destination_prefix"
-    description: "scalar.  The desired final prefix.  Allowed to be the same\nas one of the checkpoint_prefixes."
-    type: DT_STRING
+  input_arg {
+    name: "destination_prefix"
+    description: "scalar.  The desired final prefix.  Allowed to be the same\nas one of the checkpoint_prefixes."
+    type: DT_STRING
+  }
+  attr {
+    name: "delete_old_dirs"
+    type: "bool"
+    default_value {
+      b: true
+    }
+    description: "see above."
+  }
+  summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
+  description: "result is one logical checkpoint, with one physical metadata file and renamed\ndata files.\n\nIntended for \"grouping\" multiple checkpoints in a sharded checkpoint setup.\n\nIf delete_old_dirs is true, attempts to delete recursively the dirname of each\npath in the input checkpoint_prefixes.  This is useful when those paths are non\nuser-facing temporary locations."
+}
+op {
+  name: "Mfcc"
+  input_arg {
+    name: "spectrogram"
+    description: "Typically produced by the Spectrogram op, with magnitude_squared\nset to true."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sample_rate"
+    description: "How many samples per second the source audio used."
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "upper_frequency_limit"
+    type: "float"
+    default_value {
+      f: 4000
+    }
+    description: "The highest frequency to use when calculating the\nceptstrum."
+  }
+  attr {
+    name: "lower_frequency_limit"
+    type: "float"
+    default_value {
+      f: 20
+    }
+    description: "The lowest frequency to use when calculating the\nceptstrum."
   }
   attr {
-    name: "delete_old_dirs"
-    type: "bool"
+    name: "filterbank_channel_count"
+    type: "int"
     default_value {
-      b: true
+      i: 40
     }
-    description: "see above."
+    description: "Resolution of the Mel bank used internally."
   }
-  summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
-  description: "result is one logical checkpoint, with one physical metadata file and renamed\ndata files.\n\nIntended for \"grouping\" multiple checkpoints in a sharded checkpoint setup.\n\nIf delete_old_dirs is true, attempts to delete recursively the dirname of each\npath in the input checkpoint_prefixes.  This is useful when those paths are non\nuser-facing temporary locations."
+  attr {
+    name: "dct_coefficient_count"
+    type: "int"
+    default_value {
+      i: 13
+    }
+    description: "How many output channels to produce per time slice."
+  }
+  summary: "Transforms a spectrogram into a form that\'s useful for speech recognition."
+  description: "Mel Frequency Cepstral Coefficients are a way of representing audio data that\'s\nbeen effective as an input feature for machine learning. They are created by\ntaking the spectrum of a spectrogram (a \'cepstrum\'), and discarding some of the\nhigher frequencies that are less significant to the human ear. They have a long\nhistory in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum\nis a good resource to learn more."
 }
 op {
   name: "Min"
@@ -11307,7 +12304,7 @@ op {
     }
   }
   summary: "Pads a tensor with mirrored values."
-  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
+  description: "This operation pads a `input` with mirrored values according to the `paddings`\nyou specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is\nthe rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many values to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many values to add after the contents of `input`\nin that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater\nthan `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true\n(if false, respectively).\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6]].\n# \'paddings\' is [[1, 1]], [2, 2]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]\n                      [2, 1, 1, 2, 3, 3, 2]\n                      [5, 4, 4, 5, 6, 6, 5]\n                      [5, 4, 4, 5, 6, 6, 5]]\n```"
 }
 op {
   name: "MirrorPadGrad"
@@ -11355,7 +12352,7 @@ op {
     }
   }
   summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
-  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
+  description: "This operation folds the padded areas of `input` by `MirrorPad` according to the\n`paddings` you specify. `paddings` must be the same as `paddings` argument\ngiven to the corresponding `MirrorPad` op.\n\nThe folded size of each dimension D of the output is:\n\n`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].\n# \'paddings\' is [[0, 1]], [0, 1]].\n# \'mode\' is SYMMETRIC.\n# rank of \'t\' is 2.\npad(t, paddings) ==> [[ 1,  5]\n                      [11, 28]]\n```"
 }
 op {
   name: "Mod"
@@ -11383,8 +12380,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division."
-  description: "*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "Mul"
@@ -11547,8 +12544,82 @@ op {
     }
     description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
   }
-  summary: "Creates an empty hash table that uses tensors as the backing store. It uses"
-  description: "\"open addressing\" with quadratic reprobing to resolve collisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableDenseHashTableV2"
+  input_arg {
+    name: "empty_key"
+    description: "The key used to represent empty key buckets internally. Must not\nbe used in insert or lookup operations."
+    type_attr: "key_dtype"
+  }
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+    description: "The shape of each value."
+  }
+  attr {
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
+    description: "The initial number of hash table buckets. Must be a power\nto 2."
+  }
+  attr {
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
+    description: "The maximum ratio between number of entries and number of\nbuckets before growing the table. Must be between 0 and 1."
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: "It uses \"open addressing\" with quadratic reprobing to resolve\ncollisions.\n\nThis op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
 op {
@@ -11650,6 +12721,103 @@ op {
   description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
   is_stateful: true
 }
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a vector. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    description: "Handle to a table."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this table is shared under the given name across\nmultiple sessions."
+  }
+  attr {
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true and shared_name is empty, the table is shared\nusing the node name."
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+    description: "Type of the table keys."
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+    description: "Type of the table values."
+  }
+  summary: "Creates an empty hash table."
+  description: "This op creates a mutable hash table, specifying the type of its keys and\nvalues. Each value must be a scalar. Data can be inserted into the table using\nthe insert operations. It does not support the initialization operation."
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -11776,7 +12944,37 @@ op {
     description: "A float representing the threshold for deciding whether boxes\noverlap too much with respect to IOU."
   }
   summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\n\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n\n  selected_indices = tf.image.non_max_suppression(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
+  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n  selected_indices = tf.image.non_max_suppression(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
+}
+op {
+  name: "NonMaxSuppressionV2"
+  input_arg {
+    name: "boxes"
+    description: "A 2-D float tensor of shape `[num_boxes, 4]`."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    description: "A 1-D float tensor of shape `[num_boxes]` representing a single\nscore corresponding to each box (each row of boxes)."
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    description: "A scalar integer tensor representing the maximum number of\nboxes to be selected by non max suppression."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    description: "A 0-D float tensor representing the threshold for deciding whether\nboxes overlap too much with respect to IOU."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    description: "A 1-D integer tensor of shape `[M]` representing the selected\nindices from the boxes tensor, where `M <= max_output_size`."
+    type: DT_INT32
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: "pruning away boxes that have high intersection-over-union (IOU) overlap\nwith previously selected boxes.  Bounding boxes are supplied as\n[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any\ndiagonal pair of box corners and the coordinates can be provided as normalized\n(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm\nis agnostic to where the origin is in the coordinate system.  Note that this\nalgorithm is invariant to orthogonal transformations and translations\nof the coordinate system; thus translating or reflections of the coordinate\nsystem result in the same boxes being selected by the algorithm.\n\nThe output of this operation is a set of integers indexing into the input\ncollection of bounding boxes representing the selected boxes.  The bounding\nbox coordinates corresponding to the selected indices can then be obtained\nusing the `tf.gather operation`.  For example:\n\n  selected_indices = tf.image.non_max_suppression_v2(\n      boxes, scores, max_output_size, iou_threshold)\n  selected_boxes = tf.gather(boxes, selected_indices)"
 }
 op {
   name: "NotEqual"
@@ -11859,21 +13057,63 @@ op {
     type: "type"
   }
   attr {
-    name: "TI"
-    type: "type"
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Returns a one-hot tensor."
+  description: "The locations represented by indices in `indices` take value `on_value`,\nwhile all other locations take value `off_value`.\n\nIf the input `indices` is rank `N`, the output will have rank `N+1`,\nThe new axis is created at dimension `axis` (default: the new axis is\nappended at the end).\n\nIf `indices` is a scalar the output shape will be a vector of length `depth`.\n\nIf `indices` is a vector of length `features`, the output shape will be:\n```\n  features x depth if axis == -1\n  depth x features if axis == 0\n```\n\nIf `indices` is a matrix (batch) with shape `[batch, features]`,\nthe output shape will be:\n```\n  batch x features x depth if axis == -1\n  batch x depth x features if axis == 1\n  depth x batch x features if axis == 0\n```\n\n\nExamples\n=========\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 5.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[4 x 3]`:\n\n    ```output =\n      [5.0 0.0 0.0]  // one_hot(0)\n      [0.0 0.0 5.0]  // one_hot(2)\n      [0.0 0.0 0.0]  // one_hot(-1)\n      [0.0 5.0 0.0]  // one_hot(1)\n    ```\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 0.0\n  off_value = 3.0\n  axis = 0\n```\n\nThen output is `[3 x 4]`:\n\n    ```output =\n      [0.0 3.0 3.0 3.0]\n      [3.0 3.0 3.0 0.0]\n      [3.0 3.0 3.0 3.0]\n      [3.0 0.0 3.0 3.0]\n    //  ^                one_hot(0)\n    //      ^            one_hot(2)\n    //          ^        one_hot(-1)\n    //              ^    one_hot(1)\n    ```\nSuppose that\n\n```\n  indices = [[0, 2], [1, -1]]\n  depth = 3\n  on_value = 1.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[2 x 2 x 3]`:\n\n    ```output =\n      [\n        [1.0, 0.0, 0.0]  // one_hot(0)\n        [0.0, 0.0, 1.0]  // one_hot(2)\n      ][\n        [0.0, 1.0, 0.0]  // one_hot(1)\n        [0.0, 0.0, 0.0]  // one_hot(-1)\n      ]```"
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    description: "A handle to the iterator that can be passed to an \"IteratorGetNext\"\nop."
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+    description: "A function of type `() -> DT_RESOURCE`, where the returned\nDT_RESOURCE is a handle to a dataset."
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
     default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_UINT8
-        type: DT_INT32
-        type: DT_INT64
-      }
+      s: ""
     }
   }
-  summary: "Returns a one-hot tensor."
-  description: "The locations represented by indices in `indices` take value `on_value`,\nwhile all other locations take value `off_value`.\n\nIf the input `indices` is rank `N`, the output will have rank `N+1`,\nThe new axis is created at dimension `axis` (default: the new axis is\nappended at the end).\n\nIf `indices` is a scalar the output shape will be a vector of length `depth`.\n\nIf `indices` is a vector of length `features`, the output shape will be:\n```\n  features x depth if axis == -1\n  depth x features if axis == 0\n```\n\nIf `indices` is a matrix (batch) with shape `[batch, features]`,\nthe output shape will be:\n```\n  batch x features x depth if axis == -1\n  batch x depth x features if axis == 1\n  depth x batch x features if axis == 0\n```\n\n\nExamples\n=========\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 5.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[4 x 3]`:\n\n    ```output =\n      [5.0 0.0 0.0]  // one_hot(0)\n      [0.0 0.0 5.0]  // one_hot(2)\n      [0.0 0.0 0.0]  // one_hot(-1)\n      [0.0 5.0 0.0]  // one_hot(1)\n    ```\n\nSuppose that\n\n```\n  indices = [0, 2, -1, 1]\n  depth = 3\n  on_value = 0.0\n  off_value = 3.0\n  axis = 0\n```\n\nThen output is `[3 x 4]`:\n\n    ```output =\n      [0.0 3.0 3.0 3.0]\n      [3.0 3.0 3.0 0.0]\n      [3.0 3.0 3.0 3.0]\n      [3.0 0.0 3.0 3.0]\n    //  ^                one_hot(0)\n    //      ^            one_hot(2)\n    //          ^        one_hot(-1)\n    //              ^    one_hot(1)\n    ```\nSuppose that\n\n```\n  indices = [[0, 2], [1, -1]]\n  depth = 3\n  on_value = 1.0\n  off_value = 0.0\n  axis = -1\n```\n\nThen output is `[2 x 2 x 3]`:\n\n    ```output =\n      [\n        [1.0, 0.0, 0.0]  // one_hot(0)\n        [0.0, 0.0, 1.0]  // one_hot(2)\n      ][\n        [0.0, 1.0, 0.0]  // one_hot(1)\n        [0.0, 0.0, 0.0]  // one_hot(-1)\n      ]```"
+  summary: "Makes a \"one-shot\" iterator that can be iterated only once."
+  description: "A one-shot iterator bundles the logic for defining the dataset and\nthe state of the iterator in a single op, which allows simple input\npipelines to be defined without an additional initialization\n(\"MakeIterator\") step.\n\nOne-shot iterators have the following limitations:\n\n* They do not support parameterization: all logic for creating the underlying\n  dataset must be bundled in the `dataset_factory` function.\n* They are not resettable. Once a one-shot iterator reaches the end of its\n  underlying dataset, subsequent \"IteratorGetNext\" operations on that\n  iterator will always produce an `OutOfRange` error.\n\nFor greater flexibility, use \"Iterator\" and \"MakeIterator\" to define\nan iterator using an arbitrary subgraph, which may capture tensors\n(including fed values) as parameters, and which may be reset multiple\ntimes by rerunning \"MakeIterator\"."
+  is_stateful: true
 }
 op {
   name: "OnesLike"
@@ -11935,7 +13175,7 @@ op {
     description: "Dimension along which to pack.  Negative values wrap around, so the\nvalid range is `[-(R+1), R+1)`."
   }
   summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
-  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```prettyprint\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
+  description: "Packs the `N` tensors in `values` into a tensor with rank one higher than each\ntensor in `values`, by packing them along the `axis` dimension.\nGiven a list of tensors of shape `(A, B, C)`;\n\nif `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.\nif `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.\nEtc.\n\nFor example:\n\n```\n# \'x\' is [1, 4]\n# \'y\' is [2, 5]\n# \'z\' is [3, 6]\npack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\npack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]\n```\n\nThis is the opposite of `unpack`."
 }
 op {
   name: "Pad"
@@ -11969,7 +13209,54 @@ op {
     }
   }
   summary: "Pads a tensor with zeros."
-  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```prettyprint\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+  description: "This operation pads a `input` with zeros according to the `paddings` you\nspecify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the\nrank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates\nhow many zeros to add before the contents of `input` in that dimension, and\n`paddings[D, 1]` indicates how many zeros to add after the contents of `input`\nin that dimension.\n\nThe padded size of each dimension D of the output is:\n\n`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`\n\nFor example:\n\n```\n# \'t\' is [[1, 1], [2, 2]]\n# \'paddings\' is [[1, 1], [2, 2]]\n# rank of \'t\' is 2\npad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]\n                      [0, 0, 1, 1, 0, 0]\n                      [0, 0, 2, 2, 0, 0]\n                      [0, 0, 0, 0, 0, 0]]\n```"
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "batch_size"
+    description: "A scalar representing the number of elements to accumulate in a\nbatch."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    description: "A list of int64 tensors representing the desired padded shapes\nof the corresponding output components. These shapes may be partially\nspecified, using `-1` to indicate that a particular dimension should be\npadded to the maximum size of all batch elements."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    description: "A list of scalars containing the padding value to use for\neach of the outputs."
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
+  is_stateful: true
 }
 op {
   name: "PaddingFIFOQueue"
@@ -12105,7 +13392,56 @@ op {
     description: "the final shape of the result; should be equal to the shapes of any input\nbut with the number of input values in the first dimension."
   }
   summary: "Concatenates a list of `N` tensors along the first dimension."
-  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```prettyprint\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
+  description: "The input tensors are all required to have size 1 in the first dimension.\n\nFor example:\n\n```\n# \'x\' is [[1, 4]]\n# \'y\' is [[2, 5]]\n# \'z\' is [[3, 6]]\nparallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.\n```\n\nThe difference between concat and parallel_concat is that concat requires all\nof the inputs be computed before the operation will begin but doesn\'t require\nthat the input shapes be known during graph construction.  Parallel concat\nwill copy pieces of the input into the output as they become available, in\nsome situations this can provide a performance benefit."
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_threads"
+    description: "The number of threads to use to process elements from\n`input_dataset`."
+    type: DT_INT32
+  }
+  input_arg {
+    name: "output_buffer_size"
+    description: "The maximum number of output elements to buffer in an\niterator over this dataset."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: "Unlike a \"MapDataset\", which applies `f` sequentially, this dataset uses\nup to `num_threads` threads to process elements from `input_dataset`\nin parallel."
+  is_stateful: true
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -12509,6 +13845,7 @@ op {
     type: "shape"
     default_value {
       shape {
+        unknown_rank: true
       }
     }
     description: "(Optional) The shape of the tensor. If the shape has 0 dimensions, the\nshape is unconstrained."
@@ -12535,6 +13872,10 @@ op {
   }
   summary: "A placeholder op for a value that will be fed into the computation."
   description: "N.B. This operation will fail with an error if it is executed. It is\nintended as a way to represent a value that will always be fed, and to\nprovide attrs that enable the fed value to be checked at runtime."
+  deprecation {
+    version: 23
+    explanation: "Placeholder now behaves the same as PlaceholderV2."
+  }
 }
 op {
   name: "PlaceholderWithDefault"
@@ -12585,7 +13926,7 @@ op {
     }
   }
   summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
-  description: "The polygamma function is defined as:\n\n```\n\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\n```\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
+  description: "The polygamma function is defined as:\n\n\n\\\\(\\psi^{(n)}(x) = \\frac{d^n}{dx^n} \\psi(x)\\\\)\n\nwhere \\\\(\\psi(x)\\\\) is the digamma function."
 }
 op {
   name: "Pow"
@@ -14272,7 +15613,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be cancelled."
+    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
   summary: "Closes the given queue."
   description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
@@ -14290,7 +15631,7 @@ op {
     default_value {
       b: false
     }
-    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be cancelled."
+    description: "If true, all pending enqueue requests that are\nblocked on the given queue will be canceled."
   }
   summary: "Closes the given queue."
   description: "This operation signals that no more elements will be enqueued in the\ngiven queue. Subsequent Enqueue(Many) operations will fail.\nSubsequent Dequeue(Many) operations will continue to succeed if\nsufficient elements remain in the queue. Subsequent Dequeue(Many)\noperations that would block will fail immediately."
@@ -14360,8 +15701,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
 }
 op {
   name: "QueueDequeueManyV2"
@@ -14395,8 +15736,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "If the queue is closed and there are fewer than n elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until n elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "If the queue is closed and there are fewer than `n` elements, then an\nOutOfRange error is returned.\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple.\n\nN.B. If the queue is empty, this operation will block until `n` elements\nhave been dequeued (or \'timeout_ms\' elapses, if specified)."
   is_stateful: true
 }
 op {
@@ -14432,8 +15773,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size `n` in the 0th dimension.\n\nThis operation has k outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
 }
 op {
   name: "QueueDequeueUpToV2"
@@ -14467,8 +15808,8 @@ op {
     }
     description: "If the queue has fewer than n elements, this operation\nwill block for up to timeout_ms milliseconds.\nNote: This option is not supported yet."
   }
-  summary: "Dequeues n tuples of one or more tensors from the given queue."
-  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than n elements\nremaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If the queue\nis closed and there are 0 elements left in the queue, then an OutOfRange\nerror is returned just like in QueueDequeueMany.  Otherwise the behavior\nis identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has k outputs, where k is the number of components in\nthe tuples stored in the given queue, and output i is the ith\ncomponent of the dequeued tuple."
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: "This operation is not supported by all queues.  If a queue does not support\nDequeueUpTo, then an Unimplemented error is returned.\n\nIf the queue is closed and there are more than 0 but less than `n`\nelements remaining, then instead of returning an OutOfRange error like\nQueueDequeueMany, less than `n` elements are returned immediately.  If\nthe queue is closed and there are 0 elements left in the queue, then\nan OutOfRange error is returned just like in QueueDequeueMany.\nOtherwise the behavior is identical to QueueDequeueMany:\n\nThis operation concatenates queue-element component tensors along the\n0th dimension to make a single component tensor.  All of the components\nin the dequeued tuple will have size n in the 0th dimension.\n\nThis operation has `k` outputs, where `k` is the number of components in\nthe tuples stored in the given queue, and output `i` is the ith\ncomponent of the dequeued tuple."
   is_stateful: true
 }
 op {
@@ -14666,11 +16007,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length / 2 + 1` unique\n  frequency components of its 1D Fourier Transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft\n@end_compatibility"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most\n  dimension of `input` is replaced with the `fft_length / 2 + 1` unique\n  frequency components of its 1D Fourier transform.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 1-dimensional discrete Fourier Transform of a real-valued signal"
-  description: "over the inner-most dimension of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the\n`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,\nfollowed by the `fft_length / 2` positive-frequency terms."
+  summary: "Real-valued fast Fourier transform."
+  description: "Computes the 1-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most dimension of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the\n`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,\nfollowed by the `fft_length / 2` positive-frequency terms."
 }
 op {
   name: "RFFT2D"
@@ -14686,11 +16027,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier Transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft2\n@end_compatibility"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 2\n  dimensions of `input` are replaced with their 2D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfft2\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 2-dimensional discrete Fourier Transform of a real-valued signal"
-  description: "over the inner-most 2 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
+  summary: "2D real-valued fast Fourier transform."
+  description: "Computes the 2-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 2 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
 }
 op {
   name: "RFFT3D"
@@ -14706,11 +16047,11 @@ op {
   }
   output_arg {
     name: "output"
-    description: "A complex64 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the their 3D Fourier Transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfftn with 3 dimensions.\n@end_compatibility"
+    description: "A complex64 tensor of the same rank as `input`. The inner-most 3\n  dimensions of `input` are replaced with the their 3D Fourier transform. The\n  inner-most dimension contains `fft_length / 2 + 1` unique frequency\n  components.\n\n@compatibility(numpy)\nEquivalent to np.fft.rfftn with 3 dimensions.\n@end_compatibility"
     type: DT_COMPLEX64
   }
-  summary: "Compute the 3-dimensional discrete Fourier Transform of a real-valued signal"
-  description: "over the inner-most 3 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
+  summary: "3D real-valued fast Fourier transform."
+  description: "Computes the 3-dimensional discrete Fourier transform of a real-valued signal\nover the inner-most 3 dimensions of `input`.\n\nSince the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the\n`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension\nof `output`: the zero-frequency term, followed by the `fft_length / 2`\npositive-frequency terms."
 }
 op {
   name: "RGBToHSV"
@@ -15309,6 +16650,42 @@ op {
   summary: "Creates a sequence of numbers."
   description: "This operation creates a sequence of numbers that begins at `start` and\nextends by increments of `delta` up to but not including `limit`.\n\nFor example:\n\n```\n# \'start\' is 3\n# \'limit\' is 18\n# \'delta\' is 3\ntf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]\n```"
 }
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    description: "corresponds to start in python\'s xrange()."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    description: "corresponds to stop in python\'s xrange()."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    description: "corresponds to step in python\'s xrange()."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset with a range of values. Corresponds to python\'s xrange."
+  is_stateful: true
+}
 op {
   name: "Rank"
   input_arg {
@@ -15324,7 +16701,7 @@ op {
     type: "type"
   }
   summary: "Returns the rank of a tensor."
-  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
+  description: "This operation returns an integer representing the rank of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\n# shape of tensor \'t\' is [2, 2, 3]\nrank(t) ==> 3\n```\n\n**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank\nof a tensor is the number of indices required to uniquely select each element\nof the tensor. Rank is also known as \"order\", \"degree\", or \"ndims.\""
 }
 op {
   name: "ReadFile"
@@ -16148,6 +17525,36 @@ op {
   }
   summary: "Computes rectified linear gradients for a Relu operation."
 }
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of times that `input_dataset` should\nbe repeated. A value of `-1` indicates that it should be repeated infinitely."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits the outputs of `input_dataset` `count` times."
+  is_stateful: true
+}
 op {
   name: "RequantizationRange"
   input_arg {
@@ -16295,7 +17702,7 @@ op {
     }
   }
   summary: "Reshapes a tensor."
-  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
+  description: "Given `tensor`, this operation returns a tensor that has the same values\nas `tensor` with shape `shape`.\n\nIf one component of `shape` is the special value -1, the size of that dimension\nis computed so that the total size remains constant.  In particular, a `shape`\nof `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.\n\nIf `shape` is 1-D or higher, then the operation returns a tensor with shape\n`shape` filled with the values of `tensor`. In this case, the number of elements\nimplied by `shape` must be the same as the number of elements in `tensor`.\n\nFor example:\n\n```\n# tensor \'t\' is [1, 2, 3, 4, 5, 6, 7, 8, 9]\n# tensor \'t\' has shape [9]\nreshape(t, [3, 3]) ==> [[1, 2, 3],\n                        [4, 5, 6],\n                        [7, 8, 9]]\n\n# tensor \'t\' is [[[1, 1], [2, 2]],\n#                [[3, 3], [4, 4]]]\n# tensor \'t\' has shape [2, 2, 2]\nreshape(t, [2, 4]) ==> [[1, 1, 2, 2],\n                        [3, 3, 4, 4]]\n\n# tensor \'t\' is [[[1, 1, 1],\n#                 [2, 2, 2]],\n#                [[3, 3, 3],\n#                 [4, 4, 4]],\n#                [[5, 5, 5],\n#                 [6, 6, 6]]]\n# tensor \'t\' has shape [3, 2, 3]\n# pass \'[-1]\' to flatten \'t\'\nreshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]\n\n# -1 can also be used to infer the shape\n\n# -1 is inferred to be 9:\nreshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 2:\nreshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],\n                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]\n# -1 is inferred to be 3:\nreshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],\n                              [2, 2, 2],\n                              [3, 3, 3]],\n                             [[4, 4, 4],\n                              [5, 5, 5],\n                              [6, 6, 6]]]\n\n# tensor \'t\' is [7]\n# shape `[]` reshapes to a scalar\nreshape(t, []) ==> 7\n```"
 }
 op {
   name: "ResizeArea"
@@ -16835,6 +18242,14 @@ op {
     }
     description: "If `True`, updating of the var, m, and v tensors will be protected\nby a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, uses the nesterov update."
+  }
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
   is_stateful: true
@@ -18078,6 +19493,81 @@ op {
   description: "Note that in dense implementation of this algorithm, ms and mom will\nupdate even if the grad is zero, but in this sparse implementation, ms\nand mom will not update in iterations during which the grad is zero.\n\nmean_square = decay * mean_square + (1-decay) * gradient ** 2\nDelta = learning_rate * gradient / sqrt(mean_square + epsilon)\n\nms <- rho * ms_{t-1} + (1-rho) * grad * grad\nmom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\nvar <- var - mom"
   is_stateful: true
 }
+op {
+  name: "ResourceStridedSliceAssign"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "begin"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "end"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  summary: "Assign `value` to the sliced l-value reference of `ref`."
+  description: "The values of `value` are assigned to the positions in the variable\n`ref` that are selected by the slice parameters. The slice parameters\n`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.\n\nNOTE this op currently does not support broadcasting and so `value`\'s\nshape must be exactly the shape produced by the slice of `ref`."
+  is_stateful: true
+}
 op {
   name: "Restore"
   input_arg {
@@ -18213,11 +19703,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "Given a `tensor`, and a `bool` tensor `dims` representing the dimensions\nof `tensor`, this operation reverses each dimension i of `tensor` where\n`dims[i]` is `True`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions\nof `tensor` must equal the number of elements in `dims`. In other words:\n\n`rank(tensor) = size(dims)`\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [False, False, False, True]\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is [False, True, False, False]\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is [False, False, True, False]\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "ReverseSequence"
@@ -18267,7 +19758,7 @@ op {
     }
   }
   summary: "Reverses variable length slices."
-  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```prettyprint\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```prettyprint\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
+  description: "This op first slices `input` along the dimension `batch_dim`, and for each\nslice `i`, reverses the first `seq_lengths[i]` elements along\nthe dimension `seq_dim`.\n\nThe elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,\nand `seq_lengths` must be a vector of length `input.dims[batch_dim]`.\n\nThe output slice `i` along dimension `batch_dim` is then given by input\nslice `i`, with the first `seq_lengths[i]` slices along dimension\n`seq_dim` reversed.\n\nFor example:\n\n```\n# Given this:\nbatch_dim = 0\nseq_dim = 1\ninput.dims = (4, 8, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]\noutput[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]\noutput[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]\noutput[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[0, 7:, :, ...] = input[0, 7:, :, ...]\noutput[1, 2:, :, ...] = input[1, 2:, :, ...]\noutput[2, 3:, :, ...] = input[2, 3:, :, ...]\noutput[3, 2:, :, ...] = input[3, 2:, :, ...]\n```\n\nIn contrast, if:\n\n```\n# Given this:\nbatch_dim = 2\nseq_dim = 0\ninput.dims = (8, ?, 4, ...)\nseq_lengths = [7, 2, 3, 5]\n\n# then slices of input are reversed on seq_dim, but only up to seq_lengths:\noutput[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]\noutput[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]\noutput[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]\noutput[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]\n\n# while entries past seq_lens are copied through:\noutput[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]\noutput[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]\noutput[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]\noutput[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]\n```"
 }
 op {
   name: "ReverseV2"
@@ -18314,11 +19805,12 @@ op {
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
   summary: "Reverses specific dimensions of a tensor."
-  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```prettyprint\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
+  description: "NOTE `tf.reverse` has now changed behavior in preparation for 1.0.\n`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.\n\nGiven a `tensor`, and a `int32` tensor `axis` representing the set of\ndimensions of `tensor` to reverse. This operation reverses each dimension\n`i` for which there exists `j` s.t. `axis[j] == i`.\n\n`tensor` can have up to 8 dimensions. The number of dimensions specified\nin `axis` may be 0 or more entries. If an index is specified more than\nonce, a InvalidArgument error is raised.\n\nFor example:\n\n```\n# tensor \'t\' is [[[[ 0,  1,  2,  3],\n#                  [ 4,  5,  6,  7],\n#                  [ 8,  9, 10, 11]],\n#                 [[12, 13, 14, 15],\n#                  [16, 17, 18, 19],\n#                  [20, 21, 22, 23]]]]\n# tensor \'t\' shape is [1, 2, 3, 4]\n\n# \'dims\' is [3] or \'dims\' is -1\nreverse(t, dims) ==> [[[[ 3,  2,  1,  0],\n                        [ 7,  6,  5,  4],\n                        [ 11, 10, 9, 8]],\n                       [[15, 14, 13, 12],\n                        [19, 18, 17, 16],\n                        [23, 22, 21, 20]]]]\n\n# \'dims\' is \'[1]\' (or \'dims\' is \'[-3]\')\nreverse(t, dims) ==> [[[[12, 13, 14, 15],\n                        [16, 17, 18, 19],\n                        [20, 21, 22, 23]\n                       [[ 0,  1,  2,  3],\n                        [ 4,  5,  6,  7],\n                        [ 8,  9, 10, 11]]]]\n\n# \'dims\' is \'[2]\' (or \'dims\' is \'[-2]\')\nreverse(t, dims) ==> [[[[8, 9, 10, 11],\n                        [4, 5, 6, 7],\n                        [0, 1, 2, 3]]\n                       [[20, 21, 22, 23],\n                        [16, 17, 18, 19],\n                        [12, 13, 14, 15]]]]\n```"
 }
 op {
   name: "Rint"
@@ -18723,7 +20215,7 @@ op {
     description: "If True, the addition will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Adds sparse updates to a variable reference."
-  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterAdd.png\" alt>\n</div>"
+  description: "This operation computes\n\n    # Scalar indices\n    ref[indices, ...] += updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] += updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterAdd.png\" alt>\n</div>"
 }
 op {
   name: "ScatterDiv"
@@ -18863,17 +20355,17 @@ op {
   name: "ScatterNd"
   input_arg {
     name: "indices"
-    description: "A Tensor. Must be one of the following types: int32, int64.\nA tensor of indices into ref."
+    description: "Index tensor."
     type_attr: "Tindices"
   }
   input_arg {
     name: "updates"
-    description: "A Tensor. Must have the same type as tensor. A tensor of updated values\nto store in ref."
+    description: "Updates to scatter into output."
     type_attr: "T"
   }
   input_arg {
     name: "shape"
-    description: "A vector. The shape of the resulting tensor."
+    description: "1-D. The shape of the resulting tensor."
     type_attr: "Tindices"
   }
   output_arg {
@@ -18895,8 +20387,8 @@ op {
       }
     }
   }
-  summary: "Creates a new tensor by applying sparse `updates` to individual"
-  description: "values or slices within a zero tensor of the given `shape` tensor according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\nTODO(simister): Add a link to Variable.__getitem__ documentation on slice\nsyntax.\n\n`shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank\n`Q`.\n\n`indices` must be integer tensor, containing indices into `shape`.\nIt must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.\n\nThe innermost dimension of `indices` (with length `K`) corresponds to\nindices into elements (if `K = P`) or slices (if `K < P`) along the `K`th\ndimension of `shape`.\n\n`updates` is Tensor of rank `Q-1+P-K` with shape:\n\n```\n[d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].\n```\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print sess.run(scatter)\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
+  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  description: "Creates a new tensor by applying sparse `updates` to individual\nvalues or slices within a zero tensor of the given `shape` according to\nindices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)\noperator which extracts values or slices from a given tensor.\n\n**WARNING**: The order in which updates are applied is nondeterministic, so the\noutput will be nondeterministic if `indices` contains duplicates.\n\n`indices` is an integer tensor containing indices into a new tensor of shape\n`shape`.  The last dimension of `indices` can be at most the rank of `shape`:\n\n    indices.shape[-1] <= shape.rank\n\nThe last dimension of `indices` corresponds to indices into elements\n(if `indices.shape[-1] = shape.rank`) or slices\n(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of\n`shape`.  `updates` is a tensor with shape\n\n    indices.shape[:-1] + shape[indices.shape[-1]:]\n\nThe simplest form of scatter is to insert individual elements in a tensor by\nindex. For example, say we want to insert 4 scattered elements in a rank-1\ntensor with 8 elements.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd1.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[4], [3], [1], [7]])\n    updates = tf.constant([9, 10, 11, 12])\n    shape = tf.constant([8])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [0, 11, 0, 10, 9, 0, 0, 12]\n\nWe can also, insert entire slices of a higher rank tensor all at once. For\nexample, if we wanted to insert two slices in the first dimension of a\nrank-3 tensor with two matrices of new values.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterNd2.png\" alt>\n</div>\n\nIn Python, this scatter operation would look like this:\n\n```python\n    indices = tf.constant([[0], [2]])\n    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]],\n                           [[5, 5, 5, 5], [6, 6, 6, 6],\n                            [7, 7, 7, 7], [8, 8, 8, 8]]])\n    shape = tf.constant([4, 4, 4])\n    scatter = tf.scatter_nd(indices, updates, shape)\n    with tf.Session() as sess:\n      print(sess.run(scatter))\n```\n\nThe resulting tensor would look like this:\n\n    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],\n     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],\n     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]"
 }
 op {
   name: "ScatterNdAdd"
@@ -19146,7 +20638,7 @@ op {
     description: "If True, the subtraction will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Subtracts sparse updates to a variable reference."
-  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterSub.png\" alt>\n</div>"
+  description: "```python\n    # Scalar indices\n    ref[indices, ...] -= updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] -= updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nDuplicate entries are handled correctly: if multiple `indices` reference\nthe same location, their (negated) contributions add.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterSub.png\" alt>\n</div>"
 }
 op {
   name: "ScatterUpdate"
@@ -19195,7 +20687,7 @@ op {
     description: "If True, the assignment will be protected by a lock;\notherwise the behavior is undefined, but may exhibit less contention."
   }
   summary: "Applies sparse updates to a variable reference."
-  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/ScatterUpdate.png\" alt>\n</div>"
+  description: "This operation computes\n\n```python\n    # Scalar indices\n    ref[indices, ...] = updates[...]\n\n    # Vector indices (for each i)\n    ref[indices[i], ...] = updates[i, ...]\n\n    # High rank indices (for each i, ..., j)\n    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]\n```\n\nThis operation outputs `ref` after the update is done.\nThis makes it easier to chain operations that need to use the reset value.\n\nIf values in `ref` is to be updated more than once, because there are\nduplicate entries in `indices`, the order at which the updates happen\nfor each value is undefined.\n\nRequires `updates.shape = indices.shape + ref.shape[1:]`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/ScatterUpdate.png\" alt>\n</div>"
 }
 op {
   name: "SdcaFprint"
@@ -19249,7 +20741,7 @@ op {
   }
   input_arg {
     name: "sparse_indices"
-    description: "a list of vectors where each value is the indices which has\ncorresponding weights in sparse_weights. This field maybe ommitted for the\ndense approach."
+    description: "a list of vectors where each value is the indices which has\ncorresponding weights in sparse_weights. This field maybe omitted for the\ndense approach."
     type: DT_INT64
     number_attr: "num_sparse_features"
   }
@@ -19351,7 +20843,7 @@ op {
     minimum: 1
   }
   summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
-  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\nProximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.\n2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf\n\n  Loss objective = \\sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|\n\nAdding vs. Averaging in Distributed Primal-Dual Optimization.\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,\nMartin Takac http://arxiv.org/abs/1502.03508\n\nStochastic Dual Coordinate Ascent with Adaptive Probabilities\nDominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053"
+  description: "linear models with L1 + L2 regularization. As global optimization objective is\nstrongly-convex, the optimizer optimizes the dual objective at each step. The\noptimizer applies each update one example at a time. Examples are sampled\nuniformly, and the optimizer is learning rate free and enjoys linear convergence\nrate.\n\n[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>\nShai Shalev-Shwartz, Tong Zhang. 2012\n\n$$Loss Objective = \\sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$\n\n[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>\nChenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,\nPeter Richtarik, Martin Takac. 2015\n\n[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>\nDominik Csiba, Zheng Qu, Peter Richtarik. 2015"
 }
 op {
   name: "SdcaShrinkL1"
@@ -19424,7 +20916,7 @@ op {
     }
   }
   summary: "Computes the maximum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the max is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMax.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\max_j(data_j)\\\\) where `max` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the max is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMax.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMean"
@@ -19470,7 +20962,7 @@ op {
     }
   }
   summary: "Computes the mean along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\nIf the mean is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMean.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\frac{\\sum_j data_j}{N}\\\\) where `mean` is\nover `j` such that `segment_ids[j] == i` and `N` is the total number of\nvalues summed.\n\nIf the mean is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMean.png\" alt>\n</div>"
 }
 op {
   name: "SegmentMin"
@@ -19516,7 +21008,7 @@ op {
     }
   }
   summary: "Computes the minimum along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the min is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentMin.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\min_j(data_j)\\\\) where `min` is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the min is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentMin.png\" alt>\n</div>"
 }
 op {
   name: "SegmentProd"
@@ -19567,7 +21059,7 @@ op {
     }
   }
   summary: "Computes the product along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the product is empty for a given segment ID `i`, `output[i] = 1`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentProd.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\prod_j data_j\\\\) where the product is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the product is empty for a given segment ID `i`, `output[i] = 1`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentProd.png\" alt>\n</div>"
 }
 op {
   name: "SegmentSum"
@@ -19618,7 +21110,7 @@ op {
     }
   }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)\nfor an explanation of segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/SegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/SegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Select"
@@ -19709,6 +21201,8 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -19845,7 +21339,7 @@ op {
     }
   }
   summary: "Returns the shape of a tensor."
-  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
+  description: "This operation returns a 1-D integer tensor representing the shape of `input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]\nshape(t) ==> [2, 2, 3]\n```"
 }
 op {
   name: "ShapeN"
@@ -19920,7 +21414,47 @@ op {
     name: "filename"
     type: DT_STRING
   }
-  summary: "Generate a glob pattern matching all sharded file names."
+  summary: "Generate a glob pattern matching all sharded file names."
+}
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "buffer_size"
+    description: "The number of output elements to buffer in an iterator over\nthis dataset. Compare with the `min_after_dequeue` attr when creating a\n`RandomShuffleQueue`."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    description: "A scalar seed for the random number generator. If either seed or\nseed2 is set to be non-zero, the random number generator is seeded\nby the given seed.  Otherwise, a random seed is used."
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    description: "A second scalar seed to avoid seed collision."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that shuffles elements from `input_dataset` pseudorandomly."
+  is_stateful: true
 }
 op {
   name: "Sigmoid"
@@ -20059,7 +21593,37 @@ op {
     }
   }
   summary: "Returns the size of a tensor."
-  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```prettyprint\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
+  description: "This operation returns an integer representing the number of elements in\n`input`.\n\nFor example:\n\n```\n# \'t\' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]\nsize(t) ==> 12\n```"
+}
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be skipped.  If count is -1, skips everything."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that skips `count` elements from the `input_dataset`."
+  is_stateful: true
 }
 op {
   name: "Skipgram"
@@ -20377,7 +21941,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D tensor of non-negative integers with shape `[2, 2]`. It specifies\n  the padding of the input with zeros across the spatial dimensions as follows:\n\n      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]\n\n  The effective spatial dimensions of the zero-padded input tensor will be:\n\n      height_pad = pad_top + height + pad_bottom\n      width_pad = pad_left + width + pad_right\n\nThe attr `block_size` must be greater than one. It indicates the block size.\n\n  * Non-overlapping blocks of size `block_size x block size` in the height and\n    width dimensions are rearranged into the batch dimension at each location.\n  * The batch of the output tensor is `batch * block_size * block_size`.\n  * Both height_pad and width_pad must be divisible by block_size.\n\nThe shape of the output will be:\n\n    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,\n     depth]\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 2, 1]` and value:\n\n```\nx = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],\n     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -20424,7 +21988,7 @@ op {
   }
   input_arg {
     name: "paddings"
-    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```prettyprint\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```prettyprint\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```prettyprint\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```prettyprint\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```prettyprint\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
+    description: "2-D with shape `[M, 2]`, all values must be >= 0.\n  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension\n  `i + 1`, which corresponds to spatial dimension `i`.  It is required that\n  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.\n\nThis operation is equivalent to the following steps:\n\n1. Zero-pad the start and end of dimensions `[1, ..., M]` of the\n   input according to `paddings` to produce `padded` of shape `padded_shape`.\n\n2. Reshape `padded` to `reshaped_padded` of shape:\n\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n       block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1],\n      block_shape[M-1]] +\n     remaining_shape\n\n3. Permute dimensions of `reshaped_padded` to produce\n   `permuted_reshaped_padded` of shape:\n\n     block_shape +\n     [batch] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\n4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch\n   dimension, producing an output tensor of shape:\n\n     [batch * prod(block_shape)] +\n     [padded_shape[1] / block_shape[0],\n      ...,\n      padded_shape[M] / block_shape[M-1]] +\n     remaining_shape\n\nSome examples:\n\n(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1], [2]], [[3], [4]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 1]` and value:\n\n```\n[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]\n```\n\n(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThe output tensor has shape `[4, 1, 1, 3]` and value:\n\n```\n[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]\n```\n\n(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and\n    `paddings = [[0, 0], [0, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]],\n      [[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[4, 2, 2, 1]` and value:\n\n```\nx = [[[[1], [3]], [[9], [11]]],\n     [[[2], [4]], [[10], [12]]],\n     [[[5], [7]], [[13], [15]]],\n     [[[6], [8]], [[14], [16]]]]\n```\n\n(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and\n    paddings = `[[0, 0], [2, 0]]`:\n\n```\nx = [[[[1],   [2],  [3],  [4]],\n      [[5],   [6],  [7],  [8]]],\n     [[[9],  [10], [11],  [12]],\n      [[13], [14], [15],  [16]]]]\n```\n\nThe output tensor has shape `[8, 1, 3, 1]` and value:\n\n```\nx = [[[[0], [1], [3]]], [[[0], [9], [11]]],\n     [[[0], [2], [4]]], [[[0], [10], [12]]],\n     [[[0], [5], [7]]], [[[0], [13], [15]]],\n     [[[0], [6], [8]]], [[[0], [14], [16]]]]\n```\n\nAmong others, this operation is useful for reducing atrous convolution into\nregular convolution."
     type_attr: "Tpaddings"
   }
   output_arg {
@@ -20486,7 +22050,7 @@ op {
     minimum: 2
   }
   summary: "SpaceToDepth for tensors of type T."
-  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```prettyprint\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```prettyprint\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```prettyprint\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```prettyprint\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```prettyprint\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```prettyprint\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
+  description: "Rearranges blocks of spatial data, into depth. More specifically,\nthis op outputs a copy of the input tensor where values from the `height`\nand `width` dimensions are moved to the `depth` dimension.\nThe attr `block_size` indicates the input block size and how the data is moved.\n\n  * Non-overlapping blocks of size `block_size x block size` are rearranged\n    into depth at each location.\n  * The depth of the output tensor is `input_depth * block_size * block_size`.\n  * The input tensor\'s height and width must be divisible by block_size.\n\nThat is, assuming the input is in the shape:\n`[batch, height, width, depth]`,\nthe shape of the output will be:\n`[batch, height/block_size, width/block_size, depth*block_size*block_size]`\n\nThis operation requires that the input tensor be of rank 4, and that\n`block_size` be >=1 and a divisor of both the input `height` and `width`.\n\nThis operation is useful for resizing the activations between convolutions\n(but keeping all data), e.g. instead of pooling. It is also useful for training\npurely convolutional models.\n\nFor example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:\n\n```\nx = [[[[1], [2]],\n      [[3], [4]]]]\n```\n\nThis operation will output a tensor of shape `[1, 1, 1, 4]`:\n\n```\n[[[[1, 2, 3, 4]]]]\n```\n\nHere, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,\nthe corresponding output will have a single element (i.e. width and height are\nboth 1) and will have a depth of 4 channels (1 * block_size * block_size).\nThe output element shape is `[1, 1, 4]`.\n\nFor an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.\n\n```\nx = [[[[1, 2, 3], [4, 5, 6]],\n      [[7, 8, 9], [10, 11, 12]]]]\n```\n\nThis operation, for block_size of 2, will return the following tensor of shape\n`[1, 1, 1, 12]`\n\n```\n[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]\n```\n\nSimilarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:\n\n```\nx = [[[[1],   [2],  [5],  [6]],\n      [[3],   [4],  [7],  [8]],\n      [[9],  [10], [13],  [14]],\n      [[11], [12], [15],  [16]]]]\n```\n\nthe operator will return the following tensor of shape `[1 2 2 4]`:\n\n```\nx = [[[[1, 2, 3, 4],\n       [5, 6, 7, 8]],\n      [[9, 10, 11, 12],\n       [13, 14, 15, 16]]]]\n```"
 }
 op {
   name: "SparseAccumulatorApplyGradient"
@@ -20544,8 +22108,8 @@ op {
     type: "bool"
     description: "Boolean indicating whether gradient_shape is unknown, in which\ncase the input is ignored during validation."
   }
-  summary: "Applies a sparse gradient to a given accumulator. Does not add if local_step is"
-  description: "lesser than the accumulator\'s global_step."
+  summary: "Applies a sparse gradient to a given accumulator."
+  description: "Does not add if local_step is smaller than the accumulator\'s\nglobal_step."
 }
 op {
   name: "SparseAccumulatorTakeGradient"
@@ -20598,8 +22162,8 @@ op {
       }
     }
   }
-  summary: "Extracts the average sparse gradient in the given SparseConditionalAccumulator,"
-  description: "provided that sufficient (i.e., more than num_required) gradients have been\naccumulated. The op will blocks until sufficient gradients have been\naccumulated. If the accumulator has already aggregated more than num_required\ngradients, it will return its average of the accumulated gradients.\nAlso automatically increments the recorded global_step in the accumulator by 1,\nand resets the aggregate to 0."
+  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
+  description: "The op will blocks until sufficient (i.e., more than num_required)\ngradients have been accumulated. If the accumulator has already\naggregated more than num_required gradients, it will return its\naverage of the accumulated gradients.  Also automatically increments\nthe recorded global_step in the accumulator by 1, and resets the\naggregate to 0."
 }
 op {
   name: "SparseAdd"
@@ -21684,10 +23248,115 @@ op {
     }
     description: "If non-empty, this accumulator will be shared under the given name\nacross multiple sessions."
   }
-  summary: "A conditional accumulator for aggregating sparse gradients. The accumulator"
-  description: "accepts gradients marked with local_step greater or equal to the most recent\nglobal_step known to the accumulator. The average can be extracted from the\naccumulator, provided sufficient gradients have been accumulated. Extracting the\naverage automatically resets the aggregate to 0, and increments the global_step\nrecorded by the accumulator."
+  summary: "A conditional accumulator for aggregating sparse gradients."
+  description: "The accumulator accepts gradients marked with local_step greater or\nequal to the most recent global_step known to the accumulator. The\naverage can be extracted from the accumulator, provided sufficient\ngradients have been accumulated. Extracting the average automatically\nresets the aggregate to 0, and increments the global_step recorded by\nthe accumulator."
   is_stateful: true
 }
+op {
+  name: "SparseCross"
+  input_arg {
+    name: "indices"
+    description: "2-D.  Indices of each input `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    description: "1-D.   values of each `SparseTensor`."
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    description: "1-D.   Shapes of each `SparseTensor`."
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    description: "2-D.    Columns represented by dense `Tensor`."
+    type_list_attr: "dense_types"
+  }
+  output_arg {
+    name: "output_indices"
+    description: "2-D.  Indices of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    description: "1-D.  Non-empty values of the concatenated or hashed\n`SparseTensor`."
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_shape"
+    description: "1-D.  Shape of the concatenated `SparseTensor`."
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "hashed_output"
+    type: "bool"
+    description: "If true, returns the hash of the cross instead of the string.\nThis will allow us avoiding string manipulations."
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    description: "It is used if hashed_output is true.\noutput = hashed_value%num_buckets if num_buckets > 0 else hashed_value."
+    has_minimum: true
+  }
+  attr {
+    name: "hash_key"
+    type: "int"
+    description: "Specify the hash_key that will be used by the `FingerprintCat64`\nfunction to combine the crosses fingerprints."
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "internal_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: "The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each\nrepresenting features of one feature column. It outputs a 2D `SparseTensor` with\nthe batchwise crosses of these features.\n\nFor example, if the inputs are\n\n    inputs[0]: SparseTensor with shape = [2, 2]\n    [0, 0]: \"a\"\n    [1, 0]: \"b\"\n    [1, 1]: \"c\"\n\n    inputs[1]: SparseTensor with shape = [2, 1]\n    [0, 0]: \"d\"\n    [1, 0]: \"e\"\n\n    inputs[2]: Tensor [[\"f\"], [\"g\"]]\n\nthen the output will be\n\n    shape = [2, 2]\n    [0, 0]: \"a_X_d_X_f\"\n    [1, 0]: \"b_X_e_X_g\"\n    [1, 1]: \"c_X_e_X_g\"\n\nif hashed_output=true then the output will be\n\n    shape = [2, 2]\n    [0, 0]: FingerprintCat64(\n                Fingerprint64(\"f\"), FingerprintCat64(\n                    Fingerprint64(\"d\"), Fingerprint64(\"a\")))\n    [1, 0]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"b\")))\n    [1, 1]: FingerprintCat64(\n                Fingerprint64(\"g\"), FingerprintCat64(\n                    Fingerprint64(\"e\"), Fingerprint64(\"c\")))"
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {
@@ -22151,7 +23820,7 @@ op {
     }
   }
   summary: "Computes the mean along sparse segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentMean`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`."
 }
 op {
   name: "SparseSegmentMeanGrad"
@@ -22250,7 +23919,7 @@ op {
     }
   }
   summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
-  description: "N is the size of the segment being reduced.\n\nRead [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments."
+  description: "N is the size of the segment being reduced.\n\nRead @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments."
 }
 op {
   name: "SparseSegmentSqrtNGrad"
@@ -22356,7 +24025,7 @@ op {
     }
   }
   summary: "Computes the sum along sparse segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```prettyprint\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n  ==> [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n  ==> [[ 1  2  3  4]\n       [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n  ==> [[0 0 0 0]\n       [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nLike `SegmentSum`, but `segment_ids` can have rank less than `data`\'s first\ndimension, selecting a subset of dimension 0, specified by `indices`.\n\nFor example:\n\n```prettyprint\nc = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])\n\n# Select two rows, one segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))\n  ==> [[0 0 0 0]]\n\n# Select two rows, two segment.\ntf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))\n  ==> [[ 1  2  3  4]\n       [-1 -2 -3 -4]]\n\n# Select all rows, two segments.\ntf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))\n  ==> [[0 0 0 0]\n       [5 6 7 8]]\n\n# Which is equivalent to:\ntf.segment_sum(c, tf.constant([0, 0, 1]))\n```"
 }
 op {
   name: "SparseSoftmax"
@@ -22690,7 +24359,7 @@ op {
   input_arg {
     name: "a_indices"
     description: "2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix."
-    type: DT_INT64
+    type_attr: "Tindices"
   }
   input_arg {
     name: "a_values"
@@ -22715,6 +24384,19 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
   attr {
     name: "adjoint_a"
     type: "bool"
@@ -22734,6 +24416,31 @@ op {
   summary: "Multiply SparseTensor (of rank 2) \"A\" by dense matrix \"B\"."
   description: "No validity checking is performed on the indices of A.  However, the following\ninput format is recommended for optimal behavior:\n\nif adjoint_a == false:\n  A should be sorted in lexicographically increasing order.  Use SparseReorder\n  if you\'re not sure.\nif adjoint_a == true:\n  A should be sorted in order of increasing dimension 1 (i.e., \"column major\"\n  order instead of \"row major\" order)."
 }
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
+  is_stateful: true
+}
 op {
   name: "SparseToDense"
   input_arg {
@@ -23085,7 +24792,7 @@ op {
     has_minimum: true
   }
   summary: "Removes dimensions of size 1 from the shape of a tensor."
-  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```prettyprint\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
+  description: "Given a tensor `input`, this operation returns a tensor of the same type with\nall dimensions of size 1 removed. If you don\'t want to remove all size 1\ndimensions, you can remove specific size 1 dimensions by specifying\n`squeeze_dims`.\n\nFor example:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t)) ==> [2, 3]\n```\n\nOr, to remove specific size 1 dimensions:\n\n```\n# \'t\' is a tensor of shape [1, 2, 1, 3, 1, 1]\nshape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]\n```"
 }
 op {
   name: "Stack"
@@ -23150,61 +24857,205 @@ op {
     is_ref: true
   }
   input_arg {
-    name: "elem"
-    description: "The tensor to be pushed onto the stack."
+    name: "elem"
+    description: "The tensor to be pushed onto the stack."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "The same tensor as the input \'elem\'."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "swap_memory"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "Swap `elem` to CPU. Default to false."
+  }
+  summary: "Push an element onto the stack."
+}
+op {
+  name: "Stage"
+  input_arg {
+    name: "values"
+    description: "a list of tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "It is necessary to match this name to the matching Unstage Op."
+  }
+  summary: "Stage values similar to a lightweight Enqueue."
+  description: "The basic functionality of this Op is similar to a queue with many\nfewer capabilities and options.  This Op is optimized for performance."
+  is_stateful: true
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  summary: "Outputs deterministic pseudorandom values from a normal distribution."
+  description: "The generated values will have mean 0 and standard deviation 1.\n\nThe outputs are a deterministic function of `shape` and `seed`."
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    description: "The shape of the output tensor."
     type_attr: "T"
   }
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
   output_arg {
     name: "output"
-    description: "The same tensor as the input \'elem\'."
-    type_attr: "T"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   attr {
-    name: "swap_memory"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: false
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    description: "Swap `elem` to CPU. Default to false."
   }
-  summary: "Push an element onto the stack."
+  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
+  description: "The generated values follow a uniform distribution in the range `[0, 1)`. The\nlower bound 0 is included in the range, while the upper bound 1 is excluded.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
 op {
-  name: "Stage"
+  name: "StatelessTruncatedNormal"
   input_arg {
-    name: "values"
-    description: "a list of tensors"
-    type_list_attr: "dtypes"
+    name: "shape"
+    description: "The shape of the output tensor."
+    type_attr: "T"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "seed"
+    description: "2 seeds (shape [2])."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    description: "Random values with specified shape."
+    type_attr: "dtype"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "dtype"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_FLOAT
+    }
+    description: "The type of the output."
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
-    description: "If non-empty, this queue is placed in the given container. Otherwise,\na default container is used."
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "T"
+    type: "type"
     default_value {
-      s: ""
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    description: "It is necessary to match this name to the matching Unstage Op."
   }
-  summary: "Stage values similar to a lightweight Enqueue.  The basic functionality of this"
-  description: "Op is similar to a queue with many fewer capabilities and options.  This Op is\noptimized for performance."
-  is_stateful: true
+  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
+  description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
 op {
   name: "StopGradient"
@@ -23303,7 +25154,7 @@ op {
     description: "a bitmask where bit `i` implies that the `i`th\nspecification should shrink the dimensionality. begin and end\nmust imply a slice of size 1 in the dimension. For example in\npython one might do `foo[:, 3, :]` which would result in\n`shrink_axis_mask` being 2."
   }
   summary: "Return a strided slice from `input`."
-  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```prettyprint\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
+  description: "Note, most python users will want to use the Python `Tensor.__getitem__`\nor `Variable.__getitem__` rather than this op directly.\n\nThe goal of this op is to produce a new tensor with a subset of\nthe elements from the `n` dimensional `input` tensor. The subset is chosen using\na sequence of `m` sparse range specifications encoded into the arguments\nof this function. Note, in some cases\n`m` could be equal to `n`, but this need not be the case. Each\nrange specification entry can be one of the following:\n\n- An ellipsis (...). Ellipses are used to imply zero or more\n  dimensions of full-dimension selection and are produced using\n  `ellipsis_mask`. For example, `foo[...]` is the identity slice.\n\n- A new axis. This is used to insert a new shape=1 dimension and is\n  produced using `new_axis_mask`. For example, `foo[:, ...]` where\n  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.\n\n\n- A range `begin:end:stride`. This is used to specify how much to choose from\n  a given dimension. `stride` can be any integer but 0.  `begin` is an integer\n  which represents the index of the first value to select while `end` represents\n  the index of the last value to select. The number of values selected in each\n  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.\n  `begin` and `end` can be negative where `-1` is the last element, `-2` is\n  the second to last. `begin_mask` controls whether to replace the explicitly\n  given `begin` with an implicit effective value of `0` if `stride > 0` and\n  `-1` if `stride < 0`. `end_mask` is analogous but produces the number\n  required to create the largest open interval. For example, given a shape\n  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do\n  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`\n  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the\n  first dimension of a tensor while dropping the last two (in the original\n  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.\n\n- A single index. This is used to keep only elements that have a given\n  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a\n  shape `(6,)` tensor. This is encoded in `begin` and `end` and\n  `shrink_axis_mask`.\n\nEach conceptual range specification is encoded in the op\'s argument. This\nencoding is best understand by considering a non-trivial example. In\nparticular,\n`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as\n\n```\nbegin = [1, 2, x, x, 0, x] # x denotes don\'t care (usually 0)\nend = [2, 4, x, x, -3, x]\nstrides = [1, 1, x, x, -1, 1]\nbegin_mask = 1<<4 | 1 << 5 = 48\nend_mask = 1<<5 = 32\nellipsis_mask = 1<<3 = 8\nnew_axis_mask = 1<<2 4\nshrink_axis_mask = 1<<0\n```\n\nIn this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of\nthe slice becomes (2, 1, 5, 5, 2, 5).\nLet us walk step by step through each argument specification.\n\n1.  The first argument in the example slice is turned into `begin = 1` and\n`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we\nalso set the appropriate bit in `shrink_axis_mask`.\n\n2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have\nzero bits contributed.\n\n3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1\ndimension in the final shape. Dummy values are contributed to begin,\nend and stride, while the new_axis_mask bit is set.\n\n4. `...` grab the full ranges from as many dimensions as needed to\nfully specify a slice for every dimension of the input shape.\n\n5. `:-3:-1` shows the use of negative indices. A negative index `i` associated\nwith a dimension that has shape `s` is converted to a positive index\n`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion\nis done internally so begin, end and strides receive x, -3, and -1.\nThe appropriate begin_mask bit is set to indicate the start range is the\nfull range (ignoring the x).\n\n6. `:` indicates that the entire contents of the corresponding dimension\nis selected. This is equivalent to `::` or `0::1`. begin, end, and strides\nreceive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and\n`end_mask` are also set.\n\n*Requirements*:\n  `0 != strides[i] for i in [0, m)`\n  `ellipsis_mask must be a power of two (only one ellipsis)`"
 }
 op {
   name: "StridedSliceAssign"
@@ -23867,6 +25718,25 @@ op {
   }
   summary: "Computes the gradient function for function f via backpropagation."
 }
+op {
+  name: "TFRecordDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    description: "A scalar containing either (i) the empty string (no\ncompression), (ii) \"ZLIB\", or (iii) \"GZIP\"."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the records from one or more TFRecord files."
+  is_stateful: true
+}
 op {
   name: "TFRecordReader"
   output_arg {
@@ -23934,6 +25804,36 @@ op {
   summary: "A Reader that outputs the records from a TensorFlow Records file."
   is_stateful: true
 }
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "count"
+    description: "A scalar representing the number of elements from the `input_dataset`\nthat should be taken. A value of `-1` indicates that all of `input_dataset`\nis taken."
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that contains `count` elements from the `input_dataset`."
+  is_stateful: true
+}
 op {
   name: "TakeManySparseFromTensorsMap"
   input_arg {
@@ -24171,8 +26071,8 @@ op {
     description: "The handle to a TensorArray (output of TensorArray or TensorArrayGrad)."
     type: DT_RESOURCE
   }
-  summary: "Delete the TensorArray from its resource container.  This enables"
-  description: "the user to close and release the resource in the middle of a step/run."
+  summary: "Delete the TensorArray from its resource container."
+  description: "This enables the user to close and release the resource in the middle\nof a step/run."
   is_stateful: true
 }
 op {
@@ -24963,8 +26863,8 @@ op {
     }
     description: "Overrides the name used for the temporary tensor_array\nresource. Default value is the name of the \'TensorArray\' op (which\nis guaranteed unique)."
   }
-  summary: "An array of Tensors of given size, with data written via Write and read"
-  description: "via Read or Pack."
+  summary: "An array of Tensors of given size."
+  description: "Write data via Write and read via Read or Pack."
   is_stateful: true
 }
 op {
@@ -25061,6 +26961,56 @@ op {
   summary: "Push an element onto the tensor_array."
   is_stateful: true
 }
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits `components` as a tuple of tensors once."
+  is_stateful: true
+}
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that emits each dim-0 slice of `components` once."
+  is_stateful: true
+}
 op {
   name: "TensorSummary"
   input_arg {
@@ -25103,6 +27053,20 @@ op {
   }
   summary: "Outputs a `Summary` protocol buffer with a tensor."
 }
+op {
+  name: "TextLineDataset"
+  input_arg {
+    name: "filenames"
+    description: "A scalar or a vector containing the name(s) of the file(s) to be\nread."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  summary: "Creates a dataset that emits the lines of one or more text files."
+  is_stateful: true
+}
 op {
   name: "TextLineReader"
   output_arg {
@@ -25204,7 +27168,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -25238,6 +27202,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a learned unigram distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Tile"
@@ -25501,8 +27466,8 @@ op {
       }
     }
   }
-  summary: "Returns element-wise remainder of division. This emulates C semantics where"
-  description: "true, this follows C semantics in that the result here is consistent\nwith a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.\n\n*NOTE*: `Mod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: "the result here is consistent with a truncating divide. E.g. `truncate(x / y) *\ny + truncate_mod(x, y) = x`.\n\n*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting\n[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)"
 }
 op {
   name: "TruncatedNormal"
@@ -25590,7 +27555,7 @@ op {
   attr {
     name: "num_sampled"
     type: "int"
-    description: "Number of candidates to randomly sample per batch."
+    description: "Number of candidates to randomly sample."
     has_minimum: true
     minimum: 1
   }
@@ -25624,6 +27589,7 @@ op {
   }
   summary: "Generates labels for candidate sampling with a uniform distribution."
   description: "See explanations of candidate sampling and the data formats at\ngo/candidate-sampling.\n\nFor each batch, this op picks a single set of sampled candidate labels.\n\nThe advantages of sampling candidates per-batch are simplicity and the\npossibility of efficient dense matrix multiplication. The disadvantage is that\nthe sampled candidates must be chosen independently of the context and of the\ntrue labels."
+  is_stateful: true
 }
 op {
   name: "Unique"
@@ -25660,7 +27626,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx = unique(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\n```"
 }
 op {
   name: "UniqueWithCounts"
@@ -25702,7 +27668,7 @@ op {
     }
   }
   summary: "Finds unique elements in a 1-D tensor."
-  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```prettyprint\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
+  description: "This operation returns a tensor `y` containing all of the unique elements of `x`\nsorted in the same order that they occur in `x`. This operation also returns a\ntensor `idx` the same size as `x` that contains the index of each value of `x`\nin the unique output `y`. Finally, it returns a third tensor `count` that\ncontains the count of each element of `y` in `x`. In other words:\n\n`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`\n\nFor example:\n\n```\n# tensor \'x\' is [1, 1, 2, 4, 4, 4, 7, 8, 8]\ny, idx, count = unique_with_counts(x)\ny ==> [1, 2, 4, 7, 8]\nidx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]\ncount ==> [2, 1, 3, 1, 2]\n```"
 }
 op {
   name: "Unpack"
@@ -25785,7 +27751,7 @@ op {
     }
   }
   summary: "Computes the Max along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nThis operator is similar to the [unsorted segment sum operator](../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nThis operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).\nInstead of computing the sum over segments, it computes the maximum\nsuch that:\n\n\\\\(output_i = \\max_j data_j\\\\) where max is over `j` such\nthat `segment_ids[j] == i`.\n\nIf the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,\n `output[i] = numeric_limits<T>::min()`.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "UnsortedSegmentSum"
@@ -25840,60 +27806,7 @@ op {
     }
   }
   summary: "Computes the sum along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
-}
-op {
-  name: "UnsortedSegmentSum"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "segment_ids"
-    description: "A tensor whose shape is a prefix of `data.shape`."
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "num_segments"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`."
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  summary: "Computes the max along segments of a tensor."
-  description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n  range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"../../images/UnsortedSegmentSum.png\" alt>\n</div>"
+  description: "Read @{$math_ops#segmentation$the section on segmentation} for an explanation of\nsegments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n<div style=\"width:70%; margin:auto; margin-bottom:10px; margin-top:20px;\">\n<img style=\"width:100%\" src=\"https://www.tensorflow.org/images/UnsortedSegmentSum.png\" alt>\n</div>"
 }
 op {
   name: "Unstage"
@@ -25921,8 +27834,8 @@ op {
       s: ""
     }
   }
-  summary: "Op is similar to a lightweight Dequeue.  The basic funtionality is similar to"
-  description: "dequeue with many fewer capabilities and options.  This Op is optimized for\nperformance."
+  summary: "Op is similar to a lightweight Dequeue."
+  description: "The basic funtionality is similar to dequeue with many fewer\ncapabilities and options.  This Op is optimized for performance."
   is_stateful: true
 }
 op {
@@ -26006,7 +27919,7 @@ op {
     type: DT_INT64
   }
   summary: "Returns locations of true values in a boolean tensor."
-  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```prettyprint\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
+  description: "This operation returns the coordinates of true elements in `input`. The\ncoordinates are returned in a 2-D tensor where the first dimension (rows)\nrepresents the number of true elements, and the second dimension (columns)\nrepresents the coordinates of the true elements. Keep in mind, the shape of\nthe output tensor can vary depending on how many true values there are in\n`input`. Indices are output in row-major order.\n\nFor example:\n\n```\n# \'input\' tensor is [[True, False]\n#                    [True, False]]\n# \'input\' has two true values, so output has two coordinates.\n# \'input\' has rank of 2, so coordinates have two indices.\nwhere(input) ==> [[0, 0],\n                  [1, 0]]\n\n# `input` tensor is [[[True, False]\n#                     [True, False]]\n#                    [[False, True]\n#                     [False, True]]\n#                    [[False, False]\n#                     [False, True]]]\n# \'input\' has 5 true values, so output has 5 coordinates.\n# \'input\' has rank of 3, so coordinates have three indices.\nwhere(input) ==> [[0, 0, 0],\n                  [0, 1, 0],\n                  [1, 0, 1],\n                  [1, 1, 1],\n                  [2, 1, 1]]\n```"
 }
 op {
   name: "WholeFileReader"
@@ -26120,5 +28033,37 @@ op {
     }
   }
   summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
-  description: "The Hurwitz zeta function is defined as:\n\n```\n\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\n```"
+  description: "The Hurwitz zeta function is defined as:\n\n\n\\\\(\\zeta(x, q) = \\sum_{n=0}^{\\infty} (q + n)^{-x}\\\\)"
+}
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_RESOURCE
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Creates a dataset that zips together `input_datasets`."
+  is_stateful: true
 }
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 7b2da9d8e6d9666edbfbcdc0bdde00a35a043153..392ac32010396d89f0dc011b05a6eb46d8f8b3f6 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -23,17 +23,6 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-namespace {
-
-Status RandomShape(InferenceContext* c) {
-  ShapeHandle out;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
-  c->set_output(0, out);
-  return Status::OK();
-}
-
-}  // namepsace
-
 REGISTER_OP("RandomUniform")
     .Input("shape: T")
     .SetIsStateful()
@@ -42,7 +31,7 @@ REGISTER_OP("RandomUniform")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a uniform distribution.
 
@@ -69,7 +58,7 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random integers from a uniform distribution.
 
@@ -100,7 +89,7 @@ REGISTER_OP("RandomStandardNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution.
 
@@ -128,7 +117,7 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a normal distribution. The parameters may each be a
 scalar which applies to the entire output, or a vector of length shape[0] which
@@ -158,7 +147,7 @@ REGISTER_OP("TruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(RandomShape)
+    .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
 Outputs random values from a truncated normal distribution.
 
diff --git a/tensorflow/core/ops/remote_fused_graph_ops.cc b/tensorflow/core/ops/remote_fused_graph_ops.cc
index 3d90c054d47f585e65e3237838f0c0fc3c08a00c..6e9f37a6152b50f0d2e6385125f4f7b51073033f 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops.cc
@@ -21,13 +21,11 @@ namespace tensorflow {
 
 // TODO(satok): Implement shape_inference
 REGISTER_OP("RemoteFusedGraphExecute")
-    .Input("values: M * T")
-    .Output("output: N * U")
-    .Attr("M: int >= 0")
-    .Attr("N: int >= 0")
-    .Attr("T: type")
-    .Attr("U: type")
-    .Attr("serialized_graph_transfer_info: string")
+    .Input("inputs: Tinputs")
+    .Output("outputs: Toutputs")
+    .Attr("Tinputs: list(type) >= 0")
+    .Attr("Toutputs: list(type) >= 0")
+    .Attr("serialized_remote_fused_graph_execute_info: string")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Execute a sub graph on a remote processor transferred by GraphTransferer.
diff --git a/tensorflow/core/ops/remote_fused_graph_ops_test.cc b/tensorflow/core/ops/remote_fused_graph_ops_test.cc
index 7fbe213e20f596a02159c79e6dcd184c4dd825d5..f5d90a676d7f80225b0af8d7eaac8fdc1426d756 100644
--- a/tensorflow/core/ops/remote_fused_graph_ops_test.cc
+++ b/tensorflow/core/ops/remote_fused_graph_ops_test.cc
@@ -26,21 +26,33 @@ namespace tensorflow {
 
 TEST(RemoteFusedGraphOpsTest, RemoteFusedGraphExecute_ShapeFn) {
   ShapeInferenceTestOp op("RemoteFusedGraphExecute");
-  auto set_n = [&op](int input_count, int output_count) {
+  auto set_n = [&op](int input1_count, int input2_count, int output_count) {
     std::vector<NodeDefBuilder::NodeOut> src_list;
-    for (int i = 0; i < input_count; ++i) {
+    DataTypeVector input_types;
+    for (int i = 0; i < input1_count; ++i) {
       src_list.emplace_back("a", 0, DT_FLOAT);
+      input_types.emplace_back(DT_FLOAT);
     }
-    TF_ASSERT_OK(NodeDefBuilder("test", "RemoteFusedGraphExecute")
-                     .Input(src_list)
-                     .Attr("M", input_count)
-                     .Attr("N", output_count)
-                     .Attr("T", DT_FLOAT)
-                     .Attr("U", DT_FLOAT)
-                     .Finalize(&op.node_def));
+    for (int i = 0; i < input2_count; ++i) {
+      src_list.emplace_back("b", 0, DT_INT32);
+      input_types.emplace_back(DT_INT32);
+    }
+    DataTypeVector output_types;
+    for (int i = 0; i < output_count; ++i) {
+      output_types.emplace_back(DT_FLOAT);
+    }
+    NodeDefBuilder builder = NodeDefBuilder("test", "RemoteFusedGraphExecute")
+                                 .Input(src_list)
+                                 .Attr("Tinputs", input_types)
+                                 .Attr("Toutputs", output_types);
+    TF_ASSERT_OK(builder.Finalize(&op.node_def));
   };
-  set_n(4, 2);
+  set_n(4, 0, 2);
   INFER_OK(op, "?;?;?;?", "?;?");  // output rank unknown
+
+  set_n(4, 3, 3);
+  INFER_OK(op, "?;?;?;?;?;?;?", "?;?;?");  // output rank unknown
+
   // TODO(satok): Implement shape inference and do its test here
 }
 
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index c190b81dde3a346a08bf82c5b4644bf02e5c6d23..c060aa6be91d6675a6c2acdadd532151e509fcdf 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -295,7 +295,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 resource: Should be from a `Variable` node.
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index 2029ed7de22fc26bddfc2a8e71eb132e6fe9358c..dea75a1af83456f730a6c98cc40fd26d02ca2fda 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -72,17 +72,17 @@ optimizer applies each update one example at a time. Examples are sampled
 uniformly, and the optimizer is learning rate free and enjoys linear convergence
 rate.
 
-Proximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.
-2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf
+[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+Shai Shalev-Shwartz, Tong Zhang. 2012
 
-  Loss objective = \sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|
+$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 
-Adding vs. Averaging in Distributed Primal-Dual Optimization.
-Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,
-Martin Takac http://arxiv.org/abs/1502.03508
+[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+Peter Richtarik, Martin Takac. 2015
 
-Stochastic Dual Coordinate Ascent with Adaptive Probabilities
-Dominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053
+[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 
 loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
   squared and hinge losses.
@@ -105,7 +105,7 @@ example_weights: a vector which contains the weight associated with each
 example_labels: a vector which contains the label/target associated with each
   example.
 sparse_indices: a list of vectors where each value is the indices which has
-  corresponding weights in sparse_weights. This field maybe ommitted for the
+  corresponding weights in sparse_weights. This field maybe omitted for the
   dense approach.
 sparse_weights: a list of vectors where each value is the weight associated with
   a sparse feature group.
diff --git a/tensorflow/core/ops/set_ops.cc b/tensorflow/core/ops/set_ops.cc
index fad7007207162015bf57ae88b51419ea8a48c64e..85d1335dcf9b362a856f058758ebe7b130302357 100644
--- a/tensorflow/core/ops/set_ops.cc
+++ b/tensorflow/core/ops/set_ops.cc
@@ -235,7 +235,7 @@ REGISTER_OP("SparseToSparseSetOperation")
       DimensionHandle input1_rank_dim = c->Dim(input1_shape_shape, 0);
       DimensionHandle output_rank_dim;
       if (c->ValueKnown(input0_rank_dim)) {
-        const int32 input0_rank = c->Value(input0_rank_dim);
+        const int64 input0_rank = c->Value(input0_rank_dim);
         if (input0_rank < 2) {
           return errors::InvalidArgument("Input 0, expected rank >= 2, got ",
                                          input0_rank, ".");
@@ -244,7 +244,7 @@ REGISTER_OP("SparseToSparseSetOperation")
             c->WithValue(input1_rank_dim, input0_rank, &input1_rank_dim));
         output_rank_dim = input0_rank_dim;
       } else if (c->ValueKnown(input1_rank_dim)) {
-        const int32 input1_rank = c->Value(input1_rank_dim);
+        const int64 input1_rank = c->Value(input1_rank_dim);
         if (input1_rank < 2) {
           return errors::InvalidArgument("Input 1, expected rank >= 2, got ",
                                          input1_rank, ".");
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 860b3475e932474b44b86357506f5400943095c5..9bbf37cfc2cae4bdfdd61f71f43a2ced501a1503 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -128,12 +128,13 @@ pair takes space.
 )doc");
 
 REGISTER_OP("SparseTensorDenseMatMul")
-    .Input("a_indices: int64")
+    .Input("a_indices: Tindices")
     .Input("a_values: T")
     .Input("a_shape: int64")
     .Input("b: T")
     .Output("product: T")
     .Attr("T: type")
+    .Attr("Tindices: {int32,int64} = DT_INT64")
     .Attr("adjoint_a: bool = false")
     .Attr("adjoint_b: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -455,6 +456,84 @@ concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
     where rank is the number of dimensions in each input `SparseTensor`.
 )doc");
 
+REGISTER_OP("SparseCross")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Output("output_indices: int64")
+    .Output("output_values: out_type")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("hashed_output: bool")
+    .Attr("num_buckets: int >= 0")
+    .Attr("hash_key: int")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .Attr("out_type: {int64, string}")
+    .Attr("internal_type: {int64, string}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Generates sparse cross from a list of sparse and dense tensors.
+
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+indices: 2-D.  Indices of each input `SparseTensor`.
+values: 1-D.   values of each `SparseTensor`.
+shapes: 1-D.   Shapes of each `SparseTensor`.
+dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+hashed_output: If true, returns the hash of the cross instead of the string.
+  This will allow us avoiding string manipulations.
+num_buckets: It is used if hashed_output is true.
+  output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+  function to combine the crosses fingerprints.
+output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+output_values: 1-D.  Non-empty values of the concatenated or hashed
+  `SparseTensor`.
+output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+)doc");
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index 1a2b2f3dab8220bc5f5c508521db8a655e41e654..09b460fd1475db4c3bbfed0f4807a9575e819291 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -31,12 +31,14 @@ REGISTER_OP("FFT")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
     .Doc(R"doc(
-Compute the 1-dimensional discrete Fourier Transform over the inner-most
+Fast Fourier transform.
+
+Computes the 1-dimensional discrete Fourier transform over the inner-most
 dimension of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its 1D Fourier Transform.
+  dimension of `input` is replaced with its 1D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.fft
@@ -50,12 +52,14 @@ REGISTER_OP("IFFT")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
     .Doc(R"doc(
-Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
-dimension of `input`.
+Inverse fast Fourier transform.
+
+Computes the inverse 1-dimensional discrete Fourier transform over the
+inner-most dimension of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most
-  dimension of `input` is replaced with its inverse 1D Fourier Transform.
+  dimension of `input` is replaced with its inverse 1D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.ifft
@@ -69,12 +73,14 @@ REGISTER_OP("FFT2D")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
     })
     .Doc(R"doc(
-Compute the 2-dimensional discrete Fourier Transform over the inner-most
+2D fast Fourier transform.
+
+Computes the 2-dimensional discrete Fourier transform over the inner-most
 2 dimensions of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their 2D Fourier Transform.
+  dimensions of `input` are replaced with their 2D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.fft2
@@ -88,12 +94,14 @@ REGISTER_OP("IFFT2D")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
     })
     .Doc(R"doc(
-Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
-2 dimensions of `input`.
+Inverse 2D fast Fourier transform.
+
+Computes the inverse 2-dimensional discrete Fourier transform over the
+inner-most 2 dimensions of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most 2
-  dimensions of `input` are replaced with their inverse 2D Fourier Transform.
+  dimensions of `input` are replaced with their inverse 2D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.ifft2
@@ -107,12 +115,14 @@ REGISTER_OP("FFT3D")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     })
     .Doc(R"doc(
-Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
+3D fast Fourier transform.
+
+Computes the 3-dimensional discrete Fourier transform over the inner-most 3
 dimensions of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their 3D Fourier Transform.
+  dimensions of `input` are replaced with their 3D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.fftn with 3 dimensions.
@@ -126,12 +136,14 @@ REGISTER_OP("IFFT3D")
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     })
     .Doc(R"doc(
-Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
-3 dimensions of `input`.
+Inverse 3D fast Fourier transform.
+
+Computes the inverse 3-dimensional discrete Fourier transform over the
+inner-most 3 dimensions of `input`.
 
 input: A complex64 tensor.
 output: A complex64 tensor of the same shape as `input`. The inner-most 3
-  dimensions of `input` are replaced with their inverse 3D Fourier Transform.
+  dimensions of `input` are replaced with their inverse 3D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.ifftn with 3 dimensions.
@@ -180,7 +192,9 @@ REGISTER_OP("RFFT")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 1); })
     .Doc(R"doc(
-Compute the 1-dimensional discrete Fourier Transform of a real-valued signal
+Real-valued fast Fourier transform.
+
+Computes the 1-dimensional discrete Fourier transform of a real-valued signal
 over the inner-most dimension of `input`.
 
 Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
@@ -191,7 +205,7 @@ input: A float32 tensor.
 fft_length: An int32 tensor of shape [1]. The FFT length.
 output: A complex64 tensor of the same rank as `input`. The inner-most
   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-  frequency components of its 1D Fourier Transform.
+  frequency components of its 1D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.rfft
@@ -204,7 +218,9 @@ REGISTER_OP("IRFFT")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 1); })
     .Doc(R"doc(
-Compute the inverse 1-dimensional discrete Fourier Transform of a real-valued
+Inverse real-valued fast Fourier transform.
+
+Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
 signal over the inner-most dimension of `input`.
 
 The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
@@ -218,7 +234,7 @@ input: A complex64 tensor.
 fft_length: An int32 tensor of shape [1]. The FFT length.
 output: A float32 tensor of the same rank as `input`. The inner-most
   dimension of `input` is replaced with the `fft_length` samples of its inverse
-  1D Fourier Transform.
+  1D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.irfft
@@ -231,7 +247,9 @@ REGISTER_OP("RFFT2D")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 2); })
     .Doc(R"doc(
-Compute the 2-dimensional discrete Fourier Transform of a real-valued signal
+2D real-valued fast Fourier transform.
+
+Computes the 2-dimensional discrete Fourier transform of a real-valued signal
 over the inner-most 2 dimensions of `input`.
 
 Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
@@ -242,7 +260,7 @@ positive-frequency terms.
 input: A float32 tensor.
 fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 output: A complex64 tensor of the same rank as `input`. The inner-most 2
-  dimensions of `input` are replaced with their 2D Fourier Transform. The
+  dimensions of `input` are replaced with their 2D Fourier transform. The
   inner-most dimension contains `fft_length / 2 + 1` unique frequency
   components.
 
@@ -257,7 +275,9 @@ REGISTER_OP("IRFFT2D")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 2); })
     .Doc(R"doc(
-Compute the inverse 2-dimensional discrete Fourier Transform of a real-valued
+Inverse 2D real-valued fast Fourier transform.
+
+Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
 signal over the inner-most 2 dimensions of `input`.
 
 The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
@@ -271,7 +291,7 @@ input: A complex64 tensor.
 fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 output: A float32 tensor of the same rank as `input`. The inner-most 2
   dimensions of `input` are replaced with the `fft_length` samples of their
-  inverse 2D Fourier Transform.
+  inverse 2D Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.fft.irfft2
@@ -284,7 +304,9 @@ REGISTER_OP("RFFT3D")
     .Output("output: complex64")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, true, 3); })
     .Doc(R"doc(
-Compute the 3-dimensional discrete Fourier Transform of a real-valued signal
+3D real-valued fast Fourier transform.
+
+Computes the 3-dimensional discrete Fourier transform of a real-valued signal
 over the inner-most 3 dimensions of `input`.
 
 Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
@@ -295,7 +317,7 @@ positive-frequency terms.
 input: A float32 tensor.
 fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 output: A complex64 tensor of the same rank as `input`. The inner-most 3
-  dimensions of `input` are replaced with the their 3D Fourier Transform. The
+  dimensions of `input` are replaced with the their 3D Fourier transform. The
   inner-most dimension contains `fft_length / 2 + 1` unique frequency
   components.
 
@@ -310,7 +332,9 @@ REGISTER_OP("IRFFT3D")
     .Output("output: float")
     .SetShapeFn([](InferenceContext* c) { return RFFTShape(c, false, 3); })
     .Doc(R"doc(
-Compute the inverse 3-dimensional discrete Fourier Transform of a real-valued
+Inverse 3D real-valued fast Fourier transform.
+
+Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
 signal over the inner-most 3 dimensions of `input`.
 
 The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
@@ -324,7 +348,7 @@ input: A complex64 tensor.
 fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 output: A float32 tensor of the same rank as `input`. The inner-most 3
   dimensions of `input` are replaced with the `fft_length` samples of their
-  inverse 3D real Fourier Transform.
+  inverse 3D real Fourier transform.
 
 @compatibility(numpy)
 Equivalent to np.irfftn with 3 dimensions.
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index cfb3ea7141172e1429051c7bea88c918969d0124..0890d5fc7c77ac4e930f69680345d17ef9bff364 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -288,7 +288,7 @@ for each value is undefined.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterUpdate.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -332,7 +332,7 @@ the same location, their contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterAdd.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
@@ -376,7 +376,7 @@ the same location, their (negated) contributions add.
 Requires `updates.shape = indices.shape + ref.shape[1:]`.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/ScatterSub.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
 </div>
 
 ref: Should be from a `Variable` node.
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b222b5b2416dfac09e1dd1abd15862e317b064c7
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -0,0 +1,94 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::ShapeHandle;
+
+static Status StatelessShape(shape_inference::InferenceContext* context) {
+  // Check seed shape
+  ShapeHandle seed;
+  TF_RETURN_IF_ERROR(context->WithRank(context->input(1), 1, &seed));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
+
+  // Set output shape
+  shape_inference::ShapeHandle out;
+  TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
+  context->set_output(0, out);
+  return Status::OK();
+}
+
+#define REGISTER_STATELESS_OP(name)                  \
+  REGISTER_OP(name)                                  \
+      .Input("shape: T")                             \
+      .Input("seed: int64")                          \
+      .Output("output: dtype")                       \
+      .Attr("dtype: {half,float,double} = DT_FLOAT") \
+      .Attr("T: {int32, int64} = DT_INT32")          \
+      .SetShapeFn(StatelessShape)
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessRandomUniform").Doc(R"doc(
+Outputs deterministic pseudorandom random values from a uniform distribution.
+
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessRandomNormal").Doc(R"doc(
+Outputs deterministic pseudorandom values from a normal distribution.
+
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+// This op is exposed through contrib/stateless only.  The interface may change.
+REGISTER_STATELESS_OP("StatelessTruncatedNormal").Doc(R"doc(
+Outputs deterministic pseudorandom values from a truncated normal distribution.
+
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape` and `seed`.
+
+shape: The shape of the output tensor.
+dtype: The type of the output.
+seed: 2 seeds (shape [2]).
+output: Random values with specified shape.
+)doc");
+
+#undef REGISTER_STATELESS_OP
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 2027bf4603d7a6ec35db5f3ba8debd42313fba41..a786fed85684dd09ad5d1e91f0a326a598a68f30 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -102,6 +102,28 @@ use_locking: If `True`, the subtraction will be protected by a lock;
   otherwise the behavior is undefined, but may exhibit less contention.
 )doc");
 
+REGISTER_OP("ApplyDelayCompensatedGradientDescent")
+    .Input("var: resource")
+    .Input("alpha: T")
+    .Input("delta: T")
+    .Input("lambda: T")
+    .Input("shadow: resource")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn(ApplyGradientDescentShapeFn)
+    .Doc(R"doc(
+var -= alpha * (delta + lambda * delta * (var - shadow))
+Update '*shadow' by changing it to the new value of 'var'
+
+var: Should be from a Variable().
+alpha: Scaling factor. Must be a scalar.
+delta: The change.
+lambda: The variance parameter.
+shadow: Same as "var".
+use_locking: If `True`, the subtraction will be protected by a lock;
+  otherwise the behavior is undefined, but may exhibit less contention.
+)doc");
+
 static Status ApplyProximalGradientDescentShapeFn(InferenceContext* c,
                                                   bool sparse) {
   ShapeHandle unused;
@@ -1004,7 +1026,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1043,7 +1065,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1075,7 +1097,7 @@ momentum: Momentum. Must be a scalar.
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1112,7 +1134,7 @@ momentum: Momentum. Must be a scalar.
 use_locking: If `True`, updating of the var and accum tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
-use_nesterov: If `True`, the tensor passed to compute grad will be 
+use_nesterov: If `True`, the tensor passed to compute grad will be
 var - lr * momentum * accum, so in the end, the var you get is actually
 var - lr * momentum * accum.
 )doc");
@@ -1150,6 +1172,7 @@ REGISTER_OP("ApplyAdam")
     .Output("out: Ref(T)")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1175,6 +1198,7 @@ out: Same as "var".
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 REGISTER_OP("ResourceApplyAdam")
@@ -1190,6 +1214,7 @@ REGISTER_OP("ResourceApplyAdam")
     .Input("grad: T")
     .Attr("T: numbertype")
     .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       return ApplyAdamShapeFn(c, false /* sparse */);
     })
@@ -1214,6 +1239,7 @@ grad: The gradient.
 use_locking: If `True`, updating of the var, m, and v tensors will be protected
   by a lock; otherwise the behavior is undefined, but may exhibit less
   contention.
+use_nesterov: If `True`, uses the nesterov update.
 )doc");
 
 static Status ApplyRMSPropShapeFn(InferenceContext* c, bool sparse) {
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index 9c3489211c8eed4013555e5aee7d53817e43e8a1..da66fbe4ba432d5a29470d67cef3dfd07b56034b 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -21,9 +21,9 @@ limitations under the License.
 namespace tensorflow {
 
 // Used for testing the grad+indices handling for SparseApplyXYZ tests.
-static void TestGradAndIndicesErrorHandling(ShapeInferenceTestOp op,
+static void TestGradAndIndicesErrorHandling(const ShapeInferenceTestOp& op,
                                             string shape_spec_middle,
-                                            string shape_spec_end = "") {
+                                            const string& shape_spec_end = "") {
   auto shape_spec = [&shape_spec_middle, shape_spec_end](
       const char* var_spec, const char* grad_indices_spec) {
     return strings::StrCat(var_spec, ";", shape_spec_middle, ";",
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 5ee3099673f158e4736c43795027f4680de9c963..97e4c207d86fc6c8c12595be2de8707e9b3832f6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -52,6 +52,8 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 constexpr uint64 kUploadRetryDelayMicros = 1000000L;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
+// The environment variable that overrides the size of the readahead buffer.
+constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 
 // The file statistics returned by Stat() for directories.
 const FileStatistics DIRECTORY_STAT(0, 0, true);
@@ -585,7 +587,16 @@ class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 
 GcsFileSystem::GcsFileSystem()
     : auth_provider_(new GoogleAuthProvider()),
-      http_request_factory_(new HttpRequest::Factory()) {}
+      http_request_factory_(new HttpRequest::Factory()) {
+  // Apply the sys env override for the readahead buffer size if it's provided.
+  const char* readahead_buffer_size = std::getenv(kReadaheadBufferSize);
+  if (readahead_buffer_size) {
+    uint64 value;
+    if (strings::safe_strtou64(readahead_buffer_size, &value)) {
+      read_ahead_bytes_ = value;
+    }
+  }
+}
 
 GcsFileSystem::GcsFileSystem(
     std::unique_ptr<AuthProvider> auth_provider,
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 6a6437f070e348dc9b0c98c5550dceedeecfa13d..18d2de482bb27298bea7f45ad8c6f167fab6c286 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -74,6 +74,7 @@ class GcsFileSystem : public FileSystem {
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs) override;
+  size_t get_readahead_buffer_size() const { return read_ahead_bytes_; }
 
  private:
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
@@ -112,7 +113,7 @@ class GcsFileSystem : public FileSystem {
 
   // The number of bytes to read ahead for buffering purposes in the
   // RandomAccessFile implementation. Defaults to 256Mb.
-  const size_t read_ahead_bytes_ = 256 * 1024 * 1024;
+  size_t read_ahead_bytes_ = 256 * 1024 * 1024;
 
   // The initial delay for exponential backoffs when retrying failed calls.
   const int64 initial_retry_delay_usec_ = 1000000L;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index fc79f3be110569d959dd33d50bd3fc37fe83327e..c3a8678fbc6fce15354a2b50a742f02413c46ace 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1617,5 +1617,14 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
   EXPECT_EQ(1, undeleted_dirs);
 }
 
+TEST(GcsFileSystemTest, OverrideReadaheadBufferSize) {
+  GcsFileSystem fs1;
+  EXPECT_EQ(256 * 1024 * 1024, fs1.get_readahead_buffer_size());
+
+  setenv("GCS_READAHEAD_BUFFER_SIZE_BYTES", "123456789", 1);
+  GcsFileSystem fs2;
+  EXPECT_EQ(123456789L, fs2.get_readahead_buffer_size());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index 6f29d4597f18dce3813c69f86f1cdab3eca561ff..f70b431b6524cc566fa9c20fbdd5c4d555b501c6 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -67,7 +67,7 @@ constexpr char kGceTokenUrl[] =
 // The authentication token scope to request.
 constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
 
-// The default intial delay between retries with exponential backoff.
+// The default initial delay between retries with exponential backoff.
 constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
 
 /// Returns whether the given path points to a readable file.
diff --git a/tensorflow/core/platform/cloud/http_request.cc b/tensorflow/core/platform/cloud/http_request.cc
index 825741f61409eb762a0f7ea6c4a8d559c17d525d..2d0141e50e786b8275272cce29be269c6fe8afe0 100644
--- a/tensorflow/core/platform/cloud/http_request.cc
+++ b/tensorflow/core/platform/cloud/http_request.cc
@@ -35,6 +35,10 @@ constexpr uint32 kRequestTimeoutSeconds = 3600;  // 1 hour
 // Timeout for the connection phase.
 constexpr uint32 kConnectTimeoutSeconds = 120;  // 2 minutes
 
+// The maximum period of request inactivity, after which the request
+// is terminated.
+constexpr uint64 kInactivityTimeoutSeconds = 60;  // 1 minute
+
 // Proxy to the real libcurl implementation.
 class LibCurlProxy : public LibCurl {
  public:
@@ -75,6 +79,13 @@ class LibCurlProxy : public LibCurl {
     return ::curl_easy_setopt(curl, option, param);
   }
 
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    return ::curl_easy_setopt(curl, option, param);
+  }
+
   CURLcode curl_easy_perform(CURL* curl) override {
     return ::curl_easy_perform(curl);
   }
@@ -111,7 +122,8 @@ class LibCurlProxy : public LibCurl {
 
 HttpRequest::HttpRequest() : HttpRequest(LibCurlProxy::Load()) {}
 
-HttpRequest::HttpRequest(LibCurl* libcurl) : libcurl_(libcurl) {
+HttpRequest::HttpRequest(LibCurl* libcurl, Env* env)
+    : libcurl_(libcurl), env_(env) {
   default_response_buffer_.reserve(CURL_MAX_WRITE_SIZE);
 }
 
@@ -152,6 +164,12 @@ Status HttpRequest::Init() {
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
                              CURL_HTTP_VERSION_2_0);
 
+  // Set up the progress meter.
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_NOPROGRESS, 0ULL);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFODATA, this);
+  libcurl_->curl_easy_setopt(curl_, CURLOPT_XFERINFOFUNCTION,
+                             &HttpRequest::ProgressCallback);
+
   // If response buffer is not set, libcurl will print results to stdout,
   // so we always set it.
   is_initialized_ = true;
@@ -470,4 +488,31 @@ string HttpRequest::GetResponseHeader(const string& name) const {
 
 uint64 HttpRequest::GetResponseCode() const { return response_code_; }
 
+// Cancels the transmission if no progress has been made for too long.
+int HttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
+                                  curl_off_t dlnow, curl_off_t ultotal,
+                                  curl_off_t ulnow) {
+  auto that = reinterpret_cast<HttpRequest*>(this_object);
+  const auto now = that->env_->NowSeconds();
+  const auto current_progress = dlnow + ulnow;
+  if (that->last_progress_timestamp_ == 0 ||
+      current_progress > that->last_progress_bytes_) {
+    // This is the first time the callback is called or some progress
+    // was made since the last tick.
+    that->last_progress_timestamp_ = now;
+    that->last_progress_bytes_ = current_progress;
+    return 0;
+  }
+
+  if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+    LOG(ERROR) << "The transmission has been stuck at " << current_progress
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted.";
+    return 1;  // Will abort the request.
+  }
+
+  // No progress was made since the last call, but we should wait a bit longer.
+  return 0;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 5365c45ca9bed2264a8cfc44ebc8ef39ceb1f6b9..afcbb9f35cfba478746b7e9beff6135aba32fa1d 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -50,7 +51,9 @@ class HttpRequest {
   };
 
   HttpRequest();
-  explicit HttpRequest(LibCurl* libcurl);
+  explicit HttpRequest(LibCurl* libcurl)
+      : HttpRequest(libcurl, Env::Default()) {}
+  HttpRequest(LibCurl* libcurl, Env* env);
   virtual ~HttpRequest();
 
   virtual Status Init();
@@ -123,11 +126,16 @@ class HttpRequest {
   /// A header callback in the form which can be accepted by libcurl.
   static size_t HeaderCallback(const void* ptr, size_t size, size_t nmemb,
                                void* this_object);
+  /// A progress meter callback in the form which can be accepted by libcurl.
+  static int ProgressCallback(void* this_object, curl_off_t dltotal,
+                              curl_off_t dlnow, curl_off_t ultotal,
+                              curl_off_t ulnow);
   Status CheckInitialized() const;
   Status CheckMethodNotSet() const;
   Status CheckNotSent() const;
 
   LibCurl* libcurl_;
+  Env* env_;
 
   FILE* put_body_ = nullptr;
 
@@ -144,6 +152,12 @@ class HttpRequest {
   std::unordered_map<string, string> response_headers_;
   uint64 response_code_ = 0;
 
+  // The timestamp of the last activity related to the request execution, in
+  // seconds since epoch.
+  uint64 last_progress_timestamp_ = 0;
+  // The last progress in terms of bytes transmitted.
+  curl_off_t last_progress_bytes_ = 0;
+
   // Members to enforce the usage flow.
   bool is_initialized_ = false;
   bool is_uri_set_ = false;
@@ -173,6 +187,10 @@ class LibCurl {
   virtual CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
                                     size_t (*param)(const void*, size_t, size_t,
                                                     void*)) = 0;
+  virtual CURLcode curl_easy_setopt(
+      CURL* curl, CURLoption option,
+      int (*param)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                   curl_off_t ultotal, curl_off_t ulnow)) = 0;
   virtual CURLcode curl_easy_perform(CURL* curl) = 0;
   virtual CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
                                      uint64* value) = 0;
diff --git a/tensorflow/core/platform/cloud/http_request_test.cc b/tensorflow/core/platform/cloud/http_request_test.cc
index b918a3a8cd84200133869229f6bce5a7cf115371..6d66dfdee18c4251944189aca36da0d08d86d51a 100644
--- a/tensorflow/core/platform/cloud/http_request_test.cc
+++ b/tensorflow/core/platform/cloud/http_request_test.cc
@@ -25,11 +25,26 @@ namespace {
 
 const string kTestContent = "random original scratch content";
 
+class FakeEnv : public EnvWrapper {
+ public:
+  FakeEnv() : EnvWrapper(Env::Default()) {}
+
+  uint64 NowSeconds() override { return now_; }
+  uint64 now_ = 10000;
+};
+
 // A fake proxy that pretends to be libcurl.
 class FakeLibCurl : public LibCurl {
  public:
   FakeLibCurl(const string& response_content, uint64 response_code)
       : response_content_(response_content), response_code_(response_code) {}
+  FakeLibCurl(const string& response_content, uint64 response_code,
+              std::vector<std::tuple<uint64, curl_off_t>> progress_ticks,
+              FakeEnv* env)
+      : response_content_(response_content),
+        response_code_(response_code),
+        progress_ticks_(std::move(progress_ticks)),
+        env_(env) {}
   FakeLibCurl(const string& response_content, uint64 response_code,
               const std::vector<string>& response_headers)
       : response_content_(response_content),
@@ -86,6 +101,9 @@ class FakeLibCurl : public LibCurl {
       case CURLOPT_READDATA:
         read_data_ = reinterpret_cast<FILE*>(param);
         break;
+      case CURLOPT_XFERINFODATA:
+        progress_data_ = param;
+        break;
       default:
         break;
     }
@@ -112,6 +130,13 @@ class FakeLibCurl : public LibCurl {
     }
     return CURLE_OK;
   }
+  CURLcode curl_easy_setopt(CURL* curl, CURLoption option,
+                            int (*param)(void* clientp, curl_off_t dltotal,
+                                         curl_off_t dlnow, curl_off_t ultotal,
+                                         curl_off_t ulnow)) override {
+    progress_callback_ = param;
+    return CURLE_OK;
+  }
   CURLcode curl_easy_perform(CURL* curl) override {
     if (read_data_) {
       char buffer[3];
@@ -134,6 +159,12 @@ class FakeLibCurl : public LibCurl {
       strncpy(error_buffer_, curl_easy_perform_error_message_.c_str(),
               curl_easy_perform_error_message_.size() + 1);
     }
+    for (const auto& tick : progress_ticks_) {
+      env_->now_ = std::get<0>(tick);
+      if (progress_callback_(progress_data_, 0, std::get<1>(tick), 0, 0)) {
+        return CURLE_ABORTED_BY_CALLBACK;
+      }
+    }
     return curl_easy_perform_result_;
   }
   CURLcode curl_easy_getinfo(CURL* curl, CURLINFO info,
@@ -212,10 +243,17 @@ class FakeLibCurl : public LibCurl {
   FILE* read_data_ = nullptr;
   size_t (*read_callback_)(void* ptr, size_t size, size_t nmemb,
                            FILE* userdata) = &fread;
+  int (*progress_callback_)(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
+                            curl_off_t ultotal, curl_off_t ulnow) = nullptr;
+  void* progress_data_ = nullptr;
   // Outcome of performing the request.
   string posted_content_;
   CURLcode curl_easy_perform_result_ = CURLE_OK;
   string curl_easy_perform_error_message_;
+  // A vector of <timestamp, progress in bytes> pairs that represent the
+  // progress of a transmission.
+  std::vector<std::tuple<uint64, curl_off_t>> progress_ticks_;
+  FakeEnv* env_ = nullptr;
 };
 
 TEST(HttpRequestTest, GetRequest) {
@@ -547,5 +585,44 @@ TEST(HttpRequestTest, ErrorReturnsNoResponse) {
   EXPECT_EQ("", string(scratch.begin(), scratch.end()));
 }
 
+TEST(HttpRequestTest, ProgressIsOk) {
+  // Imitate a steady progress.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 0) /* timestamp 100, 0 bytes */,
+          std::make_tuple(110, 0) /* timestamp 110, 0 bytes */,
+          std::make_tuple(200, 100) /* timestamp 200, 100 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  TF_EXPECT_OK(http_request.Send());
+}
+
+TEST(HttpRequestTest, ProgressIsStuck) {
+  // Imitate a transmission that got stuck for more than a minute.
+  FakeEnv env;
+  FakeLibCurl libcurl(
+      "test", 200,
+      {
+          std::make_tuple(100, 10) /* timestamp 100, 10 bytes */,
+          std::make_tuple(130, 10) /* timestamp 130, 10 bytes */,
+          std::make_tuple(170, 10) /* timestamp 170, 10 bytes */
+      },
+      &env);
+  HttpRequest http_request(&libcurl, &env);
+  TF_EXPECT_OK(http_request.Init());
+  TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
+  auto status = http_request.Send();
+  EXPECT_EQ(error::UNAVAILABLE, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request (HTTP response code 200, "
+      "error code 42, error message '')",
+      status.error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index a3d5b9a6e4522780f65c6092dc8cbfa93e61b364..97d6617a040ae7d9c21c0c05b714a9f27a900aaa 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -43,7 +43,8 @@ constexpr char kJwtType[] = "JWT";
 constexpr char kGrantType[] =
     "urn%3Aietf%3Aparams%3Aoauth%3Agrant-type%3Ajwt-bearer";
 
-Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
+Status ReadJsonValue(const Json::Value& json, const string& name,
+                     Json::Value* value) {
   if (!value) {
     return errors::FailedPrecondition("'value' cannot be nullptr.");
   }
@@ -55,7 +56,8 @@ Status ReadJsonValue(Json::Value json, const string& name, Json::Value* value) {
   return Status::OK();
 }
 
-Status ReadJsonString(Json::Value json, const string& name, string* value) {
+Status ReadJsonString(const Json::Value& json, const string& name,
+                      string* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isString()) {
@@ -66,7 +68,7 @@ Status ReadJsonString(Json::Value json, const string& name, string* value) {
   return Status::OK();
 }
 
-Status ReadJsonInt(Json::Value json, const string& name, int64* value) {
+Status ReadJsonInt(const Json::Value& json, const string& name, int64* value) {
   Json::Value json_value;
   TF_RETURN_IF_ERROR(ReadJsonValue(json, name, &json_value));
   if (!json_value.isIntegral()) {
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index e119ad5e2a21b97157854afecaf677d725f28b6e..906826e6f834772449f5b8bd103b5424cdd0af35 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -68,7 +68,7 @@ int GetXCR0EAX() {
 
 // Structure for basic CPUID info
 class CPUIDInfo {
-public:
+ public:
   CPUIDInfo()
       : have_adx_(0),
         have_aes_(0),
@@ -115,12 +115,21 @@ public:
 
     uint32 eax, ebx, ecx, edx;
 
+    // Get vendor string (issue CPUID with eax = 0)
+    GETCPUID(eax, ebx, ecx, edx, 0, 0);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ebx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&edx), 4);
+    cpuid->vendor_str_.append(reinterpret_cast<char *>(&ecx), 4);
+
     // To get general information and extended features we send eax = 1 and
     // ecx = 0 to cpuid.  The response is returned in eax, ebx, ecx and edx.
     // (See Intel 64 and IA-32 Architectures Software Developer's Manual
     // Volume 2A: Instruction Set Reference, A-M CPUID).
     GETCPUID(eax, ebx, ecx, edx, 1, 0);
 
+    cpuid->model_num_ = static_cast<int>((eax >> 4) & 0xf);
+    cpuid->family_ = static_cast<int>((eax >> 8) & 0xf);
+
     cpuid->have_aes_ = (ecx >> 25) & 0x1;
     cpuid->have_cmov_ = (edx >> 15) & 0x1;
     cpuid->have_cmpxchg16b_ = (ecx >> 13) & 0x1;
@@ -241,6 +250,10 @@ public:
     return false;
   }
 
+  string vendor_str() const { return vendor_str_; }
+  int family() const { return family_; }
+  int model_num() { return model_num_; }
+
  private:
   int highest_eax_;
   int have_adx_ : 1;
@@ -280,6 +293,9 @@ public:
   int have_sse4_2_ : 1;
   int have_ssse3_ : 1;
   int have_hypervisor_ : 1;
+  string vendor_str_;
+  int family_;
+  int model_num_;
 };
 
 std::once_flag cpuid_once_flag;
@@ -302,5 +318,32 @@ bool TestCPUFeature(CPUFeature feature) {
 #endif
 }
 
+std::string CPUVendorIDString() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->vendor_str();
+#else
+  return "";
+#endif
+}
+
+int CPUFamily() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->family();
+#else
+  return 0;
+#endif
+}
+
+int CPUModelNum() {
+#ifdef PLATFORM_IS_X86
+  InitCPUIDInfo();
+  return cpuid->model_num();
+#else
+  return 0;
+#endif
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index f6eee478e8dc1fab7d8e9f0032e7bfe72b691d61..331f3e525169a93fa01739eefdf2dc6c588980a0 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_CPU_INFO_H_
 #define TENSORFLOW_PLATFORM_CPU_INFO_H_
 
+#include <string>
+
 #if defined(PLATFORM_WINDOWS)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
@@ -92,6 +94,18 @@ enum CPUFeature {
 // Checks CPU registers to return hardware capabilities.
 bool TestCPUFeature(CPUFeature feature);
 
+// Returns CPU Vendor string (i.e. 'GenuineIntel', 'AuthenticAMD', etc.)
+std::string CPUVendorIDString();
+
+// Returns CPU family.
+int CPUFamily();
+
+// Returns CPU model number.
+int CPUModelNum();
+
+// Returns nominal core processor cycles per second of each processor.
+double NominalCPUFrequency();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index f21a646ca1c530ae21017bc6c204119e4091b3b6..10414cbca26c404b000695ffca8799dab5ae3450 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -34,6 +34,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
       name = name + "_proto_srcs",
       srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
       testonly = testonly,
+      visibility = visibility,
   )
 
   use_grpc_plugin = None
@@ -193,13 +194,15 @@ def tf_kernel_tests_linkstatic():
 
 def tf_additional_lib_defines():
   return select({
-      "//tensorflow:with_jemalloc": ["TENSORFLOW_USE_JEMALLOC"],
+      "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
+      "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
       "//conditions:default": [],
   })
 
 def tf_additional_lib_deps():
   return select({
-      "//tensorflow:with_jemalloc": ["@jemalloc"],
+      "//tensorflow:with_jemalloc_linux_x86_64": ["@jemalloc"],
+      "//tensorflow:with_jemalloc_linux_ppc64le": ["@jemalloc"],
       "//conditions:default": [],
   })
 
@@ -245,3 +248,9 @@ def tf_lib_proto_parsing_deps():
       ":protos_all_cc",
       "//tensorflow/core/platform/default/build_config:proto_parsing",
   ]
+
+def tf_additional_verbs_lib_defines():
+  return select({
+      "//tensorflow:with_verbs_support": ["TENSORFLOW_USE_VERBS"],
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 0857010f7c9642a9544790865d41c632237427f9..9e3d5f354db4a224bd4015dc1437260b31c8face 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -58,6 +58,22 @@ cc_library(
     ],
 )
 
+# Dummy stream executor cuda plugins.
+cc_library(
+    name = "cublas_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = [],
+)
+
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",
@@ -92,6 +108,7 @@ cc_library(
         "//tensorflow/core:protos_cc",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
+        "@fft2d//:fft2d",
         "@highwayhash//:sip_hash",
         "@png_archive//:png",
     ],
@@ -170,11 +187,17 @@ cc_library(
 )
 
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = glob(["*.h"]),
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 79f97c12347b8cddc480caebfcbadf9bbaf635ab..eb804bfc786f34f9015b9d1319820d78f94d4403 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -22,3 +22,11 @@ def tf_additional_license_deps():
       "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
       "//conditions:default": [],
   })
+
+def tf_additional_verbs_deps():
+  return select({
+      "//tensorflow:with_verbs_support": [
+      "//tensorflow/contrib/verbs:verbs_server_lib",
+      "//tensorflow/contrib/verbs:grpc_verbs_client"], 
+      "//conditions:default": [],
+  })
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index d7299636168a1d696da70c3e491396999ab9f06a..2fdd989c9b97497c94bb035472df910a701b2692 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <deque>
+#include <utility>
 #include <vector>
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>
@@ -95,7 +96,7 @@ Status Env::GetRegisteredFileSystemSchemes(std::vector<string>* schemes) {
 
 Status Env::RegisterFileSystem(const string& scheme,
                                FileSystemRegistry::Factory factory) {
-  return file_system_registry_->Register(scheme, factory);
+  return file_system_registry_->Register(scheme, std::move(factory));
 }
 
 Status Env::NewRandomAccessFile(const string& fname,
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index d0a249ddc19b4971aec34a1b7f56f23e7928f79b..b766c5d25b018c1ef787e3e11b3715d3c116f3a7 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -253,10 +253,10 @@ TEST_F(DefaultEnvTest, SleepForMicroseconds) {
   env_->SleepForMicroseconds(sleep_time);
   const int64 delta = env_->NowMicros() - start;
 
-  // Subtract 50 from the sleep_time for this check because NowMicros can
+  // Subtract 200 from the sleep_time for this check because NowMicros can
   // sometimes give slightly inconsistent values between the start and the
   // finish (e.g. because the two calls run on different CPUs).
-  EXPECT_GE(delta, sleep_time - 50);
+  EXPECT_GE(delta, sleep_time - 200);
 }
 
 class TmpDirFileSystem : public NullFileSystem {
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 3d7553e6da11aeff3e46073030eab1fdd29b001a..2abda4571458965c588b3e7bff7adc236ab2b71d 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -37,7 +37,7 @@ constexpr int kNumThreads = 8;
 
 // Run a function in parallel using a ThreadPool, but skip the ThreadPool
 // on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, std::function<void(int)> f) {
+void ForEach(int first, int last, const std::function<void(int)>& f) {
 #if TARGET_OS_IPHONE
   for (int i = first; i < last; i++) {
     f(i);
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 1ca36db548bf700e71bff834661020776d3d493d..985c061676c43e0c85e18dbf282786bed1f91b33 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -26,16 +26,6 @@ limitations under the License.
 #endif
 
 namespace tensorflow {
-namespace port {
-
-// Some platforms require that filenames be of a certain form when
-// used for logging.  This function is invoked to allow platforms to
-// adjust the filename used for logging appropriately, if necessary
-// (most ports can just do nothing).  If any changes are necessary, the
-// implementation should mutate "*filename" appropriately.
-void AdjustFilenameForLogging(string* filename);
-
-}  // namespace port
 
 namespace internal {
 // Emit "message" as a log message to the log for the specified
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index e2b9c586c8621f2bea62182c720a71c256189c1b..66c4ff37b90562828092707c73947299de3b137e 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -156,5 +156,10 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 
 string Demangle(const char* mangled) { return mangled; }
 
+double NominalCPUFrequency() {
+  // TODO(yuefengz): implement it for this platform.
+  return 1.0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/posix/subprocess.cc b/tensorflow/core/platform/posix/subprocess.cc
index fc511fdf727c1693672238c5573a064e91185b8f..cefc66831a9b9fe11a170013e64ff7a1cd2e2bcd 100644
--- a/tensorflow/core/platform/posix/subprocess.cc
+++ b/tensorflow/core/platform/posix/subprocess.cc
@@ -28,7 +28,7 @@ limitations under the License.
 // A danger of calling fork() (as opposed to clone() or vfork()) is that if
 // many people have used pthread_atfork() to acquire locks, fork() can deadlock,
 // because it's unlikely that the locking order will be correct in a large
-// programme where different layers are unaware of one another and using
+// program where different layers are unaware of one another and using
 // pthread_atfork() independently.
 //
 // The danger of not calling fork() is that if libc managed to use
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index a34b23eb9de4e572d54e3c78cf22646dc17d2733..ff3aa4a6f1650a81772b88b2a383bfe9bc73fc3e 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -215,6 +215,12 @@ class Tracing::TraceMe {
  public:
   explicit TraceMe(StringPiece name);
 
+  // If tracing is enabled, set up a traceMe with a label of
+  // "<name_part1>:<name_part2>".  This can be cheaper than the
+  // single-argument constructor because the concatenation of the
+  // label string is only done if tracing is enabled.
+  TraceMe(StringPiece name_part1, StringPiece name_part2);
+
  private:
   std::unique_ptr<Engine::Tracer> tracer_;
 };
@@ -242,6 +248,15 @@ inline Tracing::TraceMe::TraceMe(StringPiece name) {
   }
 }
 
+inline Tracing::TraceMe::TraceMe(StringPiece name_part1,
+                                 StringPiece name_part2) {
+  auto e = Tracing::engine();
+  if (e && e->IsEnabled()) {
+    tracer_.reset(
+        e->StartTracing(strings::StrCat(name_part1, ":", name_part2)));
+  }
+}
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index fc969c1a05db43784c6c87095ace9c54d1beeb31..85b53e07c439e02f63d4600c57c925f3b8d843b9 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -58,21 +58,20 @@ int NumSchedulableCPUs() {
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #ifdef TENSORFLOW_USE_JEMALLOC
-    void* ptr = NULL;
-    // posix_memalign requires that the requested alignment be at least
-    // sizeof(void*). In this case, fall back on malloc which should return
-    // memory aligned to at least the size of a pointer.
-    const int required_alignment = sizeof(void*);
-    if (minimum_alignment < required_alignment) return Malloc(size);
-    int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
-    if (err != 0) {
-        return NULL;
-    }
-    else {
-        return ptr;
-    }
+  void* ptr = NULL;
+  // posix_memalign requires that the requested alignment be at least
+  // sizeof(void*). In this case, fall back on malloc which should return
+  // memory aligned to at least the size of a pointer.
+  const int required_alignment = sizeof(void*);
+  if (minimum_alignment < required_alignment) return Malloc(size);
+  int err = jemalloc_posix_memalign(&ptr, minimum_alignment, size);
+  if (err != 0) {
+    return NULL;
+  } else {
+    return ptr;
+  }
 #else
-    return _aligned_malloc(size, minimum_alignment);
+  return _aligned_malloc(size, minimum_alignment);
 #endif
 }
 
@@ -149,5 +148,10 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 
 string Demangle(const char* mangled) { return mangled; }
 
+double NominalCPUFrequency() {
+  // TODO(yuefengz): implement it for this platform.
+  return 1.0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
new file mode 100644
index 0000000000000000000000000000000000000000..33c87eefe022eee98ba7543e6623a04df540ffc9
--- /dev/null
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -0,0 +1,82 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "ClusterProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.distruntime";
+
+// This file contains protos to be used when defining a TensorFlow
+// cluster.
+//
+// EXAMPLES
+// --------
+//
+// 1. A single-process cluster, containing "/job:local/task:0".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
+//
+//    Server:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//
+// 2. A two-process cluster, containing "/job:local/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
+//                          tasks { key: 1 value: 'localhost:2223' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'local' task_index: 0
+//      cluster { $CLUSTER } job_name: 'local' task_index: 1
+//
+// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
+//    "/job:ps/task:{0,1}".
+//
+//    Cluster:
+//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
+//                           tasks { key: 1 value: 'worker2:2222' }
+//                           tasks { key: 2 value: 'worker3:2222' } }
+//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
+//                           tasks { key: 1 value: 'ps1:2222' } }
+//
+//    Servers:
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
+//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
+//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
+
+// Defines a single job in a TensorFlow cluster.
+message JobDef {
+  // The name of this job.
+  string name = 1;
+
+  // Mapping from task ID to "hostname:port" string.
+  //
+  // If the `name` field contains "worker", and the `tasks` map contains a
+  // mapping from 7 to "example.org:2222", then the device prefix
+  // "/job:worker/task:7" will be assigned to "example.org:2222".
+  map<int32, string> tasks = 2;
+}
+
+// Defines a TensorFlow cluster as a set of jobs.
+message ClusterDef {
+  // The jobs that comprise the cluster.
+  repeated JobDef job = 1;
+}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 98e7b171d28043cebab9cd9a4547304ac43e067d..630f47633f87d1dfddb6eddbb18ea13a3575ddc4 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -10,6 +10,7 @@ import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/protobuf/debug.proto";
+import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/rewriter_config.proto";
 
 message GPUOptions {
@@ -64,6 +65,18 @@ message GPUOptions {
   // PollEvents calls, when the queue is empty.  If value is not
   // set or set to 0, gets set to a non-zero default.
   int32 polling_inactive_delay_msecs = 7;
+
+  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
+  // enabling this option forces all CPU tensors to be allocated with Cuda
+  // pinned memory. Normally, TensorFlow will infer which tensors should be
+  // allocated as the pinned memory. But in case where the inference is
+  // incomplete, this option can significantly speed up the cross-device memory
+  // copy performance as long as it fits the memory.
+  // Note that this option is not something that should be
+  // enabled by default for unknown or very large models, since all Cuda pinned
+  // memory is unpageable, having too much pinned memory might negatively impact
+  // the overall host system performance.
+  bool force_gpu_compatible = 8;
 };
 
 // Options passed to the graph optimizer
@@ -247,6 +260,11 @@ message ConfigProto {
 
   // Options that apply when this session uses the distributed runtime.
   RPCOptions rpc_options = 13;
+
+  // Optional list of all workers to use in this session.
+  ClusterDef cluster_def = 14;
+
+  // Next: 15
 };
 
 // Options for a single Run() call.
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 24f42322c0fe858641f37462cd9a1475b1e48ab2..48f503225447c26f8959ba379656361292052b44 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -61,6 +61,9 @@ message WhileContextDef {
   // List of names for exit tensors.
   repeated string loop_exit_names = 8;
 
+  // List of names for enter tensors.
+  repeated string loop_enter_names = 10;
+
   // Values and external values in control flow context.
   ValuesDef values_def = 9;
 }
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9b1497c710d40c4c5a989f80ae0d98ee2a2dc3a8
--- /dev/null
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "DevicePropertiesProtos";
+
+message DeviceProperties {
+  // Device type (CPU, GPU, ...)
+  string type = 1;
+  // Vendor (Intel, nvidia, ...)
+  string vendor = 2;
+  // Model (Haswell, K40, ...)
+  string model = 3;
+  // Core Frequency in Mhz
+  int64 frequency = 4;
+  // Number of cores
+  int64 num_cores = 5;
+  // Version of the tools and libraries used with this device (e.g. gcc 4.9,
+  // cudnn 5.1)
+  map<string, string> environment = 6;
+  // Number of registers per core.
+  int64 num_registers = 7;
+  // L1 cache size in bytes
+  int64 l1_cache_size = 8;
+  // L2 cache size in bytes
+  int64 l2_cache_size = 9;
+  // L3 cache size in bytes
+  int64 l3_cache_size = 10;
+  // Shared memory size per multiprocessor in bytes. This field is
+  // applicable to GPUs only.
+  int64 shared_memory_size_per_multiprocessor = 11;
+  // Memory size in bytes
+  int64 memory_size = 12;
+  // Memory bandwidth in KB/s
+  int64 bandwidth = 13;
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index de91b6133e4924463ab2bbf1a5b43a1996cd13f8..0a825bbb928c3aec583db6dd0b5ee151bc37c9c0 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -38,6 +38,9 @@ message CreateSessionRequest {
 
   // Configuration options.
   ConfigProto config = 2;
+
+  // The target string used from the client's perspective.
+  string target = 3;
 }
 
 message CreateSessionResponse {
@@ -199,7 +202,7 @@ message CloseSessionResponse {
 // Old sessions may continue to have side-effects on resources not in
 // containers listed in "containers", and thus may affect future
 // sessions' results in ways that are hard to predict.  Thus, if well-defined
-// behaviour is desired, is it recommended that all containers be listed in
+// behavior is desired, is it recommended that all containers be listed in
 // "containers".  Similarly, if a device_filter is specified, results may be
 // hard to predict.
 message ResetRequest {
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index aef69461d882c1570faeff7ddf715fb73dc580af..753edba4b848e32dd6f71233452ec682342a15c3 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -6,7 +6,27 @@ option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
 
+message AutoParallelOptions {
+  bool enable = 1;
+  int32 num_replicas = 2;
+}
+
 message RewriterConfig {
   bool optimize_tensor_layout = 1;
   bool disable_model_pruning = 2;
+  bool constant_folding = 3;
+
+  enum MemOptType {
+    // Fully disabled
+    NO_MEM_OPT = 0;
+    // Driven by manual annotations
+    MANUAL = 1;
+  }
+  MemOptType memory_optimization = 4;
+
+  AutoParallelOptions auto_parallel = 5;
+
+  // If non-empty, will use this as an alternative way to specify a list of
+  // optimizations to turn on and the order of the optimizations.
+  repeated string optimizers = 100;
 }
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index c4077bd98e452f9ed2338ada29bfd5400ebbeff3..6199e707e5ad034d9888daa49c13bd87b02b171c 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -16,6 +16,7 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/cluster.proto";
 
 package tensorflow;
 option cc_enable_arenas = true;
@@ -23,69 +24,6 @@ option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
 
-// This file contains protos to be used when defining a TensorFlow
-// cluster, and a server within that cluster.
-//
-// EXAMPLES
-// --------
-//
-// 1. A single-process cluster, containing "/job:local/task:0".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' } }
-//
-//    Server:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//
-// 2. A two-process cluster, containing "/job:local/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'local' tasks { key: 0 value: 'localhost:2222' }
-//                          tasks { key: 1 value: 'localhost:2223' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'local' task_index: 0
-//      cluster { $CLUSTER } job_name: 'local' task_index: 1
-//
-// 3. A two-job cluster, containing "/job:worker/task:{0,1,2}" and
-//    "/job:ps/task:{0,1}".
-//
-//    Cluster:
-//      job { name: 'worker' tasks { key: 0 value: 'worker1:2222' }
-//                           tasks { key: 1 value: 'worker2:2222' }
-//                           tasks { key: 2 value: 'worker3:2222' } }
-//      job { name: 'ps'     tasks { key: 0 value: 'ps0:2222' }
-//                           tasks { key: 1 value: 'ps1:2222' } }
-//
-//    Servers:
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 0
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 1
-//      cluster { $CLUSTER } job_name: 'worker' task_index: 2
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 0
-//      cluster { $CLUSTER } job_name: 'ps'     task_index: 1
-
-// Defines a single job in a TensorFlow cluster.
-message JobDef {
-  // The name of this job.
-  string name = 1;
-
-  // Mapping from task ID to "hostname:port" string.
-  //
-  // If the `name` field contains "worker", and the `tasks` map contains a
-  // mapping from 7 to "example.org:2222", then the device prefix
-  // "/job:worker/task:7" will be assigned to "example.org:2222".
-  //
-  // NOTE(mrry): Currently, only a dense task ID space starting at 0 is
-  // supported.
-  map<int32, string> tasks = 2;
-}
-
-// Defines a TensorFlow cluster as a set of jobs.
-message ClusterDef {
-  // The jobs that comprise the cluster.
-  repeated JobDef job = 1;
-}
-
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index e3af1119e96b0522512e6287cf736d70e2fb7659..cf05aece39a1b9c23fe1c4597177655659182e15 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -28,6 +28,7 @@ import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/protobuf/config.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 
@@ -92,6 +93,9 @@ message RegisterGraphRequest {
 
   // Configuration options for the session in which this graph was created.
   GraphOptions graph_options = 4;
+
+  // Field(s) used by TensorFlow Debugger (tfdbg).
+  DebugOptions debug_options = 5;
 }
 
 message RegisterGraphResponse {
@@ -115,6 +119,10 @@ message RegisterGraphResponse {
 ////////////////////////////////////////////////////////////////////////////////
 
 message DeregisterGraphRequest {
+  // The session_handle used when registering the graph. If session_handle is
+  // empty, a single global namespace is used.
+  string session_handle = 2;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -163,6 +171,12 @@ message ExecutorOpts {
 };
 
 message RunGraphRequest {
+  // session_handle is the the master-generated unique id for this session.
+  // If session_handle is non-empty, it must be the same as used when
+  // registering the graph. If it is empty, a single global namespace is used to
+  // search for the graph_handle.
+  string session_handle = 8;
+
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
   string graph_handle = 1;
@@ -189,6 +203,8 @@ message RunGraphRequest {
   bool is_partial = 6;
   // True if this is the last partial run request in a sequence of requests.
   bool is_last_partial_run = 7;
+
+  // Next: 9
 }
 
 message RunGraphResponse {
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index eaa076ffb91ef9eeac1e0bf27e8c81c35e0e67e8..4792b32a529fc6a2953b2bdd4feb4c985e4a1c0e 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -199,7 +199,7 @@ Status NewSession(const SessionOptions& options, Session** out_session);
 /// Old sessions may continue to have side-effects on resources not in
 /// containers listed in "containers", and thus may affect future
 /// sessions' results in ways that are hard to predict.  Thus, if well-defined
-/// behaviour is desired, it is recommended that all containers be listed in
+/// behavior is desired, it is recommended that all containers be listed in
 /// "containers".
 ///
 /// `containers` is a vector of string representation of resource container
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index f7278c35020ffe07345d6722f91659b9e26e4a17..566d9aa9084f544090b8dd54be0775f06fe45113 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -20,11 +20,11 @@ limitations under the License.
 
 #define TF_MAJOR_VERSION 1
 #define TF_MINOR_VERSION 1
-#define TF_PATCH_VERSION 0-rc0
+#define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc2"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
@@ -84,10 +84,14 @@ limitations under the License.
 //     Some earlier graph def versions allowed this.
 // 21. Dropped FunctionDef.Node support, switched to node_def introduced
 //     in version 12. (11jan2017)
+// 22. Placeholder now can specify and enforce scalar and partial
+//     shapes, particularly when restoring a graph from GraphDef
+//     produced at version 22 or later.  (04/10/2016)
+// 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 21
+#define TF_GRAPH_DEF_VERSION 23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 03eb076f30ab5546a8e729738265250c2a5ce641..8373eb1f9e7e13cb7097904a075302164aaf5d80 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -93,6 +93,22 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
   return false;
 }
 
+bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
+                    float* dst, bool* value_parsing_ok) {
+  *value_parsing_ok = true;
+  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+    char extra;
+    if (sscanf(arg.data(), "%f%c", dst, &extra) != 1) {
+      LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
+                 << ".";
+      *value_parsing_ok = false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
 }  // namespace
 
 Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
@@ -116,6 +132,12 @@ Flag::Flag(const char* name, string* dst, const string& usage_text)
       string_value_(dst),
       usage_text_(usage_text) {}
 
+Flag::Flag(const char* name, float* dst, const string& usage_text)
+    : name_(name),
+      type_(TYPE_FLOAT),
+      float_value_(dst),
+      usage_text_(usage_text) {}
+
 bool Flag::Parse(string arg, bool* value_parsing_ok) const {
   bool result = false;
   if (type_ == TYPE_INT) {
@@ -126,6 +148,8 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
     result = ParseBoolFlag(arg, name_, bool_value_, value_parsing_ok);
   } else if (type_ == TYPE_STRING) {
     result = ParseStringFlag(arg, name_, string_value_, value_parsing_ok);
+  } else if (type_ == TYPE_FLOAT) {
+    result = ParseFloatFlag(arg, name_, float_value_, value_parsing_ok);
   }
   return result;
 }
@@ -195,6 +219,10 @@ bool Flag::Parse(string arg, bool* value_parsing_ok) const {
       type_name = "string";
       flag_string = strings::Printf("--%s=\"%s\"", flag.name_.c_str(),
                                     flag.string_value_->c_str());
+    } else if (flag.type_ == Flag::TYPE_FLOAT) {
+      type_name = "float";
+      flag_string =
+          strings::Printf("--%s=%f", flag.name_.c_str(), *flag.float_value_);
     }
     strings::Appendf(&usage_text, "\t%-33s\t%s\t%s\n", flag_string.c_str(),
                      type_name, flag.usage_text_.c_str());
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 2c77d7874fd3fa5fee81aaf4c25977208b7eaada..f349df16fd478c533a36c8503a711b768d49eea0 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -65,6 +65,7 @@ class Flag {
   Flag(const char* name, int64* dst1, const string& usage_text);
   Flag(const char* name, bool* dst, const string& usage_text);
   Flag(const char* name, string* dst, const string& usage_text);
+  Flag(const char* name, float* dst, const string& usage_text);
 
  private:
   friend class Flags;
@@ -72,11 +73,12 @@ class Flag {
   bool Parse(string arg, bool* value_parsing_ok) const;
 
   string name_;
-  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING } type_;
+  enum { TYPE_INT, TYPE_INT64, TYPE_BOOL, TYPE_STRING, TYPE_FLOAT } type_;
   int* int_value_;
   int64* int64_value_;
   bool* bool_value_;
   string* string_value_;
+  float* float_value_;
   string usage_text_;
 };
 
diff --git a/tensorflow/core/util/command_line_flags_test.cc b/tensorflow/core/util/command_line_flags_test.cc
index b002e35899eb6ba7d066ac0b57fddaa5551f6bb2..62025463af7b869354b7ee89a828ab07ea835805 100644
--- a/tensorflow/core/util/command_line_flags_test.cc
+++ b/tensorflow/core/util/command_line_flags_test.cc
@@ -32,29 +32,35 @@ std::vector<char *> CharPointerVectorFromStrings(
   }
   return result;
 }
-}
+}  // namespace
 
 TEST(CommandLineFlagsTest, BasicUsage) {
   int some_int = 10;
   int64 some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
   string some_name = "something";
-  int argc = 5;
-  std::vector<string> argv_strings = {
-      "program_name", "--some_int=20", "--some_int64=214748364700",
-      "--some_switch", "--some_name=somethingelse"};
+  float some_float = -23.23f;
+  int argc = 6;
+  std::vector<string> argv_strings = {"program_name",
+                                      "--some_int=20",
+                                      "--some_int64=214748364700",
+                                      "--some_switch",
+                                      "--some_name=somethingelse",
+                                      "--some_float=42.0"};
   std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
   bool parsed_ok =
       Flags::Parse(&argc, argv_array.data(),
                    {Flag("some_int", &some_int, "some int"),
                     Flag("some_int64", &some_int64, "some int64"),
                     Flag("some_switch", &some_switch, "some switch"),
-                    Flag("some_name", &some_name, "some name")});
+                    Flag("some_name", &some_name, "some name"),
+                    Flag("some_float", &some_float, "some float")});
   EXPECT_EQ(true, parsed_ok);
   EXPECT_EQ(20, some_int);
   EXPECT_EQ(214748364700, some_int64);
   EXPECT_EQ(true, some_switch);
   EXPECT_EQ("somethingelse", some_name);
+  EXPECT_NEAR(42.0f, some_float, 1e-5f);
   EXPECT_EQ(argc, 1);
 }
 
@@ -85,6 +91,21 @@ TEST(CommandLineFlagsTest, BadBoolValue) {
   EXPECT_EQ(argc, 1);
 }
 
+TEST(CommandLineFlagsTest, BadFloatValue) {
+  float some_float = -23.23f;
+  int argc = 2;
+  std::vector<string> argv_strings = {"program_name",
+                                      "--some_float=notanumber"};
+  std::vector<char *> argv_array = CharPointerVectorFromStrings(argv_strings);
+  bool parsed_ok =
+      Flags::Parse(&argc, argv_array.data(),
+                   {Flag("some_float", &some_float, "some float")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_NEAR(-23.23f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
 // Return whether str==pat, but allowing any whitespace in pat
 // to match zero or more whitespace characters in str.
 static bool MatchWithAnyWhitespace(const string &str, const string &pat) {
@@ -111,6 +132,8 @@ TEST(CommandLineFlagsTest, UsageString) {
   int64 some_int64 = 21474836470;  // max int32 is 2147483647
   bool some_switch = false;
   string some_name = "something";
+  // Don't test float in this case, because precision is hard to predict and
+  // match against, and we don't want a flakey test.
   const string tool_name = "some_tool_name";
   string usage = Flags::Usage(tool_name + "<flags>",
                               {Flag("some_int", &some_int, "some int"),
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 6dfb0bd731f0aed682031a11608b4cc1781f69f5..357b2535515432a97f5a24b8670e4e3a03db6359 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -9,7 +9,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         "ctc_beam_entry.h",
         "ctc_beam_scorer.h",
@@ -19,6 +19,11 @@ filegroup(
     ],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index 294419e907e280c28627b6ef9b41e3e3852d1191..5b28aeb70ad4bd91800dda824f0bdffd5fcbea7c 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -89,7 +89,6 @@ class CTCGreedyDecoder : public CTCDecoder {
       std::vector<int>& output_b = (*output)[0][b];
 
       int prev_class_ix = -1;
-      std::vector<int> transcription;
       (*scores)(b, 0) = 0;
       for (int t = 0; t < seq_len_b; ++t) {
         auto row = input[t].row(b);
@@ -98,7 +97,6 @@ class CTCGreedyDecoder : public CTCDecoder {
         if (max_class_ix != blank_index_ &&
             !(merge_repeated_ && max_class_ix == prev_class_ix)) {
           output_b.push_back(max_class_ix);
-          transcription.push_back(max_class_ix);
         }
         prev_class_ix = max_class_ix;
       }
diff --git a/tensorflow/core/util/ctc/ctc_loss_calculator.h b/tensorflow/core/util/ctc/ctc_loss_calculator.h
index eacadd65af8aaf00ff206406092f25ca6711ead8..f181ab93e77688b67a91e661405c39156ef2ff86 100644
--- a/tensorflow/core/util/ctc/ctc_loss_calculator.h
+++ b/tensorflow/core/util/ctc/ctc_loss_calculator.h
@@ -48,7 +48,7 @@ class CTCLossCalculator {
   // these examples.
   //
   // Reference materials:
-  //  GravesTh: Alex Graves, "Supervised Sequence Labelling with Recurrent
+  //  GravesTh: Alex Graves, "Supervised Sequence Labeling with Recurrent
   //    Neural Networks" (PhD Thesis), Technische Universit¨at M¨unchen.
  public:
   typedef std::vector<std::vector<int>> LabelSequences;
@@ -65,7 +65,8 @@ class CTCLossCalculator {
   Status CalculateLoss(const VectorIn& seq_len, const LabelSequences& labels,
                        const std::vector<MatrixIn>& inputs,
                        bool preprocess_collapse_repeated,
-                       bool ctc_merge_repeated, VectorOut* loss,
+                       bool ctc_merge_repeated,
+                       bool ignore_longer_outputs_than_inputs, VectorOut* loss,
                        std::vector<MatrixOut>* gradients,
                        DeviceBase::CpuWorkerThreads* workers = nullptr) const;
 
@@ -90,7 +91,8 @@ class CTCLossCalculator {
   // batch.  Return value:
   //    max_{b in batch_size} l_primes[b].size()
   template <typename Vector>
-  Status PopulateLPrimes(bool preprocess_collapse_repeated, int batch_size,
+  Status PopulateLPrimes(bool preprocess_collapse_repeated,
+                         bool ignore_longer_outputs_than_inputs, int batch_size,
                          int num_classes, const Vector& seq_len,
                          const LabelSequences& labels, size_t* max_u_prime,
                          LabelSequences* l_primes) const;
@@ -108,7 +110,8 @@ template <typename VectorIn, typename VectorOut, typename MatrixIn,
 Status CTCLossCalculator::CalculateLoss(
     const VectorIn& seq_len, const LabelSequences& labels,
     const std::vector<MatrixIn>& inputs, bool preprocess_collapse_repeated,
-    bool ctc_merge_repeated, VectorOut* loss, std::vector<MatrixOut>* gradients,
+    bool ctc_merge_repeated, bool ignore_longer_outputs_than_inputs,
+    VectorOut* loss, std::vector<MatrixOut>* gradients,
     DeviceBase::CpuWorkerThreads* workers) const {
   auto num_time_steps = inputs.size();
 
@@ -155,20 +158,31 @@ Status CTCLossCalculator::CalculateLoss(
   // and calculate the maximum necessary allocation size.
   LabelSequences l_primes(batch_size);
   size_t max_u_prime = 0;
-  Status l_p_ret =
-      PopulateLPrimes(preprocess_collapse_repeated, batch_size, num_classes,
-                      seq_len, labels, &max_u_prime, &l_primes);
+  Status l_p_ret = PopulateLPrimes(
+      preprocess_collapse_repeated, ignore_longer_outputs_than_inputs,
+      batch_size, num_classes, seq_len, labels, &max_u_prime, &l_primes);
   if (!l_p_ret.ok()) {
     return l_p_ret;
   }
 
   // Process each item in a batch in parallel, using at most kMaxThreads.
-  auto ComputeLossAndGradients = [this, num_classes, &l_primes, &seq_len,
-                                  &inputs, requires_backprop,
-                                  ctc_merge_repeated, &loss, &gradients](
-      int64 start_row, int64 limit_row) {
+  auto ComputeLossAndGradients = [this, num_classes, &labels, &l_primes,
+                                  &seq_len, &inputs, requires_backprop,
+                                  ctc_merge_repeated,
+                                  ignore_longer_outputs_than_inputs, &loss,
+                                  &gradients](int64 start_row,
+                                              int64 limit_row) {
     for (int b = start_row; b < limit_row; b++) {
-      if (seq_len(b) == 0) {
+      // Return zero gradient for empty sequences or sequences with labels
+      // longer than input, which is not supported by CTC.
+      if (seq_len(b) == 0 ||
+          (ignore_longer_outputs_than_inputs &&
+           labels[b].size() > seq_len(b) - this->output_delay_)) {
+        VLOG(1) << "The sequence length is either zero or shorter than the "
+                   "target output (CTC works only with shorter target sequence "
+                   "than input sequence). You can turn this into a warning by "
+                   "using the flag ignore_longer_outputs_than_inputs - "
+                << b << ": " << str_util::Join(labels[b], " ");
         continue;
       }
 
@@ -263,12 +277,11 @@ Status CTCLossCalculator::CalculateLoss(
 }
 
 template <typename Vector>
-Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
-                                          int batch_size, int num_classes,
-                                          const Vector& seq_len,
-                                          const LabelSequences& labels,
-                                          size_t* max_u_prime,
-                                          LabelSequences* l_primes) const {
+Status CTCLossCalculator::PopulateLPrimes(
+    bool preprocess_collapse_repeated, bool ignore_longer_outputs_than_inputs,
+    int batch_size, int num_classes, const Vector& seq_len,
+    const LabelSequences& labels, size_t* max_u_prime,
+    LabelSequences* l_primes) const {
   // labels is a Label array of size batch_size
   if (labels.size() != batch_size) {
     return errors::InvalidArgument("labels.size() != batch_size: ",
@@ -311,9 +324,6 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
       }
     }
 
-    // Make sure there is enough time to output the target indices.
-    int time = seq_len(b) - output_delay_;
-    int required_time = label.size();
     for (int l_i : l) {
       if (l_i < 0) {
         return errors::InvalidArgument(
@@ -325,14 +335,19 @@ Status CTCLossCalculator::PopulateLPrimes(bool preprocess_collapse_repeated,
             num_classes, ", batch: ", b, " labels: ", str_util::Join(l, ","));
       }
     }
-    if (required_time > time) {
-      return errors::InvalidArgument(
-          "Not enough time for target transition sequence ("
-          "required: ",
-          required_time, ", available: ", time,
-          "), skipping data instance in batch: ", b);
+    if (!ignore_longer_outputs_than_inputs) {
+      // Make sure there is enough time to output the target indices.
+      int time = seq_len(b) - output_delay_;
+      int required_time = label.size();
+      if (required_time > time) {
+        return errors::InvalidArgument(
+            "Not enough time for target transition sequence ("
+            "required: ",
+            required_time, ", available: ", time, ")", b,
+            "You can turn this error into a warning by using the flag "
+            "ignore_longer_outputs_than_inputs");
+      }
     }
-
     // Target indices with blanks before each index and a blank at the end.
     // Length U' = 2U + 1.
     // Convert l to l_prime
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8bb4ca8ff84a43b3ecba2950f7f8a6ddb4c40b71..46ea68687c7e7c7ce495992d59762dac188b7857 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -63,6 +63,28 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   return config;
 }
 
+// Calculate the Cuda launch config we should use for a kernel launch. This
+// variant takes the resource limits of func into account to maximize occupancy.
+template <typename DeviceFunc>
+inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
+                                            const GPUDevice& d, DeviceFunc func,
+                                            size_t dynamic_shared_memory_size) {
+  int block_count = 0;
+  int thread_per_block = 0;
+  cudaOccupancyMaxPotentialBlockSize(&block_count, &thread_per_block, func,
+                                     dynamic_shared_memory_size,
+                                     work_element_count);
+  block_count =
+      std::min(block_count,
+               (work_element_count + thread_per_block - 1) / thread_per_block);
+
+  CudaLaunchConfig config;
+  config.virtual_thread_count = work_element_count;
+  config.thread_per_block = thread_per_block;
+  config.block_count = block_count;
+  return config;
+}
+
 struct Cuda2DLaunchConfig {
   dim3 virtual_thread_count;
   dim3 thread_per_block;
@@ -128,6 +150,28 @@ __device__ __host__ inline T ldg(const T* address) {
 #endif
 }
 
+template <>
+__device__ __host__ inline std::complex<float> ldg(
+    const std::complex<float>* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
+  return std::complex<float>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
+template <>
+__device__ __host__ inline std::complex<double> ldg(
+    const std::complex<double>* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
+  return std::complex<double>(mem.x, mem.y);
+#else
+  return *address;
+#endif
+}
+
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
 // for some ops and provide implementation for all reasonable types.
 #define CUDA_ATOMIC_WRAPPER(op, T) \
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index 7e7a3f52236cf712a406e35a6943f1108b1b89a3..2db026da56c8bb1dd1f563a240068d01daa5b00b 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -28,13 +28,18 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
                    string* diff, const EqualGraphDefOptions& options) {
   // Intentionally do not check that versions match so that this routine can
   // be used for less brittle golden file tests.
+  return EqualRepeatedNodeDef(actual.node(), expected.node(), diff, options);
+}
 
+bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
+                          const protobuf::RepeatedPtrField<NodeDef>& expected,
+                          string* diff, const EqualGraphDefOptions& options) {
   std::unordered_map<string, const NodeDef*> actual_index;
-  for (const NodeDef& node : actual.node()) {
+  for (const NodeDef& node : actual) {
     actual_index[node.name()] = &node;
   }
 
-  for (const NodeDef& expected_node : expected.node()) {
+  for (const NodeDef& expected_node : expected) {
     auto actual_iter = actual_index.find(expected_node.name());
     if (actual_iter == actual_index.end()) {
       if (diff != nullptr) {
@@ -53,10 +58,9 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 
   if (!actual_index.empty()) {
     if (diff != nullptr) {
-      *diff = strings::StrCat("Found unexpected node '",
-                              SummarizeNodeDef(*actual_index.begin()->second),
-                              "' not in expected graph:\n",
-                              SummarizeGraphDef(expected));
+      *diff =
+          strings::StrCat("Found unexpected node '",
+                          SummarizeNodeDef(*actual_index.begin()->second), "'");
     }
     return false;
   }
diff --git a/tensorflow/core/util/equal_graph_def.h b/tensorflow/core/util/equal_graph_def.h
index 82f8bd0713bce1f50984e89f51051629f297ed6c..1ce6181c2e7e412f9f059e711538b3e689bfcd65 100644
--- a/tensorflow/core/util/equal_graph_def.h
+++ b/tensorflow/core/util/equal_graph_def.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -44,6 +45,14 @@ bool EqualGraphDef(const GraphDef& actual, const GraphDef& expected,
 bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
                   const EqualGraphDefOptions& options = {});
 
+// Determines if actual and expected are equal, ignoring ordering. If they're
+// different and diff != nullptr, *diff is set to an explanation of the
+// difference.
+bool EqualRepeatedNodeDef(const protobuf::RepeatedPtrField<NodeDef>& actual,
+                          const protobuf::RepeatedPtrField<NodeDef>& expected,
+                          string* diff,
+                          const EqualGraphDefOptions& options = {});
+
 #define TF_EXPECT_GRAPH_EQ(expected, actual)                  \
   do {                                                        \
     string diff;                                              \
diff --git a/tensorflow/core/util/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
index 9ce951e6eff545b6758f4f538232b6dfa63e75a5..af870c5c607798fadf773037f9cf287f7823c4cb 100644
--- a/tensorflow/core/util/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -47,8 +47,7 @@ class EqualGraphDefTest : public ::testing::Test {
  protected:
   EqualGraphDefTest()
       : e_(GraphDefBuilder::kFailImmediately),
-        a_(GraphDefBuilder::kFailImmediately) {
-  }
+        a_(GraphDefBuilder::kFailImmediately) {}
 
   bool Match() {
     GraphDef expected;
@@ -89,11 +88,7 @@ TEST_F(EqualGraphDefTest, ExtraNode) {
   Input(a_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ(strings::StrCat(
-                "Found unexpected node 'B = Input[]()' not in expected graph:\n"
-                "versions = producer: ",
-                TF_GRAPH_DEF_VERSION, ";\n", "A = Input[]();\n"),
-            diff_);
+  EXPECT_EQ("Found unexpected node 'B = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, NodeOrder) {
@@ -169,21 +164,23 @@ TEST_F(EqualGraphDefTest, ControlInputOrder) {
   Node* b = Input(e_.opts().WithName("B"));
   Node* c = Input(e_.opts().WithName("C"));
   Node* d = Input(e_.opts().WithName("D"));
-  Combine(a, a, e_.opts()
-                    .WithName("E")
-                    .WithControlInput(b)
-                    .WithControlInput(c)
-                    .WithControlInput(d));
+  Combine(a, a,
+          e_.opts()
+              .WithName("E")
+              .WithControlInput(b)
+              .WithControlInput(c)
+              .WithControlInput(d));
 
   a = Input(a_.opts().WithName("A"));
   b = Input(a_.opts().WithName("B"));
   c = Input(a_.opts().WithName("C"));
   d = Input(a_.opts().WithName("D"));
-  Combine(a, a, a_.opts()
-                    .WithName("E")
-                    .WithControlInput(c)
-                    .WithControlInput(d)
-                    .WithControlInput(b));
+  Combine(a, a,
+          a_.opts()
+              .WithName("E")
+              .WithControlInput(c)
+              .WithControlInput(d)
+              .WithControlInput(b));
   EXPECT_TRUE(Match()) << diff_;
 }
 
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index fa23ff2bcb9fe11c80608a0268ef2720cd30f4d7..7cd040ebd43b699a41391a8a888f08d59865d922 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -35,7 +35,7 @@ EventsWriter::EventsWriter(const string& file_prefix)
       file_prefix_(file_prefix),
       num_outstanding_events_(0) {}
 
-bool EventsWriter::Init() {
+bool EventsWriter::InitIfNeeded() {
   if (recordio_writer_.get() != nullptr) {
     CHECK(!filename_.empty());
     if (FileHasDisappeared()) {
@@ -52,10 +52,10 @@ bool EventsWriter::Init() {
 
   int64 time_in_seconds = env_->NowMicros() / 1000000;
 
-  filename_ = strings::Printf(
-      "%s.out.tfevents.%010lld.%s", file_prefix_.c_str(),
-      static_cast<long long>(time_in_seconds), port::Hostname().c_str());
-  port::AdjustFilenameForLogging(&filename_);
+  filename_ =
+      strings::Printf("%s.out.tfevents.%010lld.%s%s", file_prefix_.c_str(),
+                      static_cast<int64>(time_in_seconds),
+                      port::Hostname().c_str(), file_suffix_.c_str());
 
   Status s = env_->NewWritableFile(filename_, &recordio_file_);
   if (!s.ok()) {
@@ -84,14 +84,14 @@ bool EventsWriter::Init() {
 
 string EventsWriter::FileName() {
   if (filename_.empty()) {
-    Init();
+    InitIfNeeded();
   }
   return filename_;
 }
 
 void EventsWriter::WriteSerializedEvent(StringPiece event_str) {
   if (recordio_writer_.get() == NULL) {
-    if (!Init()) {
+    if (!InitIfNeeded()) {
       LOG(ERROR) << "Write failed because file could not be opened.";
       return;
     }
diff --git a/tensorflow/core/util/events_writer.h b/tensorflow/core/util/events_writer.h
index 2604ebdda2c0f11ca84234ad3529056941e6800b..a1a8cf790d4e2735d705cc2050c14970e5bfab4a 100644
--- a/tensorflow/core/util/events_writer.h
+++ b/tensorflow/core/util/events_writer.h
@@ -35,10 +35,10 @@ class EventsWriter {
 #endif
 
   // Events files typically have a name of the form
-  //   '/some/file/path/my.file.out.events.[timestamp].[hostname]'
+  //   '/some/file/path/my.file.out.events.[timestamp].[hostname][suffix]'
   // To create and EventWriter, the user should provide file_prefix =
   //   '/some/file/path/my.file'
-  // The EventsWriter will append '.out.events.[timestamp].[hostname]'
+  // The EventsWriter will append '.out.events.[timestamp].[hostname][suffix]'
   // to the ultimate filename once Init() is called.
   // Note that it is not recommended to simultaneously have two
   // EventWriters writing to the same file_prefix.
@@ -51,10 +51,14 @@ class EventsWriter {
   // and is open this is a no-op.  If on the other hand the file was opened,
   // but has since disappeared (e.g. deleted by another process), this will open
   // a new file with a new timestamp in its filename.
-  bool Init();
+  bool Init() { return InitWithSuffix(""); }
+  bool InitWithSuffix(const string& suffix) {
+    file_suffix_ = suffix;
+    return InitIfNeeded();
+  }
 
   // Returns the filename for the current events file:
-  // filename_ = [file_prefix_].out.events.[timestamp].[hostname]
+  // filename_ = [file_prefix_].out.events.[timestamp].[hostname][suffix]
   string FileName();
 
   // Append "event" to the file.  The "tensorflow::" part is for swig happiness.
@@ -78,9 +82,11 @@ class EventsWriter {
 
  private:
   bool FileHasDisappeared();  // True if event_file_path_ does not exist.
+  bool InitIfNeeded();
 
   Env* env_;
   const string file_prefix_;
+  string file_suffix_;
   string filename_;
   std::unique_ptr<WritableFile> recordio_file_;
   std::unique_ptr<io::RecordWriter> recordio_writer_;
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 5f8b4af5fe29edec57d35ae380c4da61c4ea3ff9..20536cee163ba926a16f78e5014c5abd2958f5f2 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -45,7 +45,7 @@ struct FastParseExampleConfig {
     DataType dtype;
     // These 2 fields correspond exactly to dense_shapes and dense_defaults in
     // ParseExample op.
-    // Documentation is avaliable in: tensorflow/core/ops/parsing_ops.cc
+    // Documentation is available in: tensorflow/core/ops/parsing_ops.cc
     PartialTensorShape shape;
     Tensor default_value;
     bool variable_length;
@@ -62,7 +62,7 @@ struct FastParseExampleConfig {
 };
 
 // This is exactly the output of TF's ParseExample Op.
-// Documentation is avaliable in: tensorflow/core/ops/parsing_ops.cc
+// Documentation is available in: tensorflow/core/ops/parsing_ops.cc
 struct Result {
   std::vector<Tensor> sparse_indices;
   std::vector<Tensor> sparse_values;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 4556fae2a303b0ea188d981f45d94e9a38f846a3..6a37256ea9f0827488d10bfbee1faa454e1825a8 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "third_party/mkl/include/mkl_dnn.h"
 #include "third_party/mkl/include/mkl_dnn_types.h"
@@ -63,9 +63,7 @@ class MklShape {
 
   void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; }
 
-  void SetDimensions(const size_t dimension) {
-    dimension_ = dimension;
-  }
+  void SetDimensions(const size_t dimension) { dimension_ = dimension; }
 
   void SetMklLayout(const void* primitive, size_t resourceType) {
     CHECK_EQ(
@@ -77,7 +75,6 @@ class MklShape {
   void SetTfLayout(const size_t dimension, const size_t* sizes,
                    const size_t* strides) {
     dimension_ = dimension;
-
     if (dimension > 0) {  // MKl doesn't support zero dimension tensors
       sizes_ = new size_t[dimension];
       strides_ = new size_t[dimension];
@@ -142,6 +139,39 @@ class MklShape {
   const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
   size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
 
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Channel dimension.
+  bool IsMklChannelDim(int d) const { return tf_dim_idx(d) == MklDims::C; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Batch dimension.
+  bool IsMklBatchDim(int d) const { return tf_dim_idx(d) == MklDims::N; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Width dimension.
+  bool IsMklWidthDim(int d) const { return tf_dim_idx(d) == MklDims::W; }
+  // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  // corresponds to MKL's Height dimension.
+  bool IsMklHeightDim(int d) const { return tf_dim_idx(d) == MklDims::H; }
+
+  // Check if the TF-Mkl dimension ordering map specifies if the input
+  // tensor is in NCHW format.
+  bool IsTensorInNCHWFormat() const {
+    TensorFormat data_format = FORMAT_NCHW;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  // Check if the TF-Mkl dimension ordering map specifies if the input
+  // tensor is in NHWC format.
+  bool IsTensorInNHWCFormat() const {
+    TensorFormat data_format = FORMAT_NHWC;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
   void GetConvertedFlatData(dnnLayout_t targetLayout, void* input,
                             void* output) const {
     dnnLayout_t curLayout;
@@ -196,9 +226,9 @@ class MklShape {
   (STRIDES_OFFSET(dims) + dims * sizeof(size_t))  // Location of mklLayout_
 #define TF_LAYOUT_OFFSET(dims) \
   (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)  // Location of tfLayout_
-// Location of tf_to_mkl_dim_map_
 #define TF_TO_MKL_DIM_MAP_OFFSET(dims) \
-  (TF_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF)
+  (TF_LAYOUT_OFFSET(dims) +            \
+   SIZE_OF_MKL_DNN_BUF)  // Location of tf_to_mkl_dim_map_
 
   // TODO(agramesh1) make sure to create a const to share with rewrite pass
   // for min size of MKL metadata tensor.
@@ -267,45 +297,166 @@ class MklShape {
   size_t dimension_ = 0;
   size_t* sizes_ = nullptr;    // Required by MKL for conversions
   size_t* strides_ = nullptr;  // Required by MKL for conversions
-  // TF dimension corresponding to this MKL dimension
-  size_t* tf_to_mkl_dim_map_ = nullptr;
+  size_t* tf_to_mkl_dim_map_ =
+      nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-int inline GetTensorDataIndex(int n) {
-  return 2 * n;  // index corresponding to nth input/output tensor
+// List of MklShape objects. Used in Concat/Split layers.
+typedef std::vector<MklShape> MklShapeList;
+
+// Check if all tensors specified by MklShapes are MKL tensors.
+inline bool AreAllMklTensors(const MklShapeList& shapes) {
+  for (auto& s : shapes) {
+    if (!s.IsMklTensor()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
+                             const MklShape& mkl_shape) {
+  Tensor output_tensor;
+  TensorShape output_shape;
+
+  for (size_t j = 0; j < mkl_shape.GetDimension(); j++) {
+    // Outermost to innermost dimension
+    output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]);
+  }
+
+  // Allocate output tensor.
+  context->allocate_temp(DataTypeToEnum<T>::v(), output_shape, &output_tensor);
+
+  dnnLayout_t output_layout = static_cast<dnnLayout_t>(mkl_shape.GetTfLayout());
+  void* input_buffer = const_cast<T*>(mkl_tensor.flat<T>().data());
+  void* output_buffer = const_cast<T*>(output_tensor.flat<T>().data());
+
+  if (mkl_tensor.NumElements() != 0) {
+    mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer);
+  }
+
+  return output_tensor;
 }
 
-int inline GetTensorMetaDataIndex(int n) {
-  // index corresponding to meta data of nth input/output tensor
-  return 2 * n + 1;
+// Since our ops are going to produce and also consume N addition tensors
+// (Mkl) for N Tensorflow tensors, we can have following different
+// orderings among these 2N tensors.
+//
+// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+// consume A_m, B_m, and C_m additionally.
+//
+// INTERLEAVED: in this case 2N tensors are interleaved. So for above
+//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+//
+// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+//             by N Mkl tensors. So for above example, the ordering looks
+//             like: A, B, C, A_m, B_m, C_m
+//
+// Following APIs map index of original Tensorflow tensors to their appropriate
+// position based on selected ordering. For contiguous ordering, we need to know
+// the total number of tensors (parameter total).
+//
+typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+// NOTE: Currently, we use contiguous ordering. If you change this, then you
+// would need to change Mkl op definitions in nn_ops.cc.
+static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+
+// Get index of MetaData tensor from index 'n' of Data tensor.
+inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // For interleaved ordering, Mkl tensor follows immediately after
+    // Tensorflow tensor.
+    return n + 1;
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+    return n + total_tensors / 2;
+  }
 }
+
+int inline GetTensorDataIndex(int n, int total_tensors) {
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    return 2 * n;  // index corresponding to nth input/output tensor
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    return n;
+  }
+}
+
+int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+  // Get index for TensorData first and then use mapping function
+  // to get TensorMetaData index from TensorData index.
+  int tidx = GetTensorDataIndex(n, total_tensors);
+  return DataIndexToMetaDataIndex(tidx, total_tensors);
+}
+
 // Get the MKL shape from the second string tensor
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
   mklshape->DeSerializeMklShape(
-      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().data(),
-      ctext->input(GetTensorMetaDataIndex(n)).flat<uint8>().size() *
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+          .flat<uint8>()
+          .data(),
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+              .flat<uint8>()
+              .size() *
           sizeof(uint8));
 }
 
 // Gets the actual input
 inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
-  return ctext->input(GetTensorDataIndex(n));
+  return ctext->input(GetTensorDataIndex(n, ctext->num_inputs()));
+}
+
+inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
+                            OpInputList* input_tensors) {
+  CHECK_NOTNULL(input_tensors);
+  ctext->input_list(name, input_tensors);
+}
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+// Allocate the second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      const MklShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension()));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
-inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n,
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
                                       Tensor** output,
-                                      const TensorShape& tfshape,
-                                      const MklShape& mklshape) {
+                                      const TensorShape& tf_shape,
+                                      const MklShape& mkl_shape) {
   Tensor* second_tensor = nullptr;
   TensorShape second_shape;
-  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension()));
+  second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension()));
   OP_REQUIRES_OK(
-      ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output));
-  OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n),
-                                               second_shape, &second_tensor));
-  mklshape.SerializeMklShape(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
+                                    tf_shape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklShape(
       second_tensor->flat<uint8>().data(),
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
@@ -344,12 +495,11 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
 
 inline void MklSizesToTFSizes(OpKernelContext* context,
                               TensorFormat data_format_,
-                              const MklShape& mklshape, TensorShape* tfshape) {
-  size_t tf_dim = mklshape.GetDimension();
-  const size_t* tf_sizes = mklshape.GetSizes();
+                              const MklShape& mkl_shape,
+                              TensorShape* tf_shape) {
+  size_t tf_dim = mkl_shape.GetDimension();
+  const size_t* tf_sizes = mkl_shape.GetSizes();
 
-  // TODO(agramesh1): check if this constraint is applicable in other cases
-  // (besides BackpropInput, BackpropFilter).
   OP_REQUIRES(context, tf_dim == 4,
               errors::InvalidArgument("MKLSizesToTFSizes: size must be 4-dim"));
   std::vector<int32> sizes;
@@ -366,7 +516,7 @@ inline void MklSizesToTFSizes(OpKernelContext* context,
     sizes.push_back(tf_sizes[0]);
   }
 
-  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tfshape));
+  OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tf_shape));
 }
 
 inline int32 GetMklTensorDimIndex(char dimension) {
@@ -385,38 +535,107 @@ inline int32 GetMklTensorDimIndex(char dimension) {
   }
 }
 
-inline int64 GetMklTensorDim(const MklShape& mklshape, char dimension) {
+inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   int index = GetMklTensorDimIndex(dimension);
-  CHECK(index >= 0 && index < mklshape.GetDimension())
+  CHECK(index >= 0 && index < mkl_shape.GetDimension())
       << "Invalid index from the dimension: " << index << ", " << dimension;
-  return mklshape.dim_size(index);
+  return mkl_shape.dim_size(index);
+}
+
+inline void CopyMklTensorInToOut(OpKernelContext* context,
+                                 int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  const Tensor& meta = context->input(idx_meta_in);
+  Tensor output(data.dtype());
+  Tensor meta_output(meta.dtype());
+
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, data.shape()));
+  CHECK(meta_output.CopyFrom(meta, meta.shape()));
+  context->set_output(idx_data_out, output);
+  context->set_output(idx_meta_out, meta_output);
+}
+
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
 }
 
-namespace mkl_layer_registry {
+inline void FowardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+inline void ForwarMklTensorInToOut(OpKernelContext* context,
+                                   int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_meta_in = GetTensorMetaDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+  int idx_meta_out = GetTensorMetaDataIndex(idx_out, num_outputs);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    context->forward_ref_input_to_ref_output(idx_meta_in, idx_meta_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+    context->set_output(idx_meta_out, context->input(idx_meta_in));
+  }
+}
 
-static const char* kMklLayerLabel = "MklLayer";
-static const char* kMklLayerLabelPattern = "label='MklLayer'";
+namespace mkl_op_registry {
+static const char* kMklOpLabel = "MklOp";
+static const char* kMklOpLabelPattern = "label='MklOp'";
 
 // Check whether opname with type T is registered as MKL-compliant.
 //
 // @input: name of the op
 // @input: T datatype to be used for checking op
-// @return: true if opname is registered as Mkl layer op
-static inline bool IsMklLayer(const std::string& op_name, DataType T) {
+// @return: true if opname is registered as Mkl op
+static inline bool IsMklOp(const std::string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
-  // Currently, MKL only supports float type for ops. So we check if
-  // the type is float. Actually, we should query kernel registration and
-  // find out if op is supported for type T. But there is no API to query
-  // kernel registration using name and type.
-  bool result = (kernel.find(kMklLayerLabelPattern) != string::npos) &&
-                (T == DT_FLOAT);
-  if (result == true) {
-    VLOG(1) << "mkl_layer_registry::" << op_name << " is " << kMklLayerLabel;
+  bool result =
+      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+  if (result) {
+    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
   }
   return result;
 }
 
-}  // namespace mkl_layer_registry
+}  // namespace mkl_op_registry
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/util/overflow.h b/tensorflow/core/util/overflow.h
index 13b1f305aa71a5d4d415d91b79d5ddf804fb53a9..04be68a111ec1a3d861e51e9d1e09c6c2a9e030a 100644
--- a/tensorflow/core/util/overflow.h
+++ b/tensorflow/core/util/overflow.h
@@ -31,9 +31,8 @@ inline int64 MultiplyWithoutOverflow(const int64 x, const int64 y) {
   const uint64 uy = y;
   const uint64 uxy = ux * uy;
 
-  // Check for overflow, using a cheap check if both inputs are small
-  static const uint64 kSqrtInt64Max = 3037000500;  // ceil(sqrt(2**63 - 1))
-  if (TF_PREDICT_FALSE(ux >= kSqrtInt64Max || uy >= kSqrtInt64Max)) {
+  // Check if we overflow uint64, using a cheap check if both inputs are small
+  if (TF_PREDICT_FALSE((ux | uy) >> 32 != 0)) {
     // Ensure nonnegativity.  Note that negative numbers will appear "large"
     // to the unsigned comparisons above.
     CHECK(x >= 0 && y >= 0);
diff --git a/tensorflow/core/util/overflow_test.cc b/tensorflow/core/util/overflow_test.cc
index 627f77164e9de1f3534db14c3d0f84c2c0534284..f93ba885e6d24ddf9e1c1236b1e3a399f5ca90b2 100644
--- a/tensorflow/core/util/overflow_test.cc
+++ b/tensorflow/core/util/overflow_test.cc
@@ -30,8 +30,12 @@ TEST(OverflowTest, Nonnegative) {
     interesting.push_back(bit + 1);
     interesting.push_back(bit - 1);
   }
-  auto mid = static_cast<int64>(std::pow(2, 63.0 / 2));
-  for (int i = -5; i < 5; i++) interesting.push_back(mid + i);
+  for (const int64 mid : {static_cast<int64>(1) << 32,
+                          static_cast<int64>(std::pow(2, 63.0 / 2))}) {
+    for (int i = -5; i < 5; i++) {
+      interesting.push_back(mid + i);
+    }
+  }
 
   // Check all pairs
   for (auto x : interesting) {
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index f8f071e0fa9ca9d309de8f9e486aa9c000fb3d88..40ae85e44b0a18f5c113fa25f49237ea3266ba78 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -50,20 +50,23 @@ void StatSummarizer::Validate(const Detail* detail,
       }
       const auto& stored = detail->outputs[slot];
       const auto& current = output.tensor_description();
-      bool do_shapes_match = true;
-      if (stored.shape().dim_size() != current.shape().dim_size()) {
-        do_shapes_match = false;
-      } else {
+
+      bool do_tensors_match =
+          (stored.dtype() == current.dtype()) &&
+          (stored.shape().dim_size() == current.shape().dim_size());
+
+      if (do_tensors_match) {
         for (int i = 0; i < stored.shape().dim_size(); ++i) {
           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
-            do_shapes_match = false;
+            do_tensors_match = false;
+            break;
           }
         }
+      }
 
-        if ((stored.dtype() != current.dtype()) || !do_shapes_match) {
-          LOG(WARNING) << "Output tensor changed between runs for '"
-                       << ns.node_name();
-        }
+      if (!do_tensors_match) {
+        LOG(WARNING) << "Output tensor changed between runs for '"
+                     << ns.node_name();
       }
     }
   }
@@ -271,11 +274,14 @@ void StatSummarizer::ComputeStatsByType(
     std::map<string, int64>* node_type_map_time,
     std::map<string, int64>* node_type_map_memory,
     int64* accumulated_us) const {
+  int64 run_count = run_total_us_.count();
+
   for (const auto& det : details_) {
     const string node_name = det.first;
     const Detail& detail = det.second;
 
-    int64 curr_time_val = detail.rel_end_us.avg();
+    int64 curr_time_val =
+        static_cast<int64>(detail.rel_end_us.sum() / run_count);
     *accumulated_us += curr_time_val;
 
     int64 curr_memory_val = detail.mem_used.newest();
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index f93e0b5e467062c7bf4d64cbce5799adf67bcf49..77b75a3628674395ff8ce9e93b984fea3b3b7e76 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -9,9 +9,9 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
-# To be exported to tensorflow/core:android_srcs.
+# To be exported to tensorflow/core:mobile_srcs.
 filegroup(
-    name = "android_srcs",
+    name = "mobile_srcs",
     srcs = [
         "naming.cc",
         "naming.h",
@@ -20,6 +20,11 @@ filegroup(
     ],
 )
 
+alias(
+    name = "android_srcs",
+    actual = ":mobile_srcs",
+)
+
 cc_library(
     name = "tensor_bundle",
     srcs = ["tensor_bundle.cc"],
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index b8989b2c3ede878e86ce1259c075ed8fe0c06cbd..5c2bda4770fb9213e271123c30e94fe0c925eb3e 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -249,8 +249,10 @@ BundleWriter::BundleWriter(Env* env, StringPiece prefix)
                                      random::New64())),
       out_(nullptr),
       size_(0) {
-  status_ =
-      env_->CreateDir(io::Dirname(prefix_).ToString());  // Ignores errors.
+  status_ = env_->CreateDir(io::Dirname(prefix_).ToString());
+  if (!status_.ok() && !errors::IsAlreadyExists(status_)) {
+    return;
+  }
   const string filename = DataFilename(prefix_, 0, 1);
   std::unique_ptr<WritableFile> wrapper;
   status_ = env_->NewWritableFile(tmp_data_path_, &wrapper);
@@ -261,12 +263,10 @@ BundleWriter::BundleWriter(Env* env, StringPiece prefix)
   VLOG(1) << "Writing to file " << tmp_data_path_;
 }
 
-BundleWriter::~BundleWriter() { CHECK(out_ == nullptr); }
-
 Status BundleWriter::Add(StringPiece key, const Tensor& val) {
+  if (!status_.ok()) return status_;
   CHECK_NE(key, kHeaderEntryKey);
   const string key_string = key.ToString();
-  if (!status_.ok()) return status_;
   if (entries_.find(key_string) != entries_.end()) {
     status_ = errors::InvalidArgument("Adding duplicate key: ", key);
     return status_;
@@ -301,14 +301,14 @@ Status BundleWriter::AddSlice(StringPiece full_tensor_key,
                               const TensorShape& full_tensor_shape,
                               const TensorSlice& slice_spec,
                               const Tensor& slice_tensor) {
+  if (!status_.ok()) return status_;
+  CHECK_NE(full_tensor_key, kHeaderEntryKey);
+
   // If just a singleton full slice, use the regular Add() to be more efficient.
   if (IsFullSlice(slice_spec, full_tensor_shape)) {
     return Add(full_tensor_key, slice_tensor);
   }
 
-  CHECK_NE(full_tensor_key, kHeaderEntryKey);
-  if (!status_.ok()) return status_;
-
   // Inserts/updates the full tensor's metadata entry.
   //
   // In the case of a sharded save, MergeBundles() is responsible for merging
@@ -516,7 +516,8 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   // Merges all metadata tables.
   // TODO(zhifengc): KeyValue sorter if it becomes too big.
   MergeState merge;
-  env->CreateDir(io::Dirname(merged_prefix).ToString()).IgnoreError();
+  Status status = env->CreateDir(io::Dirname(merged_prefix).ToString());
+  if (!status.ok() && !errors::IsAlreadyExists(status)) return status;
   for (int i = 0; i < prefixes.size(); ++i) {
     TF_RETURN_IF_ERROR(MergeOneBundle(env, prefixes[i], &merge));
   }
@@ -534,7 +535,6 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   std::unique_ptr<WritableFile> merged_metadata;
   TF_RETURN_IF_ERROR(
       env->NewWritableFile(MetaFilename(merged_prefix), &merged_metadata));
-  Status status;
   {
     table::TableBuilder builder(table::Options(), merged_metadata.get());
     // Header entry.
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index bca3910f59cde4a59ccd798c4c634794c2827f82..2c40388250c665388e92ff317c6666547f873205 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -31,7 +31,7 @@ limitations under the License.
 // (tensorflow::table::Table).  Each key is a name of a tensor and its value is
 // a serialized BundleEntryProto.  Each BundleEntryProto describes the metadata
 // of a tensor: which of the "data" files contains the content of a tensor, the
-// offset into that file, checksum, some auxilary data, etc.
+// offset into that file, checksum, some auxiliary data, etc.
 //
 // A tensor bundle can be accessed randomly using a BundleReader.  Usage:
 //
@@ -100,11 +100,14 @@ extern const int kTensorBundleVersion;
 extern const char* const kHeaderEntryKey;
 
 // Builds a string-string table of tensor names to BundleEntryProto (metadata).
+//
+// On construction, attempts to create a directory given by the dirname of
+// "prefix", so "status()" must be checked before calling any member functions.
+//
 // All threads accessing the same BundleWriter must synchronize.
 class BundleWriter {
  public:
   BundleWriter(Env* env, StringPiece prefix);
-  ~BundleWriter();
 
   // Adds the tensor "val" under key "key".
   // Across calls "key" must be unique but can be added in any order.
@@ -209,6 +212,10 @@ class BundleReader {
 
   // Looks up the slices of the tensor keyed by "key".  On OK, "slices"
   // is non-empty if and only if the tensor is a partitioned tensor.
+  //
+  // Warning - there is no guaranteed ordering for the returned slices, so
+  // a slice with a larger start index in some dimension could come before
+  // another slice with a smaller start index in the same dimension.
   // REQUIRES: status().ok()
   Status LookupTensorSlices(StringPiece key, std::vector<TensorSlice>* slices)
       TF_MUST_USE_RESULT;
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index fe89fe852e2a94f5de40aa4311d6f0eeeb9f1dcc..8c76f0f3c5ab3b46a5751eeff0d331c5bd041605 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -177,7 +177,7 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
                                    gtl::ArraySlice<int64> spatial, int64 C) {
   gtl::InlinedVector<int64, 5> dim_sizes(spatial.size() + 2);
   dim_sizes[GetTensorBatchDimIndex(dim_sizes.size(), format)] = N;
-  for (int dim = 0; dim < spatial.size(); dim++) {
+  for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     dim_sizes[GetTensorSpatialDimIndex(dim_sizes.size(), format, dim)] =
         spatial[dim];
   }
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index e750b130b9e4f9687a8c43c73fe00def4166f235..cd4903471963e703f5ef2b7654a3c40418cbbb08 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -102,7 +102,8 @@ TensorSliceReader::TensorSliceReader(const string& filepattern)
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function)
-    : TensorSliceReader(filepattern, open_function, kLoadAllShards) {}
+    : TensorSliceReader(filepattern, std::move(open_function), kLoadAllShards) {
+}
 
 TensorSliceReader::TensorSliceReader(const string& filepattern,
                                      OpenTableFunction open_function,
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index cbd2922f543a9b3dea16612bfdc3dc136368b04d..0f009d7de57a3cf1471c1ba694d3a771bc00635c 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
+#include <utility>
+
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -36,7 +38,8 @@ const TensorSliceReader* TensorSliceReaderCacheWrapper::GetReader(
   if (!cache_) {
     cache_ = new TensorSliceReaderCache;
   }
-  return cache_->GetReader(filepattern, open_function, preferred_shard);
+  return cache_->GetReader(filepattern, std::move(open_function),
+                           preferred_shard);
 }
 
 TensorSliceReaderCache::TensorSliceReaderCache() {}
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 854569788617e85427bac98ca941c3e0bbc5afab..f4859262e12c3560703eb7daa83c970d352eb2d7 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 #include "tensorflow/core/framework/types.h"
@@ -48,8 +50,9 @@ namespace {
 //
 // We assume this is a row-major matrix.
 
-void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                       TensorSliceReader::OpenTableFunction open_function) {
+void SimpleFloatHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
@@ -108,7 +111,7 @@ void SimpleFloatHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -171,9 +174,10 @@ TEST(TensorSliceReaderTest, SimpleFloat) {
 }
 
 template <typename T, typename U>
-void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
-                      TensorSliceReader::OpenTableFunction open_function,
-                      const string& checkpoint_file) {
+void SimpleIntXHelper(
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    TensorSliceReader::OpenTableFunction open_function,
+    const string& checkpoint_file) {
   const string fname_base = io::JoinPath(testing::TmpDir(), checkpoint_file);
 
   TensorShape shape({4, 5});
@@ -232,7 +236,7 @@ void SimpleIntXHelper(TensorSliceWriter::CreateBuilderFunction create_function,
 
   // Now we need to read the tensor slices
   const string filepattern = strings::StrCat(fname_base, "_*");
-  TensorSliceReader reader(filepattern, open_function);
+  TensorSliceReader reader(filepattern, std::move(open_function));
   TF_EXPECT_OK(reader.status());
   EXPECT_EQ(2, reader.num_files());
 
@@ -304,8 +308,8 @@ TEST_SIMPLE_INT(int8, int32)
 TEST_SIMPLE_INT(uint8, int32)
 
 void CachedTensorSliceReaderTesterHelper(
-    TensorSliceWriter::CreateBuilderFunction create_function,
-    TensorSliceReader::OpenTableFunction open_function) {
+    const TensorSliceWriter::CreateBuilderFunction& create_function,
+    const TensorSliceReader::OpenTableFunction& open_function) {
   const string fname_base = io::JoinPath(testing::TmpDir(), "float_checkpoint");
 
   TensorShape shape({4, 5});
diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index 8c73ba54ff6bdb5c81c39686165418b9d801a9bf..409d5db211f19069f439e5f50448e89b9aa4f139 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -183,4 +183,7 @@ message TestResults {
     ANDROID_BENCHMARK = 3;
   }
   BenchmarkType benchmark_type = 10;
+
+  // Used for differentiating between continuous and debug builds.
+  string run_mode = 11;
 };
diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
index 3481a6aaa4d57d47c1e767ef0af4c7fcd69b10d4..1e5a9c571264ffe62951fe51b01aa80fea7c2728 100644
--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -85,7 +85,7 @@ void MovingAverage::AddValue(double v) {
 
 static char hex_char[] = "0123456789abcdef";
 
-string PrintMemory(const char* ptr, int n) {
+string PrintMemory(const char* ptr, size_t n) {
   string ret;
   ret.resize(n * 3);
   for (int i = 0; i < n; ++i) {
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index c142f4d0d26a7594a59b5ce96132c0359e3a93f4..4adf2f14dcc39138482beeec942d696146f255f3 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -49,7 +49,7 @@ class MovingAverage {
 
 // Returns a string printing bytes in ptr[0..n).  The output looks
 // like "00 01 ef cd cd ef".
-string PrintMemory(const char* ptr, int n);
+string PrintMemory(const char* ptr, size_t n);
 
 // Given a flattened index into a tensor, computes a string s so that
 // StrAppend("tensor", s) is a Python indexing expression.  E.g.,
diff --git a/tensorflow/docs_src/api_guides/python/client.md b/tensorflow/docs_src/api_guides/python/client.md
index f5bb256d870f7f7d000d9920adef245f598e0326..97c19863600a4b67c7af966d3fd2ef8def36fa20 100644
--- a/tensorflow/docs_src/api_guides/python/client.md
+++ b/tensorflow/docs_src/api_guides/python/client.md
@@ -3,7 +3,7 @@
 
 This library contains classes for launching graphs and executing operations.
 
-The @{$get_started} guide has
+The @{$get_started/get_started} guide has
 examples of how a graph is launched in a @{tf.Session}.
 
 ## Session management
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md
deleted file mode 100644
index 16a47bfd8b111740e7995c06587df98566adbce4..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijector.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Random variable transformations (contrib)
-[TOC]
-
-Bijector Ops.
-
-An API for invertible, differentiable transformations of random variables.
-
-## Background
-
-Differentiable, bijective transformations of continuous random variables alter
-the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
-manipulations.
-
-For more details and examples, see the `Bijector` docstring.
-
-To apply a `Bijector`, use `distributions.TransformedDistribution`.
-
-## Bijectors
-
-*   @{tf.contrib.distributions.bijector.Affine}
-*   @{tf.contrib.distributions.bijector.AffineLinearOperator}
-*   @{tf.contrib.distributions.bijector.Bijector}
-*   @{tf.contrib.distributions.bijector.Chain}
-*   @{tf.contrib.distributions.bijector.CholeskyOuterProduct}
-*   @{tf.contrib.distributions.bijector.Exp}
-*   @{tf.contrib.distributions.bijector.Identity}
-*   @{tf.contrib.distributions.bijector.Inline}
-*   @{tf.contrib.distributions.bijector.Invert}
-*   @{tf.contrib.distributions.bijector.PowerTransform}
-*   @{tf.contrib.distributions.bijector.SigmoidCentered}
-*   @{tf.contrib.distributions.bijector.SoftmaxCentered}
-*   @{tf.contrib.distributions.bijector.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ce187b329bce38fe096f2640a09cc93c71f9543
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
@@ -0,0 +1,33 @@
+# Random variable transformations (contrib)
+[TOC]
+
+Bijector Ops.
+
+An API for invertible, differentiable transformations of random variables.
+
+## Background
+
+Differentiable, bijective transformations of continuous random variables alter
+the calculations made in the cumulative/probability distribution functions and
+sample function.  This module provides a standard interface for making these
+manipulations.
+
+For more details and examples, see the `Bijector` docstring.
+
+To apply a `Bijector`, use `distributions.TransformedDistribution`.
+
+## Bijectors
+
+*   @{tf.contrib.distributions.bijectors.Affine}
+*   @{tf.contrib.distributions.bijectors.AffineLinearOperator}
+*   @{tf.contrib.distributions.bijectors.Bijector}
+*   @{tf.contrib.distributions.bijectors.Chain}
+*   @{tf.contrib.distributions.bijectors.CholeskyOuterProduct}
+*   @{tf.contrib.distributions.bijectors.Exp}
+*   @{tf.contrib.distributions.bijectors.Identity}
+*   @{tf.contrib.distributions.bijectors.Inline}
+*   @{tf.contrib.distributions.bijectors.Invert}
+*   @{tf.contrib.distributions.bijectors.PowerTransform}
+*   @{tf.contrib.distributions.bijectors.SigmoidCentered}
+*   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
+*   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
index 2b43e1281d73068750fbdabaa19618d470d4e803..7a3d509b75198461430195aa70a336f94b7f8cfa 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
@@ -76,7 +76,7 @@ representing the posterior or posterior predictive.
 
 ## Kullback-Leibler Divergence
 
-*   @{tf.contrib.distributions.kl}
+*   @{tf.contrib.distributions.kl_divergence}
 *   @{tf.contrib.distributions.RegisterKL}
 
 ## Utilities
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index f6116240792455585f76c5a7e0498c89b51707da..de4f126507930331d348cc795bd03b9971778d07 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -137,16 +137,16 @@ which to operate must always be given explicitly. This is the reason why
 
 ## Module: reroute
 
-*   @{tf.contrib.graph_editor.reroute.swap_ts}
-*   @{tf.contrib.graph_editor.reroute.reroute_ts}
-*   @{tf.contrib.graph_editor.reroute.swap_inputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_inputs}
-*   @{tf.contrib.graph_editor.reroute.swap_outputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_outputs}
-*   @{tf.contrib.graph_editor.reroute.swap_ios}
-*   @{tf.contrib.graph_editor.reroute.reroute_ios}
-*   @{tf.contrib.graph_editor.reroute.remove_control_inputs}
-*   @{tf.contrib.graph_editor.reroute.add_control_inputs}
+*   @{tf.contrib.graph_editor.swap_ts}
+*   @{tf.contrib.graph_editor.reroute_ts}
+*   @{tf.contrib.graph_editor.swap_inputs}
+*   @{tf.contrib.graph_editor.reroute_inputs}
+*   @{tf.contrib.graph_editor.swap_outputs}
+*   @{tf.contrib.graph_editor.reroute_outputs}
+*   @{tf.contrib.graph_editor.swap_ios}
+*   @{tf.contrib.graph_editor.reroute_ios}
+*   @{tf.contrib.graph_editor.remove_control_inputs}
+*   @{tf.contrib.graph_editor.add_control_inputs}
 
 ## Module: edit
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.integrate.md b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
index e6b730b2035a7aec4ff612e7721ac18a2d642508..e95b5a2e68685fc4828eb64fbc3e363d8a1add31 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.integrate.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.integrate.md
@@ -33,7 +33,7 @@ plt.plot(x, z)
 ```
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/lorenz_attractor.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/lorenz_attractor.png" alt>
 </div>
 
 ## Ops
diff --git a/tensorflow/docs_src/api_guides/python/contrib.layers.md b/tensorflow/docs_src/api_guides/python/contrib.layers.md
index a829c0a02cab2f63852d562a1e38152eec2f19bd..d4cda3a25454cd1db344c10ed07ada520bd45da9 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.layers.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.layers.md
@@ -20,7 +20,6 @@ common machine learning algorithms.
 *   @{tf.contrib.layers.flatten}
 *   @{tf.contrib.layers.fully_connected}
 *   @{tf.contrib.layers.layer_norm}
-*   @{tf.contrib.layers.linear}
 *   @{tf.contrib.layers.max_pool2d}
 *   @{tf.contrib.layers.one_hot_encoding}
 *   @{tf.nn.relu}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
index efc2d76ef1ef042a55e5a483976bfb1b8e4764f4..b2c7fcf6bbac58ea782c73d9651c0554d2ba1e8f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.linalg.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
@@ -21,7 +21,7 @@ Subclasses of `LinearOperator` provide a access to common methods on a
 *   @{tf.contrib.linalg.LinearOperatorDiag}
 *   @{tf.contrib.linalg.LinearOperatorIdentity}
 *   @{tf.contrib.linalg.LinearOperatorScaledIdentity}
-*   @{tf.contrib.linalg.LinearOperatorMatrix}
+*   @{tf.contrib.linalg.LinearOperatorFullMatrix}
 *   @{tf.contrib.linalg.LinearOperatorTriL}
 *   @{tf.contrib.linalg.LinearOperatorUDVHUpdate}
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index cb93f9d549a76ced233693666b1fe3186bfcd435..8c289dd55631a94546aeab129edf4d530eecaeda 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -13,8 +13,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
+training with both log_loss and mean_square_error, and we wished that the
+log_loss penalty be twice as severe as the mean_square_error, we would
 implement this as:
 
 ```python
@@ -22,7 +22,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
+  tf.contrib.losses.mean_square_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +74,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +100,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
@@ -118,9 +118,4 @@ weighted average over the individual prediction errors:
 @{tf.contrib.losses.softmax_cross_entropy}
 @{tf.contrib.losses.sparse_softmax_cross_entropy}
 
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@{tf.contrib.losses.sum_of_pairwise_squares}
-@{tf.contrib.losses.sum_of_squares}
-
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index 223bf4a0a313d6784e6c38db3291273e2778f679..2522e50c266db24dcea424b882ebba5509a4605f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -91,6 +91,32 @@ not a suggested device partitioning strategy.)
 
 ## Dynamic Decoding
 
+Example usage:
+
+``` python
+cell = # instance of RNNCell
+
+if mode == "train":
+  helper = tf.contrib.seq2seq.TrainingHelper(
+    input=input_vectors,
+    sequence_length=input_lengths)
+elif mode == "infer":
+  helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
+      embedding=embedding,
+      start_tokens=tf.tile([GO_SYMBOL], [batch_size]),
+      end_token=END_SYMBOL)
+
+decoder = tf.contrib.seq2seq.BasicDecoder(
+    cell=cell,
+    helper=helper,
+    initial_state=cell.zero_state(batch_size, tf.float32))
+outputs, _ = tf.contrib.seq2seq.dynamic_decode(
+   decoder=decoder,
+   output_time_major=False,
+   impute_finished=True,
+   maximum_iterations=20)
+```
+
 ### Decoder base class and functions
 *   @{tf.contrib.seq2seq.Decoder}
 *   @{tf.contrib.seq2seq.dynamic_decode}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.staging.md b/tensorflow/docs_src/api_guides/python/contrib.staging.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0ac5483427fc3138ee9a70590320b2119d193ea
--- /dev/null
+++ b/tensorflow/docs_src/api_guides/python/contrib.staging.md
@@ -0,0 +1,6 @@
+# Staging (contrib)
+[TOC]
+
+This library contains utilities for adding pipelining to a model.
+
+*   @{tf.contrib.staging.StagingArea}
diff --git a/tensorflow/docs_src/api_guides/python/index.md b/tensorflow/docs_src/api_guides/python/index.md
index 0e624df55b7af6b842a74b93bfe8ca526cb4f7de..19d50926d8821c17350d57909a8830c4cf00ba0a 100644
--- a/tensorflow/docs_src/api_guides/python/index.md
+++ b/tensorflow/docs_src/api_guides/python/index.md
@@ -40,9 +40,10 @@
 *   [Losses (contrib)](contrib.losses.md)
 *   [Metrics (contrib)](contrib.metrics.md)
 *   [Optimization (contrib)](contrib.opt.md)
-*   [Random variable transformations (contrib)](contrib.distributions.bijector.md)
+*   [Random variable transformations (contrib)](contrib.distributions.bijectors.md)
 *   [RNN and Cells (contrib)](contrib.rnn.md)
 *   [Seq2seq Library (contrib)](contrib.seq2seq.md)
+*   [Staging (contrib)](contrib.staging.md)
 *   [Statistical Distributions (contrib)](contrib.distributions.md)
 *   [Training (contrib)](contrib.training.md)
 *   [Utilities (contrib)](contrib.util.md)
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index be18d3e8be3dec50682971f4b23eae833e51c6b2..31a10d1f15d437810a9acb04d34bdae97195223c 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -4,12 +4,24 @@ We welcome contributions to the Tensorflow documentation from the community.
 This document explains how you can contribute to that documentation. In
 particular, this document explains the following:
 
-- Where the documentation is located.
-- How to make conformant edits.
-- How to build and test your documentation changes before you submit them.
+* Where the documentation is located.
+* How to make conformant edits.
+* How to build and test your documentation changes before you submit them.
 
-You can view Tensorflow documentation on tensorflow.org, and you can view and
-edit the raw files on Github.
+You can view Tensorflow documentation on https://www.tensorflow.org, and you
+can view and edit the raw files on Github. We're publishing our docs on Github
+so everybody can contribute. Whatever gets checked in tensorflow/docs_src will
+be published soon after on https://www.tensorflow.org. 
+
+Republishing TensorFlow documentation in different forms is absolutely allowed,
+but we are unlikely to accept other documentation formats (or the tooling to
+generate them) into our repository. If you do choose to republish our
+documentation in another form, please be sure to include:
+
+* The version of the API this represents (i.e. r1.0, master, etc.)
+* The commit or version from which the documentation was generated
+* Where to get the latest documentation (that is, https://www.tensorflow.org)
+* The Apache 2.0 license.
 
 ## A Note on Versions
 
@@ -166,7 +178,7 @@ tensorflow`).  Run the following command to compile TensorFlow and generate the
 documentation in the `/tmp/tfdocs` dir:
 
     bazel run tools/docs:generate -- \
-              --src_dir=`pwd`/tensorflow/docs_src/ \
+              --src_dir="$(pwd)/docs_src/" \
               --output_dir=/tmp/tfdocs/
 
 Note: You must set `src_dir` and `output_dir` to absolute file paths.
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index a2df61bc8090f6d713e55e06819771a0cd9c1833..f90a6cf938dcbdc83971a68cf28ae8722d4537fb 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -115,31 +115,31 @@ Example:
 
     def my_op(tensor_in, other_tensor_in, my_param, other_param=0.5,
               output_collections=(), name=None):
-    """My operation that adds two tensors with given coefficients.
-
-    Args:
-      tensor_in: `Tensor`, input tensor.
-      other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor.
-      my_param: `float`, coefficient for `tensor_in`.
-      other_param: `float`, coefficient for `other_tensor_in`.
-      output_collections: `tuple` of `string`s, name of the collection to
-                          collect result of this op.
-      name: `string`, name of the operation.
-
-    Returns:
-      `Tensor` of same shape as `tensor_in`, sum of input values with coefficients.
-
-    Example:
-      >>> my_op([1., 2.], [3., 4.], my_param=0.5, other_param=0.6,
-                output_collections=['MY_OPS'], name='add_t1t2')
-      [2.3, 3.4]
-    """
-    with tf.name_scope(name, "my_op", [tensor_in, other_tensor_in]):
-      tensor_in = tf.convert_to_tensor(tensor_in)
-      other_tensor_in = tf.convert_to_tensor(other_tensor_in)
-      result = my_param * tensor_in + other_param * other_tensor_in
-      tf.add_to_collections(output_collections, result)
-      return result
+      """My operation that adds two tensors with given coefficients.
+
+      Args:
+        tensor_in: `Tensor`, input tensor.
+        other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor.
+        my_param: `float`, coefficient for `tensor_in`.
+        other_param: `float`, coefficient for `other_tensor_in`.
+        output_collections: `tuple` of `string`s, name of the collection to
+                            collect result of this op.
+        name: `string`, name of the operation.
+
+      Returns:
+        `Tensor` of same shape as `tensor_in`, sum of input values with coefficients.
+
+      Example:
+        >>> my_op([1., 2.], [3., 4.], my_param=0.5, other_param=0.6,
+                  output_collections=['MY_OPS'], name='add_t1t2')
+        [2.3, 3.4]
+      """
+      with tf.name_scope(name, "my_op", [tensor_in, other_tensor_in]):
+        tensor_in = tf.convert_to_tensor(tensor_in)
+        other_tensor_in = tf.convert_to_tensor(other_tensor_in)
+        result = my_param * tensor_in + other_param * other_tensor_in
+        tf.add_to_collection(output_collections, result)
+        return result
 
 Usage:
 
@@ -162,9 +162,9 @@ operation.
   - `reuse`: `bool` indicator if the variable should be reused if
              it's present in the scope.
 
-* Layers that behave differently during training should have:
-  - `is_training`: `bool` to indicate if a training graph is been built.
-
+* Layers that behave differently during training should take:
+  - `is_training`: `bool` indicator to conditionally choose different 
+                   computation paths (e.g. using `tf.cond`) during execution.
 
 Example:
 
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index 7c9717189b89c24ffbfa8a7062de7548b76707ac..4c8c4e1a97249ef766233b9aa97a1c676c0b2737 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -26,6 +26,21 @@ The TensorFlow community has created many great projects around TensorFlow, incl
 * [Rust language bindings](https://github.com/google/tensorflow-rust)
 * [Operator Vectorization Library](https://github.com/opveclib/opveclib)
 
+## TensorFlow Communities Around the World
+
+Asia:
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+
+
+Europe:
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
+
 
 ## Support
 
@@ -45,3 +60,5 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
+    
+
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index bcc5b92db88c495d3ea933848dbd6c8ff09df185..99390f7416c87ea76fae1469797f53073ef77aca 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -2,7 +2,7 @@
 
 This document shows how to create a cluster of TensorFlow servers, and how to
 distribute a computation graph across that cluster. We assume that you are
-familiar with the @{$get_started$basic concepts} of
+familiar with the @{$get_started/get_started$basic concepts} of
 writing TensorFlow programs.
 
 ## Hello distributed TensorFlow!
@@ -178,7 +178,7 @@ simplify the work of specifying a replicated model. Possible approaches include:
   values for the current parameters, compute gradients in parallel, and then
   apply them together. It is compatible with in-graph replication (e.g. using
   gradient averaging as in the
-  [CIFAR-10 multi-GPU trainer](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
+  [CIFAR-10 multi-GPU trainer](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py)),
   and between-graph replication (e.g. using the
   @{tf.train.SyncReplicasOptimizer}).
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 45f75305063f29d4a12826003e07c854276efe4d..a8c28e98c9b20c35f30192907b84abdbf4860c81 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -42,7 +42,7 @@ To incorporate your custom op you'll need to:
     Python @{tf.test.compute_gradient_error$gradient checker}.
     See
     [`relu_op_test.py`](https://www.tensorflow.org/code/tensorflow/python/kernel_tests/relu_op_test.py) as
-    an example that does tests the forward functions of Relu-like operators and
+    an example that tests the forward functions of Relu-like operators and
     their gradients.
 
 PREREQUISITES:
@@ -121,16 +121,16 @@ class ZeroOutOp : public OpKernel {
     Tensor* output_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
-    auto output = output_tensor->flat<int32>();
+    auto output_flat = output_tensor->flat<int32>();
 
     // Set all but the first element of the output tensor to 0.
     const int N = input.size();
     for (int i = 1; i < N; i++) {
-      output(i) = 0;
+      output_flat(i) = 0;
     }
 
     // Preserve the first input value if possible.
-    if (N > 0) output(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
 };
 ```
@@ -152,6 +152,163 @@ REGISTER_KERNEL_BUILDER(Name("ZeroOut").Device(DEVICE_CPU), ZeroOutOp);
 >   Consider using a [`ResourceMgr`](https://www.tensorflow.org/code/tensorflow/core/framework/resource_mgr.h)
 >   to keep track of op state.
 
+### Multi-threaded CPU kernels
+
+To write a multi-threaded CPU kernel, the Shard function in
+[`work_sharder.h`](https://www.tensorflow.org/code/tensorflow/core/framework/work_sharder.h)
+can be used. This function shards a computation function across the
+threads configured to be used for intra-op threading (see
+intra_op_parallelism_threads in
+[`config.proto`](https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto)).
+
+### GPU kernels
+
+A GPU kernel is implemented in two parts: the OpKernel and the CUDA kernel and
+its launch code.
+
+Sometimes the OpKernel implementation is common between a CPU and GPU kernel,
+such as around inspecting inputs and allocating outputs.  In that case, a
+suggested implementation is to:
+
+1. Define the OpKernel templated on the Device and the primitive type of the
+   tensor.
+2. To do the actual computation of the output, the Compute function calls a
+    templated functor struct.
+3. The specialization of that functor for the CPUDevice is defined in the same
+   file, but the specialization for the GPUDevice is defined in a .cu.cc file,
+   since it will be compiled with the CUDA compiler.
+
+<!--zippy-->
+
+Expand this to see the example implementation.
+
+```c++
+// example.h
+#ifndef KERNEL_EXAMPLE_H_
+#define KERNEL_EXAMPLE_H_
+
+template <typename Device, typename T>
+struct ExampleFunctor {
+  void operator()(const Device& d, int size, const T* in, T* out);
+};
+
+#endif KERNEL_EXAMPLE_H_
+```
+
+```c++
+// example.cc
+#define EIGEN_USE_THREADS
+#include "example.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+using namespace tensorflow;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+// CPU specialization of actual computation.
+template <typename T>
+struct ExampleFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, int size, const T* in, T* out) {
+    for (int i = 0; i < size; ++i) {
+      out[i] = 2 * in[i];
+    }
+  }
+};
+
+// OpKernel definition.
+// template parameter <T> is the datatype of the tensors.
+template <typename Device, typename T>
+class ExampleOp : public OpKernel {
+ public:
+  explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input tensor
+    const Tensor& input_tensor = context->input(0);
+
+    // Create an output tensor
+    Tensor* output_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
+                                                     &output_tensor));
+
+    // Do the computation.
+    OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
+                errors::InvalidArgument("Too many elements in tensor"));
+    ExampleFunctor<Device, T>()(
+        context->eigen_device<Device>(),
+        static_cast<int>(input_tensor.NumElements()),
+        input_tensor.flat<T>().data(),
+        output_tensor->flat<T>().data());
+  }
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      ExampleOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(int32);
+
+// Register the GPU kernels.
+#ifdef GOOGLE_CUDA
+#define REGISTER_GPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      ExampleOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(int32);
+#endif  // GOOGLE_CUDA
+```
+
+```c++
+#ifdef GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+#define EIGEN_USE_THREADS
+
+#include "example.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+using namespace tensorflow;
+
+#define EIGEN_USE_GPU
+
+// Define the CUDA kernel.
+template <typename T>
+__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 2 * ldg(in + i);
+  }
+}
+
+// Define the GPU implementation that launches the CUDA kernel.
+template <typename T>
+struct ExampleFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, int size, const T* in, T* out) {
+    // Launch the cuda kernel.
+    //
+    // See core/util/cuda_kernel_helper.h for example of computing
+    // block count and thread_per_block count.
+    int block_count = 1024;
+    int thread_per_block = 20;
+    ExampleCudaKernel<T>
+        <<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
+  }
+};
+
+// Instantiate functors for the types of OpKernels registered.
+typedef Eigen::GpuDevice GPUDevice;
+template struct ExampleFunctor<GPUDevice, float>;
+template struct ExampleFunctor<GPUDevice, int32>;
+
+#endif  // GOOGLE_CUDA
+```
+
+<!--endzippy-->
+
 ## Build the op library
 ### Compile the op using your system compiler (TensorFlow binary installation)
 
@@ -160,7 +317,7 @@ or `clang` available on your system. The binary PIP package installs the header
 files and the library that you need to compile your op in locations that are
 system specific. However, the TensorFlow python library provides the
 `get_include` function to get the header directory.
-Here is the output of this function on a Ubuntu machine.
+Here is the output of this function on an Ubuntu machine.
 
 ```bash
 $ python
@@ -182,13 +339,13 @@ g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I $TF_INC -O2
 On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
 building the `.so` file.
 
->   Note on gcc version 5: gcc5 uses the new C++
->   [ABI](https://gcc.gnu.org/gcc-5/changes.html#libstdcxx). The binary pip
->   packages available on the TensorFlow website are built with gcc4 that uses
->   the older ABI. If you compile your op library with gcc5, add
+>   Note on `gcc` version `>=5`: gcc uses the new C++
+>   [ABI](https://gcc.gnu.org/gcc-5/changes.html#libstdcxx) since version `5`. The binary pip
+>   packages available on the TensorFlow website are built with `gcc4` that uses
+>   the older ABI. If you compile your op library with `gcc>=5`, add
 >   `-D_GLIBCXX_USE_CXX11_ABI=0` to the command line to make the library
 >   compatible with the older abi.
->   Furthermore if you are using TensorFlow package created from source remember to add `-cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"`
+>   Furthermore if you are using TensorFlow package created from source remember to add `--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"`
 >   as bazel command to compile the Python package.
 
 ### Compile the op using bazel (TensorFlow source installation)
@@ -225,7 +382,7 @@ TensorFlow Python API provides the
 load the dynamic library and register the op with the TensorFlow
 framework. `load_op_library` returns a Python module that contains the Python
 wrappers for the op and the kernel. Thus, once you have built the op, you can
-do the following to run it from Python :
+do the following to run it from Python:
 
 ```python
 import tensorflow as tf
@@ -1058,6 +1215,8 @@ you'll need to specify the path explicitly in the second (g++) command above.
 For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
 `/usr/local/cuda-8.0`.
 
+>   Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
+
 ### Implement the gradient in Python {#implement-gradient}
 
 Given a graph of ops, TensorFlow uses automatic differentiation
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index 085f74c0560c40de8b7b2b3e05217bcc057f4edc..21816502acec7abfca670cac1bceda3e29144b53 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -7,7 +7,7 @@ learning models and system-level optimizations.
 This document describes the system architecture that makes possible this
 combination of scale and flexibility. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
-and sessions. See @{$get_started$Getting Started}
+and sessions. See @{$get_started/get_started$Getting Started}
 for an introduction to these topics. Some familiarity
 with @{$distributed$distributed TensorFlow}
 will also be helpful.
@@ -25,7 +25,7 @@ The TensorFlow runtime is a cross-platform library. Figure 1 illustrates its
 general architecture. A C API separates user level code in different languages
 from the core runtime.
 
-![TensorFlow Layers](../images/layers.png){: width="300"}
+![TensorFlow Layers](https://www.tensorflow.org/images/layers.png){: width="300"}
 
 **Figure 1**
 
@@ -57,7 +57,7 @@ Other tasks send updates to these parameters as they work on optimizing the
 parameters. This particular division of labor between tasks is not required, but
 it is common for distributed training.
 
-![TensorFlow Architecture Diagram](../images/diag1.svg){: width="500"}
+![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
 
 **Figure 2**
 
@@ -91,7 +91,7 @@ In Figure 3, the client has built a graph that applies weights (w) to a
 feature vector (x), adds a bias term (b) and saves the result in a variable
 (s).
 
-![TensorFlow Architecture Diagram: Client](../images/graph_client.svg){: width="700"}
+![TensorFlow Architecture Diagram: Client](https://www.tensorflow.org/images/graph_client.svg){: width="700"}
 
 **Figure 3**
 
@@ -114,7 +114,7 @@ a step, it applies standard optimizations such as common subexpression
 elimination and constant folding. It then coordinates execution of the
 optimized subgraphs across a set of tasks.
 
-![TensorFlow Architecture Diagram: Master](../images/graph_master_cln.svg){: width="700"}
+![TensorFlow Architecture Diagram: Master](https://www.tensorflow.org/images/graph_master_cln.svg){: width="700"}
 
 **Figure 4**
 
@@ -123,7 +123,7 @@ Figure 5 shows a possible partition of our example graph. The distributed
 master has grouped the model parameters in order to place them together on the
 parameter server.
 
-![Partitioned Graph](../images/graph_split1.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split1.svg){: width="700"}
 
 **Figure 5**
 
@@ -132,14 +132,14 @@ Where graph edges are cut by the partition, the distributed master inserts
 send and receive nodes to pass information between the distributed tasks
 (Figure 6).
 
-![Partitioned Graph](../images/graph_split2.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_split2.svg){: width="700"}
 
 **Figure 6**
 
 
 The distributed master then ships the graph pieces to the distributed tasks.
 
-![Partitioned Graph](../images/graph_workers_cln.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_workers_cln.svg){: width="700"}
 
 **Figure 7**
 
@@ -181,7 +181,7 @@ We also have preliminary support for NVIDIA's NCCL library for multi-GPU
 communication (see [`tf.contrib.nccl`](
 https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py)).
 
-![Partitioned Graph](../images/graph_send_recv.svg){: width="700"}
+![Partitioned Graph](https://www.tensorflow.org/images/graph_send_recv.svg){: width="700"}
 
 **Figure 8**
 
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 28f62e01ab020a3f5194a5e423c57f68e37f145a..f972ee5f50ba001347252a4ec8c054ed701aa6db 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -37,14 +37,17 @@ measurements. You'll learn how to do the following:
 ## Prerequisites
 
 This tutorial assumes you already know tf.contrib.learn API basics, such as
-feature columns and `fit()` operations. If you've never used tf.contrib.learn
-before, or need a refresher, you should first review the following tutorials:
+feature columns, input functions, and `fit()`/`evaluate()`/`predict()`
+operations. If you've never used tf.contrib.learn before, or need a refresher,
+you should first review the following tutorials:
 
 *   @{$tflearn$tf.contrib.learn Quickstart}: Quick introduction to
     training a neural network using tf.contrib.learn.
 *   @{$wide$TensorFlow Linear Model Tutorial}: Introduction to
     feature columns, and an overview on building a linear classifier in
     tf.contrib.learn.
+*   @{$input_fn$Building Input Functions with tf.contrib.learn}: Overview of how
+    to construct an input_fn to preprocess and feed data into your models.
 
 ## An Abalone Age Predictor {#abalone-predictor}
 
@@ -72,7 +75,7 @@ for abalone:
 
 The label to predict is number of rings, as a proxy for abalone age.
 
-![Abalone shell](../images/abalone_shell.jpg) **[“Abalone
+![Abalone shell](https://www.tensorflow.org/abalone_shell.jpg) **[“Abalone
 shell”](https://www.flickr.com/photos/thenickster/16641048623/) (by [Nicki Dugan
 Pogue](https://www.flickr.com/photos/thenickster/), CC BY-SA 2.0)**
 
@@ -239,7 +242,7 @@ nn = tf.contrib.learn.Estimator(
 *   `params`: An optional dict of hyperparameters (e.g., learning rate, dropout)
     that will be passed into the `model_fn`.
 
-NOTE: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
+Note: Just like `tf.contrib.learn`'s predefined regressors and classifiers, the
 `Estimator` initializer also accepts the general configuration arguments
 `model_dir` and `config`.
 
@@ -252,7 +255,7 @@ code (highlighted in bold below), right after the logging configuration:
 <strong># Learning rate for the model
 LEARNING_RATE = 0.001</strong></code></pre>
 
-NOTE: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
+Note: Here, `LEARNING_RATE` is set to `0.001`, but you can tune this value as
 needed to achieve the best results during model training.
 
 Then, add the following code to `main()`, which creates the dict `model_params`
@@ -576,7 +579,7 @@ required arguments:
         algorithm
         (@{tf.train.RMSPropOptimizer})
 
-NOTE: The `optimize_loss` function supports additional optional arguments to
+Note: The `optimize_loss` function supports additional optional arguments to
 further configure the optimizer, such as for implementing decay. See the
 @{tf.contrib.layers.optimize_loss$API docs} for more info.
 
@@ -654,15 +657,30 @@ Add the following code to the end of `main()` to fit the neural network to the
 training data and evaluate accuracy:
 
 ```python
+def get_train_inputs():
+  x = tf.constant(training_set.data)
+  y = tf.constant(training_set.target)
+  return x, y
+
 # Fit
-nn.fit(x=training_set.data, y=training_set.target, steps=5000)
+nn.fit(input_fn=get_train_inputs, steps=5000)
+
+def get_test_inputs():
+  x = tf.constant(test_set.data)
+  y = tf.constant(test_set.target)
+  return x, y
 
 # Score accuracy
-ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
+ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
 print("Loss: %s" % ev["loss"])
 print("Root Mean Squared Error: %s" % ev["rmse"])
 ```
 
+Note: The above code uses input functions to feed feature (`x`) and label (`y`)
+`Tensor`s into the model for both training (`get_train_inputs()`) and evaluation
+(`get_test_inputs()`). To learn more about input functions, see the tutorial
+@{$input_fn$Building Input Functions with tf.contrib.learn}.
+
 Then run the code. You should see output like the following:
 
 ```none
diff --git a/tensorflow/docs_src/extend/tool_developers/index.md b/tensorflow/docs_src/extend/tool_developers/index.md
index 3705b310edb8d1e52296f8516dca54f6101e3d60..06fc5e70dd0e191730cc8469f4f9a457ba0abc23 100644
--- a/tensorflow/docs_src/extend/tool_developers/index.md
+++ b/tensorflow/docs_src/extend/tool_developers/index.md
@@ -63,7 +63,7 @@ There are actually two different formats that a ProtoBuf can be saved in.
 TextFormat is a human-readable form, which makes it nice for debugging and
 editing, but can get large when there's numerical data like weights stored in
 it. You can see a small example of that in
-[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt).
+[graph_run_run2.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/demo/data/graph_run_run2.pbtxt).
 
 Binary format files are a lot smaller than their text equivalents, even though
 they're not as readable for us. In this script, we ask the user to supply a
diff --git a/tensorflow/docs_src/get_started/embedding_viz.md b/tensorflow/docs_src/get_started/embedding_viz.md
index 64042497035b8408ae39eb1b6df1a03e3b24b745..84245b11bea455c230c6c299706f6899479d4413 100644
--- a/tensorflow/docs_src/get_started/embedding_viz.md
+++ b/tensorflow/docs_src/get_started/embedding_viz.md
@@ -21,7 +21,7 @@ interested in word embeddings,
 gives a good introduction.
 
 <video autoplay loop style="max-width: 100%;">
-  <source src="../images/embedding-mnist.mp4" type="video/mp4">
+  <source src="https://www.tensorflow.org/images/embedding-mnist.mp4" type="video/mp4">
   Sorry, your browser doesn't support HTML5 video in MP4 format.
 </video>
 
@@ -39,7 +39,7 @@ labels/images to the data points. You can do this by generating a
 [metadata file](#metadata) containing the labels for each point and configuring
 the projector either by using our Python API, or manually constructing and
 saving a
-<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto)</code>
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
 in the same directory as your checkpoint file.
 
 ## Setup
@@ -68,7 +68,7 @@ saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), step)
 
 If you have any metadata (labels, images) associated with your embedding, you
 can tell TensorBoard about it either by directly storing a
-<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto)</code>
+<code>[projector_config.pbtxt](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/plugins/projector/projector_config.proto)</code>
 in the <code>LOG_DIR</code>, or use our python API.
 
 For instance, the following <code>projector_config.ptxt</code> associates the
@@ -91,7 +91,7 @@ N = 10000 # Number of items (vocab size).
 D = 200 # Dimensionality of the embedding.
 embedding_var = tf.Variable(tf.random_normal([N,D]), name='word_embedding')
 
-# Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
+# Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
 config = projector.ProjectorConfig()
 
 # You can add multiple embeddings. Here we add only one.
@@ -173,7 +173,7 @@ last data point in the bottom right:
 
 Note in the example above that the last row doesn't have to be filled. For a
 concrete example of a sprite, see
-[this sprite image](../images/mnist_10k_sprite.png) of 10,000 MNIST digits
+[this sprite image](https://www.tensorflow.org/images/mnist_10k_sprite.png) of 10,000 MNIST digits
 (100x100).
 
 Note: We currently support sprites up to 8192px X 8192px.
@@ -247,7 +247,7 @@ further analysis on their own with the "Isolate Points" button in the Inspector
 pane on the right hand side.
 
 
-![Selection of nearest neighbors](../images/embedding-nearest-points.png "Selection of nearest neighbors")
+![Selection of nearest neighbors](https://www.tensorflow.org/images/embedding-nearest-points.png "Selection of nearest neighbors")
 *Selection of the nearest neighbors of “important” in a word embedding dataset.*
 
 The combination of filtering with custom projection can be powerful. Below, we filtered
@@ -260,10 +260,10 @@ You can see that on the right side we have “ideas”, “science”, “perspe
 <table width="100%;">
   <tr>
     <td style="width: 30%;">
-      <img src="../images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
+      <img src="https://www.tensorflow.org/images/embedding-custom-controls.png" alt="Custom controls panel" title="Custom controls panel" />
     </td>
     <td style="width: 70%;">
-      <img src="../images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
+      <img src="https://www.tensorflow.org/images/embedding-custom-projection.png" alt="Custom projection" title="Custom projection" />
     </td>
   </tr>
   <tr>
@@ -284,4 +284,4 @@ projection) as a small file. The Projector can then be pointed to a set of one
 or more of these files, producing the panel below. Other users can then walk
 through a sequence of bookmarks.
 
-<img src="../images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
+<img src="https://www.tensorflow.org/images/embedding-bookmark.png" alt="Bookmark panel" style="width:300px;">
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 0047127530629ce82a9721fc8a0aa7b44723c528..00cc10cd347143f23b44549672c9bb7f56eaaac6 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -1,4 +1,3 @@
-
 # Getting Started With TensorFlow
 
 This guide gets you started programming in TensorFlow. Before using this guide,
@@ -50,7 +49,6 @@ The canonical import statement for TensorFlow programs is as follows:
 
 ```python
 import tensorflow as tf
-
 ```
 This gives Python access to all of TensorFlow's classes, methods, and symbols.
 Most of the documentation assumes you have already done this.
@@ -70,12 +68,15 @@ or more tensors as inputs and produces a tensor as an output. One type of node
 is a constant. Like all TensorFlow constants, it takes no inputs, and it outputs
 a value it stores internally. We can create two floating point Tensors `node1`
 and `node2` as follows:
+
 ```python
 node1 = tf.constant(3.0, dtype=tf.float32)
 node2 = tf.constant(4.0) # also tf.float32 implicitly
 print(node1, node2)
 ```
+
 The final print statement produces
+
 ```
 Tensor("Const:0", shape=(), dtype=float32) Tensor("Const_1:0", shape=(), dtype=float32)
 ```
@@ -94,7 +95,9 @@ running the computational graph in a session as follows:
 sess = tf.Session()
 print(sess.run([node1, node2]))
 ```
+
 we see the expected values of 3.0 and 4.0:
+
 ```
 [3.0, 4.0]
 ```
@@ -108,9 +111,11 @@ node3 = tf.add(node1, node2)
 print("node3: ", node3)
 print("sess.run(node3): ",sess.run(node3))
 ```
+
 The last two print statements produce
+
 ```
-node3:  Tensor("Add_2:0", shape=(), dtype=float32)
+node3:  Tensor("Add:0", shape=(), dtype=float32)
 sess.run(node3):  7.0
 ```
 
@@ -118,7 +123,7 @@ TensorFlow provides a utility called TensorBoard that can display a picture of
 the computational graph. Here is a screenshot showing how TensorBoard
 visualizes the graph:
 
-![TensorBoard screenshot](../images/getting_started_add.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_add.png)
 
 As it stands, this graph is not especially interesting because it always
 produces a constant result. A graph can be parameterized to accept external
@@ -141,13 +146,15 @@ print(sess.run(adder_node, {a: 3, b:4.5}))
 print(sess.run(adder_node, {a: [1,3], b: [2, 4]}))
 ```
 resulting in the output
+
 ```
 7.5
 [ 3.  7.]
 ```
+
 In TensorBoard, the graph looks like this:
 
-![TensorBoard screenshot](../images/getting_started_adder.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_adder.png)
 
 We can make the computational graph more complex by adding another operation.
 For example,
@@ -163,7 +170,7 @@ produces the output
 
 The preceding computational graph would look as follows in TensorBoard:
 
-![TensorBoard screenshot](../images/getting_started_triple.png)
+![TensorBoard screenshot](https://www.tensorflow.org/images/getting_started_triple.png)
 
 In machine learning we will typically want a model that can take arbitrary
 inputs, such as the one above.  To make the model trainable, we need to be able
@@ -316,7 +323,7 @@ for i in range(1000):
   sess.run(train, {x:x_train, y:y_train})
 
 # evaluate training accuracy
-curr_W, curr_b, curr_loss  = sess.run([W, b, loss], {x:x_train, y:y_train})
+curr_W, curr_b, curr_loss = sess.run([W, b, loss], {x:x_train, y:y_train})
 print("W: %s b: %s loss: %s"%(curr_W, curr_b, curr_loss))
 ```
 When run, it produces
@@ -324,8 +331,12 @@ When run, it produces
 W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
 ```
 
+Notice that the loss is a very small number (close to zero). If you run this
+program your loss will not be exactly the same, because the model is initialized
+with random values.
+
 This more complicated program can still be visualized in TensorBoard
-![TensorBoard final model visualization](../images/getting_started_final.png)
+![TensorBoard final model visualization](https://www.tensorflow.org/images/getting_started_final.png)
 
 ## `tf.contrib.learn`
 
@@ -361,25 +372,36 @@ features = [tf.contrib.layers.real_valued_column("x", dimension=1)]
 estimator = tf.contrib.learn.LinearRegressor(feature_columns=features)
 
 # TensorFlow provides many helper methods to read and set up data sets.
-# Here we use `numpy_input_fn`. We have to tell the function how many batches
+# Here we use two data sets: one for training and one for evaluation
+# We have to tell the function how many batches
 # of data (num_epochs) we want and how big each batch should be.
-x = np.array([1., 2., 3., 4.])
-y = np.array([0., -1., -2., -3.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x}, y, batch_size=4,
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x":x_train}, y_train,
+                                              batch_size=4,
                                               num_epochs=1000)
+eval_input_fn = tf.contrib.learn.io.numpy_input_fn(
+    {"x":x_eval}, y_eval, batch_size=4, num_epochs=1000)
 
-# We can invoke 1000 training steps by invoking the `fit` method and passing the
+# We can invoke 1000 training steps by invoking the  method and passing the
 # training data set.
 estimator.fit(input_fn=input_fn, steps=1000)
 
-# Here we evaluate how well our model did. In a real example, we would want
-# to use a separate validation and testing data set to avoid overfitting.
-print(estimator.evaluate(input_fn=input_fn))
+# Here we evaluate how well our model did.
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
 ```
 When run, it produces
 ```
-    {'global_step': 1000, 'loss': 1.9650059e-11}
+    train loss: {'global_step': 1000, 'loss': 4.3049088e-08}
+    eval loss: {'global_step': 1000, 'loss': 0.0025487561}
 ```
+Notice how our eval data has a higher loss, but it is still close to zero.
+That means we are learning properly.
 
 ### A custom model
 
@@ -421,19 +443,25 @@ def model(features, labels, mode):
       train_op=train)
 
 estimator = tf.contrib.learn.Estimator(model_fn=model)
-# define our data set
-x = np.array([1., 2., 3., 4.])
-y = np.array([0., -1., -2., -3.])
-input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x}, y, 4, num_epochs=1000)
+# define our data sets
+x_train = np.array([1., 2., 3., 4.])
+y_train = np.array([0., -1., -2., -3.])
+x_eval = np.array([2., 5., 8., 1.])
+y_eval = np.array([-1.01, -4.1, -7, 0.])
+input_fn = tf.contrib.learn.io.numpy_input_fn({"x": x_train}, y_train, 4, num_epochs=1000)
 
 # train
 estimator.fit(input_fn=input_fn, steps=1000)
-# evaluate our model
-print(estimator.evaluate(input_fn=input_fn, steps=10))
+# Here we evaluate how well our model did. 
+train_loss = estimator.evaluate(input_fn=input_fn)
+eval_loss = estimator.evaluate(input_fn=eval_input_fn)
+print("train loss: %r"% train_loss)
+print("eval loss: %r"% eval_loss)
 ```
 When run, it produces
-```python
-{'loss': 5.9819476e-11, 'global_step': 1000}
+```
+train loss: {'global_step': 1000, 'loss': 4.9380226e-11}
+eval loss: {'global_step': 1000, 'loss': 0.01010081}
 ```
 
 Notice how the contents of the custom `model()` function are very similar
diff --git a/tensorflow/docs_src/get_started/graph_viz.md b/tensorflow/docs_src/get_started/graph_viz.md
index b69103299ea151a948954b9598a36f9a4a12f969..06ec427b757d6a34270b646341786bc8925473d5 100644
--- a/tensorflow/docs_src/get_started/graph_viz.md
+++ b/tensorflow/docs_src/get_started/graph_viz.md
@@ -2,7 +2,7 @@
 
 TensorFlow computation graphs are powerful but complicated. The graph visualization can help you understand and debug them. Here's an example of the visualization at work.
 
-![Visualization of a TensorFlow graph](../images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
+![Visualization of a TensorFlow graph](https://www.tensorflow.org/images/graph_vis_animation.gif "Visualization of a TensorFlow graph")
 *Visualization of a TensorFlow graph.*
 
 To see your own graph, run TensorBoard pointing it to the log directory of the job, click on the graph tab on the top pane and select the appropriate run using the menu at the upper left corner. For in depth information on how to run TensorBoard and make sure you are logging all the necessary information, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning}.
@@ -43,10 +43,10 @@ expanded states.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
+      <img src="https://www.tensorflow.org/images/pool1_collapsed.png" alt="Unexpanded name scope" title="Unexpanded name scope" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
+      <img src="https://www.tensorflow.org/images/pool1_expanded.png" alt="Expanded name scope" title="Expanded name scope" />
     </td>
   </tr>
   <tr>
@@ -87,10 +87,10 @@ and the auxiliary area.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
+      <img src="https://www.tensorflow.org/images/conv_1.png" alt="conv_1 is part of the main graph" title="conv_1 is part of the main graph" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
+      <img src="https://www.tensorflow.org/images/save.png" alt="save is extracted as auxiliary node" title="save is extracted as auxiliary node" />
     </td>
   </tr>
   <tr>
@@ -114,10 +114,10 @@ specific set of nodes.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
+      <img src="https://www.tensorflow.org/images/series.png" alt="Sequence of nodes" title="Sequence of nodes" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
+      <img src="https://www.tensorflow.org/images/series_expanded.png" alt="Expanded sequence of nodes" title="Expanded sequence of nodes" />
     </td>
   </tr>
   <tr>
@@ -135,15 +135,15 @@ for constants and summary nodes. To summarize, here's a table of node symbols:
 
 Symbol | Meaning
 --- | ---
-![Name scope](../images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
-![Sequence of unconnected nodes](../images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
-![Sequence of connected nodes](../images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
-![Operation node](../images/op_node.png "Operation node") | An individual operation node.
-![Constant node](../images/constant.png "Constant node") | A constant.
-![Summary node](../images/summary.png "Summary node") | A summary node.
-![Data flow edge](../images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
-![Control dependency edge](../images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
-![Reference edge](../images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
+![Name scope](https://www.tensorflow.org/images/namespace_node.png "Name scope") | *High-level* node representing a name scope. Double-click to expand a high-level node.
+![Sequence of unconnected nodes](https://www.tensorflow.org/images/horizontal_stack.png "Sequence of unconnected nodes") | Sequence of numbered nodes that are not connected to each other.
+![Sequence of connected nodes](https://www.tensorflow.org/images/vertical_stack.png "Sequence of connected nodes") | Sequence of numbered nodes that are connected to each other.
+![Operation node](https://www.tensorflow.org/images/op_node.png "Operation node") | An individual operation node.
+![Constant node](https://www.tensorflow.org/images/constant.png "Constant node") | A constant.
+![Summary node](https://www.tensorflow.org/images/summary.png "Summary node") | A summary node.
+![Data flow edge](https://www.tensorflow.org/images/dataflow_edge.png "Data flow edge") | Edge showing the data flow between operations.
+![Control dependency edge](https://www.tensorflow.org/images/control_edge.png "Control dependency edge") | Edge showing the control dependency between operations.
+![Reference edge](https://www.tensorflow.org/images/reference_edge.png "Reference edge") | A reference edge showing that the outgoing operation node can mutate the incoming tensor.
 
 ## Interaction {#interaction}
 
@@ -161,10 +161,10 @@ right corner of the visualization.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
+      <img src="https://www.tensorflow.org/images/infocard.png" alt="Info card of a name scope" title="Info card of a name scope" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
+      <img src="https://www.tensorflow.org/images/infocard_op.png" alt="Info card of operation node" title="Info card of operation node" />
     </td>
   </tr>
   <tr>
@@ -207,10 +207,10 @@ The images below give an illustration for a piece of a real-life graph.
 <table width="100%;">
   <tr>
     <td style="width: 50%;">
-      <img src="../images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
+      <img src="https://www.tensorflow.org/images/colorby_structure.png" alt="Color by structure" title="Color by structure" />
     </td>
     <td style="width: 50%;">
-      <img src="../images/colorby_device.png" alt="Color by device" title="Color by device" />
+      <img src="https://www.tensorflow.org/images/colorby_device.png" alt="Color by device" title="Color by device" />
     </td>
   </tr>
   <tr>
@@ -233,7 +233,7 @@ The images below show the CIFAR-10 model with tensor shape information:
 <table width="100%;">
   <tr>
     <td style="width: 100%;">
-      <img src="../images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
+      <img src="https://www.tensorflow.org/images/tensor_shapes.png" alt="CIFAR-10 model with tensor shape information" title="CIFAR-10 model with tensor shape information" />
     </td>
   </tr>
   <tr>
@@ -303,13 +303,13 @@ tensor output sizes.
 <table width="100%;">
   <tr style="height: 380px">
     <td>
-      <img src="../images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
+      <img src="https://www.tensorflow.org/images/colorby_compute_time.png" alt="Color by compute time" title="Color by compute time"/>
     </td>
     <td>
-      <img src="../images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
+      <img src="https://www.tensorflow.org/images/run_metadata_graph.png" alt="Run metadata graph" title="Run metadata graph" />
     </td>
     <td>
-      <img src="../images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
+      <img src="https://www.tensorflow.org/images/run_metadata_infocard.png" alt="Run metadata info card" title="Run metadata info card" />
     </td>
   </tr>
 </table>
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index 74ed5fbebff26c3c31af533acaabab82d6a8bc05..a053617b5895bd2a92784e64b4dfd6f1ac35ab53 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -12,7 +12,7 @@ When training a neural network using tf.contrib.learn, it's possible to pass
 your feature and target data directly into your `fit`, `evaluate`, or `predict`
 operations. Here's an example taken from the @{$tflearn$tf.contrib.learn quickstart tutorial}:
 
-```py
+```python
 training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
     filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
 test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index d688d27ae2ce119f9edd27a184c1e1e8b8c1f40c..812f248d3ebfdf7439d9324b47825c2facf951c2 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -9,3 +9,4 @@ monitors.md
 summaries_and_tensorboard.md
 embedding_viz.md
 graph_viz.md
+tensorboard_histograms.md
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index f6d6b230b39c4aaef7ec5f7d700d34d137dd07f3..624d91647484bb0adf85b47179c2ac686ffc890f 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -15,7 +15,7 @@ MNIST is a simple computer vision dataset. It consists of images of handwritten
 digits like these:
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/MNIST.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST.png">
 </div>
 
 It also includes labels for each image, telling us which digit it is. For
@@ -88,7 +88,7 @@ Each image is 28 pixels by 28 pixels. We can interpret this as a big array of
 numbers:
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/MNIST-Matrix.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/MNIST-Matrix.png">
 </div>
 
 We can flatten this array into a vector of 28x28 = 784 numbers. It doesn't
@@ -110,7 +110,7 @@ Each entry in the tensor is a pixel intensity between 0 and 1, for a particular
 pixel in a particular image.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/mnist-train-xs.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-xs.png">
 </div>
 
 Each image in MNIST has a corresponding label, a number between 0 and 9
@@ -124,7 +124,7 @@ vector which is 1 in the \\(n\\)th dimension. For example, 3 would be
 `[55000, 10]` array of floats.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/mnist-train-ys.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/mnist-train-ys.png">
 </div>
 
 We're now ready to actually make our model!
@@ -157,7 +157,7 @@ classes. Red represents negative weights, while blue represents positive
 weights.
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-weights.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-weights.png">
 </div>
 
 We also add some extra evidence called a bias. Basically, we want to be able
@@ -202,13 +202,14 @@ although with a lot more \\(x\\)s. For each output, we compute a weighted sum of
 the \\(x\\)s, add a bias, and then apply softmax.
 
 <div style="width:55%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-scalargraph.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalargraph.png">
 </div>
 
 If we write that out as equations, we get:
 
 <div style="width:52%; margin-left:25%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-scalarequation.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-scalarequation.png"
+   alt="[y1, y2, y3] = softmax(W11*x1 + W12*x2 + W13*x3 + b1,  W21*x1 + W22*x2 + W23*x3 + b2,  W31*x1 + W32*x2 + W33*x3 + b3)">
 </div>
 
 We can "vectorize" this procedure, turning it into a matrix multiplication
@@ -216,7 +217,8 @@ and vector addition. This is helpful for computational efficiency. (It's also
 a useful way to think.)
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../../images/softmax-regression-vectorequation.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-regression-vectorequation.png"
+ alt="[y1, y2, y3] = softmax([[W11, W12, W13], [W21, W22, W23], [W31, W32, W33]]*[x1, x2, x3] + [b1, b2, b3])">
 </div>
 
 More compactly, we can just write:
@@ -360,7 +362,7 @@ minimize. Then it can apply your choice of optimization algorithm to modify the
 variables and reduce the loss.
 
 ```python
-train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
+train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
 ```
 
 In this case, we ask TensorFlow to minimize `cross_entropy` using the
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index b55a5c19ff9db1b407924d52b11079b4b16f5ff8..48d9a395f2859e81cf9627f37ce2677f3479ce22 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -34,7 +34,7 @@ MNIST is a classic problem in machine learning. The problem is to look at
 greyscale 28x28 pixel images of handwritten digits and determine which digit
 the image represents, for all the digits from zero to nine.
 
-![MNIST Digits](../../images/mnist_digits.png "MNIST Digits")
+![MNIST Digits](https://www.tensorflow.org/images/mnist_digits.png "MNIST Digits")
 
 For more information, refer to [Yann LeCun's MNIST page](http://yann.lecun.com/exdb/mnist/)
 or [Chris Olah's visualizations of MNIST](http://colah.github.io/posts/2014-10-Visualizing-MNIST/).
@@ -90,7 +90,7 @@ loss.
 and apply gradients.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/mnist_subgraph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/mnist_subgraph.png">
 </div>
 
 ### Inference
@@ -384,7 +384,7 @@ summary_writer.add_summary(summary_str, step)
 When the events files are written, TensorBoard may be run against the training
 folder to display the values from the summaries.
 
-![MNIST TensorBoard](../../images/mnist_tensorboard.png "MNIST TensorBoard")
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
 
 **NOTE**: For more info about how to build and run Tensorboard, please see the accompanying tutorial @{$summaries_and_tensorboard$Tensorboard: Visualizing Learning}.
 
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
index 6f0ba492d309d99a2955323873f4d516eb6b740d..5dbb00c0b528eb77e40045dcc56fd4901692418e 100644
--- a/tensorflow/docs_src/get_started/mnist/pros.md
+++ b/tensorflow/docs_src/get_started/mnist/pros.md
@@ -65,12 +65,12 @@ programs is to first create a graph and then launch it in a session.
 Here we instead use the convenient `InteractiveSession` class, which makes
 TensorFlow more flexible about how you structure your code.  It allows you to
 interleave operations which build a
-@{$get_started#the_computational_graph$computation graph}
+@{$get_started/get_started#the_computational_graph$computation graph}
 with ones that run the graph.  This is particularly convenient when working in
 interactive contexts like IPython.  If you are not using an
 `InteractiveSession`, then you should build the entire computation graph before
 starting a session and
-@{$get_started#the_computational_graph$launching the graph}.
+@{$get_started/get_started#the_computational_graph$launching the graph}.
 
 ```python
 import tensorflow as tf
@@ -95,8 +95,8 @@ similar to that used in Theano or Torch.
 
 The role of the Python code is therefore to build this external computation
 graph, and to dictate which parts of the computation graph should be run. See
-the @{$get_started#the_computational_graph$Computation Graph}
-section of @{$get_started} for more detail.
+the @{$get_started/get_started#the_computational_graph$Computation Graph}
+section of @{$get_started/get_started} for more detail.
 
 ## Build a Softmax Regression Model
 
diff --git a/tensorflow/docs_src/get_started/monitors.md b/tensorflow/docs_src/get_started/monitors.md
index 99d583b23dc988ce16c23532363836452e776b1a..d9c605b013cca5e4bad21fd7167a0cca345c3251 100644
--- a/tensorflow/docs_src/get_started/monitors.md
+++ b/tensorflow/docs_src/get_started/monitors.md
@@ -65,7 +65,7 @@ if __name__ == "__main__":
 
 Copy the above code into a file, and download the corresponding
 [training](http://download.tensorflow.org/data/iris_training.csv) and
-@{tf.test} data sets to the same
+[test](http://download.tensorflow.org/data/iris_test.csv) data sets to the same
 directory.
 
 In the following sections, you'll progressively make updates to the above code
@@ -282,18 +282,15 @@ validation_metrics = {
     "accuracy":
         tf.contrib.learn.MetricSpec(
             metric_fn=tf.contrib.metrics.streaming_accuracy,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES),
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
     "precision":
         tf.contrib.learn.MetricSpec(
             metric_fn=tf.contrib.metrics.streaming_precision,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES),
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
     "recall":
         tf.contrib.learn.MetricSpec(
             metric_fn=tf.contrib.metrics.streaming_recall,
-            prediction_key=tf.contrib.learn.prediction_key.PredictionKey.
-            CLASSES)
+            prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
 }
 ```
 
@@ -404,6 +401,6 @@ Then navigate to `http://0.0.0.0:`*`<port_number>`* in your browser, where
 If you click on the accuracy field, you'll see an image like the following,
 which shows accuracy plotted against step count:
 
-![Accuracy over step count in TensorBoard](../images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
+![Accuracy over step count in TensorBoard](https://www.tensorflow.org/images/validation_monitor_tensorboard_accuracy.png "Accuracy over step count in TensorBoard")
 
 For more on using TensorBoard, see @{$summaries_and_tensorboard$TensorBoard: Visualizing Learning} and @{$graph_viz$TensorBoard: Graph Visualization}.
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
index 6e06c9e41e4c16c370584f4402d42238adddebae..45d43e7a6e76ef9adc95cf2ebe5fe346de22caee 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -8,7 +8,7 @@ your TensorFlow graph, plot quantitative metrics about the execution of your
 graph, and show additional data like images that pass through it. When
 TensorBoard is fully configured, it looks like this:
 
-![MNIST TensorBoard](../images/mnist_tensorboard.png "MNIST TensorBoard")
+![MNIST TensorBoard](https://www.tensorflow.org/images/mnist_tensorboard.png "MNIST TensorBoard")
 
 <div class="video-wrapper">
   <iframe class="devsite-embedded-youtube-video" data-video-id="eBbEDRsCmv4"
diff --git a/tensorflow/docs_src/get_started/tensorboard_histograms.md b/tensorflow/docs_src/get_started/tensorboard_histograms.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3dd13497eb598d7e86efae5529396bd472edc31
--- /dev/null
+++ b/tensorflow/docs_src/get_started/tensorboard_histograms.md
@@ -0,0 +1,243 @@
+# TensorBoard Histogram Dashboard
+
+The TensorBoard Histogram Dashboard displays how the distribution of some
+`Tensor` in your TensorFlow graph has changed over time. It does this by showing
+many histograms visualizations of your tensor at different points in time.
+
+## A Basic Example
+
+Let's start with a simple case: a normally-distributed variable, where the mean
+shifts over time.
+TensorFlow has an op
+[`tf.random_normal`](https://www.tensorflow.org/api_docs/python/tf/random_normal)
+which is perfect for this purpose. As is usually the case with TensorBoard, we
+will ingest data using a summary op; in this case,
+['tf.summary.histogram'](https://www.tensorflow.org/api_docs/python/tf/summary/histogram).
+For a primer on how summaries work, please see the general
+[TensorBoard tutorial](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+Here is a code snippet that will generate some histogram summaries containing
+normally distributed data, where the mean of the distribution increases over
+time.
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+Once that code runs, we can load the data into TensorBoard via the command line:
+
+
+```sh
+tensorboard --logdir=/tmp/histogram_example
+```
+
+Once TensorBoard is running, load it in Chrome or Firefox and navigate to the
+Histogram Dashboard. Then we can see a histogram visualization for our normally
+distributed data.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/1_moving_mean.png)
+
+`tf.summary.histogram` takes an arbitrarily sized and shaped Tensor, and
+compresses it into a histogram data structure consisting of many bins with
+widths and counts. For example, let's say we want to organize the numbers
+`[0.5, 1.1, 1.3, 2.2, 2.9, 2.99]` into bins. We could make three bins:
+* a bin
+containing everything from 0 to 1 (it would contain one element, 0.5),
+* a bin
+containing everything from 1-2 (it would contain two elements, 1.1 and 1.3),
+* a bin containing everything from 2-3 (it would contain three elements: 2.2,
+2.9 and 2.99).
+
+TensorFlow uses a similar approach to create bins, but unlike in our example, it
+doesn't create integer bins. For large, sparse datasets, that might result in
+many thousands of bins.
+Instead, [the bins are exponentially distributed, with many bins close to 0 and
+comparatively few bins for very large numbers.](https://github.com/tensorflow/tensorflow/blob/c8b59c046895fa5b6d79f73e0b5817330fcfbfc1/tensorflow/core/lib/histogram/histogram.cc#L28)
+However, visualizing exponentially-distributed bins is tricky; if height is used
+to encode count, then wider bins take more space, even if they have the same
+number of elements. Conversely, encoding count in the area makes height
+comparisons impossible. Instead, the histograms [resample the data](https://github.com/tensorflow/tensorflow/blob/17c47804b86e340203d451125a721310033710f1/tensorflow/tensorboard/components/tf_backend/backend.ts#L400)
+into uniform bins. This can lead to unfortunate artifacts in some cases.
+
+Each slice in the histogram visualizer displays a single histogram.
+The slices are organized by step;
+older slices (e.g. step 0) are further "back" and darker, while newer slices
+(e.g. step 400) are close to the foreground, and lighter in color.
+The y-axis on the right shows the step number.
+
+You can mouse over the histogram to see tooltips with some more detailed
+information. For example, in the following image we can see that the histogram
+at timestep 176 has a bin centered at 2.25 with 177 elements in that bin.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/2_moving_mean_tooltip.png)
+
+Also, you may note that the histogram slices are not always evenly spaced in
+step count or time. This is because TensorBoard uses
+[reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling) to keep a
+subset of all the histograms, to save on memory. Reservoir sampling guarantees
+that every sample has an equal likelihood of being included, but because it is
+a randomized algorithm, the samples chosen don't occur at even steps.
+
+## Overlay Mode
+
+There is a control on the left of the dashboard that allows you to toggle the
+histogram mode from "offset" to "overlay":
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/3_overlay_offset.png)
+
+In "offset" mode, the visualization rotates 45 degrees, so that the individual
+histogram slices are no longer spread out in time, but instead are all plotted
+on the same y-axis.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/4_overlay.png)
+Now, each slice is a separate line on the chart, and the y-axis shows the item
+count within each bucket. Darker lines are older, earlier steps, and lighter
+lines are more recent, later steps. Once again, you can mouse over the chart to
+see some additional information.
+
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/5_overlay_tooltips.png)
+
+In general, the overlay visualization is useful if you want to directly compare
+the counts of different histograms.
+
+## Multimodal Distributions
+
+The Histogram Dashboard is great for visualizing multimodal
+distributions. Let's construct a simple bimodal distribution by concatenating
+the outputs from two different normal distributions. The code will look like
+this:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+
+You already remember our "moving mean" normal distribution from the example
+above. Now we also have a "shrinking variance" distribution. Side-by-side, they
+look like this:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/6_two_distributions.png)
+
+When we concatenate them, we get a chart that clearly reveals the divergent,
+bimodal structure:
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/7_bimodal.png)
+
+## Some more distributions
+
+Just for fun, let's generate and visualize a few more distributions, and then
+combine them all into one chart. Here's the code we'll use:
+
+```python
+import tensorflow as tf
+
+k = tf.placeholder(tf.float32)
+
+# Make a normal distribution, with a shifting mean
+mean_moving_normal = tf.random_normal(shape=[1000], mean=(5*k), stddev=1)
+# Record that distribution into a histogram summary
+tf.summary.histogram("normal/moving_mean", mean_moving_normal)
+
+# Make a normal distribution with shrinking variance
+variance_shrinking_normal = tf.random_normal(shape=[1000], mean=0, stddev=1-(k))
+# Record that distribution too
+tf.summary.histogram("normal/shrinking_variance", variance_shrinking_normal)
+
+# Let's combine both of those distributions into one dataset
+normal_combined = tf.concat([mean_moving_normal, variance_shrinking_normal], 0)
+# We add another histogram summary to record the combined distribution
+tf.summary.histogram("normal/bimodal", normal_combined)
+
+# Add a gamma distribution
+gamma = tf.random_gamma(shape=[1000], alpha=k)
+tf.summary.histogram("gamma", gamma)
+
+# And a poisson distribution
+poisson = tf.random_poisson(shape=[1000], lam=k)
+tf.summary.histogram("poisson", poisson)
+
+# And a uniform distribution
+uniform = tf.random_uniform(shape=[1000], maxval=k*10)
+tf.summary.histogram("uniform", uniform)
+
+# Finally, combine everything together!
+all_distributions = [mean_moving_normal, variance_shrinking_normal,
+                     gamma, poisson, uniform]
+all_combined = tf.concat(all_distributions, 0)
+tf.summary.histogram("all_combined", all_combined)
+
+summaries = tf.summary.merge_all()
+
+# Setup a session and summary writer
+sess = tf.Session()
+writer = tf.summary.FileWriter("/tmp/histogram_example")
+
+# Setup a loop and write the summaries to disk
+N = 400
+for step in range(N):
+  k_val = step/float(N)
+  summ = sess.run(summaries, feed_dict={k: k_val})
+  writer.add_summary(summ, global_step=step)
+```
+### Gamma Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/8_gamma.png)
+
+### Uniform Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/9_uniform.png)
+
+### Poisson Distribution
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/10_poisson.png)
+The poisson distribution is defined over the integers. So, all of the values
+being generated are perfect integers. The histogram compression moves the data
+into floating-point bins, causing the visualization to show little
+bumps over the integer values rather than perfect spikes.
+
+### All Together Now
+Finally, we can concatenate all of the data into one funny-looking curve.
+![](https://www.tensorflow.org/images/tensorboard/histogram_dashboard/11_all_combined.png)
+
diff --git a/tensorflow/docs_src/get_started/tflearn.md b/tensorflow/docs_src/get_started/tflearn.md
index 0912c7a5b4ab243640e653c1d16c3b34d9d2e3c0..ed21969b3e9fe98428c19da6baa822a5395abd3e 100644
--- a/tensorflow/docs_src/get_started/tflearn.md
+++ b/tensorflow/docs_src/get_started/tflearn.md
@@ -118,7 +118,7 @@ The [Iris data set](https://en.wikipedia.org/wiki/Iris_flower_data_set) contains
 150 rows of data, comprising 50 samples from each of three related Iris species:
 *Iris setosa*, *Iris virginica*, and *Iris versicolor*.
 
-![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](../images/iris_three_species.jpg) **From left to right,
+![Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor](https://www.tensorflow.org/images/iris_three_species.jpg) **From left to right,
 [*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
 [Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
 [*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
@@ -278,11 +278,11 @@ Then, the code creates a `DNNClassifier` model using the following arguments:
 
 The `tf.contrib.learn` API uses input functions, which create the TensorFlow
 operations that generate data for the model. In this case, the data is small
-enough that it can be stored in @{tf.constant TensorFlow constants}. The
+enough that it can be stored in @{tf.constant$TensorFlow constants}. The
 following code produces the simplest possible input pipeline:
 
 ```python
-# Define the test inputs
+# Define the training inputs
 def get_train_inputs():
   x = tf.constant(training_set.data)
   y = tf.constant(training_set.target)
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 56c4b8d2c58d64a62fced80963a1df0ccb0a08cc..72d0c7b1ff49a674eac5de1a35379d8452e793f9 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -50,7 +50,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
          <project>
              <modelVersion>4.0.0</modelVersion>
              <groupId>org.myorg</groupId>
-             <artifactId>label-image</artifactId>
+             <artifactId>hellotf</artifactId>
              <version>1.0-SNAPSHOT</version>
              <properties>
                <exec.mainClass>HelloTF</exec.mainClass>
@@ -105,8 +105,8 @@ As an example, these steps will create a Maven project that uses TensorFlow:
      <b>mvn -q compile exec:java</b></pre>
 
 
-The preceeding command should output <tt>Hello from <i>version</i></tt>. If it
-does, you've succesfully set up TensorFlow for Java and are ready to use it in
+The preceding command should output <tt>Hello from <i>version</i></tt>. If it
+does, you've successfully set up TensorFlow for Java and are ready to use it in
 Maven projects. If not, check
 [Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
 for possible solutions.  You can skip reading the rest of this document.
@@ -211,15 +211,20 @@ two files are available to the JVM:
   * the downloaded `.jar` file
   * the extracted JNI library
 
-For example, the following command line executes the `HelloTF` program:
+For example, the following command line executes the `HelloTF` program on Linux
+and Mac OS X:
 
 <pre><b>java -cp libtensorflow-1.1.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
+And the following comand line executes the `HelloTF` program on Windows:
+
+<pre><b>java -cp libtensorflow-1.1.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
 outputs something else, check
-[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
-for possible solutions.
+[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) for
+possible solutions.
 
 
 ### Advanced Example
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 80331e7ea8d95d19dfc646cf0cde653daea7d004..47c8fc77ee62779ad0f2f99804b31bb62f8f5248 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -137,7 +137,11 @@ Take the following steps to install TensorFlow with Virtualenv:
 
      <pre> (tensorflow)$ </pre>
 
-  4. Issue one of the following commands to install TensorFlow in the active
+  4. Ensure pip ≥8.1 is installed:
+
+     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
+
+  5. Issue one of the following commands to install TensorFlow in the active
      virtualenv environment:
 
      <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
@@ -148,26 +152,6 @@ Take the following steps to install TensorFlow with Virtualenv:
      If the preceding command succeeds, skip Step 5. If the preceding
      command fails, perform Step 5.
 
-  5. (Optional) If Step 4 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active virtualenv environment
-     by issuing a command of the following format:
-
-     <pre> (tensorflow)$ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-     (tensorflow)$ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
-
-     where <code><em>tfBinaryURL</em></code> identifies the URL of the
-     TensorFlow Python package. The appropriate value of
-     <code><em>tfBinaryURL</em></code>depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <code><em>tfBinaryURL</em></code> for your system
-     [here](#the_url_of_the_tensorflow_python_package).  For example, if you
-     are installing TensorFlow for Linux, Python 2.7, and CPU-only support,
-     issue the following command to install TensorFlow in the active
-     virtualenv environment:
-
-     <pre> (tensorflow)$ <b>pip install --upgrade \\
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp27-none-linux_x86_64.whl</b></pre>
-
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
 
@@ -269,8 +253,10 @@ take the following steps:
      install TensorFlow for Linux, Python 2.7, and CPU-only support, issue
      the following command:
 
-     <pre> $ <b>sudo pip install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp27-none-linux_x86_64.whl</b></pre>
+     <pre>
+     $ <b>sudo pip3 install --upgrade \
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp34-cp34m-linux_x86_64.whl</b>
+     </pre>
 
      If this step fails, see
      [Common Installation Problems](#common_installation_problems).
@@ -336,9 +322,9 @@ where:
     * <tt>gcr.io/tensorflow/tensorflow:latest-devel</tt>, which is the latest
       TensorFlow CPU Binary image plus source code.
     * <tt>gcr.io/tensorflow/tensorflow:<i>version</i></tt>, which is the
-      specified version (for example, 1.0.1) of TensorFlow CPU binary image.
+      specified version (for example, 1.1.0rc1) of TensorFlow CPU binary image.
     * <tt>gcr.io/tensorflow/tensorflow:<i>version</i>-devel</tt>, which is
-      the specified version (for example, 1.0.1) of the TensorFlow GPU
+      the specified version (for example, 1.1.0rc1) of the TensorFlow GPU
       binary image plus source code.
 
     <tt>gcr.io</tt> is the Google Container Registry. Note that some
@@ -456,7 +442,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp27-none-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -507,7 +493,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started$Getting Started with TensorFlow}.
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
@@ -624,14 +610,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.1.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -643,14 +629,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.1.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -662,14 +648,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.1.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -681,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.0.1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.1.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.0.1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.1.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 592036d1eb39f144f465fde37740f3b4bc054735..43ffc961aa9030fe431c6cca60f56b1b18dbb257 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -2,55 +2,7 @@
 
 This guide explains how to install TensorFlow on Mac OS X.
 
-## Determine which TensorFlow to install
-
-You must choose the type of TensorFlow to install.  Your choices are as follows:
-
-  * **TensorFlow with CPU support only**. If your system does not have a
-    NVIDIA CUDA® GPU, you should install this version. Note that TensorFlow
-    with CPU support is typically easier to install than TensorFlow with
-    GPU support. Therefore, even if you have an NVIDIA CUDA GPU, we recommend
-    installing this version first as a diagnostic step just in case you run
-    into problems installing TensorFlow with GPU support.
-  * **TensorFlow with GPU support**. TensorFlow programs typically run
-    significantly faster on a GPU than on a CPU. Therefore, if your system has
-    a NVIDIA CUDA GPU meeting the prerequisites shown below and you need
-    to run performance-critical applications, you should ultimately
-    install this version.
-
-
-### Requirements to run TensorFlow with GPU support
-
-If you are installing TensorFlow with GPU support using one of the mechanisms
-described in this guide, then the following NVIDIA software must be
-installed on your system:
-
-
-  * CUDA Toolkit 8.0. For details, see
-    [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x).
-    Ensure that you append the relevant CUDA pathnames to the
-    `LD_LIBRARY_PATH` environment variable as described in the
-    NVIDIA documentation.
-  * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v5.1. For details, see
-    [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
-    Ensure that you create the `CUDA_HOME` environment variable as described in
-    the NVIDIA documentation.
-  * GPU card with CUDA Compute Capability 3.0 or higher.  See
-    [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus)
-    for a list of supported GPU cards.
-
-If you have an earlier version of the preceding packages, please upgrade to
-the specified versions. If upgrading is not possible, you may still run
-TensorFlow with GPU support, but only if you do both of the following:
-
-  * Install TensorFlow from sources as described
-    @{$install_sources$here}.
-  * Install or upgrade to at least the following NVIDIA versions:
-    * CUDA toolkit 7.0 or greater
-    * cuDNN v3 or greater
-    * GPU card with CUDA Compute Capability 3.0 or higher.
-
+Note: As of version 1.2, TensorFlow no longer provides GPU support on Mac OS X.
 
 ## Determine how to install TensorFlow
 
@@ -88,10 +40,6 @@ large (hundreds of MBs). You might choose the Docker installation if you are
 incorporating TensorFlow into a larger application architecture that
 already uses Docker.
 
-Important: Docker currently does not support TensorFlow with GPU support
-on Mac OS; that is, on Mac OS, Docker only supports TensorFlow with
-CPU support.
-
 In Anaconda, you may use conda to create a virtual environment.
 However, within Anaconda, we recommend installing TensorFlow with the
 `pip install` command, not with the `conda install` command.
@@ -133,40 +81,18 @@ Take the following steps to install TensorFlow with Virtualenv:
 
      <pre> (tensorflow)$ </pre>
 
-  5. If pip version 8.1 or later is installed on your system, issue one of
-     the following commands to install TensorFlow and all the packages that
-     TensorFlow requires into the active Virtualenv environment:
-
-     <pre> $ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-     $ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
-     $ <b>pip install --upgrade tensorflow-gpu</b>  # for Python 2.7 and GPU
-     $ <b>pip3 install --upgrade tensorflow-gpu</b> # for Python 3.n and GPU </pre>
-
-     If the preceding command succeed, skip Step 6. If it failed,
-     perform Step 6.
-
-  6. Optional. If Step 5 failed (typically because you invoked a pip version
-     lower than 8.1), install TensorFlow in the active
-     virtualenv environment by issuing a command of the following format:
+  5. Ensure pip ≥8.1 is installed:
 
-     <pre> $ <b>pip install --upgrade</b> <i>tfBinaryURL</i>   # Python 2.7
-     $ <b>pip3 install --upgrade</b> <i>tfBinaryURL</i>  # Python 3.n </pre>
+     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
 
-     where <i>tfBinaryURL</i> identifies the URL
-     of the TensorFlow Python package. The appropriate value of
-     <i>tfBinaryURL</i> depends on the operating system,
-     Python version, and GPU support. Find the appropriate value for
-     <i>tfBinaryURL</i> for your system
-     [here](#the_url_of_the_tensorflow_python_package).
-     For example, if you are installing TensorFlow for Mac OS X,
-     Python 2.7, and CPU-only support, the command to install
-     TensorFlow in the active Virtualenv is as follows:
+  6. Issue one of the following commands to install TensorFlow and all the
+     packages that TensorFlow requires into the active Virtualenv environment:
 
-     <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.1-py2-none-any.whl</b></pre>
+     <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
 
 If you encounter installation problems, see
-[Common Installation Problems](#CommonInstallationProblems).
+[Common Installation Problems](#common-installation-problems).
 
 
 ### Next Steps
@@ -263,10 +189,8 @@ take the following steps:
 
   1. Install TensorFlow by invoking **one** of the following commands:
 
-     <pre> $ <b>pip install tensorflow</b>      # Python 2.7; CPU support (no GPU support)
-     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support (no GPU support)
-     $ <b>pip install tensorflow-gpu</b>  # Python 2.7;  GPU support
-     $ <b>pip3 install tensorflow-gpu</b> # Python 3.n; GPU support </pre>
+     <pre> $ <b>pip install tensorflow</b>      # Python 2.7; CPU support
+     $ <b>pip3 install tensorflow</b>     # Python 3.n; CPU support
 
      If the preceding command runs to completion, you should now
      [validate your installation](#ValidateYourInstallation).
@@ -279,17 +203,17 @@ take the following steps:
 
      where <i>tfBinaryURL</i> identifies the URL of the TensorFlow Python
      package. The appropriate value of <i>tfBinaryURL</i> depends on the
-     operating system, Python version, and GPU support. Find the appropriate
+     operating system and Python version. Find the appropriate
      value for <i>tfBinaryURL</i>
      [here](#the_url_of_the_tensorflow_python_package).  For example, if
-     you are installing TensorFlow for Mac OS, Python 2.7, and CPU-only
-     support, issue the following command:
+     you are installing TensorFlow for Mac OS and Python 2.7
+     issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
-     [Common installation problems](#CommonInstallationProblems).
+     [installation problems](#common-installation-problems).
 
 
 
@@ -320,9 +244,6 @@ Follow these steps to install TensorFlow through Docker.
 
 The remainder of this section explains how to launch a Docker container.
 
-**Note**: You may only launch a Docker container with CPU support.
-(Docker doesn't provide GPU support on Mac OS.)
-
 To launch a Docker container that holds the TensorFlow binary image,
 enter a command of the following format:
 
@@ -398,7 +319,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -449,7 +370,7 @@ writing TensorFlow programs:
 <pre>Hello, TensorFlow!</pre>
 
 If you are new to TensorFlow, see
-@{$get_started$Getting Started with TensorFlow}.
+@{$get_started/get_started$Getting Started with TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see
 [Common installation problems](#common_installation_problems).
@@ -468,17 +389,6 @@ the `tensorflow` tag.
 <table>
 <tr> <th>Stack Overflow Link</th> <th>Error Message</th> </tr>
 
-<tr>
-  <td><a href="https://stackoverflow.com/q/36159194">36159194</a></td>
-  <td><pre>ImportError: libcudart.so.<i>Version</i>: cannot open shared object file:
-  No such file or directory</pre></td>
-</tr>
-
-<tr>
-  <td><a href="https://stackoverflow.com/q/41991101">41991101</a></td>
-  <td><pre>ImportError: libcudnn.<i>Version</i>: cannot open shared object file:
-  No such file or directory</pre></td>
-</tr>
 
 <tr>
   <td><a href="http://stackoverflow.com/q/42006320">42006320</a></td>
@@ -544,17 +454,6 @@ ImportError: cannot import name 'descriptor'</pre>
   </td>
 </tr>
 
-<tr>
-  <td><a href="http://stackoverflow.com/q/42073336">42073336</a></td>
-  <td>An <tt>import tensorflow</tt> statement triggers the following error:
-<pre>
->>> import tensorflow as tf
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcublas.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcudnn.dylib locally
-I tensorflow/stream_executor/dso_loader.cc:108] successfully opened CUDA library libcufft.dylib locally
-"import tensorflow" terminated by signal SIGSEGV (Address boundary error)
-</pre></td>
-</tr>
 
 <tr>
   <td><a href="http://stackoverflow.com/q/42075397">42075397</a></td>
@@ -572,15 +471,6 @@ Terminal window to review and agree to the Xcode license agreements.
 RuntimeError: Broken toolchain: cannot link a simple C program</pre>
 </td>
 
-<tr>
-  <td><a href="http://stackoverflow.com/questions/42376790/">42376790</a></td>
-  <td>After installing for GPU, an `import tensorflow` statement
-      triggers the following error:
-<pre>tensorflow/stream_executor/dso_loader.cc:135] successfully opened CUDA
-  library libcublas.8.0.dylib locally
-  Segmentation fault: 11
-</pre></td>
-</tr>
 
 </table>
 
@@ -595,45 +485,24 @@ The value you specify depends on three factors:
 
   * operating system
   * Python version
-  * CPU only vs. GPU support
 
 This section documents the relevant values for Mac OS installations.
 
 ### Python 2.7
 
-CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py2-none-any.whl
 </pre>
 
-GPU support:
-
-<pre>
-https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.1-py2-none-any.whl
-</pre>
-
-Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see
-[Installing TensorFlow from Sources](install_sources.md).
-
 
 ### Python 3.4, 3.5, or 3.6
 
-CPU only:
-
-<pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.0.1-py3-none-any.whl
-</pre>
-
-GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow_gpu-1.0.1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.1.0rc2-py3-none-any.whl
 </pre>
 
-Requires CUDA toolkit 8.0 and CuDNN v5. For other versions, see
-[Installing TensorFlow from Sources](install_sources.md).
-
 
 
 <a name="Protobuf31"></a>
@@ -659,12 +528,12 @@ the custom binary protobuf pip package, invoke one of the following commands:
   * for Python 2.7:
 
     <pre>$ <b>pip install --upgrade \
-    https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp27-none-linux_x86_64.whl</b></pre>
+    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp27-none-macosx_10_11_x86_64.whl</b></pre>
 
   * for Python 3.n:
 
     <pre>$ <b>pip3 install --upgrade \
-    https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.1.0-cp35-none-linux_x86_64.whl</b> </pre>
+    https://storage.googleapis.com/tensorflow/mac/cpu/protobuf-3.1.0-cp35-none-macosx_10_11_x86_64.whl</b></pre>
 
 Installing this protobuf package will overwrite the existing protobuf package.
 Note that the binary pip package already has support for protobufs
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 3597a68b83b992d578f30ce0e6f25830215e78c7..8dd7870faa5b2105d5d4ded14669d3d96cab228e 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -298,7 +298,7 @@ invoke the following command:
 
 <pre>$ <b>bazel build --config=opt --config=cuda //tensorflow/tools/pip_package:build_pip_package</b> </pre>
 
-**NOTE on gcc 5 or later:** the binary pip packages available on the TensorFlow website are built with gcc 4, which uses the older ABI. To make your build compatible with the older ABI, you need to add `-cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"` to your `bazel build` command. ABI compatibility allows custom ops built against the TensorFlow pip package to continue to work against your built package.
+**NOTE on gcc 5 or later:** the binary pip packages available on the TensorFlow website are built with gcc 4, which uses the older ABI. To make your build compatible with the older ABI, you need to add `--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0"` to your `bazel build` command. ABI compatibility allows custom ops built against the TensorFlow pip package to continue to work against your built package.
 
 <b>Tip:</b> By default, building TensorFlow from sources consumes
 a lot of RAM.  If RAM is an issue on your system, you may limit RAM usage
@@ -319,10 +319,11 @@ $ <b>bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pk
 Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
-for TensorFlow 1.0.1 on Linux:
+
+for TensorFlow 1.1.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.0.1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.1.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -353,7 +354,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
@@ -367,6 +368,7 @@ of one of the following guides:
 
   * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
   * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 3e5451ff4df379ea133cd5cc13ab1e6e5ce2db6b..db7c661aa13bd54b3d3f340b127d3d09f84b9a11 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -103,7 +103,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   2. Create a conda environment named <tt>tensorflow</tt>
      by invoking the following command:
 
-     <pre>C:\> <b>conda create -n tensorflow</b> </pre>
+     <pre>C:\> <b>conda create -n tensorflow python=3.5</b> </pre>
 
   3. Activate the conda environment by issuing the following command:
 
@@ -114,12 +114,12 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      environment. To install the CPU-only version of TensorFlow, enter the
      following command:
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.0.1-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/cpu/tensorflow-1.1.0-cp35-cp35m-win_amd64.whl</b> </pre>
 
      To install the GPU version of TensorFlow, enter the following command
      (on a single line):
 
-     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.0.1-cp35-cp35m-win_amd64.whl</b> </pre>
+     <pre>(tensorflow)C:\> <b>pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/windows/gpu/tensorflow_gpu-1.1.0-cp35-cp35m-win_amd64.whl</b> </pre>
 
 ## Validate your installation
 
@@ -145,7 +145,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started$Getting Started with
+If you are new to TensorFlow, see @{$get_started/get_started$Getting Started with
 TensorFlow}.
 
 If the system outputs an error message instead of a greeting, see [Common
@@ -193,5 +193,20 @@ ImportError: cannot import name 'descriptor'</pre>
   <td><pre>No module named "pywrap_tensorflow"</pre></td>
 </tr>
 
-<table>
+<tr>
+  <td><a href="https://stackoverflow.com/q/42217532">42217532</a></td>
+  <td>
+  <pre>OpKernel ('op: "BestSplits" device_type: "CPU"') for unknown op: BestSplits</pre>
+  </td>
+</tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/q/43134753">43134753</a></td>
+  <td>
+  <pre>The TensorFlow library wasn't compiled to use SSE instructions</pre>
+  </td>
+</tr>
+
+
+</table>
 
diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..47ab028e2058b5d7f722604ecc3eeb9753270ead
--- /dev/null
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -0,0 +1,414 @@
+# Benchmarks
+
+## Overview
+
+A selection of image classification models were tested across multiple platforms
+to create a point of reference for the TensorFlow community. The
+[Methodology](#methodology) section details how the tests were executed and has
+links to the scripts used.
+
+## Results for image classification models
+
+InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)), ResNet-50
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), ResNet-152
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16
+([arXiv:1409.1556](https://arxiv.org/abs/1409.1556)), and
+[AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)
+were tested using the [ImageNet](http://www.image-net.org/) data set. Tests were
+run on Google Compute Engine, Amazon Elastic Compute Cloud (Amazon EC2), and an
+NVIDIA® DGX-1™. Most of the tests were run with both synthetic and real data.
+Testing with synthetic data was done by using a `tf.Variable` set to the same
+shape as the data expected by each model for ImageNet. We believe it is
+important to include real data measurements when benchmarking a platform. This
+load tests both the underlying hardware and the framework at preparing data for
+actual training. We start with synthetic data to remove disk I/O as a variable
+and to set a baseline. Real data is then used to verify that the TensorFlow
+input pipeline and the underlying disk I/O are saturating the compute units.
+
+### Training with NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+Details and additional results are in the [Details for NVIDIA® DGX-1™ (NVIDIA®
+Tesla® P100)](#details_for_nvidia_dgx-1tm_nvidia_tesla_p100) section.
+
+### Training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_single_server.png">
+</div>
+
+Details and additional results are in the [Details for Google Compute Engine
+(NVIDIA® Tesla® K80)](#details_for_google_compute_engine_nvidia_tesla_k80) and
+[Details for Amazon EC2 (NVIDIA® Tesla®
+K80)](#details_for_amazon_ec2_nvidia_tesla_k80) sections.
+
+### Distributed training with NVIDIA® Tesla® K80
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+Details and additional results are in the [Details for Amazon EC2 Distributed
+(NVIDIA® Tesla® K80)](#details_for_amazon_ec2_distributed_nvidia_tesla_k80)
+section.
+
+### Compare synthetic with real data training
+
+**NVIDIA® Tesla® P100**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_p100_data_compare_resnet50.png">
+</div>
+
+**NVIDIA® Tesla® K80**
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_inceptionv3.png">
+  <img style="width:35%" src="../images/perf_summary_k80_data_compare_resnet50.png">
+</div>
+
+## Details for NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
+
+### Environment
+
+*   **Instance type**: NVIDIA® DGX-1™
+*   **GPU:** 8x NVIDIA® Tesla® P100
+*   **OS:** Ubuntu 16.04 LTS with tests run via Docker
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** Local SSD
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3, ResNet-50,
+ResNet-152, and VGG16 were tested with a batch size of 32. Those results are in
+the *other results* section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 64         | 512     | 64
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device
+----------- | ---------------------- | ----------------------
+InceptionV3 | parameter_server       | cpu
+ResNet50    | parameter_server       | cpu
+ResNet152   | parameter_server       | cpu
+AlexNet     | replicated (with NCCL) | n/a
+VGG16       | replicated (with NCCL) | n/a
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_dgx1_synth_p100_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_dgx1_real_p100_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 219       | 91.8       | 2987    | 154
+2    | 284         | 422       | 181        | 5658    | 295
+4    | 569         | 852       | 356        | 10509   | 584
+8    | 1131        | 1734      | 716        | 17822   | 1081
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 142         | 218       | 91.4       | 2890    | 154
+2    | 278         | 425       | 179        | 4448    | 284
+4    | 551         | 853       | 359        | 7105    | 534
+8    | 1079        | 1630      | 708        | N/A     | 898
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to it maxing out the input pipeline.
+
+### Other Results
+
+The results below are all with a batch size of 32.
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 128         | 195       | 82.7       | 144
+2    | 259         | 368       | 160        | 281
+4    | 520         | 768       | 317        | 549
+8    | 995         | 1485      | 632        | 820
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 130         | 193       | 82.4       | 144
+2    | 257         | 369       | 159        | 253
+4    | 507         | 760       | 317        | 457
+8    | 966         | 1410      | 609        | 690
+
+## Details for Google Compute Engine (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: n1-standard-32-k80x8
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.7 TB Shared SSD persistent disk (800 MB/s)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+The configuration used for each model was `variable_update` equal to
+`parameter_server` and `local_parameter_device` equal to `cpu`.
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_gce_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_gce_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.9      | 20.0       | 656     | 35.4
+2    | 57.8        | 99.0      | 38.2       | 1209    | 64.8
+4    | 116         | 195       | 75.8       | 2328    | 120
+8    | 227         | 387       | 148        | 4640    | 234
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.6        | 51.2      | 20.0       | 639     | 34.2
+2    | 58.4        | 98.8      | 38.3       | 1136    | 62.9
+4    | 115         | 194       | 75.4       | 2067    | 118
+8    | 225         | 381       | 148        | 4056    | 230
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.3                        | 49.5
+2    | 55.0                        | 95.4
+4    | 109                         | 183
+8    | 216                         | 362
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.5                        | 49.3
+2    | 55.4                        | 95.3
+4    | 110                         | 186
+8    | 216                         | 359
+
+## Details for Amazon EC2 (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1TB Amazon EFS (burst 100 MiB/sec for 12 hours, continuous 50
+    MiB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+------------------ | ----------- | --------- | ---------- | ------- | -----
+Batch size per GPU | 64          | 64        | 32         | 512     | 32
+Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
+
+Configuration used for each model.
+
+Model       | variable_update           | local_parameter_device
+----------- | ------------------------- | ----------------------
+InceptionV3 | parameter_server          | cpu
+ResNet-50   | replicated (without NCCL) | gpu
+ResNet-152  | replicated (without NCCL) | gpu
+AlexNet     | parameter_server          | gpu
+VGG16       | parameter_server          | gpu
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:35%" src="../images/perf_aws_synth_k80_single_server_scaling.png">
+  <img style="width:35%" src="../images/perf_aws_real_k80_single_server_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.8        | 51.5      | 19.7       | 684     | 36.3
+2    | 58.7        | 98.0      | 37.6       | 1244    | 69.4
+4    | 117         | 195       | 74.9       | 2479    | 141
+8    | 230         | 384       | 149        | 4853    | 260
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+---- | ----------- | --------- | ---------- | ------- | -----
+1    | 30.5        | 51.3      | 19.7       | 674     | 36.3
+2    | 59.0        | 94.9      | 38.2       | 1227    | 67.5
+4    | 118         | 188       | 75.2       | 2201    | 136
+8    | 228         | 373       | 149        | N/A     | 242
+
+Training AlexNet with real data on 8 GPUs was excluded from the graph and table
+above due to our EFS setup not providing enough throughput.
+
+### Other Results
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.9                        | 49.0
+2    | 57.5                        | 94.1
+4    | 114                         | 184
+8    | 216                         | 355
+
+**Training real data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 30.0                        | 49.1
+2    | 57.5                        | 95.1
+4    | 113                         | 185
+8    | 212                         | 353
+
+## Details for Amazon EC2 Distributed (NVIDIA® Tesla® K80)
+
+### Environment
+
+*   **Instance type**: p2.8xlarge
+*   **GPU:** 8x NVIDIA® Tesla® K80
+*   **OS:** Ubuntu 16.04 LTS
+*   **CUDA / cuDNN:** 8.0 / 5.1
+*   **TensorFlow GitHub hash:** b1e174e
+*   **Benchmark GitHub hash:** 9165a70
+*   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
+    //tensorflow/tools/pip_package:build_pip_package`
+*   **Disk:** 1.0 TB EFS (burst 100 MB/sec for 12 hours, continuous 50 MB/sec)
+*   **DataSet:** ImageNet
+*   **Test Date:** May 2017
+
+The batch size and optimizer used for the tests are listed in the table. In
+addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
+tested with a batch size of 32. Those results are in the *other results*
+section.
+
+Options            | InceptionV3 | ResNet-50 | ResNet-152
+------------------ | ----------- | --------- | ----------
+Batch size per GPU | 64          | 64        | 32
+Optimizer          | sgd         | sgd       | sgd
+
+Configuration used for each model.
+
+Model       | variable_update        | local_parameter_device | cross_replica_sync
+----------- | ---------------------- | ---------------------- | ------------------
+InceptionV3 | distributed_replicated | n/a                    | True
+ResNet-50   | distributed_replicated | n/a                    | True
+ResNet-152  | distributed_replicated | n/a                    | True
+
+To simplify server setup, EC2 instances (p2.8xlarge) running worker servers also
+ran parameter servers. Equal numbers of parameter servers and worker servers were
+used with the following exceptions:
+
+*   InceptionV3: 8 instances / 6 parameter servers
+*   ResNet-50: (batch size 32) 8 instances / 4 parameter servers
+*   ResNet-152: 8 instances / 4 parameter servers
+
+### Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
+</div>
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../images/perf_aws_synth_k80_distributed_scaling.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152
+---- | ----------- | --------- | ----------
+1    | 29.7        | 52.4      | 19.4
+8    | 229         | 378       | 146
+16   | 459         | 751       | 291
+32   | 902         | 1388      | 565
+64   | 1783        | 2744      | 981
+
+### Other Results
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:50%" src="../images/perf_aws_synth_k80_multi_server_batch32.png">
+</div>
+
+**Training synthetic data**
+
+GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
+---- | --------------------------- | -------------------------
+1    | 29.2                        | 48.4
+8    | 219                         | 333
+16   | 427                         | 667
+32   | 820                         | 1180
+64   | 1608                        | 2315
+
+## Methodology
+
+This
+[script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+was run on the various platforms to generate the above results.
+@{$performance_models$High-Performance Models} details techniques in the script
+along with examples of how to execute the script.
+
+In order to create results that are as repeatable as possible, each test was run
+5 times and then the times were averaged together. GPUs are run in their default
+state on the given platform. For NVIDIA® Tesla® K80 this means leaving on [GPU
+Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/).
+For each test, 10 warmup steps are done and then the next 100 steps are
+averaged.
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 0ff4d2ee0041ee142c65a9975b55c89387612a26..7c1cd152d372cdf0f4506b17b15cc8a816088bd7 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -2,11 +2,19 @@
 
 Performance is often a significant issue when training a machine learning
 model.  This section explains various ways to optimize performance.  Start
-your investigation with the following guide:
+your investigation with the @{$performance_guide$Performance Guide} and then go
+deeper with techniques detailed in @{$performance_models$High-Performance Models}:
 
-  * @{$performance_guide$Performance}, which contains a collection of best
+  * @{$performance_guide$Performance Guide}, which contains a collection of best
     practices for optimizing your TensorFlow code.
 
+  * @{$performance_models$High-Performance Models}, which contains a collection
+    of advanced techniques to build highly scalable models targeting different
+    system types and network topologies.
+
+  * @{$benchmarks$Benchmarks}, which contains a collection of benchmark
+    results.
+
 XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
 XLA:
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 0f30cc7fa5c8a6a5d2501b75dba0a86365ab5aae..d22847322084d584a4ddc713486109ede838fee8 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,4 +1,8 @@
 performance_guide.md
+performance_models.md
+benchmarks.md
+quantization.md
+>>>
 xla/index.md
 xla/broadcasting.md
 xla/developing_new_backend.md
@@ -6,4 +10,3 @@ xla/jit.md
 xla/operation_semantics.md
 xla/shapes.md
 xla/tfcompile.md
-quantization.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 8a1bba883aeee93c7702c936c6130d51cc552457..07c5d3087f35e6a3dbe7369006d1a4d84517e9e4 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -1,8 +1,10 @@
-# Performance
+# Performance Guide
 
 This guide contains a collection of best practices for optimizing your
 TensorFlow code. The best practices apply to both new and experienced
-Tensorflow users.
+Tensorflow users.  As a complement to the best practices in this document, the
+@{$performance_models$High-Performance Models} document links to example code
+and details for creating models that scale on a variety of hardware.
 
 ## Best Practices
 While optimizing implementations of different types of models can be different,
@@ -73,7 +75,7 @@ Unless for a special circumstance or for example code, do not feed data
 into the session from Python variables, e.g. `dictionary`.
 
 ```python
-# This will result in poor performance.
+# Using feed_dict often results in suboptimal performance when using large inputs.
 sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 ```
 
@@ -141,3 +143,4 @@ bn = tf.contrib.layers.batch_norm(
 The non-fused batch norm does computations using several individual Ops. Fused
 batch norm combines the individual operations into a single kernel, which runs
 faster.
+
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..d48431eaa083d5e1d189ecf15accc6dac595f074
--- /dev/null
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -0,0 +1,422 @@
+# High-Performance Models
+
+This document and accompanying
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+detail how to build highly scalable models that target a variety of system types
+and network topologies. The techniques in this document utilize some low-level
+TensorFlow Python primitives. In the future, many of these techniques will be
+incorporated into high-level APIs.
+
+## Input Pipeline
+
+The @{$performance_guide$Performance Guide} explains how to identify possible
+input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
+and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
+when using large inputs and processing with higher samples per second, such
+as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+This is due to the use of Python threads as its underlying implementation. The
+overhead of Python threads is too large.
+
+Another approach, which we have implemented in the
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks),
+is to build an input pipeline using the native parallelism in TensorFlow. Our
+implementation is made up of 3 stages:
+
+*   I/O reads: Choose and read image files from disk.
+*   Image Processing: Decode image records into images, preprocess, and organize
+    into mini-batches.
+*   CPU-to-GPU Data Transfer: Transfer images from CPU to GPU.
+
+The dominant part of each stage is executed in parallel with the other stages
+using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
+similar to @{tf.FIFOQueue}. The difference is that `StagingArea` offers simpler
+functionality and can be executed on both CPU and GPU in parallel with other
+stages. Breaking the input pipeline into 3 stages that operate independently in
+parallel is scalable and takes full advantage of large multi-core environments.
+The rest of this section details the stages followed by details about using
+`data_flow_ops.StagingArea`.
+
+### Parallelize I/O Reads
+
+`data_flow_ops.RecordInput` is used to parallelize reading from disk. Given a
+list of input files representing TFRecords, `RecordInput` continuously reads
+records using background threads. The records are placed into its own large
+internal pool and when it has loaded at least half of its capacity, it produces
+output tensors.
+
+This op has its own internal threads that are dominated by I/O time that consume
+minimal CPU, which allows it to run smoothly in parallel with the rest of the
+model.
+
+### Parallelize Image Processing
+
+After images are read from `RecordInput` they are passed as tensors to the image
+processing pipeline. To make the image processing pipeline easier to explain,
+assume that the input pipeline is targeting 8 GPUs with a batch size of 256 (32
+per GPU).
+
+256 records are read and processed individually in parallel. This starts with
+256 independent `RecordInput` read ops in the graph. Each read op is followed by
+an identical set of ops for image preprocessing that are considered independent
+and executed in parallel. The image preprocessing ops include operations such as
+image decoding, distortion, and resizing.
+
+Once the images are through preprocessing, they are concatenated together into 8
+tensors each with a batch-size of 32. Rather than using @{tf.concat} for this
+purpose, which is implemented as a single op that waits for all the inputs to be
+ready before concatenating them together, @{tf.parallel_stack} is used.
+@{tf.parallel_stack} allocates an uninitialized tensor as an output, and each
+input tensor is written to its designated portion of the output tensor as soon
+as the input is available.
+
+When all the input tensors are finished, the output tensor is passed along in
+the graph. This effectively hides all the memory latency with the long tail of
+producing all the input tensors.
+
+### Parallelize CPU-to-GPU Data Transfer
+
+Continuing with the assumption that the target is 8 GPUs with a batch size of
+256 (32 per GPU). Once the input images are processed and concatenated together
+by the CPU, we have 8 tensors each with a batch-size of 32.
+
+TensorFlow enables tensors from one device to be used on any other device
+directly. TensorFlow inserts implicit copies to make the tensors available on
+any devices where they are used. The runtime schedules the copy between devices
+to run before the tensors are actually used. However, if the copy cannot finish
+in time, the computation that needs those tensors will stall and result in
+decreased performance.
+
+In this implementation, `data_flow_ops.StagingArea` is used to explicitly
+schedule the copy in parallel. The end result is that when computation starts on
+the GPU, all the tensors are already available.
+
+### Software Pipelining
+
+With all the stages capable of being driven by different processors,
+`data_flow_ops.StagingArea` is used between them so they run in parallel.
+`StagingArea` is a queue-like operator similar to @{tf.FIFOQueue} that offers
+simpler functionalities that can be executed on both CPU and GPU.
+
+Before the model starts running all the stages, the input pipeline stages are
+warmed up to prime the staging buffers in between with one set of data.
+During each run step, one set of data is read from the staging buffers at
+the beginning of each stage, and one set is pushed at the end.
+
+For example: if there are three stages: A, B and C. There are two staging areas
+in between: S1 and S2. During the warm up, we run:
+
+```
+Warm up:
+Step 1: A0
+Step 2: A1  B0
+
+Actual execution:
+Step 3: A2  B1  C0
+Step 4: A3  B2  C1
+Step 5: A4  B3  C2
+```
+
+After the warm up, S1 and S2 each have one set of data in them. For each step of
+the actual execution, one set of data is consumed from each staging area, and
+one set is added to each.
+
+Benefits of using this scheme:
+
+*   All stages are non-blocking, since the staging areas always have one set of
+    data after the warm up.
+*   Each stage can run in parallel since they can all start immediately.
+*   The staging buffers have a fixed memory overhead. They will have at most one
+    extra set of data.
+*   Only a single`session.run()` call is needed to run all stages of the step,
+    which makes profiling and debugging much easier.
+
+## Best Practices in Building High-Performance Models
+
+Collected below are a couple of additional best practices that can improve
+performance and increase the flexiblity of models.
+
+### Build the model with both NHWC and NCHW
+
+Most TensorFlow operations used by a CNN support both NHWC and NCHW data format.
+On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster.
+
+Building a model to support both data formats keeps the model flexible and
+capable of operating optimally regardless of platform. Most TensorFlow
+operations used by a CNN support both NHWC and NCHW data formats. The benchmark
+script was written to support both NCHW and NHWC. NCHW should always be used
+when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can
+be trained on GPUs using NCHW with inference done on CPU using NHWC with the
+weights obtained from training.
+
+### Use Fused Batch-Normalization
+
+The default batch-normalization in TensorFlow is implemented as composite
+operations. This is very general, but often leads to suboptimal performance. An
+alternative is to use fused batch-normalization which often has much better
+performance on GPU. Below is an example of using @{tf.contrib.layers.batch_norm}
+to implement fused batch-normalization.
+
+```python
+bn = tf.contrib.layers.batch_norm(
+          input_layer, fused=True, data_format='NCHW'
+          scope=scope)
+```
+
+## Variable Distribution and Gradient Aggregation
+
+During training, training variable values are updated using aggregated gradients
+and deltas. In the benchmark script, we demonstrate that with the flexible and
+general-purpose TensorFlow primitives, a diverse range of high-performance
+distribution and aggregation schemes can be built.
+
+Three examples of variable distribution and aggregation were included in the
+script:
+
+*   `parameter_server` where each replica of the training model reads the
+    variables from a parameter server and updates the variable independently.
+    When each model needs the variables, they are copied over through the
+    standard implicit copies added by the TensorFlow runtime. The example
+    [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+    illustrates using this method for local training, distributed synchronous
+    training, and distributed asynchronous training.
+*   `replicated` places an identical copy of each training variable on each
+    GPU. The forward and backward computation can start immediately as the
+    variable data is immediately available. Gradients are accumulated across all
+    GPUs, and the aggregated total is applied to each GPU's copy of the
+    variables to keep them in sync.
+*   `distributed_replicated` places an identical copy of the training parameters
+    on each GPU along with a master copy on the parameter servers. The forward
+    and backward computation can start immediately as the variable data is
+    immediately available. Gradients are accumulated across all GPUs on each
+    server and then the per-server aggregated gradients are applied to the
+    master copy. After all workers do this, each worker updates its copy of the
+    variable from the master copy.
+
+Below are additional details about each approach.
+
+### Parameter Server Variables
+
+The most common way trainable variables are managed in TensorFlow models is
+parameter server mode.
+
+In a distributed system, each worker process runs the same model, and parameter
+server processes own the master copies of the variables. When a worker needs a
+variable from a parameter server, it refers to it directly. The TensorFlow
+runtime adds implicit copies to the graph to make the variable value available
+on the computation device that needs it. When a gradient is computed on a
+worker, it is sent to the parameter server that owns the particular variable,
+and the corresponding optimizer is used to update the variable.
+
+There are some techniques to improve throughput:
+
+*   The variables are spread among parameter servers based on their size, for
+    load balancing.
+*   When each worker has multiple GPUs, gradients are accumulated across the
+    GPUs and a single aggregated gradient is sent to the parameter server. This
+    reduces the network bandwidth and the amount of work done by the parameter
+    servers.
+
+For coordinating between workers, a very common mode is async updates, where
+each worker updates the master copy of the variables without synchronizing with
+other workers. In our model, we demonstrate that it is fairly easy to introduce
+synchronization across workers so updates for all workers are finished in one
+step before the next step can start.
+
+The parameter server method can also be used for local training, In this case,
+instead of spreading the master copies of variables across parameters servers,
+they are either on the CPU or spread across the available GPUs.
+
+Due to the simple nature of this setup, this architecture has gained a lot of
+popularity within the community.
+
+This mode can be used in the script by passing
+`--variable_update=parameter_server`.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="parameter_server mode in distributed training"
+   src="../images/perf_parameter_server_mode_doc.png">
+</div>
+
+### Replicated Variables
+
+In this design, each GPU on the server has its own copy of each variable. The
+values are kept in sync across GPUs by applying the fully aggregated gradient to
+each GPU's copy of the variable.
+
+The variables and data are available at the start of training, so the forward
+pass of training can start immediately. Gradients are aggregated across the
+devices and the fully aggregated gradient is then applied to each local copy.
+
+Gradient aggregation across the server can be done in different ways:
+
+*   Using standard TensorFlow operations to accumulate the total on a single
+    device (CPU or GPU) and then copy it back to all GPUs.
+*   Using NVIDIA® NCCL, described below in the NCCL section.
+
+This mode can be used in the script by passing `--variable_update=replicated`.
+
+### Replicated Variables in Distributed Training
+
+The replicated method for variables can be extended to distributed training. One
+way to do this like the replicated mode: aggregate the gradients fully across
+the cluster and apply them to each local copy of the variable. This may be shown
+in a future version of this scripts; the scripts do present a different
+variation, described here.
+
+In this mode, in addition to each GPU's copy of the variables, a master copy is
+stored on the parameter servers. As with the replicated mode, training can start
+immediately using the local copies of the variables.
+
+As the gradients of the weights become available, they are sent back to the
+parameter servers and all local copies are updated:
+
+1.  All the gradients from the GPU on the same worker are aggregated together.
+2.  Aggregated gradients from each worker are sent to the parameter server that
+    owns the variable, where the specified optimizer is used to update the
+    master copy of the variable.
+3.  Each worker updates its local copy of the variable from the master. In the
+    example model, this is done with a cross-replica barrier that waits for all
+    the workers to finish updating the variables, and fetches the new variable
+    only after the barrier has been released by all replicas. Once the copy
+    finishes for all variables, this marks the end of a training step, and a new
+    step can start.
+
+Although this sounds similar to the standard use of parameter servers, the
+performance is often better in many cases. This is largely due to the fact the
+computation can happen without any delay, and much of the copy latency of early
+gradients can be hidden by later computation layers.
+
+This mode can be used in the script by passing
+`--variable_update=distributed_replicated`.
+
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="distributed_replicated mode"
+   src="../images/perf_distributed_replicated_mode_doc.png">
+</div>
+
+#### NCCL
+
+In order to broadcast variables and aggregate gradients across different GPUs
+within the same host machine, we can use the default TensorFlow implicit copy
+mechanism.
+
+However, we can instead use the optional NCCL (@{tf.contrib.nccl}) support. NCCL
+is an NVIDIA® library that can efficiently broadcast and aggregate data across
+different GPUs. It schedules a cooperating kernel on each GPU that knows how to
+best utilize the underlying hardware topology; this kernel uses a single SM of
+the GPU.
+
+In our experiment, we demonstrate that although NCCL often leads to much faster
+data aggregation by itself, it doesn't necessarily lead to faster training. Our
+hypothesis is that the implicit copies are essentially free since they go to the
+copy engine on GPU, as long as its latency can be hidden by the main computation
+itself. Although NCCL can transfer data faster, it takes one SM away, and adds
+more pressure to the underlying L2 cache. Our results show that for 8-GPUs, NCCL
+often leads to better performance. However, for fewer GPUs, the implicit copies
+often perform better.
+
+#### Staged Variables
+
+We further introduce a staged-variable mode where we use staging areas for both
+the variable reads, and their updates. Similar to software pipelining of the
+input pipeline, this can hide the data copy latency. If the computation time
+takes longer than the copy and aggregation, the copy itself becomes essentially
+free.
+
+The downside is that all the weights read are from the previous training step.
+So it is a different algorithm from SGD. But it is possible to improve its
+convergence by adjusting learning rate and other hyperparameters.
+
+## Executing the script
+
+This section lists the core command line arguments and a few basic examples for
+executing the main script
+([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
+
+> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`,
+> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released
+> building from source is advised.
+
+#### Base command line arguments
+
+*   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
+    `alexnet`.
+*   **`num_gpus`**: Number of GPUs to use.
+*   **`data_dir`**: Path to data to process. If not set, synthetic data is used.
+    To use Imagenet data use these
+    [instructions](https://github.com/tensorflow/models/tree/master/inception#getting-started)
+    as a starting point.
+*   **`batch_size`**: Batch size for each GPU.
+*   **`variable_update`**: The method for managing variables: `parameter_server`
+    ,`replicated`, `distributed_replicated`, `independent`
+*   **`local_parameter_device`**: Device to use as parameter server: `cpu` or
+    `gpu`.
+
+#### Single instance examples
+
+```bash
+# VGG16 training ImageNet with 8 GPUs using arguments that optimize for
+# Google Compute Engine.
+python tf_cnn_benchmarks.py --local_parameter_device=cpu --num_gpus=8 \
+--batch_size=32 --model=vgg16 --data_dir=/home/ubuntu/imagenet/train \
+--variable_update=parameter_server --nodistortions
+
+# VGG16 training synthetic ImageNet data with 8 GPUs using arguments that
+# optimize for the NVIDIA DGX-1.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=replicated --use_nccl=True
+
+# VGG16 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=parameter_server
+
+# ResNet-50 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=replicated --use_nccl=False
+
+```
+
+#### Distributed command line arguments
+
+*   **`ps_hosts`**: Comma separated list of hosts to use as parameter servers
+    in the format of ```<host>:port```, e.g. ```10.0.0.2:50000```.
+*   **`worker_hosts`**: Comma separated list of hosts to use as workers in the
+    format of ```<host>:port```, e.g. ```10.0.0.2:50001```.
+*   **`task_index`**: Index of the host in the list of `ps_hosts` or
+    `worker_hosts` being started.
+*   **`job_name`**: Type of job, e.g `ps` or `worker`
+
+#### Distributed examples
+
+Below is an example of training ResNet-50 on 2 hosts: host_0 (10.0.0.1) and
+host_1 (10.0.0.2). The example uses synthetic data. To use real data pass the
+`--data_dir` argument.
+
+```bash
+# Run the following commands on host_0 (10.0.0.1):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+
+# Run the following commands on host_1 (10.0.0.2):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+```
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 86d2b92494c32f2ee896f2e780fbc41ebb1134e4..49c25027fc9502f9ad37819930817cd2ecf3cd65 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -91,8 +91,8 @@ eight-bit computations:
 ```sh
 curl http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz -o /tmp/inceptionv3.tgz
 tar xzf /tmp/inceptionv3.tgz -C /tmp/
-bazel build tensorflow/tools/quantization/tools:quantize_graph
-bazel-bin/tensorflow/tools/quantization/tools/quantize_graph \
+bazel build tensorflow/tools/quantization:quantize_graph
+bazel-bin/tensorflow/tools/quantization/quantize_graph \
   --input=/tmp/classify_image_graph_def.pb \
   --output_node_names="softmax" --output=/tmp/quantized_graph.pb \
   --mode=eightbit
@@ -143,13 +143,13 @@ conversion functions before and after to move the data between float and
 eight-bit. Below is an example of what they look like. First here's the original
 Relu operation, with float inputs and outputs:
 
-![Relu Diagram](https://www.tensorflow.org/../images/quantization0.png)
+![Relu Diagram](https://www.tensorflow.org/images/quantization0.png)
 
 Then, this is the equivalent converted subgraph, still with float inputs and
 outputs, but with internal conversions so the calculations are done in eight
 bit.
 
-![Converted Diagram](https://www.tensorflow.org/../images/quantization1.png)
+![Converted Diagram](https://www.tensorflow.org/images/quantization1.png)
 
 The min and max operations actually look at the values in the input float
 tensor, and then feeds them into the Dequantize operation that converts the
@@ -162,7 +162,7 @@ operations that all have float equivalents, then there will be a lot of adjacent
 Dequantize/Quantize ops. This stage spots that pattern, recognizes that they
 cancel each other out, and removes them, like this:
 
-![Stripping Diagram](https://www.tensorflow.org/../images/quantization2.png)
+![Stripping Diagram](https://www.tensorflow.org/images/quantization2.png)
 
 Applied on a large scale to models where all of the operations have quantized
 equivalents, this gives a graph where all of the tensor calculations are done in
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 9c23e79845d3e58b57edeb11a92a62b0e136a83a..19045b45d92a2ca42c3943bc0662ca42bd0c2c24 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -62,16 +62,16 @@ well as the NVIDIA GPU backend are in the TensorFlow source tree.
 The following diagram shows the compilation process in XLA:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img src="../../images/how-does-xla-work.png">
+  <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
 </div>
 
-XLA comes with several optimizations and analyses that are target-independent,
+XLA comes with several optimizations and analyzes that are target-independent,
 such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
 target-independent operation fusion, and buffer analysis for allocating runtime
 memory for the computation.
 
 After the target-independent step, XLA sends the HLO computation to a backend.
-The backend can perform further HLO-level analyses and optimizations, this time
+The backend can perform further HLO-level analyzes and optimizations, this time
 with target specific information and needs in mind. For example, the XLA GPU
 backend may perform operation fusion beneficial specifically for the GPU
 programming model and determine how to partition the computation into streams.
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index 4d2a643b7f837e31d485cb72806d8d80429d9ad2..d4dc3e57c8fb5ec2a979b6ba7ebe2a3b6c3a5f94 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -124,7 +124,7 @@ open the timeline file created when the script finishes: `timeline.ctf.json`.
 The rendered timeline should look similar to the picture below with multiple
 green boxes labeled `MatMul`, possibly across multiple CPUs.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_gpu.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu.png">
 </div>
 
 ### Step #3 Run with XLA
@@ -139,7 +139,7 @@ TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
 should look similar to the picture below with one long bar labeled `_XlaLaunch`.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_timeline_gpu_xla.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu_xla.png">
 </div>
 
 To understand what is happening in `_XlaLaunch`, look at the console output for
@@ -165,5 +165,5 @@ dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
 
 The result will look like the following:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/jit_gpu_xla_graph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/jit_gpu_xla_graph.png">
 </div>
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index d5eeb5d7d5f307f8eaa7d5c08f5c11043f068c00..424c994e72d0c44966f164a798f8ebdddf86999a 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -178,7 +178,7 @@ Concat({a, b}, 0)
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_concatenate.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
 ## ConvertElementType
@@ -707,7 +707,7 @@ are all 0. Figure below shows examples of different `edge_padding` and
 `interior_padding` values for a two dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_pad.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
 </div>
 
 ## Reduce
@@ -781,13 +781,13 @@ Here's an example of reducing a 2D array (matrix). The shape has rank 2,
 dimension 0 of size 2 and dimension 1 of size 3:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
 </div>
 
 Results of reducing dimensions 0 or 1 with an "add" function:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_2d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
 </div>
 
 Note that both reduction results are 1D arrays. The diagram shows one as column
@@ -798,7 +798,7 @@ size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
 values 1 to 6 are replicated across dimension 0.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_from_3d_matrix.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
 </div>
 
 Similarly to the 2D example, we can reduce just one dimension. If we reduce
@@ -890,7 +890,7 @@ builder.ReduceWindow(
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="../../images/ops_reduce_window.png">
+  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
 </div>
 
 Stride of 1 in a dimension specifies that the position of a window in the
@@ -902,7 +902,7 @@ are the same as though the input came in with the dimensions it has after
 padding.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:75%" src="../../images/ops_reduce_window_stride.png">
+  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
 </div>
 
 The evaluation order of the reduction function is arbitrary and may be
@@ -1144,7 +1144,7 @@ addition `scatter` function produces the output element of value 8 (2 + 6).
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%"
-    src="../../images/ops_scatter_to_selected_window_element.png">
+    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
 </div>
 
 The evaluation order of the `scatter` function is arbitrary and may be
@@ -1482,5 +1482,5 @@ while (result(0) < 1000) {
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_while.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
 </div>
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 7ecddc548feb1734b0f67e3a3df07b4f15c75fbf..78819969b71a88e772813a73f4d1907fccbddec9 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -24,7 +24,7 @@ This code trains a simple NN for MNIST digit image recognition. Notice that the
 accuracy increases slightly after the first training step, but then gets stuck
 at a low (near-chance) level:
 
-![debug_mnist training fails](../images/tfdbg_screenshot_mnist_symptom.png)
+![debug_mnist training fails](https://www.tensorflow.org/images/tfdbg_screenshot_mnist_symptom.png)
 
 Scratching your head, you suspect that certain nodes in the training graph
 generated bad numeric values such as `inf`s and `nan`s. The computation-graph
@@ -89,7 +89,7 @@ The debug wrapper session will prompt you when it is about to execute the first
 `run()` call, with information regarding the fetched tensor and feed
 dictionaries displayed on the screen.
 
-![tfdbg run-start UI](../images/tfdbg_screenshot_run_start.png)
+![tfdbg run-start UI](https://www.tensorflow.org/images/tfdbg_screenshot_run_start.png)
 
 This is what we refer to as the *run-start UI*. If the screen size is
 too small to display the content of the message in its entirety, you can resize
@@ -108,7 +108,7 @@ intermediate tensors from the run. (These tensors can also be obtained by
 running the command `lt` after you executed `run`.) This is called the
 **run-end UI**:
 
-![tfdbg run-end UI: accuracy](../images/tfdbg_screenshot_run_end_accuracy.png)
+![tfdbg run-end UI: accuracy](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_accuracy.png)
 
 ### tfdbg CLI Frequently-Used Commands
 
@@ -130,6 +130,8 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | `lo -r hidden/Relu:0` | List the recipients of the output of the node `hidden/Relu`, recursively—i.e., the output recipient tree. |
 | `lt -n softmax.*` | List all dumped tensors whose names match the regular-expression pattern `softmax.*`. |
 | `lt -t MatMul` | List all dumped tensors whose node type is `MatMul`. |
+| `ls` | List all Python source files responsible for constructing the nodes (and tensors) in the current graph. |
+| `ls -n softmax.*` | List Python source files responsible for constructing the nodes whose names match the pattern `softmax.*`. |
 | `ps /path/to/source.py` | Print the Python source file source.py, with the lines annotated with the ops created at each of them, respectively. |
 | `ps -t /path/to/source.py` | Same as the command above, but perform annotation using dumped Tensors, instead of ops. |
 | `ps -b 30 /path/to/source.py` | Annotate source.py beginning at line 30. |
@@ -179,7 +181,7 @@ screen with a red-colored title line indicating **tfdbg** stopped immediately
 after a `run()` call generated intermediate tensors that passed the specified
 filter `has_inf_or_nan`:
 
-![tfdbg run-end UI: infs and nans](../images/tfdbg_screenshot_run_end_inf_nan.png)
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_inf_nan.png)
 
 As the screen display indicates, the `has_inf_or_nan` filter is first passed
 during the fourth `run()` call: an [Adam optimizer](https://arxiv.org/abs/1412.6980)
@@ -218,7 +220,7 @@ item on the top or entering the equivalent command:
 tfdbg> ni cross_entropy/Log
 ```
 
-![tfdbg run-end UI: infs and nans](../images/tfdbg_screenshot_run_end_node_info.png)
+![tfdbg run-end UI: infs and nans](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_node_info.png)
 
 You can see that this node has the op type `Log`
 and that its input is the node `softmax/Softmax`. Run the following command to
@@ -261,7 +263,7 @@ simply click the underlined line numbers in the stack trace output of the
 `ni -t <op_name>` commands, or use the `ps` (or `print_source`) command such as:
 `ps /path/to/source.py`. See the screenshot below for an example of `ps` output:
 
-![tfdbg run-end UI: annotated Python source file](../images/tfdbg_screenshot_run_end_annotated_source.png)
+![tfdbg run-end UI: annotated Python source file](https://www.tensorflow.org/images/tfdbg_screenshot_run_end_annotated_source.png)
 
 Apply a value clipping on the input to @{tf.log}
 to resolve this problem:
@@ -407,6 +409,25 @@ python -m tensorflow.python.debug.examples.debug_errors \
     --error uninitialized_variable --debug
 ```
 
+**Q**: _How can I let my tfdbg-wrapped Sessions or Hooks run the debug mode
+only from the main thread?_
+
+**A**:
+This is a common use case, in which the `Session` object is used from multiple
+threads concurrently. Typically, the child threads take care of background tasks
+such as running enqueue operations. Oftentimes, you want to debug only the main
+thread (or less frequently, only one of the child threads). You can use the
+`thread_name_filter` keyword argument of `LocalCLIDebugWrapperSession` to
+achieve this type of thread-selective debugging. For example, if you would like
+to debug from only the main thread, you can do:
+
+```python
+sess = tf_debug.LocalCLIDebugWrapperSession(sess, thread_name_filter="MainThread$")
+```
+
+The above example relies on the fact that main threads in Python have the
+default name `MainThread`.
+
 **Q**: _The model I am debugging is very large. The data dumped by tfdbg
 fills up the free space of my disk. What can I do?_
 
@@ -418,8 +439,9 @@ that the graph contains, this kind of disk space issue can happen.
 There are three possible workarounds or solutions:
 
 1. The constructors of `LocalCLIDebugWrapperSession` and `LocalCLIDebugHook`
-   provide a keyword argument, `dump_root`, with which you can specify the path 
+   provide a keyword argument, `dump_root`, with which you can specify the path
    to which **tfdbg** dumps the debug data. For example:
+
    ``` python
    # For LocalCLIDebugWrapperSession
    sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
@@ -432,6 +454,7 @@ There are three possible workarounds or solutions:
 2. Reduce the batch size used during the runs.
 3. Use the filtering options of **tfdbg**'s `run` command to watch only specific
    nodes in the graph. For example:
+
    ```
    tfdbg> run --node_name_filter .*hidden.*
    tfdbg> run --op_type_filter Variable.*
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 309b39451fd11d7185359209a41f0a9dbb8efdb0..acdca2bad4f4765173a239b3e10ed9a700e4b637 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -39,6 +39,11 @@ trained graph.  The following guide details `MetaGraph` objects:
 
   * @{$meta_graph$Exporting and Importing a MetaGraph}.
 
+`SavedModel` is the universal serialization format for Tensorflow models. TensorFlow provides SavedModel CLI (command-line interface) as a tool to inspect and execute a MetaGraph in a SavedModel. The detailed usages and examples are
+documented in the following guide:
+
+  * @{$saved_model_cli$SavedModel CLI (Command-Line Interface)}.
+
 To learn about the TensorFlow versioning scheme, consult the following two
 guides:
 
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index d397917219097084c7ab23070986a9769ae37a4b..322e11cbd697ab427bc4857647234e2a9014ae6a 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -8,6 +8,7 @@ supervisor.md
 debugger.md
 tfdbg-tflearn.md
 meta_graph.md
+saved_model_cli.md
 version_semantics.md
 data_versions.md
 faq.md
diff --git a/tensorflow/docs_src/programmers_guide/reading_data.md b/tensorflow/docs_src/programmers_guide/reading_data.md
index 7c3a37417d7b220653167472aa250f2273bf2d5e..3c31d3a1a7065ed04b1eeb20960fd7687374bf28 100644
--- a/tensorflow/docs_src/programmers_guide/reading_data.md
+++ b/tensorflow/docs_src/programmers_guide/reading_data.md
@@ -133,7 +133,7 @@ uses a file format where each record is represented using a fixed number of
 bytes: 1 byte for the label followed by 3072 bytes of image data. Once you have
 a uint8 tensor, standard operations can slice out each piece and reformat as
 needed. For CIFAR-10, you can see how to do the reading and decoding in
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 and described in
 @{$deep_cnn#prepare-the-data$this tutorial}.
 
@@ -170,7 +170,7 @@ You can then do any preprocessing of these examples you want. This would be any
 processing that doesn't depend on trainable parameters. Examples include
 normalization of your data, picking a random slice, adding noise or distortions,
 etc.  See
-[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/cifar10_input.py)
+[`tensorflow_models/tutorials/image/cifar10/cifar10_input.py`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/cifar10_input.py)
 for an example.
 
 ### Batching
@@ -309,7 +309,7 @@ operations, so that our training loop can dequeue examples from the example
 queue.
 
 <div style="width:70%; margin-left:12%; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/AnimatedFileQueues.gif">
+<img style="width:100%" src="https://www.tensorflow.org/images/AnimatedFileQueues.gif">
 </div>
 
 The helpers in `tf.train` that create these queues and enqueuing operations add
@@ -332,7 +332,7 @@ limit has been reached and no more examples are available.
 
 The last ingredient is the
 @{tf.train.Coordinator}. This is responsible
-for letting all the threads know if anything has signalled a shut down. Most
+for letting all the threads know if anything has signaled a shut down. Most
 commonly this would be because an exception was raised, for example one of the
 threads got an error when running some operation (or an ordinary Python
 exception).
diff --git a/tensorflow/docs_src/programmers_guide/saved_model_cli.md b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb9e60e42ed4346fe78e16eabf8401c34e87c17e
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/saved_model_cli.md
@@ -0,0 +1,251 @@
+# SavedModel CLI (Command-Line Interface)
+
+[`SavedModel`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md)
+is a universal serialization format for Tensorflow. It provides a
+language-neutral format to save machine-learned models and enables higher-level
+systems and tools to produce, consume and transform TensorFlow models.
+
+We provide SavedModel CLI(command-line interface) as a tool to inspect and
+execute a [`MetaGraph`](https://www.tensorflow.org/programmers_guide/meta_graph)
+in a SavedModel. You can inspect for example, what
+[`SignatureDefs`](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/signature_defs.md),
+including their input and output tensors, are in the model without writing any
+code. This can be useful in situations such as when you want to quickly check
+your input dtype and shape match with the model. Moreover, if you want to test
+out the model, it also allows you to do a sanity check by passing in sample
+inputs in the format of for example, python expressions, and fetch the outputs
+simply through command line.
+
+## Get SavedModel CLI
+
+If TensorFlow is installed on your system through pip, the `saved_model_cli`
+binary can be invoked directly from command line.
+
+To build the binary from source, run the following command:
+
+```
+$bazel build tensorflow/python/tools:saved_model_cli
+```
+
+## Commands
+
+SavedModel CLI allows users to both show and run computations on a
+[`MetaGraphDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
+in a SavedModel. These are done through `show` and `run` commands. We will
+explain the usages of both commands with detailed examples. SavedModel CLI will
+also display this information with `-h` option.
+
+### `show` command
+
+A SavedModel contains one or more MetaGraphs, identified by their tag-sets. Each
+MetaGraph contains both a TensorFlow GraphDef as well as associated metadata
+necessary for running computation in a graph. In order to serve a model, you
+might wonder what kind of SignatureDefs are in each model, and what are their
+inputs and outputs etc. The `show` command let you examine the content of the
+SavedModel in a hierarchical order.
+
+```
+usage: saved_model_cli show [-h] --dir DIR [--all]
+[--tag_set TAG_SET] [--signature_def SIGNATURE_DEF_KEY]
+```
+
+#### Examples
+
+To show all available MetaGraphDef tag-sets in the SavedModel:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir
+The given SavedModel contains the following tag-sets:
+serve
+serve, gpu
+```
+
+To show all available SignatureDef keys in a MetaGraphDef:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve
+The given SavedModel MetaGraphDef contains SignatureDefs with the following keys:
+SignatureDef key: "classify_x2_to_y3"
+SignatureDef key: "classify_x_to_y"
+SignatureDef key: "regress_x2_to_y3"
+SignatureDef key: "regress_x_to_y"
+SignatureDef key: "regress_x_to_y2"
+SignatureDef key: "serving_default"
+```
+
+For a MetaGraphDef with multiple tags in the tag-set, all tags must be passed
+in, separated by ',':
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --tag_set serve,gpu
+```
+
+To show all inputs and outputs TensorInfo for a specific SignatureDef, pass in
+the SignatureDef key to `signature_def` option. This is very useful when you
+want to know the tensor key value, dtype and shape of the input tensors for
+executing the computation graph later.
+
+```
+$saved_model_cli show --dir \
+/tmp/saved_model_dir --tag_set serve --signature_def serving_default
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+To show all available information in the SavedModel, use `--all` option:
+
+```
+$saved_model_cli show --dir /tmp/saved_model_dir --all
+MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['classify_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/classify
+
+...
+
+signature_def['serving_default']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict
+```
+
+### `run` command
+
+SavedModel CLI also allows you to run a graph computation by passing in inputs,
+displaying, and saving the outputs.
+
+```
+usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET --signature_def
+                           SIGNATURE_DEF_KEY [--inputs INPUTS]
+                           [--input_exprs INPUT_EXPRS] [--outdir OUTDIR]
+                           [--overwrite] [--tf_debug]
+```
+
+Tensor keys are used to specify which input we are passing in the values for.
+There are two ways to pass inputs to the model. With '--inputs' option, you can
+pass in numpy ndarray by files. With '--input_exprs' option, you can use python
+expressions as inputs.
+
+#### Input By File
+
+To pass in inputs by files, use `--inputs` option in the format of
+`<input_key>=<filename>`, or `<input_key>=<filename>[<variable_name>]`. Each
+input is separated by semicolon. File specified by `filename` will be loaded
+using `numpy.load`. Inputs can be loaded from only `.npy`, `.npz` or pickle
+files. The `variable_name` key is optional depending on the input file type as
+descripted in more details below.
+
+When loading from a `.npy` file, which always contains a numpy ndarray, the
+content will be directly assigned to the specified input tensor. If a
+`variable_name` is specified, it will be ignored and a warning will be issued.
+
+When loading from a `.npz` zip file, user can specify which variable within the
+zip file to load for the input tensor key with `variable_name`. If nothing is
+specified, SavedModel CLI will check that only one file is included in the zip
+file and load it for the specified input tensor key.
+
+When loading from a pickle file, if no `variable_name` is specified in the
+square brackets, whatever that is inside the pickle file will be passed to the
+specified input tensor key. Else SavedModel CLI will assume a dictionary is
+stored in the pickle file and the value corresponding to the variable_name will
+be used.
+
+#### Input By Python Expression
+
+To pass in inputs by python expressions, use `--input_exprs` option. `numpy`
+module is available as `np`. For example, `input_key=np.ones((32, 32, 3))` or
+`input_key=[[1], [2], [3]]`. This can be useful for when you don't have data
+files lying around, but still want to sanity check the model with some simple
+inputs that match the dtype and shape of the model signature.
+
+#### Save Output
+
+By default, SavedModel CLI will print outputs to console. If a directory is
+passed to `--outdir` option, the outputs will be saved as npy files named after
+output tensor keys under the given directory. Use `--overwite` to overwrite
+existing output files.
+
+#### TensorFlow Debugger (tfdbg) Integration
+
+If `--tf_debug` option is set, SavedModel CLI will use TensorFlow Debugger
+(tfdbg) to watch the intermediate Tensors and runtime GraphDefs while running
+the SavedModel.
+
+#### Examples
+
+If we have a simple model that adds `x1` and `x2` to get output `y`, where all
+tensors are of shape `(-1, 1)`, and we have two `npz` files. File
+`/tmp/my_data1.npy` contains a numpy ndarray `[[1], [2], [3]]`, file
+`/tmp/my_data2.npy` contains another numpy ndarray `[[0.5], [0.5], [0.5]]`. Now
+let's run these two `npy` files through the model to get `y`:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npy;x2=/tmp/my_data2.npy \
+--outdir /tmp/out
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+Similarly, we can run input tensors from `npz` file and pickle file, as well as
+overwrite the previous output file:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y \
+--inputs x1=/tmp/my_data1.npz[x];x2=/tmp/my_data2.pkl --outdir /tmp/out \
+--overwrite
+Result for output key y:
+[[ 1.5]
+ [ 2.5]
+ [ 3.5]]
+```
+
+You can also use python expression instead of input file. Here we replace input
+`x2` with a python expression:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def x1_x2_to_y --inputs x1=/tmp/my_data1.npz[x] \
+--input_exprs 'x2=np.ones((3,1))'
+Result for output key y:
+[[ 2]
+ [ 3]
+ [ 4]]
+```
+
+To run model with TensorFlow Debugger on:
+
+```
+$saved_model_cli run --dir /tmp/saved_model_dir --tag_set serve \
+--signature_def serving_default --inputs x=/tmp/data.npz[x] --tf_debug
+```
diff --git a/tensorflow/docs_src/programmers_guide/supervisor.md b/tensorflow/docs_src/programmers_guide/supervisor.md
index 82ed1c2cf76679f2b3cc86807fba882941722e6e..55a090df5898673cec7812021b1feea9606d6376 100644
--- a/tensorflow/docs_src/programmers_guide/supervisor.md
+++ b/tensorflow/docs_src/programmers_guide/supervisor.md
@@ -362,8 +362,8 @@ following keyword arguments to the `Supervisor()` constructor:
    If not specified, the supervisor uses the first op in the
    `tf.GraphKeys.LOCAL_INIT_OP` collection.  If the collection is empty the
    supervisor adds an op to initialize all the tables and local variables in
-   the graph by calling `tf.initialize_all_tables()` and
-   `tf.initialize_all_local_variables()`.
+   the graph by calling `tf.tables_initializer()` and
+   `tf.local_variables_initializer()`.
 
    Pass `None` to not use a local init op.
 
diff --git a/tensorflow/docs_src/programmers_guide/threading_and_queues.md b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
index 1999cf6941095b5cd658c49965dc384e7d58ff6b..835e8060466428ddbb82f9ef6d1b78c76a0c9890 100644
--- a/tensorflow/docs_src/programmers_guide/threading_and_queues.md
+++ b/tensorflow/docs_src/programmers_guide/threading_and_queues.md
@@ -14,7 +14,7 @@ that takes an item off the queue, adds one to that item, and puts it back on the
 end of the queue. Slowly, the numbers on the queue increase.
 
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/IncremeterFifoQueue.gif">
+<img style="width:100%" src="https://www.tensorflow.org/images/IncremeterFifoQueue.gif">
 </div>
 
 `Enqueue`, `EnqueueMany`, and `Dequeue` are special nodes. They take a pointer
diff --git a/tensorflow/docs_src/programmers_guide/variable_scope.md b/tensorflow/docs_src/programmers_guide/variable_scope.md
index 5084acbab97b0d87ca3232b44b0e07e92e8a6ff4..f4d2b3f37b875f589e2de69d8681a09e90f99360 100644
--- a/tensorflow/docs_src/programmers_guide/variable_scope.md
+++ b/tensorflow/docs_src/programmers_guide/variable_scope.md
@@ -5,7 +5,7 @@ in the way described in the @{$variables$Variables HowTo}.
 But when building complex models you often need to share large sets of
 variables and you might want to initialize all of them in one place.
 This tutorial shows how this can be done using `tf.variable_scope()` and
-the `tf.get_variable()`.
+`tf.get_variable()`.
 
 ## The Problem
 
@@ -368,6 +368,6 @@ sequence-to-sequence models.
 
 File | What's in it?
 --- | ---
-`models/tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
-`models/tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
-`models/tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
+`tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
+`tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
+`tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index 04bfca5f3bdc8dad9e4dedf3ce8691ae01eb1f44..e8d1e519f0b8fd05039b107a5501ea0da7cc29a6 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -62,7 +62,7 @@ with tf.device("/job:ps/task:7"):
   v = tf.Variable(...)
 ```
 
-**N.B.** Operations that mutate a variable, such as
+**NOTE** Operations that mutate a variable, such as
 @{tf.Variable.assign} and the parameter
 update operations in a
 @{tf.train.Optimizer} *must* run on
diff --git a/tensorflow/docs_src/programmers_guide/version_semantics.md b/tensorflow/docs_src/programmers_guide/version_semantics.md
index 0f06d4b07685c39f0f652b25114b79eaeb70a39d..cee3b105de4c2faba801c5ea4e01765391d1173b 100644
--- a/tensorflow/docs_src/programmers_guide/version_semantics.md
+++ b/tensorflow/docs_src/programmers_guide/version_semantics.md
@@ -54,9 +54,9 @@ patch versions.  The public APIs consist of
 Some API functions are explicitly marked as "experimental" and can change in
 backward incompatible ways between minor releases. These include:
 
-* **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
-  and any functions in the C API or fields in protocol buffers that are
-  explicitly commented as being experimental.
+*   **Experimental APIs**: The @{tf.contrib} module and its submodules in Python
+    and any functions in the C API or fields in protocol buffers that are
+    explicitly commented as being experimental.
 
 *   **Other languages**: TensorFlow APIs in languages other than Python and C,
     such as:
@@ -93,6 +93,22 @@ backward incompatible ways between minor releases. These include:
     single cluster is unsupported. There are no guarantees about backwards
     compatibility of the wire protocol.
 
+*   **Bugs:** We reserve the right to make backwards incompatible behavior
+    (though not API) changes if the current implementation is clearly broken,
+    i.e., if it is contradicting the documentation, or if a well-known and
+    well-defined intended behavior is not properly implemented due to a bug.
+    For example, if an optimizer claims to implement a well-known optimization
+    algorithm but, due to a bug, does not match that algorithm we will fix the
+    optimizer. This may break code relying on the wrong behavior for
+    convergence. We will note such changes in the release notes.
+
+*   **Error messages:** We reserve the right to change the text of error
+    messages. In addition, the type of an error may change unless the type is
+    specified in the documentation. For example, a function that says in some
+    condition it will raise an `InvalidArgument` exception, it will continue to
+    raise `InvalidArgument`, but the human-readable message contents can change.
+
+
 Furthermore, any API methods marked "deprecated" in the 1.0 release can
 be deleted in any subsequent minor release.
 
@@ -102,7 +118,7 @@ Many users of TensorFlow will be saving graphs and trained models to disk for
 later evaluation or more training, often changing versions of TensorFlow in the
 process.  First, following semver, any graph or checkpoint written out with one
 version of TensorFlow can be loaded and evaluated with a later version of
-TensorFlow with the same major release.  However, we will endeavour to preserve
+TensorFlow with the same major release.  However, we will endeavor to preserve
 backwards compatibility even across major releases when possible, so that the
 serialized files are usable over long periods of time.
 
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index ba3fbe12804630dc88c3da2f787f9d29b102ed94..b0617326ff32ce8d219985d8eb1baa1c0ffc0cc4 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,7 +83,7 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`tensorflow_models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://www.tensorflow.org/code/tensorflow_models/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
@@ -141,7 +141,7 @@ so that we may visualize them in @{$summaries_and_tensorboard$TensorBoard}.
 This is a good practice to verify that inputs are built correctly.
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../images/cifar_image_summary.png">
+  <img style="width:70%" src="https://www.tensorflow.org/images/cifar_image_summary.png">
 </div>
 
 Reading images from disk and distorting them can use a non-trivial amount of
@@ -170,7 +170,7 @@ Layer Name | Description
 Here is a graph generated from TensorBoard describing the inference operation:
 
 <div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/cifar_graph.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cifar_graph.png">
 </div>
 
 > **EXERCISE**: The output of `inference` are un-normalized logits. Try editing
@@ -205,7 +205,7 @@ loss and all these weight decay terms, as returned by the `loss()` function.
 
 We visualize it in TensorBoard with a @{tf.summary.scalar}:
 
-![CIFAR-10 Loss](../images/cifar_loss.png "CIFAR-10 Total Loss")
+![CIFAR-10 Loss](https://www.tensorflow.org/images/cifar_loss.png "CIFAR-10 Total Loss")
 
 We train the model using standard
 [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
@@ -214,7 +214,7 @@ with a learning rate that
 @{tf.train.exponential_decay$exponentially decays}
 over time.
 
-![CIFAR-10 Learning Rate Decay](../images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
+![CIFAR-10 Learning Rate Decay](https://www.tensorflow.org/images/cifar_lr_decay.png "CIFAR-10 Learning Rate Decay")
 
 The `train()` function adds the operations needed to minimize the objective by
 calculating the gradient and updating the learned variables (see
@@ -295,8 +295,8 @@ For instance, we can watch how the distribution of activations and degree of
 sparsity in `local3` features evolve during training:
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px; display: flex; flex-direction: row">
-  <img style="flex-grow:1; flex-shrink:1;" src="../images/cifar_sparsity.png">
-  <img style="flex-grow:1; flex-shrink:1;" src="../images/cifar_activations.png">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_sparsity.png">
+  <img style="flex-grow:1; flex-shrink:1;" src="https://www.tensorflow.org/images/cifar_activations.png">
 </div>
 
 Individual loss functions, as well as the total loss, are particularly
@@ -378,7 +378,7 @@ processing a batch of data.
 Here is a diagram of this model:
 
 <div style="width:40%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/Parallelism.png">
+  <img style="width:100%" src="https://www.tensorflow.org/images/Parallelism.png">
 </div>
 
 Note that each GPU computes inference as well as the gradients for a unique
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
index bf03427fc5b011c96cabeff3a0b5c081e1b9309c..88ae451cd5365d878d8b4fea83cebe4c6ff57c91 100644
--- a/tensorflow/docs_src/tutorials/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -36,7 +36,7 @@ images into [1000 classes], like "Zebra", "Dalmatian", and "Dishwasher".
 For example, here are the results from [AlexNet] classifying some images:
 
 <div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/AlexClassification.png">
+<img style="width:100%" src="https://www.tensorflow.org/images/AlexClassification.png">
 </div>
 
 To compare models, we examine how often the model fails to predict the
@@ -75,7 +75,7 @@ Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/mode
 The above command will classify a supplied image of a panda bear.
 
 <div style="width:15%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/cropped_panda.jpg">
+  <img style="width:100%" src="https://www.tensorflow.org/images/cropped_panda.jpg">
 </div>
 
 If the model runs correctly, the script will produce the following output:
@@ -137,7 +137,7 @@ score of 0.8.
 
 
 <div style="width:45%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/grace_hopper.jpg">
+  <img style="width:100%" src="https://www.tensorflow.org/images/grace_hopper.jpg">
 </div>
 
 Next, try it out on your own images by supplying the --image= argument, e.g.
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index c42bb8a023eaac7fdb5ae17c6940e760d8cbee51..85e6ec76dc4ed966492e404d0c3ab59824f41413 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -14,11 +14,16 @@ laptop, without requiring a GPU. This tutorial will show you how to run the
 example script on your own images, and will explain some of the options you have
 to help control the training process.
 
+Note: This version of the tutorial mainly uses bazel. A bazel free version is
+also available
+[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
+
 [TOC]
 
 ## Training on Flowers
 
-![Daisies by Kelly Sikkema](../images/daisies.jpg)
+![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
+
 [Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
 
 Before you start any training, you'll need a set of images to teach the network
@@ -148,10 +153,10 @@ Here's an example of how to build and run the label_image example with your
 retrained graphs:
 
 ```sh
-bazel build tensorflow/examples/label_image:label_image && \
-bazel-bin/tensorflow/examples/label_image/label_image \
+bazel build tensorflow/examples/image_retraining:label_image && \
+bazel-bin/tensorflow/examples/image_retraining/label_image \
 --graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---output_layer=final_result \
+--output_layer=final_result:0 \
 --image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
 ```
 
@@ -160,7 +165,10 @@ You should see a list of flower labels, in most cases with daisy on top
 `--image` parameter with your own images to try those out, and use the C++ code
 as a template to integrate with your own applications.
 
-If you'd like to use the retrained model in a Python program [this example from @eldor4do shows what you'll need to do](https://github.com/eldor4do/TensorFlow-Examples/blob/master/retraining-example.py).
+If you'd like to use the retrained model in your own Python program, then the
+above
+[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/image_retraining/label_image.py)
+is a reasonable starting point.
 
 ## Training on Your Own Categories
 
@@ -174,7 +182,7 @@ you do that and pass the root folder of the subdirectories as the argument to
 Here's what the folder structure of the flowers archive looks like, to give you
 and example of the kind of layout the script is looking for:
 
-![Folder Structure](../images/folder_structure.png)
+![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
 
 In practice it may take some work to get the accuracy you want. I'll try to
 guide you through some of the common problems you might encounter below.
@@ -260,7 +268,7 @@ them destroys their meaning.
 
 There are several other parameters you can try adjusting to see if they help
 your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
+final layer during training. Intuitively if this is smaller than the learning
 will take longer, but it can end up helping the overall precision. That's not
 always the case though, so you need to experiment carefully to see what works
 for your case. The `--train_batch_size` controls how many images are examined
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 2550bd3e4287a9abb4ac25b11fb4ca779e233875..289a45e2ed5d783c04d19a3948ed1a9e188b673f 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -7,7 +7,7 @@ activation functions, and applying dropout regularization. In this tutorial,
 you'll learn how to use `layers` to build a convolutional neural network model
 to recognize the handwritten digits in the MNIST data set.
 
-![handwritten digits 0–9 from the MNIST data set](../images/mnist_0-9.png)
+![handwritten digits 0–9 from the MNIST data set](https://www.tensorflow.org/images/mnist_0-9.png)
 
 **The [MNIST dataset](http://yann.lecun.com/exdb/mnist/) comprises 60,000
 training examples and 10,000 test examples of the handwritten digits 0–9,
@@ -316,11 +316,11 @@ of 2, which indicates that the subregions extracted by the filter should be
 separated by 2 pixels in both the width and height dimensions (for a 2x2 filter,
 this means that none of the regions extracted will overlap). If you want to set
 different stride values for width and height, you can instead specify a tuple or
-list (e.g., `stride=[3,6]`).
+list (e.g., `stride=[3, 6]`).
 
 Our output tensor produced by `max_pooling2d()` (`pool1`) has a shape of
-<code>[<em>batch_size</em>, 14, 14, 1]</code>: the 2x2 filter reduces width and
-height by 50%.
+<code>[<em>batch_size</em>, 14, 14, 32]</code>: the 2x2 filter reduces width and
+height by 50% each.
 
 ### Convolutional Layer #2 and Pooling Layer #2
 
@@ -341,7 +341,7 @@ pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 ```
 
 Note that convolutional layer #2 takes the output tensor of our first pooling
-layer (`pool1`) as input, and produces the tensor `h_conv2` as output. `conv2`
+layer (`pool1`) as input, and produces the tensor `conv2` as output. `conv2`
 has a shape of <code>[<em>batch_size</em>, 14, 14, 64]</code>, the same width
 and height as `pool1` (due to `padding="same"`), and 64 channels for the 64
 filters applied.
diff --git a/tensorflow/docs_src/tutorials/mandelbrot.md b/tensorflow/docs_src/tutorials/mandelbrot.md
index 7d8abbdcba67fd86c7e26e735cfa30bb637989c6..1c0a548129c22f2c57107061bd7eda6239eabdb8 100755
--- a/tensorflow/docs_src/tutorials/mandelbrot.md
+++ b/tensorflow/docs_src/tutorials/mandelbrot.md
@@ -109,7 +109,7 @@ Let's see what we've got.
 DisplayFractal(ns.eval())
 ```
 
-![jpeg](../images/mandelbrot_output.jpg)
+![jpeg](https://www.tensorflow.org/images/mandelbrot_output.jpg)
 
 Not bad!
 
diff --git a/tensorflow/docs_src/tutorials/pdes.md b/tensorflow/docs_src/tutorials/pdes.md
index ec6915074ba7392967a8a72a67d7c54ff8d981ae..425e8d7084e7f2505b7a3013b431345b72b38cf0 100755
--- a/tensorflow/docs_src/tutorials/pdes.md
+++ b/tensorflow/docs_src/tutorials/pdes.md
@@ -93,7 +93,7 @@ for n in range(40):
 DisplayArray(u_init, rng=[-0.1, 0.1])
 ```
 
-![jpeg](../images/pde_output_1.jpg)
+![jpeg](https://www.tensorflow.org/images/pde_output_1.jpg)
 
 
 Now let's specify the details of the differential equation.
diff --git a/tensorflow/docs_src/tutorials/recurrent.md b/tensorflow/docs_src/tutorials/recurrent.md
index a1c0532f5a2cd62ba7980583ae71afec1e3ab7ee..708a9620dd7ec2b71905d932dffa2af74cfceb96 100644
--- a/tensorflow/docs_src/tutorials/recurrent.md
+++ b/tensorflow/docs_src/tutorials/recurrent.md
@@ -51,11 +51,28 @@ The core of the model consists of an LSTM cell that processes one word at a
 time and computes probabilities of the possible values for the next word in the
 sentence. The memory state of the network is initialized with a vector of zeros
 and gets updated after reading each word. For computational reasons, we will
-process data in mini-batches of size `batch_size`.
+process data in mini-batches of size `batch_size`.  In this example, it is important 
+to note that `current_batch_of_words` does not correspond to a "sentence" of words.  
+Every word in a batch should correspond to time t.  Tensorflow will automatically sum 
+the gradients of each batch for you.
+
+For example:
+```
+ t=0  t=1    t=2  t=3     t=4
+[The, brown, fox, is,     quick]
+[The, red,   fox, jumped, high]
+
+words_in_dataset[0] = [The, The]
+words_in_dataset[1] = [fox, fox]
+words_in_dataset[2] = [is, jumped]
+words_in_dataset[3] = [quick, high]
+num_batches = 4, batch_size = 2, time_steps = 5
+```
 
 The basic pseudocode is as follows:
 
 ```python
+words_in_dataset = tf.placeholder(tf.float32, [num_batches, batch_size, num_features])
 lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
 # Initial state of the LSTM memory.
 state = tf.zeros([batch_size, lstm.state_size])
@@ -156,9 +173,10 @@ the second and so on.
 We have a class called `MultiRNNCell` that makes the implementation seamless:
 
 ```python
-lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, state_is_tuple=False)
-stacked_lstm = tf.contrib.rnn.MultiRNNCell([lstm] * number_of_layers,
-    state_is_tuple=False)
+def lstm_cell():
+  return tf.contrib.rnn.BasicLSTMCell(lstm_size)
+stacked_lstm = tf.contrib.rnn.MultiRNNCell(
+    [lstm_cell() for _ in range(number_of_layers)])
 
 initial_state = state = stacked_lstm.zero_state(batch_size, tf.float32)
 for i in range(num_steps):
@@ -173,15 +191,22 @@ final_state = state
 
 ## Run the Code
 
-Start by cloning the [TensorFlow models repo](https://github.com/tensorflow/models) from GitHub.
-You'll also need to download the PTB dataset, as discussed at the beginning of
-this tutorial; we'll assume the dataset is located in `/tmp/simple-examples/data`.
+Before running the code, download the PTB dataset, as discussed at the beginning
+of this tutorial.  Then, extract the PTB dataset underneath your home directory
+as follows:
+
+```bsh
+tar xvfz simple-examples.tgz -C $HOME
+```
+_(Note: On Windows, you may need to use
+[other tools](https://wiki.haskell.org/How_to_unpack_a_tar_file_in_Windows).)_
 
-Run the following commands:
+Now, clone the [TensorFlow models repo](https://github.com/tensorflow/models)
+from GitHub. Run the following commands:
 
-```bash
+```bsh
 cd models/tutorials/rnn/ptb
-python ptb_word_lm.py --data_path=/tmp/simple-examples/data/ --model=small
+python ptb_word_lm.py --data_path=$HOME/simple-examples/data/ --model=small
 ```
 
 There are 3 supported model configurations in the tutorial code: "small",
diff --git a/tensorflow/docs_src/tutorials/seq2seq.md b/tensorflow/docs_src/tutorials/seq2seq.md
index a3db3e51cfd1cdba489c14d9c1b50f1512ce240f..6ffe3e8b037a8e21b38cded7e3b0d617b4ddb212 100644
--- a/tensorflow/docs_src/tutorials/seq2seq.md
+++ b/tensorflow/docs_src/tutorials/seq2seq.md
@@ -40,7 +40,7 @@ networks (RNNs): an *encoder* that processes the input and a *decoder* that
 generates the output. This basic architecture is depicted below.
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/basic_seq2seq.png" />
+<img style="width:100%" src="https://www.tensorflow.org/images/basic_seq2seq.png" />
 </div>
 
 Each box in the picture above represents a cell of the RNN, most commonly
@@ -62,7 +62,7 @@ decoding step. A multi-layer sequence-to-sequence network with LSTM cells and
 attention mechanism in the decoder looks like this.
 
 <div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/attention_seq2seq.png" />
+<img style="width:100%" src="https://www.tensorflow.org/images/attention_seq2seq.png" />
 </div>
 
 ## TensorFlow seq2seq library
diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md
index d64cdafdefb287d9ddb15ac98c45ab389384ef5a..dcec62d2749b794e48a0f8b0b5d89484cbffaa13 100644
--- a/tensorflow/docs_src/tutorials/using_gpu.md
+++ b/tensorflow/docs_src/tutorials/using_gpu.md
@@ -57,14 +57,17 @@ have the same device assignment.
 with tf.device('/cpu:0'):
   a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
   b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
-  c = tf.matmul(a, b)
+c = tf.matmul(a, b)
 # Creates a session with log_device_placement set to True.
 sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
 # Runs the op.
 print(sess.run(c))
 ```
 
-You will see that now `a` and `b` are assigned to `cpu:0`.
+You will see that now `a` and `b` are assigned to `cpu:0`. Since a device was
+not explicitly specified for the `MatMul` operation, the TensorFlow runtime will
+choose one based on the operation and available devices (`gpu:0` in this
+example) and automatically copy tensors between devices if required.
 
 ```
 Device mapping:
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index 1b72ba0746d0bad55cf88fd9277d4a1addf86e77..ce820099037757d79b79fe89c8c0d1ef200400e7 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -14,26 +14,22 @@ To try the code for this tutorial:
 
 1.  @{$install$Install TensorFlow} if you haven't already.
 
-2.  Download [the tutorial code](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
 3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
 
-       ```shell
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
+    a. Get `pip`:
 
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-       ```
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
 
-    2. Use `pip` to install pandas:
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
 
-       ```shell
-       $ sudo pip install pandas
-       ```
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
 
     If you have trouble installing pandas, consult the
     [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
@@ -42,9 +38,7 @@ https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/w
 4. Execute the tutorial code with the following command to train the linear
 model described in this tutorial:
 
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide
-   ```
+        $ python wide_n_deep_tutorial.py --model_type=wide
 
 Read on to find out how this code builds its linear model.
 
@@ -387,7 +381,7 @@ which means the accuracy is 83.6%. Feel free to try more features and
 transformations and see if you can do even better!
 
 If you'd like to see a working end-to-end example, you can download our
-[example code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py)
+[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 and set the `model_type` flag to `wide`.
 
 ## Adding Regularization to Prevent Overfitting
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index dd830eeca91cb61481afc77f4d32db1fa735828a..77c905fd51369ec63bb47b3cdb7dc58f862a6410 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -17,8 +17,7 @@ large-scale regression and classification problems with sparse input features
 you're interested in learning more about how Wide & Deep Learning works, please
 check out our [research paper](http://arxiv.org/abs/1606.07792).
 
-![Wide & Deep Spectrum of Models]
-(../images/wide_n_deep.svg "Wide & Deep")
+![Wide & Deep Spectrum of Models](https://www.tensorflow.org/images/wide_n_deep.svg "Wide & Deep")
 
 The figure above shows a comparison of a wide model (logistic regression with
 sparse features and transformations), a deep model (feed-forward neural network
@@ -39,27 +38,24 @@ And that's it! Let's go through a simple example.
 
 To try the code for this tutorial:
 
-1.  @{$install$Install TensorFlow}[Install TensorFlow]  if you haven't already.
+1.  @{$install$Install TensorFlow} if you haven't already.
 
-2.  Download [the tutorial code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
 3.  Install the pandas data analysis library. tf.learn doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
-    1. Get `pip`:
 
-       ```shell
-       # Ubuntu/Linux 64-bit
-       $ sudo apt-get install python-pip python-dev
+    a. Get `pip`:
 
-       # Mac OS X
-       $ sudo easy_install pip
-       $ sudo easy_install --upgrade six
-      ```
+        # Ubuntu/Linux 64-bit
+        $ sudo apt-get install python-pip python-dev
 
-    2. Use `pip` to install pandas:
+        # Mac OS X
+        $ sudo easy_install pip
+        $ sudo easy_install --upgrade six
 
-       ```shell
-       $ sudo pip install pandas
-       ```
+    b. Use `pip` to install pandas:
+
+        $ sudo pip install pandas
 
     If you have trouble installing pandas, consult the
     [instructions](http://pandas.pydata.org/pandas-docs/stable/install.html)
@@ -68,9 +64,7 @@ To try the code for this tutorial:
 4. Execute the tutorial code with the following command to train the linear
 model described in this tutorial:
 
-   ```shell
-   $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
-   ```
+        $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
 
 Read on to find out how this code builds its linear model.
 
@@ -262,7 +256,7 @@ The first line of the output should be something like `accuracy: 0.84429705`. We
 can see that the accuracy was improved from about 83.6% using a wide-only linear
 model to about 84.4% using a Wide & Deep model. If you'd like to see a working
 end-to-end example, you can download our
-[example code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
 
 Note that this tutorial is just a quick example on a small dataset to get you
 familiar with the API. Wide & Deep Learning will be even more powerful if you
diff --git a/tensorflow/docs_src/tutorials/word2vec.md b/tensorflow/docs_src/tutorials/word2vec.md
index 3845e67496c0cff76515faf6d04f808c62dc042f..d7a9089949c05aebf033d7024ae2333e9b8a31f6 100644
--- a/tensorflow/docs_src/tutorials/word2vec.md
+++ b/tensorflow/docs_src/tutorials/word2vec.md
@@ -23,7 +23,7 @@ straight in, feel free to look at the minimalistic implementation in
 This basic example contains the code needed to download some data, train on it a
 bit and visualize the result. Once you get comfortable with reading and running
 the basic version, you can graduate to
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py)
 which is a more serious implementation that showcases some more advanced
 TensorFlow principles about how to efficiently use threads to move data into a
 text model, how to checkpoint during training, etc.
@@ -51,7 +51,7 @@ means that we may need more data in order to successfully train statistical
 models.  Using vector representations can overcome some of these obstacles.
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/audio-image-text.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/audio-image-text.png" alt>
 </div>
 
 [Vector space models](https://en.wikipedia.org/wiki/Vector_space_model) (VSMs)
@@ -108,7 +108,7 @@ $$
 
 where \\(\text{score}(w_t, h)\\) computes the compatibility of word \\(w_t\\)
 with the context \\(h\\) (a dot product is commonly used). We train this model
-by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function) 
+by maximizing its [log-likelihood](https://en.wikipedia.org/wiki/Likelihood_function)
 on the training set, i.e. by maximizing
 
 $$
@@ -125,18 +125,18 @@ probability using the score for all other \\(V\\) words \\(w'\\) in the current
 context \\(h\\), *at every training step*.
 
 <div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/softmax-nplm.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/softmax-nplm.png" alt>
 </div>
 
 On the other hand, for feature learning in word2vec we do not need a full
 probabilistic model. The CBOW and skip-gram models are instead trained using a
-binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)) 
+binary classification objective ([logistic regression](https://en.wikipedia.org/wiki/Logistic_regression))
 to discriminate the real target words \\(w_t\\) from \\(k\\) imaginary (noise) words \\(\tilde w\\), in the
 same context. We illustrate this below for a CBOW model. For skip-gram the
 direction is simply inverted.
 
 <div style="width:60%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/nce-nplm.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/nce-nplm.png" alt>
 </div>
 
 Mathematically, the objective (for each example) is to maximize
@@ -233,7 +233,7 @@ below (see also for example
 [Mikolov et al., 2013](http://www.aclweb.org/anthology/N13-1090)).
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/linear-relationships.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/linear-relationships.png" alt>
 </div>
 
 This explains why these vectors are also useful as features for many canonical
@@ -335,13 +335,13 @@ After training has finished we can visualize the learned embeddings using
 t-SNE.
 
 <div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/tsne.png" alt>
+<img style="width:100%" src="https://www.tensorflow.org/images/tsne.png" alt>
 </div>
 
 Et voila! As expected, words that are similar end up clustering nearby each
 other. For a more heavyweight implementation of word2vec that showcases more of
 the advanced features of TensorFlow, see the implementation in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 ## Evaluating Embeddings: Analogical Reasoning
 
@@ -357,7 +357,7 @@ Download the dataset for this task from
 
 To see how we do this evaluation, have a look at the `build_eval_graph()` and
 `eval()` functions in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 The choice of hyperparameters can strongly influence the accuracy on this task.
 To achieve state-of-the-art performance on this task requires training over a
@@ -385,13 +385,13 @@ your model is seriously bottlenecked on input data, you may want to implement a
 custom data reader for your problem, as described in
 @{$new_data_formats$New Data Formats}.  For the case of Skip-Gram
 modeling, we've actually already done this for you as an example in
-[tensorflow_models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
+[models/tutorials/embedding/word2vec.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec.py).
 
 If your model is no longer I/O bound but you want still more performance, you
 can take things further by writing your own TensorFlow Ops, as described in
 @{$adding_an_op$Adding a New Op}.  Again we've provided an
 example of this for the Skip-Gram case
-[tensorflow_models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
+[models/tutorials/embedding/word2vec_optimized.py](https://www.tensorflow.org/code/tensorflow_models/tutorials/embedding/word2vec_optimized.py).
 Feel free to benchmark these against each other to measure performance
 improvements at each stage.
 
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 0414566b98470cc8b37b46f4bbf090278e13dbc4..270f654ed729891af29b020a046c6ff88c24cc1e 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -10,8 +10,8 @@ TensorFlow in mobile applications.
 
 Inference is done using the [TensorFlow Android Inference Interface](../../../tensorflow/contrib/android),
 which may be built separately if you want a standalone library to drop into your
-existing application. Object tracking and YUV -> RGB conversion is handled by
-libtensorflow_demo.so.
+existing application. Object tracking and efficient YUV -> RGB conversion are
+handled by `libtensorflow_demo.so`.
 
 A device running Android 5.0 (API 21) or higher is required to run the demo due
 to the use of the camera2 API, although the native libraries themselves can run
@@ -28,20 +28,22 @@ on API >= 14 devices.
         using Deep Neural Networks](https://arxiv.org/abs/1312.2249) to
         localize and track people in the camera preview in real-time.
 3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
-        Uses a model based on [A Learned Representation For Artistic Style]
-        (https://arxiv.org/abs/1610.07629) to restyle the camera preview image
-        to that of a number of different artists.
+        Uses a model based on [A Learned Representation For Artistic
+        Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview 
+        image to that of a number of different artists.
 
 <img src="sample_images/classify1.jpg" width="30%"><img src="sample_images/stylize1.jpg" width="30%"><img src="sample_images/detect1.jpg" width="30%">
 
-## Prebuilt APK:
+## Prebuilt Components:
 
 If you just want the fastest path to trying the demo, you may download the
 nightly build
 [here](https://ci.tensorflow.org/view/Nightly/job/nightly-android/). Expand the
 "View" and then the "out" folders under "Last Successful Artifacts" to find
-tensorflow_demo.apk. Also available are precompiled native libraries that you
-may drop into your own applications. See
+tensorflow_demo.apk.
+
+Also available are precompiled native libraries, and a jcenter package that you
+may simply drop into your own applications. See
 [tensorflow/contrib/android/README.md](../../../tensorflow/contrib/android/README.md)
 for more details.
 
@@ -54,7 +56,35 @@ While running the activities, pressing the volume keys on your device will
 toggle debug visualizations on/off, rendering additional info to the screen
 that may be useful for development purposes.
 
-## Building the Demo from Source
+## Building in Android Studio using the TensorFlow AAR from JCenter
+
+The simplest way to compile the demo app yourself, and try out changes to the
+project code is to use AndroidStudio. Simply set this `android` directory as the project root.
+
+Then edit the `build.gradle` file and change the value of `nativeBuildSystem`
+to `'none'` so that the project is built in the simplest way possible:
+
+```None
+def nativeBuildSystem = 'none'
+```
+
+While this project includes full build integration for TensorFlow, this setting
+disables it, and uses the TensorFlow Inference Interface package from JCenter.
+
+Note: Currently, in this build mode, YUV -> RGB is done using a less efficient
+Java implementation, and object tracking is not available in the "TF Detect"
+activity. Setting the build system to `'cmake'` currently only builds
+`libtensorflow_demo.so`, which provides fast YUV -> RGB conversion and object
+tracking, while still acquiring TensorFlow support via the downloaded AAR, so
+it may be a lightweight way to enable these features.
+
+For any project that does not include custom low level TensorFlow code, this is
+likely sufficient.
+
+For details on how to include this JCenter package in your own project see
+[tensorflow/contrib/android/README.md](../../../tensorflow/contrib/android/README.md)
+
+## Building the Demo with TensorFlow from Source
 
 Pick your preferred approach below. At the moment, we have full support for
 Bazel, and partial support for gradle, cmake, make, and Android Studio.
@@ -156,7 +186,7 @@ root to install the APK:
 adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
 ```
 
-### Android Studio
+### Android Studio with Bazel
 
 Android Studio may be used to build the demo in conjunction with Bazel. First,
 make sure that you can build with Bazel following the above directions. Then,
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 4f241027f4b6e80089be2bca179a8d4d4f565032..5a173b129be1e5d0598bd48848749ed1ab0446c3 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -7,20 +7,46 @@
 // libraries back into the appropriate directory.
 //
 // Alternatively, experimental support for Makefile builds is provided by
-// setting buildWithMake below to true. This will allow building the demo
+// setting nativeBuildSystem below to 'makefile'. This will allow building the demo
 // on Windows machines, but note that full equivalence with the Bazel
 // build is not yet guaranteed. See comments below for caveats and tips
 // for speeding up the build, such as as enabling ccache.
-
-// Set to true to build with make.
 // NOTE: Running a make build will cause subsequent Bazel builds to *fail*
 // unless the contrib/makefile/downloads/ and gen/ dirs are deleted afterwards.
-def buildWithMake = false
+
+// The cmake build only creates libtensorflow_demo.so. In this situation,
+// libtensorflow_inference.so will be acquired via the tensorflow.aar dependency.
+
+// It is necessary to customize Gradle's build directory, as otherwise
+// it will conflict with the BUILD file used by Bazel on case-insensitive OSs.
+project.buildDir = 'gradleBuild'
+getProject().setBuildDir('gradleBuild')
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.0'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+// set to 'bazel', 'cmake', 'makefile', 'none'
+def nativeBuildSystem = 'bazel'
 
 // Controls output directory in APK and CPU type for Bazel builds.
 // NOTE: Does not affect the Makefile build target API (yet), which currently
 // assumes armeabi-v7a. If building with make, changing this will require
 // editing the Makefile as well.
+// The CMake build has only been tested with armeabi-v7a; others may not work.
 def cpuType = 'armeabi-v7a'
 
 // Output directory in the local directory for packaging into the APK.
@@ -30,62 +56,66 @@ def nativeOutDir = 'libs/' + cpuType
 def nativeBuildRule = 'buildNativeBazel'
 def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so'
-if (buildWithMake) {
+if (nativeBuildSystem == 'makefile') {
     nativeBuildRule = 'buildNativeMake'
     demoLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_demo.so'
     inferenceLibPath = '../../../tensorflow/contrib/makefile/gen/lib/libtensorflow_inference.so'
 }
 
-// Defines the NDK location for Makefile builds. Does *not* affect Bazel builds.
-// Override with your absolute NDK location if this fails to get the location
-// automatically.
-def makeNdkRoot = System.getenv('NDK_ROOT')
-
 // If building with Bazel, this is the location of the bazel binary.
 // NOTE: Bazel does not yet support building for Android on Windows,
 // so in this case the Makefile build must be used as described above.
 def bazelLocation = '/usr/local/bin/bazel'
 
-project.buildDir = 'gradleBuild'
-getProject().setBuildDir('gradleBuild')
-
 // import DownloadModels task
 project.ext.ASSET_DIR = projectDir.toString() + '/assets'
 project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
 
-buildscript {
-    repositories {
-        jcenter()
-    }
-
-    dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-    }
-}
-
 apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
     buildToolsVersion "25.0.2"
 
+    if (nativeBuildSystem == 'cmake') {
+        defaultConfig {
+            applicationId = 'com.tensorflow.demo'
+            minSdkVersion 21
+            targetSdkVersion 23
+            ndk {
+                abiFilters "${cpuType}"
+            }
+            externalNativeBuild {
+                cmake {
+                    arguments '-DANDROID_TOOLCHAIN=gcc', '-DANDROID_STL=gnustl_static'
+                }
+            }
+        }
+        externalNativeBuild {
+            cmake {
+                path './jni/CMakeLists.txt'
+            }
+        }
+    }
+
     lintOptions {
         abortOnError false
     }
 
     sourceSets {
         main {
-            // TensorFlow Java API sources.
-            java {
-                srcDir '../../java/src/main/java'
-                exclude '**/examples/**'
+            if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+                // TensorFlow Java API sources.
+                java {
+                    srcDir '../../java/src/main/java'
+                    exclude '**/examples/**'
+                }
+
+                // Android TensorFlow wrappers, etc.
+                java {
+                    srcDir '../../contrib/android/java'
+                }
             }
-
-            // Android TensorFlow wrappers, etc.
-            java {
-                srcDir '../../contrib/android/java'
-            }
-
             // Android demo app sources.
             java {
                 srcDir 'src'
@@ -115,7 +145,7 @@ task buildNativeBazel(type: Exec) {
 }
 
 task buildNativeMake(type: Exec) {
-    environment "NDK_ROOT", makeNdkRoot
+    environment "NDK_ROOT", android.ndkDirectory
     // Tip: install ccache and uncomment the following to speed up
     // builds significantly.
     // environment "CC_PREFIX", 'ccache'
@@ -138,13 +168,14 @@ task copyNativeLibs(type: Copy) {
     fileMode 0644
 }
 
-
 tasks.whenTaskAdded { task ->
-    if (task.name == 'assembleDebug') {
-        task.dependsOn 'copyNativeLibs'
-    }
-    if (task.name == 'assembleRelease') {
-        task.dependsOn 'copyNativeLibs'
+    if (nativeBuildSystem == 'bazel' || nativeBuildSystem == 'makefile') {
+        if (task.name == 'assembleDebug') {
+            task.dependsOn 'copyNativeLibs'
+        }
+        if (task.name == 'assembleRelease') {
+            task.dependsOn 'copyNativeLibs'
+        }
     }
 }
 
@@ -152,3 +183,9 @@ tasks.whenTaskAdded { task ->
 // place them in the "assets" directory and comment out this line.
 apply from: "download-models.gradle"
 
+
+dependencies {
+    if (nativeBuildSystem == 'cmake' || nativeBuildSystem == 'none') {
+        compile 'org.tensorflow:tensorflow-android:1.2.0-preview'
+    }
+}
diff --git a/tensorflow/examples/android/jni/CMakeLists.txt b/tensorflow/examples/android/jni/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c8238464f579ea4ecaed6946a8554674325cd32f
--- /dev/null
+++ b/tensorflow/examples/android/jni/CMakeLists.txt
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+project(TENSORFLOW_DEMO)
+cmake_minimum_required(VERSION 3.4.1)
+
+set(CMAKE_VERBOSE_MAKEFILE on)
+
+get_filename_component(TF_SRC_ROOT ${CMAKE_SOURCE_DIR}/../../../..  ABSOLUTE)
+get_filename_component(SAMPLE_SRC_DIR  ${CMAKE_SOURCE_DIR}/..  ABSOLUTE)
+
+if (ANDROID_ABI MATCHES "^armeabi-v7a$")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=softfp -mfpu=neon")
+elseif(ANDROID_ABI MATCHES "^arm64-v8a")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -ftree-vectorize")
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTANDALONE_DEMO_LIB \
+                    -std=c++11 -fno-exceptions -fno-rtti -O2 -Wno-narrowing \
+                    -fPIE")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} \
+                              -Wl,--allow-multiple-definition \
+                              -Wl,--whole-archive -fPIE -v")
+
+file(GLOB_RECURSE tensorflow_demo_sources ${SAMPLE_SRC_DIR}/jni/*.*)
+add_library(tensorflow_demo SHARED
+            ${tensorflow_demo_sources})
+target_include_directories(tensorflow_demo PRIVATE
+                           ${TF_SRC_ROOT}
+                           ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(tensorflow_demo
+                      android
+                      log
+                      jnigraphics
+                      m
+                      atomic
+                      z)
diff --git a/tensorflow/examples/android/jni/object_tracking/image_utils.h b/tensorflow/examples/android/jni/object_tracking/image_utils.h
index 2d712e77f91ddf73083d37f5f46ef2d3b0677451..ac9ffd90f8a167199bbcc777df74c11630a1ef41 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_utils.h
+++ b/tensorflow/examples/android/jni/object_tracking/image_utils.h
@@ -67,7 +67,7 @@ inline static void MarkImage(const int x, const int y, const int radius,
     // reduce the number of iterations required as compared to starting from
     // either 0 and counting up or radius and counting down.
     for (int d_x = radius - d_y; d_x <= radius; ++d_x) {
-      // The first time this critera is met, we know the width of the circle at
+      // The first time this criteria is met, we know the width of the circle at
       // this row (without using sqrt).
       if (squared_y_dist + Square(d_x) >= squared_radius) {
         const int min_x = MAX(x - d_x, 0);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index b26a2316782dfbcde73c75556b99e624e836549d..bc391269255f64cb17bdc3f7ff65f801b0c60e67 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -194,13 +194,12 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 206a99f3e3d5bde2f09f6fc5d5ebded97f787f0a..5800f80651bdbd07b3a861299421501cf47b1716 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -124,7 +124,7 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
     borderedText = new BorderedText(textSizePx);
     borderedText.setTypeface(Typeface.MONOSPACE);
 
-    tracker = new MultiBoxTracker(getResources().getDisplayMetrics());
+    tracker = new MultiBoxTracker(this);
 
     if (USE_YOLO) {
       detector =
@@ -273,13 +273,12 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
index 7634be5c020d93225f29308b11358f5b84f8ee74..7afe2bf5412694c94a0e5b6d575e0a73e42dcb72 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
@@ -65,10 +65,6 @@ import org.tensorflow.demo.R;
  * Artistic Style" (https://arxiv.org/abs/1610.07629)
  */
 public class StylizeActivity extends CameraActivity implements OnImageAvailableListener {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final Logger LOGGER = new Logger();
 
   private static final String MODEL_FILE = "file:///android_asset/stylize_quantized.pb";
@@ -509,17 +505,17 @@ public class StylizeActivity extends CameraActivity implements OnImageAvailableL
       final int yRowStride = planes[0].getRowStride();
       final int uvRowStride = planes[1].getRowStride();
       final int uvPixelStride = planes[1].getPixelStride();
+
       ImageUtils.convertYUV420ToARGB8888(
           yuvBytes[0],
           yuvBytes[1],
           yuvBytes[2],
-          rgbBytes,
           previewWidth,
           previewHeight,
           yRowStride,
           uvRowStride,
           uvPixelStride,
-          false);
+          rgbBytes);
 
       image.close();
     } catch (final Exception e) {
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
index f660178ebeb1993a86879c309c50acd60dc2b2a4..5756bd6b64f47018e53081c83fb5c62004f87474 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
@@ -32,10 +32,6 @@ import org.tensorflow.contrib.android.TensorFlowInferenceInterface;
 
 /** A classifier specialized to label images using TensorFlow. */
 public class TensorFlowImageClassifier implements Classifier {
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   private static final String TAG = "TensorFlowImageClassifier";
 
   // Only return this many results with at least this confidence.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
index f3e7114335ff2367e3b1e4ae58073145710c8fea..1dcf9f55efe810345e1e8280dd8f22098c61a7b3 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
@@ -41,10 +41,6 @@ import org.tensorflow.demo.env.Logger;
 public class TensorFlowMultiBoxDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = Integer.MAX_VALUE;
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
index 174723071da9979de6c7d8b004ffa64689af471b..b7e36a2379d264403f4894537ee4a810cbd3f78b 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
@@ -31,10 +31,6 @@ import org.tensorflow.demo.env.SplitTimer;
 public class TensorFlowYoloDetector implements Classifier {
   private static final Logger LOGGER = new Logger();
 
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
-
   // Only return this many results with at least this confidence.
   private static final int MAX_RESULTS = 5;
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
index db929e5e087545b0ea190f80e945376c41a4b37e..5f2ff9164cc7ad4055359e16fd5dfdd4a67786a2 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -27,6 +27,14 @@ import java.io.FileOutputStream;
 public class ImageUtils {
   @SuppressWarnings("unused")
   private static final Logger LOGGER = new Logger();
+  
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
 
   /**
    * Utility method to compute the allocated size in bytes of a YUV420SP image
@@ -83,10 +91,84 @@ public class ImageUtils {
     }
   }
 
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = true;
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w("Native YUV -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int i = 0;
+    for (int y = 0; y < height; y++) {
+      int pY = yRowStride * y;
+      int uv_row_start = uvRowStride * (y >> 1);
+      int pUV = uv_row_start;
+      int pV = uv_row_start;
+
+      for (int x = 0; x < width; x++) {
+        int uv_offset = pUV + (x >> 1) * uvPixelStride;
+        out[i++] =
+            YUV2RGB(
+                convertByteToInt(yData, pY + x),
+                convertByteToInt(uData, uv_offset),
+                convertByteToInt(vData, uv_offset));
+      }
+    }
+  }
+
+  private static int convertByteToInt(byte[] arr, int pos) {
+    return arr[pos] & 0xFF;
+  }
+
+  private static int YUV2RGB(int nY, int nU, int nV) {
+    nY -= 16;
+    nU -= 128;
+    nV -= 128;
+    if (nY < 0) nY = 0;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+
+    final int foo = 1192 * nY;
+    int nR = foo + 1634 * nV;
+    int nG = foo - 833 * nV - 400 * nU;
+    int nB = foo + 2066 * nU;
+
+    nR = Math.min(kMaxChannelValue, Math.max(0, nR));
+    nG = Math.min(kMaxChannelValue, Math.max(0, nG));
+    nB = Math.min(kMaxChannelValue, Math.max(0, nB));
+
+    return 0xff000000 | ((nR << 6) & 0x00ff0000) | ((nG >> 2) & 0x0000FF00) | ((nB >> 10) & 0xff);
+  }
+
   /**
-   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
-   * and height. The input and output must already be allocated and non-null.
-   * For efficiency, no error checking is performed.
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
    *
    * @param input The array of YUV 4:2:0 input data.
    * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index 49c91d600da4df62a69bfb88ed0b9cb21584fb55..91d1f9feb184f2b145089ed8a410561842b93906 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -15,6 +15,7 @@ limitations under the License.
 
 package org.tensorflow.demo.tracking;
 
+import android.content.Context;
 import android.graphics.Canvas;
 import android.graphics.Color;
 import android.graphics.Matrix;
@@ -24,9 +25,9 @@ import android.graphics.Paint.Join;
 import android.graphics.Paint.Style;
 import android.graphics.RectF;
 import android.text.TextUtils;
-import android.util.DisplayMetrics;
 import android.util.Pair;
 import android.util.TypedValue;
+import android.widget.Toast;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Queue;
@@ -69,6 +70,7 @@ public class MultiBoxTracker {
 
   private static class TrackedRecognition {
     ObjectTracker.TrackedObject trackedObject;
+    RectF location;
     float detectionConfidence;
     int color;
     String title;
@@ -87,8 +89,10 @@ public class MultiBoxTracker {
   private int frameHeight;
 
   private int sensorOrientation;
+  private Context context;
 
-  public MultiBoxTracker(final DisplayMetrics metrics) {
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
     for (final int color : COLORS) {
       availableColors.add(color);
     }
@@ -100,7 +104,9 @@ public class MultiBoxTracker {
     boxPaint.setStrokeJoin(Join.ROUND);
     boxPaint.setStrokeMiter(100);
 
-    textSizePx = TypedValue.applyDimension(TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, metrics);
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
     borderedText = new BorderedText(textSizePx);
   }
 
@@ -152,10 +158,6 @@ public class MultiBoxTracker {
   }
 
   public synchronized void draw(final Canvas canvas) {
-    if (objectTracker == null) {
-      return;
-    }
-
     // TODO(andrewharp): This may not work for non-90 deg rotations.
     final float multiplier =
         Math.min(canvas.getWidth() / (float) frameHeight, canvas.getHeight() / (float) frameWidth);
@@ -168,9 +170,11 @@ public class MultiBoxTracker {
             sensorOrientation,
             false);
     for (final TrackedRecognition recognition : trackedObjects) {
-      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
 
-      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
       getFrameToCanvasMatrix().mapRect(trackedPos);
       boxPaint.setColor(recognition.color);
 
@@ -185,6 +189,8 @@ public class MultiBoxTracker {
     }
   }
 
+  private boolean initialized = false;
+
   public synchronized void onFrame(
       final int w,
       final int h,
@@ -192,7 +198,7 @@ public class MultiBoxTracker {
       final int sensorOrienation,
       final byte[] frame,
       final long timestamp) {
-    if (objectTracker == null) {
+    if (objectTracker == null && !initialized) {
       ObjectTracker.clearInstance();
 
       logger.i("Initializing ObjectTracker: %dx%d", w, h);
@@ -200,6 +206,19 @@ public class MultiBoxTracker {
       frameWidth = w;
       frameHeight = h;
       this.sensorOrientation = sensorOrienation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
     }
 
     objectTracker.nextFrame(frame, null, timestamp, null, true);
@@ -255,7 +274,20 @@ public class MultiBoxTracker {
     }
 
     if (objectTracker == null) {
-      logger.w("No ObjectTracker, can't track anything!");
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
       return;
     }
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
index 82de634baff6f9e80cb7aeb45ee98258953321f7..69f202b56816b5db1c3122471798970f32ddb98a 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -48,7 +48,18 @@ import org.tensorflow.demo.env.Size;
  * ObjectTracker still exists.
  */
 public class ObjectTracker {
-  private final Logger logger = new Logger();
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
 
   private static final boolean DRAW_TEXT = false;
 
@@ -194,6 +205,13 @@ public class ObjectTracker {
 
   public static synchronized ObjectTracker getInstance(
       final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
     if (instance == null) {
       instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
       instance.init();
@@ -519,7 +537,7 @@ public class ObjectTracker {
       checkValidObject();
       synchronized (ObjectTracker.this) {
         if (lastExternalPositionTime > timestamp) {
-          logger.w("Tried to use older position time!");
+          LOGGER.w("Tried to use older position time!");
           return;
         }
         final RectF externalPosition = downscaleRect(position);
@@ -640,8 +658,4 @@ public class ObjectTracker {
 
   protected static native void downsampleImageNative(
       int width, int height, int rowStride, byte[] input, int factor, byte[] output);
-
-  static {
-    System.loadLibrary("tensorflow_demo");
-  }
 }
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c4bb0a5bd952ea175a4fd2444a3d632dc13445de
--- /dev/null
+++ b/tensorflow/examples/benchmark/BUILD
@@ -0,0 +1,31 @@
+# Description:
+# Examples of adding a benchmark to TensorFlow.
+
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_test(
+    name = "sample_benchmark",
+    srcs = ["sample_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "sample_logged_benchmark",
+    target = "//tensorflow/examples/benchmark:sample_benchmark",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**/*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/benchmark/sample_benchmark.py b/tensorflow/examples/benchmark/sample_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98d7a2b5f09c08f8796d982e218081ca248de58
--- /dev/null
+++ b/tensorflow/examples/benchmark/sample_benchmark.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sample TensorFlow benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf
+
+
+# Define a class that extends from tf.test.Benchmark.
+class SampleBenchmark(tf.test.Benchmark):
+
+  # Note: benchmark method name must start with `benchmark`.
+  def benchmarkSum(self):
+    with tf.Session() as sess:
+      x = tf.constant(10)
+      y = tf.constant(5)
+      result = tf.add(x, y)
+
+      iters = 100
+      start_time = time.time()
+      for _ in range(iters):
+        sess.run(result)
+      total_wall_time = time.time() - start_time
+
+      # Call report_benchmark to report a metric value.
+      self.report_benchmark(
+          name="sum_wall_time",
+          # This value should always be per iteration.
+          wall_time=total_wall_time/iters,
+          iters=iters)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/examples/how_tos/__init__.py b/tensorflow/examples/how_tos/__init__.py
index 878841c1840a68deddfedb90508c5d2150208bd3..2069def242083e4cfa344f06308b3a909e768764 100644
--- a/tensorflow/examples/how_tos/__init__.py
+++ b/tensorflow/examples/how_tos/__init__.py
@@ -1,3 +1,19 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Declaring how_tos a python package.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
index 8cd296d7520b7f60b7db0ca9179f04777bae9136..e29387ab9d01bcd6615cc8204cdb333ac85e10e9 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
@@ -88,7 +88,8 @@ def run_training():
     saver = tf.train.Saver()
 
     # Create the op for initializing variables.
-    init_op = tf.global_variables_initializer()
+    init_op = tf.group(tf.global_variables_initializer(),
+                       tf.local_variables_initializer())
 
     # Create a session for running Ops on the Graph.
     sess = tf.Session()
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9e73def6a9ca0c86e18a452d4ebdaee82141061..a9ed02dd1a60ad79c2943212155bad864a750a99 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -17,7 +17,8 @@
 
 This version is like fully_connected_feed.py but uses data converted
 to a TFRecords file containing tf.train.Example protocol buffers.
-See tensorflow/g3doc/how_tos/reading_data.md#reading-from-files
+See:
+https://www.tensorflow.org/programmers_guide/reading_data#reading_from_files
 for context.
 
 YOU MUST run convert_to_records before running this (but you only need to
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index 5a885e33c25af4fd60ede267301b28179fcb24f7..394c413b33ef08902281f5207dd1e3b5bbad0367 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -24,13 +24,30 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "label_image",
+    srcs = [
+        "label_image.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "retrain_test",
     size = "small",
     srcs = [
+        "label_image.py",
         "retrain.py",
         "retrain_test.py",
     ],
+    data = [
+        ":data/labels.txt",
+        "//tensorflow/examples/label_image:data/grace_hopper.jpg",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":retrain",
diff --git a/tensorflow/examples/image_retraining/data/labels.txt b/tensorflow/examples/image_retraining/data/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc1131ac4591ca1bdb840695b55f79a6feb95db3
--- /dev/null
+++ b/tensorflow/examples/image_retraining/data/labels.txt
@@ -0,0 +1,3 @@
+Runner-up
+Winner
+Loser
diff --git a/tensorflow/examples/image_retraining/label_image.py b/tensorflow/examples/image_retraining/label_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecfa672462b89456d04f194579d5daeabef709dd
--- /dev/null
+++ b/tensorflow/examples/image_retraining/label_image.py
@@ -0,0 +1,147 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple image classification with Inception.
+
+Run image classification with your model.
+
+This script is usually used with retrain.py found in this same
+directory.
+
+This program creates a graph from a saved GraphDef protocol buffer,
+and runs inference on an input JPEG image. You are required
+to pass in the graph file and the txt file.
+
+It outputs human readable strings of the top 5 predictions along with
+their probabilities.
+
+Change the --image_file argument to any jpg image to compute a
+classification of that image.
+
+Example usage:
+python label_image.py --graph=retrained_graph.pb
+  --labels=retrained_labels.txt
+  --image=flower_photos/daisy/54377391_15648e8d18.jpg
+
+NOTE: To learn to use this file and retrain.py, please see:
+
+https://codelabs.developers.google.com/codelabs/tensorflow-for-poets
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import tensorflow as tf
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--image', required=True, type=str, help='Absolute path to image file.')
+parser.add_argument(
+    '--num_top_predictions',
+    type=int,
+    default=5,
+    help='Display this many predictions.')
+parser.add_argument(
+    '--graph',
+    required=True,
+    type=str,
+    help='Absolute path to graph file (.pb)')
+parser.add_argument(
+    '--labels',
+    required=True,
+    type=str,
+    help='Absolute path to labels file (.txt)')
+parser.add_argument(
+    '--output_layer',
+    type=str,
+    default='final_result:0',
+    help='Name of the result operation')
+parser.add_argument(
+    '--input_layer',
+    type=str,
+    default='DecodeJpeg/contents:0',
+    help='Name of the input operation')
+
+
+def load_image(filename):
+  """Read in the image_data to be classified."""
+  return tf.gfile.FastGFile(filename, 'rb').read()
+
+
+def load_labels(filename):
+  """Read in labels, one label per line."""
+  return [line.rstrip() for line in tf.gfile.GFile(filename)]
+
+
+def load_graph(filename):
+  """Unpersists graph from file as default graph."""
+  with tf.gfile.FastGFile(filename, 'rb') as f:
+    graph_def = tf.GraphDef()
+    graph_def.ParseFromString(f.read())
+    tf.import_graph_def(graph_def, name='')
+
+
+def run_graph(image_data, labels, input_layer_name, output_layer_name,
+              num_top_predictions):
+  with tf.Session() as sess:
+    # Feed the image_data as input to the graph.
+    #   predictions  will contain a two-dimensional array, where one
+    #   dimension represents the input image count, and the other has
+    #   predictions per class
+    softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name)
+    predictions, = sess.run(softmax_tensor, {input_layer_name: image_data})
+
+    # Sort to show labels in order of confidence
+    top_k = predictions.argsort()[-num_top_predictions:][::-1]
+    for node_id in top_k:
+      human_string = labels[node_id]
+      score = predictions[node_id]
+      print('%s (score = %.5f)' % (human_string, score))
+
+    return 0
+
+
+def main(argv):
+  """Runs inference on an image."""
+  if argv[1:]:
+    raise ValueError('Unused Command Line Args: %s' % argv[1:])
+
+  if not tf.gfile.Exists(FLAGS.image):
+    tf.logging.fatal('image file does not exist %s', FLAGS.image)
+
+  if not tf.gfile.Exists(FLAGS.labels):
+    tf.logging.fatal('labels file does not exist %s', FLAGS.labels)
+
+  if not tf.gfile.Exists(FLAGS.graph):
+    tf.logging.fatal('graph file does not exist %s', FLAGS.graph)
+
+  # load image
+  image_data = load_image(FLAGS.image)
+
+  # load labels
+  labels = load_labels(FLAGS.labels)
+
+  # load graph, which is stored in the default session
+  load_graph(FLAGS.graph)
+
+  run_graph(image_data, labels, FLAGS.input_layer, FLAGS.output_layer,
+            FLAGS.num_top_predictions)
+
+
+if __name__ == '__main__':
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=sys.argv[:1]+unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index a3a4ba310e5f689906a1148f4df2e32d30bc8a8c..6c1b40b442b0bf877592146c1eda206586dd9e9f 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Simple transfer learning with an Inception v3 architecture model which
-displays summaries in TensorBoard.
+r"""Simple transfer learning with an Inception v3 architecture model.
+
+With support for TensorBoard.
 
 This example shows how to take a Inception v3 architecture model trained on
 ImageNet images, and train a new top layer that can recognize other classes of
@@ -39,9 +40,20 @@ The subfolder names are important, since they define what label is applied to
 each image, but the filenames themselves don't matter. Once your images are
 prepared, you can run the training with a command like this:
 
+
+```bash
 bazel build tensorflow/examples/image_retraining:retrain && \
 bazel-bin/tensorflow/examples/image_retraining/retrain \
---image_dir ~/flower_photos
+    --image_dir ~/flower_photos
+```
+
+Or, if you have a pip installation of tensorflow, `retrain.py` can be run
+without bazel:
+
+```bash
+python tensorflow/examples/image_retraining/retrain.py \
+    --image_dir ~/flower_photos
+```
 
 You can replace the image_dir argument with any folder containing subfolders of
 images. The label for each image is taken from the name of the subfolder it's
@@ -244,7 +256,7 @@ def create_inception_graph():
     Graph holding the trained Inception network, and various tensors we'll be
     manipulating.
   """
-  with tf.Session() as sess:
+  with tf.Graph().as_default() as graph:
     model_filename = os.path.join(
         FLAGS.model_dir, 'classify_image_graph_def.pb')
     with gfile.FastGFile(model_filename, 'rb') as f:
@@ -254,7 +266,7 @@ def create_inception_graph():
           tf.import_graph_def(graph_def, name='', return_elements=[
               BOTTLENECK_TENSOR_NAME, JPEG_DATA_TENSOR_NAME,
               RESIZED_INPUT_TENSOR_NAME]))
-  return sess.graph, bottleneck_tensor, jpeg_data_tensor, resized_input_tensor
+  return graph, bottleneck_tensor, jpeg_data_tensor, resized_input_tensor
 
 
 def run_bottleneck_on_image(sess, image_data, image_data_tensor,
@@ -315,7 +327,7 @@ def ensure_dir_exists(dir_name):
     os.makedirs(dir_name)
 
 
-def write_list_of_floats_to_file(list_of_floats , file_path):
+def write_list_of_floats_to_file(list_of_floats, file_path):
   """Writes a given list of floats to a binary file.
 
   Args:
@@ -346,18 +358,28 @@ def read_list_of_floats_from_file(file_path):
 
 bottleneck_path_2_bottleneck_values = {}
 
+
 def create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
-                           image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor):
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor):
+  """Create a single bottleneck file."""
   print('Creating bottleneck at ' + bottleneck_path)
-  image_path = get_image_path(image_lists, label_name, index, image_dir, category)
+  image_path = get_image_path(image_lists, label_name, index,
+                              image_dir, category)
   if not gfile.Exists(image_path):
     tf.logging.fatal('File does not exist %s', image_path)
   image_data = gfile.FastGFile(image_path, 'rb').read()
-  bottleneck_values = run_bottleneck_on_image(sess, image_data, jpeg_data_tensor, bottleneck_tensor)
+  try:
+    bottleneck_values = run_bottleneck_on_image(
+        sess, image_data, jpeg_data_tensor, bottleneck_tensor)
+  except:
+    raise RuntimeError('Error during processing file %s' % image_path)
+
   bottleneck_string = ','.join(str(x) for x in bottleneck_values)
   with open(bottleneck_path, 'w') as bottleneck_file:
     bottleneck_file.write(bottleneck_string)
 
+
 def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
                              category, bottleneck_dir, jpeg_data_tensor,
                              bottleneck_tensor):
@@ -387,25 +409,32 @@ def get_or_create_bottleneck(sess, image_lists, label_name, index, image_dir,
   sub_dir = label_lists['dir']
   sub_dir_path = os.path.join(bottleneck_dir, sub_dir)
   ensure_dir_exists(sub_dir_path)
-  bottleneck_path = get_bottleneck_path(image_lists, label_name, index, bottleneck_dir, category)
+  bottleneck_path = get_bottleneck_path(image_lists, label_name, index,
+                                        bottleneck_dir, category)
   if not os.path.exists(bottleneck_path):
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor)
+    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor)
   with open(bottleneck_path, 'r') as bottleneck_file:
     bottleneck_string = bottleneck_file.read()
   did_hit_error = False
   try:
     bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
-  except:
-    print("Invalid float found, recreating bottleneck")
+  except ValueError:
+    print('Invalid float found, recreating bottleneck')
     did_hit_error = True
   if did_hit_error:
-    create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor)
+    create_bottleneck_file(bottleneck_path, image_lists, label_name, index,
+                           image_dir, category, sess, jpeg_data_tensor,
+                           bottleneck_tensor)
     with open(bottleneck_path, 'r') as bottleneck_file:
       bottleneck_string = bottleneck_file.read()
-    # Allow exceptions to propagate here, since they shouldn't happen after a fresh creation
+    # Allow exceptions to propagate here, since they shouldn't happen after a
+    # fresh creation
     bottleneck_values = [float(x) for x in bottleneck_string.split(',')]
   return bottleneck_values
 
+
 def cache_bottlenecks(sess, image_lists, image_dir, bottleneck_dir,
                       jpeg_data_tensor, bottleneck_tensor):
   """Ensures all the training, testing, and validation bottlenecks are cached.
@@ -718,7 +747,11 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   layer_name = 'final_training_ops'
   with tf.name_scope(layer_name):
     with tf.name_scope('weights'):
-      layer_weights = tf.Variable(tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count], stddev=0.001), name='final_weights')
+      initial_value = tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count],
+                                          stddev=0.001)
+
+      layer_weights = tf.Variable(initial_value, name='final_weights')
+
       variable_summaries(layer_weights)
     with tf.name_scope('biases'):
       layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
@@ -738,8 +771,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
   with tf.name_scope('train'):
-    train_step = tf.train.GradientDescentOptimizer(FLAGS.learning_rate).minimize(
-        cross_entropy_mean)
+    optimizer = tf.train.GradientDescentOptimizer(FLAGS.learning_rate)
+    train_step = optimizer.minimize(cross_entropy_mean)
 
   return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
           final_tensor)
@@ -794,115 +827,125 @@ def main(_):
   do_distort_images = should_distort_images(
       FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
       FLAGS.random_brightness)
-  sess = tf.Session()
 
-  if do_distort_images:
-    # We will be applying distortions, so setup the operations we'll need.
-    distorted_jpeg_data_tensor, distorted_image_tensor = add_input_distortions(
-        FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
-        FLAGS.random_brightness)
-  else:
-    # We'll make sure we've calculated the 'bottleneck' image summaries and
-    # cached them on disk.
-    cache_bottlenecks(sess, image_lists, FLAGS.image_dir, FLAGS.bottleneck_dir,
-                      jpeg_data_tensor, bottleneck_tensor)
-
-  # Add the new layer that we'll be training.
-  (train_step, cross_entropy, bottleneck_input, ground_truth_input,
-   final_tensor) = add_final_training_ops(len(image_lists.keys()),
-                                          FLAGS.final_tensor_name,
-                                          bottleneck_tensor)
-
-  # Create the operations we need to evaluate the accuracy of our new layer.
-  evaluation_step, prediction = add_evaluation_step(
-      final_tensor, ground_truth_input)
-
-  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
-  merged = tf.summary.merge_all()
-  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
-                                       sess.graph)
-  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')
-
-  # Set up all our weights to their initial default values.
-  init = tf.global_variables_initializer()
-  sess.run(init)
-
-  # Run the training for as many cycles as requested on the command line.
-  for i in range(FLAGS.how_many_training_steps):
-    # Get a batch of input bottleneck values, either calculated fresh every time
-    # with distortions applied, or from the cache stored on disk.
+  with tf.Session(graph=graph) as sess:
+
     if do_distort_images:
-      train_bottlenecks, train_ground_truth = get_random_distorted_bottlenecks(
-          sess, image_lists, FLAGS.train_batch_size, 'training',
-          FLAGS.image_dir, distorted_jpeg_data_tensor,
-          distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
+      # We will be applying distortions, so setup the operations we'll need.
+      (distorted_jpeg_data_tensor,
+       distorted_image_tensor) = add_input_distortions(
+           FLAGS.flip_left_right, FLAGS.random_crop,
+           FLAGS.random_scale, FLAGS.random_brightness)
     else:
-      train_bottlenecks, train_ground_truth, _ = get_random_cached_bottlenecks(
-          sess, image_lists, FLAGS.train_batch_size, 'training',
-          FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-          bottleneck_tensor)
-    # Feed the bottlenecks and ground truth into the graph, and run a training
-    # step. Capture training summaries for TensorBoard with the `merged` op.
-    train_summary, _ = sess.run([merged, train_step],
-             feed_dict={bottleneck_input: train_bottlenecks,
-                        ground_truth_input: train_ground_truth})
-    train_writer.add_summary(train_summary, i)
-
-    # Every so often, print out how well the graph is training.
-    is_last_step = (i + 1 == FLAGS.how_many_training_steps)
-    if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
-      train_accuracy, cross_entropy_value = sess.run(
-          [evaluation_step, cross_entropy],
+      # We'll make sure we've calculated the 'bottleneck' image summaries and
+      # cached them on disk.
+      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
+                        FLAGS.bottleneck_dir, jpeg_data_tensor,
+                        bottleneck_tensor)
+
+    # Add the new layer that we'll be training.
+    (train_step, cross_entropy, bottleneck_input, ground_truth_input,
+     final_tensor) = add_final_training_ops(len(image_lists.keys()),
+                                            FLAGS.final_tensor_name,
+                                            bottleneck_tensor)
+
+    # Create the operations we need to evaluate the accuracy of our new layer.
+    evaluation_step, prediction = add_evaluation_step(
+        final_tensor, ground_truth_input)
+
+    # Merge all the summaries and write them out to the summaries_dir
+    merged = tf.summary.merge_all()
+    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
+                                         sess.graph)
+
+    validation_writer = tf.summary.FileWriter(
+        FLAGS.summaries_dir + '/validation')
+
+    # Set up all our weights to their initial default values.
+    init = tf.global_variables_initializer()
+    sess.run(init)
+
+    # Run the training for as many cycles as requested on the command line.
+    for i in range(FLAGS.how_many_training_steps):
+      # Get a batch of input bottleneck values, either calculated fresh every
+      # time with distortions applied, or from the cache stored on disk.
+      if do_distort_images:
+        (train_bottlenecks,
+         train_ground_truth) = get_random_distorted_bottlenecks(
+             sess, image_lists, FLAGS.train_batch_size, 'training',
+             FLAGS.image_dir, distorted_jpeg_data_tensor,
+             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
+      else:
+        (train_bottlenecks,
+         train_ground_truth, _) = get_random_cached_bottlenecks(
+             sess, image_lists, FLAGS.train_batch_size, 'training',
+             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
+             bottleneck_tensor)
+      # Feed the bottlenecks and ground truth into the graph, and run a training
+      # step. Capture training summaries for TensorBoard with the `merged` op.
+
+      train_summary, _ = sess.run(
+          [merged, train_step],
           feed_dict={bottleneck_input: train_bottlenecks,
                      ground_truth_input: train_ground_truth})
-      print('%s: Step %d: Train accuracy = %.1f%%' % (datetime.now(), i,
-                                                      train_accuracy * 100))
-      print('%s: Step %d: Cross entropy = %f' % (datetime.now(), i,
-                                                 cross_entropy_value))
-      validation_bottlenecks, validation_ground_truth, _ = (
-          get_random_cached_bottlenecks(
-              sess, image_lists, FLAGS.validation_batch_size, 'validation',
-              FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
-              bottleneck_tensor))
-      # Run a validation step and capture training summaries for TensorBoard
-      # with the `merged` op.
-      validation_summary, validation_accuracy = sess.run(
-          [merged, evaluation_step],
-          feed_dict={bottleneck_input: validation_bottlenecks,
-                     ground_truth_input: validation_ground_truth})
-      validation_writer.add_summary(validation_summary, i)
-      print('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
-            (datetime.now(), i, validation_accuracy * 100,
-             len(validation_bottlenecks)))
-
-  # We've completed all our training, so run a final test evaluation on
-  # some new images we haven't used before.
-  test_bottlenecks, test_ground_truth, test_filenames = (
-      get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
-                                    'testing', FLAGS.bottleneck_dir,
-                                    FLAGS.image_dir, jpeg_data_tensor,
-                                    bottleneck_tensor))
-  test_accuracy, predictions = sess.run(
-      [evaluation_step, prediction],
-      feed_dict={bottleneck_input: test_bottlenecks,
-                 ground_truth_input: test_ground_truth})
-  print('Final test accuracy = %.1f%% (N=%d)' % (
-      test_accuracy * 100, len(test_bottlenecks)))
-
-  if FLAGS.print_misclassified_test_images:
-    print('=== MISCLASSIFIED TEST IMAGES ===')
-    for i, test_filename in enumerate(test_filenames):
-      if predictions[i] != test_ground_truth[i].argmax():
-        print('%70s  %s' % (test_filename,
-                            list(image_lists.keys())[predictions[i]]))
-
-  # Write out the trained graph and labels with the weights stored as constants.
-  output_graph_def = graph_util.convert_variables_to_constants(
-      sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
-  with gfile.FastGFile(FLAGS.output_graph, 'wb') as f:
-    f.write(output_graph_def.SerializeToString())
-  with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
-    f.write('\n'.join(image_lists.keys()) + '\n')
+      train_writer.add_summary(train_summary, i)
+
+      # Every so often, print out how well the graph is training.
+      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
+      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
+        train_accuracy, cross_entropy_value = sess.run(
+            [evaluation_step, cross_entropy],
+            feed_dict={bottleneck_input: train_bottlenecks,
+                       ground_truth_input: train_ground_truth})
+        print('%s: Step %d: Train accuracy = %.1f%%' % (datetime.now(), i,
+                                                        train_accuracy * 100))
+        print('%s: Step %d: Cross entropy = %f' % (datetime.now(), i,
+                                                   cross_entropy_value))
+        validation_bottlenecks, validation_ground_truth, _ = (
+            get_random_cached_bottlenecks(
+                sess, image_lists, FLAGS.validation_batch_size, 'validation',
+                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
+                bottleneck_tensor))
+        # Run a validation step and capture training summaries for TensorBoard
+        # with the `merged` op.
+        validation_summary, validation_accuracy = sess.run(
+            [merged, evaluation_step],
+            feed_dict={bottleneck_input: validation_bottlenecks,
+                       ground_truth_input: validation_ground_truth})
+        validation_writer.add_summary(validation_summary, i)
+        print('%s: Step %d: Validation accuracy = %.1f%% (N=%d)' %
+              (datetime.now(), i, validation_accuracy * 100,
+               len(validation_bottlenecks)))
+
+    # We've completed all our training, so run a final test evaluation on
+    # some new images we haven't used before.
+    test_bottlenecks, test_ground_truth, test_filenames = (
+        get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
+                                      'testing', FLAGS.bottleneck_dir,
+                                      FLAGS.image_dir, jpeg_data_tensor,
+                                      bottleneck_tensor))
+    test_accuracy, predictions = sess.run(
+        [evaluation_step, prediction],
+        feed_dict={bottleneck_input: test_bottlenecks,
+                   ground_truth_input: test_ground_truth})
+    print('Final test accuracy = %.1f%% (N=%d)' % (
+        test_accuracy * 100, len(test_bottlenecks)))
+
+    if FLAGS.print_misclassified_test_images:
+      print('=== MISCLASSIFIED TEST IMAGES ===')
+      for i, test_filename in enumerate(test_filenames):
+        if predictions[i] != test_ground_truth[i].argmax():
+          print('%70s  %s' % (test_filename,
+                              list(image_lists.keys())[predictions[i]]))
+
+    # Write out the trained graph and labels with the weights stored as
+    # constants.
+    output_graph_def = graph_util.convert_variables_to_constants(
+        sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+    with gfile.FastGFile(FLAGS.output_graph, 'wb') as f:
+      f.write(output_graph_def.SerializeToString())
+    with gfile.FastGFile(FLAGS.output_labels, 'w') as f:
+      f.write('\n'.join(image_lists.keys()) + '\n')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 00ccea174f79a8fcac36771a373381d68f726b66..8af5cc71149c3ecb5f3f95dadaaacb64514525dc 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
+import os
 
+from tensorflow.examples.image_retraining import label_image
 from tensorflow.examples.image_retraining import retrain
 from tensorflow.python.framework import test_util
 
@@ -81,5 +83,35 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
       gt = tf.placeholder(tf.float32, [1], name='gt')
       self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
 
+  def testLabelImage(self):
+
+    image_filename = ('../label_image/data/grace_hopper.jpg')
+
+    # Load some default data
+    label_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                              'data/labels.txt')
+    labels = label_image.load_labels(label_path)
+    self.assertEqual(len(labels), 3)
+
+    image_path = os.path.join(tf.resource_loader.get_data_files_path(),
+                              image_filename)
+
+    image = label_image.load_image(image_path)
+    self.assertEqual(len(image), 61306)
+
+    # Create trivial graph; note that the two nodes don't meet
+    with tf.Graph().as_default():
+      jpeg = tf.constant(image)
+      # Input node that doesn't lead anywhere.
+      tf.image.decode_jpeg(jpeg, name='DecodeJpeg')
+
+      # Output node, that always outputs a constant.
+      tf.constant([[10, 30, 5]], name='final')
+
+      # As label_image outputs via print, we assume that
+      # if it returns, everything is OK.
+      result = label_image.run_graph(image, labels, jpeg, 'final:0', 3)
+      self.assertEqual(result, 0)
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 021372fa7b853585608c3246cac6a8e449d7e6a5..d677e58ac323e1789f493bfc6aa9a33cf807612d 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -12,12 +12,32 @@ cc_binary(
     srcs = [
         "main.cc",
     ],
-    linkopts = ["-lm"],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:tensorflow",
-    ],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-ljnigraphics",
+            "-llog",
+            "-lm",
+            "-z defs",
+            "-s",
+            "-Wl,--exclude-libs,ALL",
+        ],
+        "//conditions:default": ["-lm"],
+    }),
+    deps = select({
+        "//tensorflow:android": [
+            # cc:cc_ops is used to include image ops (for label_image)
+            # Jpg, gif, and png related code won't be included
+            "//tensorflow/cc:cc_ops",
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/cc:cc_ops",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:tensorflow",
+        ],
+    }),
 )
 
 filegroup(
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 8e3f69a6d629cf42dfab9fda2263877b519dcf9b..a98c0817e30662b3848807472a63b50fb8d333fd 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -30,8 +30,12 @@ limitations under the License.
 // the top of the main() function.
 //
 // The googlenet_graph.pb file included by default is created from Inception.
+//
+// Note that, for GIF inputs, to reuse existing code, only single-frame ones
+// are supported.
 
 #include <fstream>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -46,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,7 +67,7 @@ using tensorflow::int32;
 // Takes a file name, and loads a list of labels from it, one per line, and
 // returns a vector of the strings. It pads with empty strings so the length
 // of the result is a multiple of 16, because our model expects that.
-Status ReadLabelsFile(string file_name, std::vector<string>* result,
+Status ReadLabelsFile(const string& file_name, std::vector<string>* result,
                       size_t* found_label_count) {
   std::ifstream file(file_name);
   if (!file) {
@@ -82,9 +87,32 @@ Status ReadLabelsFile(string file_name, std::vector<string>* result,
   return Status::OK();
 }
 
+static Status ReadEntireFile(tensorflow::Env* env, const string& filename,
+                             Tensor* output) {
+
+  tensorflow::uint64 file_size = 0;
+  TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
+
+  string contents;
+  contents.resize(file_size);
+
+  std::unique_ptr<tensorflow::RandomAccessFile> file;
+  TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
+
+  tensorflow::StringPiece data;
+  TF_RETURN_IF_ERROR(file->Read(0, file_size, &data, &(contents)[0]));
+  if (data.size() != file_size) {
+    return tensorflow::errors::DataLoss("Truncated read of '", filename,
+                                        "' expected ", file_size, " got ",
+                                        data.size());
+  }
+  output->scalar<string>()() = data.ToString();
+  return Status::OK();
+}
+
 // Given an image file name, read in the data, try to decode it as an image,
 // resize it to the requested size, and then scale the values as desired.
-Status ReadTensorFromImageFile(string file_name, const int input_height,
+Status ReadTensorFromImageFile(const string& file_name, const int input_height,
                                const int input_width, const float input_mean,
                                const float input_std,
                                std::vector<Tensor>* out_tensors) {
@@ -93,8 +121,20 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
 
   string input_name = "file_reader";
   string output_name = "normalized";
-  auto file_reader =
-      tensorflow::ops::ReadFile(root.WithOpName(input_name), file_name);
+
+  // read file_name into a tensor named input
+  Tensor input(tensorflow::DT_STRING, tensorflow::TensorShape());
+  TF_RETURN_IF_ERROR(ReadEntireFile(tensorflow::Env::Default(), file_name,
+                                    &input));
+
+  // use a placeholder to read input data
+  auto file_reader = Placeholder(root.WithOpName("input"),
+                                 tensorflow::DataType::DT_STRING);
+
+  std::vector<std::pair<string, tensorflow::Tensor>> inputs = {
+    {"input", input},
+  };
+
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
@@ -102,7 +142,12 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
   } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
-    image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
+    // gif decoder returns 4-D tensor, remove the first dim
+    image_reader =
+        Squeeze(root.WithOpName("squeeze_first_dim"),
+                DecodeGif(root.WithOpName("gif_reader"), file_reader));
+  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+    image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
     image_reader = DecodeJpeg(root.WithOpName("jpeg_reader"), file_reader,
@@ -132,13 +177,13 @@ Status ReadTensorFromImageFile(string file_name, const int input_height,
   std::unique_ptr<tensorflow::Session> session(
       tensorflow::NewSession(tensorflow::SessionOptions()));
   TF_RETURN_IF_ERROR(session->Create(graph));
-  TF_RETURN_IF_ERROR(session->Run({}, {output_name}, {}, out_tensors));
+  TF_RETURN_IF_ERROR(session->Run({inputs}, {output_name}, {}, out_tensors));
   return Status::OK();
 }
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
@@ -185,7 +230,7 @@ Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
 // Given the output of a model run, and the name of a file containing the labels
 // this prints out the top five highest-scoring values.
 Status PrintTopLabels(const std::vector<Tensor>& outputs,
-                      string labels_file_name) {
+                      const string& labels_file_name) {
   std::vector<string> labels;
   size_t label_count;
   Status read_labels_status =
@@ -307,11 +352,11 @@ int main(int argc, char* argv[]) {
   }
 
   // This is for automated testing to make sure we get the expected result with
-  // the default settings. We know that label 866 (military uniform) should be
+  // the default settings. We know that label 653 (military uniform) should be
   // the top label for the Admiral Hopper image.
   if (self_test) {
     bool expected_matches;
-    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
+    Status check_status = CheckTopLabel(outputs, 653, &expected_matches);
     if (!check_status.ok()) {
       LOG(ERROR) << "Running check failed: " << check_status;
       return -1;
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 73c526cd4e6d5903fe951998a394388549c6a504..31acbd30cd33a5211350d1adea1f7cdbbf80e874 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -11,6 +11,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+"""Example of DNNClassifier for Iris plant dataset, with exponential decay."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 0c96976146345a3717defd9f33fb8a2ec0ee9cc0..5ad53acf9f3f49ad3b217c73a642c6d0ca5d657a 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -11,7 +11,8 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""This is an example of using convolutional networks over characters for DBpedia dataset to predict class from description of an entity.
+"""This is an example of using convolutional networks over characters for
+   DBpedia dataset to predict class from description of an entity.
 
 This model is similar to one described in this paper:
    "Character-level Convolutional Networks for Text Classification"
@@ -54,7 +55,7 @@ def char_cnn_model(features, target):
     # Apply Convolution filtering on input sequence.
     conv1 = tf.contrib.layers.convolution2d(
         byte_list, N_FILTERS, FILTER_SHAPE1, padding='VALID')
-    # Add a RELU for non linearity.
+    # Add a ReLU for non linearity.
     conv1 = tf.nn.relu(conv1)
     # Max pooling across output of Convolution+Relu.
     pool1 = tf.nn.max_pool(
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 0d6875671b1192ed8f09e3672440e83c7092937d..e38704fd98cea6928231f2fc2bc989705ae46bb4 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -159,7 +159,7 @@ Status SaveImage(const Tensor& tensor, const string& file_path) {
 
 // Reads a model graph definition from disk, and creates a session object you
 // can use to run it.
-Status LoadGraph(string graph_file_name,
+Status LoadGraph(const string& graph_file_name,
                  std::unique_ptr<tensorflow::Session>* session) {
   tensorflow::GraphDef graph_def;
   Status load_graph_status =
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
index 4ff8e368c44c975b4fd6c363c48b0fe20406b064..186c14b4fd0f1ac8c874fb952089ef568a61f697 100644
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
@@ -120,7 +120,7 @@
    },
    "outputs": [],
    "source": [
-    "#!wget https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip && unzip inception5h.zip"
+    "!wget -nc https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip && unzip -n inception5h.zip"
    ]
   },
   {
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
index 932ce8a8b25f0b82d61a2ec3e5ea0b980994e1e4..3c0ea2e409076671b282253d22f99516bfa99ffc 100644
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ b/tensorflow/examples/tutorials/estimators/abalone.py
@@ -134,12 +134,22 @@ def main(unused_argv):
 
   # Instantiate Estimator
   nn = tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params)
-
+  
+  def get_train_inputs():
+    x = tf.constant(training_set.data)
+    y = tf.constant(training_set.target)
+    return x, y
+  
   # Fit
-  nn.fit(x=training_set.data, y=training_set.target, steps=5000)
+  nn.fit(input_fn=get_train_inputs, steps=5000)
 
   # Score accuracy
-  ev = nn.evaluate(x=test_set.data, y=test_set.target, steps=1)
+  def get_test_inputs():
+    x = tf.constant(test_set.data)
+    y = tf.constant(test_set.target)
+    return x, y
+  
+  ev = nn.evaluate(input_fn=get_test_inputs, steps=1)
   print("Loss: %s" % ev["loss"])
   print("Root Mean Squared Error: %s" % ev["rmse"])
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 75ea0b9c6759de513cc9d89cf0e60ed77fd20bbb..dc0d87031584ae0357db08b7bca9bdc1f1f3c08c 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """A simple MNIST classifier which displays summaries in TensorBoard.
 
- This is an unimpressive MNIST model, but it is a good example of using
+This is an unimpressive MNIST model, but it is a good example of using
 tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
 naming summary tags so that they are grouped meaningfully in TensorBoard.
 
@@ -78,7 +78,7 @@ def train():
   def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
     """Reusable code for making a simple neural net layer.
 
-    It does a matrix multiply, bias add, and then uses relu to nonlinearize.
+    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
     It also sets up name scoping so that the resultant graph is easy to read,
     and adds a number of summary ops.
     """
@@ -135,7 +135,8 @@ def train():
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', accuracy)
 
-  # Merge all the summaries and write them out to /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
+  # Merge all the summaries and write them out to
+  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
   merged = tf.summary.merge_all()
   train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
   test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
@@ -196,9 +197,15 @@ if __name__ == '__main__':
                       help='Initial learning rate')
   parser.add_argument('--dropout', type=float, default=0.9,
                       help='Keep probability for training dropout.')
-  parser.add_argument('--data_dir', type=str, default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
-  parser.add_argument('--log_dir', type=str, default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
-                      help='Summaries log directory')
+  parser.add_argument(
+      '--data_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/input_data',
+      help='Directory for storing input data')
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default='/tmp/tensorflow/mnist/logs/mnist_with_summaries',
+      help='Summaries log directory')
   FLAGS, unparsed = parser.parse_known_args()
   tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
index 37ed7395e61f04f5fd5b3e58e3cec69da1c28602..850d105f7b1b33fadd40bc6a6cab3d08c0da3734 100644
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ b/tensorflow/examples/tutorials/monitors/iris_monitors.py
@@ -21,7 +21,6 @@ import os
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 
 tf.logging.set_verbosity(tf.logging.INFO)
 
@@ -41,18 +40,15 @@ def main(unused_argv):
       "accuracy":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_accuracy,
-              prediction_key=
-              tf.contrib.learn.PredictionKey.CLASSES),
+              prediction_key="classes"),
       "precision":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_precision,
-              prediction_key=
-              tf.contrib.learn.PredictionKey.CLASSES),
+              prediction_key="classes"),
       "recall":
           tf.contrib.learn.MetricSpec(
               metric_fn=tf.contrib.metrics.streaming_recall,
-              prediction_key=
-              tf.contrib.learn.PredictionKey.CLASSES)
+              prediction_key="classes")
   }
   validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
       test_set.data,
@@ -66,26 +62,6 @@ def main(unused_argv):
   # Specify that all features have real-value data
   feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
 
-  validation_metrics = {
-      "accuracy": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_accuracy,
-                          prediction_key="classes"),
-      "recall": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_recall,
-                          prediction_key="classes"),
-      "precision": MetricSpec(
-                          metric_fn=tf.contrib.metrics.streaming_precision,
-                          prediction_key="classes")
-                        }
-  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-      test_set.data,
-      test_set.target,
-      every_n_steps=50,
-      metrics=validation_metrics,
-      early_stopping_metric="loss",
-      early_stopping_metric_minimize=True,
-      early_stopping_rounds=200)
-
   # Build 3 layer DNN with 10, 20, 10 units respectively.
   classifier = tf.contrib.learn.DNNClassifier(
       feature_columns=feature_columns,
diff --git a/tensorflow/examples/udacity/1_notmnist.ipynb b/tensorflow/examples/udacity/1_notmnist.ipynb
index 521cbf30006d5a6c8463cf2a3011cd014b7402e8..39674e1aa49ad70216b778444d2448d89f44d952 100644
--- a/tensorflow/examples/udacity/1_notmnist.ipynb
+++ b/tensorflow/examples/udacity/1_notmnist.ipynb
@@ -70,7 +70,7 @@
         "colab_type": "text"
       },
       "source": [
-        "First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine."
+        "First, we'll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the testset 19000 labeled examples. Given these sizes, it should be possible to train models quickly on any machine."
       ]
     },
     {
@@ -109,7 +109,7 @@
         "outputId": "0d0f85df-155f-4a89-8e7e-ee32df36ec8d"
       },
       "source": [
-        "url = 'http://commondatastorage.googleapis.com/books1000/'\n",
+        "url = 'https://commondatastorage.googleapis.com/books1000/'\n",
         "last_percent_reported = None\n",
         "data_root = '.' # Change me to store data elsewhere\n",
         "\n",
@@ -168,7 +168,7 @@
       },
       "source": [
         "Extract the dataset from the compressed .tar.gz file.\n",
-        "This should give you a set of directories, labelled A through J."
+        "This should give you a set of directories, labeled A through J."
       ]
     },
     {
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1e72324fb0564d179b02ce42a5e1803100b9b074
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -0,0 +1,68 @@
+# Description:
+#   TensorFlow C++ inference example for labeling images.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "wav_to_spectrogram_lib",
+    srcs = [
+        "wav_to_spectrogram.cc",
+    ],
+    hdrs = [
+        "wav_to_spectrogram.h",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_binary(
+    name = "wav_to_spectrogram",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+cc_test(
+    name = "wav_to_spectrogram_test",
+    size = "medium",
+    srcs = ["wav_to_spectrogram_test.cc"],
+    deps = [
+        ":wav_to_spectrogram_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/examples/wav_to_spectrogram/README.md b/tensorflow/examples/wav_to_spectrogram/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f7eb43700c187e0049f9b7e6911cd1ed8d05a5b
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/README.md
@@ -0,0 +1,49 @@
+# TensorFlow Spectrogram Example
+
+This example shows how you can load audio from a .wav file, convert it to a
+spectrogram, and then save it out as a PNG image. A spectrogram is a
+visualization of the frequencies in sound over time, and can be useful as a
+feature for neural network recognition on noise or speech.
+
+## Building
+
+To build it, run this command:
+
+```bash
+bazel build tensorflow/examples/wav_to_spectrogram/...
+```
+
+That should build a binary executable that you can then run like this:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram
+```
+
+This uses a default test audio file that's part of the TensorFlow source code,
+and writes out the image to the current directory as spectrogram.png.
+
+## Options
+
+To load your own audio, you need to supply a .wav file in LIN16 format, and use
+the `--input_audio` flag to pass in the path.
+
+To control how the spectrogram is created, you can specify the `--window_size`
+and `--stride` arguments, which control how wide the window used to estimate
+frequencies is, and how widely adjacent windows are spaced.
+
+The `--output_image` flag sets the path to save the image file to. This is
+always written out in PNG format, even if you specify a different file
+extension.
+
+If your result seems too dark, try using the `--brightness` flag to make the
+output image easier to see.
+
+Here's an example of how to use all of them together:
+
+```bash
+bazel-bin/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram \
+--input_wav=/tmp/my_audio.wav \
+--window=1024 \
+--stride=512 \
+--output_image=/tmp/my_spectrogram.png
+```
diff --git a/tensorflow/examples/wav_to_spectrogram/main.cc b/tensorflow/examples/wav_to_spectrogram/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..539e6c4fe4277936cceeb089799e2f3eb081f123
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/main.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+int main(int argc, char* argv[]) {
+  // These are the command-line flags the program can understand.
+  // They define where the graph and input data is located, and what kind of
+  // input the model expects. If you train your own model, or use something
+  // other than inception_v3, then you'll need to update these.
+  tensorflow::string input_wav =
+      "tensorflow/core/kernels/spectrogram_test_data/short_test_segment.wav";
+  tensorflow::int32 window_size = 256;
+  tensorflow::int32 stride = 128;
+  float brightness = 64.0f;
+  tensorflow::string output_image = "spectrogram.png";
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("input_wav", &input_wav, "audio file to load"),
+      tensorflow::Flag("window_size", &window_size,
+                       "frequency sample window width"),
+      tensorflow::Flag("stride", &stride,
+                       "how far apart to place frequency windows"),
+      tensorflow::Flag("brightness", &brightness,
+                       "controls how bright the output image is"),
+      tensorflow::Flag("output_image", &output_image,
+                       "where to save the spectrogram image to"),
+  };
+  tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    LOG(ERROR) << usage;
+    return -1;
+  }
+
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
+    return -1;
+  }
+
+  tensorflow::Status wav_status = WavToSpectrogram(
+      input_wav, window_size, stride, brightness, output_image);
+  if (!wav_status.ok()) {
+    LOG(ERROR) << "WavToSpectrogram failed with " << wav_status;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c69a3596378f7cffb087e32461134b308f518792
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include <vector>
+
+#include "tensorflow/cc/ops/audio_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_UINT8;
+using tensorflow::Output;
+using tensorflow::TensorShape;
+
+// Runs a TensorFlow graph to convert an audio file into a visualization.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image) {
+  auto root = tensorflow::Scope::NewRootScope();
+  using namespace tensorflow::ops;  // NOLINT(build/namespaces)
+  // The following block creates a TensorFlow graph that:
+  //  - Reads and decodes the audio file into a tensor of float samples.
+  //  - Creates a float spectrogram from those samples.
+  //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
+  //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
+  //  - Encodes it as a PNG stream and saves it out to a file.
+  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  DecodeWav wav_decoder =
+      DecodeWav(root.WithOpName("wav_decoder"), file_reader);
+  Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
+                                        wav_decoder.audio, window_size, stride);
+  Output brightness_placeholder =
+      Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
+                  Placeholder::Attrs().Shape(TensorShape({})));
+  Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
+  Output min_const = Const(root.WithOpName("min_const"), 255.0f);
+  Output min = Minimum(root.WithOpName("min"), mul, min_const);
+  Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
+  Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
+  Output expand_dims =
+      ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
+  Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
+                           Squeeze::Attrs().SqueezeDims({0}));
+  Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
+  WriteFile file_writer =
+      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::GraphDef graph;
+  TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
+
+  // Build a session object from this graph definition. The power of TensorFlow
+  // is that you can reuse complex computations like this, so usually we'd run a
+  // lot of different inputs through it. In this example, we're just doing a
+  // one-off run, so we'll create it and then use it immediately.
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions()));
+  TF_RETURN_IF_ERROR(session->Create(graph));
+
+  // We're passing in the brightness as an input, so create a tensor to hold the
+  // value.
+  tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
+  brightness_tensor.scalar<float>()() = brightness;
+
+  // Run the session to analyze the audio and write out the file.
+  TF_RETURN_IF_ERROR(
+      session->Run({{"brightness_placeholder", brightness_tensor}}, {},
+                   {"output_image"}, nullptr));
+  return tensorflow::Status::OK();
+}
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa8cb0abe951957e621703b7e2b9a6774200ac33
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+#define THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+// Runs a TensorFlow graph to convert an audio file into a visualization. Takes
+// in the path to the audio file, the window size and stride parameters
+// controlling the spectrogram creation, the brightness scaling to use, and a
+// path to save the output PNG file to.
+tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
+                                    tensorflow::int32 window_size,
+                                    tensorflow::int32 stride, float brightness,
+                                    const tensorflow::string& output_image);
+
+#endif  // THIRD_PARTY_TENSORFLOW_EXAMPLES_WAV_TO_SPECTROGRAM_WAV_TO_SPECTROGRAM_H_
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5997114454b7226c8d5d7b0871f9fe6b06d1a04
--- /dev/null
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/wav/wav_io.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(WavToSpectrogramTest, WavToSpectrogramTest) {
+  const tensorflow::string input_wav =
+      tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "input_wav.wav");
+  const tensorflow::string output_image = tensorflow::io::JoinPath(
+      tensorflow::testing::TmpDir(), "output_image.png");
+  float audio[8] = {-1.0f, 0.0f, 1.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  tensorflow::string wav_string;
+  TF_ASSERT_OK(
+      tensorflow::wav::EncodeAudioAsS16LEWav(audio, 44100, 1, 8, &wav_string));
+  TF_ASSERT_OK(tensorflow::WriteStringToFile(tensorflow::Env::Default(),
+                                             input_wav, wav_string));
+  TF_ASSERT_OK(WavToSpectrogram(input_wav, 4, 4, 64.0f, output_image));
+  TF_EXPECT_OK(tensorflow::Env::Default()->FileExists(output_image));
+}
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index e32c21ca72056f1150aaa59ff5903d0054f7d14e..a1b4255292b0908fd5f022ce641967ba1b30f75c 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -9,24 +9,22 @@ Construct and execute TensorFlow graphs in Go.
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
-
 1.  Download and extract the TensorFlow C library, preferably into `/usr/local`.
     GPU-enabled versions require CUDA 8.0 and cuDNN 5.1. For other versions, the
     TensorFlow C library will have to be built from source (see below).
 
     -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.0.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.0.0.tar.gz)
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-linux-x86_64-1.1.0.tar.gz),
+        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-linux-x86_64-1.1.0.tar.gz)
     -   OS X
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.0.0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-gpu-darwin-x86_64-1.0.0.tar.gz)
+        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-cpu-darwin-x86_64-1.1.0.tar.gz),
 
     The following shell snippet downloads and extracts into `/usr/local`:
 
     ```sh
     TF_TYPE="cpu" # Set to "gpu" for GPU support
     curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.0.0.tar.gz" |
+      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.1.0.tar.gz" |
     sudo tar -C /usr/local -xz
     ```
 
@@ -41,20 +39,7 @@ Construct and execute TensorFlow graphs in Go.
 
 ### Installing into locations other than `/usr/local`
 
-The TensorFlow C library (`libtensorflow.so`) needs to be available at build
-time (e.g., `go build`) and run time (`go test` or executing binaries). If the
-library has not been extracted into `/usr/local`, then it needs to be made
-available through the `LIBRARY_PATH` environment variable at build time and the
-`LD_LIBRARY_PATH` environment variable (`DYLD_LIBRARY_PATH` on OS X) at run
-time.
-
-For example, if the TensorFlow C library was extracted into `/dir`, then:
-
-```sh
-export LIBRARY_PATH=/dir/lib
-export LD_LIBRARY_PATH=/dir/lib   # For Linux
-export DYLD_LIBRARY_PATH=/dir/lib # For OS X
-```
+Refer to [Installing TensorFlow for Go](https://www.tensorflow.org/install/install_go)
 
 ## Building the TensorFlow C library from source
 
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 87056b85a275190a9a61c5c4d153613f60a9bce3..682bd245cc73c1cabad96fc7a4fa1dd7db7e2c57 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -28,8 +28,8 @@ import (
 	"os"
 	"path/filepath"
 
-	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 	"github.com/tensorflow/tensorflow/tensorflow/go/op"
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
 )
 
 func Example() {
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 7c9c5a4d6ea8f9f2af3b6dd8d64c9ab99800f5e7..dec08dee1ca4f2d85f9bac834323889adad178d3 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -158,12 +158,12 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 `))
 
 	tmplOp = template.Must(template.New("op").Funcs(template.FuncMap{
-		"MakeComment": makeComment,
-		"GoType":      goType,
-		"CamelCase":   camelCase,
-		"Identifier":  identifier,
-		"IsListArg":   isListArg,
-		"IsListAttr":  isListAttr,
+		"MakeComment":       makeComment,
+		"GoType":            goType,
+		"CamelCase":         camelCase,
+		"Identifier":        identifier,
+		"IsListArg":         isListArg,
+		"IsListAttr":        isListAttr,
 		"StripLeadingColon": stripLeadingColon,
 	}).Parse(`
 {{if .OptionalAttrs -}}
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index e65619e80b54a7285b5e1cecafc55cfbe8a72117..46c600eab17c6c467d0b3a3312f848541f382e80 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -185,11 +185,11 @@ func (g *Graph) AddOperation(args OpSpec) (*Operation, error) {
 			return nil, fmt.Errorf("%v (memory will be leaked)", err)
 		}
 	}
-	op := &Operation{
-		c: C.TF_FinishOperation(cdesc, status.c),
-		g: g,
+	c := C.TF_FinishOperation(cdesc, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
 	}
-	return op, status.Err()
+	return &Operation{c, g}, nil
 }
 
 func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, value interface{}) error {
diff --git a/tensorflow/go/lib.go b/tensorflow/go/lib.go
index 551cfa0b019a6f864555b3a0473be41d6ade3abe..2800eded60b75ecf3bcf09312f4a8bedbcbbae92 100644
--- a/tensorflow/go/lib.go
+++ b/tensorflow/go/lib.go
@@ -18,14 +18,4 @@ package tensorflow
 
 // #cgo LDFLAGS: -ltensorflow
 // #cgo CFLAGS: -I${SRCDIR}/../../
-//
-// // TODO(ashankar): Remove this after TensorFlow 1.1 has been released.
-// // Till then, the TensorFlow C API binary releases do not contain
-// // the TF_DeletePRunHandle symbol. We work around that by
-// // implementing the equivalent in session.cpp
-// extern void tfDeletePRunHandle(const char*);
 import "C"
-
-func deletePRunHandle(h *C.char) {
-	C.tfDeletePRunHandle(h)
-}
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index 65877dca96bc1c38ca70116867195450cf72e763..2451ba360699a7ac24f64209339e7b4f92ffb548 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -19,6 +19,7 @@ limitations under the License.
 package op
 
 import (
+	"strings"
 	"testing"
 
 	tf "github.com/tensorflow/tensorflow/tensorflow/go"
@@ -33,3 +34,27 @@ func TestPlaceholder(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestAddOperationFailure(t *testing.T) {
+	// Inspired from https://github.com/tensorflow/tensorflow/issues/9931
+	s := NewScope()
+
+	resize := ResizeArea(s, Placeholder(s, tf.Float), Const(s, []int64{80, 80}))
+	if err := s.Err(); err == nil {
+		t.Fatal("ResizeArea expects an int32 Tensor for size, should fail when an int64 is provided")
+	}
+	// And any use of resize should panic with an error message more informative than SIGSEGV
+	defer func() {
+		r := recover()
+		if r == nil {
+			return
+		}
+		s, ok := r.(string)
+		if ok && strings.Contains(s, "see Scope.Err() for details") {
+			return
+		}
+		t.Errorf("Expected panic string to Scope.Err(), found %T: %q", r, r)
+	}()
+	_ = resize.Shape()
+	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index de014abafa92704ea88c221e5ba52cf7f6ede45f..f508b63b138682c556cc5f1fc87c03075079c09e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -57,7 +57,7 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 // Requires `updates.shape = indices.shape + ref.shape[1:]`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterAdd.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
 // </div>
 //
 // Arguments:
@@ -195,6 +195,19 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
+// FakeQuantWithMinMaxVarsPerChannelGradientAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannelGradient.
+type FakeQuantWithMinMaxVarsPerChannelGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelGradientNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.
 //
 // Arguments:
@@ -211,20 +224,36 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 //   `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter, shape `[d]`:
 // `sum_per_d(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter, shape `[d]`:
 // `sum_per_d(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "FakeQuantWithMinMaxVarsPerChannelGradient",
 		Input: []tf.Input{
 			gradients, inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
 // and `max` to 'outputs' tensor of same shape as `inputs`.
@@ -232,17 +261,23 @@ func FakeQuantWithMinMaxVarsPerChannelGradient(scope *Scope, gradients tf.Output
 // [min; max] is the clamping range for the 'inputs' data.  Op divides this range
 // into 255 steps (total of 256 values), then replaces each 'inputs' value with the
 // closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
 // This operation has a gradient and thus allows for training `min` and `max` values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output) (outputs tf.Output) {
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
 			inputs, min, max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -330,33 +365,6 @@ func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Concatenates quantized tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
 type QuantizeAndDequantizeAttr func(optionalAttr)
 
@@ -653,14 +661,14 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // For example, given this input of shape `[1, 1, 1, 4]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4]]]]
 //
 // ```
 //
 // This operation will output a tensor of shape `[1, 2, 2, 1]`:
 //
-// ```prettyprint
+// ```
 //    [[[[1], [2]],
 //      [[3], [4]]]]
 // ```
@@ -672,14 +680,14 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 // ```
 //
 // This operation, for block size of 2, will return the following tensor of shape
 // `[1, 2, 2, 3]`
 //
-// ```prettyprint
+// ```
 //    [[[[1, 2, 3], [4, 5, 6]],
 //      [[7, 8, 9], [10, 11, 12]]]]
 //
@@ -687,7 +695,7 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x =  [[[[1, 2, 3, 4],
 //        [5, 6, 7, 8]],
 //       [[9, 10, 11, 12],
@@ -696,7 +704,7 @@ func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides
 //
 // the operator will return the following tensor of shape `[1 4 4 1]`:
 //
-// ```prettyprint
+// ```
 // x = [[ [1],   [2],  [5],  [6]],
 //      [ [3],   [4],  [7],  [8]],
 //      [ [9],  [10], [13],  [14]],
@@ -784,26 +792,26 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // The output tensor has shape `[1, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // The output tensor has shape `[1, 2, 2, 3]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
@@ -811,7 +819,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -820,7 +828,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //
 // The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[1],   [2],  [3],  [4]],
 //      [[5],   [6],  [7],  [8]],
 //      [[9],  [10], [11],  [12]],
@@ -830,7 +838,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
 //     `crops = [[0, 0], [2, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 //      [[[0], [2], [4]]], [[[0], [10], [12]]],
 //      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -839,7 +847,7 @@ func DepthToSpace(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //
 // The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -897,32 +905,32 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // (1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // (2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 3]` and value:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // (3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]],
 //       [[9],  [10], [11],  [12]],
@@ -931,7 +939,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // The output tensor has shape `[4, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -940,7 +948,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // (4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -949,7 +957,7 @@ func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops
 //
 // The output tensor has shape `[8, 1, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
 //      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 // ```
@@ -1142,34 +1150,34 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 // (1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]], [[3], [4]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 1]` and value:
 //
-// ```prettyprint
+// ```
 // [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
 // (2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
 //
 // The output tensor has shape `[4, 1, 1, 3]` and value:
 //
-// ```prettyprint
+// ```
 // [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
 // (3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
 //     `paddings = [[0, 0], [0, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]],
 //       [[9],  [10], [11],  [12]],
@@ -1178,7 +1186,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 //
 // The output tensor has shape `[4, 2, 2, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [3]], [[9], [11]]],
 //      [[[2], [4]], [[10], [12]]],
 //      [[[5], [7]], [[13], [15]]],
@@ -1188,7 +1196,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 // (4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
 //     paddings = `[[0, 0], [2, 0]]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [3],  [4]],
 //       [[5],   [6],  [7],  [8]]],
 //      [[[9],  [10], [11],  [12]],
@@ -1197,7 +1205,7 @@ func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output,
 //
 // The output tensor has shape `[8, 1, 3, 1]` and value:
 //
-// ```prettyprint
+// ```
 // x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
 //      [[[0], [2], [4]]], [[[0], [10], [12]]],
 //      [[[0], [5], [7]]], [[[0], [13], [15]]],
@@ -1220,65 +1228,6 @@ func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddin
 	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```prettyprint
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```prettyprint
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // SqueezeAttr is an optional argument to Squeeze.
 type SqueezeAttr func(optionalAttr)
 
@@ -1304,14 +1253,14 @@ func SqueezeSqueezeDims(value []int64) SqueezeAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 // shape(squeeze(t)) ==> [2, 3]
 // ```
 //
 // Or, to remove specific size 1 dimensions:
 //
-// ```prettyprint
+// ```
 // # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
 // shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 // ```
@@ -1342,6 +1291,8 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 
 // A placeholder op for a value that will be fed into the computation.
 //
+// DEPRECATED at GraphDef version 23: Placeholder now behaves the same as PlaceholderV2.
+//
 // N.B. This operation will fail with an error if it is executed. It is
 // intended as a way to represent a value that will always be fed, and to
 // provide attrs that enable the fed value to be checked at runtime.
@@ -1373,7 +1324,7 @@ type PlaceholderAttr func(optionalAttr)
 //
 // value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
 // shape is unconstrained.
-// If not specified, defaults to <>
+// If not specified, defaults to <unknown_rank:true >
 func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -1424,7 +1375,7 @@ func Placeholder(scope *Scope, dtype tf.DataType, optional ...PlaceholderAttr) (
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is [[1, 2, 3], [4, 5, 6]].
 // # 'paddings' is [[1, 1]], [2, 2]].
 // # 'mode' is SYMMETRIC.
@@ -1508,7 +1459,7 @@ func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' tensor is [[True, False]
 // #                    [True, False]]
 // # 'input' has two true values, so output has two coordinates.
@@ -1710,101 +1661,25 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 	return output
 }
 
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-//
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
-//
-// For example:
-//
-// ```prettyprint
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
-//
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
-//
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-//
-// # -1 can also be used to infer the shape
-//
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
+// Checks a tensor for NaN and Inf values.
 //
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
 //
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			tensor, shape,
+			tensor,
 		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Checks a tensor for NaN and Inf values.
-//
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-//
-// Arguments:
-//
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"message": message}
-	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
-		Input: []tf.Input{
-			tensor,
-		},
-		Attrs: attrs,
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -1909,20 +1784,17 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 
 // Gather values or slices from `params` according to `indices`.
 //
-// `params` is a Tensor of rank `P` and `indices` is a Tensor of rank `Q`.
+// `indices` is an integer tensor containing indices into `params`.  The last
+// dimension of `indices` can be at most the rank of `params`:
 //
-// `indices` must be integer tensor, containing indices into `params`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//     indices.shape[-1] <= params.rank
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `params`.
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] = params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
 //
-// Produces an output tensor with shape
-//
-// ```
-// [d_0, ..., d_{Q-2}, params.shape[K], ..., params.shape[P-1]].
-// ```
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
 //
 // Some examples below.
 //
@@ -2002,11 +1874,11 @@ func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 // ```
 //
 // Arguments:
-//	params: `P-D`.  The tensor from which to gather values.
-//	indices: `Q-D`.  Index tensor having shape `[d_0, ..., d_{Q-2}, K]`.
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
 //
-// Returns `(P+Q-K-1)-D`.  Values from `params` gathered from indices given by
-// `indices`.
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
 func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -2153,7 +2025,7 @@ func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' is [[1, 0, 0, 0]
 //               [0, 2, 0, 0]
 //               [0, 0, 3, 0]
@@ -2349,21 +2221,21 @@ func Split(scope *Scope, split_dim tf.Output, value tf.Output, num_split int64)
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'x' is [2, 2, 7]
 // # 'y' is [2, 3, 7]
 // # 'z' is [2, 5, 7]
 // concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 // ```
 //
+// This is typically used by gradient computations for a concat operation.
+//
 // Arguments:
 //	concat_dim: The dimension along which to concatenate.
 //	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
 // Returns The `N` int32 vectors representing the starting offset
-//         of input tensors within the concatenated output.
-//
-// This is typically used by gradient computations for a concat operation.
+// of input tensors within the concatenated output.
 func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -2418,7 +2290,7 @@ func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.O
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'x' is [[1, 4]]
 // # 'y' is [[2, 5]]
 // # 'z' is [[3, 6]]
@@ -2454,6 +2326,83 @@ func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf
 	return op.Output(0)
 }
 
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
+	}
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
+
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
+//
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Mfcc",
+		Input: []tf.Input{
+			spectrogram, sample_rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // UniqueAttr is an optional argument to Unique.
 type UniqueAttr func(optionalAttr)
 
@@ -2476,7 +2425,7 @@ func UniqueOutIdx(value tf.DataType) UniqueAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
 // y, idx = unique(x)
 // y ==> [1, 2, 4, 7, 8]
@@ -2610,7 +2559,7 @@ func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce per batch.
+//	num_sampled: Number of candidates to produce.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2768,7 +2717,7 @@ func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSampler
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2841,7 +2790,7 @@ func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -2894,7 +2843,10 @@ func AbortExitWithoutError(value bool) AbortAttr {
 	}
 }
 
-// Raise a exception to abort the process when called. If exit_without_error is true, the process will exit normally, otherwise it will exit with a SIGABORT signal.
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
 //
 // Returns nothing but an exception.
 //
@@ -2956,14 +2908,14 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // For example, given this input of shape `[1, 2, 2, 1]`, and block_size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1], [2]],
 //       [[3], [4]]]]
 // ```
 //
 // This operation will output a tensor of shape `[1, 1, 1, 4]`:
 //
-// ```prettyprint
+// ```
 // [[[[1, 2, 3, 4]]]]
 // ```
 //
@@ -2974,7 +2926,7 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3], [4, 5, 6]],
 //       [[7, 8, 9], [10, 11, 12]]]]
 // ```
@@ -2982,13 +2934,13 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 // This operation, for block_size of 2, will return the following tensor of shape
 // `[1, 1, 1, 12]`
 //
-// ```prettyprint
+// ```
 // [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
 // ```
 //
 // Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
 //
-// ```prettyprint
+// ```
 // x = [[[[1],   [2],  [5],  [6]],
 //       [[3],   [4],  [7],  [8]],
 //       [[9],  [10], [13],  [14]],
@@ -2997,7 +2949,7 @@ func ControlTrigger(scope *Scope) (o *tf.Operation) {
 //
 // the operator will return the following tensor of shape `[1 2 2 4]`:
 //
-// ```prettyprint
+// ```
 // x = [[[[1, 2, 3, 4],
 //        [5, 6, 7, 8]],
 //       [[9, 10, 11, 12],
@@ -3023,37 +2975,34 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 	return op.Output(0)
 }
 
-// Creates a new tensor by applying sparse `updates` to individual
+// Scatter `updates` into a new (initially zero) tensor according to `indices`.
 //
-// values or slices within a zero tensor of the given `shape` tensor according to
+// Creates a new tensor by applying sparse `updates` to individual
+// values or slices within a zero tensor of the given `shape` according to
 // indices.  This operator is the inverse of the [tf.gather_nd](#gather_nd)
 // operator which extracts values or slices from a given tensor.
 //
-// TODO(simister): Add a link to Variable.__getitem__ documentation on slice
-// syntax.
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates.
 //
-// `shape` is a `TensorShape` with rank `P` and `indices` is a `Tensor` of rank
-// `Q`.
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 //
-// `indices` must be integer tensor, containing indices into `shape`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//     indices.shape[-1] <= shape.rank
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `shape`.
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
 //
-// `updates` is Tensor of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, shape[K], ..., shape[P-1]].
-// ```
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
 //
 // The simplest form of scatter is to insert individual elements in a tensor by
 // index. For example, say we want to insert 4 scattered elements in a rank-1
 // tensor with 8 elements.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterNd1.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
 // </div>
 //
 // In Python, this scatter operation would look like this:
@@ -3064,7 +3013,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //     shape = tf.constant([8])
 //     scatter = tf.scatter_nd(indices, updates, shape)
 //     with tf.Session() as sess:
-//       print sess.run(scatter)
+//       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
@@ -3076,7 +3025,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 // rank-3 tensor with two matrices of new values.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/ScatterNd2.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
 // </div>
 //
 // In Python, this scatter operation would look like this:
@@ -3090,7 +3039,7 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //     shape = tf.constant([4, 4, 4])
 //     scatter = tf.scatter_nd(indices, updates, shape)
 //     with tf.Session() as sess:
-//       print sess.run(scatter)
+//       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
@@ -3101,11 +3050,9 @@ func SpaceToDepth(scope *Scope, input tf.Output, block_size int64) (output tf.Ou
 //      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
 //
 // Arguments:
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as tensor. A tensor of updated values
-// to store in ref.
-//	shape: A vector. The shape of the resulting tensor.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
 //
 // Returns A new tensor with the given shape and updates applied according
 // to the indices.
@@ -3205,7 +3152,7 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 // `Merge` waits for at least one of the tensors in `inputs` to become available.
 // It is usually combined with `Switch` to implement branching.
 //
-// `Merge` forwards the first tensor for become available to `output`, and sets
+// `Merge` forwards the first tensor to become available to `output`, and sets
 // `value_index` to its index in `inputs`.
 //
 // Arguments:
@@ -3331,6 +3278,18 @@ func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
 	}
 }
 
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation items have longer input sequences than output sequences
+// are ignored by returning zero-gradient for those items.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
 // Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
 //
 // the gradient.  This class performs the softmax operation for you, so inputs
@@ -3389,10 +3348,10 @@ func StageSharedName(value string) StageAttr {
 	}
 }
 
-// Stage values similar to a lightweight Enqueue.  The basic functionality of this
+// Stage values similar to a lightweight Enqueue.
 //
-// Op is similar to a queue with many fewer capabilities and options.  This Op is
-// optimized for performance.
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
 //	values: a list of tensors
@@ -3435,11 +3394,20 @@ func FakeQuantWithMinMaxArgsMax(value float32) FakeQuantWithMinMaxArgsAttr {
 	}
 }
 
+// FakeQuantWithMinMaxArgsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsNumBits(value int64) FakeQuantWithMinMaxArgsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
 // Fake-quantize the 'inputs' tensor, type float to 'outputs' tensor of same type.
 //
 // Attributes [min; max] define the clamping range for the 'inputs' data.  Op
 // divides this range into 255 steps (total of 256 values), then replaces each
 // 'inputs' value with the closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
 // Quantization is called fake since the output is still in floating point.
 func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsAttr) (outputs tf.Output) {
@@ -3506,23 +3474,6 @@ func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value t
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceGatherAttr is an optional argument to ResourceGather.
 type ResourceGatherAttr func(optionalAttr)
 
@@ -3568,9 +3519,10 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.  This enables
+// Delete the TensorArray from its resource container.
 //
-// the user to close and release the resource in the middle of a step/run.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
 //	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
@@ -3651,7 +3603,7 @@ func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSam
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -3737,7 +3689,7 @@ func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'diagonal' is [1, 2, 3, 4]
 // tf.diag(diagonal) ==> [[1, 0, 0, 0]
 //                        [0, 2, 0, 0]
@@ -3927,37 +3879,6 @@ func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0), op.Output(1)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
-
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Returns the batched diagonal part of a batched tensor.
 //
 // This operation returns a tensor with the `diagonal` part
@@ -3972,7 +3893,7 @@ func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'input' is [[[1, 0, 0, 0]
 //                [0, 2, 0, 0]
 //                [0, 0, 3, 0]
@@ -4008,50 +3929,6 @@ func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	return op.Output(0)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be cancelled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
-}
-
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
-//
-// Arguments:
-//	handle: The handle to a queue.
-//
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -4092,24 +3969,24 @@ func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	}
 }
 
-// Dequeues n tuples of one or more tensors from the given queue.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
 // This operation is not supported by all queues.  If a queue does not support
 // DequeueUpTo, then an Unimplemented error is returned.
 //
-// If the queue is closed and there are more than 0 but less than n elements
-// remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If the queue
-// is closed and there are 0 elements left in the queue, then an OutOfRange
-// error is returned just like in QueueDequeueMany.  Otherwise the behavior
-// is identical to QueueDequeueMany:
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
 // This operation concatenates queue-element component tensors along the
 // 0th dimension to make a single component tensor.  All of the components
 // in the dequeued tuple will have size n in the 0th dimension.
 //
-// This operation has k outputs, where k is the number of components in
-// the tuples stored in the given queue, and output i is the ith
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
 // component of the dequeued tuple.
 //
 // Arguments:
@@ -4177,20 +4054,20 @@ func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	}
 }
 
-// Dequeues n tuples of one or more tensors from the given queue.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// If the queue is closed and there are fewer than n elements, then an
+// If the queue is closed and there are fewer than `n` elements, then an
 // OutOfRange error is returned.
 //
 // This operation concatenates queue-element component tensors along the
 // 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// in the dequeued tuple will have size `n` in the 0th dimension.
 //
-// This operation has k outputs, where k is the number of components in
-// the tuples stored in the given queue, and output i is the ith
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
 // component of the dequeued tuple.
 //
-// N.B. If the queue is empty, this operation will block until n elements
+// N.B. If the queue is empty, this operation will block until `n` elements
 // have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
@@ -4273,6 +4150,77 @@ func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, opti
 	return scope.AddOperation(opspec)
 }
 
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
+//
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceStridedSliceAssign",
+		Input: []tf.Input{
+			ref, begin, end, strides, value,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnstageAttr is an optional argument to Unstage.
 type UnstageAttr func(optionalAttr)
 
@@ -4292,10 +4240,10 @@ func UnstageSharedName(value string) UnstageAttr {
 	}
 }
 
-// Op is similar to a lightweight Dequeue.  The basic funtionality is similar to
+// Op is similar to a lightweight Dequeue.
 //
-// dequeue with many fewer capabilities and options.  This Op is optimized for
-// performance.
+// The basic funtionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
 func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -4673,7 +4621,7 @@ func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
 // particular,
 // `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
 //
-// ```prettyprint
+// ```
 // begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
 // end = [2, 4, x, x, -3, x]
 // strides = [1, 1, x, x, -1, 1]
@@ -4824,10 +4772,28 @@ func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow
 //               [51, 52], [61, 62]]
 // ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4841,188 +4807,219 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+	return func(m optionalAttr) {
+		m["cancel_pending_enqueues"] = value
+	}
+}
+
+// Closes the given queue.
 //
-// Inputs are the logits, not probabilities.
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	handle: The handle to a queue.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "QueueCloseV2",
 		Input: []tf.Input{
-			features, labels,
+			handle,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
+// Releases any resources used by the given iterator.
 //
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
+// Returns the created operation.
+func IteratorDispose(scope *Scope, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorDispose",
+		Input: []tf.Input{
+			iterator,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["labels"] = value
+// Gets the next output from the given iterator.
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
 	}
+	return components
 }
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			tensor,
+			filenames, compression_type,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			gradients, features,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Creates a dataset that emits the records from one or more binary files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			features,
+			filenames, header_bytes, record_bytes, footer_bytes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
+// Deprecated. Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// Creates a dataset that yields a SparseTensor for each element of the input.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+//
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "DenseToSparseBatchDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, batch_size, row_shape,
 		},
 		Attrs: attrs,
 	}
@@ -5030,60 +5027,28 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// Arguments:
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
@@ -5091,43 +5056,54 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Relu6",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			features,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -5135,116 +5111,189 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseTensorSliceDataset",
+		Input: []tf.Input{
+			indices, values, dense_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// Reshapes a tensor.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+//
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
+//
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
+//
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+//
+// # -1 can also be used to infer the shape
+//
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reshape",
+		Input: []tf.Input{
+			tensor, shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradient for batch normalization.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.
-//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be used in the gradient computation.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["method"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			input,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -5252,74 +5301,83 @@ func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
 // Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
-	}
-}
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["max"] = value
+		m["method"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			gradients, inputs,
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
@@ -5327,124 +5385,107 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// A container for an iterator resource.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
+		Type: "Iterator",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["uniform_noise"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			input, size, offsets,
 		},
 		Attrs: attrs,
 	}
@@ -5452,191 +5493,150 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels and the bounding box is
+// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
+// bounding box will be `(10, 40)` to `(50, 180)`.
+//
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			x,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division.
+// Convert one or more images from HSV to RGB.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			x, y,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
+//     convert $src.gif -coalesce $dst.gif
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			contents,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
 
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// DecodePngChannels sets the optional channels attribute to value.
 //
-// value: The dimension along which reversal is performed.
+// value: Number of color channels for the decoded image.
 // If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+func DecodePngChannels(value int64) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["channels"] = value
 	}
 }
 
-// Reverses variable length slices.
-//
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
-//
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-//
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
-//
-// For example:
-//
-// ```prettyprint
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// In contrast, if:
+// Accepted values are:
 //
-// ```prettyprint
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			input, seq_lengths,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -5644,151 +5644,144 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Adjust the contrast of one or more images.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			x, y,
+			images, contrast_factor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor transferred by GraphTransferer.
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
 //
-// The graph specifications are serialized by protobuf as graph_transfer_info.
-// The implementation / limitations may differ for each platform
-// and each available peripheral.
-func RemoteFusedGraphExecute(scope *Scope, values []tf.Output, N int64, U tf.DataType, serialized_graph_transfer_info string) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"N": N, "U": U, "serialized_graph_transfer_info": serialized_graph_transfer_info}
-	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	return output
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// DecodeJpegRatio sets the optional ratio attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["ratio"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
 // If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["acceptable_fraction"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
 // If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			size,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -5796,341 +5789,283 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			grads, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			data,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// Returns the set of files matching one or more glob patterns.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+//
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NoOp",
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "Transpose",
 		Input: []tf.Input{
-			features,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			input, filter,
+			filename,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// Store the input tensor in the state of the current session.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	value: The tensor to be stored.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
-			value, bias,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
-
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// Adjust the hue of one or more images.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalAvgPool function.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			images, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// Restore a Reader to its initial clean state.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradients of convolution with respect to the input.
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["description"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["labels"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["display_name"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Outputs a `Summary` protocol buffer with a tensor.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
-//
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be used in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6139,121 +6074,102 @@ func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			shape,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "Softplus",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
 // If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["adj_y"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// Multiplies slices of two tensors in batches.
+//
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6262,9 +6178,9 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -6272,34 +6188,49 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
 
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6308,114 +6239,92 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
-
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
-//
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
-//
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "Relu6",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["magnitude_squared"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// Produces a visualization of audio data over time.
+//
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
+//
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -6423,201 +6332,190 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/SegmentMax.png" alt>
-// </div>
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			data, segment_ids,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
+
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// Elements of the `shapes_and_slices` input must either be:
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveSlices",
-		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
-		},
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Returns the rank of a tensor.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// This operation returns an integer representing the rank of `input`.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// For example:
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// ```prettyprint
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.
+//	reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
 //
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			input,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["data_format"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
 	}
-	return output
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
+	}
+}
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["num_bits"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6626,9 +6524,9 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "FakeQuantWithMinMaxArgsGradient",
 		Input: []tf.Input{
-			out_backprop,
+			gradients, inputs,
 		},
 		Attrs: attrs,
 	}
@@ -6636,52 +6534,57 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			json_examples,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			serialized,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6689,294 +6592,2279 @@ func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (outp
 	return op.Output(0)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the truth value of (x == y) element-wise.
+//
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "Equal",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
+
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```prettyprint
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```prettyprint
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
-//
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
-	return func(m optionalAttr) {
-		m["little_endian"] = value
-	}
-}
-
-// Reinterpret the bytes of a string as a vector of numbers.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	bytes: All the elements must have the same length.
-//
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			bytes,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["data_format"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
-//
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			handle,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a tensor of ones with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
+	}
+}
+
+// Reverses variable length slices.
+//
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
+//
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+//
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
+//
+// For example:
+//
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
+//
+// In contrast, if:
+//
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
+//
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ReverseSequence",
+		Input: []tf.Input{
+			input, seq_lengths,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RsqrtGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor transferred by GraphTransferer.
+//
+// The graph specifications are serialized by protobuf as graph_transfer_info.
+// The implementation / limitations may differ for each platform
+// and each available peripheral.
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV2",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes its input available to the next iteration.
+//
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NoOp",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softsign",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBilinear",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
+//
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+//
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+//
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
+//
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CropAndResize",
+		Input: []tf.Input{
+			image, boxes, box_ind, crop_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gradients for batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. See `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be used in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomStandardNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
+//
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
+//
+// Arguments:
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
+//
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ComputeAccidentalHits",
+		Input: []tf.Input{
+			true_classes, sampled_candidates,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the average pooling function.
+//
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves input tensors slices to disk.
+//
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
+//
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveSlices",
+		Input: []tf.Input{
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Writes contents to the file at input filename. Creates file if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix Cholesky
+// decomposition above. The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cholesky",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rank",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acos",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+//
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
+//
+// Arguments:
+//	bytes: All the elements must have the same length.
+//
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues a tuple of one or more tensors from the given queue.
+//
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueDequeueV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
+	}
+	return components
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+//
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
 // If not specified, defaults to <>
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+//
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleSequenceExample",
+		Input: []tf.Input{
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
+//
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomGamma",
+		Input: []tf.Input{
+			shape, alpha,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
+		m["seed"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```prettyprint
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FakeQuantWithMinMaxVarsPerChannelAttr is an optional argument to FakeQuantWithMinMaxVarsPerChannel.
+type FakeQuantWithMinMaxVarsPerChannelAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsPerChannelNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsPerChannelNumBits(value int64) FakeQuantWithMinMaxVarsPerChannelAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+//
+// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+// to 'outputs' tensor of same shape as `inputs`.
+//
+// [min; max] is the clamping range for the 'inputs' data in the corresponding
+// depth channel.  Op divides this range into 255 steps (total of 256 values), then
+// replaces each 'inputs' value with the closest of the quantized step values.
+// 'num_bits' is the bitwidth of the quantization; between 2 and 8, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max` values.
+func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsPerChannelAttr) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["seed"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["seed2"] = value
 	}
 }
 
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6985,94 +8873,108 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
+	return op.Output(0)
+}
+
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
+//
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
 	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
+}
+
+// PNG-encode an image.
+//
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
+//
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
+//
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "EncodePng",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
 	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Outputs random integers from a uniform distribution.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7081,9 +8983,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			shape, alpha,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -7091,140 +8993,250 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// value: Whether to use Adapative SDCA for the inner loop.
+// If not specified, defaults to false
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["adaptative"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
-// ```prettyprint
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			value,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Fake-quantize the 'inputs' tensor of type float and one of the shapes: `[d]`,
+// Computes the minimum along segments of a tensor.
 //
-// `[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-// to 'outputs' tensor of same shape as `inputs`.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// [min; max] is the clamping range for the 'inputs' data in the corresponding
-// depth channel.  Op divides this range into 255 steps (total of 256 values), then
-// replaces each 'inputs' value with the closest of the quantized step values.
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// This operation has a gradient and thus allows for training `min` and `max` values.
-func FakeQuantWithMinMaxVarsPerChannel(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output) (outputs tf.Output) {
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsPerChannel",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			inputs, min, max,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
 
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Restores a tensor from checkpoint files.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "Restore",
 		Input: []tf.Input{
-			shape,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
@@ -7232,254 +9244,395 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "DenseToSparseSetOperation",
+		Input: []tf.Input{
+			set1, set2_indices, set2_values, set2_shape,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// Delete the tensor specified by its handle in the session.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Convert one or more images from HSV to RGB.
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "SetSize",
 		Input: []tf.Input{
-			images,
+			set_indices, set_values, set_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
+		m["container"] = value
 	}
 }
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7488,9 +9641,9 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			a, b,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -7498,405 +9651,324 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
-//
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["channels"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// Proximal Stochastic Dual Coordinate Ascent, Shalev-Shwartz, Shai; Zhang, Tong.
-// 2012 arXiv1211.2717S: http://arxiv.org/pdf/1211.2717v1.pdf
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-//   Loss objective = \sum f_{i}(wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// Adding vs. Averaging in Distributed Primal-Dual Optimization.
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan, Peter Richtarik,
-// Martin Takac http://arxiv.org/abs/1502.03508
+// Accepted values are:
 //
-// Stochastic Dual Coordinate Ascent with Adaptive Probabilities
-// Dominik Csiba, Zheng Qu, Peter Richtarik https://arxiv.org/abs/1502.08053
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe ommitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
+// Computes softmax activations.
 //
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// For each batch `i` and class `j` we have
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/SegmentMin.png" alt>
-// </div>
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
 //
 // Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "Softmax",
 		Input: []tf.Input{
-			data, segment_ids,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
 // If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["capacity"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["min_after_dequeue"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// See also `RestoreSlice`.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	component_types: The type of each component in a value.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
-		Input: []tf.Input{
-			file_pattern, tensor_name,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// Constructs a tensor by tiling a given tensor.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "Tile",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			input, multiples,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
+	return op.Output(0)
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Returns the element-wise min of two SparseTensors.
 //
-// If two elements are equal, the lower-index element appears first.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			input, k,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Delete the tensor specified by its handle in the session.
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			handle,
+			basename, shard, num_shards,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
 		m["validate_indices"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
 //
 // Output `result` is a `SparseTensor` represented by `result_indices`,
 // `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
@@ -7905,16 +9977,26 @@ func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperatio
 // `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
 //
 // Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
 // the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
 // is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7923,9 +10005,9 @@ func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			set1, set2,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
@@ -7933,585 +10015,556 @@ func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
-
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
-//
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			a_indices, a_values, a_shape, b,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
-
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["out_idx"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// Computes the difference between two lists of numbers or strings.
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
 //
-// and
+// For example, given this input:
 //
 // ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
 // ```
 //
-// then the final `SparseTensor` will be:
+// This operation would return:
 //
 // ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
 // ```
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			sparse_handles,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// Generates sparse cross from a list of sparse and dense tensors.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// For example, if the inputs are
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//     inputs[2]: Tensor [["f"], ["g"]]
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+// then the output will be
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax activations.
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
 //
-// For each batch `i` and class `j` we have
+// if hashed_output=true then the output will be
 //
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			logits,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["overlapping"] = value
 	}
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
+		m["deterministic"] = value
 	}
 }
 
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
 //
-// value: A second seed to avoid seed collision.
+// value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// Performs fractional max pooling on the input.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that randomizes the order of elements.
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			tags, values,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// Performs a padding as a preprocess during a convolution.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			pattern,
+			input, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
+//
+// Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			x, y,
+			l, grad,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// Returns immutable tensor from memory region.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+// and
 //
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
-		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+// then the final deserialized `SparseTensor` will be:
 //
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
 // Batch normalization.
@@ -9075,15 +11128,16 @@ func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Compute the 1-dimensional discrete Fourier Transform over the inner-most
+// Fast Fourier transform.
 //
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
 // dimension of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
 // Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier Transform.
+//   dimension of `input` is replaced with its 1D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.fft
@@ -9168,139 +11222,69 @@ func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	opspec := tf.OpSpec{
 		Type: "SelfAdjointEig",
 		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
-
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Computes gradient of the FractionalAvgPool function.
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			value,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // Reorders a SparseTensor into the canonical, row-major ordering.
@@ -9362,7 +11346,7 @@ func PackAxis(value int64) PackAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 'x' is [1, 4]
 // # 'y' is [2, 5]
 // # 'z' is [3, 6]
@@ -9448,53 +11432,6 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
@@ -9535,51 +11472,6 @@ func ReciprocalGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix Cholesky
-// decomposition above. The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cholesky",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Writes contents to the file at input filename. Creates file if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
-//
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteFile",
-		Input: []tf.Input{
-			filename, contents,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Reverses specific dimensions of a tensor.
 //
 // NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
@@ -9595,7 +11487,7 @@ func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Oper
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor 't' is [[[[ 0,  1,  2,  3],
 // #                  [ 4,  5,  6,  7],
 // #                  [ 8,  9, 10, 11]],
@@ -9714,15 +11606,16 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 	return scope.AddOperation(opspec)
 }
 
-// Compute the inverse 3-dimensional discrete Fourier Transform over the inner-most
+// Inverse 3D fast Fourier transform.
 //
-// 3 dimensions of `input`.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
 // Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier Transform.
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.ifftn with 3 dimensions.
@@ -9741,6 +11634,35 @@ func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Looks up keys in a table, outputs the corresponding values.
+//
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
+//
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableFindV2",
+		Input: []tf.Input{
+			table_handle, keys, default_value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
 // range that covers the actual values present in that tensor.  This op is
@@ -9767,6 +11689,35 @@ func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, inp
 	return op.Output(0), op.Output(1)
 }
 
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
 type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
@@ -9800,22 +11751,47 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // of the convolution.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
+//
+//
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -9953,6 +11929,16 @@ func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	}
 }
 
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
 // Update '*var' according to the Adam algorithm.
 //
 // lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
@@ -9991,15 +11977,16 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 	return scope.AddOperation(opspec)
 }
 
-// Compute the 3-dimensional discrete Fourier Transform over the inner-most 3
+// 3D fast Fourier transform.
 //
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
 // dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
 // Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier Transform.
+//   dimensions of `input` are replaced with their 3D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.fftn with 3 dimensions.
@@ -10036,7 +12023,7 @@ func SizeOutType(value tf.DataType) SizeAttr {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
 // size(t) ==> 12
 // ```
@@ -10184,15 +12171,16 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 	return scope.AddOperation(opspec)
 }
 
-// Compute the 2-dimensional discrete Fourier Transform over the inner-most
+// 2D fast Fourier transform.
 //
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
 // 2 dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
 // Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier Transform.
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.fft2
@@ -10217,7 +12205,7 @@ func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # Output tensor has shape [2, 3].
 // fill([2, 3], 9) ==> [[9, 9, 9]
 //                      [9, 9, 9]]
@@ -10244,15 +12232,16 @@ func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Compute the inverse 2-dimensional discrete Fourier Transform over the inner-most
+// Inverse 2D fast Fourier transform.
 //
-// 2 dimensions of `input`.
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
 //	input: A complex64 tensor.
 //
 // Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier Transform.
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.ifft2
@@ -10271,53 +12260,6 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // TensorArrayV3Attr is an optional argument to TensorArrayV3.
 type TensorArrayV3Attr func(optionalAttr)
 
@@ -10368,9 +12310,9 @@ func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
 	}
 }
 
-// An array of Tensors of given size, with data written via Write and read
+// An array of Tensors of given size.
 //
-// via Read or Pack.
+// Write data via Write and read via Read or Pack.
 //
 // Arguments:
 //	size: The size of the array.
@@ -10534,47 +12476,97 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 	return scope.AddOperation(opspec)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
+//
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -10624,6 +12616,8 @@ func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, ma
 type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
 // FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+//
+// value: Number of bytes in the header, defaults to 0.
 // If not specified, defaults to 0
 func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
@@ -10632,6 +12626,8 @@ func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2
 }
 
 // FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
 // If not specified, defaults to 0
 func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
@@ -10639,6 +12635,17 @@ func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2
 	}
 }
 
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
 // FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this reader is placed in the given container.
@@ -10663,6 +12670,9 @@ func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2
 
 // A Reader that outputs fixed-length records from a file.
 //
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
 // Returns The handle to reference the Reader.
 func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
@@ -10681,6 +12691,30 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
 type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
@@ -10748,9 +12782,8 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 
 // Computes the mean along segments of a tensor.
 //
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Computes a tensor such that
 // \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
@@ -10760,7 +12793,7 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 // If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/SegmentMean.png" alt>
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 // </div>
 //
 // Arguments:
@@ -10898,7 +12931,7 @@ func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
 // # 'paddings' is [[0, 1]], [0, 1]].
 // # 'mode' is SYMMETRIC.
@@ -10943,7 +12976,7 @@ func MirrorPadGrad(scope *Scope, input tf.Output, paddings tf.Output, mode strin
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor `x` is [3, 4, 0, 2, 1]
 // invert_permutation(x) ==> [2, 4, 3, 0, 1]
 // ```
@@ -10979,7 +13012,7 @@ func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 //
 // For example:
 //
-// ```prettyprint
+// ```
 // # tensor 't' is [[[[ 0,  1,  2,  3],
 // #                  [ 4,  5,  6,  7],
 // #                  [ 8,  9, 10, 11]],
@@ -11256,8 +13289,9 @@ func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	return op.Output(0)
 }
 
-// Compute the inverse 2-dimensional discrete Fourier Transform of a real-valued
+// Inverse 2D real-valued fast Fourier transform.
 //
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
 // signal over the inner-most 2 dimensions of `input`.
 //
 // The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
@@ -11273,7 +13307,7 @@ func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 //
 // Returns A float32 tensor of the same rank as `input`. The inner-most 2
 //   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier Transform.
+//   inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
 // Equivalent to np.fft.irfft2
@@ -11292,12 +13326,12 @@ func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics where
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// true, this follows C semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
@@ -11501,7 +13535,7 @@ func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCa
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
@@ -11583,6 +13617,27 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 	return scope.AddOperation(opspec)
 }
 
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Decode web-safe base64-encoded strings.
 //
 // Input may or may not have padding at the end. See EncodeBase64 for padding.
@@ -11602,20 +13657,68 @@ func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 			input,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			x,
+			orig_input_shape, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -11806,306 +13909,88 @@ func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDisto
 //	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
 // associated with the image.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
-	}
-}
-
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// number_of_steps = 1 << (# of bits in T)
-// range_adjust = number_of_steps / (number_of_steps - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = number_of_steps / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
-//
-// Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
-		Input: []tf.Input{
-			input, min_range, max_range,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns which elements of x are Inf.
+// Returns the truth value of (x > y) element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "Greater",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
+// value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Update '*var' according to the RMSProp algorithm.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12114,233 +13999,153 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReadFile",
-		Input: []tf.Input{
-			filename,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["mode"] = value
 	}
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
-
-// PrintSummarize sets the optional summarize attribute to value.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Prints a list of tensors.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// Passes `input` through to `output` and prints `data` when evaluating.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// *MIN_COMBINED Mode Example*
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Print",
-		Input: []tf.Input{
-			input, tf.OutputList(data),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// number_of_steps = 1 << (# of bits in T)
+// range_adjust = number_of_steps / (number_of_steps - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = number_of_steps / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
 //
-// Input images can be of different types but output images are always float.
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			images, size,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
-// For example:
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			input,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -12348,139 +14153,62 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
-//
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
-//
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
-//
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// Returns which elements of x are Inf.
 //
-// Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "IsInf",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	linear: Should be from a Variable().
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12489,67 +14217,106 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: An exponent.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
 // If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Gradients for Local Response Normalization.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12558,9 +14325,9 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -12568,26 +14335,38 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// Update '*var' according to the adadelta scheme.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12596,178 +14375,114 @@ func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToN
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			string_tensor,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```prettyprint
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// Computes the number of elements in the given queue.
-//
-// Arguments:
-//	handle: The handle to a queue.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
-		Input: []tf.Input{
-			handle,
-		},
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			tag, values,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
 
-// AsStringPrecision sets the optional precision attribute to value.
+// PrintMessage sets the optional message attribute to value.
 //
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["precision"] = value
+		m["message"] = value
 	}
 }
 
-// AsStringScientific sets the optional scientific attribute to value.
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["scientific"] = value
+		m["first_n"] = value
 	}
 }
 
-// AsStringShortest sets the optional shortest attribute to value.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["shortest"] = value
+		m["summarize"] = value
 	}
 }
 
-// AsStringWidth sets the optional width attribute to value.
+// Prints a list of tensors.
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12776,9 +14491,9 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "Print",
 		Input: []tf.Input{
-			input,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -12786,38 +14501,44 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			predictions, targets,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -12825,40 +14546,31 @@ func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (pr
 	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["Tout"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// Returns the real part of a complex number.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// For example:
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
 // ```
-//
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12867,9 +14579,9 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "Real",
 		Input: []tf.Input{
-			params, indices,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12877,125 +14589,218 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			images, contrast_factor,
+			reader_handle, queue_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
-//
-// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			gradients, features,
+			tf.OutputList(input_datasets),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// The polygamma function is defined as:
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// ```
-// \psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)
-// ```
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
+	}
+}
+
+// Creates an empty hash table that uses tensors as the backing store.
+//
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			a, x,
+			empty_key,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
 //
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNAlpha sets the optional alpha attribute to value.
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
+//
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "LRN",
 		Input: []tf.Input{
-			input, filter,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -13003,58 +14808,105 @@ func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// convert $src.gif -coalesce $dst.gif
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			contents,
+			var_, accum, lr, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13063,9 +14915,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			input,
+			input_grads, input_image, output_image,
 		},
 		Attrs: attrs,
 	}
@@ -13073,233 +14925,206 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			reader_handle,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n"]
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			input, pos, len,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// Pads a tensor with zeros.
 //
-// N is the size of the segment being reduced.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
+// The padded size of each dimension D of the output is:
 //
-// Arguments:
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// For example:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "Pad",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input, paddings,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Computes the number of elements in the given queue.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	handle: The handle to a queue.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// The tensor returned by this operation is immutable.
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			resource,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
 // If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["scientific"] = value
 	}
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
 //
-// Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...ProdAttr) (output tf.Output) {
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
+	return func(m optionalAttr) {
+		m["fill"] = value
+	}
+}
+
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13308,9 +15133,9 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "AsString",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -13318,33 +15143,45 @@ func Prod(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Gather slices from `params` according to `indices`.
 //
-// Input images can be of different types but output images are always float.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+//
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
+//
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13353,9 +15190,9 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "Gather",
 		Input: []tf.Input{
-			images, size,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
@@ -13363,166 +15200,128 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
+// Computes softsign gradients for a softsign operation.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Returns The gradients: `gradients / (1 + abs(-features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// The polygamma function is defined as:
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
 //
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
-// rate.
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			shape, rate,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["iou_threshold"] = value
+		m["pad"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// Encode strings into web-safe base64 format.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+//	input: Strings to be encoded.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13531,9 +15330,9 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -13541,90 +15340,148 @@ func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_outp
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Return substrings from `Tensor` of strings.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
+//
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n"]
+// ```
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Substr",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			input, pos, len,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["dtype"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// Arguments:
+// The generated values will have mean 0 and standard deviation 1.
 //
+// The outputs are a deterministic function of `shape` and `seed`.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13633,422 +15490,293 @@ func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_f
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// Inverse fast Fourier transform.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// N is the size of the segment being reduced.
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			value,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Computes the number of elements in the given table.
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	table_handle: Handle to the table.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			table_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
+	return op.Output(0)
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
-//
-// For example:
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
-// ```
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
+// Reads the value of a variable.
 //
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// The tensor returned by this operation is immutable.
 //
-// ```prettyprint
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
+// Computes the absolute value of a tensor.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Asserts that the given condition is true.
+// Restore a reader to a previously saved state.
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
 // Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			reader_handle, state,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// RandomUniformSeed sets the optional seed attribute to value.
+// RandomPoissonSeed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
+// rate.
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			shape,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -14056,421 +15784,441 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical or" of elements across dimensions of a tensor.
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			input, reduction_indices,
+			sp_indices, sp_values, sp_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// Computes gradients for SparseSegmentMean.
 //
-// The Hurwitz zeta function is defined as:
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// ```
-// \zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}
-// ```
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			x, q,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the inverse 1-dimensional discrete Fourier Transform over the inner-most
+// Converts one or more images from RGB to HSV.
 //
-// dimension of `input`.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier Transform.
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the inverse 1-dimensional discrete Fourier Transform of a real-valued
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
 //
-// signal over the inner-most dimension of `input`.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
+// in the least squares sense.
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// matrix and right-hand sides in the batch:
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// `matrix`=\\(A \in \Re^{m \times n}\\),
+// `rhs`=\\(B  \in \Re^{m \times k}\\),
+// `output`=\\(X  \in \Re^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda\\).
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier Transform.
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
+// \\(A Z = B\\). Notice that the fast path is only numerically stable when
+// \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.irfft
+// Equivalent to np.linalg.lstsq
 // @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			input, fft_length,
+			matrix, rhs, l2_regularizer,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			features, max_value, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
+// value: see above.
 // If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeV2Checkpoints",
+		Input: []tf.Input{
+			checkpoint_prefixes, destination_prefix,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
+
+// UnpackAxis sets the optional axis attribute to value.
 //
-// backsubstitution.
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "Unpack",
 		Input: []tf.Input{
-			matrix, rhs,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
-//
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
 	}
-	return scope.AddOperation(opspec)
+	return output
 }
 
-// Compute the 1-dimensional discrete Fourier Transform of a real-valued signal
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// over the inner-most dimension of `input`.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// Graphically the output tensors are:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier Transform.
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			input, fft_length,
+			split_dim, indices, values, shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			x, y,
-		},
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
+// value: The separator to use when joining.
 // If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["separator"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// Joins a string Tensor across the given dimensions.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// For example:
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14479,9 +16227,9 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -14489,72 +16237,57 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
-
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// Computes the singular value decompositions of one or more matrices.
 //
-// For example:
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
 // ```prettyprint
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
 // ```
 //
 // Arguments:
-//	x: 1-D.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14563,9 +16296,9 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "Svd",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -14573,34 +16306,30 @@ func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAtt
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// AssertSummarize sets the optional summarize attribute to value.
+//
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["summarize"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// Asserts that the given condition is true.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
-// For example:
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14609,52 +16338,62 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "Assert",
 		Input: []tf.Input{
-			real, imag,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["seed"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
-// For example:
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			input,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -14662,114 +16401,95 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
-//
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
-		Input: []tf.Input{
-			input,
-		},
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```prettyprint
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// which has shape (2, 4, 4)
-// ```
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			diagonal,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Computes the "logical or" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "Any",
 		Input: []tf.Input{
-			input,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -14777,208 +16497,238 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns x // y element-wise.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "Zeta",
 		Input: []tf.Input{
-			x, y,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
-
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Inverse real-valued fast Fourier transform.
 //
-// If two elements are equal, the lower-index element appears first.
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			input,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			reader_handle,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Saves tensors in V2 checkpoint format.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			x,
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x - y element-wise.
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			x, y,
+			matrix, rhs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// Adds a value to the current value of a variable.
 //
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			input, delimiter,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the inverse 3-dimensional discrete Fourier Transform of a real-valued
+// Real-valued fast Fourier transform.
 //
-// signal over the inner-most 3 dimensions of `input`.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier Transform.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
+// Equivalent to np.fft.rfft
 // @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "RFFT",
 		Input: []tf.Input{
 			input, fft_length,
 		},
@@ -14987,16 +16737,16 @@ func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Out
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "TanhGrad",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -15005,152 +16755,89 @@ func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// Outputs all keys and values in the table.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
 //
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			value,
+			table_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// RandomCropSeed sets the optional seed attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Randomly crop `image`.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15159,9 +16846,9 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			image, size,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -15169,304 +16856,332 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// The current implementation memmaps the tensor from a file.
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
+		Type: "StringToHashBucketFast",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
+
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// then the final deserialized `SparseTensor` will be:
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Gather specific elements from the TensorArray into output `value`.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			serialized_sparse,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			input, paddings, filter,
+			images, contrast_factor, min_value, max_value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			reader_handle, state,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+// 3D real-valued fast Fourier transform.
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Abs",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			x,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
 //
 // Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
+//	x: 1-D.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			l, grad,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Arguments:
 //
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// then the output will be
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
 //
-// Graphically this is equivalent to doing
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// The input tensors `real` and `imag` must have the same shape.
 //
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+// For example:
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "Complex",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			real, imag,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["Tout"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// Returns the imaginary part of a complex number.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// For example:
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15475,242 +17190,337 @@ func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// Computes fingerprints of the input strings.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	input: vector of strings to compute fingerprints on.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
-//
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+func TextLineDataset(scope *Scope, filenames tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of records this Reader has produced.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
-//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Returns x - y element-wise.
+//
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
 //
-// Arguments:
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			input, delimiter,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Quantized Batch normalization.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x != y) element-wise.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Add all input tensors element wise.
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "InTopK",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			predictions, targets,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// Returns a batched diagonal tensor with a given batched diagonal values.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "MatrixDiag",
 		Input: []tf.Input{
-			input, reduction_indices,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
+
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs 3D max pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -15718,16 +17528,16 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// Returns x // y element-wise.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "FloorDiv",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -15736,622 +17546,648 @@ func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
 
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["sorted"] = value
 	}
 }
 
-// Computes the complex absolute value of a tensor.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "TopK",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels and the bounding box is
-// `[0.1, 0.2, 0.5, 0.9]`, the bottom-left and upper-right coordinates of the
-// bounding box will be `(10, 40)` to `(50, 180)`.
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
-		Input: []tf.Input{
-			images, boxes,
-		},
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the element-wise max of two SparseTensors.
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input, k,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
+
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Randomly crop `image`.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
+//
+// Arguments:
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			x,
+			image, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
+
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
+}
+
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Square",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			x, y,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes square root of x element-wise.
+// Updates the table to associates keys with values.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
-			x,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inserts a dimension of 1 into a tensor's shape.
-//
-// Given a tensor `input`, this operation inserts a dimension of 1 at the
-// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
-// zero; if you specify a negative number for `dim` it is counted backward from
-// the end.
-//
-// This operation is useful if you want to add a batch dimension to a single
-// element. For example, if you have a single image of shape `[height, width,
-// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-// which will make the shape `[1, height, width, channels]`.
-//
-// Other examples:
-//
-// ```prettyprint
-// # 't' is a tensor of shape [2]
-// shape(expand_dims(t, 0)) ==> [1, 2]
-// shape(expand_dims(t, 1)) ==> [2, 1]
-// shape(expand_dims(t, -1)) ==> [2, 1]
-//
-// # 't2' is a tensor of shape [2, 3, 5]
-// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-// ```
+// Produces the average pool of the input tensor for quantized types.
 //
-// This operation requires that:
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-// `-1-input.dims() <= dim <= input.dims()`
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAvgPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// This operation is related to `squeeze()`, which removes dimensions of
-// size 1.
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
 // Arguments:
 //
-//	dim: 0-D (scalar). Specifies the dimension index at which to
-// expand the shape of `input`.
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// Returns Contains the same data as `input`, but its shape has an additional
-// dimension of size 1 added.
-func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ExpandDims",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			input, dim,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			input, reduction_indices,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Performs beam search decoding on the logits given in input.
+// Quantized Batch normalization.
 //
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Add all input tensors element wise.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "AddN",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
-//
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
-//
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
-//
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// RecordInputBatchSize sets the optional batch_size attribute to value.
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
-	}
-}
-
-// Emits randomized records.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
+		Type: "Max",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "Cast",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
+// Returns the truth value of x AND y element-wise.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			start, stop, num,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Log",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
 			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			gradients, features,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
-
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Resize `images` to `size` using bicubic interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// DEPRECATED at GraphDef version 17: Use ReciprocalGrad
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			images, size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Computes the reciprocal of x element-wise.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "Reciprocal",
 		Input: []tf.Input{
 			x,
 		},
@@ -16360,28 +18196,33 @@ func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
+		Type: "Elu",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes square of x element-wise.
+//
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "Square",
 		Input: []tf.Input{
 			x,
 		},
@@ -16390,28 +18231,36 @@ func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
+		Type: "Sqrt",
 		Input: []tf.Input{
 			x,
 		},
@@ -16420,223 +18269,185 @@ func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Compute the 2-dimensional discrete Fourier Transform of a real-valued signal
+// Inserts a dimension of 1 into a tensor's shape.
 //
-// over the inner-most 2 dimensions of `input`.
+// Given a tensor `input`, this operation inserts a dimension of 1 at the
+// dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
+// zero; if you specify a negative number for `dim` it is counted backward from
+// the end.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// This operation is useful if you want to add a batch dimension to a single
+// element. For example, if you have a single image of shape `[height, width,
+// channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+// which will make the shape `[1, height, width, channels]`.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// Other examples:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier Transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// ```
+// # 't' is a tensor of shape [2]
+// shape(expand_dims(t, 0)) ==> [1, 2]
+// shape(expand_dims(t, 1)) ==> [2, 1]
+// shape(expand_dims(t, -1)) ==> [2, 1]
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gradients for batch normalization.
+// # 't2' is a tensor of shape [2, 3, 5]
+// shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+// shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+// shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+// ```
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// This operation requires that:
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// `-1-input.dims() <= dim <= input.dims()`
+//
+// This operation is related to `squeeze()`, which removes dimensions of
+// size 1.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+//	dim: 0-D (scalar). Specifies the dimension index at which to
+// expand the shape of `input`.
+//
+// Returns Contains the same data as `input`, but its shape has an additional
+// dimension of size 1 added.
+func ExpandDims(scope *Scope, input tf.Output, dim tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "ExpandDims",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			input, dim,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
+	return op.Output(0)
 }
 
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// EncodeJpegProgressive sets the optional progressive attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If True, create a JPEG that loads progressively (coarse to fine).
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["progressive"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// Arguments:
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...AllAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+	opspec := tf.OpSpec{
+		Type: "All",
+		Input: []tf.Input{
+			input, reduction_indices,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
+
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Performs beam search decoding on the logits given in input.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "CTCBeamSearchDecoder",
 		Input: []tf.Input{
-			image,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sin",
+		Type: "Rsqrt",
 		Input: []tf.Input{
 			x,
 		},
@@ -16645,37 +18456,93 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the determinant of one ore more square matrices.
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_random_seed"] = value
+	}
+}
+
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+//
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_shuffle_shift_ratio"] = value
+	}
+}
+
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+//
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
+//
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RecordInput",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "Round",
 		Input: []tf.Input{
 			x,
 		},
@@ -16684,173 +18551,82 @@ func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			x,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```prettyprint
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```prettyprint
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```prettyprint
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```prettyprint
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```prettyprint
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```prettyprint
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```prettyprint
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Computes natural logarithm of x element-wise.
 //
-// ```prettyprint
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "Log",
 		Input: []tf.Input{
-			input, crops,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```prettyprint
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// Resize `images` to `size` using bicubic interpolation.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16859,9 +18635,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "ResizeBicubic",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -16869,82 +18645,87 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "Relu6Grad",
 		Input: []tf.Input{
-			x,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
+// Returns x / y element-wise for real types.
 //
-// Arguments:
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			input, dimension,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "Floor",
 		Input: []tf.Input{
 			x,
 		},
@@ -16953,81 +18734,79 @@ func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erf",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "Erfc",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
 // @compatibility(numpy)
-// Equivalent to np.isnan
+// Equivalent to np.fft.rfft2
 // @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			x,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Sin",
 		Input: []tf.Input{
 			x,
 		},
@@ -17036,28 +18815,37 @@ func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the determinant of one ore more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "Cos",
 		Input: []tf.Input{
 			x,
 		},
@@ -17066,107 +18854,136 @@ func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
-//
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
-//
-// This operator is similar to the [unsorted segment sum operator](../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
-//
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
-//
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "Tan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// BatchToSpace for 4-D tensors of type T.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			value, bias,
+			input, crops,
 		},
 		Attrs: attrs,
 	}
@@ -17174,41 +18991,53 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Converts a sparse representation into a dense tensor.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// ```prettyprint
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17217,505 +19046,415 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x * y element-wise.
-//
-// *NOTE*: `Mul` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
-//
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "Asin",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Requantize",
 		Input: []tf.Input{
-			x, y,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			x, y,
+			input, dimension,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
-
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample per batch.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			true_classes,
+			var_, alpha, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Less",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
+// Initializes a table from a text file.
 //
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			x, y,
+			table_handle, filename,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
-//
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "Atan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// ```
-// Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)
-// ```
-// where
-// ```
-// Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt
-// ```
-// is the upper incomplete Gama function.
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igammac",
-		Input: []tf.Input{
-			a, x,
-		},
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+// var: Should be from a Variable().
 //
-// The lower regularized incomplete Gamma function is defined as:
+// Arguments:
 //
-// ```
-// P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)
-// ```
-// where
-// ```
-// gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt
-// ```
-// is the lower incomplete Gamma function.
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			a, x,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-// ```
-// I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}
-// ```
-// where
-//
-// ```
-// B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt
-// ```
+// Returns which elements of x are NaN.
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "IsNan",
 		Input: []tf.Input{
-			a, b, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// Returns an element-wise indication of the sign of a number.
 //
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "Sign",
 		Input: []tf.Input{
-			logits,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "Ceil",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "Exp",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Selects elements from `t` or `e`, depending on `condition`.
-//
-// The `t`, and `e` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `t` and `e` are scalars.
-// If `t` and `e` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `t`, or must have
-// the same shape as `t`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `t` (if true) or `e` (if false).
-//
-// If `condition` is a vector and `t` and `e` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `t` and `e`.
-// If `condition` has the same shape as `t` and `e`, then it chooses which
-// element to copy from `t` and `e`.
+// Computes the Max along segments of a tensor.
 //
-// For example:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// ```prettyprint
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 6],
-//                              [7, 4]]
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
 //
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
 //
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	t: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `t` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	e: = A `Tensor` with the same type and shape as `t`.
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
 //
-// Returns = A `Tensor` with the same type and shape as `t` and `e`.
-func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			condition, t, e,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
-
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// Returns x + y element-wise.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Add",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["data_format"] = value
 	}
 }
 
-// Multiply the matrix "a" by the matrix "b".
+// Adds `bias` to `value`.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17724,9 +19463,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			a, b,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -17734,32 +19473,108 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
 
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns x * y element-wise.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17768,9 +19583,9 @@ func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input, reduction_indices,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -17778,149 +19593,145 @@ func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional .
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "Maximum",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
-// of the input Tensor to reduce across. For vectors, use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			input, dimension,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read [the section on Segmentation](../../api_docs/python/math_ops.md#segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/SegmentSum.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Returns the truth value of (x < y) element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "Less",
 		Input: []tf.Input{
-			data, segment_ids,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	out_backprop: Any number of dimensions.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17929,9 +19740,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			tag, tensor,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -17939,395 +19750,294 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// Computes the power of one value to another.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "Pow",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
+// The upper regularized incomplete Gamma function is defined as:
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/SegmentProd.png" alt>
-// </div>
+// where
 //
-// Arguments:
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// is the upper incomplete Gama function.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "Igammac",
 		Input: []tf.Input{
-			data, segment_ids,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
-//
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// The lower regularized incomplete Gamma function is defined as:
 //
-// `num_segments` should equal the number of distinct segment IDs.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/UnsortedSegmentSum.png" alt>
-// </div>
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
-// Arguments:
+// where
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
 //
+// is the lower incomplete Gamma function.
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "Igamma",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
-//
-// ```prettyprint
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-//   ==> [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-//   ==> [[ 1  2  3  4]
-//        [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-//   ==> [[0 0 0 0]
-//        [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "Atan2",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+//
+// The regularized incomplete beta integral is defined as:
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
 //
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// where
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "Betainc",
 		Input: []tf.Input{
-			arr, size, weights,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// Computes log softmax activations.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
-
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "LogSoftmax",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			logits,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
-//
-// ```
-//
-// Arguments:
-//
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Returns the truth value of x OR y element-wise.
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// Selects elements from `t` or `e`, depending on `condition`.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// The `t`, and `e` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `t` and `e` are scalars.
+// If `t` and `e` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `t`, or must have
+// the same shape as `t`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `t` (if true) or `e` (if false).
+//
+// If `condition` is a vector and `t` and `e` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `t` and `e`.
+// If `condition` has the same shape as `t` and `e`, then it chooses which
+// element to copy from `t` and `e`.
 //
 // For example:
 //
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```prettyprint
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 6],
+//                              [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
 // ```
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+//	t: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `t` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	e: = A `Tensor` with the same type and shape as `t`.
+//
+// Returns = A `Tensor` with the same type and shape as `t` and `e`.
+func Select(scope *Scope, condition tf.Output, t tf.Output, e tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "Select",
 		Input: []tf.Input{
-			start, limit, delta,
+			condition, t, e,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, "a" is transposed before multiplication.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
+// value: If true, "b" is transposed before multiplication.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18336,89 +20046,53 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "MatMul",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			a, b,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Conj",
-		Input: []tf.Input{
-			input,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
-
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// Computes the mean of elements across dimensions of a tensor.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "Mean",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -18426,402 +20100,540 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// A placeholder op that passes through `input` when its output is not fed.
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	input: The default value to produce when `output` is not fed.
-//	shape: The (possibly partial) shape of the tensor.
 //
-// Returns A placeholder tensor that defaults to `input` if it is not fed.
-func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+//	dimension: int32, 0 <= dimension < rank(input).  Describes which dimension
+// of the input Tensor to reduce across. For vectors, use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "PlaceholderWithDefault",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			input,
+			input, dimension,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Computes the sum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
+//
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["max_images"] = value
 	}
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["bad_color"] = value
 	}
 }
 
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// Outputs a `Summary` protocol buffer with images.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReshape",
+		Input: []tf.Input{
+			input_indices, input_shape, new_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
+//
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
+//
+// Returns Same shape with 'input', each value of input replaced with bucket index.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns x * y element-wise, working on quantized buffers.
+// Computes the product along segments of a tensor.
 //
-// Arguments:
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Forwards the input to the output.
+// Computes the sum along sparse segments of a tensor.
 //
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```prettyprint
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+//   ==> [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+//   ==> [[ 1  2  3  4]
+//        [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+//   ==> [[0 0 0 0]
+//        [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			input,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Counts the number of occurrences of each value in an integer array.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
+//
+// Values in `arr` outside of the range [0, size) are ignored.
+//
+// Arguments:
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
+//
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "Bincount",
 		Input: []tf.Input{
-			x, y,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// ```
 //
 // Arguments:
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
 //
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			tensor, shape, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
+//
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Creates a sequence of numbers.
 //
-// Accepted values are:
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// For example:
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "Range",
 		Input: []tf.Input{
-			contents,
+			start, limit, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18830,119 +20642,82 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the QR decompositions of one or more matrices.
+// Returns the complex conjugate of a complex number.
 //
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// ```prettyprint
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+// For example:
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "Conj",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["dtype"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			tag, tensor,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -18950,42 +20725,51 @@ func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate flo
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Restores a tensor from checkpoint files.
+//
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			images, size,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -18993,69 +20777,39 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// A placeholder op that passes through `input` when its output is not fed.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The default value to produce when `output` is not fed.
+//	shape: The (possibly partial) shape of the tensor.
+//
+// Returns A placeholder tensor that defaults to `input` if it is not fed.
+func PlaceholderWithDefault(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "PlaceholderWithDefault",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
-
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Deprecated. Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			input,
+			handle, index, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -19063,88 +20817,65 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["Toutput"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["Tactivation"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19153,42 +20884,42 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			matrix, rhs,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19197,237 +20928,313 @@ func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			input, reduction_indices,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// For example:
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="../../images/DynamicPartition.png" alt>
-// </div>
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
+//
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
-			data, partitions,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
+	return scope.AddOperation(opspec)
 }
 
-// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+// Forwards the input to the output.
+//
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
+//
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
-//
-// DEPRECATED at GraphDef version 17: Use Reciprocal
+// Returns (x - y)(x - y) element-wise.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			x, y,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Restores tensors from a V2 checkpoint.
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```prettyprint
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-	return tensors
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Performs average pooling on the input.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			value,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
@@ -19435,77 +21242,87 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Replaces the contents of the table with the specified keys and values.
 //
-// Read [the section on
-// Segmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation
-// of segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
+// value: If non-empty, this table is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+func HashTableV2SharedName(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A Reader that outputs the entire contents of a file as a value.
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
+		Type: "HashTableV2",
 
 		Attrs: attrs,
 	}
@@ -19513,137 +21330,100 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+		Type: "MutableHashTableV2",
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
-//
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// Reduces `input` along the dimensions given in `reduction_indices`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	reduction_indices: The dimensions to reduce.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19652,131 +21432,132 @@ func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			input, reduction_indices,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["value_shape"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+// Creates an empty hash table.
 //
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
-		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// Table initializer that takes two tensors for keys and values respectively.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
+//
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InitializeTableV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
 //
-// For example:
 //
-// ```prettyprint
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19785,67 +21566,86 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "FakeQuantWithMinMaxVarsGradient",
 		Input: []tf.Input{
-			input,
+			gradients, inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			features, labels,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
 // If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// Computes the inverse of one or more square invertible matrices or their
+//
+// adjoints (conjugate transposes).
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
+//
+// The op uses LU decomposition with partial pivoting to compute the inverses.
+//
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19854,9 +21654,9 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			grads, original_image,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -19864,31 +21664,43 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
 
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19897,109 +21709,121 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
-		Input: []tf.Input{
-			grads, size,
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
+// Adjust the saturation of one or more images.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
+//
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
+
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
 // If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+		m["compute_v"] = value
 	}
 }
 
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+//
+// ```prettyprint
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Arguments:
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEigV2",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Solves systems of linear equations.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20008,9 +21832,9 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			contents,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -20018,45 +21842,43 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// All elements selected by `indices` must have the same shape.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "Sum",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -20064,236 +21886,297 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Compute the 3-dimensional discrete Fourier Transform of a real-valued signal
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// over the inner-most 3 dimensions of `input`.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier Transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// `data.shape` must start with `partitions.shape`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			input, fft_length,
+			data, partitions,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Computes the reciprocal of x element-wise.
 //
-// Arguments:
-//	value: The tensor to be stored.
+// DEPRECATED at GraphDef version 17: Use Reciprocal
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "Inv",
 		Input: []tf.Input{
-			value,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// Returns x / y element-wise for integer types.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
+// Restores tensors from a V2 checkpoint.
+//
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			images, delta,
+			prefix, tensor_names, shape_and_slices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
-//
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// Performs average pooling on the input.
 //
-// ```prettyprint
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			input,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
+// Computes the mean along sparse segments of a tensor.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			images, scale,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["container"] = value
 	}
 }
 
-// PNG-encode an image.
-//
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
-//
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
 //
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20302,83 +22185,95 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
-		Input: []tf.Input{
-			image,
-		},
+		Type: "WholeFileReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
+//
+// Arguments:
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
+//
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
 
-// MatrixSolveLsFast sets the optional fast attribute to value.
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
 // If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Rhs is a tensor of shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]
-// in the least squares sense.
-//
-// matrix and right-hand sides in the batch:
-//
-// `matrix`=\\(A \in \Re^{m \times n}\\),
-// `rhs`=\\(B  \in \Re^{m \times k}\\),
-// `output`=\\(X  \in \Re^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^T A + \lambda I)^{-1} A^T B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^T (A A^T + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||Z||_F^2 \\), subject to
-// \\(A Z = B\\). Notice that the fast path is only numerically stable when
-// \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -20386,106 +22281,49 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
-//
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-//
-// Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "Digamma",
 		Input: []tf.Input{
-			images,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
-	}
-}
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
-//
-// The argument `normalized` and `centered` controls how the windows are built:
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// Reduces `input` along the dimensions given in `reduction_indices`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	input: The tensor to reduce.
+//	reduction_indices: The dimensions to reduce.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, reduction_indices tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20494,9 +22332,9 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "Min",
 		Input: []tf.Input{
-			input, size, offsets,
+			input, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -20504,111 +22342,46 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
-
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
-	}
-}
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
-//
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["out_type"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
-//
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
+// Returns the shape of a tensor.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+// For example:
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20617,12 +22390,37 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "Shape",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
diff --git a/tensorflow/go/operation.go b/tensorflow/go/operation.go
index e8f67c4f7371d078c4f33c6f62b2af50b0af1d92..8fcad61f4c6eec597d2b14fb8c9b4fa59987a829 100644
--- a/tensorflow/go/operation.go
+++ b/tensorflow/go/operation.go
@@ -113,6 +113,11 @@ func (p Output) Shape() Shape {
 }
 
 func (p Output) c() C.TF_Output {
+	if p.Op == nil {
+		// Attempt to provide a more useful panic message than "nil
+		// pointer dereference".
+		panic("nil-Operation. If the Output was created with a Scope object, see Scope.Err() for details.")
+	}
 	return C.TF_Output{oper: p.Op.c, index: C.int(p.Index)}
 }
 
diff --git a/tensorflow/go/session.cpp b/tensorflow/go/session.cpp
deleted file mode 100644
index efa225505b8fc84ddda06177991b74aa0c74a348..0000000000000000000000000000000000000000
--- a/tensorflow/go/session.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// TODO(ashankar): Remove this file when TensorFlow 1.1 is released.
-// See lib.go for details.
-
-extern "C" {
-extern void tfDeletePRunHandle(const char* h);
-}
-
-void tfDeletePRunHandle(const char* h) {
-  delete[] h;
-}
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index 3add412dcd8626d61d99313297bada677d8e844e..afa73030b8894c00ae2c619254bdfbc5068c9a53 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -199,7 +199,7 @@ func (s *Session) NewPartialRun(feeds, fetches []Output, targets []*Operation) (
 		return nil, err
 	}
 	runtime.SetFinalizer(pr, func(pr *PartialRun) {
-		deletePRunHandle(pr.handle)
+		C.TF_DeletePRunHandle(pr.handle)
 	})
 	return pr, nil
 }
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index f2904ad5a6942227e1952a1dbbd36c896e15aaa8..a8910248c1381b597f2cb4fc5ffb44896aa5aec2 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -54,6 +54,18 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OperationTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/OperationTest.java"],
+    test_class = "org.tensorflow.OperationTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 java_test(
     name = "SavedModelBundleTest",
     size = "small",
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 1a9c99bd759f373b2622486ddea3ac07654163d8..337b55bccf025190d4eab48a72e27fe3b92c9fd5 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,154 +1,18 @@
 # TensorFlow for Java
 
-Java bindings for TensorFlow. ([Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary))
-
-[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
-
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
 > [API stability guarantees](https://www.tensorflow.org/programmers_guide/version_semantics).
 >
-> For using TensorFlow on Android refer to
+> For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android
-> demo](https://www.tensorflow.org/code/tensorflow/examples/android).
-
-## Quickstart: Using [Apache Maven](https://maven.apache.org)
-
-TensorFlow for Java releases are included in
-[Maven Central](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.tensorflow%22%20AND%20a%3A%22tensorflow%22)
-and support Linux, OS X and Windows. To use it, add the following dependency to
-your project's `pom.xml`:
-
-```xml
-<dependency>
-  <groupId>org.tensorflow</groupId>
-  <artifactId>tensorflow</artifactId>
-  <version>1.1.0-rc0-windows-fix</version>
-</dependency>
-```
-
-That's all. As an example, to create a Maven project for the
-[label image example](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java):
-
-1.  Create a `pom.xml`:
-
-    ```xml
-    <project>
-        <modelVersion>4.0.0</modelVersion>
-        <groupId>org.myorg</groupId>
-        <artifactId>label-image</artifactId>
-        <version>1.0-SNAPSHOT</version>
-        <properties>
-          <exec.mainClass>org.tensorflow.examples.LabelImage</exec.mainClass>
-          <!-- The LabelImage example code requires at least JDK 1.7. -->
-          <!-- The maven compiler plugin defaults to a lower version -->
-          <maven.compiler.source>1.7</maven.compiler.source>
-          <maven.compiler.target>1.7</maven.compiler.target>
-        </properties>
-        <dependencies>
-          <dependency>
-            <groupId>org.tensorflow</groupId>
-            <artifactId>tensorflow</artifactId>
-            <version>1.1.0-rc0-windows-fix</version>
-          </dependency>
-        </dependencies>
-    </project>
-    ```
-
-2.  Download the [example source](https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java)
-    into `src/main/java/org/tensorflow/examples`. On Linux and OS X, the following script should work:
-
-    ```sh
-    mkdir -p src/main/java/org/tensorflow/examples
-    curl -L "https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java" -o src/main/java/org/tensorflow/examples/LabelImage.java
-    ```
-
-3.  Compile and execute:
-
-    ```sh
-    mvn compile exec:java
-    ```
-
-## Quickstart: Using `java` and `javac`
-
-This section describes how to use TensorFlow armed with just a JDK installation.
-
-1.  Download the Java archive (JAR):
-    [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.1.0-rc0.jar)
-    (optionally, the Java sources:
-    [libtensorflow-src.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-src-1.1.0-rc0.jar)).
-
-2.  Download the native library. GPU-enabled versions required CUDA 8 and cuDNN
-    5.1. For other versions, the native library will need to be built from
-    source (see below).
-
-    -   Linux:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-linux-x86_64-1.1.0-rc0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-gpu-linux-x86_64-1.1.0-rc0.tar.gz)
-    -   OS X:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-darwin-x86_64-1.1.0-rc0.tar.gz),
-        [GPU-enabled](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-gpu-darwin-x86_64-1.1.0-rc0.tar.gz)
-    -   Windows:
-        [CPU-only](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.1.0-rc0.zip)
-
-
-    The following shell snippet downloads and extracts the native library on
-    Linux and OS X. For Windows, download and extract manually.
-
-    ```sh
-    TF_TYPE="cpu" # Set to "gpu" to enable GPU support
-    OS=$(uname -s | tr '[:upper:]' '[:lower:]')
-    mkdir -p ./jni
-    curl -L \
-      "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.1.0-rc0.tar.gz" |
-    tar -xz -C ./jni
-    ```
-
-3.  Include the downloaded `.jar` in the classpath during compilation. For
-    example, if your program looks like the following:
-
-    ```java
-    import org.tensorflow.TensorFlow;
-
-    public class MyClass {
-      public static void main(String[] args) {
-        System.out.println("I'm using TensorFlow version: " +  TensorFlow.version());
-      }
-    }
-    ```
-
-    then it should be compiled with:
+> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
-    ```sh
-    javac -cp libtensorflow-1.1.0-rc0.jar MyClass.java
-    ```
+## Quickstart
 
-    For a more sophisticated example, see
-    [LabelImage.java](https://www.tensorflow.org/code/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java),
-    which can be compiled with:
-
-    ```sh
-    javac \
-      -cp libtensorflow-1.1.0-rc0.jar \
-      ./src/main/java/org/tensorflow/examples/LabelImage.java
-    ```
-
-4.  Include the downloaded `.jar` in the classpath and the native library in the
-    library path during execution. For example:
-
-    ```sh
-    java -cp libtensorflow-1.1.0-rc0.jar:. -Djava.library.path=./jni MyClass
-    ```
-
-    or for the `LabelImage` example:
-
-    ```sh
-    java \
-      -Djava.library.path=./jni \
-      -cp libtensorflow-1.1.0-rc0.jar:./src/main/java \
-      org.tensorflow.examples.LabelImage
-    ```
+-   Refer to [Installing TensorFlow for Java](https://www.tensorflow.org/install/install_java)
+-   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
+-   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
 
 ## Building from source
 
@@ -172,7 +36,6 @@ native libraries will need to be built from source.
     brew install swig
     ```
 
-
 3.  [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation)
     (e.g., enable GPU support) and build:
 
@@ -183,20 +46,39 @@ native libraries will need to be built from source.
       //tensorflow/java:libtensorflow_jni
     ```
 
-The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on
-Linux, `libtensorflow_jni.dylib` on OS X, `tensorflow_jni.dll` on Windows) will
-be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3
-and 4 in the previous section in order to get your application
-up and running.
+The command above will produce two files in the `bazel-bin/tensorflow/java`
+directory:
+
+*   An archive of Java classes: `libtensorflow.jar`
+*   A native library: `libtensorflow_jni.so` on Linux, `libtensorflow_jni.dylib`
+    on OS X, or `tensorflow_jni.dll` on Windows.
+
+To compile Java code that uses the TensorFlow Java API, include
+`libtensorflow.jar` in the classpath. For example:
+
+```sh
+javac -cp bazel-bin/tensorflow/java/libtensorflow.jar ...
+```
+
+To execute the compiled program, include `libtensorflow.jar` in the classpath
+and the native library in the library path. For example:
+
+```sh
+java -cp bazel-bin/tensorflow/java/libtensorflow.jar \
+  -Djava.library.path=bazel-bin/tensorflow/java \
+  ...
+```
 
-Installation on Windows requires the more experimental [bazel on Windows](https://bazel.build/versions/master/docs/windows.html).
-Details are elided here, but find inspiration in the script used for
-building the release archive:
+Installation on Windows requires the more experimental [bazel on
+Windows](https://bazel.build/versions/master/docs/windows.html). Details are
+omitted here, but find inspiration in the script used for building the release
+archive:
 [`tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh`](https://www.tensorflow.org/code/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh).
 
 ### Maven
 
-Details of the release process for Maven Central are in [`maven/README.md`](https://www.tensorflow.org/code/tensorflow/java/maven/README.md).
+Details of the release process for Maven Central are in
+[`maven/README.md`](https://www.tensorflow.org/code/tensorflow/java/maven/README.md).
 However, for development, you can push the library built from source to a local
 Maven repository with:
 
@@ -207,14 +89,14 @@ mvn install:install-file \
   -DpomFile=../../bazel-bin/tensorflow/java/pom.xml
 ```
 
-And then rever to this library in a project's `pom.xml` with:
-(replacing 1.0.head with the appropriate version):
+And then refer to this library in a project's `pom.xml` with: (replacing
+VERSION with the appropriate version of TensorFlow):
 
 ```xml
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.0.head</version>
+  <version>VERSION</version>
 </dependency>
 ```
 
diff --git a/tensorflow/java/maven/.gitignore b/tensorflow/java/maven/.gitignore
index 50eda946435b3980130a7ff97c2299543f8406da..0e11e83a0cb649425b2072f24b0d7106c08cff81 100644
--- a/tensorflow/java/maven/.gitignore
+++ b/tensorflow/java/maven/.gitignore
@@ -7,3 +7,5 @@ libtensorflow_jni/src
 libtensorflow_jni/target
 tensorflow/src
 tensorflow/target
+proto/src
+proto/target
diff --git a/tensorflow/java/maven/README.md b/tensorflow/java/maven/README.md
index 19a214f42da7768f5548113f5b5b5ff9d97c07b6..17bb799961d5d4ae040c88a0348d6dd9d1f077d4 100644
--- a/tensorflow/java/maven/README.md
+++ b/tensorflow/java/maven/README.md
@@ -20,7 +20,7 @@ Hence, the process for building and uploading release artifacts is not a single
 
 ## Artifact Structure
 
-There are four artifacts and thus `pom.xml`s involved in this release:
+There are five artifacts and thus `pom.xml`s involved in this release:
 
 1.  `tensorflow`: The single dependency for projects requiring TensorFlow for
     Java. This convenience package depends on the two below, and is the one that
@@ -34,8 +34,11 @@ There are four artifacts and thus `pom.xml`s involved in this release:
 3.  `libtensorflow_jni`: The native libraries required by `libtensorflow`.
     Native code for all supported platforms is packaged into a single `.jar`.
 
-4.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
-    shared between the above three.
+4.  `proto`: Generated Java code for TensorFlow protocol buffers
+    (e.g., `MetaGraphDef`, `ConfigProto` etc.)
+
+5.  [`parentpom`](https://maven.apache.org/pom/index.html): Common settings
+    shared by all of the above.
 
 ## Updating the release
 
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d8d6a50da778f27f26fab3ef72fb244289fc79b6..e8817c3459dd64dc0c4baa66787aec535269c76b 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc0</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index a675859638bf3529efed0c2e7a7274a6da943e3a..65f331979ff23e7a4ee143cd685104234498ecd1 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc0</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 35f9c2ecc340c8d71fe7a48cb786aa76acd442e2..59d798effe8df25645056437c132da2318e754b7 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.1.0-rc0</version>
+  <version>1.1.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
@@ -30,6 +30,7 @@
     <module>libtensorflow</module>
     <module>libtensorflow_jni</module>
     <module>tensorflow</module>
+    <module>proto</module>
   </modules>
 
 
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a306dd67691060690346a2cbe807ee43f359b894
--- /dev/null
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -0,0 +1,62 @@
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+
+  <modelVersion>4.0.0</modelVersion>
+  <description>Java API for TensorFlow protocol buffers.</description>
+  <parent>
+    <groupId>org.tensorflow</groupId>
+    <artifactId>parentpom</artifactId>
+    <version>1.1.0</version>
+    <relativePath>../</relativePath>
+  </parent>
+  <artifactId>proto</artifactId>
+  <packaging>jar</packaging>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <version>3.2.0</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.6.1</version>
+        <configuration>
+          <source>1.8</source>
+          <target>1.8</target>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+        <version>2.2.1</version>
+        <executions>
+          <execution>
+            <id>attach-sources</id>
+            <goals>
+              <goal>jar-no-fork</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <version>2.9.1</version>
+        <executions>
+          <execution>
+            <id>attach-javadocs</id>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 684cfe1868ac15a90ecef37ad980e57c8d8fd20a..b5e2bfc3a6fa7d09510108c77fc921d08d628adf 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -23,6 +23,7 @@ IS_SNAPSHOT="false"
 if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   IS_SNAPSHOT="true"
 fi
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.2.0/protoc-3.2.0-linux-x86_64.zip"
 
 set -ex
 
@@ -81,6 +82,50 @@ download_libtensorflow_jni() {
   cd "${DIR}"
 }
 
+# Ideally, the .jar for generated Java code for TensorFlow protocol buffer files
+# would have been produced by bazel rules. However, protocol buffer library
+# support in bazel is in flux. Once
+# https://github.com/bazelbuild/bazel/issues/2626 has been resolved, perhaps
+# TensorFlow can move to something like
+# https://bazel.build/blog/2017/02/27/protocol-buffers.html
+# for generating C++, Java and Python code for protocol buffers.
+#
+# At that point, perhaps the libtensorflow build scripts
+# (tensorflow/tools/ci_build/builds/libtensorflow.sh) can build .jars for
+# generated code and this function would not need to download protoc to generate
+# code.
+generate_java_protos() {
+  # Clean any previous attempts
+  rm -rf "${DIR}/proto/tmp"
+
+  # Download protoc
+  curl -L "${PROTOC_RELEASE_URL}" -o "/tmp/protoc.zip"
+  mkdir -p "${DIR}/proto/tmp/protoc"
+  unzip -d "${DIR}/proto/tmp/protoc" "/tmp/protoc.zip"
+  rm -f "/tmp/protoc.zip"
+
+  # Download the release archive of TensorFlow protos.
+  if [[ "${IS_SNAPSHOT}" == "true" ]]; then
+    URL="http://ci.tensorflow.org/view/Nightly/job/nightly-libtensorflow/TYPE=cpu-slave/lastSuccessfulBuild/artifact/lib_package/libtensorflow_proto.zip"
+  else
+    URL="${RELEASE_URL_PREFIX}/libtensorflow_proto-${TF_VERSION}.zip"
+  fi
+  curl -L "${URL}" -o /tmp/libtensorflow_proto.zip
+  mkdir -p "${DIR}/proto/tmp/src"
+  unzip -d "${DIR}/proto/tmp/src" "/tmp/libtensorflow_proto.zip"
+  rm -f "/tmp/libtensorflow_proto.zip"
+
+  # Generate Java code
+  mkdir -p "${DIR}/proto/src/main/java"
+  find "${DIR}/proto/tmp/src" -name "*.proto" | xargs \
+  ${DIR}/proto/tmp/protoc/bin/protoc \
+    --proto_path="${DIR}/proto/tmp/src" \
+    --java_out="${DIR}/proto/src/main/java"
+
+  # Cleanup
+  rm -rf "${DIR}/proto/tmp"
+}
+
 if [ -z "${TF_VERSION}" ]
 then
   echo "Must set the TF_VERSION environment variable"
@@ -99,6 +144,7 @@ clean
 update_version_in_pom
 download_libtensorflow
 download_libtensorflow_jni
+generate_java_protos
 # Build the release artifacts
 mvn verify
 # If successfully built, try to deploy.
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index d54face7c4966a90caf2bc170ffaf57d4ca5b62e..74adb35ba8d1fc2332acfc8cca41416c02e2c955 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.1.0-rc0</version>
+    <version>1.1.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Graph.java b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
index 42d7f484644af9d1ebb0b7b5504430c2ba24fbe8..c08fa9b14574a8a219609c754faaa3a395283fd7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Graph.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Graph.java
@@ -20,7 +20,7 @@ package org.tensorflow;
  *
  * <p>Instances of a Graph are thread-safe.
  *
- * <p><b>WARNING:</b> Resources consumed by the Graph object msut be explicitly freed by invoking
+ * <p><b>WARNING:</b> Resources consumed by the Graph object must be explicitly freed by invoking
  * the {@link #close()} method then the Graph object is no longer needed.
  */
 public final class Graph implements AutoCloseable {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Operation.java b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
index 48db554e072707ecb33c2008e1af41761f415729..43dbaf125c9b76dcae645b2eb9d8deba42c6c521 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Operation.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Operation.java
@@ -70,6 +70,28 @@ public final class Operation {
     }
   }
 
+  /**
+   * Returns the size of the list of Tensors produced by this operation.
+   *
+   * <p>An Operation has multiple named outputs, each of which produces either
+   * a single tensor or a list of tensors. This method returns the size of
+   * the list of tensors for a specific named output of the operation.
+   *
+   * @param name identifier of the list of tensors (of which there may
+   *        be many) produced by this operation.
+   * @returns the size of the list of Tensors produced by this named output.
+   * @throws IllegalArgumentException if this operation has no output
+   *         with the provided name.
+   */
+  public int outputListLength(final String name) {
+    Graph.Reference r = graph.ref();
+    try {
+      return outputListLength(unsafeNativeHandle, name);
+    } finally {
+      r.close();
+    }
+  }
+
   /** Returns a symbolic handle to one of the tensors produced by this operation. */
   public Output output(int idx) {
     return new Output(this, idx);
@@ -108,6 +130,8 @@ public final class Operation {
 
   private static native int numOutputs(long handle);
 
+  private static native int outputListLength(long handle, String name);
+
   private static native long[] shape(long graphHandle, long opHandle, int output);
 
   private static native int dtype(long graphHandle, long opHandle, int output);
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index cd59cf504a7166540818c36c97a58fdc46214bab..38ffa2a8e1932390780e1e762b7be2e7e7b27e8b 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -73,6 +73,29 @@ public final class OperationBuilder {
     return this;
   }
 
+  /**
+   * Ensure that the operation does not execute before the control operation does.
+   *
+   * <p>A control input is an Operation that must be executed before running the operation currently
+   * being built.
+   *
+   * <p>For example, an Assert operation may be added as a control input for this operation. The
+   * Assert now behaves as a pre-condition that will always verify itself before running the
+   * operation.
+   *
+   * @param control operation that must be executed before running this operation.
+   * @return the OperationBuilder instance for chaining.
+   */
+  public OperationBuilder addControlInput(Operation control) {
+    Graph.Reference r = graph.ref();
+    try {
+      addControlInput(unsafeNativeHandle, control.getUnsafeNativeHandle());
+    } finally {
+      r.close();
+    }
+    return this;
+  }
+
   public OperationBuilder addInputList(Output[] inputs) {
     Graph.Reference r = graph.ref();
     try {
@@ -244,6 +267,8 @@ public final class OperationBuilder {
 
   private static native void addInputList(long handle, long[] opHandles, int[] indices);
 
+  private static native void addControlInput(long handle, long opHandle);
+
   private static native void setDevice(long handle, String device);
 
   // The names of all the setAttr* family functions below correspond to the C library types, not the
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index 45332bc73f3ecc9148885a1b750d86c4229df86c..0d071e1674e3a7951248742b911f023a0dce0edf 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -120,10 +120,15 @@ public final class Session implements AutoCloseable {
     /**
      * Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
      *
-     * <p>This method is a shorthand for {@code feed(operation, 0, t)}.
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code feed(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     feed(operation_name, output_index)}. These colon-separated names are commonly used in the
+     *     {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner feed(String operation, Tensor t) {
-      return feed(operation, 0, t);
+      return feed(parseOutput(operation), t);
     }
 
     /**
@@ -155,10 +160,15 @@ public final class Session implements AutoCloseable {
     /**
      * Make {@link #run()} return the output of {@code operation}.
      *
-     * <p>This method is a shorthand for {@code fetch(operation, 0)}
+     * @param operation Is either the string name of the operation, in which case this method is a
+     *     shorthand for {@code fetch(operation, 0)}, or it is a string of the form
+     *     <tt>operation_name:output_index</tt> , in which case this method acts like {@code
+     *     fetch(operation_name, output_index)}. These colon-separated names are commonly used in
+     *     the {@code SignatureDef} protocol buffer messages that are included in {@link
+     *     SavedModelBundle.metaGraphDef()}.
      */
     public Runner fetch(String operation) {
-      return fetch(operation, 0);
+      return fetch(parseOutput(operation));
     }
 
     /**
@@ -345,6 +355,20 @@ public final class Session implements AutoCloseable {
       return op;
     }
 
+    private Output parseOutput(String opName) {
+      int colon = opName.lastIndexOf(':');
+      if (colon == -1 || colon == opName.length() - 1) {
+        return new Output(operationByName(opName), 0);
+      }
+      try {
+        String op = opName.substring(0, colon);
+        int index = Integer.parseInt(opName.substring(colon + 1));
+        return new Output(operationByName(op), index);
+      } catch (NumberFormatException e) {
+        return new Output(operationByName(opName), 0);
+      }
+    }
+
     private ArrayList<Output> inputs = new ArrayList<Output>();
     private ArrayList<Tensor> inputTensors = new ArrayList<Tensor>();
     private ArrayList<Output> outputs = new ArrayList<Output>();
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Shape.java b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
index f6677e9a15d5fa5759afe778e217195892f43036..90d6cf7b85436f9645f279326e5b60a77c4b77f7 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Shape.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
@@ -1,16 +1,17 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
 
 package org.tensorflow;
 
diff --git a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
index ff3113372900abcb2a548385edcbe0cc603934ce..c21214b76311249690237af0753d6e65cbf3e230 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/TensorFlow.java
@@ -20,6 +20,15 @@ public final class TensorFlow {
   /** Returns the version of the underlying TensorFlow runtime. */
   public static native String version();
 
+  /**
+   * All the TensorFlow operations available in this address space.
+   *
+   * @return A serialized representation of an <a
+   *     href="https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto">OpList</a>
+   *     protocol buffer, which lists all the available TensorFlow operations.
+   */
+  public static native byte[] registeredOpList();
+
   private TensorFlow() {}
 
   /** Load the TensorFlow runtime C library. */
@@ -30,5 +39,4 @@ public final class TensorFlow {
   static {
     init();
   }
-
 }
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index 5724c54f9116c269efdd842646bfc3da47d57ab0..4c54eecd9b5904c3cdd03f2373ea89bf2f14cf63 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -115,6 +115,20 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
   TF_AddInputList(d, o.get(), n);
 }
 
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv* env, jclass clazz, jlong handle, jlong op_handle) {
+  if (op_handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "control input is not valid, "
+                   "perhaps the Graph containing it has been closed()?");
+    return;
+  }
+  TF_Operation* control = reinterpret_cast<TF_Operation*>(op_handle);
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  TF_AddControlInput(d, control);
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setDevice(
     JNIEnv* env, jclass clazz, jlong handle, jstring device) {
   TF_OperationDescription* d = requireHandle(env, handle);
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index ae953c0fd63a473d3336b63ecd2954730cd3aab7..9b64c328203ad406953dea0e9cddcf6f468c043d 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -55,6 +55,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInput(
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addInputList(
     JNIEnv *, jclass, jlong, jlongArray, jintArray);
 
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    addControlInput
+ * Signature: (JJ)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_addControlInput(
+    JNIEnv *, jclass, jlong, jlong);
+
 /*
  * Class:     org_tensorflow_OperationBuilder
  * Method:    setDevice
diff --git a/tensorflow/java/src/main/native/operation_jni.cc b/tensorflow/java/src/main/native/operation_jni.cc
index 32e59bc0aedf59a12c9b85de79e5d4faef8aaf77..b3d5fc4ec374fe6e5214799581878d94315a7ea7 100644
--- a/tensorflow/java/src/main/native/operation_jni.cc
+++ b/tensorflow/java/src/main/native/operation_jni.cc
@@ -66,6 +66,24 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv* env,
   return TF_OperationNumOutputs(op);
 }
 
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv* env,
+                                                                      jclass clazz,
+                                                                      jlong handle,
+                                                                      jstring name) {
+  TF_Operation* op = requireHandle(env, handle);
+  if (op == nullptr) return 0;
+
+  TF_Status* status = TF_NewStatus();
+
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  int result = TF_OperationOutputListLength(op, cname, status);
+  env->ReleaseStringUTFChars(name, cname);
+
+  throwExceptionIfNotOK(env, status);
+  TF_DeleteStatus(status);
+  return result;
+}
+
 JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Operation_shape(
     JNIEnv* env, jclass clazz, jlong graph_handle, jlong op_handle,
     jint output_index) {
diff --git a/tensorflow/java/src/main/native/operation_jni.h b/tensorflow/java/src/main/native/operation_jni.h
index fe14882dde3438a8203652ef2aadc0ee3dd9e60c..b5d156f7c2749f7fbba3145f79e269f12e53a055 100644
--- a/tensorflow/java/src/main/native/operation_jni.h
+++ b/tensorflow/java/src/main/native/operation_jni.h
@@ -1,4 +1,3 @@
-
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,6 +46,16 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_Operation_type(JNIEnv *, jclass,
 JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_numOutputs(JNIEnv *,
                                                                 jclass, jlong);
 
+/*
+ * Class:     org_tensorflow_Operation
+ * Method:    outputListLength
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_Operation_outputListLength(JNIEnv *,
+                                                                      jclass,
+                                                                      jlong,
+                                                                      jstring);
+
 /*
  * Class:     org_tensorflow_Operation
  * Method:    shape
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.cc b/tensorflow/java/src/main/native/tensorflow_jni.cc
index 746550adbd24221e122effc11c28a0bb905fb283..c553582e38d34c67d58bf4501d9c1686b29f9a73 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.cc
+++ b/tensorflow/java/src/main/native/tensorflow_jni.cc
@@ -20,3 +20,13 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv* env,
                                                                  jclass clazz) {
   return env->NewStringUTF(TF_Version());
 }
+
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv* env, jclass clazz) {
+  TF_Buffer* buf = TF_GetAllOpList();
+  jint length = static_cast<int>(buf->length);
+  jbyteArray ret = env->NewByteArray(length);
+  env->SetByteArrayRegion(ret, 0, length, static_cast<const jbyte*>(buf->data));
+  TF_DeleteBuffer(buf);
+  return ret;
+}
diff --git a/tensorflow/java/src/main/native/tensorflow_jni.h b/tensorflow/java/src/main/native/tensorflow_jni.h
index 102951c472c38d3ce9ad2c4091eae3507fb6f8df..ecd9b15828dea07ab43ac60a0d148ba17a21af11 100644
--- a/tensorflow/java/src/main/native/tensorflow_jni.h
+++ b/tensorflow/java/src/main/native/tensorflow_jni.h
@@ -30,6 +30,14 @@ extern "C" {
 JNIEXPORT jstring JNICALL Java_org_tensorflow_TensorFlow_version(JNIEnv*,
                                                                  jclass);
 
+/*
+ * Class:     org_tensorflow_TensorFlow
+ * Method:    registeredOpList
+ * Signature: ()[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_org_tensorflow_TensorFlow_registeredOpList(JNIEnv*, jclass);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index 951136180def4401a7c72beb6173ffad54ca5e12..b3bc3aaef9ce1e65f0bef74b269c8ae5ce19dcef 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -101,7 +101,7 @@ public class OperationBuilderTest {
       assertTrue(hasNode(g, "StringAndBool"));
       // int (TF "int" attributes are 64-bit signed, so a Java long).
       g.opBuilder("RandomUniform", "Int")
-          .addInput(TestUtil.constant(g, "RandomUniformShape", 1))
+          .addInput(TestUtil.constant(g, "RandomUniformShape", new int[]{1}))
           .setAttr("seed", 10)
           .setAttr("dtype", DataType.FLOAT)
           .build();
@@ -149,6 +149,33 @@ public class OperationBuilderTest {
     }
   }
 
+  @Test
+  public void addControlInput() {
+    try (Graph g = new Graph();
+        Session s = new Session(g);
+        Tensor yes = Tensor.create(true);
+        Tensor no = Tensor.create(false)) {
+      Output placeholder = TestUtil.placeholder(g, "boolean", DataType.BOOL);
+      Operation check =
+          g.opBuilder("Assert", "assert")
+              .addInput(placeholder)
+              .addInputList(new Output[] {placeholder})
+              .build();
+      Operation noop = g.opBuilder("NoOp", "noop").addControlInput(check).build();
+
+      // No problems when the Assert check succeeds
+      s.runner().feed(placeholder, yes).addTarget(noop).run();
+
+      // Exception thrown by the execution of the Assert node
+      try {
+        s.runner().feed(placeholder, no).addTarget(noop).run();
+        fail("Did not run control operation.");
+      } catch (IllegalArgumentException e) {
+        // expected
+      }
+    }
+  }
+
   private static boolean hasNode(Graph g, String name) {
     return g.operation(name) != null;
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..101839e6d74536d82aa1eaa5b3a9bd11b6462ca6
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationTest.java
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+
+/** Unit tests for {@link org.tensorflow.Operation}. */
+@RunWith(JUnit4.class)
+public class OperationTest {
+
+  @Test
+  public void outputListLengthFailsOnInvalidName() {
+    try (Graph g = new Graph()) {
+      Operation op =
+          g.opBuilder("Add", "Add")
+              .addInput(TestUtil.constant(g, "x", 1))
+              .addInput(TestUtil.constant(g, "y", 2))
+              .build();
+      assertEquals(1, op.outputListLength("z"));
+
+      try {
+        op.outputListLength("unknown");
+        fail("Did not catch bad name");
+      } catch (IllegalArgumentException iae) {
+        // expected
+      }
+    }
+  }
+
+  @Test
+  public void outputListLength() {
+    assertEquals(1, split(new int[] {0, 1}, 1));
+    assertEquals(2, split(new int[] {0, 1}, 2));
+    assertEquals(3, split(new int[] {0, 1, 2}, 3));
+  }
+
+  private int split(int[] values, int num_split) {
+    try (Graph g = new Graph()) {
+      return g.opBuilder("Split", "Split")
+          .addInput(TestUtil.constant(g, "split_dim", 0))
+          .addInput(TestUtil.constant(g, "values", values))
+          .setAttr("num_split", num_split)
+          .build()
+          .outputListLength("output");
+    }
+  }
+}
diff --git a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
index 2ccbdf51bc5a8287eb552ebf99925074aaa7fb3d..0d2dbc5b88006d497bfcf8d70c48ed7bb93d5538 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/SessionTest.java
@@ -62,6 +62,36 @@ public class SessionTest {
     }
   }
 
+  @Test
+  public void runUsingColonSeparatedNames() {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      Operation split =
+          g.opBuilder("Split", "Split")
+              .addInput(TestUtil.constant(g, "split_dim", 0))
+              .addInput(TestUtil.constant(g, "value", new int[] {1, 2, 3, 4}))
+              .setAttr("num_split", 2)
+              .build();
+      g.opBuilder("Add", "Add")
+          .addInput(split.output(0))
+          .addInput(split.output(1))
+          .build()
+          .output(0);
+      // Fetch using colon separated names.
+      try (Tensor fetched = s.runner().fetch("Split:1").run().get(0)) {
+        final int[] expected = {3, 4};
+        assertArrayEquals(expected, fetched.copyTo(new int[2]));
+      }
+      // Feed using colon separated names.
+      try (Tensor fed = Tensor.create(new int[] {4, 3, 2, 1});
+          Tensor fetched =
+              s.runner().feed("Split:0", fed).feed("Split:1", fed).fetch("Add").run().get(0)) {
+        final int[] expected = {8, 6, 4, 2};
+        assertArrayEquals(expected, fetched.copyTo(new int[4]));
+      }
+    }
+  }
+
   @Test
   public void runWithMetadata() {
     try (Graph g = new Graph();
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
index ff89aeffbbc4042920d476f75c043ea83a9aa490..a31ea900d1c86d6077972662d05a885820dc8a3a 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorFlowTest.java
@@ -28,4 +28,12 @@ public class TensorFlowTest {
   public void version() {
     assertTrue(TensorFlow.version().length() > 0);
   }
+
+  @Test
+  public void registeredOpList() {
+    // Would be nice to actually parse the output as a tensorflow.OpList protocol buffer message,
+    // but as of May 2017, bazel support for generating Java code from protocol buffer definitions
+    // was not sorted out. Revisit? Till then, at least exercise the code.
+    assertTrue(TensorFlow.registeredOpList().length > 0);
+  }
 }
diff --git a/tensorflow/opensource_only/eigen.threadpool b/tensorflow/opensource_only/eigen.threadpool
deleted file mode 100644
index d2639af4d97dedc482bbdecac28a8639c7659d65..0000000000000000000000000000000000000000
--- a/tensorflow/opensource_only/eigen.threadpool
+++ /dev/null
@@ -1 +0,0 @@
-#include "unsupported/Eigen/CXX11/ThreadPool"
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 9db763ce78fd270182c53136c4ceeb041b745088..a20b86a2352ad411a6835a5bc39f168d1a1a16c4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -25,6 +25,7 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_py
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_lib_deps")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_plugin_deps")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "tf_additional_verbs_deps")
 
 py_library(
     name = "python",
@@ -41,6 +42,7 @@ py_library(
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
+        ":tf_optimizer",
         ":array_ops",
         ":check_ops",
         ":client",
@@ -60,7 +62,6 @@ py_library(
         ":nn",
         ":platform",
         ":script_ops",
-        ":sdca_ops",
         ":session_ops",
         ":sets",
         ":sparse_ops",
@@ -80,7 +81,9 @@ py_library(
         ":weights_broadcast_ops",
         "//third_party/py/numpy",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/saved_model",
     ] + if_not_windows([
         "//tensorflow/contrib:contrib_py",
@@ -170,6 +173,8 @@ cc_library(
     hdrs = ["lib/core/ndarray_tensor_bridge.h"],
     deps = [
         ":numpy_lib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -583,12 +588,19 @@ py_library(
         ":platform",
         ":platform_test",
         ":pywrap_tensorflow",
+        ":training",
         ":util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "distributed_framework_test_lib",
+    srcs_version = "PY2AND3",
+    deps = [":framework_test_lib"],
+)
+
 py_library(
     name = "client_testlib",
     srcs = ["platform/test.py"],
@@ -1016,7 +1028,16 @@ tf_gen_op_wrapper_private_py(
     require_shape_functions = True,
     visibility = [
         "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/contrib/lookup:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "dataset_ops_gen",
+    require_shape_functions = True,
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
         "//tensorflow/python/kernel_tests:__pkg__",
     ],
 )
@@ -1051,6 +1072,16 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "lookup_ops_gen",
+    require_shape_functions = True,
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/contrib/lookup:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     require_shape_functions = True,
@@ -1318,6 +1349,7 @@ py_library(
     deps = [
         ":array_ops",
         ":clip_ops",
+        ":data_flow_grad",
         ":data_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -1468,6 +1500,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "lookup_ops",
+    srcs = ["ops/lookup_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":lookup_ops_gen",
+        ":math_ops",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -1497,6 +1543,7 @@ py_library(
         ":framework_ops",
         ":graph_util",
         ":math_ops_gen",
+        ":nn_ops_gen",
         ":sparse_ops_gen",
         ":sparse_tensor",
         ":spectral_ops_gen",
@@ -1555,6 +1602,7 @@ py_library(
         ":rnn",
         ":sparse_ops",
         ":util",
+        ":variables",
     ],
 )
 
@@ -1667,6 +1715,7 @@ py_library(
     deps = [
         ":array_ops",
         ":framework_for_generated_wrappers",
+        ":layers_base",
         ":util",
     ],
 )
@@ -1853,6 +1902,7 @@ py_library(
         ":io_ops",
         ":linalg_ops",
         ":logging_ops",
+        ":lookup_ops",
         ":math_grad",
         ":math_ops",
         ":numerics",
@@ -2152,6 +2202,7 @@ cuda_py_test(
     srcs = ["ops/math_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":errors",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":gradients",
@@ -2204,6 +2255,9 @@ cuda_py_test(
         ":nn",
         ":nn_grad",
         ":nn_ops",
+        ":partitioned_variables",
+        ":variable_scope",
+        ":variables",
         "//third_party/py/numpy",
     ],
 )
@@ -2257,6 +2311,7 @@ py_library(
         ":io_ops",
         ":io_ops_gen",
         ":lib",
+        ":lookup_ops",
         ":math_ops",
         ":platform",
         ":protos_all_py",
@@ -2264,6 +2319,7 @@ py_library(
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
+        ":sdca_ops",
         ":sparse_ops",
         ":state_ops",
         ":string_ops",
@@ -2373,6 +2429,50 @@ py_test(
     ],
 )
 
+py_test(
+    name = "tf_contextlib_test",
+    size = "small",
+    srcs = ["util/tf_contextlib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_decorator_test",
+    size = "small",
+    srcs = ["util/tf_decorator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_should_use_test",
+    size = "small",
+    srcs = ["util/tf_should_use_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "tf_inspect_test",
+    size = "small",
+    srcs = ["util/tf_inspect_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_library(
     name = "util_example_parser_configuration",
     srcs = ["util/example_parser_configuration.py"],
@@ -2535,6 +2635,7 @@ tf_py_wrap_cc(
         "client/tf_session.i",
         "framework/cpp_shape_inference.i",
         "framework/python_op_gen.i",
+        "grappler/tf_optimizer.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -2563,6 +2664,9 @@ tf_py_wrap_cc(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core:lib",
         "//tensorflow/core:reader_base",
         "//tensorflow/core/debug",
@@ -2570,7 +2674,9 @@ tf_py_wrap_cc(
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/tools/tfprof/internal:print_model_analysis",
         "//util/python:python_headers",
-    ] + tf_additional_lib_deps() + tf_additional_plugin_deps(),
+    ] + (tf_additional_lib_deps() +
+         tf_additional_plugin_deps() +
+         tf_additional_verbs_deps()),
 )
 
 py_library(
@@ -2585,6 +2691,7 @@ py_library(
         ":errors",
         ":pywrap_tensorflow",
         ":util",
+        "@six_archive//:six",
     ],
 )
 
@@ -2739,6 +2846,7 @@ cuda_py_test(
     additional_deps = [
         ":client",
         ":client_testlib",
+        ":distributed_framework_test_lib",
         ":framework_for_generated_wrappers",
         ":partitioned_variables",
         ":training",
@@ -2871,6 +2979,26 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "moving_averages_test",
+    size = "small",
+    srcs = [
+        "training/moving_averages_test.py",
+    ],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":constant_op",
+        ":dtypes",
+        ":framework_for_generated_wrappers",
+        ":framework_ops",
+        ":training",
+        ":variable_scope",
+        ":variables",
+    ],
+    tags = ["notsan"],
+)
+
 cuda_py_tests(
     name = "training_tests",
     size = "small",
@@ -2879,13 +3007,13 @@ cuda_py_tests(
         "training/adagrad_da_test.py",
         "training/adagrad_test.py",
         "training/basic_loops_test.py",
+        "training/checkpoint_utils_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
         "training/momentum_test.py",
-        "training/moving_averages_test.py",
         "training/optimizer_test.py",
         "training/proximal_adagrad_test.py",
         "training/proximal_gradient_descent_test.py",
@@ -2908,6 +3036,7 @@ cuda_py_tests(
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":lookup_ops",
         ":gradients",
         ":math_ops",
         ":nn_grad",
@@ -2938,7 +3067,7 @@ py_library(
     srcs = ["training/saver_test_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":data_flow_ops_gen",
+        ":lookup_ops_gen",
         ":training",
     ],
 )
@@ -3181,16 +3310,35 @@ py_tests(
 )
 
 py_library(
-    name = "layers",
+    name = "layers_base",
     srcs = [
         "layers/__init__.py",
         "layers/base.py",
+        "layers/utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":init_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = [
         "layers/convolutional.py",
         "layers/core.py",
         "layers/layers.py",
         "layers/normalization.py",
         "layers/pooling.py",
-        "layers/utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -3199,6 +3347,7 @@ py_library(
         ":framework",
         ":framework_for_generated_wrappers",
         ":init_ops",
+        ":layers_base",
         ":math_ops",
         ":nn",
         ":standard_ops",
@@ -3307,39 +3456,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "docs",
-    srcs = ["framework/docs.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_library(
-    name = "gen_docs_combined_lib",
-    srcs = ["framework/gen_docs_combined.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":docs",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
-        "//tensorflow/python/debug:debug_py",
-    ],
-)
-
-py_binary(
-    name = "gen_docs_combined",
-    srcs = ["framework/gen_docs_combined.py"],
-    main = "framework/gen_docs_combined.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":docs",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/debug:debug_py",
-    ],
-)
-
 # -----------------------------------------------------------------------------
 # Quantization
 
@@ -3489,3 +3605,45 @@ cuda_py_test(
     ],
     main = "client/session_benchmark.py",
 )
+
+py_library(
+    name = "tf_optimizer",
+    srcs = [
+        "grappler/tf_optimizer.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":pywrap_tensorflow_internal"],
+)
+
+py_test(
+    name = "tf_optimizer_test",
+    size = "small",
+    srcs = ["grappler/tf_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],  # tf_optimizer is not available in pip.
+    deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "memory_optimizer_test",
+    size = "medium",
+    srcs = [
+        "grappler/memory_optimizer_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 3663d8af7aea2a2f9dce083d503b389bb84aff7d..d4f8b8b2f81a9b65b2df311a01fa43d6eeee305d 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -26,11 +26,9 @@ import tensorflow as tf
 
 import ctypes
 import importlib
-import inspect
 import sys
 import traceback
 
-
 # TODO(drpng): write up instructions for editing this file in a doc and point to
 # the doc instead.
 # If you want to edit this file to expose modules in public tensorflow API, you
@@ -57,6 +55,7 @@ from tensorflow.core.framework.summary_pb2 import *
 from tensorflow.core.framework.attr_value_pb2 import *
 from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
 from tensorflow.core.protobuf.config_pb2 import *
+from tensorflow.core.protobuf.tensorflow_server_pb2 import *
 from tensorflow.core.protobuf.rewriter_config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
@@ -76,11 +75,11 @@ from tensorflow.python.ops.standard_ops import *
 
 # Bring in subpackages.
 from tensorflow.python.estimator import estimator_lib as estimator
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import sdca_ops as sdca
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.losses import losses
@@ -132,7 +131,9 @@ from tensorflow.python.ops import tensor_array_ops
 # documentation, or remove.
 _allowed_symbols = [
     'AttrValue',
+    'AutoParallelOptions',
     'ConfigProto',
+    'ClusterDef',
     'DeviceSpec',
     'Event',
     'GPUOptions',
@@ -170,7 +171,7 @@ _allowed_symbols.extend([
     'parse_single_sequence_example',
     'serialize_many_sparse',
     'serialize_sparse',
-    'sparse_matmul',   ## use tf.matmul instead.
+    'sparse_matmul',  ## use tf.matmul instead.
 ])
 
 # This is needed temporarily because we import it explicitly.
@@ -212,6 +213,7 @@ _allowed_symbols.extend([
     'compat',
     'errors',
     'estimator',
+    'feature_column',
     'flags',
     'gfile',
     'graph_util',
@@ -224,7 +226,6 @@ _allowed_symbols.extend([
     'python_io',
     'resource_loader',
     'saved_model',
-    'sdca',
     'sets',
     'spectral',
     'summary',
diff --git a/tensorflow/python/client/events_writer.i b/tensorflow/python/client/events_writer.i
index ab83688074d806dfe4437052132db0426956a72b..de030fcb4282912475ed8853bae9d41cde2c085d 100644
--- a/tensorflow/python/client/events_writer.i
+++ b/tensorflow/python/client/events_writer.i
@@ -28,6 +28,7 @@ limitations under the License.
 %unignore tensorflow::EventsWriter;
 %unignore tensorflow::EventsWriter::EventsWriter;
 %unignore tensorflow::EventsWriter::~EventsWriter;
+%unignore tensorflow::EventsWriter::InitWithSuffix;
 %unignore tensorflow::EventsWriter::FileName;
 %rename("_WriteSerializedEvent") tensorflow::EventsWriter::WriteSerializedEvent;
 %unignore tensorflow::EventsWriter::Flush;
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 6900ac9a4f4abefa8612d6204f5a51a41576d3ec..05ba1a0d4302cbf347fb0c6ee5c804fd0d4b057b 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -594,6 +594,9 @@ class BaseSession(SessionInterface):
       try:
         status = tf_session.TF_NewStatus()
         tf_session.TF_DeleteDeprecatedSession(self._session, status)
+      except AttributeError:
+        # 'NoneType' object has no attribute 'TF_NewStatus'
+        pass
       finally:
         if status is not None:
           tf_session.TF_DeleteStatus(status)
@@ -989,13 +992,100 @@ class BaseSession(SessionInterface):
     movers = self._update_with_movers(feed_dict_string, feed_map)
     final_fetches = fetch_handler.fetches()
     final_targets = fetch_handler.targets()
-    if final_fetches or final_targets:
+    # We only want to really perform the run if fetches or targets are provided,
+    # or if the call is a partial run that specifies feeds.
+    if final_fetches or final_targets or (handle and feed_dict_string):
       results = self._do_run(handle, final_targets, final_fetches,
                              feed_dict_string, options, run_metadata)
     else:
       results = []
     return fetch_handler.build_results(self, results)
 
+  def make_callable(self, fetches, feed_list=None):
+    """Returns a Python callable that runs a particular step.
+
+    The returned callable will take `len(feed_list)` arguments whose types
+    must be compatible feed values for the respective elements of `feed_list`.
+    For example, if element `i` of `feed_list` is a `tf.Tensor`, the `i`th
+    argument to the returned callable must be a numpy ndarray (or something
+    convertible to an ndarray) with matching element type and shape. See
+    @{tf.Session.run} for details of the allowable feed key and value types.
+
+    The returned callable will have the same return type as
+    `tf.Session.run(fetches, ...)`. For example, if `fetches` is a `tf.Tensor`,
+    the callable will return a numpy ndarray; if `fetches` is a `tf.Operation`,
+    it will return `None`.
+
+    Args:
+      fetches: A value or list of values to fetch. See @{tf.Session.run}
+        for details of the allowable fetch types.
+      feed_list: (Optional.) A list of `feed_dict` keys. See
+        @{tf.Session.run} for details of the allowable feed key types.
+
+    Returns:
+      A function that when called will execute the step defined by
+      `feed_list` and `fetches` in this session.
+
+    Raises:
+      TypeError: If `fetches` or `feed_list` cannot be interpreted
+        as arguments to @{tf.Session.run}.
+    """
+    if feed_list is not None:
+      if not isinstance(feed_list, (list, tuple)):
+        raise TypeError('`feed_list` must be a list or tuple.')
+      # Delegate any non-empty feed lists to the existing `run()` logic.
+      # TODO(mrry): Refactor the feed handling logic from
+      # `Session._run()` so that we can convert the feeds to a list of
+      # strings here.
+      def _generic_run(*feed_args):
+        feed_dict = {feed: feed_val
+                     for feed, feed_val in zip(feed_list, feed_args)}
+        return self.run(fetches, feed_dict=feed_dict)
+      return _generic_run
+
+    # Ensure any changes to the graph are reflected in the runtime.
+    # Note that we don't need to do this on subsequent calls to the
+    # returned object, because the arguments to `fetches` must already be
+    # in the graph.
+    self._extend_graph()
+
+    # Create a fetch handler to take care of the structure of fetches.
+    fetch_handler = _FetchHandler(self._graph, fetches, {})
+    fetch_list_as_strings = fetch_handler.fetches()
+    target_list_as_strings = fetch_handler.targets()
+
+    if isinstance(fetches, ops.Operation):
+      # Special case for fetching a single operation, because the
+      # function will have no return value.
+      assert not fetch_list_as_strings
+      assert len(target_list_as_strings) == 1
+      def _single_operation_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          tf_session.TF_Run(self._session, None, {}, [],
+                            target_list_as_strings, status, None)
+      return _single_operation_run
+    elif isinstance(fetches, ops.Tensor):
+      # Special case for fetching a single tensor, because the
+      # function can return the result of `TF_Run()` directly.
+      assert len(fetch_list_as_strings) == 1
+      assert not target_list_as_strings
+      def _single_tensor_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = tf_session.TF_Run(self._session, None, {},
+                                      fetch_list_as_strings, [], status, None)
+        return results[0]
+      return _single_tensor_run
+    else:
+      # In all other cases, we must use `fetch_handler` to build the
+      # results for us.
+      def _fetch_handler_run():
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = tf_session.TF_Run(self._session, None, {},
+                                      fetch_list_as_strings,
+                                      target_list_as_strings, status, None)
+        return fetch_handler.build_results(self, results)
+      return _fetch_handler_run
+
   # Captures the name of a node in an error status.
   _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
 
@@ -1094,8 +1184,9 @@ class BaseSession(SessionInterface):
     if tensors_to_delete:
       feeds = {}
       fetches = []
-      for tensor_handle in tensors_to_delete:
+      for deleter_key, tensor_handle in enumerate(tensors_to_delete):
         holder, deleter = session_ops._get_handle_deleter(self.graph,
+                                                          deleter_key,
                                                           tensor_handle)
         feeds[holder] = tensor_handle
         fetches.append(deleter)
diff --git a/tensorflow/python/client/session_benchmark.py b/tensorflow/python/client/session_benchmark.py
index 614eede6816b76bd4ad21595fde9a106c2cf3c05..721bca91b71aa00479c27fad102d5888d58d35b1 100644
--- a/tensorflow/python/client/session_benchmark.py
+++ b/tensorflow/python/client/session_benchmark.py
@@ -92,26 +92,127 @@ class SessionBenchmark(test.Benchmark):
     print("%s %d %f" % (name, size, np.median(times)))
     self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
 
+  def _benchmarkFetchPrebuilt(self, name, target, size, iters):
+    """Runs a microbenchmark to measure the cost of fetching a tensor.
+
+    Reports the median cost of fetching a tensor of `size` * `sizeof(float)`
+    bytes.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      size: The number of floating-point numbers to be fetched.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the tensor to be fetched as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([size]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        runner = sess.make_callable(v)
+        runner()  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          runner()
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %d %f" % (name, size, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
+  def _benchmarkRunOp(self, name, target, iters):
+    """Runs a microbenchmark to measure the cost of running an op.
+
+    Reports the median cost of running a trivial (Variable) op.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the op to be run as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        sess.run(v.op)  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          sess.run(v.op)
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %f" % (name, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
+  def _benchmarkRunOpPrebuilt(self, name, target, iters):
+    """Runs a microbenchmark to measure the cost of running an op.
+
+    Reports the median cost of running a trivial (Variable) op.
+
+    Args:
+      name: A human-readable name for logging the output.
+      target: The session target to use for the benchmark.
+      iters: The number of iterations to perform.
+    """
+    times = []
+    with ops.Graph().as_default():
+      # Define the op to be run as a variable, to avoid
+      # constant-folding.
+      v = variables.Variable(random_ops.random_normal([]))
+      with session.Session(target) as sess:
+        sess.run(v.initializer)
+        runner = sess.make_callable(v.op)
+        runner()  # Warm-up run.
+        for _ in xrange(iters):
+          start_time = time.time()
+          runner()
+          end_time = time.time()
+          times.append(end_time - start_time)
+    print("%s %f" % (name, np.median(times)))
+    self.report_benchmark(iters=1, wall_time=np.median(times), name=name)
+
   def benchmarkGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self._benchmarkFeed("benchmark_session_feed_grpc_4B", server.target, 1,
-                        10000)
+                        30000)
     session.Session.reset(server.target)
-    self._benchmarkFeed("benchmark_session_feed_grpc_4MB", server.target, 1
-                        << 20, 100)
+    self._benchmarkFeed("benchmark_session_feed_grpc_4MB", server.target,
+                        1 << 20, 25000)
     session.Session.reset(server.target)
     self._benchmarkFetch("benchmark_session_fetch_grpc_4B", server.target, 1,
-                         20000)
+                         40000)
+    session.Session.reset(server.target)
+    self._benchmarkFetch("benchmark_session_fetch_grpc_4MB", server.target,
+                         1 << 20, 20000)
     session.Session.reset(server.target)
-    self._benchmarkFetch("benchmark_session_fetch_grpc_4MB", server.target, 1
-                         << 20, 100)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_grpc_4B",
+                                 server.target, 1, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_grpc_4MB",
+                                 server.target, 1 << 20, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkRunOp("benchmark_session_runop_grpc", server.target, 50000)
+    session.Session.reset(server.target)
+    self._benchmarkRunOpPrebuilt("benchmark_session_runopprebuilt_grpc",
+                                 server.target, 100000)
     session.Session.reset(server.target)
 
   def benchmarkDirectSession(self):
-    self._benchmarkFeed("benchmark_session_feed_direct_4B", "", 1, 5000)
-    self._benchmarkFeed("benchmark_session_feed_direct_4MB", "", 1 << 20, 200)
-    self._benchmarkFetch("benchmark_session_fetch_direct_4B", "", 1, 5000)
-    self._benchmarkFetch("benchmark_session_fetch_direct_4MB", "", 1 << 20, 100)
+    self._benchmarkFeed("benchmark_session_feed_direct_4B", "", 1, 80000)
+    self._benchmarkFeed("benchmark_session_feed_direct_4MB", "", 1 << 20, 20000)
+    self._benchmarkFetch("benchmark_session_fetch_direct_4B", "", 1, 100000)
+    self._benchmarkFetch("benchmark_session_fetch_direct_4MB", "", 1 << 20,
+                         20000)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_direct_4B",
+                                 "", 1, 200000)
+    self._benchmarkFetchPrebuilt("benchmark_session_fetchprebuilt_direct_4MB",
+                                 "", 1 << 20, 200000)
+    self._benchmarkRunOp("benchmark_session_runop_direct", "", 200000)
+    self._benchmarkRunOpPrebuilt("benchmark_session_runopprebuilt_direct", "",
+                                 200000)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index e53a046a34d2062c3636b223e7b90a244f0c57cf..9128abf0bef7b10984a6432cc16497db60dee863 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
+import sys
 import threading
 import time
 
@@ -27,7 +29,9 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.lib.core import error_codes_pb2
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -42,6 +46,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -193,6 +200,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
+      tensor_runner = sess.make_callable(a)
+      res = tensor_runner()
+      self.assertEqual(42.0, res)
+      op_runner = sess.make_callable(a.op)
+      res = op_runner()
+      self.assertEqual(None, res)
 
   def testFetchSingletonByName(self):
     with session.Session() as sess:
@@ -211,12 +224,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       assign = v.assign([63.0])
       res = sess.run([a, b, c, a.name, assign.op])
       self.assertTrue(isinstance(res, list))
-      self.assertEqual(42.0, res[0])
-      self.assertEqual(None, res[1])
-      self.assertEqual(44.0, res[2])
-      self.assertEqual(42.0, res[3])
-      self.assertEqual(None, res[4])
-      self.assertEqual(63.0, sess.run(v))
+      self.assertEqual([42.0, None, 44.0, 42.0, None], res)
+      list_runner = sess.make_callable([a, b, c, a.name, assign.op])
+      res = list_runner()
+      self.assertTrue(isinstance(res, list))
+      self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
   def testFetchTuple(self):
     with session.Session() as sess:
@@ -225,10 +237,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       c = constant_op.constant(44.0)
       res = sess.run((a, b, c, a.name))
       self.assertTrue(isinstance(res, tuple))
-      self.assertEqual(42.0, res[0])
-      self.assertEqual(None, res[1])
-      self.assertEqual(44.0, res[2])
-      self.assertEqual(42.0, res[3])
+      self.assertEqual((42.0, None, 44.0, 42.0), res)
+      tuple_runner = sess.make_callable((a, b, c, a.name))
+      res = tuple_runner()
+      self.assertTrue(isinstance(res, tuple))
+      self.assertEqual((42.0, None, 44.0, 42.0), res)
 
   def testFetchNamedTuple(self):
     # pylint: disable=invalid-name
@@ -243,6 +256,12 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res.a)
       self.assertEqual(None, res.b)
       self.assertEqual(44.0, res.c)
+      namedtuple_runner = sess.make_callable(ABC(a, b, c))
+      res = namedtuple_runner()
+      self.assertTrue(isinstance(res, ABC))
+      self.assertEqual(42.0, res.a)
+      self.assertEqual(None, res.b)
+      self.assertEqual(44.0, res.c)
 
   def testFetchDict(self):
     with session.Session() as sess:
@@ -1181,6 +1200,11 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
+          feed_fetch_runner = sess.make_callable([out_t, feed_t], [feed_t])
+          out_v, feed_v = feed_fetch_runner(np_array)
+          self.assertAllEqual(np_array, out_v)
+          self.assertAllEqual(np_array, feed_v)
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
@@ -1404,6 +1428,80 @@ class SessionTest(test_util.TensorFlowTestCase):
     r2 = sess.partial_run(h, [b, c])
     self.assertEqual(r1, r2)
 
+  def runTestPartialRunMissingPlaceholderFeedException(self, sess):
+    x = array_ops.placeholder(dtypes.float32, shape=())
+    fetches = [x * 2, x * 3]
+    handle = sess.partial_run_setup(fetches=fetches, feeds=[])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'You must feed a value for placeholder'):
+      sess.partial_run(handle, fetches[0])
+
+  def runTestPartialRunUnspecifiedFeed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+
+    h = sess.partial_run_setup([r1], [a, b])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r1, feed_dict={a: 1, b: 2, c: 3})
+
+  def runTestPartialRunUnspecifiedFetch(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1], [a, b, c])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'was not specified in partial_run_setup.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFed(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fed.$'):
+      sess.partial_run(h, r2, feed_dict={a: 1, c: 3})
+
+  def runTestPartialRunAlreadyFetched(self, sess):
+    a = array_ops.placeholder(dtypes.float32, shape=[])
+    b = array_ops.placeholder(dtypes.float32, shape=[])
+    c = array_ops.placeholder(dtypes.float32, shape=[])
+    r1 = math_ops.add(a, b)
+    r2 = math_ops.multiply(a, c)
+
+    h = sess.partial_run_setup([r1, r2], [a, b, c])
+    sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 'has already been fetched.$'):
+      sess.partial_run(h, r1, feed_dict={c: 3})
+
+  def runTestPartialRunEmptyFetches(self, sess):
+    a = array_ops.placeholder(dtypes.float32)
+    b = a * 2.0
+
+    h = sess.partial_run_setup(fetches=[b], feeds=[a])
+    sess.partial_run(h, [], {a: 3.0})
+    r = sess.partial_run(h, [b], {})
+    self.assertEqual([6.0], r)
+
+  def testInvalidPartialRunSetup(self):
+    sess = session.Session()
+    x = array_ops.placeholder(dtypes.float32, shape=[])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'specify at least one target to fetch or execute.'):
+      sess.partial_run_setup(fetches=[], feeds=[x])
+
   def testPartialRunDirect(self):
     self.runTestPartialRun(session.Session())
 
@@ -1419,6 +1517,24 @@ class SessionTest(test_util.TensorFlowTestCase):
   def testRunAndPartialRunDirect(self):
     self.runTestRunAndPartialRun(session.Session())
 
+  def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
+    self.runTestPartialRunMissingPlaceholderFeedException(session.Session())
+
+  def testPartialRunUnspecifiedFeedDirect(self):
+    self.runTestPartialRunUnspecifiedFeed(session.Session())
+
+  def testPartialRunUnspecifiedFetchDirect(self):
+    self.runTestPartialRunUnspecifiedFetch(session.Session())
+
+  def testPartialRunAlreadyFedDirect(self):
+    self.runTestPartialRunAlreadyFed(session.Session())
+
+  def testPartialRunAlreadyFetchedDirect(self):
+    self.runTestPartialRunAlreadyFetched(session.Session())
+
+  def testPartialRunEmptyFetchesDirect(self):
+    self.runTestPartialRunEmptyFetches(session.Session())
+
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.runTestPartialRun(session.Session(server.target))
@@ -1439,6 +1555,31 @@ class SessionTest(test_util.TensorFlowTestCase):
     server = server_lib.Server.create_local_server()
     self.runTestRunAndPartialRun(session.Session(server.target))
 
+  def testPartialRunMissingPlaceholderFeedExceptionDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunMissingPlaceholderFeedException(
+        session.Session(server.target))
+
+  def testPartialRunUnspecifiedFeedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFeed(session.Session(server.target))
+
+  def testPartialRunUnspecifiedFetchDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunUnspecifiedFetch(session.Session(server.target))
+
+  def testPartialRunAlreadyFedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFed(session.Session(server.target))
+
+  def testPartialRunAlreadyFetchedDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunAlreadyFetched(session.Session(server.target))
+
+  def testPartialRunEmptyFetchesDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestPartialRunEmptyFetches(session.Session(server.target))
+
   def testFeedDictKeyException(self):
     with session.Session() as sess:
       a = constant_op.constant(1.0, dtypes.float32, name='a')
@@ -1624,6 +1765,397 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
+  def testDefaultLogDevicePlacement(self):
+    class CaptureStderr(str):
+      """Class to capture stderr from C++ shared library."""
+
+      def __enter__(self):
+        self._esc = compat.as_str('\b')
+        self._output = compat.as_str('')
+        self._stderr = sys.stderr
+        self._fd = self._stderr.fileno()
+        self._out_pipe, in_pipe = os.pipe()
+        # Save the original io stream.
+        self._dup_fd = os.dup(self._fd)
+        # Replace the original io stream with in pipe.
+        os.dup2(in_pipe, self._fd)
+        return self
+
+      def __exit__(self, *args):
+        self._stderr.write(self._esc)
+        self._stderr.flush()
+        self.read()
+        os.close(self._out_pipe)
+        # Restore the original io stream.
+        os.dup2(self._dup_fd, self._fd)
+
+      def read(self):
+        while True:
+          data = os.read(self._out_pipe, 1)
+          if not data or compat.as_str(data) == self._esc:
+            break
+          self._output += compat.as_str(data)
+
+      def __str__(self):
+        return self._output
+
+    # Passing the config to the server, but not the session should still result
+    # in logging device placement.
+    config = config_pb2.ConfigProto(log_device_placement=True)
+    server = server_lib.Server.create_local_server(config=config)
+    a = constant_op.constant(1)
+    b = constant_op.constant(2)
+    c = a + b
+    with session.Session(server.target) as sess:
+      with CaptureStderr() as log:
+        sess.run(c)
+      # Ensure that we did log device placement.
+      self.assertTrue('/job:local/replica:0/task:0/cpu:0' in str(log), str(log))
+
+  def testLocalMasterSessionTimeout(self):
+    # Test that the timeout passed in a config to the session works correctly.
+    config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
+    server = server_lib.Server.create_local_server()
+    q = data_flow_ops.FIFOQueue(1, dtypes.float32)
+    dequeued_t = q.dequeue()
+
+    with session.Session(server.target, config=config) as sess:
+      # Intentionally do not run any enqueue_ops so that dequeue will block
+      # until operation_timeout_in_ms.
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(dequeued_t)
+
+  def testDefaultServerTimeout(self):
+    # Test that the default server config timeout gets used when no Session
+    # config is provided.
+    config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
+    server = server_lib.Server.create_local_server(config=config)
+    q = data_flow_ops.FIFOQueue(1, dtypes.float32)
+    dequeued_t = q.dequeue()
+
+    with session.Session(server.target) as sess:
+      # Intentionally do not run any enqueue_ops so that dequeue will block
+      # until operation_timeout_in_ms.
+      with self.assertRaises(errors.DeadlineExceededError):
+        sess.run(dequeued_t)
+
+  def runTestBuildGraphError(self, sess):
+    # Ensure that errors from building the graph get propagated.
+    data = array_ops.placeholder(dtypes.float32, shape=[])
+    enter_1 = control_flow_ops.enter(data, 'foo_1', False)
+    enter_2 = control_flow_ops.enter(data, 'foo_2', False)
+    res = math_ops.add(enter_1, enter_2)
+    with self.assertRaisesOpError('has inputs from different frames'):
+      sess.run(res, feed_dict={data: 1.0})
+
+  def testBuildGraphErrorDirect(self):
+    self.runTestBuildGraphError(session.Session())
+
+  def testBuildGraphErrorDist(self):
+    server = server_lib.Server.create_local_server()
+    self.runTestBuildGraphError(session.Session(server.target))
+
+  def testClusterSpecPropagationSimple(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  def testClusterSpecPropagationWorker2Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+    output = sess.run(const, options=run_options, run_metadata=run_metadata)
+    self.assertEqual(17, output)
+    self.assertEqual(1,
+                     len([
+                         node_stats
+                         for dev_stats in run_metadata.step_stats.dev_stats
+                         for node_stats in dev_stats.node_stats
+                         if '/job:worker/replica:0/task:1/device:CPU:0' ==
+                         dev_stats.device and 'Const' == node_stats.node_name
+                     ]))
+
+  def testClusterSpecPropagationWorker1Placement(self):
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
+      const = constant_op.constant(17)
+    sess = session.Session(server1.target, config=config, graph=g)
+    output = sess.run(const)
+    self.assertEqual(17, output)
+
+  def testClusterSpecPropagationThreeServers2Graphs(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker1'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker2'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.Graph().as_default() as g1:
+      with ops.device('/job:worker1/task:1'):
+        var1 = variables.Variable(array_ops.zeros([2]), name='var1')
+        update_op1 = state_ops.assign_add(
+            var1, array_ops.ones([2]), name='var1_assign_add')
+        init1 = variables.global_variables_initializer()
+
+    with ops.Graph().as_default() as g2:
+      with ops.device('/job:worker2/task:1'):
+        var2 = variables.Variable(array_ops.zeros([2]), name='var2')
+        update_op2 = state_ops.assign_add(
+            var2, array_ops.ones([2]), name='var2_assign_add')
+        init2 = variables.global_variables_initializer()
+
+    sess1 = session.Session(server2.target, graph=g1, config=config1)
+    sess2 = session.Session(server2.target, graph=g2, config=config2)
+
+    init1.run(session=sess1)
+    init2.run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+
+    self.assertAllEqual(expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess1.run(var1))
+    self.assertAllEqual(expected_zeros, sess2.run(var2))
+    self.assertAllEqual(expected_ones, sess2.run(update_op2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(update_op1))
+    self.assertAllEqual(expected_ones, sess2.run(var2))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var1))
+
+  def testClusterSpecPropagationThreeServers(self):
+    """Boots 3 servers, creates 2 sessions, ensures appropriate operations.
+
+    We create 2 clusterspecs:
+     1. server2 as the master, server1 as a worker
+     2. server2 as the master, server3 as a worker
+
+    We ensure that variables on the workers are independent.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def1 = cluster_pb2.ClusterDef()
+    job1 = cluster_def1.job.add()
+    job1.name = 'worker'
+    job1.tasks[0] = server2.target[len('grpc://'):]
+    job1.tasks[1] = server1.target[len('grpc://'):]
+
+    cluster_def2 = cluster_pb2.ClusterDef()
+    job2 = cluster_def2.job.add()
+    job2.name = 'worker'
+    job2.tasks[0] = server2.target[len('grpc://'):]
+    job2.tasks[1] = server3.target[len('grpc://'):]
+
+    config1 = config_pb2.ConfigProto(cluster_def=cluster_def1)
+    config2 = config_pb2.ConfigProto(cluster_def=cluster_def2)
+
+    with ops.device('/job:worker/task:1'):
+      var = variables.Variable(array_ops.zeros([2]), name='var')
+      feed = array_ops.placeholder(dtypes.float32, shape=(2))
+      update_op = var.assign_add(feed)
+
+    sess1 = session.Session(server2.target, config=config1)
+    sess2 = session.Session(server2.target, config=config2)
+
+    variables.global_variables_initializer().run(session=sess1)
+    variables.global_variables_initializer().run(session=sess2)
+
+    expected_zeros = np.zeros([2])
+    expected_ones = np.ones([2])
+
+    self.assertAllEqual(expected_zeros, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess1.run(var))
+    self.assertAllEqual(expected_zeros, sess2.run(var))
+    self.assertAllEqual(expected_ones,
+                        sess2.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones + expected_ones,
+                        sess1.run(update_op, feed_dict={feed: expected_ones}))
+    self.assertAllEqual(expected_ones, sess2.run(var))
+    self.assertAllEqual(expected_ones + expected_ones, sess1.run(var))
+
+  def testClusterSpecPropagationThreeServersOneCluster(self):
+    """Boots 3 servers, ensures appropriate communication across workers.
+
+    Additionally, in this cluster, we ensure the master is not the 0-th worker.
+
+    Note: this test only uses one session.
+    """
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+    server3 = server_lib.Server.create_local_server()
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server3.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    job.tasks[2] = server1.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    # Add ops to the devices in non-linear order.
+
+    with ops.device('/job:worker/task:1'):
+      feed1 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const1 = constant_op.constant(2.0)
+      mul1 = const1 * feed1
+
+    with ops.device('/job:worker/task:2'):
+      feed2 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const2 = constant_op.constant(2.0)
+      mul2 = const2 * feed2
+
+    with ops.device('/job:worker/task:0'):
+      feed0 = array_ops.placeholder(dtypes.float32, shape=(2))
+      const0 = constant_op.constant(2.0)
+      mul0 = const0 * feed0
+
+    sum_op = mul0 + mul1 + mul2
+
+    ones = np.ones([2])
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    # Run!
+    with session.Session(server1.target, config=config) as sess:
+      output = sess.run(
+          sum_op,
+          options=run_options,
+          run_metadata=run_metadata,
+          feed_dict={feed1: ones,
+                     feed2: ones,
+                     feed0: ones})
+      self.assertAllEqual(6 * ones, output)
+
+      self.assertEqual(
+          3,
+          len([
+              dev_stats.device
+              for dev_stats in run_metadata.step_stats.dev_stats
+              for node_stats in dev_stats.node_stats
+              if '/job:worker/replica:0/task:' in dev_stats.device and
+              node_stats.node_name.startswith('Const')
+          ]), run_metadata)
+
+  def testClusterSpecPropagationPartialRun(self):
+    """Test successful partial run with ClusterSpec propagation."""
+    server1 = server_lib.Server.create_local_server()
+    server2 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server1.target[len('grpc://'):]
+    job.tasks[1] = server2.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    with ops.device('/job:worker/task:0'):
+      a = array_ops.placeholder(dtypes.float32, shape=[])
+    with ops.device('/job:worker/task:1'):
+      b = array_ops.placeholder(dtypes.float32, shape=[])
+      c = array_ops.placeholder(dtypes.float32, shape=[])
+      r1 = math_ops.add(a, b)
+    with ops.device('/job:worker/task:0'):
+      r2 = math_ops.multiply(r1, c)
+
+    with session.Session(server1.target, config=config) as sess:
+      h = sess.partial_run_setup([r1, r2], [a, b, c])
+      res = sess.partial_run(h, r1, feed_dict={a: 1, b: 2})
+      self.assertEqual(3, res)
+      res = sess.partial_run(h, r2, feed_dict={c: 3})
+      self.assertEqual(9, res)
+
+  def testGraphOptimizer(self):
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        disable_model_pruning=False, constant_folding=True)
+    graph_options = config_pb2.GraphOptions(
+        rewrite_options=rewrite_options, build_cost_model=1)
+    config = config_pb2.ConfigProto(graph_options=graph_options)
+
+    with ops.Graph().as_default() as g:
+      r1 = random_ops.random_normal(shape=[2, 3], name='R1')
+      r2 = random_ops.random_normal(shape=[2, 3], name='R2')
+      copy1 = array_ops.stop_gradient(r1)
+      copy2 = array_ops.identity(r2)
+      result = copy1 + copy2
+
+      with session.Session(graph=g, config=config) as sess:
+        metadata = config_pb2.RunMetadata()
+        sess.run(result, run_metadata=metadata)
+
+    # Check that we optimized the graph by looking at the cost model: the add
+    # node should have been reconnected directly to the R1 and R2 nodes.
+    found_valid_nodes = 0
+    for node in metadata.cost_graph.node:
+      if node.name == 'R1':
+        r1_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'R2':
+        r2_cost_id = node.id
+        found_valid_nodes += 1
+      if node.name == 'add':
+        if node.input_info[0].preceding_node == r1_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r2_cost_id)
+          found_valid_nodes += 1
+        elif node.input_info[0].preceding_node == r2_cost_id:
+          self.assertEqual(node.input_info[1].preceding_node, r1_cost_id)
+          found_valid_nodes += 1
+    self.assertEqual(3, found_valid_nodes)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 3331f17217966d9ac7fdaa7940ffff8970e973a3..4083611cf7c0155a6f03a2faad930531929d29fb 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,6 +18,7 @@ limitations under the License.
 %{
 
 #include "tensorflow/python/client/tf_session_helper.h"
+#include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"
 
@@ -57,6 +58,9 @@ tensorflow::ImportNumpy();
   }
 }
 
+// Constants used by TensorHandle (get_session_handle).
+%constant const char* TENSOR_HANDLE_KEY = tensorflow::SessionState::kTensorHandleResourceTypeName;
+
 ////////////////////////////////////////////////////////////////////////////////
 // BEGIN TYPEMAPS FOR tensorflow::TF_Run_wrapper()
 ////////////////////////////////////////////////////////////////////////////////
@@ -65,7 +69,8 @@ tensorflow::ImportNumpy();
 // represented as a list of strings.
 %typemap(in) const tensorflow::NameVector& (
     tensorflow::NameVector temp,
-    tensorflow::Safe_PyObjectPtr temp_string_list(tensorflow::make_safe(nullptr))) {
+    tensorflow::Safe_PyObjectPtr temp_string_list(
+        tensorflow::make_safe(static_cast<PyObject*>(nullptr)))) {
   if (!PyList_Check($input)) {
     SWIG_fail;
   }
@@ -112,7 +117,7 @@ tensorflow::ImportNumpy();
 
 // Build a Python list of outputs and return it.
 %typemap(argout) tensorflow::PyObjectVector* out_values {
-  tensorflow::Safe_PyObjectVector out_values_safe;
+  std::vector<tensorflow::Safe_PyObjectPtr> out_values_safe;
   for (size_t i = 0; i < $1->size(); ++i) {
     out_values_safe.emplace_back(tensorflow::make_safe($1->at(i)));
   }
@@ -152,35 +157,56 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-// Include the functions from c_api.h, except TF_Run.
-%ignoreall
-%unignore TF_Code;
-%unignore TF_Status;
-%unignore TF_Buffer;
-%unignore TF_NewBuffer;
-%unignore TF_NewBufferFromString;
-%unignore TF_DeleteBuffer;
-%unignore TF_GetBuffer;
-%unignore TF_NewStatus;
-%unignore TF_DeleteStatus;
-%unignore TF_GetCode;
-%unignore TF_Message;
-%unignore TF_SessionOptions;
+%inline %{
+// Helper function to convert a Python list of Tensors to a C++ vector of
+// TF_Outputs.
+//
+// Caller should have already checked that `py_tensor_list` is a list (this
+// isn't done in this function to allow for function-specific error messages)
+void PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec) {
+  size_t size = PyList_Size(py_tensor_list);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem(py_tensor_list, i);
+    TF_Output* input_ptr;
+    SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                    SWIGTYPE_p_TF_Output, 0);
+    vec->push_back(*input_ptr);
+  }
+}
+%}
+
+// Converts input Python list of wrapped TF_Outputs into a single array
+%typemap(in) (const TF_Output* inputs, int num_inputs)
+    (std::vector<TF_Output> inputs) {
+  if (!PyList_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError, "$symname: expected Python list of wrapped TF_Outputs");
+  }
+  PyTensorListToVector($input, &inputs);
+  $1 = inputs.data();
+  $2 = inputs.size();
+}
+
+// TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
+// skip for now
+%ignore TF_WhileParams;
+%ignore TF_NewWhile;
+%ignore TF_FinishWhile;
+%ignore TF_AbortWhile;
+
+// These are defined below, avoid duplicate definitions
+%ignore TF_Run;
+%ignore TF_PRun;
+%ignore TF_PRunSetup;
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
-%unignore TF_DeleteSessionOptions;
-%unignore TF_NewDeprecatedSession;
-%unignore TF_CloseDeprecatedSession;
-%unignore TF_DeleteDeprecatedSession;
-%unignore TF_ExtendGraph;
-%unignore TF_NewLibrary;
-%unignore TF_LoadLibrary;
-%unignore TF_DeleteLibraryHandle;
-%unignore TF_GetOpList;
+
 %include "tensorflow/c/c_api.h"
-%ignoreall
 
+%ignoreall
 %insert("python") %{
   def TF_NewSessionOptions(target=None, config=None):
     # NOTE: target and config are validated in the session constructor.
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 99c154bd99787a0ea51a5def09755fe57746a4dc..a046c7ebb1229180bef5e6255ea7bbfb1eedc33a 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -30,20 +31,8 @@ namespace tensorflow {
 
 namespace {
 
-// Container types for the various temporary values used internally in
-// the wrapper.
-
-// A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
-typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
-
-// Safe containers for (an) owned TF_Tensor(s). On destruction, the
-// tensor will be deleted by TF_DeleteTensor.
-typedef std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)>
-    Safe_TF_TensorPtr;
-typedef std::vector<Safe_TF_TensorPtr> Safe_TF_TensorVector;
-Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
-  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
-}
+static const char* kFeedDictErrorMsg =
+    "feed_dict must be a dictionary mapping strings to NumPy arrays.";
 
 Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
                                    TF_DataType* out_tf_datatype) {
@@ -142,78 +131,6 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
   return Status::OK();
 }
 
-Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
-                                   int* out_pyarray_type) {
-  switch (tf_datatype) {
-    case TF_HALF:
-      *out_pyarray_type = NPY_FLOAT16;
-      break;
-    case TF_FLOAT:
-      *out_pyarray_type = NPY_FLOAT32;
-      break;
-    case TF_DOUBLE:
-      *out_pyarray_type = NPY_FLOAT64;
-      break;
-    case TF_INT32:
-      *out_pyarray_type = NPY_INT32;
-      break;
-    case TF_UINT8:
-      *out_pyarray_type = NPY_UINT8;
-      break;
-    case TF_UINT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    case TF_INT8:
-      *out_pyarray_type = NPY_INT8;
-      break;
-    case TF_INT16:
-      *out_pyarray_type = NPY_INT16;
-      break;
-    case TF_INT64:
-      *out_pyarray_type = NPY_INT64;
-      break;
-    case TF_BOOL:
-      *out_pyarray_type = NPY_BOOL;
-      break;
-    case TF_COMPLEX64:
-      *out_pyarray_type = NPY_COMPLEX64;
-      break;
-    case TF_COMPLEX128:
-      *out_pyarray_type = NPY_COMPLEX128;
-      break;
-    case TF_STRING:
-      *out_pyarray_type = NPY_OBJECT;
-      break;
-    case TF_RESOURCE:
-      *out_pyarray_type = NPY_VOID;
-      break;
-    // TODO(keveman): These should be changed to NPY_VOID, and the type used for
-    // the resulting numpy array should be the custom struct types that we
-    // expect for quantized types.
-    case TF_QINT8:
-      *out_pyarray_type = NPY_INT8;
-      break;
-    case TF_QUINT8:
-      *out_pyarray_type = NPY_UINT8;
-      break;
-    case TF_QINT16:
-      *out_pyarray_type = NPY_INT16;
-      break;
-    case TF_QUINT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    case TF_QINT32:
-      *out_pyarray_type = NPY_INT32;
-      break;
-    case TF_BFLOAT16:
-      *out_pyarray_type = NPY_UINT16;
-      break;
-    default:
-      return errors::Internal("Unsupported fetch type");
-  }
-  return Status::OK();
-}
-
 // Iterate over the string array 'array', extract the ptr and len of each string
 // element and call f(ptr, len).
 template <typename F>
@@ -375,6 +292,8 @@ Status GetPyArrayDescrForTensor(const TF_Tensor* tensor,
     PyObject* fields = PyList_New(1);
     PyList_SetItem(fields, 0, field);
     int convert_result = PyArray_DescrConverter(fields, descr);
+    Py_CLEAR(field);
+    Py_CLEAR(fields);
     if (convert_result != 1) {
       return errors::Internal("Failed to create numpy array description for ",
                               "TF_RESOURCE-type tensor");
@@ -389,29 +308,39 @@ Status GetPyArrayDescrForTensor(const TF_Tensor* tensor,
   return Status::OK();
 }
 
-// Converts the given TF_Tensor to a Numpy array.
+// Converts the given TF_Tensor to a numpy ndarray.
 // If the returned status is OK, the caller becomes the owner of *out_array.
-Status TF_Tensor_to_PyObject(TF_Tensor* tensor, PyObject** out_array) {
+Status TFTensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   // A fetched operation will correspond to a null tensor, and a None
   // in Python.
   if (tensor == nullptr) {
     Py_INCREF(Py_None);
-    *out_array = Py_None;
+    *out_ndarray = Py_None;
     return Status::OK();
   }
 
   tensorflow::int64 nelems = -1;
   gtl::InlinedVector<npy_intp, 4> dims =
-      GetPyArrayDimensionsForTensor(tensor, &nelems);
+      GetPyArrayDimensionsForTensor(tensor.get(), &nelems);
 
   // Convert TensorFlow dtype to numpy type descriptor.
   PyArray_Descr* descr = nullptr;
-  TF_RETURN_IF_ERROR(GetPyArrayDescrForTensor(tensor, &descr));
+  TF_RETURN_IF_ERROR(GetPyArrayDescrForTensor(tensor.get(), &descr));
+
+  // If the type is neither string nor resource we can reuse the Tensor memory.
+  TF_Tensor* original = tensor.get();
+  TF_Tensor* moved = TF_TensorMaybeMove(tensor.release());
+  if (moved != nullptr) {
+    if (ArrayFromMemory(dims.size(), dims.data(), TF_TensorData(moved),
+                        static_cast<DataType>(TF_TensorType(moved)),
+                        [moved] { TF_DeleteTensor(moved); }, out_ndarray)
+            .ok()) {
+      return Status::OK();
+    }
+  }
+  tensor.reset(original);
 
   // Copy the TF_TensorData into a newly-created ndarray and return it.
-  // TODO(mrry): Perhaps investigate zero-copy approaches. This would involve
-  // creating an ndarray-like object that wraps the TF_Tensor buffer, and
-  // maps its destructor to TF_DeleteTensor.
   Safe_PyObjectPtr safe_out_array =
       tensorflow::make_safe(PyArray_Empty(dims.size(), dims.data(), descr, 0));
   if (!safe_out_array) {
@@ -420,55 +349,119 @@ Status TF_Tensor_to_PyObject(TF_Tensor* tensor, PyObject** out_array) {
   PyArrayObject* py_array =
       reinterpret_cast<PyArrayObject*>(safe_out_array.get());
   if (PyArray_NBYTES(py_array) !=
-      static_cast<int64>(TF_TensorByteSize(tensor))) {
-    if (TF_TensorType(tensor) == TF_STRING) {
+      static_cast<int64>(TF_TensorByteSize(tensor.get()))) {
+    if (TF_TensorType(tensor.get()) == TF_STRING) {
       // Copy element by element.
       auto iter = tensorflow::make_safe(PyArray_IterNew(safe_out_array.get()));
       for (tensorflow::int64 i = 0; i < nelems; ++i) {
-        auto s =
-            CopyStringToPyArrayElement(py_array, iter.get(), tensor, nelems, i);
+        auto s = CopyStringToPyArrayElement(py_array, iter.get(), tensor.get(),
+                                            nelems, i);
         if (!s.ok()) {
           return s;
         }
         PyArray_ITER_NEXT(iter.get());
       }
-    } else if (TF_TensorType(tensor) == TF_RESOURCE) {
+    } else if (TF_TensorType(tensor.get()) == TF_RESOURCE) {
       ResourceHandle* resource_handle =
-          reinterpret_cast<ResourceHandle*>(TF_TensorData(tensor));
+          reinterpret_cast<ResourceHandle*>(TF_TensorData(tensor.get()));
       memcpy(PyArray_DATA(py_array),
              resource_handle->SerializeAsString().c_str(),
              PyArray_NBYTES(py_array));
     } else {
       return errors::Internal("ndarray was ", PyArray_NBYTES(py_array),
                               " bytes but TF_Tensor was ",
-                              TF_TensorByteSize(tensor), " bytes");
+                              TF_TensorByteSize(tensor.get()), " bytes");
     }
   } else {
-    memcpy(PyArray_DATA(py_array), TF_TensorData(tensor),
+    memcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()),
            PyArray_NBYTES(py_array));
   }
 
   // PyArray_Return turns rank 0 arrays into numpy scalars
-  *out_array = PyArray_Return(
+  *out_ndarray = PyArray_Return(
       reinterpret_cast<PyArrayObject*>(safe_out_array.release()));
   return Status::OK();
 }
 
+// Converts the given numpy ndarray to a (safe) TF_Tensor. If `ndarray` contains
+// a resource handle, `*resource_handle` will be set to the deserialized
+// handle. Otherwise it is set to nullptr. `resource_handle` and `out_tensor`
+// must be non-null. Caller retains ownership of `ndarray` and becomes owner of
+// `*resource_handle` if it's set.
+Status PyArrayToTFTensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor,
+                         ResourceHandle** resource_handle) {
+  DCHECK(out_tensor != nullptr);
+  DCHECK(resource_handle != nullptr);
+  *resource_handle = nullptr;
+
+  // Make sure we dereference this array object in case of error, etc.
+  Safe_PyObjectPtr array_safe(make_safe(
+      PyArray_FromAny(ndarray, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr)));
+  if (!array_safe) return errors::InvalidArgument(kFeedDictErrorMsg);
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+
+  // Convert numpy dtype to TensorFlow dtype.
+  TF_DataType dtype = TF_FLOAT;
+  TF_RETURN_IF_ERROR(PyArray_TYPE_to_TF_DataType(array, &dtype));
+
+  tensorflow::int64 nelems = 1;
+  gtl::InlinedVector<int64_t, 4> dims;
+  for (int i = 0; i < PyArray_NDIM(array); ++i) {
+    dims.push_back(PyArray_SHAPE(array)[i]);
+    nelems *= dims[i];
+  }
+
+  // Create a TF_Tensor based on the fed data. In the case of non-string data
+  // type, this steals a reference to array, which will be relinquished when
+  // the underlying buffer is deallocated. For string, a new temporary buffer
+  // is allocated into which the strings are encoded.
+  if (dtype == TF_RESOURCE) {
+    const string serialized(reinterpret_cast<char*>(PyArray_DATA(array)),
+                            PyArray_NBYTES(array));
+    *resource_handle = new ResourceHandle();
+    (*resource_handle)->ParseFromString(serialized);
+    TF_Tensor* tf_tensor =
+        TF_AllocateTensor(dtype, {}, 0, sizeof(ResourceHandle));
+    std::memcpy(TF_TensorData(tf_tensor),
+                reinterpret_cast<void*>(*resource_handle),
+                sizeof(ResourceHandle));
+    *out_tensor = make_safe(tf_tensor);
+  } else if (dtype != TF_STRING) {
+    size_t size = PyArray_NBYTES(array);
+    array_safe.release();
+    *out_tensor = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
+                                         PyArray_DATA(array), size,
+                                         &DelayedNumpyDecref, array));
+  } else {
+    size_t size = 0;
+    void* encoded = nullptr;
+    TF_RETURN_IF_ERROR(EncodePyBytesArray(array, nelems, &size, &encoded));
+    *out_tensor =
+        make_safe(TF_NewTensor(dtype, dims.data(), dims.size(), encoded, size,
+                               [](void* data, size_t len, void* arg) {
+                                 delete[] reinterpret_cast<char*>(data);
+                               },
+                               array));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Safe_PyObjectPtr make_safe(PyObject* o) {
   return Safe_PyObjectPtr(o, Py_DECREF_wrapper);
 }
 
+Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
+  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
+}
+
 void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
                            const TF_Buffer* run_options, PyObject* feed_dict,
                            const NameVector& output_names,
                            const NameVector& target_nodes,
                            TF_Status* out_status, PyObjectVector* out_values,
                            TF_Buffer* run_outputs) {
-  static const char* kFeedDictErrorMsg =
-      "feed_dict must be a dictionary mapping strings to NumPy arrays.";
-
   // 1. Convert the feed inputs to the appropriate form for TF_Run.
   if (!PyDict_Check(feed_dict)) {
     Set_TF_Status_from_Status(out_status,
@@ -477,7 +470,7 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
   }
 
   NameVector input_names;
-  Safe_TF_TensorVector inputs_safe;  // Used to delete tensors.
+  std::vector<Safe_TF_TensorPtr> inputs_safe;  // Used to delete tensors.
   TF_TensorVector inputs_unsafe;     // Used to contain the arg to TF_Run.
 
   PyObject* key;
@@ -496,71 +489,13 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     }
     input_names.push_back(key_string);
 
-    // The array object will be dereferenced at the end of this iteration
-    // (or if we return early due to an error).
-    Safe_PyObjectPtr array_safe(make_safe(
-        PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr)));
-    if (!array_safe) {
-      Set_TF_Status_from_Status(out_status,
-                                errors::InvalidArgument(kFeedDictErrorMsg));
-      return;
-    }
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-
-    // Convert numpy dtype to TensorFlow dtype.
-    TF_DataType dtype = TF_FLOAT;
-    s = PyArray_TYPE_to_TF_DataType(array, &dtype);
-    if (!s.ok()) {
-      Set_TF_Status_from_Status(out_status, s);
-      return;
-    }
-
-    tensorflow::int64 nelems = 1;
-    gtl::InlinedVector<int64_t, 4> dims;
-    for (int i = 0; i < PyArray_NDIM(array); ++i) {
-      dims.push_back(PyArray_SHAPE(array)[i]);
-      nelems *= dims[i];
-    }
-
-    // Create a TF_Tensor based on the fed data. In the case of non-string data
-    // type, this steals a reference to array, which will be relinquished when
-    // the underlying buffer is deallocated. For string, a new temporary buffer
-    // is allocated into which the strings are encoded.
-    if (dtype == TF_RESOURCE) {
-      const string serialized(reinterpret_cast<char*>(PyArray_DATA(array)),
-                              PyArray_NBYTES(array));
-      std::shared_ptr<ResourceHandle> resource_handle(new ResourceHandle());
-      resource_handle->ParseFromString(serialized);
+    inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
+    ResourceHandle* resource_handle;
+    s = PyArrayToTFTensor(value, &inputs_safe.back(), &resource_handle);
+    inputs_unsafe.push_back(inputs_safe.back().get());
+    if (resource_handle != nullptr) {
       resource_handles.emplace_back(resource_handle);
-      TF_Tensor* tensor =
-          TF_AllocateTensor(dtype, {}, 0, sizeof(ResourceHandle));
-      std::memcpy(TF_TensorData(tensor),
-                  reinterpret_cast<void*>(resource_handle.get()),
-                  sizeof(ResourceHandle));
-      inputs_safe.emplace_back(make_safe(tensor));
-    } else if (dtype != TF_STRING) {
-      size_t size = PyArray_NBYTES(array);
-      array_safe.release();
-      TF_Tensor* tensor =
-          TF_NewTensor(dtype, dims.data(), dims.size(), PyArray_DATA(array),
-                       size, &DelayedNumpyDecref, array);
-      inputs_safe.emplace_back(make_safe(tensor));
-    } else {
-      size_t size = 0;
-      void* encoded = nullptr;
-      Status s = EncodePyBytesArray(array, nelems, &size, &encoded);
-      if (!s.ok()) {
-        Set_TF_Status_from_Status(out_status, s);
-        return;
-      }
-      inputs_safe.emplace_back(
-          make_safe(TF_NewTensor(dtype, dims.data(), dims.size(), encoded, size,
-                                 [](void* data, size_t len, void* arg) {
-                                   delete[] reinterpret_cast<char*>(data);
-                                 },
-                                 array)));
     }
-    inputs_unsafe.push_back(inputs_safe.back().get());
     ++index;
   }
 
@@ -598,17 +533,17 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
 
   // 4. We now own the fetched tensors, so set up a safe container to
   // delete them when we exit this scope.
-  Safe_TF_TensorVector tf_outputs_safe;
+  std::vector<Safe_TF_TensorPtr> tf_outputs_safe;
   for (const auto& output : outputs) {
     tf_outputs_safe.emplace_back(make_safe(output));
   }
 
   // 5. Convert the fetched tensors into numpy ndarrays. Store them in a safe
   // container so that we do not leak
-  Safe_PyObjectVector py_outputs_safe;
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
   for (size_t i = 0; i < output_names.size(); ++i) {
     PyObject* py_array;
-    s = TF_Tensor_to_PyObject(outputs[i], &py_array);
+    s = TFTensorToPyArray(std::move(tf_outputs_safe[i]), &py_array);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -648,6 +583,14 @@ void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
       const_cast<const char**>(output_names.data()), output_names.size(),
       const_cast<const char**>(target_nodes.data()), target_nodes.size(),
       out_handle, out_status);
+  // TF_PRunSetup leaves out_handle undefined if it fails, but SWIG will call
+  // free(out_handle) on the returned handle regardless. Thus, must make sure it
+  // is valid.
+  if (TF_GetCode(out_status) != TF_OK) {
+    char* tmp = new char[1];
+    tmp[0] = '\0';
+    *out_handle = tmp;
+  }
   Py_END_ALLOW_THREADS;
 }
 
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index b36faf1f95fc77fe60546369fe29dd9811eb92ef..8cbacdd1dc97d249b9ba6cf55f00fb4812771f2d 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -37,14 +37,24 @@ typedef tensorflow::gtl::InlinedVector<const char*, 8> NameVector;
 // A PyObjectVector is a vector of borrowed pointers to PyObjects.
 typedef tensorflow::gtl::InlinedVector<PyObject*, 8> PyObjectVector;
 
-// Safe containers for (an) owned PyObject(s). On destruction, the
-// reference count of the contained object will be decremented.
+// A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
+typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
 inline void Py_DECREF_wrapper(PyObject* o) { Py_DECREF(o); }
+// Note: can't use decltype(&Py_DECREF_wrapper) due to SWIG
 typedef void (*Py_DECREF_wrapper_type)(PyObject*);
 typedef std::unique_ptr<PyObject, Py_DECREF_wrapper_type> Safe_PyObjectPtr;
-typedef std::vector<Safe_PyObjectPtr> Safe_PyObjectVector;
 Safe_PyObjectPtr make_safe(PyObject* o);
 
+// Safe containers for an owned TF_Tensor. On destruction, the tensor will be
+// deleted by TF_DeleteTensor.
+// Note: can't use decltype(&TF_DeleteTensor) due to SWIG
+typedef void (*TF_DeleteTensor_type)(TF_Tensor*);
+typedef std::unique_ptr<TF_Tensor, TF_DeleteTensor_type> Safe_TF_TensorPtr;
+Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
+
 // Run the graph associated with the session starting with the
 // supplied inputs[].  Regardless of success or failure, inputs[] are
 // stolen by the implementation (i.e. the implementation will
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 56dd7ceba520591f12fd71bfd89bb0a4033753dd..3074811733988fbe9f66c4ce6a708d7280579d83 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -63,6 +63,10 @@ py_library(
     name = "source_utils",
     srcs = ["lib/source_utils.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":profiling",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_library(
@@ -111,6 +115,7 @@ py_library(
     srcs = ["cli/tensor_format.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":debug_data",
         ":debugger_cli_common",
         "//third_party/py/numpy",
     ],
@@ -142,6 +147,29 @@ py_library(
         ":debugger_cli_common",
         ":source_utils",
         ":ui_factory",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "profiling",
+    srcs = ["lib/profiling.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "profile_analyzer_cli",
+    srcs = ["cli/profile_analyzer_cli.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cli_shared",
+        ":command_parser",
+        ":debug_data",
+        ":debugger_cli_common",
+        ":profiling",
+        ":source_utils",
+        ":ui_factory",
+        "//tensorflow/python:util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -185,6 +213,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":base_ui",
+        ":cli_shared",
         ":command_parser",
         ":curses_widgets",
         ":debugger_cli_common",
@@ -240,6 +269,7 @@ py_library(
         ":debug_data",
         ":debugger_cli_common",
         ":framework",
+        ":profile_analyzer_cli",
         ":stepper_cli",
         ":ui_factory",
     ],
@@ -375,6 +405,7 @@ py_test(
         ":debug_utils",
         ":source_utils",
         "//tensorflow/python:client",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
@@ -422,13 +453,15 @@ py_test(
 )
 
 py_test(
-    name = "curses_widgets_test",
+    name = "profiling_test",
     size = "small",
-    srcs = ["cli/curses_widgets_test.py"],
+    srcs = ["lib/profiling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":curses_widgets",
+        ":profiling",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -588,6 +621,7 @@ cuda_py_test(
         ":debug_data",
         ":debug_utils",
         ":debugger_cli_common",
+        ":source_utils",
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow:tensorflow_py",
@@ -598,10 +632,30 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
 )
 
+py_test(
+    name = "profile_analyzer_cli_test",
+    size = "small",
+    srcs = [
+        "cli/profile_analyzer_cli_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":command_parser",
+        ":profile_analyzer_cli",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_cli_test",
     size = "small",
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index d4a84f62cde39143e40b411013968f5c0c4f9896..750d21f80d318a16c571aa7438e25115b6912bfc 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -25,6 +25,8 @@ See the @{$python/tfdbg} guide.
 @@has_inf_or_nan
 @@DumpingDebugHook
 @@DumpingDebugWrapperSession
+@@GrpcDebugHook
+@@GrpcDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
 @@WatchOptions
@@ -46,7 +48,9 @@ from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
 
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
 from tensorflow.python.debug.wrappers.framework import WatchOptions
+from tensorflow.python.debug.wrappers.grpc_wrapper import GrpcDebugWrapperSession
 from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
+from tensorflow.python.debug.wrappers.hooks import GrpcDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
 
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index 0c8004e25453da52573f9737aec9187ab319ff67..da27f4cebeaf6e7d0a1db2b74245b45279066a3d 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -27,7 +27,6 @@ import argparse
 import copy
 import re
 
-import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.debug.cli import cli_shared
@@ -345,12 +344,31 @@ class DebugAnalyzer(object):
         help="Print source beginning at line number (1-based.)")
     self._arg_parsers["print_source"] = ap
 
+    # Parser for list_source.
+    ap = argparse.ArgumentParser(
+        description="List source files responsible for constructing nodes and "
+        "tensors present in the run().",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "-p",
+        "--path_filter",
+        type=str,
+        default="",
+        help="Regular expression filter for file path.")
+    ap.add_argument(
+        "-n",
+        "--node_name_filter",
+        type=str,
+        default="",
+        help="Regular expression filter for node name.")
+    self._arg_parsers["list_source"] = ap
+
     # TODO(cais): Implement list_nodes.
 
   def add_tensor_filter(self, filter_name, filter_callable):
     """Add a tensor filter.
 
-    A tensor filter is a named callable of the siganture:
+    A tensor filter is a named callable of the signature:
       filter_callable(dump_datum, tensor),
 
     wherein dump_datum is an instance of debug_data.DebugTensorDatum carrying
@@ -979,6 +997,15 @@ class DebugAnalyzer(object):
 
     return output
 
+  def _reconstruct_print_source_command(self,
+                                        parsed,
+                                        line_begin_decrease=0,
+                                        max_elements_per_line_increase=0):
+    return "ps %s %s -b %d -m %d" % (
+        parsed.source_file_path, "-t" if parsed.tensors else "",
+        max(parsed.line_begin - line_begin_decrease, 1),
+        parsed.max_elements_per_line + max_elements_per_line_increase)
+
   def print_source(self, args, screen_info=None):
     """Print the content of a source file."""
     del screen_info  # Unused.
@@ -991,21 +1018,25 @@ class DebugAnalyzer(object):
         do_dumped_tensors=parsed.tensors,
         min_line=parsed.line_begin)
 
-    with open(parsed.source_file_path, "rU") as f:
-      source_text = f.read()
-
-    source_lines = source_text.split("\n")
-    num_lines = len(source_lines)
-    line_num_width = int(np.ceil(np.log10(num_lines))) + 3
+    source_lines, line_num_width = source_utils.load_source(
+        parsed.source_file_path)
 
     labeled_source_lines = []
     if parsed.line_begin > 1:
-      labeled_source_lines.append(
-          RL("(... Omitted %d source lines ...)" % (parsed.line_begin - 1),
-             "bold"))
+      omitted_info_line = RL(
+          "(... Omitted %d source lines ...) " % (parsed.line_begin - 1),
+          "bold")
+      omitted_info_line += RL(
+          "+5",
+          debugger_cli_common.MenuItem(
+              None,
+              self._reconstruct_print_source_command(
+                  parsed, line_begin_decrease=5)))
+      labeled_source_lines.append(omitted_info_line)
 
     for i, line in enumerate(source_lines[parsed.line_begin - 1:]):
-      annotated_line = RL("L%d" % (i + parsed.line_begin), "yellow")
+      annotated_line = RL("L%d" % (i + parsed.line_begin),
+                          cli_shared.COLOR_YELLOW)
       annotated_line += " " * (line_num_width - len(annotated_line))
       annotated_line += line
       labeled_source_lines.append(annotated_line)
@@ -1014,11 +1045,19 @@ class DebugAnalyzer(object):
         sorted_elements = sorted(source_annotation[i + parsed.line_begin])
         for k, element in enumerate(sorted_elements):
           if k >= parsed.max_elements_per_line:
-            labeled_source_lines.append(
-                "    (... Omitted %d of %d %s ...)" % (
-                    len(sorted_elements) - parsed.max_elements_per_line,
-                    len(sorted_elements),
-                    "tensor(s)" if parsed.tensors else "op(s)"))
+            # TODO(cais): Replace this accordion pattern with the easier-to-use
+            # INIT_SCROLL_POS_KEY.
+            omitted_info_line = RL("    (... Omitted %d of %d %s ...) " % (
+                len(sorted_elements) - parsed.max_elements_per_line,
+                len(sorted_elements),
+                "tensor(s)" if parsed.tensors else "op(s)"))
+            omitted_info_line += RL(
+                "+5",
+                debugger_cli_common.MenuItem(
+                    None,
+                    self._reconstruct_print_source_command(
+                        parsed, max_elements_per_line_increase=5)))
+            labeled_source_lines.append(omitted_info_line)
             break
 
           label = RL(" " * 4)
@@ -1026,7 +1065,7 @@ class DebugAnalyzer(object):
               debug_data.get_node_name(element)):
             attribute = debugger_cli_common.MenuItem("", "pt %s" % element)
           else:
-            attribute = "blue"
+            attribute = cli_shared.COLOR_BLUE
 
           label += RL(element, attribute)
           labeled_source_lines.append(label)
@@ -1036,6 +1075,109 @@ class DebugAnalyzer(object):
     _add_main_menu(output, node_name=None)
     return output
 
+  def _make_source_table(self, source_list, is_tf_py_library):
+    """Make a table summarizing the source files that create nodes and tensors.
+
+    Args:
+      source_list: List of source files and related information as a list of
+        tuples (file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
+        first_line).
+      is_tf_py_library: (`bool`) whether this table is for files that belong
+        to the TensorFlow Python library.
+
+    Returns:
+      The table as a `debugger_cli_common.RichTextLines` object.
+    """
+    path_head = "Source file path"
+    num_nodes_head = "#(nodes)"
+    num_tensors_head = "#(tensors)"
+    num_dumps_head = "#(tensor dumps)"
+
+    if is_tf_py_library:
+      # Use color to mark files that are guessed to belong to TensorFlow Python
+      # library.
+      color = cli_shared.COLOR_GRAY
+      lines = [RL("TensorFlow Python library file(s):", color)]
+    else:
+      color = cli_shared.COLOR_WHITE
+      lines = [RL("File(s) outside TensorFlow Python library:", color)]
+
+    if not source_list:
+      lines.append(RL("[No files.]"))
+      lines.append(RL())
+      return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+
+    path_column_width = max(
+        max([len(item[0]) for item in source_list]), len(path_head)) + 1
+    num_nodes_column_width = max(
+        max([len(str(item[2])) for item in source_list]),
+        len(num_nodes_head)) + 1
+    num_tensors_column_width = max(
+        max([len(str(item[3])) for item in source_list]),
+        len(num_tensors_head)) + 1
+
+    head = RL(path_head + " " * (path_column_width - len(path_head)), color)
+    head += RL(num_nodes_head + " " * (
+        num_nodes_column_width - len(num_nodes_head)), color)
+    head += RL(num_tensors_head + " " * (
+        num_tensors_column_width - len(num_tensors_head)), color)
+    head += RL(num_dumps_head, color)
+
+    lines.append(head)
+
+    for (file_path, _, num_nodes, num_tensors, num_dumps,
+         first_line_num) in source_list:
+      path_attributes = [color]
+      if source_utils.is_extension_uncompiled_python_source(file_path):
+        path_attributes.append(
+            debugger_cli_common.MenuItem(None, "ps %s -b %d" %
+                                         (file_path, first_line_num)))
+
+      line = RL(file_path, path_attributes)
+      line += " " * (path_column_width - len(line))
+      line += RL(
+          str(num_nodes) + " " * (num_nodes_column_width - len(str(num_nodes))),
+          color)
+      line += RL(
+          str(num_tensors) + " " *
+          (num_tensors_column_width - len(str(num_tensors))), color)
+      line += RL(str(num_dumps), color)
+      lines.append(line)
+    lines.append(RL())
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+
+  def list_source(self, args, screen_info=None):
+    """List Python source files that constructed nodes and tensors."""
+    del screen_info  # Unused.
+
+    parsed = self._arg_parsers["list_source"].parse_args(args)
+    source_list = source_utils.list_source_files_against_dump(
+        self._debug_dump,
+        path_regex_whitelist=parsed.path_filter,
+        node_name_regex_whitelist=parsed.node_name_filter)
+
+    top_lines = [
+        RL("List of source files that created nodes in this run", "bold")]
+    if parsed.path_filter:
+      top_lines.append(
+          RL("File path regex filter: \"%s\"" % parsed.path_filter))
+    if parsed.node_name_filter:
+      top_lines.append(
+          RL("Node name regex filter: \"%s\"" % parsed.node_name_filter))
+    top_lines.append(RL())
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(top_lines)
+    if not source_list:
+      output.append("[No source file information.]")
+      return output
+
+    output.extend(self._make_source_table(
+        [item for item in source_list if not item[1]], False))
+    output.extend(self._make_source_table(
+        [item for item in source_list if item[1]], True))
+    _add_main_menu(output, node_name=None)
+    return output
+
   def _list_inputs_or_outputs(self,
                               recursive,
                               node_name,
@@ -1395,6 +1537,11 @@ def create_analyzer_ui(debug_dump,
       analyzer.print_source,
       analyzer.get_help("print_source"),
       prefix_aliases=["ps"])
+  cli.register_command_handler(
+      "list_source",
+      analyzer.list_source,
+      analyzer.get_help("list_source"),
+      prefix_aliases=["ls"])
 
   dumped_tensor_names = []
   for datum in debug_dump.dumped_tensor_data:
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index bb2d72e2e4beb23cad9789687347897c87fedad2..8b191f332e8975e25977421fa2c55d7611875410 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import os
 import shutil
 import tempfile
@@ -28,10 +27,12 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.cli import analyzer_cli
+from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import debugger_cli_common
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.debug.lib import source_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
@@ -39,10 +40,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return inspect.stack()[1][2] - 1
+  return tf_inspect.stack()[1][2] - 1
 
 
 def parse_op_and_node(line):
@@ -501,7 +503,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       cls._main_device = "/job:localhost/replica:0/task:0/cpu:0"
 
     cls._curr_file_path = os.path.abspath(
-        inspect.getfile(inspect.currentframe()))
+        tf_inspect.getfile(tf_inspect.currentframe()))
 
     cls._sess = session.Session()
     with cls._sess as sess:
@@ -569,6 +571,11 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         cls._analyzer.print_source,
         cls._analyzer.get_help("print_source"),
         prefix_aliases=["ps"])
+    cls._registry.register_command_handler(
+        "list_source",
+        cls._analyzer.list_source,
+        cls._analyzer.get_help("list_source"),
+        prefix_aliases=["ls"])
 
   @classmethod
   def tearDownClass(cls):
@@ -906,7 +913,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         ["ERROR: There is no node named \"bar\" in the partition graphs"],
         out.lines)
     # Check color indicating error.
-    self.assertEqual({0: [(0, 59, "red")]}, out.font_attr_segs)
+    self.assertEqual({0: [(0, 59, cli_shared.COLOR_RED)]}, out.font_attr_segs)
     check_main_menu(self, out, list_tensors_enabled=True)
 
   def testPrintTensor(self):
@@ -1172,7 +1179,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                      out.font_attr_segs[index + 1][0][2].content)
     # simple_mul_add/u/Assign is not used in this run because the Variable has
     # already been initialized.
-    self.assertEqual("blue", out.font_attr_segs[index + 2][0][2])
+    self.assertEqual(cli_shared.COLOR_BLUE, out.font_attr_segs[index + 2][0][2])
     self.assertEqual("pt simple_mul_add/u/read",
                      out.font_attr_segs[index + 3][0][2].content)
 
@@ -1234,6 +1241,12 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         screen_info={"cols": 80})
 
     self.assertIn("Omitted 2 source lines", out.lines[0])
+    self.assertTrue(out.lines[0].endswith("+5"))
+    expand_lines_command = out.font_attr_segs[0][-1][2].content
+    self.assertStartsWith(expand_lines_command,
+                          "ps %s " % self._curr_file_path)
+    self.assertIn("-b 1", expand_lines_command)
+
     self.assertIsNone(self._findSourceLine(out, 1))
     self.assertIsNone(self._findSourceLine(out, 2))
     self.assertIsNotNone(self._findSourceLine(out, 3))
@@ -1250,7 +1263,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
                      out.font_attr_segs[index + 1][0][2].content)
     # simple_mul_add/u/Assign is not used in this run because the Variable has
     # already been initialized.
-    self.assertEqual("blue", out.font_attr_segs[index + 2][0][2])
+    self.assertEqual(cli_shared.COLOR_BLUE, out.font_attr_segs[index + 2][0][2])
     self.assertEqual("pt simple_mul_add/u/read",
                      out.font_attr_segs[index + 3][0][2].content)
 
@@ -1266,10 +1279,108 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
         ["L%d         u = variables.Variable(u_init, name=u_name)" %
          self._u_line_number,
          "    simple_mul_add/u",
-         "    (... Omitted 2 of 3 op(s) ...)"],
+         "    (... Omitted 2 of 3 op(s) ...) +5"],
         out.lines[index : index + 3])
     self.assertEqual("pt simple_mul_add/u",
                      out.font_attr_segs[index + 1][0][2].content)
+    more_elements_command = out.font_attr_segs[index + 2][-1][2].content
+    self.assertStartsWith(more_elements_command,
+                          "ps %s " % self._curr_file_path)
+    self.assertIn(" -m 6", more_elements_command)
+
+  def testListSourceWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", [])
+
+    non_tf_lib_files_start = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("Source file path")][0] + 1
+    non_tf_lib_files_end = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("TensorFlow Python library file(s):")][0] - 1
+    non_tf_lib_files = [
+        line.split(" ")[0] for line
+        in out.lines[non_tf_lib_files_start : non_tf_lib_files_end]]
+    self.assertIn(self._curr_file_path, non_tf_lib_files)
+
+    # Check that the TF library files are marked with special color attribute.
+    for i in xrange(non_tf_lib_files_end + 1, len(out.lines)):
+      if not out.lines[i]:
+        continue
+      for attr_seg in  out.font_attr_segs[i]:
+        self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
+                        attr_seg[2] == cli_shared.COLOR_GRAY)
+
+  def testListSourceWithNodeNameFilterWithMatchesWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", ["-n", ".*/read"])
+
+    self.assertStartsWith(out.lines[1], "Node name regex filter: \".*/read\"")
+
+    non_tf_lib_files_start = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("Source file path")][0] + 1
+    non_tf_lib_files_end = [
+        i for i in xrange(len(out.lines))
+        if out.lines[i].startswith("TensorFlow Python library file(s):")][0] - 1
+    non_tf_lib_files = [
+        line.split(" ")[0] for line
+        in out.lines[non_tf_lib_files_start : non_tf_lib_files_end]]
+    self.assertIn(self._curr_file_path, non_tf_lib_files)
+
+    # Check that the TF library files are marked with special color attribute.
+    for i in xrange(non_tf_lib_files_end + 1, len(out.lines)):
+      if not out.lines[i]:
+        continue
+      for attr_seg in  out.font_attr_segs[i]:
+        self.assertTrue(cli_shared.COLOR_GRAY in attr_seg[2] or
+                        attr_seg[2] == cli_shared.COLOR_GRAY)
+
+  def testListSourceWithNodeNameFilterWithNoMatchesWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command("list_source", ["-n", "^$"])
+
+    self.assertEqual([
+        "List of source files that created nodes in this run",
+        "Node name regex filter: \"^$\"", "",
+        "[No source file information.]"], out.lines)
+
+  def testListSourceWithPathAndNodeNameFiltersWorks(self):
+    self._debug_dump.set_python_graph(self._sess.graph)
+    out = self._registry.dispatch_command(
+        "list_source", ["-p", self._curr_file_path, "-n", ".*read"])
+
+    self.assertEqual([
+        "List of source files that created nodes in this run",
+        "File path regex filter: \"%s\"" % self._curr_file_path,
+        "Node name regex filter: \".*read\"", ""], out.lines[:4])
+
+  def testListSourceWithCompiledPythonSourceWorks(self):
+    def fake_list_source_files_against_dump(dump,
+                                            path_regex_whitelist=None,
+                                            node_name_regex_whitelist=None):
+      del dump, path_regex_whitelist, node_name_regex_whitelist
+      return [("compiled_1.pyc", False, 10, 20, 30, 4),
+              ("compiled_2.pyo", False, 10, 20, 30, 5),
+              ("uncompiled.py", False, 10, 20, 30, 6)]
+
+    with test.mock.patch.object(
+        source_utils, "list_source_files_against_dump",
+        side_effect=fake_list_source_files_against_dump):
+      out = self._registry.dispatch_command("list_source", [])
+
+      self.assertStartsWith(out.lines[4], "compiled_1.pyc")
+      self.assertEqual((0, 14, [cli_shared.COLOR_WHITE]),
+                       out.font_attr_segs[4][0])
+      self.assertStartsWith(out.lines[5], "compiled_2.pyo")
+      self.assertEqual((0, 14, [cli_shared.COLOR_WHITE]),
+                       out.font_attr_segs[5][0])
+      self.assertStartsWith(out.lines[6], "uncompiled.py")
+      self.assertEqual(0, out.font_attr_segs[6][0][0])
+      self.assertEqual(13, out.font_attr_segs[6][0][1])
+      self.assertEqual(cli_shared.COLOR_WHITE, out.font_attr_segs[6][0][2][0])
+      self.assertEqual("ps uncompiled.py -b 6",
+                       out.font_attr_segs[6][0][2][1].content)
 
 
 class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index b195347950285d82f8faa92ca0e576c130b088d9..9164e18bcf582d55502b0f9530dfa51ea416b00e 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
+
 import numpy as np
 import six
 
@@ -32,6 +34,21 @@ RL = debugger_cli_common.RichLine
 # when printing the value of the tensor.
 DEFAULT_NDARRAY_DISPLAY_THRESHOLD = 2000
 
+COLOR_BLACK = "black"
+COLOR_BLUE = "blue"
+COLOR_CYAN = "cyan"
+COLOR_GRAY = "gray"
+COLOR_GREEN = "green"
+COLOR_MAGENTA = "magenta"
+COLOR_RED = "red"
+COLOR_WHITE = "white"
+COLOR_YELLOW = "yellow"
+
+TIME_UNIT_US = "us"
+TIME_UNIT_MS = "ms"
+TIME_UNIT_S = "s"
+TIME_UNITS = [TIME_UNIT_US, TIME_UNIT_MS, TIME_UNIT_S]
+
 
 def bytes_to_readable_str(num_bytes, include_b=False):
   """Generate a human-readable string representing number of bytes.
@@ -63,6 +80,34 @@ def bytes_to_readable_str(num_bytes, include_b=False):
   return result
 
 
+def time_to_readable_str(value_us, force_time_unit=None):
+  """Convert time value to human-readable string.
+
+  Args:
+    value_us: time value in microseconds.
+    force_time_unit: force the output to use the specified time unit. Must be
+      in TIME_UNITS.
+
+  Returns:
+    Human-readable string representation of the time value.
+
+  Raises:
+    ValueError: if force_time_unit value is not in TIME_UNITS.
+  """
+  if not value_us:
+    return "0"
+  if force_time_unit:
+    if force_time_unit not in TIME_UNITS:
+      raise ValueError("Invalid time unit: %s" % force_time_unit)
+    order = TIME_UNITS.index(force_time_unit)
+    time_unit = force_time_unit
+    return "{:.10g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
+  else:
+    order = min(len(TIME_UNITS) - 1, int(math.log(value_us, 10) / 3))
+    time_unit = TIME_UNITS[order]
+    return "{:.3g}{}".format(value_us / math.pow(10.0, 3*order), time_unit)
+
+
 def parse_ranges_highlight(ranges_string):
   """Process ranges highlight string.
 
@@ -154,7 +199,7 @@ def error(msg):
   """
 
   return debugger_cli_common.rich_text_lines_from_rich_line_list([
-      RL("ERROR: " + msg, "red")])
+      RL("ERROR: " + msg, COLOR_RED)])
 
 
 def _get_fetch_name(fetch):
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 1ef3c3425460b394d7ccc1d52fe11246cf7b0ef1..647bbd5f0f29591a641028342d64b2786e37610c 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -70,6 +70,41 @@ class BytesToReadableStrTest(test_util.TensorFlowTestCase):
             1024**3, include_b=True))
 
 
+class TimeToReadableStrTest(test_util.TensorFlowTestCase):
+
+  def testNoneTimeWorks(self):
+    self.assertEqual("0", cli_shared.time_to_readable_str(None))
+
+  def testMicrosecondsTime(self):
+    self.assertEqual("40us", cli_shared.time_to_readable_str(40))
+
+  def testMillisecondTime(self):
+    self.assertEqual("40ms", cli_shared.time_to_readable_str(40e3))
+
+  def testSecondTime(self):
+    self.assertEqual("40s", cli_shared.time_to_readable_str(40e6))
+
+  def testForceTimeUnit(self):
+    self.assertEqual("40s",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("40000ms",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_MS))
+    self.assertEqual("40000000us",
+                     cli_shared.time_to_readable_str(
+                         40e6, force_time_unit=cli_shared.TIME_UNIT_US))
+    self.assertEqual("4e-05s",
+                     cli_shared.time_to_readable_str(
+                         40, force_time_unit=cli_shared.TIME_UNIT_S))
+    self.assertEqual("0",
+                     cli_shared.time_to_readable_str(
+                         0, force_time_unit=cli_shared.TIME_UNIT_S))
+
+    with self.assertRaisesRegexp(ValueError, r"Invalid time unit: ks"):
+      cli_shared.time_to_readable_str(100, force_time_unit="ks")
+
+
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/command_parser.py b/tensorflow/python/debug/cli/command_parser.py
index 68b0b9a8af973ca7eb9be87f0f073779a77c106d..143c1045199dc1e4f471d187219992557f2483c8 100644
--- a/tensorflow/python/debug/cli/command_parser.py
+++ b/tensorflow/python/debug/cli/command_parser.py
@@ -26,6 +26,31 @@ _BRACKETS_PATTERN = re.compile(r"\[[^\]]*\]")
 _QUOTES_PATTERN = re.compile(r"\"[^\"]*\"")
 _WHITESPACE_PATTERN = re.compile(r"\s+")
 
+_NUMBER_PATTERN = re.compile(r"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?")
+
+
+class Interval(object):
+  """Represents an interval between a start and end value."""
+
+  def __init__(self, start, start_included, end, end_included):
+    self.start = start
+    self.start_included = start_included
+    self.end = end
+    self.end_included = end_included
+
+  def contains(self, value):
+    if value < self.start or value == self.start and not self.start_included:
+      return False
+    if value > self.end or value == self.end and not self.end_included:
+      return False
+    return True
+
+  def __eq__(self, other):
+    return (self.start == other.start and
+            self.start_included == other.start_included and
+            self.end == other.end and
+            self.end_included == other.end_included)
+
 
 def parse_command(command):
   """Parse command string into a list of arguments.
@@ -91,15 +116,26 @@ def extract_output_file_path(args):
   if args and args[-1].endswith(">"):
     raise SyntaxError("Redirect file path is empty")
   elif args and args[-1].startswith(">"):
-    output_file_path = args[-1][1:]
-    args = args[:-1]
+    try:
+      _parse_interval(args[-1])
+      if len(args) > 1 and args[-2].startswith("-"):
+        output_file_path = None
+      else:
+        output_file_path = args[-1][1:]
+        args = args[:-1]
+    except ValueError:
+      output_file_path = args[-1][1:]
+      args = args[:-1]
   elif len(args) > 1 and args[-2] == ">":
     output_file_path = args[-1]
     args = args[:-2]
   elif args and args[-1].count(">") == 1:
     gt_index = args[-1].index(">")
-    output_file_path = args[-1][gt_index + 1:]
-    args[-1] = args[-1][:gt_index]
+    if gt_index > 0 and args[-1][gt_index - 1] == "=":
+      output_file_path = None
+    else:
+      output_file_path = args[-1][gt_index + 1:]
+      args[-1] = args[-1][:gt_index]
   elif len(args) > 1 and args[-2].endswith(">"):
     output_file_path = args[-1]
     args = args[:-1]
@@ -243,6 +279,131 @@ def parse_ranges(range_string):
   return ranges
 
 
+def parse_memory_interval(interval_str):
+  """Convert a human-readable memory interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[10kB, 20kB]", "<100M", ">100G"). Only the units "kB", "MB", "GB"
+      are supported. The "B character at the end of the input `str` may be
+      omitted.
+
+  Returns:
+    `Interval` object where start and end are in bytes.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  str_interval = _parse_interval(interval_str)
+  interval_start = 0
+  interval_end = float("inf")
+  if str_interval.start:
+    interval_start = parse_readable_size_str(str_interval.start)
+  if str_interval.end:
+    interval_end = parse_readable_size_str(str_interval.end)
+  if interval_start > interval_end:
+    raise ValueError(
+        "Invalid interval %s. Start of interval must be less than or equal "
+        "to end of interval." % interval_str)
+  return Interval(interval_start, str_interval.start_included,
+                  interval_end, str_interval.end_included)
+
+
+def parse_time_interval(interval_str):
+  """Convert a human-readable time interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[10us, 20us]", "<100s", ">100ms"). Supported time suffixes are
+      us, ms, s.
+
+  Returns:
+    `Interval` object where start and end are in microseconds.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  str_interval = _parse_interval(interval_str)
+  interval_start = 0
+  interval_end = float("inf")
+  if str_interval.start:
+    interval_start = parse_readable_time_str(str_interval.start)
+  if str_interval.end:
+    interval_end = parse_readable_time_str(str_interval.end)
+  if interval_start > interval_end:
+    raise ValueError(
+        "Invalid interval %s. Start must be before end of interval." %
+        interval_str)
+  return Interval(interval_start, str_interval.start_included,
+                  interval_end, str_interval.end_included)
+
+
+def _parse_interval(interval_str):
+  """Convert a human-readable interval to a tuple of start and end value.
+
+  Args:
+    interval_str: (`str`) A human-readable str representing an interval
+      (e.g., "[1M, 2M]", "<100k", ">100ms"). The items following the ">", "<",
+      ">=" and "<=" signs have to start with a number (e.g., 3.0, -2, .98).
+      The same requirement applies to the items in the parentheses or brackets.
+
+  Returns:
+    Interval object where start or end can be None
+    if the range is specified as "<N" or ">N" respectively.
+
+  Raises:
+    ValueError: if the input is not valid.
+  """
+  interval_str = interval_str.strip()
+  if interval_str.startswith("<="):
+    if _NUMBER_PATTERN.match(interval_str[2:].strip()):
+      return Interval(start=None, start_included=False,
+                      end=interval_str[2:].strip(), end_included=True)
+    else:
+      raise ValueError("Invalid value string after <= in '%s'" % interval_str)
+  if interval_str.startswith("<"):
+    if _NUMBER_PATTERN.match(interval_str[1:].strip()):
+      return Interval(start=None, start_included=False,
+                      end=interval_str[1:].strip(), end_included=False)
+    else:
+      raise ValueError("Invalid value string after < in '%s'" % interval_str)
+  if interval_str.startswith(">="):
+    if _NUMBER_PATTERN.match(interval_str[2:].strip()):
+      return Interval(start=interval_str[2:].strip(), start_included=True,
+                      end=None, end_included=False)
+    else:
+      raise ValueError("Invalid value string after >= in '%s'" % interval_str)
+  if interval_str.startswith(">"):
+    if _NUMBER_PATTERN.match(interval_str[1:].strip()):
+      return Interval(start=interval_str[1:].strip(), start_included=False,
+                      end=None, end_included=False)
+    else:
+      raise ValueError("Invalid value string after > in '%s'" % interval_str)
+
+  if (not interval_str.startswith(("[", "("))
+      or not interval_str.endswith(("]", ")"))):
+    raise ValueError(
+        "Invalid interval format: %s. Valid formats are: [min, max], "
+        "(min, max), <max, >min" % interval_str)
+  interval = interval_str[1:-1].split(",")
+  if len(interval) != 2:
+    raise ValueError(
+        "Incorrect interval format: %s. Interval should specify two values: "
+        "[min, max] or (min, max)." % interval_str)
+
+  start_item = interval[0].strip()
+  if not _NUMBER_PATTERN.match(start_item):
+    raise ValueError("Invalid first item in interval: '%s'" % start_item)
+  end_item = interval[1].strip()
+  if not _NUMBER_PATTERN.match(end_item):
+    raise ValueError("Invalid second item in interval: '%s'" % end_item)
+
+  return Interval(start=start_item,
+                  start_included=(interval_str[0] == "["),
+                  end=end_item,
+                  end_included=(interval_str[-1] == "]"))
+
+
 def parse_readable_size_str(size_str):
   """Convert a human-readable str representation to number of bytes.
 
@@ -277,6 +438,34 @@ def parse_readable_size_str(size_str):
                      size_str)
 
 
+def parse_readable_time_str(time_str):
+  """Parses a time string in the format N, Nus, Nms, Ns.
+
+  Args:
+    time_str: (`str`) string consisting of an integer time value optionally
+      followed by 'us', 'ms', or 's' suffix. If suffix is not specified,
+      value is assumed to be in microseconds. (e.g. 100us, 8ms, 5s, 100).
+
+  Returns:
+    Microseconds value.
+  """
+  def parse_positive_float(value_str):
+    value = float(value_str)
+    if value < 0:
+      raise ValueError(
+          "Invalid time %s. Time value must be positive." % value_str)
+    return value
+
+  time_str = time_str.strip()
+  if time_str.endswith("us"):
+    return int(parse_positive_float(time_str[:-2]))
+  elif time_str.endswith("ms"):
+    return int(parse_positive_float(time_str[:-2]) * 1e3)
+  elif time_str.endswith("s"):
+    return int(parse_positive_float(time_str[:-1]) * 1e6)
+  return int(parse_positive_float(time_str))
+
+
 def evaluate_tensor_slice(tensor, tensor_slicing):
   """Call eval on the slicing of a tensor, with validation.
 
diff --git a/tensorflow/python/debug/cli/command_parser_test.py b/tensorflow/python/debug/cli/command_parser_test.py
index 3f8b8744c320a3b35bb7b9919b39507865f756ad..1ea890be8c9747e3b3d6b78f9e76f5ed6741fe03 100644
--- a/tensorflow/python/debug/cli/command_parser_test.py
+++ b/tensorflow/python/debug/cli/command_parser_test.py
@@ -132,6 +132,63 @@ class ExtractOutputFilePathTest(test_util.TensorFlowTestCase):
     self.assertEqual(["pt", "a:0"], args)
     self.assertEqual(output_path, "/tmp/foo.txt")
 
+  def testFlagWithEqualGreaterThanShouldIgnoreIntervalFlags(self):
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time=>100ms"])
+    self.assertEqual(["lp", "--execution_time=>100ms"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">1.2s"])
+    self.assertEqual(["lp", "--execution_time", ">1.2s"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "-e", ">1200"])
+    self.assertEqual(["lp", "-e", ">1200"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--foo_value", ">-.2MB"])
+    self.assertEqual(["lp", "--foo_value", ">-.2MB"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--bar_value", ">-42e3GB"])
+    self.assertEqual(["lp", "--bar_value", ">-42e3GB"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">=100ms"])
+    self.assertEqual(["lp", "--execution_time", ">=100ms"], args)
+    self.assertIsNone(output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time=>=100ms"])
+    self.assertEqual(["lp", "--execution_time=>=100ms"], args)
+    self.assertIsNone(output_path)
+
+  def testFlagWithEqualGreaterThanShouldRecognizeFilePaths(self):
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", ">1.2s"])
+    self.assertEqual(["lp"], args)
+    self.assertEqual("1.2s", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--execution_time", ">x.yms"])
+    self.assertEqual(["lp", "--execution_time"], args)
+    self.assertEqual("x.yms", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--memory", ">a.1kB"])
+    self.assertEqual(["lp", "--memory"], args)
+    self.assertEqual("a.1kB", output_path)
+
+    args, output_path = command_parser.extract_output_file_path(
+        ["lp", "--memory", ">e002MB"])
+    self.assertEqual(["lp", "--memory"], args)
+    self.assertEqual("e002MB", output_path)
+
   def testOneArgumentIsHandledCorrectly(self):
     args, output_path = command_parser.extract_output_file_path(["lt"])
     self.assertEqual(["lt"], args)
@@ -297,5 +354,161 @@ class ParseReadableSizeStrTest(test_util.TensorFlowTestCase):
       command_parser.parse_readable_size_str("2EB")
 
 
+class ParseReadableTimeStrTest(test_util.TensorFlowTestCase):
+
+  def testParseNoUnitWorks(self):
+    self.assertEqual(0, command_parser.parse_readable_time_str("0"))
+    self.assertEqual(100, command_parser.parse_readable_time_str("100 "))
+    self.assertEqual(25, command_parser.parse_readable_time_str(" 25 "))
+
+  def testParseSeconds(self):
+    self.assertEqual(1e6, command_parser.parse_readable_time_str("1 s"))
+    self.assertEqual(2e6, command_parser.parse_readable_time_str("2s"))
+
+  def testParseMicros(self):
+    self.assertEqual(2, command_parser.parse_readable_time_str("2us"))
+
+  def testParseMillis(self):
+    self.assertEqual(2e3, command_parser.parse_readable_time_str("2ms"))
+
+  def testParseUnsupportedUnitRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError, r".*float.*2us.*"):
+      command_parser.parse_readable_time_str("2uss")
+
+    with self.assertRaisesRegexp(
+        ValueError, r".*float.*2m.*"):
+      command_parser.parse_readable_time_str("2m")
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Invalid time -1. Time value must be positive."):
+      command_parser.parse_readable_time_str("-1s")
+
+
+class ParseInterval(test_util.TensorFlowTestCase):
+
+  def testParseTimeInterval(self):
+    self.assertEquals(
+        command_parser.Interval(10, True, 1e3, True),
+        command_parser.parse_time_interval("[10us, 1ms]"))
+    self.assertEquals(
+        command_parser.Interval(10, False, 1e3, False),
+        command_parser.parse_time_interval("(10us, 1ms)"))
+    self.assertEquals(
+        command_parser.Interval(10, False, 1e3, True),
+        command_parser.parse_time_interval("(10us, 1ms]"))
+    self.assertEquals(
+        command_parser.Interval(10, True, 1e3, False),
+        command_parser.parse_time_interval("[10us, 1ms)"))
+    self.assertEquals(command_parser.Interval(0, False, 1e3, True),
+                      command_parser.parse_time_interval("<=1ms"))
+    self.assertEquals(
+        command_parser.Interval(1e3, True, float("inf"), False),
+        command_parser.parse_time_interval(">=1ms"))
+    self.assertEquals(command_parser.Interval(0, False, 1e3, False),
+                      command_parser.parse_time_interval("<1ms"))
+    self.assertEquals(
+        command_parser.Interval(1e3, False, float("inf"), False),
+        command_parser.parse_time_interval(">1ms"))
+
+  def testParseTimeGreaterLessThanWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+      command_parser.parse_time_interval(">=wms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+      command_parser.parse_time_interval(">Yms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+      command_parser.parse_time_interval("<= _ms")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+      command_parser.parse_time_interval("<-ms")
+
+  def testParseTimeIntervalsWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("[wms, 10ms]")
+    with self.assertRaisesRegexp(ValueError,
+                                 "Invalid second item in interval:"):
+      command_parser.parse_time_interval("[ 0ms, _ms]")
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("(xms, _ms]")
+    with self.assertRaisesRegexp(ValueError, "Invalid first item in interval:"):
+      command_parser.parse_time_interval("((3ms, _ms)")
+
+  def testInvalidTimeIntervalRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval format: \[10us, 1ms. Valid formats are: "
+        r"\[min, max\], \(min, max\), <max, >min"):
+      command_parser.parse_time_interval("[10us, 1ms")
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Incorrect interval format: \[10us, 1ms, 2ms\]. Interval should "
+        r"specify two values: \[min, max\] or \(min, max\)"):
+      command_parser.parse_time_interval("[10us, 1ms, 2ms]")
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval \[1s, 1ms\]. Start must be before end of interval."):
+      command_parser.parse_time_interval("[1s, 1ms]")
+
+  def testParseMemoryInterval(self):
+    self.assertEquals(
+        command_parser.Interval(1024, True, 2048, True),
+        command_parser.parse_memory_interval("[1k, 2k]"))
+    self.assertEquals(
+        command_parser.Interval(1024, False, 2048, False),
+        command_parser.parse_memory_interval("(1kB, 2kB)"))
+    self.assertEquals(
+        command_parser.Interval(1024, False, 2048, True),
+        command_parser.parse_memory_interval("(1k, 2k]"))
+    self.assertEquals(
+        command_parser.Interval(1024, True, 2048, False),
+        command_parser.parse_memory_interval("[1k, 2k)"))
+    self.assertEquals(
+        command_parser.Interval(0, False, 2048, True),
+        command_parser.parse_memory_interval("<=2k"))
+    self.assertEquals(
+        command_parser.Interval(11, True, float("inf"), False),
+        command_parser.parse_memory_interval(">=11"))
+    self.assertEquals(command_parser.Interval(0, False, 2048, False),
+                      command_parser.parse_memory_interval("<2k"))
+    self.assertEquals(
+        command_parser.Interval(11, False, float("inf"), False),
+        command_parser.parse_memory_interval(">11"))
+
+  def testParseMemoryIntervalsWithInvalidValueStrings(self):
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after >= "):
+      command_parser.parse_time_interval(">=wM")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after > "):
+      command_parser.parse_time_interval(">YM")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after <= "):
+      command_parser.parse_time_interval("<= _MB")
+    with self.assertRaisesRegexp(ValueError, "Invalid value string after < "):
+      command_parser.parse_time_interval("<-MB")
+
+  def testInvalidMemoryIntervalRaisesException(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Invalid interval \[5k, 3k\]. Start of interval must be less than or "
+        "equal to end of interval."):
+      command_parser.parse_memory_interval("[5k, 3k]")
+
+  def testIntervalContains(self):
+    interval = command_parser.Interval(
+        start=1, start_included=True, end=10, end_included=True)
+    self.assertTrue(interval.contains(1))
+    self.assertTrue(interval.contains(10))
+    self.assertTrue(interval.contains(5))
+
+    interval.start_included = False
+    self.assertFalse(interval.contains(1))
+    self.assertTrue(interval.contains(10))
+
+    interval.end_included = False
+    self.assertFalse(interval.contains(1))
+    self.assertFalse(interval.contains(10))
+
+    interval.start_included = True
+    self.assertTrue(interval.contains(1))
+    self.assertFalse(interval.contains(10))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index d8d3bce3de7bf746f47956a90e6fff6f4becfdac..6a571c097ee699bec87353114d6c68ce3308283b 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import collections
 import curses
 from curses import textpad
+import os
 import signal
 import sys
 import threading
@@ -27,6 +28,7 @@ import threading
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.debug.cli import base_ui
+from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import command_parser
 from tensorflow.python.debug.cli import curses_widgets
 from tensorflow.python.debug.cli import debugger_cli_common
@@ -42,6 +44,9 @@ _SCROLL_HOME = "home"
 _SCROLL_END = "end"
 _SCROLL_TO_LINE_INDEX = "scroll_to_line_index"
 
+_COLOR_READY_COLORTERMS = ["gnome-terminal", "xfce4-terminal"]
+_COLOR_ENABLED_TERM = "xterm-256color"
+
 
 def _get_command_from_line_attr_segs(mouse_x, attr_segs):
   """Attempt to extract command from the attribute segments of a line.
@@ -77,7 +82,7 @@ class ScrollBar(object):
   event in the screen region it occupies.
   """
 
-  BASE_ATTR = "black_on_white"
+  BASE_ATTR = cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE
 
   def __init__(self,
                min_x,
@@ -119,7 +124,7 @@ class ScrollBar(object):
       raise ValueError("Insufficient height for ScrollBar (%d)" %
                        (self._max_y - self._min_y + 1))
 
-  def _block_y(self):
+  def _block_y(self, screen_coord_sys=False):
     """Get the 0-based y coordinate of the scroll block.
 
     This y coordinate takes into account the presence of the UP and DN buttons
@@ -127,9 +132,13 @@ class ScrollBar(object):
     location, the return value will be 1; at the bottom location, the return
     value will be self._scroll_bar_height - 2.
 
+    Args:
+      screen_coord_sys: (`bool`) whether the return value will be in the
+        screen coordinate system.
+
     Returns:
       (int) 0-based y coordinate of the scroll block, in the ScrollBar
-        coordinate system, i.e., not the screen coordinate system. For example,
+        coordinate system by default. For example,
         when scroll position is at the top, this return value will be 1 (not 0,
         because of the presence of the UP button). When scroll position is at
         the bottom, this return value will be self._scroll_bar_height - 2
@@ -137,8 +146,10 @@ class ScrollBar(object):
         button).
     """
 
-    return int(float(self._scroll_position) / (self._output_num_rows - 1) *
-               (self._scroll_bar_height - 3)) + 1
+    rel_block_y = int(
+        float(self._scroll_position) / (self._output_num_rows - 1) *
+        (self._scroll_bar_height - 3)) + 1
+    return rel_block_y + self._min_y if screen_coord_sys else rel_block_y
 
   def layout(self):
     """Get the RichTextLines layout of the scroll bar.
@@ -187,9 +198,11 @@ class ScrollBar(object):
       return _SCROLL_UP_A_LINE
     elif mouse_y == self._max_y:
       return _SCROLL_DOWN_A_LINE
-    elif mouse_y > self._block_y() and mouse_y < self._max_y:
+    elif (mouse_y > self._block_y(screen_coord_sys=True) and
+          mouse_y < self._max_y):
       return _SCROLL_DOWN
-    elif mouse_y < self._block_y() and mouse_y > self._min_y:
+    elif (mouse_y < self._block_y(screen_coord_sys=True) and
+          mouse_y > self._min_y):
       return _SCROLL_UP
     else:
       return None
@@ -225,27 +238,36 @@ class CursesUI(base_ui.BaseUI):
   }
 
   _FOREGROUND_COLORS = {
-      "white": curses.COLOR_WHITE,
-      "red": curses.COLOR_RED,
-      "green": curses.COLOR_GREEN,
-      "yellow": curses.COLOR_YELLOW,
-      "blue": curses.COLOR_BLUE,
-      "cyan": curses.COLOR_CYAN,
-      "magenta": curses.COLOR_MAGENTA,
-      "black": curses.COLOR_BLACK,
+      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
+      cli_shared.COLOR_RED: curses.COLOR_RED,
+      cli_shared.COLOR_GREEN: curses.COLOR_GREEN,
+      cli_shared.COLOR_YELLOW: curses.COLOR_YELLOW,
+      cli_shared.COLOR_BLUE: curses.COLOR_BLUE,
+      cli_shared.COLOR_CYAN: curses.COLOR_CYAN,
+      cli_shared.COLOR_MAGENTA: curses.COLOR_MAGENTA,
+      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
   }
   _BACKGROUND_COLORS = {
-      "white": curses.COLOR_WHITE,
-      "black": curses.COLOR_BLACK,
+      "transparent": -1,
+      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
+      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
   }
 
   # Font attribute for search and highlighting.
-  _SEARCH_HIGHLIGHT_FONT_ATTR = "black_on_white"
-  _ARRAY_INDICES_COLOR_PAIR = "black_on_white"
-  _ERROR_TOAST_COLOR_PAIR = "red_on_white"
-  _INFO_TOAST_COLOR_PAIR = "blue_on_white"
-  _STATUS_BAR_COLOR_PAIR = "black_on_white"
-  _UI_WAIT_COLOR_PAIR = "magenta_on_white"
+  _SEARCH_HIGHLIGHT_FONT_ATTR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _ARRAY_INDICES_COLOR_PAIR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _ERROR_TOAST_COLOR_PAIR = (
+      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
+  _INFO_TOAST_COLOR_PAIR = (
+      cli_shared.COLOR_BLUE + "_on_" + cli_shared.COLOR_WHITE)
+  _STATUS_BAR_COLOR_PAIR = (
+      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
+  _UI_WAIT_COLOR_PAIR = (
+      cli_shared.COLOR_MAGENTA + "_on_" + cli_shared.COLOR_WHITE)
+  _NAVIGATION_WARNING_COLOR_PAIR = (
+      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
 
   _UI_WAIT_MESSAGE = "Processing..."
 
@@ -370,29 +392,43 @@ class CursesUI(base_ui.BaseUI):
 
     Creates curses stdscr and initialize the color pairs for display.
     """
-
+    # If the terminal type is color-ready, enable it.
+    if os.getenv("COLORTERM") in _COLOR_READY_COLORTERMS:
+      os.environ["TERM"] = _COLOR_ENABLED_TERM
     self._stdscr = curses.initscr()
     self._command_window = None
+    self._screen_color_init()
 
-    # Prepare color pairs.
+  def _screen_color_init(self):
+    """Initialization of screen colors."""
     curses.start_color()
-
+    curses.use_default_colors()
     self._color_pairs = {}
     color_index = 0
 
+    # Prepare color pairs.
     for fg_color in self._FOREGROUND_COLORS:
       for bg_color in self._BACKGROUND_COLORS:
-
         color_index += 1
         curses.init_pair(color_index, self._FOREGROUND_COLORS[fg_color],
                          self._BACKGROUND_COLORS[bg_color])
 
         color_name = fg_color
-        if bg_color != "black":
+        if bg_color != "transparent":
           color_name += "_on_" + bg_color
 
         self._color_pairs[color_name] = curses.color_pair(color_index)
 
+    # Try getting color(s) available only under 256-color support.
+    try:
+      color_index += 1
+      curses.init_pair(color_index, 245, -1)
+      self._color_pairs[cli_shared.COLOR_GRAY] = curses.color_pair(color_index)
+    except curses.error:
+      # Use fall-back color(s):
+      self._color_pairs[cli_shared.COLOR_GRAY] = (
+          self._color_pairs[cli_shared.COLOR_GREEN])
+
     # A_BOLD or A_BLINK is not really a "color". But place it here for
     # convenience.
     self._color_pairs["bold"] = curses.A_BOLD
@@ -400,7 +436,7 @@ class CursesUI(base_ui.BaseUI):
     self._color_pairs["underline"] = curses.A_UNDERLINE
 
     # Default color pair to use when a specified color pair does not exist.
-    self._default_color_pair = self._color_pairs["white"]
+    self._default_color_pair = self._color_pairs[cli_shared.COLOR_WHITE]
 
   def _screen_launch(self, enable_mouse_on_start):
     """Launch the curses screen."""
@@ -477,7 +513,7 @@ class CursesUI(base_ui.BaseUI):
   def get_help(self):
     return self._command_handler_registry.get_help()
 
-  def _screen_create_command_textbox(self, existing_command):
+  def _screen_create_command_textbox(self, existing_command=None):
     """Create command textbox on screen.
 
     Args:
@@ -588,7 +624,7 @@ class CursesUI(base_ui.BaseUI):
         scroll_position = item.scroll_position
       else:
         self._toast("At the LATEST in navigation history!",
-                    color="red_on_white")
+                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
         return
     else:
       if self._nav_history.can_go_back():
@@ -596,7 +632,7 @@ class CursesUI(base_ui.BaseUI):
         scroll_position = item.scroll_position
       else:
         self._toast("At the OLDEST in navigation history!",
-                    color="red_on_white")
+                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
         return
 
     self._display_output(item.screen_output)
@@ -674,7 +710,8 @@ class CursesUI(base_ui.BaseUI):
       # Empty command: take no action. Should not exit.
       return
 
-    screen_info = {"cols": self._max_x}
+    # Take into account scroll bar width.
+    screen_info = {"cols": self._max_x - 2}
     exit_token = None
     if self._command_handler_registry.is_registered(prefix):
       try:
@@ -811,6 +848,7 @@ class CursesUI(base_ui.BaseUI):
         else:
           command = self._fetch_hyperlink_command(mouse_x, mouse_y)
           if command:
+            self._screen_create_command_textbox()
             exit_token = self._dispatch_command(command)
             if exit_token is not None:
               raise debugger_cli_common.CommandLineExit(exit_token=exit_token)
@@ -870,13 +908,14 @@ class CursesUI(base_ui.BaseUI):
     """Automatically key in a command to the command Textbox.
 
     Args:
-      command: The command, as a string.
+      command: The command, as a string or None.
       erase_existing: (bool) whether existing text (if any) is to be erased
           first.
     """
     if erase_existing:
       self._erase_existing_command()
 
+    command = command or ""
     for c in command:
       self._command_textbox.do_command(ord(c))
 
@@ -959,7 +998,7 @@ class CursesUI(base_ui.BaseUI):
       self._curr_wrapped_output.lines.append("Output cut off at %d lines!" %
                                              self.max_output_lines)
       self._curr_wrapped_output.font_attr_segs[self.max_output_lines] = [
-          (0, len(output.lines[-1]), "magenta")
+          (0, len(output.lines[-1]), cli_shared.COLOR_MAGENTA)
       ]
 
     self._display_nav_bar()
@@ -1039,6 +1078,9 @@ class CursesUI(base_ui.BaseUI):
         self._toast("Pattern not found", color=self._ERROR_TOAST_COLOR_PAIR)
     elif is_refresh:
       self._scroll_output(_SCROLL_REFRESH)
+    elif debugger_cli_common.INIT_SCROLL_POS_KEY in output.annotations:
+      line_index = output.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY]
+      self._scroll_output(_SCROLL_TO_LINE_INDEX, line_index=line_index)
     else:
       self._output_pad_row = 0
       self._scroll_output(_SCROLL_HOME)
@@ -1199,9 +1241,9 @@ class CursesUI(base_ui.BaseUI):
 
     self._scroll_bar = ScrollBar(
         self._max_x - 2,
-        2,
+        3,
         self._max_x - 1,
-        self._output_num_rows,
+        self._output_num_rows + 1,
         self._output_pad_row,
         self._output_pad_height - self._output_pad_screen_height)
 
@@ -1518,7 +1560,9 @@ class CursesUI(base_ui.BaseUI):
 
     pad, _, _ = self._display_lines(
         debugger_cli_common.RichTextLines(
-            message, font_attr_segs={0: [(0, len(message), color or "white")]}),
+            message,
+            font_attr_segs={
+                0: [(0, len(message), color or cli_shared.COLOR_WHITE)]}),
         0)
 
     right_end = min(len(message), self._max_x - 2)
diff --git a/tensorflow/python/debug/cli/curses_ui_test.py b/tensorflow/python/debug/cli/curses_ui_test.py
index 8219f47ef3a85e796c001c7d23343d2d84b4a932..15e1356d2921cd4c8d8df94710a7a91e90163ab0 100644
--- a/tensorflow/python/debug/cli/curses_ui_test.py
+++ b/tensorflow/python/debug/cli/curses_ui_test.py
@@ -113,7 +113,7 @@ class MockCursesUI(curses_ui.CursesUI):
   def _screen_create_command_window(self):
     pass
 
-  def _screen_create_command_textbox(self, existing_command):
+  def _screen_create_command_textbox(self, existing_command=None):
     """Override to insert observer of existing commands.
 
     Used in testing of history navigation and tab completion.
@@ -1646,6 +1646,25 @@ class ScrollBarTest(test_util.TensorFlowTestCase):
                      scroll_bar.get_click_command(7))
     self.assertIsNone(scroll_bar.get_click_command(8))
 
+  def testClickCommandsAreCorrectForScrollBarNotAtZeroMinY(self):
+    scroll_bar = curses_ui.ScrollBar(0, 5, 1, 12, 10, 20)
+    self.assertIsNone(scroll_bar.get_click_command(0))
+    self.assertIsNone(scroll_bar.get_click_command(4))
+    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
+                     scroll_bar.get_click_command(5))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(6))
+    self.assertEqual(curses_ui._SCROLL_UP,
+                     scroll_bar.get_click_command(7))
+    self.assertIsNone(scroll_bar.get_click_command(8))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(10))
+    self.assertEqual(curses_ui._SCROLL_DOWN,
+                     scroll_bar.get_click_command(11))
+    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
+                     scroll_bar.get_click_command(12))
+    self.assertIsNone(scroll_bar.get_click_command(13))
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/cli/debugger_cli_common.py b/tensorflow/python/debug/cli/debugger_cli_common.py
index 64a22e6be4a9512ec22da465b4a26a9dc0b553f6..12e79ab07a4655c7d41f41d2e71906273e154a08 100644
--- a/tensorflow/python/debug/cli/debugger_cli_common.py
+++ b/tensorflow/python/debug/cli/debugger_cli_common.py
@@ -32,6 +32,7 @@ HELP_INDENT = "  "
 
 EXPLICIT_USER_EXIT = "explicit_user_exit"
 REGEX_MATCH_LINES_KEY = "regex_match_lines"
+INIT_SCROLL_POS_KEY = "init_scroll_pos"
 
 MAIN_MENU_KEY = "mm:"
 
@@ -108,11 +109,12 @@ class RichLine(object):
     return len(self.text)
 
 
-def rich_text_lines_from_rich_line_list(rich_text_list):
+def rich_text_lines_from_rich_line_list(rich_text_list, annotations=None):
   """Convert a list of RichLine objects or strings to a RichTextLines object.
 
   Args:
     rich_text_list: a list of RichLine objects or strings
+    annotations: annotatoins for the resultant RichTextLines object.
 
   Returns:
     A corresponding RichTextLines object.
@@ -126,7 +128,7 @@ def rich_text_lines_from_rich_line_list(rich_text_list):
         font_attr_segs[i] = rl.font_attr_segs
     else:
       lines.append(rl)
-  return RichTextLines(lines, font_attr_segs)
+  return RichTextLines(lines, font_attr_segs, annotations=annotations)
 
 
 class RichTextLines(object):
@@ -648,7 +650,7 @@ class CommandHandlerRegistry(object):
         3) the handler is found for the prefix, but it fails to return a
           RichTextLines or raise any exception.
       CommandLineExit:
-        If the command handler raises this type of exception, tihs method will
+        If the command handler raises this type of exception, this method will
         simply pass it along.
     """
     if not prefix:
@@ -838,7 +840,7 @@ class TabCompletionRegistry(object):
 
     Args:
       context_words: A list of context words belonging to the context being
-        registerd. It is a list of str, instead of a single string, to support
+        registered. It is a list of str, instead of a single string, to support
         synonym words triggering the same tab-completion context, e.g.,
         both "drink" and the short-hand "dr" can trigger the same context.
       comp_items: A list of completion items, as a list of str.
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli.py b/tensorflow/python/debug/cli/profile_analyzer_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..3304194b1cb056154d430842b3462d62398c5142
--- /dev/null
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli.py
@@ -0,0 +1,799 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Formats and displays profiling information."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+
+import numpy as np
+
+from tensorflow.python.debug.cli import cli_shared
+from tensorflow.python.debug.cli import command_parser
+from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import ui_factory
+from tensorflow.python.debug.lib import profiling
+from tensorflow.python.debug.lib import source_utils
+
+RL = debugger_cli_common.RichLine
+
+SORT_OPS_BY_OP_NAME = "node"
+SORT_OPS_BY_OP_TYPE = "op_type"
+SORT_OPS_BY_OP_TIME = "op_time"
+SORT_OPS_BY_EXEC_TIME = "exec_time"
+SORT_OPS_BY_START_TIME = "start_time"
+SORT_OPS_BY_LINE = "line"
+
+_DEVICE_NAME_FILTER_FLAG = "device_name_filter"
+_NODE_NAME_FILTER_FLAG = "node_name_filter"
+_OP_TYPE_FILTER_FLAG = "op_type_filter"
+
+
+class ProfileDataTableView(object):
+  """Table View of profiling data."""
+
+  def __init__(self, profile_datum_list, time_unit=cli_shared.TIME_UNIT_US):
+    """Constructor.
+
+    Args:
+      profile_datum_list: List of `ProfileDatum` objects.
+      time_unit: must be in cli_shared.TIME_UNITS.
+    """
+    self._profile_datum_list = profile_datum_list
+    self.formatted_start_time = [
+        datum.start_time for datum in profile_datum_list]
+    self.formatted_op_time = [
+        cli_shared.time_to_readable_str(datum.op_time,
+                                        force_time_unit=time_unit)
+        for datum in profile_datum_list]
+    self.formatted_exec_time = [
+        cli_shared.time_to_readable_str(
+            datum.node_exec_stats.all_end_rel_micros,
+            force_time_unit=time_unit)
+        for datum in profile_datum_list]
+
+    self._column_names = ["Node",
+                          "Op Type",
+                          "Start Time (us)",
+                          "Op Time (%s)" % time_unit,
+                          "Exec Time (%s)" % time_unit,
+                          "Filename:Lineno(function)"]
+    self._column_sort_ids = [SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+                             SORT_OPS_BY_START_TIME, SORT_OPS_BY_OP_TIME,
+                             SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_LINE]
+
+  def value(self,
+            row,
+            col,
+            device_name_filter=None,
+            node_name_filter=None,
+            op_type_filter=None):
+    """Get the content of a cell of the table.
+
+    Args:
+      row: (int) row index.
+      col: (int) column index.
+      device_name_filter: Regular expression to filter by device name.
+      node_name_filter: Regular expression to filter by node name.
+      op_type_filter: Regular expression to filter by op type.
+
+    Returns:
+      A debuggre_cli_common.RichLine object representing the content of the
+      cell, potentially with a clickable MenuItem.
+
+    Raises:
+      IndexError: if row index is out of range.
+    """
+    menu_item = None
+    if col == 0:
+      text = self._profile_datum_list[row].node_exec_stats.node_name
+    elif col == 1:
+      text = self._profile_datum_list[row].op_type
+    elif col == 2:
+      text = str(self.formatted_start_time[row])
+    elif col == 3:
+      text = str(self.formatted_op_time[row])
+    elif col == 4:
+      text = str(self.formatted_exec_time[row])
+    elif col == 5:
+      command = "ps"
+      if device_name_filter:
+        command += " --%s %s" % (_DEVICE_NAME_FILTER_FLAG,
+                                 device_name_filter)
+      if node_name_filter:
+        command += " --%s %s" % (_NODE_NAME_FILTER_FLAG, node_name_filter)
+      if op_type_filter:
+        command += " --%s %s" % (_OP_TYPE_FILTER_FLAG, op_type_filter)
+      command += " %s --init_line %d" % (
+          self._profile_datum_list[row].file_path,
+          self._profile_datum_list[row].line_number)
+      menu_item = debugger_cli_common.MenuItem(None, command)
+      text = self._profile_datum_list[row].file_line_func
+    else:
+      raise IndexError("Invalid column index %d." % col)
+
+    return RL(text, font_attr=menu_item)
+
+  def row_count(self):
+    return len(self._profile_datum_list)
+
+  def column_count(self):
+    return len(self._column_names)
+
+  def column_names(self):
+    return self._column_names
+
+  def column_sort_id(self, col):
+    return self._column_sort_ids[col]
+
+
+def _list_profile_filter(
+    profile_datum,
+    node_name_regex,
+    file_path_regex,
+    op_type_regex,
+    op_time_interval,
+    exec_time_interval,
+    min_lineno=-1,
+    max_lineno=-1):
+  """Filter function for list_profile command.
+
+  Args:
+    profile_datum: A `ProfileDatum` object.
+    node_name_regex: Regular expression pattern object to filter by name.
+    file_path_regex: Regular expression pattern object to filter by file path.
+    op_type_regex: Regular expression pattern object to filter by op type.
+    op_time_interval: `Interval` for filtering op time.
+    exec_time_interval: `Interval` for filtering exec time.
+    min_lineno: Lower bound for 1-based line number, inclusive.
+      If <= 0, has no effect.
+    max_lineno: Upper bound for 1-based line number, exclusive.
+      If <= 0, has no effect.
+    # TODO(cais): Maybe filter by function name.
+
+  Returns:
+    True iff profile_datum should be included.
+  """
+  if node_name_regex and not node_name_regex.match(
+      profile_datum.node_exec_stats.node_name):
+    return False
+  if file_path_regex:
+    if (not profile_datum.file_path or
+        not file_path_regex.match(profile_datum.file_path)):
+      return False
+  if (min_lineno > 0 and profile_datum.line_number and
+      profile_datum.line_number < min_lineno):
+    return False
+  if (max_lineno > 0 and profile_datum.line_number and
+      profile_datum.line_number >= max_lineno):
+    return False
+  if (profile_datum.op_type is not None and op_type_regex and
+      not op_type_regex.match(profile_datum.op_type)):
+    return False
+  if op_time_interval is not None and not op_time_interval.contains(
+      profile_datum.op_time):
+    return False
+  if exec_time_interval and not exec_time_interval.contains(
+      profile_datum.node_exec_stats.all_end_rel_micros):
+    return False
+  return True
+
+
+def _list_profile_sort_key(profile_datum, sort_by):
+  """Get a profile_datum property to sort by in list_profile command.
+
+  Args:
+    profile_datum: A `ProfileDatum` object.
+    sort_by: (string) indicates a value to sort by.
+      Must be one of SORT_BY* constants.
+
+  Returns:
+    profile_datum property to sort by.
+  """
+  if sort_by == SORT_OPS_BY_OP_NAME:
+    return profile_datum.node_exec_stats.node_name
+  elif sort_by == SORT_OPS_BY_OP_TYPE:
+    return profile_datum.op_type
+  elif sort_by == SORT_OPS_BY_LINE:
+    return profile_datum.file_line_func
+  elif sort_by == SORT_OPS_BY_OP_TIME:
+    return profile_datum.op_time
+  elif sort_by == SORT_OPS_BY_EXEC_TIME:
+    return profile_datum.node_exec_stats.all_end_rel_micros
+  else:  # sort by start time
+    return profile_datum.node_exec_stats.all_start_micros
+
+
+class ProfileAnalyzer(object):
+  """Analyzer for profiling data."""
+
+  def __init__(self, graph, run_metadata):
+    """ProfileAnalyzer constructor.
+
+    Args:
+      graph: (tf.Graph) Python graph object.
+      run_metadata: A `RunMetadata` protobuf object.
+
+    Raises:
+      ValueError: If run_metadata is None.
+    """
+    self._graph = graph
+    if not run_metadata:
+      raise ValueError("No RunMetadata passed for profile analysis.")
+    self._run_metadata = run_metadata
+    self._arg_parsers = {}
+    ap = argparse.ArgumentParser(
+        description="List nodes profile information.",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "-d",
+        "--%s" % _DEVICE_NAME_FILTER_FLAG,
+        dest=_DEVICE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter device name by regex.")
+    ap.add_argument(
+        "-n",
+        "--%s" % _NODE_NAME_FILTER_FLAG,
+        dest=_NODE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter node name by regex.")
+    ap.add_argument(
+        "-t",
+        "--%s" % _OP_TYPE_FILTER_FLAG,
+        dest=_OP_TYPE_FILTER_FLAG,
+        type=str,
+        default="",
+        help="filter op type by regex.")
+    # TODO(annarev): allow file filtering at non-stack top position.
+    ap.add_argument(
+        "-f",
+        "--file_path_filter",
+        dest="file_path_filter",
+        type=str,
+        default="",
+        help="filter by file name at the top position of node's creation "
+             "stack that does not belong to TensorFlow library.")
+    ap.add_argument(
+        "--min_lineno",
+        dest="min_lineno",
+        type=int,
+        default=-1,
+        help="(Inclusive) lower bound for 1-based line number in source file. "
+             "If <= 0, has no effect.")
+    ap.add_argument(
+        "--max_lineno",
+        dest="max_lineno",
+        type=int,
+        default=-1,
+        help="(Exclusive) upper bound for 1-based line number in source file. "
+             "If <= 0, has no effect.")
+    ap.add_argument(
+        "-e",
+        "--execution_time",
+        dest="execution_time",
+        type=str,
+        default="",
+        help="Filter by execution time interval "
+             "(includes compute plus pre- and post -processing time). "
+             "Supported units are s, ms and us (default). "
+             "E.g. -e >100s, -e <100, -e [100us,1000ms]")
+    ap.add_argument(
+        "-o",
+        "--op_time",
+        dest="op_time",
+        type=str,
+        default="",
+        help="Filter by op time interval (only includes compute time). "
+             "Supported units are s, ms and us (default). "
+             "E.g. -e >100s, -e <100, -e [100us,1000ms]")
+    ap.add_argument(
+        "-s",
+        "--sort_by",
+        dest="sort_by",
+        type=str,
+        default=SORT_OPS_BY_START_TIME,
+        help=("the field to sort the data by: (%s)" %
+              " | ".join([SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+                          SORT_OPS_BY_START_TIME, SORT_OPS_BY_OP_TIME,
+                          SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_LINE])))
+    ap.add_argument(
+        "-r",
+        "--reverse",
+        dest="reverse",
+        action="store_true",
+        help="sort the data in reverse (descending) order")
+    ap.add_argument(
+        "--time_unit",
+        dest="time_unit",
+        type=str,
+        default=cli_shared.TIME_UNIT_US,
+        help="Time unit (" + " | ".join(cli_shared.TIME_UNITS) + ")")
+
+    self._arg_parsers["list_profile"] = ap
+
+    ap = argparse.ArgumentParser(
+        description="Print a Python source file with line-level profile "
+                    "information",
+        usage=argparse.SUPPRESS)
+    ap.add_argument(
+        "source_file_path",
+        type=str,
+        help="Path to the source_file_path")
+    ap.add_argument(
+        "--cost_type",
+        type=str,
+        choices=["exec_time", "op_time"],
+        default="exec_time",
+        help="Type of cost to display")
+    ap.add_argument(
+        "--time_unit",
+        dest="time_unit",
+        type=str,
+        default=cli_shared.TIME_UNIT_US,
+        help="Time unit (" + " | ".join(cli_shared.TIME_UNITS) + ")")
+    ap.add_argument(
+        "-d",
+        "--%s" % _DEVICE_NAME_FILTER_FLAG,
+        dest=_DEVICE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter device name by regex.")
+    ap.add_argument(
+        "-n",
+        "--%s" % _NODE_NAME_FILTER_FLAG,
+        dest=_NODE_NAME_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter node name by regex.")
+    ap.add_argument(
+        "-t",
+        "--%s" % _OP_TYPE_FILTER_FLAG,
+        dest=_OP_TYPE_FILTER_FLAG,
+        type=str,
+        default="",
+        help="Filter op type by regex.")
+    ap.add_argument(
+        "--init_line",
+        dest="init_line",
+        type=int,
+        default=0,
+        help="The 1-based line number to scroll to initially.")
+
+    self._arg_parsers["print_source"] = ap
+
+  def list_profile(self, args, screen_info=None):
+    """Command handler for list_profile.
+
+    List per-operation profile information.
+
+    Args:
+      args: Command-line arguments, excluding the command prefix, as a list of
+        str.
+      screen_info: Optional dict input containing screen information such as
+        cols.
+
+    Returns:
+      Output text lines as a RichTextLines object.
+    """
+    screen_cols = 80
+    if screen_info and "cols" in screen_info:
+      screen_cols = screen_info["cols"]
+
+    parsed = self._arg_parsers["list_profile"].parse_args(args)
+    op_time_interval = (command_parser.parse_time_interval(parsed.op_time)
+                        if parsed.op_time else None)
+    exec_time_interval = (
+        command_parser.parse_time_interval(parsed.execution_time)
+        if parsed.execution_time else None)
+    node_name_regex = (re.compile(parsed.node_name_filter)
+                       if parsed.node_name_filter else None)
+    file_path_regex = (re.compile(parsed.file_path_filter)
+                       if parsed.file_path_filter else None)
+    op_type_regex = (re.compile(parsed.op_type_filter)
+                     if parsed.op_type_filter else None)
+
+    output = debugger_cli_common.RichTextLines([""])
+    device_name_regex = (re.compile(parsed.device_name_filter)
+                         if parsed.device_name_filter else None)
+    data_generator = self._get_profile_data_generator()
+    device_count = len(self._run_metadata.step_stats.dev_stats)
+    for index in range(device_count):
+      device_stats = self._run_metadata.step_stats.dev_stats[index]
+      if not device_name_regex or device_name_regex.match(device_stats.device):
+        profile_data = [
+            datum for datum in data_generator(device_stats)
+            if _list_profile_filter(
+                datum, node_name_regex, file_path_regex, op_type_regex,
+                op_time_interval, exec_time_interval,
+                min_lineno=parsed.min_lineno, max_lineno=parsed.max_lineno)]
+        profile_data = sorted(
+            profile_data,
+            key=lambda datum: _list_profile_sort_key(datum, parsed.sort_by),
+            reverse=parsed.reverse)
+        output.extend(
+            self._get_list_profile_lines(
+                device_stats.device, index, device_count,
+                profile_data, parsed.sort_by, parsed.reverse, parsed.time_unit,
+                device_name_filter=parsed.device_name_filter,
+                node_name_filter=parsed.node_name_filter,
+                op_type_filter=parsed.op_type_filter,
+                screen_cols=screen_cols))
+    return output
+
+  def _get_profile_data_generator(self):
+    """Get function that generates `ProfileDatum` objects.
+
+    Returns:
+      A function that generates `ProfileDatum` objects.
+    """
+    node_to_file_path = {}
+    node_to_line_number = {}
+    node_to_func_name = {}
+    node_to_op_type = {}
+    for op in self._graph.get_operations():
+      for trace_entry in reversed(op.traceback):
+        file_path = trace_entry[0]
+        line_num = trace_entry[1]
+        func_name = trace_entry[2]
+        if not source_utils.guess_is_tensorflow_py_library(file_path):
+          break
+      node_to_file_path[op.name] = file_path
+      node_to_line_number[op.name] = line_num
+      node_to_func_name[op.name] = func_name
+      node_to_op_type[op.name] = op.type
+
+    def profile_data_generator(device_step_stats):
+      for node_stats in device_step_stats.node_stats:
+        if node_stats.node_name == "_SOURCE" or node_stats.node_name == "_SINK":
+          continue
+        yield profiling.ProfileDatum(
+            device_step_stats.device,
+            node_stats,
+            node_to_file_path.get(node_stats.node_name, ""),
+            node_to_line_number.get(node_stats.node_name, 0),
+            node_to_func_name.get(node_stats.node_name, ""),
+            node_to_op_type.get(node_stats.node_name, ""))
+    return profile_data_generator
+
+  def _get_list_profile_lines(
+      self, device_name, device_index, device_count,
+      profile_datum_list, sort_by, sort_reverse, time_unit,
+      device_name_filter=None, node_name_filter=None, op_type_filter=None,
+      screen_cols=80):
+    """Get `RichTextLines` object for list_profile command for a given device.
+
+    Args:
+      device_name: (string) Device name.
+      device_index: (int) Device index.
+      device_count: (int) Number of devices.
+      profile_datum_list: List of `ProfileDatum` objects.
+      sort_by: (string) Identifier of column to sort. Sort identifier
+          must match value of SORT_OPS_BY_OP_NAME, SORT_OPS_BY_OP_TYPE,
+          SORT_OPS_BY_EXEC_TIME, SORT_OPS_BY_MEMORY or SORT_OPS_BY_LINE.
+      sort_reverse: (bool) Whether to sort in descending instead of default
+          (ascending) order.
+      time_unit: time unit, must be in cli_shared.TIME_UNITS.
+      device_name_filter: Regular expression to filter by device name.
+      node_name_filter: Regular expression to filter by node name.
+      op_type_filter: Regular expression to filter by op type.
+      screen_cols: (int) Number of columns available on the screen (i.e.,
+        available screen width).
+
+    Returns:
+      `RichTextLines` object containing a table that displays profiling
+      information for each op.
+    """
+    profile_data = ProfileDataTableView(profile_datum_list, time_unit=time_unit)
+
+    # Calculate total time early to calculate column widths.
+    total_op_time = sum(datum.op_time for datum in profile_datum_list)
+    total_exec_time = sum(datum.node_exec_stats.all_end_rel_micros
+                          for datum in profile_datum_list)
+    device_total_row = [
+        "Device Total", "",
+        cli_shared.time_to_readable_str(total_op_time,
+                                        force_time_unit=time_unit),
+        cli_shared.time_to_readable_str(total_exec_time,
+                                        force_time_unit=time_unit)]
+
+    # Calculate column widths.
+    column_widths = [
+        len(column_name) for column_name in profile_data.column_names()]
+    for col in range(len(device_total_row)):
+      column_widths[col] = max(column_widths[col], len(device_total_row[col]))
+    for col in range(len(column_widths)):
+      for row in range(profile_data.row_count()):
+        column_widths[col] = max(
+            column_widths[col], len(profile_data.value(
+                row,
+                col,
+                device_name_filter=device_name_filter,
+                node_name_filter=node_name_filter,
+                op_type_filter=op_type_filter)))
+      column_widths[col] += 2  # add margin between columns
+
+    # Add device name.
+    output = [RL("-" * screen_cols)]
+    device_row = "Device %d of %d: %s" % (
+        device_index + 1, device_count, device_name)
+    output.append(RL(device_row))
+    output.append(RL())
+
+    # Add headers.
+    base_command = "list_profile"
+    row = RL()
+    for col in range(profile_data.column_count()):
+      column_name = profile_data.column_names()[col]
+      sort_id = profile_data.column_sort_id(col)
+      command = "%s -s %s" % (base_command, sort_id)
+      if sort_by == sort_id and not sort_reverse:
+        command += " -r"
+      head_menu_item = debugger_cli_common.MenuItem(None, command)
+      row += RL(column_name, font_attr=[head_menu_item, "bold"])
+      row += RL(" " * (column_widths[col] - len(column_name)))
+
+    output.append(row)
+
+    # Add data rows.
+    for row in range(profile_data.row_count()):
+      new_row = RL()
+      for col in range(profile_data.column_count()):
+        new_cell = profile_data.value(
+            row,
+            col,
+            device_name_filter=device_name_filter,
+            node_name_filter=node_name_filter,
+            op_type_filter=op_type_filter)
+        new_row += new_cell
+        new_row += RL(" " * (column_widths[col] - len(new_cell)))
+      output.append(new_row)
+
+    # Add stat totals.
+    row_str = ""
+    for col in range(len(device_total_row)):
+      row_str += ("{:<%d}" % column_widths[col]).format(device_total_row[col])
+    output.append(RL())
+    output.append(RL(row_str))
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(output)
+
+  def _measure_list_profile_column_widths(self, profile_data):
+    """Determine the maximum column widths for each data list.
+
+    Args:
+      profile_data: list of ProfileDatum objects.
+
+    Returns:
+      List of column widths in the same order as columns in data.
+    """
+    num_columns = len(profile_data.column_names())
+    widths = [len(column_name) for column_name in profile_data.column_names()]
+    for row in range(profile_data.row_count()):
+      for col in range(num_columns):
+        widths[col] = max(
+            widths[col], len(str(profile_data.row_values(row)[col])) + 2)
+    return widths
+
+  _LINE_COST_ATTR = cli_shared.COLOR_CYAN
+  _LINE_NUM_ATTR = cli_shared.COLOR_YELLOW
+  _NUM_NODES_HEAD = "#nodes"
+  _NUM_EXECS_SUB_HEAD = "(#execs)"
+  _LINENO_HEAD = "lineno"
+  _SOURCE_HEAD = "source"
+
+  def print_source(self, args, screen_info=None):
+    """Print a Python source file with line-level profile information.
+
+    Args:
+      args: Command-line arguments, excluding the command prefix, as a list of
+        str.
+      screen_info: Optional dict input containing screen information such as
+        cols.
+
+    Returns:
+      Output text lines as a RichTextLines object.
+    """
+    del screen_info
+
+    parsed = self._arg_parsers["print_source"].parse_args(args)
+
+    device_name_regex = (re.compile(parsed.device_name_filter)
+                         if parsed.device_name_filter else None)
+
+    profile_data = []
+    data_generator = self._get_profile_data_generator()
+    device_count = len(self._run_metadata.step_stats.dev_stats)
+    for index in range(device_count):
+      device_stats = self._run_metadata.step_stats.dev_stats[index]
+      if device_name_regex and not device_name_regex.match(device_stats.device):
+        continue
+      profile_data.extend([datum for datum in data_generator(device_stats)])
+
+    source_annotation = source_utils.annotate_source_against_profile(
+        profile_data,
+        os.path.expanduser(parsed.source_file_path),
+        node_name_filter=parsed.node_name_filter,
+        op_type_filter=parsed.op_type_filter)
+    if not source_annotation:
+      return debugger_cli_common.RichTextLines(
+          ["The source file %s does not contain any profile information for "
+           "the previous Session run under the following "
+           "filters:" % parsed.source_file_path,
+           "  --%s: %s" % (_DEVICE_NAME_FILTER_FLAG, parsed.device_name_filter),
+           "  --%s: %s" % (_NODE_NAME_FILTER_FLAG, parsed.node_name_filter),
+           "  --%s: %s" % (_OP_TYPE_FILTER_FLAG, parsed.op_type_filter)])
+
+    max_total_cost = 0
+    for line_index in source_annotation:
+      total_cost = self._get_total_cost(source_annotation[line_index],
+                                        parsed.cost_type)
+      max_total_cost = max(max_total_cost, total_cost)
+
+    source_lines, line_num_width = source_utils.load_source(
+        parsed.source_file_path)
+
+    cost_bar_max_length = 10
+    total_cost_head = parsed.cost_type
+    column_widths = {
+        "cost_bar": cost_bar_max_length + 3,
+        "total_cost": len(total_cost_head) + 3,
+        "num_nodes_execs": len(self._NUM_EXECS_SUB_HEAD) + 1,
+        "line_number": line_num_width,
+    }
+
+    head = RL(
+        " " * column_widths["cost_bar"] +
+        total_cost_head +
+        " " * (column_widths["total_cost"] - len(total_cost_head)) +
+        self._NUM_NODES_HEAD +
+        " " * (column_widths["num_nodes_execs"] - len(self._NUM_NODES_HEAD)),
+        font_attr=self._LINE_COST_ATTR)
+    head += RL(self._LINENO_HEAD, font_attr=self._LINE_NUM_ATTR)
+    sub_head = RL(
+        " " * (column_widths["cost_bar"] +
+               column_widths["total_cost"]) +
+        self._NUM_EXECS_SUB_HEAD +
+        " " * (column_widths["num_nodes_execs"] -
+               len(self._NUM_EXECS_SUB_HEAD)) +
+        " " * column_widths["line_number"],
+        font_attr=self._LINE_COST_ATTR)
+    sub_head += RL(self._SOURCE_HEAD, font_attr="bold")
+    lines = [head, sub_head]
+
+    output_annotations = {}
+    for i, line in enumerate(source_lines):
+      lineno = i + 1
+      if lineno in source_annotation:
+        annotation = source_annotation[lineno]
+        cost_bar = self._render_normalized_cost_bar(
+            self._get_total_cost(annotation, parsed.cost_type), max_total_cost,
+            cost_bar_max_length)
+        annotated_line = cost_bar
+        annotated_line += " " * (column_widths["cost_bar"] - len(cost_bar))
+
+        total_cost = RL(cli_shared.time_to_readable_str(
+            self._get_total_cost(annotation, parsed.cost_type),
+            force_time_unit=parsed.time_unit),
+                        font_attr=self._LINE_COST_ATTR)
+        total_cost += " " * (column_widths["total_cost"] - len(total_cost))
+        annotated_line += total_cost
+
+        file_path_filter = re.escape(parsed.source_file_path) + "$"
+        command = "lp --file_path_filter %s --min_lineno %d --max_lineno %d" % (
+            file_path_filter, lineno, lineno + 1)
+        if parsed.device_name_filter:
+          command += " --%s %s" % (_DEVICE_NAME_FILTER_FLAG,
+                                   parsed.device_name_filter)
+        if parsed.node_name_filter:
+          command += " --%s %s" % (_NODE_NAME_FILTER_FLAG,
+                                   parsed.node_name_filter)
+        if parsed.op_type_filter:
+          command += " --%s %s" % (_OP_TYPE_FILTER_FLAG,
+                                   parsed.op_type_filter)
+        menu_item = debugger_cli_common.MenuItem(None, command)
+        num_nodes_execs = RL("%d(%d)" % (annotation.node_count,
+                                         annotation.node_exec_count),
+                             font_attr=[self._LINE_COST_ATTR, menu_item])
+        num_nodes_execs += " " * (
+            column_widths["num_nodes_execs"] - len(num_nodes_execs))
+        annotated_line += num_nodes_execs
+      else:
+        annotated_line = RL(
+            " " * sum(column_widths[col_name] for col_name in column_widths
+                      if col_name != "line_number"))
+
+      line_num_column = RL(" L%d" % (lineno), self._LINE_NUM_ATTR)
+      line_num_column += " " * (
+          column_widths["line_number"] - len(line_num_column))
+      annotated_line += line_num_column
+      annotated_line += line
+      lines.append(annotated_line)
+
+      if parsed.init_line == lineno:
+        output_annotations[
+            debugger_cli_common.INIT_SCROLL_POS_KEY] = len(lines) - 1
+
+    return debugger_cli_common.rich_text_lines_from_rich_line_list(
+        lines, annotations=output_annotations)
+
+  def _get_total_cost(self, aggregated_profile, cost_type):
+    if cost_type == "exec_time":
+      return aggregated_profile.total_exec_time
+    elif cost_type == "op_time":
+      return aggregated_profile.total_op_time
+    else:
+      raise ValueError("Unsupported cost type: %s" % cost_type)
+
+  def _render_normalized_cost_bar(self, cost, max_cost, length):
+    """Render a text bar representing a normalized cost.
+
+    Args:
+      cost: the absolute value of the cost.
+      max_cost: the maximum cost value to normalize the absolute cost with.
+      length: (int) length of the cost bar, in number of characters, excluding
+        the brackets on the two ends.
+
+    Returns:
+      An instance of debugger_cli_common.RichTextLine.
+    """
+    num_ticks = int(np.ceil(float(cost) / max_cost * length))
+    num_ticks = num_ticks or 1  # Minimum is 1 tick.
+    output = RL("[", font_attr=self._LINE_COST_ATTR)
+    output += RL("|" * num_ticks + " " * (length - num_ticks),
+                 font_attr=["bold", self._LINE_COST_ATTR])
+    output += RL("]", font_attr=self._LINE_COST_ATTR)
+    return output
+
+  def get_help(self, handler_name):
+    return self._arg_parsers[handler_name].format_help()
+
+
+def create_profiler_ui(graph,
+                       run_metadata,
+                       ui_type="curses",
+                       on_ui_exit=None):
+  """Create an instance of CursesUI based on a `tf.Graph` and `RunMetadata`.
+
+  Args:
+    graph: Python `Graph` object.
+    run_metadata: A `RunMetadata` protobuf object.
+    ui_type: (str) requested UI type, e.g., "curses", "readline".
+    on_ui_exit: (`Callable`) the callback to be called when the UI exits.
+
+  Returns:
+    (base_ui.BaseUI) A BaseUI subtype object with a set of standard analyzer
+      commands and tab-completions registered.
+  """
+
+  analyzer = ProfileAnalyzer(graph, run_metadata)
+
+  cli = ui_factory.get_ui(ui_type, on_ui_exit=on_ui_exit)
+  cli.register_command_handler(
+      "list_profile",
+      analyzer.list_profile,
+      analyzer.get_help("list_profile"),
+      prefix_aliases=["lp"])
+  cli.register_command_handler(
+      "print_source",
+      analyzer.print_source,
+      analyzer.get_help("print_source"),
+      prefix_aliases=["ps"])
+
+  return cli
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a709cb4107e772871dea8d887a29723dbb7b5e1f
--- /dev/null
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -0,0 +1,456 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for profile_analyzer_cli."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import profile_analyzer_cli
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
+
+
+def _line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+def _at_least_one_line_matches(pattern, lines):
+  pattern_re = re.compile(pattern)
+  for i, line in enumerate(lines):
+    if pattern_re.search(line):
+      return True, i
+  return False, None
+
+
+def _assert_at_least_one_line_matches(pattern, lines):
+  any_match, _ = _at_least_one_line_matches(pattern, lines)
+  if not any_match:
+    raise AssertionError(
+        "%s does not match any line in %s." % (pattern, str(lines)))
+
+
+def _assert_no_lines_match(pattern, lines):
+  any_match, _ = _at_least_one_line_matches(pattern, lines)
+  if any_match:
+    raise AssertionError(
+        "%s matched at least one line in %s." % (pattern, str(lines)))
+
+
+class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
+
+  def testNodeInfoEmpty(self):
+    graph = ops.Graph()
+    run_metadata = config_pb2.RunMetadata()
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+    self.assertEquals([""], prof_output)
+
+  def testSingleDevice(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=3)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file1", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+
+    _assert_at_least_one_line_matches(r"Device 1 of 1: deviceA", prof_output)
+    _assert_at_least_one_line_matches(r"^Add/123.*add.*2us.*4us", prof_output)
+    _assert_at_least_one_line_matches(r"^Mul/456.*mul.*1us.*3us", prof_output)
+
+  def testMultipleDevices(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=3)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1])
+
+    device2 = run_metadata.step_stats.dev_stats.add()
+    device2.device = "deviceB"
+    device2.node_stats.extend([node1])
+
+    graph = test.mock.MagicMock()
+    op = test.mock.MagicMock()
+    op.name = "Add/123"
+    op.traceback = [("a/b/file1", 10, "some_var")]
+    op.type = "abc"
+    graph.get_operations.return_value = [op]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+    prof_output = prof_analyzer.list_profile([]).lines
+
+    _assert_at_least_one_line_matches(r"Device 1 of 2: deviceA", prof_output)
+    _assert_at_least_one_line_matches(r"Device 2 of 2: deviceB", prof_output)
+
+    # Try filtering by device.
+    prof_output = prof_analyzer.list_profile(["-d", "deviceB"]).lines
+    _assert_at_least_one_line_matches(r"Device 2 of 2: deviceB", prof_output)
+    _assert_no_lines_match(r"Device 1 of 2: deviceA", prof_output)
+
+  def testWithSession(self):
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+
+    with session.Session() as sess:
+      a = constant_op.constant([1, 2, 3])
+      b = constant_op.constant([2, 2, 1])
+      result = math_ops.add(a, b)
+
+      sess.run(result, options=options, run_metadata=run_metadata)
+
+      prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(
+          sess.graph, run_metadata)
+      prof_output = prof_analyzer.list_profile([]).lines
+
+      _assert_at_least_one_line_matches("Device 1 of", prof_output)
+      expected_headers = [
+          "Node", r"Start Time \(us\)", r"Op Time \(.*\)", r"Exec Time \(.*\)",
+          r"Filename:Lineno\(function\)"]
+      _assert_at_least_one_line_matches(
+          ".*".join(expected_headers), prof_output)
+      _assert_at_least_one_line_matches(r"^Add/", prof_output)
+      _assert_at_least_one_line_matches(r"Device Total", prof_output)
+
+  def testSorting(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Default sort by start time (i.e. all_start_micros).
+    prof_output = prof_analyzer.list_profile([]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    # Default sort in reverse.
+    prof_output = prof_analyzer.list_profile(["-r"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by name.
+    prof_output = prof_analyzer.list_profile(["-s", "node"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by op time (i.e. op_end_rel_micros - op_start_rel_micros).
+    prof_output = prof_analyzer.list_profile(["-s", "op_time"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+    # Sort by exec time (i.e. all_end_rel_micros).
+    prof_output = prof_analyzer.list_profile(["-s", "exec_time"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Add/123.*Mul/456")
+    # Sort by line number.
+    prof_output = prof_analyzer.list_profile(["-s", "line"]).lines
+    self.assertRegexpMatches("".join(prof_output), r"Mul/456.*Add/123")
+
+  def testFiltering(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Filter by name
+    prof_output = prof_analyzer.list_profile(["-n", "Add"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+    # Filter by op_type
+    prof_output = prof_analyzer.list_profile(["-t", "mul"]).lines
+    _assert_at_least_one_line_matches(r"Mul/456", prof_output)
+    _assert_no_lines_match(r"Add/123", prof_output)
+    # Filter by file name.
+    prof_output = prof_analyzer.list_profile(["-f", ".*file2"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+    # Fitler by execution time.
+    prof_output = prof_analyzer.list_profile(["-e", "[5, 10]"]).lines
+    _assert_at_least_one_line_matches(r"Mul/456", prof_output)
+    _assert_no_lines_match(r"Add/123", prof_output)
+    # Fitler by op time.
+    prof_output = prof_analyzer.list_profile(["-o", ">=2"]).lines
+    _assert_at_least_one_line_matches(r"Add/123", prof_output)
+    _assert_no_lines_match(r"Mul/456", prof_output)
+
+  def testSpecifyingTimeUnit(self):
+    node1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        all_start_micros=123,
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+
+    node2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        all_start_micros=122,
+        op_start_rel_micros=1,
+        op_end_rel_micros=2,
+        all_end_rel_micros=5)
+
+    run_metadata = config_pb2.RunMetadata()
+    device1 = run_metadata.step_stats.dev_stats.add()
+    device1.device = "deviceA"
+    device1.node_stats.extend([node1, node2])
+
+    graph = test.mock.MagicMock()
+    op1 = test.mock.MagicMock()
+    op1.name = "Add/123"
+    op1.traceback = [("a/b/file2", 10, "some_var")]
+    op1.type = "add"
+    op2 = test.mock.MagicMock()
+    op2.name = "Mul/456"
+    op2.traceback = [("a/b/file1", 11, "some_var")]
+    op2.type = "mul"
+    graph.get_operations.return_value = [op1, op2]
+
+    prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(graph, run_metadata)
+
+    # Force time unit.
+    prof_output = prof_analyzer.list_profile(["--time_unit", "ms"]).lines
+    _assert_at_least_one_line_matches(r"Add/123.*add.*0\.002ms", prof_output)
+    _assert_at_least_one_line_matches(r"Mul/456.*mul.*0\.005ms", prof_output)
+    _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
+
+
+class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(ProfileAnalyzerPrintSourceTest, self).setUp()
+
+    options = config_pb2.RunOptions()
+    options.trace_level = config_pb2.RunOptions.FULL_TRACE
+    run_metadata = config_pb2.RunMetadata()
+    with session.Session() as sess:
+      loop_cond = lambda x: math_ops.less(x, 10)
+      self.loop_cond_lineno = _line_number_above()
+      loop_body = lambda x: math_ops.add(x, 1)
+      self.loop_body_lineno = _line_number_above()
+      x = constant_op.constant(0, name="x")
+      self.x_lineno = _line_number_above()
+      loop = control_flow_ops.while_loop(loop_cond, loop_body, [x])
+      self.loop_lineno = _line_number_above()
+      self.assertEqual(
+          10, sess.run(loop, options=options, run_metadata=run_metadata))
+
+      self.prof_analyzer = profile_analyzer_cli.ProfileAnalyzer(
+          sess.graph, run_metadata)
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    super(ProfileAnalyzerPrintSourceTest, self).tearDown()
+
+  def testPrintSourceForWhileLoop(self):
+    prof_output = self.prof_analyzer.print_source([__file__])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceOutputContainsClickableLinks(self):
+    prof_output = self.prof_analyzer.print_source([__file__])
+    any_match, line_index = _at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    self.assertTrue(any_match)
+    any_menu_item_match = False
+    for seg in prof_output.font_attr_segs[line_index]:
+      if (isinstance(seg[2][1], debugger_cli_common.MenuItem) and
+          seg[2][1].content.startswith("lp --file_path_filter ") and
+          "--min_lineno %d" % self.loop_cond_lineno in seg[2][1].content and
+          "--max_lineno %d" % (self.loop_cond_lineno + 1) in seg[2][1].content):
+        any_menu_item_match = True
+        break
+    self.assertTrue(any_menu_item_match)
+
+  def testPrintSourceWithNonDefaultTimeUnit(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--time_unit", "ms"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceWithNodeNameFilter(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--node_name_filter", "x$"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(1\) .*L%d.*(\S)+" % self.x_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(22\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*ms .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+    # Check clickable link.
+    _, line_index = _at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(1\) .*L%d.*(\S)+" % self.x_lineno,
+        prof_output.lines)
+    any_menu_item_match = False
+    for seg in prof_output.font_attr_segs[line_index]:
+      if (isinstance(seg[2][1], debugger_cli_common.MenuItem) and
+          seg[2][1].content.startswith("lp --file_path_filter ") and
+          "--node_name_filter x$" in seg[2][1].content and
+          "--min_lineno %d" % self.x_lineno in seg[2][1].content and
+          "--max_lineno %d" % (self.x_lineno + 1) in seg[2][1].content):
+        any_menu_item_match = True
+        break
+    self.assertTrue(any_menu_item_match)
+
+  def testPrintSourceWithOpTypeFilter(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--op_type_filter", "Less"])
+
+    _assert_at_least_one_line_matches(
+        r"\[(\|)+(\s)*\] .*us .*1\(11\) .*L%d.*(\S)+" % self.loop_cond_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*2\(20\) .*L%d.*(\S)+" % self.loop_body_lineno,
+        prof_output.lines)
+    _assert_no_lines_match(
+        r"\[(\|)+(\s)*\] .*us .*7\(55\) .*L%d.*(\S)+" % self.loop_lineno,
+        prof_output.lines)
+
+  def testPrintSourceWithNonexistentDeviceGivesCorrectErrorMessage(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--device_name_filter", "foo_device"])
+
+    _assert_at_least_one_line_matches(
+        r"The source file .* does not contain any profile information for the "
+        "previous Session run", prof_output.lines)
+    _assert_at_least_one_line_matches(
+        r".*--device_name_filter: foo_device", prof_output.lines)
+
+  def testPrintSourceWithUnrelatedFileShowsCorrectErrorMessage(self):
+    prof_output = self.prof_analyzer.print_source([tf_inspect.__file__])
+    _assert_at_least_one_line_matches(
+        r"The source file .* does not contain any profile information for the "
+        "previous Session run", prof_output.lines)
+
+  def testPrintSourceOutputContainsInitScrollPosAnnotation(self):
+    prof_output = self.prof_analyzer.print_source([
+        __file__, "--init_line", str(self.loop_cond_lineno)])
+    self.assertEqual(
+        self.loop_cond_lineno + 1,  # The extra line is due to the head lines.
+        prof_output.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
index aee084983211f3881606e1e8cd4018ba5e649946..94eb2754da21b2a6c66271f53a2a0917deb25515 100644
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ b/tensorflow/python/debug/cli/stepper_cli.py
@@ -68,19 +68,19 @@ class NodeStepperCLI(object):
   _UPDATED_ATTRIBUTE = "bold"
 
   _STATE_COLORS = {
-      STATE_CONT: "green",
-      STATE_DIRTY_VARIABLE: "magenta",
-      STATE_DUMPED_INTERMEDIATE: "blue",
-      STATE_OVERRIDDEN: "yellow",
-      STATE_IS_PLACEHOLDER: "cyan",
-      STATE_UNFEEDABLE: "red",
+      STATE_CONT: cli_shared.COLOR_GREEN,
+      STATE_DIRTY_VARIABLE: cli_shared.COLOR_MAGENTA,
+      STATE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
+      STATE_OVERRIDDEN: cli_shared.COLOR_YELLOW,
+      STATE_IS_PLACEHOLDER: cli_shared.COLOR_CYAN,
+      STATE_UNFEEDABLE: cli_shared.COLOR_RED,
   }
 
   _FEED_COLORS = {
-      stepper.NodeStepper.FEED_TYPE_CLIENT: "white",
-      stepper.NodeStepper.FEED_TYPE_HANDLE: "green",
-      stepper.NodeStepper.FEED_TYPE_OVERRIDE: "yellow",
-      stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE: "blue",
+      stepper.NodeStepper.FEED_TYPE_CLIENT: cli_shared.COLOR_WHITE,
+      stepper.NodeStepper.FEED_TYPE_HANDLE: cli_shared.COLOR_GREEN,
+      stepper.NodeStepper.FEED_TYPE_OVERRIDE: cli_shared.COLOR_YELLOW,
+      stepper.NodeStepper.FEED_TYPE_DUMPED_INTERMEDIATE: cli_shared.COLOR_BLUE,
   }
 
   def __init__(self, node_stepper):
diff --git a/tensorflow/python/debug/cli/tensor_format.py b/tensorflow/python/debug/cli/tensor_format.py
index c3c4bcf215020baab6ce7fbd1125e47abc6d001d..bb7ac314303269481de802460383dfd94bba1355 100644
--- a/tensorflow/python/debug/cli/tensor_format.py
+++ b/tensorflow/python/debug/cli/tensor_format.py
@@ -24,6 +24,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.lib import debug_data
 
 _NUMPY_OMISSION = "...,"
 _NUMPY_DEFAULT_EDGE_ITEMS = 3
@@ -112,10 +113,10 @@ def format_tensor(tensor,
           (8 + proper_len + 1, 8 + proper_len + 1 + debug_op_len, "yellow")
       ]
 
-  if tensor is None:
+  if isinstance(tensor, debug_data.InconvertibleTensorProto):
     if lines:
       lines.append("")
-    lines.append("Uninitialized tensor")
+    lines.extend(str(tensor).split("\n"))
     return debugger_cli_common.RichTextLines(lines)
   elif not isinstance(tensor, np.ndarray):
     # If tensor is not a np.ndarray, return simple text-line representation of
diff --git a/tensorflow/python/debug/cli/tensor_format_test.py b/tensorflow/python/debug/cli/tensor_format_test.py
index 8392a873675a07ea549e7b6cef5cc3f8a856e47a..ec80bb998ef59de9dc0f6f3a4bcc29b910acf5f7 100644
--- a/tensorflow/python/debug/cli/tensor_format_test.py
+++ b/tensorflow/python/debug/cli/tensor_format_test.py
@@ -20,7 +20,11 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import tensor_pb2
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.debug.cli import tensor_format
+from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 
@@ -363,10 +367,28 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
       if i < 1:
         self.assertNotIn(p + i * 6 + 5, out.annotations)
 
-  def testFormatNone(self):
-    out = tensor_format.format_tensor(None, "a")
+  def testFormatUninitializedTensor(self):
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_FLOAT"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto, False), "a")
+
+    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor:"],
+                     out.lines[:3])
+    self.assertEqual(str(tensor_proto).split("\n"), out.lines[3:])
 
-    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor"], out.lines)
+  def testFormatResourceTypeTensor(self):
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_RESOURCE"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto), "a")
+
+    self.assertEqual(["Tensor \"a\":", ""], out.lines[:2])
+    self.assertEqual(str(tensor_proto).split("\n"), out.lines[2:])
 
   def testLocateTensorElement1DNoEllipsis(self):
     a = np.zeros(20)
@@ -821,9 +843,15 @@ class RichTextLinesTest(test_util.TensorFlowTestCase):
     self.assertEqual([12, None], end_cols)
 
   def testLocateTensorElementAnnotationsUnavailable(self):
-    out = tensor_format.format_tensor(None, "a")
+    tensor_proto = tensor_pb2.TensorProto(
+        dtype=types_pb2.DataType.Value("DT_FLOAT"),
+        tensor_shape=tensor_shape_pb2.TensorShapeProto(
+            dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)]))
+    out = tensor_format.format_tensor(
+        debug_data.InconvertibleTensorProto(tensor_proto, False), "a")
 
-    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor"], out.lines)
+    self.assertEqual(["Tensor \"a\":", "", "Uninitialized tensor:"],
+                     out.lines[:3])
 
     with self.assertRaisesRegexp(
         AttributeError, "tensor_metadata is not available in annotations"):
diff --git a/tensorflow/python/debug/examples/README.md b/tensorflow/python/debug/examples/README.md
index 41bf777c97a6a1f3536c2f64136aa111243125a6..cb4d484092fe39698de1ff11e4d50d4879960e0c 100644
--- a/tensorflow/python/debug/examples/README.md
+++ b/tensorflow/python/debug/examples/README.md
@@ -1,4 +1,9 @@
 Hi, there!
 
-The documentation of **TensorFlow Debugger (tfdbg)** has moved to
-[this new location](../../../g3doc/how_tos/debugger/index.md).
+The documentation of **TensorFlow Debugger (tfdbg)** has moved.
+
+See the source version at
+[this new location](../../../docs_src/programmers_guide/debugger.md).
+
+See the public website version at
+[https://www.tensorflow.org/programmers_guide/debugger](https://www.tensorflow.org/programmers_guide/debugger).
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 7830954f0560e646452bbbf3248143f11d5f822c..0b5401a7f29a3e979de157b47f952047cecd6a4e 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -66,17 +66,18 @@ run
 exit
 EOF
 
-# Use a large enough "run -t" number to let the process end properly.
-cat << EOF | ${DEBUG_MNIST_BIN} --debug --fake_data --ui_type=readline
+cat << EOF | ${DEBUG_MNIST_BIN} --debug --max_steps=1 --fake_data --ui_type=readline
+run -t 1
+run --node_name_filter hidden --op_type_filter MatMul
 run -f has_inf_or_nan
-run -t 1000
 EOF
 
 # Test the custom dump_root option.
 CUSTOM_DUMP_ROOT=$(mktemp -d)
 mkdir -p ${CUSTOM_DUMP_ROOT}
 
-cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --fake_data --train_steps=1 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --fake_data --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline
+run -p
 run -f has_inf_or_nan
 EOF
 
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 96772aed8f63b3252b4e716beddfff63f593c54b..1f5b8af0db78bc5ebaba29e0e163fca6a3a513ac 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -39,6 +39,30 @@ FETCHES_INFO_FILE_TAG = "fetches_info_"
 FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
 
 
+class InconvertibleTensorProto(object):
+  """Represents a TensorProto that cannot be converted to np.ndarray."""
+
+  def __init__(self, tensor_proto, initialized=True):
+    """Constructor.
+
+    Args:
+      tensor_proto: the `TensorProto` object that cannot be represented as a
+        `np.ndarray` object.
+      initialized: (`bool`) whether the Tensor is initialized.
+    """
+    self._tensor_proto = tensor_proto
+    self._initialized = initialized
+
+  def __str__(self):
+    output = "" if self._initialized else "Uninitialized tensor:\n"
+    output += str(self._tensor_proto)
+    return output
+
+  @property
+  def initialized(self):
+    return self._initialized
+
+
 def load_tensor_from_event_file(event_file_path):
   """Load a tensor from an event file.
 
@@ -69,26 +93,27 @@ def load_tensor_from_event(event):
         summary.value[0] field.
 
   Returns:
-    The tensor value loaded from the event file, as a `numpy.ndarray`. For
-    uninitialized Tensors, returns `None`. For Tensors of data types that
-    cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return
-    `None`.
+    The tensor value loaded from the event file, as a `numpy.ndarray`, if
+    representation of the tensor value by a `numpy.ndarray` is possible.
+    For uninitialized Tensors, returns `None`. For Tensors of data types that
+    cannot be represented as `numpy.ndarray` (e.g., `tf.resource`), return
+    the `TensorProto` protobuf object without converting it to a
+    `numpy.ndarray`.
   """
 
-  if (event.summary.value[0].tensor.tensor_content or
-      event.summary.value[0].tensor.string_val):
+  tensor_proto = event.summary.value[0].tensor
+  if tensor_proto.tensor_content or tensor_proto.string_val:
     # Initialized tensor.
-    tensor_proto = event.summary.value[0].tensor
     if tensor_proto.dtype == types_pb2.DT_RESOURCE:
-      return None
+      tensor_value = InconvertibleTensorProto(tensor_proto)
     else:
       try:
         tensor_value = tensor_util.MakeNdarray(tensor_proto)
       except KeyError:
-        tensor_value = None
+        tensor_value = InconvertibleTensorProto(tensor_proto)
   else:
     # Uninitialized tensor or tensor of unconvertible data type.
-    tensor_value = None
+    tensor_value = InconvertibleTensorProto(tensor_proto, False)
 
   return tensor_value
 
@@ -199,7 +224,7 @@ def _get_tensor_watch_key(node_name, output_slot, debug_op):
   return "%s:%s" % (_get_tensor_name(node_name, output_slot), debug_op)
 
 
-def _is_copy_node(node_name):
+def is_copy_node(node_name):
   """Determine whether a node name is that of a debug Copy node.
 
   Such nodes are inserted by TensorFlow core upon request in
@@ -215,7 +240,7 @@ def _is_copy_node(node_name):
   return node_name.startswith("__copy_")
 
 
-def _is_debug_node(node_name):
+def is_debug_node(node_name):
   """Determine whether a node name is that of a debug node.
 
   Such nodes are inserted by TensorFlow core upon request in
@@ -230,7 +255,7 @@ def _is_debug_node(node_name):
   return node_name.startswith("__dbg_")
 
 
-def _parse_debug_node_name(node_name):
+def parse_debug_node_name(node_name):
   """Parse the name of a debug node.
 
   Args:
@@ -290,8 +315,10 @@ def has_inf_or_nan(datum, tensor):
 
   _ = datum  # Datum metadata is unused in this predicate.
 
-  if tensor is None:
+  if isinstance(tensor, InconvertibleTensorProto):
     # Uninitialized tensor doesn't have bad numerical values.
+    # Also return False for data types that cannot be represented as numpy
+    # arrays.
     return False
   elif (np.issubdtype(tensor.dtype, np.float) or
         np.issubdtype(tensor.dtype, np.complex) or
@@ -494,6 +521,10 @@ class DebugTensorDatum(object):
     return self._dump_size_bytes
 
 
+class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):
+  pass
+
+
 class DebugDumpDir(object):
   """Data set from a debug-dump directory on filesystem.
 
@@ -791,12 +822,12 @@ class DebugDumpDir(object):
       ValueError: If duplicate node names are encountered.
     """
 
-    if _is_debug_node(node.name):
+    if is_debug_node(node.name):
       # This is a debug node. Parse the node name and retrieve the
       # information about debug watches on tensors. But do not include
       # the node in the graph.
       (watched_node_name, watched_output_slot, _,
-       debug_op) = _parse_debug_node_name(node.name)
+       debug_op) = parse_debug_node_name(node.name)
 
       self._debug_watches[watched_node_name][watched_output_slot].add(
           debug_op)
@@ -820,7 +851,7 @@ class DebugDumpDir(object):
     self._node_op_types[node.name] = node.op
 
     for inp in node.input:
-      if _is_copy_node(inp) and node.op == "_Send":
+      if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"):
         self._copy_send_nodes.append(node.name)
 
       if inp.startswith("^"):
@@ -855,14 +886,14 @@ class DebugDumpDir(object):
       if node in self._copy_send_nodes:
         continue
 
-      if _is_copy_node(node):
+      if is_copy_node(node):
         copy_nodes.append(node)
 
       inputs = self._node_inputs[node]
 
       for i in xrange(len(inputs)):
         inp = inputs[i]
-        if _is_copy_node(inp):
+        if is_copy_node(inp):
           # Find the input to the Copy node, which should be the original
           # input to the node.
           orig_inp = self._node_inputs[inp][0]
@@ -878,7 +909,7 @@ class DebugDumpDir(object):
       ctrl_inputs = self._node_ctrl_inputs[node]
       debug_op_inputs = []
       for ctrl_inp in ctrl_inputs:
-        if _is_debug_node(ctrl_inp):
+        if is_debug_node(ctrl_inp):
           debug_op_inputs.append(ctrl_inp)
       for debug_op_inp in debug_op_inputs:
         ctrl_inputs.remove(debug_op_inp)
@@ -948,7 +979,7 @@ class DebugDumpDir(object):
       slot = datum.output_slot
       # In some cases (e.g., system clocks with insufficient precision),
       # the upstream and downstream tensors may have identical timestamps, the
-      # following check examines this possibilty and avoids raising an error if
+      # following check examines this possibility and avoids raising an error if
       # that is the case.
       if not self._satisfied_at_timestamp(
           pending_inputs[node], datum.timestamp, start_i=i + 1):
@@ -1381,13 +1412,14 @@ class DebugDumpDir(object):
         may be dumped multiple times.
 
     Raises:
-      ValueError: If the tensor does not exist in the debug-dump data.
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in
+        the debug-dump data.
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
     if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
     return [datum.file_path for datum in self._watch_key_to_datum[watch_key]]
 
@@ -1406,13 +1438,14 @@ class DebugDumpDir(object):
       List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
 
     Raises:
-      ValueError: If the tensor does not exist in the debug-dump data.
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in
+        the debug-dump data.
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
     if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
     return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]]
 
@@ -1433,13 +1466,14 @@ class DebugDumpDir(object):
       (`list` of `int`) list of relative timestamps.
 
     Raises:
-      ValueError: If the tensor watch key does not exist in the debug dump data.
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not
+        exist in the debug dump data.
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
     if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
     return self._watch_key_to_rel_time[watch_key]
 
@@ -1457,13 +1491,14 @@ class DebugDumpDir(object):
       (`list` of `int`): list of dump file sizes in bytes.
 
     Raises:
-      ValueError: If the tensor watch key does not exist in the debug dump data.
+      WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not
+        exist in the debug dump data.
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
     if watch_key not in self._watch_key_to_datum:
-      raise ValueError("Watch key \"%s\" does not exist in the debug dump" %
-                       watch_key)
+      raise WatchKeyDoesNotExistInDebugDumpDirError(
+          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
     return self._watch_key_to_dump_size_bytes[watch_key]
 
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index 7ca222785b4e82f36f5be42c9884fff7553c496d..dc45e8df6cedb59aafb4e50e145084ee4f19c1a8 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -23,6 +23,7 @@ import tempfile
 
 import numpy as np
 
+from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -47,24 +48,24 @@ class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
 class NodeNameChecksTest(test_util.TensorFlowTestCase):
 
   def testIsCopyNode(self):
-    self.assertTrue(debug_data._is_copy_node("__copy_ns1/ns2/node3_0"))
+    self.assertTrue(debug_data.is_copy_node("__copy_ns1/ns2/node3_0"))
 
-    self.assertFalse(debug_data._is_copy_node("copy_ns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("_copy_ns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("_copyns1/ns2/node3_0"))
-    self.assertFalse(debug_data._is_copy_node("__dbg_ns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("copy_ns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("_copy_ns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("_copyns1/ns2/node3_0"))
+    self.assertFalse(debug_data.is_copy_node("__dbg_ns1/ns2/node3_0"))
 
   def testIsDebugNode(self):
     self.assertTrue(
-        debug_data._is_debug_node("__dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+        debug_data.is_debug_node("__dbg_ns1/ns2/node3:0_0_DebugIdentity"))
 
     self.assertFalse(
-        debug_data._is_debug_node("dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+        debug_data.is_debug_node("dbg_ns1/ns2/node3:0_0_DebugIdentity"))
     self.assertFalse(
-        debug_data._is_debug_node("_dbg_ns1/ns2/node3:0_0_DebugIdentity"))
+        debug_data.is_debug_node("_dbg_ns1/ns2/node3:0_0_DebugIdentity"))
     self.assertFalse(
-        debug_data._is_debug_node("_dbgns1/ns2/node3:0_0_DebugIdentity"))
-    self.assertFalse(debug_data._is_debug_node("__copy_ns1/ns2/node3_0"))
+        debug_data.is_debug_node("_dbgns1/ns2/node3:0_0_DebugIdentity"))
+    self.assertFalse(debug_data.is_debug_node("__copy_ns1/ns2/node3_0"))
 
 
 class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
@@ -72,7 +73,7 @@ class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
   def testParseDebugNodeName_valid(self):
     debug_node_name_1 = "__dbg_ns_a/ns_b/node_c:1_0_DebugIdentity"
     (watched_node, watched_output_slot, debug_op_index,
-     debug_op) = debug_data._parse_debug_node_name(debug_node_name_1)
+     debug_op) = debug_data.parse_debug_node_name(debug_node_name_1)
 
     self.assertEqual("ns_a/ns_b/node_c", watched_node)
     self.assertEqual(1, watched_output_slot)
@@ -83,20 +84,20 @@ class ParseDebugNodeNameTest(test_util.TensorFlowTestCase):
     invalid_debug_node_name_1 = "__copy_ns_a/ns_b/node_c:1_0_DebugIdentity"
 
     with self.assertRaisesRegexp(ValueError, "Invalid prefix"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
 
   def testParseDebugNodeName_missingDebugOpIndex(self):
     invalid_debug_node_name_1 = "__dbg_node1:0_DebugIdentity"
 
     with self.assertRaisesRegexp(ValueError, "Invalid debug node name"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
 
   def testParseDebugNodeName_invalidWatchedTensorName(self):
     invalid_debug_node_name_1 = "__dbg_node1_0_DebugIdentity"
 
     with self.assertRaisesRegexp(ValueError,
                                  "Invalid tensor name in debug node name"):
-      debug_data._parse_debug_node_name(invalid_debug_node_name_1)
+      debug_data.parse_debug_node_name(invalid_debug_node_name_1)
 
 
 class HasNanOrInfTest(test_util.TensorFlowTestCase):
@@ -125,9 +126,15 @@ class HasNanOrInfTest(test_util.TensorFlowTestCase):
     a = np.array([])
     self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
 
-  def testNone(self):
-    a = None
-    self.assertFalse(debug_data.has_inf_or_nan(self._dummy_datum, a))
+  def testInconvertibleTensorProto(self):
+    self.assertFalse(debug_data.has_inf_or_nan(
+        self._dummy_datum,
+        debug_data.InconvertibleTensorProto(tensor_pb2.TensorProto(),
+                                            initialized=False)))
+    self.assertFalse(debug_data.has_inf_or_nan(
+        self._dummy_datum,
+        debug_data.InconvertibleTensorProto(tensor_pb2.TensorProto(),
+                                            initialized=True)))
 
   def testDTypeComplexWorks(self):
     a = np.array([1j, 3j, 3j, 7j], dtype=np.complex128)
diff --git a/tensorflow/python/debug/lib/debug_utils.py b/tensorflow/python/debug/lib/debug_utils.py
index 9013cb096d9dc7874dbc3cb931bb27bebd57e880..f1e972940b7154aab607bbe11a19ecd74199aee4 100644
--- a/tensorflow/python/debug/lib/debug_utils.py
+++ b/tensorflow/python/debug/lib/debug_utils.py
@@ -121,7 +121,7 @@ def watch_graph(run_options,
       are set, the two filtering operations will occur in a logical `AND`
       relation. In other words, a node will be included if and only if it
       hits both whitelists.
-    tensor_dtype_regex_whitelist: Regular-experssion whitelist for Tensor
+    tensor_dtype_regex_whitelist: Regular-expression whitelist for Tensor
       data type, e.g., `"^int.*"`.
       This whitelist operates in logical `AND` relations to the two whitelists
       above.
@@ -210,7 +210,7 @@ def watch_graph_with_blacklists(run_options,
       relation. In other words, a node will be excluded if it hits either of
       the two blacklists; a node will be included if and only if it hits
       neither of the blacklists.
-    tensor_dtype_regex_blacklist: Regular-experssion blacklist for Tensor
+    tensor_dtype_regex_blacklist: Regular-expression blacklist for Tensor
       data type, e.g., `"^int.*"`.
       This blacklist operates in logical `OR` relations to the two whitelists
       above.
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index d4978fa235d1210d837c560b5baa88285f6af180..5b1875e092b4fbf8b2b1bc28a46be6999c049e34 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
diff --git a/tensorflow/python/debug/lib/profiling.py b/tensorflow/python/debug/lib/profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd580251a690f1044bbffc11e6cce968524bdcf6
--- /dev/null
+++ b/tensorflow/python/debug/lib/profiling.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data structures and algorithms for profiling information."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+class ProfileDatum(object):
+  """Profile data point."""
+
+  def __init__(self,
+               device_name,
+               node_exec_stats,
+               file_path,
+               line_number,
+               func_name,
+               op_type):
+    """Constructor.
+
+    Args:
+      device_name: (string) name of the device.
+      node_exec_stats: `NodeExecStats` proto.
+      file_path: path to the source file involved in creating the op.
+      line_number: line number in the file involved in creating the op.
+      func_name: name of the function that the line belongs to.
+      op_type: (string) Operation type.
+    """
+    self.device_name = device_name
+    self.node_exec_stats = node_exec_stats
+    self.file_path = file_path
+    self.line_number = line_number
+    self.func_name = func_name
+    if self.file_path:
+      self.file_line_func = "%s:%d(%s)" % (
+          os.path.basename(self.file_path), self.line_number, self.func_name)
+    else:
+      self.file_line_func = ""
+    self.op_type = op_type
+    self.start_time = self.node_exec_stats.all_start_micros
+    self.op_time = (self.node_exec_stats.op_end_rel_micros -
+                    self.node_exec_stats.op_start_rel_micros)
+
+  @property
+  def exec_time(self):
+    """Op execution time plus pre- and post-processing."""
+    return self.node_exec_stats.all_end_rel_micros
+
+
+class AggregateProfile(object):
+  """Profile summary data for aggregating a number of ProfileDatum."""
+
+  def __init__(self, profile_datum):
+    """Constructor.
+
+    Args:
+      profile_datum: (`ProfileDatum`) an instance of `ProfileDatum` to
+        initialize this object with.
+    """
+
+    self.total_op_time = profile_datum.op_time
+    self.total_exec_time = profile_datum.exec_time
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+    self._node_to_exec_count = {device_and_node: 1}
+
+  def add(self, profile_datum):
+    """Accumulate a new instance of ProfileDatum.
+
+    Args:
+      profile_datum: (`ProfileDatum`) an instance of `ProfileDatum` to
+        accumulate to this object.
+    """
+
+    self.total_op_time += profile_datum.op_time
+    self.total_exec_time += profile_datum.exec_time
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+
+    device_and_node = "%s:%s" % (profile_datum.device_name,
+                                 profile_datum.node_exec_stats.node_name)
+    if device_and_node in self._node_to_exec_count:
+      self._node_to_exec_count[device_and_node] += 1
+    else:
+      self._node_to_exec_count[device_and_node] = 1
+
+  @property
+  def node_count(self):
+    return len(self._node_to_exec_count)
+
+  @property
+  def node_exec_count(self):
+    return sum(self._node_to_exec_count.values())
diff --git a/tensorflow/python/debug/lib/profiling_test.py b/tensorflow/python/debug/lib/profiling_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7b120ba5d96647e2e3cca515ac77ee23f37315f
--- /dev/null
+++ b/tensorflow/python/debug/lib/profiling_test.py
@@ -0,0 +1,100 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for the basic data structures and algorithms for profiling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.python.debug.lib import profiling
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class AggregateProfile(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    node_1 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=3,
+        op_end_rel_micros=5,
+        all_end_rel_micros=4)
+    self.profile_datum_1 = profiling.ProfileDatum(
+        "cpu:0", node_1, "/foo/bar.py", 10, "func1", "Add")
+
+    node_2 = step_stats_pb2.NodeExecStats(
+        node_name="Mul/456",
+        op_start_rel_micros=13,
+        op_end_rel_micros=16,
+        all_end_rel_micros=17)
+    self.profile_datum_2 = profiling.ProfileDatum(
+        "cpu:0", node_2, "/foo/bar.py", 11, "func1", "Mul")
+
+    node_3 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=103,
+        op_end_rel_micros=105,
+        all_end_rel_micros=4)
+    self.profile_datum_3 = profiling.ProfileDatum(
+        "cpu:0", node_3, "/foo/bar.py", 12, "func1", "Add")
+
+    node_4 = step_stats_pb2.NodeExecStats(
+        node_name="Add/123",
+        op_start_rel_micros=203,
+        op_end_rel_micros=205,
+        all_end_rel_micros=4)
+    self.profile_datum_4 = profiling.ProfileDatum(
+        "gpu:0", node_4, "/foo/bar.py", 13, "func1", "Add")
+
+  def testAggregateProfileConstructorWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+
+    self.assertEqual(2, aggregate_data.total_op_time)
+    self.assertEqual(4, aggregate_data.total_exec_time)
+    self.assertEqual(1, aggregate_data.node_count)
+    self.assertEqual(1, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithDifferentNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_2)
+
+    self.assertEqual(5, aggregate_data.total_op_time)
+    self.assertEqual(21, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(2, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithSameNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_2)
+    aggregate_data.add(self.profile_datum_3)
+
+    self.assertEqual(7, aggregate_data.total_op_time)
+    self.assertEqual(25, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(3, aggregate_data.node_exec_count)
+
+  def testAddToAggregateProfileWithDifferentDeviceSameNodeWorks(self):
+    aggregate_data = profiling.AggregateProfile(self.profile_datum_1)
+    aggregate_data.add(self.profile_datum_4)
+
+    self.assertEqual(4, aggregate_data.total_op_time)
+    self.assertEqual(8, aggregate_data.total_exec_time)
+    self.assertEqual(2, aggregate_data.node_count)
+    self.assertEqual(2, aggregate_data.node_exec_count)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 511ddb1673699ea35a6686e24cfdfe1ad141e764..deb8249343fd33461b0eb30a05e4c715c970dc28 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -158,6 +158,52 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     return simple_add_results(u_init_val, v_init_val, u, v, w, u_name, v_name,
                               w_name, dump)
 
+  def testCopyNodesHaveCorrectDebugOpsAndURLsAttributeValues(self):
+    with session.Session() as sess:
+      u = variables.Variable(2.1, name="u")
+      v = variables.Variable(20.0, name="v")
+      w = math_ops.multiply(u, v, name="w")
+
+      sess.run(variables.global_variables_initializer())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_urls = self._debug_urls()
+      debug_utils.add_debug_tensor_watch(
+          run_options,
+          "u",
+          0, ["DebugNumericSummary(gated_grpc=True)", "DebugIdentity"],
+          debug_urls=debug_urls)
+      debug_utils.add_debug_tensor_watch(
+          run_options, "v", 0, ["DebugNumericSummary"], debug_urls=debug_urls)
+
+      run_metadata = config_pb2.RunMetadata()
+      r = sess.run(w, options=run_options, run_metadata=run_metadata)
+      self.assertAllClose(42.0, r)
+
+      u_copy_node_def = None
+      v_copy_node_def = None
+      for partition_graph in run_metadata.partition_graphs:
+        for node_def in partition_graph.node:
+          if debug_data.is_copy_node(node_def.name):
+            if node_def.name == "__copy_u_0":
+              u_copy_node_def = node_def
+            elif node_def.name == "__copy_v_0":
+              v_copy_node_def = node_def
+
+      self.assertIsNotNone(u_copy_node_def)
+      debug_ops_spec = u_copy_node_def.attr["debug_ops_spec"].list.s
+      self.assertEqual(2, len(debug_ops_spec))
+      self.assertEqual("DebugNumericSummary;%s;1" % debug_urls[0],
+                       debug_ops_spec[0].decode("utf-8"))
+      self.assertEqual("DebugIdentity;%s;0" % debug_urls[0],
+                       debug_ops_spec[1].decode("utf-8"))
+
+      self.assertIsNotNone(v_copy_node_def)
+      debug_ops_spec = v_copy_node_def.attr["debug_ops_spec"].list.s
+      self.assertEqual(1, len(debug_ops_spec))
+      self.assertEqual("DebugNumericSummary;%s;0" % debug_urls[0],
+                       debug_ops_spec[0].decode("utf-8"))
+
   def testConcurrentDumpingToPathsWithOverlappingParentDirsWorks(self):
     results = self._generate_dump_from_simple_addition_graph()
     self.assertTrue(results.dump.loaded_partition_graphs())
@@ -312,9 +358,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       u_vals = dump.get_tensors(u_name, 0, "DebugIdentity")
       s_vals = dump.get_tensors(s_name, 0, "DebugIdentity")
       self.assertEqual(1, len(u_vals))
-      self.assertIsNone(u_vals[0])
+      self.assertIsInstance(u_vals[0], debug_data.InconvertibleTensorProto)
+      self.assertFalse(u_vals[0].initialized)
       self.assertEqual(1, len(s_vals))
-      self.assertIsNone(s_vals[0])
+      self.assertIsInstance(s_vals[0], debug_data.InconvertibleTensorProto)
+      self.assertFalse(s_vals[0].initialized)
 
       # Call run() again, to check that u is initialized properly.
       self.assertAllClose(u_init_val, sess.run(u))
@@ -484,9 +532,15 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       sess.run(variables.global_variables_initializer())
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      debug_utils.watch_graph(run_options,
-                              sess.graph,
-                              debug_urls=self._debug_urls())
+      debug_utils.watch_graph_with_blacklists(
+          run_options,
+          sess.graph,
+          node_name_regex_blacklist="(.*rnn/while/.*|.*TensorArray.*)",
+          debug_urls=self._debug_urls())
+      # b/36870549: Nodes with these name patterns need to be excluded from
+      # tfdbg in order to prevent MSAN warnings of uninitialized Tensors
+      # under both file:// and grpc:// debug URL schemes.
+
       run_metadata = config_pb2.RunMetadata()
       sess.run(train_op, feed_dict={concat_inputs: input_values},
                options=run_options, run_metadata=run_metadata)
@@ -1370,7 +1424,10 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
           self._dump_root, partition_graphs=run_metadata.partition_graphs)
       self.assertTrue(dump.loaded_partition_graphs())
 
-      self.assertIsNone(dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0])
+      fifo_queue_tensor = dump.get_tensors("fifo_queue", 0, "DebugIdentity")[0]
+      self.assertIsInstance(fifo_queue_tensor,
+                            debug_data.InconvertibleTensorProto)
+      self.assertTrue(fifo_queue_tensor.initialized)
       self.assertAllClose(
           [101.0, 202.0, 303.0],
           dump.get_tensors("enqueue_many/component_0", 0, "DebugIdentity")[0])
diff --git a/tensorflow/python/debug/lib/source_utils.py b/tensorflow/python/debug/lib/source_utils.py
index cc949932cb1df455860244d6b914e71712845ed2..ad4e37d22e25ebd91bb7fdbfa0786623d1b1d77d 100644
--- a/tensorflow/python/debug/lib/source_utils.py
+++ b/tensorflow/python/debug/lib/source_utils.py
@@ -18,13 +18,78 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
+import re
+
+import numpy as np
+
+from tensorflow.python.debug.lib import profiling
+
+
+_TENSORFLOW_BASEDIR = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.normpath(os.path.abspath(__file__))))))
+
+UNCOMPILED_SOURCE_SUFFIXES = (".py")
+COMPILED_SOURCE_SUFFIXES = (".pyc", ".pyo")
+
+
+def _norm_abs_path(file_path):
+  return os.path.normpath(os.path.abspath(file_path))
+
+
+def is_extension_uncompiled_python_source(file_path):
+  _, extension = os.path.splitext(file_path)
+  return extension.lower() in UNCOMPILED_SOURCE_SUFFIXES
+
+
+def is_extension_compiled_python_source(file_path):
+  _, extension = os.path.splitext(file_path)
+  return extension.lower() in COMPILED_SOURCE_SUFFIXES
 
 
 def _convert_watch_key_to_tensor_name(watch_key):
   return watch_key[:watch_key.rfind(":")]
 
 
+def guess_is_tensorflow_py_library(py_file_path):
+  """Guess whether a Python source file is a part of the tensorflow library.
+
+  Special cases:
+    1) Returns False for unit-test files in the library (*_test.py),
+    2) Returns False for files under python/debug/examples.
+
+  Args:
+    py_file_path: full path of the Python source file in question.
+
+  Returns:
+    (`bool`) Whether the file is a part of the tensorflow library.
+
+  Raises:
+    ValueError: if the extension name of py_file_path does not indicate a Python
+      source file (compiled or uncomplied).
+  """
+  if (not is_extension_uncompiled_python_source(py_file_path) and
+      not is_extension_compiled_python_source(py_file_path)):
+    raise ValueError(
+        "Input file path (%s) is not a Python source file." % py_file_path)
+  py_file_path = _norm_abs_path(py_file_path)
+
+  return (py_file_path.startswith(_TENSORFLOW_BASEDIR) and
+          not py_file_path.endswith("_test.py") and
+          not os.path.dirname(py_file_path).endswith(
+              os.path.normpath("python/debug/examples")))
+
+
+def load_source(source_file_path):
+  with open(source_file_path, "rU") as f:
+    source_text = f.read()
+  source_lines = source_text.split("\n")
+  line_num_width = int(np.ceil(np.log10(len(source_lines)))) + 3
+  return source_lines, line_num_width
+
+
 def annotate_source(dump,
                     source_file_path,
                     do_dumped_tensors=False,
@@ -61,21 +126,16 @@ def annotate_source(dump,
     raise ValueError("Cannot perform source annotation due to a lack of set "
                      "Python graph in the dump object")
 
-  source_file_path = os.path.normpath(source_file_path)
+  source_file_path = _norm_abs_path(source_file_path)
 
   line_to_op_names = {}
   for op in py_graph.get_operations():
-    try:
-      traceback = dump.node_traceback(op.name)
-    except KeyError:
-      pass
-
-    for file_path, line_number, _, _ in reversed(traceback):
+    for file_path, line_number, _, _ in reversed(dump.node_traceback(op.name)):
       if (min_line is not None and line_number < min_line or
           max_line is not None and line_number >= max_line):
         continue
 
-      if os.path.normpath(file_path) != source_file_path:
+      if _norm_abs_path(file_path) != source_file_path:
         continue
 
       if do_dumped_tensors:
@@ -95,3 +155,161 @@ def annotate_source(dump,
         break
 
   return line_to_op_names
+
+
+def list_source_files_against_dump(dump,
+                                   path_regex_whitelist=None,
+                                   node_name_regex_whitelist=None):
+  """Generate a list of source files with information regarding ops and tensors.
+
+  Args:
+    dump: (`DebugDumpDir`) A `DebugDumpDir` object of which the Python graph
+      has been loaded.
+    path_regex_whitelist: A regular-expression filter for source file path.
+    node_name_regex_whitelist: A regular-expression filter for node names.
+
+  Returns:
+    A list of tuples regarding the Python source files involved in constructing
+    the ops and tensors contained in `dump`. Each tuple is:
+      (source_file_path, is_tf_library, num_nodes, num_tensors, num_dumps,
+       first_line)
+
+      is_tf_library: (`bool`) A guess of whether the file belongs to the
+        TensorFlow Python library.
+      num_nodes: How many nodes were created by lines of this source file.
+        These include nodes with dumps and those without.
+      num_tensors: How many Tensors were created by lines of this source file.
+        These include Tensors with dumps and those without.
+      num_dumps: How many debug Tensor dumps were from nodes (and Tensors)
+        that were created by this source file.
+      first_line: The first line number (1-based) that created any nodes or
+        Tensors in this source file.
+
+    The list is sorted by ascending order of source_file_path.
+
+  Raises:
+    ValueError: If the dump object does not have a Python graph set.
+  """
+
+  py_graph = dump.python_graph
+  if not py_graph:
+    raise ValueError("Cannot generate source list due to a lack of set "
+                     "Python graph in the dump object")
+
+  path_to_node_names = collections.defaultdict(set)
+  path_to_tensor_names = collections.defaultdict(set)
+  path_to_first_line = {}
+  tensor_name_to_num_dumps = {}
+
+  path_regex = (re.compile(path_regex_whitelist)
+                if path_regex_whitelist else None)
+  node_name_regex = (re.compile(node_name_regex_whitelist)
+                     if node_name_regex_whitelist else None)
+
+  to_skip_file_paths = set()
+  for op in py_graph.get_operations():
+    if node_name_regex and not node_name_regex.match(op.name):
+      continue
+
+    for file_path, line_number, _, _ in dump.node_traceback(op.name):
+      file_path = _norm_abs_path(file_path)
+      if (file_path in to_skip_file_paths or
+          path_regex and not path_regex.match(file_path) or
+          not os.path.isfile(file_path)):
+        to_skip_file_paths.add(file_path)
+        continue
+
+      path_to_node_names[file_path].add(op.name)
+      if file_path in path_to_first_line:
+        if path_to_first_line[file_path] > line_number:
+          path_to_first_line[file_path] = line_number
+      else:
+        path_to_first_line[file_path] = line_number
+
+      for output_tensor in op.outputs:
+        tensor_name = output_tensor.name
+        path_to_tensor_names[file_path].add(tensor_name)
+
+      watch_keys = dump.debug_watch_keys(op.name)
+      for watch_key in watch_keys:
+        node_name, output_slot, debug_op = watch_key.split(":")
+        tensor_name = "%s:%s" % (node_name, output_slot)
+        if tensor_name not in tensor_name_to_num_dumps:
+          tensor_name_to_num_dumps[tensor_name] = len(
+              dump.get_tensors(node_name, int(output_slot), debug_op))
+
+  path_to_num_dumps = {}
+  for path in path_to_tensor_names:
+    path_to_num_dumps[path] = sum(
+        tensor_name_to_num_dumps.get(tensor_name, 0)
+        for tensor_name in path_to_tensor_names[path])
+
+  output = []
+  for file_path in path_to_node_names:
+    output.append((
+        file_path,
+        guess_is_tensorflow_py_library(file_path),
+        len(path_to_node_names.get(file_path, {})),
+        len(path_to_tensor_names.get(file_path, {})),
+        path_to_num_dumps.get(file_path, 0),
+        path_to_first_line[file_path]))
+
+  return sorted(output, key=lambda x: x[0])
+
+
+def annotate_source_against_profile(profile_data,
+                                    source_file_path,
+                                    node_name_filter=None,
+                                    op_type_filter=None,
+                                    min_line=None,
+                                    max_line=None):
+  """Annotate a Python source file with profiling information at each line.
+
+  (The annotation doesn't change the source file itself.)
+
+  Args:
+    profile_data: (`list` of `ProfileDatum`) A list of `ProfileDatum`.
+    source_file_path: (`str`) Path to the source file being annotated.
+    node_name_filter: Regular expression to filter by node name.
+    op_type_filter: Regular expression to filter by op type.
+    min_line: (`None` or `int`) The 1-based line to start annotate the source
+      file from (inclusive).
+    max_line: (`None` or `int`) The 1-based line number to end the annotation
+      at (exclusive).
+
+  Returns:
+    A `dict` mapping 1-based line number to a the namedtuple
+      `profiling.LineOrFuncProfileSummary`.
+  """
+
+  source_file_path = _norm_abs_path(source_file_path)
+
+  node_name_regex = re.compile(node_name_filter) if node_name_filter else None
+  op_type_regex = re.compile(op_type_filter) if op_type_filter else None
+
+  line_to_profile_summary = {}
+  for profile_datum in profile_data:
+    if not profile_datum.file_path:
+      continue
+
+    if _norm_abs_path(profile_datum.file_path) != source_file_path:
+      continue
+
+    if (min_line is not None and profile_datum.line_number < min_line or
+        max_line is not None and profile_datum.line_number >= max_line):
+      continue
+
+    if (node_name_regex and
+        not node_name_regex.match(profile_datum.node_exec_stats.node_name)):
+      continue
+
+    if op_type_regex and not op_type_regex.match(profile_datum.op_type):
+      continue
+
+    if profile_datum.line_number not in line_to_profile_summary:
+      line_to_profile_summary[profile_datum.line_number] = (
+          profiling.AggregateProfile(profile_datum))
+    else:
+      line_to_profile_summary[profile_datum.line_number].add(profile_datum)
+
+  return line_to_profile_summary
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 5d28bff2072b4e3b537d5ca208303ac61eb3c99e..4a8d4eaa99f28db26f05a00e7759c79699ca9ab4 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import os
 import shutil
 import tempfile
@@ -33,13 +32,48 @@ from tensorflow.python.debug.lib import source_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return inspect.stack()[1][2] - 1
+  return tf_inspect.stack()[1][2] - 1
+
+
+class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self.curr_file_path = os.path.normpath(os.path.abspath(__file__))
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+  def testGuessedBaseDirIsProbablyCorrect(self):
+    self.assertEqual("tensorflow",
+                     os.path.basename(source_utils._TENSORFLOW_BASEDIR))
+
+  def testUnitTestFileReturnsFalse(self):
+    self.assertFalse(
+        source_utils.guess_is_tensorflow_py_library(self.curr_file_path))
+
+  def testSourceUtilModuleReturnsTrue(self):
+    self.assertTrue(
+        source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
+
+  def testFileInPythonKernelsPathReturnsTrue(self):
+    x = constant_op.constant(42.0, name="x")
+    self.assertTrue(
+        source_utils.guess_is_tensorflow_py_library(x.op.traceback[-1][0]))
+
+  def testNonPythonFileRaisesException(self):
+    with self.assertRaisesRegexp(ValueError, r"is not a Python source file"):
+      source_utils.guess_is_tensorflow_py_library(
+          os.path.join(os.path.dirname(self.curr_file_path), "foo.cc"))
 
 
 class SourceHelperTest(test_util.TensorFlowTestCase):
@@ -53,7 +87,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
 
     self.dump_root = self.get_temp_dir()
     self.curr_file_path = os.path.abspath(
-        inspect.getfile(inspect.currentframe()))
+        tf_inspect.getfile(tf_inspect.currentframe()))
 
     # Run a simple TF graph to generate some debug dumps that can be used in
     # source annotation.
@@ -103,27 +137,21 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
 
     self.assertIn(self.u_init.op.name,
                   source_annotation[self.u_init_line_number])
-    self.assertIn(self.u.op.name,
-                  source_annotation[self.u_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
     self.assertIn(self.v_init.op.name,
                   source_annotation[self.v_init_line_number])
-    self.assertIn(self.v.op.name,
-                  source_annotation[self.v_line_number])
-    self.assertIn(self.w.op.name,
-                  source_annotation[self.w_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.w_line_number])
 
     # In the non-stack-top (default) mode, the helper line should be annotated
     # with all the ops as well.
     self.assertIn(self.u_init.op.name,
                   source_annotation[self.helper_line_number])
-    self.assertIn(self.u.op.name,
-                  source_annotation[self.helper_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.helper_line_number])
     self.assertIn(self.v_init.op.name,
                   source_annotation[self.helper_line_number])
-    self.assertIn(self.v.op.name,
-                  source_annotation[self.helper_line_number])
-    self.assertIn(self.w.op.name,
-                  source_annotation[self.helper_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.helper_line_number])
 
   def testAnnotateWithStackTopGivesCorrectResult(self):
     source_annotation = source_utils.annotate_source(
@@ -131,14 +159,11 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
 
     self.assertIn(self.u_init.op.name,
                   source_annotation[self.u_init_line_number])
-    self.assertIn(self.u.op.name,
-                  source_annotation[self.u_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
     self.assertIn(self.v_init.op.name,
                   source_annotation[self.v_init_line_number])
-    self.assertIn(self.v.op.name,
-                  source_annotation[self.v_line_number])
-    self.assertIn(self.w.op.name,
-                  source_annotation[self.w_line_number])
+    self.assertIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.op.name, source_annotation[self.w_line_number])
 
     # In the stack-top mode, the helper line should not have been annotated.
     self.assertNotIn(self.helper_line_number, source_annotation)
@@ -150,8 +175,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
         min_line=self.u_line_number,
         max_line=self.u_line_number + 1)
 
-    self.assertIn(self.u.op.name,
-                  source_annotation[self.u_line_number])
+    self.assertIn(self.u.op.name, source_annotation[self.u_line_number])
     self.assertNotIn(self.v_line_number, source_annotation)
 
   def testAnnotateDumpedTensorsGivesCorrectResult(self):
@@ -160,26 +184,17 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
 
     # Note: Constant Tensors u_init and v_init may not get dumped due to
     #   constant-folding.
-    self.assertIn(self.u.name,
-                  source_annotation[self.u_line_number])
-    self.assertIn(self.v.name,
-                  source_annotation[self.v_line_number])
-    self.assertIn(self.w.name,
-                  source_annotation[self.w_line_number])
-
-    self.assertNotIn(self.u.op.name,
-                     source_annotation[self.u_line_number])
-    self.assertNotIn(self.v.op.name,
-                     source_annotation[self.v_line_number])
-    self.assertNotIn(self.w.op.name,
-                     source_annotation[self.w_line_number])
-
-    self.assertIn(self.u.name,
-                  source_annotation[self.helper_line_number])
-    self.assertIn(self.v.name,
-                  source_annotation[self.helper_line_number])
-    self.assertIn(self.w.name,
-                  source_annotation[self.helper_line_number])
+    self.assertIn(self.u.name, source_annotation[self.u_line_number])
+    self.assertIn(self.v.name, source_annotation[self.v_line_number])
+    self.assertIn(self.w.name, source_annotation[self.w_line_number])
+
+    self.assertNotIn(self.u.op.name, source_annotation[self.u_line_number])
+    self.assertNotIn(self.v.op.name, source_annotation[self.v_line_number])
+    self.assertNotIn(self.w.op.name, source_annotation[self.w_line_number])
+
+    self.assertIn(self.u.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.v.name, source_annotation[self.helper_line_number])
+    self.assertIn(self.w.name, source_annotation[self.helper_line_number])
 
   def testCallingAnnotateSourceWithoutPythonGraphRaisesException(self):
     self.dump.set_python_graph(None)
@@ -192,12 +207,139 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
     with open(unrelated_source_path, "wt") as source_file:
       source_file.write("print('hello, world')\n")
 
-    self.assertEqual(
-        {}, source_utils.annotate_source(self.dump, unrelated_source_path))
+    self.assertEqual({},
+                     source_utils.annotate_source(self.dump,
+                                                  unrelated_source_path))
 
     # Clean up unrelated source file.
     os.remove(unrelated_source_path)
 
 
+class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
+
+  def createAndRunGraphWithWhileLoop(self):
+    """Create and run a TensorFlow Graph with a while loop to generate dumps."""
+
+    self.dump_root = self.get_temp_dir()
+    self.curr_file_path = os.path.abspath(
+        tf_inspect.getfile(tf_inspect.currentframe()))
+
+    # Run a simple TF graph to generate some debug dumps that can be used in
+    # source annotation.
+    with session.Session() as sess:
+      loop_body = lambda i: math_ops.add(i, 2)
+      self.traceback_first_line = line_number_above()
+
+      loop_cond = lambda i: math_ops.less(i, 16)
+
+      i = constant_op.constant(10, name="i")
+      loop = control_flow_ops.while_loop(loop_cond, loop_body, [i])
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(
+          run_options, sess.graph, debug_urls=["file://%s" % self.dump_root])
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(loop, options=run_options, run_metadata=run_metadata)
+
+      self.dump = debug_data.DebugDumpDir(
+          self.dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.dump.set_python_graph(sess.graph)
+
+  def setUp(self):
+    self.createAndRunGraphWithWhileLoop()
+
+  def tearDown(self):
+    if os.path.isdir(self.dump_root):
+      shutil.rmtree(self.dump_root)
+    ops.reset_default_graph()
+
+  def testGenerateSourceList(self):
+    source_list = source_utils.list_source_files_against_dump(self.dump)
+
+    # Assert that the file paths are sorted and unique.
+    file_paths = [item[0] for item in source_list]
+    self.assertEqual(sorted(file_paths), file_paths)
+    self.assertEqual(len(set(file_paths)), len(file_paths))
+
+    # Assert that each item of source_list has length 6.
+    for item in source_list:
+      self.assertTrue(isinstance(item, tuple))
+      self.assertEqual(6, len(item))
+
+    # The while loop body should have executed 3 times. The following table
+    # lists the tensors and how many times each of them is dumped.
+    #   Tensor name            # of times dumped:
+    #   i:0                    1
+    #   while/Enter:0          1
+    #   while/Merge:0          4
+    #   while/Merge:1          4
+    #   while/Less/y:0         4
+    #   while/Less:0           4
+    #   while/LoopCond:0       4
+    #   while/Switch:0         1
+    #   while/Swtich:1         3
+    #   while/Identity:0       3
+    #   while/Add/y:0          3
+    #   while/Add:0            3
+    #   while/NextIteration:0  3
+    #   while/Exit:0           1
+    # ----------------------------
+    #   (Total)                39
+    #
+    # The total number of nodes is 12.
+    # The total number of tensors is 14 (2 of the nodes have 2 outputs:
+    #   while/Merge, while/Switch).
+
+    _, is_tf_py_library, num_nodes, num_tensors, num_dumps, first_line = (
+        source_list[file_paths.index(self.curr_file_path)])
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(12, num_nodes)
+    self.assertEqual(14, num_tensors)
+    self.assertEqual(39, num_dumps)
+    self.assertEqual(self.traceback_first_line, first_line)
+
+  def testGenerateSourceListWithNodeNameFilter(self):
+    source_list = source_utils.list_source_files_against_dump(
+        self.dump, node_name_regex_whitelist=r"while/Add.*")
+
+    # Assert that the file paths are sorted.
+    file_paths = [item[0] for item in source_list]
+    self.assertEqual(sorted(file_paths), file_paths)
+    self.assertEqual(len(set(file_paths)), len(file_paths))
+
+    # Assert that each item of source_list has length 4.
+    for item in source_list:
+      self.assertTrue(isinstance(item, tuple))
+      self.assertEqual(6, len(item))
+
+    # Due to the node-name filtering the result should only contain 2 nodes
+    # and 2 tensors. The total number of dumped tensors should be 6:
+    #   while/Add/y:0          3
+    #   while/Add:0            3
+    _, is_tf_py_library, num_nodes, num_tensors, num_dumps, _ = (
+        source_list[file_paths.index(self.curr_file_path)])
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(2, num_nodes)
+    self.assertEqual(2, num_tensors)
+    self.assertEqual(6, num_dumps)
+
+  def testGenerateSourceListWithPathRegexFilter(self):
+    curr_file_basename = os.path.basename(self.curr_file_path)
+    source_list = source_utils.list_source_files_against_dump(
+        self.dump,
+        path_regex_whitelist=(
+            ".*" + curr_file_basename.replace(".", "\\.") + "$"))
+
+    self.assertEqual(1, len(source_list))
+    (file_path, is_tf_py_library, num_nodes, num_tensors, num_dumps,
+     first_line) = source_list[0]
+    self.assertEqual(self.curr_file_path, file_path)
+    self.assertFalse(is_tf_py_library)
+    self.assertEqual(12, num_nodes)
+    self.assertEqual(14, num_tensors)
+    self.assertEqual(39, num_dumps)
+    self.assertEqual(self.traceback_first_line, first_line)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 825c5593120e431a796959ee80934d5a3fab6143..78e7b3b5ebaf9f33a808b754775e420750706c15 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -591,7 +591,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
     with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
       self.assertIsNone(stepper.last_updated())
 
-  def testContToUpdateInvalidatesDumpedIntermedates(self):
+  def testContToUpdateInvalidatesDumpedIntermediates(self):
     with NodeStepper(self.sess, [self.q, self.v_add]) as stepper:
       self.assertAllClose(400.0, stepper.cont("q:0"))
       self.assertItemsEqual(["v/read:0", "p:0"],
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 15f30e6f972eab94957fd6d26d36e356c3de4841..63229a85398ef92469bb35e3ee4010c1de7e0ee2 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -31,7 +31,12 @@ from tensorflow.python.platform import gfile
 class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
   """Debug Session wrapper that dumps debug data to filesystem."""
 
-  def __init__(self, sess, session_root, watch_fn=None, log_usage=True):
+  def __init__(self,
+               sess,
+               session_root,
+               watch_fn=None,
+               thread_name_filter=None,
+               log_usage=True):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
@@ -48,6 +53,9 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       watch_fn: (`Callable`) A Callable that can be used to define per-run
         debug ops and watched tensors. See the doc of
         `NonInteractiveDebugWrapperSession.__init__()` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
@@ -59,7 +67,7 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       pass  # No logging for open-source.
 
     framework.NonInteractiveDebugWrapperSession.__init__(
-        self, sess, watch_fn=watch_fn)
+        self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter)
 
     if gfile.Exists(session_root):
       if not gfile.IsDirectory(session_root):
@@ -78,7 +86,7 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
     """Implementation of abstrat method in superclass.
 
     See doc of `NonInteractiveDebugWrapperSession.prepare_run_debug_urls()`
-    for details. This implentation creates a run-specific subdirectory under
+    for details. This implementation creates a run-specific subdirectory under
     self._session_root and stores information regarding run `fetches` and
     `feed_dict.keys()` in the subdirectory.
 
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 54bffd689a465292a04d1f389b49141fa93c0627..c9ddd4798400b729a50ffb359346fc52fef028c4 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -21,6 +21,7 @@ import glob
 import os
 import shutil
 import tempfile
+import threading
 
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
@@ -335,6 +336,25 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
       self.assertEqual(repr(None), dump.run_feed_keys_info)
 
+  def testDumpingFromMultipleThreadsObeysThreadNameFilter(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False,
+        thread_name_filter=r"MainThread$")
+
+    self.assertAllClose(1.0, sess.run(self.delta))
+    def child_thread_job():
+      sess.run(sess.run(self.eta))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    thread.join()
+
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    self.assertEqual(1, len(dump_dirs))
+    dump = debug_data.DebugDumpDir(dump_dirs[0])
+    self.assertEqual(1, dump.size)
+    self.assertEqual("delta", dump.dumped_tensor_data[0].node_name)
+
   def testCallingInvokeNodeStepperOnDumpingWrapperRaisesException(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index b487671e90af814e28a0d89da5f040ece7ee5f82..2c239038e44e617ed31b266bb6b6dcef6e3d598d 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -112,6 +112,8 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import re
+import threading
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -217,6 +219,9 @@ class OnRunStartAction(object):
   # Run once with debug tensor-watching.
   DEBUG_RUN = "debug_run"
 
+  # Run once with profiler.
+  PROFILE_RUN = "profile_run"
+
   # Run without debug tensor-watching.
   NON_DEBUG_RUN = "non_debug_run"
 
@@ -327,11 +332,17 @@ class BaseDebugWrapperSession(session.SessionInterface):
   # TODO(cais): Add on_cont_start and on_cont_end callbacks once the stepper is
   # is available.
 
-  def __init__(self, sess):
+  def __init__(self, sess, thread_name_filter=None):
     """Constructor of `BaseDebugWrapperSession`.
 
     Args:
       sess: An (unwrapped) TensorFlow session instance.
+      thread_name_filter: Regular-expression filter (whitelist) for name(s) of
+        thread(s) on which the wrapper session will be active. This regular
+        expression is used in a start-anchored fashion on the thread name, i.e.,
+        by applying the `match` method of the compiled pattern. The default
+        `None` means that the wrapper session will be active on all threads.
+        E.g., r"MainThread$", r"QueueRunnerThread.*".
 
     Raises:
       ValueError: On invalid `OnSessionInitAction` value.
@@ -340,14 +351,10 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     _check_type(sess, session.BaseSession)
 
-    # TODO(cais): Remove this check once tfdbg is integrated with GrpcSession.
-    if sess.sess_str:
-      raise NotImplementedError(
-          "Non-DirectSession support is not available from TensorFlow "
-          "Debugger yet (sess_str=%s)" % sess.sess_str)
-
     # The session being wrapped.
     self._sess = sess
+    self._thread_name_filter_pattern = (re.compile(thread_name_filter)
+                                        if thread_name_filter else None)
 
     # Keeps track of number of run calls that have been performed on this
     # debug-wrapper session.
@@ -404,6 +411,11 @@ class BaseDebugWrapperSession(session.SessionInterface):
     """
 
     self._run_call_count += 1
+    if self._is_disabled_thread():
+      return self._sess.run(fetches,
+                            feed_dict=feed_dict,
+                            options=options,
+                            run_metadata=run_metadata)
 
     # Invoke on-run-start callback and obtain response.
     run_start_resp = self.on_run_start(
@@ -416,7 +428,7 @@ class BaseDebugWrapperSession(session.SessionInterface):
       decorated_run_options = options or config_pb2.RunOptions()
       run_metadata = run_metadata or config_pb2.RunMetadata()
 
-      self._decorate_run_options(
+      self._decorate_run_options_for_debug(
           decorated_run_options,
           run_start_resp.debug_urls,
           debug_ops=run_start_resp.debug_ops,
@@ -445,6 +457,19 @@ class BaseDebugWrapperSession(session.SessionInterface):
           client_graph_def=self._sess.graph.as_graph_def(),
           tf_error=tf_error)
 
+    elif run_start_resp.action == OnRunStartAction.PROFILE_RUN:
+      decorated_run_options = options or config_pb2.RunOptions()
+      run_metadata = run_metadata or config_pb2.RunMetadata()
+      self._decorate_run_options_for_profile(decorated_run_options)
+      retvals = self._sess.run(fetches,
+                               feed_dict=feed_dict,
+                               options=decorated_run_options,
+                               run_metadata=run_metadata)
+      run_end_req = OnRunEndRequest(
+          run_start_resp.action,
+          run_metadata=run_metadata,
+          client_graph_def=self._sess.graph.as_graph_def())
+
     elif (run_start_resp.action == OnRunStartAction.NON_DEBUG_RUN or
           run_start_resp.action == OnRunStartAction.INVOKE_STEPPER):
       if run_start_resp.action == OnRunStartAction.INVOKE_STEPPER:
@@ -473,6 +498,11 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     return retvals
 
+  def _is_disabled_thread(self):
+    thread_name = threading.current_thread().name or ""
+    return (self._thread_name_filter_pattern and
+            not self._thread_name_filter_pattern.match(thread_name))
+
   def partial_run_setup(self, fetches, feeds=None):
     """Sets up the feeds and fetches for partial runs in the session."""
     raise NotImplementedError(
@@ -482,14 +512,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
     raise NotImplementedError(
         "partial_run is not implemented for debug-wrapper sessions.")
 
-  def _decorate_run_options(self,
-                            run_options,
-                            debug_urls,
-                            debug_ops="DebugIdentity",
-                            node_name_regex_whitelist=None,
-                            op_type_regex_whitelist=None,
-                            tensor_dtype_regex_whitelist=None,
-                            tolerate_debug_op_creation_failures=False):
+  def _decorate_run_options_for_debug(
+      self,
+      run_options,
+      debug_urls,
+      debug_ops="DebugIdentity",
+      node_name_regex_whitelist=None,
+      op_type_regex_whitelist=None,
+      tensor_dtype_regex_whitelist=None,
+      tolerate_debug_op_creation_failures=False):
     """Modify a RunOptions object for debug tensor watching.
 
     Specifies request for outputting partition graphs. Adds
@@ -520,6 +551,15 @@ class BaseDebugWrapperSession(session.SessionInterface):
         tensor_dtype_regex_whitelist=tensor_dtype_regex_whitelist,
         tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures)
 
+  def _decorate_run_options_for_profile(self, run_options):
+    """Modify a RunOptions object for profiling TensorFlow graph execution.
+
+    Args:
+      run_options: (RunOptions) the modified RunOptions object.
+    """
+
+    run_options.trace_level = config_pb2.RunOptions.FULL_TRACE
+
   @abc.abstractmethod
   def on_session_init(self, request):
     """Callback invoked during construction of the debug-wrapper session.
@@ -626,7 +666,7 @@ class WatchOptions(object):
         are set, the two filtering operations will occur in a logical `AND`
         relation. In other words, a node will be included if and only if it
         hits both whitelists.
-      tensor_dtype_regex_whitelist: Regular-experssion whitelist for Tensor
+      tensor_dtype_regex_whitelist: Regular-expression whitelist for Tensor
         data type, e.g., `"^int.*"`.
         This whitelist operates in logical `AND` relations to the two whitelists
         above.
@@ -656,7 +696,7 @@ class WatchOptions(object):
 class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
   """Base class for non-interactive (i.e., non-CLI) debug wrapper sessions."""
 
-  def __init__(self, sess, watch_fn=None):
+  def __init__(self, sess, watch_fn=None, thread_name_filter=None):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
@@ -672,11 +712,15 @@ class NonInteractiveDebugWrapperSession(BaseDebugWrapperSession):
            the debug ops to use, the node names, op types and/or tensor data
            types to watch, etc. See the documentation of `tf_debug.WatchOptions`
            for more details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
     Raises:
        TypeError: If a non-None `watch_fn` is specified and it is not callable.
     """
 
-    BaseDebugWrapperSession.__init__(self, sess)
+    BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
     self._watch_fn = None
     if watch_fn is not None:
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 0b2f73d412fa71ab64a8c0e4cd5fe17c8c587fcb..536365b6925cb29233de6c992fd476d7adbe7e86 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import shutil
 import tempfile
+import threading
 
 import numpy as np
 
@@ -32,6 +34,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -39,7 +43,7 @@ from tensorflow.python.platform import googletest
 class TestDebugWrapperSession(framework.BaseDebugWrapperSession):
   """A concrete implementation of BaseDebugWrapperSession for test."""
 
-  def __init__(self, sess, dump_root, observer):
+  def __init__(self, sess, dump_root, observer, thread_name_filter=None):
     # Supply dump root.
     self._dump_root = dump_root
 
@@ -47,7 +51,8 @@ class TestDebugWrapperSession(framework.BaseDebugWrapperSession):
     self._obs = observer
 
     # Invoke superclass constructor.
-    framework.BaseDebugWrapperSession.__init__(self, sess)
+    framework.BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
   def on_session_init(self, request):
     """Override abstract on-session-init callback method."""
@@ -155,7 +160,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self._c_val = np.array([[-4.0], [6.0]])
 
     self._a_init = constant_op.constant(
-        self._a_init_val, shape=[2, 2], name="a1_init")
+        self._a_init_val, shape=[2, 2], name="a_init")
     self._b_init = constant_op.constant(
         self._b_init_val, shape=[2, 1], name="b_init")
 
@@ -180,7 +185,8 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
     # Tear down temporary dump directory.
-    shutil.rmtree(self._dump_root)
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
 
     ops.reset_default_graph()
 
@@ -321,17 +327,64 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
                                       self._observer)
     wrapper.close()
 
-  def testUsingNonDirectSessionRaisesNotImplementedError(self):
-    # TODO(cais): Remove this test once tfdbg is integrated with GrpcSession.
-    fake_non_direct_session = session.Session()
-    fake_non_direct_session._target = "foo"
+  def testWrapperThreadNameFilterMainThread(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter="MainThread")
+
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
 
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r"Non-DirectSession support is not available from TensorFlow Debugger "
-        r"yet \(sess_str=foo\)"):
-      TestDebugWrapperSession(
-          fake_non_direct_session, self._dump_root, self._observer)
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(1, dump.size)
+    self.assertEqual("a_init", dump.dumped_tensor_data[0].node_name)
+
+  def testWrapperThreadNameFilterChildThread(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter=r"Child.*")
+
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root)
+    self.assertEqual(1, dump.size)
+    self.assertEqual("b_init", dump.dumped_tensor_data[0].node_name)
+
+  def testWrapperThreadNameFilterBothThreads(self):
+    wrapper = TestDebugWrapperSession(
+        self._sess, self._dump_root, self._observer,
+        thread_name_filter=None)
+
+    child_run_output = []
+    def child_thread_job():
+      child_run_output.append(wrapper.run(self._b_init))
+
+    thread = threading.Thread(name="ChildThread", target=child_thread_job)
+    thread.start()
+    self.assertAllClose(self._a_init_val, wrapper.run(self._a_init))
+    thread.join()
+    self.assertAllClose([self._b_init_val], child_run_output)
+
+    dump = debug_data.DebugDumpDir(self._dump_root, validate=False)
+    self.assertEqual(2, dump.size)
+    self.assertItemsEqual(
+        ["a_init", "b_init"],
+        [datum.node_name for datum in dump.dumped_tensor_data])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index ec328e7dd60157a3445410e7690c9125a31ab4bd..4062016607c8a56eb275fe4712a47c84bc7ed01c 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -30,6 +30,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
                sess,
                grpc_debug_server_addresses,
                watch_fn=None,
+               thread_name_filter=None,
                log_usage=True):
     """Constructor of DumpingDebugWrapperSession.
 
@@ -43,6 +44,9 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       watch_fn: (`Callable`) A Callable that can be used to define per-run
         debug ops and watched tensors. See the doc of
         `NonInteractiveDebugWrapperSession.__init__()` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
@@ -54,7 +58,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       pass  # No logging for open-source.
 
     framework.NonInteractiveDebugWrapperSession.__init__(
-        self, sess, watch_fn=watch_fn)
+        self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter)
 
     if isinstance(grpc_debug_server_addresses, str):
       self._grpc_debug_server_urls = [
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index c86cc413c118594ef4900cbdc06ee0343aa68f40..f6194f5fad78bf809b65f827a60c45bfebb08ce4 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -39,7 +39,10 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self, ui_type="curses", dump_root=None):
+  def __init__(self,
+               ui_type="curses",
+               dump_root=None,
+               thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -48,10 +51,14 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
         `run()` calls and removed afterwards.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
     """
 
     self._ui_type = ui_type
     self._dump_root = dump_root
+    self._thread_name_filter = thread_name_filter
     self._wrapper_initialized = False
     self._pending_tensor_filters = {}
 
@@ -59,7 +66,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
     """Add a tensor filter.
 
     See doc of `LocalCLIDebugWrapperSession.add_tensor_filter()` for details.
-    Override default behavior to accomodate the possibility of this method being
+    Override default behavior to accommodate the possibility of this method being
     called prior to the initialization of the underlying
     `LocalCLIDebugWrapperSession` object.
 
@@ -85,7 +92,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
           self,
           run_context.session,
           ui_type=self._ui_type,
-          dump_root=self._dump_root)
+          dump_root=self._dump_root,
+          thread_name_filter=self._thread_name_filter)
 
       # Actually register tensor filters registered prior to the construction
       # of the underlying LocalCLIDebugWrapperSession object.
@@ -122,6 +130,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook,
                   on_run_start_response.tensor_dtype_regex_whitelist),
               tolerate_debug_op_creation_failures=(
                   on_run_start_response.tolerate_debug_op_creation_failures)))
+    elif self._performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._decorate_run_options_for_profile(run_args.options)
     elif self._performed_action == framework.OnRunStartAction.INVOKE_STEPPER:
       # The _finalized property must be set to False so that the NodeStepper
       # can insert ops for retrieving TensorHandles.
@@ -168,7 +178,11 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
   `tf.contrib.learn`'s `Estimator`s and `Experiment`s.
   """
 
-  def __init__(self, session_root, watch_fn=None, log_usage=True):
+  def __init__(self,
+               session_root,
+               watch_fn=None,
+               thread_name_filter=None,
+               log_usage=True):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -176,11 +190,15 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
         `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
       watch_fn: See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__`.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (bool) Whether usage is to be logged.
     """
 
     self._session_root = session_root
     self._watch_fn = watch_fn
+    self._thread_name_filter = thread_name_filter
     self._log_usage = log_usage
     self._wrapper_initialized = False
 
@@ -196,6 +214,7 @@ class DumpingDebugHook(session_run_hook.SessionRunHook,
           run_context.session,
           self._session_root,
           watch_fn=self._watch_fn,
+          thread_name_filter=self._thread_name_filter,
           log_usage=self._log_usage)
       self._wrapper_initialized = True
 
@@ -240,6 +259,7 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   def __init__(self,
                grpc_debug_server_addresses,
                watch_fn=None,
+               thread_name_filter=None,
                log_usage=True):
     """Constructs a GrpcDebugHook.
 
@@ -250,6 +270,9 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
       watch_fn: A function that allows for customizing which ops to watch at
         which specific steps. See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__` for details.
+      thread_name_filter: Regular-expression white list for threads on which the
+        wrapper session will be active. See doc of `BaseDebugWrapperSession` for
+        more details.
       log_usage: (bool) Whether usage is to be logged.
 
     Raises:
@@ -265,6 +288,7 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     # A wrapper session responsible for GRPC communication.
     self._grpc_debug_wrapper_session = None
+    self._thread_name_filter = thread_name_filter
 
     self._grpc_debug_server_addresses = grpc_debug_server_addresses
     self._watch_fn = watch_fn
@@ -286,6 +310,7 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
           run_context.session,
           self._grpc_debug_server_addresses,
           watch_fn=self._watch_fn,
+          thread_name_filter=self._thread_name_filter,
           log_usage=self._log_usage)
 
     fetches = run_context.original_args.fetches
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index b29259c901d6b62bd0385ecbf62b05d5c9c0dc56..fe822df6ce3ca21a26825abc8385f8c120f55b0c 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -27,6 +27,7 @@ import tempfile
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_shared
 from tensorflow.python.debug.cli import debugger_cli_common
+from tensorflow.python.debug.cli import profile_analyzer_cli
 from tensorflow.python.debug.cli import stepper_cli
 from tensorflow.python.debug.cli import ui_factory
 from tensorflow.python.debug.lib import debug_data
@@ -48,7 +49,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
                sess,
                dump_root=None,
                log_usage=True,
-               ui_type="curses"):
+               ui_type="curses",
+               thread_name_filter=None):
     """Constructor of LocalCLIDebugWrapperSession.
 
     Args:
@@ -61,6 +63,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       log_usage: (`bool`) whether the usage of this class is to be logged.
       ui_type: (`str`) requested UI type. Currently supported:
         (curses | readline)
+      thread_name_filter: Regular-expression white list for thread name. See
+        the doc of `BaseDebugWrapperSession` for details.
 
     Raises:
       ValueError: If dump_root is an existing and non-empty directory or if
@@ -70,7 +74,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if log_usage:
       pass  # No logging for open-source.
 
-    framework.BaseDebugWrapperSession.__init__(self, sess)
+    framework.BaseDebugWrapperSession.__init__(
+        self, sess, thread_name_filter=thread_name_filter)
 
     if not dump_root:
       self._dump_root = tempfile.mktemp(prefix=_DUMP_ROOT_PREFIX)
@@ -158,6 +163,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         default="",
         help="Regular-expression filter for tensor dtype to be watched in the "
         "run, e.g., (float32|float64), int.*")
+    ap.add_argument(
+        "-p",
+        "--profile",
+        dest="profile",
+        action="store_true",
+        help="Run and profile TensorFlow graph execution.")
     self._argparsers["run"] = ap
 
     ap = argparse.ArgumentParser(
@@ -314,12 +325,16 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
           passed_filter = self._active_tensor_filter
           self._active_tensor_filter = None
 
-      self._prep_cli_for_run_end(debug_dump, request.tf_error, passed_filter)
+      self._prep_debug_cli_for_run_end(
+          debug_dump, request.tf_error, passed_filter)
 
       self._run_start_response = self._launch_cli()
 
       # Clean up the dump generated by this run.
       self._remove_dump_root()
+    elif request.performed_action == framework.OnRunStartAction.PROFILE_RUN:
+      self._prep_profile_cli_for_run_end(self._sess.graph, request.run_metadata)
+      self._run_start_response = self._launch_cli()
     else:
       # No debug information to show following a non-debug run() call.
       self._run_start_response = None
@@ -332,7 +347,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if os.path.isdir(self._dump_root):
       shutil.rmtree(self._dump_root)
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     """Prepare (but not launch) CLI for run-end, with debug dump from the run.
 
     Args:
@@ -387,6 +402,12 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
     if help_intro:
       self._run_cli.set_help_intro(help_intro)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self._init_command = "lp"
+    self._run_cli = profile_analyzer_cli.create_profiler_ui(
+        py_graph, run_metadata, ui_type=self._ui_type)
+    self._title = "run-end (profiler mode): " + self._run_description
+
   def _launch_cli(self):
     """Launch the interactive command-line interface.
 
@@ -421,13 +442,18 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
   def _run_handler(self, args, screen_info=None):
     """Command handler for "run" command during on-run-start."""
 
-    _ = screen_info  # Currently unused.
+    del screen_info  # Currently unused.
 
     parsed = self._argparsers["run"].parse_args(args)
     parsed.node_name_filter = parsed.node_name_filter or None
     parsed.op_type_filter = parsed.op_type_filter or None
     parsed.tensor_dtype_filter = parsed.tensor_dtype_filter or None
 
+    if parsed.profile:
+      raise debugger_cli_common.CommandLineExit(
+          exit_token=framework.OnRunStartResponse(
+              framework.OnRunStartAction.PROFILE_RUN, []))
+
     if parsed.till_filter_pass:
       # For the run-till-bad-numerical-value-appears mode, use the DEBUG_RUN
       # option to access the intermediate tensors, and set the corresponding
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index e22f6e783e83bcb81a94c13e510346c3bfe3ec7f..f8e32eca25e7f97060527a8c408e16b1e556bbbb 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -33,6 +33,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -71,15 +73,21 @@ class LocalCLIDebuggerWrapperSessionForTest(
         "tf_errors": [],
         "run_start_cli_run_numbers": [],
         "run_end_cli_run_numbers": [],
+        "profiler_py_graphs": [],
+        "profiler_run_metadata": [],
     }
 
   def _prep_cli_for_run_start(self):
     pass
 
-  def _prep_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
+  def _prep_debug_cli_for_run_end(self, debug_dump, tf_error, passed_filter):
     self.observers["debug_dumps"].append(debug_dump)
     self.observers["tf_errors"].append(tf_error)
 
+  def _prep_profile_cli_for_run_end(self, py_graph, run_metadata):
+    self.observers["profiler_py_graphs"].append(py_graph)
+    self.observers["profiler_run_metadata"].append(run_metadata)
+
   def _launch_cli(self):
     if self._is_run_start:
       self.observers["run_start_cli_run_numbers"].append(self._run_call_count)
@@ -468,6 +476,19 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, dumps.size)
     self.assertEqual("w_int_inner", dumps.dumped_tensor_data[0].node_name)
 
+  def testRunUnderProfilerModeWorks(self):
+    wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
+        [["-p"], []], self.sess)
+
+    wrapped_sess.run(self.w_int)
+
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_run_metadata"]))
+    self.assertTrue(
+        wrapped_sess.observers["profiler_run_metadata"][0].step_stats)
+    self.assertEqual(1, len(wrapped_sess.observers["profiler_py_graphs"]))
+    self.assertIsInstance(
+        wrapped_sess.observers["profiler_py_graphs"][0], ops.Graph)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 616b7ae49b1a46b107825e5a246650473cca7fb8..b7f83afdb19f6bfbcf0aee579e712a36bffe0da0 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -17,10 +17,11 @@ py_library(
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":checkpoint_utils",
+        ":dnn",
         ":estimator",
         ":export",
         ":inputs",
+        ":linear",
         ":model_fn",
         ":run_config",
         "//tensorflow/python:util",
@@ -39,33 +40,6 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-py_library(
-    name = "checkpoint_utils",
-    srcs = ["checkpoint_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_test(
-    name = "checkpoint_utils_test",
-    size = "small",
-    srcs = ["checkpoint_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":checkpoint_utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
 py_library(
     name = "model_fn",
     srcs = ["model_fn.py"],
@@ -101,6 +75,65 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "run_config_test",
+    size = "small",
+    srcs = ["run_config_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":run_config",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "dnn",
+    srcs = ["canned/dnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        ":optimizers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "dnn_test",
+    size = "medium",
+    srcs = ["canned/dnn_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dnn",
+        ":export_export",
+        ":metric_keys",
+        ":numpy_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
 py_library(
     name = "estimator",
     srcs = [
@@ -108,18 +141,19 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":checkpoint_utils",
         ":export",
         ":model_fn",
         ":run_config",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:metrics",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:tag_constants",
     ],
@@ -131,20 +165,31 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":export",
         ":model_fn",
         ":numpy_io",
         ":run_config",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
     ],
 )
 
@@ -208,6 +253,59 @@ py_test(
     ],
 )
 
+py_library(
+    name = "head",
+    srcs = ["canned/head.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        ":metric_keys",
+        ":model_fn",
+        ":prediction_keys",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "head_test",
+    size = "small",
+    srcs = ["canned/head_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":head",
+        ":metric_keys",
+        ":model_fn",
+        ":numpy_io",
+        ":prediction_keys",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
 py_library(
     name = "inputs",
     srcs = ["inputs/inputs.py"],
@@ -219,6 +317,58 @@ py_library(
     ],
 )
 
+py_library(
+    name = "linear",
+    srcs = ["canned/linear.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":optimizers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_test(
+    name = "linear_test",
+    size = "medium",
+    srcs = ["canned/linear_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":export_export",
+        ":linear",
+        ":metric_keys",
+        ":numpy_io",
+        ":run_config",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
+py_library(
+    name = "metric_keys",
+    srcs = ["canned/metric_keys.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":model_fn",
+    ],
+)
+
 py_library(
     name = "numpy_io",
     srcs = ["inputs/numpy_io.py"],
@@ -241,6 +391,27 @@ py_test(
     ],
 )
 
+py_library(
+    name = "optimizers",
+    srcs = ["canned/optimizers.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "optimizers_test",
+    size = "small",
+    srcs = ["canned/optimizers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+    ],
+)
+
 py_library(
     name = "pandas_io",
     srcs = ["inputs/pandas_io.py"],
@@ -262,6 +433,13 @@ py_test(
     ],
 )
 
+py_library(
+    name = "prediction_keys",
+    srcs = ["canned/prediction_keys.py"],
+    srcs_version = "PY2AND3",
+    deps = [],
+)
+
 py_library(
     name = "inputs_queues",
     srcs = [
diff --git a/.gitmodules b/tensorflow/python/estimator/canned/__init__.py
similarity index 100%
rename from .gitmodules
rename to tensorflow/python/estimator/canned/__init__.py
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc4fd3594dc33094cede0b5cf3fd846739e029a
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Deep Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+# The default learning rate of 0.05 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.05
+
+
+def _add_hidden_layer_summary(value, tag):
+  summary.scalar('%s_fraction_of_zero_values' % tag, nn.zero_fraction(value))
+  summary.histogram('%s_activation' % tag, value)
+
+
+def _dnn_model_fn(
+    features, labels, mode, head, hidden_units, feature_columns,
+    optimizer='Adagrad', activation_fn=nn.relu, dropout=None,
+    input_layer_partitioner=None, config=None):
+  """Deep Neural Net model_fn.
+
+  Args:
+    features: Dict of `Tensor` (depends on data passed to `train`).
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    hidden_units: Iterable of integer number of hidden units per layer.
+    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05.
+    activation_fn: Activation function applied to each layer.
+    dropout: When not `None`, the probability we will drop out a given
+      coordinate.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    predictions: A dict of `Tensor` objects.
+    loss: A scalar containing the loss of the step.
+    train_op: The op for training.
+  """
+  optimizer = optimizers.get_optimizer_instance(
+      optimizer, learning_rate=_LEARNING_RATE)
+  num_ps_replicas = config.num_ps_replicas if config else 0
+
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'dnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+    with variable_scope.variable_scope(
+        'input_from_feature_columns',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      net = feature_column_lib.input_layer(
+          features=features,
+          feature_columns=feature_columns)
+
+    for layer_id, num_hidden_units in enumerate(hidden_units):
+      with variable_scope.variable_scope(
+          'hiddenlayer_%d' % layer_id,
+          values=(net,)) as hidden_layer_scope:
+        net = core_layers.dense(
+            net,
+            units=num_hidden_units,
+            activation=activation_fn,
+            kernel_initializer=init_ops.glorot_uniform_initializer(),
+            name=hidden_layer_scope)
+        if dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+          net = core_layers.dropout(net, rate=dropout, training=True)
+      _add_hidden_layer_summary(net, hidden_layer_scope.name)
+
+    with variable_scope.variable_scope(
+        'logits',
+        values=(net,)) as logits_scope:
+      logits = core_layers.dense(
+          net,
+          units=head.logits_dimension,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer(),
+          name=logits_scope)
+    _add_hidden_layer_summary(logits, logits_scope.name)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class DNNRegressor(estimator.Estimator):
+  """A regressor for TensorFlow DNN models.
+
+  Example:
+
+  ```python
+  sparse_feature_a = sparse_column_with_hash_bucket(...)
+  sparse_feature_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
+                                          ...)
+  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
+                                          ...)
+
+  estimator = DNNRegressor(
+      feature_columns=[sparse_feature_a, sparse_feature_b],
+      hidden_units=[1024, 512, 256])
+
+  # Or estimator using the ProximalAdagradOptimizer optimizer with
+  # regularization.
+  estimator = DNNRegressor(
+      feature_columns=[sparse_feature_a, sparse_feature_b],
+      hidden_units=[1024, 512, 256],
+      optimizer=tf.train.ProximalAdagradOptimizer(
+        learning_rate=0.1,
+        l1_regularization_strength=0.001
+      ))
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train)
+
+  def input_fn_eval: # returns x, y
+    pass
+  estimator.evaluate(input_fn=input_fn_eval)
+  def input_fn_predict: # returns x, None
+    pass
+  estimator.predict_scores(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_feature_key` is not `None`, a feature with
+    `key=weight_feature_key` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `SparseColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `WeightedSparseColumn`, two features: the first with
+      `key` the id column name, the second with `key` the weight column name.
+      Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               hidden_units,
+               feature_columns,
+               model_dir=None,
+               label_dimension=1,
+               weight_feature_key=None,
+               optimizer='Adagrad',
+               activation_fn=nn.relu,
+               dropout=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `DNNRegressor` instance.
+
+    Args:
+      hidden_units: Iterable of number hidden units per layer. All layers are
+        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
+        second one has 32.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_feature_key: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      optimizer: An instance of `tf.Optimizer` used to train the model. If
+        `None`, will use an Adagrad optimizer.
+      activation_fn: Activation function applied to each layer. If `None`, will
+        use `tf.nn.relu`.
+      dropout: When not `None`, the probability we will drop out a given
+        coordinate.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Returns:
+      A `DNNRegressor` estimator.
+    """
+    def _model_fn(features, labels, mode, config):
+      return _dnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          # pylint: disable=protected-access
+          head=head_lib._regression_head_with_mean_squared_error_loss(
+              label_dimension=label_dimension,
+              weight_feature_key=weight_feature_key),
+          # pylint: enable=protected-access
+          hidden_units=hidden_units,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          activation_fn=activation_fn,
+          dropout=dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(DNNRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/dnn_test.py b/tensorflow/python/estimator/canned/dnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4654e97d6ea5f24cef05c399989a409c54e7f6d4
--- /dev/null
+++ b/tensorflow/python/estimator/canned/dnn_test.py
@@ -0,0 +1,1122 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator.canned import dnn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary as summary_lib
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+# Names of variables created by model.
+_LEARNING_RATE_NAME = 'dnn/regression_head/dnn/learning_rate'
+_HIDDEN_WEIGHTS_NAME_PATTERN = 'dnn/hiddenlayer_%d/kernel'
+_HIDDEN_BIASES_NAME_PATTERN = 'dnn/hiddenlayer_%d/bias'
+_LOGITS_WEIGHTS_NAME = 'dnn/logits/kernel'
+_LOGITS_BIASES_NAME = 'dnn/logits/bias'
+
+
+def _create_checkpoint(weights_and_biases, global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    weights_and_biases: Iterable of tuples of weight and bias values.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  weights, biases = zip(*weights_and_biases)
+  model_weights = {}
+
+  # Hidden layer weights.
+  for i in range(0, len(weights) - 1):
+    model_weights[_HIDDEN_WEIGHTS_NAME_PATTERN % i] = weights[i]
+    model_weights[_HIDDEN_BIASES_NAME_PATTERN % i] = biases[i]
+
+  # Output layer weights.
+  model_weights[_LOGITS_WEIGHTS_NAME] = weights[-1]
+  model_weights[_LOGITS_BIASES_NAME] = biases[-1]
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+    # TODO(ptucker): We shouldn't have this in the checkpoint for constant LRs.
+    # Learning rate.
+    variables_lib.Variable(.5, name=_LEARNING_RATE_NAME, dtype=dtypes.float32)
+
+    # Initialize vars and save checkpoint.
+    with tf_session.Session() as sess:
+      variables_lib.global_variables_initializer().run()
+      global_step_var.assign(global_step).eval()
+      saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+class DNNRegressorEvaluateTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_simple(self):
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+    def _input_fn():
+      return {'age': ((1,),)}, ((10.,),)
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # prediction = 1778
+    # loss = (10-1778)^2 = 3125824
+    expected_loss = 3125824
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_weighted(self):
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir,
+        weight_feature_key='label_weight')
+    def _input_fn():
+      return {'age': ((1,),), 'label_weight': ((1.5,),)}, ((10.,),)
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # prediction = 1778
+        # loss = 1.5*((10-1778)^2) = 4688736
+        metric_keys.MetricKeys.LOSS: 4688736,
+        # average_loss = loss / 1.5 = 3125824
+        metric_keys.MetricKeys.LOSS_MEAN: 3125824,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
+
+  def test_multi_example(self):
+    # Create initial checkpoint, 1 input, 2x2 hidden dims, 1 outputs.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array(((1,), (2,), (3,)))},
+        y=np.array(((10,), (9,), (8,))),
+        batch_size=3,
+        shuffle=False)
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = 1778, 2251, 2724
+        # loss = ((10-1778)^2 + (9-2251)^2 + (8-2724)^2) = 15529044
+        metric_keys.MetricKeys.LOSS: 15529044.,
+        # average_loss = loss / 3 = 5176348
+        metric_keys.MetricKeys.LOSS_MEAN: 5176348.,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=1))
+
+  def test_multi_batch(self):
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array(((1,), (2,), (3,)))},
+        y=np.array(((10,), (9,), (8,))),
+        batch_size=1,
+        shuffle=False)
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # predictions = 1778, 2251, 2724
+    # loss = ((10-1778)^2 + (9-2251)^2 + (8-2724)^2) / 3 = 5176348
+    expected_loss = 5176348.
+    self.assertAllClose({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=3))
+
+  def test_weighted_multi_example(self):
+    # Create checkpoint: num_inputs=4, hidden_units=(2, 2), num_outputs=3.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), global_step, self._model_dir)
+
+    # Create batched input.
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            # Dimensions are (batch_size, feature_column.dimension).
+            'x': np.array((
+                (15., 0., 1.5, 135.2),
+                (45., 45000., 1.8, 158.8),
+                (21., 33000., 1.7, 207.1),
+                (60., 10000., 1.6, 90.2)
+            )),
+            # TODO(ptucker): Add test for different weight shapes when we fix
+            # head._compute_weighted_loss (currently it requires weights to be
+            # same shape as labels & logits).
+            'label_weight': np.array((
+                (1., 1., 0.),
+                (.5, 1., .1),
+                (.5, 0., .9),
+                (0., 0., 0.),
+            ))
+        },
+        # Label shapes is (batch_size, num_outputs).
+        y=np.array((
+            (5., 2., 2.),
+            (-2., 1., -4.),
+            (-1., -1., -1.),
+            (-4., 3., 9.),
+        )),
+        batch_size=4,
+        shuffle=False)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(
+            # Dimension is number of inputs.
+            feature_column.numeric_column(
+                'x', dtype=dtypes.int32, shape=(4,)),
+        ),
+        model_dir=self._model_dir,
+        label_dimension=3,
+        weight_feature_key='label_weight')
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = [
+        #   [  54033.5    76909.6    99785.7]
+        #   [8030393.8 11433082.4 14835771.0]
+        #   [5923209.2  8433014.8 10942820.4]
+        #   [1810021.6  2576969.6  3343917.6]
+        # ]
+        # loss = sum(label_weights*(labels-predictions)^2) = 3.10290850204e+14
+        metric_keys.MetricKeys.LOSS: 3.10290850204e+14,
+        # average_loss = loss / sum(label_weights) = 3.10290850204e+14 / 5.
+        #              = 6.205817e+13
+        metric_keys.MetricKeys.LOSS_MEAN: 6.205817e+13,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=1))
+
+  def test_weighted_multi_example_multi_column(self):
+    # Create checkpoint: num_inputs=4, hidden_units=(2, 2), num_outputs=3.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), global_step, self._model_dir)
+
+    # Create batched input.
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            # Dimensions are (batch_size, feature_column.dimension).
+            'x': np.array((
+                (15., 0.),
+                (45., 45000.),
+                (21., 33000.),
+                (60., 10000.)
+            )),
+            'y': np.array((
+                (1.5, 135.2),
+                (1.8, 158.8),
+                (1.7, 207.1),
+                (1.6, 90.2)
+            )),
+            # TODO(ptucker): Add test for different weight shapes when we fix
+            # head._compute_weighted_loss (currently it requires weights to be
+            # same shape as labels & logits).
+            'label_weight': np.array((
+                (1., 1., 0.),
+                (.5, 1., .1),
+                (.5, 0., .9),
+                (0., 0., 0.),
+            ))
+        },
+        # Label shapes is (batch_size, num_outputs).
+        y=np.array((
+            (5., 2., 2.),
+            (-2., 1., -4.),
+            (-1., -1., -1.),
+            (-4., 3., 9.),
+        )),
+        batch_size=4,
+        shuffle=False)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(
+            # Dimensions add up to 4 (number of inputs).
+            feature_column.numeric_column(
+                'x', dtype=dtypes.int32, shape=(2,)),
+            feature_column.numeric_column(
+                'y', dtype=dtypes.float32, shape=(2,)),
+        ),
+        model_dir=self._model_dir,
+        label_dimension=3,
+        weight_feature_key='label_weight')
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = [
+        #   [  54033.5    76909.6    99785.7]
+        #   [8030393.8 11433082.4 14835771.0]
+        #   [5923209.2  8433014.8 10942820.4]
+        #   [1810021.6  2576969.6  3343917.6]
+        # ]
+        # loss = sum(label_weights*(labels-predictions)^2) = 3.10290850204e+14
+        metric_keys.MetricKeys.LOSS: 3.10290850204e+14,
+        # average_loss = loss / sum(label_weights) = 3.10290850204e+14 / 5.
+        #              = 6.205817e+13
+        metric_keys.MetricKeys.LOSS_MEAN: 6.205817e+13,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=1))
+
+  def test_weighted_multi_batch(self):
+    # Create checkpoint: num_inputs=4, hidden_units=(2, 2), num_outputs=3.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), global_step, self._model_dir)
+
+    # Create batched input.
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            # Dimensions are (batch_size, feature_column.dimension).
+            'x': np.array((
+                (15., 0., 1.5, 135.2),
+                (45., 45000., 1.8, 158.8),
+                (21., 33000., 1.7, 207.1),
+                (60., 10000., 1.6, 90.2)
+            )),
+            # TODO(ptucker): Add test for different weight shapes when we fix
+            # head._compute_weighted_loss (currently it requires weights to be
+            # same shape as labels & logits).
+            'label_weights': np.array((
+                (1., 1., 0.),
+                (.5, 1., .1),
+                (.5, 0., .9),
+                (0., 0., 0.),
+            ))
+        },
+        # Label shapes is (batch_size, num_outputs).
+        y=np.array((
+            (5., 2., 2.),
+            (-2., 1., -4.),
+            (-1., -1., -1.),
+            (-4., 3., 9.),
+        )),
+        batch_size=1,
+        shuffle=False)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(
+            # Dimension is number of inputs.
+            feature_column.numeric_column(
+                'x', dtype=dtypes.int32, shape=(4,)),
+        ),
+        model_dir=self._model_dir,
+        label_dimension=3,
+        weight_feature_key='label_weights')
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = [
+        #   [  54033.5    76909.6    99785.7]
+        #   [8030393.8 11433082.4 14835771.0]
+        #   [5923209.2  8433014.8 10942820.4]
+        #   [1810021.6  2576969.6  3343917.6]
+        # ]
+        # losses = label_weights*(labels-predictions)^2 = [
+        #  [  2.91907881e+09   5.91477894e+09                0]
+        #  [  3.22436284e+13   1.30715350e+14   2.20100220e+13]
+        #  [  1.75422095e+13                0   1.07770806e+14]
+        #  [               0                0                0]
+        # ]
+        # total_loss = sum(losses) = 3.10290850204e+14
+        # loss = total_loss / 4 = 7.7572712551e+13
+        metric_keys.MetricKeys.LOSS: 7.7572712551e+13,
+        # average_loss = total_loss / sum(label_weights) = 6.20581700408e+13
+        metric_keys.MetricKeys.LOSS_MEAN: 6.20581700408e+13,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=4))
+
+  def test_multi_dim(self):
+    # Create checkpoint: num_inputs=3, hidden_units=(2, 2), num_outputs=2.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.),), (7., 8.)),
+        (((9., 8.), (7., 6.),), (5., 4.)),
+        (((3., 2.), (1., 2.),), (3., 4.)),
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x', shape=(3,)),),
+        label_dimension=2,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array(((2., 4., 5.),))},
+        y=np.array(((46., 58.),)),
+        batch_size=1,
+        shuffle=False)
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = 3198, 3094
+        # loss = ((46-3198)^2 + (58-3094)^2) = 19152400
+        metric_keys.MetricKeys.LOSS: 19152400,
+        # average_loss = loss / 2 = 9576200
+        metric_keys.MetricKeys.LOSS_MEAN: 9576200,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=1))
+
+  def test_multi_feature_column(self):
+    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=1.
+    global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.),), (5., 6.)),
+        (((7., 8.), (9., 8.),), (7., 6.)),
+        (((5.,), (4.,),), (3.,))
+    ), global_step, self._model_dir)
+
+    # Create DNNRegressor and evaluate.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('age'),
+                         feature_column.numeric_column('height')),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array(((20,), (40,))), 'height': np.array(((4,), (8,)))},
+        y=np.array(((213.,), (421.,))),
+        batch_size=2,
+        shuffle=False)
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = 7315, 13771
+        # loss = ((213-7315)^2 + (421-13771)^2) / 2 = 228660896
+        metric_keys.MetricKeys.LOSS: 228660896.,
+        # average_loss = loss / 2 = 114330452
+        metric_keys.MetricKeys.LOSS_MEAN: 114330452.,
+        ops.GraphKeys.GLOBAL_STEP: global_step
+    }, dnn_regressor.evaluate(input_fn=input_fn, steps=1))
+
+
+class DNNRegressorPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), global_step=0, model_dir=self._model_dir)
+
+    # Create DNNRegressor and predict.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x'),),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array(((1.,),))}, batch_size=1, shuffle=False)
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # prediction = 1778
+    self.assertAllClose({
+        prediction_keys.PredictionKeys.PREDICTIONS: (1778.,)
+    }, next(dnn_regressor.predict(input_fn=input_fn)))
+
+  def test_multi_dim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    # Create checkpoint: num_inputs=4, hidden_units=(2, 2), num_outputs=3.
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), 100, self._model_dir)
+
+    # Create DNNRegressor and predict.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x', shape=(4,)),),
+        label_dimension=3,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        # Inputs shape is (batch_size, num_inputs).
+        x={'x': np.array(((1., 2., 3., 4.), (5., 6., 7., 8.)))},
+        batch_size=2,
+        shuffle=False)
+    # Output shape=(batch_size, num_outputs).
+    self.assertAllClose((
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        (3275., 4660., 6045.),
+        (6939., 9876., 12813.)
+    ), tuple([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in dnn_regressor.predict(input_fn=input_fn)
+    ]), rtol=1e-04)
+
+  def test_two_feature_columns(self):
+    """Tests predict with two feature columns."""
+    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=1.
+    _create_checkpoint((
+        (((1., 2.), (3., 4.),), (5., 6.)),
+        (((7., 8.), (9., 8.),), (7., 6.)),
+        (((5.,), (4.,),), (3.,))
+    ), 100, self._model_dir)
+
+    # Create DNNRegressor and predict.
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=(feature_column.numeric_column('x'),
+                         feature_column.numeric_column('y')),
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array((20.,)), 'y': np.array((4.,))},
+        batch_size=1,
+        shuffle=False)
+    self.assertAllClose({
+        # TODO(ptucker): Point to tool for calculating a neural net output?
+        # predictions = 7315
+        prediction_keys.PredictionKeys.PREDICTIONS: (7315,)
+    }, next(dnn_regressor.predict(input_fn=input_fn)))
+
+
+class DNNRegressorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_complete_flow(self):
+    label_dimension = 2
+    batch_size = 10
+    feature_columns = [feature_column.numeric_column('x', shape=(2,))]
+    est = dnn.DNNRegressor(
+        hidden_units=(2, 2),
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    # TRAIN
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    num_steps = 200
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+    # TODO(ptucker): Deterministic test for predicted values?
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+
+def _full_var_name(var_name):
+  return '%s/part_0:0' % var_name
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / expected
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=(
+            'Condition expected =~ actual did not hold element-wise:'
+            'expected = ', expected,
+            'actual = ', actual,
+            'rdiff = ', rdiff,
+            'rtol = ', rtol,
+        ),
+        name=scope)
+
+
+class _SummaryHook(session_run_hook.SessionRunHook):
+  """Saves summaries every N steps."""
+
+  def __init__(self):
+    self._summaries = []
+
+  def begin(self):
+    self._summary_op = summary_lib.merge_all()
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs({'summary': self._summary_op})
+
+  def after_run(self, run_context, run_values):
+    s = summary_pb2.Summary()
+    s.ParseFromString(run_values.results['summary'])
+    self._summaries.append(s)
+
+  def summaries(self):
+    return tuple(self._summaries)
+
+
+class DNNRegressorTrainTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _assert_checkpoint(
+      self, global_step, input_units, hidden_units, output_units):
+    """Asserts checkpoint contains expected variables with proper shapes.
+
+    Args:
+      global_step: Expected global step value.
+      input_units: The dimension of input layer.
+      hidden_units: Iterable of integer sizes for the hidden layers.
+      output_units: The dimension of output layer (logits).
+    """
+    shapes = {
+        name: shape
+        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    # Global step.
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    # Hidden layer weights.
+    prev_layer_units = input_units
+    for i in range(len(hidden_units)):
+      layer_units = hidden_units[i]
+      self.assertAllEqual((prev_layer_units, layer_units),
+                          shapes[_HIDDEN_WEIGHTS_NAME_PATTERN % i])
+      self.assertAllEqual((layer_units,),
+                          shapes[_HIDDEN_BIASES_NAME_PATTERN % i])
+      prev_layer_units = layer_units
+
+    # Output layer weights.
+    self.assertAllEqual((prev_layer_units, output_units),
+                        shapes[_LOGITS_WEIGHTS_NAME])
+    self.assertAllEqual((output_units,), shapes[_LOGITS_BIASES_NAME])
+
+  def _mockOptimizer(self, hidden_units, expected_loss=None):
+    hidden_weights_names = [
+        (_HIDDEN_WEIGHTS_NAME_PATTERN + '/part_0:0') % i
+        for i in range(len(hidden_units))]
+    hidden_biases_names = [
+        (_HIDDEN_BIASES_NAME_PATTERN + '/part_0:0') % i
+        for i in range(len(hidden_units))]
+    expected_var_names = (
+        hidden_weights_names + hidden_biases_names +
+        [_LOGITS_WEIGHTS_NAME + '/part_0:0', _LOGITS_BIASES_NAME + '/part_0:0'])
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'), loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMagicMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    return mock_optimizer
+
+  def _assert_simple_summary(self, expected_values, actual_summary):
+    """Assert summary the specified simple values.
+
+    Args:
+      expected_values: Dict of expected tags and simple values.
+      actual_summary: `summary_pb2.Summary`.
+    """
+    self.assertAllClose(expected_values, {
+        v.tag: v.simple_value
+        for v in actual_summary.value if (v.tag in expected_values)
+    })
+
+  def test_from_scratch_with_default_optimizer(self):
+    hidden_units = (2, 2)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        model_dir=self._model_dir)
+
+    # Train for a few steps, then validate final checkpoint.
+    num_steps = 5
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((10,),)), steps=num_steps)
+    self._assert_checkpoint(
+        num_steps, input_units=1, hidden_units=hidden_units, output_units=1)
+
+  def test_from_scratch(self):
+    hidden_units = (2, 2)
+    mock_optimizer = self._mockOptimizer(hidden_units=hidden_units)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((5.,),)), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        num_steps, input_units=1, hidden_units=hidden_units, output_units=1)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      summary_keys = [v.tag for v in summary.value]
+      self.assertIn(metric_keys.MetricKeys.LOSS, summary_keys)
+      self.assertIn(metric_keys.MetricKeys.LOSS_MEAN, summary_keys)
+
+  def test_simple(self):
+    base_global_step = 100
+    hidden_units = (2, 2)
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), base_global_step, self._model_dir)
+
+    # Create DNNRegressor with mock optimizer.
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # prediction = 1778
+    # loss = (10-1778)^2 = 3125824
+    expected_loss = 3125824.
+    mock_optimizer = self._mockOptimizer(
+        hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      self._assert_simple_summary({
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+          'dnn/dnn/hiddenlayer_0_activation': 0.,
+          'dnn/dnn/hiddenlayer_0_fraction_of_zero_values': 0.,
+          'dnn/dnn/hiddenlayer_1_activation': 0.,
+          'dnn/dnn/hiddenlayer_1_fraction_of_zero_values': 0.,
+          'dnn/dnn/logits_activation': 0.,
+          'dnn/dnn/logits_fraction_of_zero_values': 0.,
+          metric_keys.MetricKeys.LOSS: expected_loss,
+      }, summary)
+    self._assert_checkpoint(
+        base_global_step + num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1)
+
+  def test_activation_fn(self):
+    base_global_step = 100
+    hidden_units = (2, 2)
+    _create_checkpoint((
+        (((1., 2.),), (3., 4.)),
+        (((5., 6.), (7., 8.),), (9., 10.)),
+        (((11.,), (12.,),), (13.,))
+    ), base_global_step, self._model_dir)
+
+    # Create DNNRegressor with mock optimizer.
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # prediction = 36
+    # loss = (10-36)^2 = 676
+    expected_loss = 676.
+    mock_optimizer = self._mockOptimizer(
+        hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(feature_column.numeric_column('age'),),
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir,
+        activation_fn=nn.tanh)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, then validate optimizer, summaries, and
+    # checkpoint.
+    num_steps = 5
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=num_steps,
+        hooks=(summary_hook,))
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    for summary in summaries:
+      self._assert_simple_summary({
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+          'dnn/dnn/hiddenlayer_0_activation': 0.,
+          'dnn/dnn/hiddenlayer_0_fraction_of_zero_values': 0.,
+          'dnn/dnn/hiddenlayer_1_activation': 0.,
+          'dnn/dnn/hiddenlayer_1_fraction_of_zero_values': 0.,
+          'dnn/dnn/logits_activation': 0.,
+          'dnn/dnn/logits_fraction_of_zero_values': 0.,
+      }, summary)
+    self._assert_checkpoint(
+        base_global_step + num_steps, input_units=1, hidden_units=hidden_units,
+        output_units=1)
+
+  def test_weighted_multi_example_multi_column(self):
+    hidden_units = (2, 2)
+    base_global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), base_global_step, self._model_dir)
+
+    # Create DNNRegressor with mock optimizer.
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # predictions = [
+    #   [  54033.5    76909.6    99785.7]
+    #   [8030393.8 11433082.4 14835771.0]
+    #   [5923209.2  8433014.8 10942820.4]
+    #   [1810021.6  2576969.6  3343917.6]
+    # ]
+    # loss = sum(label_weights*(labels-predictions)^2) = 3.10290850204e+14
+    expected_loss = 3.10290850204e+14
+    mock_optimizer = self._mockOptimizer(
+        hidden_units=hidden_units, expected_loss=expected_loss)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(
+            # Dimensions add up to 4 (number of inputs).
+            feature_column.numeric_column(
+                'x', dtype=dtypes.int32, shape=(2,)),
+            feature_column.numeric_column(
+                'y', dtype=dtypes.float32, shape=(2,)),
+        ),
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir,
+        label_dimension=3,
+        weight_feature_key='label_weights')
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Create batched inputs.
+    input_fn = numpy_io.numpy_input_fn(
+        # NOTE: feature columns are concatenated in alphabetic order of keys.
+        x={
+            # Inputs shapes are (batch_size, feature_column.dimension).
+            'x': np.array((
+                (15., 0.),
+                (45., 45000.),
+                (21., 33000.),
+                (60., 10000.)
+            )),
+            'y': np.array((
+                (1.5, 135.2),
+                (1.8, 158.8),
+                (1.7, 207.1),
+                (1.6, 90.2)
+            )),
+            # TODO(ptucker): Add test for different weight shapes when we fix
+            # head._compute_weighted_loss (currently it requires weights to be
+            # same shape as labels & logits).
+            'label_weights': np.array((
+                (1., 1., 0.),
+                (.5, 1., .1),
+                (.5, 0., .9),
+                (0., 0., 0.),
+            ))
+        },
+        # Labels shapes is (batch_size, num_outputs).
+        y=np.array((
+            (5., 2., 2.),
+            (-2., 1., -4.),
+            (-1., -1., -1.),
+            (-4., 3., 9.),
+        )),
+        batch_size=4,
+        num_epochs=None,
+        shuffle=False)
+
+    # Train for 1 step, then validate optimizer, summaries, and checkpoint.
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(input_fn=input_fn, steps=1, hooks=(summary_hook,))
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(1, len(summaries))
+    self._assert_simple_summary({
+        metric_keys.MetricKeys.LOSS: expected_loss,
+        # average_loss = loss / sum(label_weights) = 3.10290850204e+14 / 5.
+        #              = 6.205817e+13
+        metric_keys.MetricKeys.LOSS_MEAN: 6.205817e+13,
+        'dnn/dnn/hiddenlayer_0_activation': 0.,
+        'dnn/dnn/hiddenlayer_0_fraction_of_zero_values': 0.,
+        'dnn/dnn/hiddenlayer_1_activation': 0.,
+        'dnn/dnn/hiddenlayer_1_fraction_of_zero_values': 0.,
+        'dnn/dnn/logits_activation': 0.,
+        'dnn/dnn/logits_fraction_of_zero_values': 0.,
+    }, summaries[0])
+    self._assert_checkpoint(
+        base_global_step + 1,
+        input_units=4,  # Sum of feature column dimensions.
+        hidden_units=hidden_units,
+        output_units=3)  # = label_dimension
+
+    # Train for 3 steps - we should still get the same loss since we're not
+    # updating weights.
+    dnn_regressor.train(input_fn=input_fn, steps=3)
+    self.assertEqual(2, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        base_global_step + 4,
+        input_units=4,  # Sum of feature column dimensions.
+        hidden_units=hidden_units,
+        output_units=3)  # = label_dimension
+
+  def test_weighted_multi_batch(self):
+    hidden_units = (2, 2)
+    base_global_step = 100
+    _create_checkpoint((
+        (((1., 2.), (3., 4.), (5., 6.), (7., 8.),), (9., 8.)),
+        (((7., 6.), (5., 4.),), (3., 2.)),
+        (((1., 2., 3.), (4., 5., 6.),), (7., 8., 9.)),
+    ), base_global_step, self._model_dir)
+
+    mock_optimizer = self._mockOptimizer(hidden_units=hidden_units)
+    dnn_regressor = dnn.DNNRegressor(
+        hidden_units=hidden_units,
+        feature_columns=(
+            # Dimension is number of inputs.
+            feature_column.numeric_column(
+                'x', dtype=dtypes.int32, shape=(4,)),
+        ),
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir,
+        label_dimension=3,
+        weight_feature_key='label_weights')
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Create batched input.
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            # Inputs shape is (batch_size, feature_column.dimension).
+            'x': np.array((
+                (15., 0., 1.5, 135.2),
+                (45., 45000., 1.8, 158.8),
+                (21., 33000., 1.7, 207.1),
+                (60., 10000., 1.6, 90.2)
+            )),
+            # TODO(ptucker): Add test for different weight shapes when we fix
+            # head._compute_weighted_loss (currently it requires weights to be
+            # same shape as labels & logits).
+            'label_weights': np.array((
+                (1., 1., 0.),
+                (.5, 1., .1),
+                (.5, 0., .9),
+                (0., 0., 0.),
+            ))
+        },
+        # Labels shapes is (batch_size, num_outputs).
+        y=np.array((
+            (5., 2., 2.),
+            (-2., 1., -4.),
+            (-1., -1., -1.),
+            (-4., 3., 9.),
+        )),
+        batch_size=1,
+        shuffle=False)
+
+    # Train for 1 step, then validate optimizer, summaries, and checkpoint.
+    num_steps = 4
+    summary_hook = _SummaryHook()
+    dnn_regressor.train(
+        input_fn=input_fn, steps=num_steps, hooks=(summary_hook,))
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    summaries = summary_hook.summaries()
+    self.assertEqual(num_steps, len(summaries))
+    # TODO(ptucker): Point to tool for calculating a neural net output?
+    # predictions = [
+    #   [  54033.5    76909.6    99785.7]
+    #   [8030393.8 11433082.4 14835771.0]
+    #   [5923209.2  8433014.8 10942820.4]
+    #   [1810021.6  2576969.6  3343917.6]
+    # ]
+    # losses = label_weights*(labels-predictions)^2 = [
+    #   [2.91907881e+09 5.91477894e+09              0]
+    #   [3.22436284e+13 1.30715350e+14 2.20100220e+13]
+    #   [1.75422095e+13              0 1.07770806e+14]
+    #   [             0              0              0]
+    # ]
+    # step_losses = [sum(losses[i]) for i in 0...3]
+    #             = [8833857750, 1.84969e+14, 1.2531302e+14, 0]
+    expected_step_losses = (8833857750, 1.84969e+14, 1.2531302e+14, 0)
+    # step_average_losses = [
+    #     step_losses[i] / sum(label_weights[i]) for i in 0...3
+    # ] = [4416928875, 1.1560563e+14, 8.95093e+13, 0]
+    expected_step_average_losses = (4416928875, 1.1560563e+14, 8.95093e+13, 0)
+    for i in range(len(summaries)):
+      self._assert_simple_summary({
+          metric_keys.MetricKeys.LOSS: expected_step_losses[i],
+          metric_keys.MetricKeys.LOSS_MEAN: expected_step_average_losses[i],
+          'dnn/dnn/hiddenlayer_0_activation': 0.,
+          'dnn/dnn/hiddenlayer_0_fraction_of_zero_values': 0.,
+          'dnn/dnn/hiddenlayer_1_activation': 0.,
+          'dnn/dnn/hiddenlayer_1_fraction_of_zero_values': 0.,
+          'dnn/dnn/logits_activation': 0.,
+          'dnn/dnn/logits_fraction_of_zero_values': 0.,
+      }, summaries[i])
+    self._assert_checkpoint(
+        base_global_step + num_steps,
+        input_units=4,  # Sum of feature column dimensions.
+        hidden_units=hidden_units,
+        output_units=3)  # = label_dimension
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
new file mode 100644
index 0000000000000000000000000000000000000000..65142def8feabd38d929dc0b7c65aa6c629f4096
--- /dev/null
+++ b/tensorflow/python/estimator/canned/head.py
@@ -0,0 +1,669 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Abstractions for the head(s) of a model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
+
+
+class _Head(object):
+  """Interface for the head/top of a model.
+
+  Given logits (or output of a hidden layer), a Head knows how to compute
+  predictions, loss, train_op, metrics and export outputs. It is meant to:
+
+  1. Simplify writing model_fn and to make model_fn more configurable
+  2. Support wide range of machine learning models. Since most heads can work
+     with logits, they can support DNN, RNN, Wide, Wide&Deep,
+     Global objectives, Gradient boosted trees and many other types
+     of machine learning models.
+
+  Common usage:
+  Here is simplified model_fn to build a DNN regression model.
+    ```python
+    def _my_dnn_model_fn(features, labels, mode, params, config=None):
+      # Optionally your callers can pass head to model_fn as a param.
+      head = tf.contrib.learn.regression_head(...)
+      input = tf.contrib.layers.input_from_feature_columns(features, ...)
+      last_hidden_layer_out = tf.contrib.layers.stack(
+          input, tf.contrib.layers.fully_connected, [1000, 500])
+      logits = tf.contrib.layers.fully_connected(
+          last_hidden_layer_out, head.logits_dimension, activation_fn=None)
+
+      def _train_op_fn(loss):
+        return optimizer.minimize(loss)
+
+      return head.create_estimator_spec(
+          features=features,
+          labels=labels,
+          mode=mode,
+          logits=logits,
+          train_op_fn=_train_op_fn)
+    ```
+
+  There are cases where computing and applying gradients can not be meaningfully
+  captured with train_op_fn we support (for example, with sync optimizer). In
+  such case, you can take the responsibility on your own. Here is a common
+  use case,
+    ```python
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        labels=labels,
+        mode=mode,
+        logits=logits,
+        train_op_fn=tf.contrib.learn.no_op_train_fn)
+    if mode == model_fn.ModeKeys.TRAIN:
+      optimizer = ...
+      sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
+      update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
+                                                  loss=estimator_spec.loss, ...)
+      hooks = [sync.make_session_run_hook(is_chief)]
+      ... upate train_op and hooks in EstimatorSpec and return
+    ```
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def logits_dimension(self):
+    """Size of the last dimension of the logits `Tensor`.
+
+    Typically, logits is of shape `[batch_size, logits_dimension]`.
+
+    Returns:
+      The expected size of the `logits` tensor.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  @abc.abstractmethod
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """Returns `EstimatorSpec` that a model_fn can return.
+
+    Please note that,
+    + Exactly one of `logits` and `logits_input` must be provided.
+    + All args must be passed via name.
+
+    Args:
+      features: Input `dict` of `Tensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` to be used by the head.
+      labels: Labels `Tensor`, or `dict` of same.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
+          to optimize the model with the loss. This is used in TRAIN mode and
+          must not be None. None is allowed in other modes. If you want to
+          optimize loss yourself you can pass `no_op_train_fn` and then use
+          EstimatorSpec.loss to compute and apply gradients.
+
+    Returns:
+      `EstimatorSpec`.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+
+def _check_labels(labels, expected_labels_dimension):
+  """Check labels type and shape."""
+  with ops.name_scope(None, 'labels', (labels,)) as scope:
+    labels = sparse_tensor.convert_to_tensor_or_sparse_tensor(labels)
+    if isinstance(labels, sparse_tensor.SparseTensor):
+      raise ValueError('SparseTensor labels are not supported.')
+    labels_shape = array_ops.shape(labels)
+    err_msg = 'labels shape must be [batch_size, {}]'.format(
+        expected_labels_dimension)
+    assert_rank = check_ops.assert_rank(labels, 2, message=err_msg)
+    with ops.control_dependencies([assert_rank]):
+      static_shape = labels.shape
+      if static_shape is not None:
+        dim1 = static_shape[1]
+        if (dim1 is not None) and (dim1 != expected_labels_dimension):
+          raise ValueError(
+              'labels shape must be [batch_size, labels_dimension], got %s.' %
+              (static_shape,))
+      assert_dimension = check_ops.assert_equal(
+          expected_labels_dimension, labels_shape[1], message=err_msg)
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.identity(labels, name=scope)
+
+
+def _check_logits(logits, expected_logits_dimension):
+  """Check logits type and shape."""
+  with ops.name_scope(None, 'logits', (logits,)) as scope:
+    logits = math_ops.to_float(logits)
+    logits_shape = array_ops.shape(logits)
+    assert_rank = check_ops.assert_rank(
+        logits, 2, data=[logits_shape],
+        message='logits shape must be [batch_size, logits_dimension]')
+    with ops.control_dependencies([assert_rank]):
+      static_shape = logits.shape
+      if static_shape is not None:
+        dim1 = static_shape[1]
+        if (dim1 is not None) and (dim1 != expected_logits_dimension):
+          raise ValueError(
+              'logits shape must be [batch_size, logits_dimension], got %s.' %
+              (static_shape,))
+      assert_dimension = check_ops.assert_equal(
+          expected_logits_dimension, logits_shape[1], data=[logits_shape],
+          message='logits shape must be [batch_size, logits_dimension]')
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.identity(logits, name=scope)
+
+
+def _indicator_labels_mean(labels, weights=None, name=None):
+  with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
+    labels = math_ops.to_float(labels, name='labels')
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, labels)
+    return metrics_lib.mean(labels, weights=weights, name=scope)
+
+
+def _accuracy_baseline(labels_mean):
+  """Return accuracy baseline based on labels mean.
+
+  This is the best the model could do by always predicting one class.
+
+  Args:
+    labels_mean: Tuple of value and update op.
+
+  Returns:
+    Tuple of value and update op.
+  """
+  with ops.name_scope(None, 'accuracy_baseline', labels_mean):
+    value, update_op = labels_mean
+    return (
+        math_ops.maximum(value, 1. - value, name='value'),
+        math_ops.maximum(update_op, 1 - update_op, name='update_op'))
+
+
+def _predictions_mean(predictions, weights=None, name=None):
+  with ops.name_scope(
+      name, 'predictions_mean', (predictions, weights)) as scope:
+    predictions = math_ops.to_float(predictions, name='predictions')
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
+    return metrics_lib.mean(predictions, weights=weights, name=scope)
+
+
+def _auc(labels, predictions, weights=None, curve='ROC', name=None):
+  with ops.name_scope(name, 'auc', (predictions, labels, weights)) as scope:
+    predictions = math_ops.to_float(predictions, name='predictions')
+    if labels.dtype.base_dtype != dtypes.bool:
+      logging.warning('Casting %s labels to bool.', labels.dtype)
+      labels = math_ops.cast(labels, dtypes.bool)
+    if weights is not None:
+      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
+    return metrics_lib.auc(
+        labels=labels, predictions=predictions, weights=weights, curve=curve,
+        name=scope)
+
+
+def _accuracy_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'accuracy_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    threshold_predictions = math_ops.to_float(
+        math_ops.greater_equal(predictions, threshold))
+    return metrics_lib.accuracy(
+        labels=labels, predictions=threshold_predictions, weights=weights,
+        name=scope)
+
+
+def _precision_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'precision_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    precision_tensor, update_op = metrics_lib.precision_at_thresholds(
+        labels=labels, predictions=predictions, thresholds=(threshold,),
+        weights=weights, name=scope)
+    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
+
+
+def _recall_at_threshold(labels, predictions, weights, threshold, name=None):
+  with ops.name_scope(
+      name, 'recall_at_%s' % threshold,
+      (predictions, labels, weights, threshold)) as scope:
+    precision_tensor, update_op = metrics_lib.recall_at_thresholds(
+        labels=labels, predictions=predictions, thresholds=(threshold,),
+        weights=weights, name=scope)
+    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
+
+
+# TODO(xiejw): Add class ids for eval metrics?
+def _multi_class_head_with_softmax_cross_entropy_loss(
+    n_classes, weight_column_name=None):
+  """Creates a '_Head' for multi class classification.
+
+  This head expects to be fed integer labels specifying the class index.
+
+  Args:
+    n_classes: Number of classes, must be greater than 2 (for 2 classes, use
+      `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
+    weight_column_name: A string defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+
+  Returns:
+    An instance of `_Head` for  multi class classification.
+
+  Raises:
+    ValueError: if `n_classes`, `metric_class_ids` or `label_keys` is invalid.
+  """
+  return _MultiClassHeadWithSoftmaxCrossEntropyLoss(
+      n_classes, weight_column_name)
+
+
+class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
+  """See `_multi_class_head_with_softmax_cross_entropy_loss`."""
+
+  def __init__(self, n_classes, weight_column_name=None):
+    if (n_classes is None) or (n_classes <= 2):
+      raise ValueError('n_classes must be > 2: %s.' % n_classes)
+    self._n_classes = n_classes
+    self._weight_column_name = weight_column_name
+
+  @property
+  def logits_dimension(self):
+    return self._n_classes
+
+  def _eval_metric_ops(self, labels, probabilities, logits,
+                       class_ids, weights, unweighted_loss):
+    """Returns the Eval metric ops."""
+    with ops.name_scope(
+        None, 'metrics',
+        (labels, probabilities, logits, class_ids, weights, unweighted_loss)):
+      keys = metric_keys.MetricKeys
+      metric_ops = {
+          # Estimator already adds a metric for loss.
+          # TODO(xiejw): Any other metrics?
+          keys.LOSS_MEAN: metrics_lib.mean(
+              unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+          keys.ACCURACY: metrics_lib.accuracy(
+              labels=labels, predictions=class_ids, weights=weights,
+              name=keys.ACCURACY),
+      }
+    return metric_ops
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None,
+        default_name='multi_class_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+      logits = _check_logits(logits, self.logits_dimension)
+
+      # Predict.
+      pred_keys = prediction_keys.PredictionKeys
+      with ops.name_scope(None, 'predictions', (logits,)):
+        # class_ids's shape is [batch_size]
+        class_ids = math_ops.argmax(logits, 1, name=pred_keys.CLASSES)
+        probabilities = nn.softmax(logits, name=pred_keys.PROBABILITIES)
+        predictions = {
+            pred_keys.LOGITS: logits,
+            pred_keys.PROBABILITIES: probabilities,
+            # Expand to [batch_size, 1]
+            pred_keys.CLASSES: array_ops.expand_dims(class_ids, axis=(1,))
+        }
+      if mode == model_fn.ModeKeys.PREDICT:
+        batch_size = array_ops.shape(probabilities)[0]
+        output_classes = array_ops.tile(
+            input=array_ops.expand_dims(input=math_ops.range(self._n_classes),
+                                        axis=0),
+            multiples=[batch_size, 1])
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={'': export_output.ClassificationOutput(
+                scores=probabilities,
+                # `ClassificationOutput` requires string classes.
+                # TODO(xiejw): Support label_keys or label_column
+                classes=string_ops.as_string(output_classes,
+                                             name='str_classes'))})
+
+      # Eval.
+      labels = _check_labels(labels, 1)
+      # Check that we got integer for classification.
+      if not labels.dtype.is_integer:
+        raise ValueError('Labels dtype should be integer '
+                         'Instead got %s.' % labels.dtype)
+      assert_less = check_ops.assert_less(
+          labels, ops.convert_to_tensor(self._n_classes, dtype=labels.dtype),
+          message='Label IDs must < n_classes')
+      assert_greater = check_ops.assert_non_negative(
+          labels, message='Label Ids must >= 0')
+      with ops.control_dependencies((assert_less, assert_greater)):
+        labels = array_ops.identity(labels)
+
+      unweighted_loss = losses.sparse_softmax_cross_entropy(
+          labels=labels, logits=logits, reduction=losses.Reduction.NONE)
+      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
+      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=(1,))
+      weights = (
+          1. if (self._weight_column_name is None) else
+          features[self._weight_column_name])
+      weights = math_ops.to_float(weights, name='weights')
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=self._eval_metric_ops(
+                labels=labels,
+                probabilities=probabilities,
+                logits=logits,
+                class_ids=class_ids,
+                unweighted_loss=unweighted_loss,
+                weights=weights))
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
+
+
+def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
+    weight_feature_key=None, thresholds=(0.5,)):
+  """Creates a `Head` for single label binary classification.
+
+  This head uses `sigmoid_cross_entropy_with_logits` loss.
+
+  This head expects to be fed float labels of shape `(batch_size, 1)`.
+
+  Args:
+    weight_feature_key: A string defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    thresholds: Iterable of floats in the range `(0, 1)`. For binary
+      classification metrics such as precision and recall, an eval metric is
+      generated for each threshold value. This threshold is applied to the
+      logistic values to determine the binary classification (i.e., above the
+      threshold is `true`, below is `false`.
+
+  Returns:
+    An instance of `Head` for binary classification.
+
+  Raises:
+    ValueError: if `thresholds` contains a value outside of `(0, 1)`.
+  """
+  for threshold in thresholds:
+    if (threshold <= 0.0) or (threshold >= 1.0):
+      raise ValueError('thresholds not in (0, 1): %s.' % (thresholds,))
+  return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
+      weight_feature_key=weight_feature_key, thresholds=thresholds)
+
+
+class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
+  """See `_binary_logistic_head_with_sigmoid_cross_entropy_loss`."""
+
+  def __init__(self, weight_feature_key=None, thresholds=None):
+    self._weight_feature_key = weight_feature_key
+    self._thresholds = tuple(thresholds)
+
+  @property
+  def logits_dimension(self):
+    return 1
+
+  def _eval_metric_ops(
+      self, labels, logits, logistic, scores, classes, unweighted_loss,
+      weights=None):
+    with ops.name_scope(
+        None, 'metrics',
+        (labels, logits, logistic, scores, classes, unweighted_loss, weights)):
+      keys = metric_keys.MetricKeys
+      labels_mean = _indicator_labels_mean(
+          labels=labels, weights=weights, name=keys.LABEL_MEAN)
+      metric_ops = {
+          # Estimator already adds a metric for loss.
+          keys.LOSS_MEAN: metrics_lib.mean(
+              unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+          keys.ACCURACY: metrics_lib.accuracy(
+              labels=labels, predictions=classes, weights=weights,
+              name=keys.ACCURACY),
+          keys.PREDICTION_MEAN: _predictions_mean(
+              predictions=logistic, weights=weights, name=keys.PREDICTION_MEAN),
+          keys.LABEL_MEAN: labels_mean,
+          keys.ACCURACY_BASELINE: _accuracy_baseline(labels_mean),
+          keys.AUC: _auc(
+              labels=labels, predictions=logistic, weights=weights,
+              name=keys.AUC),
+          keys.AUC_PR: _auc(
+              labels=labels, predictions=logistic, weights=weights, curve='PR',
+              name=keys.AUC_PR)
+      }
+      for threshold in self._thresholds:
+        accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
+        metric_ops[accuracy_key] = _accuracy_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=accuracy_key)
+        # Precision for positive examples.
+        precision_key = keys.PRECISION_AT_THRESHOLD % threshold
+        metric_ops[precision_key] = _precision_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=precision_key)
+        # Recall for positive examples.
+        recall_key = keys.RECALL_AT_THRESHOLD % threshold
+        metric_ops[recall_key] = _recall_at_threshold(
+            labels=labels, predictions=logistic, weights=weights,
+            threshold=threshold, name=recall_key)
+      return metric_ops
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None, default_name='binary_logistic_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+
+      # Predict.
+      pred_keys = prediction_keys.PredictionKeys
+      logits = _check_logits(logits, self.logits_dimension)
+      logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
+      two_class_logits = array_ops.concat(
+          (array_ops.zeros_like(logits), logits), 1, name='two_class_logits')
+      scores = nn.softmax(two_class_logits, name=pred_keys.PROBABILITIES)
+      classes = array_ops.reshape(
+          math_ops.argmax(two_class_logits, axis=1), (-1, 1), name='classes')
+      predictions = {
+          pred_keys.LOGITS: logits,
+          pred_keys.LOGISTIC: logistic,
+          pred_keys.PROBABILITIES: scores,
+          pred_keys.CLASSES: classes
+      }
+      if mode == model_fn.ModeKeys.PREDICT:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={'': export_output.ClassificationOutput(
+                scores=scores,
+                # `ClassificationOutput` requires string classes.
+                # TODO(ptucker): Support label_keys.
+                classes=string_ops.as_string(classes, name='str_classes'))})
+
+      # Eval.
+      labels = _check_labels(math_ops.to_float(labels), self.logits_dimension)
+      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits, name='loss')
+      weights = (
+          1. if (self._weight_feature_key is None) else
+          features[self._weight_feature_key])
+      weights = math_ops.to_float(weights, name='weights')
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=self._eval_metric_ops(
+                labels=labels,
+                logits=logits,
+                logistic=logistic,
+                scores=scores,
+                classes=classes,
+                unweighted_loss=unweighted_loss,
+                weights=weights))
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
+
+
+def _regression_head_with_mean_squared_error_loss(
+    weight_feature_key=None,
+    label_dimension=1):
+  """Creates a `_Head` for regression using the mean squared loss.
+
+  Args:
+    weight_feature_key: A string defining feature column name representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    label_dimension: Number of regression labels per example. This is the size
+      of the last dimension of the labels `Tensor` (typically, this has shape
+      `[batch_size, label_dimension]`).
+
+  Returns:
+    An instance of `_Head` for linear regression.
+  """
+  return _RegressionHeadWithMeanSquaredErrorLoss(
+      weight_feature_key=weight_feature_key,
+      label_dimension=label_dimension)
+
+
+class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
+  """`Head` for regression using the mean squared loss."""
+
+  def __init__(self,
+               label_dimension,
+               weight_feature_key=None):
+    """`Head` for regression.
+
+    Args:
+      label_dimension: Number of regression labels per example. This is the
+        size of the last dimension of the labels `Tensor` (typically, this has
+        shape `[batch_size, label_dimension]`).
+      weight_feature_key: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+
+    Raises:
+      ValueError: if `label_dimension` < 1.
+    """
+    if label_dimension < 1:
+      raise ValueError('Invalid label_dimension %s.' % label_dimension)
+    self._logits_dimension = label_dimension
+    self._weight_feature_key = weight_feature_key
+
+  @property
+  def logits_dimension(self):
+    return self._logits_dimension
+
+  def create_estimator_spec(
+      self, features, mode, logits, labels=None, train_op_fn=None):
+    """See `Head`."""
+    with variable_scope.variable_scope(
+        None,
+        default_name='regression_head',
+        values=(tuple(six.itervalues(features)) + (labels, logits))):
+
+      # Predict.
+      logits = _check_logits(logits, self._logits_dimension)
+      predictions = {prediction_keys.PredictionKeys.PREDICTIONS: logits}
+      if mode == model_fn.ModeKeys.PREDICT:
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.PREDICT,
+            predictions=predictions,
+            export_outputs={'': export_output.RegressionOutput(value=logits)})
+
+      # Eval.
+      labels = _check_labels(math_ops.to_float(labels), self._logits_dimension)
+      unweighted_loss = losses.mean_squared_error(
+          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+      weights = (
+          1. if (self._weight_feature_key is None) else
+          features[self._weight_feature_key])
+      weights = math_ops.to_float(weights, name='weights')
+      training_loss = losses.compute_weighted_loss(
+          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      if mode == model_fn.ModeKeys.EVAL:
+        # Estimator already adds a metric for loss.
+        eval_metric_ops = {
+            metric_keys.MetricKeys.LOSS_MEAN: metrics_lib.mean(
+                unweighted_loss, weights=weights)
+        }
+        return model_fn.EstimatorSpec(
+            mode=model_fn.ModeKeys.EVAL,
+            predictions=predictions,
+            loss=training_loss,
+            eval_metric_ops=eval_metric_ops)
+
+      # Train.
+      if train_op_fn is None:
+        raise ValueError('train_op_fn can not be None.')
+      logging_ops.scalar_summary(metric_keys.MetricKeys.LOSS, training_loss)
+      logging_ops.scalar_summary(
+          metric_keys.MetricKeys.LOSS_MEAN,
+          losses.compute_weighted_loss(
+              unweighted_loss, weights=weights,
+              reduction=losses.Reduction.MEAN))
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.TRAIN,
+          predictions=predictions,
+          loss=training_loss,
+          train_op=train_op_fn(training_loss))
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..516e89d3664a0d4c44d9a4b7da6c566b82348311
--- /dev/null
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -0,0 +1,1570 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for head.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import queue_runner_impl
+
+
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+
+def _initialize_variables(test_case, scaffold):
+  scaffold.finalize()
+  test_case.assertIsNone(scaffold.init_feed_dict)
+  test_case.assertIsNone(scaffold.init_fn)
+  scaffold.init_op.run()
+  scaffold.ready_for_local_init_op.eval()
+  scaffold.local_init_op.run()
+  scaffold.ready_op.eval()
+  test_case.assertIsNotNone(scaffold.saver)
+
+
+def _assert_simple_summaries(test_case, expected_summaries, summary_str,
+                             tol=1e-6):
+  """Assert summary the specified simple values.
+
+  Args:
+    test_case: test case.
+    expected_summaries: Dict of expected tags and simple values.
+    summary_str: Serialized `summary_pb2.Summary`.
+    tol: Tolerance for relative and absolute.
+  """
+  summary = summary_pb2.Summary()
+  summary.ParseFromString(summary_str)
+  test_case.assertAllClose(expected_summaries, {
+      v.tag: v.simple_value for v in summary.value
+  }, rtol=tol, atol=tol)
+
+
+def _assert_no_hooks(test_case, spec):
+  test_case.assertAllEqual([], spec.training_chief_hooks)
+  test_case.assertAllEqual([], spec.training_hooks)
+
+
+def _sigmoid(logits):
+  return 1 / (1 + np.exp(-logits))
+
+
+class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
+
+  def test_n_classes_is_none(self):
+    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=None)
+
+  def test_n_classes_is_2(self):
+    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
+      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+          n_classes=2)
+
+  def test_invalid_logits_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    logits_2x2 = np.array(((45., 44.), (41., 42.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((30.,), (42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_2x2)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((30.,), (42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
+            logits_placeholder: logits_2x2
+        })
+
+  def test_invalid_labels_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    labels_2x2 = np.array(((45, 44), (41, 42),), dtype=np.int)
+    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x2)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            logits_placeholder: logits_2x3,
+            labels_placeholder: labels_2x2
+        })
+
+  def test_invalid_labels_type(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    labels_2x1 = np.array(((1.,), (1.,),))
+    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_placeholder,
+          labels=labels_placeholder)
+
+  def test_invalid_labels_values(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    labels_2x1_with_large_id = np.array(((45,), (1,),), dtype=np.int)
+    labels_2x1_with_negative_id = np.array(((-5,), (1,),), dtype=np.int)
+    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
+
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesOpError('Label IDs must < n_classes'):
+        spec.loss.eval({
+            labels_placeholder: labels_2x1_with_large_id,
+            logits_placeholder: logits_2x3
+        })
+
+    with self.test_session():
+      with self.assertRaisesOpError('Label Ids must >= 0'):
+        spec.loss.eval({
+            labels_placeholder: labels_2x1_with_negative_id,
+            logits_placeholder: logits_2x3
+        })
+
+  def test_invalid_labels_sparse_tensor(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    labels_2x1 = sparse_tensor.SparseTensor(
+        values=['english', 'italian'],
+        indices=[[0, 0], [1, 0]],
+        dense_shape=[2, 1])
+    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'SparseTensor labels are not supported.'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x3,
+          labels=labels_2x1)
+
+  def test_incompatible_labels_shape(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 3).
+    # Labels should be shape (batch_size, 1).
+    # Here batch sizes are different.
+    values_3x1 = np.array(((1,), (1,), (1,),))
+    values_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'Dimensions must be equal'):
+      head.create_estimator_spec(
+          features={'x': values_2x3},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_2x3,
+          labels=values_3x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_2x3},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(
+          errors.OpError,
+          'logits and labels must have the same first dimension'):
+        spec.loss.eval({
+            labels_placeholder: values_3x1,
+            logits_placeholder: values_2x3
+        })
+
+  def test_predict(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((1., 0., 0.), (0., 0., 1.),), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+        np.array([[0.576117, 0.2119416, 0.2119416],
+                  [0.2119416, 0.2119416, 0.576117]], dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (2,)), dtype=np.int64),
+    }
+
+    expected_output_classes = [[b'0', b'1', b'2'], [b'0', b'1', b'2']]
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNone(spec.train_op)
+    self.assertItemsEqual(
+        ('', _DEFAULT_SERVING_KEY), spec.export_outputs.keys())
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions and export_outputs.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      self.assertAllClose(expected_predictions, sess.run(spec.predictions))
+      self.assertAllClose(
+          expected_predictions[prediction_keys.PredictionKeys.PROBABILITIES],
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
+      self.assertAllEqual(
+          expected_output_classes,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
+
+  def test_weighted_multi_example_predict(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column_name='label_weights')
+    self.assertEqual(n_classes, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((1., 0., 0.), (0., 0., 1.),), dtype=np.int32)
+    weights_2x1 = np.array(((1.,), (2.,),), dtype=np.float64)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'label_weights': weights_2x1,
+        },
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+        np.array([[0.576117, 0.2119416, 0.2119416],
+                  [0.2119416, 0.2119416, 0.576117]], dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (2,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+
+    # Assert predictions.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      self.assertAllClose(expected_predictions, sess.run(spec.predictions))
+
+  def test_eval(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((1., 0., 0.), (0., 1., 0.),), dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (1,)), dtype=np.int64),
+    }
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / 2,
+        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      predictions, loss, metrics = sess.run((
+          spec.predictions, spec.loss, update_ops))
+      self.assertAllClose(expected_predictions, predictions, rtol=tol, atol=tol)
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
+  def test_weighted_multi_example_eval(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column_name='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
+    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
+    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
+    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
+    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
+    expected_loss = 30.
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'label_weights': weights_3x1,
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((1., 0., 0.), (0., 1., 0.), (0., 0., 1.)),
+                     dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (1,), (2,)), dtype=np.int64),
+    }
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / np.sum(weights_3x1),
+        # Weighted accuracy is 1 * 3.0 / sum weights = 0.5
+        keys.ACCURACY: 0.5,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      predictions, loss, metrics = sess.run((
+          spec.predictions, spec.loss, update_ops))
+      self.assertAllClose(expected_predictions, predictions, rtol=tol, atol=tol)
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
+  def test_train(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    expected_train_result = 'my_train_op'
+    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
+    expected_loss = 10.
+
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((1., 0., 0.), (0., 1., 0.),), dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (1,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions, spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(
+          expected_predictions, predictions, rtol=tol, atol=tol)
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+      }, summary_str, tol)
+
+  def test_weighted_multi_example_train(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column_name='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
+    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
+    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
+    expected_train_result = 'my_train_op'
+    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
+    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
+    expected_loss = 30.
+
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,),), dtype=np.float32),
+            'label_weights': weights_3x1,
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((1., 0., 0.), (0., 1., 0.), (0., 0., 1.),),
+                     dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((0,), (1,), (2,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions, spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(
+          expected_predictions, predictions, rtol=tol, atol=tol)
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss mean = sum(cross_entropy(labels, logits) * [1,2,3]) / (1+2+3)
+          #      = sum([10, 10, 0] * [1, 2, 3]) / 6 = 30 / 6
+          metric_keys.MetricKeys.LOSS_MEAN:
+              expected_loss / np.sum(weights_3x1),
+      }, summary_str, tol)
+
+
+# TODO(ptucker): Add thresholds tests.
+class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
+
+  def test_threshold_too_small(self):
+    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          thresholds=(0., 0.5))
+
+  def test_threshold_too_large(self):
+    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
+      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+          thresholds=(0.5, 1.))
+
+  def test_invalid_logits_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Logits should be shape (batch_size, 1).
+    logits_2x2 = np.array(((45., 44.), (41., 42.),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_2x2)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
+            logits_placeholder: logits_2x2
+        })
+
+  def test_invalid_labels_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Labels and logits should be shape (batch_size, 1).
+    labels_2x2 = np.array(((45., 44.), (41., 42.),))
+    logits_2x1 = np.array(((45.,), (41.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=logits_2x1,
+          labels=labels_2x2)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            logits_placeholder: logits_2x1,
+            labels_placeholder: labels_2x2
+        })
+
+  def test_incompatible_labels_shape(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Both logits and labels should be shape (batch_size, 1).
+    values_2x1 = np.array(((43.,), (44.,),))
+    values_3x1 = np.array(((45.,), (46.,), (47.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(
+        ValueError, 'logits and labels must have the same shape'):
+      head.create_estimator_spec(
+          features={'x': values_2x1},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_2x1,
+          labels=values_3x1)
+    with self.assertRaisesRegexp(
+        ValueError, 'logits and labels must have the same shape'):
+      head.create_estimator_spec(
+          features={'x': values_2x1},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_3x1,
+          labels=values_2x1)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_2x1},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        spec.loss.eval({
+            labels_placeholder: values_2x1,
+            logits_placeholder: values_3x1
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+        spec.loss.eval({
+            labels_placeholder: values_3x1,
+            logits_placeholder: values_2x1
+        })
+
+  def test_predict(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.LOGISTIC:
+            _sigmoid(logits).astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((0., 1.), (1., 0.),), dtype=np.float32),
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((1,), (0,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNone(spec.train_op)
+    self.assertItemsEqual(
+        ('', _DEFAULT_SERVING_KEY), spec.export_outputs.keys())
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      self.assertAllClose(expected_predictions, sess.run(spec.predictions))
+
+  def test_eval(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((1,), (1,),), dtype=np.int32))
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.LOGISTIC:
+            _sigmoid(logits).astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((0., 1.), (1., 0.),), dtype=np.float32),
+        # TODO(ptucker): Should this be (batch_size, 1) instead of (batch_size)?
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((1,), (0,)), dtype=np.int64),
+    }
+    default_threshold = .5
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+        # loss_mean = loss/2 = 41./2 = 20.5
+        keys.LOSS_MEAN: 20.5,
+        keys.ACCURACY: 1./2,
+        keys.PREDICTION_MEAN: 1./2,
+        keys.LABEL_MEAN: 2./2,
+        keys.ACCURACY_BASELINE: 2./2,
+        keys.AUC: 0.,
+        keys.AUC_PR: 1.,
+        keys.ACCURACY_AT_THRESHOLD % default_threshold: 1./2,
+        keys.PRECISION_AT_THRESHOLD % default_threshold: 2./2,
+        keys.RECALL_AT_THRESHOLD % default_threshold: 1./2,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      predictions, loss, metrics = sess.run((
+          spec.predictions, spec.loss, update_ops))
+      self.assertAllClose(expected_predictions, predictions)
+      self.assertAllClose(41., loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
+  def test_train(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,),), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+        # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
+    expected_loss = 41.
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((1,), (1,),), dtype=np.float64),
+        train_op_fn=_train_op_fn)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.LOGISTIC:
+            _sigmoid(logits).astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((0., 1.), (1., 0.),), dtype=np.float32),
+        # TODO(ptucker): Should this be (batch_size, 1) instead of (batch_size)?
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((1,), (0,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+    self.assertIsNotNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions, spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(expected_predictions, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/2 = 41/2 = 20.5
+          metric_keys.MetricKeys.LOSS_MEAN: 20.5,
+      }, summary_str)
+
+  def test_weighted_multi_example_predict(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_feature_key='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    expected_predictions = {
+        prediction_keys.PredictionKeys.LOGITS: logits.astype(np.float32),
+        prediction_keys.PredictionKeys.LOGISTIC:
+            _sigmoid(logits).astype(np.float32),
+        prediction_keys.PredictionKeys.PROBABILITIES:
+            np.array(((0., 1.), (1., 0.), (0., 1.)), dtype=np.float32),
+        # TODO(ptucker): Should this be (batch_size, 1) instead of (batch_size)?
+        prediction_keys.PredictionKeys.CLASSES:
+            np.array(((1,), (0,), (1,)), dtype=np.int64),
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertItemsEqual(expected_predictions.keys(), spec.predictions.keys())
+    self.assertEqual(
+        {k: v.dtype for k, v in six.iteritems(expected_predictions)},
+        {k: v.dtype.as_numpy_dtype for k, v in six.iteritems(spec.predictions)})
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(expected_predictions, sess.run(spec.predictions))
+
+  def test_weighted_multi_example_eval(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_feature_key='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((1,), (1,), (0,)), dtype=np.int32))
+
+    default_threshold = .5
+    # label_mean = (1*1 + .1*1 + 1.5*0)/(1 + .1 + 1.5) = 1.1/2.6
+    #            = .42307692307
+    expected_label_mean = .42307692307
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # losses = label_weights*cross_entropy(labels, logits)
+        #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
+        # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
+        # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
+        #           = 70.1/2.6 = 26.9615384615
+        keys.LOSS_MEAN: 26.9615384615,
+        # accuracy = (1*1 + .1*0 + 1.5*0)/(1 + .1 + 1.5) = 1/2.6 = .38461538461
+        keys.ACCURACY: .38461538461,
+        # prediction_mean = (1*1 + .1*0 + 1.5*1)/(1 + .1 + 1.5) = 2.5/2.6
+        #                 = .96153846153
+        keys.PREDICTION_MEAN: .96153846153,
+        keys.LABEL_MEAN: expected_label_mean,
+        keys.ACCURACY_BASELINE: 1 - expected_label_mean,
+        keys.AUC: .45454565,
+        keys.AUC_PR: .6737757325172424,
+        keys.ACCURACY_AT_THRESHOLD % default_threshold: .38461538461,
+        # precision = (1*1 + 1.5*0)/(1 + 1.5) = 1/2.5 = .4
+        keys.PRECISION_AT_THRESHOLD % default_threshold: .4,
+        # recall = (1*1 + .1*0)/(1 + .1) = 1/1.1 = .90909090909
+        keys.RECALL_AT_THRESHOLD % default_threshold: .90909090909,
+    }
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(70.1, loss)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
+
+  def test_weighted_multi_example_train(self):
+    """3 examples, 1 batch."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_feature_key='label_weights')
+
+    # Create estimator spec.
+    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # losses = label_weights*cross_entropy(labels, logits)
+    #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
+    # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
+    expected_loss = 70.1
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((1.,), (1.,), (0.,))),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    self.assertIsNotNone(spec.loss)
+    self.assertIsNotNone(spec.train_op)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      loss, train_result, summary_str = sess.run((
+          spec.loss, spec.train_op, spec.scaffold.summary_op))
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
+          #           = 70.1/2.6 = 26.9615384615
+          metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
+      }, summary_str)
+
+
+class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
+
+  def test_invalid_label_dimension(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
+      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=-1)
+    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
+      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=0)
+
+  def test_invalid_logits(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    logits_1d = np.array(((45.,), (41.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': np.array(((42.,),))},
+          mode=model_fn.ModeKeys.PREDICT,
+          logits=logits_1d)
+
+    # Dynamic shape.
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),))},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.predictions[prediction_keys.PredictionKeys.PREDICTIONS].eval({
+            logits_placeholder: logits_1d
+        })
+
+  def test_incompatible_labels_eval(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
+    values_1d = np.array(((43.,), (44.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': values_1d},
+          mode=model_fn.ModeKeys.EVAL,
+          logits=values_3d,
+          labels=values_1d)
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': values_3d}, labels=values_3d,
+          mode=model_fn.ModeKeys.EVAL, logits=values_1d, train_op_fn=None)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_1d},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits_placeholder,
+        labels=labels_placeholder)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.loss.eval({
+            labels_placeholder: values_3d,
+            logits_placeholder: values_1d
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            labels_placeholder: values_1d,
+            logits_placeholder: values_3d
+        })
+
+  def test_incompatible_labels_train(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
+    values_1d = np.array(((43.,), (44.,),))
+
+    # Static shape.
+    with self.assertRaisesRegexp(ValueError, 'labels shape'):
+      head.create_estimator_spec(
+          features={'x': values_1d},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=values_3d,
+          labels=values_1d,
+          train_op_fn=lambda x: x)
+    with self.assertRaisesRegexp(ValueError, 'logits shape'):
+      head.create_estimator_spec(
+          features={'x': values_3d},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=values_1d,
+          labels=values_3d,
+          train_op_fn=lambda x: x)
+
+    # Dynamic shape.
+    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    spec = head.create_estimator_spec(
+        features={'x': values_1d},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits_placeholder,
+        labels=labels_placeholder,
+        train_op_fn=lambda x: x)
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
+        spec.loss.eval({
+            labels_placeholder: values_3d,
+            logits_placeholder: values_1d
+        })
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
+        spec.loss.eval({
+            labels_placeholder: values_1d,
+            logits_placeholder: values_3d
+        })
+
+  def test_predict(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertIsNone(spec.loss)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNone(spec.train_op)
+    self.assertItemsEqual(
+        ('', signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY),
+        spec.export_outputs.keys())
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions.
+    with self.test_session():
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(logits, spec.predictions[prediction_key].eval())
+
+  def test_eval(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((43,), (44,),), dtype=np.int32))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = (43-45)^2 + (44-41)^2 = 4+9 = 13
+      self.assertAllClose(13., loss)
+      # loss_mean = loss/2 = 13/2 = 6.5
+      expected_loss_mean = 6.5
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_train(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss()
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,),), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
+    expected_loss = 13
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.float32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((43.,), (44.,),), dtype=np.float64),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/2 = 13/2 = 6.5
+          metric_keys.MetricKeys.LOSS_MEAN: 6.5,
+      }, summary_str)
+
+  def test_weighted_multi_example_eval(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.int32)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((35,), (42,), (45,)), dtype=np.int32))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+      self.assertAllClose(101.6, loss)
+      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
+      expected_loss_mean = 39.0769231
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_weighted_multi_example_train(self):
+    """1d label, 3 examples, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+    expected_loss = 101.6
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42,), (43,), (44,)), dtype=np.float32),
+            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((35.,), (42.,), (45.,)), dtype=np.float32),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
+          metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
+      }, summary_str)
+
+  def test_weighted_multi_value_eval(self):
+    """3d label, 1 example, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights', label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45., 41., 44.),))
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42., 43., 44.),)),
+            'label_weights': np.array(((1., .1, 1.5),)),
+        },
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=np.array(((35., 42., 45.),)))
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS_MEAN,), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Assert predictions, loss, and metrics.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
+          metric_keys.MetricKeys.LOSS_MEAN]
+      predictions, loss, loss_mean = sess.run((
+          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
+      self.assertAllClose(logits, predictions)
+      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+      self.assertAllClose(101.6, loss)
+      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+      expected_loss_mean = 39.076923
+      # Check results of both update (in `loss_mean`) and value ops.
+      self.assertAllClose(expected_loss_mean, loss_mean)
+      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
+
+  def test_weighted_multi_value_train(self):
+    """3d label, 1 example, 1 batch."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights', label_dimension=3)
+    self.assertEqual(3, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45., 41., 44.),))
+    expected_train_result = b'my_train_op'
+    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
+    expected_loss = 101.6
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+    spec = head.create_estimator_spec(
+        features={
+            'x': np.array(((42., 43., 44.),)),
+            'label_weights': np.array(((1., .1, 1.5),)),
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=np.array(((35., 42., 45.),)),
+        train_op_fn=_train_op_fn)
+
+    # Assert spec contains expected tensors.
+    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
+    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertEqual({}, spec.eval_metric_ops)
+    self.assertIsNotNone(spec.train_op)
+    self.assertIsNone(spec.export_outputs)
+    _assert_no_hooks(self, spec)
+
+    # Evaluate predictions, loss, train_op, and summaries.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      predictions, loss, train_result, summary_str = sess.run((
+          spec.predictions[prediction_key], spec.loss, spec.train_op,
+          spec.scaffold.summary_op))
+      self.assertAllClose(logits, predictions)
+      self.assertAllClose(expected_loss, loss)
+      self.assertEqual(expected_train_result, train_result)
+      _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+          metric_keys.MetricKeys.LOSS_MEAN: 39.076923,
+      }, summary_str)
+
+  def test_weighted_multi_batch_eval(self):
+    """1d label, 1 example, 3 batches."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45.,), (41.,), (44.,)))
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': np.array(((42.,), (43.,), (44.,))),
+            'label_weights': np.array(((1.,), (.1,), (1.5,))),
+            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
+            # batched version of it, and pop it off before passing to
+            # `create_estimator_spec`.
+            'logits': logits,
+        },
+        y=np.array(((35.,), (42.,), (45.,))),
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    batched_features, batched_labels = input_fn()
+    batched_logits = batched_features.pop('logits')
+    spec = head.create_estimator_spec(
+        features=batched_features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=batched_logits,
+        labels=batched_labels,
+        train_op_fn=None)
+
+    # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
+    # loss = sum(losses) = 100+.1+1.5 = 101.6
+    # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
+    expected_metrics = {metric_keys.MetricKeys.LOSS_MEAN: 39.076923}
+
+    # Assert spec contains expected tensors.
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
+    self.assertIsNone(spec.train_op)
+    _assert_no_hooks(self, spec)
+
+    with self.test_session() as sess:
+      # Finalize graph and initialize variables.
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      queue_runner_impl.start_queue_runners()
+
+      # Run tensors for `steps` steps.
+      steps = len(logits)
+      results = tuple([
+          sess.run((
+              spec.loss,
+              # The `[1]` gives us the metric update op.
+              {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+          )) for _ in range(steps)
+      ])
+
+      # Assert losses and metrics.
+      self.assertAllClose((100, .1, 1.5), [r[0] for r in results])
+      # For metrics, check results of both update (in `results`) and value ops.
+      # Note: we only check the result of the last step for streaming metrics.
+      self.assertAllClose(expected_metrics, results[steps - 1][1])
+      self.assertAllClose(expected_metrics, {
+          k: spec.eval_metric_ops[k][0].eval() for k in spec.eval_metric_ops
+      })
+
+  def test_weighted_multi_batch_train(self):
+    """1d label, 1 example, 3 batches."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_feature_key='label_weights')
+    self.assertEqual(1, head.logits_dimension)
+
+    # Create estimator spec.
+    logits = np.array(((45.,), (41.,), (44.,)))
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'x': np.array(((42.,), (43.,), (44.,))),
+            'label_weights': np.array(((1.,), (.1,), (1.5,))),
+            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
+            # batched version of it, and pop it off before passing to
+            # `create_estimator_spec`.
+            'logits': logits,
+        },
+        y=np.array(((35.,), (42.,), (45.,))),
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    batched_features, batched_labels = input_fn()
+    batched_logits = batched_features.pop('logits')
+    spec = head.create_estimator_spec(
+        features=batched_features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=batched_logits,
+        labels=batched_labels,
+        train_op_fn=lambda loss: loss * -7.)
+
+    # Assert spec contains expected tensors.
+    self.assertEqual(dtypes.float32, spec.loss.dtype)
+    self.assertIsNotNone(spec.train_op)
+
+    with self.test_session() as sess:
+      # Finalize graph and initialize variables.
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNotNone(spec.scaffold.summary_op)
+      queue_runner_impl.start_queue_runners()
+
+      results = tuple([
+          sess.run((spec.loss, spec.train_op)) for _ in range(len(logits))
+      ])
+
+      # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
+      expected_losses = np.array((100, .1, 1.5))
+      self.assertAllClose(expected_losses, [r[0] for r in results])
+      self.assertAllClose(expected_losses * -7., [r[1] for r in results])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e471f5c918b5df126f63f8703dd30f17e8711be
--- /dev/null
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -0,0 +1,293 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear Estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import training_util
+
+
+# The default learning rate of 0.2 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.2
+
+
+def _get_default_optimizer(feature_columns):
+  learning_rate = min(_LEARNING_RATE, 1.0 / math.sqrt(len(feature_columns)))
+  return ftrl.FtrlOptimizer(learning_rate=learning_rate)
+
+
+# TODO(b/36813849): Revisit passing params vs named arguments.
+def _linear_model_fn(features, labels, mode, params, config):
+  """A model_fn for linear models that use a gradient-based optimizer.
+
+  Args:
+    features: Dict of `Tensor`.
+    labels: `Tensor` of shape `[batch_size, logits_dimension]`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    params: A dict of hyperparameters.
+      The following hyperparameters are expected:
+      * head: A `Head` instance.
+      * feature_columns: An iterable containing all the feature columns used by
+          the model.
+      * optimizer: string, `Optimizer` object, or callable that defines the
+          optimizer to use for training. If `None`, will use a FTRL optimizer.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or params are invalid.
+  """
+  head = params['head']
+  feature_columns = tuple(params['feature_columns'])
+  optimizer = optimizers.get_optimizer_instance(
+      params.get('optimizer') or _get_default_optimizer(feature_columns),
+      learning_rate=_LEARNING_RATE)
+  num_ps_replicas = config.num_ps_replicas if config else 0
+
+  partitioner = params.get('partitioner') or (
+      partitioned_variables.min_max_variable_partitioner(
+          max_partitions=num_ps_replicas,
+          min_slice_size=64 << 20))
+
+  with variable_scope.variable_scope(
+      'linear',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+
+    logits = feature_column_lib.linear_model(
+        features=features,
+        feature_columns=feature_columns,
+        units=head.logits_dimension)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class LinearClassifier(estimator.Estimator):
+  """Linear classifier model.
+
+  Train a linear model to classify instances into one of multiple possible
+  classes. When number of possible classes is 2, this is binary classification.
+
+  Example:
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  # Estimator using the default optimizer.
+  estimator = LinearClassifier(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
+
+  # Or estimator using the FTRL optimizer with regularization.
+  estimator = LinearClassifier(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b],
+      optimizer=tf.train.FtrlOptimizer(
+        learning_rate=0.1,
+        l1_regularization_strength=0.001
+      ))
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    ...
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_feature_key` is not `None`, a feature with
+    `key=weight_feature_key` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `SparseColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `WeightedSparseColumn`, two features: the first with
+      `key` the id column name, the second with `key` the weight column name.
+      Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+  """
+
+  def __init__(self,
+               feature_columns,
+               model_dir=None,
+               n_classes=2,
+               weight_feature_key=None,
+               optimizer=None,
+               config=None,
+               partitioner=None):
+    """Construct a `LinearClassifier` estimator object.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Note that class labels are integers representing the class index (i.e.
+        values from 0 to n_classes-1). For arbitrary label values (e.g. string
+        labels), convert to class indices first.
+      weight_feature_key: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      optimizer: The optimizer used to train the model. If specified, it should
+        be either an instance of `tf.Optimizer` or the SDCAOptimizer. If `None`,
+        the Ftrl optimizer will be used.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+
+    Returns:
+      A `LinearClassifier` estimator.
+
+    Raises:
+      ValueError: if n_classes < 2.
+    """
+    super(LinearClassifier, self).__init__(
+        model_fn=_linear_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params={
+            # pylint: disable=protected-access
+            # TODO(xiejw): Switch to the classifier head.
+            'head': head_lib._regression_head_with_mean_squared_error_loss(
+                label_dimension=n_classes,
+                weight_feature_key=weight_feature_key),
+            # pylint: enable=protected-access
+            'feature_columns': feature_columns,
+            'optimizer': optimizer,
+            'partitioner': partitioner,
+        })
+
+
+class LinearRegressor(estimator.Estimator):
+  """An estimator for TensorFlow Linear regression problems.
+
+  Train a linear regression model to predict label value given observation of
+  feature values.
+
+  Example:
+
+  ```python
+  sparse_column_a = sparse_column_with_hash_bucket(...)
+  sparse_column_b = sparse_column_with_hash_bucket(...)
+
+  sparse_feature_a_x_sparse_feature_b = crossed_column(...)
+
+  estimator = LinearRegressor(
+      feature_columns=[sparse_column_a, sparse_feature_a_x_sparse_feature_b])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    ...
+  def input_fn_eval: # returns x, y
+    ...
+  estimator.train(input_fn=input_fn_train)
+  estimator.evaluate(input_fn=input_fn_eval)
+  estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a KeyError:
+
+  * if `weight_feature_key` is not `None`:
+    key=weight_feature_key, value=a `Tensor`
+  * for column in `feature_columns`:
+    - if isinstance(column, `SparseColumn`):
+        key=column.name, value=a `SparseTensor`
+    - if isinstance(column, `WeightedSparseColumn`):
+        {key=id column name, value=a `SparseTensor`,
+         key=weight column name, value=a `SparseTensor`}
+    - if isinstance(column, `RealValuedColumn`):
+        key=column.name, value=a `Tensor`
+  """
+
+  def __init__(self,
+               feature_columns,
+               model_dir=None,
+               label_dimension=1,
+               weight_feature_key=None,
+               optimizer=None,
+               config=None,
+               partitioner=None):
+    """Initializes a `LinearRegressor` instance.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_feature_key: A string defining feature column name representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example.
+      optimizer: string, `tf.Optimizer` object, or callable that returns
+        `tf.Optimizer`. Defines the optimizer to use for training. If `None`,
+        will use the FTRL optimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+    """
+    super(LinearRegressor, self).__init__(
+        model_fn=_linear_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params={
+            # pylint: disable=protected-access
+            'head': head_lib._regression_head_with_mean_squared_error_loss(
+                label_dimension=label_dimension,
+                weight_feature_key=weight_feature_key),
+            # pylint: enable=protected-access
+            'feature_columns': feature_columns,
+            'optimizer': optimizer,
+            'partitioner': partitioner,
+        })
diff --git a/tensorflow/python/estimator/canned/linear_test.py b/tensorflow/python/estimator/canned/linear_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3af89473958fcb794c56269eca11f0a5f50158
--- /dev/null
+++ b/tensorflow/python/estimator/canned/linear_test.py
@@ -0,0 +1,662 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for linear.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import linear
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
+from tensorflow.python.training import session_run_hook
+
+
+# Names of variables created by model.
+_AGE_WEIGHT_NAME = 'linear/linear_model/age/weights'
+_HEIGHT_WEIGHT_NAME = 'linear/linear_model/height/weights'
+_BIAS_NAME = 'linear/linear_model/bias_weights'
+_LANGUAGE_WEIGHT_NAME = 'linear/linear_model/language/weights'
+
+
+def _save_variables_to_ckpt(model_dir):
+  init_all_op = [variables.global_variables_initializer()]
+  with tf_session.Session() as sess:
+    sess.run(init_all_op)
+    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+class _CheckPartitionerVarHook(session_run_hook.SessionRunHook):
+  """A `SessionRunHook` to check a paritioned variable."""
+
+  def __init__(self, test_case, var_name, var_dim, partitions):
+    self._test_case = test_case
+    self._var_name = var_name
+    self._var_dim = var_dim
+    self._partitions = partitions
+
+  def begin(self):
+    with variable_scope.variable_scope(
+        variable_scope.get_variable_scope()) as scope:
+      scope.reuse_variables()
+      partitioned_weight = variable_scope.get_variable(
+          self._var_name, shape=(self._var_dim, 1))
+      self._test_case.assertTrue(
+          isinstance(partitioned_weight, variables.PartitionedVariable))
+      for part in partitioned_weight:
+        self._test_case.assertEqual(self._var_dim // self._partitions,
+                                    part.get_shape()[0])
+
+
+class LinearRegressorPartitionerTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def testPartitioner(self):
+    x_dim = 64
+    partitions = 4
+
+    def _partitioner(shape, dtype):
+      del dtype  # unused; required by Fn signature.
+      # Only partition the embedding tensor.
+      return [partitions, 1] if shape[0] == x_dim else [1]
+
+    regressor = linear.LinearRegressor(
+        feature_columns=(
+            feature_column_lib.categorical_column_with_hash_bucket(
+                'language', hash_bucket_size=x_dim),),
+        partitioner=_partitioner,
+        model_dir=self._model_dir)
+
+    def _input_fn():
+      return {
+          'language': sparse_tensor.SparseTensor(
+              values=['english', 'spanish'],
+              indices=[[0, 0], [0, 1]],
+              dense_shape=[1, 2])
+      }, [[10.]]
+
+    hook = _CheckPartitionerVarHook(
+        self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions)
+    regressor.train(
+        input_fn=_input_fn, steps=1, hooks=[hook])
+
+  def testDefaultPartitionerWithMultiplePsReplicas(self):
+    partitions = 2
+    x_dim = 4 * 64 << 20
+
+    class FakeRunConfig(run_config.RunConfig):
+
+      @property
+      def num_ps_replicas(self):
+        return partitions
+
+    # Mock the device setter as ps is not available on test machines.
+    with test.mock.patch.object(estimator,
+                                '_get_replica_device_setter',
+                                return_value=lambda _: '/cpu:0'):
+      linear_regressor = linear.LinearRegressor(
+          feature_columns=(
+              feature_column_lib.categorical_column_with_hash_bucket(
+                  'language', hash_bucket_size=x_dim),),
+          config=FakeRunConfig(),
+          model_dir=self._model_dir)
+
+      def _input_fn():
+        return {
+            'language': sparse_tensor.SparseTensor(
+                values=['english', 'spanish'],
+                indices=[[0, 0], [0, 1]],
+                dense_shape=[1, 2])
+        }, [[10.]]
+
+      hook = _CheckPartitionerVarHook(
+          self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions)
+      linear_regressor.train(
+          input_fn=_input_fn, steps=1, hooks=[hook])
+
+
+# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
+class LinearRegressorEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_evaluation_for_simple_data(self):
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=_BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10. Loss is 3**2 = 9.
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 9.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_batch(self):
+    """Tests evaluation for batch_size==2."""
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=_BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the sum over batch = 9 + 9 = 18
+    # Average loss is the average over batch = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 18.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_weights(self):
+    """Tests evaluation with weights."""
+    with ops.Graph().as_default():
+      variables.Variable([[11.0]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([2.0], name=_BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    def _input_fn():
+      features = {
+          'age': ((1,), (1,)),
+          'weights': ((1.,), (2.,))
+      }
+      labels = ((10.,), (10.,))
+      return features, labels
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        weight_feature_key='weights',
+        model_dir=self._model_dir)
+    eval_metrics = linear_regressor.evaluate(input_fn=_input_fn, steps=1)
+
+    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
+    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 27.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_for_multi_dimensions(self):
+    x_dim = 3
+    label_dim = 2
+    with ops.Graph().as_default():
+      variables.Variable(
+          [[1.0, 2.0],
+           [3.0, 4.0],
+           [5.0, 6.0]],
+          name=_AGE_WEIGHT_NAME)
+      variables.Variable([7.0, 8.0], name=_BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(
+            feature_column_lib.numeric_column('age', shape=(x_dim,)),),
+        label_dimension=label_dim,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([[2., 4., 5.]]),
+        },
+        y=np.array([[46., 58.]]),
+        batch_size=1,
+        num_epochs=None,
+        shuffle=False)
+    eval_metrics = linear_regressor.evaluate(
+        input_fn=input_fn, steps=1)
+
+    self.assertItemsEqual((
+        metric_keys.MetricKeys.LOSS,
+        metric_keys.MetricKeys.LOSS_MEAN,
+        ops.GraphKeys.GLOBAL_STEP
+    ), eval_metrics.keys())
+
+    # Logit is
+    #   [2., 4., 5.] * [1.0, 2.0] + [7.0, 8.0] = [39, 50] + [7.0, 8.0]
+    #                  [3.0, 4.0]
+    #                  [5.0, 6.0]
+    # which is [46, 58]
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+  def test_evaluation_for_multiple_feature_columns(self):
+    with ops.Graph().as_default():
+      variables.Variable([[10.0]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([[2.0]], name=_HEIGHT_WEIGHT_NAME)
+      variables.Variable([5.0], name=_BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    batch_size = 2
+    feature_columns = [
+        feature_column_lib.numeric_column('age'),
+        feature_column_lib.numeric_column('height')
+    ]
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([20, 40]),
+            'height': np.array([4, 8])
+        },
+        y=np.array([[213.], [421.]]),
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=False)
+
+    est = linear.LinearRegressor(
+        feature_columns=feature_columns,
+        model_dir=self._model_dir)
+
+    eval_metrics = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertItemsEqual((
+        metric_keys.MetricKeys.LOSS,
+        metric_keys.MetricKeys.LOSS_MEAN,
+        ops.GraphKeys.GLOBAL_STEP
+    ), eval_metrics.keys())
+
+    # Logit is [(20. * 10.0 + 4 * 2.0 + 5.0), (40. * 10.0 + 8 * 2.0 + 5.0)] =
+    # [213.0, 421.0], while label is [213., 421.]. Loss = 0.
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+
+class LinearRegressorPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    with ops.Graph().as_default():
+      variables.Variable([[10.]], name='linear/linear_model/x/weights')
+      variables.Variable([.2], name=_BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('x'),),
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[2.]])}, y=None, batch_size=1, num_epochs=1,
+        shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x * weight + bias = 2. * 10. + .2 = 20.2
+    self.assertAllClose([[20.2]], predicted_scores)
+
+  def testMultiDim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    batch_size = 2
+    label_dimension = 3
+    x_dim = 4
+    feature_columns = (
+        feature_column_lib.numeric_column('x', shape=(x_dim,)),)
+    with ops.Graph().as_default():
+      variables.Variable(  # shape=[x_dim, label_dimension]
+          [[1., 2., 3.],
+           [2., 3., 4.],
+           [3., 4., 5.],
+           [4., 5., 6.]],
+          name='linear/linear_model/x/weights')
+      variables.Variable(  # shape=[label_dimension]
+          [.2, .4, .6], name=_BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        # x shape=[batch_size, x_dim]
+        x={'x': np.array([[1., 2., 3., 4.],
+                          [5., 6., 7., 8.]])},
+        y=None, batch_size=batch_size, num_epochs=1, shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # score = x * weight + bias, shape=[batch_size, label_dimension]
+    self.assertAllClose(
+        [[30.2, 40.4, 50.6], [70.2, 96.4, 122.6]], predicted_scores)
+
+  def testTwoFeatureColumns(self):
+    """Tests predict with two feature columns."""
+    with ops.Graph().as_default():
+      variables.Variable([[10.]], name='linear/linear_model/x0/weights')
+      variables.Variable([[20.]], name='linear/linear_model/x1/weights')
+      variables.Variable([.2], name=_BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(
+            feature_column_lib.numeric_column('x0'),
+            feature_column_lib.numeric_column('x1')),
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x0': np.array([[2.]]),
+           'x1': np.array([[3.]])},
+        y=None, batch_size=1, num_epochs=1,
+        shuffle=False)
+    predictions = linear_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x0 * weight0 + x1 * weight1 + bias = 2. * 10. + 3. * 20 + .2 = 80.2
+    self.assertAllClose([[80.2]], predicted_scores)
+
+
+class LinearRegressorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def test_complete_flow(self):
+    label_dimension = 2
+    batch_size = 10
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(2,))
+    ]
+    est = linear.LinearRegressor(
+        feature_columns=feature_columns, label_dimension=label_dimension,
+        model_dir=self._model_dir)
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    # TRAIN
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=batch_size, num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=batch_size, num_epochs=1,
+        shuffle=False)
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=None, batch_size=batch_size, num_epochs=1,
+        shuffle=False)
+    predictions = list(
+        [x['predictions'] for x in est.predict(predict_input_fn)])
+    self.assertAllClose(data, predictions, atol=0.01)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(
+        feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / expected
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=(
+            'Condition expected =~ actual did not hold element-wise:'
+            'expected = ', expected,
+            'actual = ', actual,
+            'rdiff = ', rdiff,
+            'rtol = ', rtol,
+        ),
+        name=scope)
+
+
+class LinearRegressorTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _mockOptimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % _AGE_WEIGHT_NAME,
+        '%s/part_0:0' % _BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'), loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assertCheckpoint(
+      self, expected_global_step, expected_age_weight=None, expected_bias=None):
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([1, 1], shapes[_AGE_WEIGHT_NAME])
+    if expected_age_weight is not None:
+      self.assertEqual(
+          expected_age_weight,
+          checkpoint_utils.load_variable(self._model_dir, _AGE_WEIGHT_NAME))
+
+    self.assertEqual([1], shapes[_BIAS_NAME])
+    if expected_bias is not None:
+      self.assertEqual(
+          expected_bias,
+          checkpoint_utils.load_variable(self._model_dir, _BIAS_NAME))
+
+  def testFromScratchWithDefaultOptimizer(self):
+    # Create LinearRegressor.
+    label = 5.
+    age = 17
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self._assertCheckpoint(num_steps)
+
+  def testFromScratch(self):
+    # Create LinearRegressor.
+    label = 5.
+    age = 17
+    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
+    mock_optimizer = self._mockOptimizer(expected_loss=25.)
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir, optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assertCheckpoint(
+        expected_global_step=num_steps,
+        expected_age_weight=0.,
+        expected_bias=0.)
+
+  def testFromCheckpoint(self):
+    # Create initial checkpoint.
+    age_weight = 10.0
+    bias = 5.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([[age_weight]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([bias], name=_BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    # logits = age * age_weight + bias = 17 * 10. + 5. = 175
+    # loss = (logits - label)^2 = (175 - 5)^2 = 28900
+    mock_optimizer = self._mockOptimizer(expected_loss=28900.)
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir, optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assertCheckpoint(
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+  def testFromCheckpointMultiBatch(self):
+    # Create initial checkpoint.
+    age_weight = 10.0
+    bias = 5.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([[age_weight]], name=_AGE_WEIGHT_NAME)
+      variables.Variable([bias], name=_BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      _save_variables_to_ckpt(self._model_dir)
+
+    # logits = age * age_weight + bias
+    # logits[0] = 17 * 10. + 5. = 175
+    # logits[1] = 15 * 10. + 5. = 155
+    # loss = sum(logits - label)^2 = (175 - 5)^2 + (155 - 3)^2 = 52004
+    mock_optimizer = self._mockOptimizer(expected_loss=52004.)
+    linear_regressor = linear.LinearRegressor(
+        feature_columns=(feature_column_lib.numeric_column('age'),),
+        model_dir=self._model_dir, optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    linear_regressor.train(
+        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
+        steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assertCheckpoint(
+        expected_global_step=initial_global_step + num_steps,
+        expected_age_weight=age_weight,
+        expected_bias=bias)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..1261d1dcfb13243a491e8dff47b3975e1c3c6803
--- /dev/null
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enum for model prediction keys."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import model_fn
+
+
+# TODO(pucker): Merge with model_fn.MetricKeys once we've worked out out naming
+# conventions.
+class MetricKeys(object):
+  """Metric key strings."""
+  LOSS = model_fn.MetricKeys.LOSS
+  LOSS_MEAN = model_fn.MetricKeys.AVERAGE_LOSS
+
+  ACCURACY = 'accuracy'
+  ACCURACY_BASELINE = 'accuracy_baseline'
+  AUC = 'auc'
+  AUC_PR = 'auc_precision_recall'
+  LABEL_MEAN = 'label/mean'
+  PREDICTION_MEAN = 'prediction/mean'
+
+  # The following require a threshold applied, should be float in range (0, 1).
+  ACCURACY_AT_THRESHOLD = 'accuracy/positive_threshold_%g'
+  PRECISION_AT_THRESHOLD = 'precision/positive_threshold_%g'
+  RECALL_AT_THRESHOLD = 'recall/positive_threshold_%g'
diff --git a/tensorflow/python/estimator/canned/optimizers.py b/tensorflow/python/estimator/canned/optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f72c5ca5cbb2721d967ad9ef9dfa896f7ccce240
--- /dev/null
+++ b/tensorflow/python/estimator/canned/optimizers.py
@@ -0,0 +1,78 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Methods related to optimizers used in canned_estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import rmsprop
+
+
+_OPTIMIZER_CLS_NAMES = {
+    'Adagrad': adagrad.AdagradOptimizer,
+    'Adam': adam.AdamOptimizer,
+    'Ftrl': ftrl.FtrlOptimizer,
+    'RMSProp': rmsprop.RMSPropOptimizer,
+    'SGD': gradient_descent.GradientDescentOptimizer,
+}
+
+
+def get_optimizer_instance(opt, learning_rate=None):
+  """Returns an optimizer instance.
+
+  Supports the following types for the given `opt`:
+  * An `Optimizer` instance: Returns the given `opt`.
+  * A string: Creates an `Optimizer` subclass with the given `learning_rate`.
+    Supported strings:
+    * 'Adagrad': Returns an `AdagradOptimizer`.
+    * 'Adam': Returns an `AdamOptimizer`.
+    * 'Ftrl': Returns an `FtrlOptimizer`.
+    * 'RMSProp': Returns an `RMSPropOptimizer`.
+    * 'SGD': Returns a `GradientDescentOptimizer`.
+
+  Args:
+    opt: An `Optimizer` instance, or string, as discussed above.
+    learning_rate: A float. Only used if `opt` is a string.
+
+  Returns:
+    An `Optimizer` instance.
+
+  Raises:
+    ValueError: If `opt` is an unsupported string.
+    ValueError: If `opt` is a supported string but `learning_rate` was not
+      specified.
+    ValueError: If `opt` is none of the above types.
+  """
+  if isinstance(opt, six.string_types):
+    if opt in six.iterkeys(_OPTIMIZER_CLS_NAMES):
+      if not learning_rate:
+        raise ValueError('learning_rate must be specified when opt is string.')
+      return _OPTIMIZER_CLS_NAMES[opt](learning_rate=learning_rate)
+    raise ValueError(
+        'Unsupported optimizer name: {}. Supported names are: {}'.format(
+            opt, tuple(sorted(six.iterkeys(_OPTIMIZER_CLS_NAMES)))))
+  if not isinstance(opt, optimizer_lib.Optimizer):
+    raise ValueError(
+        'The given object is not an Optimizer instance. Given: {}'.format(opt))
+  return opt
diff --git a/tensorflow/python/estimator/canned/optimizers_test.py b/tensorflow/python/estimator/canned/optimizers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee28756155afd5ae3421475c3d41542db9411345
--- /dev/null
+++ b/tensorflow/python/estimator/canned/optimizers_test.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for optimizers.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import adam
+from tensorflow.python.training import ftrl
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import rmsprop
+
+
+class GetOptimizerInstance(test.TestCase):
+
+  def test_unsupported_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Unsupported optimizer name: unsupported_name'):
+      optimizers.get_optimizer_instance('unsupported_name', learning_rate=0.1)
+
+  def test_supported_name_but_learning_rate_none(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'learning_rate must be specified when opt is string'):
+      optimizers.get_optimizer_instance('Adagrad', learning_rate=None)
+
+  def test_adagrad(self):
+    opt = optimizers.get_optimizer_instance('Adagrad', learning_rate=0.1)
+    self.assertIsInstance(opt, adagrad.AdagradOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_adam(self):
+    opt = optimizers.get_optimizer_instance('Adam', learning_rate=0.1)
+    self.assertIsInstance(opt, adam.AdamOptimizer)
+    self.assertAlmostEqual(0.1, opt._lr)
+
+  def test_ftrl(self):
+    opt = optimizers.get_optimizer_instance('Ftrl', learning_rate=0.1)
+    self.assertIsInstance(opt, ftrl.FtrlOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_rmsprop(self):
+    opt = optimizers.get_optimizer_instance('RMSProp', learning_rate=0.1)
+    self.assertIsInstance(opt, rmsprop.RMSPropOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_sgd(self):
+    opt = optimizers.get_optimizer_instance('SGD', learning_rate=0.1)
+    self.assertIsInstance(opt, gradient_descent.GradientDescentOptimizer)
+    self.assertAlmostEqual(0.1, opt._learning_rate)
+
+  def test_object(self):
+    class _TestOptimizer(optimizer_lib.Optimizer):
+
+      def __init__(self):
+        super(_TestOptimizer, self).__init__(
+            use_locking=False, name='TestOptimizer')
+
+    opt = optimizers.get_optimizer_instance(_TestOptimizer())
+    self.assertIsInstance(opt, _TestOptimizer)
+
+  def test_object_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'The given object is not an Optimizer instance'):
+      optimizers.get_optimizer_instance((1, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/estimator/canned/prediction_keys.py b/tensorflow/python/estimator/canned/prediction_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dfb9e184f0b66ff9c0a5d511b72415b8ec593b5
--- /dev/null
+++ b/tensorflow/python/estimator/canned/prediction_keys.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enum for model prediction keys."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class PredictionKeys(object):
+  """Enum for canonical model prediction keys.
+
+  The following values are defined:
+  PREDICTIONS: Used by models that predict values, such as regressor models.
+  """
+
+  CLASSES = 'classes'
+  LOGISTIC = 'logistic'
+  LOGITS = 'logits'
+  PREDICTIONS = 'predictions'
+  PROBABILITIES = 'probabilities'
diff --git a/tensorflow/python/estimator/checkpoint_utils.py b/tensorflow/python/estimator/checkpoint_utils.py
deleted file mode 100644
index 7ad2730f205173fc18c9ef1e37dbd4b7424367b7..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/checkpoint_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tools to work with checkpoints."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import gfile
-from tensorflow.python.training import saver
-from tensorflow.python.training import training as train
-
-
-def _get_checkpoint_filename(filepattern):
-  """Returns checkpoint filename given directory or specific filepattern."""
-  if gfile.IsDirectory(filepattern):
-    return saver.latest_checkpoint(filepattern)
-  return filepattern
-
-
-def _load_checkpoint(filepattern):
-  """Returns CheckpointReader for latest checkpoint.
-
-  Args:
-    filepattern: Directory with checkpoints file or path to checkpoint.
-
-  Returns:
-    `CheckpointReader` object.
-
-  Raises:
-    ValueError: if checkpoint_dir doesn't have 'checkpoint' file or checkpoints.
-  """
-  filename = _get_checkpoint_filename(filepattern)
-  if filename is None:
-    raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
-                     "given directory %s" % filepattern)
-  return train.NewCheckpointReader(filename)
-
-
-def load_variable(checkpoint_dir, name):
-  """Returns a Tensor with the contents of the given variable in the checkpoint.
-
-  Args:
-    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
-    name: Name of the tensor to return.
-
-  Returns:
-    `Tensor` object.
-  """
-  # TODO(b/29227106): Fix this in the right place and remove this.
-  if name.endswith(":0"):
-    name = name[:-2]
-  reader = _load_checkpoint(checkpoint_dir)
-  return reader.get_tensor(name)
-
-
-def list_variables(checkpoint_dir):
-  """Returns list of all variables in the latest checkpoint.
-
-  Args:
-    checkpoint_dir: Directory with checkpoints file or path to checkpoint.
-
-  Returns:
-    List of tuples `(name, shape)`.
-  """
-  reader = _load_checkpoint(checkpoint_dir)
-  variable_map = reader.get_variable_to_shape_map()
-  names = sorted(variable_map.keys())
-  result = []
-  for name in names:
-    result.append((name, variable_map[name]))
-  return result
diff --git a/tensorflow/python/estimator/checkpoint_utils_test.py b/tensorflow/python/estimator/checkpoint_utils_test.py
deleted file mode 100644
index cf11039aa201def38d1ff2caefd80d5b3f6180a0..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/checkpoint_utils_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for checkpoint_utils.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.python.estimator import checkpoint_utils
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
-
-
-def _create_checkpoints(sess, checkpoint_dir):
-  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
-  checkpoint_state_name = "checkpoint"
-  v1 = variable_scope.get_variable("var1", [1, 10])
-  v2 = variable_scope.get_variable("var2", [10, 10])
-  v3 = variable_scope.get_variable("var3", [100, 100])
-  with variable_scope.variable_scope("useful_scope"):
-    v4 = variable_scope.get_variable("var4", [9, 9])
-  sess.run(variables.global_variables_initializer())
-  v1_value, v2_value, v3_value, v4_value = sess.run([v1, v2, v3, v4])
-  saver = saver_lib.Saver()
-  saver.save(
-      sess,
-      checkpoint_prefix,
-      global_step=0,
-      latest_filename=checkpoint_state_name)
-  return v1_value, v2_value, v3_value, v4_value
-
-
-class CheckpointsTest(test.TestCase):
-
-  def testNoCheckpoints(self):
-    checkpoint_dir = self.get_temp_dir() + "/no_checkpoints"
-    with self.assertRaises(errors_impl.OpError):
-      self.assertAllEqual(
-          checkpoint_utils.load_variable(checkpoint_dir, "var1"), [])
-
-  def testNoTensor(self):
-    checkpoint_dir = self.get_temp_dir()
-    with self.test_session() as session:
-      _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
-    with self.assertRaises(errors_impl.OpError):
-      self.assertAllEqual(
-          checkpoint_utils.load_variable(checkpoint_dir, "var5"), [])
-
-  def testGetTensor(self):
-    checkpoint_dir = self.get_temp_dir()
-    with self.test_session() as session:
-      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
-    self.assertAllEqual(
-        checkpoint_utils.load_variable(checkpoint_dir, "var1"), v1)
-    self.assertAllEqual(
-        checkpoint_utils.load_variable(checkpoint_dir, "var2"), v2)
-    self.assertAllEqual(
-        checkpoint_utils.load_variable(checkpoint_dir, "var3"), v3)
-    self.assertAllEqual(
-        checkpoint_utils.load_variable(checkpoint_dir, "useful_scope/var4"), v4)
-
-  def testGetAllVariables(self):
-    checkpoint_dir = self.get_temp_dir()
-    with self.test_session() as session:
-      _create_checkpoints(session, checkpoint_dir)
-    self.assertEqual(
-        checkpoint_utils.list_variables(checkpoint_dir),
-        [("useful_scope/var4", [9, 9]), ("var1", [1, 10]), ("var2", [10, 10]),
-         ("var3", [100, 100])])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index d83cd8b3702be1b12ad74122592f30d622846907..230f103c5d15c808e1c19223b262623c3161bee5 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import inspect
 import os
 import tempfile
 
@@ -48,6 +47,9 @@ from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
 
 _VALID_MODEL_FN_ARGS = set(
     ['features', 'labels', 'mode', 'params', 'config'])
@@ -89,14 +91,18 @@ class Estimator(object):
 
     Args:
       model_fn: Model function. Follows the signature:
+
         * Args:
-          * `features`: single `Tensor` or `dict` of `Tensor`s
-                 (depending on data passed to `train`),
-          * `labels`: `Tensor` or `dict` of `Tensor`s (for multi-head
-                 models). If mode is `ModeKeys.PREDICT`, `labels=None` will be
-                 passed. If the `model_fn`'s signature does not accept
-                 `mode`, the `model_fn` must still be able to handle
-                 `labels=None`.
+
+          * `features`: This is the first item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same.
+          * `labels`: This is the second item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same (for multi-head models). If
+                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
+                 the `model_fn`'s signature does not accept `mode`, the
+                 `model_fn` must still be able to handle `labels=None`.
           * `mode`: Optional. Specifies if this training, evaluation or
                  prediction. See `ModeKeys`.
           * `params`: Optional `dict` of hyperparameters.  Will receive what
@@ -109,9 +115,12 @@ class Estimator(object):
 
         * Returns:
           `EstimatorSpec`
+
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
@@ -122,12 +131,6 @@ class Estimator(object):
         a member of `Estimator`.
     """
     Estimator._assert_members_are_not_overridden(self)
-    # Model directory.
-    self._model_dir = model_dir
-    if self._model_dir is None:
-      self._model_dir = tempfile.mkdtemp()
-      logging.warning('Using temporary folder as model directory: %s',
-                      self._model_dir)
 
     if config is None:
       self._config = run_config.RunConfig()
@@ -139,8 +142,30 @@ class Estimator(object):
             config)
       self._config = config
 
+    # Model directory.
+    if (model_dir is not None) and (self._config.model_dir is not None):
+      if model_dir != self._config.model_dir:
+        # pylint: disable=g-doc-exception
+        raise ValueError(
+            "model_dir are set both in constructor and RunConfig, but with "
+            "different values. In constructor: '{}', in RunConfig: "
+            "'{}' ".format(model_dir, self._config.model_dir))
+        # pylint: enable=g-doc-exception
+
+    self._model_dir = model_dir or self._config.model_dir
+    if self._model_dir is None:
+      self._model_dir = tempfile.mkdtemp()
+      logging.warning('Using temporary folder as model directory: %s',
+                      self._model_dir)
+    if self._config.model_dir is None:
+      self._config = self._config.replace(model_dir=self._model_dir)
     logging.info('Using config: %s', str(vars(self._config)))
 
+    if self._config.session_config is None:
+      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    else:
+      self._session_config = self._config.session_config
+
     self._device_fn = _get_replica_device_setter(self._config)
 
     if model_fn is None:
@@ -175,7 +200,7 @@ class Estimator(object):
         error. 'steps' works incrementally. If you call two times
         train(steps=10) then training occurs in total 20 steps. If `OutOfRange`
         or `StopIteration` error occurs in the middle, training stops before 20
-        steps. If you don't want to have incremental behaviour please set
+        steps. If you don't want to have incremental behavior please set
         `max_steps` instead. If set, `max_steps` must be `None`.
       max_steps: Number of total steps for which to train model. If `None`,
         train forever or train until input_fn generates the `OutOfRange` or
@@ -198,10 +223,10 @@ class Estimator(object):
     if (steps is not None) and (max_steps is not None):
       raise ValueError('Can not provide both steps and max_steps.')
     if steps is not None and steps <= 0:
-      raise ValueError('Must specify steps >= 0, given: {}'.format(steps))
+      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
     if max_steps is not None and max_steps <= 0:
       raise ValueError(
-          'Must specify max_steps >= 0, given: {}'.format(max_steps))
+          'Must specify max_steps > 0, given: {}'.format(max_steps))
 
     if max_steps is not None:
       start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
@@ -256,7 +281,7 @@ class Estimator(object):
     hooks = _check_hooks_type(hooks)
     if steps is not None:
       if steps <= 0:
-        raise ValueError('Must specify steps >= 0, given: {}'.format(steps))
+        raise ValueError('Must specify steps > 0, given: {}'.format(steps))
       hooks.append(evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
           num_evals=steps))
 
@@ -266,7 +291,11 @@ class Estimator(object):
         checkpoint_path=checkpoint_path,
         name=name)
 
-  def predict(self, input_fn, predict_keys=None, hooks=None, checkpoint_path=None):
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None):
     """Returns predictions for given features.
 
     Args:
@@ -313,7 +342,7 @@ class Estimator(object):
           session_creator=training.ChiefSessionCreator(
               checkpoint_filename_with_path=checkpoint_path,
               scaffold=estimator_spec.scaffold,
-              config=config_pb2.ConfigProto(allow_soft_placement=True)),
+              config=self._session_config),
           hooks=hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
@@ -504,7 +533,7 @@ class Estimator(object):
     Raises:
       ValueError: if model_fn returns invalid objects.
     """
-    model_fn_args = _get_arguments(self._model_fn).args
+    model_fn_args = _model_fn_args(self._model_fn)
     kwargs = {}
     if 'mode' in model_fn_args:
       kwargs['mode'] = mode
@@ -548,7 +577,8 @@ class Estimator(object):
                               training.Saver(
                                   sharded=True,
                                   max_to_keep=self._config.keep_checkpoint_max,
-                                  defer_build=True))
+                                  defer_build=True,
+                                  save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
@@ -556,7 +586,7 @@ class Estimator(object):
         saver_hook_exists = any([
             isinstance(h, training.CheckpointSaverHook)
             for h in (all_hooks + chief_hooks +
-                      estimator_spec.training_chief_hooks)
+                      list(estimator_spec.training_chief_hooks))
         ])
         if not saver_hook_exists:
           chief_hooks = [
@@ -572,10 +602,11 @@ class Estimator(object):
           checkpoint_dir=self._model_dir,
           scaffold=estimator_spec.scaffold,
           hooks=all_hooks,
-          chief_only_hooks=chief_hooks + estimator_spec.training_chief_hooks,
+          chief_only_hooks=(
+              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
           save_checkpoint_secs=0,  # Saving is handled by a hook.
           save_summaries_steps=self._config.save_summary_steps,
-          config=config_pb2.ConfigProto(allow_soft_placement=True)) as mon_sess:
+          config=self._session_config) as mon_sess:
         loss = None
         while not mon_sess.should_stop():
           _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
@@ -630,7 +661,7 @@ class Estimator(object):
           eval_ops=update_op,
           final_ops=eval_dict,
           hooks=hooks,
-          config=config_pb2.ConfigProto(allow_soft_placement=True))
+          config=self._session_config)
 
       _write_dict_to_summary(
           output_dir=eval_dir,
@@ -639,12 +670,6 @@ class Estimator(object):
 
     return eval_results
 
-  def _verify_default_metric_key(self, metric_key, eval_dict):
-    if metric_key in six.iterkeys(eval_dict):
-      raise ValueError(
-          'Metric with name `%s` is not allowed, because Estimator '
-          'already defines a default metric with the same name.' % metric_key)
-
 
 def _check_hooks_type(hooks):
   """Returns hooks if all are SessionRunHook, raises TypeError otherwise."""
@@ -689,35 +714,48 @@ def _get_replica_device_setter(config):
     return None
 
 
-def _get_arguments(func):
-  """Returns a spec of given func."""
-  if hasattr(func, '__code__'):
-    # Regular function.
-    return inspect.getargspec(func)
-  elif hasattr(func, '__call__'):
-    # Callable object.
-    return _get_arguments(func.__call__)
-  elif hasattr(func, 'func'):
-    # Partial function.
-    return _get_arguments(func.func)
+def _model_fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  _, fn = tf_decorator.unwrap(fn)
+  if hasattr(fn, 'func') and hasattr(fn, 'keywords') and hasattr(fn, 'args'):
+    # Handle functools.partial and similar objects.
+    return tuple([
+        arg for arg in tf_inspect.getargspec(fn.func).args[len(fn.args):]
+        if arg not in set(fn.keywords.keys())
+    ])
+  # Handle function.
+  return tuple(tf_inspect.getargspec(fn).args)
 
 
 def _verify_model_fn_args(model_fn, params):
   """Verifies model fn arguments."""
-  fn_spec = _get_arguments(model_fn)
-  if 'features' not in fn_spec.args:
+  args = set(_model_fn_args(model_fn))
+  if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
-  if 'labels' not in fn_spec.args:
+  if 'labels' not in args:
     raise ValueError('model_fn (%s) must include labels argument.' % model_fn)
-  if params is not None and 'params' not in fn_spec.args:
+  if params is not None and 'params' not in args:
     raise ValueError('model_fn (%s) does not include params argument, '
                      'but params (%s) is passed to Estimator.' % (model_fn,
                                                                   params))
-  if params is None and 'params' in fn_spec.args:
+  if params is None and 'params' in args:
     logging.warning('Estimator\'s model_fn (%s) includes params '
                     'argument, but params are not passed to Estimator.',
                     model_fn)
-  non_valid_args = list(set(fn_spec.args) - _VALID_MODEL_FN_ARGS)
+  if tf_inspect.ismethod(model_fn):
+    if 'self' in args:
+      args.remove('self')
+  non_valid_args = list(args - _VALID_MODEL_FN_ARGS)
   if non_valid_args:
     raise ValueError('model_fn (%s) has following not expected args: %s' %
                      (model_fn, non_valid_args))
@@ -779,13 +817,20 @@ def _write_dict_to_summary(output_dir,
   for key in dictionary:
     if dictionary[key] is None:
       continue
+    if key == 'global_step':
+      continue
     value = summary_proto.value.add()
     value.tag = key
     if (isinstance(dictionary[key], np.float32) or
         isinstance(dictionary[key], float)):
       value.simple_value = float(dictionary[key])
+    elif (isinstance(dictionary[key], np.int64) or
+          isinstance(dictionary[key], np.int32) or
+          isinstance(dictionary[key], int)):
+      value.simple_value = int(dictionary[key])
     else:
-      logging.warn('Skipping summary for %s, must be a float or np.float32.',
-                   key)
+      logging.warn(
+          'Skipping summary for %s, must be a float, np.float32, np.int64, np.int32 or int.',
+          key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index cac0b55bd38e3f76b39ce75d6c5bd2b239acade7..b25c3ba93abe5212ad6208508033ee92b9b8080c 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 import tempfile
 
 import numpy as np
+import six
+
+from google.protobuf import text_format
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator
@@ -34,10 +38,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import layers
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
@@ -48,12 +54,17 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_state_pb2
 from tensorflow.python.training import saver
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
 
+_TMP_DIR = '/tmp'
+_ANOTHER_TMP_DIR = '/another_tmp'
+
 
 def dummy_model_fn(features, labels, params):
   _, _, _ = features, labels, params
@@ -118,7 +129,7 @@ class EstimatorConstructorTest(test.TestCase):
     def model_fn(features, labels, params):
       _, _, _ = features, labels, params
 
-    class FakeConfig(run_config.RunConfig):  # pylint: disable=g-wrong-blank-lines
+    class FakeConfig(run_config.RunConfig):
       pass
 
     params = {'hidden_layers': [3, 4]}
@@ -141,8 +152,68 @@ class EstimatorConstructorTest(test.TestCase):
     def model_fn(features, labels):
       _, _ = features, labels
 
-    est = estimator.Estimator(model_fn=model_fn)
-    self.assertTrue(est.model_dir is not None)
+    with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
+      est = estimator.Estimator(model_fn=model_fn)
+      self.assertEqual(_TMP_DIR, est.config.model_dir)
+      self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_model_dir_in_constructor(self):
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(model_fn=model_fn, model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_model_dir_in_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(model_fn=model_fn, config=FakeConfig())
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_same_model_dir_in_constructor_and_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    est = estimator.Estimator(
+        model_fn=model_fn, config=FakeConfig(), model_dir=_TMP_DIR)
+    self.assertEqual(_TMP_DIR, est.config.model_dir)
+    self.assertEqual(_TMP_DIR, est.model_dir)
+
+  def test_different_model_dir_in_constructor_and_run_config(self):
+
+    class FakeConfig(run_config.RunConfig):
+
+      @property
+      def model_dir(self):
+        return _TMP_DIR
+
+    def model_fn(features, labels):
+      _, _ = features, labels
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'model_dir are set both in constructor and RunConfig, but '
+        'with different values'):
+      estimator.Estimator(
+          model_fn=model_fn, config=FakeConfig(), model_dir=_ANOTHER_TMP_DIR)
 
   def test_model_fn_args_must_include_features(self):
 
@@ -185,6 +256,18 @@ class EstimatorConstructorTest(test.TestCase):
         features, labels, 'something')
     estimator.Estimator(model_fn=new_model_fn)
 
+  def test_if_model_fn_is_a_member_function_of_a_class(self):
+
+    class ModelFnClass(object):
+
+      def __init__(self):
+        estimator.Estimator(model_fn=self.model_fn)
+
+      def model_fn(self, features, labels, mode):
+        _, _, _ = features, labels, mode
+
+    ModelFnClass()
+
 
 def dummy_input_fn():
   return ({'x': constant_op.constant([[1], [1]])},
@@ -200,8 +283,120 @@ def model_fn_global_step_incrementer(features, labels, mode):
       train_op=state_ops.assign_add(global_step, 1))
 
 
+def _estimator_spec(
+    expected_features, expected_labels, actual_features, actual_labels, mode):
+  assert_ops = tuple([
+      check_ops.assert_equal(
+          expected_features[k], actual_features[k], name='assert_%s' % k)
+      for k in expected_features
+  ] + [
+      check_ops.assert_equal(
+          expected_labels, actual_labels, name='assert_labels')
+  ])
+  with ops.control_dependencies(assert_ops):
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        predictions=constant_op.constant(0.),
+        loss=constant_op.constant(0.),
+        train_op=constant_op.constant(0.))
+
+
+def _make_input_fn(features, labels):
+  def _input_fn():
+    return {
+        k: constant_op.constant(v)
+        for k, v in six.iteritems(features)
+    }, constant_op.constant(labels)
+  return _input_fn
+
+
 class EstimatorTrainTest(test.TestCase):
 
+  def test_minimal_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    def _model_fn(features, labels):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels,
+          model_fn_lib.ModeKeys.TRAIN)
+
+    with self.assertRaisesRegexp(ValueError, 'does not include params'):
+      estimator.Estimator(model_fn=_model_fn, params={'a': 'b'})
+    est = estimator.Estimator(model_fn=_model_fn, config=run_config.RunConfig())
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_all_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    # Note that args are all passed by keyword, so can be in any order.
+    def _model_fn(mode, params, features, labels, config):
+      model_fn_call_count[0] += 1
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels, mode)
+
+    est = estimator.Estimator(
+        model_fn=_model_fn, params=expected_params, config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
+  def test_partial_model_fn_args(self):
+    expected_features = {'x': 42., 'y': 43.}
+    expected_labels = 44.
+    expected_params = {'some_param': 'some_value'}
+    expected_config = run_config.RunConfig()
+    expected_config.i_am_test = True
+    expected_foo = 45.
+    expected_bar = 46.
+
+    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
+    # doesn't work with mock fns.
+    model_fn_call_count = [0]
+
+    def _model_fn(features, labels, foo, mode, params, config, bar):
+      model_fn_call_count[0] += 1
+      self.assertEqual(expected_foo, foo)
+      self.assertEqual(expected_bar, bar)
+      self.assertItemsEqual(expected_features.keys(), features.keys())
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
+      self.assertEqual(expected_params, params)
+      self.assertTrue(config.i_am_test)
+      return _estimator_spec(
+          expected_features, expected_labels, features, labels, mode)
+    partial_model_fn = functools.partial(
+        _model_fn, foo=expected_foo, bar=expected_bar)
+
+    est = estimator.Estimator(
+        model_fn=partial_model_fn, params=expected_params,
+        config=expected_config)
+    self.assertEqual(0, model_fn_call_count[0])
+    est.train(
+        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
+    self.assertEqual(1, model_fn_call_count[0])
+
   def test_model_fn_must_return_estimator_spec(self):
 
     def model_fn(features, labels):
@@ -236,28 +431,66 @@ class EstimatorTrainTest(test.TestCase):
     self.assertEqual(
         5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
 
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=model_fn_global_step_incrementer)
+    est.train(dummy_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=model_fn_global_step_incrementer)
+    est1.train(dummy_input_fn, steps=5)
+
+    # We have to clear the cache before we can rename the directory,
+    # otherwise open file handles will prevent the delete on Windows.
+    writer_cache.FileWriterCache.clear()
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=model_fn_global_step_incrementer)
+    self.assertEqual(
+        5, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
+    est2.train(dummy_input_fn, steps=5)
+    self.assertEqual(
+        10, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
+
   def test_steps0_raises_error(self):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
       est.train(dummy_input_fn, steps=0)
 
   def test_steps_negative_raises_error(self):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
       est.train(dummy_input_fn, steps=-1)
 
   def test_max_steps0_raises_error(self):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
       est.train(dummy_input_fn, max_steps=0)
 
   def test_max_steps_negative_raises_error(self):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
       est.train(dummy_input_fn, max_steps=-1)
 
   def test_scaffold_is_used(self):
@@ -321,7 +554,7 @@ class EstimatorTrainTest(test.TestCase):
           training_chief_hooks=[chief_hook],
           training_hooks=[hook])
 
-    class NonChiefRunConfig(run_config.RunConfig):  # pylint: disable=g-wrong-blank-lines
+    class NonChiefRunConfig(run_config.RunConfig):
       @property
       def is_chief(self):  # pylint: disable=g-wrong-blank-lines
         return False
@@ -329,6 +562,8 @@ class EstimatorTrainTest(test.TestCase):
     # Mocking the SessionManager.wait_for_session, so that worker doesn't wait
     # for chief.
     def get_initialized_session(*args, **kwargs):
+      # Session doesn't take 'max_wait_secs' argument.
+      kwargs.pop('max_wait_secs', None)
       scaffold = training.Scaffold().finalize()
       sess = session.Session(*args, **kwargs)
       sess.run(scaffold.init_op)
@@ -475,14 +710,14 @@ class EstimatorEvaluateTest(test.TestCase):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
     est.train(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
       est.evaluate(dummy_input_fn, steps=0)
 
   def test_steps_negative_raises_error(self):
     est = estimator.Estimator(
         model_fn=_model_fn_with_eval_metric_ops)
     est.train(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps >= 0'):
+    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
       est.evaluate(dummy_input_fn, steps=-1)
 
   def test_global_step_metric_raises_error(self):
@@ -627,7 +862,10 @@ class EstimatorPredictTest(test.TestCase):
   def test_no_trained_model_invalid_checkpoint_path(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     with self.assertRaises(ValueError):
-      next(est.predict(dummy_input_fn, checkpoint_path=saver.latest_checkpoint("fakedir")))
+      next(
+          est.predict(
+              dummy_input_fn,
+              checkpoint_path=saver.latest_checkpoint('fakedir')))
 
   def test_tensor_predictions(self):
 
@@ -848,9 +1086,12 @@ class EstimatorPredictTest(test.TestCase):
     est1 = estimator.Estimator(model_fn=_model_fn)
     est1.train(dummy_input_fn, steps=1)
     est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
-    self.assertEqual([32.], next(est2.predict(
-      dummy_input_fn,
-      checkpoint_path=saver.latest_checkpoint(est1.model_dir))))
+    self.assertEqual(
+        [32.],
+        next(
+            est2.predict(
+                dummy_input_fn,
+                checkpoint_path=saver.latest_checkpoint(est1.model_dir))))
 
   def test_scaffold_is_used(self):
 
@@ -1172,9 +1413,10 @@ class EstimatorExportTest(test.TestCase):
       my_int = variables.Variable(1, name='my_int',
                                   collections=[ops.GraphKeys.LOCAL_VARIABLES])
       scores = constant_op.constant([3.])
-      with ops.control_dependencies(
-          [variables.local_variables_initializer(),
-           data_flow_ops.tables_initializer()]):
+      with ops.control_dependencies([
+          variables.local_variables_initializer(),
+          lookup_ops.tables_initializer()
+      ]):
         assign_op = state_ops.assign(my_int, 12345)
 
       # local_initSop must be an Operation, not a Tensor.
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 37a98cf481521cc571e60c264b0980077ee01580..a1ecd794df6f114483a1ce4eeacd6fcbd4634392 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -23,6 +23,8 @@ import collections
 import os
 import time
 
+import six
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -56,7 +58,7 @@ class ServingInputReceiver(collections.namedtuple('ServingInputReceiver',
     if not isinstance(features, dict):
       features = {_SINGLE_FEATURE_DEFAULT_NAME: features}
     for name, tensor in features.items():
-      if not isinstance(name, str):
+      if not isinstance(name, six.string_types):
         raise ValueError('feature keys must be strings: {}.'.format(name))
       if not (isinstance(tensor, ops.Tensor)
               or isinstance(tensor, sparse_tensor.SparseTensor)):
@@ -68,7 +70,7 @@ class ServingInputReceiver(collections.namedtuple('ServingInputReceiver',
     if not isinstance(receiver_tensors, dict):
       receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
     for name, tensor in receiver_tensors.items():
-      if not isinstance(name, str):
+      if not isinstance(name, six.string_types):
         raise ValueError(
             'receiver_tensors keys must be strings: {}.'.format(name))
       if not isinstance(tensor, ops.Tensor):
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 69be0f687c17fb85a2a1f2830360af284006afdf..49bcd06d504bc4b1faa4920b87ebe92510190731 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -171,7 +173,7 @@ class PredictOutput(ExportOutput):
           'Prediction outputs must be given as a dict of string to Tensor; '
           'got {}'.format(outputs))
     for key, value in outputs.items():
-      if not isinstance(key, str):
+      if not isinstance(key, six.string_types):
         raise ValueError(
             'Prediction output key must be a string; got {}.'.format(key))
       if not isinstance(value, ops.Tensor):
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
index 27a088e551c25062a770f44acb4b3b907b880051..035a9a143e6ffa18ae78ef2544614f342363b22d 100644
--- a/tensorflow/python/estimator/export/export_output_test.py
+++ b/tensorflow/python/estimator/export/export_output_test.py
@@ -22,7 +22,9 @@ from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -197,6 +199,33 @@ class ExportOutputTest(test.TestCase):
         signature_constants.CLASSIFY_METHOD_NAME)
     self.assertEqual(actual_signature_def, expected_signature_def)
 
+  def test_predict_output_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    outputs = {
+        "output0": constant_op.constant([0]),
+        u"output1": constant_op.constant([1]),
+    }
+    export_output_lib.PredictOutput(outputs)
+
+  def test_predict_output_outputs_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction outputs must be given as a dict of string to Tensor"):
+      export_output_lib.PredictOutput(constant_op.constant([0]))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output key must be a string"):
+      export_output_lib.PredictOutput({1: constant_op.constant([0])})
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Prediction output value must be a Tensor"):
+      export_output_lib.PredictOutput({
+          "prediction1": sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+      })
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index fdd924f2e1cc4936c9655c0b26b85e6f954f016d..7946bd88ba0b577fb9f4885b80829cec6f26c919 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -28,13 +28,11 @@ from tensorflow.core.example import example_pb2
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -43,6 +41,69 @@ from tensorflow.python.saved_model import signature_def_utils
 
 class ExportTest(test_util.TensorFlowTestCase):
 
+  def test_serving_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.ServingInputReceiver(features, receiver_tensors)
+
+  def test_serving_input_receiver_features_invalid(self):
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.ServingInputReceiver(
+          features=None,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
+      export.ServingInputReceiver(
+          features={1: constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
+      export.ServingInputReceiver(
+          features={"feature1": [1]},
+          receiver_tensors=receiver_tensors)
+
+  def test_serving_input_receiver_receiver_tensors_invalid(self):
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.ServingInputReceiver(
+          features=features,
+          receiver_tensors={"example1": [1]})
+
   def test_single_feature_single_receiver(self):
     feature = constant_op.constant(5)
     receiver_tensor = array_ops.placeholder(dtypes.string)
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index dab8ffea7578ea19d81046a2cd77a1302b2b0ece..a6f5157680f5733a930b3d3e1fd8c2b63af690be 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import collections
 import random
+import types as tp
 import numpy as np
 import six
-import types as tp
 
 from tensorflow.python.estimator.inputs.queues import feeding_queue_runner as fqr
 from tensorflow.python.framework import dtypes
@@ -245,8 +245,8 @@ class _GeneratorFeedFn(object):
 
   def __call__(self):
     if self._num_epochs and self._epoch >= self._num_epochs:
-      raise errors.OutOfRangeError(
-          None, None, "Already emitted %s epochs." % self._epoch)
+      raise errors.OutOfRangeError(None, None,
+                                   "Already emitted %s epochs." % self._epoch)
     list_dict = {}
     list_dict_size = 0
     while list_dict_size < self._batch_size:
@@ -258,8 +258,9 @@ class _GeneratorFeedFn(object):
         data_row = next(self._iterator)
       for index, key in enumerate(self._keys):
         if key not in data_row.keys():
-          raise KeyError('key mismatch between dicts emitted by GenFun'
-              'Expected {} keys; got {}'.format( self._keys, data_row.keys()))
+          raise KeyError("key mismatch between dicts emitted by GenFun"
+                         "Expected {} keys; got {}".format(
+                             self._keys, data_row.keys()))
         list_dict.setdefault(self._col_placeholders[index],
                              list()).append(data_row[key])
         list_dict_size += 1
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index ee5999c78bc97f05cf353a192d2d54eecc47e5b8..1aa2623962059aa54b5e1e84482681614a56b043 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -51,6 +51,7 @@ class ModeKeys(object):
 class MetricKeys(object):
   """Metric key strings."""
   LOSS = 'loss'
+  AVERAGE_LOSS = 'average_loss'
 
 
 class EstimatorSpec(
@@ -142,10 +143,10 @@ class EstimatorSpec(
         Multi-headed models should specify one entry for each head, one of
         which must be named using
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.
-      training_chief_hooks: A list of `tf.train.SessionRunHook` objects to
+      training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
         run on the chief worker during training.
-      training_hooks: A list of `tf.train.SessionRunHook` objects that to run on
-        all workers during training.
+      training_hooks: Iterable of `tf.train.SessionRunHook` objects that to run
+        on all workers during training.
       scaffold: A `tf.train.Scaffold` object that can be used to set
         initialization, saver, and more to be used in training.
 
@@ -238,23 +239,43 @@ class EstimatorSpec(
 
     # Validate that all tensors and ops are from the default graph.
     default_graph = ops.get_default_graph()
-    for value in _prediction_values(predictions):
-      if value.graph is not default_graph:
-        raise ValueError('prediction values must be from the default graph.')
+
+    # We enumerate possible error causes here to aid in debugging.
+    error_message_template = (
+        '{0} with "{1}" must be from the default graph. '
+        'Possible causes of this error include: \n\n'
+        '1) {0} was created outside the context of the default graph.'
+        '\n\n'
+        '2) The object passed through to EstimatorSpec was not created '
+        'in the most recent call to "model_fn".')
+
+    if isinstance(predictions, dict):
+      for key, value in six.iteritems(predictions):
+        if value.graph is not default_graph:
+          raise ValueError(error_message_template.format(
+              'prediction values',
+              '{0}: {1}'.format(key, value.name)))
+    elif predictions is not None:
+      # 'predictions' must be a single Tensor.
+      if predictions.graph is not default_graph:
+        raise ValueError(error_message_template.format(
+            'prediction values', predictions.name))
+
     if loss is not None and loss.graph is not default_graph:
-      raise ValueError('loss must be from the default graph.')
+      raise ValueError(error_message_template.format('loss', loss.name))
     if train_op is not None and train_op.graph is not default_graph:
-      raise ValueError('train_op must be from the default graph.')
-    for value in nest.flatten(list(eval_metric_ops.values())):
-      if value.graph is not default_graph:
-        raise ValueError(
-            'eval_metric_ops values must be from the default graph.')
+      raise ValueError(error_message_template.format('train_op', train_op.name))
+    for key, value in list(six.iteritems(eval_metric_ops)):
+      values = nest.flatten(value)
+      for value in values:
+        if value.graph is not default_graph:
+          raise ValueError(error_message_template.format(
+              'eval_metric_ops',
+              '{0}: {1}'.format(key, value.name)))
 
     # Validate hooks.
-    if training_chief_hooks is None:
-      training_chief_hooks = []
-    if training_hooks is None:
-      training_hooks = []
+    training_chief_hooks = tuple(training_chief_hooks or [])
+    training_hooks = tuple(training_hooks or [])
     for hook in training_hooks + training_chief_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
         raise TypeError(
@@ -289,12 +310,3 @@ def _check_is_tensor(x, tensor_name):
   if not isinstance(x, ops.Tensor):
     raise TypeError('{} must be Tensor, given: {}'.format(tensor_name, x))
   return x
-
-
-def _prediction_values(predictions):
-  """Returns the values of the given predictions dict or `Tensor`."""
-  if predictions is None:
-    return []
-  if isinstance(predictions, dict):
-    return list(six.itervalues(predictions))
-  return [predictions]
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index c6e6c609917352516d88e0e81ec5fdc21013a318..30ba18d07dbb804297fbe6d668abf91756867086 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -18,6 +18,81 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
+import six
+
+from tensorflow.core.protobuf import config_pb2
+
+
+# A list of the property names in RunConfig user allows to change.
+_DEFAULT_REPLACEABLE_LIST = [
+    'model_dir',
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+
+
+def _validate_save_ckpt_with_replaced_keys(new_copy, replaced_keys):
+  """Validates the save ckpt properties."""
+  # Ensure one (and only one) of save_steps and save_secs is not None.
+  # Also, if user sets one save ckpt property, say steps, the other one (secs)
+  # should be set as None to improve usability.
+
+  save_steps = new_copy.save_checkpoints_steps
+  save_secs = new_copy.save_checkpoints_secs
+
+  if ('save_checkpoints_steps' in replaced_keys and
+      'save_checkpoints_secs' in replaced_keys):
+    # If user sets both properties explicitly, we need to error out if both
+    # are set or neither of them are set.
+    if save_steps is not None and save_secs is not None:
+      raise ValueError(_SAVE_CKPT_ERR)
+  elif 'save_checkpoints_steps' in replaced_keys and save_steps is not None:
+    new_copy._save_checkpoints_secs = None  # pylint: disable=protected-access
+  elif 'save_checkpoints_secs' in replaced_keys and save_secs is not None:
+    new_copy._save_checkpoints_steps = None  # pylint: disable=protected-access
+
+
+def _validate_properties(run_config):
+  """Validates the properties."""
+  def _validate(property_name, cond, message):
+    property_value = getattr(run_config, property_name)
+    if property_value is not None and not cond(property_value):
+      raise ValueError(message)
+
+  _validate('model_dir', lambda dir: dir,
+            message='model_dir should be non-empty')
+
+  _validate('save_summary_steps', lambda steps: steps >= 0,
+            message='save_summary_steps should be >= 0')
+
+  _validate('save_checkpoints_steps', lambda steps: steps >= 0,
+            message='save_checkpoints_steps should be >= 0')
+  _validate('save_checkpoints_secs', lambda secs: secs >= 0,
+            message='save_checkpoints_secs should be >= 0')
+
+  _validate('session_config',
+            lambda sc: isinstance(sc, config_pb2.ConfigProto),
+            message='session_config must be instance of ConfigProto')
+
+  _validate('keep_checkpoint_max', lambda keep_max: keep_max >= 0,
+            message='keep_checkpoint_max should be >= 0')
+  _validate('keep_checkpoint_every_n_hours', lambda keep_hours: keep_hours > 0,
+            message='keep_checkpoint_every_n_hours should be > 0')
+
+  _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
+            message='tf_random_seed must be integer.')
+
 
 class TaskType(object):
   MASTER = 'master'
@@ -28,6 +103,17 @@ class TaskType(object):
 class RunConfig(object):
   """This class specifies the configurations for an `Estimator` run."""
 
+  def __init__(self):
+    self._model_dir = None
+    self._tf_random_seed = 1
+    self._save_summary_steps = 100
+    self._save_checkpoints_secs = 600
+    self._save_checkpoints_steps = None
+    self._session_config = None
+    self._keep_checkpoint_max = 5
+    self._keep_checkpoint_every_n_hours = 10000
+    _validate_properties(self)
+
   @property
   def cluster_spec(self):
     return None
@@ -62,24 +148,98 @@ class RunConfig(object):
 
   @property
   def tf_random_seed(self):
-    return 1
+    return self._tf_random_seed
 
   @property
   def save_summary_steps(self):
-    return 100
+    return self._save_summary_steps
 
   @property
   def save_checkpoints_secs(self):
-    return 600
+    return self._save_checkpoints_secs
+
+  @property
+  def session_config(self):
+    return self._session_config
 
   @property
   def save_checkpoints_steps(self):
-    return None
+    return self._save_checkpoints_steps
 
   @property
   def keep_checkpoint_max(self):
-    return 5
+    return self._keep_checkpoint_max
 
   @property
   def keep_checkpoint_every_n_hours(self):
-    return 10000
+    return self._keep_checkpoint_every_n_hours
+
+  @property
+  def model_dir(self):
+    return self._model_dir
+
+  def replace(self, **kwargs):
+    """Returns a new instance of `RunConfig` replacing specified properties.
+
+    Only the properties in the following list are allowed to be replaced:
+      - `model_dir`.
+      - `tf_random_seed`,
+      - `save_summary_steps`,
+      - `save_checkpoints_steps`,
+      - `save_checkpoints_secs`,
+      - `session_config`,
+      - `keep_checkpoint_max`,
+      - `keep_checkpoint_every_n_hours`,
+
+    In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
+    can be set (should not be both).
+
+    Args:
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+    return self._replace(
+        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, **kwargs)
+
+  def _replace(self, allowed_properties_list=None, **kwargs):
+    """See `replace`.
+
+    N.B.: This implementation assumes that for key named "foo", the underlying
+    property the RunConfig holds is "_foo" (with one leading underscore).
+
+    Args:
+      allowed_properties_list: The property name list allowed to be replaced.
+      **kwargs: keyword named properties with new values.
+
+    Raises:
+      ValueError: If any property name in `kwargs` does not exist or is not
+        allowed to be replaced, or both `save_checkpoints_steps` and
+        `save_checkpoints_secs` are set.
+
+    Returns:
+      a new instance of `RunConfig`.
+    """
+
+    new_copy = copy.deepcopy(self)
+
+    allowed_properties_list = allowed_properties_list or []
+
+    for key, new_value in six.iteritems(kwargs):
+      if key in allowed_properties_list:
+        setattr(new_copy, '_' + key, new_value)
+        continue
+
+      raise ValueError(
+          'Replacing {} is not supported. Allowed properties are {}.'.format(
+              key, allowed_properties_list))
+
+    _validate_save_ckpt_with_replaced_keys(new_copy, kwargs.keys())
+    _validate_properties(new_copy)
+    return new_copy
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d5b2a3f64845fa0f60567e73b346688c482703
--- /dev/null
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -0,0 +1,183 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RunConfig tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import test
+
+_TEST_DIR = 'test_dir'
+_MASTER = 'master_'
+_NOT_SUPPORTED_REPLACE_PROPERTY_MSG = 'Replacing .*is not supported'
+_SAVE_CKPT_ERR = (
+    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
+)
+_MODEL_DIR_ERR = 'model_dir should be non-empty'
+_SAVE_SUMMARY_STEPS_ERR = 'save_summary_steps should be >= 0'
+_SAVE_CKPT_STEPS_ERR = 'save_checkpoints_steps should be >= 0'
+_SAVE_CKPT_SECS_ERR = 'save_checkpoints_secs should be >= 0'
+_SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
+_KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
+_KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
+_TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
+
+
+class RunConfigTest(test.TestCase):
+
+  def test_default_property_values(self):
+    config = run_config_lib.RunConfig()
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.session_config)
+    self.assertEqual(1, config.tf_random_seed)
+    self.assertEqual(100, config.save_summary_steps)
+    self.assertEqual(600, config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertEqual(5, config.keep_checkpoint_max)
+    self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
+
+  def test_model_dir(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertIsNone(empty_config.model_dir)
+
+    new_config = empty_config.replace(model_dir=_TEST_DIR)
+    self.assertEqual(_TEST_DIR, new_config.model_dir)
+
+  def test_replace_with_allowed_properties(self):
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_secs=14,
+        session_config=session_config,
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(session_config, config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+  def test_replace_none_value(self):
+    config = run_config_lib.RunConfig().replace(
+        tf_random_seed=None,
+        model_dir=None,
+        save_summary_steps=None,
+        save_checkpoints_secs=None,
+        save_checkpoints_steps=None,
+        session_config=None,
+        keep_checkpoint_max=None,
+        keep_checkpoint_every_n_hours=None)
+    self.assertIsNone(config.tf_random_seed)
+    self.assertIsNone(config.model_dir)
+    self.assertIsNone(config.save_summary_steps)
+    self.assertIsNone(config.save_checkpoints_secs)
+    self.assertIsNone(config.save_checkpoints_steps)
+    self.assertIsNone(config.session_config)
+    self.assertIsNone(config.keep_checkpoint_max)
+    self.assertIsNone(config.keep_checkpoint_every_n_hours)
+
+  def test_replace_with_disallowallowed_properties(self):
+    config = run_config_lib.RunConfig()
+    with self.assertRaises(ValueError):
+      # tf_random_seed is not allowed to be replaced.
+      config.replace(master='_master')
+    with self.assertRaises(ValueError):
+      config.replace(some_undefined_property=123)
+
+  def test_replace(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      # master is not allowed to be replaced.
+      config.replace(master=_MASTER)
+
+    with self.assertRaisesRegexp(
+        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
+      config.replace(some_undefined_property=_MASTER)
+
+  def test_replace_invalid_values(self):
+    config = run_config_lib.RunConfig()
+
+    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
+      config.replace(model_dir='')
+    with self.assertRaisesRegexp(ValueError, _SAVE_SUMMARY_STEPS_ERR):
+      config.replace(save_summary_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_STEPS_ERR):
+      config.replace(save_checkpoints_steps=-1)
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_SECS_ERR):
+      config.replace(save_checkpoints_secs=-1)
+    with self.assertRaisesRegexp(ValueError, _SESSION_CONFIG_ERR):
+      config.replace(session_config={})
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_MAX_ERR):
+      config.replace(keep_checkpoint_max=-1)
+    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_HOURS_ERR):
+      config.replace(keep_checkpoint_every_n_hours=0)
+    with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
+      config.replace(tf_random_seed=1.0)
+
+
+class RunConfigSaveCheckpointsTest(test.TestCase):
+
+  def test_save_checkpoint(self):
+    empty_config = run_config_lib.RunConfig()
+    self.assertEqual(600, empty_config.save_checkpoints_secs)
+    self.assertIsNone(empty_config.save_checkpoints_steps)
+
+    config_with_steps = empty_config.replace(save_checkpoints_steps=100)
+    del empty_config
+    self.assertEqual(100, config_with_steps.save_checkpoints_steps)
+    self.assertIsNone(config_with_steps.save_checkpoints_secs)
+
+    config_with_secs = config_with_steps.replace(save_checkpoints_secs=200)
+    del config_with_steps
+    self.assertEqual(200, config_with_secs.save_checkpoints_secs)
+    self.assertIsNone(config_with_secs.save_checkpoints_steps)
+
+  def test_save_checkpoint_both_steps_and_secs_are_not_none(self):
+    empty_config = run_config_lib.RunConfig()
+    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_ERR):
+      empty_config.replace(save_checkpoints_steps=100,
+                           save_checkpoints_secs=200)
+
+  def test_save_checkpoint_both_steps_and_secs_are_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(
+        save_checkpoints_steps=None, save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_secs_to_none(self):
+    config_with_secs = run_config_lib.RunConfig()
+    config_without_ckpt = config_with_secs.replace(save_checkpoints_secs=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+  def test_save_checkpoint_flip_steps_to_none(self):
+    config_with_steps = run_config_lib.RunConfig().replace(
+        save_checkpoints_steps=100)
+    config_without_ckpt = config_with_steps.replace(save_checkpoints_steps=None)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
+    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..673111be9a45b0dd8f3b1b4c0d35da1da6ac67ed
--- /dev/null
+++ b/tensorflow/python/feature_column/BUILD
@@ -0,0 +1,85 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "feature_column_py",
+    srcs = ["feature_column_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+filegroup(
+    name = "vocabulary_testdata",
+    srcs = [
+        "testdata/embedding.ckpt.data-00000-of-00001",
+        "testdata/embedding.ckpt.index",
+        "testdata/embedding.ckpt.meta",
+        "testdata/warriors_vocabulary.txt",
+        "testdata/wire_vocabulary.txt",
+    ],
+)
+
+py_test(
+    name = "feature_column_test",
+    srcs = ["feature_column_test.py"],
+    data = [":vocabulary_testdata"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:inputs",
+    ],
+)
diff --git a/third_party/nccl/BUILD b/tensorflow/python/feature_column/__init__.py
similarity index 100%
rename from third_party/nccl/BUILD
rename to tensorflow/python/feature_column/__init__.py
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d7923ea6119b562587687f90dfdb2d8f7488a9f
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -0,0 +1,2449 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn abstraction.
+
+FeatureColumns provide a high level abstraction for ingesting and representing
+features. FeatureColumns are also the primary way of encoding features for
+canned ${tf.estimator.Estimator}s.
+
+When using FeatureColumns with `Estimators`, the type of feature column you
+should choose depends on (1) the feature type and (2) the model type.
+
+1. Feature type:
+
+  * Continuous features can be represented by `numeric_column`.
+  * Categorical features can be represented by any `categorical_column_with_*`
+  column:
+    - `categorical_column_with_vocabulary_list`
+    - `categorical_column_with_vocabulary_file`
+    - `categorical_column_with_hash_bucket`
+    - `categorical_column_with_identity`
+    - `weighted_categorical_column`
+
+2. Model type:
+
+  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
+
+    Continuous features can be directly fed into deep neural network models.
+
+      age_column = numeric_column("age")
+
+    To feed sparse features into DNN models, wrap the column with
+    `embedding_column` or `indicator_column`. `indicator_column` is recommended
+    for features with only a few possible values. For features with many
+    possible values, to reduce the size of your model, `embedding_column` is
+    recommended.
+
+      embedded_dept_column = embedding_column(
+          categorical_column_with_vocabulary_list(
+              "department", ["math", "philosphy", ...]), dimension=10)
+
+  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
+
+    Sparse features can be fed directly into linear models. They behave like an
+    indicator column but with an efficient implementation.
+
+      dept_column = categorical_column_with_vocabulary_list("department",
+          ["math", "philosophy", "english"])
+
+    It is recommended that continuous features be bucketized before being
+    fed into linear models.
+
+      bucketized_age_column = bucketized_column(
+          source_column=age_column,
+          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
+
+    Sparse features can be crossed (also known as conjuncted or combined) in
+    order to form non-linearities, and then fed into linear models.
+
+      cross_dept_age_column = crossed_column(
+          columns=["department", bucketized_age_column],
+          hash_bucket_size=1000)
+
+Example of building canned `Estimator`s using FeatureColumns:
+
+  ```python
+  # Define features and transformations
+  deep_feature_columns = [age_column, embedded_dept_column]
+  wide_feature_columns = [dept_column, bucketized_age_column,
+      cross_dept_age_column]
+
+  # Build deep model
+  estimator = DNNClassifier(
+      feature_columns=deep_feature_columns,
+      hidden_units=[500, 250, 50])
+  estimator.train(...)
+
+  # Or build a wide model
+  estimator = LinearClassifier(
+      feature_columns=wide_feature_columns)
+  estimator.train(...)
+
+  # Or build a wide and deep model!
+  estimator = DNNLinearCombinedClassifier(
+      linear_feature_columns=wide_feature_columns,
+      dnn_feature_columns=deep_feature_columns,
+      dnn_hidden_units=[500, 250, 50])
+  estimator.train(...)
+  ```
+
+
+FeatureColumns can also be transformed into a generic input layer for
+custom models using `input_layer`.
+
+Example of building model using FeatureColumns, this can be used in a
+`model_fn` which is given to the {tf.estimator.Estimator}:
+
+  ```python
+  # Building model via layers
+
+  deep_feature_columns = [age_column, embedded_dept_column]
+  columns_to_tensor = parse_feature_columns_from_examples(
+      serialized=my_data,
+      feature_columns=deep_feature_columns)
+  first_layer = input_layer(
+      features=columns_to_tensor,
+      feature_columns=deep_feature_columns)
+  second_layer = fully_connected(first_layer, ...)
+  ```
+
+NOTE: Functions prefixed with "_" indicate experimental or private parts of
+the API subject to change, and should not be relied upon!
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+import math
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.util import nest
+
+
+def input_layer(features,
+                feature_columns,
+                weight_collections=None,
+                trainable=True):
+  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  columns = [price, keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
+  prediction = tf.layers.dense(dense_tensor, 1)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_DenseColumn` such as `numeric_column`, `embedding_column`,
+      `bucketized_column`, `indicator_column`. If you have categorical features,
+      you can wrap them with an `embedding_column` or `indicator_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents input layer of a model. Its shape
+    is (batch_size, first_layer_dimension) and its dtype is `float32`.
+    first_layer_dimension is determined based on given `feature_columns`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(None, default_name=column.name):
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
+def linear_model(features,
+                 feature_columns,
+                 units=1,
+                 sparse_combiner='sum',
+                 weight_collections=None,
+                 trainable=True):
+  """Returns a linear prediction `Tensor` based on given `feature_columns`.
+
+  This function generates a weighted sum based on output dimension `units`.
+  Weighted sum refers to logits in classification problems. It refers to the
+  prediction itself for linear regression problems.
+
+  Note on supported columns: `linear_model` treats categorical columns as
+  `indicator_column`s while `input_layer` explicitly requires wrapping each
+  of them with an `embedding_column` or an `indicator_column`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  keywords_price = crossed_column('keywords', price_buckets, ...)
+  columns = [price_buckets, keywords, keywords_price ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values are `Tensor` or `SparseTensor` depending on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing the FeatureColumns to use as inputs
+      to your model. All items should be instances of classes derived from
+      `_FeatureColumn`s.
+    units: An integer, dimensionality of the output space. Default value is 1.
+    sparse_combiner: A string specifying how to reduce if a sparse column is
+      multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum"
+      the default. "sqrtn" often achieves good accuracy, in particular with
+      bag-of-words columns. It combines each sparse columns independently.
+        * "sum": do not normalize features in the column
+        * "mean": do l1 normalization on features in the column
+        * "sqrtn": do l2 normalization on features in the column
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents predictions/logits of a linear model. Its shape
+    is (batch_size, units) and its dtype is `float32`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
+      nor `_CategoricalColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+      raise ValueError('Items of feature_columns must be either a _DenseColumn '
+                       'or _CategoricalColumn. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='linear_model', values=features.values()):
+    weighted_sums = []
+    ordered_columns = []
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(None, default_name=column.name):
+        ordered_columns.append(column)
+        if isinstance(column, _CategoricalColumn):
+          weighted_sums.append(_create_categorical_column_weighted_sum(
+              column, builder, units, sparse_combiner, weight_collections,
+              trainable))
+        else:
+          weighted_sums.append(_create_dense_column_weighted_sum(
+              column, builder, units, weight_collections, trainable))
+    _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+    predictions_no_bias = math_ops.add_n(
+        weighted_sums, name='weighted_sum_no_bias')
+    bias = variable_scope.get_variable(
+        'bias_weights',
+        shape=[units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
+    predictions = nn_ops.bias_add(
+        predictions_no_bias, bias, name='weighted_sum')
+
+    return predictions
+
+
+def _transform_features(features, feature_columns):
+  """Returns transformed features based on features columns passed in.
+
+  Please note that most probably you would not need to use this function. Please
+  check `input_layer` and `linear_model` to see whether they will
+  satisfy your use case or not.
+
+  Example:
+
+  ```python
+  # Define features and transformations
+  crosses_a_x_b = crossed_column(
+      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
+  price_buckets = bucketized_column(
+      source_column=numeric_column("price"), boundaries=[...])
+
+  columns = [crosses_a_x_b, price_buckets]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  transformed = transform_features(features=features, feature_columns=columns)
+
+  assertCountEqual(columns, transformed.keys())
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `_FeatureColumn`.
+    feature_columns: An iterable containing all the `_FeatureColumn`s.
+
+  Returns:
+    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
+  """
+  _check_feature_columns(feature_columns)
+  outputs = {}
+  with ops.name_scope(
+      None, default_name='transform_features', values=features.values()):
+    builder = _LazyBuilder(features)
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with ops.name_scope(None, default_name=column.name):
+        outputs[column] = builder.get(column)
+  return outputs
+
+
+def make_parse_example_spec(feature_columns):
+  """Creates parsing spec dictionary from input feature_columns.
+
+  The returned dictionary can be used as arg 'features' in `tf.parse_example`.
+
+  Typical usage example:
+
+  ```python
+  # Define features and transformations
+  feature_b = numeric_column(...)
+  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
+  feature_a_x_feature_c = crossed_column(
+      columns=["feature_a", feature_c_bucketized], ...)
+
+  feature_columns = set(
+      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
+  features = tf.parse_example(
+      serialized=serialized_examples,
+      features=make_parse_example_spec(feature_columns))
+  ```
+
+  For the above example, make_parse_example_spec would return the dict:
+  {
+    "feature_a": parsing_ops.VarLenFeature(tf.string),
+    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
+    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
+  }
+
+  Args:
+    feature_columns: An iterable containing all feature columns. All items
+      should be instances of classes derived from `_FeatureColumn`.
+
+  Returns:
+    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
+    value.
+
+  Raises:
+    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
+      instance.
+  """
+  result = {}
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError(
+          'All feature_columns must be _FeatureColumn instances. '
+          'Given: {}'.format(column))
+    config = column._parse_example_spec  # pylint: disable=protected-access
+    for key, value in six.iteritems(config):
+      if key in result and value != result[key]:
+        raise ValueError(
+            'feature_columns contain different parse_spec for key '
+            '{}. Given {} and {}'.format(key, value, result[key]))
+    result.update(config)
+  return result
+
+
+def embedding_column(
+    categorical_column, dimension, combiner='mean', initializer=None,
+    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
+    trainable=True):
+  """`_DenseColumn` that converts from sparse, categorical input.
+
+  Use this when your inputs are sparse, but you want to convert them to a dense
+  representation (e.g., to feed to a DNN).
+
+  Inputs must be a `_CategoricalColumn` created by any of the
+  `categorical_column_*` function. Here is an example embedding of an identity
+  column for a DNN model:
+
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by a
+      `categorical_column_with_*` function. This column produces the sparse IDs
+      that are inputs to the embedding lookup.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
+      which to restore the column weights. Required if `ckpt_to_load_from` is
+      not `None`.
+    max_norm: If not `None`, embedding values are l2-normalized to this value.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    `_DenseColumn` that converts from sparse input.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+  """
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  return _EmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      initializer=initializer,
+      ckpt_to_load_from=ckpt_to_load_from,
+      tensor_name_in_ckpt=tensor_name_in_ckpt,
+      max_norm=max_norm,
+      trainable=trainable)
+
+
+def numeric_column(key,
+                   shape=(1,),
+                   default_value=None,
+                   dtype=dtypes.float32,
+                   normalizer_fn=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  columns = [price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    shape: An iterable of integers specifies the shape of the `Tensor`. An
+      integer can be given which means a single dimension `Tensor` with given
+      width. The `Tensor` representing the column will have the shape of
+      [batch_size] + `shape`.
+    default_value: A single value compatible with `dtype` or an iterable of
+      values compatible with `dtype` which the column takes on during
+      `tf.Example` parsing if data is missing. A default value of `None` will
+      cause `tf.parse_example` to fail if an example does not contain this
+      column. If a single value is provided, the same value will be applied as
+      the default value for every item. If an iterable of values is provided,
+      the shape of the `default_value` should be equal to the given `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `_NumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int
+    ValueError: if any dimension in shape is not a positive integer
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = _check_shape(shape, key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  default_value = _check_default_value(shape, default_value, dtype, key)
+
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  return _NumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
+def bucketized_column(source_column, boundaries):
+  """Represents discretized dense input.
+
+  Buckets include the left boundary, and exclude the right boundary. Namely,
+  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
+  `[1., 2.)`, and `[2., +inf)`.
+
+  For example, if the inputs are
+    `boundaries` = [0, 10, 100]
+    input tensor = [[-5, 10000]
+                    [150,   10]
+                    [5,    100]]
+
+  then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  `bucketized_column` can also be crossed with another categorical column using
+  `crossed_column`:
+  ```python
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  # 'keywords' is a string feature.
+  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
+  columns = [price_x_keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Args:
+    source_column: A one-dimensional dense column which is generated with
+      `numeric_column`.
+    boundaries: A sorted list or tuple of floats specifying the boundaries.
+
+  Returns:
+    A `_BucketizedColumn`.
+
+  Raises:
+    ValueError: If `source_column` is not a numeric column, or if it is not
+      one-dimensional.
+    ValueError: If `boundaries` is not a sorted list or tuple.
+  """
+  if not isinstance(source_column, _NumericColumn):
+    raise ValueError(
+        'source_column must be a column generated with numeric_column(). '
+        'Given: {}'.format(source_column))
+  if len(source_column.shape) > 1:
+    raise ValueError(
+        'source_column must be one-dimensional column. '
+        'Given: {}'.format(source_column))
+  if (not boundaries or
+      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+    raise ValueError('boundaries must be a sorted list.')
+  for i in range(len(boundaries) - 1):
+    if boundaries[i] >= boundaries[i + 1]:
+      raise ValueError('boundaries must be a sorted list.')
+  return _BucketizedColumn(source_column, tuple(boundaries))
+
+
+def _assert_string_or_int(dtype, prefix):
+  if (dtype != dtypes.string) and (not dtype.is_integer):
+    raise ValueError(
+        '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
+
+
+def categorical_column_with_hash_bucket(key,
+                                        hash_bucket_size,
+                                        dtype=dtypes.string):
+  """Represents sparse feature where ids are set by hashing.
+
+  Use this when your sparse features are in string or integer format, and you
+  want to distribute your inputs into a finite number of buckets by hashing.
+  output_id = Hash(input_feature_string) % bucket_size
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example:
+
+  ```python
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  columns = [keywords, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+
+  # or
+  keywords_embedded = embedding_column(keywords, 16)
+  columns = [keywords_embedded, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_HashedCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if hash_bucket_size is None:
+    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
+
+  if hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be at least 1. '
+                     'hash_bucket_size: {}, key: {}'.format(
+                         hash_bucket_size, key))
+
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
+
+
+def categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A `_CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_CategoricalColumn` with a vocabulary file.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is not a non-negative integer.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  if not vocabulary_file:
+    raise ValueError('Missing vocabulary_file in {}.'.format(key))
+  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
+  if (vocabulary_size is None) or (vocabulary_size < 1):
+    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
+  if num_oov_buckets:
+    if default_value is not None:
+      raise ValueError(
+          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
+              key))
+    if num_oov_buckets < 0:
+      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
+          num_oov_buckets, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  return _VocabularyFileCategoricalColumn(
+      key=key,
+      vocabulary_file=vocabulary_file,
+      vocabulary_size=vocabulary_size,
+      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
+      default_value=-1 if default_value is None else default_value,
+      dtype=dtype)
+
+
+def categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1):
+  """A `_CategoricalColumn` with in-memory vocabulary.
+
+  Logic for feature f is:
+  id = vocabulary_list.index_of(f) if f in vocabulary_list else default_value
+
+  Use this when your inputs are in string or integer format, and you have an
+  in-memory vocabulary mapping each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use `default_value` to specify how to
+  include out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in `vocabulary_list` is assigned an ID
+  0-4 corresponding to its index (e.g., input 'B' produces output 2). All other
+  inputs are assigned `default_value` 0.
+
+  Linear model:
+  ```python
+  colors = categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
+  columns = [colors, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(colors, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The value to use for values not in `vocabulary_list`.
+
+  Returns:
+    A `_CategoricalColumn` with in-memory vocabulary.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: if `dtype` is not integer or string.
+  """
+  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
+    raise ValueError(
+        'vocabulary_list {} must be non-empty, column_name: {}'.format(
+            vocabulary_list, key))
+  if len(set(vocabulary_list)) != len(vocabulary_list):
+    raise ValueError(
+        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
+            vocabulary_list, key))
+  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
+  _assert_string_or_int(
+      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
+  if dtype is None:
+    dtype = vocabulary_dtype
+  elif dtype.is_integer != vocabulary_dtype.is_integer:
+    raise ValueError(
+        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
+            dtype, vocabulary_dtype, key))
+  _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+
+  return _VocabularyListCategoricalColumn(
+      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
+      default_value=default_value)
+
+
+def categorical_column_with_identity(key, num_buckets, default_value=None):
+  """A `_CategoricalColumn` that returns identity values.
+
+  Use this when your inputs are integers in the range `[0, num_buckets)`, and
+  you want to use the input value itself as the categorical ID. Values outside
+  this range will result in `default_value` if specified, otherwise it will
+  fail.
+
+  Typically, this is used for contiguous ranges of integer indexes, but
+  it doesn't have to be. This might be inefficient, however, if many of IDs
+  are unused. Consider `categorical_column_with_hash_bucket` in that case.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string. Note that these values are independent of the
+  `default_value` argument.
+
+  In the following examples, each input in the range `[0, 1000000)` is assigned
+  the same value. All other inputs are assigned `default_value` 0. Note that a
+  literal 0 in inputs will result in the same default ID.
+
+  Linear model:
+  ```python
+  video_id = categorical_column_with_identity(
+      key='video_id', num_buckets=1000000, default_value=0)
+  columns = [video_id, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  Embedding for a DNN model:
+  ```python
+  columns = [embedding_column(video_id, 9),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace inputs in that range.
+
+  Returns:
+    A `_CategoricalColumn` that returns identity values.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  if num_buckets < 1:
+    raise ValueError(
+        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
+  if (default_value is not None) and (
+      (default_value < 0) or (default_value >= num_buckets)):
+    raise ValueError(
+        'default_value {} not in range [0, {}), column_name {}'.format(
+            default_value, num_buckets, key))
+  return _IdentityCategoricalColumn(
+      key=key, num_buckets=num_buckets, default_value=default_value)
+
+
+def indicator_column(categorical_column):
+  """Represents multi-hot representation of given categorical column.
+
+  Used to wrap any `categorical_column_*` (e.g., to feed to DNN). Use
+  `embedding_column` if the inputs are sparse.
+
+  ```python
+  name = indicator_column(categorical_column_with_vocabulary_list('name',
+      ['bob', 'george', 'wanda'])
+  columns = [name, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
+  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
+  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
+  ```
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` or `crossed_column` functions.
+
+  Returns:
+    An `_IndicatorColumn`.
+  """
+  return _IndicatorColumn(categorical_column)
+
+
+def weighted_categorical_column(
+    categorical_column, weight_feature_key, dtype=dtypes.float32):
+  """Applies weight values to a `_CategoricalColumn`.
+
+  Use this when each of your sparse inputs has both an ID and a value. For
+  example, if you're representing text documents as a collection of word
+  frequencies, you can provide 2 parallel sparse input features ('terms' and
+  'frequencies' below).
+
+  Example:
+
+  Input `tf.Example` objects:
+  [
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "very" value: "model"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.3 value: 0.1}}
+      }
+    },
+    features {
+      feature {
+        key: "terms"
+        value {bytes_list {value: "when" value: "course" value: "human"}}
+      }
+      feature {
+        key: "frequencies"
+        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
+      }
+    }
+  ]
+
+  ```python
+  categorical_column = categorical_column_with_hash_bucket(
+      column_name='terms', hash_bucket_size=1000)
+  weighted_column = weighted_categorical_column(
+      categorical_column=categorical_column, weight_feature_key='frequencies')
+  columns = [weighted_column, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  This assumes the input dictionary contains a `SparseTensor` for key
+  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
+  the same indices and dense shape.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` created by
+      `categorical_column_with_*` functions.
+    weight_feature_key: String key for weight values.
+    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
+      are supported.
+
+  Returns:
+    A `_CategoricalColumn` composed of two sparse features: one represents id,
+    the other represents weight (value) of the id feature in that example.
+
+  Raises:
+    ValueError: if `dtype` is not convertible to float.
+  """
+  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
+  return _WeightedCategoricalColumn(
+      categorical_column=categorical_column,
+      weight_feature_key=weight_feature_key,
+      dtype=dtype)
+
+
+def crossed_column(keys, hash_bucket_size, hash_key=None):
+  """Returns a column for performing crosses of categorical features.
+
+  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
+  the transformation can be thought of as:
+    Hash(cartesian product of features) % `hash_bucket_size`
+
+  For example, if the input features are:
+  * SparseTensor referred by first key: shape = [2, 2]
+      [0, 0]: "a"
+      [1, 0]: "b"
+      [1, 1]: "c"
+
+  * SparseTensor referred by second key: shape = [2, 1]
+      [0, 0]: "d"
+      [1, 0]: "e"
+
+  then crossed feature will look like:
+      shape = [2, 2]
+      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
+      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
+      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
+
+  Here is an example to create a linear model with crosses of string features:
+  ```python
+  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  You could also use vocabulary lookup before crossing:
+  ```python
+  keywords = categorical_column_with_vocabulary_file(
+      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
+  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
+  columns = [keywords_x_doc_terms, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  If an input feature is of numeric type, you can use
+  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
+  ```python
+  # vertical_id is an integer categorical feature.
+  vertical_id = categorical_column_with_identity('vertical_id', 10K)
+  price = numeric_column('price')
+  # bucketized_column converts numerical feature to a categorical one.
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  columns = [vertical_id_x_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  To use crossed column in DNN model, you need to add it in an embedding column
+  as in this example:
+  ```python
+  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
+  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
+  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
+  ```
+
+  Args:
+    keys: An iterable identifying the features to be crossed. Each element can
+      be either:
+      * string: Will use the corresponding feature which must be of string type.
+      * `_CategoricalColumn`: Will use the transformed tensor produced by this
+        column. Does not support hashed categorical column.
+    hash_bucket_size: An int > 1. The number of buckets.
+    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+      function to combine the crosses fingerprints on SparseCrossOp (optional).
+
+  Returns:
+    A `_CrossedColumn`.
+
+  Raises:
+    ValueError: If `len(keys) < 2`.
+    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
+    ValueError: If any of the keys is `_HashedCategoricalColumn`.
+    ValueError: If `hash_bucket_size < 1`.
+  """
+  if not hash_bucket_size or hash_bucket_size < 1:
+    raise ValueError('hash_bucket_size must be > 1. '
+                     'hash_bucket_size: {}'.format(hash_bucket_size))
+  if not keys or len(keys) < 2:
+    raise ValueError(
+        'keys must be a list with length > 1. Given: {}'.format(keys))
+  for key in keys:
+    if (not isinstance(key, six.string_types) and
+        not isinstance(key, _CategoricalColumn)):
+      raise ValueError(
+          'Unsupported key type. All keys must be either string, or '
+          'categorical column except _HashedCategoricalColumn. '
+          'Given: {}'.format(key))
+    if isinstance(key, _HashedCategoricalColumn):
+      raise ValueError(
+          '_HashedCategoricalColumn is not supported. Instead, use the feature '
+          'name as a string. Given: {}'.format(key))
+  return _CrossedColumn(
+      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
+      hash_key=hash_key)
+
+
+class _FeatureColumn(object):
+  """Represents a feature column abstraction.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  To distinguish the concept of a feature family and a specific binary feature
+  within a family, we refer to a feature family like "country" as a feature
+  column. Following is an example feature in a `tf.Example` format:
+    {key: "country",  value: [ "US" ]}
+  In this example the value of feature is "US" and "country" refers to the
+  column of the feature.
+
+  This class is an abstract class. User should not create instances of this.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def name(self):
+    """Returns string. used for variable_scope and naming."""
+    pass
+
+  @abc.abstractmethod
+  def _transform_feature(self, inputs):
+    """Returns intermediate representation (usually a `Tensor`).
+
+    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
+    that other feature columns can use.
+
+    Example usage of `inputs`:
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
+    be used as follows:
+
+    ```python
+    raw_tensor = inputs.get('raw')
+    fc_tensor = inputs.get(input_fc)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+    """
+    pass
+
+  @abc.abstractproperty
+  def _parse_example_spec(self):
+    """Returns a `tf.Example` parsing spec as dict.
+
+    It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
+    dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
+    supported objects. Please check documentation of ${tf.parse_example} for all
+    supported spec objects.
+
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `_FeatureColumn` (input_fc). One possible implementation of
+    _parse_example_spec is as follows:
+
+    ```python
+    spec = {'raw': tf.FixedLenFeature(...)}
+    spec.update(input_fc._parse_example_spec)
+    return spec
+    ```
+    """
+    pass
+
+
+class _DenseColumn(_FeatureColumn):
+  """Represents a column which can be represented as `Tensor`.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  Some examples of this type are: numeric_column, embedding_column,
+  indicator_column.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  @abc.abstractproperty
+  def _variable_shape(self):
+    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
+    pass
+
+  @abc.abstractmethod
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns a `Tensor`.
+
+    The output of this function will be used by model-builder-functions. For
+    example the pseudo code of `input_layer` will be like:
+
+    ```python
+    def input_layer(features, feature_columns, ...):
+      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
+      return tf.concat(outputs)
+    ```
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: List of graph collections to which Variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.Variable}).
+
+    Returns:
+      `Tensor` of shape [batch_size] + `_variable_shape`.
+    """
+    pass
+
+
+def _create_dense_column_weighted_sum(
+    column, builder, units, weight_collections, trainable):
+  """Create a weighted sum of a dense column for linear_model."""
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+  batch_size = array_ops.shape(tensor)[0]
+  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=[num_elements, units],
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return math_ops.matmul(tensor, weight, name='weighted_sum')
+
+
+class _CategoricalColumn(_FeatureColumn):
+  """Represents a categorical feature.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  A categorical feature typically handled with a ${tf.SparseTensor} of IDs.
+  """
+  __metaclass__ = abc.ABCMeta
+
+  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
+      'IdWeightPair', ['id_tensor', 'weight_tensor'])
+
+  @abc.abstractproperty
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    pass
+
+  @abc.abstractmethod
+  def _get_sparse_tensors(self,
+                          inputs,
+                          weight_collections=None,
+                          trainable=None):
+    """Returns an IdWeightPair.
+
+    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
+    weights.
+
+    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
+    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
+    `SparseTensor` of `float` or `None` to indicate all weights should be
+    taken to be 1. If specified, `weight_tensor` must have exactly the same
+    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
+    output of a `VarLenFeature` which is a ragged matrix.
+
+    Args:
+      inputs: A `LazyBuilder` as a cache to get input tensors required to
+        create `IdWeightPair`.
+      weight_collections: List of graph collections to which variables (if any
+        will be created) are added.
+      trainable: If `True` also add variables to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see ${tf.get_variable}).
+    """
+    pass
+
+
+def _create_categorical_column_weighted_sum(
+    column, builder, units, sparse_combiner, weight_collections, trainable):
+  """Create a weighted sum of a categorical column for linear_model."""
+  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
+      array_ops.shape(sparse_tensors.id_tensor)[0], -1
+  ])
+  weight_tensor = sparse_tensors.weight_tensor
+  if weight_tensor is not None:
+    weight_tensor = sparse_ops.sparse_reshape(
+        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
+
+  weight = variable_scope.get_variable(
+      name='weights',
+      shape=(column._num_buckets, units),  # pylint: disable=protected-access
+      initializer=init_ops.zeros_initializer(),
+      trainable=trainable,
+      collections=weight_collections)
+  return _safe_embedding_lookup_sparse(
+      weight,
+      id_tensor,
+      sparse_weights=weight_tensor,
+      combiner=sparse_combiner,
+      name='weighted_sum')
+
+
+class _LazyBuilder(object):
+  """Handles caching of transformations while building the model.
+
+  `_FeatureColumn` specifies how to digest an input column to the network. Some
+  feature columns require data transformations. This class caches those
+  transformations.
+
+  Some features may be used in more than one place. For example, one can use a
+  bucketized feature by itself and a cross with it. In that case we
+  should create only one bucketization op instead of creating ops for each
+  feature column separately. To handle re-use of transformed columns,
+  `_LazyBuilder` caches all previously transformed columns.
+
+  Example:
+  We're trying to use the following `_FeatureColumn`s:
+
+  ```python
+    bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
+    keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
+    age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
+    ... = linear_model(features,
+                            [bucketized_age, keywords, age_X_keywords]
+  ```
+
+  If we transform each column independently, then we'll get duplication of
+  bucketization (one for cross, one for bucketization itself).
+  The `_LazyBuilder` eliminates this duplication.
+  """
+
+  def __init__(self, features):
+    """Creates a `_LazyBuilder`.
+
+    Args:
+      features: A mapping from feature column to objects that are `Tensor` or
+        `SparseTensor`, or can be converted to same via
+        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
+        signifies a base feature (not-transformed). A `_FeatureColumn` key
+        means that this `Tensor` is the output of an existing `_FeatureColumn`
+        which can be reused.
+    """
+    self._features = features.copy()
+    self._feature_tensors = {}
+
+  def get(self, key):
+    """Returns a `Tensor` for the given key.
+
+    A `str` key is used to access a base feature (not-transformed). When a
+    `_FeatureColumn` is passed, the transformed feature is returned if it
+    already exists, otherwise the given `_FeatureColumn` is asked to provide its
+    transformed output, which is then cached.
+
+    Args:
+      key: a `str` or a `_FeatureColumn`.
+
+    Returns:
+      The transformed `Tensor` corresponding to the `key`.
+
+    Raises:
+      ValueError: if key is not found or a transformed `Tensor` cannot be
+        computed.
+    """
+    if key in self._feature_tensors:
+      # FeatureColumn is already transformed or converted.
+      return self._feature_tensors[key]
+
+    if key in self._features:
+      feature_tensor = self._maybe_expand_dims(self._features[key])
+      self._feature_tensors[key] = feature_tensor
+      return feature_tensor
+
+    if not isinstance(key, (str, _FeatureColumn)):
+      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
+                      'Provided: {}'.format(key))
+
+    if not isinstance(key, _FeatureColumn):
+      raise ValueError('Feature {} is not in features dictionary.'.format(key))
+
+    column = key
+    logging.debug('Transforming feature_column %s.', column)
+    transformed = column._transform_feature(self)  # pylint: disable=protected-access
+    if transformed is None:
+      raise ValueError('Column {} is not supported.'.format(column.name))
+    self._feature_tensors[column] = transformed
+    return transformed
+
+  def _maybe_expand_dims(self, raw_feature):
+    """Converts the `raw_feature` to (sparse) tensor and maybe expand dim.
+
+    For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
+    the rank is 1. This supports dynamic rank also.
+
+    Args:
+      raw_feature: The raw feature from FeatureColumn.
+
+    Returns:
+      A `Tensor` or `SparseTensor`.
+    """
+    feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+        raw_feature)
+
+    rank = feature_tensor.get_shape().ndims
+    if (rank is not None) and rank != 1:
+      return feature_tensor
+
+    def expand_dims(input_tensor):
+      # Input_tensor has rank 1.
+      if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+        return sparse_ops.sparse_reshape(
+            input_tensor, [array_ops.shape(input_tensor)[0], -1])
+      else:
+        return array_ops.expand_dims(input_tensor, -1)
+
+    if rank is None:
+      return control_flow_ops.cond(
+          math_ops.equal(1, array_ops.rank(feature_tensor)),
+          lambda: expand_dims(feature_tensor),
+          lambda: feature_tensor)
+    else:
+      return expand_dims(feature_tensor)
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _shape_offsets(shape):
+  """Returns moving offset for each dimension given shape."""
+  offsets = []
+  for dim in reversed(shape):
+    if offsets:
+      offsets.append(dim * offsets[-1])
+    else:
+      offsets.append(dim)
+  offsets.reverse()
+  return offsets
+
+
+# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
+def _to_sparse_input(input_tensor, ignore_value=None):
+  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
+
+  If `input_tensor` is already a `SparseTensor`, just return it.
+
+  Args:
+    input_tensor: A string or integer `Tensor`.
+    ignore_value: Entries in `dense_tensor` equal to this value will be
+      absent from the resulting `SparseTensor`. If `None`, default value of
+      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
+
+  Returns:
+    A `SparseTensor` with the same shape as `input_tensor`.
+
+  Raises:
+    ValueError: when `input_tensor`'s rank is `None`.
+  """
+  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+      input_tensor)
+  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+    return input_tensor
+  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
+    input_rank = input_tensor.get_shape().ndims
+    if input_rank is None:
+      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
+      raise ValueError('Undefined input_tensor shape.')
+    if ignore_value is None:
+      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
+    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
+    indices = array_ops.where(math_ops.not_equal(
+        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
+    # Flattens the tensor and indices for use with gather.
+    flat_tensor = array_ops.reshape(input_tensor, [-1])
+    flat_indices = indices[:, input_rank - 1]
+    # Computes the correct flattened indices for 2d (or higher) tensors.
+    if input_rank > 1:
+      higher_dims = indices[:, :input_rank - 1]
+      shape_offsets = array_ops.stack(
+          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
+      offsets = math_ops.reduce_sum(
+          math_ops.multiply(higher_dims, shape_offsets),
+          reduction_indices=[1])
+      flat_indices = math_ops.add(flat_indices, offsets)
+    values = array_ops.gather(flat_tensor, flat_indices)
+    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+
+
+def _check_feature_columns(feature_columns):
+  """Verifies feature_columns input."""
+  if isinstance(feature_columns, dict):
+    raise ValueError('Expected feature_columns to be iterable, found dict.')
+  for column in feature_columns:
+    if not isinstance(column, _FeatureColumn):
+      raise ValueError('Items of feature_columns must be a _FeatureColumn.'
+                       'Given (type {}): {}.'.format(type(column), column))
+  if not feature_columns:
+    raise ValueError('feature_columns must not be empty.')
+  name_to_column = dict()
+  for column in feature_columns:
+    if column.name in name_to_column:
+      raise ValueError('Duplicate feature column name found for columns: {} '
+                       'and {}. This usually means that these columns refer to '
+                       'same base feature. Either one must be discarded or a '
+                       'duplicated but renamed item must be inserted in '
+                       'features dict.'.format(column,
+                                               name_to_column[column.name]))
+    name_to_column[column.name] = column
+
+
+class _NumericColumn(_DenseColumn,
+                     collections.namedtuple('_NumericColumn', [
+                         'key', 'shape', 'default_value', 'dtype',
+                         'normalizer_fn'
+                     ])):
+  """see `numeric_column`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {
+        self.key:
+            parsing_ops.FixedLenFeature(self.shape, self.dtype,
+                                        self.default_value)
+    }
+
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError(
+          'The corresponding Tensor of numerical column must be a Tensor. '
+          'SparseTensor is not supported. key: {}'.format(self.key))
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return math_ops.to_float(input_tensor)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(self.shape)
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing numeric feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
+                        collections.namedtuple('_BucketizedColumn', [
+                            'source_column', 'boundaries'])):
+  """See `bucketized_column`."""
+
+  @property
+  def name(self):
+    return '{}_bucketized'.format(self.source_column.name)
+
+  @property
+  def _parse_example_spec(self):
+    return self.source_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    source_tensor = inputs.get(self.source_column)
+    return math_ops._bucketize(  # pylint: disable=protected-access
+        source_tensor,
+        boundaries=self.boundaries)
+
+  @property
+  def _variable_shape(self):
+    return tensor_shape.TensorShape(
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return array_ops.one_hot(
+        indices=math_ops.to_int64(input_tensor),
+        depth=len(self.boundaries) + 1,
+        on_value=1.,
+        off_value=0.)
+
+  @property
+  def _num_buckets(self):
+    # By construction, source_column is always one-dimensional.
+    return (len(self.boundaries) + 1) * self.source_column.shape[0]
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    input_tensor = inputs.get(self)
+    batch_size = array_ops.shape(input_tensor)[0]
+    # By construction, source_column is always one-dimensional.
+    source_dimension = self.source_column.shape[0]
+
+    i1 = array_ops.reshape(
+        array_ops.tile(
+            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+            [1, source_dimension]),
+        (-1,))
+    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
+    # Flatten the bucket indices and unique them across dimensions
+    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
+    bucket_indices = (
+        array_ops.reshape(input_tensor, (-1,)) +
+        (len(self.boundaries) + 1) * i2)
+
+    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
+    dense_shape = math_ops.to_int64(array_ops.stack(
+        [batch_size, source_dimension]))
+    sparse_tensor = sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=bucket_indices,
+        dense_shape=dense_shape)
+    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
+
+
+class _EmbeddingColumn(
+    _DenseColumn,
+    collections.namedtuple('_EmbeddingColumn', (
+        'categorical_column', 'dimension', 'combiner', 'initializer',
+        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
+    ))):
+  """See `_embedding_column`."""
+
+  @property
+  def name(self):
+    if not hasattr(self, '_name'):
+      self._name = '{}_embedding'.format(self.categorical_column.name)
+    return self._name
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
+  @property
+  def _variable_shape(self):
+    if not hasattr(self, '_shape'):
+      self._shape = tensor_shape.vector(self.dimension)
+    return self._shape
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    # Get sparse IDs and weights.
+    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+        inputs, weight_collections=weight_collections, trainable=trainable)
+    sparse_ids = sparse_tensors.id_tensor
+    sparse_weights = sparse_tensors.weight_tensor
+
+    # Create embedding weight, and restore from checkpoint if necessary.
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=(self.categorical_column._num_buckets, self.dimension),  # pylint: disable=protected-access
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    if self.ckpt_to_load_from is not None:
+      to_restore = embedding_weights
+      if isinstance(to_restore, variables.PartitionedVariable):
+        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
+          self.tensor_name_in_ckpt: to_restore
+      })
+
+    # Return embedding lookup result.
+    return _safe_embedding_lookup_sparse(
+        embedding_weights=embedding_weights,
+        sparse_ids=sparse_ids,
+        sparse_weights=sparse_weights,
+        combiner=self.combiner,
+        name='%s_weights' % self.name,
+        max_norm=self.max_norm)
+
+
+def _create_tuple(shape, value):
+  """Returns a tuple with given shape and filled with value."""
+  if shape:
+    return tuple([_create_tuple(shape[1:], value) for _ in range(shape[0])])
+  return value
+
+
+def _as_tuple(value):
+  if not nest.is_sequence(value):
+    return value
+  return tuple([_as_tuple(v) for v in value])
+
+
+def _check_shape(shape, key):
+  """Returns shape if it's valid, raises error otherwise."""
+  assert shape is not None
+  if not nest.is_sequence(shape):
+    shape = [shape]
+  shape = tuple(shape)
+  for dimension in shape:
+    if not isinstance(dimension, int):
+      raise TypeError('shape dimensions must be integer. '
+                      'shape: {}, key: {}'.format(shape, key))
+    if dimension < 1:
+      raise ValueError('shape dimensions must be greater than 0. '
+                       'shape: {}, key: {}'.format(shape, key))
+  return shape
+
+
+def _is_shape_and_default_value_compatible(default_value, shape):
+  """Verifies compatibility of shape and default_value."""
+  # Invalid condition:
+  #  * if default_value is not a scalar and shape is empty
+  #  * or if default_value is an iterable and shape is not empty
+  if nest.is_sequence(default_value) != bool(shape):
+    return False
+  if not shape:
+    return True
+  if len(default_value) != shape[0]:
+    return False
+  for i in range(shape[0]):
+    if not _is_shape_and_default_value_compatible(default_value[i], shape[1:]):
+      return False
+  return True
+
+
+def _check_default_value(shape, default_value, dtype, key):
+  """Returns default value as tuple if it's valid, otherwise raises errors.
+
+  This function verifies that `default_value` is compatible with both `shape`
+  and `dtype`. If it is not compatible, it raises an error. If it is compatible,
+  it casts default_value to a tuple and returns it. `key` is used only
+  for error message.
+
+  Args:
+    shape: An iterable of integers specifies the shape of the `Tensor`.
+    default_value: If a single value is provided, the same value will be applied
+      as the default value for every item. If an iterable of values is
+      provided, the shape of the `default_value` should be equal to the given
+      `shape`.
+    dtype: defines the type of values. Default value is `tf.float32`. Must be a
+      non-quantized, real integer or floating point type.
+    key: Column name, used only for error messages.
+
+  Returns:
+    A tuple which will be used as default value.
+
+  Raises:
+    TypeError: if `default_value` is an iterable but not compatible with `shape`
+    TypeError: if `default_value` is not compatible with `dtype`.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  if default_value is None:
+    return None
+
+  if isinstance(default_value, int):
+    return _create_tuple(shape, default_value)
+
+  if isinstance(default_value, float) and dtype.is_floating:
+    return _create_tuple(shape, default_value)
+
+  if callable(getattr(default_value, 'tolist', None)):  # Handles numpy arrays
+    default_value = default_value.tolist()
+
+  if nest.is_sequence(default_value):
+    if not _is_shape_and_default_value_compatible(default_value, shape):
+      raise ValueError(
+          'The shape of default_value must be equal to given shape. '
+          'default_value: {}, shape: {}, key: {}'.format(
+              default_value, shape, key))
+    # Check if the values in the list are all integers or are convertible to
+    # floats.
+    is_list_all_int = all(
+        isinstance(v, int) for v in nest.flatten(default_value))
+    is_list_has_float = any(
+        isinstance(v, float) for v in nest.flatten(default_value))
+    if is_list_all_int:
+      return _as_tuple(default_value)
+    if is_list_has_float and dtype.is_floating:
+      return _as_tuple(default_value)
+  raise TypeError('default_value must be compatible with dtype. '
+                  'default_value: {}, dtype: {}, key: {}'.format(
+                      default_value, dtype, key))
+
+
+class _HashedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_HashedCategoricalColumn',
+                           ['key', 'hash_bucket_size', 'dtype'])):
+  """see `categorical_column_with_hash_bucket`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError('SparseColumn input must be a SparseTensor.')
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    if self.dtype == dtypes.string:
+      sparse_values = input_tensor.values
+    else:
+      sparse_values = string_ops.as_string(input_tensor.values)
+
+    sparse_id_values = string_ops.string_to_hash_bucket_fast(
+        sparse_values, self.hash_bucket_size, name='lookup')
+    return sparse_tensor_lib.SparseTensor(
+        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyFileCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyFileCategoricalColumn', (
+        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
+        'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_file`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_file` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_file(
+        vocabulary_file=self.vocabulary_file,
+        num_oov_buckets=self.num_oov_buckets,
+        vocab_size=self.vocabulary_size,
+        default_value=self.default_value,
+        key_dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.vocabulary_size + self.num_oov_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _VocabularyListCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_VocabularyListCategoricalColumn', (
+        'key', 'vocabulary_list', 'dtype', 'default_value'
+    ))):
+  """See `categorical_column_with_vocabulary_list`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if self.dtype.is_integer != input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Column dtype and SparseTensors dtype must be compatible. '
+          'key: {}, column dtype: {}, tensor dtype: {}'.format(
+              self.key, self.dtype, input_tensor.dtype))
+
+    _assert_string_or_int(
+        input_tensor.dtype,
+        prefix='column_name: {} input_tensor'.format(self.key))
+
+    key_dtype = self.dtype
+    if input_tensor.dtype.is_integer:
+      # `index_table_from_tensor` requires 64-bit integer keys.
+      key_dtype = dtypes.int64
+      input_tensor = math_ops.to_int64(input_tensor)
+
+    return lookup_ops.index_table_from_tensor(
+        vocabulary_list=tuple(self.vocabulary_list),
+        default_value=self.default_value,
+        dtype=key_dtype,
+        name='{}_lookup'.format(self.key)).lookup(input_tensor)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return len(self.vocabulary_list)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _IdentityCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_IdentityCategoricalColumn', (
+        'key', 'num_buckets', 'default_value'
+    ))):
+
+  """See `categorical_column_with_identity`."""
+
+  @property
+  def name(self):
+    return self.key
+
+  @property
+  def _parse_example_spec(self):
+    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
+
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input(inputs.get(self.key))
+
+    if not input_tensor.dtype.is_integer:
+      raise ValueError(
+          'Invalid input, not integer. key: {} dtype: {}'.format(
+              self.key, input_tensor.dtype))
+
+    values = math_ops.to_int64(input_tensor.values, name='values')
+    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
+    zero = math_ops.to_int64(0, name='zero')
+    if self.default_value is None:
+      # Fail if values are out-of-range.
+      assert_less = check_ops.assert_less(
+          values, num_buckets, data=(values, num_buckets),
+          name='assert_less_than_num_buckets')
+      assert_greater = check_ops.assert_greater_equal(
+          values, zero, data=(values,),
+          name='assert_greater_or_equal_0')
+      with ops.control_dependencies((assert_less, assert_greater)):
+        values = array_ops.identity(values)
+    else:
+      # Assign default for out-of-range values.
+      values = array_ops.where(
+          math_ops.logical_or(
+              values < zero, values >= num_buckets, name='out_of_range'),
+          array_ops.fill(
+              dims=array_ops.shape(values),
+              value=math_ops.to_int64(self.default_value),
+              name='default_values'),
+          values)
+
+    return sparse_tensor_lib.SparseTensor(
+        indices=input_tensor.indices,
+        values=values,
+        dense_shape=input_tensor.dense_shape)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.num_buckets
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+class _WeightedCategoricalColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_WeightedCategoricalColumn', (
+        'categorical_column', 'weight_feature_key', 'dtype'
+    ))):
+  """See `weighted_categorical_column`."""
+
+  @property
+  def name(self):
+    return '{}_weighted_by_{}'.format(
+        self.categorical_column.name, self.weight_feature_key)
+
+  @property
+  def _parse_example_spec(self):
+    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+    if self.weight_feature_key in config:
+      raise ValueError('Parse config {} already exists for {}.'.format(
+          config[self.weight_feature_key], self.weight_feature_key))
+    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
+    return config
+
+  @property
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    weight_tensor = inputs.get(self.weight_feature_key)
+    if weight_tensor is None:
+      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
+    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+        weight_tensor)
+    if self.dtype != weight_tensor.dtype.base_dtype:
+      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
+          self.dtype, weight_tensor.dtype))
+    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
+      # The weight tensor can be a regular Tensor. In this case, sparsify it.
+      weight_tensor = _to_sparse_input(weight_tensor, ignore_value=0.0)
+    if not weight_tensor.dtype.is_floating:
+      weight_tensor = math_ops.to_float(weight_tensor)
+    return (inputs.get(self.categorical_column), weight_tensor)
+
+  def _get_sparse_tensors(
+      self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    tensors = inputs.get(self)
+    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
+
+
+class _CrossedColumn(
+    _CategoricalColumn,
+    collections.namedtuple('_CrossedColumn',
+                           ['keys', 'hash_bucket_size', 'hash_key'])):
+  """See `crossed_column`."""
+
+  @property
+  def name(self):
+    feature_names = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, _FeatureColumn):
+        feature_names.append(key.name)
+      else:  # key must be a string
+        feature_names.append(key)
+    return '_X_'.join(sorted(feature_names))
+
+  @property
+  def _parse_example_spec(self):
+    config = {}
+    for key in self.keys:
+      if isinstance(key, _FeatureColumn):
+        config.update(key._parse_example_spec)  # pylint: disable=protected-access
+      else:  # key must be a string
+        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
+    return config
+
+  def _transform_feature(self, inputs):
+    feature_tensors = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, six.string_types):
+        feature_tensors.append(inputs.get(key))
+      elif isinstance(key, _CategoricalColumn):
+        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+        if ids_and_weights.weight_tensor is not None:
+          raise ValueError(
+              'crossed_column does not support weight_tensor, but the given '
+              'column populates weight_tensor. '
+              'Given column: {}'.format(key.name))
+        feature_tensors.append(ids_and_weights.id_tensor)
+      else:
+        raise ValueError('Unsupported column type. Given: {}'.format(key))
+    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
+        inputs=feature_tensors,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key)
+
+  @property
+  def _num_buckets(self):
+    """Returns number of buckets in this sparse feature."""
+    return self.hash_bucket_size
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+
+def _collect_leaf_level_keys(cross):
+  """Collects base keys by expanding all nested crosses.
+
+  Args:
+    cross: A `_CrossedColumn`.
+
+  Returns:
+    A list of strings or `_CategoricalColumn` instances.
+  """
+  leaf_level_keys = []
+  for k in cross.keys:
+    if isinstance(k, _CrossedColumn):
+      leaf_level_keys.extend(_collect_leaf_level_keys(k))
+    else:
+      leaf_level_keys.append(k)
+  return leaf_level_keys
+
+
+# TODO(zakaria): Move this to embedding_ops and make it public.
+def _safe_embedding_lookup_sparse(embedding_weights,
+                                  sparse_ids,
+                                  sparse_weights=None,
+                                  combiner='mean',
+                                  default_id=None,
+                                  name=None,
+                                  partition_strategy='div',
+                                  max_norm=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+        created by partitioning along dimension 0.  The total unpartitioned
+        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
+        vocab size and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+        ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+        float weights corresponding to `sparse_ids`, or `None` if all weights
+        are be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
+        the default.
+    default_id: The id to use for an entry with no features.
+    name: A name for this operation (optional).
+    partition_strategy: A string specifying the partitioning strategy.
+        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+        combining.
+
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  if embedding_weights is None:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+  if isinstance(embedding_weights, variables.PartitionedVariable):
+    embedding_weights = list(embedding_weights)  # get underlying Variables.
+  if not isinstance(embedding_weights, list):
+    embedding_weights = [embedding_weights]
+  if len(embedding_weights) < 1:
+    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
+
+  dtype = sparse_weights.dtype if sparse_weights is not None else None
+  embedding_weights = [
+      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+  ]
+
+  with ops.name_scope(name, 'embedding_lookup',
+                      embedding_weights + [sparse_ids,
+                                           sparse_weights]) as scope:
+    # Reshape higher-rank sparse ids and weights to linear segment ids.
+    original_shape = sparse_ids.dense_shape
+    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
+    original_rank = (
+        array_ops.size(original_shape)
+        if original_rank_dim.value is None
+        else original_rank_dim.value)
+    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
+        math_ops.reduce_prod(
+            array_ops.slice(original_shape, [0], [original_rank - 1])),
+        array_ops.gather(original_shape, original_rank - 1)])
+    if sparse_weights is not None:
+      sparse_weights = sparse_tensor_lib.SparseTensor(
+          sparse_ids.indices,
+          sparse_weights.values, sparse_ids.dense_shape)
+
+    # Prune invalid ids and weights.
+    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+
+    # Fill in dummy values for empty features, if necessary.
+    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
+                                                                 default_id or
+                                                                 0)
+    if sparse_weights is not None:
+      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
+
+    result = embedding_ops.embedding_lookup_sparse(
+        embedding_weights,
+        sparse_ids,
+        sparse_weights,
+        combiner=combiner,
+        partition_strategy=partition_strategy,
+        name=None if default_id is None else scope,
+        max_norm=max_norm)
+
+    if default_id is None:
+      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
+      # for use in Select.
+      is_row_empty = array_ops.tile(
+          array_ops.reshape(is_row_empty, [-1, 1]),
+          array_ops.stack([1, array_ops.shape(result)[1]]))
+
+      result = array_ops.where(is_row_empty,
+                               array_ops.zeros_like(result),
+                               result,
+                               name=scope)
+
+    # Reshape back from linear ids back into higher-dimensional dense result.
+    final_result = array_ops.reshape(
+        result,
+        array_ops.concat([
+            array_ops.slice(
+                math_ops.cast(original_shape, dtypes.int32), [0],
+                [original_rank - 1]),
+            array_ops.slice(array_ops.shape(result), [1], [-1])
+        ], 0))
+    final_result.set_shape(tensor_shape.unknown_shape(
+        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
+    return final_result
+
+
+def _prune_invalid_ids(sparse_ids, sparse_weights):
+  """Prune invalid IDs (< 0) from the input ids and weights."""
+  is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
+  if sparse_weights is not None:
+    is_id_valid = math_ops.logical_and(
+        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+  sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
+  if sparse_weights is not None:
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
+  return sparse_ids, sparse_weights
+
+
+class _IndicatorColumn(_DenseColumn,
+                       collections.namedtuple('_IndicatorColumn',
+                                              ['categorical_column'])):
+  """Represents a one-hot column for use in deep networks.
+
+  Args:
+    categorical_column: A `_CategoricalColumn` which is created by
+      `categorical_column_with_*` function.
+  """
+
+  @property
+  def name(self):
+    return '{}_indicator'.format(self.categorical_column.name)
+
+  def _transform_feature(self, inputs):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+
+    Returns:
+      Transformed feature `Tensor`.
+
+    Raises:
+      ValueError: if input rank is not known at graph building time.
+    """
+    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    id_tensor = id_weight_pair.id_tensor
+    weight_tensor = id_weight_pair.weight_tensor
+
+    # If the underlying column is weighted, return the input as a dense tensor.
+    if weight_tensor is not None:
+      weighted_column = sparse_ops.sparse_merge(
+          sp_ids=id_tensor,
+          sp_values=weight_tensor,
+          vocab_size=self._variable_shape[-1])
+      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+
+    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
+        id_tensor, default_value=-1)
+
+    # One hot must be float for tf.concat reasons since all other inputs to
+    # input_layer are float32.
+    one_hot_id_tensor = array_ops.one_hot(
+        dense_id_tensor,
+        depth=self._variable_shape[-1],
+        on_value=1.0,
+        off_value=0.0)
+
+    # Reduce to get a multi-hot per example.
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
+
+  @property
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
+  @property
+  def _variable_shape(self):
+    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
+    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      inputs: A `_LazyBuilder` object to access inputs.
+      weight_collections: Unused `weight_collections` since no variables are
+        created in this function.
+      trainable: Unused `trainable` bool since no variables are created in
+        this function.
+
+    Returns:
+      Dense `Tensor` created within `_transform_feature`.
+    """
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    return inputs.get(self)
+
+
+def _verify_static_batch_size_equality(tensors, columns):
+  # bath_size is a tf.Dimension object.
+  expected_batch_size = None
+  for i in range(0, len(tensors)):
+    if tensors[i].shape[0].value is not None:
+      if expected_batch_size is None:
+        bath_size_column_index = i
+        expected_batch_size = tensors[i].shape[0]
+      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
+        raise ValueError(
+            'Batch size (first dimension) of each feature must be same. '
+            'Batch size of columns ({}, {}): ({}, {})'.format(
+                columns[bath_size_column_index].name, columns[i].name,
+                expected_batch_size, tensors[i].shape[0]))
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a57986764f9f5e2cff788817cc7706089dc73b0
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FeatureColumns: tools for ingesting and representing features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.python.feature_column.feature_column import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long
+
+_allowed_symbols = [
+    'input_layer',
+    'linear_model',
+    'make_parse_example_spec',
+    'embedding_column',
+    'crossed_column',
+    'numeric_column',
+    'bucketized_column',
+    'categorical_column_with_hash_bucket',
+    'categorical_column_with_vocabulary_file',
+    'categorical_column_with_vocabulary_list',
+    'categorical_column_with_identity',
+    'weighted_categorical_column',
+    'indicator_column',
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda6c3faf4ec4fb02e94a8d0b5aeffa44281e6cd
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -0,0 +1,3920 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _CategoricalColumn
+from tensorflow.python.feature_column.feature_column import _DenseColumn
+from tensorflow.python.feature_column.feature_column import _FeatureColumn
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import queue_runner_impl
+
+
+def _initialized_session():
+  sess = session.Session()
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class LazyColumnTest(test.TestCase):
+
+  def test_transormations_called_once(self):
+
+    class TransformCounter(_FeatureColumn):
+
+      def __init__(self):
+        self.num_transform = 0
+
+      @property
+      def name(self):
+        return 'TransformCounter'
+
+      def _transform_feature(self, cache):
+        self.num_transform += 1  # Count transform calls.
+        return cache.get('a')
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = TransformCounter()
+    self.assertEqual(0, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+    builder.get(column)
+    self.assertEqual(1, column.num_transform)
+
+  def test_returns_transform_output(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    column = Transformer()
+    self.assertEqual('Output', builder.get(column))
+    self.assertEqual('Output', builder.get(column))
+
+  def test_does_not_pollute_given_features_dict(self):
+
+    class Transformer(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'Transformer'
+
+      def _transform_feature(self, cache):
+        return 'Output'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    features = {'a': [[2], [3.]]}
+    builder = _LazyBuilder(features=features)
+    builder.get(Transformer())
+    self.assertEqual(['a'], list(features.keys()))
+
+  def test_error_if_feature_is_not_found(self):
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'bbb is not in features dictionary'):
+      builder.get('bbb')
+
+  def test_not_supported_feature_column(self):
+
+    class NotAProperColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotAProperColumn'
+
+      def _transform_feature(self, cache):
+        # It should return not None.
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(ValueError,
+                                 'NotAProperColumn is not supported'):
+      builder.get(NotAProperColumn())
+
+  def test_key_should_be_string_or_feature_colum(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    builder = _LazyBuilder(features={'a': [[2], [3.]]})
+    with self.assertRaisesRegexp(
+        TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
+      builder.get(NotAFeatureColumn())
+
+
+class NumericColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual((1,), a.shape)
+    self.assertIsNone(a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
+
+  def test_shape_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_default_value_saved_as_tuple(self):
+    a = fc.numeric_column('aaa', default_value=4.)
+    self.assertEqual((4.,), a.default_value)
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    self.assertEqual(((3., 2.),), a.default_value)
+
+  def test_shape_and_default_value_compatibility(self):
+    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc.numeric_column(
+        'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
+    with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
+      fc.numeric_column(
+          'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
+
+  def test_default_value_type_check(self):
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
+    fc.numeric_column(
+        'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
+      fc.numeric_column(
+          'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
+    with self.assertRaisesRegexp(TypeError,
+                                 'default_value must be compatible with dtype'):
+      fc.numeric_column('aaa', default_value=['string'])
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      fc.numeric_column(
+          'aaa', shape=[
+              1.0,
+          ])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'shape dimensions must be greater than 0'):
+      fc.numeric_column(
+          'aaa', shape=[
+              0,
+          ])
+
+  def test_dtype_is_convertable_to_float(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'dtype must be convertible to float'):
+      fc.numeric_column('aaa', dtype=dtypes.string)
+
+  def test_scalar_deafult_value_fills_the_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example_no_default_value(self):
+    price = fc.numeric_column('price', shape=[2])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_parse_example_with_default_value(self):
+    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    no_data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'something_else':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString(),
+                    no_data.SerializeToString()],
+        features=fc.make_parse_example_spec([price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      fc.numeric_column('price', normalizer_fn='NotACallable')
+
+  def test_normalizer_fn_transform_feature(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
+    with self.test_session():
+      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
+
+  def test_get_dense_tensor(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
+    self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
+
+  def test_sparse_tensor_not_supported(self):
+    price = fc.numeric_column('price')
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a_copy = copy.deepcopy(a)
+    self.assertEqual(a_copy.name, 'aaa')
+    self.assertEqual(a_copy.shape, (1, 2))
+    self.assertEqual(a_copy.default_value, ((3., 2.),))
+
+  def test_numpy_default_value(self):
+    a = fc.numeric_column(
+        'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
+    self.assertEqual(a.default_value, ((3., 2.),))
+
+  def test_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
+
+class BucketizedColumnTest(test.TestCase):
+
+  def test_invalid_source_column_type(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'source_column must be a column generated with numeric_column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_source_column_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3])
+    with self.assertRaisesRegexp(
+        ValueError, 'source_column must be one-dimensional column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_boundaries(self):
+    a = fc.numeric_column('aaa')
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=None)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=1.)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 0])
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 1])
+
+  def test_name(self):
+    a = fc.numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual('aaa_bucketized', b.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
+    }, b._parse_example_spec)
+
+  def test_variable_shape(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
+    self.assertAllEqual((2, 3), b._variable_shape)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
+    self.assertEqual(6, b._num_buckets)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([bucketized_price]))
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      transformed_tensor = _transform_features({
+          'price': [[-1., 1.], [5., 6.]]
+      }, [bucketized_price])
+      with _initialized_session():
+        self.assertAllEqual([[0, 1], [3, 4]],
+                            transformed_tensor[bucketized_price].eval())
+
+  def test_get_dense_tensor_one_input_value(self):
+    """Tests _get_dense_tensor() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.]],
+             [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]],
+             [[0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_dense_tensor_two_input_values(self):
+    """Tests _get_dense_tensor() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_sparse_tensors_one_input_value(self):
+    """Tests _get_sparse_tensors() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
+        self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
+
+  def test_get_sparse_tensors_two_input_values(self):
+    """Tests _get_sparse_tensors() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        # Values 0-4 correspond to the first column of the input price.
+        # Values 5-9 correspond to the second column of the input price.
+        self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
+        self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
+
+  def test_sparse_tensor_input_not_supported(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    builder = _LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      bucketized_price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[2])
+    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a_bucketized_copy = copy.deepcopy(a_bucketized)
+    self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
+    self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
+    self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
+
+  def test_linear_model_one_input_value(self):
+    """Tests linear_model() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_linear_model_two_input_values(self):
+    """Tests linear_model() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = fc.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.],
+             [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
+
+class HashedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a.key)
+    self.assertEqual(10, a.hash_bucket_size)
+    self.assertEqual(dtypes.string, a.dtype)
+
+  def test_bucket_size_should_be_given(self):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
+      fc.categorical_column_with_hash_bucket('aaa', None)
+
+  def test_bucket_size_should_be_positive(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'hash_bucket_size must be at least 1'):
+      fc.categorical_column_with_hash_bucket('aaa', 0)
+
+  def test_dtype_should_be_string_or_integer(self):
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(10, column.hash_bucket_size)
+      self.assertEqual(10, column._num_buckets)
+      self.assertEqual(dtypes.string, column.dtype)
+
+  def test_parse_spec_string(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, a._parse_example_spec)
+
+  def test_parse_spec_int(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, a._parse_example_spec)
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_strings_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    outputs = _transform_features({'wire': wire_tensor}, [hashed_sparse])
+    output = outputs[hashed_sparse]
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [6, 4, 1]
+    with self.test_session():
+      self.assertEqual(dtypes.int64, output.values.dtype)
+      self.assertAllEqual(expected_values, output.values.eval())
+      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
+      self.assertAllEqual(wire_tensor.dense_shape.eval(),
+                          output.dense_shape.eval())
+
+  def test_tensor_dtype_should_be_string_or_integer(self):
+    string_fc = fc.categorical_column_with_hash_bucket(
+        'a_string', 10, dtype=dtypes.string)
+    int_fc = fc.categorical_column_with_hash_bucket(
+        'a_int', 10, dtype=dtypes.int32)
+    float_fc = fc.categorical_column_with_hash_bucket(
+        'a_float', 10, dtype=dtypes.string)
+    int_tensor = sparse_tensor.SparseTensor(
+        values=[101],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    string_tensor = sparse_tensor.SparseTensor(
+        values=['101'],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    float_tensor = sparse_tensor.SparseTensor(
+        values=[101.],
+        indices=[[0, 0]],
+        dense_shape=[1, 1])
+    builder = _LazyBuilder({
+        'a_int': int_tensor,
+        'a_string': string_tensor,
+        'a_float': float_tensor
+    })
+    builder.get(string_fc)
+    builder.get(int_fc)
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      builder.get(float_fc)
+
+  def test_dtype_should_match_with_tensor(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      builder.get(hashed_sparse)
+
+  def test_ints_should_be_hashed(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=[101, 201, 301],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_int32_64_is_compatible(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket(
+        'wire', 10, dtype=dtypes.int64)
+    wire_tensor = sparse_tensor.SparseTensor(
+        values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    builder = _LazyBuilder({'wire': wire_tensor})
+    output = builder.get(hashed_sparse)
+    # Check exact hashed output. If hashing changes this test will break.
+    expected_values = [3, 7, 5]
+    with self.test_session():
+      self.assertAllEqual(expected_values, output.values.eval())
+
+  def test_get_sparse_tensors(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2])
+    })
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
+    id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+
+class CrossedColumnTest(test.TestCase):
+
+  def test_keys_empty(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column([], 10)
+
+  def test_keys_length_one(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'keys must be a list with length > 1'):
+      fc.crossed_column(['a'], 10)
+
+  def test_key_type_unsupported(self):
+    with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
+      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+
+    with self.assertRaisesRegexp(
+        ValueError, '_HashedCategoricalColumn is not supported'):
+      fc.crossed_column(
+          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+
+  def test_hash_bucket_size_negative(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], -1)
+
+  def test_hash_bucket_size_zero(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], 0)
+
+  def test_hash_bucket_size_none(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'hash_bucket_size must be > 1'):
+      fc.crossed_column(['a', 'c'], None)
+
+  def test_name(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_name_leaf_keys_ordered_alphabetically(self):
+    """Tests that the name does not depend on the order of given columns."""
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+
+    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_parse_spec(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 10)
+    self.assertEqual({
+        'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
+        'c': parsing_ops.VarLenFeature(dtypes.string),
+    }, crossed._parse_example_spec)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed = fc.crossed_column([b, 'c'], 15)
+    self.assertEqual(15, crossed._num_buckets)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    crossed2_copy = copy.deepcopy(crossed2)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(15, crossed2_copy.hash_bucket_size)
+    self.assertEqual(5, crossed2_copy.hash_key)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.])),
+            'wire':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([price_cross_wire]))
+    self.assertIn('price', features)
+    self.assertIn('wire', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+      wire_sparse = features['wire']
+      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
+      # Use byte constants to pass the open-source test.
+      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
+      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    hash_bucket_size = 10
+    price_cross_wire = fc.crossed_column(
+        [bucketized_price, 'wire'], hash_bucket_size)
+    features = {
+        'price': constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire': sparse_tensor.SparseTensor(
+            values=['omar', 'stringer', 'marlo'],
+            indices=[[0, 0], [1, 0], [1, 1]],
+            dense_shape=[2, 2]),
+    }
+    outputs = _transform_features(features, [price_cross_wire])
+    output = outputs[price_cross_wire]
+    with self.test_session() as sess:
+      output_val = sess.run(output)
+      self.assertAllEqual(
+          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
+      for val in output_val.values:
+        self.assertIn(val, list(range(hash_bucket_size)))
+      self.assertAllEqual([2, 4], output_val.dense_shape)
+
+  def test_get_sparse_tensors(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+          'd1':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d1A', 'd1B', 'd1C'],
+                  dense_shape=(2, 2)),
+          'd2':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['d2A', 'd2B', 'd2C'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed2._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+             (1, 14), (1, 15)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (
+            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+  def test_get_sparse_tensors_simple(self):
+    """Same as test_get_sparse_tensors, but with simpler values."""
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      builder = _LazyBuilder({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      })
+      id_weight_pair = crossed._get_sparse_tensors(builder)
+      with _initialized_session():
+        id_tensor_eval = id_weight_pair.id_tensor.eval()
+        self.assertAllEqual(
+            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+            id_tensor_eval.indices)
+        # Check exact hashed output. If hashing changes this test will break.
+        # All values are within [0, hash_bucket_size).
+        expected_values = (1, 0, 1, 3, 4, 2)
+        self.assertAllEqual(expected_values, id_tensor_eval.values)
+        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  def test_linear_model(self):
+    """Tests linear_model.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'a': constant_op.constant(((-1., .5), (.5, 1.))),
+          'c': sparse_tensor.SparseTensor(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=['cA', 'cB', 'cC'],
+              dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(
+            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_linear_model_with_weights(self):
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name: parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
+                dtypes.float32),
+            }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc.linear_model({
+            t.name: sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[0, 1, 2],
+                dense_shape=(2, 2)),
+            '{}_weights'.format(t.name): sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=[1., 10., 2.],
+                dense_shape=(2, 2)),
+            'c': sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=['cA', 'cB', 'cC'],
+                dense_shape=(2, 2)),
+        }, (crossed,))
+
+
+def get_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            'linear_model/' + column.name)[0]
+
+
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.linear_model(features={}, feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(_FeatureColumn):
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def _transform_feature(self, cache):
+        pass
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.linear_model(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
+                1000., 1100., 1200.
+            ], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc.linear_model(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+
+class InputLayerTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.input_layer(features={}, feature_columns=[])
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1.], [5.]], net.eval())
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          net.eval()
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session():
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.input_layer(features, [price_a, price_b])
+      net2 = fc.input_layer(features, [price_b, price_a])
+      with _initialized_session():
+        self.assertAllClose([[1., 3.]], net1.eval())
+        self.assertAllClose([[1., 3.]], net2.eval())
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+        fc.input_layer(features, [animal])
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.input_layer(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = fc.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    # one_hot_body_style has 3 dims in input_layer.
+    one_hot_body_style = fc.indicator_column(body_style)
+    # embedded_body_style has 5 dims in input_layer.
+    embedded_body_style = fc.embedding_column(body_style, dimension=5,
+                                              initializer=_initializer)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([11., 12., 13., 14.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_body_style])
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
+          sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in input_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in input_layer.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(country, dimension=5,
+                                           initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': constant_op.constant([11., 12.,]),
+        'body-style': sparse_tensor.SparseTensor(
+            indices=((0,), (1,)),
+            values=('sedan', 'hardtop'),
+            dense_shape=(2,)),
+        # This is dense tensor for the categorical_column.
+        'country': constant_op.constant(['CA', 'US']),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+    self.assertEqual(1, features['country'].shape.ndims)
+
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_country])
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+          sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in input_layer
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in input_layer.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in input_layer.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(country, dimension=5,
+                                           initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        # This is dense tensor for the categorical_column.
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+    self.assertIsNone(features['country'].shape.ndims)
+
+    price_data = np.array([11., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)),
+        values=('sedan', 'hardtop'),
+        dense_shape=(2,))
+
+    # Dense categorical_column with unknown shape is not allowed.
+    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
+      fc.input_layer(features, [price, one_hot_body_style, embedded_country])
+
+    net = fc.input_layer(features, [price, one_hot_body_style])
+    self.assertEqual(1 + 3, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 11.], [1., 0., 0., 12.]],
+          sess.run(net, feed_dict={
+              features['price']: price_data,
+              features['body-style']: body_style_data}))
+
+
+class MakeParseExampleSpecTest(test.TestCase):
+
+  class _TestFeatureColumn(_FeatureColumn,
+                           collections.namedtuple('_TestFeatureColumn',
+                                                  ['parse_spec'])):
+
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
+  def test_no_feature_columns(self):
+    actual = fc.make_parse_example_spec([])
+    self.assertDictEqual({}, actual)
+
+  def test_invalid_type(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'All feature_columns must be _FeatureColumn instances.*invalid_column'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+
+  def test_one_feature_column(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_two_feature_columns(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2})))
+    self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
+
+  def test_equal_keys_different_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'feature_columns contain different parse_spec for key key1'):
+      fc.make_parse_example_spec(
+          (self._TestFeatureColumn({key1: parse_spec1}),
+           self._TestFeatureColumn({key1: parse_spec2})))
+
+  def test_equal_keys_equal_parse_spec(self):
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key1: parse_spec1})))
+    self.assertDictEqual({key1: parse_spec1}, actual)
+
+  def test_multiple_features_dict(self):
+    """parse_spc for one column is a dict with length > 1."""
+    key1 = 'key1'
+    parse_spec1 = parsing_ops.FixedLenFeature(
+        shape=(2,), dtype=dtypes.float32, default_value=0.)
+    key2 = 'key2'
+    parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
+    key3 = 'key3'
+    parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
+    actual = fc.make_parse_example_spec(
+        (self._TestFeatureColumn({key1: parse_spec1}),
+         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
+    self.assertDictEqual(
+        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class VocabularyFileCategoricalColumnTest(test.TestCase):
+
+  def setUp(self):
+    super(VocabularyFileCategoricalColumnTest, self).setUp()
+
+    # Contains ints, Golden State Warriors jersey numbers: 30, 35, 11, 23, 22
+    self._warriors_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/warriors_vocabulary.txt')
+    self._warriors_vocabulary_size = 5
+
+    # Contains strings, character names from 'The Wire': omar, stringer, marlo
+    self._wire_vocabulary_file_name = test.test_src_dir_path(
+        'python/feature_column/testdata/wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  def test_defaults(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    self.assertEqual(7, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
+        num_oov_buckets=4, dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(7, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_vocabulary_file_none(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=None, vocabulary_size=3)
+
+  def test_vocabulary_file_empty_string(self):
+    with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='', vocabulary_size=3)
+
+  def test_invalid_vocabulary_file(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_vocabulary_size(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=None)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=-1)
+    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=0)
+
+  def test_too_large_vocabulary_size(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size + 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
+      with self.test_session():
+        lookup_ops.tables_initializer().run()
+
+  def test_invalid_num_oov_buckets(self):
+    with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          num_oov_buckets=-1)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          dtype=dtypes.float64)
+
+  def test_invalid_buckets_and_default_value(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'both num_oov_buckets and default_value'):
+      fc.categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size,
+          num_oov_buckets=100,
+          default_value=2)
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        dtype=dtypes.string)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(self,
+                                  sparse_tensor.SparseTensorValue(
+                                      indices=inputs.indices,
+                                      values=np.array(
+                                          (2, -1, 0), dtype=np.int64),
+                                      dense_shape=inputs.dense_shape),
+                                  id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (1, 2)),
+        values=('marlo', 'skywalker', 'omar', 'heisenberg'),
+        dense_shape=(2, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 33, 0, 62), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_small_vocabulary_size(self):
+    # 'marlo' is the last entry in our vocabulary file, so be setting
+    # `vocabulary_size` to 1 less than number of entries in file, we take
+    # 'marlo' out of the vocabulary.
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size - 1)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((-1, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_with_oov_buckets(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._warriors_vocabulary_file_name,
+        vocabulary_size=self._warriors_vocabulary_size,
+        dtype=dtypes.int32,
+        num_oov_buckets=100)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=(11, 100, 30, 22),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 60, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+
+class VocabularyListCategoricalColumnTest(test.TestCase):
+
+  def test_defaults_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.string)
+    }, column._parse_example_spec)
+
+  def test_defaults_int(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        default_value=-99)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype(self):
+    with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.float32)
+
+  def test_invalid_mapping_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary dtype must be string or integer'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12., 24., 36.))
+
+  def test_mismatched_int_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          dtype=dtypes.int32)
+
+  def test_mismatched_string_dtype(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
+
+  def test_none_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=None)
+
+  def test_empty_mapping(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'vocabulary_list.*must be non-empty'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=tuple([]))
+
+  def test_duplicate_mapping(self):
+    with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
+      fc.categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 12))
+
+  def test_invalid_input_dtype_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(12, 24, 36),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_invalid_input_dtype_string(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example_string(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_parse_example_int(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(11, 21, 31))
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=[11, 21],
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    inputs = sparse_tensor.SparseTensor(
+        values=['omar', 'stringer', 'marlo'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': (('marlo', ''), ('skywalker', 'omar'))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_default_value_in_vocabulary(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        default_value=2)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, 2, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32(self):
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+        values=np.array((11, 100, 30, 22), dtype=np.int32),
+        dense_shape=(3, 3))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0, 4), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_int32_dense_input(self):
+    default_value = -100
+    column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
+        dtype=dtypes.int32,
+        default_value=default_value)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa':
+                np.array(
+                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+              values=np.array((2, default_value, 0, 4), dtype=np.int64),
+              dense_shape=(3, 3)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+    self.assertEqual(3, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          wire_column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=('marlo', 'skywalker', 'omar'),
+              dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> None, 'omar' -> 0: wire_var[0] = 1
+        self.assertAllClose(((3.,), (1.,)), predictions.eval())
+
+
+class IdentityCategoricalColumnTest(test.TestCase):
+
+  def test_constructor(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual('aaa', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, column._parse_example_spec)
+
+  def test_invalid_num_buckets_zero(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+
+  def test_invalid_num_buckets_negative(self):
+    with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
+      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+
+  def test_invalid_default_value_too_small(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=-1)
+
+  def test_invalid_default_value_too_big(self):
+    with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
+      fc.categorical_column_with_identity(
+          key='aaa', num_buckets=3, default_value=3)
+
+  def test_invalid_input_dtype(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
+      column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                    value=[11, 21]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([11, 21], dtype=np.int64),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_get_sparse_tensors(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_transform_feature(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor = _transform_features({'aaa': inputs}, [column])[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+
+  def test_get_sparse_tensors_weight_collections(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': inputs
+        }), weight_collections=('my_weights',))
+
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertItemsEqual([], ops.get_collection('my_weights'))
+
+  def test_get_sparse_tensors_dense_input(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({
+            'aaa': ((0, -1), (1, 0))
+        }))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_inputs_too_small(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_greater_or_equal_0'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_inputs_too_big(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 99, 0),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      with self.assertRaisesRegexp(
+          errors.OpError, 'assert_less_than_num_buckets'):
+        id_weight_pair.id_tensor.eval()
+
+  def test_get_sparse_tensors_with_default_value(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, -1, 99),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_weight_pair.id_tensor.eval())
+
+  def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
+    column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=4, default_value=3)
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int32)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=input_indices,
+        values=input_values,
+        dense_shape=input_shape)
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
+              values=np.array((1, 3, 3), dtype=np.int64),
+              dense_shape=np.array((2, 2), dtype=np.int64)),
+          id_weight_pair.id_tensor.eval(feed_dict={
+              input_indices: ((0, 0), (1, 0), (1, 1)),
+              input_values: (1, -1, 99),
+              input_shape: (2, 2),
+          }))
+
+  def test_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          column.name: sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+
+class TransformFeaturesTest(test.TestCase):
+
+  # All transform tests are distributed in column test.
+  # Here we only test multi column case and naming
+  def transform_multi_column(self):
+    bucketized_price = fc.bucketized_column(
+        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    with ops.Graph().as_default():
+      features = {
+          'price': [[-1.], [5.]],
+          'wire':
+              sparse_tensor.SparseTensor(
+                  values=['omar', 'stringer', 'marlo'],
+                  indices=[[0, 0], [1, 0], [1, 1]],
+                  dense_shape=[2, 2])
+      }
+      transformed = _transform_features(features,
+                                        [bucketized_price, hashed_sparse])
+      with _initialized_session():
+        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
+        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+
+  def test_column_order(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _LoggerColumn(_FeatureColumn):
+
+      def __init__(self, name):
+        self._name = name
+
+      @property
+      def name(self):
+        return self._name
+
+      def _transform_feature(self, inputs):
+        del inputs
+        self.call_order = call_logger['count']
+        call_logger['count'] += 1
+        return 'Anything'
+
+      @property
+      def _parse_example_spec(self):
+        pass
+
+    with ops.Graph().as_default():
+      column1 = _LoggerColumn('1')
+      column2 = _LoggerColumn('2')
+      call_logger = {'count': 0}
+      _transform_features({}, [column1, column2])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+      call_logger = {'count': 0}
+      _transform_features({}, [column2, column1])
+      self.assertEqual(0, column1.call_order)
+      self.assertEqual(1, column2.call_order)
+
+
+class IndicatorColumnTest(test.TestCase):
+
+  def test_indicator_column(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc.indicator_column(a)
+    self.assertEqual(indicator_a.categorical_column.name, 'a')
+    self.assertEqual(indicator_a._variable_shape, [1, 4])
+
+    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc.indicator_column(b)
+    self.assertEqual(indicator_b.categorical_column.name, 'b')
+    self.assertEqual(indicator_b._variable_shape, [1, 100])
+
+  def test_1D_shape_succeeds(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_hash_bucket('animal', 4))
+    builder = _LazyBuilder({'animal': ['fox', 'fox']})
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+  def test_2D_shape_succeeds(self):
+    # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
+    animal = fc.indicator_column(
+        fc.categorical_column_with_hash_bucket('animal', 4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [1, 0]],
+                values=['fox', 'fox'],
+                dense_shape=[2, 1])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+  def test_multi_hot(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+  def test_multi_hot2(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    builder = _LazyBuilder({
+        'animal':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+    })
+    output = builder.get(animal)
+    with self.test_session():
+      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+
+  def test_deep_copy(self):
+    a = fc.categorical_column_with_hash_bucket('a', 4)
+    column = fc.indicator_column(a)
+    column_copy = copy.deepcopy(column)
+    self.assertEqual(column_copy.categorical_column.name, 'a')
+    self.assertEqual(column.name, 'a_indicator')
+    self.assertEqual(column._variable_shape, [1, 4])
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_indicator]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_indicator = fc.indicator_column(a)
+    features = {
+        'aaa': sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=('marlo', 'skywalker', 'omar'),
+            dense_shape=(2, 2))
+    }
+    indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
+    with _initialized_session():
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+
+  def test_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = fc.linear_model(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+  def test_input_layer(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = fc.input_layer(features, [animal])
+      with _initialized_session():
+        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+
+
+class EmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('mean', embedding_column.combiner)
+    self.assertIsNotNone(embedding_column.initializer)
+    self.assertIsNone(embedding_column.ckpt_to_load_from)
+    self.assertIsNone(embedding_column.tensor_name_in_ckpt)
+    self.assertIsNone(embedding_column.max_norm)
+    self.assertTrue(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('my_combiner', embedding_column.combiner)
+    self.assertEqual('my_initializer', embedding_column.initializer())
+    self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+    self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+    self.assertEqual(42., embedding_column.max_norm)
+    self.assertFalse(embedding_column.trainable)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_deep_copy(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    original = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        combiner='my_combiner', initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    for embedding_column in (original, copy.deepcopy(original)):
+      self.assertEqual('aaa', embedding_column.categorical_column.name)
+      self.assertEqual(3, embedding_column.categorical_column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column.categorical_column._parse_example_spec)
+
+      self.assertEqual(embedding_dimension, embedding_column.dimension)
+      self.assertEqual('my_combiner', embedding_column.combiner)
+      self.assertEqual('my_initializer', embedding_column.initializer())
+      self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
+      self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
+      self.assertEqual(42., embedding_column.max_norm)
+      self.assertFalse(embedding_column.trainable)
+      self.assertEqual('aaa_embedding', embedding_column.name)
+      self.assertEqual(
+          (embedding_dimension,), embedding_column._variable_shape)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column._parse_example_spec)
+
+  def test_invalid_initializer(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_embedded = fc.embedding_column(a, dimension=2)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer']))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_embedded]))
+    self.assertIn('aaa', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc.embedding_column(a, dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2))
+    }
+    outputs = _transform_features(features, [a, a_embedded])
+    output_a = outputs[a]
+    output_embedded = outputs[a_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_3d(self):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0, 0), (1, 1, 0), (1, 1, 4), (3, 0, 0), (3, 1, 2)),
+        values=(2, 0, 1, 1, 2),
+        dense_shape=(4, 2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 3
+    embedding_values = (
+        (1., 2., 4.),   # id 0
+        (3., 5., 1.),   # id 1
+        (7., 11., 2.),  # id 2
+        (2., 7., 12.)   # id 3
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [[2], []], embedding = [[7, 11, 2], [0, 0, 0]]
+        ((7., 11., 2.), (0., 0., 0.)),
+        # example 1, ids [[], [0, 1]], embedding
+        # = mean([[], [1, 2, 4] + [3, 5, 1]]) = [[0, 0, 0], [2, 3.5, 2.5]]
+        ((0., 0., 0.), (2., 3.5, 2.5)),
+        # example 2, ids [[], []], embedding = [[0, 0, 0], [0, 0, 0]]
+        ((0., 0., 0.), (0., 0., 0.)),
+        # example 3, ids [[1], [2]], embedding = [[3, 5, 1], [7, 11, 2]]
+        ((3., 5., 1.), (7., 11., 2.)),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_weight_collections(self):
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+
+    # Provide sparse input and get dense result.
+    embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }), weight_collections=('my_vars',))
+
+    # Assert expected embedding variable and lookups.
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    my_vars = ops.get_collection('my_vars')
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_indices = array_ops.placeholder(dtype=dtypes.int64)
+    input_values = array_ops.placeholder(dtype=dtypes.int64)
+    input_shape = array_ops.placeholder(dtype=dtypes.int64)
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa':
+                sparse_tensor.SparseTensorValue(
+                    indices=input_indices,
+                    values=input_values,
+                    dense_shape=input_shape)
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
+          feed_dict={
+              input_indices: sparse_input.indices,
+              input_values: sparse_input.values,
+              input_shape: sparse_input.dense_shape,
+          }))
+
+  def test_get_dense_tensor_restore_from_ckpt(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable. The checkpoint file contains _embedding_values.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    ckpt_path = test.test_src_dir_path(
+        'python/feature_column/testdata/embedding.ckpt')
+    ckpt_tensor = 'my_embedding'
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        ckpt_to_load_from=ckpt_path,
+        tensor_name_in_ckpt=ckpt_tensor)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars[
+          'linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
+  def test_input_layer(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in trainable_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+  def test_input_layer_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer, trainable=False)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(
+        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+
+class WeightedCategoricalColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    self.assertEqual('ids_weighted_by_values', column.name)
+    self.assertEqual(3, column._num_buckets)
+    self.assertEqual({
+        'ids': parsing_ops.VarLenFeature(dtypes.int64),
+        'values': parsing_ops.VarLenFeature(dtypes.float32)
+    }, column._parse_example_spec)
+
+  def test_deep_copy(self):
+    """Tests deepcopy of categorical_column_with_hash_bucket."""
+    original = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    for column in (original, copy.deepcopy(original)):
+      self.assertEqual('ids_weighted_by_values', column.name)
+      self.assertEqual(3, column._num_buckets)
+      self.assertEqual({
+          'ids': parsing_ops.VarLenFeature(dtypes.int64),
+          'values': parsing_ops.VarLenFeature(dtypes.float32)
+      }, column._parse_example_spec)
+
+  def test_invalid_dtype_none(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=None)
+
+  def test_invalid_dtype_string(self):
+    with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='ids', num_buckets=3),
+          weight_feature_key='values',
+          dtype=dtypes.string)
+
+  def test_invalid_input_dtype(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    strings = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
+      _transform_features({'ids': strings, 'values': strings}, (column,))
+
+  def test_column_name_collision(self):
+    with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
+      fc.weighted_categorical_column(
+          categorical_column=fc.categorical_column_with_identity(
+              key='aaa', num_buckets=3),
+          weight_feature_key='aaa')._parse_example_spec()
+
+  def test_missing_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('omar', 'stringer', 'marlo'),
+        dense_shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, 'values is not in features dictionary'):
+      _transform_features({'ids': inputs}, (column,))
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+            'weights':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[1., 10.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_weighted]))
+    self.assertIn('aaa', features)
+    self.assertIn('weights', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([1., 10.], dtype=np.float32),
+              dense_shape=[1, 2]),
+          features['weights'].eval())
+
+  def test_transform_features(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0, 1, 0),
+        dense_shape=(2, 2))
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_input(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    weights = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(0.5, 1.0, 0.1),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': ((0, -1), (1, 0)),
+        'values': weights,
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((0, 1, 0), dtype=np.int64),
+              dense_shape=(2, 2)),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=weights.indices,
+              values=np.array(weights.values, dtype=np.float32),
+              dense_shape=weights.dense_shape),
+          weight_tensor.eval())
+
+  def test_transform_features_dense_weights(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 1, 0),
+        dense_shape=(2, 2))
+    id_tensor, weight_tensor = _transform_features({
+        'ids': inputs,
+        'values': ((.5, 0.), (1., .1)),
+    }, (column,))[column]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array(inputs.values, dtype=np.int64),
+              dense_shape=inputs.dense_shape),
+          id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=np.array((.5, 1., .1), dtype=np.float32),
+              dense_shape=(2, 2)),
+          weight_tensor.eval())
+
+  def test_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(.5, 1., .1),
+              dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, r'Dimensions.*are not compatible'):
+        fc.linear_model({
+            'ids': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 2, 1),
+                dense_shape=(2, 2)),
+            'values': sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                values=(.5, 11., 1., .1),
+                dense_shape=(2, 2))
+        }, (column,))
+
+  def test_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,))
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          'ids': sparse_tensor.SparseTensorValue(
+              indices=((0, 0), (1, 0), (1, 1)),
+              values=(0, 2, 1),
+              dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  # TODO(ptucker): Add test with embedding of weighted categorical.
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..5cc36d86d60d4a76b1cf005fe207e0d8af0f3f06
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.data-00000-of-00001 differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.index b/tensorflow/python/feature_column/testdata/embedding.ckpt.index
new file mode 100644
index 0000000000000000000000000000000000000000..c1f35a8fcfffed90eb44b3d784f998cafb59d3aa
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.index differ
diff --git a/tensorflow/python/feature_column/testdata/embedding.ckpt.meta b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta
new file mode 100644
index 0000000000000000000000000000000000000000..65bc3f2becb000010273d8e9835e7e39d553f5c7
Binary files /dev/null and b/tensorflow/python/feature_column/testdata/embedding.ckpt.meta differ
diff --git a/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6c917fa699903c367734f220953f0c97a39bc9ef
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/warriors_vocabulary.txt
@@ -0,0 +1,5 @@
+30
+35
+11
+23
+22
diff --git a/tensorflow/python/feature_column/testdata/wire_vocabulary.txt b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
new file mode 100644
index 0000000000000000000000000000000000000000..32c6b5692a0d4c8b2935cd7b32f3a5396857ee3d
--- /dev/null
+++ b/tensorflow/python/feature_column/testdata/wire_vocabulary.txt
@@ -0,0 +1,3 @@
+omar
+stringer
+marlo
diff --git a/tensorflow/python/framework/common_shapes_test.py b/tensorflow/python/framework/common_shapes_test.py
index dc99720e8a1e4899fef7b87b823716e8e8706a8c..62d9b568041a3a7a6a35c4ab7b8377c4bdbf7fdb 100644
--- a/tensorflow/python/framework/common_shapes_test.py
+++ b/tensorflow/python/framework/common_shapes_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -26,6 +28,38 @@ from tensorflow.python.platform import googletest
 
 class CommonShapesTest(test_util.TensorFlowTestCase):
 
+  # Asserts that we get the same result with numpy (for known shapes), and that
+  # the order of arguments does not matter (i.e., broadcasting is reflexive).
+  def _assert_incompatible_broadcast(self, shape1, shape2):
+    if shape1.dims is not None and shape2.dims is not None:
+      zeros1 = np.zeros(shape1.as_list())
+      zeros2 = np.zeros(shape2.as_list())
+      with self.assertRaises(ValueError):
+        np.broadcast(zeros1, zeros2)
+      with self.assertRaises(ValueError):
+        np.broadcast(zeros2, zeros1)
+    with self.assertRaises(ValueError):
+      common_shapes.broadcast_shape(shape1, shape2)
+    with self.assertRaises(ValueError):
+      common_shapes.broadcast_shape(shape2, shape1)
+
+  # Asserts that we get the same result with numpy (for known shapes), and that
+  # the order of arguments does not matter (i.e., broadcasting is reflexive).
+  def _assert_broadcast(self, expected, shape1, shape2):
+    if shape1.dims is not None and shape2.dims is not None:
+      expected_np = expected.as_list()
+      zeros1 = np.zeros(shape1.as_list())
+      zeros2 = np.zeros(shape2.as_list())
+      self.assertAllEqual(expected_np, np.broadcast(zeros1, zeros2).shape)
+      self.assertAllEqual(expected_np, np.broadcast(zeros2, zeros1).shape)
+      self.assertEqual(
+          expected, common_shapes.broadcast_shape(shape1, shape2))
+      self.assertEqual(
+          expected, common_shapes.broadcast_shape(shape2, shape1))
+    else:
+      self.assertEqual(expected, common_shapes.broadcast_shape(shape1, shape2))
+      self.assertEqual(expected, common_shapes.broadcast_shape(shape2, shape1))
+
   def testBroadcast_one_dimension(self):
     s1 = tensor_shape.vector(5)
     s2 = tensor_shape.vector(7)
@@ -35,29 +69,138 @@ class CommonShapesTest(test_util.TensorFlowTestCase):
     expanded_scalar = tensor_shape.TensorShape([1])
 
     # Tensors with same shape should have the same broadcast result.
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, s1))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, s2))
-    self.assertEqual(unknown, common_shapes.broadcast_shape(unknown, unknown))
-    self.assertEqual(scalar, common_shapes.broadcast_shape(scalar, scalar))
-    self.assertEqual(expanded_scalar, common_shapes.broadcast_shape(
-        expanded_scalar, expanded_scalar))
+    for shape in (s1, s2, unknown, scalar, expanded_scalar):
+      self._assert_broadcast(expected=shape, shape1=shape, shape2=shape)
 
-    # [] acts like an identity.
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, scalar))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, scalar))
+    # [] and [1] act like identity.
+    self._assert_broadcast(expected=s1, shape1=s1, shape2=scalar)
+    self._assert_broadcast(expected=s2, shape1=s2, shape2=scalar)
+    self._assert_broadcast(expected=s1, shape1=s1, shape2=expanded_scalar)
+    self._assert_broadcast(expected=s2, shape1=s2, shape2=expanded_scalar)
 
-    self.assertEqual(s1, common_shapes.broadcast_shape(s1, expanded_scalar))
-    self.assertEqual(s2, common_shapes.broadcast_shape(s2, expanded_scalar))
+    self._assert_broadcast(expected=unknown, shape1=s1, shape2=unknown)
+    self._assert_broadcast(expected=unknown, shape1=s2, shape2=unknown)
 
-    self.assertEqual(unknown, common_shapes.broadcast_shape(s1, unknown))
-    self.assertEqual(unknown, common_shapes.broadcast_shape(s2, unknown))
+    self._assert_broadcast(
+        expected=expanded_scalar, shape1=scalar, shape2=expanded_scalar)
 
-    self.assertEqual(expanded_scalar, common_shapes.broadcast_shape(
-        scalar, expanded_scalar))
+    self._assert_incompatible_broadcast(shape1=s1, shape2=s2)
 
-    with self.assertRaises(ValueError):
-      common_shapes.broadcast_shape(s1, s2)
-      common_shapes.broadcast_shape(s2, s1)
+  def testBroadcast_many_dimensions(self):
+    unknown = tensor_shape.unknown_shape()
+    shape_0 = tensor_shape.scalar()
+    shape_1 = tensor_shape.vector(1)
+    shape_4 = tensor_shape.vector(4)
+    shape_1x4 = tensor_shape.matrix(1, 4)
+    shape_4x1 = tensor_shape.matrix(4, 1)
+    shape_3x4 = tensor_shape.matrix(3, 4)
+    shape_4x3 = tensor_shape.matrix(4, 3)
+
+    # Tensors with same shape should have the same broadcast result.
+    for shape in (
+        shape_0, shape_1, shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+      self._assert_broadcast(expected=shape, shape1=shape, shape2=shape)
+
+    # [] and [1] act like identity.
+    for identity in (shape_0, shape_1):
+      for shape in (shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+        self._assert_broadcast(expected=shape, shape1=identity, shape2=shape)
+
+    # Unknown in, unknown out.
+    for shape in (shape_4, shape_1x4, shape_4x1, shape_3x4, shape_4x3):
+      self._assert_broadcast(expected=unknown, shape1=shape, shape2=unknown)
+
+    self._assert_broadcast(expected=shape_1x4, shape1=shape_4, shape2=shape_1x4)
+    shape_4x4 = tensor_shape.matrix(4, 4)
+    self._assert_broadcast(expected=shape_4x4, shape1=shape_4, shape2=shape_4x1)
+    self._assert_broadcast(expected=shape_3x4, shape1=shape_4, shape2=shape_3x4)
+    self._assert_incompatible_broadcast(shape1=shape_4, shape2=shape_4x3)
+    self._assert_broadcast(
+        expected=shape_4x4, shape1=shape_1x4, shape2=shape_4x1)
+    self._assert_broadcast(
+        expected=shape_3x4, shape1=shape_1x4, shape2=shape_3x4)
+    self._assert_incompatible_broadcast(shape1=shape_1x4, shape2=shape_4x3)
+    self._assert_incompatible_broadcast(shape1=shape_4x1, shape2=shape_3x4)
+    self._assert_broadcast(
+        expected=shape_4x3, shape1=shape_4x1, shape2=shape_4x3)
+    self._assert_incompatible_broadcast(shape1=shape_3x4, shape2=shape_4x3)
+
+  # Asserts that the order of arguments does not matter (i.e., broadcasting is
+  # reflexive).
+  def _assert_broadcast_with_unknown_dims(self, expected, shape1, shape2):
+    actual_dims = common_shapes.broadcast_shape(shape1, shape2).dims
+    reflexive_actual_dims = common_shapes.broadcast_shape(shape2, shape1).dims
+
+    if actual_dims is None:
+      self.assertIsNone(reflexive_actual_dims)
+    elif reflexive_actual_dims is None:
+      self.assertIsNone(actual_dims)
+    else:
+      self.assertEqual(len(actual_dims), len(reflexive_actual_dims))
+      for actual_dim, reflexive_actual_dim in zip(
+          actual_dims, reflexive_actual_dims):
+        self.assertEqual(actual_dim.value, reflexive_actual_dim.value)
+
+    expected_dims = expected.dims
+    if expected_dims is None:
+      self.assertIsNone(actual_dims)
+    elif actual_dims is None:
+      self.assertIsNone(expected_dims)
+    else:
+      self.assertEqual(len(expected_dims), len(actual_dims))
+      for expected_dim, actual_dim in zip(expected_dims, actual_dims):
+        self.assertEqual(expected_dim.value, actual_dim.value)
+
+  def testBroadcast_unknown_dims(self):
+    unknown = tensor_shape.unknown_shape()
+    shape_0 = tensor_shape.scalar()
+    shape_1 = tensor_shape.vector(1)
+    # pylint: disable=invalid-name
+    shape_U = tensor_shape.vector(None)
+    shape_1xU = tensor_shape.matrix(1, None)
+    shape_Ux1 = tensor_shape.matrix(None, 1)
+    shape_4xU = tensor_shape.matrix(4, None)
+    shape_Ux4 = tensor_shape.matrix(None, 4)
+    # pylint: enable=invalid-name
+
+    # Tensors with same shape should have the same broadcast result.
+    for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+      self._assert_broadcast_with_unknown_dims(
+          expected=shape, shape1=shape, shape2=shape)
+
+    # [] and [1] act like identity.
+    for identity in (shape_0, shape_1):
+      for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+        self._assert_broadcast_with_unknown_dims(
+            expected=shape, shape1=identity, shape2=shape)
+
+    # Unknown in, unknown out.
+    for shape in (shape_U, shape_1xU, shape_Ux1, shape_4xU, shape_Ux4):
+      self._assert_broadcast_with_unknown_dims(
+          expected=unknown, shape1=shape, shape2=unknown)
+
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_1xU, shape1=shape_U, shape2=shape_1xU)
+    shape_UxU = tensor_shape.matrix(None, None)  # pylint: disable=invalid-name
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_UxU, shape1=shape_U, shape2=shape_Ux1)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_U, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_U, shape2=shape_Ux4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_UxU, shape1=shape_1xU, shape2=shape_Ux1)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_1xU, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_1xU, shape2=shape_Ux4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4xU, shape1=shape_Ux1, shape2=shape_4xU)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_Ux4, shape1=shape_Ux1, shape2=shape_Ux4)
+    shape_4x4 = tensor_shape.matrix(4, 4)
+    self._assert_broadcast_with_unknown_dims(
+        expected=shape_4x4, shape1=shape_4xU, shape2=shape_Ux4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index d1485d419c3710ede9cc06ccf8a9e89c86b65dc7..49f5739194b241d5ef4cb78d87afd7b691b932e8 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -129,14 +129,24 @@ def _tensor_shape_tensor_conversion_function(s, dtype=None, name=None,
   if not s.is_fully_defined():
     raise ValueError(
         "Cannot convert a partially known TensorShape to a Tensor: %s" % s)
+  s_list = s.as_list()
+  int64_value = 0
+  for dim in s_list:
+    if dim >= 2**31:
+      int64_value = dim
+      break
+
   if dtype is not None:
     if dtype not in (dtypes.int32, dtypes.int64):
       raise TypeError("Cannot convert a TensorShape to dtype: %s" % dtype)
+    if dtype == dtypes.int32 and int64_value:
+      raise ValueError("Cannot convert a TensorShape to dtype int32; "
+                       "a dimension is too large (%s)" % int64_value)
   else:
-    dtype = dtypes.int32
+    dtype = dtypes.int64 if int64_value else dtypes.int32
   if name is None:
     name = "shape_as_tensor"
-  return constant(s.as_list(), dtype=dtype, name=name)
+  return constant(s_list, dtype=dtype, name=name)
 
 ops.register_tensor_conversion_function(
     tensor_shape.TensorShape, _tensor_shape_tensor_conversion_function, 100)
diff --git a/tensorflow/python/framework/contrib_test.py b/tensorflow/python/framework/contrib_test.py
index 8ca0c69d775442902c29e48f6906398f738026cf..f2eaf7c2eea86792bb604fa9e5799f6c479caf66 100644
--- a/tensorflow/python/framework/contrib_test.py
+++ b/tensorflow/python/framework/contrib_test.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 from tensorflow.python.platform import test
+from tensorflow.python.util import tf_inspect
 
 
 class ContribTest(test.TestCase):
@@ -29,17 +28,17 @@ class ContribTest(test.TestCase):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
     _ = tf.contrib.layers  # `tf.contrib` is loaded lazily on first use.
-    assert inspect.ismodule(tf.contrib)
+    assert tf_inspect.ismodule(tf.contrib)
 
   def testLayers(self):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
-    assert inspect.ismodule(tf.contrib.layers)
+    assert tf_inspect.ismodule(tf.contrib.layers)
 
   def testLinearOptimizer(self):
     # pylint: disable=g-import-not-at-top
     import tensorflow as tf
-    assert inspect.ismodule(tf.contrib.linear_optimizer)
+    assert tf_inspect.ismodule(tf.contrib.linear_optimizer)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/docs.py b/tensorflow/python/framework/docs.py
deleted file mode 100644
index 4ae0046117b846a122f320003cd5e7e5f62890de..0000000000000000000000000000000000000000
--- a/tensorflow/python/framework/docs.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Updates generated docs from Python doc comments.
-
-Updates the documentation files.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import inspect
-import os
-import re
-
-
-_arg_re = re.compile(" *([*]{0,2}[a-zA-Z][a-zA-Z0-9_]*):")
-_section_re = re.compile("([A-Z][a-zA-Z ]*):$")
-_always_drop_symbol_re = re.compile("_[_a-zA-Z0-9]")
-_anchor_re = re.compile(r"^[\w.]+$")
-_member_mark = "@@"
-_indiv_dir = "functions_and_classes"
-_num_subdirs = 10
-_subdir_prefix = "shard"
-
-
-class Document(object):
-  """Base class for an automatically generated document."""
-
-  def write_markdown_to_file(self, f):
-    """Writes a Markdown-formatted version of this document to file `f`.
-
-    Args:
-      f: The output file.
-    """
-    raise NotImplementedError("Document.WriteToFile")
-
-
-class Index(Document):
-  """An automatically generated index for a collection of documents."""
-
-  def __init__(self, module_to_name, members, filename_to_library_map,
-               path_prefix):
-    """Creates a new Index.
-
-    Args:
-      module_to_name: Dictionary mapping modules to short names.
-      members: Dictionary mapping member name to (fullname, member).
-      filename_to_library_map: A list of (filename, Library) pairs. The order
-        corresponds to the order in which the libraries appear in the index.
-      path_prefix: Prefix to add to links in the index.
-    """
-    self._module_to_name = module_to_name
-    self._members = members
-    self._filename_to_library_map = filename_to_library_map
-    self._path_prefix = path_prefix
-
-  def write_markdown_to_file(self, f):
-    """Writes this index to file `f`.
-
-    The output is formatted as an unordered list. Each list element
-    contains the title of the library, followed by a list of symbols
-    in that library hyperlinked to the corresponding anchor in that
-    library.
-
-    Args:
-      f: The output file.
-    """
-    print("<!-- This file is machine generated: DO NOT EDIT! -->", file=f)
-    print("", file=f)
-    print("# TensorFlow Python reference documentation", file=f)
-    print("", file=f)
-    fullname_f = lambda name: self._members[name][0]
-    anchor_f = lambda name: get_anchor(self._module_to_name, fullname_f(name))
-
-    for filename, library in self._filename_to_library_map:
-      sorted_names = sorted(library.mentioned, key=lambda x: (str.lower(x), x))
-      member_names = [n for n in sorted_names if n in self._members]
-      # TODO(wicke): This is a hack that should be removed as soon as the
-      # website code allows it.
-      full_filename = self._path_prefix + filename
-      links = ["[`%s`](%s#%s)" % (name, full_filename, anchor_f(name))
-               for name in member_names]
-      if links:
-        print("* **[%s](%s)**:" % (library.title, full_filename), file=f)
-        for link in links:
-          print("  * %s" % link, file=f)
-        print("", file=f)
-
-
-def collect_members(module_to_name, exclude=()):
-  """Collect all symbols from a list of modules.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    exclude: Set of fully qualified names to exclude.
-
-  Returns:
-    Dictionary mapping name to (fullname, member) pairs.
-
-  Raises:
-    RuntimeError: if we can not resolve a name collision.
-  """
-  members = {}
-  for module, module_name in module_to_name.items():
-    all_names = getattr(module, "__all__", None)
-    for name, member in inspect.getmembers(module):
-      if ((inspect.isfunction(member)
-           or inspect.isclass(member)
-           or isinstance(member, functools.partial))
-          and not _always_drop_symbol_re.match(name) and
-          (all_names is None or name in all_names)):
-        fullname = "%s.%s" % (module_name, name)
-        if fullname in exclude:
-          continue
-        if name in members:
-          other_fullname, other_member = members[name]
-          if member is not other_member:
-            raise RuntimeError("Short name collision between %s and %s" %
-                               (fullname, other_fullname))
-          if len(fullname) == len(other_fullname):
-            raise RuntimeError("Can't decide whether to use %s or %s for %s: "
-                               "both full names have length %d" %
-                               (fullname, other_fullname, name, len(fullname)))
-          if len(fullname) > len(other_fullname):
-            continue  # Use the shorter full name
-        members[name] = fullname, member
-  return members
-
-
-def get_anchor(module_to_name, fullname):
-  """Turn a full member name into an anchor.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    fullname: Fully qualified name of symbol.
-
-  Returns:
-    HTML anchor string.  The longest module name prefix of fullname is
-    removed to make the anchor.
-
-  Raises:
-    ValueError: If fullname uses characters invalid in an anchor.
-  """
-  if not _anchor_re.match(fullname):
-    raise ValueError("'%s' is not a valid anchor" % fullname)
-  anchor = fullname
-  for module_name in module_to_name.values():
-    if fullname.startswith(module_name + "."):
-      rest = fullname[len(module_name)+1:]
-      # Use this prefix iff it is longer than any found before
-      if len(anchor) > len(rest):
-        anchor = rest
-  return anchor
-
-
-def _stable_hash(s):
-  """A simple string hash that won't change from run to run."""
-  ret = 0
-  for c in s:
-    ret = ret * 97 + ord(c)
-  return ret
-
-
-class Library(Document):
-  """An automatically generated document for a set of functions and classes."""
-
-  def __init__(self,
-               title,
-               module,
-               module_to_name,
-               members,
-               documented,
-               exclude_symbols=(),
-               prefix=None):
-    """Creates a new Library.
-
-    Args:
-      title: A human-readable title for the library.
-      module: Module to pull high level docstring from (for table of contents,
-        list of Ops to document, etc.).
-      module_to_name: Dictionary mapping modules to short names.
-      members: Dictionary mapping member name to (fullname, member).
-      documented: Set of documented names to update.
-      exclude_symbols: A list of specific symbols to exclude.
-      prefix: A string to include at the beginning of the page.
-    """
-    self._title = title
-    self._module = module
-    self._module_to_name = module_to_name
-    self._members = dict(members)  # Copy since we mutate it below
-    self._exclude_symbols = frozenset(exclude_symbols)
-    documented.update(exclude_symbols)
-    self._documented = documented
-    self._mentioned = set()
-    self._prefix = prefix or ""
-
-  @property
-  def title(self):
-    """The human-readable title for this library."""
-    return self._title
-
-  @property
-  def mentioned(self):
-    """Set of names mentioned in this library."""
-    return self._mentioned
-
-  @property
-  def exclude_symbols(self):
-    """Set of excluded symbols."""
-    return self._exclude_symbols
-
-  def _should_include_member(self, name):
-    """Returns True if this member should be included in the document."""
-    # __x__ should be documented always
-    name_is_operator = name.startswith("__") and name.endswith("__")
-    name_is_private = name.startswith("_") and not name_is_operator
-    name_is_excluded = name in self._exclude_symbols
-    return not (name_is_private or name_is_excluded)
-
-  def get_imported_modules(self, module):
-    """Returns the list of modules imported from `module`."""
-    for name, member in inspect.getmembers(module):
-      if inspect.ismodule(member):
-        yield name, member
-
-  def get_class_members(self, cls_name, cls):
-    """Returns the list of class members to document in `cls`.
-
-    This function filters the class member to ONLY return those
-    defined by the class.  It drops the inherited ones.
-
-    Args:
-      cls_name: Qualified name of `cls`.
-      cls: An inspect object of type 'class'.
-
-    Yields:
-      name, member tuples.
-    """
-    for name, member in inspect.getmembers(cls):
-      # Only show methods and properties presently.  In Python 3,
-      # methods register as isfunction.
-      is_method = (inspect.ismethod(member) or inspect.isfunction(member)
-                   or isinstance(member, functools.partial))
-      if not (is_method or isinstance(member, property)):
-        continue
-      if self._should_include_member(name):
-        yield name, ("%s.%s" % (cls_name, name), member)
-
-  def shard_dir(self, name):
-    """Returns the path of the doc subdirectory for member `name`.
-
-    When generating individual files for each function and class, we shard
-    the files across several directories to avoid hitting the limit for
-    files per directory. This function determines the subdirectory for
-    a member based on a stable hash of its name.
-
-    Args:
-      name: string. The name of a function or class.
-
-    Returns:
-      The path to a subdirectory of the api docs directory.
-    """
-    index = _stable_hash(name) % _num_subdirs
-    return os.path.join(self.functions_and_classes_dir,
-                        _subdir_prefix + str(index))
-
-  def set_functions_and_classes_dir(self, dirname):
-    """Sets the name of the directory for function and class markdown files.
-
-    Args:
-      dirname: string. The name of the directory in which to store function
-        and class markdown files.
-    """
-    self.functions_and_classes_dir = dirname
-
-  def _generate_signature_for_function(self, func):
-    """Given a function, returns a string representing its args."""
-    args_list = []
-    if isinstance(func, functools.partial):
-      argspec = inspect.getargspec(func.func)
-      # Remove the args from the original function that have been used up.
-      first_default_arg = (
-          len(argspec.args or []) - len(argspec.defaults or []))
-      partial_args = len(func.args)
-      if argspec.args:
-        argspec_args = list(argspec.args[partial_args:])
-      else:
-        argspec_args = []
-      if argspec.defaults:
-        argspec_defaults = list(argspec.defaults[
-            max(0, partial_args-first_default_arg):])
-      else:
-        argspec_defaults = []
-      first_default_arg = max(0, first_default_arg - partial_args)
-      for kwarg in func.keywords:
-        if kwarg in argspec_args:
-          i = argspec_args.index(kwarg)
-          argspec_args.pop(i)
-          if i >= first_default_arg:
-            argspec_defaults.pop(i-first_default_arg)
-          else:
-            first_default_arg -= 1
-      argspec_varargs = None
-      argspec_keywords = None
-
-    else:
-      argspec = inspect.getargspec(func)
-      argspec_args = argspec.args
-      argspec_defaults = argspec.defaults
-      argspec_varargs = argspec.varargs
-      argspec_keywords = argspec.keywords
-
-    first_arg_with_default = (
-        len(argspec_args or []) - len(argspec_defaults or []))
-    for arg in argspec_args[:first_arg_with_default]:
-      if arg == "self":
-        # Python documentation typically skips `self` when printing method
-        # signatures.
-        continue
-      args_list.append(arg)
-
-    # TODO(mrry): This is a workaround for documenting signature of
-    # functions that have the @contextlib.contextmanager decorator.
-    # TODO(aselle): This workaround is brittle on TestCase.__call__
-    #  so we need to wrap this in a try/catch
-    # We should do something better.
-    if argspec_varargs == "args" and argspec_keywords == "kwds":
-      try:
-        original_func = func.__closure__[0].cell_contents
-        return self._generate_signature_for_function(original_func)
-      except TypeError:
-        pass
-
-    if argspec_defaults:
-      for arg, default in zip(
-          argspec_args[first_arg_with_default:], argspec_defaults):
-        if callable(default):
-          if hasattr(default, "__name__"):
-            args_list.append("%s=%s" % (arg, default.__name__))
-          else:
-            # A callable may be a class instance.
-            # TODO(fchollet): handle case with non-default constructor
-            # arguments (currently not present in the TF codebase).
-            args_list.append("%s=%s()" % (arg, default.__class__.__name__))
-        else:
-          args_list.append("%s=%r" % (arg, default))
-    if argspec_varargs:
-      args_list.append("*" + argspec_varargs)
-    if argspec_keywords:
-      args_list.append("**" + argspec_keywords)
-    return "(" + ", ".join(args_list) + ")"
-
-  def _remove_docstring_indent(self, docstring):
-    """Remove indenting.
-
-    We follow Python's convention and remove the minimum indent of the lines
-    after the first, see:
-    https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
-    preserving relative indentation.
-
-    Args:
-      docstring: A docstring.
-
-    Returns:
-      A list of strings, one per line, with the minimum indent stripped.
-    """
-    docstring = docstring or ""
-    lines = docstring.strip().split("\n")
-
-    min_indent = len(docstring)
-    for l in lines[1:]:
-      l = l.rstrip()
-      if l:
-        i = 0
-        while i < len(l) and l[i] == " ":
-          i += 1
-        if i < min_indent: min_indent = i
-    for i in range(1, len(lines)):
-      l = lines[i].rstrip()
-      if len(l) >= min_indent:
-        l = l[min_indent:]
-      lines[i] = l
-    return lines
-
-  def _print_formatted_docstring(self, docstring, f):
-    """Formats the given `docstring` as Markdown and prints it to `f`."""
-    lines = self._remove_docstring_indent(docstring)
-
-    # Output the lines, identifying "Args" and other section blocks.
-    i = 0
-
-    def _at_start_of_section():
-      """Returns the header if lines[i] is at start of a docstring section."""
-      l = lines[i]
-      match = _section_re.match(l)
-      if match and i + 1 < len(
-          lines) and lines[i + 1].startswith(" "):
-        return match.group(1)
-      else:
-        return None
-
-    while i < len(lines):
-      l = lines[i]
-
-      section_header = _at_start_of_section()
-      if section_header:
-        if i == 0 or lines[i-1]:
-          print("", file=f)
-        # Use at least H4 to keep these out of the TOC.
-        print("##### " + section_header + ":", file=f)
-        print("", file=f)
-        i += 1
-        outputting_list = False
-        while i < len(lines):
-          l = lines[i]
-          # A new section header terminates the section.
-          if _at_start_of_section():
-            break
-          match = _arg_re.match(l)
-          if match:
-            if not outputting_list:
-              # We need to start a list. In Markdown, a blank line needs to
-              # precede a list.
-              print("", file=f)
-              outputting_list = True
-            suffix = l[len(match.group()):].lstrip()
-            print("*  <b>`" + match.group(1) + "`</b>: " + suffix, file=f)
-          else:
-            # For lines that don't start with _arg_re, continue the list if it
-            # has enough indentation.
-            outputting_list &= l.startswith("   ")
-            print(l, file=f)
-          i += 1
-      else:
-        print(l, file=f)
-        i += 1
-
-  def _print_function(self, f, prefix, fullname, func):
-    """Prints the given function to `f`."""
-    heading = prefix + " `" + fullname
-    if not isinstance(func, property):
-      heading += self._generate_signature_for_function(func)
-    heading += "` {#%s}" % get_anchor(self._module_to_name, fullname)
-    print(heading, file=f)
-    print("", file=f)
-    self._print_formatted_docstring(inspect.getdoc(func), f)
-    print("", file=f)
-
-  def _write_member_markdown_to_file(self, f, prefix, name, member):
-    """Print `member` to `f`."""
-    if (inspect.isfunction(member) or inspect.ismethod(member)
-        or (isinstance(member, functools.partial)
-            and inspect.isfunction(member.func))
-        or isinstance(member, property)):
-      print("- - -", file=f)
-      print("", file=f)
-      self._print_function(f, prefix, name, member)
-      print("", file=f)
-
-      # Write an individual file for each function.
-      if inspect.isfunction(member):
-        indivf = open(
-            os.path.join(self.shard_dir(name), name + ".md"), "w+")
-        self._print_function(indivf, prefix, name, member)
-    elif (inspect.isclass(member)
-          or (isinstance(member, functools.partial)
-              and inspect.isclass(member.func))):
-      print("- - -", file=f)
-      print("", file=f)
-      print("%s `class %s` {#%s}" % (prefix, name,
-                                     get_anchor(self._module_to_name, name)),
-            file=f)
-      print("", file=f)
-      self._write_class_markdown_to_file(f, name, member)
-      print("", file=f)
-
-      # Write an individual file for each class.
-      indivf = open(
-          os.path.join(self.shard_dir(name), name + ".md"), "w+")
-      self._write_class_markdown_to_file(indivf, name, member)
-    else:
-      raise RuntimeError("Member %s has unknown type %s" % (name, type(member)))
-
-  def _write_docstring_markdown_to_file(self, f, prefix, docstring, members,
-                                        imports):
-    for l in self._remove_docstring_indent(docstring):
-      if l.startswith(_member_mark):
-        name = l[len(_member_mark):].strip(" \t")
-        if name in members:
-          self._documented.add(name)
-          self._mentioned.add(name)
-          self._write_member_markdown_to_file(f, prefix, *members[name])
-          del members[name]
-        elif name in imports:
-          self._write_module_markdown_to_file(f, imports[name])
-        else:
-          raise ValueError("%s: unknown member `%s`, markdown=`%s`." % (
-              self._title, name, l))
-      else:
-        print(l, file=f)
-
-  def _write_class_markdown_to_file(self, f, name, cls):
-    """Write the class doc to `f`.
-
-    Args:
-      f: File to write to.
-      name: name to use.
-      cls: class object.
-    """
-    # Build the list of class methods to document.
-    methods = dict(self.get_class_members(name, cls))
-    # Used later to check if any methods were called out in the class
-    # docstring.
-    num_methods = len(methods)
-    try:
-      self._write_docstring_markdown_to_file(f, "####", inspect.getdoc(cls),
-                                             methods, {})
-    except ValueError as e:
-      raise ValueError(str(e) + " in class `%s`" % cls.__name__)
-
-    # If some methods were not described, describe them now if they are
-    # defined by the class itself (not inherited).  If NO methods were
-    # described, describe all methods.
-    #
-    # TODO(touts): when all methods have been categorized make it an error
-    # if some methods are not categorized.
-    any_method_called_out = (len(methods) != num_methods)
-    if any_method_called_out:
-      other_methods = {n: m for n, m in methods.items() if n in cls.__dict__}
-      if other_methods:
-        print("\n#### Other Methods", file=f)
-    else:
-      other_methods = methods
-    for name in sorted(other_methods):
-      self._write_member_markdown_to_file(f, "####", *other_methods[name])
-
-  def _write_module_markdown_to_file(self, f, module):
-    imports = dict(self.get_imported_modules(module))
-    self._write_docstring_markdown_to_file(f, "###", inspect.getdoc(module),
-                                           self._members, imports)
-
-  def write_markdown_to_file(self, f):
-    """Prints this library to file `f`.
-
-    Args:
-      f: File to write to.
-
-    Returns:
-      Dictionary of documented members.
-    """
-    print("<!-- This file is machine generated: DO NOT EDIT! -->", file=f)
-    print("", file=f)
-    # TODO(touts): Do not insert these.  Let the doc writer put them in
-    # the module docstring explicitly.
-    print("#", self._title, file=f)
-    if self._prefix:
-      print(self._prefix, file=f)
-    print("[TOC]", file=f)
-    print("", file=f)
-    if self._module is not None:
-      self._write_module_markdown_to_file(f, self._module)
-
-  def write_other_members(self, f, catch_all=False):
-    """Writes the leftover members to `f`.
-
-    Args:
-      f: File to write to.
-      catch_all: If true, document all missing symbols from any module.
-        Otherwise, document missing symbols from just this module.
-    """
-    if catch_all:
-      names = self._members.items()
-    else:
-      names = inspect.getmembers(self._module)
-      all_names = getattr(self._module, "__all__", None)
-      if all_names is not None:
-        names = [(n, m) for n, m in names if n in all_names]
-    leftovers = []
-    for name, _ in names:
-      if name in self._members and name not in self._documented:
-        leftovers.append(name)
-    if leftovers:
-      print("%s: undocumented members: %d" % (self._title, len(leftovers)))
-      print("\n## Other Functions and Classes", file=f)
-      for name in sorted(leftovers):
-        print("  %s" % name)
-        self._documented.add(name)
-        self._mentioned.add(name)
-        self._write_member_markdown_to_file(f, "###", *self._members[name])
-
-  def assert_no_leftovers(self):
-    """Generate an error if there are leftover members."""
-    leftovers = []
-    for name in self._members:
-      if name in self._members and name not in self._documented:
-        leftovers.append(name)
-    if leftovers:
-      raise RuntimeError("%s: undocumented members: %s" %
-                         (self._title, ", ".join(leftovers)))
-
-
-def write_libraries(output_dir, libraries):
-  """Write a list of libraries to disk.
-
-  Args:
-    output_dir: Output directory.
-    libraries: List of (filename, library) pairs.
-  """
-  files = [open(os.path.join(output_dir, k), "w") for k, _ in libraries]
-
-  # Set the directory in which to save individual class and function md files,
-  # creating it if it doesn't exist. Create subdirectories to avoid hitting
-  # the limit for number of files in a directory.
-  indiv_dir = os.path.join(output_dir, _indiv_dir)
-  if not os.path.exists(indiv_dir):
-    os.makedirs(indiv_dir)
-
-  for i in range(0, _num_subdirs):
-    subdir = os.path.join(indiv_dir, _subdir_prefix + str(i))
-    if not os.path.exists(subdir):
-      os.makedirs(subdir)
-
-  # Document mentioned symbols for all libraries
-  for f, (_, v) in zip(files, libraries):
-    v.set_functions_and_classes_dir(indiv_dir)
-    v.write_markdown_to_file(f)
-  # Document symbols that no library mentioned.  We do this after writing
-  # out all libraries so that earlier libraries know what later libraries
-  # documented.
-  for f, (_, v) in zip(files, libraries):
-    v.write_other_members(f)
-    f.close()
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index d373bac47a2a44e64ce989c7d5150cd42fea219a..3e6c04982b4b1c1ca219cfd1bc1a1954e2b520a1 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -270,6 +270,9 @@ class DType(object):
     """Returns the string name for this `DType`."""
     return _TYPE_TO_STRING[self._type_enum]
 
+  def __int__(self):
+    return self._type_enum
+
   def __str__(self):
     return "<dtype: %r>" % self.name
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index fac2cf4def9111caab07a375fde24a45e2c03a88..5bb60763b6e30d23c622b1a281f62e3577c77692 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.framework.importer."""
+"""Tests for tensorflow.python.framework.dtypes."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -45,8 +45,8 @@ class TypesTest(test_util.TensorFlowTestCase):
     for datatype_enum in types_pb2.DataType.values():
       if datatype_enum == types_pb2.DT_INVALID:
         continue
-      self.assertEqual(datatype_enum,
-                       dtypes.as_dtype(datatype_enum).as_datatype_enum)
+      dt = dtypes.as_dtype(datatype_enum)
+      self.assertEqual(datatype_enum, dt.as_datatype_enum)
 
   def testAllTypesConvertibleToNumpyDtype(self):
     for datatype_enum in types_pb2.DataType.values():
diff --git a/tensorflow/python/framework/file_system_test.py b/tensorflow/python/framework/file_system_test.py
index 26b2a5b9b94b24da144587ac7246e76a657d466b..5eb59141a2aeaf0c3b552b525fdb738f0838ab97 100644
--- a/tensorflow/python/framework/file_system_test.py
+++ b/tensorflow/python/framework/file_system_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Tests for functions."""
+"""Tests for file_system."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 3c663c3a9b2705a82903bcb8f3a0906c6104177d..08faa3a6d2eb753b2c5659dad767e3d8d5fdbbde 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -23,7 +23,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import inspect
 import re
 
 from tensorflow.core.framework import attr_value_pb2
@@ -36,6 +35,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 def _make_argname_from_tensor_name(name):
@@ -112,18 +113,19 @@ def _add_op_node(op, func, input_dict):
   node_def = func.node_def[-1]
   for i in range(len(node_def.input)):
     if not node_def.input[i].startswith("^"):
-      assert node_def.input[i] in input_dict, (
-          "%s missing from %s" % (node_def.input[i], input_dict.items()))
+      assert node_def.input[i] in input_dict, ("%s missing from %s" %
+                                               (node_def.input[i],
+                                                input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
 
 
-def _graph_to_function_def(graph, inputs, outputs, out_names=None):
+def _graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
   """Returns `graph` as a `FunctionDef` protocol buffer.
 
   This method creates a [`FunctionDef`](
   https://www.tensorflow.org/code/tensorflow/core/framework/function.proto)
-  protocol buffer that contains all the ops present in the graph.  The
-  graph effectively becomes the body of the function.
+  protocol buffer that contains all the ops in `operations`.  The
+  operations become the body of the function.
 
   The arguments `inputs` and `outputs` will be listed as the inputs
   and outputs tensors of the function.  They must be lists of
@@ -131,6 +133,8 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
 
   Args:
     graph: Graph.
+    operations: the operations to put in the function. Must be a subset of
+     the operations in the graph.
     inputs: List of tensors. Inputs to the function.
     outputs: List of tensors. Outputs of the function.
     out_names: Optional list of string names for the outputs.
@@ -144,12 +148,12 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
   func = function_pb2.FunctionDef()
   func.signature.name = "_"
   used_names = set()
-  func.signature.input_arg.extend([_tensor_to_argdef(i, used_names=used_names)
-                                   for i in inputs])
+  func.signature.input_arg.extend(
+      [_tensor_to_argdef(i, used_names=used_names) for i in inputs])
   if out_names is None:
     used_names = set()
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, used_names=used_names) for o in outputs])
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, used_names=used_names) for o in outputs])
   elif len(outputs) != len(out_names):
     raise ValueError(
         "Length of out_names (%d) does not match number of outputs (%d): %s" %
@@ -158,12 +162,12 @@ def _graph_to_function_def(graph, inputs, outputs, out_names=None):
     raise ValueError(
         "Must not have duplicates in out_names: %s" % ", ".join(out_names))
   else:
-    func.signature.output_arg.extend([
-        _tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
+    func.signature.output_arg.extend(
+        [_tensor_to_argdef(o, name=n) for o, n in zip(outputs, out_names)])
   func_arg_placeholders = set([i.name for i in inputs])
   input_dict = _create_input_dict(graph, func_arg_placeholders)
 
-  for op in graph.get_operations():
+  for op in operations:
     if _is_in_placeholders(op, func_arg_placeholders):
       continue
     _add_op_node(op, func, input_dict)
@@ -259,10 +263,11 @@ def _call(sig, *inputs, **kwargs):
 
 
 def _get_func_name(func):
+  _, func = tf_decorator.unwrap(func)
   if callable(func):
-    if inspect.isfunction(func):
+    if tf_inspect.isfunction(func):
       return func.__name__
-    elif inspect.ismethod(func):
+    elif tf_inspect.ismethod(func):
       return "%s.%s" % (func.__self__.__name__, func.__name__)
     else:  # Probably a class instance with __call__
       return type(func)
@@ -276,7 +281,7 @@ class _FuncGraph(ops.Graph):
   _FuncGraph overrides ops.Graph's create_op() so that we can keep
   track of every inputs into every op created inside the function.  If
   any input is from other graphs, we keep track of it in self.capture
-  and substitue the input with a place holder.
+  and substitute the input with a place holder.
 
   Each captured input's corresponding place holder is converted into a
   function argument and the caller passes in the captured tensor.
@@ -293,16 +298,18 @@ class _FuncGraph(ops.Graph):
     self.extra_args = []
     self.extra_vars = []
 
-  def getvar(self,
-             getter,
-             name,
-             shape=None,
-             dtype=None,
-             initializer=None,
-             trainable=True,
-             collections=None,  # pylint: disable=redefined-outer-name
-             use_resource=None,
-             **kwargs):
+  def getvar(
+      self,
+      getter,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      reuse=None,
+      trainable=True,
+      collections=None,  # pylint: disable=redefined-outer-name
+      use_resource=None,
+      **kwargs):
     """A custom variable getter."""
     # Here, we switch the default graph to the outer graph and ask the
     # variable scope in which the function is defined to give us the
@@ -319,6 +326,7 @@ class _FuncGraph(ops.Graph):
           shape=shape,
           dtype=dtype,
           initializer=initializer,
+          reuse=reuse,
           trainable=trainable,
           collections=collections,
           use_resource=use_resource)
@@ -534,20 +542,23 @@ class _DefinedFunction(object):
 
     # Build the FunctionDef
     self._definition = _graph_to_function_def(
-        temp_graph, inputs, outputs, out_names=self._out_names)
+        temp_graph,
+        temp_graph.get_operations(),
+        inputs,
+        outputs,
+        out_names=self._out_names)
 
     # Extra kwargs are treated as attrs on the function def.
     sig_pre_func_name = self._func_name or _get_func_name(self._func)
-    kwargs_attr = _parse_kwargs_as_attrs(
-        sig_pre_func_name, **self._extra_kwargs)
+    kwargs_attr = _parse_kwargs_as_attrs(sig_pre_func_name,
+                                         **self._extra_kwargs)
     for k in kwargs_attr:
       self._definition.attr[k].CopyFrom(kwargs_attr[k])
 
     # Hash the definition and its dependencies.
     self._hash_str = self._create_hash_str(
         self._definition.signature.input_arg,
-        self._definition.signature.output_arg,
-        self._definition.node_def)
+        self._definition.signature.output_arg, self._definition.node_def)
 
     # Finally, we decide the function name to use.  If not specified,
     # make up something which is almost certainly unique (but deterministic).
@@ -654,8 +665,8 @@ def _from_definition(fdef, grad_func=None):
   # have access to such a callable here).
   func = None
   argnames = [arg.name for arg in fdef.signature.input_arg]
-  input_types = tuple(dtypes.as_dtype(arg.type)
-                      for arg in fdef.signature.input_arg)
+  input_types = tuple(
+      dtypes.as_dtype(arg.type) for arg in fdef.signature.input_arg)
   func_name = fdef.signature.name
   # Note: FunctionDefs do not include python gradient functions, so if the
   # original _DefinedFunction included one it will not be reflected here.
@@ -671,8 +682,7 @@ def _from_definition(fdef, grad_func=None):
   result._extra_inputs = []
   result._hash_str = result._create_hash_str(
       result._definition.signature.input_arg,
-      result._definition.signature.output_arg,
-      result._definition.node_def)
+      result._definition.signature.output_arg, result._definition.node_def)
   # pylint: enable=protected-access
   return result
 
@@ -692,7 +702,8 @@ def _from_library(lib):
   Raises:
     ValueError: `lib` is invalid
   """
-  if not lib.function and not lib.gradient: return []
+  if not lib.function and not lib.gradient:
+    return []
 
   # function name -> FunctionDef proto
   funcs = {fdef.signature.name: fdef for fdef in lib.function}
@@ -716,8 +727,9 @@ def _from_library(lib):
     grad_to_funcs[gdef.gradient_func].append(gdef.function_name)
 
   # Start with functions without gradients
-  ready = [fdef for fdef in lib.function
-           if func_to_grad[fdef.signature.name] is None]
+  ready = [
+      fdef for fdef in lib.function if func_to_grad[fdef.signature.name] is None
+  ]
   if not ready:
     raise ValueError("FunctionDefLibrary contains cyclic gradient functions!\n"
                      + str(lib))
@@ -729,7 +741,8 @@ def _from_library(lib):
     name = fdef.signature.name
 
     grad = initialized.get(func_to_grad[name])
-    if func_to_grad[name]: assert grad
+    if func_to_grad[name]:
+      assert grad
     defined_func = _from_definition(fdef, grad_func=grad)
     initialized[name] = defined_func
 
@@ -831,10 +844,15 @@ class _OverloadedFunction(object):
       name = self._func_name
       if name is not None:
         name = "_".join([name, key])
-      defined = _DefinedFunction(self._func, self._argnames, input_types, name,
-                                 None, self._python_grad_func,
-                                 out_names=self._out_names,
-                                 **self._extra_kwargs)
+      defined = _DefinedFunction(
+          self._func,
+          self._argnames,
+          input_types,
+          name,
+          None,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
       _ = defined.name  # Fully instantiate the function definition.
       if self._grad_func:
         # If _grad_func is given, it is another
@@ -845,8 +863,8 @@ class _OverloadedFunction(object):
             for _ in defined.definition.signature.output_arg
         ]
         # pylint: disable=protected-access
-        defined._grad_func = self._grad_func.instantiate(input_types +
-                                                         output_types)
+        defined._grad_func = self._grad_func.instantiate(
+            input_types + output_types)
         # pylint: enable=protected-access
       self._overload[key] = defined
     return defined
@@ -886,6 +904,11 @@ class Defun(object):
   default graph. Because the addition of the function into the graph
   is deferred, the decorator can be used anywhere in the program.
 
+  Any variables created inside of the function are hoisted into the outer graph.
+  Note that the variables are created in the variable scope that was active
+  during the first call to the function. Subsequent function calls will refer to
+  the same set of variables.
+
   Definitions of functions are frozen in a graph as soon as the graph is used to
   create a session. Therefore, nodes using the function must be created in the
   graph before the corresponding session is created.
@@ -948,7 +971,7 @@ class Defun(object):
       raise ValueError("func %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
-    argspec = inspect.getargspec(func)
+    argspec = tf_inspect.getargspec(func)
     if argspec.keywords or argspec.defaults:
       raise ValueError("Functions with argument defaults or keyword "
                        "arguments are not supported.")
@@ -959,7 +982,7 @@ class Defun(object):
     if argspec.varargs:
       max_args = 1000000
     argnames = argspec.args
-    if inspect.ismethod(func):
+    if tf_inspect.ismethod(func):
       # 1st argument is the "class" type.
       min_args -= 1
       argnames = argnames[1:]
@@ -972,22 +995,36 @@ class Defun(object):
         raise ValueError(
             "The function has fewer arguments than the number of specified "
             "input types.")
-      return _DefinedFunction(func, argnames, self._input_types,
-                              self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
+      return _DefinedFunction(
+          func,
+          argnames,
+          self._input_types,
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
 
     # 'func' expects no arguments and input types is an empty list.
     if min_args == 0 and max_args == 0:
-      return _DefinedFunction(func, [], [], self._func_name, self._grad_func,
-                              self._python_grad_func,
-                              out_names=self._out_names, **self._extra_kwargs)
+      return _DefinedFunction(
+          func, [], [],
+          self._func_name,
+          self._grad_func,
+          self._python_grad_func,
+          out_names=self._out_names,
+          **self._extra_kwargs)
 
     # Input types are unknown. It's an overloaded function and hence
     # its definition needs to be deferred until it's called.
-    return _OverloadedFunction(func, argnames, self._func_name, self._grad_func,
-                               self._python_grad_func,
-                               out_names=self._out_names, **self._extra_kwargs)
+    return _OverloadedFunction(
+        func,
+        argnames,
+        self._func_name,
+        self._grad_func,
+        self._python_grad_func,
+        out_names=self._out_names,
+        **self._extra_kwargs)
 
 
 class Declare(object):
@@ -1030,8 +1067,10 @@ class Declare(object):
       names = [n for n, t in args]
       if len(names) != len(set(names)):
         raise ValueError("Expected names to all be unique: %s" % str(names))
-      return [op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
-              for n, t in args]
+      return [
+          op_def_pb2.OpDef.ArgDef(type=t.as_datatype_enum, name=n)
+          for n, t in args
+      ]
 
     self._sig.input_arg.extend(_to_argdef_list(inputs))
     self._sig.output_arg.extend(_to_argdef_list(outputs))
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 96bf7bde29f037ea65aed81ce919b2084ea77c62..416ab263afc685aef50fd22a8cdff6c754f94d8b 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import time
 
 import numpy as np
@@ -323,6 +324,48 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  def testControlFlowStrictness(self):
+    """Inlined functions must not execute in a untaken control flow branch."""
+
+    @function.Defun(dtypes.int32)
+    def AssertFail(x):
+      # Assertion that always fails and does not have a data dependency on `x`.
+      assert_false = control_flow_ops.Assert(False, [42])
+      with ops.control_dependencies([assert_false]):
+        return array_ops.identity(x)
+
+    with ops.device("CPU"):
+      pred = array_ops.placeholder(dtypes.bool)
+      x = array_ops.placeholder(dtypes.int32)
+      cond = control_flow_ops.cond(pred, lambda: x + 1, lambda: AssertFail(x))
+      # pylint: disable=unnecessary-lambda
+      loop = control_flow_ops.while_loop(lambda y: pred,
+                                         lambda y: AssertFail(y), [x])
+      # pylint: enable=unnecessary-lambda
+
+    # Enables inlining.
+    config = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
+        optimizer_options=config_pb2.OptimizerOptions(
+            opt_level=config_pb2.OptimizerOptions.L0,
+            do_common_subexpression_elimination=True,
+            do_function_inlining=True,
+            do_constant_folding=True)))
+
+    with session.Session(config=config) as sess:
+      # Since the 'False' branch is not taken, the assertion should not fire.
+      self.assertEqual(4, sess.run(cond, {pred: True, x: 3}))
+
+      # The assertion should still fire if the False branch is taken.
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(cond, {pred: False, x: 3})
+
+      # Similarly for loops.
+      self.assertEqual(3, sess.run(loop, {pred: False, x: 3}))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "assertion"):
+        sess.run(loop, {pred: True, x: 3})
+
   def testVar(self):
 
     @function.Defun(dtypes.float32)
@@ -722,6 +765,58 @@ class FunctionTest(test.TestCase):
       y = Bar(array_ops.zeros([1, 2, 3]))
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
+  def testVariableReuse(self):
+    def LinearWithReuse(input_tensor, reuse=None):
+      size = input_tensor.shape.dims[1]
+      with variable_scope.variable_scope("linear", reuse=reuse):
+        w = variable_scope.get_variable("w", shape=[size, size],
+                                        dtype=input_tensor.dtype)
+      return math_ops.matmul(input_tensor, w)
+
+    @function.Defun(dtypes.float32)
+    def Foo(inputs):
+      inputs = array_ops.reshape(inputs, [32, 100])
+      hidden = LinearWithReuse(inputs)
+      return LinearWithReuse(hidden, reuse=True)
+
+    input_op = array_ops.placeholder(shape=[32, 100], dtype=dtypes.float32)
+    output_op = Foo(input_op)
+
+    global_vars = variables.global_variables()
+    self.assertEqual(len(global_vars), 1)
+    self.assertEqual(global_vars[0].name, "linear/w:0")
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      output_val = sess.run(output_op,
+                            feed_dict={input_op: np.random.rand(32, 100)})
+      self.assertEqual(output_val.shape, (32, 100))
+
+  def testFunctionCallInDifferentVariableScopes(self):
+    @function.Defun(dtypes.float32)
+    def Foo(inputs):
+      var = variable_scope.get_variable("var", shape=[10], dtype=dtypes.float32,
+                                        initializer=init_ops.ones_initializer())
+      return inputs + var
+
+    input_op = array_ops.placeholder(shape=[10], dtype=dtypes.float32)
+    with variable_scope.variable_scope("vs1"):
+      out1_op = Foo(input_op)
+
+    with variable_scope.variable_scope("vs2"):
+      out2_op = Foo(input_op)
+
+    global_vars = variables.global_variables()
+    self.assertEqual(len(global_vars), 1)
+    self.assertEqual(global_vars[0].name, "vs1/var:0")
+
+    with session.Session() as sess:
+      sess.run(variables.global_variables_initializer())
+      out1, out2 = sess.run([out1_op, out2_op],
+                            feed_dict={input_op: np.linspace(1, 10, 10)})
+      self.assertAllEqual(out1, np.linspace(2, 11, 10))
+      self.assertAllEqual(out2, np.linspace(2, 11, 10))
+
 
 class FunctionsFromProtos(test.TestCase):
 
@@ -1102,6 +1197,7 @@ class FunctionInlineControlTest(test.TestCase):
             do_common_subexpression_elimination=True,
             do_function_inlining=True,
             do_constant_folding=True)))
+    cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
     for noinline in [False, True]:
 
       @function.Defun(dtype, noinline=noinline)
@@ -1140,7 +1236,7 @@ class FunctionInlineControlTest(test.TestCase):
       def MetadataHasCell(run_metadata):
         for dev_stats in run_metadata.step_stats.dev_stats:
           for node_stats in dev_stats.node_stats:
-            if "Cell" in node_stats.timeline_label:
+            if cell_func_call_pattern.search(node_stats.timeline_label):
               return True
         return False
 
diff --git a/tensorflow/python/framework/gen_docs_combined.py b/tensorflow/python/framework/gen_docs_combined.py
deleted file mode 100644
index 65379dda209225f3c81f4eb5ee789cb954b600d7..0000000000000000000000000000000000000000
--- a/tensorflow/python/framework/gen_docs_combined.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Updates generated docs from Python doc comments."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import os.path
-import sys
-
-import tensorflow as tf
-
-from tensorflow.contrib import ffmpeg
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.client import client_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import docs
-from tensorflow.python.framework import framework_lib
-
-FLAGS = None
-
-
-PREFIX_TEXT = """
-Note: Functions taking `Tensor` arguments can also take anything accepted by
-@{tf.convert_to_tensor}.
-"""
-
-
-def module_names():
-  return [
-      "tf",
-      "tf.errors",
-      "tf.image",
-      "tf.nn",
-      "tf.train",
-      "tf.python_io",
-      "tf.saved_model",
-      "tf.summary",
-      "tf.test",
-      "tf.contrib.bayesflow.entropy",
-      "tf.contrib.bayesflow.monte_carlo",
-      "tf.contrib.bayesflow.stochastic_graph",
-      "tf.contrib.bayesflow.stochastic_tensor",
-      "tf.contrib.bayesflow.variational_inference",
-      "tf.contrib.copy_graph",
-      "tf.contrib.crf",
-      "tf.contrib.distributions",
-      "tf.contrib.distributions.bijector",
-      "tf.contrib.ffmpeg",
-      "tf.contrib.framework",
-      "tf.contrib.graph_editor",
-      "tf.contrib.integrate",
-      "tf.contrib.layers",
-      "tf.contrib.learn",
-      "tf.contrib.learn.monitors",
-      "tf.contrib.legacy_seq2seq",
-      "tf.contrib.linalg",
-      "tf.contrib.losses",
-      "tf.contrib.metrics",
-      "tf.contrib.opt",
-      "tf.contrib.rnn",
-      "tf.contrib.solvers",
-      "tf.contrib.training",
-      "tf.contrib.util",
-      "tf_debug",
-  ]
-
-
-def find_module(base_module, name):
-  if name == "tf":
-    return base_module
-  # Special case for ffmpeg is needed since it's not linked in by default due
-  # to size concerns.
-  elif name == "tf.contrib.ffmpeg":
-    return ffmpeg
-  elif name == "tf_debug":
-    return tf_debug
-  elif name.startswith("tf."):
-    subname = name[3:]
-    subnames = subname.split(".")
-    parent_module = base_module
-    for s in subnames:
-      if not hasattr(parent_module, s):
-        raise ValueError(
-            "Module not found: {}. Submodule {} not found in parent module {}."
-            " Possible candidates are {}".format(
-                name, s, parent_module.__name__, dir(parent_module)))
-      parent_module = getattr(parent_module, s)
-    return parent_module
-  else:
-    raise ValueError(
-        "Invalid module name: {}. Module names must start with 'tf.'".format(
-            name))
-
-
-def get_module_to_name(names):
-  return collections.OrderedDict([(find_module(tf, x), x) for x in names])
-
-
-def all_libraries(module_to_name, members, documented):
-  """Make a list of the individual files that we want to create.
-
-  Args:
-    module_to_name: Dictionary mapping modules to short names.
-    members: Dictionary mapping member name to (fullname, member).
-    documented: Set of documented names to update.
-
-  Returns:
-    List of (filename, docs.Library) pairs.
-  """
-  def library(name, title, module=None, **args):
-    if module is None:
-      module = sys.modules["tensorflow.python.ops." + name]
-    return (name + ".md", docs.Library(title=title,
-                                       module_to_name=module_to_name,
-                                       members=members,
-                                       documented=documented,
-                                       module=module,
-                                       **args))
-  return collections.OrderedDict([
-      # Splits of module 'tf'.
-      library("framework", "Building Graphs", framework_lib),
-      library("check_ops", "Asserts and boolean checks."),
-      library("constant_op", "Constants, Sequences, and Random Values",
-              constant_op, prefix=PREFIX_TEXT),
-      library("state_ops",
-              "Variables",
-              exclude_symbols=["create_partitioned_variables"],
-              prefix=PREFIX_TEXT),
-      library("array_ops",
-              "Tensor Transformations",
-              exclude_symbols=["list_diff"],
-              prefix=PREFIX_TEXT),
-      library("math_ops",
-              "Math",
-              exclude_symbols=["sparse_matmul", "arg_min", "arg_max",
-                               "lin_space", "sparse_segment_mean_grad"],
-              prefix=PREFIX_TEXT),
-      library("string_ops", "Strings",
-              prefix=PREFIX_TEXT),
-      library("histogram_ops", "Histograms"),
-      library("control_flow_ops", "Control Flow", prefix=PREFIX_TEXT),
-      library("functional_ops", "Higher Order Functions", prefix=PREFIX_TEXT),
-      library("tensor_array_ops", "TensorArray Operations", prefix=PREFIX_TEXT),
-      library("session_ops", "Tensor Handle Operations", prefix=PREFIX_TEXT),
-      library("image", "Images", tf.image, exclude_symbols=["ResizeMethod"],
-              prefix=PREFIX_TEXT),
-      library("sparse_ops",
-              "Sparse Tensors",
-              exclude_symbols=["serialize_sparse", "serialize_many_sparse",
-                               "deserialize_many_sparse"],
-              prefix=PREFIX_TEXT),
-      library("io_ops",
-              "Inputs and Readers",
-              exclude_symbols=["LookupTableBase", "HashTable",
-                               "initialize_all_tables",
-                               "tables_initializer",
-                               "parse_single_sequence_example",
-                               "string_to_hash_bucket"],
-              prefix=PREFIX_TEXT),
-      library("python_io", "Data IO (Python functions)", tf.python_io),
-      library("nn",
-              "Neural Network",
-              tf.nn,
-              exclude_symbols=["conv2d_backprop_input",
-                               "conv2d_backprop_filter", "avg_pool_grad",
-                               "max_pool_grad", "max_pool_grad_with_argmax",
-                               "batch_norm_with_global_normalization_grad",
-                               "lrn_grad", "relu6_grad", "softplus_grad",
-                               "softsign_grad", "xw_plus_b", "relu_layer",
-                               "lrn", "batch_norm_with_global_normalization",
-                               "batch_norm_with_global_normalization_grad",
-                               "all_candidate_sampler", "seq2seq"],
-              prefix=PREFIX_TEXT),
-      library("client", "Running Graphs", client_lib),
-      library("train",
-              "Training",
-              tf.train,
-              exclude_symbols=["Feature", "Features", "BytesList", "FloatList",
-                               "Int64List", "Example", "InferenceExample",
-                               "FeatureList", "FeatureLists", "RankingExample",
-                               "SequenceExample"]),
-      library("script_ops",
-              "Wraps python functions",
-              prefix=PREFIX_TEXT),
-      library("summary", "Summary Operations", tf.summary),
-      library("test", "Testing", tf.test),
-      library("contrib.bayesflow.entropy",
-              "BayesFlow Entropy (contrib)",
-              tf.contrib.bayesflow.entropy),
-      library("contrib.bayesflow.monte_carlo",
-              "BayesFlow Monte Carlo (contrib)",
-              tf.contrib.bayesflow.monte_carlo),
-      library("contrib.bayesflow.stochastic_graph",
-              "BayesFlow Stochastic Graph (contrib)",
-              tf.contrib.bayesflow.stochastic_graph),
-      library("contrib.bayesflow.stochastic_tensor",
-              "BayesFlow Stochastic Tensors (contrib)",
-              tf.contrib.bayesflow.stochastic_tensor),
-      library("contrib.bayesflow.variational_inference",
-              "BayesFlow Variational Inference (contrib)",
-              tf.contrib.bayesflow.variational_inference),
-      library("contrib.crf", "CRF (contrib)", tf.contrib.crf),
-      library("contrib.distributions", "Statistical Distributions (contrib)",
-              tf.contrib.distributions),
-      library("contrib.distributions.bijector",
-              "Random variable transformations (contrib)",
-              tf.contrib.distributions.bijector),
-      library("contrib.ffmpeg", "FFmpeg (contrib)", ffmpeg),
-      library("contrib.framework", "Framework (contrib)", tf.contrib.framework),
-      library("contrib.graph_editor", "Graph Editor (contrib)",
-              tf.contrib.graph_editor),
-      library("contrib.integrate", "Integrate (contrib)", tf.contrib.integrate),
-      library("contrib.layers", "Layers (contrib)", tf.contrib.layers),
-      library("contrib.learn", "Learn (contrib)", tf.contrib.learn),
-      library("contrib.learn.monitors", "Monitors (contrib)",
-              tf.contrib.learn.monitors),
-      library("contrib.legacy_seq2seq", "Sequence to Sequence (contrib)",
-              tf.contrib.legacy_seq2seq),
-      library("contrib.linalg", "Linear Algebra (contrib)",
-              tf.contrib.linalg),
-      library("contrib.losses", "Losses (contrib)", tf.contrib.losses),
-      library("contrib.opt", "Optimization (contrib)", tf.contrib.opt),
-      library("contrib.rnn", "RNN and Cells (contrib)", tf.contrib.rnn),
-      library("contrib.metrics", "Metrics (contrib)", tf.contrib.metrics),
-      library("contrib.training", "Training (contrib)", tf.contrib.training),
-      library("contrib.util", "Utilities (contrib)", tf.contrib.util),
-      library("contrib.copy_graph", "Copying Graph Elements (contrib)",
-              tf.contrib.copy_graph),
-      library("tf_debug", "TensorFlow Debugger", tf_debug),
-  ])
-
-_hidden_symbols = ["Event", "LogMessage", "Summary", "SessionLog", "xrange",
-                   "HistogramProto", "ConfigProto", "NodeDef", "GraphDef",
-                   "GPUOptions", "GraphOptions", "RunOptions", "RunMetadata",
-                   "SessionInterface", "BaseSession", "NameAttrList",
-                   "AttrValue", "OptimizerOptions",
-                   "CollectionDef", "MetaGraphDef", "QueueRunnerDef",
-                   "SaverDef", "VariableDef", "TestCase", "GrpcServer",
-                   "ClusterDef", "JobDef", "ServerDef", "TensorInfo"]
-
-# TODO(skleinfeld, deannarubin) Address shortname
-# conflict between tf.contrib.learn.NanLossDuringTrainingError and
-# tf.contrib.learn.monitors.NanLossDuringTrainingError, arising due
-# to imports in learn/python/learn/__init__.py
-# TODO(wicke): Remove contrib.layers.relu* after shortnames are
-# disabled.  These conflict with tf.nn.relu*
-EXCLUDE = frozenset(["tf.contrib.learn.monitors.NanLossDuringTrainingError",
-                     "tf.contrib.layers.dropout",
-                     "tf.contrib.layers.bias_add",
-                     "tf.contrib.layers.conv2d",
-                     "tf.contrib.layers.conv2d_transpose",
-                     "tf.contrib.layers.separable_conv2d",
-                     "tf.contrib.layers.softmax",
-                     "tf.contrib.layers.relu", "tf.contrib.layers.relu6",
-                     "tf.contrib.framework.assert_global_step",
-                     "tf.contrib.framework.get_global_step",
-                     "tf.contrib.learn.NanLossDuringTrainingError",
-                     "tf.contrib.layers.stack",
-                     "tf.contrib.layers.ProblemType",
-                     "tf.confusion_matrix"])
-
-
-def main(unused_argv):
-  if not FLAGS.out_dir:
-    tf.logging.error("out_dir not specified")
-    return -1
-
-  # Document libraries
-  documented = set()
-  module_to_name = get_module_to_name(module_names())
-  members = docs.collect_members(module_to_name, exclude=EXCLUDE)
-  libraries = all_libraries(module_to_name, members, documented).items()
-
-  # Define catch_all library before calling write_libraries to avoid complaining
-  # about generically hidden symbols.
-  catch_all = docs.Library(title="Catch All", module=None,
-                           exclude_symbols=_hidden_symbols,
-                           module_to_name=module_to_name, members=members,
-                           documented=documented)
-
-  # Write docs to files
-  docs.write_libraries(FLAGS.out_dir, libraries)
-
-  # Make it easy to search for hidden symbols
-  if FLAGS.print_hidden_regex:
-    hidden = set(_hidden_symbols)
-    for _, lib in libraries:
-      hidden.update(lib.exclude_symbols)
-    print(r"hidden symbols regex = r'\b(%s)\b'" % "|".join(sorted(hidden)))
-
-  # Verify that all symbols are mentioned in some library doc.
-  catch_all.assert_no_leftovers()
-
-  # Generate index
-  with open(os.path.join(FLAGS.out_dir, "index.md"), "w") as f:
-    docs.Index(module_to_name, members, libraries,
-               "../../api_docs/python/").write_markdown_to_file(f)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--out_dir",
-      type=str,
-      default=None,
-      help="Directory to which docs should be written.")
-  parser.add_argument(
-      "--print_hidden_regex",
-      type="bool",
-      nargs="?",
-      const=True,
-      default=False,
-      help="Dump a regular expression matching any hidden symbol")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index b52f4e3772b4e93134fb069efec865b5cfb62b88..65bd3dce2e6c3a4560027798d2923625f8d76c54 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -241,6 +241,8 @@ def convert_variables_to_constants(sess, input_graph_def, output_node_names,
     else:
       output_node.CopyFrom(input_node)
     output_graph_def.node.extend([output_node])
+
+  output_graph_def.library.CopyFrom(inference_graph.library)
   print("Converted %d variables to const ops." % how_many_converted)
   return output_graph_def
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 4d8cb6daae58b63cfaaa79b4319669fda3909796..f6e9bc9dad3551420093f7d35f6cd6ba3f24c6b9 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -187,6 +188,36 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertEqual("n3", sub_graph.node[2].name)
     self.assertEqual("n5", sub_graph.node[3].name)
 
+  def testConvertVariablesToConstsWithFunctions(self):
+    @function.Defun(dtypes.float32)
+    def plus_one(x):
+      return x + 1.0
+
+    with ops.Graph().as_default():
+      variable_node = variables.Variable(1.0, name="variable_node")
+      _ = variables.Variable(1.0, name="unused_variable_node")
+      defun_node = plus_one(variable_node)
+      output_node = math_ops_lib.multiply(
+          defun_node, 2.0, name="output_node")
+
+      with session.Session() as sess:
+        init = variables.initialize_variables([variable_node])
+        sess.run(init)
+        output = sess.run(output_node)
+        self.assertNear(4.0, output, 0.00001)
+        variable_graph_def = sess.graph.as_graph_def()
+
+        # First get the constant_graph_def when variable_names_whitelist is set,
+        # note that if variable_names_whitelist is not set an error will be
+        # thrown because unused_variable_node is not initialized.
+        constant_graph_def = graph_util.convert_variables_to_constants(
+            sess,
+            variable_graph_def, ["output_node"],
+            variable_names_whitelist=set(["variable_node"]))
+
+        self.assertEqual(variable_graph_def.library,
+                         constant_graph_def.library)
+
   def testConvertVariablesToConsts(self):
     with ops.Graph().as_default():
       variable_node = variables.Variable(1.0, name="variable_node")
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index fcddd9546d94e0a3e5aa76daef5819b8143eebec..ed579224d32562be6ec64c51c49bce2862f928be 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -275,6 +275,9 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
 
     # 1. Add operations without their inputs.
     for node in graph_def.node:
+      # Check to see if this op's name matches a previously seen op
+      if node.name in name_to_op:
+        raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
       # Set any default attr values that aren't present.
       if node.op not in op_dict:
         raise ValueError('No op named %s in defined operations.' % node.op)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index c4ccc3d1892c4991648be0d03af1191f95c94096..2b2398f83329b373f9d6363be036ae99597c76a2 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -685,6 +685,17 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual("return_elements must be a list of strings.",
                        str(e.exception))
 
+  def testDuplicateOperationNames(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError) as e:
+        importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'Oi' }
+            node { name: 'B' op: 'Oi' }
+            node { name: 'A' op: 'Oi' }
+            """))
+      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
+
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant(5.0, dtype=dtypes.float32, name="c")
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 26344d38528c0f937ca36b6bbeceae478cf2f30c..783612c942fc299dc120382ffe7740258ea008a1 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -422,14 +422,15 @@ def import_scoped_meta_graph(meta_graph_or_file,
                              graph=None,
                              import_scope=None,
                              input_map=None,
-                             unbound_inputs_col_name="unbound_inputs"):
-  """Recreates a`Graph` saved in a `MetaGraphDef` proto.
+                             unbound_inputs_col_name="unbound_inputs",
+                             restore_collections_predicate=(lambda key: True)):
+  """Recreates a `Graph` saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
   the argument is a file containing a `MetaGraphDef` protocol buffer ,
   it constructs a protocol buffer from the file content. The function
   then adds all the nodes from the `graph_def` field to the
-  current graph, recreates all the collections, and returns a saver
+  current graph, recreates the desired collections, and returns a saver
   constructed from the `saver_def` field.
 
   In combination with `export_scoped_meta_graph()`, this function can be used to
@@ -453,6 +454,10 @@ def import_scoped_meta_graph(meta_graph_or_file,
       `Tensor` objects. The values of the named input tensors in the imported
       graph will be re-mapped to the respective `Tensor` values.
     unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
 
   Returns:
     A dictionary of all the `Variables` imported into the name scope.
@@ -498,11 +503,16 @@ def import_scoped_meta_graph(meta_graph_or_file,
         input_graph_def, name=(import_scope or ""), input_map=input_map,
         producer_op_list=producer_op_list)
 
+    scope_to_prepend_to_names = "/".join(
+        [part for part in [graph.get_name_scope(), import_scope] if part])
+
     # Restores all the other collections.
     for key, col_def in meta_graph_def.collection_def.items():
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
+      if not restore_collections_predicate(key):
+        continue
 
       kind = col_def.WhichOneof("kind")
       if kind is None:
@@ -517,13 +527,13 @@ def import_scoped_meta_graph(meta_graph_or_file,
           proto = proto_type()
           proto.ParseFromString(value)
           graph.add_to_collection(
-              key, from_proto(proto, import_scope=import_scope))
+              key, from_proto(proto, import_scope=scope_to_prepend_to_names))
       else:
         field = getattr(col_def, kind)
         if kind == "node_list":
           for value in field.value:
             col_op = graph.as_graph_element(
-                ops.prepend_name_scope(value, import_scope))
+                ops.prepend_name_scope(value, scope_to_prepend_to_names))
             graph.add_to_collection(key, col_op)
         elif kind == "int64_list":
           # NOTE(opensource): This force conversion is to work around the fact
@@ -534,13 +544,13 @@ def import_scoped_meta_graph(meta_graph_or_file,
         else:
           for value in field.value:
             graph.add_to_collection(
-                key, ops.prepend_name_scope(value, import_scope))
+                key, ops.prepend_name_scope(value, scope_to_prepend_to_names))
 
     var_list = {}
     variables = graph.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                     scope=import_scope)
+                                     scope=scope_to_prepend_to_names)
     for v in variables:
-      var_list[ops.strip_name_scope(v.name, import_scope)] = v
+      var_list[ops.strip_name_scope(v.name, scope_to_prepend_to_names)] = v
 
   return var_list
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index de17eeb5b40f3cbdb061a0557596650676c37fef..10236576eafe296f0a20235688778a2e6c7a4495 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -332,7 +333,82 @@ class ScopedMetaGraphTest(test.TestCase):
     del orig_meta_graphs[0].collection_def["unbound_inputs"]
     del new_meta_graphs[0].collection_def["unbound_inputs"]
     for a, b in zip(orig_meta_graphs, new_meta_graphs):
-      self.assertProtoEquals(a, b)
+      test_util.assert_meta_graph_protos_equal(self, a, b)
+
+  def testScopedImportUnderNameScope(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True, name="myvar")
+    meta_graph_def, _ = meta_graph.export_scoped_meta_graph(graph=graph)
+
+    graph = ops.Graph()
+    with graph.as_default():
+      with ops.name_scope("foo"):
+        imported_variables = meta_graph.import_scoped_meta_graph(
+            meta_graph_def, import_scope="bar")
+        self.assertEqual(len(imported_variables), 1)
+        self.assertEqual(list(imported_variables.values())[0].name,
+                         "foo/bar/myvar:0")
+
+  def testScopedImportWithSelectedCollections(self):
+    meta_graph_filename = os.path.join(
+        _TestDir("selected_collections_import"), "meta_graph.pb")
+
+    graph = ops.Graph()
+    # Add a variable to populate two collections. The functionality tested is
+    # not specific to variables, but using variables in the test is convenient.
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True)
+    self.assertTrue(
+        all([
+            graph.get_collection(key)
+            for key in
+            [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
+        ]))
+    meta_graph.export_scoped_meta_graph(
+        filename=meta_graph_filename, graph=graph)
+
+    def _test_import(include_collection_keys, omit_collection_keys):
+      assert set(include_collection_keys).isdisjoint(omit_collection_keys)
+      newgraph = ops.Graph()
+      import_scope = "some_scope_name"
+
+      def _restore_collections_predicate(collection_key):
+        return (collection_key in include_collection_keys and
+                collection_key not in omit_collection_keys)
+
+      meta_graph.import_scoped_meta_graph(
+          meta_graph_filename,
+          graph=newgraph,
+          import_scope=import_scope,
+          restore_collections_predicate=_restore_collections_predicate)
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in include_collection_keys
+      ]
+      self.assertTrue(all(collection_values))
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in omit_collection_keys
+      ]
+      self.assertFalse(any(collection_values))
+
+    _test_import(
+        include_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ],
+        omit_collection_keys=[])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES])
+    _test_import(
+        include_collection_keys=[],
+        omit_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ])
 
   def _testScopedExportWithQueue(self, test_dir, exported_filename):
     graph = ops.Graph()
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 7f2b03e350981b205a918bc64f5efd909a311e56..662c2c679c8113cbba3ba4bbfcd6c587fb0fbc2c 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -19,8 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 import six
 
 from tensorflow.core.framework import attr_value_pb2
@@ -33,6 +31,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_contextlib
 
 
 def _Attr(op_def, name):
@@ -241,7 +240,7 @@ class _OpInfo(object):
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def _MaybeColocateWith(inputs):
   """A context manager for (maybe) colocating with a list of input tensors.
 
@@ -329,7 +328,7 @@ class OpDefLibrary(object):
       # Need to flatten all the arguments into a list.
       # pylint: disable=protected-access
       g = ops._get_graph_from_inputs(_Flatten(keywords.values()))
-      # pyline: enable=protected-access
+      # pylint: enable=protected-access
     except AssertionError as e:
       raise RuntimeError(
           "Cannot determine graph for Op '%s' due to: %s"
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 08572955d965495de76b6b7ae18eab41d313c22a..db2b8d4d89081bb4e5e4789b8161bb0d6aa5b4b2 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import contextlib
 import copy
 import linecache
 import re
@@ -35,8 +34,10 @@ from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
@@ -44,6 +45,25 @@ from tensorflow.python.framework import versions
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import tf_contextlib
+
+
+# Temporary global switch determining if we should enable the work-in-progress
+# calls to the C API. Currently disabled by default but can be manually enabled
+# e.g. in tests. This will be removed once all functionality is supported and
+# there's no performance penalty with it enabled.
+#
+# TODO(skyewm) before we can remove this:
+# - functions
+# - import_graph_def() incrementally adds inputs to ops (i.e. creates an
+#   Operation and then calls _add_input()). The current code requires that all
+#   inputs be specified when creating the Operation (since we call
+#   TF_FinishOperation()).
+# - ops_test.py (and others?) create unregistered op types
+# - while loop
+# - performance (e.g. delete/refactor redundant Python functionality, switch to
+#   new session API)
+_USE_C_API = False
 
 
 def _override_helper(clazz_object, operator, func):
@@ -70,25 +90,33 @@ def _override_helper(clazz_object, operator, func):
   setattr(clazz_object, operator, func)
 
 
-def _convert_stack(stack):
+def _convert_stack(stack, include_func_start_lineno=False):
   """Converts a stack extracted using _extract_stack() to a traceback stack.
 
   Args:
-    stack: A list of n 4-tuples, (filename, lineno, name, frame_globals).
+    stack: A list of n 5-tuples,
+      (filename, lineno, name, frame_globals, func_start_lineno).
+    include_func_start_lineno: True if function start line number should be
+      included as the 5th entry in return tuples.
 
   Returns:
-    A list of n 4-tuples (filename, lineno, name, code), where the code tuple
-    element is calculated from the corresponding elements of the input tuple.
+    A list of n 4-tuples or 5-tuples
+    (filename, lineno, name, code, [optional: func_start_lineno]), where the
+    code tuple element is calculated from the corresponding elements of the
+    input tuple.
   """
   ret = []
-  for filename, lineno, name, frame_globals in stack:
+  for filename, lineno, name, frame_globals, func_start_lineno in stack:
     linecache.checkcache(filename)
     line = linecache.getline(filename, lineno, frame_globals)
     if line:
       line = line.strip()
     else:
       line = None
-    ret.append((filename, lineno, name, line))
+    if include_func_start_lineno:
+      ret.append((filename, lineno, name, line, func_start_lineno))
+    else:
+      ret.append((filename, lineno, name, line))
   return ret
 
 
@@ -103,7 +131,8 @@ def _extract_stack():
     be formatted etc. using traceback methods.
 
   Returns:
-    A list of 4-tuples (filename, lineno, name, frame_globals) corresponding to
+    A list of 5-tuples
+    (filename, lineno, name, frame_globals, func_start_lineno) corresponding to
     the call stack of the current thread.
   """
   # pylint: enable=line-too-long
@@ -118,7 +147,8 @@ def _extract_stack():
     filename = co.co_filename
     name = co.co_name
     frame_globals = f.f_globals
-    ret.append((filename, lineno, name, frame_globals))
+    func_start_lineno = co.co_firstlineno
+    ret.append((filename, lineno, name, frame_globals, func_start_lineno))
     f = f.f_back
   ret.reverse()
   return ret
@@ -457,6 +487,13 @@ class Tensor(_TensorLike):
     else:
       return "%s:%d" % (self._op.name, self._value_index)
 
+  def _as_tf_output(self):
+    assert self.op._c_op  # pylint: disable=protected-access
+    tf_output = c_api.TF_Output()
+    tf_output.oper = self.op._c_op  # pylint: disable=protected-access
+    tf_output.index = self.value_index
+    return tf_output
+
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
         self.name,
@@ -1206,7 +1243,11 @@ class Operation(object):
     else:
       if not all(x.is_compatible_with(i.dtype)
                  for i, x in zip(self._inputs, input_types)):
-        raise TypeError("Inputs are not compatible with input types")
+        raise TypeError("In op '%s', input types (%s) are not compatible "
+                        "with expected types (%s)" % (
+                            self.node_def.name,
+                            [i.dtype for i in self._inputs],
+                            input_types))
     self._input_types = input_types
 
     # Build the list of control inputs.
@@ -1238,6 +1279,103 @@ class Operation(object):
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._recompute_node_def()
 
+    if _USE_C_API:
+      assert self._graph._c_graph, (  # pylint: disable=protected-access
+          "_USE_C_API set to False when creating Graph, you may need to "
+          "manually set 'ops._USE_C_API = True' before creating the Graph")
+      if self._op_def:
+        # TODO(skyewm): op_def_library.apply_op() flattens the incoming
+        # inputs. Refactor so we don't have to do this here.
+        grouped_inputs = self._reconstruct_sequence_inputs(
+            self._op_def, self._inputs, self._node_def.attr)
+      else:
+        # If no OpDef is specified, assume all inputs are scalar.
+        grouped_inputs = self._inputs
+
+      self._c_op = self._create_c_op(self._graph, self._node_def,
+                                     grouped_inputs, self._control_inputs)
+    else:
+      self._c_op = None
+
+  def _create_c_op(self, graph, node_def, inputs, control_inputs):
+    """Creates a TF_Operation.
+
+    Arguments:
+      graph: a `Graph`.
+      node_def: `node_def_pb2.NodeDef` for the operation to create.
+      inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
+        `Tensor`s (corresponding to sequence inputs, e.g. "int64 * N",
+        "list(int64)"). The length of the list should be equal to the number of
+        inputs specified by this operation's op def.
+      control_inputs: A list of `Operation`s to set as control dependencies.
+
+    Returns:
+      A wrapped TF_Operation*.
+    """
+    # pylint: disable=protected-access
+    op_desc = c_api.TF_NewOperation(graph._c_graph, compat.as_str(node_def.op),
+                                    compat.as_str(node_def.name))
+    # Add inputs
+    for op_input in inputs:
+      if isinstance(op_input, (list, tuple)):
+        c_api.TF_AddInputList(op_desc, [t._as_tf_output() for t in op_input])
+      else:
+        c_api.TF_AddInput(op_desc, op_input._as_tf_output())
+
+    # Add control inputs
+    for control_input in control_inputs:
+      c_api.TF_AddControlInput(op_desc, control_input._c_op)
+    # pylint: enable=protected-access
+
+    # Add attrs
+    for name, attr_value in node_def.attr.items():
+      serialized = attr_value.SerializeToString()
+      # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
+      # It might be worth creating a convenient way to re-use the same status.
+      with errors.raise_exception_on_not_ok_status() as status:
+        c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized,
+                                   status)
+
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_op = c_api.TF_FinishOperation(op_desc, status)
+
+    return c_op
+
+  def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
+    """Regroups a flat list of input tensors into scalar and sequence inputs.
+
+    Arguments:
+      op_def: The `op_def_pb2.OpDef` (for knowing the input types)
+      inputs: a list of input `Tensor`s to the op.
+      attrs: mapping from attr name to `attr_value_pb2.AttrValue` (these define
+        how long each sequence is)
+
+    Returns:
+      A list of `Tensor`s (corresponding to scalar inputs) and lists of
+      `Tensor`s (corresponding to sequence inputs).
+    """
+    grouped_inputs = []
+    i = 0
+    for input_arg in op_def.input_arg:
+      if input_arg.number_attr:
+        input_len = attrs[input_arg.number_attr].i
+        is_sequence = True
+      elif input_arg.type_list_attr:
+        input_len = len(attrs[input_arg.type_list_attr].list.type)
+        is_sequence = True
+      else:
+        input_len = 1
+        is_sequence = False
+
+      if is_sequence:
+        grouped_inputs.append(inputs[i:i + input_len])
+      else:
+        grouped_inputs.append(inputs[i])
+      i += input_len
+
+    assert i == len(inputs)
+    return grouped_inputs
+
   def colocation_groups(self):
     """Returns the list of colocation groups of the op."""
     default_colocation_group = [compat.as_bytes("loc:@%s" %
@@ -1501,6 +1639,15 @@ class Operation(object):
     """Returns the call stack from when this operation was constructed."""
     return _convert_stack(self._traceback)
 
+  @property
+  def traceback_with_start_lines(self):
+    """Same as traceback but includes start line of function definition.
+
+    Returns:
+      A list of 5-tuples (filename, lineno, name, code, func_start_lineno).
+    """
+    return _convert_stack(self._traceback, include_func_start_lineno=True)
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -1524,12 +1671,18 @@ class Operation(object):
     if x.HasField("list"):
       for f in fields:
         if getattr(x.list, f):
-          return list(getattr(x.list, f))
+          if f == "type":
+            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+          else:
+            return list(getattr(x.list, f))
       return []
     else:
       for f in fields:
         if x.HasField(f):
-          return getattr(x, f)
+          if f == "type":
+            return dtypes.as_dtype(getattr(x, f))
+          else:
+            return getattr(x, f)
       assert False, "Unsupported field type in " + str(x)
 
   def run(self, feed_dict=None, session=None):
@@ -1888,6 +2041,18 @@ def _name_from_scope_name(name):
   return name[:-1] if name[-1] == "/" else name
 
 
+class _ScopedTF_Graph(object):
+
+  def __init__(self):
+    self.graph = c_api.TF_NewGraph()
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api.TF_DeleteGraph is not None:
+      c_api.TF_DeleteGraph(self.graph)
+
+
 class Graph(object):
   """A TensorFlow computation, represented as a dataflow graph.
 
@@ -2001,6 +2166,13 @@ class Graph(object):
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
 
+    # TODO(skyewm): fold as much of the above as possible into the C
+    # implementation
+    if _USE_C_API:
+      self._scoped_c_graph = _ScopedTF_Graph()
+    else:
+      self._scoped_c_graph = None
+
   def _check_not_finalized(self):
     """Check if the graph is finalized.
 
@@ -2036,6 +2208,12 @@ class Graph(object):
       self._version = max(self._version, op._id)
       # pylint: enable=protected-access
 
+  @property
+  def _c_graph(self):
+    if self._scoped_c_graph:
+      return self._scoped_c_graph.graph
+    return None
+
   @property
   def version(self):
     """Returns a version number that increases as ops are added to the graph.
@@ -2682,11 +2860,11 @@ class Graph(object):
     Args:
       name: The key for the collection. For example, the `GraphKeys` class
         contains many standard names for collections.
-      scope: (Optional.) If supplied, the resulting list is filtered to include
-        only items whose `name` attribute matches using `re.match`. Items
-        without a `name` attribute are never returned if a scope is supplied and
-        the choice or `re.match` means that a `scope` without special tokens
-        filters by prefix.
+      scope: (Optional.) A string. If supplied, the resulting list is filtered
+        to include only items whose `name` attribute matches `scope` using
+        `re.match`. Items without a `name` attribute are never returned if a
+        scope is supplied. The choice of `re.match` means that a `scope` without
+        special tokens filters by prefix.
 
     Returns:
       The list of values in the collection with the given `name`, or
@@ -2725,7 +2903,7 @@ class Graph(object):
       if name in self._collections:
         del self._collections[name]
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _original_op(self, op):
     """Python 'with' handler to help annotate ops with their originator.
 
@@ -2751,7 +2929,7 @@ class Graph(object):
       self._default_original_op = old_original_op
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def name_scope(self, name):
     r"""Returns a context manager that creates hierarchical names for operations.
 
@@ -2907,7 +3085,24 @@ class Graph(object):
         self._names_in_use[name] = 1
     return name
 
-  @contextlib.contextmanager
+  def get_name_scope(self):
+    """Returns the current name scope.
+
+    For example:
+
+    ```python
+    with tf.name_scope('scope1'):
+      with tf.name_scope('scope2'):
+        print(tf.get_default_graph().get_name_scope())
+    ```
+    would print the string `scope1/scope2`.
+
+    Returns:
+      A string representing the current name scope.
+    """
+    return self._name_stack
+
+  @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
 
@@ -2982,7 +3177,7 @@ class Graph(object):
       if ignore_existing:
         self._colocation_stack = current_stack
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def device(self, device_name_or_function):
     """Returns a context manager that specifies the default device to use.
 
@@ -3064,7 +3259,7 @@ class Graph(object):
       op._set_device(device_function(op))
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def container(self, container_name):
     """Returns a context manager that specifies the resource container to use.
 
@@ -3332,7 +3527,7 @@ class Graph(object):
     return self._ControlDependenciesController(self, control_ops)
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _attr_scope(self, attr_map):
     """EXPERIMENTAL: A context manager for setting attributes on operators.
 
@@ -3397,7 +3592,7 @@ class Graph(object):
   # pylint: enable=g-doc-return-or-yield
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def _kernel_label_map(self, op_to_kernel_label_map):
     """EXPERIMENTAL: A context manager for setting kernel labels.
 
@@ -3459,7 +3654,7 @@ class Graph(object):
   # pylint: enable=g-doc-return-or-yield
 
   # pylint: disable=g-doc-return-or-yield
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def gradient_override_map(self, op_type_map):
     """EXPERIMENTAL: A context manager for overriding gradient functions.
 
@@ -3617,7 +3812,7 @@ class _DefaultStack(threading.local):
   def enforce_nesting(self, value):
     self._enforce_nesting = value
 
-  @contextlib.contextmanager
+  @tf_contextlib.contextmanager
   def get_controller(self, default):
     """A context manager for manipulating a default stack."""
     try:
@@ -3960,9 +4155,13 @@ class GraphKeys(object):
     for more details.
   * `REGULARIZATION_LOSSES`: regularization losses collected during graph
     construction.
-  * `WEIGHTS`: weights inside neural network layers
-  * `BIASES`: biases inside neural network layers
-  * `ACTIVATIONS`: activations of neural network layers
+
+  The following standard keys are _defined_, but their collections are **not**
+  automatically populated as many of the others are:
+
+  * `WEIGHTS`
+  * `BIASES`
+  * `ACTIVATIONS`
   """
 
   # Key to collect Variable objects that are global (shared across machines).
@@ -4120,7 +4319,7 @@ def get_all_collection_keys():
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def name_scope(name, default_name=None, values=None):
   """Returns a context manager for use when defining a Python op.
 
@@ -4182,10 +4381,15 @@ def strip_name_scope(name, export_scope):
     is None.
   """
   if export_scope:
-    # Strips export_scope/, export_scope///,
-    # ^export_scope/, loc:@export_scope/.
-    str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
-    return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    try:
+      # Strips export_scope/, export_scope///,
+      # ^export_scope/, loc:@export_scope/.
+      str_to_replace = r"([\^]|loc:@|^)" + export_scope + r"[\/]+(.*)"
+      return re.sub(str_to_replace, r"\1\2", compat.as_str(name), count=1)
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
@@ -4202,15 +4406,20 @@ def prepend_name_scope(name, import_scope):
     is None.
   """
   if import_scope:
-    str_to_replace = r"([\^]|loc:@|^)(.*)"
-    return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
-                  compat.as_str(name))
+    try:
+      str_to_replace = r"([\^]|loc:@|^)(.*)"
+      return re.sub(str_to_replace, r"\1" + import_scope + r"/\2",
+                    compat.as_str(name))
+    except TypeError as e:
+      # If the name is not of a type we can process, simply return it.
+      logging.warning(e)
+      return name
   else:
     return name
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
   logging.warn("tf.op_scope(values, name, default_name) is deprecated,"
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index bd4764982f507b83964fe1b0995d782279e3997e..32d9d52d00cf5e250dd3eac66bd53910ff19290a 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -18,7 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gc
+import weakref
+
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -31,6 +37,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_ops_2
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resources
@@ -351,6 +358,32 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("noop", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=noop>", repr(op))
 
+  def testGetAttr(self):
+    list_value = attr_value_pb2.AttrValue.ListValue()
+    list_value.type.append(types_pb2.DT_STRING)
+    list_value.type.append(types_pb2.DT_DOUBLE)
+    op = ops.Operation(
+        ops._NodeDef(
+            "noop",
+            "op1",
+            attrs={
+                "value": attr_value_pb2.AttrValue(i=32),
+                "dtype": attr_value_pb2.AttrValue(type=types_pb2.DT_INT32),
+                "list": attr_value_pb2.AttrValue(list=list_value)
+            }), ops.Graph(), [], [dtypes.int32])
+    self.assertEqual(32, op.get_attr("value"))
+
+    d = op.get_attr("dtype")
+    # First check that d is a DType, because the assertEquals will
+    # work no matter what since DType overrides __eq__
+    self.assertIsInstance(d, dtypes.DType)
+    self.assertEqual(dtypes.int32, d)
+
+    l = op.get_attr("list")
+    for x in l:
+      self.assertIsInstance(x, dtypes.DType)
+    self.assertEqual([dtypes.string, dtypes.double], l)
+
 
 class CreateOpTest(test_util.TensorFlowTestCase):
 
@@ -1021,18 +1054,28 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
-    g = ops.Graph()
-    a = _apply_op(g, "const", [], [dtypes.float32])
-    b = _apply_op(g, "const", [], [dtypes.float32])
-    with g.control_dependencies([a]):
-      c = _apply_op(g, "const", [], [dtypes.float32])
-      d = _apply_op(g, "identity", [b], [dtypes.float32])
-      e = _apply_op(g, "identity", [c], [dtypes.float32])
-
-    self.assertEqual(c.op.control_inputs, [a.op])
-    self.assertEqual(d.op.control_inputs, [a.op])
-    # e should be dominated by c.
-    self.assertEqual(e.op.control_inputs, [])
+    ops._USE_C_API = True
+    try:
+      g = ops.Graph()
+      with g.as_default():
+        # Creating unregistered ops with _apply_op() doesn't work with the C API
+        # TODO(skyewm): address this more consistently. Possible solutions are
+        # to use registered ops in all tests, create a way to register ops in
+        # Python tests, or conditionally disable the op registration check in
+        # the C API.
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(1.0)
+        with g.control_dependencies([a]):
+          c = constant_op.constant(1.0)
+          d = array_ops.identity(b)
+          e = array_ops.identity(c)
+
+      self.assertEqual(c.op.control_inputs, [a.op])
+      self.assertEqual(d.op.control_inputs, [a.op])
+      # e should be dominated by c.
+      self.assertEqual(e.op.control_inputs, [])
+    finally:
+      ops._USE_C_API = False
 
   def testBasicWithConversion(self):
     g = ops.Graph()
@@ -1298,6 +1341,32 @@ class GraphTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       g.as_graph_element(NonConvertibleObj())
 
+  # Regression test against creating custom __del__ functions in classes
+  # involved in cyclic references, e.g. Graph and Operation. (Python won't gc
+  # cycles that require calling a __del__ method, because the __del__ method can
+  # theoretically increase the object's refcount to "save" it from gc, and any
+  # already-deleted objects in the cycle would have be to restored.)
+  def testGarbageCollected(self):
+    # Create a graph we can delete and a weak reference to monitor if it's gc'd
+    g = ops.Graph()
+    g_ref = weakref.ref(g)
+    # Create some ops
+    with g.as_default():
+      a = constant_op.constant(2.0)
+      b = constant_op.constant(3.0)
+      c = math_ops.add(a, b)
+    # Create a session we can delete
+    with session.Session(graph=g) as sess:
+      sess.run(c)
+    # Delete all references and trigger gc
+    del g
+    del a
+    del b
+    del c
+    del sess
+    gc.collect()
+    self.assertIsNone(g_ref())
+
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
@@ -1662,6 +1731,37 @@ class NameScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(es, striped)
       self.assertEqual(ep, ops.prepend_name_scope(striped, name_scope_to_add))
 
+  def testGetNameScope(self):
+    with ops.Graph().as_default() as g:
+      with ops.name_scope("scope1"):
+        with ops.name_scope("scope2"):
+          with ops.name_scope("scope3"):
+            self.assertEqual("scope1/scope2/scope3", g.get_name_scope())
+          self.assertEqual("scope1/scope2", g.get_name_scope())
+        self.assertEqual("scope1", g.get_name_scope())
+      self.assertEqual("", g.get_name_scope())
+
+
+class TracebackTest(test_util.TensorFlowTestCase):
+
+  def testTracebackWithStartLines(self):
+    with self.test_session() as sess:
+      a = constant_op.constant(2.0)
+      sess.run(
+          a,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(sess.graph.get_operations())
+
+      # Tests that traceback_with_start_lines is the same as traceback
+      # but includes one more element at the end.
+      for op in sess.graph.get_operations():
+        self.assertEquals(len(op.traceback), len(op.traceback_with_start_lines))
+        for frame, frame_with_start_line in zip(
+            op.traceback, op.traceback_with_start_lines):
+          self.assertEquals(5, len(frame_with_start_line))
+          self.assertEquals(frame, frame_with_start_line[:-1])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 64be2c70a11df09b96a5818445a440bcb2fd5286..a3168a008834bd9144b21e852e04d42bf3afcd90 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -21,8 +21,11 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb_text.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor.pb_text.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -66,7 +69,8 @@ bool IsPythonReserved(const string& s) {
        "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
        "__package__",
        // Imports and symbols used in the generated code:
-       "_op_def_lib", "text_format", "op_def_pb2", "op_def_library", "ops"});
+       "_text_format", "_op_def_pb2", "_common_shapes", "_op_def_registry",
+       "_ops", "_op_def_library"});
 
   return kPythonReserved->count(s) > 0;
 }
@@ -175,13 +179,12 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         prefix = "A list of";
       }
     } else {
-      prefix = strings::StrCat(
-          "A list with the same number of `Tensor` objects as `",
-          AvoidPythonReserved(*original_arg), "` of");
+      prefix = strings::StrCat("A list with the same length as `",
+                               AvoidPythonReserved(*original_arg), "` of");
     }
 
     if (arg.type() != DT_INVALID) {
-      return strings::StrCat(prefix, " `Tensor` objects of type ",
+      return strings::StrCat(prefix, " `Tensor` objects with type ",
                              TypeString(arg.type(), arg.is_ref()), ".");
     } else {
       original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
@@ -189,20 +192,22 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
         strings::StrAppend(&prefix, " mutable");
       }
       if (original_arg == nullptr) {
-        return strings::StrCat(prefix, " `Tensor` objects of type ",
-                               arg.type_attr(), ".");
+        return strings::StrCat(prefix, " `Tensor` objects with type `",
+                               arg.type_attr(), "`.");
       } else if (*original_arg == arg.name()) {
         const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
         if (attr->has_allowed_values()) {
           return strings::StrCat(prefix,
-                                 " `Tensor` objects of the same type in: ",
+                                 " `Tensor` objects with the same type in: ",
                                  TypeListString(attr->allowed_values()), ".");
         } else {
-          return strings::StrCat(prefix, " `Tensor` objects of the same type.");
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type.");
         }
       } else {
-        return strings::StrCat(prefix, " `Tensor` objects of the same type as ",
-                               AvoidPythonReserved(*original_arg), ".");
+        return strings::StrCat(prefix,
+                               " `Tensor` objects with the same type as `",
+                               AvoidPythonReserved(*original_arg), "`.");
       }
     }
   } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
@@ -241,19 +246,19 @@ string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
   }
 }
 
-static string GetReturns(const OpDef& op_def,
-                         const std::vector<string>& output_type_string) {
+string GetReturns(const OpDef& op_def,
+                  const std::vector<string>& output_type_string) {
   string result;
   DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
   const int num_outs = op_def.output_arg_size();
-  strings::Appendf(&result, "\n  Returns:\n");
+  strings::StrAppend(&result, "\n  Returns:\n");
   if (num_outs == 0) {
-    strings::Appendf(&result, "    The created Operation.\n");
+    strings::StrAppend(&result, "    The created Operation.\n");
   } else {
     if (num_outs == 1) {
       StringPiece description = op_def.output_arg(0).description();
       if (ConsumeEquals(&description)) {  // Skip the generated type info.
-        strings::Appendf(&result, "%s", Indent(4, 4, description).c_str());
+        strings::StrAppend(&result, Indent(4, 4, description));
       } else {
         // Special case of one output, don't use the name of the output unless
         // there is no description.
@@ -272,7 +277,7 @@ static string GetReturns(const OpDef& op_def,
         } else if (!description.empty()) {
           AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
         }
-        strings::Appendf(&result, "%s", Indent(4, 4, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 4, desc));
       }
     } else {
       std::vector<string> out_names(num_outs);
@@ -283,8 +288,8 @@ static string GetReturns(const OpDef& op_def,
           out_names[i] = strings::StrCat("output", i);
         }
       }
-      strings::Appendf(&result, "    A tuple of `Tensor` objects (%s).\n",
-                       str_util::Join(out_names, ", ").c_str());
+      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
+                         str_util::Join(out_names, ", "), ").\n\n");
       for (int i = 0; i < num_outs; ++i) {
         string desc = strings::StrCat(out_names[i], ": ");
         StringPiece description = op_def.output_arg(i).description();
@@ -307,7 +312,7 @@ static string GetReturns(const OpDef& op_def,
             strings::StrAppend(&desc, type);
           }
         }
-        strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+        strings::StrAppend(&result, Indent(4, 6, desc));
       }
     }
   }
@@ -337,6 +342,10 @@ string ShapeToPython(const TensorShapeProto& shape) {
   return python;
 }
 
+string TensorToPython(const TensorProto& proto) {
+  return ProtoShortDebugString(proto);
+}
+
 string AttrListToPython(const AttrValue& value) {
   string ret;
   if (value.list().s_size() > 0) {
@@ -369,6 +378,16 @@ string AttrListToPython(const AttrValue& value) {
       if (i > 0) strings::StrAppend(&ret, ", ");
       strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
     }
+  } else if (value.list().tensor_size() > 0) {
+    for (int i = 0; i < value.list().tensor_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
+    }
+  } else if (value.list().func_size() > 0) {
+    for (int i = 0; i < value.list().func_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
+    }
   }
   return ret;
 }
@@ -386,12 +405,36 @@ string AttrValueToPython(const string& type, const AttrValue& value) {
     return DataTypeToPython(value.type());
   } else if (type == "shape") {
     return ShapeToPython(value.shape());
-  } else {
+  } else if (type == "tensor") {
+    return TensorToPython(value.tensor());
+  } else if (type == "func") {
+    return StringToPython(value.func().name());
+  } else if (StringPiece(type).starts_with("list(")) {
     return strings::StrCat("[", AttrListToPython(value), "]");
+  } else {
+    return "?";
   }
 }
 
-static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
+void GenerateLowerCaseOpName(const string& str, string* result) {
+  const char joiner = '_';
+  const int last_index = str.size() - 1;
+  for (int i = 0; i <= last_index; ++i) {
+    const char c = str[i];
+    // Emit a joiner only if a previous-lower-to-now-upper or a
+    // now-upper-to-next-lower transition happens.
+    if (isupper(c) && (i > 0)) {
+      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+        result->push_back(joiner);
+      }
+    }
+    result->push_back(tolower(c));
+  }
+}
+
+}  // namespace
+
+string GetPythonOp(const OpDef& op_def, bool is_hidden, const string& op_name) {
   string result;
   // Map from attr name to the first input arg it is inferred from.
   std::unordered_map<string, string> inferred_attrs;
@@ -399,7 +442,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   // defaults.
   std::vector<string> args_no_default;
   // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs and the graph ("g") parameter.
+  // No input args are included, just attrs.
   std::vector<string> args_with_defaults;
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
     const auto& arg(op_def.input_arg(i));
@@ -430,8 +473,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   // those with defaults go at the end.
   std::vector<string> attrs;
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default (before
-  // "g" gets added to args_no_default, so it only has attrs).
+  // from the end of args_no_default, and adding args_no_default.
   attrs.reserve(args_no_default.size() - op_def.input_arg_size() +
                 args_with_defaults.size());
   attrs.insert(attrs.end(), args_no_default.begin() + op_def.input_arg_size(),
@@ -454,51 +496,51 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     strings::StrAppend(&parameters, param, "=None");
     param_names.push_back(param);
   }
-  const bool has_args = args_no_default.size() + args_with_defaults.size() > 0;
 
   const string lower_op_name = strings::StrCat(is_hidden ? "_" : "", op_name);
 
-  // Prepare the list of output names
   const int num_outs = op_def.output_arg_size();
-  std::vector<string> out_names(num_outs);
-  for (int i = 0; i < num_outs; ++i) {
-    if (!op_def.output_arg(i).name().empty()) {
-      out_names[i] = op_def.output_arg(i).name();
-    } else {
-      out_names[i] = strings::StrCat("output", i);
-    }
-  }
-  string out_names_list =
-      strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
-
-  // Provide the output names as a Python list
-  string lower_op_name_outputs =
-      strings::StrCat("_", lower_op_name, "_outputs");
-  const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-  strings::Appendf(
-      &result, "%s\n",
-      WordWrap(outputs_prefix, out_names_list, kRightMargin).c_str());
-  strings::Appendf(&result, "\n\n");
-
   // Prepare a NamedTuple type to hold the outputs, if there are multiple
   if (num_outs > 1) {
-    const string tuple_type_prefix = strings::StrCat(
-        "_", op_def.name(), "Output = _collections.namedtuple(");
+    // Prepare the list of output names
+    std::vector<string> out_names(num_outs);
+    for (int i = 0; i < num_outs; ++i) {
+      if (!op_def.output_arg(i).name().empty()) {
+        out_names[i] = op_def.output_arg(i).name();
+      } else {
+        out_names[i] = strings::StrCat("output", i);
+      }
+    }
+    string out_names_list =
+        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+
+    // Provide the output names as a Python list
+    string lower_op_name_outputs =
+        strings::StrCat("_", lower_op_name, "_outputs");
+    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
+    strings::StrAppend(&result, "\n",
+                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
+                       "\n");
+
+    strings::StrAppend(&result, "_", op_def.name(),
+                       "Output = _collections.namedtuple(\n");
+    const string tuple_type_prefix = "    ";
     const string tuple_type_suffix = strings::StrCat(
         "\"", op_def.name(), "\", ", lower_op_name_outputs, ")");
-    strings::Appendf(
-        &result, "%s\n",
-        WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin).c_str());
-    strings::Appendf(&result, "\n\n");
+    strings::StrAppend(
+        &result, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
+        "\n\n");
   }
+  strings::StrAppend(&result, "\n");
 
   // Print: def Function(parameters):
   const string def_prefix = strings::StrCat("def ", lower_op_name, "(");
+  const bool has_args = args_no_default.size() + args_with_defaults.size() > 0;
   const string def_suffix =
       strings::StrCat(parameters, has_args ? ", " : "", "name=None):");
 
-  strings::Appendf(&result, "%s\n",
-                   WordWrap(def_prefix, def_suffix, kRightMargin).c_str());
+  strings::StrAppend(&result, WordWrap(def_prefix, def_suffix, kRightMargin),
+                     "\n");
 
   // Format the Op's descriptions so that it can be a Python docstring.
   string comment;
@@ -511,7 +553,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     }
   }
 
-  strings::Appendf(&result, "  r\"\"\"%s\n  Args:\n", comment.c_str());
+  strings::StrAppend(&result, "  r\"\"\"", comment, "\n  Args:\n");
 
   // Inputs
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
@@ -527,7 +569,7 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
     if (!description.empty()) {
       AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result, Indent(4, 6, desc));
   }
 
   // Attrs
@@ -549,6 +591,10 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
         {"shape", "`tf.TensorShape` or list of `ints`"},
         {"list(shape)",
          "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
+        {"tensor", "`tf.TensorProto`"},
+        {"list(tensor)", "list of `tf.TensorProto` objects"},
+        {"func", "function decorated with @Defun"},
+        {"list(func)", "list of functions decorated with @Defun"},
     };
     for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
       if (attr.type() == kAttrTypeName[i][0]) {
@@ -592,14 +638,15 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
       AppendWithinWidth(&desc, attr.description(),
                         kRightMargin - 4 /* indent */);
     }
-    strings::Appendf(&result, "%s", Indent(4, 6, desc).c_str());
+    strings::StrAppend(&result, Indent(4, 6, desc));
   }
 
-  strings::Appendf(&result, "    name: A name for the operation (optional).\n");
+  strings::StrAppend(&result,
+                     "    name: A name for the operation (optional).\n");
 
   std::vector<string> output_type_string;
-  output_type_string.reserve(op_def.output_arg_size());
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+  output_type_string.reserve(num_outs);
+  for (int i = 0; i < num_outs; ++i) {
     output_type_string.push_back(
         ArgTypeName(op_def, op_def.output_arg(i), inferred_attrs, true));
   }
@@ -612,46 +659,27 @@ static string GetPythonOp(const OpDef& op_def, bool is_hidden, string op_name) {
   }
   strings::StrAppend(&return_args, "name=name)");
 
-  strings::Appendf(&result, "  \"\"\"\n%s\n",
-                   // Wrap the arguments, and indent to the (.
-                   WordWrap(return_prefix, return_args, kRightMargin).c_str());
+  strings::StrAppend(&result, "  \"\"\"\n",
+                     // Wrap the arguments, and indent to the (.
+                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
 
   if (num_outs <= 1) {
-    strings::Appendf(&result, "  return result\n");
+    strings::StrAppend(&result, "  return result\n");
   } else {
-    string return_tuple =
-        strings::StrCat("  return _", op_def.name(), "Output._make(result)\n");
-    strings::Appendf(&result, "%s", return_tuple.c_str());
+    strings::StrAppend(&result, "  return _", op_def.name(),
+                       "Output._make(result)\n");
   }
+  strings::StrAppend(&result, "\n\n");
 
-  strings::Appendf(&result, "\n\n");
   return result;
 }
 
-void GenerateLowerCaseOpName(const string& str, string* result) {
-  char joiner = '_';
-  int last_index = str.size() - 1;
-  for (int i = 0; i <= last_index; ++i) {
-    char c = str[i];
-    // Emit a joiner only if a previous-lower-to-now-upper or a
-    // now-upper-to-next-lower transition happens.
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
-      }
-    }
-    result->push_back(tolower(c));
-  }
-}
-
-}  // namespace
-
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes) {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::Appendf(&result, R"("""Python wrappers around Brain.
+  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
 """
@@ -699,8 +727,8 @@ from tensorflow.python.framework import op_def_library as _op_def_library
                        GetPythonOp(op_def, is_hidden, lower_case_name));
 
     if (!require_shapes) {
-      strings::Appendf(&result, "_ops.RegisterShape(\"%s\")(None)\n",
-                       op_def.name().c_str());
+      strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
+                         "\")(None)\n");
     }
 
     auto added = out->Add();
@@ -722,7 +750,7 @@ _InitOpDefLibrary.op_list_ascii = """%s"""
 
 _op_def_lib = _InitOpDefLibrary()
 )",
-                   cleaned_ops.DebugString().c_str());
+                   ProtoDebugString(cleaned_ops).c_str());
   return result;
 }
 
@@ -731,8 +759,8 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
   printf("%s", GetPythonOps(ops, hidden_ops, require_shapes).c_str());
 }
 
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len) {
-  string op_list_str(op_wrapper_buf, op_wrapper_len);
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
+  string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
   return GetPythonOps(ops, {}, false);
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 424244fcc55006943340ed865e97b9572a14102e..d865c238743ae7b8a5dc5a0101e2f154fca9baed 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -31,11 +31,13 @@ void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
 string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
                     bool require_shapes);
+string GetPythonOp(const OpDef& op_def, bool is_hidden, const string& op_name);
 
 // Get the python wrappers for a list of ops in a OpList.
-// buf should be a pointer to a buffer containing the binary encoded OpList
-// proto, and len should be the length of that buffer.
-string GetPythonWrappers(const char* op_wrapper_buf, size_t op_wrapper_len);
+// `op_list_buf` should be a pointer to a buffer containing
+// the binary encoded OpList proto, and `op_list_len` should be the
+// length of that buffer.
+string GetPythonWrappers(const char* op_list_buf, size_t op_list_len);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index 740eff4ecdb24a933abceeec11e5be1c60037e5b..26ec4e8e66b5d4e3be433c9e59f9b6034109d153 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -25,7 +25,7 @@ limitations under the License.
 // going from python bytes to const char* tries to decode the
 // contents from utf-8 to unicode for Python version >= 3, but
 // we want the bytes to be uninterpreted.
-%typemap(in) (const char* op_wrapper_buf, size_t op_wrapper_len) {
+%typemap(in) (const char* op_list_buf, size_t op_list_len) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
diff --git a/tensorflow/python/framework/random_seed_test.py b/tensorflow/python/framework/random_seed_test.py
index d64500fbc96821d0cd007c3f82d25b4e0599ed07..c1d2b05b0b7ffd76d59452a7be5106b5b92872a6 100644
--- a/tensorflow/python/framework/random_seed_test.py
+++ b/tensorflow/python/framework/random_seed_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.framework.ops."""
+"""Tests for tensorflow.python.framework.random_seed."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 19a2b187b9b50ef99f2ca626b3db55a91ccf0e01..e709eaeda14e1eaae93ff39a4dc6b85970e976e1 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tests for tensorflow.python.framework.ops."""
+"""Tests for tensorflow.python.framework.sparse_tensor."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 91c6e33f22c6a7cee267950b4acf7cb077e7416d..2654bca31c8b13474f8c6e547a03ba33c75260d2 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -276,7 +276,7 @@ def subscribe(tensors, side_effects):
     Subscribed tensors, which are identity copies of the passed in tensors
       in the same passed in structure, but the graph has been modified
       such that these are downstream of the control dependencies for
-      the side effect graphs. Use these functionally equivelant tensors
+      the side effect graphs. Use these functionally equivalent tensors
       instead of the passed in tensors for further construction or running.
   """
   if not hasattr(side_effects, '__iter__'):
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 3664710caa331a1d0960e95f7728db1c68d1706d..3aedbfef0d592885f2ef5a6d48be668fe0ee0abf 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Helper classes for tensor shape inference."""
 from __future__ import absolute_import
 from __future__ import division
@@ -31,8 +30,8 @@ class Dimension(object):
       self._value = None
     else:
       self._value = int(value)
-      if (not isinstance(value, compat.bytes_or_text_types)
-          and self._value != value):
+      if (not isinstance(value, compat.bytes_or_text_types) and
+          self._value != value):
         raise ValueError("Ambiguous dimension: %s" % value)
       if self._value < 0:
         raise ValueError("Dimension %d must be >= 0" % self._value)
@@ -67,6 +66,11 @@ class Dimension(object):
   def __int__(self):
     return self._value
 
+  # This is needed for Windows.
+  # See https://github.com/tensorflow/tensorflow/pull/9780
+  def __long__(self):
+    return self._value
+
   def __index__(self):
     # Allow use in Python 3 range
     return self._value
@@ -89,9 +93,8 @@ class Dimension(object):
       True if this Dimension and `other` are compatible.
     """
     other = as_dimension(other)
-    return (self._value is None
-            or other.value is None
-            or self._value == other.value)
+    return (self._value is None or other.value is None or
+            self._value == other.value)
 
   def assert_is_compatible_with(self, other):
     """Raises an exception if `other` is not compatible with this Dimension.
@@ -104,8 +107,8 @@ class Dimension(object):
         is_compatible_with).
     """
     if not self.is_compatible_with(other):
-      raise ValueError("Dimensions %s and %s are not compatible"
-                       % (self, other))
+      raise ValueError("Dimensions %s and %s are not compatible" % (self,
+                                                                    other))
 
   def merge_with(self, other):
     """Returns a Dimension that combines the information in `self` and `other`.
@@ -385,18 +388,17 @@ class TensorShape(object):
   `Tensor`. It may be one of the following:
 
   * *Fully-known shape:* has a known number of dimensions and a known size
-    for each dimension.
+    for each dimension. e.g. `TensorShape([16, 256])`
   * *Partially-known shape:* has a known number of dimensions, and an unknown
-    size for one or more dimension.
+    size for one or more dimension. e.g. `TensorShape([None, 256])`
   * *Unknown shape:* has an unknown number of dimensions, and an unknown
-    size in all dimensions.
+    size in all dimensions. e.g. `TensorShape(None)`
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in   C++`} for
-  details of shape functions and how to register them. Alternatively,
-  the shape may be set explicitly using
-  @{tf.Tensor.set_shape}.
+  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  for details of shape functions and how to register them. Alternatively,
+  the shape may be set explicitly using @{tf.Tensor.set_shape}.
   """
 
   def __init__(self, dims):
@@ -414,7 +416,7 @@ class TensorShape(object):
       self._dims = None
     elif isinstance(dims, compat.bytes_or_text_types):
       raise TypeError("A string has ambiguous TensorShape, please wrap in a "
-                       "list or convert to an int: %s" % dims)
+                      "list or convert to an int: %s" % dims)
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:
         self._dims = None
@@ -422,7 +424,8 @@ class TensorShape(object):
         self._dims = [
             # Protos store variable-size dimensions as -1
             as_dimension(dim.size if dim.size != -1 else None)
-            for dim in dims.dim]
+            for dim in dims.dim
+        ]
     elif isinstance(dims, TensorShape):
       self._dims = dims.dims
     else:
@@ -519,7 +522,7 @@ class TensorShape(object):
           # suffixes of otherwise unknown shapes.
           return unknown_shape()
         else:
-          return unknown_shape(ndims=stop-start)
+          return unknown_shape(ndims=stop - start)
       else:
         return Dimension(None)
 
@@ -560,8 +563,7 @@ class TensorShape(object):
           new_dims.append(dim.merge_with(other[i]))
         return TensorShape(new_dims)
       except ValueError:
-        raise ValueError("Shapes %s and %s are not compatible" %
-                         (self, other))
+        raise ValueError("Shapes %s and %s are not compatible" % (self, other))
 
   def concatenate(self, other):
     """Returns the concatenation of the dimension in `self` and `other`.
@@ -599,8 +601,8 @@ class TensorShape(object):
     other = as_shape(other)
     if self.ndims is not None and other.ndims is not None:
       if self.ndims != other.ndims:
-        raise ValueError(
-            "Shapes %s and %s must have the same rank" % (self, other))
+        raise ValueError("Shapes %s and %s must have the same rank" % (self,
+                                                                       other))
 
   def assert_has_rank(self, rank):
     """Raises an exception if `self` is not compatible with the given `rank`.
@@ -736,8 +738,8 @@ class TensorShape(object):
 
   def is_fully_defined(self):
     """Returns True iff `self` is fully defined in every dimension."""
-    return (self._dims is not None
-            and all(dim.value is not None for dim in self._dims))
+    return (self._dims is not None and all(dim.value is not None
+                                           for dim in self._dims))
 
   def assert_is_fully_defined(self):
     """Raises an exception if `self` is not fully defined in every dimension.
@@ -767,9 +769,10 @@ class TensorShape(object):
       return tensor_shape_pb2.TensorShapeProto(unknown_rank=True)
     else:
       return tensor_shape_pb2.TensorShapeProto(dim=[
-          tensor_shape_pb2.TensorShapeProto.Dim(
-              size=-1 if d.value is None else d.value)
-          for d in self._dims])
+          tensor_shape_pb2.TensorShapeProto.Dim(size=-1
+                                                if d.value is None else d.value)
+          for d in self._dims
+      ])
 
   def __eq__(self, other):
     """Returns True if `self` is equivalent to `other`."""
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index c2378ac4b22b693b63ee3e3b68abed55f71e403b..10811100010614a8b0b18cd4f24ddd5dcc5bb542 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -368,7 +368,9 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
       np_dt = dtype.as_numpy_dtype
     else:
       np_dt = None
-    if np.prod(shape) == 0:
+    # If shape is None, numpy.prod returns None when dtype is not set, but raises
+    # exception when dtype is set to np.int64
+    if shape is not None and np.prod(shape, dtype=np.int64) == 0:
       nparray = np.empty(shape, dtype=np_dt)
     else:
       _AssertCompatible(values, dtype)
@@ -414,7 +416,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     shape_size = nparray.size
   else:
     shape = [int(dim) for dim in shape]
-    shape_size = np.prod(shape)
+    shape_size = np.prod(shape, dtype=np.int64)
     is_same_size = shape_size == nparray.size
 
     if verify_shape:
@@ -491,7 +493,7 @@ def MakeNdarray(tensor):
 
   """
   shape = [d.size for d in tensor.tensor_shape.dim]
-  num_elements = np.prod(shape)
+  num_elements = np.prod(shape, dtype=np.int64)
   tensor_dtype = dtypes.as_dtype(tensor.dtype)
   dtype = tensor_dtype.as_numpy_dtype
 
@@ -671,6 +673,13 @@ def _ConstantValue(tensor):
         return None
       values.append(value)
     return np.array(values)
+  elif tensor.op.type == "Fill":
+    fill_shape = tensor.shape
+    fill_value = constant_value(tensor.op.inputs[1])
+    if fill_shape.is_fully_defined() and fill_value is not None:
+      return np.full(fill_shape.as_list(), fill_value, dtype=fill_value.dtype)
+    else:
+      return None
   else:
     return None
 
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 727438a56df7a2c5e5ebb3f053120941f46b53be..8949702b8752796c0de796b37959e355c7fe12ef 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import sys
+import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -48,13 +48,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatN(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -66,12 +66,12 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTyped(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
+        """, t)
     else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
@@ -84,13 +84,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatTypeCoerce(self):
     t = tensor_util.make_tensor_proto([10, 20, 30], dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -103,13 +103,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatTypeCoerceNdarray(self):
     arr = np.asarray([10, 20, 30], dtype="int")
     t = tensor_util.make_tensor_proto(arr, dtype=dtypes.float32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } }
@@ -121,13 +121,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[1, 3])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -139,13 +139,13 @@ class TensorUtilTest(test.TestCase):
 
   def testFloatSizes2(self):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[3, 1])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_FLOAT  
         tensor_shape { dim { size: 3 } dim { size: 1 } }  
         tensor_content: "A \000\000A\240\000\000A\360\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_FLOAT
         tensor_shape { dim { size: 3 } dim { size: 1 } }
@@ -167,13 +167,13 @@ class TensorUtilTest(test.TestCase):
   def testFloatNpArrayFloat64(self):
     t = tensor_util.make_tensor_proto(
         np.array([[10.0, 20.0, 30.0]], dtype=np.float64))
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_DOUBLE  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_DOUBLE
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -258,13 +258,13 @@ class TensorUtilTest(test.TestCase):
 
   def testIntNDefaultType(self):
     t = tensor_util.make_tensor_proto([10, 20, 30, 40], shape=[2, 2])
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT32  
         tensor_shape { dim { size: 2 } dim { size: 2 } }  
         tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT32
         tensor_shape { dim { size: 2 } dim { size: 2 } }
@@ -328,13 +328,13 @@ class TensorUtilTest(test.TestCase):
   def testLongN(self):
     t = tensor_util.make_tensor_proto(
         [10, 20, 30], shape=[1, 3], dtype=dtypes.int64)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT64  
         tensor_shape { dim { size: 1 } dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 1 } dim { size: 3 } }
@@ -346,13 +346,13 @@ class TensorUtilTest(test.TestCase):
 
   def testLongNpArray(self):
     t = tensor_util.make_tensor_proto(np.array([10, 20, 30]))
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_INT64  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_INT64
         tensor_shape { dim { size: 3 } }
@@ -367,13 +367,13 @@ class TensorUtilTest(test.TestCase):
     data = [(21,), (22,), (23,)]
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint32)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QINT32  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT32
         tensor_shape { dim { size: 3 } }
@@ -404,13 +404,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.quint16)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QUINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else:  
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QUINT16
         tensor_shape { dim { size: 3 } }
@@ -421,13 +421,13 @@ class TensorUtilTest(test.TestCase):
     self.assertAllEqual(np.array(data, dtype=a.dtype), a)
 
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint16)
-    if sys.byteorder == "big":  
-      self.assertProtoEquals("""  
+    if sys.byteorder == "big":
+      self.assertProtoEquals("""
         dtype: DT_QINT16  
         tensor_shape { dim { size: 3 } }  
         tensor_content: "\000\025\000\026\000\027"  
-        """, t)  
-    else: 
+        """, t)
+    else:
       self.assertProtoEquals("""
         dtype: DT_QINT16
         tensor_shape { dim { size: 3 } }
@@ -669,7 +669,9 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
   def testMockArray(self):
+
     class MockArray(object):
+
       def __init__(self, array):
         self.array = array
 
@@ -711,6 +713,13 @@ class ConstantValueTest(test.TestCase):
     self.assertAllEqual(np_val, c_val)
     self.assertEqual(np.int32, c_val.dtype)
 
+  def testFill(self):
+    np_val = np.array([-1, -1, -1], dtype=np.float32)
+    tf_val = array_ops.fill([3], constant_op.constant(-1.0))
+    c_val = tensor_util.constant_value(tf_val)
+    self.assertAllEqual(np_val, c_val)
+    self.assertEqual(np.float32, c_val.dtype)
+
   def testSize(self):
     tf_val = array_ops.size(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
diff --git a/tensorflow/python/framework/test_file_system.cc b/tensorflow/python/framework/test_file_system.cc
index 32b9b82e56b0b63c1ce167356bd9dea06ec51673..094ea6f658ab800736eebce2db7ee80da151a033 100644
--- a/tensorflow/python/framework/test_file_system.cc
+++ b/tensorflow/python/framework/test_file_system.cc
@@ -18,14 +18,20 @@ limitations under the License.
 namespace tensorflow {
 
 class TestRandomAccessFile : public RandomAccessFile {
-  // The filecontents is all A's
+  // The file contents is 10 bytes of all A's
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
+    Status s;
     for (int i = 0; i < n; ++i) {
+      if (offset + i >= 10) {
+        n = i;
+        s = errors::OutOfRange("EOF");
+        break;
+      }
       scratch[i] = 'A';
     }
     *result = StringPiece(scratch, n);
-    return Status::OK();
+    return s;
   }
 };
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a9755506abd19238dfed700d7806df3fc97ac873..485bb3b109b6e260cd96deffd59b8b09ddef892d 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -29,6 +29,12 @@ import threading
 import numpy as np
 import six
 
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _portpicker_import_error:
+  portpicker = None
+
+# pylint: disable=g-import-not-at-top
 from google.protobuf import descriptor_pool
 from google.protobuf import text_format
 
@@ -45,6 +51,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 from tensorflow.python.util.protobuf import compare
 
@@ -117,6 +124,36 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     raise AssertionError(compat.as_str(diff))
 
 
+def assert_meta_graph_protos_equal(tester, a, b):
+  """Compares MetaGraphDefs `a` and `b` in unit test class `tester`."""
+  # Carefully check the collection_defs
+  tester.assertEqual(set(a.collection_def), set(b.collection_def))
+  collection_keys = a.collection_def.keys()
+  for k in collection_keys:
+    a_value = a.collection_def[k]
+    b_value = b.collection_def[k]
+    proto_type = ops.get_collection_proto_type(k)
+    if proto_type:
+      a_proto = proto_type()
+      b_proto = proto_type()
+      # Number of entries in the collections is the same
+      tester.assertEqual(len(a_value.bytes_list.value),
+                         len(b_value.bytes_list.value))
+      for (a_value_item, b_value_item) in zip(
+          a_value.bytes_list.value,
+          b_value.bytes_list.value):
+        a_proto.ParseFromString(a_value_item)
+        b_proto.ParseFromString(b_value_item)
+        tester.assertProtoEquals(a_proto, b_proto)
+    else:
+      tester.assertEquals(a_value, b_value)
+  # Compared the fields directly, remove their raw values from the
+  # proto comparison below.
+  a.ClearField("collection_def")
+  b.ClearField("collection_def")
+  tester.assertProtoEquals(a, b)
+
+
 # Matches attributes named via _SHARDED_SUFFIX in
 # tensorflow/python/training/saver.py
 _SHARDED_SAVE_OP_PATTERN = "_temp_[0-9a-z]{32}/part"
@@ -220,7 +257,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """Returns a unique temporary directory for the test to use.
 
     If you call this method multiple times during in a test, it will return the
-    same folder. However, accross different runs the directories will be
+    same folder. However, across different runs the directories will be
     different. This will ensure that across different runs tests will not be
     able to pollute each others environment.
     If you need multiple unique directories within a single test, you should
@@ -534,15 +571,7 @@ class TensorFlowTestCase(googletest.TestCase):
       a = np.array(a)
     return a
 
-  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
-    """Asserts that two numpy arrays have near values.
-
-    Args:
-      a: a numpy ndarray or anything can be converted to one.
-      b: a numpy ndarray or anything can be converted to one.
-      rtol: relative tolerance.
-      atol: absolute tolerance.
-    """
+  def _assertArrayLikeAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
     self.assertEqual(a.shape, b.shape, "Shape mismatch: expected %s, got %s." %
@@ -570,7 +599,37 @@ class TensorFlowTestCase(googletest.TestCase):
       print("not close dif = ", np.abs(x - y))
       print("not close tol = ", atol + rtol * np.abs(y))
       print("dtype = %s, shape = %s" % (a.dtype, a.shape))
-      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol)
+      np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, err_msg=msg)
+
+  def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6):
+    """Asserts that two numpy arrays, or dicts of same, have near values.
+
+    This does not support nested dicts.
+
+    Args:
+      a: A numpy ndarray (or anything can be converted to one), or dict of same.
+        Must be a dict iff `b` is a dict.
+      b: A numpy ndarray (or anything can be converted to one), or dict of same.
+        Must be a dict iff `a` is a dict.
+      rtol: relative tolerance.
+      atol: absolute tolerance.
+
+    Raises:
+      ValueError: if only one of `a` and `b` is a dict.
+    """
+    is_a_dict = isinstance(a, dict)
+    if is_a_dict != isinstance(b, dict):
+      raise ValueError("Can't compare dict to non-dict, %s vs %s." % (a, b))
+    if is_a_dict:
+      self.assertItemsEqual(
+          a.keys(), b.keys(),
+          msg="mismatched keys, expected %s, got %s" % (a.keys(), b.keys()))
+      for k in a:
+        self._assertArrayLikeAllClose(
+            a[k], b[k], rtol=rtol, atol=atol,
+            msg="%s: expected %s, got %s." % (k, a, b))
+    else:
+      self._assertArrayLikeAllClose(a, b, rtol=rtol, atol=atol)
 
   def assertAllCloseAccordingToType(self,
                                     a,
@@ -724,3 +783,62 @@ class TensorFlowTestCase(googletest.TestCase):
     assertItemsEqual = googletest.TestCase.assertCountEqual
 
     # pylint: enable=invalid-name
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create and start local servers and return the associated `Server` objects.
+
+  Example:
+  ```python
+  workers, _ = tf.test.create_local_cluster(num_workers=2, num_ps=2)
+
+  worker_sessions = [tf.Session(w.target) for w in workers]
+
+  with tf.device("/job:ps/task:0"):
+    ...
+  with tf.device("/job:ps/task:1"):
+    ...
+  with tf.device("/job:worker/task:0"):
+    ...
+  with tf.device("/job:worker/task:1"):
+    ...
+
+  worker_sessions[0].run(...)
+  ```
+
+  Args:
+    num_workers: Number of worker servers to start.
+    num_ps: Number of PS servers to start.
+    protocol: Communication protocol.  Allowed values are documented in
+      the documentation of `tf.train.Server`.
+
+  Returns:
+    A tuple `(worker_servers, ps_servers)`.  `worker_servers` is a list
+    of `num_workers` objects of type `tf.train.Server` (all running locally);
+    and `ps_servers` is a list of `num_ps` objects of similar type.
+
+  Raises:
+    ImportError: if portpicker module was not found at load time
+  """
+  if not portpicker:
+    raise _portpicker_import_error
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+      "worker": ["localhost:%s" % port for port in worker_ports],
+      "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+      server_lib.Server(
+          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_workers)
+  ]
+  ps_servers = [
+      server_lib.Server(
+          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+      for ix in range(num_ps)
+  ]
+
+  return workers, ps_servers
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index e457b35f004eea91eb9a13150f5f38dde9f73ef8..6129fa2e0d06e3ac271ace515a0e3ab8fb98ac9d 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -196,7 +196,47 @@ class TestUtilTest(test_util.TensorFlowTestCase):
   def testAllCloseScalars(self):
     self.assertAllClose(7, 7 + 1e-8)
     with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
-      self.assertAllClose(7, 8)
+      self.assertAllClose(7, 7 + 1e-5)
+
+  def testAllCloseDictToNonDict(self):
+    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+      self.assertAllClose(1, {"a": 1})
+    with self.assertRaisesRegexp(ValueError, r"Can't compare dict to non-dict"):
+      self.assertAllClose({"a": 1}, 1)
+
+  def testAllCloseDicts(self):
+    a = 7
+    b = (2., 3.)
+    c = np.ones((3, 2, 4)) * 7.
+    expected = {"a": a, "b": b, "c": c}
+
+    # Identity.
+    self.assertAllClose(expected, expected)
+    self.assertAllClose(expected, dict(expected))
+
+    # With each item removed.
+    for k in expected:
+      actual = dict(expected)
+      del actual[k]
+      with self.assertRaisesRegexp(AssertionError, r"mismatched keys"):
+        self.assertAllClose(expected, actual)
+
+    # With each item changed.
+    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+      self.assertAllClose(expected, {"a": a + 1e-5, "b": b, "c": c})
+    with self.assertRaisesRegexp(AssertionError, r"Shape mismatch"):
+      self.assertAllClose(expected, {"a": a, "b": b + (4.,), "c": c})
+    c_copy = np.array(c)
+    c_copy[1, 1, 1] += 1e-5
+    with self.assertRaisesRegexp(AssertionError, r"Not equal to tolerance"):
+      self.assertAllClose(expected, {"a": a, "b": b, "c": c_copy})
+
+  def testAllCloseNestedDicts(self):
+    a = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"inputs could not be safely coerced to any supported types"):
+      self.assertAllClose(a, a)
 
   def testArrayNear(self):
     a = [1, 2]
@@ -212,8 +252,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertArrayNear(a, b, 0.001)
 
   def testForceGPU(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Cannot assign a device to node"):
+    with self.assertRaises(errors.InvalidArgumentError):
       with self.test_session(force_gpu=True):
         # this relies on us not having a GPU implementation for assert, which
         # seems sensible
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..581f17c2ca21d2d1634bdbc695156f66dd1d4b35
--- /dev/null
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the swig wrapper tf_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MemoryOptimizerTest(test.TestCase):
+  """Tests the Grappler memory optimizer."""
+
+  def testNoSwapping(self):
+    """Make sure the graph is preserved when there is nothing to swap."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 4)
+    self.assertItemsEqual([node.name
+                           for node in graph.node], ['a', 'b', 'c', 'd'])
+
+  def testSimpleSwap(self):
+    """Check that the swap annotations are followed."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+
+    d.op.node_def.attr['_swap_to_host'].i = 0
+
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 6)
+    self.assertItemsEqual([node.name for node in graph.node], [
+        'a',
+        'b',
+        'c',
+        'd',
+        'swap_in_d_0',
+        'swap_out_d_0',
+    ])
+    for node in graph.node:
+      if node.name == 'swap_in_d_0':
+        self.assertEqual('swap_out_d_0', node.input[0])
+        self.assertEqual('^b', node.input[1])
+      elif node.name == 'swap_out_d_0':
+        self.assertEqual('b', node.input[0])
+      elif node.name == 'd':
+        self.assertEqual('swap_in_d_0', node.input[0])
+        self.assertEqual('c', node.input[1])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
new file mode 100644
index 0000000000000000000000000000000000000000..404ce351801464ce9941505b7b51c3b9f009ba2c
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+%include "tensorflow/python/platform/base.i"
+
+%typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%typemap(in) const tensorflow::RewriterConfig& (
+    tensorflow::RewriterConfig temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The RewriterConfig could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
+%{
+  #include <memory>
+  #include "tensorflow/c/tf_status_helper.h"
+  #include "tensorflow/core/lib/core/status.h"
+  #include "tensorflow/core/framework/graph.pb.h"
+  #include "tensorflow/core/grappler/grappler_item.h"
+  #include "tensorflow/core/grappler/grappler_item_builder.h"
+  #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+  #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/meta_graph.pb.h"
+  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+PyObject* TF_OptimizeGraph(
+      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::MetaGraphDef& metagraph,
+      const string& graph_id, TF_Status* out_status) {
+    const tensorflow::grappler::ItemConfig item_config;
+    std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+        tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
+    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+    tensorflow::grappler::VirtualCluster cluster(device_map);
+    tensorflow::GraphDef out_graph;
+    tensorflow::Status status = tensorflow::grappler::RunMetaOptimizer(
+        *grappler_item, rewriter_config, &cluster, &out_graph);
+    tensorflow::Set_TF_Status_from_Status(out_status, status);
+    string out_graph_str = out_graph.SerializeAsString();
+    PyObject* ret = PyBytes_FromStringAndSize(out_graph_str.data(),
+                                              out_graph_str.size());
+    return ret;
+  }
+%}
+
+
+// Wrap this function
+PyObject* TF_OptimizeGraph(
+    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::MetaGraphDef& metagraph,
+    const string& graph_id, TF_Status* out_status);
+
+
+
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0464c6054293b8499231526317d5bd42bc88752
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -0,0 +1,35 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Provides a proper python API for the symbols exported through swig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python import pywrap_tensorflow as tf_opt
+from tensorflow.python.framework import errors
+
+
+def OptimizeGraph(rewriter_config, metagraph, graph_id=b'graph_to_optimize'):
+  """Optimize the provided metagraph."""
+  with errors.raise_exception_on_not_ok_status() as status:
+    ret_from_swig = tf_opt.TF_OptimizeGraph(rewriter_config.SerializeToString(),
+                                            metagraph.SerializeToString(),
+                                            graph_id, status)
+  if ret_from_swig is None:
+    return None
+  out_graph = graph_pb2.GraphDef().FromString(ret_from_swig)
+  return out_graph
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1efc2dbfbb60f1b606e6268074fe3a1e39c5562
--- /dev/null
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the swig wrapper tf_optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PyWrapOptimizeGraphTest(test.TestCase):
+
+  def testBasic(self):
+    """Make sure arguments can be passed correctly."""
+    a = constant_op.constant(10, name='a')
+    b = constant_op.constant(20, name='b')
+    c = math_ops.add_n([a, b], name='c')
+    d = math_ops.add_n([b, c], name='d')
+    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+    train_op.append(d)
+    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    rewriter_config.optimizers.append('constfold')
+
+    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+
+    self.assertEqual(len(graph.node), 5)
+    self.assertItemsEqual([node.name for node in graph.node],
+                          ['a', 'b', 'c', 'd', 'ConstantFolding/c'])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 333c6b8f2ba2422c34dacfe2f4ad7cad795496f3..88754015a67dc129b83a0ddf94b9f01e655197f1 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -214,6 +214,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "decode_bmp_op_test",
+    size = "small",
+    srcs = ["decode_bmp_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:nn_grad",
+    ],
+)
+
 tf_py_test(
     name = "decode_image_op_test",
     size = "small",
@@ -402,6 +415,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "lookup_ops_test",
+    size = "small",
+    srcs = ["lookup_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_py_test(
     name = "losses_test",
     size = "medium",
@@ -801,7 +830,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "topk_op_test",
     size = "small",
     srcs = ["topk_op_test.py"],
@@ -941,6 +970,19 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "aggregate_ops_test",
+    size = "small",
+    srcs = ["aggregate_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 cuda_py_test(
     name = "argmax_op_test",
     size = "small",
@@ -967,6 +1009,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
     ],
@@ -1084,6 +1127,7 @@ cuda_py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1534,6 +1578,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -1578,9 +1623,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "scalar_strict_test",
+    name = "scalar_test",
     size = "small",
-    srcs = ["scalar_strict_test.py"],
+    srcs = ["scalar_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
@@ -1774,7 +1819,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "split_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["split_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1863,14 +1908,18 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_grad",
+        "//tensorflow/python:training",
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
     ],
+    flaky = 1,  # create_local_cluster sometimes times out.
+    tags = ["nomsan"],  # b/38390993
 )
 
 cuda_py_test(
@@ -2057,6 +2106,21 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "neon_depthwise_conv_op_test",
+    size = "medium",
+    srcs = ["neon_depthwise_conv_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+    ],
+)
+
 cuda_py_test(
     name = "division_future_test",
     size = "medium",
@@ -2109,6 +2173,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -2124,6 +2189,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -2273,6 +2339,20 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "map_stage_op_test",
+    size = "small",
+    srcs = ["map_stage_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:data_flow_ops",
+    ],
+)
+
 cuda_py_test(
     name = "concat_op_test",
     size = "medium",
@@ -2331,6 +2411,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
+        "//tensorflow/python:nn_grad",
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
@@ -2346,7 +2427,6 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
@@ -2552,6 +2632,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bucketize_op_test",
+    size = "small",
+    srcs = ["bucketize_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+tf_py_test(
+    name = "sparse_cross_op_test",
+    size = "small",
+    srcs = ["sparse_cross_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:sparse_ops",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56917f7e9b5315e9ec2f01607b7bfa99112f53c
--- /dev/null
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -0,0 +1,79 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for aggregate_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class AddNTest(test.TestCase):
+  # AddN special-cases adding the first M inputs to make (N - M) divisible by 8,
+  # after which it adds the remaining (N - M) tensors 8 at a time in a loop.
+  # Test N in [1, 10] so we check each special-case from 1 to 9 and one
+  # iteration of the loop.
+  _MAX_N = 10
+
+  def _supported_types(self):
+    if test.is_gpu_available():
+      return [dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+              dtypes.complex128]
+    return [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+            dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+            dtypes.complex128]
+
+  def _buildData(self, shape, dtype):
+    data = np.random.randn(*shape).astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testAddN(self):
+    np.random.seed(12345)
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in self._supported_types():
+        for count in range(1, self._MAX_N + 1):
+          data = [self._buildData((2, 2), dtype) for _ in range(count)]
+          actual = sess.run(math_ops.add_n(data))
+          expected = np.sum(np.vstack(
+              [np.expand_dims(d, 0) for d in data]), axis=0)
+          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+  def testUnknownShapes(self):
+    np.random.seed(12345)
+    with self.test_session(use_gpu=True) as sess:
+      for dtype in self._supported_types():
+        data = self._buildData((2, 2), dtype)
+        for count in range(1, self._MAX_N + 1):
+          data_ph = array_ops.placeholder(dtype=dtype)
+          actual = sess.run(math_ops.add_n([data_ph] * count), {data_ph: data})
+          expected = np.sum(np.vstack([np.expand_dims(data, 0)] * count),
+                            axis=0)
+          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index caa4cd22238c5ed03113b0a74c39ad53d9adf480..7b8cd256643c27754724e5110797068cbcc6cc0d 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -33,6 +33,8 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
@@ -238,7 +240,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, x_np)
 
   def _reverse1DimAuto(self, np_dtype):
-    x_np = np.array([1, 2, 3, 4, 5], dtype=np_dtype)
+    x_np = np.array([1, 200, 3, 40, 5], dtype=np_dtype)
 
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
@@ -246,7 +248,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(x_tf, np.asarray(x_np)[::-1])
 
   def _reverse2DimAuto(self, np_dtype):
-    x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np_dtype)
+    x_np = np.array([[1, 200, 3], [4, 5, 60]], dtype=np_dtype)
 
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
@@ -281,14 +283,14 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse1DimAuto(dtype)
 
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.int32, np.int64, np.bool, np.float16, np.float32,
-        np.float64, np.complex64, np.complex128
+        np.float64, np.complex64, np.complex128, np.array(b"").dtype.type
     ]:
       self._reverse2DimAuto(dtype)
 
@@ -396,11 +398,12 @@ class StridedSliceChecker(object):
   REF_TENSOR_ALIGNED = np.arange(1, 97, dtype=np.float32).reshape(3, 4, 8)
 
   def __init__(self, test, x, tensor_type=dtypes.int32, check_type_infer=True):
+    self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if tensor_type.is_complex:
+      self.x_np -= 1j * self.x_np
     self.test = test
-    self.x = math_ops.cast(
-        constant_op.constant(
-            x, dtype=dtypes.float32), dtype=tensor_type)
-    self.x_np = np.array(x)
+    self.x = constant_op.constant(self.x_np, dtype=tensor_type)
     self.check_type_infer = check_type_infer
 
   def __getitem__(self, spec):
@@ -435,7 +438,8 @@ class StridedSliceChecker(object):
 
 
 STRIDED_SLICE_TYPES = [dtypes.int32, dtypes.int64, dtypes.int16, dtypes.int8,
-                       dtypes.float32, dtypes.float64, dtypes.complex64]
+                       dtypes.float32, dtypes.float64, dtypes.complex64,
+                       dtypes.complex128]
 
 
 class StridedSliceTest(test_util.TensorFlowTestCase):
@@ -443,134 +447,126 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
 
   def test_basic_slice(self):
     for tensor_type in STRIDED_SLICE_TYPES:
-      for use_gpu in [False, True]:
-        with self.test_session(use_gpu=use_gpu):
-          checker = StridedSliceChecker(
-              self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
-          _ = checker[:, :, :]
-          # Various ways of representing identity slice
-          _ = checker[:, :, :]
-          _ = checker[::, ::, ::]
-          _ = checker[::1, ::1, ::1]
-          # Not zero slice
-          _ = checker[::1, ::5, ::2]
-          # Reverse in each dimension independently
-          _ = checker[::-1, :, :]
-          _ = checker[:, ::-1, :]
-          _ = checker[:, :, ::-1]
-          ## negative index tests i.e. n-2 in first component
-          _ = checker[-2::-1, :, ::1]
-          # negative index tests i.e. n-2 in first component, non-unit stride
-          _ = checker[-2::-1, :, ::2]
-
-          # Check rank-0 examples
-          checker2 = StridedSliceChecker(self, 5, tensor_type=tensor_type)
-          _ = checker2[None]
-          _ = checker2[...]
-          _ = checker2[tuple()]
+      with self.test_session(use_gpu=True):
+        checker = StridedSliceChecker(
+            self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
+        _ = checker[:, :, :]
+        # Various ways of representing identity slice
+        _ = checker[:, :, :]
+        _ = checker[::, ::, ::]
+        _ = checker[::1, ::1, ::1]
+        # Not zero slice
+        _ = checker[::1, ::5, ::2]
+        # Reverse in each dimension independently
+        _ = checker[::-1, :, :]
+        _ = checker[:, ::-1, :]
+        _ = checker[:, :, ::-1]
+        ## negative index tests i.e. n-2 in first component
+        _ = checker[-2::-1, :, ::1]
+        # negative index tests i.e. n-2 in first component, non-unit stride
+        _ = checker[-2::-1, :, ::2]
+
+        # Check rank-0 examples
+        checker2 = StridedSliceChecker(self, 5, tensor_type=tensor_type)
+        _ = checker2[None]
+        _ = checker2[...]
+        _ = checker2[tuple()]
 
   def testDegenerateSlices(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
-        # degenerate by offering a forward interval with a negative stride
-        _ = checker[0:-1:-1, :, :]
-        # degenerate with a reverse interval with a positive stride
-        _ = checker[-1:0, :, :]
-        # empty interval in every dimension
-        _ = checker[-1:0, 2:2, 2:3:-1]
+    with self.test_session(use_gpu=True):
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      # degenerate by offering a forward interval with a negative stride
+      _ = checker[0:-1:-1, :, :]
+      # degenerate with a reverse interval with a positive stride
+      _ = checker[-1:0, :, :]
+      # empty interval in every dimension
+      _ = checker[-1:0, 2:2, 2:3:-1]
 
   def testEllipsis(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
-        checker = StridedSliceChecker(self, raw)
-
-        _ = checker[0:]
-        # implicit ellipsis
-        _ = checker[0:, ...]
-        # ellipsis alone
-        _ = checker[...]
-        # ellipsis at end
-        _ = checker[0:1, ...]
-        # ellipsis at begin
-        _ = checker[..., 0:1]
-        # ellipsis at middle
-        _ = checker[0:1, ..., 0:1]
-        # multiple ellipses not allowed
-        with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
-          _ = checker[..., :, ...].eval()
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
+      checker = StridedSliceChecker(self, raw)
+
+      _ = checker[0:]
+      # implicit ellipsis
+      _ = checker[0:, ...]
+      # ellipsis alone
+      _ = checker[...]
+      # ellipsis at end
+      _ = checker[0:1, ...]
+      # ellipsis at begin
+      _ = checker[..., 0:1]
+      # ellipsis at middle
+      _ = checker[0:1, ..., 0:1]
+      # multiple ellipses not allowed
+      with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
+        _ = checker[..., :, ...].eval()
 
   def testShrink(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw)
-        _ = checker[:, :, :, :, 3]
-        _ = checker[..., 3]
-        _ = checker[:, 0]
-        _ = checker[:, :, 0]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw)
+      _ = checker[:, :, :, :, 3]
+      _ = checker[..., 3]
+      _ = checker[:, 0]
+      _ = checker[:, :, 0]
 
   def testTensorIndexing(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw, check_type_infer=False)
-        bar = constant_op.constant(2)
-        bar2 = constant_op.constant(3)
-        _ = checker[..., bar:bar2]
-        _ = checker[..., bar]
-        with self.assertRaisesRegexp(
-            TypeError,
-            "Value passed to parameter 'begin' has DataType float32 not in "
-            "list of allowed values"):
-          _ = checker[..., 3.0]
-        _ = checker[..., 3]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw, check_type_infer=False)
+      bar = constant_op.constant(2)
+      bar2 = constant_op.constant(3)
+      _ = checker[..., bar:bar2]
+      _ = checker[..., bar]
+      with self.assertRaisesRegexp(
+          TypeError,
+          "Value passed to parameter 'begin' has DataType float32 not in "
+          "list of allowed values"):
+        _ = checker[..., 3.0]
+      _ = checker[..., 3]
 
   def testExpand(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
-                [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
-        checker = StridedSliceChecker(self, raw)
-        # new axis (followed by implicit ellipsis)
-        _ = checker[np.newaxis]
-        # newaxis after ellipsis
-        _ = checker[..., np.newaxis]
-        # newaxis in between ellipsis and explicit range
-        _ = checker[..., np.newaxis, :]
-        _ = checker[:, ..., np.newaxis, :, :]
-        # Reverse final dimension with new axis
-        _ = checker[:, :, np.newaxis, :, 2::-1]
-        # Ellipsis in middle of two newaxis
-        _ = checker[np.newaxis, ..., np.newaxis]
+    with self.test_session(use_gpu=True):
+      raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
+              [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
+      checker = StridedSliceChecker(self, raw)
+      # new axis (followed by implicit ellipsis)
+      _ = checker[np.newaxis]
+      # newaxis after ellipsis
+      _ = checker[..., np.newaxis]
+      # newaxis in between ellipsis and explicit range
+      _ = checker[..., np.newaxis, :]
+      _ = checker[:, ..., np.newaxis, :, :]
+      # Reverse final dimension with new axis
+      _ = checker[:, :, np.newaxis, :, 2::-1]
+      # Ellipsis in middle of two newaxis
+      _ = checker[np.newaxis, ..., np.newaxis]
 
   def testExpandVariable(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        x = variables.Variable(7, dtype=dtypes.int32)
-        x.initializer.run()
-        y = x[None].eval()
-        self.assertEqual(y.shape, (1,))
-        self.assertAllEqual(y, (7,))
+    with self.test_session(use_gpu=True):
+      x = variables.Variable(7, dtype=dtypes.int32)
+      x.initializer.run()
+      y = x[None].eval()
+      self.assertEqual(y.shape, (1,))
+      self.assertAllEqual(y, (7,))
 
   def testOptimizedCases(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        checker = StridedSliceChecker(self,
-                                      StridedSliceChecker.REF_TENSOR_ALIGNED)
-        # Identity
-        _ = checker[:]
-        # Identity
-        _ = checker[...]
-        # Identity
-        _ = checker[np.newaxis, ..., np.newaxis]
-        # First axis slice
-        _ = checker[1:]
-        # First axis slice
-        _ = checker[np.newaxis, 1:]
+    with self.test_session(use_gpu=True):
+      checker = StridedSliceChecker(self,
+                                    StridedSliceChecker.REF_TENSOR_ALIGNED)
+      # Identity
+      _ = checker[:]
+      # Identity
+      _ = checker[...]
+      # Identity
+      _ = checker[np.newaxis, ..., np.newaxis]
+      # First axis slice
+      _ = checker[1:]
+      # First axis slice
+      _ = checker[np.newaxis, 1:]
 
 
 class StridedSliceShapeChecker(object):
@@ -587,7 +583,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
   """Test the shape inference of StridedSliceShapes."""
 
   def testUnknown(self):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(dtypes.float32)
       a = StridedSliceShapeChecker(uncertain_tensor)
       a_slice_shape = a[...]
@@ -598,45 +594,43 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
     self.assertEqual(x.as_list(), y.as_list())
 
   def testTensorShapeUncertain(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        uncertain_tensor = array_ops.placeholder(
-            dtypes.float32, shape=(5, None, 7))
-        a = StridedSliceShapeChecker(uncertain_tensor)
-        self.tensorShapeEqual(a[3:5], tensor_shape.TensorShape([2, None, 7]))
-        self.tensorShapeEqual(a[3:5, :, 4], tensor_shape.TensorShape([2, None]))
-        self.tensorShapeEqual(a[3:5, 3:4, 4],
-                              tensor_shape.TensorShape([2, None]))
-        self.tensorShapeEqual(a[3:5, :, 5:10],
-                              tensor_shape.TensorShape([2, None, 2]))
-        self.tensorShapeEqual(a[3:5, :, 50:3],
-                              tensor_shape.TensorShape([2, None, 0]))
-        self.tensorShapeEqual(a[3:5, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[1:5:2, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[:5:3, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([2, None, 1, 0]))
-        self.tensorShapeEqual(a[:2:3, :, array_ops.newaxis, 50:3,],
-                              tensor_shape.TensorShape([1, None, 1, 0]))
-        self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
-                              tensor_shape.TensorShape([5, None, 1, 4]))
+    with self.test_session(use_gpu=True):
+      uncertain_tensor = array_ops.placeholder(
+          dtypes.float32, shape=(5, None, 7))
+      a = StridedSliceShapeChecker(uncertain_tensor)
+      self.tensorShapeEqual(a[3:5], tensor_shape.TensorShape([2, None, 7]))
+      self.tensorShapeEqual(a[3:5, :, 4], tensor_shape.TensorShape([2, None]))
+      self.tensorShapeEqual(a[3:5, 3:4, 4],
+                            tensor_shape.TensorShape([2, None]))
+      self.tensorShapeEqual(a[3:5, :, 5:10],
+                            tensor_shape.TensorShape([2, None, 2]))
+      self.tensorShapeEqual(a[3:5, :, 50:3],
+                            tensor_shape.TensorShape([2, None, 0]))
+      self.tensorShapeEqual(a[3:5, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[1:5:2, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[:5:3, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([2, None, 1, 0]))
+      self.tensorShapeEqual(a[:2:3, :, array_ops.newaxis, 50:3,],
+                            tensor_shape.TensorShape([1, None, 1, 0]))
+      self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
+                            tensor_shape.TensorShape([5, None, 1, 4]))
 
   def testTensorValuedIndexShape(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
-        defined_shape_tensor = array_ops.placeholder(
-            dtypes.float32, shape=(5, 3, 7))
-        index_value = array_ops.placeholder(dtypes.int32, shape=())
-        a = StridedSliceShapeChecker(defined_shape_tensor)
-        self.tensorShapeEqual(a[index_value], tensor_shape.TensorShape([3, 7]))
-        self.tensorShapeEqual(a[index_value, ::-1],
-                              tensor_shape.TensorShape([3, 7]))
-        self.tensorShapeEqual(a[index_value, ::-2],
-                              tensor_shape.TensorShape([2, 7]))
-        other_scalar = array_ops.placeholder(dtypes.int32, shape=())
-        self.tensorShapeEqual(a[index_value, other_scalar:2],
-                              tensor_shape.TensorShape([None, 7]))
+    with self.test_session(use_gpu=True):
+      defined_shape_tensor = array_ops.placeholder(
+          dtypes.float32, shape=(5, 3, 7))
+      index_value = array_ops.placeholder(dtypes.int32, shape=())
+      a = StridedSliceShapeChecker(defined_shape_tensor)
+      self.tensorShapeEqual(a[index_value], tensor_shape.TensorShape([3, 7]))
+      self.tensorShapeEqual(a[index_value, ::-1],
+                            tensor_shape.TensorShape([3, 7]))
+      self.tensorShapeEqual(a[index_value, ::-2],
+                            tensor_shape.TensorShape([2, 7]))
+      other_scalar = array_ops.placeholder(dtypes.int32, shape=())
+      self.tensorShapeEqual(a[index_value, other_scalar:2],
+                            tensor_shape.TensorShape([None, 7]))
 
 
 class GradSliceChecker(object):
@@ -681,35 +675,33 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
   """Test that strided slice's custom gradient produces correct gradients."""
 
   def testGradient(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        var = variables.Variable(
-            array_ops.reshape(
-                math_ops.range(1, 97, 1), shape=(6, 4, 4)))
-        init = variables.global_variables_initializer()
-        sess.run(init)
-
-        grad = GradSliceChecker(self, sess, var,
-                                np.array(range(1, 97, 1)).reshape((6, 4, 4)))
-        _ = grad[2:6:2, 1:3, 1:3]
-        _ = grad[3:0:-2, 1:3, 1:3]
-        _ = grad[3:0:-2, array_ops.newaxis, 1:3, 2, array_ops.newaxis]
-        _ = grad[3:0:-2, 1:3, 2]
-        _ = grad[:, -1, :]
-        _ = grad[:, -2, :]
-        with self.assertRaisesRegexp(ValueError, "out of bounds"):
-          _ = grad[:, -200, :]
-        with self.assertRaisesRegexp(ValueError, "out of bounds"):
-          _ = grad[:, 200, :]
+    with self.test_session(use_gpu=True) as sess:
+      var = variables.Variable(
+          array_ops.reshape(
+              math_ops.range(1, 97, 1), shape=(6, 4, 4)))
+      init = variables.global_variables_initializer()
+      sess.run(init)
+
+      grad = GradSliceChecker(self, sess, var,
+                              np.array(range(1, 97, 1)).reshape((6, 4, 4)))
+      _ = grad[2:6:2, 1:3, 1:3]
+      _ = grad[3:0:-2, 1:3, 1:3]
+      _ = grad[3:0:-2, array_ops.newaxis, 1:3, 2, array_ops.newaxis]
+      _ = grad[3:0:-2, 1:3, 2]
+      _ = grad[:, -1, :]
+      _ = grad[:, -2, :]
+      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+        _ = grad[:, -200, :]
+      with self.assertRaisesRegexp(ValueError, "out of bounds"):
+        _ = grad[:, 200, :]
 
   def testGradientZero(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        var = variables.Variable(8)
-        init = variables.global_variables_initializer()
-        sess.run(init)
-        grad = GradSliceChecker(self, sess, var, np.array(8))
-        _ = grad[tuple()]
+    with self.test_session(use_gpu=True) as sess:
+      var = variables.Variable(8)
+      init = variables.global_variables_initializer()
+      sess.run(init)
+      grad = GradSliceChecker(self, sess, var, np.array(8))
+      _ = grad[tuple()]
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
@@ -816,25 +808,37 @@ class StridedSliceBenchmark(test_lib.Benchmark):
 
 class StridedSliceAssignChecker(object):
 
-  def __init__(self, test, x, tensor_type=dtypes.float32):
+  def __init__(self, test, x, tensor_type=dtypes.float32, use_resource=False):
     self.tensor_type = tensor_type
     self.test = test
-    self.x = math_ops.cast(
-        constant_op.constant(
-            x, dtype=dtypes.float32), dtype=tensor_type)
-    self.x_np = np.array(x)
+    self._use_resource = use_resource
+
+    self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if tensor_type.is_complex:
+      self.x_np -= 1j * self.x_np
+    self.x = constant_op.constant(self.x_np, dtype=tensor_type)
 
   def __setitem__(self, index, value):
-    for use_gpu in [False, True]:
-      with self.test.test_session(use_gpu=use_gpu) as sess:
+    value = np.array(value).astype(self.tensor_type.as_numpy_dtype)
+    # Give the value a non-zero imaginary component for complex types.
+    if self.tensor_type.is_complex:
+      value -= 1j * value
+
+    with self.test.test_session(use_gpu=True) as sess:
+      if self._use_resource:
+        var = resource_variable_ops.ResourceVariable(self.x)
+      else:
         var = variables.Variable(self.x)
-        sess.run(variables.initialize_variables([var]))
-        val = sess.run(var[index].assign(
-            constant_op.constant(
-                value, dtype=self.tensor_type)))
-        valnp = np.copy(self.x_np)
-        valnp[index] = np.array(value)
-        self.test.assertAllEqual(val, valnp)
+      sess.run(variables.initialize_variables([var]))
+      val = sess.run(var[index].assign(value))
+      # val_copy is used to check that tf.assign works equivalently to the
+      # assign method above.
+      val_copy = sess.run(state_ops.assign(var[index], value))
+      valnp = np.copy(self.x_np)
+      valnp[index] = np.array(value)
+      self.test.assertAllEqual(val, valnp)
+      self.test.assertAllEqual(val_copy, valnp)
 
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
@@ -847,9 +851,10 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
         bar = foo[:2].assign(constant_op.constant([1, 2]))
         sess.run(bar)
 
-  def testSliceAssign(self):
+  def doTestSliceAssign(self, use_resource):
     for dtype in STRIDED_SLICE_TYPES:
       checker = StridedSliceAssignChecker(self, [[1, 2, 3], [4, 5, 6]],
+                                          use_resource=use_resource,
                                           tensor_type=dtype)
       # Check if equal
       checker[:] = [[10, 20, 30], [40, 50, 60]]
@@ -874,6 +879,12 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  def testSliceAssign(self):
+    self.doTestSliceAssign(use_resource=False)
+
+  def testSliceAssignResource(self):
+    self.doTestSliceAssign(use_resource=True)
+
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index e90543a44b0022476f660f7c6915ff117f01374f..7f49c639577b590758c542e96f9d7f38994f2f71 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -402,7 +402,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         fail_insert_op.run()
 
-      # This op should succeed because the barrier has not cancelled
+      # This op should succeed because the barrier has not canceled
       # pending enqueues
       insert_1_op.run()
       self.assertEquals(size_t.eval(), [3])
@@ -461,7 +461,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         fail_insert_op.run()
 
-      # This op should fail because the queue is cancelled.
+      # This op should fail because the queue is canceled.
       with self.assertRaisesOpError("is closed"):
         insert_2_op.run()
 
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index afdb436dc685bb4b07f2e0509c3c950138b8fb49..08b03f851803a34dd050721e47471bafd1cd6cac 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -25,76 +25,78 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
 class BetaincTest(test.TestCase):
-  use_gpu = False
 
-  def _testBetaInc(self, dtype):
+  def _testBetaInc(self, a_s, b_s, x_s, dtype):
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       np_dt = dtype.as_numpy_dtype
 
       # Test random values
-      a_s = np.abs(np.random.randn(10, 10) * 30).astype(np_dt)  # in (0, infty)
-      b_s = np.abs(np.random.randn(10, 10) * 30).astype(np_dt)  # in (0, infty)
-      x_s = np.random.rand(10, 10).astype(np_dt)  # in (0, 1)
-      with self.test_session(use_gpu=self.use_gpu):
-        tf_a_s = constant_op.constant(a_s, dtype=dtype)
-        tf_b_s = constant_op.constant(b_s, dtype=dtype)
-        tf_x_s = constant_op.constant(x_s, dtype=dtype)
-        tf_out = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s).eval()
+      a_s = a_s.astype(np_dt)  # in (0, infty)
+      b_s = b_s.astype(np_dt)  # in (0, infty)
+      x_s = x_s.astype(np_dt)  # in (0, 1)
+      tf_a_s = constant_op.constant(a_s, dtype=dtype)
+      tf_b_s = constant_op.constant(b_s, dtype=dtype)
+      tf_x_s = constant_op.constant(x_s, dtype=dtype)
+      tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
+      with self.test_session():
+        tf_out = tf_out_t.eval()
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
       tol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      self.assertAllCloseAccordingToType(scipy_out, tf_out, rtol=tol, atol=tol)
+      self.assertAllCloseAccordingToType(scipy_out, tf_out, rtol=tol, atol=0)
 
       # Test out-of-range values (most should return nan output)
       combinations = list(itertools.product([-1, 0, 0.5, 1.0, 1.5], repeat=3))
       a_comb, b_comb, x_comb = np.asarray(list(zip(*combinations)), dtype=np_dt)
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb).astype(np_dt)
       self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
 
       # Test broadcasting between scalars and other shapes
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, b_s, x_s).astype(np_dt),
             math_ops.betainc(0.1, b_s, x_s).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(a_s, 0.1, x_s).astype(np_dt),
             math_ops.betainc(a_s, 0.1, x_s).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(a_s, b_s, 0.1).astype(np_dt),
             math_ops.betainc(a_s, b_s, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, b_s, 0.1).astype(np_dt),
             math_ops.betainc(0.1, b_s, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
         self.assertAllCloseAccordingToType(
             special.betainc(0.1, 0.1, 0.1).astype(np_dt),
             math_ops.betainc(0.1, 0.1, 0.1).eval(),
             rtol=tol,
-            atol=tol)
+            atol=0)
 
       with self.assertRaisesRegexp(ValueError, "must be equal"):
         math_ops.betainc(0.5, [0.5], [[0.5]])
 
-      with self.test_session(use_gpu=self.use_gpu):
+      with self.test_session():
         with self.assertRaisesOpError("Shapes of .* are inconsistent"):
           a_p = array_ops.placeholder(dtype)
           b_p = array_ops.placeholder(dtype)
@@ -108,14 +110,79 @@ class BetaincTest(test.TestCase):
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
   def testBetaIncFloat(self):
-    self._testBetaInc(dtypes.float32)
+    a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
   def testBetaIncDouble(self):
-    self._testBetaInc(dtypes.float64)
-
-
-class BetaincTestGPU(BetaincTest):
-  use_gpu = True
+    a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
+
+  def testBetaIncDoubleVeryLargeValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
+
+  def testBetaIncDoubleVerySmallValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
+
+  def testBetaIncFloatVerySmallValues(self):
+    a_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
+    b_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
+    x_s = np.random.rand(10, 10)  # in (0, 1)
+    self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
+
+  def testBetaIncFpropAndBpropAreNeverNAN(self):
+    with self.test_session() as sess:
+      space = np.logspace(-8, 5).tolist()
+      space_x = np.linspace(1e-16, 1 - 1e-16).tolist()
+      ga_s, gb_s, gx_s = zip(*list(itertools.product(space, space, space_x)))
+      # Test grads are never nan
+      ga_s_t = constant_op.constant(ga_s, dtype=dtypes.float32)
+      gb_s_t = constant_op.constant(gb_s, dtype=dtypes.float32)
+      gx_s_t = constant_op.constant(gx_s, dtype=dtypes.float32)
+      tf_gout_t = math_ops.betainc(ga_s_t, gb_s_t, gx_s_t)
+      tf_gout, grads_x = sess.run(
+          [tf_gout_t,
+           gradients_impl.gradients(tf_gout_t, [ga_s_t, gb_s_t, gx_s_t])[2]])
+
+      # Equivalent to `assertAllFalse` (if it existed).
+      self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
+                          np.isnan(tf_gout))
+      self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
+                          np.isnan(grads_x))
+
+  def testBetaIncGrads(self):
+    err_tolerance = 1e-3
+    with self.test_session():
+      # Test gradient
+      ga_s = np.abs(np.random.randn(2, 2) * 30)  # in (0, infty)
+      gb_s = np.abs(np.random.randn(2, 2) * 30)  # in (0, infty)
+      gx_s = np.random.rand(2, 2)  # in (0, 1)
+      tf_ga_s = constant_op.constant(ga_s, dtype=dtypes.float64)
+      tf_gb_s = constant_op.constant(gb_s, dtype=dtypes.float64)
+      tf_gx_s = constant_op.constant(gx_s, dtype=dtypes.float64)
+      tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
+      err = gradient_checker.compute_gradient_error(
+          [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
+      print("betainc gradient err = %g " % err)
+      self.assertLess(err, err_tolerance)
+
+      # Test broadcast gradient
+      gx_s = np.random.rand()  # in (0, 1)
+      tf_gx_s = constant_op.constant(gx_s, dtype=dtypes.float64)
+      tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
+      err = gradient_checker.compute_gradient_error(
+          [tf_gx_s], [()], tf_gout_t, ga_s.shape)
+      print("betainc gradient err = %g " % err)
+      self.assertLess(err, err_tolerance)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 2a1641b10fa188702934468f593307e7d0bf533c..7a610debd1d0c94cf7529e6c386f06fdfb11402f 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -73,13 +73,13 @@ class BincountTest(test_util.TensorFlowTestCase):
         else:
           weights = np.random.random(num_samples)
         self.assertAllEqual(
-            math_ops.bincount(arr, weights=weights).eval(),
+            math_ops.bincount(arr, weights).eval(),
             np.bincount(arr, weights))
 
   def test_zero_weights(self):
     with self.test_session():
       self.assertAllEqual(
-          math_ops.bincount(np.arange(1000), weights=np.zeros(1000)).eval(),
+          math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
           np.zeros(1000))
 
   def test_negative(self):
diff --git a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
similarity index 55%
rename from tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
rename to tensorflow/python/kernel_tests/bucketize_op_test.py
index abc6cc5674ce69fa2d7b27cdad773e9d29ee938e..ed53cc62940650c7312ea49afebf585ca2d705d0 100644
--- a/tensorflow/contrib/layers/python/kernel_tests/bucketization_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -12,35 +12,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for bucketization_op."""
+"""Tests for bucketize_op."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.layers.python.ops import bucketization_op
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class BucketizationOpTest(test.TestCase):
 
-  def test_normal_usecase(self):
-    op = bucketization_op.bucketize(
+  def testInt(self):
+    op = math_ops._bucketize(
         constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.test_session() as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
-  def test_invalid_boundaries_order(self):
-    op = bucketization_op.bucketize(
+  def testFloat(self):
+    op = math_ops._bucketize(
+        constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
+        boundaries=[0., 3., 8., 11.])
+    expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def test2DInput(self):
+    op = math_ops._bucketize(
+        constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
+        boundaries=[0, 3, 8, 11])
+    expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
+    with self.test_session() as sess:
+      self.assertAllEqual(expected_out, sess.run(op))
+
+  def testInvalidBoundariesOrder(self):
+    op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
     with self.test_session() as sess:
-      with self.assertRaises(errors_impl.InvalidArgumentError):
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
         sess.run(op)
 
+  def testBoundariesNotList(self):
+    with self.assertRaisesRegexp(
+        TypeError, "Expected list for attr boundaries"):
+      math_ops._bucketize(constant_op.constant([-5, 0]), boundaries=0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index d688b1647837a55ea331ea2dbe11fcdff352dca6..ed859e37741fe391c2f003a038a64eb292e385f1 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -80,23 +80,35 @@ class AssertEqualTest(test.TestCase):
 
   def test_raises_when_greater(self):
     with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+      # Static check
+      static_small = constant_op.constant([1, 2], name="small")
+      static_big = constant_op.constant([3, 4], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
       with ops.control_dependencies(
           [check_ops.assert_equal(
               big, small, message="fail")]):
         out = array_ops.identity(small)
       with self.assertRaisesOpError("fail.*big.*small"):
-        out.eval()
+        out.eval(feed_dict={small: [1, 2], big: [3, 4]})
 
   def test_raises_when_less(self):
     with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
-      big = constant_op.constant([4, 2], name="big")
+      # Static check
+      static_small = constant_op.constant([3, 1], name="small")
+      static_big = constant_op.constant([4, 2], name="big")
+      with self.assertRaisesRegexp(ValueError, "fail"):
+        check_ops.assert_equal(static_big, static_small, message="fail")
+      # Dynamic check
+      small = array_ops.placeholder(dtypes.int32, name="small")
+      big = array_ops.placeholder(dtypes.int32, name="big")
       with ops.control_dependencies([check_ops.assert_equal(small, big)]):
         out = array_ops.identity(small)
       with self.assertRaisesOpError("small.*big"):
-        out.eval()
+        out.eval(feed_dict={small: [3, 1], big: [4, 2]})
 
   def test_doesnt_raise_when_equal_and_broadcastable_shapes(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index bbe1d052f032b16a4eb614472eb2dba604bcc417..d95200ec92a204fddc03b4d991c157233bc9f322 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -48,13 +48,15 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       chol = linalg_ops.cholesky(x)
       verification = math_ops.matmul(chol, chol, adjoint_b=True)
       self._verifyCholeskyBase(sess, x, chol, verification)
 
   def testBasic(self):
-    self._verifyCholesky(np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]))
+    for dtype in (np.float32, np.float64):
+      self._verifyCholesky(
+          np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]).astype(dtype))
 
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
@@ -84,11 +86,12 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
-  def testNotInvertible(self):
+  def testNotInvertibleCPU(self):
     # The input should be invertible.
-    with self.test_session():
-      with self.assertRaisesOpError("LLT decomposition was not successful. The"
-                                    " input might not be valid."):
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesOpError(
+          "Cholesky decomposition was not successful. The"
+          " input might not be valid."):
         # All rows of the matrix below add to zero
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index a9cd0dd726b96115f229cf2c815ea7ae3cbe9266..0bb5b551555ae9234afbd79ba69668c1d4f8d1ee 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 class ConcatOpTest(test.TestCase):
 
   def testHStack(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 0)
@@ -50,7 +50,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[4:, :], params[p2])
 
   def testVStack(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 1)
@@ -76,7 +76,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[2:, :], p2)
 
   def testRefType(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       p1 = np.random.rand(4, 4).astype("f")
       p2 = np.random.rand(4, 4).astype("f")
       v1 = variables.Variable(p1)
@@ -89,7 +89,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:4, :], p1)
     self.assertAllEqual(result[4:, :], p2)
 
-  def _testRandom(self, dtype, use_gpu=False):
+  def _testRandom(self, dtype):
     # Random dims of rank 5
     shape = np.random.randint(1, 5, size=5)
     # Random number of tensors, but always > 1.
@@ -101,7 +101,7 @@ class ConcatOpTest(test.TestCase):
       dtype_feed = dtypes.float32
     else:
       dtype_feed = dtype
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=True):
       p = []
       for i in np.arange(num_tensors):
         input_shape = shape
@@ -139,11 +139,11 @@ class ConcatOpTest(test.TestCase):
 
   def testRandom(self):
     self._testRandom(dtypes.float32)
-    self._testRandom(dtypes.float32, use_gpu=True)
     self._testRandom(dtypes.int16)
-    self._testRandom(dtypes.int32, use_gpu=True)
+    self._testRandom(dtypes.int32)
     self._testRandom(dtypes.bfloat16)
-    self._testRandom(dtypes.bfloat16, use_gpu=True)
+    self._testRandom(dtypes.complex64)
+    self._testRandom(dtypes.complex128)
 
   def testInvalidConcatDimTypeAndShape(self):
     a = variables.Variable(constant_op.constant(1.0, shape=[1]))
@@ -166,38 +166,42 @@ class ConcatOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.concat(1, constant_op.constant(0, shape=[1]))
 
-  def _testGradientsSimple(self, use_gpu):
+  def _testGradientsSimple(self, dtype):
     # Test both positive and negative concat axis.
     # -2 and 1 correspond to the same axis for 3-dimensional tensors.
     for axis in [-2, 1]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.test_session(use_gpu=True):
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
           shape = [10, x, 2]
-          t = np.random.rand(*shape).astype("f")
+          t = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+          if dtype.is_complex:
+            t += -1j * t
           inp.append(t)
           inp_tensors.append(
               constant_op.constant(
-                  [float(y) for y in t.flatten()],
+                  t.flatten(),
                   shape=shape,
-                  dtype=dtypes.float32))
+                  dtype=dtype))
         c = array_ops.concat(inp_tensors, axis)
         output_shape = [10, 9, 2]
-        grad_inp = np.random.rand(*output_shape).astype("f")
+        grad_inp = np.random.rand(*output_shape).astype(dtype.as_numpy_dtype)
+        if dtype.is_complex:
+          grad_inp += -1j * grad_inp
         grad_tensor = constant_op.constant(
-            [float(x) for x in grad_inp.flatten()], shape=output_shape)
+            grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
         result = concated_grad.eval()
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsSimpleAll(self):
-    self._testGradientsSimple(use_gpu=True)
-    self._testGradientsSimple(use_gpu=False)
+  def testGradientsSimple(self):
+    self._testGradientsSimple(dtypes.float32)
+    self._testGradientsSimple(dtypes.complex64)
 
-  def _testGradientsFirstDim(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+  def testGradientsFirstDim(self):
+    with self.test_session(use_gpu=True):
       inp = []
       inp_tensors = []
       for x in [1, 2, 6]:
@@ -206,29 +210,25 @@ class ConcatOpTest(test.TestCase):
         inp.append(t)
         inp_tensors.append(
             constant_op.constant(
-                [float(y) for y in t.flatten()],
+                t.flatten(),
                 shape=shape,
                 dtype=dtypes.float32))
       c = array_ops.concat(inp_tensors, 0)
       output_shape = [9, 10, 2]
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
       result = concated_grad.eval()
 
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsFirstDimAll(self):
-    self._testGradientsFirstDim(use_gpu=False)
-    self._testGradientsFirstDim(use_gpu=True)
-
-  def _testGradientsLastDim(self, use_gpu):
+  def testGradientsLastDim(self):
     # Test both positive and negative concat axis.
     # -1 and 2 correspond to the same axis for 3-dimensional tensors.
     for axis in [-1, 2]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.test_session(use_gpu=True):
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -237,25 +237,21 @@ class ConcatOpTest(test.TestCase):
           inp.append(t)
           inp_tensors.append(
               constant_op.constant(
-                  [float(y) for y in t.flatten()],
+                  t.flatten(),
                   shape=shape,
                   dtype=dtypes.float32))
         c = array_ops.concat(inp_tensors, 2)
         output_shape = [10, 2, 9]
         grad_inp = np.random.rand(*output_shape).astype("f")
         grad_tensor = constant_op.constant(
-            [float(x) for x in grad_inp.flatten()], shape=output_shape)
+            grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
         result = concated_grad.eval()
 
     self.assertAllEqual(result, grad_inp)
 
-  def testGradientsLastDimAll(self):
-    self._testGradientsLastDim(use_gpu=False)
-    self._testGradientsLastDim(use_gpu=True)
-
-  def _RunAndVerifyGradientsRandom(self, use_gpu):
+  def _RunAndVerifyGradientsRandom(self):
     # Random dims of rank 5
     input_shape = np.random.randint(1, 5, size=5)
     # Random number of tensors
@@ -263,7 +259,7 @@ class ConcatOpTest(test.TestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=True):
       inp = []
       inp_tensors = []
       for x in concat_dim_sizes:
@@ -272,16 +268,13 @@ class ConcatOpTest(test.TestCase):
         t = np.random.rand(*shape).astype("f")
         inp.append(t)
         inp_tensors.append(
-            constant_op.constant(
-                [float(y) for y in t.flatten()],
-                shape=shape,
-                dtype=dtypes.float32))
+            constant_op.constant(t.flatten(), shape=shape,
+                                 dtype=dtypes.float32))
       c = array_ops.concat(inp_tensors, concat_dim)
       output_shape = input_shape
       output_shape[concat_dim] = concat_dim_sizes.sum()
       grad_inp = np.random.rand(*output_shape).astype("f")
-      grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+      grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
       result = concated_grad.eval()
@@ -290,8 +283,7 @@ class ConcatOpTest(test.TestCase):
 
   def testGradientsRandom(self):
     for _ in range(5):
-      self._RunAndVerifyGradientsRandom(use_gpu=False)
-      self._RunAndVerifyGradientsRandom(use_gpu=True)
+      self._RunAndVerifyGradientsRandom()
 
   def testGradientWithUnknownInputDim(self):
     with self.test_session(use_gpu=True):
@@ -302,7 +294,7 @@ class ConcatOpTest(test.TestCase):
       output_shape = [10, 2, 9]
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(inp) for inp in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
 
       grad = gradients_impl.gradients([c], [x, y], [grad_tensor])
       concated_grad = array_ops.concat(grad, 2)
@@ -364,24 +356,23 @@ class ConcatOpTest(test.TestCase):
   def testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        for shape0 in (), (2,):
-          axis = len(shape0)
-          for shape1 in (), (3,):
-            for n0 in 0, 1, 2:
-              for n1 in 0, 1, 2:
-                x0 = np.random.randn(*(shape0 + (n0,) + shape1))
-                x1 = np.random.randn(*(shape0 + (n1,) + shape1))
-                correct = np.concatenate([x0, x1], axis=axis)
-                # TODO(irving): Make tf.concat handle map, then drop list().
-                xs = list(map(constant_op.constant, [x0, x1]))
-                c = array_ops.concat(xs, axis)
-                self.assertAllEqual(c.eval(), correct)
-                # Check gradients
-                dc = np.random.randn(*c.get_shape().as_list())
-                dxs = sess.run(gradients_impl.gradients(c, xs, dc))
-                self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
+    with self.test_session(use_gpu=True) as sess:
+      for shape0 in (), (2,):
+        axis = len(shape0)
+        for shape1 in (), (3,):
+          for n0 in 0, 1, 2:
+            for n1 in 0, 1, 2:
+              x0 = np.random.randn(*(shape0 + (n0,) + shape1))
+              x1 = np.random.randn(*(shape0 + (n1,) + shape1))
+              correct = np.concatenate([x0, x1], axis=axis)
+              # TODO(irving): Make tf.concat handle map, then drop list().
+              xs = list(map(constant_op.constant, [x0, x1]))
+              c = array_ops.concat(xs, axis)
+              self.assertAllEqual(c.eval(), correct)
+              # Check gradients
+              dc = np.random.randn(*c.get_shape().as_list())
+              dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+              self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
   def testTensorConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
@@ -565,7 +556,7 @@ class ConcatOpTest(test.TestCase):
       c = array_ops.concat(inp_tensors, axis)
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, axis)
       result = concated_grad.eval(feed_dict=feed_dict)
@@ -578,7 +569,7 @@ class ConcatOpTest(test.TestCase):
           array_ops.concat(inp_tensors, axis), gather_indexes)
       grad_inp = np.random.rand(*output_shape).astype("f")
       grad_tensor = constant_op.constant(
-          [float(x) for x in grad_inp.flatten()], shape=output_shape)
+          grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.gather(
           array_ops.concat(grad, axis), gather_indexes)
@@ -617,15 +608,14 @@ class ConcatOpTest(test.TestCase):
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
-    for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
-        cdim = constant_op.constant(1, dtypes.int32)
-        s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-        s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-        s2 = constant_op.constant([2, 20, 5], dtypes.int32)
-        off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
-        self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
+    with self.test_session(use_gpu=True) as sess:
+      cdim = constant_op.constant(1, dtypes.int32)
+      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+      s1 = constant_op.constant([2, 7, 5], dtypes.int32)
+      s2 = constant_op.constant([2, 20, 5], dtypes.int32)
+      off = gen_array_ops._concat_offset(cdim, [s0, s1, s2])
+      ans = sess.run(off)
+      self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
   def testNotVector(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 128a6529f0d31ffbb7f0a3cf04773c05e3abec09..40c6a9e614dfd1897732c7b3808cb73a8de9c84d 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from google.protobuf import text_format
+
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -226,6 +230,29 @@ class AsTensorTest(test.TestCase):
       self.assertEqual(dtypes_lib.int32, x.dtype)
       self.assertAllEqual([1, 2, 3], x.eval())
 
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
+      self.assertEqual(dtypes_lib.int32, x.dtype)
+      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
+                                dtype=dtypes_lib.int32)
+      self.assertEqual(dtypes_lib.int32, x.dtype)
+      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
+      self.assertEqual(dtypes_lib.int64, x.dtype)
+      self.assertAllEqual([2**31, 2, 3], x.eval())
+
+      x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
+                                dtype=dtypes_lib.int64)
+      self.assertEqual(dtypes_lib.int64, x.dtype)
+      self.assertAllEqual([2**31, 2, 3], x.eval())
+
+      with self.assertRaisesRegexp(
+          ValueError, "a dimension is too large .2147483648."):
+        x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
+                                  dtype=dtypes_lib.int32)
+
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
@@ -346,7 +373,7 @@ class ZerosTest(test.TestCase):
 
 class ZerosLikeTest(test.TestCase):
 
-  def _compareZeros(self, dtype, use_gpu):
+  def _compareZeros(self, dtype, fully_defined_shape, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       # Creates a tensor of non-zero values with shape 2 x 3.
       # NOTE(kearnes): The default numpy dtype associated with tf.string is
@@ -357,16 +384,24 @@ class ZerosLikeTest(test.TestCase):
         numpy_dtype = np.string_
       else:
         numpy_dtype = dtype.as_numpy_dtype
-      d = constant_op.constant(np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+      if fully_defined_shape:
+        d = constant_op.constant(
+            np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+      else:
+        d = array_ops.placeholder(dtype=dtype)
       # Constructs a tensor of zeros of the same dimensions and type as "d".
       z_var = array_ops.zeros_like(d)
       # Test that the type is correct
       self.assertEqual(z_var.dtype, dtype)
       # Test that the shape is correct
-      self.assertEqual([2, 3], z_var.get_shape())
+      if fully_defined_shape:
+        self.assertEqual([2, 3], z_var.get_shape())
 
       # Test that the value is correct
-      z_value = z_var.eval()
+      feed_dict = {}
+      if not fully_defined_shape:
+        feed_dict[d] = np.ones((2, 3), dtype=numpy_dtype)
+      z_value = z_var.eval(feed_dict=feed_dict)
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
 
@@ -377,14 +412,16 @@ class ZerosLikeTest(test.TestCase):
         dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64,
         dtypes_lib.string
     ]:
-      self._compareZeros(dtype, False)
+      self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
+      self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
         dtypes_lib.bool, dtypes_lib.int64, dtypes_lib.string
     ]:
-      self._compareZeros(dtype, True)
+      self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
+      self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
 
   def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
@@ -589,18 +626,6 @@ class FillTest(test.TestCase):
 class PlaceholderTest(test.TestCase):
 
   def testDtype(self):
-    with self.test_session():
-      p = array_ops.placeholder(dtypes_lib.float32, name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 10)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
-      with self.assertRaisesOpError(
-          "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
-
-  def testShape(self):
     with self.test_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
       p_identity = array_ops.identity(p)
@@ -608,67 +633,13 @@ class PlaceholderTest(test.TestCase):
       self.assertAllClose(
           p_identity.eval(feed_dict={p: feed_array}), feed_array)
 
-      with self.assertRaisesOpError(
-          "must feed a value for placeholder tensor 'p' with dtype float and "
-          r"shape \[10,10\]"):
-        p_identity.eval()
-
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Cannot feed value of shape" in str(e)):
-        p_identity.eval(feed_dict={p: feed_array[:5, :5]})
-
-  def testPartialShape(self):
-    with self.test_session():
-      p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 3)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Cannot feed value of shape" in str(e)):
-        p_identity.eval(feed_dict={p: feed_array[:5, :2]})
-
-  def testControlDependency(self):
-    with self.test_session():
-      p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
-      with ops.control_dependencies([p]):
-        c = constant_op.constant(5, dtypes_lib.int32)
-      d = math_ops.multiply(p, c)
-      self.assertEqual(10, d.eval(feed_dict={p: 2}))
-
-  def testBadShape(self):
-    with self.assertRaises(ValueError):
-      array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
-
-  def testTensorStr(self):
-    a = array_ops.placeholder(dtypes_lib.float32, name="a")
-    self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
-
-    b = array_ops.placeholder(dtypes_lib.int32, shape=(32, 40), name="b")
-    self.assertEqual("<tf.Tensor 'b:0' shape=(32, 40) dtype=int32>", repr(b))
-
-    c = array_ops.placeholder(dtypes_lib.qint32, shape=(32, None, 2), name="c")
-    self.assertEqual("<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
-
-
-class PlaceholderV2Test(test.TestCase):
-
-  def testDtype(self):
-    with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="p")
-      p_identity = array_ops.identity(p)
-      feed_array = np.random.rand(10, 10)
-      self.assertAllClose(
-          p_identity.eval(feed_dict={p: feed_array}), feed_array)
-
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
         p_identity.eval()
 
   def testShape(self):
     with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=(10, 10), name="p")
+      p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
       p_identity = array_ops.identity(p)
       feed_array = np.random.rand(10, 10)
       self.assertAllClose(
@@ -685,7 +656,7 @@ class PlaceholderV2Test(test.TestCase):
 
   def testUnknownShape(self):
     with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="p")
+      p = array_ops.placeholder(dtypes_lib.float32, shape=None, name="p")
       p_identity = array_ops.identity(p)
       # can feed anything
       feed_array = np.random.rand(10, 3)
@@ -697,14 +668,13 @@ class PlaceholderV2Test(test.TestCase):
 
   def testScalarShape(self):
     with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.float32, shape=[], name="p")
+      p = array_ops.placeholder(dtypes_lib.float32, shape=[], name="p")
       p_identity = array_ops.identity(p)
       self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
 
   def testPartialShape(self):
     with self.test_session():
-      p = array_ops.placeholder_v2(
-          dtypes_lib.float32, shape=[None, 3], name="p")
+      p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
       p_identity = array_ops.identity(p)
       feed_array = np.random.rand(10, 3)
       self.assertAllClose(
@@ -716,7 +686,7 @@ class PlaceholderV2Test(test.TestCase):
 
   def testControlDependency(self):
     with self.test_session():
-      p = array_ops.placeholder_v2(dtypes_lib.int32, shape=[], name="p")
+      p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
       with ops.control_dependencies([p]):
         c = constant_op.constant(5, dtypes_lib.int32)
       d = math_ops.multiply(p, c)
@@ -725,19 +695,94 @@ class PlaceholderV2Test(test.TestCase):
 
   def testBadShape(self):
     with self.assertRaises(ValueError):
-      array_ops.placeholder_v2(dtypes_lib.float32, shape=(-1, 10))
+      array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
 
   def testTensorStr(self):
-    a = array_ops.placeholder_v2(dtypes_lib.float32, shape=None, name="a")
+    a = array_ops.placeholder(dtypes_lib.float32, shape=None, name="a")
     self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
 
-    b = array_ops.placeholder_v2(dtypes_lib.int32, shape=(32, 40), name="b")
+    b = array_ops.placeholder(dtypes_lib.int32, shape=(32, 40), name="b")
     self.assertEqual("<tf.Tensor 'b:0' shape=(32, 40) dtype=int32>", repr(b))
 
-    c = array_ops.placeholder_v2(
-        dtypes_lib.qint32, shape=(32, None, 2), name="c")
+    c = array_ops.placeholder(dtypes_lib.qint32, shape=(32, None, 2), name="c")
     self.assertEqual("<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
 
+  def testOldGraph(self):
+    # Load graph generated from earlier version of TF where
+    # placeholder shape was not set.
+    #
+    # a = tf.placeholder(tf.float32)
+    # b = a + 1.0
+    #
+    # Older graph's default shape is 'shape {}', not 'shape {
+    # unknown_rank: true }'
+    graph = """
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+      }
+    }
+  }
+}
+node {
+  name: "add/y"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "Placeholder"
+  input: "add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+"""
+    gdef = graph_pb2.GraphDef()
+    text_format.Merge(graph, gdef)
+    with self.test_session():
+      p, ret = importer.import_graph_def(
+          gdef, return_elements=["Placeholder:0", "add:0"])
+
+      # Feed in a vector of two elements.  Since the producer version
+      # of 21, a shape of {} is interpreted as "any shape".  If
+      # producer version were 22, then we'd get a shape mismatch
+      # error.
+      self.assertAllEqual([2.0, 3.0], ret.eval(feed_dict={p: [1.0, 2.0]}))
+
 
 class PlaceholderWithDefaultTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 6c7cbbff9cbcf8debfa54d51a42b27cc318dc368..d4ab4ca7aa4ac8538a252e0646995b1f090a7ed8 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -45,6 +45,8 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -196,7 +198,7 @@ class ControlFlowTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
-          lambda e: "The tensor returned for" in str(e)):
+          lambda e: "Retval[0] does not have value" in str(e)):
         dead_branch.eval()
 
   def testSwitchMergeLess(self):
@@ -323,6 +325,15 @@ class ControlFlowTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
+  def testCondInt(self):
+    p = array_ops.placeholder(dtypes.bool, shape=[])
+    v = constant_op.constant(10)
+    fn1 = lambda: math_ops.add(v, 1)
+    fn2 = lambda: math_ops.subtract(v, 1)
+    y = control_flow_ops.cond(p, fn1, fn2)
+    grad = gradients_impl.gradients(y, [v])
+    self.assertAllEqual([None], grad)
+
   def testFetchables(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -2139,6 +2150,29 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  def testStopGradMultiFlows(self):
+    with self.test_session():
+      def body(i, y, r):
+        x = variable_scope.get_variable(
+            "x", shape=(), dtype=dtypes.float32,
+            initializer=init_ops.ones_initializer())
+        y *= x
+        return [i + 1, y, r + math_ops.reduce_sum(y)]
+
+      i0 = constant_op.constant(0)
+      y0 = array_ops.ones(5)
+      r0 = constant_op.constant(0.0)
+      cond = lambda i, y, r: i < 1
+      _, _, r = control_flow_ops.while_loop(
+          cond, body, [i0, y0, r0], back_prop=True)
+
+      vars_ = variables.global_variables()
+      grads = linalg_ops.norm(gradients_impl.gradients(r, vars_)[0])
+      z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
+      result = gradients_impl.gradients(z, vars_)[0]
+      variables.global_variables_initializer().run()
+      self.assertEqual(5.0, result.eval())
+
   def testOneValueCond(self):
     with self.test_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 04c43ef5fa482d8c43b8ddf2daeb1ddf52a2bbca..14622ab4678864cd21257fe293a7984b39e59204 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -330,7 +330,7 @@ class Conv3DTest(test.TestCase):
 
     if test.is_gpu_available() and use_gpu:
       data_type = dtypes.float32
-      # TOOD(mjanusz): Modify gradient_checker to also provide max relative
+      # TODO(mjanusz): Modify gradient_checker to also provide max relative
       # error and synchronize the tolerance levels between the tests for forward
       # and backward computations.
       if test.is_gpu_available():
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index cfe2754b3233a27f71fdc9acc727747842b03f1a..0846470abc6c0be452a836da93f66dea803ea5c0 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -165,6 +166,9 @@ class UnaryOpTest(test.TestCase):
   def _sigmoid(self, x):
     return 1.0 / (1.0 + np.exp(-x))
 
+  def _log_sigmoid(self, x):
+    return np.log(self._sigmoid(x))
+
   def _replace_domain_error_with_inf(self, fn):
 
     def func(x):
@@ -198,6 +202,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(z, np.log1p, math_ops.log1p)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     self._compareBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareBoth(x, self._log_sigmoid, math_ops.log_sigmoid)
     self._compareBoth(y, np.sign, math_ops.sign)
     self._compareBoth(x, np.sin, math_ops.sin)
     self._compareBoth(x, np.cos, math_ops.cos)
@@ -372,10 +377,10 @@ class UnaryOpTest(test.TestCase):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
                                                     2).astype(np.complex64)
     y = x + 0.5  # no zeros
-    self._compareCpu(x, np.abs, math_ops.abs)
-    self._compareCpu(x, np.abs, _ABS)
-    self._compareCpu(x, np.negative, math_ops.negative)
-    self._compareCpu(x, np.negative, _NEG)
+    self._compareBoth(x, np.abs, math_ops.abs)
+    self._compareBoth(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
     self._compareCpu(y, self._inv, math_ops.reciprocal)
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
@@ -399,17 +404,17 @@ class UnaryOpTest(test.TestCase):
     def complex_sign(x):
       return x / np.abs(x)
 
-    self._compareCpu(y, complex_sign, math_ops.sign)
+    self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3,
                                                     2).astype(np.complex128)
     y = x + 0.5  # no zeros
-    self._compareCpu(x, np.abs, math_ops.abs)
-    self._compareCpu(x, np.abs, _ABS)
-    self._compareCpu(x, np.negative, math_ops.negative)
-    self._compareCpu(x, np.negative, _NEG)
+    self._compareBoth(x, np.abs, math_ops.abs)
+    self._compareBoth(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
     self._compareCpu(y, self._inv, math_ops.reciprocal)
     self._compareCpu(x, np.square, math_ops.square)
     self._compareCpu(y, np.sqrt, math_ops.sqrt)
@@ -433,7 +438,7 @@ class UnaryOpTest(test.TestCase):
     def complex_sign(x):
       return x / np.abs(x)
 
-    self._compareCpu(y, complex_sign, math_ops.sign)
+    self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
   def testGradGrad(self):
@@ -585,7 +590,8 @@ class BinaryOpTest(test.TestCase):
 
   def _compareBoth(self, x, y, np_func, tf_func, also_compare_variables=False):
     self._compareCpu(x, y, np_func, tf_func, also_compare_variables)
-    if x.dtype in (np.float16, np.float32, np.float64):
+    if x.dtype in (np.float16, np.float32, np.float64, np.complex64,
+                   np.complex128):
       if tf_func not in (_FLOORDIV, math_ops.floordiv, math_ops.igamma,
                          math_ops.igammac, math_ops.zeta, math_ops.polygamma):
         self._compareGradientX(x, y, np_func, tf_func)
@@ -609,6 +615,13 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
     self._compareBoth(x, y + 0.1, np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.arctan2, math_ops.atan2)
+    x1 = np.random.randn(5, 6).astype(np.float32)
+    x2 = np.random.randn(5, 6).astype(np.float32)
+    # Remove tiny values--atan2 gradients are flaky near the origin.
+    x1[np.abs(x1) < 0.05] = 0.05 * np.sign(x1[np.abs(x1) < 0.05])
+    x2[np.abs(x2) < 0.05] = 0.05 * np.sign(x2[np.abs(x2) < 0.05])
+    self._compareBoth(x1, x2, np.arctan2, math_ops.atan2)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       a_pos_small = np.linspace(0.1, 2, 15).reshape(1, 3, 5).astype(np.float32)
@@ -666,6 +679,13 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
     self._compareBoth(x, y + 0.1, np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.arctan2, math_ops.atan2)
+    x1 = np.random.randn(7, 4).astype(np.float64)
+    x2 = np.random.randn(7, 4).astype(np.float64)
+    # Remove tiny values--atan2 gradients are flaky near the origin.
+    x1[np.abs(x1) < 0.5] = 0.5 * np.sign(x1[np.abs(x1) < 0.5])
+    x2[np.abs(x2) < 0.5] = 0.5 * np.sign(x2[np.abs(x2) < 0.5])
+    self._compareBoth(x1, x2, np.arctan2, math_ops.atan2)
     try:
       from scipy import special  # pylint: disable=g-import-not-at-top
       a_pos_small = np.linspace(0.1, 2, 15).reshape(1, 3, 5).astype(np.float32)
@@ -1084,11 +1104,24 @@ class BinaryOpTest(test.TestCase):
           error = gradient_checker.compute_gradient_error(y, [], z, [])
           self.assertLess(error, 2e-4)
 
+  def testAtan2SpecialValues(self):
+    x1l, x2l = zip((+0.0, +0.0), (+0.0, -0.0), (-0.0, +0.0), (-0.0, -0.0),
+                   (1.2345, float("inf")), (1.2345, -float("inf")),
+                   (-4.321, float("inf")), (-4.125, -float("inf")),
+                   (float("inf"), float("inf")), (float("inf"), -float("inf")),
+                   (-float("inf"), float("inf")), (-float("inf"),
+                                                   -float("inf")))
+    for dtype in np.float32, np.float64:
+      x1 = np.array(x1l).astype(dtype)
+      x2 = np.array(x2l).astype(dtype)
+      self._compareCpu(x1, x2, np.arctan2, math_ops.atan2)
+      self._compareGpu(x1, x2, np.arctan2, math_ops.atan2)
+
 
 class ComparisonOpTest(test.TestCase):
 
-  def _compare(self, func, x, y, dtype):
-    with self.test_session(use_gpu=False):
+  def _compareScalar(self, func, x, y, dtype):
+    with self.test_session(use_gpu=True):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
@@ -1101,38 +1134,30 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for x in data:
         for y in data:
-          self.assertEqual(self._compare(math_ops.less, x, y, t), x < y)
-          self.assertEqual(self._compare(math_ops.less_equal, x, y, t), x <= y)
-          self.assertEqual(self._compare(math_ops.greater, x, y, t), x > y)
+          self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
+          self.assertEqual(self._compareScalar(math_ops.less_equal, x, y, t),
+                           x <= y)
+          self.assertEqual(self._compareScalar(math_ops.greater, x, y, t),
+                           x > y)
           self.assertEqual(
-              self._compare(math_ops.greater_equal, x, y, t), x >= y)
-          self.assertEqual(self._compare(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compare(math_ops.not_equal, x, y, t), x != y)
+              self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
+          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
+          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
+                           x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
-          self.assertEqual(self._compare(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(self._compare(math_ops.not_equal, x, y, t), x != y)
-
-  def _compareCpu(self, x, y, np_func, tf_func):
-    np_ans = np_func(x, y)
-    with self.test_session(use_gpu=False):
-      out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_cpu = out.eval()
-    self.assertAllEqual(np_ans, tf_cpu)
+          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
+          self.assertEqual(self._compareScalar(math_ops.not_equal, x, y, t),
+                           x != y)
 
-  def _compareGpu(self, x, y, np_func, tf_func):
+  def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
     with self.test_session(use_gpu=True):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_gpu = out.eval()
-    self.assertAllEqual(np_ans, tf_gpu)
-
-  def _compareBoth(self, x, y, np_func, tf_func):
-    self._compareCpu(x, y, np_func, tf_func)
-    if x.dtype == np.float16 or x.dtype == np.float32 or x.dtype == np.float64:
-      self._compareGpu(x, y, np_func, tf_func)
+      tf_ans = out.eval()
+    self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
     x = np.linspace(-15, 15, 6).reshape(1, 3, 2)
@@ -1140,28 +1165,31 @@ class ComparisonOpTest(test.TestCase):
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
       xt = x.astype(t)
       yt = y.astype(t)
-      self._compareBoth(xt, yt, np.less, math_ops.less)
-      self._compareBoth(xt, yt, np.less_equal, math_ops.less_equal)
-      self._compareBoth(xt, yt, np.greater, math_ops.greater)
-      self._compareBoth(xt, yt, np.greater_equal, math_ops.greater_equal)
-      self._compareBoth(xt, yt, np.equal, math_ops.equal)
-      self._compareBoth(xt, yt, np.not_equal, math_ops.not_equal)
-    # TODO(zhifengc): complex64 doesn't work on GPU yet.
+      self._compare(xt, yt, np.less, math_ops.less)
+      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+      self._compare(xt, yt, np.greater, math_ops.greater)
+      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+      self._compare(xt, yt, np.equal, math_ops.equal)
+      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+    # Complex types do not support ordering but do support equality tests.
     for t in [np.complex64, np.complex128]:
-      self._compareCpu(x.astype(t), y.astype(t), np.equal, math_ops.equal)
-      self._compareCpu(
-          x.astype(t), y.astype(t), np.not_equal, math_ops.not_equal)
+      xt = x.astype(t)
+      xt -= 1j * xt
+      yt = y.astype(t)
+      yt -= 1j * yt
+      self._compare(xt, yt, np.equal, math_ops.equal)
+      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
 
   def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
     x = np.linspace(-15, 15, np.prod(xs)).astype(dtype).reshape(xs)
     y = np.linspace(20, -10, np.prod(ys)).astype(dtype).reshape(ys)
-    self._compareCpu(x, y, np_func, tf_func)
-    self._compareCpu(y, x, np_func, tf_func)
-    if x.dtype == np.float16 or x.dtype == np.float32 or x.dtype == np.float64:
-      self._compareGpu(x, y, np_func, tf_func)
-      self._compareGpu(y, x, np_func, tf_func)
+    if dtype in (np.complex64, np.complex128):
+      x -= 1j * x
+      y -= 1j * y
+    self._compare(x, y, np_func, tf_func)
+    self._compare(y, x, np_func, tf_func)
 
-  def _testBCastByFunc(self, np_func, tf_func):
+  def _testBCastByFunc(self, np_func, tf_func, include_complex=False):
     shapes = [
         ([1, 3, 2], [1]),
         ([1, 3, 2], [2]),
@@ -1182,6 +1210,9 @@ class ComparisonOpTest(test.TestCase):
         np.int32,
         np.int64,
     ]
+    if include_complex:
+      dtypes.extend([np.complex64, np.complex128])
+
     for (xs, ys) in shapes:
       for dtype in dtypes:
         self._compareBCast(xs, ys, dtype, np_func, tf_func)
@@ -1199,10 +1230,11 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(np.greater_equal, math_ops.greater_equal)
 
   def testBCastEqual(self):
-    self._testBCastByFunc(np.equal, math_ops.equal)
+    self._testBCastByFunc(np.equal, math_ops.equal, include_complex=True)
 
   def testBCastNotEqual(self):
-    self._testBCastByFunc(np.not_equal, math_ops.not_equal)
+    self._testBCastByFunc(np.not_equal, math_ops.not_equal,
+                          include_complex=True)
 
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..783492a6f255b7e665615e91d0d1db380e42b7a9
--- /dev/null
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DecodeBmpOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.platform import test
+
+
+
+class DecodeBmpOpTest(test.TestCase):
+
+  def testex1(self):
+    img_bytes = [[[0, 0, 255], [0, 255, 0]], [[255, 0, 0], [255, 255, 255]]]
+    # Encoded BMP bytes from Wikipedia
+    encoded_bytes = [
+        0x42, 0x40,
+        0x46, 0, 0, 0,
+        0, 0,
+        0, 0,
+        0x36, 0, 0, 0,
+        0x28, 0, 0, 0,
+        0x2, 0, 0, 0,
+        0x2, 0, 0, 0,
+        0x1, 0,
+        0x18, 0,
+        0, 0, 0, 0,
+        0x10, 0, 0, 0,
+        0x13, 0xb, 0, 0,
+        0x13, 0xb, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0xff,
+        0xff, 0xff, 0xff,
+        0, 0,
+        0xff, 0, 0,
+        0, 0xff, 0,
+        0, 0,
+    ]
+
+    byte_string = bytes(bytearray(encoded_bytes))
+    img_in = constant_op.constant(byte_string, dtype=dtypes.string)
+    decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
+
+    with self.test_session():
+      decoded = decode.eval()
+      self.assertAllEqual(decoded, img_bytes)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 52f48c3368be8c372db581e48ed8f05927f45a34..58280432d6332770b8c3f1916cd61782a26a8f85 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -33,13 +33,24 @@ prefix_path = "tensorflow/core/lib"
 
 class DecodeImageOpTest(test.TestCase):
 
+  def testBmp(self):
+    # Read a real bmp and verify shape
+    path = os.path.join(prefix_path, "bmp", "testdata", "lena.bmp")
+    with self.test_session(use_gpu=True) as sess:
+      bmp0 = io_ops.read_file(path)
+      image0 = image_ops.decode_image(bmp0)
+      image1 = image_ops.decode_bmp(bmp0)
+      bmp0, image0, image1 = sess.run([bmp0, image0, image1])
+      self.assertEqual(len(bmp0), 4194)
+      self.assertAllEqual(image0, image1)
+
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
-    WIDTH = 20
-    HEIGHT = 40
-    STRIDE = 5
-    shape = (12, HEIGHT, WIDTH, 3)
+    width = 20
+    height = 40
+    stride = 5
+    shape = (12, height, width, 3)
 
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(path)
@@ -52,13 +63,13 @@ class DecodeImageOpTest(test.TestCase):
 
       for frame_idx, frame in enumerate(image0):
         gt = np.zeros(shape[1:], dtype=np.uint8)
-        start = frame_idx * STRIDE
-        end = (frame_idx + 1) * STRIDE
-        if end <= WIDTH:
+        start = frame_idx * stride
+        end = (frame_idx + 1) * stride
+        if end <= width:
           gt[:, start:end, :] = 255
         else:
-          start -= WIDTH
-          end -= WIDTH
+          start -= width
+          end -= width
           gt[start:end, :, :] = 255
 
         self.assertAllClose(frame, gt)
@@ -79,11 +90,15 @@ class DecodeImageOpTest(test.TestCase):
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
+      bad_channels = image_ops.decode_image(jpeg0, channels=4)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        bad_channels.eval()
+
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
     inputs = [(1, "lena_gray.png")]
     for channels_in, filename in inputs:
-      for channels in 0, 1, 3:
+      for channels in 0, 1, 3, 4:
         with self.test_session(use_gpu=True) as sess:
           path = os.path.join(prefix_path, "png", "testdata", filename)
           png0 = io_ops.read_file(path)
@@ -100,11 +115,6 @@ class DecodeImageOpTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         decode.eval()
 
-  def testInvalidChannels(self):
-    image_bytes = b"unused"
-    with self.assertRaises(ValueError):
-      decode = image_ops.decode_image(image_bytes, channels=4)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index cd7216c52796f868623e94b2c61a3bde18204f08..fbaf335efb8ae3b12b2014e4298df12c2967a583 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -77,6 +77,14 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  def testEmptyStringInput(self):
+    with self.test_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.float16)
+
+      result = decode.eval(feed_dict={in_bytes: [""]})
+      self.assertEqual(len(result), 1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index a881ed0dc9adda6418a1712656caf2d63d24ae05..2fc34bd4d17860e57d66e5eda1218d430cfc6b4a 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -113,10 +113,9 @@ class DepthwiseConv2DTest(test.TestCase):
       total_size_1 *= s
     for s in filter_in_sizes:
       total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
+    # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [1.0 for f in range(1, total_size_2 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t1.set_shape(tensor_in_sizes)
@@ -147,8 +146,9 @@ class DepthwiseConv2DTest(test.TestCase):
       native_result = sess.run(conv_native)
       interface_result = sess.run(conv_interface)
 
-    print("diff matrix:",
-          np.amax(np.ravel(native_result) - np.ravel(interface_result)))
+    print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
+          ", stride:", stride, ", padding: ", padding, ", max diff: ",
+          np.amax(np.absolute(native_result - interface_result)))
     self.assertArrayNear(
         np.ravel(native_result), np.ravel(interface_result), 1e-5)
     self.assertShapeEqual(native_result, conv_native)
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..50a079520048f8b2fc1ae0769b26507bb452d8b1
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -0,0 +1,279 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+cuda_py_test(
+    name = "bijector_test",
+    size = "small",
+    srcs = ["bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "kullback_leibler_test",
+    size = "small",
+    srcs = ["kullback_leibler_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "beta_test",
+    size = "small",
+    srcs = ["beta_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "bernoulli_test",
+    size = "small",
+    srcs = ["bernoulli_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "categorical_test",
+    size = "small",
+    srcs = ["categorical_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_test",
+    size = "small",
+    srcs = ["dirichlet_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_multinomial_test",
+    size = "medium",
+    srcs = ["dirichlet_multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "gamma_test",
+    srcs = ["gamma_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "laplace_test",
+    srcs = ["laplace_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "multinomial_test",
+    srcs = ["multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "student_t_test",
+    size = "small",
+    srcs = ["student_t_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+)
+
+cuda_py_test(
+    name = "uniform_test",
+    size = "small",
+    srcs = ["uniform_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "normal_test",
+    size = "medium",
+    srcs = ["normal_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "special_math_test",
+    size = "medium",
+    srcs = ["special_math_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "identity_bijector_test",
+    size = "small",
+    srcs = ["identity_bijector_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/distributions/__init__.py b/tensorflow/python/kernel_tests/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94dd13c89057795d4f5182cbf25867441f76c8b3
--- /dev/null
+++ b/tensorflow/python/kernel_tests/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Kernel tests for tf.distributions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
similarity index 90%
rename from tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
rename to tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 6ba872ef9ca07aa2566fc46b04742b8a3a0dfa4b..ef93c4dab088c1e8bcb8ba1673d964eabb79835d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-import scipy.special
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
 
 
 def make_bernoulli(batch_shape, dtype=dtypes.int32):
@@ -54,13 +69,16 @@ class BernoulliTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(logits, dist.logits.eval())
 
+    if not special:
+      return
+
     with self.test_session():
-      self.assertAllClose(scipy.special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), dist.probs.eval())
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
@@ -148,10 +166,21 @@ class BernoulliTest(test.TestCase):
               p: [0.2, 0.3, 0.4]
           }), [[0.2, 0.7, 0.4]])
 
+  def testPmfInvalid(self):
+    p = [0.1, 0.2, 0.7]
+    with self.test_session():
+      dist = bernoulli.Bernoulli(probs=p, validate_args=True)
+      with self.assertRaisesOpError("must be non-negative."):
+        dist.prob([1, 1, -1]).eval()
+      with self.assertRaisesOpError("is not less than or equal to 1."):
+        dist.prob([2, 0, 1]).eval()
+
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
-    self._testPmf(logits=scipy.special.logit(p))
+    if not special:
+      return
+    self._testPmf(logits=special.logit(p))
 
   def testBroadcasting(self):
     with self.test_session():
@@ -277,7 +306,7 @@ class BernoulliTest(test.TestCase):
       a = bernoulli.Bernoulli(probs=a_p)
       b = bernoulli.Bernoulli(probs=b_p)
 
-      kl = kullback_leibler.kl(a, b)
+      kl = kullback_leibler.kl_divergence(a, b)
       kl_val = sess.run(kl)
 
       kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
rename to tensorflow/python/kernel_tests/distributions/beta_test.py
index f524986cec8d881b262a2a2009da021d7e1e91e9..91a451f033ffbb01d54c3dacce952b406564b7b4 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -16,18 +16,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import beta as beta_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import beta as beta_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class BetaTest(test.TestCase):
@@ -167,18 +182,22 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_mean = stats.beta.mean(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.beta.mean(a, b)
       self.assertAllClose(expected_mean, dist.mean().eval())
 
   def testBetaVariance(self):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_variance = stats.beta.var(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.beta.var(a, b)
       self.assertAllClose(expected_variance, dist.variance().eval())
 
   def testBetaMode(self):
@@ -228,9 +247,11 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_entropy = stats.beta.entropy(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.beta.entropy(a, b)
       self.assertAllClose(expected_entropy, dist.entropy().eval())
 
   def testBetaSample(self):
@@ -243,6 +264,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
@@ -286,6 +309,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values[:, 1, :].mean(axis=0),
           stats.beta.mean(a, b)[1, :],
@@ -301,6 +326,8 @@ class BetaTest(test.TestCase):
         actual = beta_lib.Beta(a, b).cdf(x).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaLogCdf(self):
@@ -313,6 +340,8 @@ class BetaTest(test.TestCase):
         actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaWithSoftplusConcentration(self):
@@ -342,6 +371,8 @@ class BetaTest(test.TestCase):
         d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
                                                        concentration0=b2_sp)
 
+        if not special:
+          return
         kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
                        (a1 - a2) * special.digamma(a1) +
                        (b1 - b2) * special.digamma(b1) +
@@ -349,13 +380,13 @@ class BetaTest(test.TestCase):
 
         for dist1 in [d1, d1_sp]:
           for dist2 in [d2, d2_sp]:
-            kl = kullback_leibler.kl(dist1, dist2)
+            kl = kullback_leibler.kl_divergence(dist1, dist2)
             kl_val = sess.run(kl)
             self.assertEqual(kl.get_shape(), shape)
             self.assertAllClose(kl_val, kl_expected)
 
         # Make sure KL(d1||d1) is 0
-        kl_same = sess.run(kullback_leibler.kl(d1, d1))
+        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
         self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py
rename to tensorflow/python/kernel_tests/distributions/bijector_test.py
index 94f3bc959b6b714860bdb1f4b9b4e88d9c40920a..9f9fb5c0bb4c0e9d68ddf6034a8649ad5a6bd8e9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -22,9 +22,9 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector as bijector_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.platform import test
 
 
@@ -36,10 +36,10 @@ class BaseBijectorTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError,
                                    ("Can't instantiate abstract class Bijector "
                                     "with abstract methods __init__")):
-        bijector_lib.Bijector()
+        bijector.Bijector()  # pylint: disable=abstract-class-instantiated
 
   def testDefaults(self):
-    class _BareBonesBijector(bijector_lib.Bijector):
+    class _BareBonesBijector(bijector.Bijector):
       """Minimal specification of a `Bijector`."""
 
       def __init__(self):
@@ -80,7 +80,7 @@ class IntentionallyMissingError(Exception):
   pass
 
 
-class BrokenBijector(bijector_lib.Bijector):
+class BrokenBijector(bijector.Bijector):
   """Forward and inverse are not inverses of each other."""
 
   def __init__(self, forward_missing=False, inverse_missing=False):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
rename to tensorflow/python/kernel_tests/distributions/categorical_test.py
index 0b42581e79f15827f74749094bd50ceba5de50c2..33db933e82a3fdc794c34aa3a93de82fdd89e3be 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
@@ -30,6 +28,8 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import categorical
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
 
@@ -126,6 +126,63 @@ class CategoricalTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
+  def testCDFWithDynamicEventShape(self):
+    """Test that dynamically-sized events with unknown shape work."""
+    batch_size = 2
+    histograms = array_ops.placeholder(dtype=dtypes.float32,
+                                       shape=(batch_size, None))
+    event = array_ops.placeholder(dtype=dtypes.float32, shape=(batch_size,))
+    dist = categorical.Categorical(probs=histograms)
+    cdf_op = dist.cdf(event)
+
+    # Feed values into the placeholder with different shapes
+    # three classes.
+    event_feed_one = [0, 1]
+    histograms_feed_one = [[0.5, 0.3, 0.2], [1.0, 0.0, 0.0]]
+    expected_cdf_one = [0.0, 1.0]
+    feed_dict_one = {
+        histograms: histograms_feed_one,
+        event: event_feed_one
+    }
+
+    # six classes.
+    event_feed_two = [2, 5]
+    histograms_feed_two = [[0.9, 0.0, 0.0, 0.0, 0.0, 0.1],
+                           [0.15, 0.2, 0.05, 0.35, 0.13, 0.12]]
+    expected_cdf_two = [0.9, 0.88]
+    feed_dict_two = {
+        histograms: histograms_feed_two,
+        event: event_feed_two
+    }
+
+    with self.test_session() as sess:
+      actual_cdf_one = sess.run(cdf_op, feed_dict=feed_dict_one)
+      actual_cdf_two = sess.run(cdf_op, feed_dict=feed_dict_two)
+
+    self.assertAllClose(actual_cdf_one, expected_cdf_one)
+    self.assertAllClose(actual_cdf_two, expected_cdf_two)
+
+  def testCDFWithBatch(self):
+    histograms = [[0.1, 0.2, 0.3, 0.25, 0.15],
+                  [0.0, 0.75, 0.2, 0.05, 0.0]]
+    event = [0, 3]
+    expected_cdf = [0.0, 0.95]
+    dist = categorical.Categorical(probs=histograms)
+    cdf_op = dist.cdf(event)
+
+    with self.test_session():
+      self.assertAllClose(cdf_op.eval(), expected_cdf)
+
+  def testCDFNoBatch(self):
+    histogram = [0.1, 0.2, 0.3, 0.4]
+    event = 2
+    expected_cdf = 0.3
+    dist = categorical.Categorical(probs=histogram)
+    cdf_op = dist.cdf(event)
+
+    with self.test_session():
+      self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
+
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -278,10 +335,10 @@ class CategoricalTest(test.TestCase):
           a = categorical.Categorical(logits=a_logits)
           b = categorical.Categorical(logits=b_logits)
 
-          kl = kullback_leibler.kl(a, b)
+          kl = kullback_leibler.kl_divergence(a, b)
           kl_val = sess.run(kl)
           # Make sure KL(a||a) is 0
-          kl_same = sess.run(kullback_leibler.kl(a, a))
+          kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
           prob_a = np_softmax(a_logits)
           prob_b = np_softmax(b_logits)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index 54691d2095dab843ddc821464b581fa14284528f..d009f4e9319293c636f90a76d49f8b90d473cb0d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -17,13 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet_multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
+
+ds = dirichlet_multinomial
 
 
 class DirichletMultinomialTest(test.TestCase):
@@ -87,9 +89,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist.prob([3., 0, 2]).eval()
       dist.prob([3.0, 0, 2.0]).eval()
       # Both equality and integer checking fail.
+      placeholder = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(
           "counts cannot contain fractional components"):
-        dist.prob([1.0, 2.5, 1.5]).eval()
+        dist.prob(placeholder).eval(feed_dict={placeholder: [1.0, 2.5, 1.5]})
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob([1., 2., 3.]).eval()
       # Non-integer arguments work.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index cd634da09dde3227cb09ef68150790fc67eec747..a2f1de5aaf3a75c1cfac820cc4494af34d082250 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -16,14 +16,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import dirichlet as dirichlet_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class DirichletTest(test.TestCase):
@@ -132,9 +147,11 @@ class DirichletTest(test.TestCase):
   def testMean(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_mean = stats.dirichlet.mean(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mean().get_shape(), [3])
+      if not stats:
+        return
+      expected_mean = stats.dirichlet.mean(alpha)
       self.assertAllClose(dirichlet.mean().eval(), expected_mean)
 
   def testCovarianceFromSampling(self):
@@ -177,11 +194,13 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       alpha = [1., 2, 3]
       denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
+      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
+      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
+      if not stats:
+        return
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
-      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
       self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
 
   def testMode(self):
@@ -213,9 +232,11 @@ class DirichletTest(test.TestCase):
   def testEntropy(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_entropy = stats.dirichlet.entropy(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.entropy().get_shape(), ())
+      if not stats:
+        return
+      expected_entropy = stats.dirichlet.entropy(alpha)
       self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
 
   def testSample(self):
@@ -227,6 +248,8 @@ class DirichletTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
similarity index 88%
rename from tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
rename to tensorflow/python/kernel_tests/distributions/exponential_test.py
index 617120241383c1574ae88ce5b7ee5a95bbc94eba..7afdf0f947605c6b982e8bf7defdd6224180e089 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -18,13 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import exponential as exponential_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class ExponentialTest(test.TestCase):
@@ -36,14 +51,17 @@ class ExponentialTest(test.TestCase):
       lam_v = 2.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
 
       log_pdf = exponential.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = exponential.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
@@ -54,34 +72,43 @@ class ExponentialTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
 
       cdf = exponential.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_mean = stats.expon.mean(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.expon.mean(scale=1 / lam_v)
       self.assertAllClose(exponential.mean().eval(), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_variance = stats.expon.var(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.expon.var(scale=1 / lam_v)
       self.assertAllClose(exponential.variance().eval(), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       self.assertAllClose(exponential.entropy().eval(), expected_entropy)
 
   def testExponentialSample(self):
@@ -95,6 +122,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
@@ -116,6 +145,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
 
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py b/tensorflow/python/kernel_tests/distributions/gamma_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
rename to tensorflow/python/kernel_tests/distributions/gamma_test.py
index fd627102372d1fc9c19646729a715290a5851449..5e4813ac0762d2855d7fbe6754fe1466c29c06c9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
+++ b/tensorflow/python/kernel_tests/distributions/gamma_test.py
@@ -17,18 +17,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class GammaTest(test.TestCase):
@@ -53,13 +67,14 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-
       pdf = gamma.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensional(self):
@@ -71,15 +86,16 @@ class GammaTest(test.TestCase):
       beta_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensionalBroadcasting(self):
@@ -91,15 +107,17 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaCDF(self):
@@ -112,10 +130,11 @@ class GammaTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
-
       cdf = gamma.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testGammaMean(self):
@@ -123,8 +142,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.mean().eval(), expected_means)
 
   def testGammaModeAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
@@ -165,8 +186,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.variance().eval(), expected_variances)
 
   def testGammaStd(self):
@@ -174,17 +197,21 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertEqual(gamma.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertAllClose(gamma.stddev().eval(), expected_stddev)
 
   def testGammaEntropy(self):
     with self.test_session():
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
       self.assertEqual(gamma.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.entropy().eval(), expected_entropy)
 
   def testGammaSampleSmallAlpha(self):
@@ -199,6 +226,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -208,7 +238,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSample(self):
     with session.Session():
@@ -222,6 +251,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -231,7 +263,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSampleMultiDimensional(self):
     with session.Session():
@@ -246,6 +277,8 @@ class GammaTest(test.TestCase):
       zeros = np.zeros_like(alpha_v + beta_v)  # 10 x 100
       alpha_bc = alpha_v + zeros
       beta_bc = beta_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.gamma.mean(
@@ -266,6 +299,8 @@ class GammaTest(test.TestCase):
 
   def _kstest(self, alpha, beta, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If we can't test, return that the test passes.
     ks, _ = stats.kstest(samples, stats.gamma(alpha, scale=1 / beta).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -279,6 +314,12 @@ class GammaTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.gamma.mean(
               [[7., 11.], [7., 11.]], scale=1 / np.array([[5., 5.], [6., 6.]])),
@@ -289,10 +330,6 @@ class GammaTest(test.TestCase):
                           scale=1 / np.array([[5., 5.], [6., 6.]])),
           sample_vals.var(axis=0),
           atol=.1)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -345,11 +382,15 @@ class GammaTest(test.TestCase):
       g1 = gamma_lib.Gamma(concentration=alpha1, rate=beta1)
       x = g0.sample(int(1e4), seed=0)
       kl_sample = math_ops.reduce_mean(g0.log_prob(x) - g1.log_prob(x), 0)
-      kl_actual = kullback_leibler.kl(g0, g1)
+      kl_actual = kullback_leibler.kl_divergence(g0, g1)
 
     # Execute graph.
     [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual])
 
+    self.assertEqual(beta0.shape, kl_actual.get_shape())
+
+    if not special:
+      return
     kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0)
                    + special.gammaln(alpha1)
                    - special.gammaln(alpha0)
@@ -357,7 +398,6 @@ class GammaTest(test.TestCase):
                    - alpha1 * np.log(beta1)
                    + alpha0 * (beta1 / beta0 - 1.))
 
-    self.assertEqual(beta0.shape, kl_actual.get_shape())
     self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6)
     self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py
rename to tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index f6aabe0d6303fb35ee602048737b481a5f7619c0..e8f9d0b728d8f831becc82cdba0ae2bf3d5da52a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/identity_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector_test_util
-from tensorflow.contrib.distributions.python.ops.bijectors import identity as identity_lib
+from tensorflow.python.ops.distributions import bijector_test_util
+from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.platform import test
 
 
@@ -28,7 +28,7 @@ class IdentityBijectorTest(test.TestCase):
 
   def testBijector(self):
     with self.test_session():
-      bijector = identity_lib.Identity()
+      bijector = identity_bijector.Identity()
       self.assertEqual("identity", bijector.name)
       x = [[[0.], [1.]]]
       self.assertAllEqual(x, bijector.forward(x).eval())
@@ -38,7 +38,7 @@ class IdentityBijectorTest(test.TestCase):
 
   def testScalarCongruency(self):
     with self.test_session():
-      bijector = identity_lib.Identity()
+      bijector = identity_bijector.Identity()
       bijector_test_util.assert_scalar_congruency(
           bijector, lower_x=-2., upper_x=2.)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
similarity index 76%
rename from tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
rename to tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index 6b3d886e01b92099bb6476779564f7a5953d550d..b1d8da771612fe42a153a1a11b6cb26bdcb983a0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
@@ -43,7 +43,7 @@ class KLTest(test.TestCase):
       return name
 
     a = MyDist(loc=0.0, scale=1.0)
-    self.assertEqual("OK", kullback_leibler.kl(a, a, name="OK"))
+    self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
 
   def testDomainErrorExceptions(self):
 
@@ -60,11 +60,11 @@ class KLTest(test.TestCase):
 
     with self.test_session():
       a = MyDistException(loc=0.0, scale=1.0)
-      kl = kullback_leibler.kl(a, a, allow_nan_stats=False)
+      kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         kl.eval()
-      kl_ok = kullback_leibler.kl(a, a)
+      kl_ok = kullback_leibler.kl_divergence(a, a)
       self.assertAllEqual([float("nan")], kl_ok.eval())
 
   def testRegistrationFailures(self):
@@ -116,16 +116,16 @@ class KLTest(test.TestCase):
     sub2 = Sub2(loc=0.0, scale=1.0)
     sub11 = Sub11(loc=0.0, scale=1.0)
 
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub1, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub1))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl(sub11, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl(sub2, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl(sub1, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
+    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
+    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
+    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
rename to tensorflow/python/kernel_tests/distributions/laplace_test.py
index 1f58d495f02f5d9f894ff4cccc3ae6f32b21441b..55577386c450c7ac63f62c8a6dfd277af50e2387 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -17,15 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import laplace as laplace_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
 
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
 
 class LaplaceTest(test.TestCase):
 
@@ -49,9 +65,11 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = laplace.prob(x)
@@ -67,15 +85,17 @@ class LaplaceTest(test.TestCase):
       scale_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensionalBroadcasting(self):
@@ -87,15 +107,17 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceCDF(self):
@@ -108,10 +130,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogCDF(self):
@@ -124,10 +148,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.log_cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
@@ -140,10 +166,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
 
       sf = laplace.log_survival_function(x)
       self.assertEqual(sf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
       self.assertAllClose(sf.eval(), expected_sf)
 
   def testLaplaceMean(self):
@@ -151,8 +179,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertEqual(laplace.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertAllClose(laplace.mean().eval(), expected_means)
 
   def testLaplaceMode(self):
@@ -168,8 +198,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertEqual(laplace.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertAllClose(laplace.variance().eval(), expected_variances)
 
   def testLaplaceStd(self):
@@ -177,17 +209,21 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertEqual(laplace.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertAllClose(laplace.stddev().eval(), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       self.assertAllClose(laplace.entropy().eval(), expected_entropy)
 
   def testLaplaceSample(self):
@@ -202,6 +238,8 @@ class LaplaceTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.laplace.mean(
@@ -228,6 +266,8 @@ class LaplaceTest(test.TestCase):
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
       loc_bc = loc_v + zeros
       scale_bc = scale_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.laplace.mean(
@@ -250,6 +290,8 @@ class LaplaceTest(test.TestCase):
 
   def _kstest(self, loc, scale, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If scipy isn't available, return "True" for passing
     ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -263,6 +305,12 @@ class LaplaceTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.laplace.mean(
               [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
@@ -275,10 +323,6 @@ class LaplaceTest(test.TestCase):
           sample_vals.var(axis=0),
           rtol=0.05,
           atol=0.)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
similarity index 86%
rename from tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 169498be24697c2be11bf2188e24765b4f801264..80caf10391d7e9e9735b71a48c6676812f4d637e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -17,14 +17,14 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
-
 
 class MultinomialTest(test.TestCase):
 
@@ -34,7 +34,7 @@ class MultinomialTest(test.TestCase):
   def testSimpleShapes(self):
     with self.test_session():
       p = [.1, .3, .6]
-      dist = ds.Multinomial(total_count=1., probs=p)
+      dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
       self.assertAllEqual([], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
@@ -44,7 +44,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual(2, dist.event_shape_tensor().eval())
       self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
@@ -54,14 +54,14 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=3., probs=p)
+      dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
@@ -70,7 +70,7 @@ class MultinomialTest(test.TestCase):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
     with self.test_session():
-      multinom = ds.Multinomial(total_count=3., logits=logits)
+      multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
       self.assertAllClose(p, multinom.probs.eval())
@@ -80,7 +80,7 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      dist = multinomial.Multinomial(total_count=n, probs=p, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
       with self.assertRaisesOpError("must be non-negative"):
@@ -93,18 +93,21 @@ class MultinomialTest(test.TestCase):
     n = [[5.]]
     with self.test_session():
       # No errors with integer n.
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=True)
       multinom.prob([2., 1, 2]).eval()
       multinom.prob([3., 0, 2]).eval()
       # Counts don't sum to n.
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         multinom.prob([2., 3, 2]).eval()
       # Counts are non-integers.
+      x = array_ops.placeholder(dtypes.float32)
       with self.assertRaisesOpError(
           "cannot contain fractional components."):
-        multinom.prob([1.0, 2.5, 1.5]).eval()
+        multinom.prob(x).eval(feed_dict={x: [1.0, 2.5, 1.5]})
 
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=False)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=False)
       multinom.prob([1., 2., 2.]).eval()
       # Non-integer arguments work.
       multinom.prob([1.0, 2.5, 1.5]).eval()
@@ -114,7 +117,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.5, 0.5]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(0.5, pmf.eval())
       self.assertEqual((), pmf.get_shape())
 
@@ -123,7 +126,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.1, 0.9]
       counts = [3., 2]
-      dist = ds.Multinomial(total_count=5., probs=p)
+      dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
       self.assertAllClose(81. / 10000, pmf.eval())
@@ -133,7 +136,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -141,7 +144,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -149,7 +152,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
@@ -157,7 +160,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual(pmf.get_shape(), (2))
 
@@ -169,7 +172,7 @@ class MultinomialTest(test.TestCase):
       n = [[3., 3], [3, 3]]
       # [2]
       counts = [2., 1]
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual(pmf.get_shape(), (2, 2))
 
@@ -178,7 +181,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual((4, 3), pmf.get_shape())
 
@@ -186,7 +189,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_means = 5 * np.array(p, dtype=np.float32)
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
@@ -195,7 +198,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_covariances = [[9. / 20, -1 / 10, -7 / 20],
                               [-1 / 10, 4 / 5, -7 / 10],
                               [-7 / 20, -7 / 10, 21 / 20]]
@@ -208,7 +211,7 @@ class MultinomialTest(test.TestCase):
       n = [5.] * 2
       # Shape [4, 1, 2]
       p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       # Shape [2, 2]
       inner_var = [[9. / 20, -9 / 20], [-9 / 20, 9 / 20]]
       # Shape [4, 2, 2, 2]
@@ -226,8 +229,8 @@ class MultinomialTest(test.TestCase):
     ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
 
     with self.test_session():
-      dist = ds.Multinomial(ns, p)
-      dist2 = ds.Multinomial(ns2, p2)
+      dist = multinomial.Multinomial(ns, p)
+      dist2 = multinomial.Multinomial(ns2, p2)
 
       covariance = dist.covariance()
       covariance2 = dist2.covariance()
@@ -244,7 +247,8 @@ class MultinomialTest(test.TestCase):
     # doesn't support different total counts.
     n = np.float32(5)
     with self.test_session() as sess:
-      dist = ds.Multinomial(n, theta)  # batch_shape=[2], event_shape=[3]
+      # batch_shape=[2], event_shape=[3]
+      dist = multinomial.Multinomial(n, theta)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
       x_centered = x - sample_mean[array_ops.newaxis, ...]
@@ -279,7 +283,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
       n = int(3e3)
@@ -308,7 +312,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4).astype(np.float32)))
       n = int(5e3)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
similarity index 86%
rename from tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
rename to tensorflow/python/kernel_tests/distributions/normal_test.py
index 9b70efaa37b74f388994728115f093b13a1c2968..07c7d6d11d0f3bcecfd1029295d3249c3ea8584b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import normal as normal_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,7 +31,21 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import normal as normal_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
 
 
 class NormalTest(test.TestCase):
@@ -89,10 +102,8 @@ class NormalTest(test.TestCase):
       sigma = constant_op.constant([math.sqrt(10.0)] * batch_size)
       x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -101,12 +112,17 @@ class NormalTest(test.TestCase):
       self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
 
       pdf = normal.prob(x)
-      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf.eval().shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
   def testNormalLogPDFMultidimensional(self):
     with self.test_session():
       batch_size = 6
@@ -115,12 +131,10 @@ class NormalTest(test.TestCase):
                                    batch_size)
       x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
 
       log_pdf = normal.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(expected_log_pdf, log_pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           log_pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
@@ -131,12 +145,17 @@ class NormalTest(test.TestCase):
       pdf = normal.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
-      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf_values.shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf_values.shape)
 
+      if not stats:
+        return
+      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
   def testNormalCDF(self):
     with self.test_session():
       batch_size = 50
@@ -145,14 +164,15 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).cdf(x)
-
       cdf = normal.cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
 
   def testNormalSurvivalFunction(self):
     with self.test_session():
@@ -162,14 +182,16 @@ class NormalTest(test.TestCase):
       x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).sf(x)
 
       sf = normal.survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
 
   def testNormalLogCDF(self):
     with self.test_session():
@@ -179,15 +201,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_cdf = stats.norm(mu, sigma).logcdf(x)
 
       cdf = normal.log_cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
       self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
 
+      if not stats:
+        return
+      expected_cdf = stats.norm(mu, sigma).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
   def testFiniteGradientAtDifficultPoints(self):
     for dtype in [np.float32, np.float64]:
       g = ops.Graph()
@@ -216,15 +241,18 @@ class NormalTest(test.TestCase):
       x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
-      expected_sf = stats.norm(mu, sigma).logsf(x)
 
       sf = normal.log_survival_function(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
       self.assertAllEqual(normal.batch_shape, sf.eval().shape)
 
+      if not stats:
+        return
+      expected_sf = stats.norm(mu, sigma).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
   def testNormalEntropyWithScalarInputs(self):
     # Scipy.stats.norm cannot deal with the shapes in the other test.
     with self.test_session():
@@ -232,16 +260,18 @@ class NormalTest(test.TestCase):
       sigma_v = 4.56
       normal = normal_lib.Normal(loc=mu_v, scale=sigma_v)
 
-      # scipy.stats.norm cannot deal with these shapes.
-      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
       entropy = normal.entropy()
-      self.assertAllClose(expected_entropy, entropy.eval())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.get_shape())
       self.assertAllEqual(normal.batch_shape_tensor().eval(),
                           entropy.eval().shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
       self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      # scipy.stats.norm cannot deal with these shapes.
+      if not stats:
+        return
+      expected_entropy = stats.norm(mu_v, sigma_v).entropy()
+      self.assertAllClose(expected_entropy, entropy.eval())
 
   def testNormalEntropy(self):
     with self.test_session():
@@ -276,6 +306,54 @@ class NormalTest(test.TestCase):
       self.assertAllEqual((3,), normal.mode().get_shape())
       self.assertAllEqual([7., 7, 7], normal.mode().eval())
 
+  def testNormalQuantile(self):
+    with self.test_session():
+      batch_size = 52
+      mu = self._rng.randn(batch_size)
+      sigma = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0., 1.0, batch_size - 2).astype(np.float64)
+      # Quantile performs piecewise rational approximation so adding some
+      # special input values to make sure we hit all the pieces.
+      p = np.hstack((p, np.exp(-33), 1. - np.exp(-33)))
+
+      normal = normal_lib.Normal(loc=mu, scale=sigma)
+      x = normal.quantile(p)
+
+      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.get_shape())
+      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(normal.batch_shape, x.get_shape())
+      self.assertAllEqual(normal.batch_shape, x.eval().shape)
+
+      if not stats:
+        return
+      expected_x = stats.norm(mu, sigma).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def _baseQuantileFiniteGradientAtDifficultPoints(self, dtype):
+    g = ops.Graph()
+    with g.as_default():
+      mu = variables.Variable(dtype(0.0))
+      sigma = variables.Variable(dtype(1.0))
+      dist = normal_lib.Normal(loc=mu, scale=sigma)
+      p = variables.Variable(
+          np.array([0.,
+                    np.exp(-32.), np.exp(-2.),
+                    1. - np.exp(-2.), 1. - np.exp(-32.),
+                    1.]).astype(dtype))
+
+      value = dist.quantile(p)
+      grads = gradients_impl.gradients(value, [mu, p])
+      with self.test_session(graph=g):
+        variables.global_variables_initializer().run()
+        self.assertAllFinite(grads[0])
+        self.assertAllFinite(grads[1])
+
+  def testQuantileFiniteGradientAtDifficultPointsFloat32(self):
+    self._baseQuantileFiniteGradientAtDifficultPoints(np.float32)
+
+  def testQuantileFiniteGradientAtDifficultPointsFloat64(self):
+    self._baseQuantileFiniteGradientAtDifficultPoints(np.float64)
+
   def testNormalVariance(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -404,7 +482,7 @@ class NormalTest(test.TestCase):
       n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
       n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
 
-      kl = kullback_leibler.kl(n_a, n_b)
+      kl = kullback_leibler.kl_divergence(n_a, n_b)
       kl_val = sess.run(kl)
 
       kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
similarity index 85%
rename from tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
rename to tensorflow/python/kernel_tests/distributions/special_math_test.py
index 795087e6a4c8e87bf54634990be21c61527cc8b5..dc462bae56b5fbc18036e80f6bbd4177b7b9fff2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -19,18 +19,30 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import importlib
 
 import numpy as np
-from scipy import special
-from scipy import stats
 
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 sm = special_math
 
 
@@ -56,6 +68,51 @@ GridSpec = collections.namedtuple("GridSpec", ["min", "max", "shape"])
 ErrorSpec = collections.namedtuple("ErrorSpec", ["rtol", "atol"])
 
 
+class NdtriTest(test.TestCase):
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def testNdtri(self):
+    """Verifies that ndtri computation is correct."""
+    with self.test_session():
+      if not special:
+        return
+
+      p = np.linspace(0., 1.0, 50).astype(np.float64)
+      # Quantile performs piecewise rational approximation so adding some
+      # special input values to make sure we hit all the pieces.
+      p = np.hstack((p, np.exp(-32), 1. - np.exp(-32),
+                     np.exp(-2), 1. - np.exp(-2)))
+      expected_x = special.ndtri(p)
+      x = special_math.ndtri(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def _baseNdtriFiniteGradientTest(self, dtype):
+    """Verifies that ndtri has finite gradients at interesting points."""
+    g = ops.Graph()
+    with g.as_default():
+      # Tests gradients at 0, 1, and piece-wise boundaries.
+      p = variables.Variable(
+          np.array([0.,
+                    np.exp(-32.), np.exp(-2.),
+                    1. - np.exp(-2.), 1. - np.exp(-32.),
+                    1.]).astype(dtype))
+    value = special_math.ndtri(p)
+    grads = gradients_impl.gradients(value, p)
+    with self.test_session(graph=g):
+      variables.global_variables_initializer().run()
+      self.assertAllFinite(grads[0])
+
+  def testNdtriFiniteGradientFloat32(self):
+    self._baseNdtriFiniteGradientTest(np.float32)
+
+  def testNdtriFiniteGradientFloat64(self):
+    self._baseNdtriFiniteGradientTest(np.float64)
+
+
 class NdtrTest(test.TestCase):
   _use_log = False
   # Grid min/max chosen to ensure 0 < cdf(x) < 1.
@@ -71,6 +128,9 @@ class NdtrTest(test.TestCase):
       self._test_grid_no_log(dtype, grid_spec, error_spec)
 
   def _test_grid_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.log_ndtr(grid).eval()
@@ -95,6 +155,9 @@ class NdtrTest(test.TestCase):
           atol=error_spec.atol)
 
   def _test_grid_no_log(self, dtype, grid_spec, error_spec):
+    if not special:
+      return
+
     with self.test_session():
       grid = _make_grid(dtype, grid_spec)
       actual = sm.ndtr(grid).eval()
@@ -225,6 +288,9 @@ class NdtrGradientTest(test.TestCase):
       self.assert_all_true(np.isfinite(grad_eval))
 
       # Versus scipy.
+      if not (special and stats):
+        return
+
       expected = stats.norm.pdf(raw_grid)
       if self._use_log:
         expected /= special.ndtr(raw_grid)
@@ -281,6 +347,9 @@ class LogCDFLaplaceTest(test.TestCase):
       _check_strictly_increasing(actual)
 
       # Versus scipy.
+      if not stats:
+        return
+
       scipy_dist = stats.laplace(loc=0., scale=1.)
       expected = scipy_dist.logcdf(grid.astype(scipy_dtype))
       self.assertAllClose(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
rename to tensorflow/python/kernel_tests/distributions/student_t_test.py
index 209ef696caa96411210a054dd473da88db80c76f..f1150de58e0dae5da25f74f95fb391c340a01262 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -18,19 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib import distributions
-from tensorflow.contrib.distributions.python.ops import student_t
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
 
-ds = distributions
+
+stats = try_import("scipy.stats")
 
 
 class StudentTTest(test.TestCase):
@@ -45,7 +56,7 @@ class StudentTTest(test.TestCase):
       mu_v = 7.
       sigma_v = 8.
       t = np.array([-2.5, 2.5, 8., 0., -1., 2.], dtype=np.float32)
-      student = ds.StudentT(df, loc=mu, scale=-sigma)
+      student = student_t.StudentT(df, loc=mu, scale=-sigma)
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
@@ -54,6 +65,9 @@ class StudentTTest(test.TestCase):
       self.assertEquals(pdf.get_shape(), (6,))
       pdf_values = pdf.eval()
 
+      if not stats:
+        return
+
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -72,13 +86,16 @@ class StudentTTest(test.TestCase):
       mu_v = np.array([3., -3.])
       sigma_v = np.array([np.sqrt(10.), np.sqrt(15.)])
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
-      student = ds.StudentT(df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -105,6 +122,8 @@ class StudentTTest(test.TestCase):
       self.assertEquals(cdf.get_shape(), (6,))
       cdf_values = cdf.eval()
 
+      if not stats:
+        return
       expected_log_cdf = stats.t.logcdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_cdf = stats.t.cdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_cdf, log_cdf_values, atol=0., rtol=1e-5)
@@ -119,7 +138,7 @@ class StudentTTest(test.TestCase):
     mu_v = np.array([[1., -1, 0]])  # 1x3
     sigma_v = np.array([[1., -2., 3.]]).T  # transposed => 3x1
     with self.test_session():
-      student = ds.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
+      student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
       ent_values = ent.eval()
 
@@ -128,6 +147,8 @@ class StudentTTest(test.TestCase):
     sigma_bc = np.abs(sigma_v) * ones
     mu_bc = ones.T * mu_v
     df_bc = ones.T * df_v
+    if not stats:
+      return
     expected_entropy = stats.t.entropy(
         np.reshape(df_bc, [-1]),
         loc=np.reshape(mu_bc, [-1]),
@@ -144,7 +165,7 @@ class StudentTTest(test.TestCase):
       mu_v = 3.
       sigma_v = np.sqrt(10.)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -166,11 +187,13 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(100)
 
       random_seed.set_random_seed(654321)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t1")
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t1")
       samples1 = student.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
-      student2 = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t2")
+      student2 = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t2")
       samples2 = student2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -180,7 +203,7 @@ class StudentTTest(test.TestCase):
       df_v = [1e-1, 1e-5, 1e-10, 1e-20]
       df = constant_op.constant(df_v)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=1., scale=1.)
+      student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -198,7 +221,7 @@ class StudentTTest(test.TestCase):
       mu_v = [3., -3.]
       sigma_v = [np.sqrt(10.), np.sqrt(15.)]
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
@@ -222,6 +245,8 @@ class StudentTTest(test.TestCase):
   def _checkKLApprox(self, df, mu, sigma, samples):
     n = samples.size
     np.random.seed(137)
+    if not stats:
+      return
     sample_scipy = stats.t.rvs(df, loc=mu, scale=sigma, size=n)
     covg = 0.99
     r = stats.t.interval(covg, df, loc=mu, scale=sigma)
@@ -247,9 +272,9 @@ class StudentTTest(test.TestCase):
       self.assertEqual(student.prob(2.).get_shape(), (3,))
       self.assertEqual(student.sample(37, seed=123456).get_shape(), (37, 3,))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
   def testBroadcastingPdfArgs(self):
 
@@ -266,9 +291,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
     def _check2d(student):
       _assert_shape(student, 2., (1, 3))
@@ -279,9 +304,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check2d(ds.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
-    _check2d(ds.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
-    _check2d(ds.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
+    _check2d(student_t.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
 
     def _check2d_rows(student):
       _assert_shape(student, 2., (3, 1))
@@ -292,22 +317,23 @@ class StudentTTest(test.TestCase):
       xs = xs.T  # (3,1)
       _assert_shape(student, xs, (3, 1))
 
-    _check2d_rows(ds.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
+    _check2d_rows(student_t.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
 
   def testMeanAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
+      student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
       mean = student.mean().eval()
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
-                            allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
+          allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.mean().eval()
 
@@ -315,8 +341,9 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
-                            allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
+          allow_nan_stats=True)
       mean = student.mean().eval()
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
@@ -327,7 +354,8 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1.5, 3., 5., 7.]
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, allow_nan_stats=True)
       var = student.variance().eval()
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
@@ -336,6 +364,8 @@ class StudentTTest(test.TestCase):
       self.assertTrue(np.isnan(var[0]))
       var[0] = np.inf
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -348,9 +378,11 @@ class StudentTTest(test.TestCase):
       df = [1.5, 3., 5., 7.]
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       var = student.variance().eval()
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -359,13 +391,15 @@ class StudentTTest(test.TestCase):
   def testVarianceAllowNanStatsFalseRaisesForUndefinedBatchMembers(self):
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=1., loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=0.5, loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
@@ -375,11 +409,13 @@ class StudentTTest(test.TestCase):
       df = [3.5, 5., 3., 5., 7.]
       mu = [-2.2]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       stddev = student.stddev().eval()
       mu *= len(df)
 
+      if not stats:
+        return
       expected_stddev = [
           stats.t.std(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -390,14 +426,14 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1., 3]
       mu = [-1, 0., 1]
       sigma = [5., 4., 3.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       mode = student.mode().eval()
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=3., loc=np.pi, scale=1.)
+      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
       num = 20000
       samples = student.sample(num, seed=123456)
       pdfs = student.prob(samples)
@@ -410,13 +446,15 @@ class StudentTTest(test.TestCase):
       self.assertEqual(mean.get_shape(), ())
       self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
       self.assertNear(np.pi, mean_val, err=1e-6)
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
       # Verify integral over sample*pdf ~= 1.
       self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
+      if not stats:
+        return
+      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
       self.assertAllEqual([], student.event_shape)
       self.assertAllEqual([], student.event_shape_tensor().eval())
       self.assertAllEqual([2, 2], student.batch_shape)
@@ -429,6 +467,12 @@ class StudentTTest(test.TestCase):
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
       self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
       self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertNear(
           stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 0]),
@@ -437,10 +481,6 @@ class StudentTTest(test.TestCase):
           stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 1]),
           err=.4)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -454,8 +494,8 @@ class StudentTTest(test.TestCase):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = ds.StudentT(df=[2, -5.], loc=0., scale=1.,
-                            validate_args=True, name="S")
+      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
+                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
         student.mean().eval()
 
@@ -464,7 +504,8 @@ class StudentTTest(test.TestCase):
       df = constant_op.constant([-3.2, -4.6])
       mu = constant_op.constant([-4.2, 3.4])
       sigma = constant_op.constant([-6.4, -8.8])
-      student = ds.StudentTWithAbsDfSoftplusScale(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentTWithAbsDfSoftplusScale(
+          df=df, loc=mu, scale=sigma)
       self.assertAllClose(
           math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
       self.assertAllClose(mu.eval(), student.loc.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
rename to tensorflow/python/kernel_tests/distributions/uniform_test.py
index c3c97b98f0d59fe3e7d632e40e61c3e4738a50b5..df99a0ed257da20179909eb44eacf7d44528dad2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
+
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class UniformTest(test.TestCase):
@@ -126,7 +141,7 @@ class UniformTest(test.TestCase):
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
       uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
         uniform.low.eval()
 
@@ -187,6 +202,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
 
@@ -195,6 +212,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.variance().eval(), s_uniform.var())
 
@@ -203,6 +222,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
 
diff --git a/tensorflow/python/kernel_tests/division_future_test.py b/tensorflow/python/kernel_tests/division_future_test.py
index 4d943decf2ab93f7572e0689e85c1e6a75cd8430..e681b32856ab5e91d11d42490a0c137d1f39f8c9 100644
--- a/tensorflow/python/kernel_tests/division_future_test.py
+++ b/tensorflow/python/kernel_tests/division_future_test.py
@@ -38,15 +38,19 @@ class DivisionTestCase(test.TestCase):
     # TODO(irving): Test int8, int16 once we support casts for those.
     dtypes = np.int32, np.int64, np.float32, np.float64
 
+    tensors = []
+    checks = []
+
     def check(x, y):
-      if isinstance(x, ops.Tensor):
-        x = x.eval()
-      if isinstance(y, ops.Tensor):
-        y = y.eval()
-      self.assertEqual(x.dtype, y.dtype)
-      self.assertEqual(x, y)
+      x = ops.convert_to_tensor(x)
+      y = ops.convert_to_tensor(y)
+      tensors.append((x, y))
+      def f(x, y):
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x, y)
+      checks.append(f)
 
-    with self.test_session():
+    with self.test_session() as sess:
       for dtype in dtypes:
         for x in map(dtype, values):
           for y in map(dtype, values):
@@ -60,6 +64,9 @@ class DivisionTestCase(test.TestCase):
                 floordiv = x // y
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
+      # Do only one sess.run for speed
+      for f, (x, y) in zip(checks, sess.run(tensors)):
+        f(x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 8f446d01bb3648a287fb6d1ad5a95815703f7555..2ff2f894077ebd2ec418deb984170beac31e0d08 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -38,15 +38,19 @@ class DivisionTestCase(test.TestCase):
     # TODO(irving): Test int8, int16 once we support casts for those.
     dtypes = np.int32, np.int64, np.float32, np.float64
 
+    tensors = []
+    checks = []
+
     def check(x, y):
-      if isinstance(x, ops.Tensor):
-        x = x.eval()
-      if isinstance(y, ops.Tensor):
-        y = y.eval()
-      self.assertEqual(x.dtype, y.dtype)
-      self.assertEqual(x, y)
+      x = ops.convert_to_tensor(x)
+      y = ops.convert_to_tensor(y)
+      tensors.append((x, y))
+      def f(x, y):
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x, y)
+      checks.append(f)
 
-    with self.test_session():
+    with self.test_session() as sess:
       for dtype in dtypes:
         for x in map(dtype, values):
           for y in map(dtype, values):
@@ -60,6 +64,9 @@ class DivisionTestCase(test.TestCase):
                 floordiv = x // y
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
+      # Do only one sess.run for speed
+      for f, (x, y) in zip(checks, sess.run(tensors)):
+        f(x, y)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 8cd378825701e2aea2251a76095e89bb18eed1ba..2bd21fb01d1f187af9cf4cf9670d0fd3948a7df8 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import compat
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index f66996494892e55585d96eba6d234c47251091fd..84928bd2e1fe2fe03a4ae7c3564651e2a9a9c3cc 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -39,30 +39,26 @@ class BaseFFTOpsTest(test.TestCase):
     self._CompareBackward(x, rank, fft_length, use_placeholder)
 
   def _CompareForward(self, x, rank, fft_length=None, use_placeholder=False):
-    if test.is_gpu_available(cuda_only=True):
-      x_np = self._npFFT(x, rank, fft_length)
-      if use_placeholder:
-        x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
-        x_tf = self._tfFFT(x_ph, rank, fft_length, use_gpu=True,
-                           feed_dict={x_ph: x})
-      else:
-        x_tf = self._tfFFT(x, rank, fft_length, use_gpu=True)
+    x_np = self._npFFT(x, rank, fft_length)
+    if use_placeholder:
+      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
+      x_tf = self._tfFFT(
+          x_ph, rank, fft_length, use_gpu=True, feed_dict={x_ph: x})
+    else:
+      x_tf = self._tfFFT(x, rank, fft_length, use_gpu=True)
 
-      # GPU/Forward
-      self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
 
   def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
-    if test.is_gpu_available(cuda_only=True):
-      x_np = self._npIFFT(x, rank, fft_length)
-      if use_placeholder:
-        x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
-        x_tf = self._tfIFFT(x_ph, rank, fft_length, use_gpu=True,
-                            feed_dict={x_ph: x})
-      else:
-        x_tf = self._tfIFFT(x, rank, fft_length, use_gpu=True)
+    x_np = self._npIFFT(x, rank, fft_length)
+    if use_placeholder:
+      x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
+      x_tf = self._tfIFFT(
+          x_ph, rank, fft_length, use_gpu=True, feed_dict={x_ph: x})
+    else:
+      x_tf = self._tfIFFT(x, rank, fft_length, use_gpu=True)
 
-      # GPU/Backward
-      self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
 
   def _checkGradComplex(self, func, x, y, result_is_complex=True,
                         use_gpu=False):
@@ -151,12 +147,11 @@ class FFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   def testEmpty(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          x = np.zeros((0,) * dims).astype(np.complex64)
-          self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
-          self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(rank, rank + 3):
+        x = np.zeros((0,) * dims).astype(np.complex64)
+        self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
+        self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
     for rank in VALID_FFT_RANKS:
@@ -184,41 +179,41 @@ class FFTOpsTest(BaseFFTOpsTest):
         self._Compare(gen((4,) * dims), rank)
 
   def testError(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(0, rank):
-          x = np.zeros((1,) * dims).astype(np.complex64)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfFFT(x, rank)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfIFFT(x, rank)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(0, rank):
+        x = np.zeros((1,) * dims).astype(np.complex64)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfFFT(x, rank)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfIFFT(x, rank)
 
   def testGrad_Simple(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 2):
-          re = np.ones(shape=(4,) * dims, dtype=np.float32) / 10.0
-          im = np.zeros(shape=(4,) * dims, dtype=np.float32)
-          self._checkGradComplex(self._tfFFTForRank(rank), re, im, use_gpu=True)
-          self._checkGradComplex(
-              self._tfIFFTForRank(rank), re, im, use_gpu=True)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(rank, rank + 2):
+        re = np.ones(shape=(4,) * dims, dtype=np.float32) / 10.0
+        im = np.zeros(shape=(4,) * dims, dtype=np.float32)
+        self._checkGradComplex(self._tfFFTForRank(rank), re, im, use_gpu=True)
+        self._checkGradComplex(self._tfIFFTForRank(rank), re, im, use_gpu=True)
 
   def testGrad_Random(self):
-    if test.is_gpu_available(cuda_only=True):
-      np.random.seed(54321)
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 2):
-          re = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
-          im = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
-          self._checkGradComplex(self._tfFFTForRank(rank), re, im, use_gpu=True)
-          self._checkGradComplex(
-              self._tfIFFTForRank(rank), re, im, use_gpu=True)
+    np.random.seed(54321)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(rank, rank + 2):
+        re = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
+        im = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
+        self._checkGradComplex(self._tfFFTForRank(rank), re, im, use_gpu=True)
+        self._checkGradComplex(self._tfIFFTForRank(rank), re, im, use_gpu=True)
 
 
 class RFFTOpsTest(BaseFFTOpsTest):
 
+  def _CompareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+    if test.is_gpu_available(cuda_only=True):
+      super(RFFTOpsTest, self)._CompareBackward(x, rank, fft_length,
+                                                use_placeholder)
+
   def _tfFFT(self, x, rank, fft_length=None, use_gpu=False, feed_dict=None):
     with self.test_session(use_gpu=use_gpu):
       return self._tfFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
@@ -268,12 +263,12 @@ class RFFTOpsTest(BaseFFTOpsTest):
       raise ValueError("invalid rank")
 
   def testEmpty(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          x = np.zeros((0,) * dims).astype(np.float32)
-          self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
-          x = np.zeros((0,) * dims).astype(np.complex64)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(rank, rank + 3):
+        x = np.zeros((0,) * dims).astype(np.float32)
+        self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
+        x = np.zeros((0,) * dims).astype(np.complex64)
+        if test.is_gpu_available(cuda_only=True):
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
@@ -327,36 +322,35 @@ class RFFTOpsTest(BaseFFTOpsTest):
           self._CompareBackward(gen_complex(complex_dims), rank, (size,) * rank)
 
   def testError(self):
-    if test.is_gpu_available(cuda_only=True):
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(0, rank):
-          x = np.zeros((1,) * dims).astype(np.complex64)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfFFT(x, rank)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Shape must be .*rank {}.*".format(rank)):
-            self._tfIFFT(x, rank)
-        for dims in xrange(rank, rank + 2):
-          x = np.zeros((1,) * rank)
-
-          # Test non-rank-1 fft_length produces an error.
-          fft_length = np.zeros((1, 1)).astype(np.int32)
-          with self.assertRaisesWithPredicateMatch(ValueError,
-                                                   "Shape must be .*rank 1"):
-            self._tfFFT(x, rank, fft_length)
-          with self.assertRaisesWithPredicateMatch(ValueError,
-                                                   "Shape must be .*rank 1"):
-            self._tfIFFT(x, rank, fft_length)
-
-          # Test wrong fft_length length.
-          fft_length = np.zeros((rank + 1,)).astype(np.int32)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
-            self._tfFFT(x, rank, fft_length)
-          with self.assertRaisesWithPredicateMatch(
-              ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
-            self._tfIFFT(x, rank, fft_length)
+    for rank in VALID_FFT_RANKS:
+      for dims in xrange(0, rank):
+        x = np.zeros((1,) * dims).astype(np.complex64)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfFFT(x, rank)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Shape must be .*rank {}.*".format(rank)):
+          self._tfIFFT(x, rank)
+      for dims in xrange(rank, rank + 2):
+        x = np.zeros((1,) * rank)
+
+        # Test non-rank-1 fft_length produces an error.
+        fft_length = np.zeros((1, 1)).astype(np.int32)
+        with self.assertRaisesWithPredicateMatch(ValueError,
+                                                 "Shape must be .*rank 1"):
+          self._tfFFT(x, rank, fft_length)
+        with self.assertRaisesWithPredicateMatch(ValueError,
+                                                 "Shape must be .*rank 1"):
+          self._tfIFFT(x, rank, fft_length)
+
+        # Test wrong fft_length length.
+        fft_length = np.zeros((rank + 1,)).astype(np.int32)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+          self._tfFFT(x, rank, fft_length)
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, "Dimension must be .*but is {}.*".format(rank + 1)):
+          self._tfIFFT(x, rank, fft_length)
 
   def testGrad_Simple(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index dda20e3a93db7d5cde7071fe0e81686871aa851f..85e7b635d800b1aec1d61e27129cf5a5d14f25a3 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -1372,7 +1372,8 @@ class FIFOQueueTest(test.TestCase):
       dtypes = [
           dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
           dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8, dtypes_lib.int64,
-          dtypes_lib.bool, dtypes_lib.complex64, dtypes_lib.complex128
+          dtypes_lib.uint16, dtypes_lib.bool, dtypes_lib.complex64,
+          dtypes_lib.complex128
       ]
       shape = (32, 4, 128)
       q = data_flow_ops.FIFOQueue(32, dtypes, [shape[1:]] * len(dtypes))
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index dac8d58b3560db9cdbe615a8258787b170e5984f..b3ce234d4e88d0eccd471c528bfc53d33ab93d2f 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,65 +27,87 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
+_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+
 
 class GatherTest(test.TestCase):
-  use_gpu = False
+
+  def _buildParams(self, data, dtype):
+    data = data.astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
 
   def testScalar1D(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([0, 1, 2, 3, 7, 5])
-      indices = constant_op.constant(4)
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual(7, gather_val)
-    self.assertEqual([], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([0, 1, 2, 3, 7, 5])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant(4)
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[4], gather_val)
+        self.assertEqual([], gather_t.get_shape())
 
   def testScalar2D(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8],
-                                     [9, 10, 11], [12, 13, 14]])
-      indices = constant_op.constant(2)
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual([6, 7, 8], gather_val)
-    self.assertEqual([3], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
+                       [9, 10, 11], [12, 13, 14]])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant(2)
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[2], gather_val)
+        self.assertEqual([3], gather_t.get_shape())
 
   def testSimpleTwoD32(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      params = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8],
-                                     [9, 10, 11], [12, 13, 14]])
-      indices = constant_op.constant([0, 4, 0, 2])
-      gather_t = array_ops.gather(params, indices)
-      gather_val = gather_t.eval()
-    self.assertAllEqual([[0, 1, 2], [12, 13, 14], [0, 1, 2], [6, 7, 8]],
-                        gather_val)
-    self.assertEqual([4, 3], gather_t.get_shape())
+    with self.test_session(use_gpu=True):
+      data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
+                       [9, 10, 11], [12, 13, 14]])
+      for dtype in _TEST_TYPES:
+        params_np = self._buildParams(data, dtype)
+        params = constant_op.constant(params_np)
+        indices = constant_op.constant([0, 4, 0, 2])
+        gather_t = array_ops.gather(params, indices)
+        gather_val = gather_t.eval()
+        self.assertAllEqual(params_np[[0, 4, 0, 2]], gather_val)
+        self.assertEqual([4, 3], gather_t.get_shape())
 
   def testHigherRank(self):
     np.random.seed(1)
     # We check that scalar and empty shapes work as well
     for shape in (7, 0), (4, 3, 2):
       for indices_shape in (), (0,), (3, 0), (3, 5):
-        params = np.random.randn(*shape)
-        indices = np.random.randint(shape[0], size=indices_shape)
-        with self.test_session(use_gpu=self.use_gpu):
-          tf_params = constant_op.constant(params)
-          tf_indices = constant_op.constant(indices)
-          gather = array_ops.gather(tf_params, tf_indices)
-          self.assertAllEqual(params[indices], gather.eval())
-          self.assertEqual(indices.shape + params.shape[1:], gather.get_shape())
-          # Test gradients
-          gather_grad = np.random.randn(*gather.get_shape().as_list())
-          params_grad, indices_grad = gradients_impl.gradients(
-              gather, [tf_params, tf_indices], gather_grad)
-          self.assertEqual(indices_grad, None)
-          self.assertEqual(type(params_grad), ops.IndexedSlices)
-          params_grad = ops.convert_to_tensor(params_grad)
-          correct_params_grad = np.zeros(shape)
-          for i, g in zip(indices.flat,
-                          gather_grad.reshape((indices.size,) + shape[1:])):
-            correct_params_grad[i] += g
-          self.assertAllClose(correct_params_grad, params_grad.eval())
+        for dtype in _TEST_TYPES:
+          params = self._buildParams(np.random.randn(*shape), dtype)
+          indices = np.random.randint(shape[0], size=indices_shape)
+          with self.test_session(use_gpu=True):
+            tf_params = constant_op.constant(params)
+            tf_indices = constant_op.constant(indices)
+            gather = array_ops.gather(tf_params, tf_indices)
+            self.assertAllEqual(params[indices], gather.eval())
+            self.assertEqual(indices.shape + params.shape[1:],
+                             gather.get_shape())
+            # Test gradients
+            gather_grad = np.random.randn(*gather.get_shape().as_list()).astype(
+                dtype.as_numpy_dtype)
+            if dtype.is_complex:
+              gather_grad -= 1j * gather_grad
+            params_grad, indices_grad = gradients_impl.gradients(
+                gather, [tf_params, tf_indices], gather_grad)
+            self.assertEqual(indices_grad, None)
+            self.assertEqual(type(params_grad), ops.IndexedSlices)
+            params_grad = ops.convert_to_tensor(params_grad)
+            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+            for i, g in zip(indices.flat,
+                            gather_grad.reshape((indices.size,) + shape[1:])):
+              correct_params_grad[i] += g
+            self.assertAllClose(correct_params_grad, params_grad.eval())
 
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
@@ -94,7 +116,7 @@ class GatherTest(test.TestCase):
     self.assertEqual(None, gather_t.get_shape())
 
   def testBadIndices(self):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[7]]
       gather = array_ops.gather(params, indices)
@@ -102,18 +124,14 @@ class GatherTest(test.TestCase):
         gather.eval()
 
   def testEmptySlices(self):
-    with self.test_session(use_gpu=self.use_gpu):
-      for dtype in np.float32, np.float64:
+    with self.test_session(use_gpu=True):
+      for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
-          params = np.zeros((7, 0), dtype=dtype)
+          params = np.zeros((7, 0), dtype=dtype.as_numpy_dtype)
           indices = np.array([3, 4], dtype=itype)
           gather = array_ops.gather(params, indices)
           self.assertAllEqual(gather.eval(), np.zeros((2, 0)))
 
 
-class GatherGpuTest(GatherTest):
-  use_gpu = True
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index ff299e651165e59e1d2e4991c965b09570158812..2d31ac85b02d688ab260f840cb62e38435764f23 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.python.ops.special_math_ops."""
+"""Tests for tensorflow.python.ops.linalg_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,7 +28,7 @@ from tensorflow.python.platform import test
 
 
 def _random_pd_matrix(n, rng):
-  """Random postive definite matrix."""
+  """Random positive definite matrix."""
   temp = rng.randn(n, n)
   return temp.dot(temp.T)
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a90bc539bb6127993872651a99458daccdc71ad
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -0,0 +1,1321 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lookup ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class HashTableOpTest(test.TestCase):
+
+  def testHashTable(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableFindHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(
+          [["brain", "salad"], ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testHashTableInitWithPythonArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = ["brain", "salad", "surgery"]
+      values = [0, 1, 2]
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              keys, values, value_dtype=dtypes.int64), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableInitWithNumPyArrays(self):
+    with self.test_session():
+      default_val = -1
+      keys = np.array(["brain", "salad", "surgery"], dtype=np.str)
+      values = np.array([0, 1, 2], dtype=np.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testMultipleHashTables(self):
+    with self.test_session() as sess:
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      table1 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(3, table1.size().eval())
+      self.assertAllEqual(3, table2.size().eval())
+      self.assertAllEqual(3, table3.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testHashTableWithTensorDefault(self):
+    with self.test_session():
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testHashTableWithSparseTensorInput(self):
+    with self.test_session() as sess:
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      sp_indices = [[0, 0], [0, 1], [1, 0]]
+      sp_shape = [2, 2]
+      input_tensor = sparse_tensor.SparseTensor(
+          constant_op.constant(sp_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "tank"]),
+          constant_op.constant(sp_shape, dtypes.int64))
+      output = table.lookup(input_tensor)
+
+      out_indices, out_values, out_shape = sess.run(output)
+
+      self.assertAllEqual([0, 1, -1], out_values)
+      self.assertAllEqual(sp_indices, out_indices)
+      self.assertAllEqual(sp_shape, out_shape)
+
+  def testSignatureMismatch(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+      with self.assertRaises(TypeError):
+        table.lookup(input_string)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), "UNK")
+
+  def testDTypes(self):
+    with self.test_session():
+      default_val = -1
+      with self.assertRaises(TypeError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
+                                                 dtypes.int64), default_val)
+
+  def testNotInitialized(self):
+    with self.test_session():
+      default_val = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(
+              ["a"], [1], value_dtype=dtypes.int64), default_val)
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      with self.assertRaisesOpError("Table not initialized"):
+        output.eval()
+
+  def testInitializeTwice(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      with self.assertRaisesOpError("Table already initialized"):
+        table.init.run()
+
+  def testInitializationWithInvalidDimensions(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2, 3, 4], dtypes.int64)
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+  def testMultipleSessions(self):
+    # Start a server
+    server = server_lib.Server(
+        {
+            "local0": ["localhost:0"]
+        }, protocol="grpc", start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target)
+    session2 = session.Session(server.target)
+
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        name="t1")
+
+    # Init the table in the first session.
+    with session1:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+    # Init the table in the second session and verify that we do not get a
+    # "Table already initialized" error.
+    with session2:
+      table.init.run()
+      self.assertAllEqual(3, table.size().eval())
+
+
+class IndexTableFromFile(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def test_string_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int32_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab2.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_file_with_default_value(self):
+    default_value = -42
+    vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_file_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+      ids = table.lookup(
+          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual(
+          (
+              1,  # From vocabulary file.
+              2,  # From vocabulary file.
+              867,  # 3 + fingerprint("tarkus") mod 300.
+              860),  # 3 + fingerprint("toccata") mod 300.
+          ids.eval())
+
+  def test_index_table_from_file_with_only_oov_buckets(self):
+    self.assertRaises(
+        ValueError, lookup_ops.index_table_from_file, vocabulary_file=None)
+
+  def test_index_table_from_file_with_vocab_size_too_small(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=2)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertEqual(2, table.size().eval())
+
+  def test_index_table_from_file_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", table.init.run)
+
+  def test_index_table_from_file_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
+
+    self.assertRaises(
+        ValueError,
+        lookup_ops.index_table_from_file,
+        vocabulary_file=vocabulary_file,
+        vocab_size=0)
+
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertEqual(3, table.size().eval())
+
+  def test_index_table_from_file_with_invalid_hashers(self):
+    vocabulary_file = self._createVocabFile("invalid_hasher.txt")
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file,
+            vocab_size=3,
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=3,
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class KeyValueTensorInitializerTest(test.TestCase):
+
+  def test_string(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer(
+          ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int64(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int64, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      table.init.run()
+
+  def test_int32(self):
+    with ops.Graph().as_default(), self.test_session():
+      init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                                  dtypes.int32, dtypes.int64)
+      table = lookup_ops.HashTable(init, default_value=-1)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "No OpKernel was registered"):
+        table.init.run()
+
+
+class IndexTableFromTensor(test.TestCase):
+
+  def test_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int32_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_int64_index_table_from_tensor_with_tensor_init(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+      ids = table.lookup(
+          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_index_table_from_tensor_with_default_value(self):
+    default_value = -42
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          default_value=default_value)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, default_value), ids.eval())
+
+  def test_index_table_from_tensor_missing_vocabulary_list(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError,
+                                   "vocabulary_list must be specified"):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=None, num_oov_buckets=1)
+
+  def test_index_table_from_tensor_empty_vocabulary_list(self):
+    with self.test_session():
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaisesRegexp(
+          errors_impl.OpError, "keys and values cannot be empty"):
+        lookup_ops.tables_initializer().run()
+
+  def test_index_table_from_tensor_with_invalid_hashers(self):
+    with self.test_session():
+      with self.assertRaises(TypeError):
+        lookup_ops.index_table_from_tensor(
+            vocabulary_list=["brain", "salad", "surgery"],
+            num_oov_buckets=1,
+            hasher_spec=1)
+
+      table = lookup_ops.index_table_from_tensor(
+          vocabulary_list=["brain", "salad", "surgery"],
+          num_oov_buckets=1,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      self.assertRaises(ValueError, table.lookup,
+                        constant_op.constant(["salad", "surgery", "tarkus"]))
+
+
+class IndexToStringTableFromFileTest(test.TestCase):
+
+  def _createVocabFile(self, basename):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+    return vocabulary_file
+
+  def test_index_to_string_table(self):
+    vocabulary_file = self._createVocabFile("i2f_vocab1.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_index_to_string_table_with_default_value(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_small(self):
+    default_value = b"NONE"
+    vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          vocab_size=2,
+          default_value=default_value)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", default_value, default_value),
+                          features.eval())
+
+  def test_index_to_string_table_with_vocab_size_too_large(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      init = lookup_ops.tables_initializer()
+      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                              "Invalid vocab_size", init.run)
+
+  def test_index_to_string_table_with_vocab_size(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=3)
+      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+
+
+class IndexToStringTableFromTensorTest(test.TestCase):
+
+  def test_index_to_string_table_from_tensor(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+
+      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_duplicate_entries(self):
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["hello", "hello"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list)
+      indices = constant_op.constant([0, 1, 4], dtypes.int64)
+      features = table.lookup(indices)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+
+  def test_index_to_string_with_default_value(self):
+    default_value = b"NONE"
+    with self.test_session():
+      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.index_to_string_table_from_tensor(
+          vocabulary_list=vocabulary_list, default_value=default_value)
+      indices = constant_op.constant([1, 2, 4], dtypes.int64)
+      features = table.lookup(indices)
+      self.assertRaises(errors_impl.OpError, features.eval)
+
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"salad", b"surgery", default_value),
+                          features.eval())
+
+
+class InitializeTableFromFileOpTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testInitializeStringTable(self):
+    vocabulary_file = self._createVocabFile("one_column_1.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeInt64Table(self):
+    vocabulary_file = self._createVocabFile(
+        "one_column_int64.txt", values=("42", "1", "-1000"))
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file, dtypes.int64,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      table.init.run()
+
+      output = table.lookup(
+          constant_op.constant((42, 1, 11), dtype=dtypes.int64))
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInitializeIndexTable(self):
+    vocabulary_file = self._createVocabFile("one_column_2.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      value_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                         key_index, dtypes.string, value_index),
+          default_value)
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      output = table.lookup(input_values)
+
+      result = output.eval()
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
+
+  def testMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 1
+      value_index = 2
+
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([1, 5, 6], result)
+
+  def testInvalidDataTypeInMultiColumn(self):
+    vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n")
+
+    with self.test_session():
+      default_value = -1
+      key_index = 2
+      value_index = 1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+      with self.assertRaisesOpError("is not a valid"):
+        table.init.run()
+
+  def testInvalidDataType(self):
+    vocabulary_file = self._createVocabFile("one_column_3.txt")
+
+    with self.test_session():
+      default_value = "UNK"
+      key_index = lookup_ops.TextFileIndex.WHOLE_LINE
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                           key_index, dtypes.string,
+                                           value_index), default_value)
+
+  def testInvalidIndex(self):
+    vocabulary_file = self._createVocabFile("one_column_4.txt")
+    with self.test_session():
+      default_value = -1
+      key_index = 1  # second column of the line
+      value_index = lookup_ops.TextFileIndex.LINE_NUMBER
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         key_index, dtypes.int64, value_index),
+          default_value)
+
+      with self.assertRaisesOpError("Invalid number of columns"):
+        table.init.run()
+
+  def testInitializeSameTableWithMultipleNodes(self):
+    vocabulary_file = self._createVocabFile("one_column_5.txt")
+
+    with self.test_session() as sess:
+      shared_name = "shared-one-columm"
+      default_value = -1
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
+                                         lookup_ops.TextFileIndex.WHOLE_LINE,
+                                         dtypes.int64,
+                                         lookup_ops.TextFileIndex.LINE_NUMBER),
+          default_value,
+          shared_name=shared_name)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = sess.run([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testInitializeTableWithNoFilename(self):
+    with self.test_session():
+      default_value = -1
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                "", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testInitializeWithVocabSize(self):
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      vocabulary_file1 = self._createVocabFile("one_column6.txt")
+      table1 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file1,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Initialize from file.
+      table1.init.run()
+      self.assertEquals(vocab_size, table1.size().eval())
+
+      vocabulary_file2 = self._createVocabFile("one_column7.txt")
+      vocab_size = 5
+      table2 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file2,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+      with self.assertRaisesOpError("Invalid vocab_size"):
+        table2.init.run()
+
+      vocab_size = 1
+      vocabulary_file3 = self._createVocabFile("one_column3.txt")
+      table3 = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              vocabulary_file3,
+              dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE,
+              dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER,
+              vocab_size=vocab_size), default_value)
+
+      # Smaller vocab size reads only vocab_size records.
+      table3.init.run()
+      self.assertEquals(vocab_size, table3.size().eval())
+
+  def testFeedVocabularyName(self):
+    vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
+
+    with self.test_session():
+      default_value = -1
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileInitializer(
+              "old_file.txt", dtypes.string,
+              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
+              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+
+      # Initialize with non existing file (old_file.txt) should fail.
+      # TODO(yleon): Update message, which might change per FileSystem.
+      with self.assertRaisesOpError("old_file.txt"):
+        table.init.run()
+
+      # Initialize the model feeding the vocabulary file.
+      filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      table.init.run(feed_dict={filenames[0]: vocabulary_file})
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testInvalidFilenames(self):
+    vocabulary_file = self._createVocabFile("filename_shape.txt")
+
+    with self.test_session():
+      default_value = -1
+
+      # Invalid data type
+      other_type = constant_op.constant(1)
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+      # Non-scalar filename
+      filenames = constant_op.constant([vocabulary_file, vocabulary_file])
+      with self.assertRaises(ValueError):
+        lookup_ops.HashTable(
+            lookup_ops.TextFileInitializer(
+                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+            default_value)
+
+  def testIdToStringTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = "UNK"
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileStringTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      table.init.run()
+
+      input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+
+      out = table.lookup(input_values)
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testStringToIdTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+  def testInt64ToIdTable(self):
+    vocab_file = self._createVocabFile(
+        "feat_to_id_3.txt", values=("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          default_value)
+      table.init.run()
+
+      out = table.lookup(
+          constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
+      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertEquals(vocab_size, table.size().eval())
+
+
+class IdTableWithHashBucketsTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testStringIdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
+
+      table.init.run()
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value),
+          oov_buckets,
+          key_dtype=dtypes.int32)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testInt64IdTableWithHashBuckets(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+              default_value), oov_buckets)
+
+      table.init.run()
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+
+  def testStringIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
+      table.init.run()
+
+      values = constant_op.constant(("brain", "salad", "surgery"))
+
+      out = table.lookup(values)
+      self.assertAllEqual(
+          [
+              3,  # fingerprint("brain") mod 5.
+              1,  # fingerprint("salad") mod 5.
+              4  # fingerprint("surgery") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testInt32IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.IdTableWithHashBuckets(
+          None, oov_buckets, key_dtype=dtypes.int32)
+      table.init.run()
+
+      input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
+
+      out = table.lookup(input_string)
+      self.assertAllEqual(
+          [
+              1,  # fingerprint("42") mod 5.
+              4,  # fingerprint("1") mod 5.
+              2  # fingerprint("-1000") mod 5
+          ],
+          out.eval())
+      self.assertEquals(oov_buckets, table.size().eval())
+
+  def testFloat64IdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.float64)
+
+  def testBoolIdTableWithOnlyHashBucket(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Invalid key_dtype"):
+        lookup_ops.IdTableWithHashBuckets(
+            None, num_oov_buckets=5, key_dtype=dtypes.bool)
+
+  def testIdTableWithHashBucketsWithMultipleInitializers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session() as sess:
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 3
+
+      vocab_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.FastHashSpec,
+          name="table1")
+
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          vocab_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.StrongHashSpec((1, 2)),
+          name="table2")
+
+      lookup_ops.tables_initializer().run()
+
+      input_string = constant_op.constant(
+          ["fruit", "brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string)
+      out2 = table2.lookup(input_string)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([5, 0, 1, 2, 5], out1)
+      self.assertAllEqual([5, 0, 1, 2, 3], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      test_util.assert_ops_in_graph({
+          "table1_Lookup/hash_bucket": "StringToHashBucketFast",
+          "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
+      }, sess.graph)
+
+  def testIdTableWithHashBucketsInitializationAcrossSessions(self):
+    vocab_file = self._createVocabFile("feat_to_id_5.txt")
+    shared_name = "across-sessions"
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      table1.init.run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+
+      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+
+      # Underlying lookup table already initialized in previous session.
+      # No need to call table2.init.run()
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size),
+              default_value,
+              shared_name=shared_name), oov_buckets)
+
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out2 = table2.lookup(input_string_2)
+
+      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
+    vocab_file = self._createVocabFile("feat_to_id_6.txt")
+    with self.test_session() as sess:
+      default_value1 = -1
+      vocab_size = 3
+      oov_buckets = 0
+      table1 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value1),
+          oov_buckets)
+
+      default_value2 = -2
+      table2 = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(
+                  vocab_file, vocab_size=vocab_size), default_value2),
+          oov_buckets)
+
+      lookup_ops.tables_initializer().run()
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+      out2 = table2.lookup(input_string_2)
+
+      out1, out2 = sess.run([out1, out2])
+      self.assertAllEqual([0, 1, 2, -1], out1)
+      self.assertAllEqual([-2, 1, -2], out2)
+      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+
+  def testSparseTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                               dtypes.string),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
+              -1), 1)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int32)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.test_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_ops.HashTable(
+              lookup_ops.KeyValueTensorInitializer(
+                  (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
+          1,
+          key_dtype=dtypes.int64)
+      table.init.run()
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testIdTableWithHashBucketsWithInvalidHashers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.test_session():
+      default_value = -1
+      vocab_size = 3
+      oov_buckets = 1
+      lookup_table = lookup_ops.HashTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), default_value)
+
+      with self.assertRaises(TypeError):
+        lookup_ops.IdTableWithHashBuckets(
+            lookup_table, oov_buckets, hasher_spec=1)
+
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      with self.assertRaises(ValueError):
+        table.lookup(input_string)
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([]))
+
+      with self.assertRaises(ValueError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
+
+      with self.assertRaises(TypeError):
+        table = lookup_ops.IdTableWithHashBuckets(
+            lookup_table,
+            oov_buckets,
+            hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index f94561b48e572135d7b94ab6d1401d5c06ae779a..a23a8058b7d33a620cd3515827cb067f766e0bd7 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -319,6 +319,21 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
+  def testUnknownShapePlaceholderForLogitsLabelsButScalarWeights(self):
+    logits = array_ops.placeholder(dtypes.float32)
+    labels = array_ops.placeholder(dtypes.int32)
+    weights = 1.0
+    with self.test_session() as sess:
+      loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
+      loss_val = sess.run(loss,
+                          feed_dict={
+                              logits: [[10.0, 0.0, 0.0],
+                                       [0.0, 10.0, 0.0],
+                                       [0.0, 0.0, 10.0]],
+                              labels: [[2], [0], [1]],
+                          })
+      self.assertAlmostEqual((1.0 + 1.0 + 1.0) * 10.0 / 3.0, loss_val, 3)
+
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -447,7 +462,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(0.0, loss.eval(), 3)
 
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
@@ -456,6 +472,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
+    self.assertEquals(logits.dtype, loss.dtype)
 
     with self.test_session() as sess:
       loss = sess.run(loss,
@@ -471,6 +488,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     weights = array_ops.ones_like(logits, dtype=dtypes.float32)
 
     loss = losses.sigmoid_cross_entropy(labels, logits, weights)
+    self.assertEquals(logits.dtype, loss.dtype)
 
     with self.test_session() as sess:
       loss = sess.run(loss,
@@ -487,7 +505,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                      [-100.0, -100.0, 100.0]])
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       loss = losses.sigmoid_cross_entropy(labels, logits)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
@@ -498,7 +517,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       labels = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
       weights = constant_op.constant([[3, 4, 5], [2, 6, 0], [8, 0, 1]])
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
 
   def testMultiCorrectSigmoid(self):
@@ -507,10 +527,43 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                                    [-100.0, 100.0, 100.0]])
     labels = constant_op.constant([[1, 0, 1], [1, 1, 0], [0, 1, 1]])
     loss = losses.sigmoid_cross_entropy(labels, logits)
-    self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+    self.assertEquals(logits.dtype, loss.dtype)
+    self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.test_session():
-      self.assertAlmostEqual(loss.eval(), 0.0, 3)
+      self.assertAlmostEqual(0.0, loss.eval(), 3)
+
+  def testSigmoidFloat64(self):
+    logits = constant_op.constant((
+        (100.0, -100.0, 100.0),
+        (100.0, -100.0, 100.0),
+        (100.0, 100.0, -100.0)
+    ), dtype=dtypes.float64)
+    labels = constant_op.constant((
+        (1, 0, 1), (1, 1, 0), (0, 1, 1)
+    ), dtype=dtypes.int64)
+    loss = losses.sigmoid_cross_entropy(labels, logits)
+    self.assertEquals(logits.dtype, loss.dtype)
+
+    with self.test_session():
+      self.assertAlmostEqual(44.444, loss.eval(), 3)
+
+  def testSigmoidNoReduction(self):
+    logits = constant_op.constant((
+        (100.0, -100.0, 100.0),
+        (100.0, -100.0, 100.0),
+        (100.0, 100.0, -100.0)))
+    labels = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    loss = losses.sigmoid_cross_entropy(
+        labels, logits, reduction=losses.Reduction.NONE)
+    self.assertEquals(logits.dtype, loss.dtype)
+
+    with self.test_session():
+      self.assertAllClose((
+          (0., 0., 0.),
+          (0., 100., 100.),
+          (100., 0., 100.)
+      ), loss.eval(), 3)
 
   def testSigmoidLabelSmoothingCorrect(self):
     with self.test_session():
@@ -530,7 +583,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       label_smoothing = 0.1
       loss = losses.sigmoid_cross_entropy(
           labels, logits, label_smoothing=label_smoothing)
-      self.assertEquals(loss.op.name, 'sigmoid_cross_entropy_loss/value')
+      self.assertEquals(logits.dtype, loss.dtype)
+      self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
@@ -541,6 +595,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       sigmoid_labels = constant_op.constant([[1, 0, 1]])
       sigmoid_loss = losses.sigmoid_cross_entropy(
           sigmoid_labels, sigmoid_logits, label_smoothing=label_smoothing)
+      self.assertEquals(sigmoid_logits.dtype, sigmoid_loss.dtype)
 
       softmax_logits = constant_op.constant(
           [[0.0, 100.0], [100.0, 0.0], [100.0, 0.0]])
@@ -753,6 +808,44 @@ class HingeLossTest(test.TestCase):
       self.assertAllClose(loss.eval(), 0.875, atol=1e-3)
 
 
+class HuberLossTest(test.TestCase):
+
+  def testIncompatibleShapes(self):
+    with self.test_session():
+      predictions = constant_op.constant([[-1.0], [2.1]])
+      labels = constant_op.constant([0.0, 1.0])
+      with self.assertRaises(ValueError):
+        _ = losses.huber_loss(labels, predictions).eval()
+
+  def testAllQuadratic(self):
+    with self.test_session():
+      predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
+      labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
+      loss = losses.huber_loss(labels, predictions)
+      self.assertAllClose(loss.eval(),
+                          0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
+
+  def testAllLinear(self):
+    with self.test_session():
+      predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
+      labels = constant_op.constant([0.0, 1.0, 0.0, 1.5])
+      loss = losses.huber_loss(labels, predictions)
+      self.assertAllClose(loss.eval(),
+                          (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
+
+  def testMixedQuadraticLinear(self):
+    with self.test_session():
+      predictions = constant_op.constant([[1.5, -1.4, -1.0, 0.0],
+                                          [1.5, -1.4, -1.0, 0.0]])
+      labels = constant_op.constant([[1.0, -1.0, 0.0, 0.5],
+                                     [0.0, 1.0, 0.0, 1.5]])
+      loss = losses.huber_loss(labels, predictions)
+      quadratic = 0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4.
+      linear = (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5
+      expected_loss = (quadratic + linear) / 2.
+      self.assertAllClose(loss.eval(), expected_loss, atol=1e-5)
+
+
 class MeanSquaredErrorTest(test.TestCase):
 
   def setUp(self):
@@ -1188,27 +1281,44 @@ class ComputeWeightedLossTest(test.TestCase):
           next_loss += 1.0
     raw_losses.setflags(write=False)
     self._raw_losses = raw_losses
-    self._unweighted_loss = np.mean(self._raw_losses)
 
   def testUnweighted(self):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      raw_losses = self._raw_losses
-      unweighted_losses = (
-          losses.compute_weighted_loss(raw_losses),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 1, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((1, 2, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 1, 4))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones((3, 2, 1))),
-          losses.compute_weighted_loss(raw_losses, weights=np.ones(self._shape))
-      )
-      self.assertEqual(9, len(util.get_losses()))
-      with self.test_session():
-        for unweighted_loss in unweighted_losses:
-          self.assertAllClose(self._unweighted_loss, unweighted_loss.eval())
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        raw_losses = self._raw_losses
+        unweighted_losses = (
+            losses.compute_weighted_loss(raw_losses, reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 2, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((1, 2, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 1, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 1, 4)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones((3, 2, 1)), reduction=reduction),
+            losses.compute_weighted_loss(
+                raw_losses, weights=np.ones(self._shape), reduction=reduction)
+        )
+        self.assertEqual(9, len(util.get_losses()))
+        with self.test_session(g):
+          for unweighted_loss in unweighted_losses:
+            if reduction == losses.Reduction.NONE:
+              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+            elif reduction == losses.Reduction.SUM:
+              self.assertAllClose(
+                  np.sum(self._raw_losses), unweighted_loss.eval())
+            else:
+              # reduction one of losses.Reduction.MEAN and
+              # losses.Reduction.SUM_BY_NONZERO_WEIGHTS.
+              self.assertAllClose(
+                  np.mean(self._raw_losses), unweighted_loss.eval())
 
   def testScalarWeight(self):
     with ops.Graph().as_default():
@@ -1281,15 +1391,29 @@ class ComputeWeightedLossTest(test.TestCase):
     self._test_invalid_weights((17.0,),)
 
   def _test_valid_weights(self, weights):
-    with ops.Graph().as_default():
-      self.assertEqual(0, len(util.get_losses()))
-      weighted_loss = losses.compute_weighted_loss(
-          self._raw_losses, weights=weights)
-      self.assertEqual(1, len(util.get_losses()))
-      with self.test_session():
-        self.assertAllClose(
-            np.mean(weights * self._raw_losses),
-            weighted_loss.eval())
+    for reduction in losses.Reduction.all():
+      with ops.Graph().as_default() as g:
+        self.assertEqual(0, len(util.get_losses()))
+        weighted_loss = losses.compute_weighted_loss(
+            self._raw_losses, weights=weights, reduction=reduction)
+        self.assertEqual(1, len(util.get_losses()))
+        with self.test_session(g):
+          weighted_losses = weights * self._raw_losses
+          weighted_sum = np.sum(weighted_losses)
+          if reduction == losses.Reduction.NONE:
+            self.assertAllClose(weighted_losses, weighted_loss.eval())
+          elif reduction == losses.Reduction.SUM:
+            self.assertAllClose(weighted_sum, weighted_loss.eval())
+          else:
+            broadcast_weights = weights * np.ones_like(self._raw_losses)
+            if reduction == losses.Reduction.MEAN:
+              self.assertAllClose(
+                  weighted_sum / np.sum(broadcast_weights),
+                  weighted_loss.eval())
+            elif reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS:
+              self.assertAllClose(
+                  weighted_sum / np.count_nonzero(broadcast_weights),
+                  weighted_loss.eval())
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
@@ -1298,7 +1422,7 @@ class ComputeWeightedLossTest(test.TestCase):
     self._test_valid_weights((((17.0,), (3.0,),),))
 
   def test1x1x4Weight(self):
-    self._test_valid_weights((((17.0, 13.0, 2.0, 5.0),),))
+    self._test_valid_weights((((17.0, 0.0, 2.0, 5.0),),))
 
   def test3x1x1Weight(self):
     self._test_valid_weights((((17.0,),), ((5.0,),), ((2.0,),),))
@@ -1312,22 +1436,22 @@ class ComputeWeightedLossTest(test.TestCase):
 
   def test3x1x4Weight(self):
     self._test_valid_weights((
-        ((17.0, 13.0, 2.0, 5.0),),
+        ((17.0, 0.0, 2.0, 5.0),),
         ((5.0, 31.0, 17.0, 5.0),),
         ((7.0, 3.0, 11.0, 5.0),),
     ))
 
   def test1x2x4Weight(self):
     self._test_valid_weights(((
-        (17.0, 13.0, 2.0, 5.0),
+        (17.0, 0.0, 2.0, 5.0),
         (3.0, 13.0, 11.0, 2.0),
     ),))
 
   def test3x2x4Weight(self):
     self._test_valid_weights((
-        ((17.0, 13.0, 2.0, 5.0), (3.0, 13.0, 11.0, 2.0),),
-        ((5.0, 31.0, 17.0, 5.0), (13.0, 3.0, 1.0, 11.0),),
-        ((7.0, 3.0, 11.0, 5.0), (13.0, 11.0, 1.0, 7.0),),
+        ((17.0, 0.0, 2.0, 5.0), (3.0, 13.0, 11.0, 2.0),),
+        ((5.0, 31.0, 17.0, 5.0), (13.0, 3.0, 0.0, 11.0),),
+        ((0.0, 3.0, 11.0, 5.0), (13.0, 11.0, 1.0, 7.0),),
     ))
 
 
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..718d8aebd875369832ae35c8422835c1eb30cb48
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -0,0 +1,383 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MapStageTest(test.TestCase):
+
+  def testSimple(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.float32])
+        stage = stager.put(pi, [v], [0])
+        k, y = stager.get(gi)
+        y = math_ops.reduce_max(math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testMultiple(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea([dtypes.float32, dtypes.float32])
+        stage = stager.put(pi, [x, v], [0, 1])
+        k, (z, y) = stager.get(gi)
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testDictionary(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+        v = 2. * (array_ops.zeros([128, 128]) + x)
+      with ops.device(test.gpu_device_name()):
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32],
+            shapes=[[], [128, 128]],
+            names=['x', 'v'])
+        stage = stager.put(pi,{'x': x, 'v': v})
+        key, ret = stager.get(gi)
+        z = ret['x']
+        y = ret['v']
+        y = math_ops.reduce_max(z * math_ops.matmul(y, y))
+      sess.run(stage, feed_dict={x: -1, pi: 0})
+      for i in range(10):
+        _, yval = sess.run([stage, y], feed_dict={x: i, pi: i+1, gi:i})
+        self.assertAllClose(
+            4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
+
+  def testColocation(self):
+    gpu_dev = test.gpu_device_name()
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.float32)
+      v = 2. * (array_ops.zeros([128, 128]) + x)
+    with ops.device(gpu_dev):
+      stager = data_flow_ops.MapStagingArea([dtypes.float32])
+      y = stager.put(1, [v], [0])
+      self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
+                                                 else gpu_dev)
+    with ops.device('/cpu:0'):
+      _, x = stager.get(1)
+      y = stager.peek(1)
+      _, z = stager.get()
+      self.assertEqual(x.device, '/device:CPU:0')
+      self.assertEqual(y.device, '/device:CPU:0')
+      self.assertEqual(z.device, '/device:CPU:0')
+
+  def testPeek(self):
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.int32, name='x')
+      pi = array_ops.placeholder(dtypes.int64)
+      gi = array_ops.placeholder(dtypes.int64)
+      p = array_ops.placeholder(dtypes.int32, name='p')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.MapStagingArea([dtypes.int32, ], shapes=[[]])
+      stage = stager.put(pi,[x], [0])
+      peek = stager.peek(gi)
+      size = stager.size()
+
+    n = 10
+
+    with self.test_session(use_gpu=True) as sess:
+      for i in range(n):
+        sess.run(stage, feed_dict={x:i, pi:i})
+
+      for i in range(n):
+        self.assertTrue(sess.run(peek, feed_dict={gi: i}) == i)
+
+      self.assertTrue(sess.run(size) == 10)
+
+  def testSizeAndClear(self):
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.float32, name='x')
+      pi = array_ops.placeholder(dtypes.int64)
+      gi = array_ops.placeholder(dtypes.int64)
+      v = 2. * (array_ops.zeros([128, 128]) + x)
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.MapStagingArea(
+          [dtypes.float32, dtypes.float32],
+          shapes=[[], [128, 128]],
+          names=['x', 'v'])
+      stage = stager.put(pi,{'x': x, 'v': v})
+      size = stager.size()
+      clear = stager.clear()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(stage, feed_dict={x: -1, pi: 3})
+      self.assertEqual(sess.run(size), 1)
+      sess.run(stage, feed_dict={x: -1, pi: 1})
+      self.assertEqual(sess.run(size), 2)
+      sess.run(clear)
+      self.assertEqual(sess.run(size), 0)
+
+
+  def testCapacity(self):
+    capacity = 3
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.int32, name='x')
+      pi = array_ops.placeholder(dtypes.int64, name='pi')
+      gi = array_ops.placeholder(dtypes.int64, name='gi')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
+        capacity=capacity, shapes=[[]])
+
+      stage = stager.put(pi, [x], [0])
+      get = stager.get()
+      size = stager.size()
+
+    from six.moves import queue as Queue
+    import threading
+
+    queue = Queue.Queue()
+    n = 5
+    missed = 0
+
+    with self.test_session(use_gpu=True) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: i, pi: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.start()
+
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        sess.run(get)
+
+      # This should now succeed
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear out the staging area completely
+      for i in range(capacity):
+        sess.run(get)
+
+  def testMemoryLimit(self):
+    memory_limit = 512*1024  # 512K
+    chunk = 200*1024 # 256K
+    capacity = memory_limit // chunk
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.uint8, name='x')
+      pi = array_ops.placeholder(dtypes.int64, name='pi')
+      gi = array_ops.placeholder(dtypes.int64, name='gi')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.MapStagingArea([dtypes.uint8],
+        memory_limit=memory_limit, shapes=[[]])
+      stage = stager.put(pi, [x], [0])
+      get = stager.get()
+      size = stager.size()
+
+    from six.moves import queue as Queue
+    import threading
+    import numpy as np
+
+    queue = Queue.Queue()
+    n = 5
+    missed = 0
+
+    with self.test_session(use_gpu=True) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8),
+                                    pi: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.start()
+
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        sess.run(get)
+
+      # This should now succeed
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear out the staging area completely
+      for i in range(capacity):
+        sess.run(get)
+
+  def testOrdering(self):
+    import six
+    import random
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.int32, name='x')
+      pi = array_ops.placeholder(dtypes.int64, name='pi')
+      gi = array_ops.placeholder(dtypes.int64, name='gi')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.MapStagingArea([dtypes.int32, ],
+        shapes=[[]], ordered=True)
+      stage = stager.put(pi, [x], [0])
+      get = stager.get()
+      size = stager.size()
+
+    n = 10
+
+    with self.test_session(use_gpu=True) as sess:
+      # Keys n-1..0
+      keys = list(reversed(six.moves.range(n)))
+
+      for i in keys:
+        sess.run(stage, feed_dict={pi: i, x: i})
+
+      self.assertTrue(sess.run(size) == n)
+
+      # Check that key, values come out in ascending order
+      for i, k in enumerate(reversed(keys)):
+        get_key, values = sess.run(get)
+        self.assertTrue(i == k == get_key == values)
+
+      self.assertTrue(sess.run(size) == 0)
+
+  def testBarrier(self):
+    with self.test_session(use_gpu=True) as sess:
+      with ops.device('/cpu:0'):
+        x = array_ops.placeholder(dtypes.float32)
+        f = array_ops.placeholder(dtypes.float32)
+        v = array_ops.placeholder(dtypes.float32)
+        pi = array_ops.placeholder(dtypes.int64)
+        gi = array_ops.placeholder(dtypes.int64)
+      with ops.device(test.gpu_device_name()):
+        # Test barrier with dictionary
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32],
+            names=['x', 'v', 'f'])
+        stage_xf = stager.put(pi,{'x': x, 'f': f})
+        stage_v = stager.put(pi, {'v': v})
+        key, ret = stager.get(gi)
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+        # 0 complete and incomplete entries
+        self.assertTrue(sess.run([size, isize]) == [0, 0])
+        # Stage key 0, x and f tuple entries
+        sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
+        self.assertTrue(sess.run([size, isize]) == [0, 1])
+        # Stage key 1, x and f tuple entries
+        sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
+        self.assertTrue(sess.run([size, isize]) == [0, 2])
+
+        # Now complete key 0 with tuple entry v
+        sess.run(stage_v, feed_dict={pi: 0, v: 1})
+        # 1 complete and 1 incomplete entry
+        self.assertTrue(sess.run([size, isize]) == [1, 1])
+        # We can now obtain tuple associated with key 0
+        self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
+                                == [0, { 'x':1, 'f':2, 'v':1}])
+
+        # 0 complete and 1 incomplete entry
+        self.assertTrue(sess.run([size, isize]) == [0, 1])
+        # Now complete key 1 with tuple entry v
+        sess.run(stage_v, feed_dict={pi: 1, v: 3})
+        # We can now obtain tuple associated with key 1
+        self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
+                                == [1, { 'x':1, 'f':2, 'v':3}])
+
+        # Test again with index inserts
+        stager = data_flow_ops.MapStagingArea(
+            [dtypes.float32, dtypes.float32, dtypes.float32])
+        stage_xf = stager.put(pi, [x, f], [0, 2])
+        stage_v = stager.put(pi, [v], [1])
+        key, ret = stager.get(gi)
+        size = stager.size()
+        isize = stager.incomplete_size()
+
+        # 0 complete and incomplete entries
+        self.assertTrue(sess.run([size, isize]) == [0, 0])
+        # Stage key 0, x and f tuple entries
+        sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
+        self.assertTrue(sess.run([size, isize]) == [0, 1])
+        # Stage key 1, x and f tuple entries
+        sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
+        self.assertTrue(sess.run([size, isize]) == [0, 2])
+
+        # Now complete key 0 with tuple entry v
+        sess.run(stage_v, feed_dict={pi: 0, v: 1})
+        # 1 complete and 1 incomplete entry
+        self.assertTrue(sess.run([size, isize]) == [1, 1])
+        # We can now obtain tuple associated with key 0
+        self.assertTrue(sess.run([key, ret], feed_dict={gi:0})
+                                == [0, [1, 1, 2]])
+
+        # 0 complete and 1 incomplete entry
+        self.assertTrue(sess.run([size, isize]) == [0, 1])
+        # Now complete key 1 with tuple entry v
+        sess.run(stage_v, feed_dict={pi: 1, v: 3})
+        # We can now obtain tuple associated with key 1
+        self.assertTrue(sess.run([key, ret], feed_dict={gi:1})
+                                == [1, [1,3, 2]])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 263f90c4f1f2cb16228e38f516e7818e4b4ed1fc..c4418dfd43a66e3b1b30fa0ddd07e8ea7933e727 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -32,7 +32,7 @@ class InverseOpTest(test.TestCase):
     for np_type in [np.float32, np.float64]:
       for adjoint in False, True:
         y = x.astype(np_type)
-        with self.test_session():
+        with self.test_session(use_gpu=True):
           # Verify that x^{-1} * x == Identity matrix.
           inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
           tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
@@ -86,8 +86,8 @@ class InverseOpTest(test.TestCase):
     with self.test_session():
       with self.assertRaisesOpError("Input is not invertible."):
         # All rows of the matrix below add to zero.
-        tensor3 = constant_op.constant(
-            [[1., 0., -1.], [-1., 1., 0.], [0., -1., 1.]])
+        tensor3 = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                        [0., -1., 1.]])
         linalg_ops.matrix_inverse(tensor3).eval()
 
   def testEmpty(self):
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index cdf828f3cae7c6faa6496d15666472cfe09e708a..262c197480e5ab3e36b1b8d553b08756a58b76de 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test
 
@@ -27,16 +29,16 @@ from tensorflow.python.platform import test
 class MatrixTriangularSolveOpTest(test.TestCase):
 
   def _verifySolveAllWays(self, x, y, batch_dims=None):
-    for use_gpu in True, False:
-      for lower in True, False:
-        for adjoint in True, False:
+    for lower in True, False:
+      for adjoint in True, False:
+        for use_placeholder in True, False:
           self._verifySolve(
               x,
               y,
               lower=lower,
               adjoint=adjoint,
               batch_dims=batch_dims,
-              use_gpu=use_gpu)
+              use_placeholder=use_placeholder)
 
   def _verifySolve(self,
                    x,
@@ -44,7 +46,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
                    lower=True,
                    adjoint=False,
                    batch_dims=None,
-                   use_gpu=False):
+                   use_placeholder=False):
     for np_type in [np.float32, np.float64]:
       a = x.astype(np_type)
       b = y.astype(np_type)
@@ -64,16 +66,30 @@ class MatrixTriangularSolveOpTest(test.TestCase):
         a_np = np.tile(a_np, batch_dims + [1, 1])
         b = np.tile(b, batch_dims + [1, 1])
 
-      with self.test_session(use_gpu=use_gpu):
-        tf_ans = linalg_ops.matrix_triangular_solve(
-            a, b, lower=lower, adjoint=adjoint)
-        out = tf_ans.eval()
-        np_ans = np.linalg.solve(a_np, b)
-        self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        self.assertEqual(np_ans.shape, out.shape)
-        self.assertAllClose(np_ans, out)
+      with self.test_session(use_gpu=True) as sess:
+        if use_placeholder:
+          a_tf = array_ops.placeholder(a.dtype)
+          b_tf = array_ops.placeholder(b.dtype)
+          tf_ans = linalg_ops.matrix_triangular_solve(
+              a_tf, b_tf, lower=lower, adjoint=adjoint)
+          tf_val = sess.run(tf_ans, feed_dict={a_tf: a, b_tf: b})
+          np_ans = np.linalg.solve(a_np, b)
+        else:
+          a_tf = constant_op.constant(a)
+          b_tf = constant_op.constant(b)
+          tf_ans = linalg_ops.matrix_triangular_solve(
+              a_tf, b_tf, lower=lower, adjoint=adjoint)
+          tf_val = tf_ans.eval()
+          np_ans = np.linalg.solve(a_np, b)
+          self.assertEqual(np_ans.shape, tf_ans.get_shape())
+        self.assertEqual(np_ans.shape, tf_val.shape)
+        self.assertAllClose(np_ans, tf_val)
 
   def testSolve(self):
+    # 1x1 matrix, single rhs.
+    matrix = np.array([[0.1]])
+    rhs0 = np.array([[1.]])
+    self._verifySolveAllWays(matrix, rhs0)
     # 2x2 matrices, single right-hand side.
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs0 = np.array([[1.], [1.]])
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index cd5bee362d79674fef02ef471a53b670ba1b3a2a..543039bdd3dcdddce2a9926384d919cba2b4623a 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -1169,7 +1169,7 @@ class AUCTest(test.TestCase):
       self.assertAlmostEqual(1, auc.eval(), 6)
 
   def np_auc(self, predictions, labels, weights):
-    """Computes the AUC explicitely using Numpy.
+    """Computes the AUC explicitly using Numpy.
 
     Args:
       predictions: an ndarray with shape [N].
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..30795eed8a063076a69ec2ec7851788775fe4dc6
--- /dev/null
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -0,0 +1,287 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for neon kernel for depthwise convolutional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+def ConfigsToTest():
+  """Iterator for different convolution shapes, strides and paddings.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
+    convolution parameters.
+  """
+  input_sizes = [[4, 5, 5, 48], [4, 8, 8, 84], [4, 17, 17, 48], [4, 35, 35, 2],
+                 [4, 147, 147, 2], [3, 299, 299, 3], [5, 183, 183, 1]]
+  filter_sizes = [[1, 1, 48, 2], [1, 3, 84, 1], [3, 1, 48, 4], [5, 5, 2, 1],
+                  [3, 3, 2, 8], [2, 2, 3, 8], [5, 5, 1, 2]]
+  out_sizes = [[4, 5, 5, 96], [4, 8, 8, 84], [4, 17, 17, 192], [4, 35, 35, 2],
+               [4, 49, 49, 16], [3, 150, 150, 24], [5, 92, 92, 2]]
+  strides = [1, 1, 1, 1, 3, 2, 2]
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [SAME, SAME, SAME, SAME, VALID, SAME, SAME, SAME]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+def CheckGradConfigsToTest():
+  """Iterator for different convolution shapes, strides and paddings.
+
+  compute_gradient_error() is very expensive. So the configs should be
+  relatively small.
+
+  Yields:
+    Tuple (input_size, filter_size, out_size, stride, padding), the depthwise
+    convolution parameters.
+  """
+  input_sizes = [[2, 5, 8, 1], [4, 5, 5, 1], [2, 4, 4, 2], [1, 15, 15, 2],
+                 [2, 15, 16, 1]]
+  filter_sizes = [[4, 4, 1, 2], [2, 2, 1, 2], [3, 1, 2, 2], [1, 3, 2, 1],
+                  [3, 3, 1, 2]]
+  out_sizes = [[2, 5, 8, 2], [4, 2, 2, 2], [2, 4, 4, 4], [1, 15, 15, 2],
+               [2, 5, 5, 2]]
+  strides = [1, 2, 1, 1, 3]
+  # pylint: disable=invalid-name
+  VALID = "VALID"
+  SAME = "SAME"
+  # pylint: enable=invalid-name
+  paddings = [SAME, VALID, SAME, SAME, VALID]
+  for i, f, o, s, p in zip(input_sizes, filter_sizes, out_sizes, strides,
+                           paddings):
+    yield i, f, o, s, p
+
+
+class DepthwiseConv2DTest(test.TestCase):
+
+  # This is testing that depthwise_conv2d and depthwise_conv2d_native
+  # produce the same results.  It also tests that NCHW and NWHC
+  # formats agree, by comparing the depthwise_conv2d_native with
+  # 'NCHW' format (with transposition) matches the 'NHWC' format using
+  # the higher level interface.
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    stride,
+                    padding,
+                    use_gpu,
+                    data_format="NHWC"):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [filter_rows, filter_cols, input_depth, depth_multiplier].
+      stride: Stride.
+      padding: Padding type.
+      use_gpu: Whether to use GPU.
+      data_format: The data_format of the input.  "NHWC" or "NCHW".
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input and filter tensor with numbers incrementing from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t1.set_shape(tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+
+      native_t1 = t1
+      strides = [1, stride, stride, 1]
+      if data_format == "NCHW":
+        # Transpose from NWHC input to NCHW
+        # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+        native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
+        strides = [1, 1, stride, stride]
+
+      conv_native = nn_ops.depthwise_conv2d_native(
+          native_t1,
+          t2,
+          strides=strides,
+          data_format=data_format,
+          padding=padding)
+
+      if data_format == "NCHW":
+        # Transpose back from NCHW to NHWC
+        conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
+
+      conv_interface = nn_impl.depthwise_conv2d(
+          t1, t2, strides=[1, stride, stride, 1], padding=padding)
+
+      native_result = sess.run(conv_native)
+      interface_result = sess.run(conv_interface)
+
+    print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
+          ", stride:", stride, ", padding: ", padding, ", max diff: ",
+          np.amax(np.absolute(native_result - interface_result)))
+    self.assertArrayNear(
+        np.ravel(native_result), np.ravel(interface_result), 1e-5)
+    self.assertShapeEqual(native_result, conv_native)
+    self.assertShapeEqual(native_result, conv_interface)
+
+  def testDepthwiseConv2D(self):
+    for index, (input_size, filter_size, _, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Processing ", index, "th config.")
+      if index == 2:
+        self._VerifyValues(
+            input_size, filter_size, stride, padding, use_gpu=True)
+      self._VerifyValues(
+          input_size, filter_size, stride, padding, use_gpu=False)
+
+  def testDepthwiseConv2DFormat(self):
+    if not test.is_gpu_available():
+      return
+
+    for index, (input_size, filter_size, _, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Processing ", index, "th config.")
+      self._VerifyValues(
+          input_size,
+          filter_size,
+          stride,
+          padding,
+          use_gpu=True,
+          data_format="NCHW")
+
+# This is testing against hand calculated results.
+
+  def _VerifyHandValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
+                        expected, use_gpu):
+    """Verifies the output values of the depthwise convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in
+        [batch, input_rows, input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in
+        [filter_rows, filter_cols, input_depth, depth_multiplier].
+      stride: Stride.
+      padding: Padding type.
+      expected: An array containing the expected operation outputs.
+      use_gpu: Whether to use GPU.
+    """
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with self.test_session(use_gpu=use_gpu) as sess:
+      with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t1.set_shape(tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+        conv = nn_ops.depthwise_conv2d_native(
+            t1, t2, strides=[1, stride, stride, 1], padding=padding)
+        value = sess.run(conv)
+    print("value = ", value)
+    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertShapeEqual(value, conv)
+
+  def testConv2D2x2Filter(self):
+    # The inputs look like this (it's a 3 x 2 matrix, each of depth 2):
+    #
+    # [ (1.0, 2.0), (3.0,  4.0), ( 5.0,  6.0) ]
+    # [ (7.0, 8.0), (9.0, 10.0), (11.0, 12.0) ]
+    #  We can view this as two inputs
+    #
+    #  input depth 0:
+    #
+    #  [ 1.0,  3.0,  5.0 ]
+    #  [ 7.0,  9.0, 11.0 ]
+    #
+    #  input depth 1:
+    #
+    #  [ 2.0,  4.0,  6.0 ]
+    #  [ 8.0, 10.0, 12.0 ]
+    #
+    # The filter looks like this (it has two 2 x 2 patches, each generating 2
+    # depths):
+    #
+    #  filter #0:
+    #
+    #  [ (1.0,  3.0), ( 5.0,  7.0)]
+    #  [ (9.0, 11.0), (13.0, 15.0)]
+    #
+    #  filter #1:
+    #
+    #  [ ( 2.0,  4.0), ( 6.0,  8.0)]
+    #  [ (10.0, 12.0), (14.0, 16.0)]
+    #
+    # So the outputs are:
+    #
+    # (position 0, 0: in_depth 0, output_depth 0 -- using filter #0)
+    #  1.0 * 1.0 + 7.0 * 9.0 + 3.0 * 5.0 + 9.0 * 13.0 = 196
+    # (position 0, 0: in_depth 0, output_depth 1 -- using filter #1)
+    #  1.0 * 2.0 + 7.0 * 10.0 + 3.0 * 6.0 + 9.0 * 14.0 = 216
+    # (position 0, 0: in_depth 1, output_depth 2 -- using filter #0)
+    #  2.0 * 3.0 + 8.0 * 11.0 + 4.0 * 7.0 + 10.0 * 15.0 = 272
+    # (position 0, 0: in_depth 1, output_depth 3 -- using filter #1)
+    #  2.0 * 4.0 + 8.0 * 12.0 + 4.0 * 8.0 + 10.0 * 16.0 = 296
+    #
+    # (position 1, 0: in_depth 0, output_depth 0 -- using filter #0)
+    #  3.0 * 1.0 + 9.0 * 9.0 + 5.0 * 5.0 + 11.0 * 13.0 = 252
+    # (position 1, 0: in_depth 0, output_depth 1 -- using filter #1)
+    #  3.0 * 2.0 + 9.0 * 10.0 + 5.0 * 6.0 + 11.0 * 14.0 = 280
+    # (position 1, 0: in_depth 1, output_depth 2 -- using filter #0)
+    #  4.0 * 3.0 + 10.0 * 11.0 + 6.0 * 7.0 + 12.0 * 15.0 = 344
+    # (position 1, 0: in_depth 1, output_depth 3 -- using filter #1)
+    #  4.0 * 4.0 + 10.0 * 12.0 + 6.0 * 8.0 + 12.0 * 16.0 = 376
+    expected_output = [196, 216, 272, 296, 252, 280, 344, 376]
+    self._VerifyHandValues(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        stride=1,
+        padding="VALID",
+        expected=expected_output,
+        use_gpu=False)
+
+    self._VerifyHandValues(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        stride=1,
+        padding="VALID",
+        expected=expected_output,
+        use_gpu=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index 5e9b7766a7745dd6dfede2b331d9f6e3731b748f..fa1553a3f6b421e551b19ad763cb5434bb528eb6 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -261,7 +261,7 @@ class PoolingTest(test.TestCase):
           padding=padding,
           data_format=data_format,
           name=func_name)
-      t_g = gradients_impl.gradients(t ** 2, input_tensor)[0]
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
 
       err_g = gradient_checker.compute_gradient_error(
           input_tensor,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 85b01be26633e65d2c273b092ebee847c456d8ec..1b6c8bef9864e754dce6e5224e5015cd44b8b3ab 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -24,10 +24,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -97,7 +97,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
       if data_format == "NCHW":
         t = test_util.NHWCToNCHW(t)
@@ -497,7 +497,7 @@ class PoolingTest(test.TestCase):
                                          strides,
                                          error_msg,
                                          use_gpu=False):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       t = constant_op.constant(1.0, shape=in_size)
       with self.assertRaisesRegexp(errors_impl.UnimplementedError, error_msg):
         t = nn_ops.max_pool(
@@ -562,7 +562,8 @@ class PoolingTest(test.TestCase):
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
-      self.assertAllCloseAccordingToType(cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def _CompareMaxPoolingGradBk(self, input_shape, output_shape, ksize, strides,
                                padding):
@@ -570,14 +571,13 @@ class PoolingTest(test.TestCase):
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
-      tensor_output = np.random.rand(*output_shape).astype(dtype)
       with self.test_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
         argmax = argmax_op.eval()
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
-        out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(t, grad_in, argmax,
-                                                            ksize, strides, padding)
+        out_op = gen_nn_ops._max_pool_grad_grad_with_argmax(
+            t, grad_in, argmax, ksize, strides, padding)
         gpu_val = out_op.eval()
         self.assertShapeEqual(gpu_val, out_op)
       with self.test_session(use_gpu=False):
@@ -585,13 +585,14 @@ class PoolingTest(test.TestCase):
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
         orig_out = out_op.eval()
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
-        out_op = gen_nn_ops._max_pool_grad_grad(t, orig_out, grad_in, ksize, strides,
-                                                padding)
+        out_op = gen_nn_ops._max_pool_grad_grad(t, orig_out, grad_in, ksize,
+                                                strides, padding)
         cpu_val = out_op.eval()
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
-      self.assertAllCloseAccordingToType(cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
+      self.assertAllCloseAccordingToType(
+          cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
     # MaxPoolWithArgMax is implemented only on CUDA.
@@ -619,7 +620,7 @@ class PoolingTest(test.TestCase):
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=True):
       orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
       t = constant_op.constant(tensor_input, shape=[1, 2, 2, 1])
       argmax = constant_op.constant(
@@ -642,7 +643,7 @@ class PoolingTest(test.TestCase):
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.test_session(use_gpu=True) as sess:
+    with self.test_session(use_gpu=True):
       orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
       argmax = constant_op.constant(
@@ -655,8 +656,7 @@ class PoolingTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding="VALID")
       out = out_op.eval().flatten()
-      self.assertAllClose(out,
-                          [11.0, 12.0, 14.0, 16.0])
+      self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
 
   def _ConstructAndTestGradient(self,
                                 pool_func,
@@ -791,16 +791,16 @@ class PoolingTest(test.TestCase):
         strides = [1, row_stride, col_stride, 1]
         t = input_tensor
       t = pool_func(
-        t,
-        ksize=ksize,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        name=func_name)
+          t,
+          ksize=ksize,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          name=func_name)
       if data_format == "NCHW":
         t = test_util.NHWCToNCHW(t)
 
-      t_g = gradients_impl.gradients(t ** 2, input_tensor)[0]
+      t_g = gradients_impl.gradients(t**2, input_tensor)[0]
       err = gradient_checker.compute_gradient_error(
           input_tensor,
           input_sizes,
@@ -952,7 +952,7 @@ class PoolingTest(test.TestCase):
                              expected_input_backprop, input_sizes, output_sizes,
                              window_rows, window_cols, row_stride, col_stride,
                              padding, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(input_data, shape=input_sizes)
       output_tensor = nn_ops.max_pool(input_tensor,
                                       [1, window_rows, window_cols, 1],
@@ -1312,8 +1312,10 @@ class PoolingTest(test.TestCase):
       A Tensor.
     """
     return gen_nn_ops._max_pool_grad_grad(orig_input, orig_output, grad,
-                                          [1, window_rows, window_cols, 1],
-                                          [1, row_stride, col_stride, 1], padding)
+                                          [1, window_rows, window_cols,
+                                           1], [1, row_stride, col_stride,
+                                                1], padding)
+
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1501,7 +1503,9 @@ def GetMaxPoolGradTest(input_size, filter_size, output_size, strides, padding):
 
   return Test
 
-def GetMaxPoolGradGradTest(input_size, filter_size, output_size, strides, padding):
+
+def GetMaxPoolGradGradTest(input_size, filter_size, output_size, strides,
+                           padding):
 
   def Test(self):
     # MaxPoolWithArgMax is implemented only on CUDA.
@@ -1522,6 +1526,6 @@ if __name__ == "__main__":
             GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
                                padding_))
     setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
-            GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_, stride_,
-                                   padding_))
+            GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
+                                   stride_, padding_))
   test.main()
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index f1bb3bdc22896ae2554cbd6b197a8a36be936b23..e098cf3ff9ca88bfee7746b2916e8dd947f664f2 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -186,9 +186,9 @@ class PyOpTest(test.TestCase):
 
       def bad():
         # Non-string python objects aren't supported.
-        return dtypes.float32
+        return {"foo": dtypes.float32}
 
-      z, = script_ops.py_func(bad, [], [dtypes.float64])
+      z, = script_ops.py_func(bad, [], [dtypes.int64])
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
@@ -283,6 +283,28 @@ class PyOpTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(f), [])
 
+  def _testExceptionHandling(self, py_exp, tf_exp):
+
+    def raise_exception():
+      raise py_exp("blah")  # pylint: disable=not-callable
+
+    f = script_ops.py_func(raise_exception, [], [])
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(tf_exp, "blah"):
+        sess.run(f)
+
+  def testExceptionHandling(self):
+    self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
+    self._testExceptionHandling(TypeError, errors.InvalidArgumentError)
+    self._testExceptionHandling(StopIteration, errors.OutOfRangeError)
+    self._testExceptionHandling(MemoryError, errors.ResourceExhaustedError)
+    self._testExceptionHandling(NotImplementedError, errors.UnimplementedError)
+
+    class WeirdError(Exception):
+      pass
+
+    self._testExceptionHandling(WeirdError, errors.UnknownError)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random_ops_test.py b/tensorflow/python/kernel_tests/random_ops_test.py
index fa323b24963e23c356ab1643744dd9313e3a2514..d44c0b3d9fbff3aa50a6221305655848186960a1 100644
--- a/tensorflow/python/kernel_tests/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -157,6 +158,13 @@ class TruncatedNormalTest(test.TestCase):
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
       self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
 
+  def testLargeShape(self):
+    with self.test_session(use_gpu=True):
+      v = variables.Variable(
+          array_ops.zeros(dtype=dtypes.float32, shape=[2**33, 1]))
+      n = random_ops.truncated_normal(v.shape)
+      self.assertEqual([8589934592, 1], n.shape.as_list())
+
   def testNoCSE(self):
     with self.test_session(use_gpu=True):
       shape = [2, 3, 4]
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 5e8f8e8673a24f6370159a50d908170c20c54407..10f34751d0b2e55faaba7652377b2b7b8659323d 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -352,9 +352,20 @@ class FixedLengthRecordReaderTest(test.TestCase):
     self._record_bytes = 3
     self._footer_bytes = 2
 
+    self._hop_bytes = 2
+    self._num_overlapped_records = 3
+
   def _Record(self, f, r):
     return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
 
+  def _OverlappedRecord(self, f, r):
+    record_str = "".join([
+        str(i)[0]
+        for i in range(r * self._hop_bytes,
+                       r * self._hop_bytes + self._record_bytes)
+    ])
+    return compat.as_bytes(record_str)
+
   def _CreateFiles(self):
     filenames = []
     for i in range(self._num_files):
@@ -367,6 +378,23 @@ class FixedLengthRecordReaderTest(test.TestCase):
         f.write(b"F" * self._footer_bytes)
     return filenames
 
+  def _CreateOverlappedRecordFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(),
+                        "fixed_length_overlapped_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        f.write(b"H" * self._header_bytes)
+        all_records_str = "".join([
+            str(i)[0]
+            for i in range(self._record_bytes + self._hop_bytes *
+                           (self._num_overlapped_records - 1))
+        ])
+        f.write(compat.as_bytes(all_records_str))
+        f.write(b"F" * self._footer_bytes)
+    return filenames
+
   def testOneEpoch(self):
     files = self._CreateFiles()
     with self.test_session() as sess:
@@ -374,6 +402,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           header_bytes=self._header_bytes,
           record_bytes=self._record_bytes,
           footer_bytes=self._footer_bytes,
+          hop_bytes=0,
           name="test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -390,6 +419,31 @@ class FixedLengthRecordReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testOneEpochWithHopBytes(self):
+    files = self._CreateOverlappedRecordFiles()
+    with self.test_session() as sess:
+      reader = io_ops.FixedLengthRecordReader(
+          header_bytes=self._header_bytes,
+          record_bytes=self._record_bytes,
+          footer_bytes=self._footer_bytes,
+          hop_bytes=self._hop_bytes,
+          name="test_reader")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue_many([files]).run()
+      queue.close().run()
+      for i in range(self._num_files):
+        for j in range(self._num_overlapped_records):
+          k, v = sess.run([key, value])
+          print(v)
+          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+          self.assertAllEqual(self._OverlappedRecord(i, j), v)
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
 
 class TFRecordReaderTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 316c23609cd667e3df1c3096b5cbd5d34f5fa1fa..1dfc7f48d572516c0d2dfbdffe06c30084b46c30 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+import numbers
+
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -29,6 +32,26 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+# The maximum input rank to test.
+_MAX_RANK = 5
+
+
+def _powerset(iterable):
+  """Helper for generating all possible reduction_axes arguments.
+
+  Example:
+  powerset([0,1,2]): () (0,) (1,) (2,) (0,1) (0,2) (1,2) (0,1,2)
+
+  Args:
+    iterable: An iterable of items to generate the powerset of.
+
+  Returns:
+    The powerset of all items in iterable.
+  """
+  s = list(iterable)
+  return itertools.chain.from_iterable(
+      itertools.combinations(s, r) for r in range(len(s)+1))
+
 
 class ReducedShapeTest(test.TestCase):
 
@@ -68,23 +91,30 @@ class ReducedShapeTest(test.TestCase):
       self._check([10, 10, 10], [-3], [1, 10, 10])
 
 
-class SumReductionTest(test.TestCase):
+class BaseReductionTest(test.TestCase):
 
-  def _compare(self,
-               x,
-               reduction_axes,
-               keep_dims,
-               use_gpu=False,
-               feed_dict=None):
-    np_ans = x
-    if reduction_axes is None:
-      np_ans = np.sum(np_ans, keepdims=keep_dims)
-    else:
-      reduction_axes = np.array(reduction_axes).astype(np.int32)
-      for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
-    with self.test_session(use_gpu=use_gpu) as sess:
-      tf_ans = math_ops.reduce_sum(x, reduction_axes, keep_dims)
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    raise NotImplementedError()
+
+  def _makeIncremental(self, shape, dtype):
+    data = np.arange(np.prod(shape)).reshape(shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _makeRandom(self, shape, dtype):
+    data = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _compare(self, x, reduction_axes, keep_dims, feed_dict=None):
+    np_ans = self._np_reduce(x, reduction_axes, keep_dims)
+    with self.test_session(use_gpu=True) as sess:
+      tf_ans = self._tf_reduce(x, reduction_axes, keep_dims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -93,10 +123,45 @@ class SumReductionTest(test.TestCase):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
       self._compareAll(x, reduction_axes[0])
-    self._compare(x, reduction_axes, False, use_gpu=True, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, False, use_gpu=False, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, True, use_gpu=True, feed_dict=feed_dict)
-    self._compare(x, reduction_axes, True, use_gpu=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keep_dims=False, feed_dict=feed_dict)
+    self._compare(x, reduction_axes, keep_dims=True, feed_dict=feed_dict)
+
+  def _compareAllAxes(self, x, feed_dict=None):
+    self._compareAll(x, None)
+    for axes in _powerset(range(x.ndim)):
+      self._compareAll(x, axes, feed_dict)
+
+  def _compareGradient(self, x, reduction_axes, rtol=1e-8, atol=1e-8):
+    if reduction_axes is not None and np.shape(reduction_axes) == (1,):
+      # Test scalar reduction_axes argument
+      self._compareGradient(x, reduction_axes[0], rtol=rtol, atol=atol)
+    with self.test_session(use_gpu=True):
+      t = ops.convert_to_tensor(x)
+      su = self._tf_reduce(t, reduction_axes, False)
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          t, x.shape, su, su.get_shape().as_list(), x_init_value=x, delta=1)
+    self.assertAllClose(jacob_t, jacob_n, rtol=rtol, atol=atol)
+
+  def _compareGradientAxes(self, x, rtol=1e-8, atol=1e-8):
+    self._compareGradient(x, None, rtol=rtol, atol=atol)
+    self._compareGradient(x, [], rtol=rtol, atol=atol)
+    self._compareGradient(x, 0, rtol=rtol, atol=atol)
+    self._compareGradient(x, [1], rtol=rtol, atol=atol)
+    self._compareGradient(x, [2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [1, 2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [0, 1, 2, 3], rtol=rtol, atol=atol)
+
+
+class SumReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_sum(x, reduction_axes, keep_dims)
+
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    return np.sum(x, axis=reduction_axes, keepdims=keep_dims)
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -105,95 +170,30 @@ class SumReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testFloatReduce1D(self):
-    # Create a 1D array of floats
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.float32)
-    self._compareAll(np_arr, [0])
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce2D(self):
-    # Create a 2D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [0, 1])
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [-1])
-    self._compareAll(np_arr, [-1, -3])
-    self._compareAll(np_arr, [-1, 1])
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce4D(self):
-    # Create a 4D array of floats and reduce across some
-    # dimensions
-    np_arr = np.arange(0, 210).reshape([2, 3, 5, 7]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    # Need specialization for reduce(4D, [0, 2])
-    # self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [1, 2, 3])
-    self._compareAll(np_arr, [0, 1, 2, 3])
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
 
-  def testFloatReduce5D(self):
-    # Create a 5D array of floats and reduce across some dimensions
-    np_arr = np.arange(0, 840).reshape([2, 3, 5, 7, 4]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    # Need specialization for reduce(4D, [0, 2])
-    # self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-    self._compareAll(np_arr, [1, 2, 3])
-    self._compareAll(np_arr, [0, 1, 2, 3])
-    self._compareAll(np_arr, [1, 2, 3, 4])
-    self._compareAll(np_arr, [0, 1, 2, 3, 4])
-
-  # Simple tests for various types.
-  def testDoubleReduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.float64)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-
-  def testInt32Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.int32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-
-  def testComplex64Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.complex64)
-    self._compare(np_arr, [], False)
-    self._compare(np_arr, [0], False)
-
-  def testComplex128Reduce1D(self):
-    np_arr = np.arange(1, 6).reshape([5]).astype(np.complex128)
-    self._compare(np_arr, [], False)
-    self._compare(np_arr, [0], False)
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
 
   def testInvalidIndex(self):
     np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
@@ -250,32 +250,11 @@ class SumReductionTest(test.TestCase):
 
   # Int64??
 
-  def _compareGradient(self, shape, sum_shape, reduction_axes):
-    if reduction_axes is not None and np.shape(reduction_axes) == (1,):
-      # Test scalar reduction_axes argument
-      self._compareGradient(shape, sum_shape, reduction_axes[0])
-    x = np.arange(1.0, 49.0).reshape(shape).astype(np.float64)
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
-      su = math_ops.reduce_sum(t, reduction_axes)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, shape, su, sum_shape, x_init_value=x, delta=1)
-    self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
-
   def testGradient(self):
-    self._compareGradient([2, 3, 4, 2], [2, 2], [1, 2])
-
-  def testGradient2(self):
-    self._compareGradient([2, 3, 4, 2], [2, 4, 2], [1])
-
-  def testGradient3(self):
-    self._compareGradient([2, 3, 4, 2], [2, 3, 2], [2])
-
-  def testGradient4(self):
-    self._compareGradient([2, 3, 4, 2], [], None)
-
-  def testGradient5(self):
-    self._compareGradient([2, 3, 4, 2], [3, 4, 2], 0)
+    for dtype in [dtypes.float32, dtypes.float64, dtypes.complex64,
+                  dtypes.complex128]:
+      x = self._makeIncremental([2, 3, 4, 2], dtype)
+      self._compareGradientAxes(x)
 
   def testHighRank(self):
     # Do a bunch of random high dimensional reductions
@@ -300,61 +279,45 @@ class SumReductionTest(test.TestCase):
     self._compareAll(x, [1])
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_sum(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                      dtypes.complex64, dtypes.complex128):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_sum(x, [0])
-          self.assertAllEqual(y.eval(), np.zeros(9938))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                    dtypes.complex64, dtypes.complex128):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_sum(x, [0])
+        self.assertAllEqual(y.eval(), np.zeros(9938))
 
 
-class MeanReductionTest(test.TestCase):
+class MeanReductionTest(BaseReductionTest):
 
-  def _compare(self, x, reduction_axes, keep_dims, use_gpu=False):
-    np_ans = x
-    if reduction_axes is None:
-      np_ans = np.mean(np_ans, keepdims=keep_dims)
-    else:
-      reduction_axes = np.array(reduction_axes).astype(np.int32)
-      count = 1
-      for ra in reduction_axes.ravel()[::-1]:
-        np_ans = np.sum(np_ans, axis=ra, keepdims=keep_dims)
-        count *= x.shape[ra]
-      np_ans /= count
-    with self.test_session(use_gpu=use_gpu):
-      tf_ans = math_ops.reduce_mean(x, reduction_axes, keep_dims)
-      out = tf_ans.eval()
-    self.assertAllClose(np_ans, out)
-    self.assertShapeEqual(np_ans, tf_ans)
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_mean(x, reduction_axes, keep_dims)
 
-  def _compareAll(self, x, reduction_axes):
-    self._compare(x, reduction_axes, False, use_gpu=True)
-    self._compare(x, reduction_axes, True, use_gpu=True)
-    self._compare(x, reduction_axes, False, use_gpu=False)
-    self._compare(x, reduction_axes, True, use_gpu=False)
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    elif isinstance(reduction_axes, numbers.Integral):
+      reduction_axes = (reduction_axes,)
 
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
+    if reduction_axes is None:
+      count = np.prod(x.shape)
+    else:
+      count = np.prod([x.shape[ax] for ax in reduction_axes])
+    # np.mean automatically converts integer inputs to float, while TensorFlow's
+    # reduce_mean does not. For integer inputs, we emulate TensorFlow's behavior
+    # using np.sum and truncating division.
+    np_sum = np.sum(x, axis=reduction_axes, keepdims=keep_dims)
+    if np.issubdtype(x.dtype, np.integer):
+      return np_sum // count
+    return np_sum / count
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -363,83 +326,64 @@ class MeanReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testDoubleReduce3D(self):
-    # Create a 3D array of doubles and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float64)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
 
-  def testGradient(self):
-    s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32)
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
-      su = math_ops.reduce_mean(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [2, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_mean(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [1], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_mean(t, [])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [2, 3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
 
-      su = math_ops.reduce_mean(t, 0)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, s, su, [3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
+
+  def testGradient(self):
+    s = [2, 3, 4, 2]
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = self._makeIncremental(s, dtype)
+      self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_mean(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_mean(x, [0]).eval()
-          self.assertEqual(y.shape, (9938,))
-          self.assertTrue(np.all(np.isnan(y)))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_mean(x, [0]).eval()
+        self.assertEqual(y.shape, (9938,))
+        self.assertTrue(np.all(np.isnan(y)))
 
 
-class ProdReductionTest(test.TestCase):
+class ProdReductionTest(BaseReductionTest):
 
-  def _compare(self, x, reduction_axes, keep_dims):
-    np_ans = x
-    if reduction_axes is None:
-      np_ans = np.prod(np_ans, keepdims=keep_dims)
-    else:
-      for ra in reduction_axes[::-1]:
-        np_ans = np.prod(np_ans, axis=ra, keepdims=keep_dims)
-    with self.test_session():
-      if reduction_axes is not None:
-        reduction_axes = np.array(reduction_axes).astype(np.int32)
-      tf_ans = math_ops.reduce_prod(x, reduction_axes, keep_dims)
-      out = tf_ans.eval()
-    self.assertAllClose(np_ans, out)
-    self.assertShapeEqual(np_ans, tf_ans)
+  def _tf_reduce(self, x, reduction_axes, keep_dims):
+    return math_ops.reduce_prod(x, reduction_axes, keep_dims)
 
-  def _compareAll(self, x, reduction_axes):
-    self._compare(x, reduction_axes, False)
-    self._compare(x, reduction_axes, True)
+  def _np_reduce(self, x, reduction_axes, keep_dims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    return np.prod(x, axis=reduction_axes, keepdims=keep_dims)
 
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
@@ -448,81 +392,70 @@ class ProdReductionTest(test.TestCase):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
-  def testFloatReduce3D(self):
-    # Create a 3D array of floats and reduce across all possible
-    # dimensions
-    np_arr = np.arange(0, 30).reshape([2, 3, 5]).astype(np.float32)
-    self._compareAll(np_arr, None)
-    self._compareAll(np_arr, [])
-    self._compareAll(np_arr, [0])
-    self._compareAll(np_arr, [1])
-    self._compareAll(np_arr, [2])
-    self._compareAll(np_arr, [0, 1])
-    self._compareAll(np_arr, [1, 2])
-    self._compareAll(np_arr, [0, 2])
-    self._compareAll(np_arr, [0, 1, 2])
-
-  def _compareGradient(self, x):
-    with self.test_session():
-      t = ops.convert_to_tensor(x)
-
-      su = math_ops.reduce_prod(t, [])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [2, 3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_prod(t, [1, 2])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [2, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_prod(t, [0, 1, 2, 3])
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [1], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-      su = math_ops.reduce_prod(t, 0)
-      jacob_t, jacob_n = gradient_checker.compute_gradient(
-          t, x.shape, su, [3, 4, 2], x_init_value=x, delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+  def testInt32(self):
+    # Numpy automatically upgrades the type of np.prod from int32 to int64, so
+    # Numpy does not overflow an int32 np.prod while TensorFlow does. To avoid
+    # overflow, divide the incremental int32 array by 2.
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32) / 2
+      self._compareAllAxes(np_arr)
+
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
+
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
+
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
+
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
 
   def testGradientWithZeros(self):
     s = [2, 3, 4, 2]
-    x = np.arange(1.0, 49.0).reshape(s).astype(np.float32) / 20.
+    x = self._makeIncremental(s, dtypes.float32) / 20.
     # No zeros in input
-    self._compareGradient(x)
+    self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
     # Zero at beginning
     x1 = x.copy()
     x1[:, :, 0, :] = 0
-    self._compareGradient(x1)
+    self._compareGradientAxes(x1, rtol=1e-3, atol=1e-3)
     # Zero at end
     x2 = x.copy()
     x2[:, :, -1, :] = 0
-    self._compareGradient(x2)
+    self._compareGradientAxes(x2, rtol=1e-3, atol=1e-3)
     # Zero in middle
     x3 = x.copy()
     x3[:, :, 2, :] = 0
-    self._compareGradient(x3)
+    self._compareGradientAxes(x3, rtol=1e-3, atol=1e-3)
     # All zeros
     x4 = x.copy()
     x4[:, :, :, :] = 0
-    self._compareGradient(x4)
+    self._compareGradientAxes(x4, rtol=1e-3, atol=1e-3)
 
   def testEmptyGradients(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_prod(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
   def testDegenerate(self):
-    for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
-        for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
-          # A large number is needed to get Eigen to die
-          x = array_ops.zeros((0, 9938), dtype=dtype)
-          y = math_ops.reduce_prod(x, [0])
-          self.assertAllEqual(y.eval(), np.ones(9938))
+    with self.test_session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_prod(x, [0])
+        self.assertAllEqual(y.eval(), np.ones(9938))
 
 
 class MinReductionTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 0b81dcb8afecb3865587cad0c7b594bef1af430d..3be4eb06d3ee391d90a6ac0d257ed7aaf5ae2ed1 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -151,6 +152,14 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.assign(2.0).eval()
       self.assertEqual(2.0, v.value().eval())
 
+  def testToFromProto(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+
+      w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
+      self.assertEquals(2, math_ops.add(w, 1).eval())
+
   def testAssignAddMethod(self):
     with self.test_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -195,6 +204,47 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertIsInstance(w.dtype, dtypes.DType)
     self.assertEqual(v.dtype, w.dtype)
 
+  def testCachingDevice(self):
+    with ops.device("/job:server/task:1"):
+      v = resource_variable_ops.ResourceVariable(
+          2.0, caching_device="/job:localhost")
+      self.assertEqual("/job:localhost", v.value().device)
+      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+        _ = v.value().op.get_attr("_class")
+
+    with ops.colocate_with(v.op):
+      w = resource_variable_ops.ResourceVariable(
+          2.0, caching_device="/job:localhost")
+      self.assertEqual("/job:localhost", w.value().device)
+      with self.assertRaisesRegexp(ValueError, "No attr named '_class'"):
+        _ = w.value().op.get_attr("_class")
+
+  def testSharedName(self):
+    with self.test_session():
+      v = resource_variable_ops.ResourceVariable(300.0, name="var1")
+      v.initializer.run()
+
+      w = resource_variable_ops.var_handle_op(dtype=v.dtype.base_dtype,
+                                              shape=v.get_shape(),
+                                              shared_name="var1")
+      w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
+      self.assertEqual(300.0, w_read.eval())
+
+      x = resource_variable_ops.var_handle_op(dtype=v.dtype.base_dtype,
+                                              shape=v.get_shape(),
+                                              shared_name="var1/")
+      x_read = resource_variable_ops.read_variable_op(x, v.dtype.base_dtype)
+      with self.assertRaisesOpError("Resource .*/var1//.* does not exist"):
+        _ = x_read.eval()
+
+  def testSetInitialValue(self):
+    with self.test_session():
+      # Initialize variable with a value different from the initial value passed
+      # in the constructor.
+      v = resource_variable_ops.ResourceVariable(2.0)
+      v.initializer.run(feed_dict={v.initial_value: 3.0})
+      self.assertEqual(3.0, v.value().eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index c5d26408c27130020c9a9e8ce8787d1d22c3e400..5f61c54c784dc9c5a475e49c7b113b39fbb875a8 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -402,7 +402,7 @@ def dynamic_rnn_swap_memory_benchmark(batch_size, max_time, num_units):
 
 
 def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, dynamic,
-                                swap_memory):
+                                swap_memory, nn):
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
 
@@ -415,7 +415,7 @@ def rnn_long_sequence_benchmark(batch_size, seqlen, num_units, dynamic,
   ]
   inputs = np.dstack(inputs_list).transpose([0, 2, 1])  # batch x time x depth
 
-  for _ in range(5):
+  for _ in range(nn):
     if dynamic:
       with session.Session(config=config, graph=ops_lib.Graph()) as sess:
         inputs_t = variables_lib.Variable(inputs, trainable=False).value()
@@ -548,6 +548,23 @@ class BenchmarkRNN(test.Benchmark):
                 iters=20,
                 wall_time=t_dt)
 
+  def _benchmarkDynamicLSTMMemorySwapLongSeq(self):
+    """The memory swapping test for the SOSP submission."""
+    print("Calculation: Long LSTM Sequence")
+    print("batch \t len \t units \t dynamic \t elapsed_t \t elapsed_t/len")
+    batch_size = 512
+    seqlen = 800
+    num_units = 512
+    dynamic = True
+    swap_memory = True
+    # Some warming up.
+    if swap_memory:
+      rnn_long_sequence_benchmark(batch_size, seqlen, num_units,
+                                  dynamic, swap_memory, 2)
+    # Measure the performance.
+    for slen in xrange(100, 1100, 100):
+      rnn_long_sequence_benchmark(batch_size, slen, num_units, dynamic,
+                                  swap_memory, 3)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scalar_strict_test.py b/tensorflow/python/kernel_tests/scalar_test.py
similarity index 95%
rename from tensorflow/python/kernel_tests/scalar_strict_test.py
rename to tensorflow/python/kernel_tests/scalar_test.py
index e208217637c0841b07320e74db56cdb24b84f351..b34426cc21590d585bf7ef7b24b778adbf0cd084 100644
--- a/tensorflow/python/kernel_tests/scalar_strict_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -27,20 +27,15 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
-from tensorflow.python.platform import control_imports
 from tensorflow.python.platform import test
 
 
-class ScalarStrictTest(test.TestCase):
+class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
     # Within Google, the switch to scalar strict occurred at version 6.
-    if control_imports.USE_OSS:
-      lenient = []
-      strict = [5, 6]
-    else:
-      lenient = [5]
-      strict = [6]
+    lenient = []
+    strict = [5, 6]
 
     # Use placeholders to bypass shape inference, since only the C++
     # GraphDef level is ever scalar lenient.
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index fead455bfaabc8de9bdff6b68488bce48e2cbb36..33269c912343311cba22aace50bdb8b0ba87b127 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -36,8 +36,11 @@ class SegmentReductionHelper(test.TestCase):
       num_elem *= x
     values = np.arange(1, num_elem + 1)
     np_values = values.reshape(input_shape).astype(dtype.as_numpy_dtype)
+    # Add a non-zero imaginary component to complex types.
+    if dtype.is_complex:
+      np_values -= 1j * np_values
     return constant_op.constant(
-        values, shape=input_shape, dtype=dtype), np_values
+        np_values, shape=input_shape, dtype=dtype), np_values
 
   def _segmentReduce(self, indices, x, op1, op2=None, num_out_rows=None):
     if not x.size:
@@ -228,11 +231,10 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             s, [3, 4],
             x_init_value=np_x.astype(np.double),
             delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+      self.assertAllClose(jacob_t, jacob_n)
 
 
 class UnsortedSegmentSumTest(SegmentReductionHelper):
-  use_gpu = False
 
   def testValues(self):
     dtypes = [
@@ -244,7 +246,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
       for dtype in dtypes:
-        with self.test_session(use_gpu=self.use_gpu):
+        with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
               indices, np_x, np.add, op2=None, num_out_rows=num_segments)
@@ -258,19 +260,21 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
     num_segments = max(indices_flat) + 3
-    for indices in indices_flat, indices_flat.reshape(5, 2):
-      shape = indices.shape + (num_cols,)
-      with self.test_session(use_gpu=self.use_gpu):
-        tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-        s = math_ops.unsorted_segment_sum(
-            data=tf_x, segment_ids=indices, num_segments=num_segments)
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            tf_x,
-            shape,
-            s, [num_segments, num_cols],
-            x_init_value=np_x.astype(np.double),
-            delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
+                  dtypes_lib.complex128]:
+      for indices in indices_flat, indices_flat.reshape(5, 2):
+        shape = indices.shape + (num_cols,)
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape, dtype=dtype)
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x, segment_ids=indices, num_segments=num_segments)
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              tf_x,
+              shape,
+              s, [num_segments, num_cols],
+              x_init_value=np_x,
+              delta=1)
+        self.assertAllClose(jacob_t, jacob_n)
 
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
@@ -283,27 +287,28 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_cols = 2
     shape = [n, num_cols]
     num_segments = max(indices) + 1
-    with self.test_session(use_gpu=self.use_gpu):
-      tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
-      # Results from UnsortedSegmentSum
-      unsorted_s = math_ops.unsorted_segment_sum(
-          data=tf_x, segment_ids=indices, num_segments=num_segments)
-      (unsorted_jacob_t, unsorted_jacob_n) = gradient_checker.compute_gradient(
-          tf_x,
-          shape,
-          unsorted_s, [num_segments, num_cols],
-          x_init_value=np_x.astype(np.double),
-          delta=1)
-      # Results from SegmentSum
-      sorted_s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-      sorted_jacob_t, sorted_jacob_n = gradient_checker.compute_gradient(
-          tf_x,
-          shape,
-          sorted_s, [num_segments, num_cols],
-          x_init_value=np_x.astype(np.double),
-          delta=1)
-    self.assertAllClose(unsorted_jacob_t, sorted_jacob_t, rtol=1e-3, atol=1e-3)
-    self.assertAllClose(unsorted_jacob_n, sorted_jacob_n, rtol=1e-3, atol=1e-3)
+    for dtype in [dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.complex64,
+                  dtypes_lib.complex128]:
+      with self.test_session(use_gpu=True):
+        tf_x, np_x = self._input(shape, dtype=dtype)
+        # Results from UnsortedSegmentSum
+        unsorted_s = math_ops.unsorted_segment_sum(
+            data=tf_x, segment_ids=indices, num_segments=num_segments)
+        unsorted_jacob_t, unsorted_jacob_n = (
+            gradient_checker.compute_gradient(tf_x, shape, unsorted_s,
+                                              [num_segments, num_cols],
+                                              x_init_value=np_x, delta=1))
+
+        # Results from SegmentSum
+        sorted_s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
+        sorted_jacob_t, sorted_jacob_n = gradient_checker.compute_gradient(
+            tf_x,
+            shape,
+            sorted_s, [num_segments, num_cols],
+            x_init_value=np_x,
+            delta=1)
+      self.assertAllClose(unsorted_jacob_t, sorted_jacob_t)
+      self.assertAllClose(unsorted_jacob_n, sorted_jacob_n)
 
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
@@ -319,7 +324,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     dtypes = [
         np.float32, np.float64, np.int64, np.int32, np.complex64, np.complex128
     ]
-    with self.test_session(use_gpu=self.use_gpu):
+    with self.test_session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
           data = np.zeros((2, 0), dtype=dtype)
@@ -333,7 +338,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
     num_segments = max(indices_flat) + 3
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (num_cols,)
-      with self.test_session():
+      with self.test_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtypes_lib.float64)
         s = math_ops.unsorted_segment_max(data=tf_x, segment_ids=indices,
                                     num_segments=num_segments)
@@ -343,10 +348,7 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
             s,
             [num_segments, num_cols],
             x_init_value=np_x.astype(np.double), delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
-
-class UnsortedSegmentSumGpuTest(UnsortedSegmentSumTest):
-  use_gpu = True
+      self.assertAllClose(jacob_t, jacob_n)
 
 
 class SparseSegmentReductionHelper(SegmentReductionHelper):
@@ -539,7 +541,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             s, [3, 4],
             x_init_value=np_x.astype(np.double),
             delta=1)
-      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+      self.assertAllClose(jacob_t, jacob_n)
 
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 36b3ed33d85a49653958750e90986a86865350b0..f26b51013be304d2f137fa7833193ce326483c88 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -76,9 +76,9 @@ def _GetSelfAdjointEigTest(dtype_, shape_):
     batch_shape = shape_[:-2]
     a = np.random.uniform(
         low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-    a += a.T
+    a += np.conj(a.T)
     a = np.tile(a, batch_shape + (1, 1))
-    if dtype_ == np.float32:
+    if dtype_ == np.float32 or dtype_ == np.complex64:
       atol = 1e-4
     else:
       atol = 1e-12
@@ -118,7 +118,7 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_):
     batch_shape = shape_[:-2]
     a = np.random.uniform(
         low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-    a += a.T
+    a += np.conj(a.T)
     a = np.tile(a, batch_shape + (1, 1))
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(dtype_).eps
@@ -135,7 +135,7 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_):
       for b in tf_e, tf_v:
         x_init = np.random.uniform(
             low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(dtype_)
-        x_init += x_init.T
+        x_init += np.conj(x_init.T)
         x_init = np.tile(x_init, batch_shape + (1, 1))
         theoretical, numerical = gradient_checker.compute_gradient(
             tf_a,
@@ -151,12 +151,15 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_):
 
 if __name__ == '__main__':
   for dtype in np.float32, np.float64:
+    # TODO(rmlarsen): Re-enable for np.complex64, np.complex128
+    # when we have a fix for the crash in numpy.linalg.eig.
     for size in 1, 2, 5, 10:
       for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
         shape = batch_dims + (size, size)
         name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
         setattr(SelfAdjointEigTest, 'testSelfAdjointEig_' + name,
                 _GetSelfAdjointEigTest(dtype, shape))
-        setattr(SelfAdjointEigGradTest, 'testSelfAdjointEigGrad_' + name,
-                _GetSelfAdjointEigGradTest(dtype, shape))
+        if dtype in [np.float32, np.float64]:
+          setattr(SelfAdjointEigGradTest, 'testSelfAdjointEigGrad_' + name,
+                  _GetSelfAdjointEigGradTest(dtype, shape))
   test.main()
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 875ac3a4276bb9677eb5ca1c0a52c89c69215b1b..97d61d52af5ccbf51ceb3ab6934ebe14c1165063 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -147,6 +147,14 @@ class ShapeOpsTest(test.TestCase):
     self._testAll(np.random.randn(2, 3, 5, 7, 11))
     self._testAll(np.random.randn(2, 3, 5, 7, 11, 13))
 
+  def testBool(self):
+    self._testAll(np.random.choice((False, True), size=(2,)))
+    self._testAll(np.random.choice((False, True), size=(2, 3)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11)))
+    self._testAll(np.random.choice((False, True), size=(2, 3, 5, 7, 11, 13)))
+
   # Disabled because it takes too long to run, but manually verified
   # as passing at time of writing.
   def _test64BitOutput(self):
@@ -197,12 +205,38 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -3)
     self._compareExpandDimsAll(np.zeros([2, 3, 5]), -4)
 
+  def testExpandDimsBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    self._compareExpandDimsAll(choice([2]), 0)
+    self._compareExpandDimsAll(choice([2]), 1)
+    self._compareExpandDimsAll(choice([2]), -1)
+
+    self._compareExpandDimsAll(choice([2, 3]), 0)
+    self._compareExpandDimsAll(choice([2, 3]), 1)
+    self._compareExpandDimsAll(choice([2, 3]), 2)
+    self._compareExpandDimsAll(choice([2, 3]), -1)
+    self._compareExpandDimsAll(choice([2, 3]), -2)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), 0)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), 3)
+
+    self._compareExpandDimsAll(choice([2, 3, 5]), -1)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -2)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -3)
+    self._compareExpandDimsAll(choice([2, 3, 5]), -4)
+
   def testExpandDimsErrors(self):
     with self.test_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), -5)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], -5)
       self.assertRaises(ValueError, array_ops.expand_dims,
                         np.zeros([2, 3, 5]), 4)
+      self.assertRaises(ValueError, array_ops.expand_dims,
+                        [False, True, True], 4)
 
   def testExpandDimsGradient(self):
     with self.test_session():
@@ -220,6 +254,10 @@ class ShapeOpsTest(test.TestCase):
       self.assertAllEqual([7], array_ops.expand_dims(inp, 0).eval())
       self.assertAllEqual([7], array_ops.expand_dims(inp, -1).eval())
 
+      inp = constant_op.constant(True)
+      self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
+      self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
+
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       if squeeze_dims:
@@ -250,6 +288,18 @@ class ShapeOpsTest(test.TestCase):
     # Squeeze on both ends.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]))
 
+  def testSqueezeBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Nothing to squeeze.
+    self._compareSqueezeAll(choice([2]))
+    self._compareSqueezeAll(choice([2, 3]))
+
+    # Squeeze the middle element away.
+    self._compareSqueezeAll(choice([2, 1, 2]))
+
+    # Squeeze on both ends.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]))
+
   def testSqueezeSpecificDimension(self):
     # Positive squeeze dim index.
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [0])
@@ -261,6 +311,18 @@ class ShapeOpsTest(test.TestCase):
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5])
     self._compareSqueezeAll(np.zeros([1, 2, 1, 3, 1]), [-3, -5, -1])
 
+  def testSqueezeSpecificDimensionBool(self):
+    choice = lambda s: np.random.choice((False, True), size=s)
+    # Positive squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [2, 4])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [0, 4, 2])
+
+    # Negative squeeze dim index.
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-1])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5])
+    self._compareSqueezeAll(choice([1, 2, 1, 3, 1]), [-3, -5, -1])
+
   def testSqueezeAllOnes(self):
     # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
     # Verify that we do the same.
@@ -271,6 +333,16 @@ class ShapeOpsTest(test.TestCase):
         tf_ans = tensor.eval()
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  def testSqueezeAllOnesBool(self):
+    # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
+    # Verify that we do the same.
+    for use_gpu in [False, True]:
+      with self.test_session(use_gpu=use_gpu):
+        tensor = array_ops.squeeze([[[False]]], [])
+        self.assertEqual(np.shape(1), tensor.get_shape())
+        tf_ans = tensor.eval()
+        self.assertEqual(np.shape(1), tf_ans.shape)
+
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
       with self.test_session(use_gpu=use_gpu):
@@ -348,6 +420,16 @@ class TileTest(test.TestCase):
     self.assertEqual([4, 4], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 4))).all())
 
+  def testIdentityTileAndGrad(self):
+    with self.test_session():
+      inp = np.random.rand(4, 1).astype(np.float32)
+      a = constant_op.constant(inp)
+      tiled = array_ops.tile(a, [1, 1])
+      result = tiled.eval()
+    self.assertEqual(result.shape, (4, 1))
+    self.assertEqual([4, 1], tiled.get_shape())
+    self.assertTrue((result == np.tile(inp, (1, 1))).all())
+
   def testEmpty(self):
     with self.test_session():
       inp = np.random.rand(2, 3).astype(np.float32)
@@ -528,6 +610,7 @@ class TileTest(test.TestCase):
     self._RunAndVerifyGradientResult([], [])
 
   def testGradientRandom(self):
+    self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 1, 1, 1, 1])
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
     self._RunAndVerifyGradientResult([2, 3, 1, 1, 3], [3, 1, 1, 2, 2])
     self._RunAndVerifyGradientResult([2, 1, 3, 3, 2], [1, 3, 3, 1, 2])
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index f70f60c0f5ef6d0d238412ab8c0bcbba577b180c..b8e7c50a378317636fe184abc411483c96c6ebbf 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -85,6 +86,45 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) gradient of gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testGradGradGrad(self):
+    with self.test_session():
+      x = constant_op.constant(
+          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
+          shape=[2, 5],
+          name="x")
+      y = nn_ops.softplus(x, name="softplus")
+      (grad,) = gradients_impl.gradients(y, x)
+      (grad_grad,) = gradients_impl.gradients(grad, x)
+      x_init = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker.compute_gradient_error(
+          x, [2, 5], grad_grad, [2, 5], x_init_value=x_init)
+    print("softplus (float) third-order gradient err = ", err)
+    self.assertLess(err, 5e-5)
+
+  def testWarnInts(self):
+    # Running the op triggers address sanitizer errors, so we just make it
+    nn_ops.softplus(constant_op.constant(7))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 5fd5253c092d71f66d31d1411d306004a5ff1666..371f86ff151f35764e5f976aba8301d250e199a9 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -65,6 +65,12 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  def testWarnInts(self):
+    # NOTE(irving): Actually I don't know how to intercept the warning, but
+    # let's make sure it runs.  I promised I've looked, and there was a warning.
+    with self.test_session():
+      nn_ops.softsign(constant_op.constant(7)).eval()
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 874dcbabf10911fff5dfa1257b5310b2b60494a9..555c16194e10105eb7c28344f688ad643d3aae4b 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -88,6 +88,7 @@ class SparseAddTest(test.TestCase):
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
+          self.assertAllEqual((3, 3), sp_sum.get_shape())
 
           sum_out = sess.run(sp_sum)
 
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d09badf27e621ec244730eb6c1f6b637546219f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -0,0 +1,398 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse_cross_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class SparseCrossOpTest(test.TestCase):
+
+  def test_simple(self):
+    """Tests a simple scenario."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['batch1-FC1-F1_X_batch1-FC2-F1'], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_dense(self):
+    """Tests only dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                              ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                             dtypes.string),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+    ], [
+        'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+        'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 55555]]),
+        self._sparse_tensor([['batch1-FC2-F1'],
+                             ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    ])
+    expected_out = self._sparse_tensor([['11_X_batch1-FC2-F1'], [
+        '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2', '55555_X_batch2-FC2-F1',
+        '55555_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    op = sparse_ops._sparse_cross([
+        constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor([[
+        '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1',
+        '333_X_batch1-FC2-F2'
+    ], [
+        '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+        '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1'],
+                             ['batch2-FC1-F1', 'batch2-FC1-F2']]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'], [
+            'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+            'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_integer_sparse_input(self):
+    """Tests mixed type sparse and dense inputs."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([[11], [333, 5555]]),
+        constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                              ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                             dtypes.string),
+    ])
+    expected_out = self._sparse_tensor(
+        [['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2'], [
+            '333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+            '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
+        ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']]),
+        self._sparse_tensor(
+            [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(
+            [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+        self._sparse_tensor([['batch1-FC2-F1']]),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b, 'batch%d-FC1-F2' % b, 'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor(col1), self._sparse_tensor(col2),
+        self._sparse_tensor(col3)
+    ])
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
+        self._sparse_tensor([], 1),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
+        self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
+        self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    ])
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    op = sparse_ops._sparse_cross([
+        self._sparse_tensor([]), self._sparse_tensor([]),
+        self._sparse_tensor([])
+    ])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_empty(sess.run(op))
+
+  def test_hashed_zero_bucket_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ])
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[1971693436396284976]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_zero_bucket(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[4847552627144134031]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  def test_hashed_no_hash_key(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[83]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed_output(self):
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor([['batch1-FC1-F1']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1']])
+        ],
+        num_buckets=100,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[31]])
+    with self.test_session() as sess:
+      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+
+  def test_hashed__has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]])
+    t2 = constant_op.constant([list(range(10)), list(range(10))])
+    cross = sparse_ops._sparse_cross_hashed(
+        [t2, t1],
+        num_buckets=1024,
+        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = cross_dense.eval()
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    op = sparse_ops._sparse_cross_hashed(
+        [
+            self._sparse_tensor(
+                [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
+            self._sparse_tensor([['batch1-FC2-F1']]),
+            self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+        ],
+        num_buckets=1000)
+    with self.test_session() as sess:
+      out = sess.run(op)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEquals(0, sp.indices.size)
+    self.assertEquals(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEquals(0, sp.dense_shape[1])
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+          list represents a batch. Each item of the batch is a feature of a
+          specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+          entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (dtypes.string if not values or isinstance(values[0], str) else
+                  dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 06d5cbaf2d0f88e63bbf2a693ec9afab63ba3399..bad11a29df0a63033fd169b91e5493319e9181c0 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -328,6 +328,12 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 6, 7], dtype=np.int64)
+    sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
+    self.assertAllEqual([3, 6, 7], sp_output.get_shape())
+
   def testBasic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -397,14 +403,21 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  def testInvalidDimensionSize(self):
+  def testInvalidDimensionSizeStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 7, 5], dtype=np.int64)
+
+    with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
+      sparse_ops.sparse_reset_shape(sp_input, new_shape)
+
+  def testInvalidDimensionSizeDynamic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
-      new_shape = np.array([3, 7, 5], dtype=np.int64)
+      new_shape = array_ops.placeholder(dtype=dtypes.int32)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
-        sess.run(out)
+        sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 5136cdadead4d6dfd6961f4c128acb5de18963b8..18335d665af833fb7d9fef0b517b2c4efc4a005e 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -48,6 +48,13 @@ class SparseReorderTest(test.TestCase):
     shape = np.array([5, 6]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6(np.arange(6)))
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reorder(sp_input)
+    self.assertAllEqual((5, 6), sp_output.get_shape())
+
   def testAlreadyInOrder(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 1bb05aa3b2aab1141e27ad486ad436b4c6bc2dd3..e87fa0c94c4cf3346c0127dd17b037cabb3cbb56 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -50,6 +50,13 @@ class SparseReshapeTest(test.TestCase):
     shape = np.array([2, 3, 4])
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6())
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reshape(sp_input, shape=(1, 5, 2, 3))
+    self.assertAllEqual((1, 5, 2, 3), sp_output.get_shape())
+
   def testSameShape(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
@@ -71,6 +78,18 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  def testWorksWellWithTfShape(self):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorPlaceholder()
+      input_val = self._SparseTensorValue_5x6()
+      shape = array_ops.shape(sp_input)  # tf.shape generates int32 output
+      sp_output = sparse_ops.sparse_reshape(sp_input, shape)
+
+      output_val = sess.run(sp_output, {sp_input: input_val})
+      self.assertAllEqual(output_val.indices, input_val.indices)
+      self.assertAllEqual(output_val.values, input_val.values)
+      self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
+
   def testFeedSameShapeWithInferredDim(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -180,6 +199,12 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output shape size may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  def testProvideStaticallyMismatchedSizes(self):
+    input_val = self._SparseTensorValue_5x6()
+    sp_input = sparse_tensor.SparseTensor.from_value(input_val)
+    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+      sparse_ops.sparse_reshape(sp_input, [4, 7])
+
   def testFeedMismatchedSizes(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index df5462dd2d0f50f515a4aa3b61f8ba945089a6e0..e8b94294b1b85849760356ca102df44603ae5a3f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -30,34 +30,44 @@ from tensorflow.python.platform import test
 
 class SparseTensorDenseMatMulGradientTest(test.TestCase):
 
-  def _sparsify(self, x):
+  def _sparsify(self, x, indices_dtype=np.int64):
     x[x < 0.5] = 0
 
     non_zero = np.where(x)
-    x_indices = np.vstack(non_zero).astype(np.int64).T
+    x_indices = np.vstack(non_zero).astype(indices_dtype).T
     x_values = x[non_zero]
     x_shape = x.shape
 
     return sparse_tensor.SparseTensor(
         indices=x_indices, values=x_values, dense_shape=x_shape), len(x_values)
 
-  def _randomTensor(self, size, np_dtype, adjoint=False, sparse=False):
+  def _randomTensor(self,
+                    size,
+                    values_dtype,
+                    adjoint=False,
+                    sparse=False,
+                    indices_dtype=np.int64):
     n, m = size
-    x = np.random.randn(n, m).astype(np_dtype)
+    x = np.random.randn(n, m).astype(values_dtype)
 
     if adjoint:
       x = x.transpose()
 
     if sparse:
-      return self._sparsify(x)
+      return self._sparsify(x, indices_dtype=indices_dtype)
     else:
-      return constant_op.constant(x, dtype=np_dtype)
+      return constant_op.constant(x, dtype=values_dtype)
 
-  def _testGradients(self, adjoint_a, adjoint_b, name, np_dtype):
+  def _testGradients(self, adjoint_a, adjoint_b, name, values_dtype,
+                     indices_dtype):
     n, k, m = np.random.randint(1, 10, size=3)
     sp_t, nnz = self._randomTensor(
-        [n, k], np_dtype, adjoint=adjoint_a, sparse=True)
-    dense_t = self._randomTensor([k, m], np_dtype, adjoint=adjoint_b)
+        [n, k],
+        values_dtype,
+        adjoint=adjoint_a,
+        sparse=True,
+        indices_dtype=indices_dtype)
+    dense_t = self._randomTensor([k, m], values_dtype, adjoint=adjoint_b)
 
     matmul = sparse_ops.sparse_tensor_dense_matmul(
         sp_t, dense_t, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name=name)
@@ -71,17 +81,19 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
       print("%s gradient err = %s" % (name, err))
       self.assertLess(err, 1e-3)
 
-  def _testGradientsType(self, np_dtype):
+  def _testGradientsType(self, values_dtype, indices_dtype):
     for adjoint_a in [True, False]:
       for adjoint_b in [True, False]:
-        name = "sparse_tensor_dense_matmul_%s_%s_%s" % (adjoint_a, adjoint_b,
-                                                        np_dtype.__name__)
-        self._testGradients(adjoint_a, adjoint_b, name, np_dtype)
+        name = "sparse_tensor_dense_matmul_%s_%s_%s_%s" % (
+            adjoint_a, adjoint_b, values_dtype.__name__, indices_dtype.__name__)
+        self._testGradients(adjoint_a, adjoint_b, name, values_dtype,
+                            indices_dtype)
 
   def testGradients(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
-    self._testGradientsType(np.float32)
-    self._testGradientsType(np.float64)
+    self._testGradientsType(np.float32, np.int64)
+    self._testGradientsType(np.float64, np.int64)
+    self._testGradientsType(np.float32, np.int32)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 25da6691e62088ee797090047ed3111f0680db74..a0bd178e247019470a907275cdf8d42d162be38e 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -45,7 +46,12 @@ def _maybe_complex(x):
 
 class SparseTensorDenseMatMulTest(test.TestCase):
 
-  def _testMatmul(self, x, y, adjoint_a=False, adjoint_b=False):
+  def _testMatmul(self,
+                  x,
+                  y,
+                  adjoint_a=False,
+                  adjoint_b=False,
+                  indices_dtype=np.int64):
     x_mat = np.matrix(x)
     if adjoint_a:
       x_mat = x_mat.H
@@ -55,7 +61,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     np_ans = x_mat * y_mat
 
-    x_indices = np.vstack(np.where(x)).astype(np.int64).T
+    x_indices = np.vstack(np.where(x)).astype(indices_dtype).T
     x_values = x[np.where(x)]
     x_shape = x.shape
 
@@ -82,13 +88,13 @@ class SparseTensorDenseMatMulTest(test.TestCase):
         else:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
 
-  def _testBasic(self, np_dtype):
-    x = _maybe_complex(np.random.rand(10, 10).astype(np_dtype))
+  def _testBasic(self, value_dtype, indices_dtype=np.int64):
+    x = _maybe_complex(np.random.rand(10, 10).astype(value_dtype))
     x[np.abs(x) < 0.5] = 0  # Make it sparse
 
-    y = _maybe_complex(np.random.randn(10, 20).astype(np_dtype))
+    y = _maybe_complex(np.random.randn(10, 20).astype(value_dtype))
 
-    self._testMatmul(x, y)
+    self._testMatmul(x, y, indices_dtype=indices_dtype)
 
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
@@ -97,6 +103,8 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.float64)
     self._testBasic(np.complex64)
     self._testBasic(np.complex128)
+    self._testBasic(np.int32, indices_dtype=np.int32)
+    self._testBasic(np.float32, indices_dtype=np.int32)
 
   def testShapeInference(self):
     x = np.random.rand(10, 10)
@@ -123,6 +131,77 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
       sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
 
+  def testInvalidIndicesForSparseTensorDenseMatmul(self):
+    # Note: use_gpu=False because nice errors are only returned from CPU kernel.
+    with self.test_session(use_gpu=False):
+      indices = np.matrix([[1, 10]]).astype(np.int64)
+      values = np.array([10]).astype(np.float32)
+      shape = [3, 2]
+      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+
+      # Test multiplying by both a small and large dense matrix, to hit
+      # both cases in the kernel.
+      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "k .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "k .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t).eval()
+
+      # Repeat with adjoint_a, to get a different error.
+      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "m .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True).eval()
+      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+      with self.assertRaisesOpError(
+          "m .10. from index.0,1. out of bounds .>=2."):
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True).eval()
+
+  def testInvalidIndicesForSparseTensorDenseMatmulOnGPU(self):
+    # Note: use_gpu=False because nice errors are only returned from CPU kerne
+    if not test.is_gpu_available():
+      return
+    with self.test_session(use_gpu=True):
+      indices = np.array([[1, 10]]).astype(np.int64)
+      values = np.array([10]).astype(np.float32)
+      shape = [3, 2]
+      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+
+      # Test multiplying by both a small and large dense matrix, to hit
+      # both cases in the kernel.
+      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+      expected_t = np.array(
+          [[0] * 500, [np.nan] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t).eval())
+
+      # Repeat with adjoint_a, now the error is that the sparse index
+      # is OOO w.r.t. the output.  The GPU kernel can't do much here,
+      # so it just doesn't accumulate.
+
+      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+      expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
+      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+      expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
+      self.assertAllClose(expected_t,
+                          sparse_ops.sparse_tensor_dense_matmul(
+                              sparse_t, dense_t, adjoint_a=True).eval())
+
   # Tests setting one dimension to be a high value.
   def _testLarge(self, np_dtype):
     r1 = np.random.randint(6000, 20000)
@@ -137,9 +216,12 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
       y = _maybe_complex(np.random.randn(k, n).astype(np_dtype))
 
-      self._testMatmul(x, y)
+      self._testMatmul(x, y, adjoint_a=False, adjoint_b=False)
+      self._testMatmul(x.transpose(), y, adjoint_a=True, adjoint_b=False)
+      self._testMatmul(x, y.transpose(), adjoint_a=False, adjoint_b=True)
+      self._testMatmul(
+          x.transpose(), y.transpose(), adjoint_a=True, adjoint_b=True)
 
-  def testLarge(self):
     np.random.seed(127)  # Repeatable results
     self._testLarge(np.float32)
     self._testLarge(np.float64)
@@ -183,7 +265,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x, y, adjoint_a,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -208,7 +292,9 @@ def _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(x_ind, x_val, x_shape,
         lambda t, _: t < iterations,
         body, (t0, v0),
         parallel_iterations=1,
-        back_prop=False)
+        back_prop=False,
+        shape_invariants=(tensor_shape.TensorShape(()),
+                          tensor_shape.TensorShape(None)))
     return [final]
 
   return _timeit
@@ -253,7 +339,7 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
   if skip_dense:
     delta_dense = float("nan")
   else:
-    with session.Session("", config=config, graph=ops.Graph()) as sess:
+    with session.Session(config=config, graph=ops.Graph()) as sess:
       if not use_gpu:
         with ops.device("/cpu:0"):
           x_t = constant_op.constant(x)
@@ -261,12 +347,12 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
           ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
               x_t, y_t, adjoint_a, adjoint_b)
       else:
-        x_t = constant_op.constant(x)
-        y_t = constant_op.constant(y)
-        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(x_t, y_t,
-                                                                      adjoint_a,
-                                                                      adjoint_b)
-      delta_dense = _timer(sess, ops_fn, 1000)
+        with ops.device("/gpu:0"):
+          x_t = constant_op.constant(x)
+          y_t = constant_op.constant(y)
+          ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_dense(
+              x_t, y_t, adjoint_a, adjoint_b)
+      delta_dense = _timer(sess, ops_fn, 200)
 
   # Using sparse_tensor_dense_matmul.
   with session.Session("", config=config, graph=ops.Graph()) as sess:
@@ -279,13 +365,14 @@ def sparse_tensor_dense_vs_dense_matmul_benchmark(thresh,
         ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
             x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
     else:
-      x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
-      x_val = constant_op.constant(x[np.where(x)])
-      x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
-      y_t = constant_op.constant(y)
-      ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
-          x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
-    delta_sparse = _timer(sess, ops_fn, 1000)
+      with ops.device("/gpu:0"):
+        x_ind = constant_op.constant(np.vstack(np.where(x)).astype(np.int64).T)
+        x_val = constant_op.constant(x[np.where(x)])
+        x_shape = constant_op.constant(np.array(x.shape).astype(np.int64))
+        y_t = constant_op.constant(y)
+        ops_fn = _sparse_tensor_dense_vs_dense_matmul_benchmark_sparse(
+            x_ind, x_val, x_shape, y_t, adjoint_a, adjoint_b)
+    delta_sparse = _timer(sess, ops_fn, 200)
 
   print("%g \t %d \t %s \t %d \t %d \t %g \t %g \t %g" %
         (1 - thresh, n, use_gpu, m, k, delta_dense, delta_sparse,
@@ -302,7 +389,7 @@ def main(_):
         "\t dt(sparse)/dt(dense)")
 
   for thresh in (0.99, 0.8, 0.5, 0.2):
-    for n in (1, 10, 25):
+    for n in (50, 100):
       for use_gpu in (True, False):
         for m in (100, 1000):
           for k in (100, 1000):
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 9eed5bedab245763352e65a18d3454330da2e815..3dcafd2496565b8c0f9c42829e12f051185fa345 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -22,21 +22,73 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+_TEST_DTYPES = (dtypes.float32, dtypes.float64, dtypes.complex64,
+                dtypes.complex128)
+
 
 class SplitOpTest(test.TestCase):
 
+  def _makeData(self, shape, dtype):
+    data = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 1j * data
+    return data
+
+  def testShapeInference(self):
+    model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
+
+    # check that we fail during static shape inference if sizes are known
+    with self.assertRaises(ValueError):
+      # pylint: disable=expression-not-assigned
+      array_ops.split(model_input, [4], axis=1)[0]
+      # pylint: enable=expression-not-assigned
+
+    model_input = array_ops.placeholder(dtypes.float32)
+    inp = np.zeros((1, 10))
+    # check that we still fail at runtime if the shapes were unknown
+    with self.test_session(use_gpu=False) as sess:
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        sess.run(array_ops.split(model_input, [4]), {model_input: inp})
+
+    # test that we can pass a scalar Tensor as num_splits
+    with self.test_session(use_gpu=False) as sess:
+      result = sess.run(
+          array_ops.split(
+              array_ops.ones([4, 4]),
+              num_or_size_splits=array_ops.ones([2, 2]).get_shape()[1],
+              axis=0))
+
+    self.assertEqual(result[0].shape, (2, 4))
+    self.assertEqual(result[1].shape, (2, 4))
+
+    # test that none split dimensions remain, even if we don't know how
+    # the split_dim will be split, but we do know the axis
+    result = array_ops.split(
+        array_ops.ones([5, 2]), array_ops.constant([2, 1, 2]) * 1, axis=0)
+
+    self.assertEqual(result[0].shape[1], 2)
+    self.assertEqual(result[1].shape[1], 2)
+    self.assertEqual(result[2].shape[1], 2)
+
+    model_input2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
+    result = array_ops.split(model_input2, [2, 2], axis=0)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
+
   def testExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
     value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.test_session(use_gpu=True) as sess:
       with self.assertRaises(ValueError) as context:
         sess.run(array_ops.split(value, size_splits), {size_splits: [2, 2, 6]})
 
@@ -55,13 +107,13 @@ class SplitOpTest(test.TestCase):
 
     value = np.random.rand(11, 11)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(value, [a, b]))
 
     self.assertAllEqual(result[0], value[0:5, :])
     self.assertAllEqual(result[1], value[5:, :])
 
-  def _RunAndVerifyVariable(self, use_gpu, large_num_splits=False):
+  def _RunAndVerifyVariable(self, dtype, large_num_splits=False):
     # Random dims of rank 5
     shape = np.random.randint(1, 5, size=5)
     split_dim = np.random.randint(0, 5)
@@ -71,8 +123,8 @@ class SplitOpTest(test.TestCase):
       num_split = np.random.randint(2, 8)
     size_splits = np.random.randint(2, 8, num_split)
     shape[split_dim] = np.sum(size_splits)
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
@@ -81,10 +133,10 @@ class SplitOpTest(test.TestCase):
       offset += size_splits[i]
       self.assertAllEqual(result[i], inp[slices])
 
-  def _testSpecialCasesVariable(self, use_gpu):
+  def _testSpecialCasesVariable(self):
     inp = np.random.rand(4, 4).astype("f")
 
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, [4], 0))
       self.assertAllEqual(result[0], inp)
 
@@ -92,13 +144,13 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[0], inp[0:1, :])
       self.assertAllEqual(result[1], inp[1:4, :])
 
-  def _testHugeNumberOfTensorsVariable(self, use_gpu):
+  def _testHugeNumberOfTensorsVariable(self, dtype):
     num_split = 10000
     size_splits = np.random.randint(1, 3, num_split)
     shape = [3, np.sum(size_splits)]
     split_dim = 1
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
@@ -108,18 +160,17 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[i], inp[slices])
 
   def testSpecialCasesVariable(self):
-    self._testSpecialCasesVariable(False)
-    self._testSpecialCasesVariable(True)
-    self._testHugeNumberOfTensorsVariable(False)
-    self._testHugeNumberOfTensorsVariable(True)
+    self._testSpecialCasesVariable()
+    for dtype in _TEST_DTYPES:
+      self._testHugeNumberOfTensorsVariable(dtype)
 
-  def _testGradientsSimpleVariable(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    with self.test_session(use_gpu=use_gpu):
+  def _testGradientsSimpleVariable(self, dtype):
+    inp = self._makeData((4, 4), dtype)
+    with self.test_session(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
       s = array_ops.split(inp_tensor, [1, 3], 1)
       inp_grads = [
-          np.random.rand(4, 1).astype("f"), np.random.rand(4, 3).astype("f")
+          self._makeData((4, 1), dtype), self._makeData((4, 3), dtype)
       ]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[-1]
@@ -129,16 +180,16 @@ class SplitOpTest(test.TestCase):
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
 
   def testOutputShape(self):
-    with self.test_session(use_gpu=False):
+    with self.test_session(use_gpu=True):
       tensor = array_ops.placeholder(dtypes.float32, shape=[None, 12])
       size_splits = [3, 7, 2]
       outputs = array_ops.split(tensor, size_splits, 1)
       for i, output in enumerate(outputs):
         self.assertEqual(output.get_shape().as_list(), [None, size_splits[i]])
 
-  def _compare(self, x, dim, num, use_gpu):
+  def _compare(self, x, dim, num):
     np_ans = np.split(x, num, dim)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(use_gpu=True) as sess:
       tf_ans = array_ops.split(value=x, num_or_size_splits=num, axis=dim)
       out = sess.run(tf_ans)
     self.assertEqual(num, len(np_ans))
@@ -148,21 +199,15 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(np_ans[i], out[i])
       self.assertShapeEqual(np_ans[i], tf_ans[i])
 
-  def _testSplitRows(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    self._compare(inp, 0, 4, use_gpu)
-
-  def testSplitRowsAll(self):
-    self._testSplitRows(use_gpu=False)
-    self._testSplitRows(use_gpu=True)
+  def testSplitRows(self):
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((4, 4), dtype)
+      self._compare(inp, 0, 4)
 
-  def _testSplitCols(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    self._compare(inp, 1, 4, use_gpu)
-
-  def testSplitColsAll(self):
-    self._testSplitRows(use_gpu=False)
-    self._testSplitCols(use_gpu=True)
+  def testSplitCols(self):
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((4, 4), dtype)
+      self._compare(inp, 1, 4)
 
   def _testEmpty(self, x, dim, num, expected_shape):
     with self.test_session() as sess:
@@ -177,27 +222,28 @@ class SplitOpTest(test.TestCase):
   def testEmpty(self):
     # Note: np.split returns a rank-0 empty ndarray
     # if the input ndarray is empty.
-    inp = np.random.rand(8, 0, 21).astype("f")
-    self._testEmpty(inp, 0, 2, (4, 0, 21))
-    self._testEmpty(inp, 0, 4, (2, 0, 21))
-    self._testEmpty(inp, 1, 4, (8, 0, 21))
-    self._testEmpty(inp, 2, 3, (8, 0, 7))
-    self._testEmpty(inp, 2, 7, (8, 0, 3))
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((8, 0, 21), dtype)
+      self._testEmpty(inp, 0, 2, (4, 0, 21))
+      self._testEmpty(inp, 0, 4, (2, 0, 21))
+      self._testEmpty(inp, 1, 4, (8, 0, 21))
+      self._testEmpty(inp, 2, 3, (8, 0, 7))
+      self._testEmpty(inp, 2, 7, (8, 0, 3))
 
   def testIdentity(self):
-    inp = np.random.rand(2, 2, 2).astype("f")
-    for use_gpu in [False, True]:
-      self._compare(inp, 0, 1, use_gpu)
-      self._compare(inp, 1, 1, use_gpu)
-      self._compare(inp, 2, 1, use_gpu)
+    for dtype in _TEST_DTYPES:
+      inp = self._makeData((2, 2, 2), dtype)
+      self._compare(inp, 0, 1)
+      self._compare(inp, 1, 1)
+      self._compare(inp, 2, 1)
 
   def testSplitDim0(self):
-    for use_gpu in [False, True]:
-      self._compare(np.random.rand(6, 10, 18).astype("f"), 0, 3, use_gpu)
-      self._compare(np.random.rand(6, 7, 18).astype("f"), 0, 3, use_gpu)
-      self._compare(np.random.rand(6, 7, 9).astype("f"), 0, 3, use_gpu)
+    for dtype in _TEST_DTYPES:
+      self._compare(self._makeData((6, 10, 18), dtype), 0, 3)
+      self._compare(self._makeData((6, 7, 18), dtype), 0, 3)
+      self._compare(self._makeData((6, 7, 9), dtype), 0, 3)
 
-  def _RunAndVerify(self, use_gpu, large_num_splits=False):
+  def _RunAndVerify(self, dtype, large_num_splits=False):
     # Random dims of rank 5
     shape = np.random.randint(0, 5, size=5)
     split_dim = np.random.randint(0, 5)
@@ -206,8 +252,8 @@ class SplitOpTest(test.TestCase):
     else:
       num_split = np.random.randint(2, 8)
     shape[split_dim] = np.random.randint(2, 5) * num_split
-    inp = np.random.rand(*shape).astype("f")
-    with self.test_session(use_gpu=use_gpu) as sess:
+    inp = self._makeData(shape, dtype)
+    with self.test_session(use_gpu=True) as sess:
       result = sess.run(
           array_ops.split(
               value=inp, num_or_size_splits=num_split, axis=split_dim))
@@ -220,20 +266,19 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[i], inp[slices])
 
   def testRandom(self):
-    for _ in range(5):
-      self._RunAndVerify(use_gpu=False)
-      self._RunAndVerify(use_gpu=True)
-      self._RunAndVerify(use_gpu=True, large_num_splits=True)
-      self._RunAndVerifyVariable(use_gpu=False)
-      self._RunAndVerifyVariable(use_gpu=True)
-      self._RunAndVerifyVariable(use_gpu=True, large_num_splits=True)
-
-  def _testGradientsSimple(self, use_gpu):
-    inp = np.random.rand(4, 4).astype("f")
-    with self.test_session(use_gpu=use_gpu):
+    for dtype in _TEST_DTYPES:
+      for _ in range(5):
+        self._RunAndVerify(dtype)
+        self._RunAndVerify(dtype, large_num_splits=True)
+        self._RunAndVerifyVariable(dtype)
+        self._RunAndVerifyVariable(dtype, large_num_splits=True)
+
+  def _testGradientsSimple(self, dtype):
+    inp = self._makeData((4, 4), dtype)
+    with self.test_session(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
       s = array_ops.split(value=inp_tensor, num_or_size_splits=4, axis=1)
-      inp_grads = [np.random.rand(4, 1).astype("f") for _ in range(4)]
+      inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
       result = grad.eval()
@@ -241,10 +286,9 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
   def testGradientsAll(self):
-    self._testGradientsSimple(use_gpu=False)
-    self._testGradientsSimple(use_gpu=True)
-    self._testGradientsSimpleVariable(use_gpu=False)
-    self._testGradientsSimpleVariable(use_gpu=True)
+    for dtype in _TEST_DTYPES:
+      self._testGradientsSimple(dtype)
+      self._testGradientsSimpleVariable(dtype)
 
   def testShapeFunctionEdgeCases(self):
     # split_dim greater than rank of input.
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 81eee48d2e86ff7a8d49885ad9b8c03d1c9c28db..645ac2f13028ccd21aa7397fc058ab4a3dcd6db4 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ class StageTest(test.TestCase):
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
-      with ops.device('/gpu:0'):
+      with ops.device(test.gpu_device_name()):
         stager = data_flow_ops.StagingArea([dtypes.float32])
         stage = stager.put([v])
         y = stager.get()
@@ -78,18 +78,174 @@ class StageTest(test.TestCase):
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
-  def testColocation1(self):
+  def testColocation(self):
+    gpu_dev = test.gpu_device_name()
+
     with ops.device('/cpu:0'):
       x = array_ops.placeholder(dtypes.float32)
       v = 2. * (array_ops.zeros([128, 128]) + x)
-    with ops.device('/gpu:0'):
+    with ops.device(gpu_dev):
       stager = data_flow_ops.StagingArea([dtypes.float32])
       y = stager.put([v])
-      self.assertEqual(y.device, '/device:GPU:0')
+      self.assertEqual(y.device, '/device:GPU:0' if gpu_dev
+                                                 else gpu_dev)
     with ops.device('/cpu:0'):
       x = stager.get()
       self.assertEqual(x.device, '/device:CPU:0')
 
+  def testPeek(self):
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.int32, name='x')
+      p = array_ops.placeholder(dtypes.int32, name='p')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.StagingArea([dtypes.int32, ], shapes=[[]])
+      stage = stager.put([x])
+      peek = stager.peek(p)
+      ret = stager.get()
+
+    with self.test_session(use_gpu=True) as sess:
+      for i in range(10):
+        sess.run(stage, feed_dict={x:i})
+
+      for i in range(10):
+        self.assertTrue(sess.run(peek, feed_dict={p:i}) == i)
+
+  def testSizeAndClear(self):
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.float32, name='x')
+      v = 2. * (array_ops.zeros([128, 128]) + x)
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.StagingArea(
+          [dtypes.float32, dtypes.float32],
+          shapes=[[], [128, 128]],
+          names=['x', 'v'])
+      stage = stager.put({'x': x, 'v': v})
+      ret = stager.get()
+      size = stager.size()
+      clear = stager.clear()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(stage, feed_dict={x: -1})
+      self.assertEqual(sess.run(size), 1)
+      sess.run(stage, feed_dict={x: -1})
+      self.assertEqual(sess.run(size), 2)
+      sess.run(clear)
+      self.assertEqual(sess.run(size), 0)
+
+  def testCapacity(self):
+    capacity = 3
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.int32, name='x')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.StagingArea([dtypes.int32, ],
+        capacity=capacity, shapes=[[]])
+      stage = stager.put([x])
+      ret = stager.get()
+      size = stager.size()
+
+    from six.moves import queue as Queue
+    import threading
+
+    queue = Queue.Queue()
+    n = 5
+    missed = 0
+
+    with self.test_session(use_gpu=True) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: i})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.start()
+
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        self.assertTrue(sess.run(ret) == i)
+
+      # Thread should be able to join now
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(capacity):
+        self.assertTrue(sess.run(ret) == i+(n-capacity))
+
+      self.assertTrue(sess.run(size) == 0)
+
+  def testMemoryLimit(self):
+    memory_limit = 512*1024  # 512K
+    chunk = 200*1024 # 256K
+    capacity = memory_limit // chunk
+
+    with ops.device('/cpu:0'):
+      x = array_ops.placeholder(dtypes.uint8, name='x')
+    with ops.device(test.gpu_device_name()):
+      stager = data_flow_ops.StagingArea([dtypes.uint8, ],
+        memory_limit=memory_limit, shapes=[[]])
+      stage = stager.put([x])
+      ret = stager.get()
+      size = stager.size()
+
+    from six.moves import queue as Queue
+    import threading
+    import numpy as np
+
+    queue = Queue.Queue()
+    n = 5
+    missed = 0
+
+    with self.test_session(use_gpu=True) as sess:
+      # Stage data in a separate thread which will block
+      # when it hits the staging area's capacity and thus
+      # not fill the queue with n tokens
+      def thread_run():
+        for i in range(n):
+          sess.run(stage, feed_dict={x: np.full(chunk, i, dtype=np.uint8)})
+          queue.put(0)
+
+      t = threading.Thread(target=thread_run)
+      t.start()
+
+      # Get tokens from the queue, making notes of when we timeout
+      for i in range(n):
+        try:
+          queue.get(timeout=0.05)
+        except Queue.Empty:
+          missed += 1
+
+      # We timed out n - capacity times waiting for queue puts
+      self.assertTrue(missed == n - capacity)
+
+      # Clear the staging area out a bit
+      for i in range(n - capacity):
+        self.assertTrue(sess.run(ret)[0] == i)
+
+      # Thread should be able to join now
+      t.join()
+
+      self.assertTrue(sess.run(size) == capacity)
+
+      # Clear the staging area completely
+      for i in range(capacity):
+        self.assertTrue(sess.run(ret)[0] == i+(n-capacity))
+
+      self.assertTrue(sess.run(size) == 0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 0c0710fed438ebd51b40e46b4a7f3be399788ea8..854394b0dde867f7b351619e0832a39a77c3556b 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -183,7 +183,7 @@ class SubstrOpTest(test.TestCase):
 
     position = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]], dtype)
     length = np.array([[2, 3, 4]], dtype)
-    # Should fail: postion/length have different dimensionality
+    # Should fail: position/length have different dimensionality
     with self.assertRaises(ValueError):
       substr_op = string_ops.substr(test_string, position, length)
 
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index be2d6a566abb611e5c5265b9f7787df2571636ba..54e8098e4e6002c8f3f6c6260b15bc614cca7b83 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -306,7 +306,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(custom_getter_count[0], 2)
 
     # Test that custom getter is called when the variable scope is created
-    # during construction
+  # during construction
     custom_getter_count[0] = 0
     tmpl2 = template.make_template(
         "s2",
@@ -319,6 +319,28 @@ class TemplateTest(test.TestCase):
     tmpl2()
     self.assertEqual(custom_getter_count[0], 2)
 
+  def test_fails_gracefully(self):
+    for create_scope_now in [True, False]:
+      def module_function_with_one_arg(inputs):
+        w = variable_scope.get_variable(
+            "w", shape=[1], initializer=init_ops.zeros_initializer())
+        return inputs * w
+
+      templatized_function = template.make_template(
+          "f1", module_function_with_one_arg,
+          create_scope_now_=create_scope_now)
+      data = array_ops.zeros(1)
+      try:
+        # Try to connect with a kwarg which is unsupported.
+        templatized_function(data, is_training=True)
+      except TypeError:
+        pass
+
+      # The failed __call__ hasn't modified the inner state.
+      self.assertFalse(templatized_function._variables_created)
+      templatized_function(data)
+      self.assertTrue(templatized_function._variables_created)
+
   def test_name_scopes_for_variable_scopes(self):
     # Test that name scopes are not unnecessarily uniquified (but are
     # still uniquified when necessary).
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index e52ae95281adb13096ab0f3698be7b28f4f07442..5b0f318efe28d56c73e89584d39d81cb288e5a86 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,6 +38,19 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+def _make_converter(tf_dtype):
+  def _converter(x):
+    if tf_dtype == dtypes.string:
+      # In Python3, np.str is unicode, while we always want bytes
+      return np.asarray(x).astype("|S")
+    x = np.asarray(x).astype(tf_dtype.as_numpy_dtype)
+    if tf_dtype.is_complex:
+      # Add a non-zero imaginary component to x.
+      x -= 1j * x
+    return x
+  return _converter
+
+
 class TensorArrayTest(test.TestCase):
 
   def testTensorArrayWriteRead(self):
@@ -60,16 +75,11 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(-3.0, d2)
 
   def _testTensorArrayWritePack(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       w0 = ta.write(0, convert([[4.0, 5.0]]))
       w1 = w0.write(1, convert([[6.0, 7.0]]))
@@ -92,17 +102,26 @@ class TensorArrayTest(test.TestCase):
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
+  def testEmptyTensorArrayPack(self):
+    with self.test_session(use_gpu=True):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+
+      empty_element = np.zeros((0, 1), dtype=np.float32)
+      w0 = ta.write(0, empty_element)
+      w1 = w0.write(1, empty_element)
+      w2 = w1.write(2, empty_element)
+
+      c0 = w2.stack()
+
+      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+
   def _testTensorArrayWriteConcat(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       w0 = ta.write(0, convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0]]))
       w1 = w0.write(1, convert([[6.0, 7.0], [106.0, 107.0]]))
@@ -124,7 +143,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWriteConcat(dtypes.string)
 
   def _testTensorArrayPackNotAllValuesAvailableFails(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -136,16 +155,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayPackNotAllValuesAvailableFails()
 
   def _testTensorArrayUnpackRead(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
-      if tf_dtype is dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       # Unpack a vector into scalars
       w0 = ta.unstack(convert([1.0, 2.0, 3.0]))
@@ -201,16 +215,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
   def _testTensorArraySplitRead(self, tf_dtype):
-    dtype = tf_dtype.as_numpy_dtype()
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
-      if tf_dtype == dtypes.string:
-        # In Python3, np.str is unicode, while we always want bytes
-        convert = lambda x: np.asarray(x).astype("|S")
-      else:
-        convert = lambda x: np.asarray(x).astype(dtype)
+      convert = _make_converter(tf_dtype)
 
       # Split an empty vector
       lengths = constant_op.constant([0, 0, 0])
@@ -831,7 +840,7 @@ class TensorArrayTest(test.TestCase):
         dynamic_size=True, dtype=dtypes.float32)
 
   def testGradSerialTwoLoops(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       num_steps = 100
       acc = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -937,7 +946,7 @@ class TensorArrayTest(test.TestCase):
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
   def testWriteShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       c0 = constant_op.constant([4.0, 5.0])
@@ -961,7 +970,7 @@ class TensorArrayTest(test.TestCase):
         w0.write(0, c2)
 
   def testPartlyUnknownShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
 
@@ -1001,7 +1010,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
   def _testUnpackShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1027,7 +1036,7 @@ class TensorArrayTest(test.TestCase):
     self._testUnpackShape()
 
   def testSplitShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1050,7 +1059,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
 
   def testWriteUnknownShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1121,7 +1130,7 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5]))
+      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       self.assertAllEqual([0, 3, 5], packed.eval().shape)
       # Concatenating zero tensors along their first dimension gives a
@@ -1187,85 +1196,86 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(expected_grad, grad_vals[0])
 
   def testTensorArrayGetsDeviceFromFirstWrite(self):
-    with ops.device("/gpu:1"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    # parent device was ignored when creating the TensorArray
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
-      # the first write sets the op's device
+    with ops.device("/job:worker/task:1/cpu:0"):
+      # the first write sets the op's device.
       ta = ta.write(0, 1.0)
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
-      # subsequent writes do not modify the op's device
+    with ops.device("/job:worker/task:2/cpu:0"):
+      # subsequent writes do not modify the op's device.
       ta = ta.write(1, 1.0)
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
 
+    # The gradient TA will sit on the same device as the forward TA.
     ta_grad = ta.grad("grad")
-    self.assertTrue("gpu:0" in ta_grad.handle.device.lower())
-    self.assertTrue("gpu:0" in ta_grad.flow.device.lower())
+    flows = [ta.flow, ta_grad.flow]
 
     # Similar tests for unpack and split
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=3)
+    with ops.device("/job:worker/task:1/cpu:0"):
       ta = ta.unstack([1.0, 2.0])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
-      ta = ta.unstack([1.0, 2.0])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
+    with ops.device("/job:worker/task:2/cpu:0"):
+      ta = ta.write(2, 3.0)
+    flows.append(ta.flow)
 
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-    with ops.device("/gpu:0"):
-      ta = ta.split([1.0, 2.0], [1, 1])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
-    with ops.device("/gpu:1"):
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+    with ops.device("/job:worker/task:1/cpu:0"):
       ta = ta.split([1.0, 2.0], [1, 1])
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.flow.device.lower())
+    flows.append(ta.flow)
+
+    workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
+    session = session_lib.Session(workers[0].target)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    session.run(flows, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: d.node_stats
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:1/" in d:
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
   def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
+    with ops.device("/job:worker/task:0/cpu:0"):
+      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
     def _body(i, ta_i):
-      with ops.device("/gpu:0"):
+      with ops.device("/job:worker/task:1/cpu:0"):
         return i + 1, ta_i.write(i, 0.0)
 
-    self.assertEqual(ta.handle.device, "")
-    self.assertEqual(ta.flow.device, "")
-
     _, ta_out = control_flow_ops.while_loop(
         lambda i, ta: i < 2, _body, loop_vars=[0, ta])
 
-    self.assertTrue("gpu:0" in ta_out.handle.device.lower())
-    self.assertTrue("gpu:0" in ta.handle.device.lower())
-
-  def testTensorArrayLazyDeviceSettingDoesNotConfuseInitialAccess(self):
-    with self.test_session(use_gpu=True) as session:
-      ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
-      self.assertEqual(ta.handle.device, "")
-
-      with ops.device("/cpu:0"):
-        size = ta.size()
-      with ops.device("/gpu:0"):
-        ta = ta.write(0, 0.0)
-
-      self.assertTrue("gpu:0" in ta.handle.device.lower())
-
-      # This should use the TensorArray on /gpu:0
-      size_value, _ = session.run((size, ta.flow))
-      self.assertEqual(2, size_value)
+    workers, _ = test.create_local_cluster(num_workers=3, num_ps=0)
+    session = session_lib.Session(workers[0].target)
+
+    run_options = config_pb2.RunOptions(
+        trace_level=config_pb2.RunOptions.FULL_TRACE)
+    run_metadata = config_pb2.RunMetadata()
+
+    session.run(ta_out.flow, options=run_options, run_metadata=run_metadata)
+    self.assertTrue(run_metadata.HasField("step_stats"))
+    dev_stats = {d.device: d.node_stats
+                 for d in run_metadata.step_stats.dev_stats}
+    for d in dev_stats:
+      if "/task:1/" in d:
+        self.assertTrue(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+      else:
+        self.assertFalse(
+            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
   def testTensorArrayIdentity(self):
-    with self.test_session() as session:
+    with self.test_session(use_gpu=True) as session:
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
                                          infer_shape=False)
       ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4,
diff --git a/tensorflow/python/kernel_tests/tensor_priority_test.py b/tensorflow/python/kernel_tests/tensor_priority_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..574538a837a0a112e1a806ddea7a13fe44beacc2
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tensor_priority_test.py
@@ -0,0 +1,86 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the binary ops priority mechanism."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test as test_lib
+
+
+class TensorPriorityTest(test_lib.TestCase):
+
+  def testSupportedRhsWithoutDelegation(self):
+
+    class NumpyArraySubclass(np.ndarray):
+      pass
+
+    supported_rhs_without_delegation = (3, 3.0, [1.0, 2.0], np.array(
+        [1.0, 2.0]), NumpyArraySubclass(
+            shape=(1, 2), buffer=np.array([1.0, 2.0])),
+                                        ops.convert_to_tensor([[1.0, 2.0]]))
+    for rhs in supported_rhs_without_delegation:
+      tensor = ops.convert_to_tensor([[10.0, 20.0]])
+      res = tensor + rhs
+      self.assertIsInstance(res, ops.Tensor)
+
+  def testUnsupportedRhsWithoutDelegation(self):
+
+    class WithoutReverseAdd(object):
+      pass
+
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = WithoutReverseAdd()
+    with self.assertRaisesWithPredicateMatch(
+        TypeError, lambda e: "Expected float" in str(e)):
+      # pylint: disable=pointless-statement
+      tensor + rhs
+
+  def testUnsupportedRhsWithDelegation(self):
+
+    class WithReverseAdd(object):
+
+      def __radd__(self, lhs):
+        return "Works!"
+
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = WithReverseAdd()
+    res = tensor + rhs
+    self.assertEqual(res, "Works!")
+
+  def testFullDelegationControlUsingRegistry(self):
+
+    class NumpyArraySubclass(np.ndarray):
+
+      def __radd__(self, lhs):
+        return "Works!"
+
+    def raise_to_delegate(value, dtype=None, name=None, as_ref=False):
+      del value, dtype, name, as_ref  # Unused.
+      raise TypeError
+
+    ops.register_tensor_conversion_function(
+        NumpyArraySubclass, raise_to_delegate, priority=0)
+    tensor = ops.convert_to_tensor([[10.0, 20.0]])
+    rhs = NumpyArraySubclass(shape=(1, 2), buffer=np.array([1.0, 2.0]))
+    res = tensor + rhs
+    self.assertEqual(res, "Works!")
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 9d89e250f59a15a93f445aa6514606d2fec7fe2d..b61995373fd6b085645dcdeb8acdcbb8daf14cde 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -37,16 +37,41 @@ class TopKTest(test.TestCase):
                     expected_values,
                     expected_indices,
                     sorted=True):
-    np_values = np.array(expected_values)
-    np_indices = np.array(expected_indices)
-    with self.test_session():
+    np_expected_values = np.array(expected_values)
+    np_expected_indices = np.array(expected_indices)
+    with self.test_session(use_gpu=True):
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
       values = values_op.eval()
       indices = indices_op.eval()
-      self.assertAllClose(np_values, values)
-      self.assertAllEqual(np_indices, indices)
-      self.assertShapeEqual(np_values, values_op)
-      self.assertShapeEqual(np_indices, indices_op)
+
+      self.assertShapeEqual(np_expected_values, values_op)
+      self.assertShapeEqual(np_expected_indices, indices_op)
+
+      if sorted:
+        self.assertAllClose(np_expected_values, values)
+        self.assertAllEqual(np_expected_indices, indices)
+      else:
+        np_inputs = np.array(inputs)
+
+        # Check that the indices are valid.
+        for result_index, src_index in np.ndenumerate(indices):
+          value = values[result_index]
+          expected_value = np_inputs[result_index[0], src_index]
+          np.testing.utils.assert_almost_equal(value, expected_value)
+
+        # Check that if two elements are equal, the lower-index element appears
+        # first.
+        shape = values.shape
+        for batch_index in range(shape[0]):
+          for index in range(shape[1] - 1):
+            if np.isclose(values[batch_index, index],
+                          values[batch_index, index + 1]):
+              self.assertLess(indices[batch_index, index],
+                              indices[batch_index, index + 1])
+
+        # Now check the results, ignoring order.
+        self.assertAllEqual(np.sort(np_expected_indices), np.sort(indices))
+        self.assertAllClose(np.sort(np_expected_values), np.sort(values))
 
   def testTop1(self):
     inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
@@ -79,7 +104,7 @@ class TopKTest(test.TestCase):
 
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       k = array_ops.placeholder(dtypes.int32)
       values, _ = nn_ops.top_k(inputs, k)
       with self.assertRaisesOpError("Need k >= 0, got -7"):
@@ -92,7 +117,7 @@ class TopKTest(test.TestCase):
       nn_ops.top_k(inputs, 4)
 
   def testTopKGradients(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       inputs = array_ops.placeholder(dtypes.int32, shape=[2, 5])
       values, _ = nn_ops.top_k(inputs, 3)
       grad = sess.run(
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 939a86c0b50e2956c7364fdf564ffbeb038499af..7b112a6a17b9a8a366be9e00ecbf4f5a68665e18 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -358,9 +358,6 @@ class TransposeTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.transpose(
           np.arange(0., 30).reshape([2, 3, 5]), [[0, 1], [2, 3]])
-    self._testError(
-        np.arange(0., 2**11).reshape([2] * 11), np.arange(11),
-        "not implemented")
     with self.assertRaises(ValueError):
       array_ops.transpose(np.arange(0., 30).reshape([2, 3, 5]), [0, 1, 3])
     self._testError(
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 69d1a6f60e1eed15f71d5884be720cceac3171b5..7108131d53d88e0bfd6039b7d6f75a51293d081c 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -115,7 +115,7 @@ class VariableScopeTest(test.TestCase):
           dtypes.int64, dtypes.bool
       ]
 
-      # Use different varibale_name to distinguish various dtypes
+      # Use different variable_name to distinguish various dtypes
       for (i, dtype) in enumerate(types):
         x = variable_scope.get_variable(
             name="x%d" % i, shape=(3, 4), dtype=dtype)
@@ -774,6 +774,11 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual([v.name
                           for v in scope.global_variables()], ["foo/b:0"])
 
+  def testGetVariableWithRefDtype(self):
+    v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
+    # Ensure it is possible to do get_variable with a _ref dtype passed in.
+    _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
@@ -802,7 +807,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
           dtypes.int64, dtypes.bool
       ]
 
-      # Use different varibale_name to distinguish various dtypes
+      # Use different variable_name to distinguish various dtypes
       for (i, dtype) in enumerate(types):
         x = variable_scope.get_variable(
             name="x%d" % i,
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 9b76585f9fbbb7ef75edfc5a20dea18ec5d19f1e..8410f12f3e4ea8301b9cf3bac0f3fa8c81e71af5 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -25,7 +25,6 @@ from __future__ import print_function
 
 import copy
 import functools
-import inspect
 import re
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
@@ -36,9 +35,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 
-class _Layer(object):
+class Layer(object):
   """Base layer class.
 
   WARNING: Do not subclass this layer unless you know what you are doing:
@@ -60,6 +60,8 @@ class _Layer(object):
     variables: List of all variables of this layer, trainable and non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
+    input_spec: Object specifying the constraints on inputs that can be
+      accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None,
@@ -80,23 +82,28 @@ class _Layer(object):
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    self._trainable = trainable
-    self._built = False
-    self._trainable_variables = []
-    self._non_trainable_variables = []
+    self.trainable = trainable
+    self.built = False
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
-    self.dtype = dtype
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+    self.dtype = dtypes.as_dtype(dtype).name
+    self.input_spec = None
 
-    # Determine base name (non-unique).
+    # Determine layer name (non-unique).
     if isinstance(name, vs.VariableScope):
       base_name = name.name
     else:
       base_name = name
+      self.name = name
     if not name:
       base_name = _to_snake_case(self.__class__.__name__)
+      self.name = _unique_layer_name(base_name)
     self._base_name = base_name
 
     # Determine variable scope.
@@ -106,45 +113,43 @@ class _Layer(object):
     else:
       self._scope = None
 
-    # Unique name is borrowed from scope to match variable names.
-    if self._scope is not None:
-      self._name = self._scope.name
-    else:
-      # No name available until we see a scope
-      self._name = None
-
-  def __setattr__(self, name, value):
-    if hasattr(self, name):
-      # Only allow private attributes to be set more than once, under the
-      # convention that private attributes should only be set from inside
-      # the class.
-      # All attributes meant to be set several times should be set to private.
-      if name[0] != '_':
-        raise AttributeError('Read-only property cannot be set: %s' % name)
-    super(_Layer, self).__setattr__(name, value)
+  @property
+  def scope_name(self):
+    if not self._scope:
+      raise ValueError('No name available for layer scope because the layer "' +
+                       self.name + '" has not been used yet. The scope name ' +
+                       ' is determined the first time the layer instance is ' +
+                       'called. You must therefore call the layer before ' +
+                       'querying `scope_name`.')
+    return self._scope.name
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
 
   @property
-  def name(self):
-    if self._name is None:
-      raise ValueError(
-          'No name available for layer because it has not been used yet.')
-    return self._name
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
 
   @property
   def trainable_variables(self):
-    return self._trainable_variables if self.trainable else []
+    return self.trainable_weights
 
   @property
   def non_trainable_variables(self):
-    return self._non_trainable_variables if self.trainable else self.variables
+    return self.non_trainable_weights
 
   @property
-  def trainable_weights(self):
-    return self.trainable_variables
+  def weights(self):
+    """Returns the list of all layer variables/weights.
 
-  @property
-  def non_trainable_weights(self):
-    return self.non_trainable_variables
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
 
   @property
   def variables(self):
@@ -153,37 +158,141 @@ class _Layer(object):
     Returns:
       A list of variables.
     """
-    return self._trainable_variables + self._non_trainable_variables
+    return self.weights
 
   @property
   def updates(self):
     return self._updates
 
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing a same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: Optional input tensor(s) that the update(s) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the updates are created. If `None` is passed, the updates are assumed
+        to be unconditional, and will apply across all dataflows of the layer.
+    """
+    updates = _to_list(updates)
+    if not updates:
+      return
+    self._updates += updates
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_updates:
+      self._per_input_updates[inputs_hash] = []
+    self._per_input_updates[inputs_hash] += updates
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__` method
+        at the time the updates were created.
+        If you pass `inputs=None`, unconditional updates are returned.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+    """
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_updates.get(inputs_hash, [])
+
   @property
   def losses(self):
     return self._losses
 
-  @property
-  def built(self):
-    return self._built
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
 
-  @property
-  def trainable(self):
-    return self._trainable
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing a same layer
+    on different inputs `a` and `b`, some entries in `layer.losses` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
 
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: Optional input tensor(s) that the loss(es) depend on. Must
+        match the `inputs` argument passed to the `__call__` method at the time
+        the losses are created. If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+    """
+    losses = _to_list(losses)
+    if not losses:
+      return
+    self._losses += losses
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      # We compute an ID that uniquely identifies the list of tensors.
+      # This ID is order-sensitive.
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    if inputs_hash not in self._per_input_losses:
+      self._per_input_losses[inputs_hash] = []
+    self._per_input_losses[inputs_hash] += losses
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+        Must match the `inputs` argument passed to the `__call__`
+        method at the time the losses were created.
+        If you pass `inputs=None`, unconditional losses are returned,
+        such as weight regularization losses.
 
     Returns:
-      A list of variables.
+      List of loss tensors of the layer that depend on `inputs`.
     """
-    return self.variables
+    if inputs is not None:
+      inputs = _to_list(inputs)
+    if not inputs:
+      inputs = None
+    if inputs is not None:
+      inputs_hash = _object_list_uid(inputs)
+    else:
+      inputs_hash = None
+    return self._per_input_losses.get(inputs_hash, [])
 
   def build(self, _):
     """Creates the variables of the layer.
     """
-    self._built = True
+    self.built = True
 
   def call(self, inputs, **kwargs):
     """The logic of the layer lives here.
@@ -201,6 +310,9 @@ class _Layer(object):
     """Computes the output shape of the layer given the input shape.
 
     Assumes that the layer will be built to match that input shape.
+    If this method is not implemented by child classes, the default
+    assumption will be that the layer does not alter the shape of the tensors
+    passing through it.
 
     Args:
       input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
@@ -215,12 +327,21 @@ class _Layer(object):
       ValueError: if `input_shape` is incomplete or is incompatible with the
         the layer.
     """
-    raise NotImplementedError
+    return input_shape
+
+  def _set_scope(self, scope=None):
+    if self._scope is None:
+      # If constructed with _scope=None, lazy setting of scope.
+      if self._reuse:
+        self._scope = next(vs.variable_scope(
+            scope if scope is not None else self._base_name).gen)
+      else:
+        self._scope = next(vs.variable_scope(
+            scope, default_name=self._base_name).gen)
 
-  def _add_variable(self, name, shape, dtype=None,
-                    initializer=None, regularizer=None, trainable=True,
-                    variable_getter=vs.get_variable):
-    """Adds a new variable to the layer.
+  def add_variable(self, name, shape, dtype=None,
+                   initializer=None, regularizer=None, trainable=True):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
       name: variable name.
@@ -231,7 +352,6 @@ class _Layer(object):
       trainable: whether the variable should be part of the layer's
         "trainable_variables" (e.g. variables, biases)
         or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      variable_getter: The getter to use for TensorFlow variables.
 
     Returns:
       The created variable.
@@ -239,38 +359,43 @@ class _Layer(object):
     if dtype is None:
       dtype = self.dtype
     existing_variables = set(tf_variables.global_variables())
-    variable = variable_getter(name,
-                               shape=shape,
-                               initializer=initializer,
-                               dtype=dtype,
-                               trainable=trainable and self.trainable)
-    # TODO(sguada) fix name = variable.op.name
-    if variable in existing_variables:
-      return variable
-    if regularizer:
-      # To match the behavior of tf.get_variable(), we only
-      # apply regularization if the variable is newly created.
-      if isinstance(variable, tf_variables.PartitionedVariable):
-        for v in variable:
-          with ops.colocate_with(v.op):
-            with ops.name_scope(name + '/Regularizer'):
-              regularization = regularizer(v)
-          if regularization is not None:
-            self._losses.append(regularization)
-            _add_elements_to_collection(
-                regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
-      else:
-        with ops.colocate_with(variable.op):
-          with ops.name_scope(name + '/Regularizer'):
-            regularization = regularizer(variable)
-        if regularization is not None:
-          self._losses.append(regularization)
-          _add_elements_to_collection(
-              regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+
+    self._set_scope(None)
+
+    with vs.variable_scope(self._scope,
+                           reuse=self.built or self._reuse) as scope:
+      with ops.name_scope(scope.original_name_scope):
+        variable = vs.get_variable(name,
+                                   shape=shape,
+                                   initializer=initializer,
+                                   dtype=dtypes.as_dtype(dtype),
+                                   trainable=trainable and self.trainable)
+        if variable in existing_variables:
+          return variable
+        if regularizer:
+          # To match the behavior of tf.get_variable(), we only
+          # apply regularization if the variable is newly created.
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            for v in variable:
+              with ops.colocate_with(v.op):
+                with ops.name_scope(name + '/Regularizer'):
+                  regularization = regularizer(v)
+              if regularization is not None:
+                self.add_loss(regularization)
+                _add_elements_to_collection(
+                    regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
+          else:
+            with ops.colocate_with(variable.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(variable)
+            if regularization is not None:
+              self.add_loss(regularization)
+              _add_elements_to_collection(
+                  regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
     if trainable:
-      self._trainable_variables.append(variable)
+      self._trainable_weights.append(variable)
     else:
-      self._non_trainable_variables.append(variable)
+      self._non_trainable_weights.append(variable)
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -280,46 +405,25 @@ class _Layer(object):
       inputs: input tensor(s).
       *args: additional positional arguments to be passed to `self.call`.
       **kwargs: additional keyword arguments to be passed to `self.call`.
-        **Note**, the kwarg 'scope' is reserved for use by the Layer.
-
+        **Note**: kwarg `scope` is reserved for use by the layer.
     Returns:
       Output tensor(s).
     """
-    scope = kwargs.pop('scope', None)
-
-    # Define a custom getter to override tf.get_variable when creating layer
-    # variables. The current custom getter is nested by the variable scope.
-    def variable_getter(getter, name, shape, dtype=None, initializer=None,
-                        regularizer=None, trainable=True, **getter_kwargs):
-      return self._add_variable(
-          name, shape, initializer=initializer, regularizer=regularizer,
-          dtype=dtype, trainable=trainable,
-          variable_getter=functools.partial(getter, **getter_kwargs))
-
-    if not self._built and self._scope is None:
-      # If constructed with _scope=None, lazy setting of scope.
-      if self._reuse:
-        self._scope = next(vs.variable_scope(
-            scope if scope is not None else self._base_name).gen)
-      else:
-        self._scope = next(vs.variable_scope(
-            scope, default_name=self._base_name).gen)
-      self._name = self._scope.name
+    self._set_scope(kwargs.pop('scope', None))
 
-    # Build (if necessary) and call the layer, inside a variable
-    # scope.
-    with vs.variable_scope(self._scope,
-                           reuse=True if self._built else self._reuse,
-                           custom_getter=variable_getter) as scope:
-      # Ensure the Layer, if being reused, is working with inputs from
-      # the same graph as where it was created.
-      try:
-        ops._get_graph_from_inputs(nest.flatten(inputs), graph=self.graph)  # pylint: disable=protected-access
-      except ValueError as e:
-        raise ValueError("Inputs' and Layer's graphs are not the same: %s" % e)
+    # Ensure the Layer, if being reused, is working with inputs from
+    # the same graph as where it was created.
+    try:
+      ops._get_graph_from_inputs(nest.flatten(inputs), graph=self.graph)  # pylint: disable=protected-access
+    except ValueError as e:
+      raise ValueError('Input graph and Layer graph are not the same: %s' % e)
 
+    with vs.variable_scope(self._scope,
+                           reuse=self.built or self._reuse) as scope:
       with ops.name_scope(scope.original_name_scope):
         if not self.built:
+          # Check input assumptions set before layer building, e.g. input rank.
+          self._assert_input_compatibility(inputs)
           input_list = [
               ops.convert_to_tensor(x, name='input')
               for x in nest.flatten(inputs)]
@@ -328,7 +432,10 @@ class _Layer(object):
             self.build(input_shapes[0])
           else:
             self.build(input_shapes)
-          self._built = True
+        if 'scope' in tf_inspect.getargspec(self.call).args:
+          kwargs['scope'] = scope
+        # Check input assumptions set after layer building, e.g. input shape.
+        self._assert_input_compatibility(inputs)
         outputs = self.call(inputs, *args, **kwargs)
 
         # Apply activity regularization.
@@ -339,12 +446,13 @@ class _Layer(object):
           for output in output_list:
             with ops.name_scope('ActivityRegularizer'):
               activity_regularization = self.activity_regularizer(output)
-            self._losses.append(activity_regularization)
+            self.add_loss(activity_regularization)
             _add_elements_to_collection(
                 activity_regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
 
     # Update global default collections.
     _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
+    self.built = True
     return outputs
 
   @property
@@ -366,19 +474,153 @@ class _Layer(object):
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
-  def apply(self, inputs, **kwargs):
+  def apply(self, inputs, *args, **kwargs):
     """Apply the layer on a input.
 
     This simply wraps `self.__call__`.
 
     Arguments:
       inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
       **kwargs: additional keyword arguments to be passed to `self.call`.
 
     Returns:
       Output tensor(s).
     """
-    return self.__call__(inputs, **kwargs)
+    return self.__call__(inputs, *args, **kwargs)
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = _to_list(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = _to_list(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, by the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim='
+                           + str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
+
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
 
 
 def _to_snake_case(name):
@@ -417,3 +659,39 @@ def _add_elements_to_collection(elements, collections):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
+
+
+def _object_list_uid(object_list):
+  object_list = _to_list(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def _unique_layer_name(name):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```
+    >>> _unique_layer_name('dense')
+    dense_1
+    >>> _unique_layer_name('dense')
+    dense_2
+  ```
+  """
+  layer_name_uids_collection = ops.get_collection('LAYER_NAME_UIDS')
+  if not layer_name_uids_collection:
+    layer_name_uids = {}
+    ops.add_to_collection('LAYER_NAME_UIDS', layer_name_uids)
+  else:
+    layer_name_uids = layer_name_uids_collection[0]
+  if name not in layer_name_uids:
+    layer_name_uids[name] = 1
+  else:
+    layer_name_uids[name] += 1
+  return name + '_' + str(layer_name_uids[name])
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 83ae1b6e83588f4b1466c33d7b85342b224de9cb..81fbe5fbf70e95637fa5687bd5a146b1d1bac4b4 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -22,6 +22,7 @@ import copy
 
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import base as base_layers
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -32,26 +33,24 @@ from tensorflow.python.platform import test
 class BaseLayerTest(test.TestCase):
 
   def testLayerProperties(self):
-    layer = base_layers._Layer(name='my_layer')
+    layer = base_layers.Layer(name='my_layer')
     self.assertListEqual(layer.variables, [])
     self.assertListEqual(layer.trainable_variables, [])
     self.assertListEqual(layer.non_trainable_variables, [])
     self.assertListEqual(layer.updates, [])
     self.assertListEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
-    with self.assertRaisesRegexp(ValueError, 'not been used yet'):
-      _ = layer.name
-    layer = base_layers._Layer(name='my_layer', trainable=False)
+    layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
   def testAddWeight(self):
     with self.test_session():
-      layer = base_layers._Layer(name='my_layer')
+      layer = base_layers.Layer(name='my_layer')
 
       # Test basic variable creation.
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-      self.assertEqual(variable.name, 'my_var:0')
+      self.assertEqual(variable.name, 'my_layer/my_var:0')
       self.assertListEqual(layer.variables, [variable])
       self.assertListEqual(layer.trainable_variables, [variable])
       self.assertListEqual(layer.non_trainable_variables, [])
@@ -60,8 +59,8 @@ class BaseLayerTest(test.TestCase):
           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
       # Test non-trainable variable creation.
-      # layer._add_variable should work even outside `build` and `call`.
-      variable_2 = layer._add_variable(
+      # layer.add_variable should work even outside `build` and `call`.
+      variable_2 = layer.add_variable(
           'non_trainable_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           trainable=False)
@@ -73,7 +72,7 @@ class BaseLayerTest(test.TestCase):
 
       # Test with regularizer.
       regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
-      variable = layer._add_variable(
+      variable = layer.add_variable(
           'reg_var', [2, 2],
           initializer=init_ops.zeros_initializer(),
           regularizer=regularizer)
@@ -81,81 +80,70 @@ class BaseLayerTest(test.TestCase):
 
   def testGetVariable(self):
     with self.test_session():
-      # From inside `build` and `call` it should be possible to use
-      # either tf.get_variable
 
-      class MyLayer(base_layers._Layer):
+      class MyLayer(base_layers.Layer):
 
         def build(self, input_shape):
-          self.my_var = variable_scope.get_variable(
+          self.my_var = self.add_variable(
               'my_var', [2, 2], initializer=init_ops.zeros_initializer())
 
         def call(self, inputs):
-          variable_scope.get_variable(
-              'my_call_var', [2, 2], initializer=init_ops.zeros_initializer())
-          return inputs
+          return inputs * 2
 
       layer = MyLayer(name='my_layer')
       inputs = random_ops.random_uniform((5,), seed=1)
       layer.apply(inputs)
       layer.apply(inputs)
       self.assertListEqual([v.name for v in layer.variables],
-                           ['my_layer/my_var:0', 'my_layer/my_call_var:0'])
+                           ['my_layer/my_var:0'])
 
       # Creating a layer with no scope leads to lazy construction of
       # the scope at apply() time.  It uses scope "<current scope>/base_name"
       lazy_layer = MyLayer(_reuse=True)
       with variable_scope.variable_scope('new_scope'):
-        # This should attempt to reuse 'my_var' and 'my_call_var' in 'new_scope'
+        # This should attempt to reuse 'my_var' in 'new_scope'
         with self.assertRaisesRegexp(
             ValueError, r'new_scope/my_layer/my_var does not exist'):
           lazy_layer.apply(inputs)
         with variable_scope.variable_scope('my_layer'):
           variable_scope.get_variable('my_var', [2, 2])
-        with self.assertRaisesRegexp(
-            ValueError, r'new_scope/my_layer/my_call_var does not exist'):
-          lazy_layer.apply(inputs)
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_call_var', [2, 2])
+
         # Smoke test: it runs.
         lazy_layer.apply(inputs)
         # The variables were created outside of the Layer, and
         # reuse=True, so the Layer does not own them and they are not
         # stored in its collection.
         self.assertListEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer.name, 'new_scope/my_layer')
+        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
 
       # Creating a layer with no scope leads to lazy construction of
       # the scope at apply() time.  If 'scope' argument is passed to
       # apply(), it uses that scope when accessing variables.
       lazy_layer = MyLayer(_reuse=True)
       with variable_scope.variable_scope('new_scope') as new_scope:
-        # This should attempt to reuse 'my_var' and 'my_call_var' in 'new_scope'
+        # This should attempt to reuse 'my_var' in 'new_scope'
         with self.assertRaisesRegexp(
             ValueError, r'new_scope/my_var does not exist'):
           lazy_layer.apply(inputs, scope=new_scope)
         variable_scope.get_variable('my_var', [2, 2])
-        with self.assertRaisesRegexp(
-            ValueError, r'new_scope/my_call_var does not exist'):
-          lazy_layer.apply(inputs, scope=new_scope)
-        variable_scope.get_variable('my_call_var', [2, 2])
+
         # Smoke test: it runs.
         lazy_layer.apply(inputs, scope=new_scope)
         # The variables were created outside of the Layer, and
         # reuse=True, so the Layer does not own them and they are not
         # stored in its collection.
         self.assertListEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer.name, 'new_scope')
+        self.assertEqual(lazy_layer._scope.name, 'new_scope')
 
       with ops.Graph().as_default():
         inputs_ng = random_ops.random_uniform((5,), seed=1)
         with self.assertRaisesRegexp(ValueError,
-                                     r'graphs are not the same'):
+                                     r'graph are not the same'):
           layer.apply(inputs_ng)
 
   def testCall(self):
 
-    class MyLayer(base_layers._Layer):
+    class MyLayer(base_layers.Layer):
 
       def call(self, inputs):
         return math_ops.square(inputs)
@@ -166,9 +154,39 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.built, True)
     self.assertEqual(outputs.op.name, 'my_layer/Square')
 
+  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, _):
+        # Do not mark the layer as built.
+        pass
+
+      def call(self, inputs):
+        self.my_var = self.add_variable('my_var', [2, 2])
+        if self.built:
+          # Skip creating on the first call; try to create after it's
+          # built.  This is expected to fail.
+          self.add_variable('this_will_break_on_second_call', [2, 2])
+        return inputs + math_ops.square(self.my_var)
+
+    layer = MyLayer(name='my_layer')
+    inputs = random_ops.random_uniform((2,), seed=1)
+    outputs = layer.apply(inputs)
+    self.assertEqual(layer.built, True)
+    self.assertEqual(outputs.op.name, 'my_layer/add')
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+    with self.assertRaisesRegexp(ValueError,
+                                 'my_layer/this_will_break_on_second_call'):
+      layer.apply(inputs)
+    # The list of variables hasn't changed.
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+
   def testDeepCopy(self):
 
-    class MyLayer(base_layers._Layer):
+    class MyLayer(base_layers.Layer):
 
       def call(self, inputs):
         return math_ops.square(inputs)
@@ -184,9 +202,9 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer_copy._scope.name, layer._scope.name)
     self.assertEqual(layer_copy._graph, layer._graph)
 
-  def testNaming(self):
+  def testScopeNaming(self):
 
-    class PrivateLayer(base_layers._Layer):
+    class PrivateLayer(base_layers.Layer):
 
       def call(self, inputs):
         return None
@@ -194,41 +212,189 @@ class BaseLayerTest(test.TestCase):
     inputs = random_ops.random_uniform((5,))
     default_layer = PrivateLayer()
     _ = default_layer.apply(inputs)
-    self.assertEqual(default_layer.name, 'private_layer')
+    self.assertEqual(default_layer._scope.name, 'private_layer')
     default_layer1 = PrivateLayer()
     default_layer1.apply(inputs)
-    self.assertEqual(default_layer1.name, 'private_layer_1')
+    self.assertEqual(default_layer1._scope.name, 'private_layer_1')
     my_layer = PrivateLayer(name='my_layer')
     my_layer.apply(inputs)
-    self.assertEqual(my_layer.name, 'my_layer')
+    self.assertEqual(my_layer._scope.name, 'my_layer')
     my_layer1 = PrivateLayer(name='my_layer')
     my_layer1.apply(inputs)
-    self.assertEqual(my_layer1.name, 'my_layer_1')
+    self.assertEqual(my_layer1._scope.name, 'my_layer_1')
     my_layer2 = PrivateLayer(name='my_layer')
     my_layer2.apply(inputs)
-    self.assertEqual(my_layer2.name, 'my_layer_2')
+    self.assertEqual(my_layer2._scope.name, 'my_layer_2')
     # Name scope shouldn't affect names.
     with ops.name_scope('some_name_scope'):
       default_layer2 = PrivateLayer()
       default_layer2.apply(inputs)
-      self.assertEqual(default_layer2.name, 'private_layer_2')
+      self.assertEqual(default_layer2._scope.name, 'private_layer_2')
       my_layer3 = PrivateLayer(name='my_layer')
       my_layer3.apply(inputs)
-      self.assertEqual(my_layer3.name, 'my_layer_3')
+      self.assertEqual(my_layer3._scope.name, 'my_layer_3')
       other_layer = PrivateLayer(name='other_layer')
       other_layer.apply(inputs)
-      self.assertEqual(other_layer.name, 'other_layer')
-    # Variable scope gets added to names.
+      self.assertEqual(other_layer._scope.name, 'other_layer')
+    # Variable scope gets added to scope names.
     with variable_scope.variable_scope('var_scope'):
       default_layer_scoped = PrivateLayer()
       default_layer_scoped.apply(inputs)
-      self.assertEqual(default_layer_scoped.name, 'var_scope/private_layer')
+      self.assertEqual(default_layer_scoped._scope.name,
+                       'var_scope/private_layer')
       my_layer_scoped = PrivateLayer(name='my_layer')
       my_layer_scoped.apply(inputs)
-      self.assertEqual(my_layer_scoped.name, 'var_scope/my_layer')
+      self.assertEqual(my_layer_scoped._scope.name, 'var_scope/my_layer')
       my_layer_scoped1 = PrivateLayer(name='my_layer')
       my_layer_scoped1.apply(inputs)
-      self.assertEqual(my_layer_scoped1.name, 'var_scope/my_layer_1')
+      self.assertEqual(my_layer_scoped1._scope.name, 'var_scope/my_layer_1')
+
+  def testInputSpecNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+
+  def testInputSpecMinNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(min_ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected min_ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+    layer.apply(array_ops.placeholder('int32', shape=(None, None, None)))
+
+  def testInputSpecMaxNdimCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(max_ndim=2)
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'requires a defined rank'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected max_ndim=2'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, None, None)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None)))
+    layer.apply(array_ops.placeholder('int32', shape=(None,)))
+
+  def testInputSpecDtypeCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(dtype='float32')
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected dtype=float32'):
+      layer.apply(array_ops.placeholder('int32'))
+
+    # Works
+    layer.apply(array_ops.placeholder('float32', shape=(None, None)))
+
+  def testInputSpecAxesCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected axis'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, 3)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, None, 2)))
+    layer.apply(array_ops.placeholder('int32', shape=(None, 2)))
+
+  def testInputSpecShapeCheck(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+    with self.assertRaisesRegexp(ValueError,
+                                 r'expected shape'):
+      layer.apply(array_ops.placeholder('int32', shape=(None, 2)))
+
+    # Works
+    layer.apply(array_ops.placeholder('int32', shape=(None, 3)))
+    layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
+
+  def testNoInputSpec(self):
+
+    class CustomerLayer(base_layers.Layer):
+
+      def __init__(self):
+        super(CustomerLayer, self).__init__()
+        self.input_spec = None
+
+      def call(self, inputs):
+        return inputs
+
+    layer = CustomerLayer()
+
+    # Works
+    layer.apply(array_ops.placeholder('int32'))
+    layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 3b8959e2106af660fe2f5536fecd180394441e23..b61168695a4c1170067c0aba83169fae9f6b1487 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -32,12 +32,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope as vs
-
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
+from tensorflow.python import framework
 
 
-class _Conv(base._Layer):  # pylint: disable=protected-access
+class _Conv(base.Layer):
   """Abstract nD convolution layer (private, used as implementation base).
 
   This layer creates a convolution kernel that is convolved
@@ -114,12 +115,10 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
     self.activity_regularizer = activity_regularizer
+    self.input_spec = base.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
-    if len(input_shape) != self.rank + 2:
-      raise ValueError('Inputs should have rank ' +
-                       str(self.rank + 2) +
-                       '. Received input shape: ' + str(input_shape))
+    input_shape = tensor_shape.TensorShape(input_shape)
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
@@ -130,37 +129,71 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
     input_dim = input_shape[channel_axis].value
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.input_spec = base.InputSpec(ndim=self.rank + 2,
+                                     axes={channel_axis: input_dim})
+    self.built = True
 
   def call(self, inputs):
+    if (self.data_format == 'channels_first' and
+        not framework.test_util.gpu_device_name()):
+      # `nn.convolution` is not implemented on CPU for `channels_first` format.
+      # In cases where we are most likely running on CPU using `channels_first`,
+      # we reshape the inputs to use `channels_last` (and reshape them back
+      # afterwards). This is a temporary fix; a better solution would be a fix
+      # at the op level.
+      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
+      data_format = 'channels_last'
+      if self.rank == 1:
+        inputs = array_ops.transpose(inputs, (0, 2, 1))
+      elif self.rank == 2:
+        inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
+      elif self.rank == 3:
+        inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+    else:
+      data_format = self.data_format
     outputs = nn.convolution(
         input=inputs,
         filter=self.kernel,
         dilation_rate=self.dilation_rate,
         strides=self.strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, self.rank + 2))
+        data_format=utils.convert_data_format(data_format,
+                                              self.rank + 2))
+    if (self.data_format == 'channels_first' and
+        not framework.test_util.gpu_device_name()):
+      if self.rank == 1:
+        outputs = array_ops.transpose(outputs, (0, 2, 1))
+      elif self.rank == 2:
+        outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+      elif self.rank == 3:
+        outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+
     if self.bias is not None:
-      if self.rank != 2 and self.data_format == 'channels_first':
-        # bias_add does not support channels_first for non-4D inputs.
+      if self.data_format == 'channels_first':
+        # bias_add only supports NHWC.
+        # TODO(fchollet): remove this when `bias_add` is feature-complete.
         if self.rank == 1:
           bias = array_ops.reshape(self.bias, (1, self.filters, 1))
           outputs += bias
-        elif self.rank == 3:
+        if self.rank == 2:
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1, 1))
+          outputs += bias
+        if self.rank == 3:
           # As of Mar 2017, direct addition is significantly slower than
           # bias_add when computing gradients. To use bias_add, we collapse Z
           # and Y into a single dimension to obtain a 4D input tensor.
@@ -186,6 +219,35 @@ class _Conv(base._Layer):  # pylint: disable=protected-access
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
 
 class Conv1D(_Conv):
   """1D convolution layer (e.g. temporal convolution).
@@ -806,6 +868,7 @@ class SeparableConv2D(Conv2D):
                        '`SeparableConv2D` '
                        'should be defined. Found `None`.')
     input_dim = int(input_shape[channel_axis])
+    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
     depthwise_kernel_shape = (self.kernel_size[0],
                               self.kernel_size[1],
                               input_dim,
@@ -814,29 +877,30 @@ class SeparableConv2D(Conv2D):
                               self.depth_multiplier * input_dim,
                               self.filters)
 
-    self.depthwise_kernel = vs.get_variable(
-        'depthwise_kernel',
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
         shape=depthwise_kernel_shape,
         initializer=self.depthwise_initializer,
         regularizer=self.depthwise_regularizer,
         trainable=True,
         dtype=self.dtype)
-    self.pointwise_kernel = vs.get_variable(
-        'pointwise_kernel',
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
         shape=pointwise_kernel_shape,
         initializer=self.pointwise_initializer,
         regularizer=self.pointwise_regularizer,
         trainable=True,
         dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -866,6 +930,26 @@ class SeparableConv2D(Conv2D):
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+
+    rows = utils.conv_output_length(rows, self.kernel_size[0],
+                                    self.padding, self.strides[0])
+    cols = utils.conv_output_length(cols, self.kernel_size[1],
+                                    self.padding, self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], self.filters, rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, self.filters])
+
 
 def separable_conv2d(inputs,
                      filters,
@@ -970,7 +1054,7 @@ def separable_conv2d(inputs,
 
 
 class Conv2DTranspose(Conv2D):
-  """Transposed convolution layer (sometimes called Deconvolution).
+  """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
   from the desire to use a transformation going in the opposite direction
@@ -1039,6 +1123,7 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
+    self.input_spec = base.InputSpec(ndim=4)
 
   def build(self, input_shape):
     if len(input_shape) != 4:
@@ -1053,23 +1138,25 @@ class Conv2DTranspose(Conv2D):
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
+    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-    self.kernel = vs.get_variable('kernel',
-                                  shape=kernel_shape,
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=(self.filters,),
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  trainable=True,
-                                  dtype=self.dtype)
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    trainable=True,
+                                    dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1083,20 +1170,15 @@ class Conv2DTranspose(Conv2D):
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
-    def get_deconv_dim(dim_size, stride_size, kernel_size, padding):
-      if isinstance(dim_size, ops.Tensor):
-        dim_size = math_ops.multiply(dim_size, stride_size)
-      elif dim_size is not None:
-        dim_size *= stride_size
-
-      if padding == 'valid' and dim_size is not None:
-        dim_size += max(kernel_size - stride_size, 0)
-      return dim_size
-
     # Infer the dynamic output shape:
-    out_height = get_deconv_dim(height, stride_h, kernel_h, self.padding)
-    out_width = get_deconv_dim(width, stride_w, kernel_w, self.padding)
-
+    out_height = utils.deconv_output_length(height,
+                                            kernel_h,
+                                            self.padding,
+                                            stride_h)
+    out_width = utils.deconv_output_length(width,
+                                           kernel_w,
+                                           self.padding,
+                                           stride_w)
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_height, out_width)
       strides = (1, 1, stride_h, stride_w)
@@ -1116,10 +1198,14 @@ class Conv2DTranspose(Conv2D):
     # Infer the static output shape:
     out_shape = inputs.get_shape().as_list()
     out_shape[c_axis] = self.filters
-    out_shape[h_axis] = get_deconv_dim(
-        out_shape[h_axis], stride_h, kernel_h, self.padding)
-    out_shape[w_axis] = get_deconv_dim(
-        out_shape[w_axis], stride_w, kernel_w, self.padding)
+    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                   kernel_h,
+                                                   self.padding,
+                                                   stride_h)
+    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                   kernel_w,
+                                                   self.padding,
+                                                   stride_w)
     outputs.set_shape(out_shape)
 
     if self.bias:
@@ -1132,6 +1218,24 @@ class Conv2DTranspose(Conv2D):
       return self.activation(outputs)
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
 
 def conv2d_transpose(inputs,
                      filters,
@@ -1149,7 +1253,7 @@ def conv2d_transpose(inputs,
                      trainable=True,
                      name=None,
                      reuse=None):
-  """Transposed convolution layer (sometimes called Deconvolution).
+  """Functional interface for transposed 2D convolution layer.
 
   The need for transposed convolutions generally arises
   from the desire to use a transformation going in the opposite direction
@@ -1169,6 +1273,264 @@ def conv2d_transpose(inputs,
       of the convolution. Can be a single integer to specify the same value for
       all spatial dimensions.
     padding: one of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, then no
+      bias will be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = Conv2DTranspose(
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      name=name,
+      _reuse=reuse,
+      _scope=name)
+  return layer.apply(inputs)
+
+
+class Conv3DTranspose(Conv3D):
+  """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 3 integers, specifying the
+      depth, height and width of the 3D convolution window.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    strides: An integer or tuple/list of 3 integers, specifying the strides
+      of the convolution along the depth, height and width.
+      Can be a single integer to specify the same value for all spatial
+      dimensions.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, depth, height, width)`.
+    activation: Activation function. Set it to `None` to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If `None`, then no
+      bias will be applied.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1, 1),
+               padding='valid',
+               data_format='channels_last',
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv3DTranspose, self).__init__(
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+  def build(self, input_shape):
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = utils.deconv_output_length(depth,
+                                           kernel_d,
+                                           self.padding,
+                                           stride_d)
+    out_height = utils.deconv_output_length(height,
+                                            kernel_h,
+                                            self.padding,
+                                            stride_h)
+    out_width = utils.deconv_output_length(width,
+                                           kernel_w,
+                                           self.padding,
+                                           stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    # Infer the static output shape:
+    out_shape = inputs.get_shape().as_list()
+    out_shape[c_axis] = self.filters
+    out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
+                                                   kernel_d,
+                                                   self.padding,
+                                                   stride_d)
+    out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
+                                                   kernel_h,
+                                                   self.padding,
+                                                   stride_h)
+    out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
+                                                   kernel_w,
+                                                   self.padding,
+                                                   stride_w)
+    outputs.set_shape(out_shape)
+
+    if self.bias:
+      outputs_shape = outputs.shape.as_list()
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+def conv3d_transpose(inputs,
+                     filters,
+                     kernel_size,
+                     strides=(1, 1, 1),
+                     padding='valid',
+                     data_format='channels_last',
+                     activation=None,
+                     use_bias=True,
+                     kernel_initializer=None,
+                     bias_initializer=init_ops.zeros_initializer(),
+                     kernel_regularizer=None,
+                     bias_regularizer=None,
+                     activity_regularizer=None,
+                     trainable=True,
+                     name=None,
+                     reuse=None):
+  """Functional interface for transposed 3D convolution layer.
+
+  Arguments:
+    inputs: Input tensor.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of 3 positive integers specifying the spatial
+      dimensions of of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of 3 positive integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: one of `"valid"` or `"same"` (case-insensitive).
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
@@ -1192,7 +1554,7 @@ def conv2d_transpose(inputs,
   Returns:
     Output tensor.
   """
-  layer = Conv2DTranspose(
+  layer = Conv3DTranspose(
       filters=filters,
       kernel_size=kernel_size,
       strides=strides,
@@ -1219,9 +1581,10 @@ Convolution2D = Conv2D
 Convolution3D = Conv3D
 SeparableConvolution2D = SeparableConv2D
 Convolution2DTranspose = Deconvolution2D = Deconv2D = Conv2DTranspose
+Convolution3DTranspose = Deconvolution3D = Deconv3D = Conv3DTranspose
 convolution1d = conv1d
 convolution2d = conv2d
 convolution3d = conv3d
 separable_convolution2d = separable_conv2d
 convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
-
+convolution3d_transpose = deconvolution3d = deconv3d = conv3d_transpose
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index c3e133d08b2689254dff6cae35c5fc88c4362e32..42a2d775349042f20c48e40385c440705d92912b 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.convolutional."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -651,5 +651,174 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertEqual(len(variables.trainable_variables()), 4)
 
 
+class Conv3DTransposeTest(test.TestCase):
+
+  def testInvalidDataFormat(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'data_format'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, data_format='invalid')
+
+  def testInvalidStrides(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, strides=(1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'strides'):
+      conv_layers.conv3d_transpose(volumes, 4, 3, strides=None)
+
+  def testInvalidKernelSize(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.conv3d_transpose(volumes, 4, (1, 2))
+
+    with self.assertRaisesRegexp(ValueError, 'kernel_size'):
+      conv_layers.conv3d_transpose(volumes, 4, None)
+
+  def testCreateConv3DTranspose(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], activation=nn_ops.relu)
+    output = layer.apply(volumes)
+    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testCreateConv3DTransposeIntegerKernelSize(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(4, 3)
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testCreateConv3DTransposeChannelsFirst(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, 32, depth, height, width))
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], data_format='channels_first')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, 4, depth + 2, height + 2, width + 2])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertListEqual(layer.bias.get_shape().as_list(), [4])
+
+  def testConv3DTransposePaddingSame(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 64), seed=1)
+    layer = conv_layers.Conv3DTranspose(
+        32, volumes.get_shape()[1:4], padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth, height, width, 32])
+
+  def testCreateConv3DTransposeWithStrides(self):
+    depth, height, width = 4, 6, 8
+    # Test strides tuple.
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], strides=(2, 2, 2), padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height * 2, width * 2, 4])
+
+    # Test strides integer.
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], strides=2, padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height * 2, width * 2, 4])
+
+    # Test unequal strides.
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], strides=(2, 1, 1), padding='same')
+    output = layer.apply(volumes)
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth * 2, height, width, 4])
+
+  def testConv3DTransposeKernelRegularizer(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], kernel_regularizer=reg)
+    layer.apply(volumes)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv3DTransposeBiasRegularizer(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
+    layer = conv_layers.Conv3DTranspose(4, [3, 3, 3], bias_regularizer=reg)
+    layer.apply(volumes)
+    loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+    self.assertEqual(len(loss_keys), 1)
+    self.assertListEqual(layer.losses, loss_keys)
+
+  def testConv3DTransposeNoBias(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32))
+    layer = conv_layers.Conv3DTranspose(
+        4, [3, 3, 3], activation=nn_ops.relu, use_bias=False)
+    output = layer.apply(volumes)
+    self.assertEqual(output.op.name, 'conv3d_transpose/Relu')
+    self.assertListEqual(output.get_shape().as_list(),
+                         [5, depth + 2, height + 2, width + 2, 4])
+    self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
+    self.assertEqual(layer.bias, None)
+
+  def testFunctionalConv3DTransposeReuse(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+    self.assertEqual(len(variables.trainable_variables()), 2)
+    conv_layers.conv3d_transpose(
+        volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
+    self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv3DTransposeReuseFromScope(self):
+    with variable_scope.variable_scope('scope'):
+      depth, height, width = 5, 7, 9
+      volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+    with variable_scope.variable_scope('scope', reuse=True):
+      conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+      self.assertEqual(len(variables.trainable_variables()), 2)
+
+  def testFunctionalConv3DTransposeInitializerFromScope(self):
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          'scope', initializer=init_ops.ones_initializer()):
+        depth, height, width = 5, 7, 9
+        volumes = random_ops.random_uniform(
+            (5, depth, height, width, 32), seed=1)
+        conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
+        weights = variables.trainable_variables()
+        # Check the names of weights in order.
+        self.assertTrue('kernel' in weights[0].name)
+        self.assertTrue('bias' in weights[1].name)
+        sess.run(variables.global_variables_initializer())
+        weights = sess.run(weights)
+        # Check that the kernel weights got initialized to ones (from scope)
+        self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
+        # Check that the bias still got initialized to zeros.
+        self.assertAllClose(weights[1], np.zeros((4)))
+
+  def testFunctionalConv3DTransposeNoReuse(self):
+    depth, height, width = 5, 7, 9
+    volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+    self.assertEqual(len(variables.trainable_variables()), 2)
+    conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3])
+    self.assertEqual(len(variables.trainable_variables()), 4)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index b5846ae3d2f8d53e99de1e292800bd0863fe7695..c63f8e5792c342e25e915efa6fd92a49ad42aad7 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -38,7 +38,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class Dense(base._Layer):  # pylint: disable=protected-access
+class Dense(base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -101,35 +101,31 @@ class Dense(base._Layer):  # pylint: disable=protected-access
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
     self.activity_regularizer = activity_regularizer
+    self.input_spec = base.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape.ndims is None:
-      raise ValueError('Inputs to `Dense` should have known rank.')
-    if len(input_shape) < 2:
-      raise ValueError('Inputs to `Dense` should have rank >= 2.')
     if input_shape[-1].value is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    # Note that we set `trainable=True` because this is a trainable
-    # weight of the layer. If the layer is not trainable
-    # (self.trainable = False), the variable will not be added to
-    # tf.trainable_variables(), and self.trainable_weights will be empty.
-    self.kernel = vs.get_variable('kernel',
-                                  shape=[input_shape[-1].value, self.units],
-                                  initializer=self.kernel_initializer,
-                                  regularizer=self.kernel_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+    self.input_spec = base.InputSpec(min_ndim=2,
+                                     axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     if self.use_bias:
-      self.bias = vs.get_variable('bias',
-                                  shape=[self.units,],
-                                  initializer=self.bias_initializer,
-                                  regularizer=self.bias_regularizer,
-                                  dtype=self.dtype,
-                                  trainable=True)
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    dtype=self.dtype,
+                                    trainable=True)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
@@ -219,7 +215,7 @@ def dense(
   return layer.apply(inputs)
 
 
-class Dropout(base._Layer):  # pylint: disable=protected-access
+class Dropout(base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -248,14 +244,20 @@ class Dropout(base._Layer):  # pylint: disable=protected-access
                name=None,
                **kwargs):
     super(Dropout, self).__init__(name=name, **kwargs)
-    self.rate = rate
+    self.rate = min(1., max(0., rate))
     self.noise_shape = noise_shape
     self.seed = seed
 
+  def _get_noise_shape(self, _):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    return self.noise_shape
+
   def call(self, inputs, training=False):
     def dropped_inputs():
       return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self.noise_shape,
+                        noise_shape=self._get_noise_shape(inputs),
                         seed=self.seed)
     return utils.smart_cond(training,
                             dropped_inputs,
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index df650535d4e21d9d48ba6ac3e1317efeb0bdd2a3..0ee477371a03a1f364ffb83b3eef38db18fb6627 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -44,16 +44,14 @@ class DenseTest(test.TestCase):
     self.assertEqual(dense.bias_regularizer, None)
     self.assertEqual(dense.activity_regularizer, None)
     self.assertEqual(dense.use_bias, True)
-    with self.assertRaisesRegexp(ValueError, 'not been used yet'):
-      _ = dense.name
 
     # Test auto-naming
     dense = core_layers.Dense(2, activation=nn_ops.relu)
-    dense.apply(np.random.randn(0, 2))
-    self.assertEqual(dense.name, 'dense')
-    dense = core_layers.Dense(2, activation=nn_ops.relu)
-    dense.apply(np.random.randn(0, 2))
+    dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_1')
+    dense = core_layers.Dense(2, activation=nn_ops.relu)
+    dense.apply(random_ops.random_uniform((5, 2)))
+    self.assertEqual(dense.name, 'dense_2')
 
   def testCall(self):
     dense = core_layers.Dense(2, activation=nn_ops.relu, name='my_dense')
@@ -62,8 +60,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [dense.kernel, dense.bias])
     self.assertListEqual(dense.non_trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
     self.assertEqual(dense.kernel.name, 'my_dense/kernel:0')
@@ -89,8 +85,6 @@ class DenseTest(test.TestCase):
     self.assertListEqual(dense.non_trainable_variables,
                          [dense.kernel, dense.bias])
     self.assertListEqual(dense.trainable_variables, [])
-    self.assertListEqual(dense._trainable_variables, [dense.kernel, dense.bias])
-    self.assertListEqual(dense._non_trainable_variables, [])
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 0)
 
@@ -289,7 +283,7 @@ class DenseTest(test.TestCase):
 class DropoutTest(test.TestCase):
 
   def testDropoutProperties(self):
-    dp = core_layers.Dropout(0.5)
+    dp = core_layers.Dropout(0.5, name='dropout')
     self.assertEqual(dp.rate, 0.5)
     self.assertEqual(dp.noise_shape, None)
     dp.apply(np.ones(()))
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 9f02757d5bc754c522417a4f787ea54d28727be1..aa46eb5d27d98d0250d89f32f017a9f67fa20efe 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -23,6 +23,7 @@
 @@conv3d
 @@separable_conv2d
 @@conv2d_transpose
+@@conv3d_transpose
 @@average_pooling1d
 @@max_pooling1d
 @@average_pooling2d
@@ -50,6 +51,7 @@ from tensorflow.python.layers.convolutional import conv2d
 from tensorflow.python.layers.convolutional import conv3d
 from tensorflow.python.layers.convolutional import separable_conv2d
 from tensorflow.python.layers.convolutional import conv2d_transpose
+from tensorflow.python.layers.convolutional import conv3d_transpose
 
 # Pooling layers.
 from tensorflow.python.layers.pooling import average_pooling1d
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 8659382834553315f883633218a09f1fd32ab95d..8733d6576909e6555e5d8c135a6b7abf087ed61a 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -41,7 +41,7 @@ from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 
 
-class BatchNormalization(base._Layer):  # pylint: disable=protected-access
+class BatchNormalization(base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -69,6 +69,19 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: A string, the name of the layer.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
   """
 
   def __init__(self,
@@ -83,6 +96,9 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
                moving_variance_initializer=init_ops.ones_initializer(),
                beta_regularizer=None,
                gamma_regularizer=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
                trainable=True,
                name=None,
                **kwargs):
@@ -99,6 +115,15 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     self.moving_variance_initializer = moving_variance_initializer
     self.beta_regularizer = beta_regularizer
     self.gamma_regularizer = gamma_regularizer
+    self.renorm = renorm
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -116,40 +141,128 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     if not param_dim.value:
       raise ValueError('Input has undefined `axis` dimension. Input shape: ',
                        input_shape)
+    self.input_spec = base.InputSpec(ndim=ndim,
+                                     axes={self.axis: param_dim.value})
 
     if self.center:
-      self.beta = vs.get_variable('beta',
-                                  shape=(param_dim,),
-                                  initializer=self.beta_initializer,
-                                  regularizer=self.beta_regularizer,
-                                  trainable=True)
+      self.beta = self.add_variable(name='beta',
+                                    shape=(param_dim,),
+                                    initializer=self.beta_initializer,
+                                    regularizer=self.beta_regularizer,
+                                    trainable=True)
     else:
       self.beta = None
     if self.scale:
-      self.gamma = vs.get_variable('gamma',
-                                   shape=(param_dim,),
-                                   initializer=self.gamma_initializer,
-                                   regularizer=self.gamma_regularizer,
-                                   trainable=True)
+      self.gamma = self.add_variable(name='gamma',
+                                     shape=(param_dim,),
+                                     initializer=self.gamma_initializer,
+                                     regularizer=self.gamma_regularizer,
+                                     trainable=True)
     else:
       self.gamma = None
 
     # Disable variable partitioning when creating the moving mean and variance
-    partitioner = vs.get_variable_scope().partitioner
+    partitioner = self._scope.partitioner
     try:
-      vs.get_variable_scope().set_partitioner(None)
-      self.moving_mean = vs.get_variable(
-          'moving_mean',
+      self._scope.set_partitioner(None)
+      self.moving_mean = self.add_variable(
+          name='moving_mean',
           shape=(param_dim,),
           initializer=self.moving_mean_initializer,
           trainable=False)
-      self.moving_variance = vs.get_variable(
-          'moving_variance',
+      self.moving_variance = self.add_variable(
+          name='moving_variance',
           shape=(param_dim,),
           initializer=self.moving_variance_initializer,
           trainable=False)
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self.add_variable(name=name,
+                                  shape=shape,
+                                  initializer=init_ops.zeros_initializer(),
+                                  trainable=False)
+          return var
+        with ops.device(None):
+          with ops.device(lambda _: self.moving_mean.device):
+            self.renorm_mean = _renorm_variable('renorm_mean', (param_dim,))
+            self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+          # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+          # renorm_stddev_weight. This allows us to (1) mix the average
+          # stddev with the minibatch stddev early in training, and (2) compute
+          # the unbiased average stddev by dividing renorm_stddev by the weight.
+          with ops.device(lambda _: self.moving_variance.device):
+            self.renorm_stddev = _renorm_variable('renorm_stddev', (param_dim,))
+            self.renorm_stddev_weight = _renorm_variable(
+                'renorm_stddev_weight', ())
     finally:
-      vs.get_variable_scope().set_partitioner(partitioner)
+      self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0, and decay=1 meaning no updates.
+    r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
+    decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      # Update the variables without zero debiasing. The debiasing will be
+      # accomplished by dividing the exponential moving average by the weight.
+      # For example, after a single update, the moving average would be
+      # (1-decay) * value. and the weight will be 1-decay, with their ratio
+      # giving value.
+      # Make sure the weight is not updated until before r and d computation.
+      value = array_ops.identity(value)
+      with ops.control_dependencies([value]):
+        weight_value = array_ops.constant(1., dtype=weight.dtype)
+      new_var = moving_averages.assign_moving_average(
+          var, value, decay, zero_debias=False)
+      new_weight = moving_averages.assign_moving_average(
+          weight, weight_value, decay, zero_debias=False)
+      return new_var / new_weight
+
+    with ops.colocate_with(self.moving_mean):
+      new_mean = _update_renorm_variable(self.renorm_mean,
+                                         self.renorm_mean_weight,
+                                         mean)
+    with ops.colocate_with(self.moving_variance):
+      new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                           self.renorm_stddev_weight,
+                                           stddev)
+      # Make sqrt(moving_variance + epsilon) = new_stddev.
+      new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=False):
     # First, compute the axes along which to reduce the mean / variance,
@@ -164,82 +277,62 @@ class BatchNormalization(base._Layer):  # pylint: disable=protected-access
     # Determines whether broadcasting is needed.
     needs_broadcasting = (sorted(reduction_axes) != list(range(ndim))[:-1])
 
+    scale, offset = self.gamma, self.beta
+
     # Determine a boolean value for `training`: could be True, False, or None.
     training_value = utils.constant_value(training)
-
-    if needs_broadcasting:
-      # In this case we must explictly broadcast all parameters.
-      if self.center:
-        broadcast_beta = array_ops.reshape(self.beta, broadcast_shape)
-      else:
-        broadcast_beta = None
-      if self.scale:
-        broadcast_gamma = array_ops.reshape(self.gamma, broadcast_shape)
-      else:
-        broadcast_gamma = None
-
     if training_value is not False:
-      if needs_broadcasting:
-        broadcast_mean, broadcast_variance = nn.moments(
-            inputs, reduction_axes, keep_dims=True)
-        mean = array_ops.reshape(broadcast_mean, [-1])
-        variance = array_ops.reshape(broadcast_variance, [-1])
-      else:
-        mean, variance = nn.moments(inputs, reduction_axes)
-
-      # Prepare updates if necessary.
-      if not self.updates:
-        mean_update = moving_averages.assign_moving_average(
-            self.moving_mean, mean, self.momentum, zero_debias=False)
-        variance_update = moving_averages.assign_moving_average(
-            self.moving_variance, variance, self.momentum, zero_debias=False)
-        # In the future this should be refactored into a self.add_update
-        # methods in order to allow for instance-based BN layer sharing
-        # across unrelated input streams (e.g. like in Keras).
-        self.updates.append(mean_update)
-        self.updates.append(variance_update)
-
-    # Normalize batch. We do this inside separate functions for training
-    # and inference so as to avoid evaluating both branches.
-    def normalize_in_test():
-      if needs_broadcasting:
-        broadcast_moving_mean = array_ops.reshape(self.moving_mean,
-                                                  broadcast_shape)
-        broadcast_moving_variance = array_ops.reshape(self.moving_variance,
-                                                      broadcast_shape)
-        return nn.batch_normalization(inputs,
-                                      broadcast_moving_mean,
-                                      broadcast_moving_variance,
-                                      broadcast_beta,
-                                      broadcast_gamma,
-                                      self.epsilon)
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      mean, variance = nn.moments(inputs, reduction_axes)
+      mean = _smart_select(training,
+                           lambda: mean,
+                           lambda: self.moving_mean)
+      variance = _smart_select(training,
+                               lambda: variance,
+                               lambda: self.moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        scale = array_ops.stop_gradient(r, name='renorm_r')
+        offset = array_ops.stop_gradient(d, name='renorm_d')
+        if self.gamma is not None:
+          scale *= self.gamma
+          offset *= self.gamma
+        if self.beta is not None:
+          offset += self.beta
       else:
-        return nn.batch_normalization(inputs,
-                                      self.moving_mean,
-                                      self.moving_variance,
-                                      self.beta if self.center else None,
-                                      self.gamma if self.scale else None,
-                                      self.epsilon)
-
-    def normalize_in_training():
-      if needs_broadcasting:
-        return nn.batch_normalization(inputs,
-                                      broadcast_mean,
-                                      broadcast_variance,
-                                      broadcast_beta,
-                                      broadcast_gamma,
-                                      self.epsilon)
-      else:
-        return nn.batch_normalization(inputs,
-                                      mean,
-                                      variance,
-                                      self.beta if self.center else None,
-                                      self.gamma if self.scale else None,
-                                      self.epsilon)
+        new_mean, new_variance = mean, variance
+
+      # Update moving averages when training, and prevent updates otherwise.
+      decay = _smart_select(training, lambda: self.momentum, lambda: 1.)
+      mean_update = moving_averages.assign_moving_average(
+          self.moving_mean, new_mean, decay, zero_debias=False)
+      variance_update = moving_averages.assign_moving_average(
+          self.moving_variance, new_variance, decay, zero_debias=False)
+
+      self.add_update(mean_update, inputs=inputs)
+      self.add_update(variance_update, inputs=inputs)
+
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
 
-    return utils.smart_cond(training,
-                            normalize_in_training,
-                            normalize_in_test)
+    def _broadcast(v):
+      if needs_broadcasting and v is not None:
+        # In this case we must explicitly broadcast all parameters.
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    return nn.batch_normalization(inputs,
+                                  _broadcast(mean),
+                                  _broadcast(variance),
+                                  _broadcast(offset),
+                                  _broadcast(scale),
+                                  self.epsilon)
 
 
 def batch_normalization(inputs,
@@ -257,7 +350,10 @@ def batch_normalization(inputs,
                         training=False,
                         trainable=True,
                         name=None,
-                        reuse=None):
+                        reuse=None,
+                        renorm=False,
+                        renorm_clipping=None,
+                        renorm_momentum=0.99):
   """Functional interface for the batch normalization layer.
 
   Reference: http://arxiv.org/abs/1502.03167
@@ -267,6 +363,25 @@ def batch_normalization(inputs,
 
   Sergey Ioffe, Christian Szegedy
 
+  Note: the operations which update the `moving_mean` and `moving_variance`
+  variables will not be added as dependencies of your training operation and so
+  must be run separately. For example:
+
+  ```
+  extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+  sess.run([train_op, extra_update_ops], ...)
+  ```
+  Alternatively, add the operations as a dependency to your training operation
+  manually, and then just run your training operation as normal:
+
+  ```
+  extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+  with tf.control_dependencies(extra_update_ops):
+    train_op = optimizer.minimize(loss)
+  ...
+  sess.run([train_op], ...)
+  ```
+
   Arguments:
     inputs: Tensor input.
     axis: Integer, the axis that should be normalized (typically the features
@@ -288,12 +403,27 @@ def batch_normalization(inputs,
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
       (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics).
+      (normalized with moving statistics). **NOTE**: make sure to set this
+      parameter correctly, or else your training/inference will not work
+      properly.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: String, the name of the layer.
     reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
 
   Returns:
     Output tensor.
@@ -311,6 +441,9 @@ def batch_normalization(inputs,
       beta_regularizer=beta_regularizer,
       gamma_regularizer=gamma_regularizer,
       trainable=trainable,
+      renorm=renorm,
+      renorm_clipping=renorm_clipping,
+      renorm_momentum=renorm_momentum,
       name=name,
       _reuse=reuse,
       _scope=name)
@@ -321,3 +454,39 @@ def batch_normalization(inputs,
 
 BatchNorm = BatchNormalization
 batch_norm = batch_normalization
+
+
+# Helper function
+
+
+def _smart_select(pred, fn_then, fn_else):
+  """Selects fn_then() or fn_else() based on the value of pred.
+
+  The purpose of this function is the same as `utils.smart_cond`. However, at
+  the moment there is a bug (b/36297356) that seems to kick in only when
+  `smart_cond` delegates to `tf.cond`, which sometimes results in the training
+  hanging when using parameter servers. This function will output the result
+  of `fn_then` or `fn_else` if `pred` is known at graph construction time.
+  Otherwise, it will use `tf.where` which will result in some redundant work
+  (both branches will be computed but only one selected). However, the tensors
+  involved will usually be small (means and variances in batchnorm), so the
+  cost will be small and will not be incurred at all if `pred` is a constant.
+
+  Args:
+    pred: A boolean scalar `Tensor`.
+    fn_then: A callable to use when pred==True.
+    fn_else: A callable to use when pred==False.
+
+  Returns:
+    A `Tensor` whose value is fn_then() or fn_else() based on the value of pred.
+  """
+  pred_value = utils.constant_value(pred)
+  if pred_value:
+    return fn_then()
+  elif pred_value is False:
+    return fn_else()
+  t_then = array_ops.expand_dims(fn_then(), 0)
+  t_else = array_ops.expand_dims(fn_else(), 0)
+  pred = array_ops.reshape(pred, [1])
+  result = array_ops.where(pred, t_then, t_else)
+  return array_ops.squeeze(result, [0])
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 91b7cb6f4833abc0242853cece4b22f7aac4debf..933f196e01160ba058cdb57557a10e33eacd1cc7 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.normalization."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -513,6 +514,64 @@ class BNTest(test.TestCase):
     _ = bn.apply(inputs, training=training)
     self.assertEqual(len(bn.losses), 1)
 
+  def testRenorm(self):
+    shape = (4, 3)
+    xt = array_ops.placeholder(dtypes.float32, shape)
+    momentum = 0.99
+    renorm_momentum = 0.8
+    rmax = 1.1
+    rmin = 0.9
+    dmax = 0.1
+    gamma = 2.
+    beta = 3.
+    epsilon = 0.001
+    bn = normalization_layers.BatchNormalization(
+        axis=1,
+        gamma_initializer=init_ops.constant_initializer(gamma),
+        beta_initializer=init_ops.constant_initializer(beta),
+        epsilon=epsilon,
+        momentum=momentum,
+        renorm=True,
+        renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax},
+        renorm_momentum=renorm_momentum)
+    training = array_ops.placeholder(dtypes.bool)
+    yt = bn.apply(xt, training=training)
+
+    moving_mean = 0.
+    moving_variance = 1.
+    renorm_mean = renorm_stddev = 0.
+    renorm_weight = 0.
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(variables.global_variables_initializer())
+      for _ in range(5):
+        x = np.random.random(shape)
+
+        mean = x.mean(0)
+        stddev = np.sqrt(x.var(0) + epsilon)
+        adj_mean = renorm_mean + (1. - renorm_weight) * mean
+        adj_stddev = renorm_stddev + (1. - renorm_weight) * stddev
+        r = (stddev / adj_stddev).clip(rmin, rmax)
+        d = ((mean - adj_mean) / adj_stddev).clip(-dmax, dmax)
+        y_train = ((x - mean) / stddev * r + d) * gamma + beta
+        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
+        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
+        renorm_weight += (1. - renorm_weight) * (1. - renorm_momentum)
+        moving_mean += (renorm_mean / renorm_weight -
+                        moving_mean) * (1. - momentum)
+        moving_variance += ((renorm_stddev / renorm_weight) ** 2 - epsilon -
+                            moving_variance) * (1. - momentum)
+
+        y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 *
+                  gamma) + beta
+
+        yt_val_train, _, _ = sess.run([yt] + bn.updates,
+                                      feed_dict={xt: x, training: True})
+        yt_val_test, _, _ = sess.run([yt] + bn.updates,
+                                     feed_dict={xt: x, training: False})
+
+        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
+        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 3e40423ad638f4e0cb1c231fdc9cfa71cbb16f72..6cd644b6428d0ff5b99665ea2e48f79ac03ae210 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -31,12 +31,13 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope as vs
-
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
+from tensorflow.python import framework
 
 
-class _Pooling1D(base._Layer):  # pylint: disable=protected-access
+class _Pooling1D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 1D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -66,11 +67,7 @@ class _Pooling1D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 1, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 3:
-      raise ValueError('Inputs should have rank 3. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=3)
 
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
@@ -97,6 +94,12 @@ class _Pooling1D(base._Layer):  # pylint: disable=protected-access
     else:
       return array_ops.squeeze(outputs, 1)
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                      self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
 
 class AveragePooling1D(_Pooling1D):
   """Average Pooling layer for 1D inputs.
@@ -222,7 +225,7 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base._Layer):  # pylint: disable=protected-access
+class _Pooling2D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -256,25 +259,54 @@ class _Pooling2D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 2, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=4)
 
   def call(self, inputs):
-    if self.data_format == 'channels_last':
+    if (self.data_format == 'channels_first' and
+        not framework.test_util.gpu_device_name()):
+      # `nn.convolution` is not implemented on CPU for `channels_first` format.
+      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
+      data_format = 'channels_last'
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
+    else:
+      data_format = self.data_format
+
+    if data_format == 'channels_last':
       pool_shape = (1,) + self.pool_size + (1,)
       strides = (1,) + self.strides + (1,)
     else:
       pool_shape = (1, 1) + self.pool_size
       strides = (1, 1) + self.strides
-    return self.pool_function(
+    outputs = self.pool_function(
         inputs,
         ksize=pool_shape,
         strides=strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, 4))
+        data_format=utils.convert_data_format(data_format, 4))
+
+    if (self.data_format == 'channels_first' and
+        not framework.test_util.gpu_device_name()):
+      outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                    self.strides[0])
+    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                    self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
 
 class AveragePooling2D(_Pooling2D):
@@ -407,7 +439,7 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base._Layer):  # pylint: disable=protected-access
+class _Pooling3D(base.Layer):
   """Pooling layer for arbitrary pooling functions, for 3D inputs.
 
   This class only exists for code reuse. It will never be an exposed API.
@@ -443,19 +475,16 @@ class _Pooling3D(base._Layer):  # pylint: disable=protected-access
     self.strides = utils.normalize_tuple(strides, 3, 'strides')
     self.padding = utils.normalize_padding(padding)
     self.data_format = utils.normalize_data_format(data_format)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5. '
-                       'Received input shape:', str(input_shape))
+    self.input_spec = base.InputSpec(ndim=5)
 
   def call(self, inputs):
     pool_shape = (1,) + self.pool_size + (1,)
     strides = (1,) + self.strides + (1,)
 
     if self.data_format == 'channels_first':
-      # TF does not support channels first with 3D pooling operations,
+      # TF does not support `channels_first` with 3D pooling operations,
       # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
       inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
 
     outputs = self.pool_function(
@@ -468,6 +497,29 @@ class _Pooling3D(base._Layer):  # pylint: disable=protected-access
       outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
     return outputs
 
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
+                                        self.padding, self.strides[0])
+    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
+                                        self.padding, self.strides[1])
+    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
+                                        self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
+
 
 class AveragePooling3D(_Pooling3D):
   """Average pooling layer for 3D inputs (e.g. volumes).
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index f9929eaf209fb301ab9b4f03886cc0a8961df969..589fee5f7196cc542b39506c5bda580a92647f0d 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import pooling as pooling_layers
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -63,6 +64,36 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
 
+  def testCreateMaxPooling2DChannelsFirst(self):
+    height, width = 7, 9
+    images = random_ops.random_uniform((5, 2, height, width))
+    layer = pooling_layers.MaxPooling2D([2, 2],
+                                        strides=1,
+                                        data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
+
+  def testCreateAveragePooling2DChannelsFirst(self):
+    height, width = 5, 6
+    images = random_ops.random_uniform((3, 4, height, width))
+    layer = pooling_layers.AveragePooling2D((2, 2),
+                                            strides=(1, 1),
+                                            padding='valid',
+                                            data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
+
+  def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
+    height, width = 5, 6
+    images = array_ops.placeholder(dtype='float32',
+                                   shape=(None, 4, height, width))
+    layer = pooling_layers.AveragePooling2D((2, 2),
+                                            strides=(1, 1),
+                                            padding='valid',
+                                            data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [None, 4, 4, 5])
+
   def testCreateMaxPooling1D(self):
     width = 7
     images = random_ops.random_uniform((5, width, 4))
@@ -85,6 +116,14 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
 
+  def testCreateAveragePooling1DChannelsFirst(self):
+    width = 7
+    images = random_ops.random_uniform((5, width, 4))
+    layer = pooling_layers.AveragePooling1D(
+        2, strides=2, data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 3, 4])
+
   def testCreateMaxPooling3D(self):
     depth, height, width = 6, 7, 9
     images = random_ops.random_uniform((5, depth, height, width, 4))
@@ -99,13 +138,21 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 4, 4])
 
-  def testmaxPooling3DChannelsFirst(self):
+  def testMaxPooling3DChannelsFirst(self):
+    depth, height, width = 6, 7, 9
+    images = random_ops.random_uniform((5, 2, depth, height, width))
+    layer = pooling_layers.MaxPooling3D(
+        [2, 2, 2], strides=2, data_format='channels_first')
+    output = layer.apply(images)
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
+
+  def testAveragePooling3DChannelsFirst(self):
     depth, height, width = 6, 7, 9
-    images = random_ops.random_uniform((5, 4, depth, height, width))
+    images = random_ops.random_uniform((5, 2, depth, height, width))
     layer = pooling_layers.AveragePooling3D(
         [2, 2, 2], strides=2, data_format='channels_first')
     output = layer.apply(images)
-    self.assertListEqual(output.get_shape().as_list(), [5, 4, 3, 3, 4])
+    self.assertListEqual(output.get_shape().as_list(), [5, 2, 3, 3, 4])
 
   def testCreateMaxPooling2DIntegerPoolSize(self):
     height, width = 7, 9
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 666d475690b4e297941d90795823f6d7970ee150..5e206c3bf9d8713843c6ab9c8b1bbe6256ce6f36 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 
@@ -109,6 +110,78 @@ def normalize_padding(value):
   return padding
 
 
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
+
+
 def smart_cond(pred, fn1, fn2, name=None):
   """Return either `fn1()` or `fn2()` based on the boolean predicate `pred`.
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index ace8046a0bb8ca94e16c84ce1240581698447637..a560f6b6d21efc0c1070d5a9296a7a8e914e2eb9 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.layers.core."""
+"""Tests for tf.layers.utils."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,20 +25,20 @@ from tensorflow.python.platform import test
 class ConvUtilsTest(test.TestCase):
 
   def testConvertDataFormat(self):
-    self.assertEqual(utils.convert_data_format('channels_first', 5), 'NCDHW')
-    self.assertEqual(utils.convert_data_format('channels_first', 4), 'NCHW')
-    self.assertEqual(utils.convert_data_format('channels_first', 3), 'NCW')
-    self.assertEqual(utils.convert_data_format('channels_last', 4), 'NHWC')
-    self.assertEqual(utils.convert_data_format('channels_last', 3), 'NWC')
-    self.assertEqual(utils.convert_data_format('channels_last', 5), 'NDHWC')
+    self.assertEqual('NCDHW', utils.convert_data_format('channels_first', 5))
+    self.assertEqual('NCHW', utils.convert_data_format('channels_first', 4))
+    self.assertEqual('NCW', utils.convert_data_format('channels_first', 3))
+    self.assertEqual('NHWC', utils.convert_data_format('channels_last', 4))
+    self.assertEqual('NWC', utils.convert_data_format('channels_last', 3))
+    self.assertEqual('NDHWC', utils.convert_data_format('channels_last', 5))
 
     with self.assertRaises(ValueError):
       utils.convert_data_format('invalid', 2)
 
   def testNormalizeTuple(self):
-    self.assertEqual(utils.normalize_tuple(2, n=3, name='strides'), (2, 2, 2))
+    self.assertEqual((2, 2, 2), utils.normalize_tuple(2, n=3, name='strides'))
     self.assertEqual(
-        utils.normalize_tuple((2, 1, 2), n=3, name='strides'), (2, 1, 2))
+        (2, 1, 2), utils.normalize_tuple((2, 1, 2), n=3, name='strides'))
 
     with self.assertRaises(ValueError):
       utils.normalize_tuple((2, 1), n=3, name='strides')
@@ -48,20 +48,44 @@ class ConvUtilsTest(test.TestCase):
 
   def testNormalizeDataFormat(self):
     self.assertEqual(
-        utils.normalize_data_format('Channels_Last'), 'channels_last')
+        'channels_last', utils.normalize_data_format('Channels_Last'))
     self.assertEqual(
-        utils.normalize_data_format('CHANNELS_FIRST'), 'channels_first')
+        'channels_first', utils.normalize_data_format('CHANNELS_FIRST'))
 
     with self.assertRaises(ValueError):
       utils.normalize_data_format('invalid')
 
   def testNormalizePadding(self):
-    self.assertEqual(utils.normalize_padding('SAME'), 'same')
-    self.assertEqual(utils.normalize_padding('VALID'), 'valid')
+    self.assertEqual('same', utils.normalize_padding('SAME'))
+    self.assertEqual('valid', utils.normalize_padding('VALID'))
 
     with self.assertRaises(ValueError):
       utils.normalize_padding('invalid')
 
+  def testConvOutputLength(self):
+    self.assertEqual(4, utils.conv_output_length(4, 2, 'same', 1, 1))
+    self.assertEqual(2, utils.conv_output_length(4, 2, 'same', 2, 1))
+    self.assertEqual(3, utils.conv_output_length(4, 2, 'valid', 1, 1))
+    self.assertEqual(2, utils.conv_output_length(4, 2, 'valid', 2, 1))
+    self.assertEqual(5, utils.conv_output_length(4, 2, 'full', 1, 1))
+    self.assertEqual(3, utils.conv_output_length(4, 2, 'full', 2, 1))
+    self.assertEqual(2, utils.conv_output_length(5, 2, 'valid', 2, 2))
+
+  def testConvInputLength(self):
+    self.assertEqual(3, utils.conv_input_length(4, 2, 'same', 1))
+    self.assertEqual(2, utils.conv_input_length(2, 2, 'same', 2))
+    self.assertEqual(4, utils.conv_input_length(3, 2, 'valid', 1))
+    self.assertEqual(4, utils.conv_input_length(2, 2, 'valid', 2))
+    self.assertEqual(3, utils.conv_input_length(4, 2, 'full', 1))
+    self.assertEqual(4, utils.conv_input_length(3, 2, 'full', 2))
+
+  def testDeconvOutputLength(self):
+    self.assertEqual(4, utils.deconv_output_length(4, 2, 'same', 1))
+    self.assertEqual(8, utils.deconv_output_length(4, 2, 'same', 2))
+    self.assertEqual(5, utils.deconv_output_length(4, 2, 'valid', 1))
+    self.assertEqual(8, utils.deconv_output_length(4, 2, 'valid', 2))
+    self.assertEqual(3, utils.deconv_output_length(4, 2, 'full', 1))
+    self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index baba144daf1f95039a217bea716beae080487e18..b9a43bd8203cc5b6c3d8f850230cc2fffe4fa69c 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
@@ -54,4 +56,158 @@ void ClearDecrefCache() {
   DecrefCache()->clear();
 }
 
+// Structure which keeps a reference to a Tensor alive while numpy has a pointer
+// to it.
+struct TensorReleaser {
+  // Python macro to include standard members.
+  PyObject_HEAD
+
+      // Destructor responsible for releasing the memory.
+      std::function<void()>* destructor;
+};
+
+extern PyTypeObject TensorReleaserType;
+
+static void TensorReleaser_dealloc(TensorReleaser* self) {
+  (*self->destructor)();
+  delete self->destructor;
+  TensorReleaserType.tp_free(self);
+}
+
+PyTypeObject TensorReleaserType = {
+    PyVarObject_HEAD_INIT(nullptr, 0) /* head init */
+    "tensorflow_wrapper",             /* tp_name */
+    sizeof(TensorReleaser),           /* tp_basicsize */
+    0,                                /* tp_itemsize */
+    /* methods */
+    (destructor)TensorReleaser_dealloc, /* tp_dealloc */
+    0,                                  /* tp_print */
+    0,                                  /* tp_getattr */
+    0,                                  /* tp_setattr */
+    0,                                  /* tp_compare */
+    0,                                  /* tp_repr */
+    0,                                  /* tp_as_number */
+    0,                                  /* tp_as_sequence */
+    0,                                  /* tp_as_mapping */
+    0,                                  /* tp_hash */
+    0,                                  /* tp_call */
+    0,                                  /* tp_str */
+    0,                                  /* tp_getattro */
+    0,                                  /* tp_setattro */
+    0,                                  /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
+    "Wrapped TensorFlow Tensor",        /* tp_doc */
+    0,                                  /* tp_traverse */
+    0,                                  /* tp_clear */
+    0,                                  /* tp_richcompare */
+};
+
+Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
+                                   int* out_pyarray_type) {
+  switch (tf_datatype) {
+    case TF_HALF:
+      *out_pyarray_type = NPY_FLOAT16;
+      break;
+    case TF_FLOAT:
+      *out_pyarray_type = NPY_FLOAT32;
+      break;
+    case TF_DOUBLE:
+      *out_pyarray_type = NPY_FLOAT64;
+      break;
+    case TF_INT32:
+      *out_pyarray_type = NPY_INT32;
+      break;
+    case TF_UINT8:
+      *out_pyarray_type = NPY_UINT8;
+      break;
+    case TF_UINT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    case TF_INT8:
+      *out_pyarray_type = NPY_INT8;
+      break;
+    case TF_INT16:
+      *out_pyarray_type = NPY_INT16;
+      break;
+    case TF_INT64:
+      *out_pyarray_type = NPY_INT64;
+      break;
+    case TF_BOOL:
+      *out_pyarray_type = NPY_BOOL;
+      break;
+    case TF_COMPLEX64:
+      *out_pyarray_type = NPY_COMPLEX64;
+      break;
+    case TF_COMPLEX128:
+      *out_pyarray_type = NPY_COMPLEX128;
+      break;
+    case TF_STRING:
+      *out_pyarray_type = NPY_OBJECT;
+      break;
+    case TF_RESOURCE:
+      *out_pyarray_type = NPY_VOID;
+      break;
+    // TODO(keveman): These should be changed to NPY_VOID, and the type used for
+    // the resulting numpy array should be the custom struct types that we
+    // expect for quantized types.
+    case TF_QINT8:
+      *out_pyarray_type = NPY_INT8;
+      break;
+    case TF_QUINT8:
+      *out_pyarray_type = NPY_UINT8;
+      break;
+    case TF_QINT16:
+      *out_pyarray_type = NPY_INT16;
+      break;
+    case TF_QUINT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    case TF_QINT32:
+      *out_pyarray_type = NPY_INT32;
+      break;
+    case TF_BFLOAT16:
+      *out_pyarray_type = NPY_UINT16;
+      break;
+    default:
+      return errors::Internal("Tensorflow type ", tf_datatype,
+                              " not convertible to numpy dtype.");
+  }
+  return Status::OK();
+}
+
+Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
+                       std::function<void()> destructor, PyObject** result) {
+  int size = 1;
+  for (int i = 0; i < dim_size; ++i) {
+    size *= dims[i];
+  }
+  if (dtype == DT_STRING || dtype == DT_RESOURCE || size == 0) {
+    return errors::FailedPrecondition(
+        "Cannot convert strings, resources, or empty Tensors.");
+  }
+
+  int type_num = -1;
+  Status s =
+      TF_DataType_to_PyArray_TYPE(static_cast<TF_DataType>(dtype), &type_num);
+  if (!s.ok()) {
+    return s;
+  }
+
+  PyObject* np_array =
+      PyArray_SimpleNewFromData(dim_size, dims, type_num, data);
+  if (PyType_Ready(&TensorReleaserType) == -1) {
+    return errors::Unknown("Python type initialization failed.");
+  }
+  TensorReleaser* releaser = reinterpret_cast<TensorReleaser*>(
+      TensorReleaserType.tp_alloc(&TensorReleaserType, 0));
+  releaser->destructor = new std::function<void()>(std::move(destructor));
+  if (PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(np_array),
+                            reinterpret_cast<PyObject*>(releaser)) == -1) {
+    Py_DECREF(releaser);
+    return errors::Unknown("Python array refused to use memory.");
+  }
+  *result = PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.h b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
index 5890e1328e5205745a7174d9ab2d9c9c317ec496..029c0d3ef0ac5d1cc1ef30ad99e9c4ffb694f816 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.h
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
@@ -15,6 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
 
+// Must be included first.
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include <functional>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
 namespace tensorflow {
 
 // Destructor passed to TF_NewTensor when it reuses a numpy buffer. Stores a
@@ -26,6 +35,17 @@ void DelayedNumpyDecref(void* data, size_t len, void* obj);
 // holding the GIL.
 void ClearDecrefCache();
 
+// Creates a numpy array with shapes specified by dim_size and dims and content
+// in data. The array does not own the memory, and destructor will be called to
+// release it. If the status is not ok the caller is responsible for releasing
+// the memory.
+Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data, DataType dtype,
+                       std::function<void()> destructor, PyObject** result);
+
+// Converts TF_DataType to the corresponding numpy type.
+Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
+                                   int* out_pyarray_type);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_LIB_CORE_NDARRAY_TENSOR_BRIDGE_H_
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 040a4513caa775a657ac931fb92592be6b29b071..89e93a86a9af3f9455bb5294557f7c401100fe71 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -40,50 +41,6 @@ PyObject* GetPyTrampoline() {
   return py_trampoline;
 }
 
-// Returns the corresponding numpy dtype in 'np' for tf data type
-// 'tf'.  Returns an error if the type is not supported by this
-// module.
-Status TfDTypeToNpDType(const DataType& tf, int* np) {
-  switch (tf) {
-    case DT_FLOAT:
-      *np = NPY_FLOAT32;
-      break;
-    case DT_DOUBLE:
-      *np = NPY_FLOAT64;
-      break;
-    case DT_INT32:
-      *np = NPY_INT32;
-      break;
-    case DT_UINT8:
-      *np = NPY_UINT8;
-      break;
-    case DT_INT8:
-      *np = NPY_INT8;
-      break;
-    case DT_INT16:
-      *np = NPY_INT16;
-      break;
-    case DT_INT64:
-      *np = NPY_INT64;
-      break;
-    case DT_BOOL:
-      *np = NPY_BOOL;
-      break;
-    case DT_COMPLEX64:
-      *np = NPY_COMPLEX64;
-      break;
-    case DT_COMPLEX128:
-      *np = NPY_COMPLEX128;
-      break;
-    case DT_STRING:
-      *np = NPY_OBJECT;
-      break;
-    default:
-      return errors::Unimplemented("Unsupported tf type ", DataTypeString(tf));
-  }
-  return Status::OK();
-}
-
 // A call to the registered python function.
 struct PyCall {
   // Passed to python runtime to call the python function registered
@@ -172,6 +129,48 @@ bool IsSingleNone(PyObject* obj) {
   return item == Py_None;
 }
 
+// py.__class__.__name__
+const char* ClassName(PyObject* py) {
+/* PyPy doesn't have a separate C API for old-style classes. */
+#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
+  if (PyClass_Check(py))
+    return PyString_AS_STRING(
+        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
+  if (PyInstance_Check(py))
+    return PyString_AS_STRING(CHECK_NOTNULL(
+        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
+#endif
+  if (Py_TYPE(py) == &PyType_Type) {
+    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
+  }
+  return Py_TYPE(py)->tp_name;
+}
+
+string PyExcFetch() {
+  CHECK(PyErr_Occurred()) << "Must only call PyExcFetch after an exception.";
+  PyObject* ptype;
+  PyObject* pvalue;
+  PyObject* ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  string err = ClassName(ptype);
+  if (pvalue) {
+    PyObject* str = PyObject_Str(pvalue);
+    if (str) {
+#if PY_MAJOR_VERSION < 3
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+#else
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+#endif
+      Py_DECREF(str);
+    }
+    Py_DECREF(pvalue);
+  }
+  Py_DECREF(ptype);
+  Py_XDECREF(ptraceback);
+  return err;
+}
+
 // Calls the registered py function through the trampoline.
 Status DoCallPyFunc(PyCall* call) {
   PyObject* trampoline = GetPyTrampoline();
@@ -189,11 +188,24 @@ Status DoCallPyFunc(PyCall* call) {
   Py_DECREF(args);
   if (result == nullptr) {
     if (PyErr_Occurred()) {
-      // TODO(zhifengc): Consider pretty-print error using LOG(STDERR).
-      PyErr_Print();
+      if (PyErr_ExceptionMatches(PyExc_ValueError) ||
+          PyErr_ExceptionMatches(PyExc_TypeError)) {
+        return errors::InvalidArgument(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+        return errors::OutOfRange(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
+        return errors::ResourceExhausted(PyExcFetch());
+      } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
+        return errors::Unimplemented(PyExcFetch());
+      } else {
+        // TODO(ebrevdo): Check if exception is an OpError and use the
+        // OpError.error_code property to map it back in the Status.
+        return errors::Unknown(PyExcFetch());
+      }
+    } else {
+      return errors::Internal("Failed to run py callback ", call->token,
+                              ": see error log.");
     }
-    return errors::Internal("Failed to run py callback ", call->token,
-                            ": see error log.");
   }
 
   // Process the return values and converts them to tf Tensors.
@@ -249,7 +261,7 @@ class NumpyTensorBuffer : public TensorBuffer {
     proto->set_requested_bytes(rb);
     proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
   }
-  Tensor MakeTensor(DataType dtype, TensorShape shape) {
+  Tensor MakeTensor(DataType dtype, const TensorShape& shape) {
     CHECK_EQ(len_, shape.num_elements() * DataTypeSize(dtype));
     return Tensor(dtype, shape, this);
   }
@@ -265,7 +277,7 @@ class NumpyTensorBuffer : public TensorBuffer {
 
 Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
   PyArrayObject* input = reinterpret_cast<PyArrayObject*>(obj);
-  DataType dtype;
+  DataType dtype = DT_INVALID;
   TensorShape shape;
   for (int i = 0; i < PyArray_NDIM(input); ++i) {
     shape.AddDim(PyArray_SHAPE(input)[i]);
@@ -326,17 +338,27 @@ Status ConvertNdarrayToTensor(PyObject* obj, Tensor* ret) {
   return Status::OK();
 }
 
-// Creates a numpy array in 'ret' and copies the content of tensor 't'
-// into 'ret'.
+// Creates a numpy array in 'ret' which either aliases the content of 't' or has
+// a copy.
 Status ConvertTensorToNdarray(const Tensor& t, PyObject** ret) {
   int typenum = -1;
-  TF_RETURN_IF_ERROR(TfDTypeToNpDType(t.dtype(), &typenum));
+  TF_RETURN_IF_ERROR(TF_DataType_to_PyArray_TYPE(
+      static_cast<TF_DataType>(t.dtype()), &typenum));
   PyArray_Descr* descr = PyArray_DescrFromType(typenum);
   CHECK(descr);
   std::vector<npy_intp> dims;
   for (int i = 0; i < t.dims(); ++i) {
     dims.push_back(t.dim_size(i));
   }
+  Tensor* copy = new Tensor(t);
+  if (ArrayFromMemory(dims.size(), dims.data(),
+                      const_cast<char*>(copy->tensor_data().data()), t.dtype(),
+                      [copy]() { delete copy; }, ret)
+          .ok()) {
+    return Status::OK();
+  }
+  delete copy;
+
   PyObject* obj = PyArray_Empty(dims.size(), dims.data(), descr, 0);
   if (obj == nullptr) {
     return errors::Internal("Failed to allocate np array: ",
diff --git a/tensorflow/python/lib/core/strings.i b/tensorflow/python/lib/core/strings.i
index d2b05588826363ab4426df527045c45e1bf58636..b74eb91cd55259923803448b951221aebd77b61d 100644
--- a/tensorflow/python/lib/core/strings.i
+++ b/tensorflow/python/lib/core/strings.i
@@ -87,7 +87,7 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   if (!temp_string_list) {
     SWIG_fail;
   }
-  tensorflow::Safe_PyObjectVector converted;
+  std::vector<tensorflow::Safe_PyObjectPtr> converted;
   converted.reserve(size);
   for (const string& op : $1) {
     // Always treat strings as bytes, consistent with the typemap
@@ -103,4 +103,3 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   }
   $result = temp_string_list.release();
 }
-
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index ace03e3d1b190d3103c6584040d5110f90a94ed0..c212d2071f216881b58c6a2a37626eaebd3be4ca 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -24,9 +24,12 @@ from __future__ import print_function
 import os
 import uuid
 
+import six
+
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 
 
 class FileIO(object):
@@ -121,11 +124,51 @@ class FileIO(object):
       return self._prepare_value(
           pywrap_tensorflow.ReadFromStream(self._read_buf, length, status))
 
-  def seek(self, position):
-    """Seeks to the position in the file."""
+  @deprecation.deprecated_args(
+      None,
+      "position is deprecated in favor of the offset argument.",
+      "position")
+  def seek(self, offset=None, whence=0, position=None):
+    # TODO(jhseu): Delete later. Used to omit `position` from docs.
+    # pylint: disable=g-doc-args
+    """Seeks to the offset in the file.
+
+    Args:
+      offset: The byte count relative to the whence argument.
+      whence: Valid values for whence are:
+        0: start of the file (default)
+        1: relative to the current position of the file
+        2: relative to the end of file. offset is usually negative.
+    """
+    # pylint: enable=g-doc-args
     self._preread_check()
+    # We needed to make offset a keyword argument for backwards-compatibility.
+    # This check exists so that we can convert back to having offset be a
+    # positional argument.
+    # TODO(jhseu): Make `offset` a positional argument after `position` is
+    # deleted.
+    if offset is None and position is None:
+      raise TypeError("seek(): offset argument required")
+    if offset is not None and position is not None:
+      raise TypeError("seek(): offset and position may not be set "
+                      "simultaneously.")
+
+    if position is not None:
+      offset = position
+
     with errors.raise_exception_on_not_ok_status() as status:
-      ret_status = self._read_buf.Seek(position)
+      if whence == 0:
+        pass
+      elif whence == 1:
+        offset += self.tell()
+      elif whence == 2:
+        offset += self.size()
+      else:
+        raise errors.InvalidArgumentError(
+            None, None,
+            "Invalid whence argument: {}. Valid values are 0, 1, or 2."
+            .format(whence))
+      ret_status = self._read_buf.Seek(offset)
       pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
 
   def readline(self):
@@ -263,23 +306,33 @@ def write_string_to_file(filename, file_content):
 
 
 def get_matching_files(filename):
-  """Returns a list of files that match the given pattern.
+  """Returns a list of files that match the given pattern(s).
 
   Args:
-    filename: string, the pattern
+    filename: string or iterable of strings. The glob pattern(s).
 
   Returns:
-    Returns a list of strings containing filenames that match the given pattern.
+    A list of strings containing filenames that match the given pattern(s).
 
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    # Convert each element to string, since the return values of the
-    # vector of string should be interpreted as strings, not bytes.
-    return [compat.as_str_any(matching_filename)
-            for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-                compat.as_bytes(filename), status)]
+    if isinstance(filename, six.string_types):
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(filename), status)
+      ]
+    else:
+      return [
+          # Convert the filenames to string from bytes.
+          compat.as_str_any(matching_filename)
+          for single_filename in filename
+          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
+              compat.as_bytes(single_filename), status)
+      ]
 
 
 def create_dir(dirname):
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index 72931217d9a4171e321f031260d4ecd9285a9eb1..e60b93b84fbb09e984a19f3e5a4920ac8e29dd28 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -129,6 +129,12 @@ class FileIoTest(test.TestCase):
     self.assertItemsEqual(
         file_io.get_matching_files(os.path.join(dir_path, "file*.txt")),
         expected_match)
+    self.assertItemsEqual(file_io.get_matching_files(tuple()), [])
+    files_subset = [
+        os.path.join(dir_path, files[0]), os.path.join(dir_path, files[2])
+    ]
+    self.assertItemsEqual(
+        file_io.get_matching_files(files_subset), files_subset)
     file_io.delete_recursively(dir_path)
     self.assertFalse(file_io.file_exists(os.path.join(dir_path, "file3.txt")))
 
@@ -392,6 +398,40 @@ class FileIoTest(test.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       f.seek(-1)
 
+    with self.assertRaises(TypeError):
+      f.seek()
+
+    # TODO(jhseu): Delete after position deprecation.
+    with self.assertRaises(TypeError):
+      f.seek(offset=0, position=0)
+    f.seek(position=9)
+    self.assertEqual(9, f.tell())
+    self.assertEqual("testing2\n", f.readline())
+
+  def testSeekFromWhat(self):
+    file_path = os.path.join(self._base_dir, "temp_file")
+    with file_io.FileIO(file_path, mode="r+") as f:
+      f.write("testing1\ntesting2\ntesting3\n\ntesting5")
+    self.assertEqual("testing1\n", f.readline())
+    self.assertEqual(9, f.tell())
+
+    # Seek to 18
+    f.seek(9, 1)
+    self.assertEqual(18, f.tell())
+    self.assertEqual("testing3\n", f.readline())
+
+    # Seek back to 9
+    f.seek(9, 0)
+    self.assertEqual(9, f.tell())
+    self.assertEqual("testing2\n", f.readline())
+
+    f.seek(-f.size(), 2)
+    self.assertEqual(0, f.tell())
+    self.assertEqual("testing1\n", f.readline())
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      f.seek(0, 3)
+
   def testReadingIterator(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     data = ["testing1\n", "testing2\n", "testing3\n", "\n", "testing5"]
diff --git a/tensorflow/python/lib/io/py_record_writer.cc b/tensorflow/python/lib/io/py_record_writer.cc
index 039e59756ec0d57c80f9d3da60ac5a607edc398b..df35c43c3d4abdd6a3e0bf7c1fb07bedf556afb7 100644
--- a/tensorflow/python/lib/io/py_record_writer.cc
+++ b/tensorflow/python/lib/io/py_record_writer.cc
@@ -37,18 +37,16 @@ PyRecordWriter* PyRecordWriter::New(const string& filename,
     return nullptr;
   }
   PyRecordWriter* writer = new PyRecordWriter;
-  writer->file_ = file.release();
+  writer->file_ = std::move(file);
 
   RecordWriterOptions options =
       RecordWriterOptions::CreateRecordWriterOptions(compression_type_string);
 
-  writer->writer_ = new RecordWriter(writer->file_, options);
+  writer->writer_.reset(new RecordWriter(writer->file_.get(), options));
   return writer;
 }
 
 PyRecordWriter::~PyRecordWriter() {
-  delete writer_;
-  delete file_;
 }
 
 bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
@@ -57,11 +55,19 @@ bool PyRecordWriter::WriteRecord(tensorflow::StringPiece record) {
   return s.ok();
 }
 
-void PyRecordWriter::Close() {
-  delete writer_;
-  delete file_;
-  writer_ = nullptr;
-  file_ = nullptr;
+void PyRecordWriter::Close(TF_Status* out_status) {
+  Status s = writer_->Close();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  writer_.reset(nullptr);
+  s = file_->Close();
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  file_.reset(nullptr);
 }
 
 }  // namespace io
diff --git a/tensorflow/python/lib/io/py_record_writer.h b/tensorflow/python/lib/io/py_record_writer.h
index 483b7b9df04715fc4265779cff9829f09b731b61..8c53420ce687ab5878d1ffd7eae6579a48f112c5 100644
--- a/tensorflow/python/lib/io/py_record_writer.h
+++ b/tensorflow/python/lib/io/py_record_writer.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_IO_PY_RECORD_WRITER_H_
 #define TENSORFLOW_PYTHON_LIB_IO_PY_RECORD_WRITER_H_
 
+#include <memory>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
@@ -42,13 +44,13 @@ class PyRecordWriter {
   ~PyRecordWriter();
 
   bool WriteRecord(tensorflow::StringPiece record);
-  void Close();
+  void Close(TF_Status* out_status);
 
  private:
   PyRecordWriter();
 
-  WritableFile* file_;        // Owned
-  io::RecordWriter* writer_;  // Owned
+  std::unique_ptr<io::RecordWriter> writer_;
+  std::unique_ptr<WritableFile> file_;
   TF_DISALLOW_COPY_AND_ASSIGN(PyRecordWriter);
 };
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 450b89e4c1678923334c74d69362cd66cc27742b..3d0cdc2153c211c6f6e804dfefda826c1dec2730 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -123,4 +123,5 @@ class TFRecordWriter(object):
 
   def close(self):
     """Close the file."""
-    self._writer.Close()
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._writer.Close(status)
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 8594d811e89ecb318c20f0b9b52ba6ce04bf3eea..5c6d309e6c766b135089b3659b90687505114168 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -354,9 +354,15 @@ def _PreventGradientGrad(op, _):
 def _GatherGrad(op, grad):
   """Gradient for Gather op."""
   # params can be large, so colocate the shape calculation with it.
+  #
+  # params can be very large for sparse model, array_ops.shape raises
+  # exception on the Windows platform when any dimension is larger than
+  # int32. params_shape is not used in optimizer apply_sparse gradients,
+  # so it's fine to convert it back to int32 regardless of truncation.
   params = op.inputs[0]
   with ops.colocate_with(params):
-    params_shape = array_ops.shape(params)
+    params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
+    params_shape = math_ops.to_int32(params_shape)
 
   # Build appropriately shaped IndexedSlices
   indices = op.inputs[1]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 60057b9ab1e6430abefd80a2f1967c3309cee3de..f0dba04e44c68fdc96b2db823d06910886ba30df 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -84,7 +84,6 @@ from __future__ import print_function
 
 import sys
 import numpy as np
-import six
 
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
@@ -196,7 +195,8 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
-    shape_y: A rank 1 integer `Tensor`, representing the shape of x.
+    shape_y: A rank 1 integer `Tensor`, representing the shape of y.
+
   Returns:
     A rank 1 integer `Tensor` representing the broadcasted shape.
   """
@@ -470,7 +470,10 @@ def _SliceHelper(tensor, slice_spec, var=None):
     else:
       begin.append(s)
       end.append(s + 1)
-      strides.append(1)
+      if isinstance(s, ops.Tensor):
+        strides.append(constant(1, s.dtype))
+      else:
+        strides.append(np.ones_like(s).dtype.type(1))
       shrink_axis_mask |= (1 << index)
     index += 1
 
@@ -557,7 +560,13 @@ def strided_slice(input_,
                   shrink_axis_mask=0,
                   var=None,
                   name=None):
-  """Extracts a strided slice from a tensor.
+  """Extracts a strided slice of a tensor (generalized python array indexing).
+
+  **Most users will want to use @{tf.Tensor.__getitem__} and
+  @{tf.Variable.__getitem__}.** That allows  NumPy style slicing syntax (i.e.
+  `tensor[..., 3:4:-1, tf.newaxis, 3]`).
+  This op is the low-level interface that are used to implement operators.
+  Those interfaces are much more friendly, and highly recommended.
 
   To a first order, this operation extracts a slice of size `end - begin`
   from a tensor `input`
@@ -654,19 +663,23 @@ def strided_slice(input_,
       new_axis_mask=new_axis_mask,
       shrink_axis_mask=shrink_axis_mask)
 
-  def assign(val):
+  parent_name = name
+
+  def assign(val, name=None):
     """Closure that holds all the arguments to create an assignment."""
 
     if var is None:
       raise ValueError("Sliced assignment is only supported for variables")
 
-    return gen_array_ops.strided_slice_assign(
-        ref=var,
+    if name is None:
+      name = parent_name + "_assign"
+
+    return var._strided_slice_assign(
         begin=begin,
         end=end,
         strides=strides,
         value=val,
-        name=name + "_assign",
+        name=name,
         begin_mask=begin_mask,
         end_mask=end_mask,
         ellipsis_mask=ellipsis_mask,
@@ -1151,13 +1164,14 @@ def sparse_mask(a, mask_indices, name=None):
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor into sub tensors.
 
-  If `num_or_size_splits` is a scalar, `num_split`, then splits `value` along
-  dimension `axis` into `num_split` smaller tensors.
+  If `num_or_size_splits` is an integer type, `num_split`, then splits `value`
+  along dimension `axis` into `num_split` smaller tensors.
   Requires that `num_split` evenly divides `value.shape[axis]`.
 
-  If `num_or_size_splits` is a tensor, `size_splits`, then splits `value` into
-  `len(size_splits)` pieces. The shape of the `i`-th piece has the same size as
-  the `value` except along dimension `axis` where the size is `size_splits[i]`.
+  If `num_or_size_splits` is not an integer type, it is presumed to be a Tensor
+  `size_splits`, then splits `value` into `len(size_splits)` pieces. The shape
+  of the `i`-th piece has the same size as the `value` except along dimension
+  `axis` where the size is `size_splits[i]`.
 
   For example:
 
@@ -1175,11 +1189,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of splits along
-      split_dim or a 1-D Tensor containing the sizes of each output tensor
-      along split_dim. If an integer then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split
-      dimension must match that of the `value`.
+    num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
+      splits along split_dim or a 1-D integer `Tensor` integer tensor containing
+      the sizes of each output tensor along split_dim. If a scalar then it must
+      evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
+      split dimension must match that of the `value`.
     axis: A 0-D `int32` `Tensor`. The dimension along which to split.
       Must be in the range `[0, rank(value))`. Defaults to 0.
     num: Optional, used to specify the number of outputs when it cannot be
@@ -1195,11 +1209,11 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   Raises:
     ValueError: If `num` is unspecified and cannot be inferred.
   """
-  if isinstance(num_or_size_splits, six.integer_types):
+  size_splits = ops.convert_to_tensor(num_or_size_splits)
+  if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
         split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
   else:
-    size_splits = ops.convert_to_tensor(num_or_size_splits)
     if num is None:
       size_splits_shape = size_splits.get_shape()
       num = size_splits_shape.dims[0]
@@ -1292,6 +1306,17 @@ def matrix_transpose(a, name="matrix_transpose"):
   # tf.matrix_transpose(x) is shape [1, 2, 4, 3]
   ```
 
+  Note that `tf.matmul` provides kwargs allowing for transpose of arguments.
+  This is done with minimal cost, and is preferable to using this function. E.g.
+
+  ```
+  # Good!  Transpose is taken at minimal additional cost.
+  tf.matmul(matrix, b, transpose_b=True)
+
+  # Inefficient!
+  tf.matmul(matrix, tf.matrix_transpose(b))
+  ```
+
   Args:
     a: A `Tensor` with `rank >= 2`.
     name: A name for the operation (optional).
@@ -1391,10 +1416,15 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
-    if dtype is not None and tensor.dtype != dtype:
-      ret = zeros(shape_internal(tensor, optimize=optimize), dtype, name=name)
-      ret.set_shape(tensor.get_shape())
-      return ret
+
+    if tensor.shape.is_fully_defined():
+      # We can produce a zeros tensor independent of the value of 'tensor',
+      # since the shape is known statically.
+      return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name)
+
+    if dtype is not None and dtype != tensor.dtype:
+      return zeros(shape_internal(tensor, optimize=optimize), dtype=dtype,
+                   name=name)
     else:
       return gen_array_ops._zeros_like(tensor, name=name)
 
@@ -1498,17 +1528,7 @@ def placeholder(dtype, shape=None, name=None):
     A `Tensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
   """
-  shape = tensor_shape.as_shape(shape)
-  if shape.is_fully_defined():
-    dim_list = shape.as_list()
-  else:
-    dim_list = []
-  ret = gen_array_ops._placeholder(
-      dtype=dtype,
-      shape=dim_list,
-      name=name)
-  ret.set_shape(shape)
-  return ret
+  return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
 
 
 # pylint: disable=redefined-outer-name
@@ -1670,21 +1690,21 @@ def meshgrid(*args, **kwargs):
   results in
 
   ```prettyprint
-    X = [[1, 1, 1],
-         [2, 2, 2],
-         [3, 3, 3]]
-    Y = [[4, 5, 6],
-         [4, 5, 6],
-         [4, 5, 6]]
+    X = [[1, 2, 3],
+         [1, 2, 3],
+         [1, 2, 3]]
+    Y = [[4, 4, 4],
+         [5, 5, 5],
+         [6, 6, 6]]
   ```
 
   Args:
-    *args: `Tensor`s with rank 1
-    indexing: Either 'xy' or 'ij' (optional, default: 'xy')
+    *args: `Tensor`s with rank 1.
+    indexing: Either 'xy' or 'ij' (optional, default: 'xy').
     name: A name for the operation (optional).
 
   Returns:
-    outputs: A list of N `Tensor`s with rank N
+    outputs: A list of N `Tensor`s with rank N.
   """
 
   indexing = kwargs.pop("indexing", "xy")
@@ -1865,7 +1885,8 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
 @ops.RegisterGradient("FakeQuantWithMinMaxArgs")
 def _FakeQuantWithMinMaxArgsGradient(op, grad):
   """Gradient for FakeQuantWithMinMaxArgs op."""
-  return fake_quant_with_min_max_args_gradient(grad, op.inputs[0])
+  return fake_quant_with_min_max_args_gradient(
+      grad, op.inputs[0], min=op.get_attr("min"), max=op.get_attr("max"))
 
 
 @ops.RegisterGradient("FakeQuantWithMinMaxVars")
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index 397ed91078b1166c95809e03bb0d87f8d639f4b8..c2ee2b383231333239c6e2d4e874a0ad1cdf493e 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -198,7 +198,7 @@ class BatchNormBenchmark(test.Benchmark):
     if FLAGS.use_gpu:
       t1 = self._run_graph("gpu", shape, axes, 10, "op", True, True, 50)
       t2 = self._run_graph("gpu", shape, axes, 10, "py", True, True, 50)
-      t2 = self._run_graph("gpu", shape, axes, 10, "slow", True, True, 50)
+      t3 = self._run_graph("gpu", shape, axes, 10, "slow", True, True, 50)
       print_difference("op vs py", t1, t2)
       print_difference("py vs slow", t2, t3)
     print("Forward convolution (higher layers).")
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 285c199b10988af7de88d9eaccd5e927828a3468..3053a333bfcd38b4cc74bc509af3b2baffe5be43 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -53,7 +53,9 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample. The
+      `sampled_candidates` return value will have shape `[num_sampled]`. If
+      `unique=True`, `num_sampled` must be less than or equal to `range_max`.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -61,8 +63,10 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     name: A name for the operation (optional).
 
   Returns:
-    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.
-      The sampled classes.
+    sampled_candidates: A tensor of type `int64` and shape `[num_sampled]`.  The
+      sampled classes, either with possible duplicates (`unique=False`) or all
+      unique (`unique=True`). In either case, `sampled_candidates` is
+      independent of the true classes.
     true_expected_count: A tensor of type `float`.  Same shape as
       `true_classes`. The expected counts under the sampling distribution
       of each of `true_classes`.
@@ -111,7 +115,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -166,7 +170,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
@@ -230,7 +234,7 @@ def fixed_unigram_candidate_sampler(true_classes,
     true_classes: A `Tensor` of type `int64` and shape `[batch_size,
       num_true]`. The target classes.
     num_true: An `int`.  The number of target classes per training example.
-    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_sampled: An `int`.  The number of classes to randomly sample.
     unique: A `bool`. Determines whether all sampled classes in a batch are
       unique.
     range_max: An `int`. The number of possible classes.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 8401f5493b8693ab4b8253c2384d0c239e6fe7db..753999a67288a571e527582a71887eeaac98aa9f 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -84,6 +84,22 @@ __all__ = [
 ]
 
 
+def _maybe_constant_value_string(t):
+  if not isinstance(t, ops.Tensor):
+    return str(t)
+  const_t = tensor_util.constant_value(t)
+  if const_t is not None:
+    return str(const_t)
+  return t
+
+
+def _assert_static(condition, data):
+  """Raises a static ValueError with as much information as possible."""
+  if not condition:
+    data_static = [_maybe_constant_value_string(x) for x in data]
+    raise ValueError('\n'.join(data_static))
+
+
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
 
@@ -140,7 +156,9 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       data = [
-          message, 'Condition x < 0 did not hold element-wise: x = ', x.name, x]
+          message,
+          'Condition x < 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(x, zero, data=data, summarize=summarize)
 
@@ -174,7 +192,8 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       data = [
-          message, 'Condition x > 0 did not hold element-wise: x = ', x.name, x]
+          message, 'Condition x > 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(zero, x, data=data, summarize=summarize)
 
@@ -210,7 +229,8 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x >= 0 did not hold element-wise: x = ', x.name, x]
+          'Condition x >= 0 did not hold element-wise:',
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
@@ -246,7 +266,8 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x <= 0 did not hold element-wise: x = ', x.name, x]
+          'Condition x <= 0 did not hold element-wise:'
+          'x (%s) = ' % x.name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
@@ -284,10 +305,16 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x == y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x == y did not hold element-wise:',
+          'x (%s) = ' % x.name, x,
+          'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.equal(x, y))
+    x_static = tensor_util.constant_value(x)
+    y_static = tensor_util.constant_value(y)
+    if x_static is not None and y_static is not None:
+      condition_static = (x_static == y_static).all()
+      _assert_static(condition_static, data)
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
@@ -326,9 +353,9 @@ def assert_none_equal(
     if data is None:
       data = [
           message,
-          'Condition x != y did not hold for every single element: x = ',
-          x.name, x,
-          'y = ', y.name, y
+          'Condition x != y did not hold for every single element:'
+          'x (%s) = ' % x.name, x,
+          'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.not_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -367,8 +394,8 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x < y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x < y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.less(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -407,8 +434,8 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x <= y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x <= y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.less_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -447,8 +474,8 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     if data is None:
       data = [
           message,
-          'Condition x > y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x > y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -489,8 +516,8 @@ def assert_greater_equal(x, y, data=None, summarize=None, message=None,
     if data is None:
       data = [
           message,
-          'Condition x >= y did not hold element-wise: x = ', x.name, x, 'y = ',
-          y.name, y
+          'Condition x >= y did not hold element-wise:'
+          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 133528a1cd6e5388a2caf266899062d44d03be65..496c5addad0760105dbb6541c18dad508b9c6b01 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -49,7 +49,7 @@ def _SwitchGrad(op, *grad):
       # This is the second time this Switch is visited. It comes from
       # the non-exit branch of the Switch, so update the second input
       # to the Merge.
-      # TODO: Perform shape inference with this new input.
+      # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
         control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
@@ -72,6 +72,9 @@ def _SwitchGrad(op, *grad):
     good_grad = grad[op_ctxt.branch]
     zero_grad = grad[1 - op_ctxt.branch]
     # At this point, we have created zero_grad guarded by the right switch.
+    # Unfortunately, we may still get None here for not trainable data types.
+    if zero_grad is None:
+      return None, None
     return merge([good_grad, zero_grad], name="cond_grad")[0], None
   else:
     false_grad = switch(grad[0], op.inputs[1])[0]
@@ -162,11 +165,14 @@ def _ExitGrad(op, grad):
     dense_shape = grad.dense_shape
     if dense_shape is not None:
       grad_ctxt.AddName(dense_shape.name)
-  enter_fn = control_flow_ops._Enter  # pylint: disable=protected-access
   grad_ctxt.Enter()
-  result = enter_fn(grad, grad_ctxt.name, is_constant=False,
-                    parallel_iterations=grad_ctxt.parallel_iterations,
-                    name="b_exit")
+  # pylint: disable=protected-access
+  result = control_flow_ops._Enter(
+      grad, grad_ctxt.name, is_constant=False,
+      parallel_iterations=grad_ctxt.parallel_iterations,
+      name="b_exit")
+  # pylint: enable=protected-access
+  grad_ctxt.loop_enters.append(result)
   grad_ctxt.Exit()
   return result
 
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index c4a27009c3c3ceb5ac119ba75ba4988d2dc45360..96ace6e79b4502d94df32ba92fb70fea53458e28 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -73,7 +73,9 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.gen_control_flow_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_should_use
 
 
 # We override the 'tuple' for a control flow op, so we keep python's
@@ -86,6 +88,7 @@ _basetuple = tuple
 
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
+@tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
 
@@ -275,7 +278,7 @@ def exit(data, name=None):
 def switch(data, pred, dtype=None, name=None):
   """Forwards `data` to an output determined by `pred`.
 
-  If `pred` is false, the `data` input is forwared to the first output.
+  If `pred` is false, the `data` input is forwarded to the first output.
   Otherwise, the data goes to the second output.
 
   This op handles `Tensor`s and `IndexedSlices`.
@@ -334,7 +337,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
     name: A name for this operation (optional).
 
   Returns:
-    `(output_false, output_false)`: If `pred` is true, data will be forwarded to
+    `(output_false, output_true)`: If `pred` is true, data will be forwarded to
     `output_true`, otherwise it goes to `output_false`.
 
   Raises:
@@ -426,10 +429,11 @@ def merge(inputs, name=None):
 # pylint: enable=protected-access
 
 
-def _convert_tensorarrays_to_flows(tensors_or_tensor_arrays):
-  return [ta.flow if isinstance(ta, tensor_array_ops.TensorArray)
-          else ta
-          for ta in tensors_or_tensor_arrays]
+def _convert_tensorarray_to_flow(tensor_or_tensor_array):
+  if isinstance(tensor_or_tensor_array, tensor_array_ops.TensorArray):
+    return tensor_or_tensor_array.flow
+  else:
+    return tensor_or_tensor_array
 
 
 def _make_tensor_array(ta, t_or_flow):
@@ -1047,15 +1051,19 @@ class ControlFlowState(object):
     """
     loop_exits = []
     for _, grad_state in self._map.items():
+      # pylint: disable=protected-access
       for y in grad_state.forward_loop_exits:
-        # pylint: disable=protected-access
         if pending_count[y.op._id] == 0:
           grad_state.pending_exits_count -= 1
           if y.op._id not in to_ops_set:
             grad_state.unused_exits.append(y)
           if grad_state.pending_exits_count == 0:
             loop_exits.extend(grad_state.unused_exits)
-        # pylint: enable=protected-access
+      # Need to include Enters in backprop for higher-order gradients.
+      for y in grad_state.forward_context.loop_enters:
+        if pending_count[y.op._id] == 0:
+          pending_count[y.op._id] = 1
+      # pylint: enable=protected-access
     return loop_exits
 
   def EnterGradWhileContext(self, op, before):
@@ -1302,11 +1310,15 @@ def ZerosLikeOutsideLoop(op, index):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
-    pred = op_ctxt.pred
-    branch = op_ctxt.branch
-    switch_val = switch(op.inputs[0], pred)[1 - branch]
-    zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
-    return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    if op_ctxt:
+      # We are in a cond context. Use a switch to create zeros only when needed.
+      pred = op_ctxt.pred
+      branch = op_ctxt.branch
+      switch_val = switch(op.inputs[0], pred)[1 - branch]
+      zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
+      return array_ops.zeros(zeros_shape, dtype=val.dtype)
+    else:
+      return array_ops.zeros_like(val, optimize=False)
 
 
 class ControlFlowContext(object):
@@ -1417,8 +1429,7 @@ class ControlFlowContext(object):
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      for x in result:
-        self._outer_context.AddName(x.name)
+      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1454,6 +1465,14 @@ class ControlFlowContext(object):
     return internal_control_inputs
   # pylint: enable=protected-access
 
+  def AddInnerOp(self, op):
+    """Notifies a scope about an operator added to an inner scope."""
+    pass
+
+  def GetControlPivot(self):
+    """Returns the pivot node for this context, or None."""
+    return None
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
@@ -1617,6 +1636,11 @@ class CondContext(ControlFlowContext):
           # pylint: enable=protected-access
       for x in op.outputs:
         self._values.add(x.name)
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        op._add_control_input(self._pivot.op)
+      # pylint: enable=protected-access
+
     if self._outer_context or not IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
@@ -1637,73 +1661,100 @@ class CondContext(ControlFlowContext):
         real_val = external_val
     return real_val
 
+  def _BuildCondTensor(self, v):
+    if isinstance(v, ops.Operation):
+      # Use pivot as the proxy for this op.
+      return with_dependencies([v], self._pivot)
+    elif isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
+      values = self._ProcessOutputTensor(v.values)
+      indices = self._ProcessOutputTensor(v.indices)
+      if isinstance(v, ops.IndexedSlices):
+        dense_shape = v.dense_shape
+        if dense_shape is not None:
+          dense_shape = self._ProcessOutputTensor(dense_shape)
+        return ops.IndexedSlices(values, indices, dense_shape)
+      else:
+        dense_shape = self._ProcessOutputTensor(v.dense_shape)
+        return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    else:
+      v = nest.map_structure(_convert_tensorarray_to_flow, v)
+      return self._ProcessOutputTensor(ops.convert_to_tensor(v))
+
   def BuildCondBranch(self, fn):
     """Add the subgraph defined by fn() to the graph."""
-    r = fn()
-    original_r = r
-    result = []
-    if r is not None:
-      if not isinstance(r, list) and not isinstance(r, _basetuple):
-        r = [r]
-        original_r = [original_r]
-      r = _convert_tensorarrays_to_flows(r)
-      for v in r:
-        real_v = v
-        if isinstance(v, ops.Operation):
-          # Use pivot as the proxy for this op.
-          real_v = with_dependencies([v], self._pivot)
-        else:
-          if isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-            values = self._ProcessOutputTensor(v.values)
-            indices = self._ProcessOutputTensor(v.indices)
-            if isinstance(v, ops.IndexedSlices):
-              dense_shape = v.dense_shape
-              if dense_shape is not None:
-                dense_shape = self._ProcessOutputTensor(dense_shape)
-              real_v = ops.IndexedSlices(values, indices, dense_shape)
-            else:
-              dense_shape = self._ProcessOutputTensor(v.dense_shape)
-              real_v = sparse_tensor.SparseTensor(indices, values, dense_shape)
-          else:
-            real_v = self._ProcessOutputTensor(v)
-        result.append(real_v)
-    return original_r, result
+    original_result = fn()
+    if original_result is None:
+      return None, None
 
+    result = nest.map_structure(self._BuildCondTensor, original_result)
+    if not isinstance(result, (list, _basetuple)):
+      result = [result]
+    return original_result, result
 
-def cond(pred, fn1, fn2, name=None):
-  """Return either fn1() or fn2() based on the boolean predicate `pred`.
 
-  `fn1` and `fn2` both return lists of output tensors. `fn1` and `fn2` must have
-  the same non-zero number and type of outputs.
+def _UnpackIfSingleton(res):
+  if isinstance(res, (list, _basetuple)) and len(res) == 1:
+    return res[0]
+  else:
+    return res
+
+
+# pylint: disable=g-doc-args
+@deprecation.deprecated_args(
+    None,
+    "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
+    "fn1", "fn2")
+def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
+         fn1=None, fn2=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
 
   Note that the conditional execution applies only to the operations defined in
-  fn1 and fn2. Consider the following simple program:
+  `true_fn` and `false_fn`. Consider the following simple program:
 
   ```python
   z = tf.multiply(a, b)
   result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
   ```
 
-  If x < y, the `tf.add` operation will be executed and `tf.square`
-  operation will not be executed. Since z is needed for at least one
-  branch of the cond, the `tf.multiply` operation is always executed, unconditionally.
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
   Although this behavior is consistent with the dataflow model of TensorFlow,
   it has occasionally surprised some users who expected a lazier semantics.
 
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+  This behavior is disabled by passing `strict=True`.
+
   Args:
-    pred: A scalar determining whether to return the result of `fn1` or `fn2`.
-    fn1: The callable to be performed if pred is true.
-    fn2: The callable to be performed if pred is false.
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    strict: A boolean that enables/disables 'strict' mode; see above.
     name: Optional name prefix for the returned tensors.
 
   Returns:
-    Tensors returned by the call to either `fn1` or `fn2`. If the callables
-    return a singleton list, the element is extracted from the list.
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
 
   Raises:
-    TypeError: if `fn1` or `fn2` is not callable.
-    ValueError: if `fn1` and `fn2` do not return the same number of tensors, or
-                return tensors of different types.
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
 
   Example:
 
@@ -1718,12 +1769,30 @@ def cond(pred, fn1, fn2, name=None):
   ```
 
   """
-  with ops.name_scope(name, "cond", [pred]) as name:
-    if not callable(fn1):
-      raise TypeError("fn1 must be callable.")
-    if not callable(fn2):
-      raise TypeError("fn2 must be callable.")
+  # We needed to make true_fn/false_fn keyword arguments for
+  # backwards-compatibility. This check exists so that we can convert back to
+  # having them be positional arguments.
+  # TODO(josh11b): Make `true_fn` and `false_fn` positional arguments after
+  # `fn1` and `fn2` are deleted.
+  if fn1 is not None:
+    if true_fn is not None:
+      raise TypeError("cond(): true_fn and fn1 may not be set simultaneously.")
+    true_fn = fn1
+  elif true_fn is None:
+    raise TypeError("cond(): true_fn argument required")
+  if fn2 is not None:
+    if false_fn is not None:
+      raise TypeError("cond(): false_fn and fn2 may not be set simultaneously.")
+    false_fn = fn2
+  elif false_fn is None:
+    raise TypeError("cond(): false_fn argument required")
+
+  if not callable(true_fn):
+    raise TypeError("true_fn must be callable.")
+  if not callable(false_fn):
+    raise TypeError("false_fn must be callable.")
 
+  with ops.name_scope(name, "cond", [pred]) as name:
     # Add the Switch to the graph.
     if isinstance(pred, bool):
       raise TypeError("pred must not be a Python bool")
@@ -1738,23 +1807,43 @@ def cond(pred, fn1, fn2, name=None):
     # Build the graph for the true branch in a new context.
     context_t = CondContext(pred, pivot_1, branch=1)
     context_t.Enter()
-    orig_res, res_t = context_t.BuildCondBranch(fn1)
+    orig_res_t, res_t = context_t.BuildCondBranch(true_fn)
+    if orig_res_t is None:
+      raise ValueError("true_fn must have a return value.")
     context_t.ExitResult(res_t)
     context_t.Exit()
 
     # Build the graph for the false branch in a new context.
     context_f = CondContext(pred, pivot_2, branch=0)
     context_f.Enter()
-    _, res_f = context_f.BuildCondBranch(fn2)
+    orig_res_f, res_f = context_f.BuildCondBranch(false_fn)
+    if orig_res_f is None:
+      raise ValueError("false_fn must have a return value.")
     context_f.ExitResult(res_f)
     context_f.Exit()
 
+    if not strict:
+      orig_res_t = _UnpackIfSingleton(orig_res_t)
+      orig_res_f = _UnpackIfSingleton(orig_res_f)
+
+    # Check that the return values of the two branches have the same structure.
+    try:
+      nest.assert_same_structure(orig_res_t, orig_res_f)
+    except TypeError as e:
+      raise TypeError(
+          "Incompatible return types of true_fn and false_fn: {}".format(e))
+    except ValueError as e:
+      raise ValueError(
+          "Incompatible return values of true_fn and false_fn: {}".format(e))
+
     # Add the final merge to the graph.
-    if len(res_t) != len(res_f):
-      raise ValueError("fn1 and fn2 must return the same number of results.")
     if not res_t:
-      raise ValueError("fn1 and fn2 must return at least one result.")
-    for x, y in zip(res_t, res_f):
+      raise ValueError("true_fn and false_fn must return at least one result.")
+
+    res_t_flat = nest.flatten(res_t)
+    res_f_flat = nest.flatten(res_f)
+
+    for x, y in zip(res_t_flat, res_f_flat):
       assert ((isinstance(x, ops.IndexedSlices) and
                isinstance(y, ops.IndexedSlices)) or
               (isinstance(x, sparse_tensor.SparseTensor) and
@@ -1763,16 +1852,24 @@ def cond(pred, fn1, fn2, name=None):
       val_x = x if isinstance(x, ops.Tensor) else x.values
       val_y = y if isinstance(y, ops.Tensor) else y.values
       if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError("Outputs of fn1 and fn2 must have the same type: "
-                         "%s, %s" % (val_x.dtype.name, val_y.dtype.name))
-    merges = [merge([x[0], x[1]])[0] for x in zip(res_f, res_t)]
-    merges = _convert_flows_to_tensorarrays(orig_res, merges)
+        raise ValueError(
+            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
+            (val_x.dtype.name, val_y.dtype.name))
+
+    merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
+    merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
 
     # Add to collections
     ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
     ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    return merges[0] if len(merges) == 1 else merges
+    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
+
+    # Singleton lists and tuples are automatically unpacked if strict == False.
+    if not strict:
+      merges = _UnpackIfSingleton(merges)
+    return merges
+# pylint: enable=g-doc-args
 
 
 def _resource_safe_shape(t):
@@ -1844,6 +1941,8 @@ class WhileContext(ControlFlowContext):
     self._pivot = None
     # The list of exit tensors for loop variables.
     self._loop_exits = []
+    # The list of enter tensors for loop variables.
+    self._loop_enters = []
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `WhileContext` from protocol buffer.
@@ -1873,6 +1972,10 @@ class WhileContext(ControlFlowContext):
     self._loop_exits = [g.as_graph_element(
         ops.prepend_name_scope(exit_name, import_scope))
                         for exit_name in context_def.loop_exit_names]
+    # The list of enter tensors for loop variables.
+    self._loop_enters = [g.as_graph_element(
+        ops.prepend_name_scope(enter_name, import_scope))
+                         for enter_name in context_def.loop_enter_names]
     super(WhileContext, self).__init__(values_def=context_def.values_def,
                                        import_scope=import_scope)
 
@@ -1900,6 +2003,11 @@ class WhileContext(ControlFlowContext):
     """The boolean tensor representing the loop termination condition."""
     return self._pivot
 
+  @property
+  def loop_enters(self):
+    """The list of enter tensors for loop variables."""
+    return self._loop_enters
+
   @property
   def loop_exits(self):
     """The list of exit tensors for loop variables."""
@@ -1933,10 +2041,12 @@ class WhileContext(ControlFlowContext):
           self._pivot_for_body.name, export_scope)
       context_def.pivot_name = ops.strip_name_scope(
           self._pivot.name, export_scope)
-      if self._loop_exits:
-        context_def.loop_exit_names.extend(
-            [ops.strip_name_scope(l.name, export_scope)
-             for l in self._loop_exits])
+      context_def.loop_exit_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_exits])
+      context_def.loop_enter_names.extend(
+          [ops.strip_name_scope(l.name, export_scope)
+           for l in self._loop_enters])
       context_def.values_def.MergeFrom(
           super(WhileContext, self)._to_proto(
               export_scope=export_scope))
@@ -1997,6 +2107,8 @@ class WhileContext(ControlFlowContext):
         enter = _Enter(result, self._name, is_constant=True,
                        parallel_iterations=self._parallel_iterations)
         enter.graph.prevent_feeding(enter)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(enter.op)
       # Fix the control inputs and control flow context of these enter ops.
       self._FixControlInputsAndContext([enter])
 
@@ -2066,11 +2178,19 @@ class WhileContext(ControlFlowContext):
       for x in op.outputs:
         op.graph.prevent_feeding(x)
 
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
   def _MaybeAddControlDependency(self, op):
     """Add a control input to the op if it only depends on loop invariants."""
     def _IsOpFree(op):
+      """Determines if `op` needs a control dependency."""
       if op.control_inputs:
         return False
+      # pylint: disable=protected-access
+      if op.graph._is_function(op.type) or op.type == "SymbolicGradient":
+        return True
+      # pylint: enable=protected-access
       for x in op.inputs:
         if not _IsLoopConstantEnter(x.op):
           return False
@@ -2111,6 +2231,8 @@ class WhileContext(ControlFlowContext):
     enter_n = _Enter(n, self._name, is_constant=False,
                      parallel_iterations=self._parallel_iterations,
                      name="f_count")
+    self.loop_enters.append(enter_n)
+
     merge_n = merge([enter_n, enter_n])[0]
     switch_n = switch(merge_n, self._pivot)
 
@@ -2151,6 +2273,8 @@ class WhileContext(ControlFlowContext):
     enter_count = _Enter(count, self._name, is_constant=False,
                          parallel_iterations=self._parallel_iterations,
                          name="b_count")
+    self.loop_enters.append(enter_count)
+
     merge_count = merge([enter_count, enter_count])[0]
     self._pivot_for_pred = merge_count
 
@@ -2238,6 +2362,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = _Enter(acc, self._name, is_constant=False,
                        parallel_iterations=self._parallel_iterations,
                        name="b_acc")
+    self.loop_enters.append(enter_acc)
+
     merge_acc = merge([enter_acc, enter_acc], name="b_acc")[0]
     switch_acc_false, switch_acc_true = switch(merge_acc, self._pivot)
 
@@ -2245,10 +2371,10 @@ class WhileContext(ControlFlowContext):
     next_acc = _NextIteration(add_acc)
     merge_acc.op._update_input(1, next_acc)  # pylint: disable=protected-access
 
-    acc_result = exit(switch_acc_false, name="b_acc")
-    self.loop_exits.append(acc_result)
-    self.ExitResult([acc_result])
-    return acc_result
+    result_acc = exit(switch_acc_false, name="b_acc")
+    self.loop_exits.append(result_acc)
+    self.ExitResult([result_acc])
+    return result_acc
 
   def AddBackPropIndexedSlicesAccumulator(self, op, grad):
     """This is used for accumulating gradients that are IndexedSlices.
@@ -2305,6 +2431,8 @@ class WhileContext(ControlFlowContext):
     enter_acc = [_Enter(x, self._name, is_constant=False,
                         parallel_iterations=self._parallel_iterations,
                         name="b_acc") for x in init_acc]
+    self.loop_enters.extend(enter_acc)
+
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
     switch_acc = [switch(x, self._pivot) for x in merge_acc]
 
@@ -2322,13 +2450,13 @@ class WhileContext(ControlFlowContext):
     for xm, xn in zip(merge_acc, next_acc):
       xm.op._update_input(1, xn)  # pylint: disable=protected-access
 
-    acc_exits = [exit(x[0], name="b_acc") for x in switch_acc]
-    self.loop_exits.extend(acc_exits)
+    exit_acc = [exit(x[0], name="b_acc") for x in switch_acc]
+    self.loop_exits.extend(exit_acc)
 
-    self.ExitResult(acc_exits)
+    self.ExitResult(exit_acc)
     return ops.IndexedSlices(
-        indices=acc_exits[0], values=acc_exits[1],
-        dense_shape=acc_exits[2] if shape_acc is not None else None)
+        indices=exit_acc[0], values=exit_acc[1],
+        dense_shape=exit_acc[2] if shape_acc is not None else None)
 
   def _InitializeValues(self, values):
     """Makes the values known to this context."""
@@ -2366,19 +2494,30 @@ class WhileContext(ControlFlowContext):
                     for x in real_vars]
       for x in enter_vars:
         x.graph.prevent_feeding(x)
+        if self._outer_context:
+          self._outer_context.AddInnerOp(x.op)
+
+    # Finds the closest enclosing non-None control pivot.
+    outer_context = self._outer_context
+    control_pivot = None
+    while outer_context is not None and control_pivot is None:
+      control_pivot = outer_context.GetControlPivot()
+      # pylint: disable=protected-access
+      outer_context = outer_context._outer_context
+      # pylint: enable=protected-access
 
-    if self._outer_context:
-      control_pivot = self._outer_context.GetControlPivot().op
+    if control_pivot is not None:
       for var in enter_vars:
         if _IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
-          var.op._add_control_input(control_pivot)
+          var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
     _SetShapeInvariants(real_vars, enter_vars, shape_invariants)
 
     # Fix the control inputs and control flow context of these enter ops.
     self._FixControlInputsAndContext(enter_vars)
     self._InitializeValues(enter_vars)
+    self._loop_enters = enter_vars
 
     merge_vars = [merge([x, x])[0] for x in enter_vars]
     self._pivot_for_pred = merge_vars[0]
@@ -2415,8 +2554,8 @@ class WhileContext(ControlFlowContext):
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
     # Convert TensorArrays returned by body into their flow variables
-    flat_result = nest.flatten(body_result)
-    result = _convert_tensorarrays_to_flows(flat_result)
+    result = nest.map_structure(_convert_tensorarray_to_flow,
+                                nest.flatten(body_result))
     result = ops.convert_n_to_tensor_or_indexed_slices(result)
 
     # Add NextIteration and the back edges to complete the loop.
@@ -2446,9 +2585,9 @@ class WhileContext(ControlFlowContext):
 
     # Keep original_loop_vars to identify which are TensorArrays
     original_loop_vars = loop_vars
-    flat_loop_vars = nest.flatten(loop_vars)
     # Convert TensorArrays to their flow variables
-    loop_vars = _convert_tensorarrays_to_flows(flat_loop_vars)
+    loop_vars = nest.map_structure(_convert_tensorarray_to_flow,
+                                   nest.flatten(loop_vars))
     loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
     try:
       self.Enter()
@@ -2504,12 +2643,16 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
   `cond` and `body`. `cond` and `body` both take as many arguments as there are
   `loop_vars`.
 
-  While `cond` evaluates to true, `body` is executed.
-
   In addition to regular Tensors or IndexedSlices, the body may accept and
   return TensorArray objects.  The flows of the TensorArray objects will
   be appropriately forwarded between loops and during gradient calculations.
 
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to make something the repeats
+  `body` until `cond` returns false.
+
   For correctness, `tf.while_loop()` strictly enforces shape invariants for
   the loop variables. A shape invariant is a (possibly partial) shape that
   is unchanged across the iterations of the loop. An error will be raised
@@ -2820,7 +2963,7 @@ def tuple(tensors, name=None, control_inputs=None):
     return tpl
 
 
-def case(pred_fn_pairs, default, exclusive=False, name="case"):
+def case(pred_fn_pairs, default, exclusive=False, strict=False, name="case"):
   """Create a case operation.
 
   The `pred_fn_pairs` parameter is a dict or list of pairs of size N.
@@ -2837,6 +2980,18 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
   are returned immediately. If none of the predicates evaluate to True, this
   operation returns the tensors generated by `default`.
 
+  `tf.case` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. All of the callables must return the same
+  (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  a callable, they are implicitly unpacked to single values. This
+  behavior is disabled by passing `strict=True`.
+
+  If an unordered dictionary is used for `pred_fn_pairs`, the order of the
+  conditional tests is not guaranteed. However, the order is guaranteed to be
+  deterministic, so that variables created in conditional branches are created
+  in fixed order across runs.
+
   Example 1:
     Pseudocode:
     ```
@@ -2862,9 +3017,6 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
 
     Expressions:
     ```
-      x = tf.constant(0)
-      y = tf.constant(1)
-      z = tf.constant(2)
       def f1(): return tf.constant(17)
       def f2(): return tf.constant(23)
       def f3(): return tf.constant(-1)
@@ -2877,6 +3029,7 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
                    callable which returns a list of tensors.
     default: A callable that returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
+    strict: A boolean that enables/disables 'strict' mode; see above.
     name: A name for this operation (optional).
 
   Returns:
@@ -2894,11 +3047,14 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
           or isinstance(pfp, dict)):
     raise TypeError("fns must be a list, tuple, or dict")
   if isinstance(pfp, dict):
-    pfp = pfp.items()
-    if not exclusive:
-      logging.warn("%s: Provided dictionary of predicate/fn pairs, but "
-                   "exclusive=False.  Order of conditional tests is "
-                   "not guaranteed.", name)
+    if isinstance(pfp, collections.OrderedDict):
+      pfp = pfp.items()
+    else:
+      pfp = sorted(pfp.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
+                     "provided, but exclusive=False. The order of conditional "
+                     "tests is deterministic but not guaranteed.", name)
   for tup in pfp:
     if not isinstance(tup, _basetuple) or len(tup) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -2941,20 +3097,31 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
 
     # Create an empty tensor, or list, with the right type and shape
     with ops.name_scope("case_create_empty"):
-      dummy_value = default()
+      def _create_empty_constant(dtype, shape):
+        value = ("" if dtype == dtypes.string else dtype.as_numpy_dtype())
+        if shape.ndims is None:
+          return array_ops.constant(value, dtype=dtype)
+        else:
+          temp_shape = [1 if x.value is None else x.value for x in shape]
+          result = array_ops.constant(value, shape=temp_shape, dtype=dtype)
+          result._shape = shape  # pylint: disable=protected-access
+          return result
+
       def _correct_empty(v):
         if isinstance(v, ops.Operation):
           return no_op()
-        elif v.dtype == dtypes.string:
-          return array_ops.constant("")
+        elif isinstance(v, tensor_array_ops.TensorArray):
+          return v
+        elif not hasattr(v, "dtype"):
+          return ops.convert_to_tensor(v)
+        elif isinstance(v, sparse_tensor.SparseTensor):
+          return sparse_tensor.SparseTensor(indices=[[0] * len(v.get_shape())],
+                                            values=[v.dtype.as_numpy_dtype()],
+                                            dense_shape=v.get_shape())
         else:
-          return array_ops.constant(v.dtype.as_numpy_dtype())
+          return _create_empty_constant(v.dtype, v.get_shape())
 
-      if isinstance(dummy_value, collections.Sequence):
-        dummy_type = type(dummy_value)
-        empty = lambda: dummy_type(_correct_empty(v) for v in dummy_value)
-      else:
-        empty = lambda: _correct_empty(dummy_value)
+      empty = lambda: nest.map_structure(_correct_empty, default())
 
     # case_sequence = [
     #   cond(~p3 & ~p2 & ~p1, default, empty),
@@ -2972,7 +3139,7 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
         prev_case = cond(
             cp, fn,
             empty if i == 0 else lambda: prev_case,
-            name="If_%d" % i)
+            strict=strict, name="If_%d" % i)
       return prev_case
 
     if exclusive:
@@ -2994,6 +3161,8 @@ def case(pred_fn_pairs, default, exclusive=False, name="case"):
     else:
       case_seq = _build_case()
 
+    if not strict:
+      case_seq = _UnpackIfSingleton(case_seq)
     return case_seq
 
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 9037dd042ddfbb36bf492f96220e1c3d62c9e3aa..4e95783e5a81f01499bb3d164683d34de258b9b9 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import numpy as np
+
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -37,9 +42,14 @@ from tensorflow.python.ops import variables
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import momentum
+from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
 
 
+TestTuple = collections.namedtuple("TestTuple", "a b")
+SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
+
+
 class GroupTestCase(TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -303,6 +313,79 @@ class SwitchTestCase(TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  def testGradientThroughSingleBranchOutsideOfContext(self):
+    with self.test_session():
+      x = constant_op.constant(2.)
+      s = constant_op.constant(True)
+      x_false, x_true = control_flow_ops.switch(x, s)
+      grad_x_true = gradients_impl.gradients(x_true, x)[0]
+      grad_x_false = gradients_impl.gradients(x_false, x)[0]
+      self.assertEquals(grad_x_true.eval(), 1.)
+      self.assertEquals(grad_x_false.eval(), 0.)
+
+
+class CondTest(TensorFlowTestCase):
+
+  def testCondTrue(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalse(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
+          lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondTrueLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(5)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 34)
+
+  def testCondFalseLegacy(self):
+    with self.test_session():
+      x = constant_op.constant(2)
+      y = constant_op.constant(1)
+      z = control_flow_ops.cond(
+          math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
+          fn2=lambda: math_ops.add(y, 23))
+      self.assertEquals(z.eval(), 24)
+
+  def testCondMissingArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, false_fn=lambda: x)
+
+  def testCondMissingArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x)
+
+  def testCondDuplicateArg1(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+
+  def testCondDuplicateArg2(self):
+    with self.test_session():
+      x = constant_op.constant(1)
+      with self.assertRaises(TypeError):
+        control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+
 
 class ContextTest(TensorFlowTestCase):
 
@@ -334,5 +417,356 @@ class ContextTest(TensorFlowTestCase):
               control_flow_ops.WhileContext.from_proto(c.to_proto()).to_proto())
 
 
+def _GetNestedShape(nested):
+  def _GetShape(tensor):
+    if isinstance(tensor, tensor_array_ops.TensorArray):
+      return tensor_array_ops.TensorArray
+    elif isinstance(tensor, ops.IndexedSlices):
+      return tensor.dense_shape
+    else:
+      return tensor.get_shape()
+
+  return nest.map_structure(_GetShape, nested)
+
+
+def _CreateTensorArray(size, shape):
+  ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=size,
+                                    clear_after_read=False)
+  for i in range(size):
+    ta = ta.write(i, array_ops.zeros(shape))
+  return ta
+
+
+def _RawNestedShape(nested_shape):
+  def _RawShape(shape):
+    if isinstance(shape, tensor_shape.TensorShape) and shape.ndims is not None:
+      return [x.value for x in shape]
+    else:
+      return None
+  return nest.map_structure(_RawShape, nested_shape)
+
+
+# TODO(yori): Add tests for indexed slices.
+class DataTypesTest(TensorFlowTestCase):
+
+  def assertAllEqualNested(self, a, b):
+    if isinstance(a, (list, tuple)):
+      for entry_a, entry_b in zip(a, b):
+        self.assertAllEqualNested(entry_a, entry_b)
+    else:
+      self.assertAllEqual(a, b)
+
+  def _testShape(self, fn_true, fn_false, expected_shape,
+                 strict=False):
+    condition = array_ops.placeholder(dtypes.bool)
+    output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
+                                        strict=strict)
+    self.assertEqual(_RawNestedShape(_GetNestedShape(output_cond)),
+                     _RawNestedShape(expected_shape))
+
+    output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
+                                        strict=strict)
+    self.assertEqual(_RawNestedShape(_GetNestedShape(output_case)),
+                     _RawNestedShape(expected_shape))
+
+  def _testReturnValues(self, fn_true, fn_false, expected_value_true,
+                        expected_value_false, strict=False,
+                        check_cond=True):
+    condition = array_ops.placeholder(dtypes.bool)
+    output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
+                                        strict=strict)
+    output_case = control_flow_ops.case([(condition, fn_true)], fn_false,
+                                        strict=strict)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      result_cond, result_case = sess.run([output_cond, output_case],
+                                          feed_dict={condition: True})
+      self.assertAllEqualNested(result_cond, expected_value_true)
+      if check_cond:
+        self.assertAllEqualNested(result_case, expected_value_true)
+      result_cond, result_case = sess.run([output_cond, output_case],
+                                          feed_dict={condition: False})
+      self.assertAllEqualNested(result_cond, expected_value_false)
+      if check_cond:
+        self.assertAllEqualNested(result_case, expected_value_false)
+
+  def test_int(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: 1
+    fn_false = lambda: 2
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 2)
+    self._testShape(fn_true, fn_false, shape, strict=True)
+    self._testReturnValues(fn_true, fn_false, 1, 2, strict=True)
+
+  def test_float(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: 1.0
+    fn_false = lambda: 2.0
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1.0, 2.0)
+
+  def test_noop(self):
+    shape = tensor_shape.TensorShape(None)
+    self._testShape(control_flow_ops.no_op, control_flow_ops.no_op, shape)
+    self._testReturnValues(control_flow_ops.no_op, control_flow_ops.no_op,
+                           True, False, check_cond=False)
+
+  def test_string(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: "abc"
+    fn_false = lambda: "xyz"
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
+
+  def test_variable(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: variables.Variable(3.0)
+    fn_false = lambda: variables.Variable(4.0)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 3.0, 4.0)
+
+  def test_none(self):
+    fn_none = lambda: None
+    fn_tensor = lambda: constant_op.constant(1)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_none, fn_tensor)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
+
+  def test_tensors(self):
+    def _BuildTrueBranch(dtype):
+      def _Build():
+        return (array_ops.zeros([2, 2], dtype=dtype),
+                array_ops.ones([3, 3], dtype=dtype))
+      return _Build
+
+    def _BuildFalseBranch(dtype):
+      def _Build():
+        return (array_ops.ones([2, 2], dtype=dtype),
+                array_ops.zeros([3, 3], dtype=dtype))
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = (tensor_shape.TensorShape([2, 2]),
+               tensor_shape.TensorShape([3, 3]))
+      fn_true = _BuildTrueBranch(dtype)
+      fn_false = _BuildFalseBranch(dtype)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             (np.zeros([2, 2]), np.ones([3, 3])),
+                             (np.ones([2, 2]), np.zeros([3, 3])))
+
+  def test_tensors_unknown_shape(self):
+    def _BuildTrueBranch(dtype):
+      def _Build():
+        tensor = array_ops.zeros([2, 2], dtype=dtype)
+        tensor._shape = tensor_shape.TensorShape(None)
+        return tensor
+      return _Build
+
+    def _BuildFalseBranch(dtype):
+      def _Build():
+        tensor = array_ops.ones([2, 2], dtype=dtype)
+        tensor._shape = tensor_shape.TensorShape(None)
+        return tensor
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = tensor_shape.TensorShape(None)
+      fn_true = _BuildTrueBranch(dtype)
+      fn_false = _BuildFalseBranch(dtype)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             np.zeros([2, 2]), np.ones([2, 2]))
+
+  def test_sparse_tensors(self):
+    shape = tensor_shape.TensorShape([None, None])
+
+    def FnTrue():
+      return [sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]],
+                                         values=[1, 2], dense_shape=[3, 4])]
+
+    def FnFalse():
+      return [sparse_tensor.SparseTensor(indices=[[0, 0], [2, 1]],
+                                         values=[3, 4], dense_shape=[3, 4])]
+
+    value1 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [1, 2]],
+                                             values=[1, 2], dense_shape=[3, 4])
+    value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
+                                             values=[3, 4], dense_shape=[3, 4])
+    self._testShape(FnTrue, FnFalse, shape)
+    self._testReturnValues(FnTrue, FnFalse, value1, value2)
+    self._testShape(FnTrue, FnFalse, [shape], strict=True)
+    self._testReturnValues(FnTrue, FnFalse, [value1], [value2], strict=True)
+
+  def test_tensors_with_partially_specified_shapes(self):
+    def _BuildBranch(dtype, shape):
+      def _Build():
+        a = array_ops.zeros([2, 2], dtype=dtype)
+        b = array_ops.zeros([5], dtype=dtype)
+        c = array_ops.ones([3, 3], dtype=dtype)
+        a._shape = tensor_shape.TensorShape(shape[0])
+        b._shape = tensor_shape.TensorShape(shape[1])
+        c._shape = tensor_shape.TensorShape(shape[2])
+        return a, b, c
+      return _Build
+
+    for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
+      shape = (tensor_shape.TensorShape([None, 2]),
+               tensor_shape.TensorShape([None]),
+               tensor_shape.TensorShape([3, None]))
+      fn_true = _BuildBranch(dtype, shape)
+      fn_false = _BuildBranch(dtype, shape)
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false,
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])))
+
+  def test_tensor_arrays(self):
+    element_shape = tensor_shape.TensorShape([2])
+    ta1 = _CreateTensorArray(4, element_shape)
+    ta2 = _CreateTensorArray(4, element_shape)
+    shape = tensor_array_ops.TensorArray
+    fn_true = lambda: ta1
+    fn_false = lambda: ta2
+    self._testShape(fn_true, fn_false, shape)
+
+  def test_tensor_array_reads(self):
+    shape = tensor_shape.TensorShape([2])
+    ta = _CreateTensorArray(4, shape)
+    fn_true = lambda: ta.read(0)
+    fn_false = lambda: ta.read(1)
+    self._testShape(fn_true, fn_false, shape)
+
+  def test_list(self):
+    shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
+             tensor_shape.TensorShape([])]
+    fn_true = lambda: [constant_op.constant(1), 2, variables.Variable(3.0)]
+    fn_false = lambda: [constant_op.constant(3), 4, variables.Variable(5.0)]
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, [1, 2, 3.0], [3, 4, 5.0])
+
+  def test_non_strict(self):
+    shape = tensor_shape.TensorShape([])
+    fn_tensor = lambda: constant_op.constant(1)
+    fn_list = lambda: [constant_op.constant(2)]
+    fn_tuple = lambda: (constant_op.constant(3),)
+    self._testShape(fn_tensor, fn_list, shape)
+    self._testShape(fn_tensor, fn_tuple, shape)
+    self._testShape(fn_list, fn_tuple, shape)
+    self._testReturnValues(fn_tensor, fn_list, 1, 2)
+    self._testReturnValues(fn_tensor, fn_tuple, 1, 3)
+    self._testReturnValues(fn_list, fn_tuple, 2, 3)
+
+  def test_singleton_strict(self):
+    fn_tensor = lambda: constant_op.constant(1)
+    fn_list = lambda: [constant_op.constant(2)]
+    fn_tuple = lambda: (constant_op.constant(3),)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_list,
+                            strict=True)
+
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(constant_op.constant(True), fn_list, fn_tuple,
+                            strict=True)
+
+    with self.assertRaises(ValueError):
+      control_flow_ops.case([(constant_op.constant(True), fn_tensor)], fn_list,
+                            strict=True)
+
+    with self.assertRaises(TypeError):
+      control_flow_ops.case([(constant_op.constant(True), fn_list)], fn_tuple,
+                            strict=True)
+
+  def test_singleton_list(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: [constant_op.constant(1)]
+    fn_false = lambda: [constant_op.constant(3)]
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, [shape], strict=True)
+    self._testReturnValues(fn_true, fn_false, [1], [3], strict=True)
+
+  def test_singleton_tuple(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: (constant_op.constant(1),)
+    fn_false = lambda: (constant_op.constant(3),)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, (shape,), strict=True)
+    self._testReturnValues(fn_true, fn_false, (1,), (3,),
+                           strict=True)
+
+  def test_singleton_namedtuple(self):
+    shape = tensor_shape.TensorShape([])
+    fn_true = lambda: SingletonTestTuple(constant_op.constant(1))
+    fn_false = lambda: SingletonTestTuple(constant_op.constant(3))
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, 1, 3)
+    self._testShape(fn_true, fn_false, SingletonTestTuple(shape),
+                    strict=True)
+    self._testReturnValues(fn_true, fn_false, SingletonTestTuple(1),
+                           SingletonTestTuple(3), strict=True)
+
+  def test_tuple(self):
+    shape = (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+    fn_true = lambda: (constant_op.constant(1), 2)
+    fn_false = lambda: (constant_op.constant(3), 4)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, (1, 2), (3, 4))
+
+  def test_namedtuple(self):
+    shape = TestTuple(tensor_shape.TensorShape([]),
+                      tensor_shape.TensorShape([]))
+    fn_true = lambda: TestTuple(constant_op.constant(1), 2)
+    fn_false = lambda: TestTuple(constant_op.constant(3), 4)
+    self._testShape(fn_true, fn_false, shape)
+    self._testReturnValues(fn_true, fn_false, TestTuple(1, 2), TestTuple(3, 4))
+
+  def test_nested(self):
+    shape = [tensor_shape.TensorShape([]),
+             TestTuple(tensor_shape.TensorShape([]),
+                       [tensor_shape.TensorShape([]),
+                        tensor_shape.TensorShape([])]),
+             tensor_shape.TensorShape([5, 5]),
+             tensor_shape.TensorShape([])]
+
+    def FnTrue():
+      return [constant_op.constant(1),
+              TestTuple(constant_op.constant(2), [3, 4]),
+              array_ops.zeros([5, 5]), 6]
+
+    def FnFalse():
+      return [constant_op.constant(11),
+              TestTuple(constant_op.constant(12), [13, 14]),
+              array_ops.ones([5, 5]), 16]
+
+    self._testShape(FnTrue, FnFalse, shape)
+    self._testReturnValues(FnTrue, FnFalse,
+                           [1, TestTuple(2, [3, 4]), np.zeros([5, 5]), 6],
+                           [11, TestTuple(12, [13, 14]), np.ones([5, 5]), 16])
+
+  def test_cond_inside_while_loop(self):
+    def Body(i, matrix):
+      result_tuple, unused_matrix = control_flow_ops.cond(
+          constant_op.constant(True),
+          lambda: (TestTuple(matrix * 2, matrix * 4), matrix),
+          lambda: (TestTuple(matrix * 4, matrix * 2), matrix))
+      return [i+1, result_tuple.a]
+
+    iteration, matrix = control_flow_ops.while_loop(
+        lambda i, matrix: i < 10,
+        Body,
+        loop_vars=[constant_op.constant(0), array_ops.ones([2, 2])])
+
+    self.assertEqual(iteration.get_shape(), tensor_shape.TensorShape([]))
+    self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index b0a1fc3dd1b4139daca3434cc238e5f6b22e0626..477c0d1cb49ad44c64da8a14d05fbc796cecb9de 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -30,16 +30,15 @@ from tensorflow.python.ops.nn_grad import _BroadcastMul
 # pylint: disable=protected-access, invalid-name
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
-             ctc_merge_repeated=True, time_major=True):
+             ctc_merge_repeated=True,
+             ignore_longer_outputs_than_inputs=False, time_major=True):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
   This op implements the CTC loss as presented in the article:
 
-  A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
-  Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.
-
-  http://www.cs.toronto.edu/~graves/icml_2006.pdf
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Input requirements:
 
@@ -96,6 +95,11 @@ def ctc_loss(labels, inputs, sequence_length,
 
     Untested.  Very likely will not learn to output repeated classes.
 
+  The `ignore_longer_outputs_than_inputs` option allows to specify the behavior
+  of the CTCLoss when dealing with sequences that have longer outputs than
+  inputs. If true, the CTCLoss will simply return zero gradient for those
+  items, otherwise an InvalidArgument error is returned, stopping training.
+
   Args:
     labels: An `int32` `SparseTensor`.
       `labels.indices[i, :] == [b, t]` means `labels.values[i]` stores
@@ -113,6 +117,8 @@ def ctc_loss(labels, inputs, sequence_length,
     preprocess_collapse_repeated: Boolean.  Default: False.
       If True, repeated labels are collapsed prior to the CTC calculation.
     ctc_merge_repeated: Boolean.  Default: True.
+    ignore_longer_outputs_than_inputs: Boolean. Default: False.
+      If True, sequences with longer outputs than inputs will be ignored.
     time_major: The shape format of the `inputs` Tensors.
       If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
       If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
@@ -142,7 +148,8 @@ def ctc_loss(labels, inputs, sequence_length,
       labels.values,
       sequence_length,
       preprocess_collapse_repeated=preprocess_collapse_repeated,
-      ctc_merge_repeated=ctc_merge_repeated)
+      ctc_merge_repeated=ctc_merge_repeated,
+      ignore_longer_outputs_than_inputs=ignore_longer_outputs_than_inputs)
 
   return loss
 
@@ -197,7 +204,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
     merge_repeated: Boolean.  Default: True.
 
   Returns:
-    A tuple `(decoded, log_probabilities)` where
+    A tuple `(decoded, neg_sum_logits)` where
     decoded: A single-element list. `decoded[0]`
       is an `SparseTensor` containing the decoded outputs s.t.:
       `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
@@ -206,8 +213,9 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The vector stores the decoded classes.
       `decoded.shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
-    log_probability: A `float` matrix `(batch_size x 1)` containing sequence
-        log-probabilities.
+    neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
+        sequence found, the negative of the sum of the greatest logit at each
+        timeframe.
   """
   outputs = gen_ctc_ops._ctc_greedy_decoder(
       inputs, sequence_length, merge_repeated=merge_repeated)
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index bbe765a3af491b12e9a78418bb84772a1f660823..00f339c3935926332110a7f1f94c674b7561a5ec 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import re
 import threading
 
 import six
@@ -39,7 +38,6 @@ from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.util.deprecation import deprecated
 
 
 def _as_type_list(dtypes):
@@ -56,6 +54,7 @@ def _as_type_list(dtypes):
 def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
+  del dtypes
   if unknown_dim_allowed:
     if (not isinstance(shapes, collections.Sequence)
         or not shapes
@@ -517,7 +516,7 @@ class QueueBase(object):
     that would block will fail immediately.
 
     If `cancel_pending_enqueues` is `True`, all pending requests will also
-    be cancelled.
+    be canceled.
 
     Args:
       cancel_pending_enqueues: (Optional.) A boolean, defaulting to
@@ -925,16 +924,18 @@ class Barrier(object):
     If barrier has no completed elements, this operation will block
     until there are 'num_elements' elements to take.
 
+    TODO(b/25743580): the semantics of `allow_small_batch` are experimental
+    and may be extended to other cases in the future.
+
+    TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
+    already when the barrier is closed, it will block for ever. Fix this
+    by using asynchronous operations.
+
     Args:
       num_elements: The number of elements to take.
       allow_small_batch: If the barrier is closed, don't block if there are less
         completed elements than requested, but instead return all available
         completed elements.
-        TODO(b/25743580): the semantics of `allow_small_batch` are experimental
-        and may be extended to other cases in the future.
-        TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
-        already when the barrier is closed, it will block for ever. Fix this
-        by using asynchronous operations.
       timeout: This specifies the number of milliseconds to block
         before returning with DEADLINE_EXCEEDED. (This option is not
         supported yet.)
@@ -987,7 +988,7 @@ class Barrier(object):
     TakeMany operations that would block will fail immediately.
 
     If `cancel_pending_enqueues` is `True`, all pending requests to the
-    underlying queue will also be cancelled, and completing of already
+    underlying queue will also be canceled, and completing of already
     started values is also not acceptable anymore.
 
     Args:
@@ -1035,47 +1036,6 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
-def initialize_all_tables(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  return tables_initializer(name)
-
-
-def tables_initializer(name="init_all_tables"):
-  """Returns an Op that initializes all tables of the default graph.
-
-  Args:
-    name: Optional name for the initialization op.
-
-  Returns:
-    An Op that initializes all tables.  Note that if there are
-    not tables the returned Op is a NoOp.
-  """
-  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
-  if initializers:
-    return control_flow_ops.group(*initializers, name=name)
-  return control_flow_ops.no_op(name=name)
-
-
-ops.NotDifferentiable("LookupTableFind")
-ops.NotDifferentiable("LookupTableInsert")
-ops.NotDifferentiable("LookupTableSize")
-ops.NotDifferentiable("HashTable")
-ops.NotDifferentiable("InitializeTable")
-ops.NotDifferentiable("InitializeTableFromTextFile")
-ops.NotDifferentiable("MutableDenseHashTable")
-ops.NotDifferentiable("MutableHashTable")
-ops.NotDifferentiable("MutableHashTableOfTensors")
-
-
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1384,73 +1344,30 @@ class SparseConditionalAccumulator(ConditionalAccumulatorBase):
         dense_shape=return_val.shape)
 
 
-class StagingArea(object):
-  """Class for staging inputs. No ordering guarantees.
-
-  A `StagingArea` is a TensorFlow data structure that stores tensors across
-  multiple steps, and exposes operations that can put and get
-  tensors.
-
-  Each `StagingArea` element is a tuple of one or more tensors, where each
-  tuple component has a static dtype, and may have a static shape.
-
-  The capacity of a `StagingArea` is unbounded and supports multiple
-  concurrent producers and consumers; and provides exactly-once delivery.
-
-  Each element of a `StagingArea` is a fixed-length tuple of tensors whose
-  dtypes are described by `dtypes`, and whose shapes are optionally described
-  by the `shapes` argument.
-
-  If the `shapes` argument is specified, each component of a staging area
-  element must have the respective fixed shape. If it is
-  unspecified, different elements may have different shapes,
-  """
-
+class BaseStagingArea(object):
+  """Base class for Staging Areas."""
   _identifier = 0
   _lock = threading.Lock()
 
-  def __init__(self, dtypes, shapes=None, names=None, shared_name=None):
-    """Constructs a staging area object.
-
-    The two optional lists, `shapes` and `names`, must be of the same length
-    as `dtypes` if provided.  The values at a given index `i` indicate the
-    shape and name to use for the corresponding queue component in `dtypes`.
-
-    The device scope at the time of object creation determines where the
-    storage for the `StagingArea` will reside.  Calls to `put` will incur a copy
-    to this memory space, if necessary.  Tensors returned by `get` will be
-    placed according to the device scope when `get` is called.
-
-    Args:
-      dtypes:  A list of types.  The length of dtypes must equal the number
-        of tensors in each element.
-      shapes: (Optional.) Constraints on the shapes of tensors in an element.
-        A list of shape tuples or None. This list is the same length
-        as dtypes.  If the shape of any tensors in the element are constrained,
-        all must be; shapes can be None if the shapes should not be constrained.
-      names: (Optional.) If provided, the `get()` and
-        `put()` methods will use dictionaries with these names as keys.
-        Must be None or a list or tuple of the same length as `dtypes`.
-      shared_name: (Optional.) A name to be used for the shared object. By
-        passing the same name to two different python objects they will share
-        the underlying staging area. Must be a string.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                  capacity=0, memory_limit=0):
     if shared_name is None:
-      self._name = ops.get_default_graph().unique_name("StagingArea")
+      self._name = (ops.get_default_graph()
+                       .unique_name(self.__class__.__name__))
     elif isinstance(shared_name, six.string_types):
       self._name = shared_name
     else:
       raise ValueError("shared_name must be a string")
+
     self._dtypes = dtypes
+
     if shapes is not None:
       if len(shapes) != len(dtypes):
         raise ValueError("StagingArea shapes must be the same length as dtypes")
       self._shapes = [tensor_shape.TensorShape(s) for s in shapes]
     else:
       self._shapes = [tensor_shape.unknown_shape() for _ in self._dtypes]
+
     if names is not None:
       if len(names) != len(dtypes):
         raise ValueError("StagingArea names must be the same length as dtypes")
@@ -1458,6 +1375,9 @@ class StagingArea(object):
     else:
       self._names = None
 
+    self._capacity = capacity
+    self._memory_limit = memory_limit
+
     # all get and put ops must colocate with this op
     with ops.name_scope("%s_root" % self._name):
       self._coloc_op = control_flow_ops.no_op()
@@ -1482,52 +1402,140 @@ class StagingArea(object):
     """The list of names for each component of a staging area element."""
     return self._names
 
-  def _check_put_dtypes(self, vals):
+  @property
+  def capacity(self):
+    """The maximum number of elements of this staging area."""
+    return self._capacity
+
+  @property
+  def memory_limit(self):
+    """The maximum number of bytes of this staging area."""
+    return self._memory_limit
+
+  def _check_put_dtypes(self, vals, indices=None):
     """Validate and convert `vals` to a list of `Tensor`s.
 
     The `vals` argument can be a Tensor, a list or tuple of tensors, or a
     dictionary with tensor values.
 
+    If `vals` is a list, then the appropriate indices associated with the
+    values must be provided.
+
     If it is a dictionary, the staging area must have been constructed with a
     `names` attribute and the dictionary keys must match the staging area names.
+    `indices` will be inferred from the dictionary keys.
     If the staging area was constructed with a `names` attribute, `vals` must
     be a dictionary.
 
+    Checks that the dtype and shape of each value matches that
+    of the staging area.
+
     Args:
       vals: A tensor, a list or tuple of tensors, or a dictionary..
 
     Returns:
-      A list of `Tensor` objects.
+      A (tensors, indices) tuple where `tensors` is a list of `Tensor` objects
+      and `indices` is a list of indices associed with the tensors.
 
     Raises:
-      ValueError: If `vals` is invalid.
+      ValueError: If `vals` or `indices` is invalid.
     """
     if isinstance(vals, dict):
       if not self._names:
         raise ValueError(
             "Staging areas must have names to enqueue a dictionary")
-      if sorted(self._names) != sorted(vals.keys()):
+      if not set(vals.keys()).issubset(self._names):
         raise ValueError("Keys in dictionary to put do not match names "
                          "of staging area. Dictionary: (%s), Queue: (%s)" %
                          (sorted(vals.keys()), sorted(self._names)))
       # The order of values in `self._names` indicates the order in which the
       # tensors in the dictionary `vals` must be listed.
-      vals = [vals[k] for k in self._names]
+      vals, indices, n = zip(*[(vals[k], i, k) for i, k in enumerate(self._names)
+                                                  if k in vals])
     else:
       if self._names:
         raise ValueError("You must enqueue a dictionary in a staging area "
                          "with names")
+
+      if indices is None:
+        raise ValueError("Indices must be supplied when inserting a list "
+                        "of tensors")
+
+      if len(indices) != len(vals):
+        raise ValueError("Number of indices '%s' doesn't match "
+                         "number of values '%s'")
+
       if not isinstance(vals, (list, tuple)):
         vals = [vals]
+        indices = [0]
+
+    # Sanity check number of values
+    if not len(vals) <= len(self._dtypes):
+      raise ValueError("Unexpected number of inputs '%s' vs '%s'" % (
+                          len(values), len(self._dtypes)))
 
     tensors = []
-    for i, (val, dtype) in enumerate(zip(vals, self._dtypes)):
-      tensors.append(
-          ops.convert_to_tensor(
-              val, dtype=dtype, name="component_%d" % i))
+
+    for val, i in zip(vals, indices):
+      dtype, shape = self._dtypes[i], self._shapes[i]
+      # Check dtype
+      if not val.dtype == dtype:
+        raise ValueError("Datatypes do not match. '%s' != '%s'" %(
+                        str(val.dtype), str(dtype)))
+
+      # Check shape
+      val.get_shape().assert_is_compatible_with(shape)
+
+      tensors.append(ops.convert_to_tensor(val, dtype=dtype,
+                                          name="component_%d" % i))
+
+    return tensors, indices
+
+  def _create_device_transfers(self, tensors):
+    """Encode inter-device transfers if the current device
+    is not the same as the Staging Area's device
+    """
+
+    if not isinstance(tensors, (tuple, list)):
+      tensors = [tensors]
+
+    curr_device_scope = control_flow_ops.no_op().device
+
+    if curr_device_scope != self._coloc_op.device:
+      tensors = [array_ops.identity(t) for t in tensors]
 
     return tensors
 
+  def _get_return_value(self, tensors):
+    """Return the value to return from a get op.
+
+    If the staging area has names, return a dictionary with the
+    names as keys.  Otherwise return either a single tensor
+    or a list of tensors depending on the length of `tensors`.
+
+    Args:
+      tensors: List of tensors from the get op.
+
+    Returns:
+      A single tensor, a list of tensors, or a dictionary
+      of tensors.
+    """
+
+    tensors = self._create_device_transfers(tensors)
+
+    # Sets shape
+    for output, shape in zip(tensors, self._shapes):
+      output.set_shape(shape)
+
+    if self._names:
+      # The returned values in `tensors` are in the same order as
+      # the names in `self._names`.
+      return {n: tensors[i] for i, n in enumerate(self._names)}
+    elif len(tensors) == 1:
+      return tensors[0]
+    else:
+      return tensors
+
   def _scope_vals(self, vals):
     """Return a list of values to pass to `name_scope()`.
 
@@ -1544,9 +1552,86 @@ class StagingArea(object):
     else:
       return [vals]
 
+class StagingArea(BaseStagingArea):
+  """Class for staging inputs. No ordering guarantees.
+
+  A `StagingArea` is a TensorFlow data structure that stores tensors across
+  multiple steps, and exposes operations that can put and get tensors.
+
+  Each `StagingArea` element is a tuple of one or more tensors, where each
+  tuple component has a static dtype, and may have a static shape.
+
+  The capacity of a `StagingArea` may be bounded or unbounded.
+  It supports multiple concurrent producers and consumers; and
+  provides exactly-once delivery.
+
+  Each element of a `StagingArea` is a fixed-length tuple of tensors whose
+  dtypes are described by `dtypes`, and whose shapes are optionally described
+  by the `shapes` argument.
+
+  If the `shapes` argument is specified, each component of a staging area
+  element must have the respective fixed shape. If it is
+  unspecified, different elements may have different shapes,
+
+  It can be configured with a capacity in which case
+  put(values) will block until space becomes available.
+
+  Similarly, it can be configured with a memory limit which
+  will block put(values) until space is available.
+  This is mostly useful for limiting the number of tensors on
+  devices such as GPUs.
+
+  All get() and peek() commands block if the the requested data
+  is not present in the Staging Area.
+
+  """
+
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                  capacity=0, memory_limit=0):
+    """Constructs a staging area object.
+
+    The two optional lists, `shapes` and `names`, must be of the same length
+    as `dtypes` if provided.  The values at a given index `i` indicate the
+    shape and name to use for the corresponding queue component in `dtypes`.
+
+    The device scope at the time of object creation determines where the
+    storage for the `StagingArea` will reside.  Calls to `put` will incur a copy
+    to this memory space, if necessary.  Tensors returned by `get` will be
+    placed according to the device scope when `get` is called.
+
+    Args:
+      dtypes:  A list of types.  The length of dtypes must equal the number
+        of tensors in each element.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area.
+        An integer. If zero, the Staging Area is unbounded
+      shapes: (Optional.) Constraints on the shapes of tensors in an element.
+        A list of shape tuples or None. This list is the same length
+        as dtypes.  If the shape of any tensors in the element are constrained,
+        all must be; shapes can be None if the shapes should not be constrained.
+      names: (Optional.) If provided, the `get()` and
+        `put()` methods will use dictionaries with these names as keys.
+        Must be None or a list or tuple of the same length as `dtypes`.
+      shared_name: (Optional.) A name to be used for the shared object. By
+        passing the same name to two different python objects they will share
+        the underlying staging area. Must be a string.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+
+    super(StagingArea, self).__init__(dtypes, shapes,
+                                          names, shared_name,
+                                          capacity, memory_limit)
+
   def put(self, values, name=None):
     """Create an op that places a value into the staging area.
 
+    This operation will block if the `StagingArea` has reached
+    its capacity.
+
     Args:
       values: Tensor (or a tuple of Tensors) to place into the staging area.
       name: A name for the operation (optional).
@@ -1559,46 +1644,23 @@ class StagingArea(object):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
-      vals = self._check_put_dtypes(values)
-      if len(values) != len(self._dtypes):
-        raise ValueError("Unexpected number of inputs " + str(len(values)) +
-                         "vs " + str(len(self._dtypes)))
-      for val, dtype in zip(vals, self._dtypes):
-        if val.dtype != dtype:
-          raise ValueError("Datatypes do not match. " + str(val.dtype) + " != "
-                           + str(dtype))
 
-      for val, shape in zip(vals, self._shapes):
-        val.get_shape().assert_is_compatible_with(shape)
+      # Hard-code indices for this staging area
+      indices = range(len(values)) if isinstance(values, (list, tuple)) else None
+      vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
         op = gen_data_flow_ops.stage(values=vals, shared_name=self._name,
-                                     name=scope)
+                                     name=scope, capacity=self._capacity,
+                                     memory_limit=self._memory_limit)
 
       return op
 
-  def _get_return_value(self, tensors):
-    """Return the value to return from a get op.
-
-    If the staging area has names, return a dictionary with the
-    names as keys.  Otherwise return either a single tensor
-    or a list of tensors depending on the length of `tensors`.
-
-    Args:
-      tensors: List of tensors from the get op.
+  def __internal_get(self, get_fn, name):
+    with ops.colocate_with(self._coloc_op):
+      ret = get_fn()
 
-    Returns:
-      A single tensor, a list of tensors, or a dictionary
-      of tensors.
-    """
-    if self._names:
-      # The returned values in `tensors` are in the same order as
-      # the names in `self._names`.
-      return {n: tensors[i] for i, n in enumerate(self._names)}
-    elif len(tensors) == 1:
-      return tensors[0]
-    else:
-      return tensors
+    return self._get_return_value(ret)
 
   def get(self, name=None):
     """Gets one element from this staging area.
@@ -1606,6 +1668,13 @@ class StagingArea(object):
     If the staging area is empty when this operation executes, it will block
     until there is an element to dequeue.
 
+    Note that unlike others ops that can block, like the queue Dequeue
+    operations, this can stop other work from happening.  To avoid this, the
+    intended use is for this to be called only when there will be an element
+    already available.  One method for doing this in a training loop would be to
+    run a `put()` call during a warmup session.run call, and then call both
+    `get()` and `put()` in each subsequent step.
+
     The placement of the returned tensor will be determined by the current
     device scope when this function is called.
 
@@ -1618,19 +1687,388 @@ class StagingArea(object):
     if name is None:
       name = "%s_get" % self._name
 
+    fn = lambda: gen_data_flow_ops.unstage(dtypes=self._dtypes,
+                    shared_name=self._name, name=name,
+                    capacity=self._capacity,
+                    memory_limit=self._memory_limit)
+
+    return self.__internal_get(fn, name)
+
+  def peek(self, index, name=None):
+    """Peeks at an element in the staging area.
+
+    If the staging area is too small to contain the element at
+    the specified index, it will block until enough elements
+    are inserted to complete the operation.
+
+    The placement of the returned tensor will be determined by
+    the current device scope when this function is called.
+
+    Args:
+      index: The index of the tensor within the staging area
+              to look up.
+      name: A name for the operation (optional).
+
+    Returns:
+      The tuple of tensors that was gotten.
+    """
+    if name is None:
+      name = "%s_peek" % self._name
+
+    fn = lambda: gen_data_flow_ops.stage_peek(index,
+                    dtypes=self._dtypes, shared_name=self._name,
+                    name=name, capacity=self._capacity,
+                    memory_limit=self._memory_limit)
+
+    return self.__internal_get(fn, name)
+
+  def size(self, name=None):
+    """Returns the number of elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_size" % self._name
+
+    return gen_data_flow_ops.stage_size(name=name, shared_name=self._name,
+                        dtypes=self._dtypes, capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+  def clear(self, name=None):
+    """Clears the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_clear" % self._name
+
+    return gen_data_flow_ops.stage_clear(name=name, shared_name=self._name,
+                        dtypes=self._dtypes, capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+class MapStagingArea(BaseStagingArea):
+  """
+  A `MapStagingArea` is a TensorFlow data structure that stores tensors across
+  multiple steps, and exposes operations that can put and get tensors.
+
+  Each `MapStagingArea` element is a (key, value) pair.
+  Only int64 keys are supported, other types should be
+  hashed to produce a key.
+  Values are a tuple of one or more tensors.
+  Each tuple component has a static dtype,
+  and may have a static shape.
+
+  The capacity of a `MapStagingArea` may be bounded or unbounded.
+  It supports multiple concurrent producers and consumers; and
+  provides exactly-once delivery.
+
+  Each value tuple of a `MapStagingArea` is a fixed-length tuple of tensors whose
+  dtypes are described by `dtypes`, and whose shapes are optionally described
+  by the `shapes` argument.
+
+  If the `shapes` argument is specified, each component of a staging area
+  element must have the respective fixed shape. If it is
+  unspecified, different elements may have different shapes,
+
+  It behaves like an associative container with support for:
+
+   - put(key, values)
+   - peek(key)         like dict.get(key)
+   - get(key)          like dict.pop(key)
+   - get(key=None)     like dict.popitem()
+   - size()
+   - clear()
+
+  If ordered a tree structure ordered by key will be used and
+  get(key=None) will remove (key, value) pairs in increasing key order.
+  Otherwise a hashtable
+
+  It can be configured with a capacity in which case
+  put(key, values) will block until space becomes available.
+
+  Similarly, it can be configured with a memory limit which
+  will block put(key, values) until space is available.
+  This is mostly useful for limiting the number of tensors on
+  devices such as GPUs.
+
+  All get() and peek() commands block if the requested
+  (key, value) pair is not present in the staging area.
+
+  Incomplete puts are supported and will be placed in an incomplete
+  hash until such time as all values associated with the key have
+  been inserted. Once completed, this (key, value) pair will be
+  inserted into the main data structure. Data in the incomplete set
+  counts towards the memory limit, but not towards capacity limit.
+  """
+
+  def __init__(self, dtypes, shapes=None, names=None, shared_name=None,
+                      ordered=False, capacity=0, memory_limit=0):
+    """
+    Args:
+      dtypes:  A list of types.  The length of dtypes must equal the number
+        of tensors in each element.
+      capacity: (Optional.) Maximum number of elements.
+        An integer. If zero, the Staging Area is unbounded
+      memory_limit: (Optional.) Maximum number of bytes of all tensors
+        in the Staging Area (excluding keys).
+        An integer. If zero, the Staging Area is unbounded
+      ordered: (Optional.) If True the underlying data structure
+        is a tree ordered on key. Otherwise assume a hashtable.
+      shapes: (Optional.) Constraints on the shapes of tensors in an element.
+        A list of shape tuples or None. This list is the same length
+        as dtypes.  If the shape of any tensors in the element are constrained,
+        all must be; shapes can be None if the shapes should not be constrained.
+      names: (Optional.) If provided, the `get()` and
+        `put()` methods will use dictionaries with these names as keys.
+        Must be None or a list or tuple of the same length as `dtypes`.
+      shared_name: (Optional.) A name to be used for the shared object. By
+        passing the same name to two different python objects they will share
+        the underlying staging area. Must be a string.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    """
+
+    super(MapStagingArea, self).__init__(dtypes, shapes,
+                                      names, shared_name,
+                                      capacity, memory_limit)
+
+    # Defer to different methods depending if the map is ordered
+    self._ordered = ordered
+
+    if ordered:
+      self._put_fn = gen_data_flow_ops.ordered_map_stage
+      self._pop_fn = gen_data_flow_ops.ordered_map_unstage
+      self._popitem_fn = gen_data_flow_ops.ordered_map_unstage_no_key
+      self._peek_fn = gen_data_flow_ops.ordered_map_peek
+      self._size_fn = gen_data_flow_ops.ordered_map_size
+      self._incomplete_size_fn = gen_data_flow_ops.ordered_map_incomplete_size
+      self._clear_fn = gen_data_flow_ops.ordered_map_clear
+    else:
+      self._put_fn = gen_data_flow_ops.map_stage
+      self._pop_fn = gen_data_flow_ops.map_unstage
+      self._popitem_fn = gen_data_flow_ops.map_unstage_no_key
+      self._peek_fn = gen_data_flow_ops.map_peek
+      self._size_fn = gen_data_flow_ops.map_size
+      self._incomplete_size_fn = gen_data_flow_ops.map_incomplete_size
+      self._clear_fn = gen_data_flow_ops.map_clear
+
+  def put(self, key, vals, indices=None, name=None):
+    """
+    Create an op that stores the (key, vals) pair in the staging area.
+
+    Incomplete puts are possible, preferably using a dictionary for vals
+    as the appropriate dtypes and shapes can be inferred from the value names
+    dictionary key values. If vals is a list or tuple, indices must
+    also be specified so that the op knows at which element position
+    to perform the insert.
+
+    This operation will block if the capacity or memory limit of this
+    container is reached.
+
+    Args:
+        key: Key associated with the data
+        vals: Tensor (or a dict/tuple of Tensors) to place
+                into the staging area.
+        indices: (Optional) if vals is a tuple/list, this is required.
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+
+    Raises:
+        ValueError: If the number or type of inputs don't match the staging area.
+    """
+
+    with ops.name_scope(name, "%s_put" % self._name,
+                        self._scope_vals(vals)) as scope:
+
+      vals, indices = self._check_put_dtypes(vals, indices)
+
+      with ops.colocate_with(self._coloc_op):
+        op = self._put_fn(key, indices, vals, dtypes=self._dtypes,
+                             shared_name=self._name, name=scope,
+                             capacity=self._capacity,
+                             memory_limit=self._memory_limit)
+    return op
+
+  def peek(self, key, name=None):
+    """
+    Peeks at staging area data associated with the key.
+
+    If the key is not in the staging area, it will block
+    until the associated (key, value) is inserted.
+
+    Args:
+        key: Key associated with the required data
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+
+    if name is None:
+      name = "%s_pop" % self._name
+
     with ops.colocate_with(self._coloc_op):
-      ret = gen_data_flow_ops.unstage(dtypes=self._dtypes,
-                                      shared_name=self._name, name=name)
+      result = self._peek_fn(key, shared_name=self._name,
+                      dtypes=self._dtypes,
+                      name=name,
+                      capacity=self._capacity,
+                      memory_limit=self._memory_limit)
 
-    curr_device_scope = control_flow_ops.no_op().device
-    if curr_device_scope != self._coloc_op.device:
-      for i in range(len(ret)):
-        ret[i] = array_ops.identity(ret[i])
+    return self._get_return_value(result)
 
-    for output, shape in zip(ret, self._shapes):
-      output.set_shape(shape)
+  def get(self, key=None, name=None):
+    """
+    If the key is provided, the associated (key, value)
+    is returned from the staging area. If the key is not
+    in the staging area, this method will block until
+    the associated (key, value) is inserted.
 
-    return self._get_return_value(ret)
+    If no key is provided and the staging area is ordered,
+    the (key, value) with the smallest key will be returned.
+    Otherwise, a random (key, value) will be returned.
+
+    If the staging area is empty when this operation executes,
+    it will block until there is an element to dequeue.
+
+    Args:
+        key: Key associated with the required data (Optional)
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if key is None:
+      return self._popitem(name)
+    else:
+      return self._pop(key, name)
+
+
+  def _pop(self, key, name=None):
+    """
+    Remove and return the associated (key, value)
+    is returned from the staging area. If the key is not
+    in the staging area, this method will block until
+    the associated (key, value) is inserted.
+
+    Args:
+        key: Key associated with the required data
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_get" % self._name
+
+    with ops.colocate_with(self._coloc_op):
+      result = self._pop_fn(key, shared_name=self._name,
+                      dtypes=self._dtypes,
+                      name=name,
+                      capacity=self._capacity,
+                      memory_limit=self._memory_limit)
+
+    return key, self._get_return_value(result)
+
+  def _popitem(self, name=None):
+    """
+    If the staging area is ordered,
+    the (key, value) with the smallest key will be returned.
+    Otherwise, a random (key, value) will be returned.
+
+    If the staging area is empty when this operation executes,
+    it will block until there is an element to dequeue.
+
+    Args:
+        key: Key associated with the required data
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_get_nokey" % self._name
+
+    with ops.colocate_with(self._coloc_op):
+      key, result = self._popitem_fn(shared_name=self._name,
+                              dtypes=self._dtypes,
+                              name=name,
+                              capacity=self._capacity,
+                              memory_limit=self._memory_limit)
+
+    # Separate keys and results out from
+    # underlying namedtuple
+    key = self._create_device_transfers(key)[0]
+    result = self._get_return_value(result)
+
+    return key, result
+
+  def size(self, name=None):
+    """
+    Returns the number of elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_size" % self._name
+
+    return self._size_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+  def incomplete_size(self, name=None):
+    """
+    Returns the number of incomplete elements in the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_incomplete_size" % self._name
+
+    return self._incomplete_size_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
+
+
+
+  def clear(self, name=None):
+    """
+    Clears the staging area.
+
+    Args:
+        name: A name for the operation (optional)
+
+    Returns:
+        The created op
+    """
+    if name is None:
+      name = "%s_clear" % self._name
+
+    return self._clear_fn(shared_name=self._name,
+                        name=name, dtypes=self._dtypes,
+                        capacity=self._capacity,
+                        memory_limit=self._memory_limit)
 
 
 class RecordInput(object):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..833239eb5fa7ad68b3cd2cc5d2346f060a1727ca
--- /dev/null
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -0,0 +1,42 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    features = [
+        "-layering_check",
+        "-parse_headers",
+    ],
+)
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "distributions",
+    srcs = glob(["*.py"]),
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/ops/distributions/__init__.py b/tensorflow/python/ops/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..563b189990cfed5d6418c7cfca6c0fdf4226995f
--- /dev/null
+++ b/tensorflow/python/ops/distributions/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Core module for TensorFlow distribution objects and helpers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
similarity index 90%
rename from tensorflow/contrib/distributions/python/ops/bernoulli.py
rename to tensorflow/python/ops/distributions/bernoulli.py
index 33e6dbd78b904310923e7107b72b94f8df8a725e..3281b57e83e374ddae9ac9cb1d4ef0154c12f836 100644
--- a/tensorflow/contrib/distributions/python/ops/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -18,17 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Bernoulli(distribution.Distribution):
@@ -120,6 +121,7 @@ class Bernoulli(distribution.Distribution):
     return math_ops.cast(sample, self.dtype)
 
   def _log_prob(self, event):
+    event = self._maybe_assert_valid_sample(event)
     # TODO(jaana): The current sigmoid_cross_entropy_with_logits has
     # inconsistent  behavior for logits = inf/-inf.
     event = math_ops.cast(event, self.logits.dtype)
@@ -160,6 +162,17 @@ class Bernoulli(distribution.Distribution):
     """Returns `1` if `prob > 0.5` and `0` otherwise."""
     return math_ops.cast(self.probs > 0.5, self.dtype)
 
+  def _maybe_assert_valid_sample(self, event, check_integer=True):
+    if not self.validate_args:
+      return event
+    event = distribution_util.embed_check_nonnegative_discrete(
+        event, check_integer=check_integer)
+    return control_flow_ops.with_dependencies([
+        check_ops.assert_less_equal(
+            event, array_ops.ones_like(event),
+            message="event is not less than or equal to 1."),
+    ], event)
+
 
 class BernoulliWithSigmoidProbs(Bernoulli):
   """Bernoulli with `probs = nn.sigmoid(logits)`."""
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/python/ops/distributions/beta.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/beta.py
rename to tensorflow/python/ops/distributions/beta.py
index 463808ea9af800515691e6bf7e7226b44a5ce68c..2b93478cdf9f9e80f4c2c19ad25cb270a8e7aa98 100644
--- a/tensorflow/contrib/distributions/python/ops/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector.py b/tensorflow/python/ops/distributions/bijector.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector.py
rename to tensorflow/python/ops/distributions/bijector.py
index b0727cd8f36b8e954bc57897636181e47ead6e1e..70e9fdadd20e42b5618a23f4b03aa24decd267ba 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/bijector.py
+++ b/tensorflow/python/ops/distributions/bijector.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.bijector_impl import *
+from tensorflow.python.ops.distributions.bijector_impl import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector_impl.py
rename to tensorflow/python/ops/distributions/bijector_impl.py
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py b/tensorflow/python/ops/distributions/bijector_test_util.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
rename to tensorflow/python/ops/distributions/bijector_test_util.py
index a0834423329da7bd512c3c825c888185f66af6bf..ff3535c62642d98bdd9b18808f45deae27d6d88d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
+++ b/tensorflow/python/ops/distributions/bijector_test_util.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 
 
 def assert_finite(array):
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/python/ops/distributions/categorical.py
similarity index 90%
rename from tensorflow/contrib/distributions/python/ops/categorical.py
rename to tensorflow/python/ops/distributions/categorical.py
index abdb94b3e9c18098e5a70a668e0ad3586a6cc6b7..bad7e6e42f008849a60759e69f3902a3a713f293 100644
--- a/tensorflow/contrib/distributions/python/ops/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -18,9 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +26,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class Categorical(distribution.Distribution):
@@ -193,6 +193,22 @@ class Categorical(distribution.Distribution):
         array_ops.concat([[n], self.batch_shape_tensor()], 0))
     return ret
 
+  def _cdf(self, k):
+    k = ops.convert_to_tensor(k, name="k")
+
+    # If there are multiple batch dimension, flatten them into one.
+    batch_flattened_probs = array_ops.reshape(self._probs,
+                                              [-1, self._event_size])
+    batch_flattened_k = array_ops.reshape(k, (-1,))
+
+    # Form a tensor to sum over.
+    mask_tensor = array_ops.sequence_mask(batch_flattened_k, self._event_size)
+    to_sum_over = array_ops.where(mask_tensor,
+                                  batch_flattened_probs,
+                                  array_ops.zeros_like(batch_flattened_probs))
+    batch_flat_cdf = math_ops.reduce_sum(to_sum_over, axis=-1)
+    return array_ops.reshape(batch_flat_cdf, self._batch_shape())
+
   def _log_prob(self, k):
     k = ops.convert_to_tensor(k, name="k")
     if self.logits.get_shape()[:-1] == k.get_shape():
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/dirichlet.py
rename to tensorflow/python/ops/distributions/dirichlet.py
index c524f322b0d67f858c29b318839aacd96504f322..923696a553caae80592be65f7ffeecb3f9373bb0 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -29,6 +27,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
rename to tensorflow/python/ops/distributions/dirichlet_multinomial.py
index e647a4981ca6444c0c7e0404d181b1a2c4438229..662a7655584b8dc6aeed5251f98dd17fb24f3606 100644
--- a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -28,6 +26,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/distribution.py b/tensorflow/python/ops/distributions/distribution.py
similarity index 96%
rename from tensorflow/contrib/distributions/python/ops/distribution.py
rename to tensorflow/python/ops/distributions/distribution.py
index 5beb3999bc5c43ec63592debaa57a5ab9b1eae15..a0be433a616103fc9525c157494629044704ec02 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -20,19 +20,19 @@ from __future__ import print_function
 
 import abc
 import contextlib
-import inspect
 import types
 
 import numpy as np
 import six
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.util import tf_inspect
 
 
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
@@ -154,12 +154,12 @@ class _DistributionMeta(abc.ABCMeta):
       if class_special_attr_value is None:
         # No _special method available, no need to update the docstring.
         continue
-      class_special_attr_docstring = inspect.getdoc(class_special_attr_value)
+      class_special_attr_docstring = tf_inspect.getdoc(class_special_attr_value)
       if not class_special_attr_docstring:
         # No docstring to append.
         continue
       class_attr_value = _copy_fn(base_attr_value)
-      class_attr_docstring = inspect.getdoc(base_attr_value)
+      class_attr_docstring = tf_inspect.getdoc(base_attr_value)
       if class_attr_docstring is None:
         raise ValueError(
             "Expected base class fn to contain a docstring: %s.%s"
@@ -241,7 +241,7 @@ class Distribution(_BaseDistribution):
   docstrings for their method specializations. For example:
 
   ```python
-  @distribution_util.AppendDocstring("Some other details.")
+  @util.AppendDocstring("Some other details.")
   def _log_prob(self, value):
     ...
   ```
@@ -870,6 +870,36 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._mean()
 
+  def _quantile(self, value):
+    raise NotImplementedError("quantile is not implemented")
+
+  def _call_quantile(self, value, name, **kwargs):
+    with self._name_scope(name, values=[value]):
+      value = ops.convert_to_tensor(value, name="value")
+      try:
+        return self._quantile(value, **kwargs)
+      except NotImplementedError as original_exception:
+        raise original_exception
+
+  def quantile(self, value, name="quantile"):
+    """Quantile function. Aka "inverse cdf" or "percent point function".
+
+    Given random variable `X` and `p in [0, 1]`, the `quantile` is:
+
+    ```none
+    quantile(p) := x such that P[X <= x] == p
+    ```
+
+    Args:
+      value: `float` or `double` `Tensor`.
+      name: The name to give this op.
+
+    Returns:
+      quantile: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
+        values of type `self.dtype`.
+    """
+    return self._call_quantile(value, name)
+
   def _variance(self):
     raise NotImplementedError("variance is not implemented")
 
@@ -1003,10 +1033,9 @@ class Distribution(_BaseDistribution):
     if ndims is None:
       # Maybe expand_dims.
       ndims = array_ops.rank(x)
-      expanded_shape = distribution_util.pick_vector(
+      expanded_shape = util.pick_vector(
           math_ops.equal(ndims, 0),
-          np.array([1], dtype=np.int32),
-          array_ops.shape(x))
+          np.array([1], dtype=np.int32), array_ops.shape(x))
       x = array_ops.reshape(x, expanded_shape)
     elif ndims == 0:
       # Definitely expand_dims.
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/python/ops/distributions/exponential.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/exponential.py
rename to tensorflow/python/ops/distributions/exponential.py
index a293d1e0dc27ece2c9bd6c326674e2b2414b675a..281641b9156b9631199efc78ea1c2d30119dadb8 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/python/ops/distributions/gamma.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/gamma.py
rename to tensorflow/python/ops/distributions/gamma.py
index f46e2116e107da5bd418507cde565242d16e8e6b..4ac2b9b4ef894fd9a603ff67bf9c8754f1e23b8e 100644
--- a/tensorflow/contrib/distributions/python/ops/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +30,9 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py b/tensorflow/python/ops/distributions/identity_bijector.py
similarity index 95%
rename from tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py
rename to tensorflow/python/ops/distributions/identity_bijector.py
index 9438a5226cd83142b460a99ea7899a46fea73a16..f277eda8bbfb88f2344dfd620c573e0acd8d8078 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/identity_impl.py
+++ b/tensorflow/python/ops/distributions/identity_bijector.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import bijector
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
similarity index 80%
rename from tensorflow/contrib/distributions/python/ops/kullback_leibler.py
rename to tensorflow/python/ops/distributions/kullback_leibler.py
index bb94a8768096aed6fd129843599db3f72af92156..9770d82bd8398a9f6d88c4360b77a7a691e72e5a 100644
--- a/tensorflow/contrib/distributions/python/ops/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -18,12 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_inspect
 
 
 _DIVERGENCES = {}
@@ -31,8 +30,8 @@ _DIVERGENCES = {}
 
 def _registered_kl(type_a, type_b):
   """Get the KL function registered for classes a and b."""
-  hierarchy_a = inspect.getmro(type_a)
-  hierarchy_b = inspect.getmro(type_b)
+  hierarchy_a = tf_inspect.getmro(type_a)
+  hierarchy_b = tf_inspect.getmro(type_b)
   dist_to_children = None
   kl_fn = None
   for mro_to_a, parent_a in enumerate(hierarchy_a):
@@ -45,11 +44,13 @@ def _registered_kl(type_a, type_b):
   return kl_fn
 
 
-def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
-  """Get the KL-divergence KL(dist_a || dist_b).
+def kl_divergence(distribution_a, distribution_b,
+                  allow_nan_stats=True, name=None):
+  """Get the KL-divergence KL(distribution_a || distribution_b).
 
-  If there is no KL method registered specifically for `type(dist_a)` and
-  `type(dist_b)`, then the class hierarchies of these types are searched.
+  If there is no KL method registered specifically for `type(distribution_a)`
+  and `type(distribution_b)`, then the class hierarchies of these types are
+  searched.
 
   If one KL method is registered between any pairs of classes in these two
   parent hierarchies, it is used.
@@ -59,11 +60,11 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
 
   If more than one such shortest path exists, the first method
   identified in the search is used (favoring a shorter MRO distance to
-  `type(dist_a)`).
+  `type(distribution_a)`).
 
   Args:
-    dist_a: The first distribution.
-    dist_b: The second distribution.
+    distribution_a: The first distribution.
+    distribution_b: The second distribution.
     allow_nan_stats: Python `bool`, default `True`. When `True`,
       statistics (e.g., mean, mode, variance) use the value "`NaN`" to
       indicate the result is undefined. When `False`, an exception is raised
@@ -71,20 +72,22 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
     name: Python `str` name prefixed to Ops created by this class.
 
   Returns:
-    A Tensor with the batchwise KL-divergence between dist_a and dist_b.
+    A Tensor with the batchwise KL-divergence between `distribution_a`
+    and `distribution_b`.
 
   Raises:
     NotImplementedError: If no KL method is defined for distribution types
-      of dist_a and dist_b.
+      of `distribution_a` and `distribution_b`.
   """
-  kl_fn = _registered_kl(type(dist_a), type(dist_b))
+  kl_fn = _registered_kl(type(distribution_a), type(distribution_b))
   if kl_fn is None:
     raise NotImplementedError(
-        "No KL(dist_a || dist_b) registered for dist_a type %s and dist_b "
-        "type %s" % (type(dist_a).__name__, type(dist_b).__name__))
+        "No KL(distribution_a || distribution_b) registered for distribution_a "
+        "type %s and distribution_b type %s"
+        % (type(distribution_a).__name__, type(distribution_b).__name__))
 
   with ops.name_scope("KullbackLeibler"):
-    kl_t = kl_fn(dist_a, dist_b, name=name)
+    kl_t = kl_fn(distribution_a, distribution_b, name=name)
     if allow_nan_stats:
       return kl_t
 
@@ -97,7 +100,7 @@ def kl(dist_a, dist_b, allow_nan_stats=True, name=None):
                 math_ops.reduce_any(math_ops.is_nan(kl_t))),
             ["KL calculation between %s and %s returned NaN values "
              "(and was called with allow_nan_stats=False). Values:"
-             % (dist_a.name, dist_b.name), kl_t])]):
+             % (distribution_a.name, distribution_b.name), kl_t])]):
       return array_ops.identity(kl_t, name="checked_kl")
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/python/ops/distributions/laplace.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/laplace.py
rename to tensorflow/python/ops/distributions/laplace.py
index eff4f5f9b8906385c2b8635c97eeccb0b08e9e68..5c964ff78a53b6d2dec588b85abff2c5b1173c06 100644
--- a/tensorflow/contrib/distributions/python/ops/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -22,8 +22,6 @@ import math
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/multinomial.py
rename to tensorflow/python/ops/distributions/multinomial.py
index e5e24cc87f05f5e000402e0269bb043c76cacf44..a5bea7b4bad0e644cb7776446195f2734750ce7e 100644
--- a/tensorflow/contrib/distributions/python/ops/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -27,6 +25,8 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/normal.py b/tensorflow/python/ops/distributions/normal.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/ops/normal.py
rename to tensorflow/python/ops/distributions/normal.py
index c06dd570b95ac4aed75fd135698976727b30954a..0ef1c91df8c83146fdae086d6056b1d947bae128 100644
--- a/tensorflow/contrib/distributions/python/ops/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -20,9 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import kullback_leibler
-from tensorflow.contrib.distributions.python.ops import special_math
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +29,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
+from tensorflow.python.ops.distributions import special_math
 
 
 __all__ = [
@@ -70,14 +70,14 @@ class Normal(distribution.Distribution):
 
   ```python
   # Define a single scalar Normal distribution.
-  dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
+  dist = tf.distributions.Normal(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Normals.
   # The first has mean 1 and standard deviation 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -92,7 +92,7 @@ class Normal(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Normals.
   # Both have mean 1, but different standard deviations.
-  dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
@@ -215,6 +215,9 @@ class Normal(distribution.Distribution):
   def _mean(self):
     return self.loc * array_ops.ones_like(self.scale)
 
+  def _quantile(self, p):
+    return self._inv_z(special_math.ndtri(p))
+
   def _stddev(self):
     return self.scale * array_ops.ones_like(self.loc)
 
@@ -226,6 +229,11 @@ class Normal(distribution.Distribution):
     with ops.name_scope("standardize", values=[x]):
       return (x - self.loc) / self.scale
 
+  def _inv_z(self, z):
+    """Reconstruct input `x` from a its normalized version."""
+    with ops.name_scope("reconstruct", values=[z]):
+      return z * self.scale + self.loc
+
 
 class NormalWithSoftplusScale(Normal):
   """Normal with softplus applied to `scale`."""
diff --git a/tensorflow/contrib/distributions/python/ops/special_math.py b/tensorflow/python/ops/distributions/special_math.py
similarity index 63%
rename from tensorflow/contrib/distributions/python/ops/special_math.py
rename to tensorflow/python/ops/distributions/special_math.py
index e5e5e1963e063f3f15b6a01bea88be218728a5db..f96eafed71255a78abd94a3d41bac8b83bf14b25 100644
--- a/tensorflow/contrib/distributions/python/ops/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import math_ops
 
 __all__ = [
     "ndtr",
+    "ndtri",
     "log_ndtr",
     "log_cdf_laplace",
 ]
@@ -100,6 +101,137 @@ def _ndtr(x):
   return 0.5 * y
 
 
+def ndtri(p, name="ndtri"):
+  """The inverse of the CDF of the Normal distribution function.
+
+  Returns x such that the area under the pdf from minus infinity to x is equal
+  to p.
+
+  A piece-wise rational approximation is done for the function.
+  This is a port of the implementation in netlib.
+
+  Args:
+    p: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="ndtri").
+
+  Returns:
+    x: `Tensor` with `dtype=p.dtype`.
+
+  Raises:
+    TypeError: if `p` is not floating-type.
+  """
+
+  with ops.name_scope(name, values=[p]):
+    p = ops.convert_to_tensor(p, name="p")
+    if p.dtype.as_numpy_dtype not in [np.float32, np.float64]:
+      raise TypeError(
+          "p.dtype=%s is not handled, see docstring for supported types."
+          % p.dtype)
+    return _ndtri(p)
+
+
+def _ndtri(p):
+  """Implements ndtri core logic."""
+
+  # Constants used in piece-wise rational approximations. Taken from the cephes
+  # library:
+  # https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtri.c
+  p0 = list(reversed([-5.99633501014107895267E1,
+                      9.80010754185999661536E1,
+                      -5.66762857469070293439E1,
+                      1.39312609387279679503E1,
+                      -1.23916583867381258016E0]))
+  q0 = list(reversed([1.0,
+                      1.95448858338141759834E0,
+                      4.67627912898881538453E0,
+                      8.63602421390890590575E1,
+                      -2.25462687854119370527E2,
+                      2.00260212380060660359E2,
+                      -8.20372256168333339912E1,
+                      1.59056225126211695515E1,
+                      -1.18331621121330003142E0]))
+  p1 = list(reversed([4.05544892305962419923E0,
+                      3.15251094599893866154E1,
+                      5.71628192246421288162E1,
+                      4.40805073893200834700E1,
+                      1.46849561928858024014E1,
+                      2.18663306850790267539E0,
+                      -1.40256079171354495875E-1,
+                      -3.50424626827848203418E-2,
+                      -8.57456785154685413611E-4]))
+  q1 = list(reversed([1.0,
+                      1.57799883256466749731E1,
+                      4.53907635128879210584E1,
+                      4.13172038254672030440E1,
+                      1.50425385692907503408E1,
+                      2.50464946208309415979E0,
+                      -1.42182922854787788574E-1,
+                      -3.80806407691578277194E-2,
+                      -9.33259480895457427372E-4]))
+  p2 = list(reversed([3.23774891776946035970E0,
+                      6.91522889068984211695E0,
+                      3.93881025292474443415E0,
+                      1.33303460815807542389E0,
+                      2.01485389549179081538E-1,
+                      1.23716634817820021358E-2,
+                      3.01581553508235416007E-4,
+                      2.65806974686737550832E-6,
+                      6.23974539184983293730E-9]))
+  q2 = list(reversed([1.0,
+                      6.02427039364742014255E0,
+                      3.67983563856160859403E0,
+                      1.37702099489081330271E0,
+                      2.16236993594496635890E-1,
+                      1.34204006088543189037E-2,
+                      3.28014464682127739104E-4,
+                      2.89247864745380683936E-6,
+                      6.79019408009981274425E-9]))
+
+  def _create_polynomial(var, coeffs):
+    """Compute n_th order polynomial via Horner's method."""
+    if not coeffs:
+      return 0.
+    return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var
+
+  maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p)
+  # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
+  # later on. The result from the computation when p == 0 is not used so any
+  # number that doesn't result in NaNs is fine.
+  sanitized_mcp = array_ops.where(
+      maybe_complement_p <= 0.,
+      constant_op.constant(0.5, dtype=p.dtype, shape=p.shape),
+      maybe_complement_p)
+
+  # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
+  w = sanitized_mcp - 0.5
+  ww = w ** 2
+  x_for_big_p = w + w * ww * (_create_polynomial(ww, p0)
+                              / _create_polynomial(ww, q0))
+  x_for_big_p *= -np.sqrt(2. * np.pi)
+
+  # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
+  # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
+  # arrays based on wether p < exp(-32).
+  z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
+  first_term = z - math_ops.log(z) / z
+  second_term_small_p = (_create_polynomial(1. / z, p2)
+                         / _create_polynomial(1. / z, q2)) / z
+  second_term_otherwise = (_create_polynomial(1. / z, p1)
+                           / _create_polynomial(1. / z, q1)) / z
+  x_for_small_p = first_term - second_term_small_p
+  x_otherwise = first_term - second_term_otherwise
+
+  x = array_ops.where(sanitized_mcp > np.exp(-2.),
+                      x_for_big_p,
+                      array_ops.where(z >= 8.0, x_for_small_p, x_otherwise))
+
+  x = array_ops.where(p > 1. - np.exp(-2.), x, -x)
+  infinity = constant_op.constant(np.inf, dtype=x.dtype, shape=x.shape)
+  x_nan_replaced = array_ops.where(
+      p <= 0.0, -infinity, array_ops.where(p >= 1.0, infinity, x))
+  return x_nan_replaced
+
+
 def log_ndtr(x, series_order=3, name="log_ndtr"):
   """Log Normal distribution function.
 
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/python/ops/distributions/student_t.py
similarity index 96%
rename from tensorflow/contrib/distributions/python/ops/student_t.py
rename to tensorflow/python/ops/distributions/student_t.py
index 87f5ecd7ae76f1cdcb4fa3606f97dda9e07af423..073ac4286be170dcfd564f61f1026a85d95c772c 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -33,6 +31,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -42,8 +42,10 @@ __all__ = [
 
 
 class StudentT(distribution.Distribution):
-  # pylint: disable=line-too-long
-  """Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
+  """Student's t-distribution.
+
+  This distribution has parameters: degree of freedom `df`, location `loc`,
+  and `scale`.
 
   #### Mathematical details
 
@@ -82,7 +84,7 @@ class StudentT(distribution.Distribution):
 
   ```python
   # Define a single scalar Student t distribution.
-  single_dist = tf.contrib.distributions.StudentT(df=3)
+  single_dist = tf.distributions.StudentT(df=3)
 
   # Evaluate the pdf at 1, returning a scalar Tensor.
   single_dist.prob(1.)
@@ -90,7 +92,7 @@ class StudentT(distribution.Distribution):
   # Define a batch of two scalar valued Student t's.
   # The first has degrees of freedom 2, mean 1, and scale 11.
   # The second 3, 2 and 22.
-  multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
+  multi_dist = tf.distributions.StudentT(df=[2, 3],
                                                  loc=[1, 2.],
                                                  scale=[11, 22.])
 
@@ -107,7 +109,7 @@ class StudentT(distribution.Distribution):
   ```python
   # Define a batch of two Student's t distributions.
   # Both have df 2 and mean 1, but different scales.
-  dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
+  dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/ops/transformed_distribution.py
rename to tensorflow/python/ops/distributions/transformed_distribution.py
index 844f78ca968ca831ccb1b294f3c72ef3a821ddc2..1be3819569cc1fca599b8967667cb93253edb8f8 100644
--- a/tensorflow/contrib/distributions/python/ops/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -19,9 +19,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import distribution as distributions
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.bijectors import identity as identity_lib
+# Bijectors must be directly imported because `remove_undocumented` prevents
+# individual file imports.
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -31,6 +30,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+from tensorflow.python.ops.distributions import identity_bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
 __all__ = [
     "TransformedDistribution",
@@ -119,7 +121,7 @@ def _is_scalar_from_shape(shape):
   return _logical_equal(_ndims_from_shape(shape), 0)
 
 
-class TransformedDistribution(distributions.Distribution):
+class TransformedDistribution(distribution_lib.Distribution):
   """A Transformed Distribution.
 
   A `TransformedDistribution` models `p(y)` given a base distribution `p(x)`,
@@ -146,49 +148,19 @@ class TransformedDistribution(distributions.Distribution):
 
   A `TransformedDistribution` implements the following operations:
 
-    * `sample`:
+    * `sample`
+      Mathematically:   `Y = g(X)`
+      Programmatically: `bijector.forward(distribution.sample(...))`
 
-      Mathematically:
+    * `log_prob`
+      Mathematically:   `(log o pdf)(Y=y) = (log o pdf o g^{-1})(y)
+                         + (log o abs o det o J o g^{-1})(y)`
+      Programmatically: `(distribution.log_prob(bijector.inverse(y))
+                         + bijector.inverse_log_det_jacobian(y))`
 
-      ```none
-      Y = g(X)
-      ```
-
-      Programmatically:
-
-      ```python
-      return bijector.forward(distribution.sample(...))
-      ```
-
-    * `log_prob`:
-
-      Mathematically:
-
-      ```none
-      (log o pdf)(Y=y) = (log o pdf o g^{-1})(y) +
-                           (log o abs o det o J o g^{-1})(y)
-      ```
-
-      Programmatically:
-
-      ```python
-      return (distribution.log_prob(bijector.inverse(y)) +
-              bijector.inverse_log_det_jacobian(y))
-      ```
-
-    * `log_cdf`:
-
-      Mathematically:
-
-      ```none
-      (log o cdf)(Y=y) = (log o cdf o g^{-1})(y)
-      ```
-
-      Programmatically:
-
-      ```python
-      return distribution.log_cdf(bijector.inverse(x))
-      ```
+    * `log_cdf`
+      Mathematically:   `(log o cdf)(Y=y) = (log o cdf o g^{-1})(y)`
+      Programmatically: `distribution.log_cdf(bijector.inverse(x))`
 
     * and similarly for: `cdf`, `prob`, `log_survival_function`,
      `survival_function`.
@@ -199,7 +171,7 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   log_normal = ds.TransformedDistribution(
-    distribution=ds.Normal(loc=mu, scale=sigma),
+    distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Exp(),
     name="LogNormalTransformedDistribution")
   ```
@@ -209,7 +181,7 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   log_normal = ds.TransformedDistribution(
-    distribution=ds.Normal(loc=mu, scale=sigma),
+    distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Inline(
       forward_fn=tf.exp,
       inverse_fn=tf.log,
@@ -223,8 +195,11 @@ class TransformedDistribution(distributions.Distribution):
   ```python
   ds = tf.contrib.distributions
   normal = ds.TransformedDistribution(
-    distribution=ds.Normal(loc=0, scale=1),
-    bijector=ds.bijectors.ScaleAndShift(loc=mu, scale=sigma, event_ndims=0),
+    distribution=ds.Normal(loc=0., scale=1.),
+    bijector=ds.bijectors.Affine(
+      shift=-1.,
+      scale_identity_multiplier=2.,
+      event_ndims=0),
     name="NormalTransformedDistribution")
   ```
 
@@ -237,7 +212,6 @@ class TransformedDistribution(distributions.Distribution):
   multivariate Normal as a `TransformedDistribution`.
 
   ```python
-  bs = tf.contrib.distributions.bijector
   ds = tf.contrib.distributions
   # We will create two MVNs with batch_shape = event_shape = 2.
   mean = [[-1., 0],      # batch:0
@@ -248,7 +222,7 @@ class TransformedDistribution(distributions.Distribution):
                [2, 2]]]  # batch:1
   mvn1 = ds.TransformedDistribution(
       distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.Affine(shift=mean, tril=chol_cov),
+      bijector=ds.bijectors.Affine(shift=mean, scale_tril=chol_cov),
       batch_shape=[2],  # Valid because base_distribution.batch_shape == [].
       event_shape=[2])  # Valid because base_distribution.event_shape == [].
   mvn2 = ds.MultivariateNormalTriL(loc=mean, scale_tril=chol_cov)
@@ -291,7 +265,7 @@ class TransformedDistribution(distributions.Distribution):
       self._empty = constant_op.constant([], dtype=dtypes.int32, name="empty")
 
       if bijector is None:
-        bijector = identity_lib.Identity(validate_args=validate_args)
+        bijector = identity_bijector.Identity(validate_args=validate_args)
 
       # We will keep track of a static and dynamic version of
       # self._is_{batch,event}_override. This way we can do more prior to graph
@@ -365,7 +339,7 @@ class TransformedDistribution(distributions.Distribution):
             self.distribution.event_shape_tensor()))
 
   def _event_shape(self):
-    # If there's a chance that the event_shape has been overriden, we return
+    # If there's a chance that the event_shape has been overridden, we return
     # what we statically know about the `event_shape_override`. This works
     # because: `_is_maybe_event_override` means `static_override` is `None` or a
     # non-empty list, i.e., we don't statically know the `event_shape` or we do.
@@ -386,7 +360,7 @@ class TransformedDistribution(distributions.Distribution):
         self.distribution.batch_shape_tensor())
 
   def _batch_shape(self):
-    # If there's a chance that the batch_shape has been overriden, we return
+    # If there's a chance that the batch_shape has been overridden, we return
     # what we statically know about the `batch_shape_override`. This works
     # because: `_is_maybe_batch_override` means `static_override` is `None` or a
     # non-empty list, i.e., we don't statically know the `batch_shape` or we do.
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/python/ops/distributions/uniform.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/uniform.py
rename to tensorflow/python/ops/distributions/uniform.py
index 81a4c8cdefeeb1ffcab96cd0af717fbfee700cad..9b555f87eae14fe30ff020f996778a4ad8f98ab9 100644
--- a/tensorflow/contrib/distributions/python/ops/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.contrib.distributions.python.ops import distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
 
 
 class Uniform(distribution.Distribution):
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a7b53a3cecf50e087034e53fe0fd5bc9c9af43
--- /dev/null
+++ b/tensorflow/python/ops/distributions/util.py
@@ -0,0 +1,693 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for probability distributions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import hashlib
+import math
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+
+
+def assert_close(
+    x, y, data=None, summarize=None, message=None, name="assert_close"):
+  """Assert that that x and y are within machine epsilon of each other.
+
+  Args:
+    x: Floating-point `Tensor`
+    y: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if |x - y| > machine epsilon.
+  """
+  message = message or ""
+  x = ops.convert_to_tensor(x, name="x")
+  y = ops.convert_to_tensor(y, name="y")
+
+  if data is None:
+    data = [
+        message,
+        "Condition x ~= y did not hold element-wise: x = ", x.name, x, "y = ",
+        y.name, y
+    ]
+
+  if x.dtype.is_integer:
+    return check_ops.assert_equal(
+        x, y, data=data, summarize=summarize, message=message, name=name)
+
+  with ops.name_scope(name, "assert_close", [x, y, data]):
+    tol = np.finfo(x.dtype.as_numpy_dtype).eps
+    condition = math_ops.reduce_all(math_ops.less_equal(math_ops.abs(x-y), tol))
+    return control_flow_ops.Assert(
+        condition, data, summarize=summarize)
+
+
+def assert_integer_form(
+    x, data=None, summarize=None, message=None, name="assert_integer_form"):
+  """Assert that x has integer components (or floats equal to integers).
+
+  Args:
+    x: Floating-point `Tensor`
+    data: The tensors to print out if the condition is `False`. Defaults to
+      error message and first few entries of `x` and `y`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).
+
+  Returns:
+    Op raising `InvalidArgumentError` if round(x) != x.
+  """
+
+  message = message or "x has non-integer components"
+  x = ops.convert_to_tensor(x, name="x")
+  casted_x = math_ops.to_int64(x)
+  return check_ops.assert_equal(
+      x, math_ops.cast(math_ops.round(casted_x), x.dtype),
+      data=data, summarize=summarize, message=message, name=name)
+
+
+def assert_symmetric(matrix):
+  matrix_t = array_ops.matrix_transpose(matrix)
+  return control_flow_ops.with_dependencies(
+      [check_ops.assert_equal(matrix, matrix_t)], matrix)
+
+
+def embed_check_nonnegative_discrete(x, check_integer=True):
+  """Assert x is a non-negative tensor, and optionally of integers."""
+  assertions = [check_ops.assert_non_negative(
+      x, message="x must be non-negative.")]
+  if check_integer:
+    assertions += [assert_integer_form(
+        x, message="x cannot contain fractional components.")]
+  return control_flow_ops.with_dependencies(assertions, x)
+
+
+def same_dynamic_shape(a, b):
+  """Returns whether a and b have the same dynamic shape.
+
+  Args:
+    a: `Tensor`
+    b: `Tensor`
+
+  Returns:
+    `bool` `Tensor` representing if both tensors have the same shape.
+  """
+  a = ops.convert_to_tensor(a, name="a")
+  b = ops.convert_to_tensor(b, name="b")
+
+  # Here we can't just do math_ops.equal(a.shape, b.shape), since
+  # static shape inference may break the equality comparison between
+  # shape(a) and shape(b) in math_ops.equal.
+  def all_shapes_equal():
+    return math_ops.reduce_all(math_ops.equal(
+        array_ops.concat([array_ops.shape(a), array_ops.shape(b)], 0),
+        array_ops.concat([array_ops.shape(b), array_ops.shape(a)], 0)))
+
+  # One of the shapes isn't fully defined, so we need to use the dynamic
+  # shape.
+  return control_flow_ops.cond(
+      math_ops.equal(array_ops.rank(a), array_ops.rank(b)),
+      all_shapes_equal,
+      lambda: constant_op.constant(False))
+
+
+def get_logits_and_probs(logits=None,
+                         probs=None,
+                         multidimensional=False,
+                         validate_args=False,
+                         name="get_logits_and_probs"):
+  """Converts logit to probabilities (or vice-versa), and returns both.
+
+  Args:
+    logits: Floating-point `Tensor` representing log-odds.
+    probs: Floating-point `Tensor` representing probabilities.
+    multidimensional: Python `bool`, default `False`.
+      If `True`, represents whether the last dimension of `logits` or `probs`,
+      a `[N1, N2, ...  k]` dimensional tensor, representing the
+      logit or probability of `shape[-1]` classes.
+    validate_args: Python `bool`, default `False`. When `True`, either assert
+      `0 <= probs <= 1` (if not `multidimensional`) or that the last dimension
+      of `probs` sums to one.
+    name: A name for this operation (optional).
+
+  Returns:
+    logits, probs: Tuple of `Tensor`s. If `probs` has an entry that is `0` or
+      `1`, then the corresponding entry in the returned logit will be `-Inf` and
+      `Inf` respectively.
+
+  Raises:
+    ValueError: if neither `probs` nor `logits` were passed in, or both were.
+  """
+  with ops.name_scope(name, values=[probs, logits]):
+    if (probs is None) == (logits is None):
+      raise ValueError("Must pass probs or logits, but not both.")
+
+    if probs is None:
+      logits = ops.convert_to_tensor(logits, name="logits")
+      if multidimensional:
+        return logits, nn.softmax(logits, name="probs")
+      return logits, math_ops.sigmoid(logits, name="probs")
+
+    probs = ops.convert_to_tensor(probs, name="probs")
+    if validate_args:
+      with ops.name_scope("validate_probs"):
+        one = constant_op.constant(1., probs.dtype)
+        dependencies = [check_ops.assert_non_negative(probs)]
+        if multidimensional:
+          dependencies += [assert_close(math_ops.reduce_sum(probs, -1), one,
+                                        message="probs does not sum to 1.")]
+        else:
+          dependencies += [check_ops.assert_less_equal(
+              probs, one, message="probs has components greater than 1.")]
+        probs = control_flow_ops.with_dependencies(dependencies, probs)
+
+    with ops.name_scope("logits"):
+      if multidimensional:
+        # Here we don't compute the multidimensional case, in a manner
+        # consistent with respect to the unidimensional case. We do so
+        # following the TF convention. Typically, you might expect to see
+        # logits = log(probs) - log(probs[pivot]). A side-effect of
+        # being consistent with the TF approach is that the unidimensional case
+        # implicitly handles the second dimension but the multidimensional case
+        # explicitly keeps the pivot dimension.
+        return math_ops.log(probs), probs
+      return math_ops.log(probs) - math_ops.log1p(-1. * probs), probs
+
+
+def log_combinations(n, counts, name="log_combinations"):
+  """Multinomial coefficient.
+
+  Given `n` and `counts`, where `counts` has last dimension `k`, we compute
+  the multinomial coefficient as:
+
+  ```n! / sum_i n_i!```
+
+  where `i` runs over all `k` classes.
+
+  Args:
+    n: Floating-point `Tensor` broadcastable with `counts`. This represents `n`
+      outcomes.
+    counts: Floating-point `Tensor` broadcastable with `n`. This represents
+      counts in `k` classes, where `k` is the last dimension of the tensor.
+    name: A name for this operation (optional).
+
+  Returns:
+    `Tensor` representing the multinomial coefficient between `n` and `counts`.
+  """
+  # First a bit about the number of ways counts could have come in:
+  # E.g. if counts = [1, 2], then this is 3 choose 2.
+  # In general, this is (sum counts)! / sum(counts!)
+  # The sum should be along the last dimension of counts. This is the
+  # "distribution" dimension. Here n a priori represents the sum of counts.
+  with ops.name_scope(name, values=[n, counts]):
+    n = ops.convert_to_tensor(n, name="n")
+    counts = ops.convert_to_tensor(counts, name="counts")
+    total_permutations = math_ops.lgamma(n + 1)
+    counts_factorial = math_ops.lgamma(counts + 1)
+    redundant_permutations = math_ops.reduce_sum(counts_factorial, axis=[-1])
+    return total_permutations - redundant_permutations
+
+
+def matrix_diag_transform(matrix, transform=None, name=None):
+  """Transform diagonal of [batch-]matrix, leave rest of matrix unchanged.
+
+  Create a trainable covariance defined by a Cholesky factor:
+
+  ```python
+  # Transform network layer into 2 x 2 array.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+
+  # Make the diagonal positive. If the upper triangle was zero, this would be a
+  # valid Cholesky factor.
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # OperatorPDCholesky ignores the upper triangle.
+  operator = OperatorPDCholesky(chol)
+  ```
+
+  Example of heteroskedastic 2-D linear regression.
+
+  ```python
+  # Get a trainable Cholesky factor.
+  matrix_values = tf.contrib.layers.fully_connected(activations, 4)
+  matrix = tf.reshape(matrix_values, (batch_size, 2, 2))
+  chol = matrix_diag_transform(matrix, transform=tf.nn.softplus)
+
+  # Get a trainable mean.
+  mu = tf.contrib.layers.fully_connected(activations, 2)
+
+  # This is a fully trainable multivariate normal!
+  dist = tf.contrib.distributions.MVNCholesky(mu, chol)
+
+  # Standard log loss. Minimizing this will "train" mu and chol, and then dist
+  # will be a distribution predicting labels as multivariate Gaussians.
+  loss = -1 * tf.reduce_mean(dist.log_prob(labels))
+  ```
+
+  Args:
+    matrix:  Rank `R` `Tensor`, `R >= 2`, where the last two dimensions are
+      equal.
+    transform:  Element-wise function mapping `Tensors` to `Tensors`. To
+      be applied to the diagonal of `matrix`. If `None`, `matrix` is returned
+      unchanged. Defaults to `None`.
+    name:  A name to give created ops.
+      Defaults to "matrix_diag_transform".
+
+  Returns:
+    A `Tensor` with same shape and `dtype` as `matrix`.
+  """
+  with ops.name_scope(name, "matrix_diag_transform", [matrix]):
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    if transform is None:
+      return matrix
+    # Replace the diag with transformed diag.
+    diag = array_ops.matrix_diag_part(matrix)
+    transformed_diag = transform(diag)
+    transformed_mat = array_ops.matrix_set_diag(matrix, transformed_diag)
+
+  return transformed_mat
+
+
+def rotate_transpose(x, shift, name="rotate_transpose"):
+  """Circularly moves dims left or right.
+
+  Effectively identical to:
+
+  ```python
+  numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift))
+  ```
+
+  When `validate_args=False` additional graph-runtime checks are
+  performed. These checks entail moving data from to GPU to CPU.
+
+  Example:
+
+    ```python
+    x = ...  # Tensor of shape [1, 2, 3, 4].
+    rotate_transpose(x, -1)  # result shape: [2, 3, 4, 1]
+    rotate_transpose(x, -2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x,  1)  # result shape: [4, 1, 2, 3]
+    rotate_transpose(x,  2)  # result shape: [3, 4, 1, 2]
+    rotate_transpose(x, 7) == rotate_transpose(x, 3)
+    rotate_transpose(x, -7) == rotate_transpose(x, -3)
+    ```
+
+  Args:
+    x: `Tensor`.
+    shift: `Tensor`. Number of dimensions to transpose left (shift<0) or
+      transpose right (shift>0).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    rotated_x: Input `Tensor` with dimensions circularly rotated by shift.
+
+  Raises:
+    TypeError: if shift is not integer type.
+  """
+  with ops.name_scope(name, values=[x, shift]):
+    x = ops.convert_to_tensor(x, name="x")
+    shift = ops.convert_to_tensor(shift, name="shift")
+    # We do not assign back to preserve constant-ness.
+    check_ops.assert_integer(shift)
+    shift_value_static = tensor_util.constant_value(shift)
+    ndims = x.get_shape().ndims
+    if ndims is not None and shift_value_static is not None:
+      if ndims < 2: return x
+      shift_value_static = np.sign(shift_value_static) * (
+          abs(shift_value_static) % ndims)
+      if shift_value_static == 0: return x
+      perm = np.roll(np.arange(ndims), shift_value_static)
+      return array_ops.transpose(x, perm=perm)
+    else:
+      # Consider if we always had a positive shift, and some specified
+      # direction.
+      # When shifting left we want the new array:
+      #   last(x, n-shift) + first(x, shift)
+      # and if shifting right then we want:
+      #   last(x, shift) + first(x, n-shift)
+      # Observe that last(a) == slice(a, n) and first(a) == slice(0, a).
+      # Also, we can encode direction and shift as one: direction * shift.
+      # Combining these facts, we have:
+      #   a = cond(shift<0, -shift, n-shift)
+      #   last(x, n-a) + first(x, a) == x[a:n] + x[0:a]
+      # Finally, we transform shift by modulo length so it can be specified
+      # independently from the array upon which it operates (like python).
+      ndims = array_ops.rank(x)
+      shift = array_ops.where(math_ops.less(shift, 0),
+                              math_ops.mod(-shift, ndims),
+                              ndims - math_ops.mod(shift, ndims))
+      first = math_ops.range(0, shift)
+      last = math_ops.range(shift, ndims)
+      perm = array_ops.concat([last, first], 0)
+      return array_ops.transpose(x, perm=perm)
+
+
+def pick_vector(cond,
+                true_vector,
+                false_vector,
+                name="pick_vector"):
+  """Picks possibly different length row `Tensor`s based on condition.
+
+  Value `Tensor`s should have exactly one dimension.
+
+  If `cond` is a python Boolean or `tf.constant` then either `true_vector` or
+  `false_vector` is immediately returned. I.e., no graph nodes are created and
+  no validation happens.
+
+  Args:
+    cond: `Tensor`. Must have `dtype=tf.bool` and be scalar.
+    true_vector: `Tensor` of one dimension. Returned when cond is `True`.
+    false_vector: `Tensor` of one dimension. Returned when cond is `False`.
+    name: Python `str`. The name to give this op.
+
+  Example:
+
+  ```python
+  pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [10, 11].
+  pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18))
+  # result is tensor: [15, 16, 17].
+  ```
+
+  Returns:
+    true_or_false_vector: `Tensor`.
+
+  Raises:
+    TypeError: if `cond.dtype != tf.bool`
+    TypeError: if `cond` is not a constant and
+      `true_vector.dtype != false_vector.dtype`
+  """
+  with ops.name_scope(name, values=(cond, true_vector, false_vector)):
+    cond = ops.convert_to_tensor(cond, name="cond")
+    if cond.dtype != dtypes.bool:
+      raise TypeError("%s.dtype=%s which is not %s" %
+                      (cond.name, cond.dtype, dtypes.bool))
+    cond_value_static = tensor_util.constant_value(cond)
+    if cond_value_static is not None:
+      return true_vector if cond_value_static else false_vector
+    true_vector = ops.convert_to_tensor(true_vector, name="true_vector")
+    false_vector = ops.convert_to_tensor(false_vector, name="false_vector")
+    if true_vector.dtype != false_vector.dtype:
+      raise TypeError(
+          "%s.dtype=%s does not match %s.dtype=%s"
+          % (true_vector.name, true_vector.dtype,
+             false_vector.name, false_vector.dtype))
+    n = array_ops.shape(true_vector)[0]
+    return array_ops.slice(
+        array_ops.concat([true_vector, false_vector], 0),
+        [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
+
+
+def gen_new_seed(seed, salt):
+  """Generate a new seed, from the given seed and salt."""
+  if seed is None:
+    return None
+  string = (str(seed) + salt).encode("utf-8")
+  return int(hashlib.md5(string).hexdigest()[:8], 16) & 0x7FFFFFFF
+
+
+def fill_lower_triangular(x, validate_args=False, name="fill_lower_triangular"):
+  """Creates a (batch of) lower triangular matrix from a vector of inputs.
+
+  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
+  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
+  `n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))`.
+
+  Although the non-batch complexity is O(n**2), large constants and sub-optimal
+  vectorization means the complexity of this function is 5x slower than zeroing
+  out the upper triangular, i.e., `tf.matrix_band_part(X, -1, 0)`. This
+  function becomes competitive only when several matmul/cholesky/etc ops can be
+  ellided in constructing the input. Example: wiring a fully connected layer as
+  a covariance matrix; this function reduces the final layer by 2x and possibly
+  reduces the network arch complexity considerably. In most cases it is better
+  to simply build a full matrix and zero out the upper triangular elements,
+  e.g., `tril = tf.matrix_band_part(full, -1, 0)`, rather than directly
+  construct a lower triangular.
+
+  Example:
+
+  ```python
+  fill_lower_triangular([1, 2, 3, 4, 5, 6])
+  # Returns: [[1, 0, 0],
+  #           [2, 3, 0],
+  #           [4, 5, 6]]
+  ```
+
+  For comparison, a pure numpy version of this function can be found in
+  `distribution_util_test.py`, function `_fill_lower_triangular`.
+
+  Args:
+    x: `Tensor` representing lower triangular elements.
+    validate_args: Python `bool`, default `False`. Whether to ensure the shape
+      of `x` can be mapped to a lower triangular matrix (controls non-static
+      checks only).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    tril: `Tensor` with lower triangular elements filled from `x`.
+
+  Raises:
+    ValueError: if shape if `x` has static shape which cannot be mapped to a
+      lower triangular matrix.
+  """
+  # TODO(jvdillon): Replace this code with dedicated op when it exists.
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[-1].value is not None):
+      d = x.get_shape()[-1].value
+      # d = n(n+1)/2 implies n is:
+      n = int(0.5 * (math.sqrt(1. + 8. * d) - 1.))
+      d_inferred = n * (n + 1) /2
+      if d != d_inferred:
+        raise ValueError("Input cannot be mapped to a lower triangular; "
+                         "n*(n+1)/2 = %d != %d" % (d_inferred, d))
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([n, n]))
+    else:
+      d = math_ops.cast(array_ops.shape(x)[-1], dtype=dtypes.float32)
+      # d = n(n+1)/2 implies n is:
+      n = math_ops.cast(0.5 * (dtypes.sqrt(1. + 8. * d) - 1.),
+                        dtype=dtypes.int32)
+      if validate_args:
+        is_valid_input_shape = check_ops.assert_equal(
+            n * (n + 1) / 2, d,
+            message="Input cannot be mapped to a lower triangular.")
+        n = control_flow_ops.with_dependencies([is_valid_input_shape], n)
+      final_shape = x.get_shape()[:-1].concatenate(
+          tensor_shape.TensorShape([None, None]))
+
+    def tril_ids(n):
+      """Internal helper to create vector of linear indices into y."""
+      # Build the ids statically; chose 512 because it implies 1MiB.
+      if not tensor_util.is_tensor(n) and n <= 512:
+        ids = np.arange(n**2, dtype=np.int32)
+        rows = (ids / n).astype(np.int32)  # Implicit floor.
+        # We need to stop incrementing the index when we encounter
+        # upper-triangular elements. The idea here is to compute the
+        # lower-right number of zeros then by "symmetry" subtract this from the
+        # total number of zeros, n(n-1)/2.
+        # Then we note that: n(n-1)/2 - (n-r)*(n-r-1)/2 = r(2n-r-1)/2
+        offset = (rows * (2 * n - rows - 1) / 2).astype(np.int32)
+        # We could also zero out when (rows < cols) == (rows < ids-n*rows).
+        # mask = (ids <= (n + 1) * rows).astype(np.int32)
+      else:
+        ids = math_ops.range(n**2)
+        rows = math_ops.cast(ids / n, dtype=dtypes.int32)
+        offset = math_ops.cast(rows * (2 * n - rows - 1) / 2,
+                               dtype=dtypes.int32)
+      return ids - offset
+
+    # Special-case non-batch case.
+    if x.get_shape().ndims == 1:
+      y = array_ops.gather(x, array_ops.reshape(tril_ids(n), [n, n]))
+      y = array_ops.matrix_band_part(y, -1, 0)
+      y.set_shape(y.get_shape().merge_with(final_shape))
+      return y
+
+    # Make ids for each batch dim.
+    if (x.get_shape().ndims is not None and
+        x.get_shape()[:-1].is_fully_defined()):
+      batch_shape = np.asarray(x.get_shape()[:-1].as_list(), dtype=np.int32)
+      m = np.prod(batch_shape).astype(np.int32)
+    else:
+      batch_shape = array_ops.shape(x)[:-1]
+      m = array_ops.reduce_prod(array_ops.shape(x)[:-1])
+    batch_ids = math_ops.range(m)
+
+    # Assemble the tril_ids into batch,tril_id pairs.
+    idx = array_ops.stack([
+        array_ops.tile(array_ops.expand_dims(batch_ids, 1), [1, n * n]),
+        array_ops.tile(array_ops.expand_dims(tril_ids(n), 0), [m, 1])
+    ])
+    idx = array_ops.transpose(idx, [1, 2, 0])
+
+    # Gather up, reshape, and return.
+    y = array_ops.reshape(x, [-1, d])
+    y = array_ops.gather_nd(y, idx)
+    y = array_ops.reshape(y, array_ops.concat([batch_shape, [n, n]], 0))
+    y = array_ops.matrix_band_part(y, -1, 0)
+    y.set_shape(y.get_shape().merge_with(final_shape))
+    return y
+
+
+# TODO(jvdillon): Merge this test back into:
+# tensorflow/python/ops/softplus_op_test.py
+# once TF core is accepting new ops.
+def softplus_inverse(x, name=None):
+  """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)).
+
+  Mathematically this op is equivalent to:
+
+  ```none
+  softplus_inverse = log(exp(x) - 1.)
+  ```
+
+  Args:
+    x: `Tensor`. Non-negative (not enforced), floating-point.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor`. Has the same type/shape as input `x`.
+  """
+  with ops.name_scope(name, "softplus_inverse", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    # We begin by deriving a more numerically stable softplus_inverse:
+    # x = softplus(y) = Log[1 + exp{y}], (which means x > 0).
+    # ==> exp{x} = 1 + exp{y}                                (1)
+    # ==> y = Log[exp{x} - 1]                                (2)
+    #       = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}]
+    #       = Log[(1 - exp{-x}) / 1] + Log[exp{x}]
+    #       = Log[1 - exp{-x}] + x                           (3)
+    # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x.
+    # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will
+    # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0.
+    #
+    # In addition to the numerically stable derivation above, we clamp
+    # small/large values to be congruent with the logic in:
+    # tensorflow/core/kernels/softplus_op.h
+    #
+    # Finally, we set the input to one whenever the input is too large or too
+    # small. This ensures that no unchosen codepath is +/- inf. This is
+    # necessary to ensure the gradient doesn't get NaNs. Recall that the
+    # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false`
+    # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful
+    # to overwrite `x` with ones only when we will never actually use this
+    # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`.
+    threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2.
+    is_too_small = math_ops.less(x, np.exp(threshold))
+    is_too_large = math_ops.greater(x, -threshold)
+    too_small_value = math_ops.log(x)
+    too_large_value = x
+    # This `where` will ultimately be a NOP because we won't select this
+    # codepath whenever we used the surrogate `ones_like`.
+    x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large),
+                        array_ops.ones_like(x), x)
+    y = x + math_ops.log(-math_ops.expm1(-x))  # == log(expm1(x))
+    return array_ops.where(is_too_small, too_small_value,
+                           array_ops.where(is_too_large, too_large_value, y))
+
+
+# TODO(b/35290280): Add unit-tests.
+def dimension_size(x, axis):
+  """Returns the size of a specific dimension."""
+  # Since tf.gather isn't "constant-in, constant-out", we must first check the
+  # static shape or fallback to dynamic shape.
+  num_rows = (None if x.get_shape().ndims is None
+              else x.get_shape()[axis].value)
+  if num_rows is not None:
+    return num_rows
+  return array_ops.shape(x)[axis]
+
+
+class AppendDocstring(object):
+  """Helper class to promote private subclass docstring to public counterpart.
+
+  Example:
+
+  ```python
+  class TransformedDistribution(Distribution):
+    @distribution_util.AppendDocstring(
+      additional_note="A special note!",
+      kwargs_dict={"foo": "An extra arg."})
+    def _prob(self, y, foo=None):
+      pass
+  ```
+
+  In this case, the `AppendDocstring` decorator appends the `additional_note` to
+  the docstring of `prob` (not `_prob`) and adds a new `kwargs`
+  section with each dictionary item as a bullet-point.
+
+  For a more detailed example, see `TransformedDistribution`.
+  """
+
+  def __init__(self, additional_note="", kwargs_dict=None):
+    """Initializes the AppendDocstring object.
+
+    Args:
+      additional_note: Python string added as additional docstring to public
+        version of function.
+      kwargs_dict: Python string/string dictionary representing
+        specific kwargs expanded from the **kwargs input.
+
+    Raises:
+      ValueError: if kwargs_dict.key contains whitespace.
+      ValueError: if kwargs_dict.value contains newlines.
+    """
+    self._additional_note = additional_note
+    if kwargs_dict:
+      bullets = []
+      for key in sorted(kwargs_dict.keys()):
+        value = kwargs_dict[key]
+        if any(x.isspace() for x in key):
+          raise ValueError(
+              "Parameter name \"%s\" contains whitespace." % key)
+        value = value.lstrip()
+        if "\n" in value:
+          raise ValueError(
+              "Parameter description for \"%s\" contains newlines." % key)
+        bullets.append("*  `%s`: %s" % (key, value))
+      self._additional_note += ("\n\n##### `kwargs`:\n\n" +
+                                "\n".join(bullets))
+
+  def __call__(self, fn):
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+      return fn(*args, **kwargs)
+    if _fn.__doc__ is None:
+      _fn.__doc__ = self._additional_note
+    else:
+      _fn.__doc__ += "\n%s" % self._additional_note
+    return _fn
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 2aeb9ce14d310758e3ab7ddc36fdaf7f3b90f867..315e7d4b43cc5cecdd744d72b0187a61d4913b47 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+# Imports gradient definitions.
+from tensorflow.python.ops import data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -33,16 +35,16 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 
 
-def _do_gather(params, ids, validate_indices=True, name=None):
+def _do_gather(params, ids, name=None):
   """Deals with doing gather differently for resource variables."""
   if isinstance(params, resource_variable_ops.ResourceVariable):
     return params.sparse_read(ids, name=name)
-  return array_ops.gather(
-      params, ids, name=name, validate_indices=validate_indices)
+  return array_ops.gather(params, ids, name=name)
 
 
 def embedding_lookup(params, ids, partition_strategy="mod", name=None,
-                     validate_indices=True, max_norm=None):
+                     validate_indices=True,  # pylint: disable=unused-argument
+                     max_norm=None):
   """Looks up `ids` in a list of embedding tensors.
 
   This function is used to perform parallel lookups on the list of
@@ -82,7 +84,10 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
       is `"mod"`.
     name: A name for the operation (optional).
-    validate_indices: Whether or not to validate gather indices.
+    validate_indices: DEPRECATED. If this operation is assigned to CPU, values
+      in `indices` are always validated to be within range.  If assigned to GPU,
+      out-of-bound indices result in safe but unspecified behavior, which may
+      include raising an error.
     max_norm: If not None, embedding values are l2-normalized to the value of
      max_norm.
 
@@ -92,7 +97,7 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
   Raises:
     ValueError: If `params` is empty.
   """
-  if params is None or params == []:  # pylint: disable=g-explicit-bool-comparison
+  if params in (None, (), []):
     raise ValueError("Need at least one param")
   if isinstance(params, variables.PartitionedVariable):
     params = list(params)  # Iterate to get the underlying Variables.
@@ -114,9 +119,7 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
     if np == 1:
       with ops.colocate_with(params[0]):
-        return maybe_normalize(
-            _do_gather(
-                params[0], ids, validate_indices=validate_indices, name=name))
+        return maybe_normalize(_do_gather(params[0], ids, name=name))
     else:
       ids = ops.convert_to_tensor(ids, name="ids")
       flat_ids = array_ops.reshape(ids, [-1])
@@ -176,9 +179,7 @@ def embedding_lookup(params, ids, partition_strategy="mod", name=None,
       partitioned_result = []
       for p in xrange(np):
         with ops.colocate_with(params[p]):
-          partitioned_result.append(
-              _do_gather(params[p], gather_ids[p],
-                         validate_indices=validate_indices))
+          partitioned_result.append(_do_gather(params[p], gather_ids[p]))
       # Stitch these back together
       ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
                                          name=name)
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 2adf8f05d898780964659ac18a4d444d83d23e55..bd8a5c86acc7501b30489c78b714293ee14763c1 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -273,28 +273,6 @@ def _VerifyGeneratedGradients(grads, op):
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
-    for i in xrange(len(grads)):
-      grad = grads[i]
-      inp = op.inputs[i]
-      if grad is None:
-        continue
-      if grad.dtype.is_floating:
-        if not inp.dtype.is_floating:
-          raise TypeError("Gradient type %s generated for real-valued op %s "
-                           "with type %s must be real" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      elif grad.dtype.is_complex:
-        if not inp.dtype.is_complex:
-          raise TypeError("Gradient type %s generated for complex-valued op %s"
-                           " with type %s must be complex" %
-                           (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                            dtypes.as_dtype(inp.dtype).name))
-      else:
-        raise TypeError("Gradient type %s generated for op %s "
-                         "with type %s must be either real or complex" %
-                         (dtypes.as_dtype(grad.dtype).name, op.node_def,
-                          dtypes.as_dtype(inp.dtype).name))
 
 
 def _StopOps(from_ops, pending_count):
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index cd4c37d7e03a50d240d64690fa78a4f607d0f5ee..aefed34d74425e239e562ace5966af984a60ff4a 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -577,14 +577,16 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
     # https://github.com/google/protobuf/issues/2812
-    if sys.version_info < (3, 6):
-      # Smaller than the threshold: no warning.
-      c_sparse = ops.IndexedSlices(
-          array_ops.placeholder(dtypes.float32),
-          array_ops.placeholder(dtypes.int32), constant([4, 4, 4, 4]))
-      with warnings.catch_warnings(record=True) as w:
-        math_ops.multiply(c_sparse, 1.0)
-      self.assertEqual(0, len(w))
+    if sys.version_info >= (3, 6):
+      self.skipTest("Skipped test for Python 3.6+")
+
+    # Smaller than the threshold: no warning.
+    c_sparse = ops.IndexedSlices(
+        array_ops.placeholder(dtypes.float32),
+        array_ops.placeholder(dtypes.int32), constant([4, 4, 4, 4]))
+    with warnings.catch_warnings(record=True) as w:
+      math_ops.multiply(c_sparse, 1.0)
+    self.assertEqual(0, len(w))
 
     # Greater than or equal to the threshold: warning.
     c_sparse = ops.IndexedSlices(
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 4981cb6a2eb79ae9f36490819edcd173fe741249..57a5b982abfc8211b25808c14d2aefb2bb889b66 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -63,16 +63,27 @@ GetSessionHandle
 GetSessionHandleV2
 GetSessionTensor
 HashTable
+HashTableV2
 InitializeTable
+InitializeTableV2
 InitializeTableFromTextFile
+InitializeTableFromTextFileV2
 LookupTableExport
+LookupTableExportV2
 LookupTableFind
+LookupTableFindV2
 LookupTableImport
+LookupTableImportV2
 LookupTableInsert
+LookupTableInsertV2
 LookupTableSize
+LookupTableSizeV2
 MutableDenseHashTable
+MutableDenseHashTableV2
 MutableHashTable
+MutableHashTableV2
 MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
 Mutex
 MutexAcquire
 MutexRelease
@@ -220,6 +231,7 @@ BatchFFT3D
 BatchIFFT
 BatchIFFT2D
 BatchIFFT3D
+Bucketize
 Complex
 ComplexAbs
 Conj
@@ -301,9 +313,6 @@ PyFunc
 PyFuncStateless
 
 # sdca_ops
-SdcaFprint
-SdcaOptimizer
-SdcaShrinkL1
 
 # state_ops
 Variable
@@ -321,6 +330,7 @@ SerializeSparse
 SparseAdd
 SparseAddGrad
 SparseConcat
+SparseCross
 SparseSplit
 SparseSelectLastK
 SparseReorder
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index c29ae26f04ed07cb44ba821e3a3c5296faa2840d..51d0276140200ecfbe97379b292aca6efa447719 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -18,6 +18,7 @@
 
 See the @{$python/image} guide.
 
+@@decode_bmp
 @@decode_gif
 @@decode_jpeg
 @@encode_jpeg
@@ -59,6 +60,7 @@ See the @{$python/image} guide.
 @@per_image_standardization
 @@draw_bounding_boxes
 @@non_max_suppression
+@@non_max_suppression_v2
 @@sample_distorted_bounding_box
 @@total_variation
 """
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 78621d3b5707a527c5ed930e9e6369c7e6f8b38a..42cd3441345417c212f74fcf4fcdcf8874a07520 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -32,9 +32,9 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 
 
@@ -52,6 +52,7 @@ ops.NotDifferentiable('SampleDistortedBoundingBox')
 # latent bugs here.
 ops.NotDifferentiable('ExtractGlimpse')
 ops.NotDifferentiable('NonMaxSuppression')
+ops.NotDifferentiable('NonMaxSuppressionV2')
 
 
 def _assert(cond, ex_type, msg):
@@ -218,7 +219,8 @@ def random_flip_up_down(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -246,7 +248,8 @@ def random_flip_left_right(image, seed=None):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
   mirror_cond = math_ops.less(uniform_random, .5)
   result = control_flow_ops.cond(mirror_cond,
@@ -273,7 +276,8 @@ def flip_left_right(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
 
 
@@ -295,7 +299,8 @@ def flip_up_down(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
 
 
@@ -312,7 +317,8 @@ def rot90(image, k=1, name=None):
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    _Check3DImage(image, require_static=False)
+    image = control_flow_ops.with_dependencies(
+        _Check3DImage(image, require_static=False), image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
@@ -350,7 +356,8 @@ def transpose_image(image):
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
 
 
@@ -379,12 +386,14 @@ def central_crop(image, central_fraction):
     3-D float Tensor
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
   if central_fraction <= 0.0 or central_fraction > 1.0:
     raise ValueError('central_fraction must be within (0, 1]')
   if central_fraction == 1.0:
     return image
 
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
+
   img_shape = array_ops.shape(image)
   depth = image.get_shape()[2]
   fraction_offset = int(1 / ((1 - central_fraction) / 2.0))
@@ -435,9 +444,6 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
-
   is_batch = True
   image_shape = image.get_shape()
   if image_shape.ndims == 3:
@@ -450,6 +456,8 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   after_padding_width = target_width - offset_width - width
@@ -515,9 +523,6 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   """
   image = ops.convert_to_tensor(image, name='image')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
-
   is_batch = True
   image_shape = image.get_shape()
   if image_shape.ndims == 3:
@@ -530,6 +535,8 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
+
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   assert_ops += _assert(offset_width >= 0, ValueError,
@@ -602,8 +609,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   elif image_shape.ndims != 4:
     raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
-  assert_ops = []
-  assert_ops += _CheckAtLeast3DImage(image, require_static=False)
+  assert_ops = _CheckAtLeast3DImage(image, require_static=False)
   assert_ops += _assert(target_width > 0, ValueError,
                         'target_width must be > 0.')
   assert_ops += _assert(target_height > 0, ValueError,
@@ -614,7 +620,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
   # Make sure our checks come first, so that error messages are clearer.
   if _is_tensor(target_height):
     target_height = control_flow_ops.with_dependencies(
-      assert_ops, target_height)
+        assert_ops, target_height)
   if _is_tensor(target_width):
     target_width = control_flow_ops.with_dependencies(assert_ops, target_width)
 
@@ -693,9 +699,12 @@ def resize_images(images,
 
   `method` can be one of:
 
-  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](https://en.wikipedia.org/wiki/Bicubic_interpolation)
+  *   <b>`ResizeMethod.BILINEAR`</b>: [Bilinear interpolation.](
+    https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  *   <b>`ResizeMethod.NEAREST_NEIGHBOR`</b>: [Nearest neighbor interpolation.](
+    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
+  *   <b>`ResizeMethod.BICUBIC`</b>: [Bicubic interpolation.](
+    https://en.wikipedia.org/wiki/Bicubic_interpolation)
   *   <b>`ResizeMethod.AREA`</b>: Area interpolation.
 
   Args:
@@ -800,7 +809,8 @@ def per_image_standardization(image):
     ValueError: if the shape of 'image' is incompatible with this function.
   """
   image = ops.convert_to_tensor(image, name='image')
-  _Check3DImage(image, require_static=False)
+  image = control_flow_ops.with_dependencies(
+      _Check3DImage(image, require_static=False), image)
   num_pixels = math_ops.reduce_prod(array_ops.shape(image))
 
   image = math_ops.cast(image, dtype=dtypes.float32)
@@ -955,6 +965,7 @@ def adjust_contrast(images, contrast_factor):
 
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs Gamma Correction on the input image.
+
     Also known as Power Law Transform. This function transforms the
     input image pixelwise according to the equation Out = In**gamma
     after scaling each pixel to the range 0 to 1.
@@ -967,6 +978,9 @@ def adjust_gamma(image, gamma=1, gain=1):
   Returns:
     A Tensor. Gamma corrected output image.
 
+  Raises:
+    ValueError: If gamma is negative.
+
   Notes:
     For gamma greater than 1, the histogram will shift towards left and
     the output image will be darker than the input image.
@@ -977,16 +991,17 @@ def adjust_gamma(image, gamma=1, gain=1):
     [1] http://en.wikipedia.org/wiki/Gamma_correction
   """
 
-  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma') as name:
+  with ops.op_scope([image, gamma, gain], None, 'adjust_gamma'):
     # Convert pixel value to DT_FLOAT for computing adjusted image
     img = ops.convert_to_tensor(image, name='img', dtype=dtypes.float32)
     # Keep image dtype for computing the scale of corresponding dtype
     image = ops.convert_to_tensor(image, name='image')
 
     if gamma < 0:
-      raise ValueError("Gamma should be a non-negative real number")
+      raise ValueError('Gamma should be a non-negative real number')
     # scale = max(dtype) - min(dtype)
-    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0], dtype=dtypes.float32)
+    scale = constant_op.constant(image.dtype.limits[1] - image.dtype.limits[0],
+                                 dtype=dtypes.float32)
     # According to the definition of gamma correction
     adjusted_img = (img / scale) ** gamma * scale * gain
 
@@ -1298,15 +1313,18 @@ def adjust_saturation(image, saturation_factor, name=None):
 
 
 def decode_image(contents, channels=None, name=None):
-  """Convenience function for `decode_gif`, `decode_jpeg`, and `decode_png`.
-  Detects whether an image is a GIF, JPEG, or PNG, and performs the appropriate
-  operation to convert the input bytes `string` into a `Tensor` of type `uint8`.
+  """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
+  and `decode_png`.
+
+  Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
+  appropriate operation to convert the input bytes `string` into a `Tensor` of
+  type `uint8`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
-  opposed to `decode_jpeg` and `decode_png`, which return 3-D arrays
-  `[height, width, num_channels]`. Make sure to take this into account when
-  constructing your graph if you are intermixing GIF files with JPEG and/or PNG
-  files.
+  opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
+  arrays `[height, width, num_channels]`. Make sure to take this into account
+  when constructing your graph if you are intermixing GIF files with BMP, JPEG,
+  and/or PNG files.
 
   Args:
     contents: 0-D `string`. The encoded image bytes.
@@ -1316,39 +1334,73 @@ def decode_image(contents, channels=None, name=None):
 
   Returns:
     `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
-      JPEG and PNG images and shape `[num_frames, height, width, 3]` for GIF
-      images.
+      BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
+      GIF images.
+
+  Raises:
+    ValueError: On incorrect number of channels.
   """
-  with ops.name_scope(name, 'decode_image') as scope:
-    if channels not in (None, 0, 1, 3):
-      raise ValueError('channels must be in (None, 0, 1, 3)')
-    substr = string_ops.substr(contents, 0, 4)
+  with ops.name_scope(name, 'decode_image'):
+    if channels not in (None, 0, 1, 3, 4):
+      raise ValueError('channels must be in (None, 0, 1, 3, 4)')
+    substr = string_ops.substr(contents, 0, 3)
+
+    def _bmp():
+      """Decodes a GIF image."""
+      signature = string_ops.substr(contents, 0, 2)
+      # Create assert op to check that bytes are BMP decodable
+      is_bmp = math_ops.equal(signature, 'BM', name='is_bmp')
+      decode_msg = 'Unable to decode bytes as JPEG, PNG, GIF, or BMP'
+      assert_decode = control_flow_ops.Assert(is_bmp, [decode_msg])
+      bmp_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(bmp_channels, 1, name='check_channels')
+      channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_decode, assert_channels]):
+        return gen_image_ops.decode_bmp(contents)
 
     def _gif():
-      # Create assert op to check that bytes are GIF decodable
-      is_gif = math_ops.equal(substr, b'\x47\x49\x46\x38', name='is_gif')
-      decode_msg = 'Unable to decode bytes as JPEG, PNG, or GIF'
-      assert_decode = control_flow_ops.Assert(is_gif, [decode_msg])
       # Create assert to make sure that channels is not set to 1
       # Already checked above that channels is in (None, 0, 1, 3)
+
       gif_channels = 0 if channels is None else channels
-      good_channels = math_ops.not_equal(gif_channels, 1, name='check_channels')
+      good_channels = math_ops.logical_and(
+          math_ops.not_equal(gif_channels, 1, name='check_gif_channels'),
+          math_ops.not_equal(gif_channels, 4, name='check_gif_channels')
+      )
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
-      with ops.control_dependencies([assert_decode, assert_channels]):
+      with ops.control_dependencies([assert_channels]):
         return gen_image_ops.decode_gif(contents)
 
+    def check_gif():
+      # Create assert op to check that bytes are GIF decodable
+      is_gif = math_ops.equal(substr, b'\x47\x49\x46', name='is_gif')
+      return control_flow_ops.cond(is_gif, _gif, _bmp, name='cond_gif')
+
     def _png():
+      """Decodes a PNG image."""
       return gen_image_ops.decode_png(contents, channels)
 
     def check_png():
-      is_png = math_ops.equal(substr, b'\211PNG', name='is_png')
-      return control_flow_ops.cond(is_png, _png, _gif, name='cond_png')
+      """Checks if an image is PNG."""
+      is_png = math_ops.equal(substr, b'\211PN', name='is_png')
+      return control_flow_ops.cond(is_png, _png, check_gif, name='cond_png')
 
     def _jpeg():
-      return gen_image_ops.decode_jpeg(contents, channels)
+      """Decodes a jpeg image."""
+      jpeg_channels = 0 if channels is None else channels
+      good_channels = math_ops.not_equal(jpeg_channels, 4,
+                                         name='check_jpeg_channels')
+      channels_msg = ('Channels must be in (None, 0, 1, 3) when decoding JPEG '
+                      'images')
+      assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
+      with ops.control_dependencies([assert_channels]):
+        return gen_image_ops.decode_jpeg(contents, channels)
 
-    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff\xe0', name='is_jpeg')
+    # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
+    # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
+    is_jpeg = math_ops.equal(substr, b'\xff\xd8\xff', name='is_jpeg')
     return control_flow_ops.cond(is_jpeg, _jpeg, check_png, name='cond_jpeg')
 
 
@@ -1416,7 +1468,7 @@ def total_variation(images, name=None):
 
     # Calculate the total variation by taking the absolute value of the
     # pixel-differences and summing over the appropriate axis.
-    tot_var = math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) + \
-              math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis)
+    tot_var = (math_ops.reduce_sum(math_ops.abs(pixel_dif1), axis=sum_axis) +
+               math_ops.reduce_sum(math_ops.abs(pixel_dif2), axis=sum_axis))
 
   return tot_var
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 887140c726ae587beb2417fcfc1361b41b44d90d..5588d18ef1d75e38bd7a91f02c50c508f22044ad 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import colorsys
+import functools
 import math
 import os
 import time
@@ -1175,12 +1176,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([3, 5],):
-      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width,
-                         "'image' must be at least three-dimensional.")
-
-    for x_shape in ([1, 3, 5, 1, 1],):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
@@ -1426,12 +1422,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     offset_height, offset_width = [0, 0]
     target_height, target_width = [2, 2]
 
-    for x_shape in ([3, 5],):
-      self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
-                         target_width,
-                         "'image' must be at least three-dimensional")
-
-    for x_shape in ([1, 3, 5, 1, 1],):
+    for x_shape in ([3, 5], [1, 3, 5, 1, 1]):
       self._assertRaises(x, x_shape, offset_height, offset_width, target_height,
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
@@ -1458,7 +1449,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           use_tensor_inputs_options=[False])
 
       # The orignal error message does not contain back slashes. However, they
-      # are added by either the assert op or the runtime. If this behaviour
+      # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
           x,
@@ -1652,8 +1643,8 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self.assertEqual(y.get_shape().as_list(), [None] + post_shape)
 
   def shouldRunOnGPU(self, opt, nptype):
-    if opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR \
-            and nptype in [np.float32, np.float64]:
+    if (opt == image_ops.ResizeMethod.NEAREST_NEIGHBOR and
+        nptype in [np.float32, np.float64]):
       return True
     else:
       return False
@@ -1676,15 +1667,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True) as sess:
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(image, [target_height, target_width],
-                                        opt)
-            yshape = array_ops.shape(y)
-            resized, newshape = sess.run([y, yshape])
-            self.assertAllEqual(img_shape, newshape)
-            self.assertAllClose(resized, img_np, atol=1e-5)
+        with self.test_session(use_gpu=True) as sess:
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(image, [target_height, target_width], opt)
+          yshape = array_ops.shape(y)
+          resized, newshape = sess.run([y, yshape])
+          self.assertAllEqual(img_shape, newshape)
+          self.assertAllClose(resized, img_np, atol=1e-5)
 
       # Resizing with a single image must leave the shape unchanged also.
       with self.test_session(use_gpu=True):
@@ -1822,7 +1811,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               resized = y.eval()
               self.assertAllClose(resized, expected, atol=1e-5)
 
-  def testResizeUp(self):
+  def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32,
             32, 64,
@@ -1857,16 +1846,63 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR,
           image_ops.ResizeMethod.AREA]:
-        if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-          with self.test_session(use_gpu=True):
-            img_np = np.array(data, dtype=nptype).reshape(img_shape)
-            image = constant_op.constant(img_np, shape=img_shape)
-            y = image_ops.resize_images(
-                image, [target_height, target_width], opt)
-            resized = y.eval()
-            expected = np.array(expected_data[opt]).reshape(
-                [1, target_height, target_width, 1])
-            self.assertAllClose(resized, expected, atol=1e-05)
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=False)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
+
+  def testResizeUpAlignCornersTrue(self):
+    img_shape = [1, 3, 2, 1]
+    data = [6, 3,
+            3, 6,
+            6, 9]
+    target_height = 5
+    target_width = 4
+    expected_data = {}
+    expected_data[image_ops.ResizeMethod.BILINEAR] = [
+        6.0, 5.0, 4.0, 3.0,
+        4.5, 4.5, 4.5, 4.5,
+        3.0, 4.0, 5.0, 6.0,
+        4.5, 5.5, 6.5, 7.5,
+        6.0, 7.0, 8.0, 9.0
+    ]
+    expected_data[image_ops.ResizeMethod.NEAREST_NEIGHBOR] = [
+        6.0, 6.0, 3.0, 3.0,
+        3.0, 3.0, 6.0, 6.0,
+        3.0, 3.0, 6.0, 6.0,
+        6.0, 6.0, 9.0, 9.0,
+        6.0, 6.0, 9.0, 9.0
+    ]
+    # TODO(b/37749740): Improve alignment of ResizeMethod.AREA when
+    # align_corners=True.
+    expected_data[image_ops.ResizeMethod.AREA] = [
+        6.0, 6.0, 6.0, 3.0,
+        6.0, 6.0, 6.0, 3.0,
+        3.0, 3.0, 3.0, 6.0,
+        3.0, 3.0, 3.0, 6.0,
+        6.0, 6.0, 6.0, 9.0
+    ]
+
+    for nptype in self.TYPES:
+      for opt in [
+          image_ops.ResizeMethod.BILINEAR,
+          image_ops.ResizeMethod.NEAREST_NEIGHBOR,
+          image_ops.ResizeMethod.AREA
+      ]:
+        with self.test_session(use_gpu=True):
+          img_np = np.array(data, dtype=nptype).reshape(img_shape)
+          image = constant_op.constant(img_np, shape=img_shape)
+          y = image_ops.resize_images(
+              image, [target_height, target_width], opt, align_corners=True)
+          resized = y.eval()
+          expected = np.array(expected_data[opt]).reshape(
+              [1, target_height, target_width, 1])
+          self.assertAllClose(resized, expected, atol=1e-05)
 
   def testResizeUpBicubic(self):
     img_shape = [1, 6, 6, 1]
@@ -2245,7 +2281,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           use_tensor_inputs_options=[False])
 
       # The orignal error message does not contain back slashes. However, they
-      # are added by either the assert op or the runtime. If this behaviour
+      # are added by either the assert op or the runtime. If this behavior
       # changes in the future, the match string will also needs to be changed.
       self._assertRaises(
           x,
@@ -2757,5 +2793,37 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     self._test(multi, tot_var * np.array([1.0, 1.1, 1.2]))
 
 
+class FormatTest(test_util.TensorFlowTestCase):
+
+  def testFormats(self):
+    prefix = "tensorflow/core/lib"
+    paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
+             "gif/testdata/lena.gif")
+    decoders = {
+        "jpeg": functools.partial(image_ops.decode_jpeg, channels=3),
+        "png": functools.partial(image_ops.decode_png, channels=3),
+        "gif": lambda s: array_ops.squeeze(image_ops.decode_gif(s), axis=0),
+    }
+    with self.test_session():
+      for path in paths:
+        contents = io_ops.read_file(os.path.join(prefix, path)).eval()
+        images = {}
+        for name, decode in decoders.items():
+          image = decode(contents).eval()
+          self.assertEqual(image.ndim, 3)
+          for prev_name, prev in images.items():
+            print("path %s, names %s %s, shapes %s %s" %
+                  (path, name, prev_name, image.shape, prev.shape))
+            self.assertAllEqual(image, prev)
+          images[name] = image
+
+  def testError(self):
+    path = "tensorflow/core/lib/gif/testdata/scan.gif"
+    with self.test_session():
+      for decode in image_ops.decode_jpeg, image_ops.decode_png:
+        with self.assertRaisesOpError(r"Got 12 frames"):
+          decode(io_ops.read_file(path)).eval()
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 7610a4629b5375fb3777f7a3d6e006a376eb32d4..1e2f999995756ad4b4c432ddfc31c39254818622 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -39,7 +39,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import math_ops
 
 
 class Initializer(object):
@@ -49,30 +51,64 @@ class Initializer(object):
   def __call__(self, shape, dtype=None, partition_info=None):
     raise NotImplementedError
 
+  def get_config(self):
+    """Returns the configuration of the initializer as a JSON-serializable dict.
+
+    Returns:
+      A JSON-serializable Python dict.
+    """
+    return {}
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates an initializer from a configuration dictionary.
+
+    Example:
+
+    ```
+    initializer = RandomUniform(-1, 1)
+    config = initializer.get_config()
+    initializer = RandomUniform.from_config(config)
+    ```
+
+    Arguments:
+      config: A Python dictionary.
+        It will typically be the output of `get_config`.
+
+    Returns:
+      An Initializer instance.
+    """
+    return cls(**config)
+
 
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
   def __init__(self, dtype=dtypes.float32):
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
       dtype = self.dtype
-    return constant_op.constant(False if dtype is dtypes.bool else 0,
-                                dtype=dtype, shape=shape)
+    return array_ops.zeros(shape, dtype)
+
+  def get_config(self):
+    return {"dtype": self.dtype.name}
 
 
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
   def __init__(self, dtype=dtypes.float32):
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
       dtype = self.dtype
-    return constant_op.constant(1, dtype=dtype, shape=shape)
+    return array_ops.ones(shape, dtype)
+
+  def get_config(self):
+    return {"dtype": self.dtype.name}
 
 
 class Constant(Initializer):
@@ -152,14 +188,27 @@ class Constant(Initializer):
 
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
     self.value = value
-    self.dtype = dtype
-    self.verify_shape = verify_shape
+    self.dtype = dtypes.as_dtype(dtype)
+    self._verify_shape = verify_shape
 
-  def __call__(self, shape, dtype=None, partition_info=None):
+  def __call__(self, shape,
+               dtype=None,
+               partition_info=None,
+               verify_shape=None):
     if dtype is None:
       dtype = self.dtype
+    if verify_shape is None:
+      verify_shape = self._verify_shape
     return constant_op.constant(self.value, dtype=dtype, shape=shape,
-                                verify_shape=self.verify_shape)
+                                verify_shape=verify_shape)
+
+  def get_config(self):
+    # We don't include `verify_shape` for compatibility with Keras.
+    # `verify_shape` should be passed as an argument to `__call__` rather
+    # than as a constructor argument: conceptually it isn't a property
+    # of the initializer.
+    return {"value": self.value,
+            "dtype": self.dtype.name}
 
 
 class RandomUniform(Initializer):
@@ -180,7 +229,7 @@ class RandomUniform(Initializer):
     self.minval = minval
     self.maxval = maxval
     self.seed = seed
-    self.dtype = dtype
+    self.dtype = dtypes.as_dtype(dtype)
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -188,6 +237,12 @@ class RandomUniform(Initializer):
     return random_ops.random_uniform(shape, self.minval, self.maxval,
                                      dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"minval": self.minval,
+            "maxval": self.maxval,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
@@ -207,7 +262,7 @@ class RandomNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -215,6 +270,12 @@ class RandomNormal(Initializer):
     return random_ops.random_normal(shape, self.mean, self.stddev,
                                     dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"mean": self.mean,
+            "stddev": self.stddev,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
@@ -239,7 +300,7 @@ class TruncatedNormal(Initializer):
     self.mean = mean
     self.stddev = stddev
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -247,6 +308,12 @@ class TruncatedNormal(Initializer):
     return random_ops.truncated_normal(shape, self.mean, self.stddev,
                                        dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"mean": self.mean,
+            "stddev": self.stddev,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
@@ -278,7 +345,7 @@ class UniformUnitScaling(Initializer):
   def __init__(self, factor=1.0, seed=None, dtype=dtypes.float32):
     self.factor = factor
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -299,6 +366,11 @@ class UniformUnitScaling(Initializer):
     return random_ops.random_uniform(shape, -max_val, max_val,
                                      dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"factor": self.factor,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
@@ -343,7 +415,7 @@ class VarianceScaling(Initializer):
     self.mode = mode
     self.distribution = distribution
     self.seed = seed
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
 
   def __call__(self, shape, dtype=None, partition_info=None):
     if dtype is None:
@@ -368,6 +440,13 @@ class VarianceScaling(Initializer):
       return random_ops.random_uniform(shape, -limit, limit,
                                        dtype, seed=self.seed)
 
+  def get_config(self):
+    return {"scale": self.scale,
+            "mode": self.mode,
+            "distribution": self.distribution,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
@@ -389,9 +468,9 @@ class Orthogonal(Initializer):
       for behavior.
   """
 
-  def __init__(self, gain=1.0, dtype=dtypes.float32, seed=None):
+  def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
     self.gain = gain
-    self.dtype = _assert_float_dtype(dtype)
+    self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
     self.seed = seed
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -410,18 +489,25 @@ class Orthogonal(Initializer):
     flat_shape = (num_rows, num_cols)
 
     # Generate a random matrix
-    a = random_ops.random_uniform(flat_shape, dtype=dtype, seed=self.seed)
-    # Compute the svd
-    _, u, v = linalg_ops.svd(a, full_matrices=False)
-    # Pick the appropriate singular value decomposition
-    if num_rows > num_cols:
-      q = u
-    else:
-      # Tensorflow departs from numpy conventions
-      # such that we need to transpose axes here
-      q = array_ops.transpose(v)
+    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
+    # Compute the qr factorization
+    q, r = linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    square_len = math_ops.minimum(num_rows, num_cols)
+    d = array_ops.diag_part(r[:square_len, :square_len])
+    ph = d / math_ops.abs(d)
+    q *= ph
+    # Pad zeros to Q (if rows smaller than cols)
+    if num_rows < num_cols:
+      padding = array_ops.zeros([num_rows, num_cols - num_rows], dtype=dtype)
+      q = array_ops.concat([q, padding], 1)
     return self.gain * array_ops.reshape(q, shape)
 
+  def get_config(self):
+    return {"gain": self.gain,
+            "seed": self.seed,
+            "dtype": self.dtype.name}
+
 
 # Aliases.
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index ae45c40aec4ff181b710ac76fbe9f900ae1dea7e..68ecc219e4f45922a2ad7b9a88f0600c113f7a7d 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -391,7 +391,11 @@ class FixedLengthRecordReader(ReaderBase):
   """
   # TODO(josh11b): Support serializing and restoring state.
 
-  def __init__(self, record_bytes, header_bytes=None, footer_bytes=None,
+  def __init__(self,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               hop_bytes=None,
                name=None):
     """Create a FixedLengthRecordReader.
 
@@ -399,11 +403,15 @@ class FixedLengthRecordReader(ReaderBase):
       record_bytes: An int.
       header_bytes: An optional int. Defaults to 0.
       footer_bytes: An optional int. Defaults to 0.
+      hop_bytes: An optional int. Defaults to 0.
       name: A name for the operation (optional).
     """
     rr = gen_io_ops._fixed_length_record_reader_v2(
-        record_bytes=record_bytes, header_bytes=header_bytes,
-        footer_bytes=footer_bytes, name=name)
+        record_bytes=record_bytes,
+        header_bytes=header_bytes,
+        footer_bytes=footer_bytes,
+        hop_bytes=hop_bytes,
+        name=name)
     super(FixedLengthRecordReader, self).__init__(rr)
 
 
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index e2fd25675ec50f1d244fe85241918764b6d62208..9b6420317da6c5e88a4276b0fd32b9326b58b0c7 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -257,7 +257,9 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
     name: string, optional name of the operation.
 
   Returns:
-    s: Singular values. Shape is `[..., P]`.
+    s: Singular values. Shape is `[..., P]`. The values are sorted in reverse
+      order of magnitude, so s[..., 0] is the largest value, s[..., 1] is the
+      second largest, etc.
     u: Left singular vectors. If `full_matrices` is `False` (default) then
       shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
       `[..., M, M]`. Not returned if `compute_uv` is `False`.
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 9c49ef78896740a0a66e995f759e200b7ecf6cde..08e3f83a0b21a8444ad3500c62fe624440edc255 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -86,7 +86,7 @@ def histogram_summary(tag, values, collections=None, name=None):
   This ops is deprecated. Please switch to tf.summary.histogram.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
@@ -124,7 +124,7 @@ def image_summary(tag, tensor, max_images=3, collections=None, name=None):
   """Outputs a `Summary` protocol buffer with images.
 
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_images` summary values containing images. The
   images are built from `tensor` which must be 4-D with shape `[batch_size,
@@ -190,7 +190,7 @@ def audio_summary(tag,
 
   This op is deprecated. Please switch to tf.summary.audio.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The summary has up to `max_outputs` summary values containing audio. The
   audio is built from `tensor` which must be 3-D with shape `[batch_size,
@@ -326,7 +326,7 @@ def scalar_summary(tags, values, collections=None, name=None):
 
   This ops is deprecated. Please switch to tf.summary.scalar.
   For an explanation of why this op was deprecated, and information on how to
-  migrate, look ['here'](https://www.tensorflow.org/code/tensorflow/contrib/deprecated/__init__.py)
+  migrate, look ['here'](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/deprecated/__init__.py)
 
   The input `tags` and `values` must have the same shape.  The generated
   summary has a summary value for each tag-value pair in `tags` and `values`.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..82277ebaccbf32eb4e8935c97110301ccfb00d7c
--- /dev/null
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -0,0 +1,1215 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#==============================================================================
+"""Lookup operations."""
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_lookup_ops import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
+
+
+# TODO(yleon): Remove this function.
+@deprecated("2017-03-02", "Use `tf.tables_initializer` instead.")
+def initialize_all_tables(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  return tables_initializer(name)
+
+
+def tables_initializer(name="init_all_tables"):
+  """Returns an Op that initializes all tables of the default graph.
+
+  Args:
+    name: Optional name for the initialization op.
+
+  Returns:
+    An Op that initializes all tables.  Note that if there are
+    not tables the returned Op is a NoOp.
+  """
+  initializers = ops.get_collection(ops.GraphKeys.TABLE_INITIALIZERS)
+  if initializers:
+    return control_flow_ops.group(*initializers, name=name)
+  return control_flow_ops.no_op(name=name)
+
+
+def _check_table_dtypes(table, key_dtype, value_dtype):
+  """Check that the given key_dtype and value_dtype matches the table dtypes.
+
+  Args:
+    table: The table to check types against to.
+    key_dtype: The key data type to check.
+    value_dtype: The value data type to check.
+
+  Raises:
+    TypeError: when 'key_dtype' or 'value_dtype' doesn't match the table data
+      types.
+  """
+  if key_dtype != table.key_dtype:
+    raise TypeError("Invalid key dtype, expected %s but got %s." %
+                    (table.key_dtype, key_dtype))
+  if value_dtype != table.value_dtype:
+    raise TypeError("Invalid value dtype, expected %s but got %s." %
+                    (table.value_dtype, value_dtype))
+
+
+class LookupInterface(object):
+  """Represent a lookup table that persists across different steps."""
+
+  def __init__(self, key_dtype, value_dtype, name):
+    """Construct a lookup table interface.
+
+    Args:
+      key_dtype: The table key type.
+      value_dtype: The table value type.
+      name: A name for the operation (optional).
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+    self._name = name
+
+  @property
+  def key_dtype(self):
+    """The table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The table value dtype."""
+    return self._value_dtype
+
+  @property
+  def name(self):
+    """The name of the table."""
+    return self._name
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    raise NotImplementedError
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    raise NotImplementedError
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values."""
+    raise NotImplementedError
+
+
+class InitializableLookupTableBase(LookupInterface):
+  """Initializable lookup table interface.
+
+  An initializable lookup tables persist across different steps.
+  """
+
+  def __init__(self, table_ref, default_value, initializer):
+    """Construct a table object from a table reference.
+
+    If requires a table initializer object (subclass of `TableInitializerBase`).
+    It provides the table key and value types, as well as the op to initialize
+    the table. The caller is responsible to execute the initialization op.
+
+    Args:
+      table_ref: The table reference, i.e. the output of the lookup table ops.
+      default_value: The value to use if a key is missing in the table.
+      initializer: The table initializer to use.
+    """
+    super(InitializableLookupTableBase,
+          self).__init__(initializer.key_dtype, initializer.value_dtype,
+                         table_ref.op.name.split("/")[-1])
+    self._table_ref = table_ref
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=self._value_dtype)
+    self._default_value.get_shape().merge_with(tensor_shape.scalar())
+    self._init = initializer.initialize(self)
+
+  @property
+  def table_ref(self):
+    """Get the underlying table reference."""
+    return self._table_ref
+
+  @property
+  def default_value(self):
+    """The default value of the table."""
+    return self._default_value
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    return self._init
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self._name,
+                        [self._table_ref]) as scope:
+      # pylint: disable=protected-access
+      return gen_lookup_ops._lookup_table_size_v2(self._table_ref, name=scope)
+      # pylint: enable=protected-access
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` or `default_value` doesn't match the table data
+        types.
+    """
+    key_tensor = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      key_tensor = keys.values
+
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_Lookup" % self._name,
+                        (self._table_ref, key_tensor,
+                         self._default_value)) as scope:
+      # pylint: disable=protected-access
+      values = gen_lookup_ops._lookup_table_find_v2(
+          self._table_ref, key_tensor, self._default_value, name=scope)
+      # pylint: enable=protected-access
+
+    values.set_shape(key_tensor.get_shape())
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, values, keys.dense_shape)
+    else:
+      return values
+
+
+class HashTable(InitializableLookupTableBase):
+  """A generic hash table implementation.
+
+  Example usage:
+
+  ```python
+  table = tf.contrib.lookup.HashTable(
+      tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+  """
+
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
+
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
+
+    Args:
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
+      default_value: The value to use if a key is missing in the table.
+      shared_name: If non-empty, this table will be shared under
+        the given name across multiple sessions.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `HashTable` object.
+    """
+    with ops.name_scope(name, "hash_table", (initializer,
+                                             default_value)) as scope:
+      # pylint: disable=protected-access
+      table_ref = gen_lookup_ops._hash_table_v2(
+          shared_name=shared_name,
+          key_dtype=initializer.key_dtype,
+          value_dtype=initializer.value_dtype,
+          name=scope)
+      # pylint: enable=protected-access
+
+      super(HashTable, self).__init__(table_ref, default_value, initializer)
+
+
+class TableInitializerBase(object):
+  """Base class for lookup table initializers."""
+
+  def __init__(self, key_dtype, value_dtype):
+    """Construct a table initializer object.
+
+    Args:
+      key_dtype: Type of the table keys.
+      value_dtype: Type of the table values.
+    """
+    self._key_dtype = dtypes.as_dtype(key_dtype)
+    self._value_dtype = dtypes.as_dtype(value_dtype)
+
+  @property
+  def key_dtype(self):
+    """The expected table key dtype."""
+    return self._key_dtype
+
+  @property
+  def value_dtype(self):
+    """The expected table value dtype."""
+    return self._value_dtype
+
+  def initialize(self, table):
+    """Returns the table initialization op."""
+    raise NotImplementedError
+
+
+class KeyValueTensorInitializer(TableInitializerBase):
+  """Table initializers given `keys` and `values` tensors."""
+
+  def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
+    """Constructs a table initializer object based on keys and values tensors.
+
+    Args:
+      keys: The tensor for the keys.
+      values: The tensor for the values.
+      key_dtype: The `keys` data type. Used when `keys` is a python array.
+      value_dtype: The `values` data type. Used when `values` is a python array.
+      name: A name for the operation (optional).
+    """
+    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
+      self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
+      self._values = ops.convert_to_tensor(
+          values, dtype=value_dtype, name="values")
+      self._name = scope
+
+    super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
+                                                    self._values.dtype)
+
+  def initialize(self, table):
+    """Initializes the given `table` with `keys` and `values` tensors.
+
+    Args:
+      table: The table to initialize.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
+    with ops.name_scope(
+        self._name, values=(table.table_ref, self._keys,
+                            self._values)) as scope:
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_v2(
+          table.table_ref, self._keys, self._values, name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    return init_op
+
+
+class TextFileIndex(object):
+  WHOLE_LINE = -2
+  LINE_NUMBER = -1
+
+
+class TextFileInitializer(TableInitializerBase):
+  """Table initializers from a text file.
+
+  This initializer assigns one entry in the table for each line in the file.
+
+  The key and value type of the table to initialize is given by `key_dtype` and
+  `value_dtype`.
+
+  The key and value content to get from each line is specified by
+  the `key_index` and `value_index`.
+
+  * `TextFileIndex.LINE_NUMBER` means use the line number starting from zero,
+    expects data type int64.
+  * `TextFileIndex.WHOLE_LINE` means use the whole line content, expects data
+    type string.
+  * A value `>=0` means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+  For example if we have a file with the following content:
+
+  ```
+  emerson 10
+  lake 20
+  palmer 30
+  ```
+
+  The following snippet initializes a table with the first column as keys and
+  second column as values:
+
+  * `emerson -> 10`
+  * `lake -> 20`
+  * `palmer -> 30`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+
+  Similarly to initialize the whole line as keys and the line number as values.
+
+  * `emerson 10 -> 0`
+  * `lake 20 -> 1`
+  * `palmer 30 -> 2`
+
+  ```python
+  table = tf.contrib.lookup.HashTable(tf.contrib.lookup.TextFileInitializer(
+      "test.txt", tf.string, tf.contrib.lookup.TextFileIndex.WHOLE_LINE,
+      tf.int64, tf.contrib.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
+  ...
+  table.init.run()
+  ```
+  """
+
+  def __init__(self,
+               filename,
+               key_dtype,
+               key_index,
+               value_dtype,
+               value_index,
+               vocab_size=None,
+               delimiter="\t",
+               name=None):
+    """Constructs a table initializer object to populate from a text file.
+
+    It generates one key-value pair per line. The type of table key and
+    value are specified by `key_dtype` and `value_dtype`, respectively.
+    Similarly the content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_dtype: The `key` data type.
+      key_index: the index that represents information of a line to get the
+        table 'key' values from.
+      value_dtype: The `value` data type.
+      value_index: the index that represents information of a line to get the
+        table 'value' values from.'
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: A name for the operation (optional).
+
+    Raises:
+      ValueError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    if not isinstance(filename, ops.Tensor) and not filename:
+      raise ValueError("Filename required for %s." % name)
+
+    key_dtype = dtypes.as_dtype(key_dtype)
+    value_dtype = dtypes.as_dtype(value_dtype)
+
+    if key_index < -2:
+      raise ValueError("Invalid key index %s." % (key_index))
+
+    if key_index == TextFileIndex.LINE_NUMBER and key_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Keys must be dtype %s, got %s." %
+                       (dtypes.int64, key_dtype))
+    if ((key_index == TextFileIndex.WHOLE_LINE) and
+        (not key_dtype.is_integer) and (key_dtype != dtypes.string)):
+      raise ValueError(
+          "Signature mismatch. Keys must be integer or string, got %s." %
+          key_dtype)
+    if value_index < -2:
+      raise ValueError("Invalid value index %s." % (value_index))
+
+    if value_index == TextFileIndex.LINE_NUMBER and value_dtype != dtypes.int64:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.int64, value_dtype))
+    if value_index == TextFileIndex.WHOLE_LINE and value_dtype != dtypes.string:
+      raise ValueError("Signature mismatch. Values must be dtype %s, got %s." %
+                       (dtypes.string, value_dtype))
+
+    if (vocab_size is not None) and (vocab_size <= 0):
+      raise ValueError("Invalid vocab_size %s." % vocab_size)
+
+    self._filename = filename
+    self._key_index = key_index
+    self._value_index = value_index
+    self._vocab_size = vocab_size
+    self._delimiter = delimiter
+    self._name = name
+
+    super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
+
+  def initialize(self, table):
+    """Initializes the table from a text file.
+
+    Args:
+      table: The table to be initialized.
+
+    Returns:
+      The operation that initializes the table.
+
+    Raises:
+      TypeError: when the keys and values data types do not match the table
+      key and value data types.
+    """
+    _check_table_dtypes(table, self.key_dtype, self.value_dtype)
+    with ops.name_scope(self._name, "text_file_init",
+                        (table.table_ref,)) as scope:
+      filename = ops.convert_to_tensor(
+          self._filename, dtypes.string, name="asset_filepath")
+      # pylint: disable=protected-access
+      init_op = gen_lookup_ops._initialize_table_from_text_file_v2(
+          table.table_ref,
+          filename,
+          self._key_index,
+          self._value_index,
+          -1 if self._vocab_size is None else self._vocab_size,
+          self._delimiter,
+          name=scope)
+      # pylint: enable=protected-access
+    ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+    ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
+    return init_op
+
+
+class TextFileStringTableInitializer(TextFileInitializer):
+  """Table initializer for `int64` IDs to string tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.LINE_NUMBER,
+               value_column_index=TextFileIndex.WHOLE_LINE,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_string_table_init"):
+    """Constructs an initializer for an id-to-string table from a text file.
+
+    It populates a table that its key and value types are int64 and string,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by `key_column_index`
+    and `value_column_index`.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the keys
+        from. The default is 0 that represents the whole line content.
+      value_column_index: The column index from the text file to get the
+        values from. The default is to use the line number, starting from zero.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileStringTableInitializer, self).__init__(
+        filename,
+        dtypes.int64,
+        key_column_index,
+        dtypes.string,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class TextFileIdTableInitializer(TextFileInitializer):
+  """Table initializer for string to `int64` IDs tables from a text file."""
+
+  def __init__(self,
+               filename,
+               key_column_index=TextFileIndex.WHOLE_LINE,
+               value_column_index=TextFileIndex.LINE_NUMBER,
+               vocab_size=None,
+               delimiter="\t",
+               name="text_file_id_table_init",
+               key_dtype=dtypes.string):
+    """Constructs an initializer for an string-to-id table from a text file.
+
+    It populates a table that its key and value types are string and int64,
+    respectively. It generates one key-value pair per line.
+    The content of the key and value are specified by the key_index
+    and value_index.
+
+    - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+      expects data type int64.
+    - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+      type string.
+    - A value >=0 means use the index (starting at zero) of the split line based
+      on `delimiter`.
+
+    Args:
+      filename: The filename of the text file to be used for initialization.
+        The path must be accessible from wherever the graph is initialized
+        (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
+      key_column_index: The column index from the text file to get the `key`
+        values from. The default is to use the line number, starting from zero.
+      value_column_index: The column index from the text file ro get the `value`
+        values from. The default is 0 that represents the whole line content.
+      vocab_size: The number of elements in the file, if known.
+      delimiter: The delimiter to separate fields in a line.
+      name: Optional name for the op.
+      key_dtype: The `key` data type.
+
+    Raises:
+      TypeError: when the filename is empty, or when the table key and value
+      data types do not match the expected data types.
+    """
+    super(TextFileIdTableInitializer, self).__init__(
+        filename,
+        key_dtype,
+        key_column_index,
+        dtypes.int64,
+        value_column_index,
+        vocab_size=vocab_size,
+        delimiter=delimiter,
+        name=name)
+
+
+class HasherSpec(collections.namedtuple("HasherSpec", ["hasher", "key"])):
+  """A structure for the spec of the hashing function to use for hash buckets.
+
+  `hasher` is the name of the hashing function to use (eg. "fasthash",
+  "stronghash").
+  `key` is optional and specify the key to use for the hash function if
+  supported, currently only used by a strong hash.
+
+  Fields:
+    hasher: The hasher name to use.
+    key: The key to be used by the hashing function, if required.
+  """
+  __slots__ = ()
+
+
+FastHashSpec = HasherSpec("fasthash", None)  # pylint: disable=invalid-name
+
+
+class StrongHashSpec(HasherSpec):
+  """A structure to specify a key of the strong keyed hash spec.
+
+  The strong hash requires a `key`, which is a list of 2 unsigned integer
+  numbers. These should be non-zero; random numbers generated from random.org
+  would be a fine choice.
+
+  Fields:
+    key: The key to be used by the keyed hashing function.
+  """
+  __slots__ = ()
+
+  def __new__(cls, key):
+    if len(key) != 2:
+      raise ValueError("key must have size 2, got %s." % len(key))
+
+    if not isinstance(key[0], compat.integral_types) or not isinstance(
+        key[1], compat.integral_types):
+      raise TypeError("Invalid key %s. Must be unsigned integer values." % key)
+
+    return super(cls, StrongHashSpec).__new__(cls, "stronghash", key)
+
+
+def _as_string(tensor):
+  if dtypes.string == tensor.dtype.base_dtype:
+    return tensor
+  return string_ops.as_string(tensor)
+
+
+class IdTableWithHashBuckets(LookupInterface):
+  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+
+  For example, if an instance of `IdTableWithHashBuckets` is initialized with a
+  string-to-id table that maps:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+
+  The `IdTableWithHashBuckets` object will performs the following mapping:
+  - emerson -> 0
+  - lake -> 1
+  - palmer -> 2
+  - <other term> -> bucket id between 3 and 3 + num_oov_buckets, calculated by:
+    hash(<term>) % num_oov_buckets + vocab_size
+
+  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
+  the lookup result is [0, 1, 2, 4, 7]
+
+  If `table` is None, only out-of-vocabulary buckets are used.
+
+  Example usage:
+
+  ```python
+  num_oov_buckets = 3
+  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
+  table = tf.IdTableWithHashBuckets(
+      tf.HashTable(tf.TextFileIdTableInitializer(filename), default_value),
+      num_oov_buckets)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print out.eval()
+  ```
+
+  The hash function used for generating out-of-vocabulary buckets ID is handled
+  by `hasher_spec`.
+  """
+
+  def __init__(self,
+               table,
+               num_oov_buckets,
+               hasher_spec=FastHashSpec,
+               name=None,
+               key_dtype=None):
+    """Construct a `IdTableWithHashBuckets` object.
+
+    Args:
+      table: Table that maps `tf.string` or `tf.int64` keys to `tf.int64` ids.
+      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys.
+      hasher_spec: A `HasherSpec` to specify the hash function to use for
+        assignation of out-of-vocabulary buckets  (optional).
+      name: A name for the operation (optional).
+      key_dtype: Data type of keys passed to `lookup`. Defaults to
+        `table.key_dtype` if `table` is specified, otherwise `tf.string`.
+        Must be string or integer, and must be castable to `table.key_dtype`.
+
+    Raises:
+      ValueError: when `table` in None and `num_oov_buckets` is not positive.
+      TypeError: when `hasher_spec` is invalid.
+    """
+    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
+    # characters to use as table name.
+    if name:
+      name = name.rstrip("/")
+    if table:
+      if key_dtype is None:
+        key_dtype = table.key_dtype
+      supported_table_key_dtypes = (dtypes.int64, dtypes.string)
+      if table.key_dtype not in supported_table_key_dtypes:
+        raise TypeError("Invalid key dtype, expected one of %s, but got %s." %
+                        (supported_table_key_dtypes, key_dtype))
+      if table.key_dtype.is_integer != key_dtype.is_integer:
+        raise TypeError("Invalid key dtype, expected %s but got %s." %
+                        ("integer" if key_dtype.is_integer else "non-integer",
+                         table.key_dtype))
+      if table.value_dtype != dtypes.int64:
+        raise TypeError("Invalid value dtype, expected %s but got %s." %
+                        (dtypes.int64, table.value_dtype))
+      self._table = table
+      name = name or self._table.name
+    else:
+      if num_oov_buckets <= 0:
+        raise ValueError("oov_buckets must be > 0 if no table is supplied.")
+      key_dtype = dtypes.string if key_dtype is None else key_dtype
+      self._table = None
+      name = name or "hash_bucket"
+    if (not key_dtype.is_integer) and (dtypes.string != key_dtype):
+      raise TypeError(
+          "Invalid key_dtype, expected integer or string, got %s." % key_dtype)
+    self._num_oov_buckets = num_oov_buckets
+
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError(
+          "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
+    self._hasher_spec = hasher_spec
+    super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64,
+                                                 name.split("/")[-1])
+
+  @property
+  def init(self):
+    """The table initialization op."""
+    if self._table:
+      return self._table.init
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    with ops.name_scope(name, "%s_Size" % self.name) as scope:
+      if self._table:
+        tsize = self._table.size(scope)
+      else:
+        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
+      return tsize + self._num_oov_buckets
+
+  def _get_string_to_hash_bucket_fn(self, hasher_spec):
+    """Returns the string_to_hash_bucket op to use based on `hasher_spec`."""
+    if not isinstance(hasher_spec, HasherSpec):
+      raise TypeError("hasher_spec must be of type HasherSpec %s" % hasher_spec)
+    if hasher_spec.hasher == "fasthash":
+      return string_ops.string_to_hash_bucket_fast
+    if hasher_spec.hasher == "legacy":
+      return string_ops.string_to_hash_bucket
+    if hasher_spec.hasher == "stronghash":
+      return functools.partial(
+          string_ops.string_to_hash_bucket_strong, key=hasher_spec.key)
+    raise ValueError("Unknown hasher %s" % hasher_spec.hasher)
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in the table, outputs the corresponding values.
+
+    It assigns out-of-vocabulary keys to buckets based in their hashes.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: Optional name for the op.
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` doesn't match the table key data type.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+    values = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      values = keys.values
+    if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
+      values = math_ops.to_int64(values)
+
+    if self._num_oov_buckets == 0:
+      ids = self._table.lookup(values, name=name)
+    else:
+      # TODO(yleon): Consider moving this functionality to its own kernel.
+      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
+        str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
+            self._hasher_spec)
+        buckets = str_to_hash_bucket(
+            _as_string(values),
+            num_buckets=self._num_oov_buckets,
+            name="hash_bucket")
+        if self._table:
+          ids = self._table.lookup(values)
+          buckets = math_ops.add(buckets, self._table.size())
+          is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
+          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
+        else:
+          ids = buckets
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    return ids
+
+
+def index_table_from_file(vocabulary_file=None,
+                          num_oov_buckets=0,
+                          vocab_size=None,
+                          default_value=-1,
+                          hasher_spec=FastHashSpec,
+                          key_dtype=dtypes.string,
+                          name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the key and the zero-based line
+  number is the ID.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[vocabulary size, vocabulary size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  table = tf.contrib.lookup.index_table_from_file(
+      vocabulary_file="test.txt", num_oov_buckets=1)
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 3, 2]  # where 3 is the out-of-vocabulary bucket
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignation of out-of-vocabulary buckets.
+    key_dtype: The `key` data type.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `vocabulary_file` is not set.
+    ValueError: If `num_oov_buckets` is negative or `vocab_size` is not greater
+      than zero.
+  """
+  if not vocabulary_file:
+    raise ValueError("vocabulary_file must be specified.")
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+  if (not key_dtype.is_integer) and (dtypes.string != key_dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    table = None
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      if vocab_size:
+        # Keep the shared_name:
+        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                  TextFileIndex.WHOLE_LINE,
+                                                  TextFileIndex.LINE_NUMBER)
+      else:
+        # Keep the shared_name
+        # <table_type>_<filename>_<key_index>_<value_index>
+        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                               TextFileIndex.WHOLE_LINE,
+                                               TextFileIndex.LINE_NUMBER)
+      init = TextFileIdTableInitializer(
+          vocabulary_file,
+          vocab_size=vocab_size,
+          key_dtype=dtypes.int64 if key_dtype.is_integer else key_dtype,
+          name="table_init")
+
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=key_dtype)
+
+    return table
+
+
+def index_table_from_tensor(vocabulary_list,
+                            num_oov_buckets=0,
+                            default_value=-1,
+                            hasher_spec=FastHashSpec,
+                            dtype=dtypes.string,
+                            name=None):
+  """Returns a lookup table that converts a string tensor into int64 IDs.
+
+  This operation constructs a lookup table to convert tensor of strings into
+  int64 IDs. The mapping can be initialized from a string `vocabulary_list` 1-D
+  tensor where each element is a key and corresponding index within the tensor
+  is the value.
+
+  Any lookup of an out-of-vocabulary token will return a bucket ID based on its
+  hash if `num_oov_buckets` is greater than zero. Otherwise it is assigned the
+  `default_value`.
+  The bucket ID range is `[mapping size, mapping size + num_oov_buckets]`.
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  table = tf.contrib.lookup.index_table_from_tensor(
+      vocabulary_list=vocabulary_list, num_oov_buckets=1, default_value=-1)
+  features = tf.constant(["emerson", "lake", "and", "palmer"])
+  ids = table.lookup(features)
+  ...
+  tf.tables_initializer().run()
+
+  ids.eval()  ==> [0, 1, 4, 2]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D `Tensor` that specifies the mapping of keys to
+      indices. Thetype of this object must be castable to `dtype`.
+    num_oov_buckets: The number of out-of-vocabulary buckets.
+    default_value: The value to use for out-of-vocabulary feature values.
+      Defaults to -1.
+    hasher_spec: A `HasherSpec` to specify the hash function to use for
+      assignment of out-of-vocabulary buckets.
+    dtype: The type of values passed to `lookup`. Only string and integers are
+      supported.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map an input `Tensor` to index `int64` `Tensor`.
+
+  Raises:
+    ValueError: If `mapping` is invalid.
+    ValueError: If `num_oov_buckets` is negative.
+  """
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  if num_oov_buckets < 0:
+    raise ValueError("num_oov_buckets must be greater or equal than 0, got %d."
+                     % num_oov_buckets)
+
+  if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
+    raise TypeError("Only integer and string keys are supported.")
+
+  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+    keys = ops.convert_to_tensor(vocabulary_list)
+    if keys.dtype.is_integer != dtype.is_integer:
+      raise ValueError("Expected %s, got %s." %
+                       ("integer"
+                        if dtype.is_integer else "non-integer", keys.dtype))
+    if (not dtype.is_integer) and (keys.dtype.base_dtype != dtype):
+      raise ValueError("Expected %s, got %s." % (dtype, keys.dtype))
+    num_elements = array_ops.size(keys)
+    values = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    with ops.name_scope(None, "hash_table") as hash_table_scope:
+      table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
+      init = KeyValueTensorInitializer(
+          table_keys,
+          values,
+          table_keys.dtype.base_dtype,
+          dtypes.int64,
+          name="table_init")
+      table = HashTable(
+          init, default_value, shared_name=shared_name, name=hash_table_scope)
+    if num_oov_buckets:
+      table = IdTableWithHashBuckets(
+          table,
+          num_oov_buckets=num_oov_buckets,
+          hasher_spec=hasher_spec,
+          name=feat_to_id_scope,
+          key_dtype=dtype)
+
+    return table
+
+
+def index_to_string_table_from_file(vocabulary_file,
+                                    vocab_size=None,
+                                    default_value="UNK",
+                                    name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The table is initialized from a vocabulary file specified in
+  `vocabulary_file`, where the whole line is the value and the
+  zero-based line number is the index.
+
+  Any input which does not have a corresponding index in the vocabulary file
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Sample Usages:
+
+  If we have a vocabulary file "test.txt" with the following content:
+
+  ```
+  emerson
+  lake
+  palmer
+  ```
+
+  ```python
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_file(
+      vocabulary_file="test.txt", default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_file: The vocabulary filename.
+    vocab_size: Number of the elements in the vocabulary, if known.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_file` is empty.
+    ValueError: when `vocab_size` is invalid.
+  """
+  if not vocabulary_file:
+    raise ValueError("vocabulary_file must be specified.")
+  if vocab_size is not None and vocab_size < 1:
+    raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    shared_name = ""
+    if vocab_size:
+      # Keep a shared_name
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
+                                                TextFileIndex.LINE_NUMBER,
+                                                TextFileIndex.WHOLE_LINE)
+    else:
+      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
+                                             TextFileIndex.LINE_NUMBER,
+                                             TextFileIndex.WHOLE_LINE)
+    init = TextFileStringTableInitializer(
+        vocabulary_file, vocab_size=vocab_size, name="table_init")
+
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+def index_to_string_table_from_tensor(vocabulary_list,
+                                      default_value="UNK",
+                                      name=None):
+  """Returns a lookup table that maps a `Tensor` of indices into strings.
+
+  This operation constructs a lookup table to map int64 indices into string
+  values. The mapping is initialized from a string `mapping` 1-D `Tensor` where
+  each element is a value and the corresponding index within the tensor is the
+  key.
+
+  Any input which does not have a corresponding index in 'mapping'
+  (an out-of-vocabulary entry) is assigned the `default_value`
+
+  The underlying table must be initialized by calling
+  `tf.tables_initializer.run()` or `table.init.run()` once.
+
+  Elements in `mapping` cannot have duplicates, otherwise when executing the
+  table initializer op, it will throw a `FailedPreconditionError`.
+
+  Sample Usages:
+
+  ```python
+  vocabulary_list = t.constant(["emerson", "lake", "palmer")
+  indices = tf.constant([1, 5], tf.int64)
+  table = tf.contrib.lookup.index_to_string_table_from_tensor(
+      vocabulary_list, default_value="UNKNOWN")
+  values = table.lookup(indices)
+  ...
+  tf.tables_initializer().run()
+
+  values.eval() ==> ["lake", "UNKNOWN"]
+  ```
+
+  Args:
+    vocabulary_list: A 1-D string `Tensor` that specifies the strings to map
+      from indices.
+    default_value: The value to use for out-of-vocabulary indices.
+    name: A name for this op (optional).
+
+  Returns:
+    The lookup table to map a string values associated to a given index `int64`
+    `Tensors`.
+
+  Raises:
+    ValueError: when `vocabulary_list` is not set.
+  """
+
+  if vocabulary_list is None:
+    raise ValueError("vocabulary_list must be specified.")
+
+  with ops.name_scope(name, "index_to_string") as scope:
+    vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
+    num_elements = array_ops.size(vocabulary_list)
+    keys = math_ops.to_int64(math_ops.range(num_elements))
+
+    shared_name = ""
+    init = KeyValueTensorInitializer(
+        keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
+    # TODO(yleon): Use a more effienct structure.
+    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+
+
+ops.NotDifferentiable("LookupTableFind")
+ops.NotDifferentiable("LookupTableFindV2")
+ops.NotDifferentiable("LookupTableInsert")
+ops.NotDifferentiable("LookupTableInsertV2")
+ops.NotDifferentiable("LookupTableSize")
+ops.NotDifferentiable("LookupTableSizeV2")
+ops.NotDifferentiable("HashTable")
+ops.NotDifferentiable("HashTableV2")
+ops.NotDifferentiable("InitializeTable")
+ops.NotDifferentiable("InitializeTableV2")
+ops.NotDifferentiable("InitializeTableFromTextFile")
+ops.NotDifferentiable("InitializeTableFromTextFileV2")
+ops.NotDifferentiable("MutableDenseHashTable")
+ops.NotDifferentiable("MutableDenseHashTableV2")
+ops.NotDifferentiable("MutableHashTable")
+ops.NotDifferentiable("MutableHashTableV2")
+ops.NotDifferentiable("MutableHashTableOfTensors")
+ops.NotDifferentiable("MutableHashTableOfTensorsV2")
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
index 28f004228d8e96379c9b3510e13f935bc719d4ce..8532c19ad6b3348823cdc8b24f9fa301cea6d3b5 100644
--- a/tensorflow/python/ops/losses/losses.py
+++ b/tensorflow/python/ops/losses/losses.py
@@ -16,10 +16,12 @@
 
 Note: All the losses are added to the `GraphKeys.LOSSES` collection by default.
 
+@@Reduction
 @@absolute_difference
 @@compute_weighted_loss
 @@cosine_distance
 @@hinge_loss
+@@huber_loss
 @@log_loss
 @@mean_pairwise_squared_error
 @@mean_squared_error
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 89daa9594a2a810032d0cd7c201c4b4e2eaa1fd2..97078d71d93e9d26eb7e013c86aebd323d982960 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -27,29 +27,36 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.platform import tf_logging as logging
 
 
-def _scale_losses(losses, weights):
-  """Computes the scaled loss.
+class Reduction(object):
+  """Types of loss reduction."""
 
-  Args:
-    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
-    weights: `Tensor` of shape `[]`, `[batch_size]` or
-      `[batch_size, d1, ... dN]`. The `losses` are reduced (`tf.reduce_sum`)
-      until its dimension matches that of `weights` at which point the reduced
-      `losses` are element-wise multiplied by `weights` and a final `reduce_sum`
-      is computed on the result. Conceptually, this operation is similar to
-      broadcasting (tiling) `weights` to be the same shape as `losses`,
-      performing an element-wise multiplication, and summing the result. Note,
-      however, that the dimension matching is right-to-left, not left-to-right;
-      i.e., the opposite of standard NumPy/Tensorflow broadcasting.
+  # Un-reduced weighted losses with the same shape as input.
+  NONE = "none"
 
-  Returns:
-    A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
-      `losses`.
-  """
-  weighted_losses = math_ops.multiply(losses, weights)
-  return math_ops.reduce_sum(weighted_losses)
+  # Scalar sum of `NONE`.
+  SUM = "weighted_sum"
+
+  # Scalar `SUM` divided by sum of weights.
+  MEAN = "weighted_mean"
+
+  # Scalar `SUM` divided by number of non-zero weights.
+  SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
+
+  @classmethod
+  def all(cls):
+    return (
+        cls.NONE,
+        cls.SUM,
+        cls.MEAN,
+        cls.SUM_BY_NONZERO_WEIGHTS)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid ReductionKey %s." % key)
 
 
 def _safe_div(numerator, denominator, name="value"):
@@ -129,7 +136,8 @@ def _num_present(losses, weights, per_batch=False):
 
 
 def compute_weighted_loss(
-    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES):
+    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Computes the weighted loss.
 
   Args:
@@ -139,15 +147,18 @@ def compute_weighted_loss(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: the scope for the operations performed in computing the loss.
     loss_collection: the loss will be added to these collections.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` that returns the weighted loss.
+    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
 
   Raises:
     ValueError: If `weights` is `None` or the shape is not compatible with
       `losses`, or if the number of dimensions (rank) of either `losses` or
       `weights` is missing.
   """
+  Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
@@ -155,18 +166,28 @@ def compute_weighted_loss(
       input_dtype = losses.dtype
       losses = math_ops.to_float(losses)
       weights = math_ops.to_float(weights)
-      total_loss = _scale_losses(losses, weights)
-      num_present = _num_present(losses, weights)
-      mean_loss = _safe_mean(total_loss, num_present)
+      weighted_losses = math_ops.multiply(losses, weights)
+      if reduction == Reduction.NONE:
+        loss = weighted_losses
+      else:
+        loss = math_ops.reduce_sum(weighted_losses)
+        if reduction == Reduction.MEAN:
+          loss = _safe_mean(
+              loss,
+              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+        elif reduction == Reduction.SUM_BY_NONZERO_WEIGHTS:
+          loss = _safe_mean(loss, _num_present(losses, weights))
+
       # Convert the result back to the input type.
-      mean_loss = math_ops.cast(mean_loss, input_dtype)
-      util.add_loss(mean_loss, loss_collection)
-      return mean_loss
+      loss = math_ops.cast(loss, input_dtype)
+      util.add_loss(loss, loss_collection)
+      return loss
 
 
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds an Absolute Difference loss to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -185,9 +206,11 @@ def absolute_difference(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -199,12 +222,14 @@ def absolute_difference(
     labels = math_ops.to_float(labels)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.abs(math_ops.subtract(predictions, labels))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def cosine_distance(
     labels, predictions, dim=None, weights=1.0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a cosine-distance loss to the training procedure.
 
   Note that the function assumes that `predictions` and `labels` are already
@@ -219,9 +244,11 @@ def cosine_distance(
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which this loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If `predictions` shape doesn't match `labels` shape, or
@@ -237,11 +264,13 @@ def cosine_distance(
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(dim,), keep_dims=True)
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def hinge_loss(labels, logits, weights=1.0, scope=None,
-               loss_collection=ops.GraphKeys.LOSSES):
+               loss_collection=ops.GraphKeys.LOSSES,
+               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a hinge loss to the training procedure.
 
   Args:
@@ -253,9 +282,11 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` of the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shapes of `logits` and `labels` don't match.
@@ -269,11 +300,76 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
     labels = math_ops.subtract(2 * labels, all_ones)
     losses = nn_ops.relu(
         math_ops.subtract(all_ones, math_ops.multiply(labels, logits)))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
+
+
+def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
+               loss_collection=ops.GraphKeys.LOSSES,
+               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
+  """Adds a Huber Loss term to the training procedure.
+
+  For each value x in `error=labels-predictions`, the following is calculated:
+
+  ```
+    0.5 * x^2                  if |x| <= d
+    0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+
+  where d is `delta`.
+
+  See: https://en.wikipedia.org/wiki/Huber_loss
+
+  `weights` acts as a coefficient for the loss. If a scalar is provided, then
+  the loss is simply scaled by the given value. If `weights` is a tensor of size
+  [batch_size], then the total loss for each sample of the batch is rescaled
+  by the corresponding element in the `weights` vector. If the shape of
+  `weights` matches the shape of `predictions`, then the loss of each
+  measurable element of `predictions` is scaled by the corresponding value of
+  `weights`.
+
+  Args:
+    labels: The ground truth output tensor, same dimensions as 'predictions'.
+    predictions: The predicted outputs.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `losses` dimension).
+    delta: `float`, the point where the huber loss function
+      changes from a quadratic to linear.
+    scope: The scope for the operations performed in computing the loss.
+    loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
+
+  Returns:
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
+
+  Raises:
+    ValueError: If the shape of `predictions` doesn't match that of `labels` or
+      if the shape of `weights` is invalid.
+  """
+  with ops.name_scope(scope, "huber_loss",
+                      (predictions, labels, weights)) as scope:
+    predictions = math_ops.to_float(predictions)
+    labels = math_ops.to_float(labels)
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    error = math_ops.subtract(predictions, labels)
+    abs_error = math_ops.abs(error)
+    quadratic = math_ops.minimum(abs_error, delta)
+    # The following expression is the same in value as
+    # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
+    # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
+    # This is necessary to avoid doubling the gradient, since there is already a
+    # nonzero contribution to the gradient from the quadratic term.
+    linear = (abs_error - quadratic)
+    losses = 0.5 * quadratic**2 + delta * linear
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
-             loss_collection=ops.GraphKeys.LOSSES):
+             loss_collection=ops.GraphKeys.LOSSES,
+             reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a Log Loss term to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -293,9 +389,11 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
     epsilon: A small increment to add to avoid taking a log of zero.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -310,11 +408,14 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
         labels,
         math_ops.log(predictions + epsilon)) - math_ops.multiply(
             (1 - labels), math_ops.log(1 - predictions + epsilon))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
-def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
-                                loss_collection=ops.GraphKeys.LOSSES):
+# TODO(b/37208492): Add reduction arg.
+def mean_pairwise_squared_error(
+    labels, predictions, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES):
   """Adds a pairwise-errors-squared loss to the training procedure.
 
   Unlike `mean_squared_error`, which is a measure of the differences between
@@ -349,7 +450,7 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
     loss_collection: collection to which the loss will be added.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    A scalar `Tensor` that returns the weighted loss.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -381,7 +482,8 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
       term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
                               math_ops.square(num_present_per_batch))
 
-      loss = _scale_losses(term1 - term2, weights)
+      weighted_losses = math_ops.multiply(term1 - term2, weights)
+      loss = math_ops.reduce_sum(weighted_losses)
 
       mean_loss = array_ops.where(
           math_ops.reduce_sum(num_present_per_batch) > 0,
@@ -392,8 +494,10 @@ def mean_pairwise_squared_error(labels, predictions, weights=1.0, scope=None,
       return mean_loss
 
 
-def mean_squared_error(labels, predictions, weights=1.0, scope=None,
-                       loss_collection=ops.GraphKeys.LOSSES):
+def mean_squared_error(
+    labels, predictions, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Adds a Sum-of-Squares loss to the training procedure.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided, then
@@ -412,9 +516,11 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
       be either `1`, or the same as the corresponding `losses` dimension).
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+    shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `predictions` doesn't match that of `labels` or
@@ -426,12 +532,14 @@ def mean_squared_error(labels, predictions, weights=1.0, scope=None,
     labels = math_ops.to_float(labels)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.square(math_ops.subtract(predictions, labels))
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -447,16 +555,18 @@ def sigmoid_cross_entropy(
   Args:
     multi_class_labels: `[batch_size, num_classes]` target integer labels in
       `(0, 1)`.
-    logits: `[batch_size, num_classes]` logits outputs of the network.
+    logits: Float `[batch_size, num_classes]` logits outputs of the network.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
     label_smoothing: If greater than `0` then smooth the labels.
     scope: The scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has the same shape as `logits`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `logits` doesn't match that of
@@ -466,7 +576,9 @@ def sigmoid_cross_entropy(
   with ops.name_scope(scope, "sigmoid_cross_entropy_loss",
                       (logits, multi_class_labels, weights)) as scope:
     logits = ops.convert_to_tensor(logits)
+    logging.info("logits.dtype=%s.", logits.dtype)
     multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
+    logging.info("multi_class_labels.dtype=%s.", multi_class_labels.dtype)
     logits.get_shape().assert_is_compatible_with(multi_class_labels.get_shape())
 
     if label_smoothing > 0:
@@ -476,12 +588,15 @@ def sigmoid_cross_entropy(
     losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
                                                   logits=logits,
                                                   name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    logging.info("losses.dtype=%s.", losses.dtype)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
-    loss_collection=ops.GraphKeys.LOSSES):
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -503,9 +618,11 @@ def softmax_cross_entropy(
     label_smoothing: If greater than 0 then smooth the labels.
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the mean loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has shape `[batch_size]`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
@@ -527,7 +644,8 @@ def softmax_cross_entropy(
     losses = nn.softmax_cross_entropy_with_logits(labels=onehot_labels,
                                                   logits=logits,
                                                   name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
 
 
 # TODO(ptucker): Merge this with similar method in metrics_impl.
@@ -572,7 +690,7 @@ def _remove_squeezable_dimensions(
     # Use dynamic rank.
     rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
     if (weights_rank is None) or (
-        weights_shape.dims[-1].is_compatible_with(1)):
+        weights_rank > 0 and weights_shape.dims[-1].is_compatible_with(1)):
       weights = control_flow_ops.cond(
           math_ops.equal(1, rank_diff),
           lambda: array_ops.squeeze(weights, [-1]),
@@ -581,8 +699,10 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
-def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
-                                 loss_collection=ops.GraphKeys.LOSSES):
+def sparse_softmax_cross_entropy(
+    labels, logits, weights=1.0, scope=None,
+    loss_collection=ops.GraphKeys.LOSSES,
+    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
   """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
 
   `weights` acts as a coefficient for the loss. If a scalar is provided,
@@ -602,9 +722,11 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
       `labels`
     scope: the scope for the operations performed in computing the loss.
     loss_collection: collection to which the loss will be added.
+    reduction: Type of reduction to apply to loss.
 
   Returns:
-    A scalar `Tensor` representing the mean loss value.
+    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
+    `NONE`, this has the same shape as `labels`; otherwise, it is scalar.
 
   Raises:
     ValueError: If the shapes of logits, labels, and weight are incompatible, or
@@ -620,4 +742,5 @@ def sparse_softmax_cross_entropy(labels, logits, weights=1.0, scope=None,
     losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                          logits=logits,
                                                          name="xentropy")
-    return compute_weighted_loss(losses, weights, scope, loss_collection)
+    return compute_weighted_loss(
+        losses, weights, scope, loss_collection, reduction=reduction)
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 15d9d86f013a45bcb19fda5cda1de5bedcc1f8f8..3414df475f5744f11482add71b403fb7d86c2265 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -57,13 +57,13 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
 
 
 def get_regularization_losses(scope=None):
-  """Gets the regularization losses.
+  """Gets the list of regularization losses.
 
   Args:
     scope: An optional scope for filtering the losses to return.
 
   Returns:
-    A list of loss variables.
+    A list of regularization losses as Tensors.
   """
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
@@ -88,7 +88,11 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
-  Notice that the function adds the given losses to the regularization losses.
+  In particular, this adds any losses you have added with `tf.add_loss()` to
+  any regularization losses that have been added by regularization parameters
+  on layers constructors e.g. `tf.layers`. Be very sure to use this if you
+  are constructing a loss_op manually. Otherwise regularization arguments
+  on `tf.layers` methods will not function.
 
   Args:
     add_regularization_losses: A boolean indicating whether or not to use the
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 8aa8de530c22a5e14ce3a86724dcaec588127561..024158e7097f389ccf6d72acc60bdab3bfafeb92 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -429,7 +429,7 @@ def _DigammaGrad(op, grad):
 
 @ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
-  """Returns gradient of igamma(a, x) with respect to a and x."""
+  """Returns gradient of igamma(a, x) with respect to x."""
   # TODO(ebrevdo): Perhaps add the derivative w.r.t. a
   a = op.inputs[0]
   x = op.inputs[1]
@@ -440,14 +440,43 @@ def _IgammaGrad(op, grad):
   # Perform operations in log space before summing, because Gamma(a)
   # and Gamma'(a) can grow large.
   partial_x = math_ops.exp(-x + (a - 1) * math_ops.log(x) - math_ops.lgamma(a))
+  # TODO(b/36815900): Mark None return values as NotImplemented
   return (None,
           array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Igammac")
 def _IgammacGrad(op, grad):
-  """Returns gradient of igammac(a, x) = 1 - igamma(a, x) w.r.t. a and x."""
-  return [-1 * g if g is not None else None for g in _IgammaGrad(op, grad)]
+  """Returns gradient of igammac(a, x) = 1 - igamma(a, x) w.r.t. x."""
+  _, igamma_grad_x = _IgammaGrad(op, grad)
+  return None, -igamma_grad_x
+
+
+@ops.RegisterGradient("Betainc")
+def _BetaincGrad(op, grad):
+  """Returns gradient of betainc(a, b, x) with respect to x."""
+  # TODO(ebrevdo): Perhaps add the derivative w.r.t. a, b
+  a, b, x = op.inputs
+
+  # two cases: x is a scalar and a/b are same-shaped tensors, or vice
+  # versa; so its sufficient to check against shape(a).
+  sa = array_ops.shape(a)
+  sx = array_ops.shape(x)
+  # pylint: disable=protected-access
+  _, rx = gen_array_ops._broadcast_gradient_args(sa, sx)
+  # pylint: enable=protected-access
+
+  # Perform operations in log space before summing, because terms
+  # can grow large.
+  log_beta = (gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b)
+              - gen_math_ops.lgamma(a + b))
+  partial_x = math_ops.exp(
+      (b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta)
+
+  # TODO(b/36815900): Mark None return values as NotImplemented
+  return (None,  # da
+          None,  # db
+          array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
 
 @ops.RegisterGradient("Zeta")
@@ -465,6 +494,7 @@ def _ZetaGrad(op, grad):
     x = math_ops.conj(x)
     q = math_ops.conj(q)
     partial_q = -x * math_ops.zeta(x + 1, q)
+    # TODO(b/36815900): Mark None return values as NotImplemented
     return (None,
             array_ops.reshape(math_ops.reduce_sum(partial_q * grad, rq), sq))
 
@@ -484,6 +514,7 @@ def _PolygammaGrad(op, grad):
     n = math_ops.conj(n)
     x = math_ops.conj(x)
     partial_x = math_ops.polygamma(n + 1, x)
+    # TODO(b/36815900): Mark None return values as NotImplemented
     return (None,
             array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
 
@@ -582,6 +613,16 @@ def _AtanGrad(op, grad):
     return grad * inv
 
 
+@ops.RegisterGradient("Atan2")
+def _Atan2Grad(op, grad):
+  """Returns grad * x / (x^2 + y^2), grad * -y / (x^2 + y^2)."""
+  y = op.inputs[0]
+  x = op.inputs[1]
+  with ops.control_dependencies([grad.op]):
+    grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
+    return x * grad_inv, -y * grad_inv
+
+
 @ops.RegisterGradient("AddN")
 def _AddNGrad(op, grad):
   """Copies the gradient to all inputs."""
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 6734f3fc73e31b04945e69f25a8914386b1c6398..9e989210257b43915b3c22d747a2c0625c01f525 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -56,6 +56,7 @@ See the @{$python/math_ops} guide.
 @@acos
 @@asin
 @@atan
+@@atan2
 @@lgamma
 @@digamma
 @@erf
@@ -151,6 +152,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gen_state_ops
@@ -240,6 +242,12 @@ def abs(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
+# pylint: disable=redefined-builtin
+def _bucketize(input, boundaries, name=None):
+  return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+# pylint: enable=redefined-builtin
+
+
 class DivideDelegateWithName(object):
   """Use Python2/Python3 division delegation to implement divide for tensors."""
 
@@ -378,7 +386,7 @@ def sign(x, name=None):
     A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
 
   @compatibility(numpy)
-  Equivalent to numpy.sign except for the behaviour for input values of NaN.
+  Equivalent to numpy.sign except for the behavior for input values of NaN.
   @end_compatibility
   """
   with ops.name_scope(name, "Sign", [x]) as name:
@@ -817,7 +825,16 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
       if not isinstance(y, sparse_tensor.SparseTensor):
-        y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+        try:
+          y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+        except TypeError:
+          # If the RHS is not a tensor, it might be a tensor aware object
+          # that can implement the operator with knowledge of itself
+          # and the tensor.
+          if hasattr(type(y), "__r%s__" % op_name):
+            return NotImplemented
+          else:
+            raise
       return func(x, y, name=name)
 
   def binary_op_wrapper_sparse(sp_x, y):
@@ -1065,8 +1082,6 @@ _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
 _OverrideBinaryOperatorHelper(_div_python2, "div")
 _OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
-# TODO(aselle): Switch mod to floor_mod when ready
-# _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(gen_math_ops._floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
@@ -1919,6 +1934,12 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   NOTE: This operation is not differentiable and cannot be used if inputs depend
   on trainable variables. Please use `tf.add_n` for such cases.
 
+  Aside from differentiability, `tf.accumulate_n` performs the same operation as
+  `tf.add_n`, but does not wait for all of its inputs to be ready before
+  beginning to sum. This can save memory if inputs are ready at different times,
+  since minimum temporary storage is proportional to the output size rather than
+  the inputs size.
+
   For example:
 
   ```python
@@ -2004,6 +2025,24 @@ def sigmoid(x, name=None):
     return gen_math_ops._sigmoid(x, name=name)
 
 
+def log_sigmoid(x, name=None):
+  """Computes log sigmoid of `x` element-wise.
+
+  Specifically, `y = log(1 / (1 + exp(-x)))`.  For numerical stability,
+  we use `y = -tf.nn.softplus(-x)`.
+
+  Args:
+    x: A Tensor with type `float32` or `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor with the same type as `x`.
+  """
+  with ops.name_scope(name, "LogSigmoid", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    return gen_math_ops._neg(gen_nn_ops.softplus(-x), name=name)
+
+
 def tanh(x, name=None):
   """Computes hyperbolic tangent of `x` element-wise.
 
@@ -2026,9 +2065,9 @@ def tanh(x, name=None):
 
 
 def bincount(arr,
+             weights=None,
              minlength=None,
              maxlength=None,
-             weights=None,
              dtype=dtypes.int32):
   """Counts the number of occurrences of each value in an integer array.
 
@@ -2040,13 +2079,13 @@ def bincount(arr,
 
   Args:
     arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+        `arr`, the bin will be incremented by the corresponding weight instead
+        of 1.
     minlength: If given, ensures the output has length at least `minlength`,
         padding with zeros at the end if necessary.
     maxlength: If given, skips values in `arr` that are equal or greater than
         `maxlength`, ensuring that the output has length at most `maxlength`.
-    weights: If non-None, must be the same shape as arr. For each value in
-        `arr`, the bin will be incremented by the corresponding weight instead
-        of 1.
     dtype: If `weights` is None, determines the type of the output bins.
 
   Returns:
@@ -2290,7 +2329,7 @@ def tensordot(a, b, axes, name=None):
     using `array_ops.transpose` and `array_ops.reshape`. The method takes a
     tensor and performs the correct transpose and reshape operation for a given
     set of indices. It returns the reshaped tensor as well as a list of indices
-    necesary to reshape the tensor again after matrix multiplication.
+    necessary to reshape the tensor again after matrix multiplication.
 
     Args:
       a: `Tensor`.
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 7dbc8efe16aab6f7618b3d4ccc93fb019f9b15c9..617d2305bd87df5eda7374b6fa8756ef6fd5553a 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -28,6 +29,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
+ops._USE_C_API = True
+
 exp = np.exp
 log = np.log
 
@@ -53,7 +56,8 @@ class ReduceTest(test_util.TensorFlowTestCase):
   def testReduceInvalidAxis(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
 
@@ -278,7 +282,8 @@ class AddNTest(test_util.TensorFlowTestCase):
     for _ in range(98):
       partials.append(math_ops.add_n([constant_op.constant(1)]))
     partials.append(
-        math_ops.add_n([constant_op.constant(1), constant_op.constant(1)]))
+        math_ops.add_n([constant_op.constant(1),
+                        constant_op.constant(1)]))
 
     res = math_ops.add_n(partials) + constant_op.constant(0)
     with self.test_session(use_gpu=True):
@@ -408,9 +413,9 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       tf_divs = array_ops.constant(divs)
       tf2_result = (tf_nums // tf_divs * tf_divs + tf_nums % tf_divs).eval()
       np_result = (nums // divs) * divs + (nums % divs)
-      # consistentcy with numpy
+      # Consistent with numpy
       self.assertAllEqual(tf_result, np_result)
-      # consistentcy with two forms of divide
+      # Consistent with two forms of divide
       self.assertAllEqual(tf_result, tf2_result)
       # consistency for truncation form
       tf3_result = (math_ops.truncatediv(nums, divs) * divs +
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index bdb34dd78e64f59f48ccd747d323cf2b0d076176..0d35f50894f5d4f860fd6a7966c4e0252c80ffe1 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import sets
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
 from tensorflow.python.ops import weights_broadcast_ops
 
 
@@ -45,7 +44,7 @@ def _local_variable(initial_value, validate_shape=True, name=None):
   Returns:
     New variable.
   """
-  return variables.Variable(
+  return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=validate_shape, name=name)
@@ -189,8 +188,8 @@ def _create_local(name, shape, collections=None, validate_shape=True,
   # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
   collections = list(collections or [])
   collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variables.Variable(
-      initial_value=array_ops.zeros(shape, dtype=dtype),
+  return variable_scope.variable(
+      array_ops.zeros(shape, dtype=dtype),
       name=name,
       trainable=False,
       collections=collections,
@@ -1736,7 +1735,7 @@ def _streaming_sparse_true_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('true_positive', k, class_id=class_id),
@@ -1832,7 +1831,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_negative', k, class_id=class_id),
@@ -1925,7 +1924,74 @@ def recall_at_k(labels,
     labels = _maybe_expand_labels(labels, predictions)
 
     _, top_k_idx = nn.top_k(predictions, k)
-    top_k_idx = math_ops.to_int64(top_k_idx)
+    return _sparse_recall_at_top_k(
+        labels=labels,
+        predictions_idx=top_k_idx,
+        k=k,
+        class_id=class_id,
+        weights=weights,
+        metrics_collections=metrics_collections,
+        updates_collections=updates_collections,
+        name=scope)
+
+
+def _sparse_recall_at_top_k(labels,
+                            predictions_idx,
+                            k=None,
+                            class_id=None,
+                            weights=None,
+                            metrics_collections=None,
+                            updates_collections=None,
+                            name=None):
+  """Computes recall@k of top-k predictions with respect to sparse labels.
+
+  Differs from `recall_at_k` in that predictions must be in the form of top `k`
+  class indices, whereas `recall_at_k` expects logits. Refer to `recall_at_k`
+  for more details.
+
+  Args:
+    labels: `int64` `Tensor` or `SparseTensor` with shape
+      [D1, ... DN, num_labels] or [D1, ... DN], where the latter implies
+      num_labels=1. N >= 1 and num_labels is the number of target classes for
+      the associated prediction. Commonly, N=1 and `labels` has shape
+      [batch_size, num_labels]. [D1, ... DN] must match `predictions`. Values
+      should be in range [0, num_classes), where num_classes is the last
+      dimension of `predictions`. Values outside this range always count
+      towards `false_negative_at_<k>`.
+    predictions_idx: Integer `Tensor` with shape [D1, ... DN, k] where N >= 1.
+      Commonly, N=1 and predictions has shape [batch size, k]. The final
+      dimension contains the top `k` predicted class indices. [D1, ... DN] must
+      match `labels`.
+    k: Integer, k for @k metric.
+    class_id: Integer class ID for which we want binary metrics. This should be
+      in range [0, num_classes), where num_classes is the last dimension of
+      `predictions`. If class_id is outside this range, the method returns NAN.
+    weights: `Tensor` whose rank is either 0, or n-1, where n is the rank of
+      `labels`. If the latter, it must be broadcastable to `labels` (i.e., all
+      dimensions must be either `1`, or the same as the corresponding `labels`
+      dimension).
+    metrics_collections: An optional list of collections that values should
+      be added to.
+    updates_collections: An optional list of collections that updates should
+      be added to.
+    name: Name of new update operation, and namespace for other dependent ops.
+
+  Returns:
+    recall: Scalar `float64` `Tensor` with the value of `true_positives` divided
+      by the sum of `true_positives` and `false_negatives`.
+    update_op: `Operation` that increments `true_positives` and
+      `false_negatives` variables appropriately, and whose value matches
+      `recall`.
+
+  Raises:
+    ValueError: If `weights` is not `None` and its shape doesn't match
+    `predictions`, or if either `metrics_collections` or `updates_collections`
+    are not a list or tuple.
+  """
+  with ops.name_scope(name,
+                      _at_k_name('recall', k, class_id=class_id),
+                      (predictions_idx, labels, weights)) as scope:
+    top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
         weights=weights)
@@ -2587,7 +2653,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     A tuple of `Variable` and update `Operation`.
 
   Raises:
-    ValueError: If `weights` is not `None` and has an incomptable shape.
+    ValueError: If `weights` is not `None` and has an incompatible shape.
   """
   with ops.name_scope(
       name, _at_k_name('false_positive', k, class_id=class_id),
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index c5c5169231167ac36294a2821db3b5d440dd2b35..7b6494e0c97305723e98220129a6c04f0ebce81e 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -27,6 +27,7 @@ See the @{$python/nn} guide.
 @@dropout
 @@bias_add
 @@sigmoid
+@@log_sigmoid
 @@tanh
 @@convolution
 @@conv2d
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 93f6f97ee4a6b75b9233851b2303e298dda7d319..028d82aa4da50a506d80cd24b9fd8e0c7fa584e1 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import gen_nn_ops
 
 
 @ops.RegisterGradient("Conv2DBackpropInput")
@@ -132,12 +133,12 @@ def _AvgPool3DGrad(op, grad):
 
 @ops.RegisterGradient("AvgPool3DGrad")
 def _AvgPool3DGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]),
-          gen_nn_ops.avg_pool3d(grad,
-                                op.get_attr("ksize"),
-                                op.get_attr("strides"),
-                                op.get_attr("padding"),
-                                data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops.avg_pool3d(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -154,32 +155,34 @@ def _MaxPool3DGrad(op, grad):
 
 @ops.RegisterGradient("MaxPool3DGrad")
 def _MaxPool3DGradGrad(op, grad):
-  return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype),
-          array_ops.zeros(shape=array_ops.shape(op.inputs[1]),
-                          dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool3d_grad_grad(op.inputs[0],
-                                           op.inputs[1],
-                                           grad,
-                                           op.get_attr("ksize"),
-                                           op.get_attr("strides"),
-                                           padding=op.get_attr("padding"),
-                                           data_format=op.get_attr("data_format")))
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool3DGradGrad")
 def _MaxPool3DGradGradGrad(op, grad):
-  return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype),
-          array_ops.zeros(shape=array_ops.shape(op.inputs[1]),
-                          dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool3d_grad(op.inputs[0],
-                                      op.inputs[1],
-                                      grad,
-                                      op.get_attr("ksize"),
-                                      op.get_attr("strides"),
-                                      padding=op.get_attr("padding"),
-                                      data_format=op.get_attr("data_format")))
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool3d_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("Softmax")
@@ -328,7 +331,7 @@ def _EluGradGrad(op, grad):
   return (gen_nn_ops._elu_grad(grad, op.outputs[0]),
           array_ops.where(
               x < 0., gen_nn_ops._elu_grad(grad, op.outputs[0] + 1),
-              array_ops.zeros(shape = array_ops.shape(x), dtype = x.dtype)))
+              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -346,6 +349,19 @@ def _SoftplusGrad(op, grad):
   return gen_nn_ops._softplus_grad(grad, op.inputs[0])
 
 
+@ops.RegisterGradient("SoftplusGrad")
+def _SoftplusGradGrad(op, grad):
+  # Let:
+  #   y = tf.nn.softplus(x)
+  #   dx = gen_nn_ops._softplus_grad(dy, x) = dy / (1 + exp(-x))
+  # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
+  dy, x = op.inputs
+  with ops.control_dependencies([grad.op]):
+    ddy = gen_nn_ops._softplus_grad(grad, x)  # pylint: disable=protected-access
+    d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
+    return (ddy, d2x)
+
+
 @ops.RegisterGradient("Softsign")
 def _SoftsignGrad(op, grad):
   return gen_nn_ops._softsign_grad(grad, op.inputs[0])
@@ -385,12 +401,20 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
   softmax_grad = op.outputs[1]
   grad = _BroadcastMul(grad_loss, softmax_grad)
 
-  if grad_grad.op.type not in ('ZerosLike', 'Zeros'):
+  def IsZero(g):
+    # Some introspection to check if the gradient is feeding zeros
+    if g.op.type in ("ZerosLike", "Zeros"):
+      return True
+    const_fill_value = tensor_util.constant_value(g)
+    return const_fill_value is not None and (const_fill_value == 0).all()
+
+  if not IsZero(grad_grad):
     logits = op.inputs[0]
     softmax = nn_ops.softmax(logits)
 
-    grad += ((grad_grad - array_ops.squeeze(math_ops.matmul(grad_grad[:, None, :],
-                                                              softmax[:, :, None]), axis=1)) * softmax)
+    grad += ((grad_grad - array_ops.squeeze(
+        math_ops.matmul(grad_grad[:, None, :],
+                        softmax[:, :, None]), axis=1)) * softmax)
 
   return grad, None
 
@@ -482,12 +506,12 @@ def _AvgPoolGrad(op, grad):
 
 @ops.RegisterGradient("AvgPoolGrad")
 def _AvgPoolGradGrad(op, grad):
-  return (array_ops.stop_gradient(op.inputs[0]),
-          gen_nn_ops._avg_pool(grad,
-                               op.get_attr("ksize"),
-                               op.get_attr("strides"),
-                               op.get_attr("padding"),
-                               data_format=op.get_attr("data_format")))
+  return (array_ops.stop_gradient(op.inputs[0]), gen_nn_ops._avg_pool(
+      grad,
+      op.get_attr("ksize"),
+      op.get_attr("strides"),
+      op.get_attr("padding"),
+      data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPool")
@@ -501,34 +525,46 @@ def _MaxPoolGrad(op, grad):
                                    data_format=op.get_attr("data_format"))
 
 
+@ops.RegisterGradient("MaxPoolWithArgmax")
+def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
+  return gen_nn_ops._max_pool_grad_with_argmax(op.inputs[0],
+                                               grad,
+                                               op.outputs[1],
+                                               op.get_attr("ksize"),
+                                               op.get_attr("strides"),
+                                               padding=op.get_attr("padding"))
+
+
 @ops.RegisterGradient("MaxPoolGrad")
 def _MaxPoolGradGrad(op, grad):
-  return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype),
-          array_ops.zeros(shape=array_ops.shape(op.inputs[1]),
-                          dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool_grad_grad(op.inputs[0],
-                                         op.inputs[1],
-                                         grad,
-                                         op.get_attr("ksize"),
-                                         op.get_attr("strides"),
-                                         padding=op.get_attr("padding"),
-                                         data_format=op.get_attr("data_format")))
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("MaxPoolGradGrad")
 def _MaxPoolGradGradGrad(op, grad):
-  return (array_ops.zeros(shape=array_ops.shape(op.inputs[0]),
-                          dtype=op.inputs[0].dtype),
-          array_ops.zeros(shape=array_ops.shape(op.inputs[1]),
-                          dtype=op.inputs[1].dtype),
-          gen_nn_ops._max_pool_grad(op.inputs[0],
-                                    op.inputs[1],
-                                    grad,
-                                    op.get_attr("ksize"),
-                                    op.get_attr("strides"),
-                                    padding=op.get_attr("padding"),
-                                    data_format=op.get_attr("data_format")))
+  return (array_ops.zeros(
+      shape=array_ops.shape(op.inputs[0]),
+      dtype=op.inputs[0].dtype), array_ops.zeros(
+          shape=array_ops.shape(op.inputs[1]), dtype=op.inputs[1].dtype),
+          gen_nn_ops._max_pool_grad(
+              op.inputs[0],
+              op.inputs[1],
+              grad,
+              op.get_attr("ksize"),
+              op.get_attr("strides"),
+              padding=op.get_attr("padding"),
+              data_format=op.get_attr("data_format")))
 
 
 @ops.RegisterGradient("FractionalMaxPool")
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index dc044a29216d2d7223158772311ce45f39d8a86b..0a00e3d76508b9e910e755203337940d9b53a6ea 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -301,9 +301,8 @@ def zero_fraction(value, name=None):
   This is useful in summaries to measure and report sparsity.  For example,
 
   ```python
-      z = tf.Relu(...)
-      summ = tf.contrib.deprecated.scalar_summary('sparsity',
-      tf.nn.zero_fraction(z))
+      z = tf.nn.relu(...)
+      summ = tf.summary.scalar('sparsity', tf.nn.zero_fraction(z))
   ```
 
   Args:
@@ -639,18 +638,22 @@ def moments(x, axes, shift=None, name=None, keep_dims=False):
           math_ops.reduce_mean(y, axes, keep_dims=True))
     else:
       shift = math_ops.cast(shift, y.dtype)
-    counts, m_ss, v_ss, shift = sufficient_statistics(
-        y, axes, shift=shift, keep_dims=keep_dims, name=name)
-    # Reshape shift as needed.
-    shift = array_ops.reshape(shift, array_ops.shape(m_ss))
-    shift.set_shape(m_ss.get_shape())
-    with ops.control_dependencies([counts, m_ss, v_ss]):
-      mean, variance = normalize_moments(counts, m_ss, v_ss, shift, name=name)
-      if x.dtype == dtypes.float16:
-        return (math_ops.cast(mean, dtypes.float16),
-                math_ops.cast(variance, dtypes.float16))
-      else:
-        return (mean, variance)
+    shifted_mean = math_ops.reduce_mean(
+        math_ops.subtract(y, shift), axes, keep_dims=True, name="shifted_mean")
+    variance = math_ops.subtract(
+        math_ops.reduce_mean(
+            math_ops.squared_difference(y, shift), axes, keep_dims=True),
+        math_ops.square(shifted_mean),
+        name="variance")
+    mean = math_ops.add(shifted_mean, shift, name="mean")
+    if not keep_dims:
+      mean = array_ops.squeeze(mean, axes)
+      variance = array_ops.squeeze(variance, axes)
+    if x.dtype == dtypes.float16:
+      return (math_ops.cast(mean, dtypes.float16), math_ops.cast(
+          variance, dtypes.float16))
+    else:
+      return (mean, variance)
 
 
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
@@ -920,7 +923,8 @@ def _compute_sampled_logits(weights,
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
         objects whose concatenation along dimension 0 has shape
         `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
-    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
+        class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
         the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
@@ -985,7 +989,8 @@ def _compute_sampled_logits(weights,
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
-    all_b = embedding_ops.embedding_lookup(biases, all_ids)
+    all_b = embedding_ops.embedding_lookup(
+        biases, all_ids, partition_strategy=partition_strategy)
     # true_w shape is [batch_size * num_true, dim]
     # true_b is a [batch_size * num_true] tensor
     true_w = array_ops.slice(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index b2ccec0a9d21a85589e67a5822b89df32098ba8b..e4eaeff67ad6b90d6c9fb129b0f7ad8f732d77b0 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -39,6 +39,8 @@ from tensorflow.python.ops.gen_nn_ops import *
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
 
+# pylint: disable=protected-access
+
 
 def _non_atrous_convolution(input, filter, padding, data_format=None,  # pylint: disable=redefined-builtin
                             strides=None, name=None):
@@ -373,7 +375,9 @@ def with_space_to_batch(
     input_shape_list = input.get_shape().as_list()
     input_spatial_shape = [input_shape_list[i] for i in spatial_dims]
   if input_spatial_shape is None or None in input_spatial_shape:
-    input_spatial_shape = array_ops.gather(array_ops.shape(input), spatial_dims)
+    input_shape_tensor = array_ops.shape(input)
+    input_spatial_shape = array_ops.stack(
+        [input_shape_tensor[i] for i in spatial_dims])
 
   paddings, crops = array_ops.required_space_to_batch_paddings(
       input_shape=input_spatial_shape,
@@ -836,6 +840,11 @@ def pool(input,  # pylint: disable=redefined-builtin
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
+  This function is a simpler wrapper around the more general
+  @{tf.nn.convolution}, and exists only for backwards compatibility. You can
+  use @{tf.nn.convolution} to perform 1-D, 2-D, or 3-D atrous convolution.
+
+
   Computes a 2-D atrous convolution, also known as convolution with holes or
   dilated convolution, given 4-D `value` and `filters` tensors. If the `rate`
   parameter is equal to one, it performs regular 2-D convolution. If the `rate`
@@ -955,93 +964,12 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
     ValueError: If input/output depth does not match `filters`' shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
-  with ops.name_scope(name, "atrous_conv2d", [value, filters]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filters = ops.convert_to_tensor(filters, name="filters")
-    if not value.get_shape()[3].is_compatible_with(filters.get_shape()[2]):
-      raise ValueError(
-          "value's input channels does not match filters' input channels, "
-          "{} != {}".format(value.get_shape()[3], filters.get_shape()[2]))
-    if rate < 1:
-      raise ValueError("rate {} cannot be less than one".format(rate))
-
-    if rate == 1:
-      value = gen_nn_ops.conv2d(input=value,
-                                filter=filters,
-                                strides=[1, 1, 1, 1],
-                                padding=padding)
-      return value
-
-    # We have two padding contributions. The first is used for converting "SAME"
-    # to "VALID". The second is required so that the height and width of the
-    # zero-padded value tensor are multiples of rate.
-
-    # Padding required to reduce to "VALID" convolution
-    if padding == "SAME":
-      # Handle filters whose shape is unknown during graph creation.
-      if filters.get_shape().is_fully_defined():
-        filter_shape = filters.get_shape().as_list()
-      else:
-        filter_shape = array_ops.shape(filters)
-      filter_height, filter_width = filter_shape[0], filter_shape[1]
-
-      # Spatial dimensions of the filters and the upsampled filters in which we
-      # introduce (rate - 1) zeros between consecutive filter values.
-      filter_height_up = filter_height + (filter_height - 1) * (rate - 1)
-      filter_width_up = filter_width + (filter_width - 1) * (rate - 1)
-
-      pad_height = filter_height_up - 1
-      pad_width = filter_width_up - 1
-
-      # When pad_height (pad_width) is odd, we pad more to bottom (right),
-      # following the same convention as conv2d().
-      pad_top = pad_height // 2
-      pad_bottom = pad_height - pad_top
-      pad_left = pad_width // 2
-      pad_right = pad_width - pad_left
-    elif padding == "VALID":
-      pad_top = 0
-      pad_bottom = 0
-      pad_left = 0
-      pad_right = 0
-    else:
-      raise ValueError("Invalid padding")
-
-    # Handle input whose shape is unknown during graph creation.
-    if value.get_shape().is_fully_defined():
-      value_shape = value.get_shape().as_list()
-    else:
-      value_shape = array_ops.shape(value)
-
-    in_height = value_shape[1] + pad_top + pad_bottom
-    in_width = value_shape[2] + pad_left + pad_right
-
-    # More padding so that rate divides the height and width of the input.
-    pad_bottom_extra = (rate - in_height % rate) % rate
-    pad_right_extra = (rate - in_width % rate) % rate
-
-    # The paddings argument to space_to_batch includes both padding components.
-    space_to_batch_pad = [[pad_top, pad_bottom + pad_bottom_extra],
-                          [pad_left, pad_right + pad_right_extra]]
-
-    value = array_ops.space_to_batch(input=value,
-                                     paddings=space_to_batch_pad,
-                                     block_size=rate)
-
-    value = gen_nn_ops.conv2d(input=value,
-                              filter=filters,
-                              strides=[1, 1, 1, 1],
-                              padding="VALID",
-                              name=name)
-
-    # The crops argument to batch_to_space is just the extra padding component.
-    batch_to_space_crop = [[0, pad_bottom_extra], [0, pad_right_extra]]
-
-    value = array_ops.batch_to_space(input=value,
-                                     crops=batch_to_space_crop,
-                                     block_size=rate)
-
-    return value
+  return convolution(
+      input=value,
+      filter=filters,
+      padding=padding,
+      dilation_rate=np.broadcast_to(rate, (2,)),
+      name=name)
 
 
 def conv2d_transpose(value,
@@ -1268,7 +1196,7 @@ def conv3d_transpose(value,
                      output_shape,
                      strides,
                      padding="SAME",
-                     data_format=None,
+                     data_format="NDHWC",
                      name=None):
   """The transpose of `conv3d`.
 
@@ -1304,10 +1232,11 @@ def conv3d_transpose(value,
                       [value, filter, output_shape]) as name:
     value = ops.convert_to_tensor(value, name="value")
     filter = ops.convert_to_tensor(filter, name="filter")
-    if not value.get_shape()[4].is_compatible_with(filter.get_shape()[4]):
+    axis = 1 if data_format == "NCDHW" else 4
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[4], filter.get_shape(
-                       )[4]))
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[4]))
 
     output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
     if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
@@ -1395,7 +1324,7 @@ def crelu(features, name=None):
   Concatenates a ReLU which selects only the positive part of the activation
   with a ReLU which selects only the *negative* part of the activation.
   Note that as a result this non-linearity doubles the depth of the activations.
-  Source: https://arxiv.org/abs/1603.05201
+  Source: [Understanding and Improving Convolutional Neural Networks via Concatenated Rectified Linear Units. W. Shang, et al.](https://arxiv.org/abs/1603.05201) 
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1413,6 +1342,7 @@ def crelu(features, name=None):
 
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
+  Source: [Convolutional Deep Belief Networks on CIFAR-10. A. Krizhevsky](http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf)
 
   Args:
     features: A `Tensor` with type `float`, `double`, `int32`, `int64`, `uint8`,
@@ -1474,14 +1404,14 @@ def _softmax(logits, compute_op, dim=-1, name=None):
     InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
       dimension of `logits`.
   """
-  def _swap_axis(logits, dim_index, last_index):
+  def _swap_axis(logits, dim_index, last_index, name=None):
     """Swaps logits's dim_index and last_index."""
     return array_ops.transpose(logits,
                                array_ops.concat([
                                    math_ops.range(dim_index), [last_index],
                                    math_ops.range(dim_index + 1, last_index),
                                    [dim_index]
-                               ], 0))
+                               ], 0), name=name)
 
   logits = ops.convert_to_tensor(logits)
 
@@ -1497,8 +1427,8 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   if is_last_dim:
     input_shape = array_ops.shape(logits)
     logits = _flatten_outer_dims(logits)
-    output = compute_op(logits, name=name)
-    output = array_ops.reshape(output, input_shape)
+    output = compute_op(logits)
+    output = array_ops.reshape(output, input_shape, name=name)
     return output
 
   # If dim is not the last dimension, we have to do a reshape and transpose so
@@ -1513,11 +1443,11 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   logits = _flatten_outer_dims(logits)
 
   # Do the actual softmax on its last dimension.
-  output = compute_op(logits, name=name)
+  output = compute_op(logits)
 
   # Transform back the output tensor.
   output = array_ops.reshape(output, shape_after_swap)
-  output = _swap_axis(output, dim, math_ops.subtract(input_rank, 1))
+  output = _swap_axis(output, dim, math_ops.subtract(input_rank, 1), name=name)
 
   # Make shape inference work since reshape and transpose may erase its static
   # shape.
@@ -1604,8 +1534,9 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  `logits` and `labels` must have the same shape `[batch_size, num_classes]`
-  and the same dtype (either `float16`, `float32`, or `float64`).
+  `logits` and `labels` must have the same shape, e.g.
+  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  or `float64`).
 
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
@@ -2019,7 +1950,7 @@ def top_k(input, k=1, sorted=True, name=None):
 def conv1d(value, filters, stride, padding,
            use_cudnn_on_gpu=None, data_format=None,
            name=None):
-  """Computes a 1-D convolution given 3-D input and filter tensors.
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
 
   Given an input tensor of shape
     [batch, in_width, in_channels]
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index fdb036ebfde9768b3c857fe750ab57a8e4c8b223..5cf8c3291cdea496a8fd902d19e9d7faa4a974d8 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -30,6 +30,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops.nn_impl import _compute_sampled_logits
 from tensorflow.python.platform import test as test_lib
@@ -412,13 +415,25 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     biases = np.random.randn(self._num_classes).astype(np.float32)
     hidden_acts = np.random.randn(self._batch_size,
                                   self._dim).astype(np.float32)
-    sharded_weights = [
-        weights[[
-            row for row in range(self._num_classes)
-            if row % self._num_shards == shard
-        ]] for shard in range(self._num_shards)
-    ]
-    return weights, biases, hidden_acts, sharded_weights
+
+    with ops.Graph().as_default() as g:
+      sharded_weights = variable_scope.get_variable(
+          "w",
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              self._num_shards),
+          initializer=constant_op.constant(weights))
+      sharded_biases = variable_scope.get_variable(
+          "b",
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              self._num_shards),
+          initializer=constant_op.constant(biases))
+      with self.test_session(graph=g) as sess:
+        variables.global_variables_initializer().run()
+
+        sharded_weights_v, sharded_biases_v = sess.run(
+            [list(sharded_weights), list(sharded_biases)])
+
+    return weights, biases, hidden_acts, sharded_weights_v, sharded_biases_v
 
   def _ComputeSampledLogitsNP(self,
                               true_w,
@@ -466,7 +481,10 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       weights_tf = [constant_op.constant(shard) for shard in weights]
     else:
       weights_tf = constant_op.constant(weights)
-    biases_tf = constant_op.constant(biases)
+    if isinstance(biases, list):
+      biases_tf = [constant_op.constant(shard) for shard in biases]
+    else:
+      biases_tf = constant_op.constant(biases)
     hidden_acts_tf = constant_op.constant(
         hidden_acts, shape=(self._batch_size, self._dim))
     labels_tf = constant_op.constant(
@@ -483,12 +501,13 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         sampled_vals,
         subtract_log_q=subtract_log_q,
         remove_accidental_hits=remove_accidental_hits,
-        name=name)
+        name=name,
+        partition_strategy="div")
     return pred_logits_tf, pred_labels_tf
 
   def testComputeSampledLogitsShapes(self):
     # We just check that the shapes of the returned values are correct.
-    weights, biases, hidden_acts, _ = self._GenerateTestInputs()
+    weights, biases, hidden_acts, _, _ = self._GenerateTestInputs()
     sampled = [1, 0, 2, 3]
     num_sampled = len(sampled)
     true_exp = sampled_exp = [1., 1., 1., 1.]
@@ -529,7 +548,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
 
   def testComputeSampledLogitsValues(self):
     # Here we check the actual numerics.
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     eps = 1e-3
     sampled = [1, 0, 2, 3]
     num_sampled = len(sampled)
@@ -627,7 +647,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
         self.assertAllClose(logits_np, logits_tf_val, eps)
         self.assertAllClose(labels_np, labels_tf_val, eps)
 
-        # Test 4: Test 1, with sharded weights
+        # Test 4: Test 1, with sharded weights and sharded biases.
         logits_np, labels_np = self._ComputeSampledLogitsNP(
             true_w,
             true_b,
@@ -637,7 +657,7 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
             num_true=num_true_test)
         logits_tf, labels_tf = self._ComputeSampledLogitsTF(
             sharded_weights,
-            biases,
+            sharded_biases,
             hidden_acts,
             labels,
             num_sampled,
@@ -663,7 +683,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = np.minimum(np.maximum(pred, eps), 1 - eps)
       return -targets * np.log(pred) - (1. - targets) * np.log(1. - pred)
 
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     labels = [0, 1, 2]
     true_w, true_b = weights[labels], biases[labels]
     sampled = [1, 0, 2, 3]
@@ -697,23 +718,25 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           biases_tf,
           labels_tf,
           inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals)
+          sampled_values=test_sampled_vals,
+          partition_strategy="div")
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
-      # Test with sharded weights
+      # Test with sharded weights and sharded biases.
       nce_loss_tf = nn_impl.nce_loss(
-          [constant_op.constant(shard) for shard in sharded_weights],
-          biases_tf,
+          sharded_weights,
+          sharded_biases,
           labels_tf,
           inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
-          sampled_values=test_sampled_vals)
+          sampled_values=test_sampled_vals,
+          partition_strategy="div")
 
       self.assertAllClose(nce_loss_np, nce_loss_tf.eval(), 1e-4)
 
@@ -728,7 +751,8 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
       return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
 
-    weights, biases, hidden_acts, sharded_weights = self._GenerateTestInputs()
+    weights, biases, hidden_acts, sharded_weights, sharded_biases = (
+        self._GenerateTestInputs())
     labels = [0, 1, 2]
     true_w, true_b = weights[labels], biases[labels]
     sampled = [1, 0, 2, 3]
@@ -760,26 +784,28 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           biases=biases_tf,
           labels=labels_tf,
           inputs=inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
           sampled_values=test_sampled_vals,
-          remove_accidental_hits=False)
+          remove_accidental_hits=False,
+          partition_strategy="div")
 
       self.assertAllClose(sampled_softmax_loss_np,
                           sampled_softmax_loss_tf.eval(), 1e-4)
 
-      # Test with sharded weights
+      # Test with sharded weights and sharded biases.
       sampled_softmax_loss_tf = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in sharded_weights],
-          biases=biases_tf,
+          weights=sharded_weights,
+          biases=sharded_biases,
           labels=labels_tf,
           inputs=inputs_tf,
-          num_sampled=1,
+          num_sampled=num_sampled,
           num_classes=self._num_classes,
           num_true=1,
           sampled_values=test_sampled_vals,
-          remove_accidental_hits=False)
+          remove_accidental_hits=False,
+          partition_strategy="div")
 
       self.assertAllClose(sampled_softmax_loss_np,
                           sampled_softmax_loss_tf.eval(), 1e-4)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index b1994acdc4bd74853b29adc0e22f6c93f109c68d..c2f9961731630173127ed5367789a31550ce1c0d 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -58,7 +58,7 @@ class SparseFeature(
         ["index_key", "value_key", "dtype", "size", "already_sorted"])):
   """Configuration for parsing a sparse input feature from an `Example`.
 
-  Note, preferrably use `VarLenFeature` (possibly in combination with a
+  Note, preferably use `VarLenFeature` (possibly in combination with a
   `SequenceExample`) in order to parse out `SparseTensor`s instead of
   `SparseFeature` due to its simplicity.
 
@@ -845,7 +845,7 @@ def parse_single_sequence_example(
   Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
   proto given in `serialized`.
 
-  This op parses a serialize sequence example into a tuple of dictionaries
+  This op parses a serialized sequence example into a tuple of dictionaries
   mapping keys to `Tensor` and `SparseTensor` objects respectively.
   The first dictionary contains mappings for keys appearing in
   `context_features`, and the second dictionary contains mappings for keys
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 19689622b14308fe2a84e1c97c3045b2aea1b6dc..15613289a0b0a96b4631c60afa9d196da408480d 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -300,7 +300,8 @@ def random_crop(value, size, seed=None, name=None):
     shape = array_ops.shape(value)
     check = control_flow_ops.Assert(
         math_ops.reduce_all(shape >= size),
-        ["Need value.shape >= size, got ", shape, size])
+        ["Need value.shape >= size, got ", shape, size],
+        summarize=1000)
     shape = control_flow_ops.with_dependencies([check], shape)
     limit = shape - size + 1
     offset = random_uniform(
@@ -324,7 +325,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
 
   Args:
     logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
-      `[i, :]` represents the unnormalized log probabilities for all classes.
+      `[i, :]` represents the log-odds for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index e84fa21868f0e943a17154f40c41337dfcde6cf2..9ac50345eab4c10439af44d24acd660f88a6e0a3 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
@@ -34,7 +35,7 @@ from tensorflow.python.ops.gen_resource_variable_ops import *
 from tensorflow.python.util import compat
 
 
-class ResourceVariable(object):
+class ResourceVariable(variables.Variable):
   """Variable based on resource handles.
 
   TODO(apassos): fill this out explaining the semantics and Variable
@@ -159,16 +160,15 @@ class ResourceVariable(object):
     with ops.control_dependencies(None):
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
+        # pylint: disable=protected-access
+        true_name = ops._name_from_scope_name(name)
         if init_from_fn:
           # Use attr_scope and device(None) to simulate the behavior of
           # colocate_with when the variable we want to colocate with doesn't
           # yet exist.
-          # pylint: disable=protected-access
-          true_name = ops._name_from_scope_name(name)
           attr = attr_value_pb2.AttrValue(
               list=attr_value_pb2.AttrValue.ListValue(
                   s=[compat.as_bytes("loc:@%s" % true_name)]))
-          # pylint: disable=protected-access
           with ops.get_default_graph()._attr_scope({"_class": attr}):
             with ops.name_scope("Initializer"), ops.device(None):
               self._initial_value = ops.convert_to_tensor(
@@ -176,7 +176,8 @@ class ResourceVariable(object):
             self._handle = gen_resource_variable_ops.var_handle_op(
                 shape=self._initial_value.get_shape(),
                 dtype=self._initial_value.dtype.base_dtype,
-                shared_name=name, name=name)
+                shared_name=true_name, name=name)
+        # pylint: enable=protected-access
 
         # Or get the initial value from a Tensor or Python object.
         else:
@@ -185,7 +186,7 @@ class ResourceVariable(object):
           self._handle = gen_resource_variable_ops.var_handle_op(
               shape=self._initial_value.get_shape(),
               dtype=self._initial_value.dtype.base_dtype,
-              shared_name=name, name=name)
+              shared_name=true_name, name=name)
 
         self._dtype = self._initial_value.dtype.base_dtype
 
@@ -197,12 +198,22 @@ class ResourceVariable(object):
             self._initialize_op = gen_resource_variable_ops.assign_variable_op(
                 self._handle, self._initial_value, name=n)
         with ops.name_scope("Read"), ops.colocate_with(self._handle):
-          value = gen_resource_variable_ops.read_variable_op(
-              self._handle, dtype=self._dtype)
+          # Manually assign reads to the handle's device to avoid log messages.
+          with ops.device(self._handle.device):
+            value = gen_resource_variable_ops.read_variable_op(
+                self._handle, dtype=self._dtype)
           self._graph_element = value
           if caching_device is not None:
-            with ops.device(caching_device):
-              self._cached_value = array_ops.identity(value)
+            # Variables may be created in a tf.device() or ops.colocate_with()
+            # context. At the same time, users would expect caching device to be
+            # independent of this context, and/or would not expect the current
+            # device context to be merged with the caching device spec.
+            # Therefore we reset the colocation stack before creating the cached
+            # value. Note that resetting the colocation stack will also reset
+            # the device stack.
+            with ops.colocate_with(None, ignore_existing=True):
+              with ops.device(caching_device):
+                self._cached_value = array_ops.identity(value)
           else:
             self._cached_value = None
           ops.add_to_collections(collections, self)
@@ -234,6 +245,7 @@ class ResourceVariable(object):
       self._save_slice_info = None
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
+    self._graph_element = self.value()
 
   @property
   def dtype(self):
@@ -245,6 +257,11 @@ class ResourceVariable(object):
     """The device this variable is on."""
     return self._handle.device
 
+  @property
+  def graph(self):
+    """The `Graph` of this variable."""
+    return self._handle.graph
+
   @property
   def name(self):
     """The name of the handle for this variable."""
@@ -268,8 +285,10 @@ class ResourceVariable(object):
     """A cached operation which reads the value of this variable."""
     if self._cached_value is not None:
       return self._cached_value
-    return gen_resource_variable_ops.read_variable_op(
-        self._handle, dtype=self._dtype)
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._handle.device):
+        return gen_resource_variable_ops.read_variable_op(
+            self._handle, dtype=self._dtype)
 
   def _as_graph_element(self):
     """Conversion function for Graph.as_graph_element()."""
@@ -280,6 +299,11 @@ class ResourceVariable(object):
     """The op responsible for initializing this variable."""
     return self._initialize_op
 
+  @property
+  def initial_value(self):
+    """Returns the Tensor used as the initial value for the variable."""
+    return self._initial_value
+
   @property
   def op(self):
     """The op for this variable."""
@@ -310,8 +334,9 @@ class ResourceVariable(object):
      the read operation.
     """
     with ops.name_scope("Read"):
-      value = gen_resource_variable_ops.read_variable_op(
-          self._handle, dtype=self._dtype)
+      with ops.device(self._handle.device):
+        value = gen_resource_variable_ops.read_variable_op(
+            self._handle, dtype=self._dtype)
     # Return an identity so it can get placed on whatever device the context
     # specifies instead of the device where the variable is.
     return array_ops.identity(value)
@@ -369,6 +394,10 @@ class ResourceVariable(object):
   def _AsTensor(self):
     return self.value()
 
+  def _ref(self):
+    """Unsupported."""
+    raise NotImplementedError("ResourceVariable does not implement _ref()")
+
   @staticmethod
   def _OverloadOperator(operator):  # pylint: disable=invalid-name
     """Defer an operator overload to `ops.Tensor`.
@@ -416,18 +445,51 @@ class ResourceVariable(object):
             ops.convert_to_tensor(value, dtype=self.dtype), name=name)]):
       return self.read_value()
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    with ops.control_dependencies([gen_array_ops.resource_strided_slice_assign(
+        ref=self.handle,
+        begin=begin,
+        end=end,
+        strides=strides,
+        value=value,
+        name=name,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        ellipsis_mask=ellipsis_mask,
+        new_axis_mask=new_axis_mask,
+        shrink_axis_mask=shrink_axis_mask)]):
+      return self.value()
+
 
 # pylint: disable=unused-argument,protected-access
 def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   if dtype is not None and dtype != var.value().dtype:
     print("trying to switch the dtype to ", dtype, " from ", var.value().dtype)
     return NotImplemented
+  if as_ref:
+    return var.read_value().op.inputs[0]
   return var.value()
 # pylint: enable=unused-argument,protected-access
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
+
+# Note: registering for Variable after ResourceVariable because inheritance will
+# otherwise lead to the wrong behavior.
 ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
+ops.register_tensor_conversion_function(
+    variables.Variable,
+    variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
 
 # pylint: disable=protected-access
 ResourceVariable._OverloadAllOperators()
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index 41fb8a74a9e16984d115866a4e5123ed296e164a..57ba0084e846a612ba3deedb600f53c123545571 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import tf_should_use
 
 
 _Resource = collections.namedtuple("_Resource",
@@ -98,6 +99,7 @@ def report_uninitialized_resources(resource_list=None,
     return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 
+@tf_should_use.should_use_result
 def initialize_resources(resource_list, name="init"):
   """Initializes the resources in the given list.
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 162b13ec2129a3b3d25616bbcd5ab8440dce17c4..a6fba046da287a223517521331a4a604b0aee78b 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -33,10 +33,41 @@ from tensorflow.python.util import nest
 
 
 # pylint: disable=protected-access
-_state_size_with_prefix = rnn_cell_impl._state_size_with_prefix
+_concat = rnn_cell_impl._concat
+_like_rnncell = rnn_cell_impl._like_rnncell
 # pylint: enable=protected-access
 
 
+def _transpose_batch_time(x):
+  """Transpose the batch and time dimensions of a Tensor.
+
+  Retains as much of the static shape information as possible.
+
+  Args:
+    x: A tensor of rank 2 or higher.
+
+  Returns:
+    x transposed along the first two dimensions.
+
+  Raises:
+    ValueError: if `x` is rank 1 or lower.
+  """
+  x_static_shape = x.get_shape()
+  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+    raise ValueError(
+        "Expected input tensor %s to have rank at least 2, but saw shape: %s" %
+        (x, x_static_shape))
+  x_rank = array_ops.rank(x)
+  x_t = array_ops.transpose(
+      x, array_ops.concat(
+          ([1, 0], math_ops.range(2, x_rank)), axis=0))
+  x_t.set_shape(
+      tensor_shape.TensorShape([
+          x_static_shape[1].value, x_static_shape[0].value
+      ]).concatenate(x_static_shape[2:]))
+  return x_t
+
+
 def _infer_state_dtype(explicit_dtype, state):
   """Infer the dtype of an RNN state.
 
@@ -67,15 +98,6 @@ def _infer_state_dtype(explicit_dtype, state):
     return state.dtype
 
 
-def _on_device(fn, device):
-  """Build the subgraph defined by lambda `fn` on `device` if it's not None."""
-  if device:
-    with ops.device(device):
-      return fn()
-  else:
-    return fn()
-
-
 # pylint: disable=unused-argument
 def _rnn_step(
     time, sequence_length, min_sequence_length, max_sequence_length,
@@ -137,9 +159,8 @@ def _rnn_step(
 
   def _copy_one_through(output, new_output):
     copy_cond = (time >= sequence_length)
-    return _on_device(
-        lambda: array_ops.where(copy_cond, output, new_output),
-        device=new_output.op.device)
+    with ops.colocate_with(new_output):
+      return array_ops.where(copy_cond, output, new_output)
 
   def _copy_some_through(flat_new_output, flat_new_state):
     # Use broadcasting select to determine which values should get
@@ -258,11 +279,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               swap_memory=False, time_major=False, scope=None):
   """Creates a dynamic version of bidirectional recurrent neural network.
 
-  Similar to the unidirectional case above (rnn) but takes input and builds
-  independent forward and backward RNNs. The input_size of forward and
-  backward cell must match. The initial state for both directions is zero by
-  default (but can be set optionally) and no intermediate states are ever
-  returned -- the network is fully unrolled for the given (passed in)
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
   length(s) of the sequence(s) or completely unrolled if length(s) is not
   given.
 
@@ -332,12 +352,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
     TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell_fw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_fw):
     raise TypeError("cell_fw must be an instance of RNNCell")
-  if not isinstance(cell_bw, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell_bw):
     raise TypeError("cell_bw must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   with vs.variable_scope(scope or "bidirectional_rnn"):
     # Forward direction
@@ -389,12 +407,10 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 time_major=False, scope=None):
   """Creates a recurrent neural network specified by RNNCell `cell`.
 
-  This function is functionally identical to the function `rnn` above, but
-  performs fully dynamic unrolling of `inputs`.
+  Performs fully dynamic unrolling of `inputs`.
 
-  Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
-  each frame.  Instead, `inputs` may be a single `Tensor` where
-  the maximum time is either the first or second dimension (see the parameter
+  `Inputs` may be a single `Tensor` where the maximum time is either the first
+  or second dimension (see the parameter
   `time_major`).  Alternatively, it may be a (possibly nested) tuple of
   Tensors, each of them having matching batch and time dimensions.
   The corresponding output is either a single `Tensor` having the same number
@@ -403,7 +419,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
   The parameter `sequence_length` is optional and is used to copy-through state
   and zero-out outputs when past a batch element's sequence length. So it's more
-  for correctness than performance, unlike in rnn().
+  for correctness than performance.
 
   Args:
     cell: An instance of RNNCell.
@@ -480,10 +496,8 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     ValueError: If inputs is None or an empty list.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
 
   # By default, time_major==False and inputs are batch-major: shaped
   #   [batch, time, depth]
@@ -492,8 +506,8 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 
   if not time_major:
     # (B,T,D) => (T,B,D)
-    flat_input = tuple(array_ops.transpose(input_, [1, 0, 2])
-                       for input_ in flat_input)
+    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+    flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
 
   parallel_iterations = parallel_iterations or 32
   if sequence_length is not None:
@@ -556,11 +570,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     # to shape [batch, time, depth]
     if not time_major:
       # (T,B,D) => (B,T,D)
-      flat_output = nest.flatten(outputs)
-      flat_output = [array_ops.transpose(output, [1, 0, 2])
-                     for output in flat_output]
-      outputs = nest.pack_sequence_as(
-          structure=outputs, flat_sequence=flat_output)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
 
     return (outputs, final_state)
 
@@ -637,7 +647,7 @@ def _dynamic_rnn_loop(cell,
 
   # Prepare dynamic conditional copying of state & output
   def _create_zero_arrays(size):
-    size = _state_size_with_prefix(size, prefix=[batch_size])
+    size = _concat(batch_size, size)
     return array_ops.zeros(
         array_ops.stack(size), _infer_state_dtype(dtype, state))
 
@@ -723,8 +733,8 @@ def _dynamic_rnn_loop(cell,
 
   # Restore some shape information
   for output, output_size in zip(final_outputs, flat_output_size):
-    shape = _state_size_with_prefix(
-        output_size, prefix=[const_time_steps, const_batch_size])
+    shape = _concat(
+        [const_time_steps, const_batch_size], output_size, static=True)
     output.set_shape(shape)
 
   final_outputs = nest.pack_sequence_as(
@@ -898,10 +908,8 @@ def raw_rnn(cell, loop_fn,
       a `callable`.
   """
 
-  # pylint: disable=protected-access
-  if not isinstance(cell, rnn_cell_impl._RNNCell):
+  if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
-  # pylint: enable=protected-access
   if not callable(loop_fn):
     raise TypeError("loop_fn must be a callable")
 
@@ -958,9 +966,7 @@ def raw_rnn(cell, loop_fn,
     emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                     flat_sequence=flat_emit_ta)
     flat_zero_emit = [
-        array_ops.zeros(
-            _state_size_with_prefix(size_i, prefix=[batch_size]),
-            dtype_i)
+        array_ops.zeros(_concat(batch_size, size_i), dtype_i)
         for size_i, dtype_i in zip(flat_emit_size, flat_emit_dtypes)]
     zero_emit = nest.pack_sequence_as(structure=emit_structure,
                                       flat_sequence=flat_zero_emit)
@@ -1003,34 +1009,19 @@ def raw_rnn(cell, loop_fn,
 
       def _copy_some_through(current, candidate):
         """Copy some tensors through via array_ops.where."""
-        current_flat = nest.flatten(current)
-        candidate_flat = nest.flatten(candidate)
-        # pylint: disable=g-long-lambda,cell-var-from-loop
-        result_flat = [
-            _on_device(
-                lambda: array_ops.where(
-                    elements_finished, current_i, candidate_i),
-                device=candidate_i.op.device)
-            for (current_i, candidate_i) in zip(current_flat, candidate_flat)]
-        # pylint: enable=g-long-lambda,cell-var-from-loop
-        return nest.pack_sequence_as(
-            structure=current, flat_sequence=result_flat)
+        def copy_fn(cur_i, cand_i):
+          with ops.colocate_with(cand_i):
+            return array_ops.where(elements_finished, cur_i, cand_i)
+        return nest.map_structure(copy_fn, current, candidate)
 
       emit_output = _copy_some_through(zero_emit, emit_output)
       next_state = _copy_some_through(state, next_state)
 
-      emit_output_flat = nest.flatten(emit_output)
-      emit_ta_flat = nest.flatten(emit_ta)
+      emit_ta = nest.map_structure(
+          lambda ta, emit: ta.write(time, emit), emit_ta, emit_output)
 
       elements_finished = math_ops.logical_or(elements_finished, next_finished)
 
-      emit_ta_flat = [
-          ta.write(time, emit)
-          for (ta, emit) in zip(emit_ta_flat, emit_output_flat)]
-
-      emit_ta = nest.pack_sequence_as(
-          structure=emit_structure, flat_sequence=emit_ta_flat)
-
       return (next_time, elements_finished, next_input,
               emit_ta, next_state, loop_state)
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index c3dddf85f3d7191db11f34ab90419230285acf7c..9c0fb1db23dbd16ce5faffa68daede7d13accd59 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -24,61 +24,104 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
 
 
-def _state_size_with_prefix(state_size, prefix=None):
-  """Helper function that enables int or TensorShape shape specification.
+def _like_rnncell(cell):
+  """Checks that a given object is an RNNCell by using duck typing."""
+  conditions = [hasattr(cell, "output_size"), hasattr(cell, "state_size"),
+                hasattr(cell, "zero_state"), callable(cell)]
+  return all(conditions)
 
-  This function takes a size specification, which can be an integer or a
-  TensorShape, and converts it into a list of integers. One may specify any
-  additional dimensions that precede the final state size specification.
+
+def _concat(prefix, suffix, static=False):
+  """Concat that enables int, Tensor, or TensorShape values.
+
+  This function takes a size specification, which can be an integer, a
+  TensorShape, or a Tensor, and converts it into a concatenated Tensor
+  (if static = False) or a list of integers (if static = True).
 
   Args:
-    state_size: TensorShape or int that specifies the size of a tensor.
-    prefix: optional additional list of dimensions to prepend.
+    prefix: The prefix; usually the batch size (and/or time step size).
+      (TensorShape, int, or Tensor.)
+    suffix: TensorShape, int, or Tensor.
+    static: If `True`, return a python list with possibly unknown dimensions.
+      Otherwise return a `Tensor`.
 
   Returns:
-    result_state_size: list of dimensions the resulting tensor size.
+    shape: the concatenation of prefix and suffix.
+
+  Raises:
+    ValueError: if `suffix` is not a scalar or vector (or TensorShape).
+    ValueError: if prefix or suffix was `None` and asked for dynamic
+      Tensors out.
   """
-  result_state_size = tensor_shape.as_shape(state_size).as_list()
-  if prefix is not None:
-    if not isinstance(prefix, list):
-      raise TypeError("prefix of _state_size_with_prefix should be a list.")
-    result_state_size = prefix + result_state_size
-  return result_state_size
+  if isinstance(prefix, ops.Tensor):
+    p = prefix
+    p_static = tensor_util.constant_value(prefix)
+    if p.shape.ndims == 0:
+      p = array_ops.expand_dims(p, 0)
+    elif p.shape.ndims != 1:
+      raise ValueError("prefix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % p)
+  else:
+    p = tensor_shape.as_shape(prefix)
+    p_static = p.as_list() if p.ndims is not None else None
+    p = (constant_op.constant(p.as_list(), dtype=dtypes.int32)
+         if p.is_fully_defined() else None)
+  if isinstance(suffix, ops.Tensor):
+    s = suffix
+    s_static = tensor_util.constant_value(suffix)
+    if s.shape.ndims == 0:
+      s = array_ops.expand_dims(s, 0)
+    elif s.shape.ndims != 1:
+      raise ValueError("suffix tensor must be either a scalar or vector, "
+                       "but saw tensor: %s" % s)
+  else:
+    s = tensor_shape.as_shape(suffix)
+    s_static = s.as_list() if s.ndims is not None else None
+    s = (constant_op.constant(s.as_list(), dtype=dtypes.int32)
+         if s.is_fully_defined() else None)
+
+  if static:
+    shape = tensor_shape.as_shape(p_static).concatenate(s_static)
+    shape = shape.as_list() if shape.ndims is not None else None
+  else:
+    if p is None or s is None:
+      raise ValueError("Provided a prefix or suffix of None: %s and %s"
+                       % (prefix, suffix))
+    shape = array_ops.concat((p, s), 0)
+  return shape
 
 
 def _zero_state_tensors(state_size, batch_size, dtype):
   """Create tensors of zeros based on state_size, batch_size, and dtype."""
-  if nest.is_sequence(state_size):
-    state_size_flat = nest.flatten(state_size)
-    zeros_flat = [
-        array_ops.zeros(
-            array_ops.stack(_state_size_with_prefix(
-                s, prefix=[batch_size])),
-            dtype=dtype) for s in state_size_flat
-    ]
-    for s, z in zip(state_size_flat, zeros_flat):
-      z.set_shape(_state_size_with_prefix(s, prefix=[None]))
-    zeros = nest.pack_sequence_as(structure=state_size,
-                                  flat_sequence=zeros_flat)
-  else:
-    zeros_size = _state_size_with_prefix(state_size, prefix=[batch_size])
-    zeros = array_ops.zeros(array_ops.stack(zeros_size), dtype=dtype)
-    zeros.set_shape(_state_size_with_prefix(state_size, prefix=[None]))
-
-  return zeros
+  def get_state_shape(s):
+    """Combine s with batch_size to get a proper tensor shape."""
+    c = _concat(batch_size, s)
+    c_static = _concat(batch_size, s, static=True)
+    size = array_ops.zeros(c, dtype=dtype)
+    size.set_shape(c_static)
+    return size
+  return nest.map_structure(get_state_shape, state_size)
 
 
-class _RNNCell(object):
+class _RNNCell(base_layer.Layer):
   """Abstract object representing an RNN cell.
 
-  Every `RNNCell` must have the properties below and implement `__call__` with
-  the following signature.
+  Every `RNNCell` must have the properties below and implement `call` with
+  the signature `(output, next_state) = call(input, state)`.  The optional
+  third input argument, `scope`, is allowed for backwards compatibility
+  purposes; but should be left off for new subclasses.
 
   This definition of cell differs from the definition used in the literature.
   In the literature, 'cell' refers to an object with a single scalar output.
@@ -89,8 +132,9 @@ class _RNNCell(object):
   This operation results in an output matrix with `self.output_size` columns.
   If `self.state_size` is an integer, this operation also results in a new
   state matrix with `self.state_size` columns.  If `self.state_size` is a
-  tuple of integers, then it results in a tuple of `len(state_size)` state
-  matrices, each with a column size corresponding to values in `state_size`.
+  (possibly nested tuple of) TensorShape object(s), then it should return a
+  matching structure of Tensors having shape `[batch_size].concatenate(s)`
+  for each `s` in `self.batch_size`.
   """
 
   def __call__(self, inputs, state, scope=None):
@@ -111,7 +155,25 @@ class _RNNCell(object):
       - New state: Either a single `2-D` tensor, or a tuple of tensors matching
         the arity and shapes of `state`.
     """
-    raise NotImplementedError("Abstract method")
+    if scope is not None:
+      with vs.variable_scope(scope,
+                             custom_getter=self._rnn_get_variable) as scope:
+        return super(_RNNCell, self).__call__(inputs, state, scope=scope)
+    else:
+      with vs.variable_scope(vs.get_variable_scope(),
+                             custom_getter=self._rnn_get_variable):
+        return super(_RNNCell, self).__call__(inputs, state)
+
+  def _rnn_get_variable(self, getter, *args, **kwargs):
+    variable = getter(*args, **kwargs)
+    trainable = (variable in tf_variables.trainable_variables() or
+                 (isinstance(variable, tf_variables.PartitionedVariable) and
+                  list(variable)[0] in tf_variables.trainable_variables()))
+    if trainable and variable not in self._trainable_weights:
+      self._trainable_weights.append(variable)
+    elif not trainable and variable not in self._non_trainable_weights:
+      self._non_trainable_weights.append(variable)
+    return variable
 
   @property
   def state_size(self):
@@ -127,6 +189,11 @@ class _RNNCell(object):
     """Integer or TensorShape: size of outputs produced by this cell."""
     raise NotImplementedError("Abstract method")
 
+  def build(self, _):
+    # This tells the parent Layer object that it's OK to call
+    # self.add_variable() inside the call() method.
+    pass
+
   def zero_state(self, batch_size, dtype):
     """Return zero-filled state tensor(s).
 
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index 0a06982ad7ca07c51250e274747177c2818c0b96..de43b562f9f38beb726a86fc812cdb417cb30515 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -26,11 +26,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 import numpy as np
 
 from tensorflow.core.framework import resource_handle_pb2
+from tensorflow.python import pywrap_tensorflow_internal
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -39,16 +38,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.util import compat
 
 
-def decode_resource_handle(encoded):
-  """Decode a ResourceHandle proto encoded as custom numpy struct type."""
-  resource_handle = resource_handle_pb2.ResourceHandle()
-  if sys.version_info.major < 3:
-    resource_handle.ParseFromString("".join([chr(ch[0]) for ch in encoded]))
-  else:
-    resource_handle.ParseFromString(bytes([ch[0] for ch in encoded]))
-  return resource_handle
-
-
 def encode_resource_handle(resource_handle):
   """Encode a ResourceHandle proto as custom numpy struct type."""
   return np.asarray(bytearray(resource_handle.SerializeToString()),
@@ -69,8 +58,8 @@ class TensorHandle(object):
       dtype: The data type of the tensor represented by `handle`.
       session: The session in which the tensor is produced.
     """
-    self._resource_handle = decode_resource_handle(handle)
-    self._handle = compat.as_str_any(self._resource_handle.name)
+    self._handle = compat.as_str_any(handle)
+    self._resource_handle = None
     self._dtype = dtype
     self._session = session
     self._auto_gc_enabled = True
@@ -82,9 +71,14 @@ class TensorHandle(object):
   def __str__(self):
     return self._handle
 
-  @property
-  def resource_handle(self):
+  def _get_resource_handle(self):
     """The ResourceHandle representation of this handle."""
+    if not self._resource_handle:
+      self._resource_handle = resource_handle_pb2.ResourceHandle()
+      self._resource_handle.device = self._handle.split(";")[-1]
+      self._resource_handle.container = (
+          pywrap_tensorflow_internal.TENSOR_HANDLE_KEY)
+      self._resource_handle.name = self._handle
     return self._resource_handle
 
   def to_numpy_array(self):
@@ -94,7 +88,7 @@ class TensorHandle(object):
       A numpy array of a custom struct type that can be used as a feed value
       to run().
     """
-    return encode_resource_handle(self.resource_handle)
+    return encode_resource_handle(self._get_resource_handle())
 
   @property
   def handle(self):
@@ -116,7 +110,7 @@ class TensorHandle(object):
       raise TypeError("Persistent tensor %s may have already been deleted."
                       % self.handle)
     self._auto_gc_enabled = False
-    holder, deleter = _get_handle_deleter(self._session.graph, self._handle)
+    holder, deleter = _get_handle_deleter(self._session.graph, 0, self._handle)
     self._session.run(deleter, feed_dict={holder: self.handle})
 
   def get_raw_handle(self):
@@ -141,11 +135,6 @@ class TensorHandle(object):
     handle_parts = str(handle).split(";")
     return handle_parts[0] + ";" + handle_parts[-1]
 
-  @staticmethod
-  def _get_deleter_key(handle):
-    """The graph key for deleter."""
-    return str(handle).split(";")[-1]
-
   @staticmethod
   def _get_mover_key(feeder, handle):
     """The graph key for mover."""
@@ -191,7 +180,7 @@ def get_session_handle(data, name=None):
 
   # Colocate this operation with data.
   with ops.colocate_with(data):
-    return gen_data_flow_ops._get_session_handle_v2(data, name=name)  # pylint: disable=protected-access
+    return gen_data_flow_ops._get_session_handle(data, name=name)  # pylint: disable=protected-access
 
 
 def get_session_tensor(handle, dtype, name=None):
@@ -296,16 +285,15 @@ def _get_handle_mover(graph, feeder, handle):
     # Create mover if we haven't done it.
     holder, reader = _get_handle_reader(graph, handle, dtype)
     with graph.as_default(), graph.device(feeder.op.device):
-      mover = gen_data_flow_ops._get_session_handle_v2(reader)  # pylint: disable=protected-access
+      mover = gen_data_flow_ops._get_session_handle(reader)  # pylint: disable=protected-access
     result = (holder, mover)
     graph._handle_movers[graph_key] = result
   return result
 
 
-def _get_handle_deleter(graph, handle):
+def _get_handle_deleter(graph, deleter_key, handle):
   """Return a deletion subgraph for this handle."""
-  graph_key = TensorHandle._get_deleter_key(handle)
-  result = graph._handle_deleters.get(graph_key)
+  result = graph._handle_deleters.get(deleter_key)
   if result is None:
     # Create deleter if we haven't done it.
     handle_device = TensorHandle._get_device_name(handle)
@@ -313,5 +301,5 @@ def _get_handle_deleter(graph, handle):
       holder = array_ops.placeholder(dtypes.string)
       deleter = gen_data_flow_ops._delete_session_tensor(holder)
     result = (holder, deleter)
-    graph._handle_deleters[graph_key] = result
+    graph._handle_deleters[deleter_key] = result
   return result
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index fa015856cecac5ecbd329ecc00a97cd5d8af57d9..b8e356c78ccb5b840d46fdf530f61403101a3a4f 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -136,12 +136,13 @@ def _SparseTensorDenseMatMulGrad(op, grad):
   Raises:
     TypeError: When the two operands don't have the same type.
   """
-  sp_t = sparse_tensor.SparseTensor(*op.inputs[:3])
+  a_indices, a_values, a_shape = op.inputs[:3]
+  b = op.inputs[3]
   adj_a = op.get_attr("adjoint_a")
   adj_b = op.get_attr("adjoint_b")
 
-  a_type = sp_t.values.dtype.base_dtype
-  b_type = op.inputs[3].dtype.base_dtype
+  a_type = a_values.dtype.base_dtype
+  b_type = b.dtype.base_dtype
   if a_type != b_type:
     raise TypeError("SparseTensorDenseMatMul op received operands with "
                     "different types: ", a_type, " and ", b_type)
@@ -150,15 +151,12 @@ def _SparseTensorDenseMatMulGrad(op, grad):
                               "complex gradients.")
 
   # gradient w.r.t. dense
-  b_grad = sparse_ops.sparse_tensor_dense_matmul(sp_t, grad,
-                                                 adjoint_a=not adj_a)
+  b_grad = gen_sparse_ops._sparse_tensor_dense_mat_mul(  # pylint: disable=protected-access
+      a_indices, a_values, a_shape, grad, adjoint_a=not adj_a)
   if adj_b:
     b_grad = array_ops.transpose(b_grad)
 
   # gradient w.r.t. sparse values
-  a_indices = op.inputs[0]
-  b = op.inputs[3]
-
   rows = a_indices[:, 0]
   cols = a_indices[:, 1]
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 9f4e6607d105bd54deefda6a441a901bde970ac6..ee08ef534b6cd106ee59e90522a7f4f4c9b5e405 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -51,6 +51,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -240,6 +241,8 @@ def sparse_add(a, b, thresh=0):
   of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
   `Tensor`s.
 
+  The shapes of the two operands must match: broadcasting is not supported.
+
   The indices of any input `SparseTensor` are assumed ordered in standard
   lexicographic order.  If this is not the case, before this step run
   `SparseReorder` to restore index ordering.
@@ -288,12 +291,21 @@ def sparse_add(a, b, thresh=0):
 
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
+    b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype, name="thresh")
     output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
         a.indices, a.values, a.dense_shape,
         b.indices, b.values, b.dense_shape,
         thresh))
+
+    # Attempt to get output_shape statically.
+    a.get_shape().assert_is_compatible_with(b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(
+        a.get_shape(), b.get_shape())
+    if static_shape.is_fully_defined():
+      output_shape = static_shape.as_list()
+
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
   else:
     # swap to make `a` the SparseTensor.
@@ -303,6 +315,126 @@ def sparse_add(a, b, thresh=0):
         a.indices, a.values, a.dense_shape, b)
 
 
+def _sparse_cross(inputs, name=None):
+  """Generates sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `string`.
+  """
+  return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+
+
+def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
+  """Generates hashed sparse cross from a list of sparse and dense tensors.
+
+  For example, if the inputs are
+  * inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+  * inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+  * inputs[2]: Tensor [["f"], ["g"]]
+
+  then the output will be:
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+
+  Args:
+    inputs: An iterable of `Tensor` or `SparseTensor`.
+    num_buckets: An `int` that is `>= 0`.
+      output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+    hash_key: Integer hash_key that will be used by the `FingerprintCat64`
+      function. If not given, will use a default key.
+    name: Optional name for the op.
+
+  Returns:
+    A `SparseTensor` of type `int64`.
+  """
+  return _sparse_cross_internal(
+      inputs=inputs,
+      hashed_output=True,
+      num_buckets=num_buckets,
+      hash_key=hash_key,
+      name=name)
+
+
+_DEFAULT_HASH_KEY = 0xDECAFCAFFE
+
+
+def _sparse_cross_internal(
+    inputs, hashed_output=False, num_buckets=0, hash_key=None, name=None):
+  """See gen_sparse_ops._sparse_cross."""
+  if not isinstance(inputs, list):
+    raise TypeError("Inputs must be a list")
+  if not all(isinstance(i, sparse_tensor.SparseTensor) or
+             isinstance(i, ops.Tensor) for i in inputs):
+    raise TypeError("All inputs must be SparseTensors")
+
+  sparse_inputs = [i for i in inputs
+                   if isinstance(i, sparse_tensor.SparseTensor)]
+  dense_inputs = [i for i in inputs
+                  if not isinstance(i, sparse_tensor.SparseTensor)]
+
+  indices = [sp_input.indices for sp_input in sparse_inputs]
+  values = [sp_input.values for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
+  out_type = dtypes.int64 if hashed_output else dtypes.string
+
+  internal_type = dtypes.string
+  for i in range(len(values)):
+    if values[i].dtype != dtypes.string:
+      values[i] = math_ops.to_int64(values[i])
+      internal_type = dtypes.int64
+  for i in range(len(dense_inputs)):
+    if dense_inputs[i].dtype != dtypes.string:
+      dense_inputs[i] = math_ops.to_int64(dense_inputs[i])
+      internal_type = dtypes.int64
+
+  indices_out, values_out, shape_out = gen_sparse_ops._sparse_cross(
+      indices=indices,
+      values=values,
+      shapes=shapes,
+      dense_inputs=dense_inputs,
+      hashed_output=hashed_output,
+      num_buckets=num_buckets,
+      hash_key=hash_key or _DEFAULT_HASH_KEY,
+      out_type=out_type,
+      internal_type=internal_type,
+      name=name)
+
+  return sparse_tensor.SparseTensor(indices_out, values_out, shape_out)
+
+
 def sparse_dense_cwise_add(sp_t, dense_t):
   """Adds up a SparseTensor and a dense Tensor, using these special rules:
 
@@ -368,8 +500,12 @@ def sparse_reorder(sp_input, name=None):
   reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
       sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
-  return sparse_tensor.SparseTensor(reordered_ind, reordered_val,
-                                    array_ops.identity(sp_input.dense_shape))
+  if sp_input.get_shape().is_fully_defined():
+    dense_shape = sp_input.get_shape().as_list()
+  else:
+    dense_shape = array_ops.identity(sp_input.dense_shape)
+
+  return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
 def sparse_reshape(sp_input, shape, name=None):
@@ -416,13 +552,30 @@ def sparse_reshape(sp_input, shape, name=None):
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
+    ValueError:  If argument `shape` requests a `SparseTensor` with a different
+      number of elements than `sp_input`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
+  shape = math_ops.cast(shape, dtype=dtypes.int64)
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
     reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
+    reshaped_shape_const = tensor_util.constant_value(shape)
+    if (reshaped_shape_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      # Don't deal with inferred dimensions. That would add significant code.
+      if all(n >= 0 for n in reshaped_shape_const):
+        reshaped_size = np.prod(reshaped_shape_const)
+        in_shape_size = np.prod(sp_input.get_shape().as_list())
+        if reshaped_size != in_shape_size:
+          raise ValueError(
+              "Cannot reshape a tensor with %d elements to shape %s "
+              "(%d elements)."
+              % (in_shape_size, reshaped_shape_const, reshaped_size))
+        reshaped_shape = reshaped_shape_const
+
     return sparse_tensor.SparseTensor(
         reshaped_ind, array_ops.identity(sp_input.values),
         reshaped_shape)
@@ -986,6 +1139,8 @@ def sparse_reset_shape(sp_input, new_shape=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
     ValueError: If `new_shape` represents a tensor with a different rank from
       that of `sp_input` (if shapes are known when graph is constructed).
+    ValueError:  If `new_shape` is determined during graph build to have
+      dimension sizes that are too small.
     OpError:
       - If `new_shape` has dimension sizes that are too small.
       - If shapes are not known during graph construction time, and during run
@@ -1009,14 +1164,27 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    # For cases where shape is not known during graph construction.
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_equal(
-            array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-        output_shape_tensor)
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+    output_shape_tensor_const = tensor_util.constant_value(
         output_shape_tensor)
+    # For cases where all shapes are known during graph construction
+    if (output_shape_tensor_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      in_shape_const = np.array(sp_input.get_shape().as_list())
+      if not np.all(in_shape_const <= output_shape_tensor_const):
+        raise ValueError(
+            "Requested new_shape should have dimension sizes >= sp_input.shape."
+            "  Found new_shape (%s), sp_input.shape (%s)."
+            % (in_shape_const, output_shape_tensor_const))
+      output_shape_tensor = output_shape_tensor_const
+    else:
+      # For cases where shape is not known during graph construction.
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
+          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+          output_shape_tensor)
 
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
@@ -1229,36 +1397,72 @@ def sparse_tensor_dense_matmul(sp_a,
   # pylint: disable=line-too-long
   """Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 
-  No validity checking is performed on the indices of A.  However, the following
-  input format is recommended for optimal behavior:
+  No validity checking is performed on the indices of `A`.  However, the
+  following input format is recommended for optimal behavior:
+
+  * If `adjoint_a == false`: `A` should be sorted in lexicographically
+    increasing order.  Use `sparse_reorder` if you're not sure.
+  * If `adjoint_a == true`: `A` should be sorted in order of increasing
+    dimension 1 (i.e., "column major" order instead of "row major" order).
+
+  Using `tf.nn.embedding_lookup_sparse` for sparse multiplication:
+
+  It's not obvious but you can consider `embedding_lookup_sparse` as another
+  sparse and dense multiplication. In some situations, you may prefer to use
+  `embedding_lookup_sparse` even though you're not dealing with embeddings.
+
+  There are two questions to ask in the decision process: Do you need gradients
+  computed as sparse too? Is your sparse data represented as two
+  `SparseTensor`s: ids and values? There is more explanation about data format
+  below. If you answer any of these questions as yes, consider using
+  `tf.nn.embedding_lookup_sparse`.
+
+  Following explains differences between the expected SparseTensors:
+  For example if dense form of your sparse data has shape `[3, 5]` and values:
+
+      [[  a      ]
+       [b       c]
+       [    d    ]]
+
+
+  `SparseTensor` format expected by `sparse_tensor_dense_matmul`:
+   `sp_a` (indices, values):
+
+      [0, 1]: a
+      [1, 0]: b
+      [1, 4]: c
+      [2, 2]: d
+
+  `SparseTensor` format expected by `embedding_lookup_sparse`:
+   `sp_ids`                 `sp_weights`
+
+      [0, 0]: 1                [0, 0]: a
+      [1, 0]: 0                [1, 0]: b
+      [1, 1]: 4                [1, 1]: c
+      [2, 0]: 2                [2, 0]: d
 
-  if adjoint_a == false:
-    A should be sorted in lexicographically increasing order.  Use
-    sparse_reorder if you're not sure.
-  if adjoint_a == true:
-    A should be sorted in order of increasing dimension 1 (i.e., "column major"
-    order instead of "row major" order).
 
-  Deciding when to use sparse_tensor_dense_matmul vs. matmul(sp_a=True):
+  Deciding when to use `sparse_tensor_dense_matmul` vs.
+  `matmul`(a_is_sparse=True):
 
   There are a number of questions to ask in the decision process, including:
 
-  * Will the SparseTensor A fit in memory if densified?
+  * Will the SparseTensor `A` fit in memory if densified?
   * Is the column count of the product large (>> 1)?
-  * Is the density of A larger than approximately 15%?
+  * Is the density of `A` larger than approximately 15%?
 
   If the answer to several of these questions is yes, consider
   converting the `SparseTensor` to a dense one and using `tf.matmul` with
-  `sp_a=True`.
+  `a_is_sparse=True`.
 
-  This operation tends to perform well when A is more sparse, if the column size
-  of the product is small (e.g. matrix-vector multiplication), if
+  This operation tends to perform well when `A` is more sparse, if the column
+  size of the product is small (e.g. matrix-vector multiplication), if
   `sp_a.dense_shape` takes on large values.
 
-  Below is a rough speed comparison between sparse_tensor_dense_matmul,
-  labelled 'sparse', and matmul(sp_a=True), labelled 'dense'.  For purposes of
-  the comparison, the time spent converting from a SparseTensor to a dense
-  Tensor is not included, so it is overly conservative with respect to
+  Below is a rough speed comparison between `sparse_tensor_dense_matmul`,
+  labeled 'sparse', and `matmul`(a_is_sparse=True), labeled 'dense'.  For
+  purposes of the comparison, the time spent converting from a `SparseTensor` to
+  a dense `Tensor` is not included, so it is overly conservative with respect to
   the time ratio.
 
   Benchmark system:
@@ -1383,9 +1587,9 @@ def sparse_tensor_dense_matmul(sp_a,
 
   Returns:
     A dense matrix (pseudo-code in dense np.matrix notation):
-      A = A.H if adjoint_a else A
-      B = B.H if adjoint_b else B
-      return A*B
+      `A = A.H if adjoint_a else A`
+      `B = B.H if adjoint_b else B`
+      `return A*B`
   """
   # pylint: enable=line-too-long
   sp_a = _convert_to_sparse_tensor(sp_a)
@@ -1560,7 +1764,7 @@ def sparse_transpose(sp_input, perm=None, name=None):
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
-  with ops.op_scope([sp_input], name, "SparseTranspose") as name:
+  with ops.name_scope(name, "SparseTranspose", [sp_input]) as name:
     if perm is None:
       rank = array_ops.rank(sp_input)
       perm = (rank - 1) - math_ops.range(0, rank, 1)
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index e24246464ec85ebd0ade33ba5bc81a401787535f..b561203bb474a366cf32dea05dadb032fa510e44 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -35,19 +35,20 @@ from tensorflow.python.platform import tf_logging as logging
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 def lbeta(x, name='lbeta'):
-  r"""Computes `ln(|Beta(x)|)`, reducing along the last dimension.
+  r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
   Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
 
-  ```Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)```
+  $$Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)$$
 
   And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-  `lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)`.  In other words,
-  the last dimension is treated as the `z` vector.
+  $$lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)$$.
+
+  In other words, the last dimension is treated as the `z` vector.
 
   Note that if `z = [u, v]`, then
-  `Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt`, which defines the traditional
-  bivariate beta function.
+  \\(Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt\\), which defines the
+  traditional bivariate beta function.
 
   If the last dimension is empty, we follow the convention that the sum over
   the empty set is zero, and the product is one.
@@ -57,7 +58,7 @@ def lbeta(x, name='lbeta'):
     name: A name for the operation (optional).
 
   Returns:
-    The logarithm of `|Beta(x)|` reducing along the last dimension.
+    The logarithm of \\(|Beta(x)|\\) reducing along the last dimension.
   """
   # In the event that the last dimension has zero entries, we return -inf.
   # This is consistent with a convention that the sum over the empty set 0, and
@@ -423,7 +424,7 @@ def _exponential_space_einsum(equation, *inputs):
   missing_idx = set(idx_out).difference(idx_all)
   if missing_idx:
     raise ValueError(
-        'Unknown ouput axes: %s' % missing_idx
+        'Unknown output axes: %s' % missing_idx
     )
 
   axis_order = {}
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 9d24eb242d6e504a8d4b492e14d9a086c0bb74c5..a6b14f6f6f35a497f20908868a9ef5f2dfeef48e 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -57,6 +57,8 @@ from tensorflow.python.ops.io_ops import *
 from tensorflow.python.ops.linalg_ops import *
 from tensorflow.python.ops.logging_ops import Print
 from tensorflow.python.ops.logging_ops import get_summary_op
+from tensorflow.python.ops.lookup_ops import initialize_all_tables
+from tensorflow.python.ops.lookup_ops import tables_initializer
 from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
@@ -145,6 +147,7 @@ _allowed_symbols_math_ops = [
     # These are documented in nn.
     # We are are not importing nn because it would create a circular dependency.
     "sigmoid",
+    "log_sigmoid",
     "tanh",
 ]
 
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 2a64cb7b705647a154de803f4b6e2ca934db8e39..63394d52145209c9d409d7b5f85fec58039c26bb 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -209,7 +209,7 @@ def assign_sub(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_sub(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_sub(value, name=name)
+  return ref.assign_sub(value)
 
 
 def assign_add(ref, value, use_locking=None, name=None):
@@ -237,14 +237,15 @@ def assign_add(ref, value, use_locking=None, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign_add(
         ref, value, use_locking=use_locking, name=name)
-  return ref.assign_add(value, name=name)
+  return ref.assign_add(value)
 
 
 def assign(ref, value, validate_shape=None, use_locking=None, name=None):
   """Update 'ref' by assigning 'value' to it.
 
-  This operation outputs "ref" after the assignment is done.
-  This makes it easier to chain operations that need to use the reset value.
+  This operation outputs a Tensor that holds the new value of 'ref' after
+    the value has been assigned. This makes it easier to chain operations
+    that need to use the reset value.
 
   Args:
     ref: A mutable `Tensor`.
@@ -261,11 +262,11 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
     name: A name for the operation (optional).
 
   Returns:
-    Same as "ref".  Returned as a convenience for operations that want
-    to use the new value after the variable has been reset.
+    A `Tensor` that will hold the new value of 'ref' after
+      the assignment has completed.
   """
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.assign(
         ref, value, use_locking=use_locking, name=name,
         validate_shape=validate_shape)
-  return ref.assign(value, name=name)
+  return ref.assign(value)
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 80dd74521bed07f22b519b5869b8352ecd7a8f78..48be9e2cdae685a0d72d0a773a37d76e74f179a0 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -261,20 +261,23 @@ class Template(object):
           return self._call_func(args, kwargs, check_for_new_variables=True)
       else:
         # This is the first visit to __call__, but the scope has already been
-        # created in the constructor. Set _variables_created so that subsequent
-        # calls take the if branch above.
-        self._variables_created = True
+        # created in the constructor. Set _variables_created after the inner
+        # function is successfully called so that subsequent calls take the if
+        # branch above.
         with variable_scope.variable_scope(self._variable_scope):
-          return self._call_func(args, kwargs, check_for_new_variables=False)
+          result = self._call_func(args, kwargs, check_for_new_variables=False)
+          self._variables_created = True
+          return result
     else:
       # The scope was not created at construction time, so create it here.
       # Subsequent calls should reuse variables.
-      self._variables_created = True
       with variable_scope.variable_scope(
           self._unique_name, self._name,
           custom_getter=self._custom_getter) as vs:
         self._variable_scope = vs
-        return self._call_func(args, kwargs, check_for_new_variables=False)
+        result = self._call_func(args, kwargs, check_for_new_variables=False)
+        self._variables_created = True
+        return result
 
   @property
   def variable_scope(self):
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index d1013c53ddfa2c936eaf780e5e135fb979edbdc1..8b119f58421c909fd6fd3c27fe587cc8e8d74d9c 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -22,30 +22,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
-
-
-def _maybe_set_device(handle_op, value_t):
-  # NOTE(ebrevdo): Do not try this at home, kids
-  # _______________________________________________
-  # | I WILL NOT ACCESS PRIVATE METHODS ^^^^^^^^\ |
-  # | I WILL NOT ACCESS PRIVATE METHODS |       | |
-  # | I WILL NOT ACCESS PRIVATE METHODS |_ __   | |
-  # | I WILL NOT ACCESS PRIVATE METHODS (.(. )  | |
-  # | I WILL NOT ACCESS PRIVATE         (_      ) |
-  # |                           \\      /___/' /  |
-  # |                           _\\_      \    |  |
-  # |                          ((   )     /====|  |
-  # |                           \  <.__._-      \ |
-  # |___________________________ <//___.         ||
-  #
-  if not handle_op.device and value_t.device:
-    handle_op._set_device(value_t.device)  # pylint: disable=protected-access
+from tensorflow.python.util import tf_should_use
 
 
 # TensorArray object accesses many of the hidden generated ops, but is
@@ -131,6 +116,12 @@ class TensorArray(object):
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
+
+    # Used to keep track of what tensors the TensorArray should be
+    # colocated with.  We choose to colocate the TensorArray with the
+    # first tensor written to it.
+    self._colocate_with = []
+
     # Record the current static shape for the array elements. The element
     # shape is defined either by `element_shape` or the shape of the tensor
     # of the first write. If `infer_shape` is true, all writes checks for
@@ -196,6 +187,24 @@ class TensorArray(object):
     else:
       self._element_shape.append(shape)
 
+  @contextlib.contextmanager
+  def _maybe_colocate_with(self, value):
+    """Colocate operations with an internal colocation group or `value`.
+
+    Args:
+      value: `Tensor`, the tensor to try to colocate with.
+
+    Yields:
+      Does not yield anything, but the new context is a colocation context.
+
+    If no internal colocation group is set, colocate with `value` and set
+    the internal colocation group to be value.
+    """
+    if not self._colocate_with:
+      self._colocate_with.append(value)
+    with ops.colocate_with(self._colocate_with[0]):
+      yield
+
   def identity(self):
     """Returns a TensorArray with the same content and properties.
 
@@ -208,6 +217,7 @@ class TensorArray(object):
     ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow,
                      infer_shape=self._infer_shape)
     ta._element_shape = self._element_shape
+    ta._colocate_with = self._colocate_with
     return ta
 
   def grad(self, source, flow=None, name=None):
@@ -241,17 +251,17 @@ class TensorArray(object):
     Returns:
       The tensor at index `index`.
     """
-    with ops.colocate_with(self._handle):
-      value = gen_data_flow_ops._tensor_array_read_v3(
-          handle=self._handle,
-          index=index,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name)
-      if self._element_shape:
-        value.set_shape(self._element_shape[0].dims)
-      return value
+    value = gen_data_flow_ops._tensor_array_read_v3(
+        handle=self._handle,
+        index=index,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
 
+  @tf_should_use.should_use_result
   def write(self, index, value, name=None):
     """Write `value` into index `index` of the TensorArray.
 
@@ -269,8 +279,7 @@ class TensorArray(object):
     """
     with ops.name_scope(name, "TensorArrayWrite", [self._handle, index, value]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops._tensor_array_write_v3(
             handle=self._handle,
             index=index,
@@ -280,6 +289,7 @@ class TensorArray(object):
       ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         ta._merge_element_shape(value.get_shape())
       return ta
@@ -314,21 +324,20 @@ class TensorArray(object):
     Returns:
       The in the `TensorArray` selected by `indices`, packed into one tensor.
     """
-    with ops.colocate_with(self._handle):
-      if self._element_shape:
-        element_shape = self._element_shape[0]
-      else:
-        element_shape = tensor_shape.TensorShape(None)
-      value = gen_data_flow_ops._tensor_array_gather_v3(
-          handle=self._handle,
-          indices=indices,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name,
-          element_shape=element_shape)
-      if self._element_shape and self._element_shape[0].dims is not None:
-        value.set_shape([None] + self._element_shape[0].dims)
-      return value
+    if self._element_shape:
+      element_shape = self._element_shape[0]
+    else:
+      element_shape = tensor_shape.TensorShape(None)
+    value = gen_data_flow_ops._tensor_array_gather_v3(
+        handle=self._handle,
+        indices=indices,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name,
+        element_shape=element_shape)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
 
   def concat(self, name=None):
     """Return the values in the TensorArray as a concatenated `Tensor`.
@@ -347,17 +356,17 @@ class TensorArray(object):
           tensor_shape.TensorShape(self._element_shape[0].dims[1:]))
     else:
       element_shape_except0 = tensor_shape.TensorShape(None)
-    with ops.colocate_with(self._handle):
-      value, _ = gen_data_flow_ops._tensor_array_concat_v3(
-          handle=self._handle,
-          flow_in=self._flow,
-          dtype=self._dtype,
-          name=name,
-          element_shape_except0=element_shape_except0)
-      if self._element_shape and self._element_shape[0].dims is not None:
-        value.set_shape([None] + self._element_shape[0].dims[1:])
-      return value
+    value, _ = gen_data_flow_ops._tensor_array_concat_v3(
+        handle=self._handle,
+        flow_in=self._flow,
+        dtype=self._dtype,
+        name=name,
+        element_shape_except0=element_shape_except0)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
 
+  @tf_should_use.should_use_result
   def unstack(self, value, name=None):
     """Unstack the values of a `Tensor` in the TensorArray.
 
@@ -380,6 +389,7 @@ class TensorArray(object):
       return self.scatter(
           indices=math_ops.range(0, num_elements), value=value, name=name)
 
+  @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
     """Scatter the values of a `Tensor` in specific indices of a `TensorArray`.
 
@@ -399,8 +409,7 @@ class TensorArray(object):
     with ops.name_scope(name, "TensorArrayScatter",
                         [self._handle, value, indices]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
         flow_out = gen_data_flow_ops._tensor_array_scatter_v3(
             handle=self._handle,
             indices=indices,
@@ -410,6 +419,7 @@ class TensorArray(object):
       ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[2].get_shape()
         element_shape = tensor_shape.unknown_shape()
@@ -418,6 +428,7 @@ class TensorArray(object):
         ta._merge_element_shape(element_shape)
       return ta
 
+  @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
     """Split the values of a `Tensor` into the TensorArray.
 
@@ -437,9 +448,8 @@ class TensorArray(object):
     with ops.name_scope(name, "TensorArraySplit",
                         [self._handle, value, lengths]):
       value = ops.convert_to_tensor(value, name="value")
-      _maybe_set_device(self._handle.op, value)
-      lengths_64 = math_ops.to_int64(lengths)
-      with ops.colocate_with(self._handle):
+      with self._maybe_colocate_with(value):
+        lengths_64 = math_ops.to_int64(lengths)
         flow_out = gen_data_flow_ops._tensor_array_split_v3(
             handle=self._handle,
             value=value,
@@ -449,6 +459,7 @@ class TensorArray(object):
       ta = TensorArray(dtype=self._dtype, handle=self._handle, flow=flow_out)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
       if ta._infer_shape:
         val_shape = flow_out.op.inputs[1].get_shape()
         clengths = tensor_util.constant_value(flow_out.op.inputs[2])
@@ -462,14 +473,13 @@ class TensorArray(object):
 
   def size(self, name=None):
     """Return the size of the TensorArray."""
-    with ops.colocate_with(self._handle):
-      return gen_data_flow_ops._tensor_array_size_v3(
-          handle=self._handle, flow_in=self.flow, name=name)
+    return gen_data_flow_ops._tensor_array_size_v3(
+        handle=self._handle, flow_in=self.flow, name=name)
 
+  @tf_should_use.should_use_result
   def close(self, name=None):
     """Close the current TensorArray."""
-    with ops.colocate_with(self._handle):
-      return gen_data_flow_ops._tensor_array_close_v3(
-          handle=self._handle, name=name)
+    return gen_data_flow_ops._tensor_array_close_v3(
+        handle=self._handle, name=name)
 
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 2f97abdc7919159144f88180fe9ef3b3860e4ffe..aceffd373af18ee55d4c3f3ecb44c5307a99fc0c 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as collections_lib
-import contextlib
 import copy
 import functools
 import traceback
@@ -36,6 +35,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_contextlib
 
 __all__ = ["VariableScope", "get_variable_scope",
            "get_variable", "get_local_variable", "variable_scope",
@@ -280,6 +280,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favor of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
     # This is the main logic of get_variable.  However, custom_getter
     # may override this logic.  So we save it as a callable and pass
     # it to custom_getter.
@@ -904,6 +915,7 @@ class VariableScope(object):
                    dtype=None,
                    initializer=None,
                    regularizer=None,
+                   reuse=None,
                    trainable=True,
                    collections=None,
                    caching_device=None,
@@ -920,6 +932,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if custom_getter is None:
       custom_getter = self._custom_getter
+    if reuse is None:
+      reuse = self._reuse
 
     full_name = self.name + "/" + name if self.name else name
     # Variable names only depend on variable_scope (full_name here),
@@ -942,7 +956,7 @@ class VariableScope(object):
 
       return var_store.get_variable(
           full_name, shape=shape, dtype=dtype, initializer=initializer,
-          regularizer=regularizer, reuse=self.reuse, trainable=trainable,
+          regularizer=regularizer, reuse=reuse, trainable=trainable,
           collections=collections, caching_device=caching_device,
           partitioner=partitioner, validate_shape=validate_shape,
           use_resource=use_resource, custom_getter=custom_getter)
@@ -971,6 +985,8 @@ class VariableScope(object):
       partitioner = self._partitioner
     if dtype is None:
       dtype = self._dtype
+    if use_resource is None:
+      use_resource = self._use_resource
 
     if self._custom_getter is not None:
       raise ValueError(
@@ -1245,7 +1261,7 @@ def _get_partitioned_variable(name,
   # pylint: enable=protected-access
 
 
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def _pure_variable_scope(name_or_scope,
                          reuse=None,
                          initializer=None,
@@ -1276,7 +1292,7 @@ def _pure_variable_scope(name_or_scope,
       well-defined semantics. Defaults to False (will later change to True).
 
   Yields:
-    A scope that can be to captured and reused.
+    A scope that can be captured and reused.
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within
@@ -1404,7 +1420,7 @@ def _get_unique_variable_scope(prefix):
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def variable_scope(name_or_scope,
                    default_name=None,
                    values=None,
@@ -1482,6 +1498,9 @@ def variable_scope(name_or_scope,
   Note that the `reuse` flag is inherited: if we open a reusing scope,
   then all its sub-scopes become reusing as well.
 
+  A note about name scoping: Setting `reuse` does not impact the naming of other
+  ops such as mult. See related discussion on [github#6189](https://github.com/tensorflow/tensorflow/issues/6189)
+
   Args:
     name_or_scope: `string` or `VariableScope`: the scope to open.
     default_name: The default name to use if the `name_or_scope` argument is
@@ -1577,7 +1596,7 @@ def variable_scope(name_or_scope,
 
 
 # pylint: disable=g-doc-return-or-yield
-@contextlib.contextmanager
+@tf_contextlib.contextmanager
 def variable_op_scope(values,
                       name_or_scope,
                       default_name=None,
@@ -1634,3 +1653,22 @@ def _compute_slice_dim_and_shape(full_shape, slicing):
   if slice_dim is None:
     slice_dim = 0
   return slice_dim, slice_shape
+
+
+def variable(initial_value=None,
+             trainable=True,
+             collections=None,
+             validate_shape=True,
+             caching_device=None,
+             name=None,
+             dtype=None):
+  if get_variable_scope().use_resource:
+    return resource_variable_ops.ResourceVariable(
+        initial_value=initial_value, trainable=trainable,
+        collections=collections, validate_shape=validate_shape,
+        caching_device=caching_device, name=name, dtype=dtype)
+  else:
+    return variables.Variable(
+        initial_value=initial_value, trainable=trainable,
+        collections=collections, validate_shape=validate_shape,
+        caching_device=caching_device, name=name, dtype=dtype)
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 6a3dc75096e018ed59fc1c85b23eddc553406581..778cb3aac0709069974b498fcc09d9c7919c7863 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -24,9 +24,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 
 
@@ -163,8 +165,9 @@ class Variable(object):
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
       variable_def: `VariableDef` protocol buffer. If not `None`, recreates
-        the Variable object with its contents. `variable_def` and the other
-        arguments are mutually exclusive.
+        the Variable object with its contents, referencing the variable's nodes
+        in the graph, which must already exist. The graph is not changed.
+        `variable_def` and the other arguments are mutually exclusive.
       dtype: If set, initial_value will be converted to the given type.
         If `None`, either the datatype will be kept (if `initial_value` is
         a Tensor), or `convert_to_tensor` will decide.
@@ -320,10 +323,11 @@ class Variable(object):
     self._save_slice_info = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
-    """Creates a new variable from `VariableDef` protocol buffer.
+    """Recreates the Variable object from a `VariableDef` protocol buffer.
 
     Args:
-      variable_def: `VariableDef` protocol buffer.
+      variable_def: `VariableDef` protocol buffer, describing a variable
+          whose nodes already exists in the graph.
       import_scope: Optional `string`. Name scope to add.
     """
     assert isinstance(variable_def, variable_pb2.VariableDef)
@@ -566,6 +570,29 @@ class Variable(object):
         sparse_delta.values,
         use_locking=use_locking)
 
+  def _strided_slice_assign(self,
+                            begin,
+                            end,
+                            strides,
+                            value,
+                            name,
+                            begin_mask,
+                            end_mask,
+                            ellipsis_mask,
+                            new_axis_mask,
+                            shrink_axis_mask):
+    return gen_array_ops.strided_slice_assign(ref=self._ref(),
+                                              begin=begin,
+                                              end=end,
+                                              strides=strides,
+                                              value=value,
+                                              name=name,
+                                              begin_mask=begin_mask,
+                                              end_mask=end_mask,
+                                              ellipsis_mask=ellipsis_mask,
+                                              new_axis_mask=new_axis_mask,
+                                              shrink_axis_mask=shrink_axis_mask)
+
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -1150,6 +1177,7 @@ def variables_initializer(var_list, name="init"):
   return control_flow_ops.no_op(name=name)
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.variables_initializer` instead.")
 def initialize_variables(var_list, name="init"):
   """See `tf.variables_initializer`."""
@@ -1159,7 +1187,7 @@ def initialize_variables(var_list, name="init"):
 def global_variables_initializer():
   """Returns an Op that initializes global variables.
 
-  This is just a shortcut for `variable_initializers(global_variables())`
+  This is just a shortcut for `variable_initializer(global_variables())`
 
   Returns:
     An Op that initializes global variables in the graph.
@@ -1167,6 +1195,7 @@ def global_variables_initializer():
   return variables_initializer(global_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.global_variables_initializer` instead.")
 def initialize_all_variables():
   """See `tf.global_variables_initializer`."""
@@ -1176,7 +1205,7 @@ def initialize_all_variables():
 def local_variables_initializer():
   """Returns an Op that initializes all local variables.
 
-  This is just a shortcut for `variable_initializers(local_variables())`
+  This is just a shortcut for `variable_initializer(local_variables())`
 
   Returns:
     An Op that initializes all local variables in the graph.
@@ -1184,12 +1213,14 @@ def local_variables_initializer():
   return variables_initializer(local_variables())
 
 
+@tf_should_use.should_use_result
 @deprecated("2017-03-02", "Use `tf.local_variables_initializer` instead.")
 def initialize_local_variables():
   """See `tf.local_variables_initializer`."""
   return local_variables_initializer()
 
 
+@tf_should_use.should_use_result
 def is_variable_initialized(variable):
   """Tests if a variable has been initialized.
 
@@ -1203,6 +1234,7 @@ def is_variable_initialized(variable):
   return state_ops.is_variable_initialized(variable)
 
 
+@tf_should_use.should_use_result
 def assert_variables_initialized(var_list=None):
   """Returns an Op to check if variables are initialized.
 
@@ -1244,6 +1276,7 @@ def assert_variables_initialized(var_list=None):
       return array_ops.stack(ranks)
 
 
+@tf_should_use.should_use_result
 def report_uninitialized_variables(var_list=None,
                                    name="report_uninitialized_variables"):
   """Adds ops to list the names of uninitialized variables.
@@ -1284,8 +1317,6 @@ def report_uninitialized_variables(var_list=None,
       return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
 # pylint: disable=protected-access
-ops.register_tensor_conversion_function(Variable,
-                                        Variable._TensorConversionFunction)
 Variable._OverloadAllOperators()
 
 ops.register_tensor_conversion_function(
diff --git a/tensorflow/python/ops/weights_broadcast_ops.py b/tensorflow/python/ops/weights_broadcast_ops.py
index 257b9f1faa4c2c7bd492c882bc2bdcb66b9d22d0..35e93249c31b7446be387bf165284fe54fcaa8e0 100644
--- a/tensorflow/python/ops/weights_broadcast_ops.py
+++ b/tensorflow/python/ops/weights_broadcast_ops.py
@@ -97,9 +97,10 @@ def assert_broadcastable(weights, values):
         return control_flow_ops.no_op(name="static_scalar_check_success")
       if weights_rank_static != values_rank_static:
         raise ValueError(
-            "%s values.rank=%s. weights.rank=%s." % (
+            "%s values.rank=%s. weights.rank=%s."
+            " values.shape=%s. weights.shape=%s." % (
                 _ASSERT_BROADCASTABLE_ERROR_PREFIX, values_rank_static,
-                weights_rank_static))
+                weights_rank_static, values.shape, weights.shape))
       weights_shape_static = tensor_util.constant_value(weights_shape)
       values_shape_static = tensor_util.constant_value(values_shape)
       if weights_shape_static is not None and values_shape_static is not None:
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index ea29399ed2f738d1aa9c503aaabe9b7757ce1fcf..bd2ef3617059808acc9f80ad97f3adc494a612c6 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import numbers
 import os
 import re
@@ -33,6 +32,8 @@ from tensorflow.python.client import timeline
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
 
 # When a subclass of the Benchmark class is created, it is added to
 # the registry automatically
@@ -72,11 +73,6 @@ def _global_report_benchmark(
                  cpu_time is not None else -1, throughput if
                  throughput is not None else -1, str(extras) if extras else "")
 
-  test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
-  if test_env is None:
-    # Reporting was not requested
-    return
-
   entries = test_log_pb2.BenchmarkEntries()
   entry = entries.entry.add()
   entry.name = name
@@ -95,6 +91,12 @@ def _global_report_benchmark(
       else:
         entry.extras[k].string_value = str(v)
 
+  test_env = os.environ.get(TEST_REPORTER_TEST_ENV, None)
+  if test_env is None:
+    # Reporting was not requested, just print the proto
+    print(str(entries))
+    return
+
   serialized_entry = entries.SerializeToString()
 
   mangled_name = name.replace("/", "__")
@@ -135,7 +137,7 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
     """Returns full name of class and method calling report_benchmark."""
 
     # Find the caller method (outermost Benchmark class)
-    stack = inspect.stack()
+    stack = tf_inspect.stack()
     calling_class = None
     name = None
     for frame in stack[::-1]:
diff --git a/tensorflow/python/platform/control_imports.py b/tensorflow/python/platform/control_imports.py
index 61b29ca4e57c50bec34bd2a30151651d6e173342..b8e8e78ef3bfd555c026bce44381964fbce163f3 100644
--- a/tensorflow/python/platform/control_imports.py
+++ b/tensorflow/python/platform/control_imports.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Switch between Google or open source dependencies."""
 # Switch between Google and OSS dependencies
 USE_OSS = True
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 1e74b1512b877ed7331ea84b4154ef23f3e7fb53..96219faab719e28a5fa8a9a21c83f81a6f8478e6 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import atexit
-import inspect
 import itertools
 import os
 import sys
@@ -35,6 +34,9 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import app
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
 
@@ -101,9 +103,9 @@ def GetTempDir():
   """Return a temporary directory for tests to use."""
   global _googletest_temp_dir
   if not _googletest_temp_dir:
-    first_frame = inspect.stack()[-1][0]
-    temp_dir = os.path.join(
-        tempfile.gettempdir(), os.path.basename(inspect.getfile(first_frame)))
+    first_frame = tf_inspect.stack()[-1][0]
+    temp_dir = os.path.join(tempfile.gettempdir(),
+                            os.path.basename(tf_inspect.getfile(first_frame)))
     temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
     def delete_temp_dir(dirname=temp_dir):
@@ -204,15 +206,16 @@ class StubOutForTesting(object):
     Raises:
       AttributeError: If the attribute cannot be found.
     """
-    if (inspect.ismodule(obj) or
-        (not inspect.isclass(obj) and attr_name in obj.__dict__)):
+    _, obj = tf_decorator.unwrap(obj)
+    if (tf_inspect.ismodule(obj) or
+        (not tf_inspect.isclass(obj) and attr_name in obj.__dict__)):
       orig_obj = obj
       orig_attr = getattr(obj, attr_name)
     else:
-      if not inspect.isclass(obj):
-        mro = list(inspect.getmro(obj.__class__))
+      if not tf_inspect.isclass(obj):
+        mro = list(tf_inspect.getmro(obj.__class__))
       else:
-        mro = list(inspect.getmro(obj))
+        mro = list(tf_inspect.getmro(obj))
 
       mro.reverse()
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index a53fc541cb7d2a3d3242f5832b0f3f93a3148784..2455acb4c0c469acbb928c4ec44571e50e06de1f 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Resource management library.
 
 @@get_data_files_path
@@ -25,10 +24,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect as _inspect
 import os as _os
 import sys as _sys
 
+from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -44,9 +43,8 @@ def load_resource(path):
   Raises:
     IOError: If the path is not found, or the resource can't be opened.
   """
-  tensorflow_root = (
-      _os.path.join(
-          _os.path.dirname(__file__), _os.pardir, _os.pardir))
+  tensorflow_root = (_os.path.join(
+      _os.path.dirname(__file__), _os.pardir, _os.pardir))
   path = _os.path.join(tensorflow_root, path)
   path = _os.path.abspath(path)
   with open(path, 'rb') as f:
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 452b8f5d3b81cbb310fa0c09551a4e3882eff2c6..5cb2c152b048db9baaa0d63360be9fd71726338e 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -27,12 +27,15 @@ See the @{$python/test} guide.
 @@gpu_device_name
 @@compute_gradient
 @@compute_gradient_error
+@@create_local_cluster
+
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 # pylint: disable=g-bad-import-order
 from tensorflow.python.client import device_lib as _device_lib
 from tensorflow.python.framework import test_util as _test_util
@@ -41,6 +44,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=unused-import
 from tensorflow.python.framework.test_util import assert_equal_graph_def
+from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.framework.test_util import TensorFlowTestCase as TestCase
 from tensorflow.python.framework.test_util import gpu_device_name
 
@@ -108,6 +112,7 @@ def is_gpu_available(cuda_only=False):
     return any((x.device_type == 'GPU' or x.device_type == 'SYCL')
                for x in _device_lib.list_local_devices())
 
+
 _allowed_symbols = [
     # We piggy-back googletest documentation.
     'Benchmark',
diff --git a/tensorflow/python/saved_model/README.md b/tensorflow/python/saved_model/README.md
index 9fd795adb511bbe7e8a6493cf33b30247406a369..f19127ecd54af161a7bd20033d9e253ae0028c10 100644
--- a/tensorflow/python/saved_model/README.md
+++ b/tensorflow/python/saved_model/README.md
@@ -22,6 +22,8 @@ The following is a summary of the features in SavedModel:
       and outputs. This is called a `Signature`.
     * SavedModel uses [SignatureDefs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/meta_graph.proto)
       to allow generic support for signatures that may need to be saved with the graphs.
+    * For commonly used SignatureDefs in the context of TensorFlow Serving,
+      please see documentation [here](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/signature_defs.md).
 * Support for `Assets`.
     * For cases where ops depend on external files for initialization, such as
       vocabularies, SavedModel supports this via `assets`.
@@ -100,7 +102,7 @@ The typical usage of `builder` is as follows:
 ~~~python
 export_dir = ...
 ...
-builder = saved_model_builder.SavedModelBuilder(export_dir)
+builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
 with tf.Session(graph=tf.Graph()) as sess:
   ...
   builder.add_meta_graph_and_variables(sess,
@@ -130,7 +132,7 @@ the specific meta graph def, will be restored into the supplied session.
 export_dir = ...
 ...
 with tf.Session(graph=tf.Graph()) as sess:
-  loader.load(sess, [tag_constants.TRAINING], export_dir)
+  tf.saved_model.loader.load(sess, [tag_constants.TRAINING], export_dir)
   ...
 ~~~
 
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index d075a04ca2a49d9b2692490eaf4729da10b0fe0d..6899cb10a985080ad79f4574c8a8b78b57ed2038 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -57,7 +57,7 @@ class SavedModelBuilder(object):
   Typical usage for the `SavedModelBuilder`:
   ```python
   ...
-  builder = saved_model.builder.SavedModelBuilder(export_dir)
+  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 
   with tf.Session(graph=tf.Graph()) as sess:
     ...
@@ -96,53 +96,13 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _asset_path_from_tensor(self, path_tensor):
-    """Returns the filepath value stored in constant `path_tensor`.
-
-    Args:
-      path_tensor: Tensor of a file-path.
-
-    Returns:
-      The string value i.e. path of the tensor, if valid.
-
-    Raises:
-      TypeError if tensor does not match expected op type, dtype or value.
-    """
-    if not isinstance(path_tensor, ops.Tensor):
-      raise TypeError("Asset path tensor must be a Tensor.")
-    if path_tensor.op.type != "Const":
-      raise TypeError("Asset path tensor must be of type constant.")
-    if path_tensor.dtype != dtypes.string:
-      raise TypeError("Asset path tensor must be of dtype string.")
-    str_values = path_tensor.op.get_attr("value").string_val
-    if len(str_values) != 1:
-      raise TypeError("Asset path tensor must be a scalar.")
-    return str_values[0]
-
-  def _add_asset_to_collection(self, asset_filename, asset_tensor):
-    """Builds an asset proto and adds it to the asset collection of the graph.
-
-    Args:
-      asset_filename: The filename of the asset to be added.
-      asset_tensor: The asset tensor used to populate the tensor info of the
-          asset proto.
-    """
-    asset_proto = meta_graph_pb2.AssetFileDef()
-    asset_proto.filename = asset_filename
-    asset_proto.tensor_info.name = asset_tensor.name
-
-    asset_any_proto = Any()
-    asset_any_proto.Pack(asset_proto)
-    ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
-
   def _save_and_write_assets(self, assets_collection_to_add=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
       assets_collection_to_add: The collection where the asset paths are setup.
     """
-    asset_source_filepath_list = self._maybe_save_assets(
-        assets_collection_to_add)
+    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)
 
     # Return if there are no assets to write.
     if len(asset_source_filepath_list) is 0:
@@ -201,42 +161,6 @@ class SavedModelBuilder(object):
         raise TypeError("main_op needs to be an Operation: %r" % main_op)
       ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
-  def _maybe_save_assets(self, assets_collection_to_add=None):
-    """Saves assets to the meta graph.
-
-    Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
-
-    Returns:
-      The list of filepaths to the assets in the assets collection.
-
-    Raises:
-      ValueError: Indicating an invalid filepath tensor.
-    """
-    asset_source_filepath_list = []
-
-    if assets_collection_to_add is None:
-      tf_logging.info("No assets to save.")
-      return asset_source_filepath_list
-
-    # Iterate over the supplied asset collection, build the `AssetFile` proto
-    # and add them to the collection with key `constants.ASSETS_KEY`, in the
-    # graph.
-    for asset_tensor in assets_collection_to_add:
-      asset_source_filepath = self._asset_path_from_tensor(asset_tensor)
-      if not asset_source_filepath:
-        raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
-
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
-      # Build `AssetFile` proto and add it to the asset collection in the graph.
-      self._add_asset_to_collection(asset_source_filename, asset_tensor)
-
-      asset_source_filepath_list.append(asset_source_filepath)
-
-    tf_logging.info("Assets added to graph.")
-    return asset_source_filepath_list
-
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
 
@@ -475,3 +399,81 @@ class SavedModelBuilder(object):
     tf_logging.info("SavedModel written to: %s", path)
 
     return path
+
+
+def _maybe_save_assets(assets_collection_to_add=None):
+  """Saves assets to the meta graph.
+
+  Args:
+    assets_collection_to_add: The collection where the asset paths are setup.
+
+  Returns:
+    The list of filepaths to the assets in the assets collection.
+
+  Raises:
+    ValueError: Indicating an invalid filepath tensor.
+  """
+  asset_source_filepath_list = []
+
+  if assets_collection_to_add is None:
+    tf_logging.info("No assets to save.")
+    return asset_source_filepath_list
+
+  # Iterate over the supplied asset collection, build the `AssetFile` proto
+  # and add them to the collection with key `constants.ASSETS_KEY`, in the
+  # graph.
+  for asset_tensor in assets_collection_to_add:
+    asset_source_filepath = _asset_path_from_tensor(asset_tensor)
+    if not asset_source_filepath:
+      raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
+
+    asset_source_filename = os.path.basename(asset_source_filepath)
+
+    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    _add_asset_to_collection(asset_source_filename, asset_tensor)
+
+    asset_source_filepath_list.append(asset_source_filepath)
+
+  tf_logging.info("Assets added to graph.")
+  return asset_source_filepath_list
+
+
+def _asset_path_from_tensor(path_tensor):
+  """Returns the filepath value stored in constant `path_tensor`.
+
+  Args:
+    path_tensor: Tensor of a file-path.
+
+  Returns:
+    The string value i.e. path of the tensor, if valid.
+
+  Raises:
+    TypeError if tensor does not match expected op type, dtype or value.
+  """
+  if not isinstance(path_tensor, ops.Tensor):
+    raise TypeError("Asset path tensor must be a Tensor.")
+  if path_tensor.op.type != "Const":
+    raise TypeError("Asset path tensor must be of type constant.")
+  if path_tensor.dtype != dtypes.string:
+    raise TypeError("Asset path tensor must be of dtype string.")
+  str_values = path_tensor.op.get_attr("value").string_val
+  if len(str_values) != 1:
+    raise TypeError("Asset path tensor must be a scalar.")
+  return str_values[0]
+
+
+def _add_asset_to_collection(asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the asset collection of the graph.
+
+  Args:
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the
+        asset proto.
+  """
+  asset_proto = meta_graph_pb2.AssetFileDef()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+  asset_any_proto = Any()
+  asset_any_proto.Pack(asset_proto)
+  ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index a9d999dad3a55b45d58395f7be535d236deaecc9..32526521749d26c02e29f8bcda7b934faecfddfe 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -195,46 +195,47 @@ def load(sess, tags, export_dir, **saver_kwargs):
   Raises:
     RuntimeError: MetaGraphDef associated with the tags cannot be found.
   """
-  # Build the SavedModel protocol buffer and find the requested meta graph def.
-  saved_model = _parse_saved_model(export_dir)
-  found_match = False
-  for meta_graph_def in saved_model.meta_graphs:
-    if set(meta_graph_def.meta_info_def.tags) == set(tags):
-      meta_graph_def_to_load = meta_graph_def
-      found_match = True
-      break
-
-  if not found_match:
-    raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
-        "[]") + " could not be found in SavedModel")
-
-  # Build a saver by importing the meta graph def to load.
-  saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
-
-  if saver:
-    # Build the checkpoint path where the variables are located.
-    variables_path = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.VARIABLES_DIRECTORY),
-        compat.as_bytes(constants.VARIABLES_FILENAME))
-
-    # Restore the variables using the built saver in the provided session.
-    saver.restore(sess, variables_path)
-  else:
-    tf_logging.info("The specified SavedModel has no variables; no "
-                    "checkpoints were restored.")
-
-  # Get asset tensors, if any.
-  asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                meta_graph_def_to_load)
-
-  main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
-  if main_op_tensor is not None:
-    sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
-  else:
-    legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
-    if legacy_init_op_tensor is not None:
-      sess.run(fetches=[legacy_init_op_tensor],
-               feed_dict=asset_tensors_dictionary)
-
-  return meta_graph_def_to_load
+  with sess.graph.as_default():
+    # Build the SavedModel protocol buffer and find requested meta graph def.
+    saved_model = _parse_saved_model(export_dir)
+    found_match = False
+    for meta_graph_def in saved_model.meta_graphs:
+      if set(meta_graph_def.meta_info_def.tags) == set(tags):
+        meta_graph_def_to_load = meta_graph_def
+        found_match = True
+        break
+
+    if not found_match:
+      raise RuntimeError("MetaGraphDef associated with tags " + str(tags).strip(
+          "[]") + " could not be found in SavedModel")
+
+    # Build a saver by importing the meta graph def to load.
+    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
+
+    if saver:
+      # Build the checkpoint path where the variables are located.
+      variables_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.VARIABLES_DIRECTORY),
+          compat.as_bytes(constants.VARIABLES_FILENAME))
+
+      # Restore the variables using the built saver in the provided session.
+      saver.restore(sess, variables_path)
+    else:
+      tf_logging.info("The specified SavedModel has no variables; no "
+                      "checkpoints were restored.")
+
+    # Get asset tensors, if any.
+    asset_tensors_dictionary = _get_asset_tensors(export_dir,
+                                                  meta_graph_def_to_load)
+
+    main_op_tensor = _get_main_op_tensor(meta_graph_def_to_load)
+    if main_op_tensor is not None:
+      sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+    else:
+      legacy_init_op_tensor = _get_legacy_init_op_tensor(meta_graph_def_to_load)
+      if legacy_init_op_tensor is not None:
+        sess.run(
+            fetches=[legacy_init_op_tensor], feed_dict=asset_tensors_dictionary)
+
+    return meta_graph_def_to_load
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index 66cf9d4d8af53b2d22f14d54ab054bcfa49df967..355fd57bf1d2166f58a5fdc95d04695ea05b56b3 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops as tf_data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 
 
@@ -35,7 +35,7 @@ def main_op():
   """
   init = variables.global_variables_initializer()
   init_local = variables.local_variables_initializer()
-  init_tables = tf_data_flow_ops.tables_initializer()
+  init_tables = lookup_ops.tables_initializer()
   return control_flow_ops.group(init, init_local, init_tables)
 
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index a81f74417529eb84763656ae5407bbcfe4a9e077..fcd6bc39547066617be14b8f9e70127dd7fdadab 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -151,6 +151,27 @@ class SavedModelTest(test.TestCase):
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
+  def testVerifySessionGraphUsage(self):
+    export_dir = os.path.join(test.get_temp_dir(),
+                              "test_verify_session_graph_usage")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    # Build a session and supply it to the load operation.
+    sess = session.Session(graph=ops.Graph())
+    loader.load(sess, [tag_constants.TRAINING], export_dir)
+
+    # Check the variable within the scope of the session and its graph.
+    with sess:
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
   def testSequence(self):
     export_dir = os.path.join(test.get_temp_dir(), "test_sequence")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index d130588fa2978156a9e4b5aacc5a3aa91fad7bd1..efcc59465a6a23c8e2a0008a6507e9a6d4a46aa3 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -190,6 +190,11 @@ def histogram(name, values, collections=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
 
+  Adding a histogram summary makes it possible to visualize your data's
+  distribution in TensorBoard. You can see a detailed explanation of the
+  TensorBoard histogram dashboard
+  [here](https://www.tensorflow.org/get_started/tensorboard_histograms).
+
   The generated
   [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
   has one summary value containing a histogram for `values`.
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index 82dee45d267cdbbb2d30fbd82010f54e3fb20399..52bc913b2ada672bc0b263d4f0646a98698d2895 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -34,12 +34,17 @@ def text_summary(name, tensor, collections=None):
   """Summarizes textual data.
 
   Text data summarized via this plugin will be visible in the Text Dashboard
-  in TensorBoard.
+  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
+  in the strings, and will automatically organize 1d and 2d tensors into tables.
+  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
+  displayed along with a warning message. (Note that this behavior is not
+  intrinsic to the text summary api, but rather to the default TensorBoard text
+  plugin.)
 
   Args:
     name: A name for the generated node. Will also serve as a series name in
       TensorBoard.
-    tensor: a scalar string-type Tensor to summarize.
+    tensor: a string-type Tensor to summarize.
     collections: Optional list of ops.GraphKeys.  The collections to add the
       summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
 
@@ -49,16 +54,12 @@ def text_summary(name, tensor, collections=None):
     type `string` which contains `Summary` protobufs.
 
   Raises:
-    ValueError: If tensor has the wrong shape or type.
+    ValueError: If tensor has the wrong type.
   """
   if tensor.dtype != dtypes.string:
     raise ValueError("Expected tensor %s to have dtype string, got %s" %
                      (tensor.name, tensor.dtype))
 
-  if tensor.shape.ndims != 0:
-    raise ValueError("Expected tensor %s to be scalar, has shape %s" %
-                     (tensor.name, tensor.shape))
-
   t_summary = tensor_summary(name, tensor, collections=collections)
   text_assets = plugin_asset.get_plugin_asset(TextSummaryPluginAsset)
   text_assets.register_tensor(t_summary.op.name)
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
index 69739573c10f26378691c83e679b380ca8e68128..31009702ca41e1f0b5ff5742ae81cfc962d3061d 100644
--- a/tensorflow/python/summary/text_summary_test.py
+++ b/tensorflow/python/summary/text_summary_test.py
@@ -40,17 +40,19 @@ class TextPluginTest(test_util.TensorFlowTestCase):
         num = array_ops.constant(1)
         text_summary.text_summary("foo", num)
 
-      with self.assertRaises(ValueError):
-        arr = array_ops.constant(["one", "two", "three"])
-        text_summary.text_summary("foo", arr)
+      # The API accepts vectors.
+      arr = array_ops.constant(["one", "two", "three"])
+      summ = text_summary.text_summary("foo", arr)
+      self.assertEqual(summ.op.type, "TensorSummary")
 
+      # the API accepts scalars
       summ = text_summary.text_summary("foo", array_ops.constant("one"))
       self.assertEqual(summ.op.type, "TensorSummary")
 
-      text_summary.text_summary("bar", array_ops.constant("2"), collections=[])
-      summaries = framework_ops.get_collection(
-          framework_ops.GraphKeys.SUMMARIES)
-      self.assertEqual(len(summaries), 1)
+  def testTextSummaryCollections(self):
+    text_summary.text_summary("bar", array_ops.constant("2"), collections=[])
+    summaries = framework_ops.get_collection(framework_ops.GraphKeys.SUMMARIES)
+    self.assertEqual(len(summaries), 0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/summary/writer/event_file_writer.py b/tensorflow/python/summary/writer/event_file_writer.py
index 7142998ce726bf94dd6408db152a0e735ec2cf93..2936a279bd4becc9c4548b138fa196714a2b224b 100644
--- a/tensorflow/python/summary/writer/event_file_writer.py
+++ b/tensorflow/python/summary/writer/event_file_writer.py
@@ -24,6 +24,7 @@ import time
 
 import six
 
+from tensorflow.core.util import event_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
@@ -37,7 +38,8 @@ class EventFileWriter(object):
   is encoded using the tfrecord format, which is similar to RecordIO.
   """
 
-  def __init__(self, logdir, max_queue=10, flush_secs=120):
+  def __init__(self, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=None):
     """Creates a `EventFileWriter` and an event file to write to.
 
     On construction the summary writer creates a new event file in `logdir`.
@@ -57,6 +59,8 @@ class EventFileWriter(object):
       max_queue: Integer. Size of the queue for pending events and summaries.
       flush_secs: Number. How often, in seconds, to flush the
         pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
     """
     self._logdir = logdir
     if not gfile.IsDirectory(self._logdir):
@@ -64,12 +68,20 @@ class EventFileWriter(object):
     self._event_queue = six.moves.queue.Queue(max_queue)
     self._ev_writer = pywrap_tensorflow.EventsWriter(
         compat.as_bytes(os.path.join(self._logdir, "events")))
+    self._flush_secs = flush_secs
+    self._sentinel_event = self._get_sentinel_event()
+    if filename_suffix:
+      self._ev_writer.InitWithSuffix(compat.as_bytes(filename_suffix))
     self._closed = False
     self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
-                                      flush_secs)
+                                      self._flush_secs, self._sentinel_event)
 
     self._worker.start()
 
+  def _get_sentinel_event(self):
+    """Generate a sentinel event for terminating worker."""
+    return event_pb2.Event()
+
   def get_logdir(self):
     """Returns the directory where event file will be written."""
     return self._logdir
@@ -83,6 +95,9 @@ class EventFileWriter(object):
     Does nothing if the EventFileWriter was not closed.
     """
     if self._closed:
+      self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
+                                        self._flush_secs, self._sentinel_event)
+      self._worker.start()
       self._closed = False
 
   def add_event(self, event):
@@ -108,7 +123,9 @@ class EventFileWriter(object):
 
     Call this method when you do not need the summary writer anymore.
     """
+    self.add_event(self._sentinel_event)
     self.flush()
+    self._worker.join()
     self._ev_writer.Close()
     self._closed = True
 
@@ -116,7 +133,7 @@ class EventFileWriter(object):
 class _EventLoggerThread(threading.Thread):
   """Thread that logs events."""
 
-  def __init__(self, queue, ev_writer, flush_secs):
+  def __init__(self, queue, ev_writer, flush_secs, sentinel_event):
     """Creates an _EventLoggerThread.
 
     Args:
@@ -125,6 +142,8 @@ class _EventLoggerThread(threading.Thread):
        the visualizer.
       flush_secs: How often, in seconds, to flush the
         pending file to disk.
+      sentinel_event: A sentinel element in queue that tells this thread to
+        terminate.
     """
     threading.Thread.__init__(self)
     self.daemon = True
@@ -133,10 +152,14 @@ class _EventLoggerThread(threading.Thread):
     self._flush_secs = flush_secs
     # The first event will be flushed immediately.
     self._next_event_flush_time = 0
+    self._sentinel_event = sentinel_event
 
   def run(self):
     while True:
       event = self._queue.get()
+      if event is self._sentinel_event:
+        self._queue.task_done()
+        break
       try:
         self._ev_writer.WriteEvent(event)
         # Flush the event writer every so often.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 2dad7b88d37e77e1a509b5b51f51f96de5fac554..05f97fb28417dbac6e26c7767f34a2152baea8c0 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,7 +32,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
 
-
 _PLUGINS_DIR = "plugins"
 
 
@@ -81,12 +80,11 @@ class SummaryToEventTransformer(object):
       self.add_graph(graph=graph, graph_def=graph_def)
       # Also export the meta_graph_def in this case.
       # graph may itself be a graph_def due to positional arguments
-      maybe_graph_as_def = (
-          graph.as_graph_def(add_shapes=True) if isinstance(graph, ops.Graph)
-          else graph)
+      maybe_graph_as_def = (graph.as_graph_def(add_shapes=True)
+                            if isinstance(graph, ops.Graph) else graph)
       self.add_meta_graph(
-          meta_graph.create_meta_graph_def(
-              graph_def=graph_def or maybe_graph_as_def))
+          meta_graph.create_meta_graph_def(graph_def=graph_def or
+                                           maybe_graph_as_def))
 
   def add_summary(self, summary, global_step=None):
     """Adds a `Summary` protocol buffer to the event file.
@@ -214,8 +212,8 @@ class SummaryToEventTransformer(object):
       TypeError: If both `meta_graph_def` is not an instance of `MetaGraphDef`.
     """
     if not isinstance(meta_graph_def, meta_graph_pb2.MetaGraphDef):
-      raise TypeError("meta_graph_def must be type MetaGraphDef, saw type: %s"
-                      % type(meta_graph_def))
+      raise TypeError("meta_graph_def must be type MetaGraphDef, saw type: %s" %
+                      type(meta_graph_def))
     meta_graph_bytes = meta_graph_def.SerializeToString()
     event = event_pb2.Event(meta_graph_def=meta_graph_bytes)
     self._add_event(event, global_step)
@@ -266,7 +264,8 @@ class FileWriter(SummaryToEventTransformer):
                graph=None,
                max_queue=10,
                flush_secs=120,
-               graph_def=None):
+               graph_def=None,
+               filename_suffix=None):
     """Creates a `FileWriter` and an event file.
 
     On construction the summary writer creates a new event file in `logdir`.
@@ -304,8 +303,11 @@ class FileWriter(SummaryToEventTransformer):
       flush_secs: Number. How often, in seconds, to flush the
         pending events and summaries to disk.
       graph_def: DEPRECATED: Use the `graph` argument instead.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `suffix`.
     """
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs)
+    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                   filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def get_logdir(self):
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index 21870e788ef00c4e72b27f17fa7399cb5fef74f7..bad289303c0fd0de7836b03a6762d04505521a89 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -39,6 +39,10 @@ class FileWriterCache(object):
   def clear():
     """Clear cached summary writers. Currently only used for unit tests."""
     with FileWriterCache._lock:
+      # Make sure all the writers are closed now (otherwise open file handles
+      # may hang around, blocking deletions on Windows).
+      for item in FileWriterCache._cache.values():
+        item.close()
       FileWriterCache._cache = {}
 
   @staticmethod
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 50797483ee52e6b0f54aea5f766bdb496e0f54a6..8c34eb82e35cba6db8716f797a39f56e778a74af 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -258,6 +258,15 @@ class SummaryWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  def testNonBlockingClose(self):
+    test_dir = self._CleanTestDir("non_blocking_close")
+    sw = writer.FileWriter(test_dir)
+    # Sleep 1.2 seconds to make sure event queue is empty.
+    time.sleep(1.2)
+    time_before_close = time.time()
+    sw.close()
+    self._assertRecent(time_before_close)
+
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
@@ -308,6 +317,22 @@ class SummaryWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  def testFileWriterWithSuffix(self):
+    test_dir = self._CleanTestDir("test_suffix")
+    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    for _ in range(10):
+      sw.add_summary(
+          summary_pb2.Summary(value=[
+              summary_pb2.Summary.Value(tag="float_ten", simple_value=10.0)
+          ]),
+          10)
+      sw.close()
+      sw.reopen()
+    sw.close()
+    event_filenames = glob.glob(os.path.join(test_dir, "event*"))
+    for filename in event_filenames:
+      self.assertTrue(filename.endswith("_test_suffix"))
+
 
 class SummaryWriterCacheTest(test.TestCase):
   """SummaryWriterCache tests."""
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index a0009031ac300326c06714131b68cb88159f29b3..5c2ad417e2f3c95c2dd137387f7808fb3c8f6ad7 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -40,3 +40,5 @@ limitations under the License.
 %include "tensorflow/python/util/kernel_registry.i"
 
 %include "tensorflow/python/util/transform_graph.i"
+
+%include "tensorflow/python/grappler/tf_optimizer.i"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index a98f600c40658a8b4680d446fdd85f51cc4f21f8..48b84f9a96e5f62485d14a47461e140bea39ffdd 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -17,6 +17,7 @@ py_library(
         ":inspect_checkpoint",
         ":optimize_for_inference",
         ":print_selective_registration_header",
+        ":saved_model_cli",
         ":strip_unused",
     ],
 )
@@ -197,6 +198,29 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "saved_model_cli",
+    srcs = ["saved_model_cli.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python",
+        "//tensorflow/python/debug:local_cli_wrapper",
+    ],
+)
+
+py_test(
+    name = "saved_model_cli_test",
+    srcs = ["saved_model_cli_test.py"],
+    data = [
+        "//tensorflow/cc/saved_model:saved_model_half_plus_two",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saved_model_cli",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 38e3e17a88424735e19e6adf5c7e1e0a1bec3ca3..bd046a7fd099c71518e694c7a44c62616c960178 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
+r"""Converts checkpoint variables into Const ops in a standalone GraphDef file.
 
 This script is designed to take a GraphDef proto, a SaverDef proto, and a set of
 variable values stored in a checkpoint file, and output a GraphDef with all of
@@ -55,29 +55,20 @@ from tensorflow.python.training import saver as saver_lib
 FLAGS = None
 
 
-def freeze_graph(input_graph,
-                 input_saver,
-                 input_binary,
-                 input_checkpoint,
-                 output_node_names,
-                 restore_op_name,
-                 filename_tensor_name,
-                 output_graph,
-                 clear_devices,
-                 initializer_nodes,
-                 variable_names_blacklist=""):
+def freeze_graph_with_def_protos(
+    input_graph_def,
+    input_saver_def,
+    input_checkpoint,
+    output_node_names,
+    restore_op_name,
+    filename_tensor_name,
+    output_graph,
+    clear_devices,
+    initializer_nodes,
+    variable_names_blacklist=""):
   """Converts all variables in a graph and checkpoint into constants."""
-
   del restore_op_name, filename_tensor_name  # Unused by updated loading code.
 
-  if not gfile.Exists(input_graph):
-    print("Input graph file '" + input_graph + "' does not exist!")
-    return -1
-
-  if input_saver and not gfile.Exists(input_saver):
-    print("Input saver file '" + input_saver + "' does not exist!")
-    return -1
-
   # 'input_checkpoint' may be a prefix if we're using Saver V2 format
   if not saver_lib.checkpoint_exists(input_checkpoint):
     print("Input checkpoint '" + input_checkpoint + "' doesn't exist!")
@@ -87,13 +78,6 @@ def freeze_graph(input_graph,
     print("You need to supply the name of a node to --output_node_names.")
     return -1
 
-  input_graph_def = graph_pb2.GraphDef()
-  mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
-    if input_binary:
-      input_graph_def.ParseFromString(f.read())
-    else:
-      text_format.Merge(f.read(), input_graph_def)
   # Remove all the explicit device specifications for this node. This helps to
   # make the graph more portable.
   if clear_devices:
@@ -103,15 +87,9 @@ def freeze_graph(input_graph,
   _ = importer.import_graph_def(input_graph_def, name="")
 
   with session.Session() as sess:
-    if input_saver:
-      with gfile.FastGFile(input_saver, mode) as f:
-        saver_def = saver_pb2.SaverDef()
-        if input_binary:
-          saver_def.ParseFromString(f.read())
-        else:
-          text_format.Merge(f.read(), saver_def)
-        saver = saver_lib.Saver(saver_def=saver_def)
-        saver.restore(sess, input_checkpoint)
+    if input_saver_def:
+      saver = saver_lib.Saver(saver_def=input_saver_def)
+      saver.restore(sess, input_checkpoint)
     else:
       var_list = {}
       reader = pywrap_tensorflow.NewCheckpointReader(input_checkpoint)
@@ -142,6 +120,65 @@ def freeze_graph(input_graph,
   print("%d ops in the final graph." % len(output_graph_def.node))
 
 
+def _parse_input_graph_proto(input_graph, input_binary):
+  """Parser input tensorflow graph into GraphDef proto."""
+  if not gfile.Exists(input_graph):
+    print("Input graph file '" + input_graph + "' does not exist!")
+    return -1
+  input_graph_def = graph_pb2.GraphDef()
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_graph, mode) as f:
+    if input_binary:
+      input_graph_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), input_graph_def)
+  return input_graph_def
+
+
+def _parse_input_saver_proto(input_saver, input_binary):
+  """Parser input tensorflow Saver into SaverDef proto."""
+  if not gfile.Exists(input_saver):
+    print("Input saver file '" + input_saver + "' does not exist!")
+    return -1
+  mode = "rb" if input_binary else "r"
+  with gfile.FastGFile(input_saver, mode) as f:
+    saver_def = saver_pb2.SaverDef()
+    if input_binary:
+      saver_def.ParseFromString(f.read())
+    else:
+      text_format.Merge(f.read(), saver_def)
+  return saver_def
+
+
+def freeze_graph(input_graph,
+                 input_saver,
+                 input_binary,
+                 input_checkpoint,
+                 output_node_names,
+                 restore_op_name,
+                 filename_tensor_name,
+                 output_graph,
+                 clear_devices,
+                 initializer_nodes,
+                 variable_names_blacklist=""):
+  """Converts all variables in a graph and checkpoint into constants."""
+  input_graph_def = _parse_input_graph_proto(input_graph, input_binary)
+  input_saver_def = None
+  if input_saver:
+    input_saver_def = _parse_input_saver_proto(input_saver, input_binary)
+  freeze_graph_with_def_protos(
+      input_graph_def,
+      input_saver_def,
+      input_checkpoint,
+      output_node_names,
+      restore_op_name,
+      filename_tensor_name,
+      output_graph,
+      clear_devices,
+      initializer_nodes,
+      variable_names_blacklist)
+
+
 def main(unused_args):
   freeze_graph(FLAGS.input_graph, FLAGS.input_saver, FLAGS.input_binary,
                FLAGS.input_checkpoint, FLAGS.output_node_names,
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb055e978630bcb399e327ddc968961b4978bca
--- /dev/null
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ================================
+"""Imports a protobuf model as a graph in Tensorboard."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary import summary
+
+
+def import_to_tensorboard(model_dir, log_dir):
+  """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
+
+  Args:
+    model_dir: The location of the protobuf (`pb`) model to visualize
+    log_dir: The location for the Tensorboard log to begin visualization from.
+
+  Usage:
+    Call this function with your model location and desired log directory.
+    Launch Tensorboard by pointing it to the log directory.
+    View your imported `.pb` model as a graph.
+  """
+  with session.Session(graph=ops.Graph()) as sess:
+    with gfile.FastGFile(model_dir, "rb") as f:
+      graph_def = graph_pb2.GraphDef()
+      graph_def.ParseFromString(f.read())
+      importer.import_graph_def(graph_def)
+
+    pb_visual_writer = summary.FileWriter(log_dir)
+    pb_visual_writer.add_graph(sess.graph)
+    print("Model Imported. Visualize by running: "
+          "> tensorboard --logdir={}".format(log_dir))
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 6faf570de722ab7e6167056a46bb29ced24a931f..47a74e5abfb45e9bfd87b72d1511ae2e7c2f7d6c 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -46,7 +46,7 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
     if all_tensors:
       var_to_shape_map = reader.get_variable_to_shape_map()
-      for key in var_to_shape_map:
+      for key in sorted(var_to_shape_map):
         print("tensor_name: ", key)
         print(reader.get_tensor(key))
     elif not tensor_name:
@@ -95,7 +95,7 @@ def parse_numpy_printoption(kv_str):
         "Setting '%s' from the command line is not supported." % k)
   try:
     v = (v_type(v_str) if v_type is not bool
-         else flags.BooleanParser().Parse(v_str))
+         else flags.BooleanParser().parse(v_str))
   except ValueError as e:
     raise argparse.ArgumentTypeError(e.message)
   np.set_printoptions(**{k: v})
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1be3055052aeeb1355bbb71d6232ab8d60cc974
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -0,0 +1,643 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Command-line interface to inspect and execute a graph in a SavedModel.
+
+For detailed usages and examples, please refer to:
+https://www.tensorflow.org/programmers_guide/saved_model_cli
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import sys
+import warnings
+
+import numpy as np
+
+from tensorflow.contrib.saved_model.python.saved_model import reader
+from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.platform import app
+from tensorflow.python.saved_model import loader
+
+
+def _show_tag_sets(saved_model_dir):
+  """Prints the tag-sets stored in SavedModel directory.
+
+  Prints all the tag-sets for MetaGraphs stored in SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+  """
+  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  print('The given SavedModel contains the following tag-sets:')
+  for tag_set in sorted(tag_sets):
+    print(', '.join(sorted(tag_set)))
+
+
+def _show_signature_def_map_keys(saved_model_dir, tag_set):
+  """Prints the keys for each SignatureDef in the SignatureDef map.
+
+  Prints the list of SignatureDef keys from the SignatureDef map specified by
+  the given tag-set and SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef to get SignatureDef map from,
+        in string format, separated by ','. For tag-set contains multiple tags,
+        all tags must be passed in.
+  """
+  signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
+  print('The given SavedModel MetaGraphDef contains SignatureDefs with the '
+        'following keys:')
+  for signature_def_key in sorted(signature_def_map.keys()):
+    print('SignatureDef key: \"%s\"' % signature_def_key)
+
+
+def _get_inputs_tensor_info_from_meta_graph_def(meta_graph_def,
+                                                signature_def_key):
+  """Gets TensorInfo for all inputs of the SignatureDef.
+
+  Returns a dictionary that maps each input key to its TensorInfo for the given
+  signature_def_key in the meta_graph_def
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer with the SignatureDef map to
+        look up SignatureDef key.
+    signature_def_key: A SignatureDef key string.
+
+  Returns:
+    A dictionary that maps input tensor keys to TensorInfos.
+  """
+  return signature_def_utils.get_signature_def_by_key(meta_graph_def,
+                                                      signature_def_key).inputs
+
+
+def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def,
+                                                 signature_def_key):
+  """Gets TensorInfos for all outputs of the SignatureDef.
+
+  Returns a dictionary that maps each output key to its TensorInfo for the given
+  signature_def_key in the meta_graph_def.
+
+  Args:
+    meta_graph_def: MetaGraphDef protocol buffer with the SignatureDefmap to
+    look up signature_def_key.
+    signature_def_key: A SignatureDef key string.
+
+  Returns:
+    A dictionary that maps output tensor keys to TensorInfos.
+  """
+  return signature_def_utils.get_signature_def_by_key(meta_graph_def,
+                                                      signature_def_key).outputs
+
+
+def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key):
+  """Prints input and output TensorInfos.
+
+  Prints the details of input and output TensorInfos for the SignatureDef mapped
+  by the given signature_def_key.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+    tag_set: Group of tag(s) of the MetaGraphDef, in string format, separated by
+        ','. For tag-set contains multiple tags, all tags must be passed in.
+    signature_def_key: A SignatureDef key string.
+  """
+  meta_graph_def = get_meta_graph_def(saved_model_dir, tag_set)
+  inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+
+  print('The given SavedModel SignatureDef contains the following input(s):')
+  for input_key, input_tensor in sorted(inputs_tensor_info.items()):
+    print('inputs[\'%s\'] tensor_info:' % input_key)
+    _print_tensor_info(input_tensor)
+
+  print('The given SavedModel SignatureDef contains the following output(s):')
+  for output_key, output_tensor in sorted(outputs_tensor_info.items()):
+    print('outputs[\'%s\'] tensor_info:' % output_key)
+    _print_tensor_info(output_tensor)
+
+  print('Method name is: %s' %
+        meta_graph_def.signature_def[signature_def_key].method_name)
+
+
+def _print_tensor_info(tensor_info):
+  """Prints details of the given tensor_info.
+
+  Args:
+    tensor_info: TensorInfo object to be printed.
+  """
+  print('    dtype: ' + types_pb2.DataType.keys()[tensor_info.dtype])
+  # Display shape as tuple.
+  if tensor_info.tensor_shape.unknown_rank:
+    shape = 'unknown_rank'
+  else:
+    dims = [str(dim.size) for dim in tensor_info.tensor_shape.dim]
+    shape = ', '.join(dims)
+    shape = '(' + shape + ')'
+  print('    shape: ' + shape)
+  print('    name: ' + tensor_info.name)
+
+
+def _show_all(saved_model_dir):
+  """Prints tag-set, SignatureDef and Inputs/Outputs information in SavedModel.
+
+  Prints all tag-set, SignatureDef and Inputs/Outputs information stored in
+  SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect.
+  """
+  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  for tag_set in sorted(tag_sets):
+    tag_set = ', '.join(tag_set)
+    print('\nMetaGraphDef with tag-set: \'' + tag_set +
+          '\' contains the following SignatureDefs:')
+
+    signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
+    for signature_def_key in sorted(signature_def_map.keys()):
+      print('\nsignature_def[\'' + signature_def_key + '\']:')
+      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key)
+
+
+def get_meta_graph_def(saved_model_dir, tag_set):
+  """Gets MetaGraphDef from SavedModel.
+
+  Returns the MetaGraphDef for the given tag-set and SavedModel directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect or execute.
+    tag_set: Group of tag(s) of the MetaGraphDef to load, in string format,
+        separated by ','. For tag-set contains multiple tags, all tags must be
+        passed in.
+
+  Raises:
+    RuntimeError: An error when the given tag-set does not exist in the
+        SavedModel.
+
+  Returns:
+    A MetaGraphDef corresponding to the tag-set.
+  """
+  saved_model = reader.read_saved_model(saved_model_dir)
+  set_of_tags = set(tag_set.split(','))
+  for meta_graph_def in saved_model.meta_graphs:
+    if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
+      return meta_graph_def
+
+  raise RuntimeError('MetaGraphDef associated with tag-set ' + tag_set +
+                     ' could not be found in SavedModel')
+
+
+def get_signature_def_map(saved_model_dir, tag_set):
+  """Gets SignatureDef map from a MetaGraphDef in a SavedModel.
+
+  Returns the SignatureDef map for the given tag-set in the SavedModel
+  directory.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to inspect or execute.
+    tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in
+        string format, separated by ','. For tag-set contains multiple tags, all
+        tags must be passed in.
+
+  Returns:
+    A SignatureDef map that maps from string keys to SignatureDefs.
+  """
+  meta_graph = get_meta_graph_def(saved_model_dir, tag_set)
+  return meta_graph.signature_def
+
+
+def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
+                                   input_tensor_key_feed_dict, outdir,
+                                   overwrite_flag, tf_debug=False):
+  """Runs SavedModel and fetch all outputs.
+
+  Runs the input dictionary through the MetaGraphDef within a SavedModel
+  specified by the given tag_set and SignatureDef. Also save the outputs to file
+  if outdir is not None.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel to execute.
+    tag_set: Group of tag(s) of the MetaGraphDef with the SignatureDef map, in
+        string format, separated by ','. For tag-set contains multiple tags, all
+        tags must be passed in.
+    signature_def_key: A SignatureDef key string.
+    input_tensor_key_feed_dict: A dictionary maps input keys to numpy ndarrays.
+    outdir: A directory to save the outputs to. If the directory doesn't exist,
+        it will be created.
+    overwrite_flag: A boolean flag to allow overwrite output file if file with
+        the same name exists.
+    tf_debug: A boolean flag to use TensorFlow Debugger (TFDBG) to observe the
+        intermediate Tensor values and runtime GraphDefs while running the
+        SavedModel.
+
+  Raises:
+    RuntimeError: An error when output file already exists and overwrite is not
+    enabled.
+  """
+  # Get a list of output tensor names.
+  meta_graph_def = get_meta_graph_def(saved_model_dir, tag_set)
+
+  # Re-create feed_dict based on input tensor name instead of key as session.run
+  # uses tensor name.
+  inputs_tensor_info = _get_inputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  inputs_feed_dict = {
+      inputs_tensor_info[key].name: tensor
+      for key, tensor in input_tensor_key_feed_dict.items()
+  }
+  # Get outputs
+  outputs_tensor_info = _get_outputs_tensor_info_from_meta_graph_def(
+      meta_graph_def, signature_def_key)
+  # Sort to preserve order because we need to go from value to key later.
+  output_tensor_keys_sorted = sorted(outputs_tensor_info.keys())
+  output_tensor_names_sorted = [
+      outputs_tensor_info[tensor_key].name
+      for tensor_key in output_tensor_keys_sorted
+  ]
+
+  with session.Session(graph=ops_lib.Graph()) as sess:
+    loader.load(sess, tag_set.split(','), saved_model_dir)
+
+    if tf_debug:
+      sess = local_cli_wrapper.LocalCLIDebugWrapperSession(sess)
+
+    outputs = sess.run(output_tensor_names_sorted, feed_dict=inputs_feed_dict)
+
+    for i, output in enumerate(outputs):
+      output_tensor_key = output_tensor_keys_sorted[i]
+      print('Result for output key %s:\n%s' % (output_tensor_key, output))
+
+      # Only save if outdir is specified.
+      if outdir:
+        # Create directory if outdir does not exist
+        if not os.path.isdir(outdir):
+          os.makedirs(outdir)
+        output_full_path = os.path.join(outdir, output_tensor_key + '.npy')
+
+        # If overwrite not enabled and file already exist, error out
+        if not overwrite_flag and os.path.exists(output_full_path):
+          raise RuntimeError(
+              'Output file %s already exists. Add \"--overwrite\" to overwrite'
+              ' the existing output files.' % output_full_path)
+
+        np.save(output_full_path, output)
+        print('Output %s is saved to %s' % (output_tensor_key,
+                                            output_full_path))
+
+
+def preprocess_inputs_arg_string(inputs_str):
+  """Parses input arg into dictionary that maps input to file/variable tuple.
+
+  Parses input string in the format of, for example,
+  "input1=filename1[variable_name1],input2=filename2" into a
+  dictionary looks like
+  {'input_key1': (filename1, variable_name1),
+   'input_key2': (file2, None)}
+  , which maps input keys to a tuple of file name and variable name(None if
+  empty).
+
+  Args:
+    inputs_str: A string that specified where to load inputs. Inputs are
+    separated by semicolons.
+        * For each input key:
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
+
+  Returns:
+    A dictionary that maps input keys to a tuple of file name and variable name.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+  inputs_raw = inputs_str.split(';')
+  for input_raw in filter(bool, inputs_raw):  # skip empty strings
+    # Format of input=filename[variable_name]'
+    match = re.match(r'([^=]+)=([^\[\]]+)\[([^\[\]]+)\]$', input_raw)
+
+    if match:
+      input_dict[match.group(1)] = match.group(2), match.group(3)
+    else:
+      # Format of input=filename'
+      match = re.match(r'([^=]+)=([^\[\]]+)$', input_raw)
+      if match:
+        input_dict[match.group(1)] = match.group(2), None
+      else:
+        raise RuntimeError(
+            '--inputs "%s" format is incorrect. Please follow'
+            '"<input_key>=<filename>", or'
+            '"<input_key>=<filename>[<variable_name>]"' % input_raw)
+
+  return input_dict
+
+
+def preprocess_input_exprs_arg_string(input_exprs_str):
+  """Parses input arg into dictionary that maps input key to python expression.
+
+  Parses input string in the format of 'input_key=<python expression>' into a
+  dictionary that maps each input_key to its python expression.
+
+  Args:
+    input_exprs_str: A string that specifies python expression for input keys.
+    Each input is separated by semicolon. For each input key:
+        'input_key=<python expression>'
+
+  Returns:
+    A dictionary that maps input keys to python expressions.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+
+  for input_raw in filter(bool, input_exprs_str.split(';')):
+    if '=' not in input_exprs_str:
+      raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow'
+                         '"<input_key>=<python expression>"' % input_exprs_str)
+    input_key, expr = input_raw.split('=')
+    input_dict[input_key] = expr
+
+  return input_dict
+
+
+def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
+  """Parses input arg strings and create inputs feed_dict.
+
+  Parses '--inputs' string for inputs to be loaded from file, and parses
+  '--input_exprs' string for inputs to be evaluated from python expression.
+
+  Args:
+    inputs_str: A string that specified where to load inputs. Each input is
+        separated by semicolon.
+        * For each input key:
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
+        * File specified by 'filename' will be loaded using numpy.load. Inputs
+            can be loaded from only .npy, .npz or pickle files.
+        * The "[variable_name]" key is optional depending on the input file type
+            as descripted in more details below.
+        When loading from a npy file, which always contains a numpy ndarray, the
+        content will be directly assigned to the specified input tensor. If a
+        variable_name is specified, it will be ignored and a warning will be
+        issued.
+        When loading from a npz zip file, user can specify which variable within
+        the zip file to load for the input tensor inside the square brackets. If
+        nothing is specified, this function will check that only one file is
+        included in the zip and load it for the specified input tensor.
+        When loading from a pickle file, if no variable_name is specified in the
+        square brackets, whatever that is inside the pickle file will be passed
+        to the specified input tensor, else SavedModel CLI will assume a
+        dictionary is stored in the pickle file and the value corresponding to
+        the variable_name will be used.
+    input_exprs_str: A string that specified python expressions for inputs.
+        * In the format of: '<input_key>=<python expression>'.
+        * numpy module is available as np.
+
+  Returns:
+    A dictionary that maps input tensor keys to numpy ndarrays.
+
+  Raises:
+    RuntimeError: An error when a key is specified, but the input file contains
+        multiple numpy ndarrays, none of which matches the given key.
+    RuntimeError: An error when no key is specified, but the input file contains
+        more than one numpy ndarrays.
+  """
+  tensor_key_feed_dict = {}
+
+  inputs = preprocess_inputs_arg_string(inputs_str)
+  input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
+
+  for input_tensor_key, (filename, variable_name) in inputs.items():
+    data = np.load(filename)
+
+    # When a variable_name key is specified for the input file
+    if variable_name:
+      # if file contains a single ndarray, ignore the input name
+      if isinstance(data, np.ndarray):
+        warnings.warn(
+            'Input file %s contains a single ndarray. Name key \"%s\" ignored.'
+            % (filename, variable_name))
+        tensor_key_feed_dict[input_tensor_key] = data
+      else:
+        if variable_name in data:
+          tensor_key_feed_dict[input_tensor_key] = data[variable_name]
+        else:
+          raise RuntimeError(
+              'Input file %s does not contain variable with name \"%s\".' %
+              (filename, variable_name))
+    # When no key is specified for the input file.
+    else:
+      # Check if npz file only contains a single numpy ndarray.
+      if isinstance(data, np.lib.npyio.NpzFile):
+        variable_name_list = data.files
+        if len(variable_name_list) != 1:
+          raise RuntimeError(
+              'Input file %s contains more than one ndarrays. Please specify '
+              'the name of ndarray to use.' % filename)
+        tensor_key_feed_dict[input_tensor_key] = data[variable_name_list[0]]
+      else:
+        tensor_key_feed_dict[input_tensor_key] = data
+
+  # When input is a python expression:
+  for input_tensor_key, py_expr in input_exprs.items():
+    if input_tensor_key in tensor_key_feed_dict:
+      warnings.warn(
+          'input_key %s has been specified with both --inputs and --input_exprs'
+          ' options. Value in --input_exprs will be used.' % input_tensor_key)
+
+    # ast.literal_eval does not work with numpy expressions
+    tensor_key_feed_dict[input_tensor_key] = eval(py_expr)  # pylint: disable=eval-used
+
+  return tensor_key_feed_dict
+
+
+def show(args):
+  """Function triggered by show command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  # If all tag is specified, display all information.
+  if args.all:
+    _show_all(args.dir)
+  else:
+    # If no tag is specified, display all tag_set, if no signaure_def key is
+    # specified, display all SignatureDef keys, else show input output tensor
+    # information corresponding to the given SignatureDef key
+    if args.tag_set is None:
+      _show_tag_sets(args.dir)
+    else:
+      if args.signature_def is None:
+        _show_signature_def_map_keys(args.dir, args.tag_set)
+      else:
+        _show_inputs_outputs(args.dir, args.tag_set, args.signature_def)
+
+
+def run(args):
+  """Function triggered by run command.
+
+  Args:
+    args: A namespace parsed from command line.
+
+  Raises:
+    AttributeError: An error when neither --inputs nor --input_exprs is passed
+    to run command.
+  """
+  if not args.inputs and not args.input_exprs:
+    raise AttributeError(
+        'At least one of --inputs and --input_exprs must be required')
+  tensor_key_feed_dict = load_inputs_from_input_arg_string(
+      args.inputs, args.input_exprs)
+  run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
+                                 tensor_key_feed_dict, args.outdir,
+                                 args.overwrite, tf_debug=args.tf_debug)
+
+
+def create_parser():
+  """Creates a parser that parse the command line arguments.
+
+  Returns:
+    A namespace parsed from command line arguments.
+  """
+  parser = argparse.ArgumentParser(
+      description='saved_model_cli: Command-line interface for SavedModel')
+  parser.add_argument('-v', '--version', action='version', version='0.1.0')
+
+  subparsers = parser.add_subparsers(
+      title='commands', description='valid commands', help='additional help')
+
+  # show command
+  show_msg = (
+      'Usage examples:\n'
+      'To show all tag-sets in a SavedModel:\n'
+      '$saved_model_cli show --dir /tmp/saved_model\n'
+      'To show all available SignatureDef keys in a '
+      'MetaGraphDef specified by its tag-set:\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
+      'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
+      'passed in, separated by \';\':\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
+      'To show all inputs and outputs TensorInfo for a specific'
+      ' SignatureDef specified by the SignatureDef key in a'
+      ' MetaGraph.\n'
+      '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+      '--signature_def serving_default\n\n'
+      'To show all available information in the SavedModel\n:'
+      '$saved_model_cli show --dir /tmp/saved_model --all')
+  parser_show = subparsers.add_parser(
+      'show',
+      description=show_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_show.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to inspect')
+  parser_show.add_argument(
+      '--all',
+      action='store_true',
+      help='if set, will output all information in given SavedModel')
+  parser_show.add_argument(
+      '--tag_set',
+      type=str,
+      default=None,
+      help='tag-set of graph in SavedModel to show, separated by \',\'')
+  parser_show.add_argument(
+      '--signature_def',
+      type=str,
+      default=None,
+      metavar='SIGNATURE_DEF_KEY',
+      help='key of SignatureDef to display input(s) and output(s) for')
+  parser_show.set_defaults(func=show)
+
+  # run command
+  run_msg = ('Usage example:\n'
+             'To run input tensors from files through a MetaGraphDef and save'
+             ' the output tensors to files:\n'
+             '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
+             '--signature_def serving_default '
+             '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy'
+             '--input_exprs \'input3_key=np.ones(2)\' --outdir=/out\n\n'
+             'For more information about input file format, please see:\n'
+             'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
+  parser_run = subparsers.add_parser(
+      'run', description=run_msg, formatter_class=argparse.RawTextHelpFormatter)
+  parser_run.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to execute')
+  parser_run.add_argument(
+      '--tag_set',
+      type=str,
+      required=True,
+      help='tag-set of graph in SavedModel to load, separated by \',\'')
+  parser_run.add_argument(
+      '--signature_def',
+      type=str,
+      required=True,
+      metavar='SIGNATURE_DEF_KEY',
+      help='key of SignatureDef to run')
+  msg = ('Loading inputs from files, in the format of \'<input_key>=<filename>,'
+         ' or \'<input_key>=<filename>[<variable_name>]\', separated by \';\'.'
+         ' The file format can only be from .npy, .npz or pickle.')
+  parser_run.add_argument('--inputs', type=str, default='', help=msg)
+  msg = ('Specifying inputs by python expressions, in the format of'
+         ' "<input_key>=\'<python expression>\'", separated by \';\'. '
+         'numpy module is available as \'np\'. '
+         'Will override duplicate input_keys from --inputs option.')
+  parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
+  parser_run.add_argument(
+      '--outdir',
+      type=str,
+      default=None,
+      help='if specified, output tensor(s) will be saved to given directory')
+  parser_run.add_argument(
+      '--overwrite',
+      action='store_true',
+      help='if set, output file will be overwritten if it already exists.')
+  parser_run.add_argument(
+      '--tf_debug',
+      action='store_true',
+      help='if set, will use TensorFlow Debugger (tfdbg) to watch the '
+           'intermediate Tensors and runtime GraphDefs while running the '
+           'SavedModel.')
+  parser_run.set_defaults(func=run)
+
+  return parser
+
+
+def main():
+  parser = create_parser()
+  args = parser.parse_args()
+  args.func(args)
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f79c888ebd3c82affde5d17ff0c5db2232a6c46
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -0,0 +1,455 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModelCLI tool.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import os
+import pickle
+import shutil
+import sys
+
+import numpy as np
+from six import StringIO
+
+from tensorflow.python.debug.wrappers import local_cli_wrapper
+from tensorflow.python.platform import test
+from tensorflow.python.tools import saved_model_cli
+
+SAVED_MODEL_PATH = ('cc/saved_model/testdata/half_plus_two/00000123')
+
+
+@contextlib.contextmanager
+def captured_output():
+  new_out, new_err = StringIO(), StringIO()
+  old_out, old_err = sys.stdout, sys.stderr
+  try:
+    sys.stdout, sys.stderr = new_out, new_err
+    yield sys.stdout, sys.stderr
+  finally:
+    sys.stdout, sys.stderr = old_out, old_err
+
+
+class SavedModelCLITestCase(test.TestCase):
+
+  def testShowCommandAll(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(['show', '--dir', base_path, '--all'])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    # pylint: disable=line-too-long
+    exp_out = """MetaGraphDef with tag-set: 'serve' contains the following SignatureDefs:
+
+signature_def['classify_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/classify
+
+signature_def['classify_x_to_y']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['scores'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/classify
+
+signature_def['regress_x2_to_y3']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x2:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y3:0
+Method name is: tensorflow/serving/regress
+
+signature_def['regress_x_to_y']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/regress
+
+signature_def['regress_x_to_y2']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['inputs'] tensor_info:
+    dtype: DT_STRING
+    shape: unknown_rank
+    name: tf_example:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['outputs'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y2:0
+Method name is: tensorflow/serving/regress
+
+signature_def['serving_default']:
+The given SavedModel SignatureDef contains the following input(s):
+inputs['x'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: x:0
+The given SavedModel SignatureDef contains the following output(s):
+outputs['y'] tensor_info:
+    dtype: DT_FLOAT
+    shape: (-1, 1)
+    name: y:0
+Method name is: tensorflow/serving/predict"""
+    # pylint: enable=line-too-long
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandTags(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(['show', '--dir', base_path])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    exp_out = 'The given SavedModel contains the following tag-sets:\nserve'
+    self.assertMultiLineEqual(output, exp_out)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandSignature(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(
+        ['show', '--dir', base_path, '--tag_set', 'serve'])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    exp_header = ('The given SavedModel MetaGraphDef contains SignatureDefs '
+                  'with the following keys:')
+    exp_start = 'SignatureDef key: '
+    exp_keys = [
+        '"classify_x2_to_y3"', '"classify_x_to_y"', '"regress_x2_to_y3"',
+        '"regress_x_to_y"', '"regress_x_to_y2"', '"serving_default"'
+    ]
+    # Order of signatures does not matter
+    self.assertMultiLineEqual(
+        output,
+        '\n'.join([exp_header] + [exp_start + exp_key for exp_key in exp_keys]))
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testShowCommandErrorNoTagSet(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args(
+        ['show', '--dir', base_path, '--tag_set', 'badtagset'])
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.show(args)
+
+  def testShowCommandInputsOutputs(self):
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    self.parser = saved_model_cli.create_parser()
+    args = self.parser.parse_args([
+        'show', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default'
+    ])
+    with captured_output() as (out, err):
+      saved_model_cli.show(args)
+    output = out.getvalue().strip()
+    expected_output = (
+        'The given SavedModel SignatureDef contains the following input(s):\n'
+        'inputs[\'x\'] tensor_info:\n'
+        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: x:0\n'
+        'The given SavedModel SignatureDef contains the following output(s):\n'
+        'outputs[\'y\'] tensor_info:\n'
+        '    dtype: DT_FLOAT\n    shape: (-1, 1)\n    name: y:0\n'
+        'Method name is: tensorflow/serving/predict')
+    self.assertEqual(output, expected_output)
+    self.assertEqual(err.getvalue().strip(), '')
+
+  def testInputPreProcessFormats(self):
+    input_str = 'input1=/path/file.txt[ab3];input2=file2'
+    input_expr_str = 'input3=np.zeros([2,2]);input4=[4,5]'
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_expr_dict = saved_model_cli.preprocess_input_exprs_arg_string(
+        input_expr_str)
+    self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
+    self.assertTrue(input_dict['input2'] == ('file2', None))
+    self.assertTrue(input_expr_dict['input3'] == 'np.zeros([2,2])')
+    self.assertTrue(input_expr_dict['input4'] == '[4,5]')
+    self.assertTrue(len(input_dict) == 2)
+    self.assertTrue(len(input_expr_dict) == 2)
+
+  def testInputPreProcessFileNames(self):
+    input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
+                 r'input:0=c:\PROGRA~1\data.npy')
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    print(input_dict)
+    self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
+                                             'v:0'))
+    self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
+
+  def testInputPreProcessErrorBadFormat(self):
+    input_str = 'inputx=file[[v1]v2'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:file'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:np.zeros((5))'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.preprocess_input_exprs_arg_string(input_str)
+
+  def testInputParserNPY(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(6)).reshape(2, 3)
+    input0_path = os.path.join(test.get_temp_dir(), 'input0.npy')
+    input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
+    np.save(input0_path, x0)
+    np.save(input1_path, x1)
+    input_str = 'x0=' + input0_path + '[x0];x1=' + input1_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x0'] == x0))
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+
+  def testInputParserNPZ(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    input_str = 'x=' + input_path + '[a];y=' + input_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x'] == x0))
+    self.assertTrue(np.all(feed_dict['y'] == x0))
+
+  def testInputParserPickle(self):
+    pkl0 = {'a': 5, 'b': np.array(range(4))}
+    pkl1 = np.array([1])
+    pkl2 = np.array([[1], [3]])
+    input_path0 = os.path.join(test.get_temp_dir(), 'pickle0.pkl')
+    input_path1 = os.path.join(test.get_temp_dir(), 'pickle1.pkl')
+    input_path2 = os.path.join(test.get_temp_dir(), 'pickle2.pkl')
+    with open(input_path0, 'wb') as f:
+      pickle.dump(pkl0, f)
+    with open(input_path1, 'wb') as f:
+      pickle.dump(pkl1, f)
+    with open(input_path2, 'wb') as f:
+      pickle.dump(pkl2, f)
+    input_str = 'x=' + input_path0 + '[b];y=' + input_path1 + '[c];'
+    input_str += 'z=' + input_path2
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+    self.assertTrue(np.all(feed_dict['x'] == pkl0['b']))
+    self.assertTrue(np.all(feed_dict['y'] == pkl1))
+    self.assertTrue(np.all(feed_dict['z'] == pkl2))
+
+  def testInputParserPythonExpression(self):
+    x1 = np.ones([2, 10])
+    x2 = np.array([[1], [2], [3]])
+    x3 = np.mgrid[0:5, 0:5]
+    x4 = [[3], [4]]
+    input_expr_str = ('x1=np.ones([2,10]);x2=np.array([[1],[2],[3]]);'
+                      'x3=np.mgrid[0:5,0:5];x4=[[3],[4]]')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        '', input_expr_str)
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+    self.assertTrue(np.all(feed_dict['x2'] == x2))
+    self.assertTrue(np.all(feed_dict['x3'] == x3))
+    self.assertTrue(np.all(feed_dict['x4'] == x4))
+
+  def testInputParserBoth(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x1=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x0))
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+
+  def testInputParserBothDuplicate(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x0=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x1))
+
+  def testInputParserErrorNoName(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(5))
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0, b=x1)
+    input_str = 'x=' + input_path
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+
+  def testInputParserErrorWrongName(self):
+    x0 = np.array([[1], [2]])
+    x1 = np.array(range(5))
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0, b=x1)
+    input_str = 'x=' + input_path + '[c]'
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
+
+  def testRunCommandExistingOutdir(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(), 'testRunCommand_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'outputs.npy')
+    if os.path.exists(output_file):
+      os.remove(output_file)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'regress_x2_to_y3', '--inputs', 'inputs=' + input_path + '[x0]',
+        '--outdir',
+        test.get_temp_dir()
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(output_file)
+    y_expected = np.array([[3.5], [4.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandNewOutdir(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandNewOutdir_inputs.npz')
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    if os.path.isdir(output_dir):
+      shutil.rmtree(output_dir)
+    np.savez(input_path, x0=x, x1=x_notused)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        output_dir
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(os.path.join(output_dir, 'y.npy'))
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandOutOverwrite(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandOutOverwrite_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'y.npy')
+    open(output_file, 'a').close()
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        test.get_temp_dir(), '--overwrite'
+    ])
+    saved_model_cli.run(args)
+    y_actual = np.load(output_file)
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+  def testRunCommandOutputFileExistError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandOutOverwrite_inputs.npz')
+    np.savez(input_path, x0=x, x1=x_notused)
+    output_file = os.path.join(test.get_temp_dir(), 'y.npy')
+    open(output_file, 'a').close()
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        test.get_temp_dir()
+    ])
+    with self.assertRaises(RuntimeError):
+      saved_model_cli.run(args)
+
+  def testRunCommandInputNotGivenError(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default'
+    ])
+    with self.assertRaises(AttributeError):
+      saved_model_cli.run(args)
+
+  def testRunCommandWithDebuggerEnabled(self):
+    self.parser = saved_model_cli.create_parser()
+    base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
+    x = np.array([[1], [2]])
+    x_notused = np.zeros((6, 3))
+    input_path = os.path.join(test.get_temp_dir(),
+                              'testRunCommandNewOutdir_inputs.npz')
+    output_dir = os.path.join(test.get_temp_dir(), 'new_dir')
+    if os.path.isdir(output_dir):
+      shutil.rmtree(output_dir)
+    np.savez(input_path, x0=x, x1=x_notused)
+    args = self.parser.parse_args([
+        'run', '--dir', base_path, '--tag_set', 'serve', '--signature_def',
+        'serving_default', '--inputs', 'x=' + input_path + '[x0]', '--outdir',
+        output_dir, '--tf_debug'
+    ])
+
+    def fake_wrapper_session(sess):
+      return sess
+
+    with test.mock.patch.object(local_cli_wrapper,
+                                'LocalCLIDebugWrapperSession',
+                                side_effect=fake_wrapper_session,
+                                autospec=True) as fake:
+      saved_model_cli.run(args)
+      fake.assert_called_with(test.mock.ANY)
+
+    y_actual = np.load(os.path.join(output_dir, 'y.npy'))
+    y_expected = np.array([[2.5], [3.0]])
+    self.assertAllClose(y_expected, y_actual)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/tools/strip_unused_lib.py b/tensorflow/python/tools/strip_unused_lib.py
index 8f9e20ab8e7f1ca564c4cbac1ad069e4d4439fa2..b1d195607604b406f68b28824564afc642cc43ad 100644
--- a/tensorflow/python/tools/strip_unused_lib.py
+++ b/tensorflow/python/tools/strip_unused_lib.py
@@ -41,14 +41,26 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
         a list that specifies one value per input node name.
 
   Returns:
-    A GraphDef with all unnecessary ops removed.
+    A `GraphDef` with all unnecessary ops removed.
+
+  Raises:
+    ValueError: If any element in `input_node_names` refers to a tensor instead
+      of an operation.
+    KeyError: If any element in `input_node_names` is not found in the graph.
   """
+  for name in input_node_names:
+    if ":" in name:
+      raise ValueError("Name '%s' appears to refer to a Tensor, "
+                       "not a Operation." % name)
+
   # Here we replace the nodes we're going to override as inputs with
   # placeholders so that any unused nodes that are inputs to them are
   # automatically stripped out by extract_sub_graph().
+  not_found = {name for name in input_node_names}
   inputs_replaced_graph_def = graph_pb2.GraphDef()
   for node in input_graph_def.node:
     if node.name in input_node_names:
+      not_found.remove(node.name)
       placeholder_node = node_def_pb2.NodeDef()
       placeholder_node.op = "Placeholder"
       placeholder_node.name = node.name
@@ -67,6 +79,9 @@ def strip_unused(input_graph_def, input_node_names, output_node_names,
     else:
       inputs_replaced_graph_def.node.extend([copy.deepcopy(node)])
 
+  if not_found:
+    raise KeyError("The following input nodes were not found: %s\n" % not_found)
+
   output_graph_def = graph_util.extract_sub_graph(inputs_replaced_graph_def,
                                                   output_node_names)
   return output_graph_def
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index d492a0e8229802b0cc67855ec9c2f7321b49a1b7..7cf0c3e3ed9b5748b263913566150eff8acf857a 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -58,16 +58,25 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
     # routine.
     input_graph_path = os.path.join(self.get_temp_dir(), input_graph_name)
     input_binary = False
-    input_node_names = "wanted_input_node"
     output_binary = True
     output_node_names = "output_node"
     output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name)
 
-    strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
-                                             output_graph_path, output_binary,
-                                             input_node_names,
-                                             output_node_names,
-                                             dtypes.float32.as_datatype_enum)
+    def strip(input_node_names):
+      strip_unused_lib.strip_unused_from_files(input_graph_path, input_binary,
+                                               output_graph_path, output_binary,
+                                               input_node_names,
+                                               output_node_names,
+                                               dtypes.float32.as_datatype_enum)
+
+    with self.assertRaises(KeyError):
+      strip("does_not_exist")
+
+    with self.assertRaises(ValueError):
+      strip("wanted_input_node:0")
+
+    input_node_names = "wanted_input_node"
+    strip(input_node_names)
 
     # Now we make sure the variable is now a constant, and that the graph still
     # produces the expected result.
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 111461f78427fd0845a9e3488614fe6520a3db7a..5ae82524a3836d3e2333072ce49a8bff4d35ecce 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -21,8 +21,9 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_ops
 
@@ -60,7 +61,10 @@ class AdamOptimizer(optimizer.Optimizer):
 
     The default value of 1e-8 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
-    current good choice is 1.0 or 0.1.
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
 
     The sparse implementation of this algorithm (used when the gradient is an
     IndexedSlices object, typically because of `tf.gather` or an embedding
@@ -77,7 +81,9 @@ class AdamOptimizer(optimizer.Optimizer):
         The exponential decay rate for the 1st moment estimates.
       beta2: A float value or a constant float tensor.
         The exponential decay rate for the 2nd moment estimates.
-      epsilon: A small constant for numerical stability.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".
@@ -111,12 +117,12 @@ class AdamOptimizer(optimizer.Optimizer):
     if (self._beta1_power is None or
         self._beta1_power.graph is not var_list[0].graph):
       with ops.colocate_with(var_list[0]):
-        self._beta1_power = variables.Variable(self._beta1,
-                                               name="beta1_power",
-                                               trainable=False)
-        self._beta2_power = variables.Variable(self._beta2,
-                                               name="beta2_power",
-                                               trainable=False)
+        self._beta1_power = variable_scope.variable(self._beta1,
+                                                    name="beta1_power",
+                                                    trainable=False)
+        self._beta2_power = variable_scope.variable(self._beta2,
+                                                    name="beta2_power",
+                                                    trainable=False)
     # Create slots for the first and second moments.
     for v in var_list:
       self._zeros_slot(v, "m", self._name)
@@ -154,7 +160,7 @@ class AdamOptimizer(optimizer.Optimizer):
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
         grad, use_locking=self._use_locking)
 
-  def _apply_sparse(self, grad, var):
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
     beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
@@ -164,23 +170,39 @@ class AdamOptimizer(optimizer.Optimizer):
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
-    m_scaled_g_values = grad.values * (1 - beta1_t)
+    m_scaled_g_values = grad * (1 - beta1_t)
     m_t = state_ops.assign(m, m * beta1_t,
                            use_locking=self._use_locking)
-    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
-                                use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
-    v_scaled_g_values = (grad.values * grad.values) * (1 - beta2_t)
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
-    v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values,
-                                use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
     var_update = state_ops.assign_sub(var,
                                       lr * m_t / (v_sqrt + epsilon_t),
                                       use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking))
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add)
+
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
     with ops.control_dependencies(update_ops):
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 00ff5d9b9d422c72cedb5479b058d7d1c01b6778..62b171e234eebcb3e12508d8e000f565e5e89903 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -52,7 +52,7 @@ def adam_update_numpy(param,
 
 class AdamOptimizerTest(test.TestCase):
 
-  def testSparse(self):
+  def doTestSparse(self, use_resource=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():
         # Initialize variables for numpy implementation.
@@ -62,8 +62,12 @@ class AdamOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
 
-        var0 = variables.Variable(var0_np)
-        var1 = variables.Variable(var1_np)
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np),
@@ -95,6 +99,12 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
       with self.test_session(force_gpu=test.is_gpu_available()):
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index f13b87dfed68d194cb8893fabf1f99042c375fee..6fd20ce8013faa3e5a3732edc2b668c457914356 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -40,6 +40,7 @@ from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 from tensorflow.python.training.session_run_hook import SessionRunArgs
@@ -124,7 +125,7 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
 
   def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
                formatter=None):
-    """Initializes a LoggingHook monitor.
+    """Initializes a `LoggingTensorHook`.
 
     Args:
       tensors: `dict` that maps string-valued tags to tensors/tensor names,
@@ -189,10 +190,10 @@ class LoggingTensorHook(session_run_hook.SessionRunHook):
 
 
 class StopAtStepHook(session_run_hook.SessionRunHook):
-  """Monitor to request stop at a specified step."""
+  """Hook that requests stop at a specified step."""
 
   def __init__(self, num_steps=None, last_step=None):
-    """Create a StopAtStep Hook.
+    """Initializes a `StopAtStepHook`.
 
     This hook requests stop after either a number of steps have been
     executed or a last step has been reached. Only one of the two options can be
@@ -234,51 +235,48 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
 
 
 class CheckpointSaverListener(object):
-  """An interface for event hooks that depend on a checkpoint.
-
-  CheckpointSaverListeners are similar to SessionRunHooks, and can be useful to
-  track training, report progress, and more.  The distinction is that
-  CheckpointSaverListeners run only in steps when CheckpointSaverHook is
-  triggered, and provide callbacks to run before or after the checkpoint is
-  generated.  This is in contrast to SessionRunHooks, which may run in steps
-  when no checkpoint is written, and which have no guaranteed execution order
-  in any case.  CheckpointSaverListeners use the observer pattern and notify at
-  the following points:
-   - when a session starts being used
+  """Interface for listeners that take action before or after checkpoint save.
+
+  `CheckpointSaverListener` triggers only in steps when `CheckpointSaverHook` is
+  triggered, and provides callbacks at the following points:
+   - before using the session
    - before each call to `Saver.save()`
    - after each call to `Saver.save()`
-   - when the session closed
-
-  Custom CheckpointSaverListeners look like this:
-    class ExampleCheckpointSaverListerner(CheckpointSaverListener):
-      def begin(self):
-        # You can add ops to the graph here.
-        print('Starting the session.')
-        self.your_tensor = ...
-
-      def before_save(self, session, global_step_value):
-        print('About to write a checkpoint')
-
-      def after_save(self, session, global_step_value):
-        print('Done writing checkpoint.')
-
-      def end(self, session, global_step_value):
-        print('Done with the session.')
-
-  A CheckpointSaverListener may simply take some action after every checkpoint.
-  It is also possible for the listener to use its own schedule to act less
-  frequently, based on wall clock time or on global_step_value.  In this case,
-  implementors must be careful about what happens at end().  When end is called,
-  The CheckpointSaverHook will have already triggered after_save() in the same
-  global_step, but the listener may or may not have actually acted on it.
-  The listener may want to be sure to act at end() if there is a fresh
-  checkpoint available, but should not act twice if after_save() already handled
-  it.  In this case, end() should have logic to detect the situation and do the
-  right thing, similar to what CheckpointSaverHook.end() does using
-  self._timer.last_triggered_step().
-
-  To use such listeners, in your `model_fn` return a `CheckpointSaverHook` as
-  part of `training_chief_hooks`.
+   - at the end of session
+
+  To use a listener, implement a class and pass the listener to a
+  `CheckpointSaverHook`, as in this example:
+
+  ```python
+  class ExampleCheckpointSaverListerner(CheckpointSaverListener):
+    def begin(self):
+      # You can add ops to the graph here.
+      print('Starting the session.')
+      self.your_tensor = ...
+
+    def before_save(self, session, global_step_value):
+      print('About to write a checkpoint')
+
+    def after_save(self, session, global_step_value):
+      print('Done writing checkpoint.')
+
+    def end(self, session, global_step_value):
+      print('Done with the session.')
+
+  ...
+  listener = ExampleCheckpointSaverListerner()
+  saver_hook = tf.train.CheckpointSaverHook(
+      checkpoint_dir, listeners=[listener])
+  with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]):
+    ...
+  ```
+
+  A `CheckpointSaverListener` may simply take some action after every
+  checkpoint save. It is also possible for the listener to use its own schedule
+  to act less frequently, e.g. based on global_step_value. In this case,
+  implementors should implement the `end()` method to handle actions related to
+  the last checkpoint save. But the listener should not act twice if
+  `after_save()` already handled this last checkpoint save.
   """
 
   def begin(self):
@@ -305,7 +303,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
                checkpoint_basename="model.ckpt",
                scaffold=None,
                listeners=None):
-    """Initialize CheckpointSaverHook monitor.
+    """Initializes a `CheckpointSaverHook`.
 
     Args:
       checkpoint_dir: `str`, base directory for the checkpoint files.
@@ -315,18 +313,18 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
       checkpoint_basename: `str`, base name for the checkpoint files.
       scaffold: `Scaffold`, use to get saver object.
       listeners: List of `CheckpointSaverListener` subclass instances.
-        Used for callbacks that run immediately after the corresponding
-        CheckpointSaverHook callbacks, only in steps where the
-        CheckpointSaverHook was triggered.
+        Used for callbacks that run immediately before or after this hook saves
+        the checkpoint.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
       ValueError: Exactly one of saver or scaffold should be set.
     """
     logging.info("Create CheckpointSaverHook.")
-    if ((saver is None and scaffold is None) or
-        (saver is not None and scaffold is not None)):
-      raise ValueError("Exactly one of saver or scaffold must be provided.")
+    if saver is not None and scaffold is not None:
+      raise ValueError("You cannot provide both saver and scaffold.")
+    if saver is None and scaffold is None:
+      saver = saver_lib._get_saver_or_default()  # pylint: disable=protected-access
     self._saver = saver
     self._checkpoint_dir = checkpoint_dir
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
@@ -401,7 +399,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
 
 
 class StepCounterHook(session_run_hook.SessionRunHook):
-  """Steps per second monitor."""
+  """Hook that counts steps per second."""
 
   def __init__(self,
                every_n_steps=100,
@@ -453,14 +451,13 @@ class NanLossDuringTrainingError(RuntimeError):
 
 
 class NanTensorHook(session_run_hook.SessionRunHook):
-  """NaN Loss monitor.
+  """Monitors the loss tensor and stops training if loss is NaN.
 
-  Monitors loss and stops training if loss is NaN.
   Can either fail with exception or just stop training.
   """
 
   def __init__(self, loss_tensor, fail_on_nan_loss=True):
-    """Initializes NanLoss monitor.
+    """Initializes a `NanTensorHook`.
 
     Args:
       loss_tensor: `Tensor`, the loss tensor.
@@ -494,7 +491,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
                summary_writer=None,
                scaffold=None,
                summary_op=None):
-    """Initializes a `SummarySaver` monitor.
+    """Initializes a `SummarySaverHook`.
 
     Args:
       save_steps: `int`, save summaries every N steps. Exactly one of
@@ -590,7 +587,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
 
 
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
-  """Delay execution until global step reaches to wait_until_step.
+  """Delays execution until global step reaches `wait_until_step`.
 
   This hook delays execution until global step reaches to `wait_until_step`. It
   is used to gradually start workers in distributed settings. One example usage
@@ -599,7 +596,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """
 
   def __init__(self, wait_until_step):
-    """Create a _GlobalStepWaiterHook.
+    """Initializes a `GlobalStepWaiterHook`.
 
     Args:
       wait_until_step: an `int` shows until which global step should we wait.
@@ -637,10 +634,10 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
 
 
 class FinalOpsHook(session_run_hook.SessionRunHook):
-  """A run hook which evaluates `Tensors` at the end of a session."""
+  """A hook which evaluates `Tensors` at the end of a session."""
 
   def __init__(self, final_ops, final_ops_feed_dict=None):
-    """Constructs the FinalOpHook with ops to run at the end of the session.
+    """Initializes `FinalOpHook` with ops to run at the end of the session.
 
     Args:
       final_ops: A single `Tensor`, a list of `Tensors` or a dictionary of
@@ -666,10 +663,11 @@ class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
   def __init__(self, feed_fn):
-    """Constructs the FeedFnHook with given `feed_fn`.
+    """Initializes a `FeedFnHook`.
 
     Args:
-      feed_fn: function, no arguments and returns `dict` to feed.
+      feed_fn: function that takes no arguments and returns `dict` of `Tensor`
+        to feed.
     """
     self.feed_fn = feed_fn
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index c2636d46f59c8fee2650934cd5dfec7e542d6b3a..ecb61d447bfca32558826dd3a12994c34abbb272 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -346,6 +346,98 @@ class CheckpointSaverHookTest(test.TestCase):
           'end': 1
       }, listener.get_counts())
 
+  def test_listener_with_monitored_session(self):
+    with ops.Graph().as_default():
+      scaffold = monitored_session.Scaffold()
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          scaffold=scaffold,
+          listeners=[listener])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          scaffold=scaffold,
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener_counts = listener.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener_counts)
+
+  def test_listener_with_default_saver(self):
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          listeners=[listener])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener_counts = listener.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener_counts)
+
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      with monitored_session.SingularMonitoredSession(
+          checkpoint_dir=self.model_dir) as sess2:
+        global_step_saved_val = sess2.run(global_step)
+    self.assertEqual(2, global_step_saved_val)
+
+  def test_two_listeners_with_default_saver(self):
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      train_op = state_ops.assign_add(global_step, 1)
+      listener1 = MockCheckpointSaverListener()
+      listener2 = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir,
+          save_steps=1,
+          listeners=[listener1, listener2])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook],
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        sess.run(train_op)
+        global_step_val = sess.run(global_step)
+      listener1_counts = listener1.get_counts()
+      listener2_counts = listener2.get_counts()
+    self.assertEqual(2, global_step_val)
+    self.assertEqual({
+        'begin': 1,
+        'before_save': 2,
+        'after_save': 2,
+        'end': 1
+    }, listener1_counts)
+    self.assertEqual(listener1_counts, listener2_counts)
+
+    with ops.Graph().as_default():
+      global_step = variables.get_or_create_global_step()
+      with monitored_session.SingularMonitoredSession(
+          checkpoint_dir=self.model_dir) as sess2:
+        global_step_saved_val = sess2.run(global_step)
+    self.assertEqual(2, global_step_saved_val)
+
   @test.mock.patch('time.time')
   def test_save_secs_saves_periodically(self, mock_time):
     # Let's have a realistic start time
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d52cf9a4367dd7728245cbe4fe35b47dd5c0dd25
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -0,0 +1,326 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools to work with checkpoints."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saver
+from tensorflow.python.training import training as train
+
+__all__ = [
+    "load_checkpoint", "load_variable", "list_variables", "init_from_checkpoint"
+]
+
+
+def load_checkpoint(ckpt_dir_or_file):
+  """Returns `CheckpointReader` for checkpoint found in `ckpt_dir_or_file`.
+
+  If `ckpt_dir_or_file` resolves to a directory with multiple checkpoints,
+  reader for the latest checkpoint is returned.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint
+      file.
+
+  Returns:
+    `CheckpointReader` object.
+
+  Raises:
+    ValueError: If `ckpt_dir_or_file` resolves to a directory with no
+      checkpoints.
+  """
+  filename = _get_checkpoint_filename(ckpt_dir_or_file)
+  if filename is None:
+    raise ValueError("Couldn't find 'checkpoint' file or checkpoints in "
+                     "given directory %s" % ckpt_dir_or_file)
+  return train.NewCheckpointReader(filename)
+
+
+def load_variable(ckpt_dir_or_file, name):
+  """Returns the tensor value of the given variable in the checkpoint.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+    name: Name of the variable to return.
+
+  Returns:
+    A numpy `ndarray` with a copy of the value of this variable.
+  """
+  # TODO(b/29227106): Fix this in the right place and remove this.
+  if name.endswith(":0"):
+    name = name[:-2]
+  reader = load_checkpoint(ckpt_dir_or_file)
+  return reader.get_tensor(name)
+
+
+def list_variables(ckpt_dir_or_file):
+  """Returns list of all variables in the checkpoint.
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+
+  Returns:
+    List of tuples `(name, shape)`.
+  """
+  reader = load_checkpoint(ckpt_dir_or_file)
+  variable_map = reader.get_variable_to_shape_map()
+  names = sorted(variable_map.keys())
+  result = []
+  for name in names:
+    result.append((name, variable_map[name]))
+  return result
+
+
+def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
+  """Initializes current variables with tensors loaded from given checkpoint.
+
+  Note: This overrides default initialization ops of specified variables and
+  redefines dtype.
+
+  Assignment map supports following syntax:
+
+  * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
+    current `scope_name` from `checkpoint_scope_name` with matching tensor
+    names.
+  * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
+    will initialize `scope_name/variable_name` variable
+    from `checkpoint_scope_name/some_other_variable`.
+  * `'scope_variable_name': variable` - will initialize given `tf.Variable`
+    object with tensor 'scope_variable_name' from the checkpoint.
+  * `'scope_variable_name': list(variable)` - will initialize list of
+    partitioned variables with tensor 'scope_variable_name' from the checkpoint.
+  * `'/': 'scope_name/'` - will load all variables in current `scope_name` from
+    checkpoint's root (e.g. no scope).
+
+  Supports loading into partitioned variables, which are represented as
+  `'<variable>/part_<part #>'`.
+
+  Example:
+
+  ```python
+
+  # Say, '/tmp/model.ckpt' has the following tensors:
+  #  -- name='old_scope_1/var1', shape=[20, 2]
+  #  -- name='old_scope_1/var2', shape=[50, 4]
+  #  -- name='old_scope_2/var3', shape=[100, 100]
+
+  # Create new model's variables
+  with tf.variable_scope('new_scope_1'):
+    var1 = tf.get_variable('var1', shape=[20, 2],
+                           initializer=tf.zeros_initializer())
+  with tf.variable_scope('new_scope_2'):
+    var2 = tf.get_variable('var2', shape=[50, 4],
+                           initializer=tf.zeros_initializer())
+    # Partition into 5 variables along the first axis.
+    var3 = tf.get_variable(name='var3', shape=[100, 100],
+                           initializer=tf.zeros_initializer(),
+                           partitioner=lambda shape, dtype: [5, 1])
+
+  # Initialize all variables in `new_scope_1` from `old_scope_1`.
+  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/', 'new_scope_1'})
+
+  # Use names to specify which variables to initialize from checkpoint.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_1/var1': 'new_scope_1/var1',
+                        'old_scope_1/var2': 'new_scope_2/var2'})
+
+  # Or use tf.Variable objects to identify what to initialize.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_1/var1': var1,
+                        'old_scope_1/var2': var2})
+
+  # Initialize partitioned variables using variable's name
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_2/var3': 'new_scope_2/var3'})
+
+  # Or specify the list of tf.Variable objects.
+  init_from_checkpoint('/tmp/model.ckpt',
+                       {'old_scope_2/var3': var3._get_variable_list()})
+
+  ```
+
+  Args:
+    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
+    assignment_map: Dict, where keys are names of the variables in the
+      checkpoint and values are current variables or names of current variables
+      (in default graph).
+
+  Raises:
+    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
+    ValueError: If missing variables in current graph.
+  """
+  ckpt_file = _get_checkpoint_filename(ckpt_dir_or_file)
+  reader = load_checkpoint(ckpt_dir_or_file)
+  variable_map = reader.get_variable_to_shape_map()
+  for tensor_name_in_ckpt, current_var_or_name in six.iteritems(assignment_map):
+    var = None
+    # Check if this is Variable object or list of Variable objects (in case of
+    # partitioned variables).
+    is_var = lambda x: isinstance(x, variables.Variable)
+    if is_var(current_var_or_name) or (
+        isinstance(current_var_or_name, list)
+        and all(is_var(v) for v in current_var_or_name)):
+      var = current_var_or_name
+    else:
+      store_vars = vs._get_default_variable_store()._vars  # pylint:disable=protected-access
+      # Check if this variable is in var_store.
+      var = store_vars.get(current_var_or_name, None)
+      # Also check if variable is partitioned as list.
+      if var is None:
+        var = _collect_partitioned_variable(current_var_or_name, store_vars)
+    if var is not None:
+      # If 1 to 1 mapping was provided, find variable in the checkpoint.
+      if tensor_name_in_ckpt not in variable_map:
+        raise ValueError("Tensor %s is not found in %s checkpoint %s" % (
+            tensor_name_in_ckpt, ckpt_dir_or_file, variable_map
+        ))
+      if is_var(var):
+        # Additional at-call-time checks.
+        if not var.get_shape().is_compatible_with(
+            variable_map[tensor_name_in_ckpt]):
+          raise ValueError(
+              "Shape of variable %s (%s) doesn't match with shape of "
+              "tensor %s (%s) from checkpoint reader." % (
+                  var.name, str(var.get_shape()),
+                  tensor_name_in_ckpt, str(variable_map[tensor_name_in_ckpt])
+              ))
+        var_name = var.name
+      else:
+        var_name = ",".join([v.name for v in var])
+      _set_variable_or_list_initializer(var, ckpt_file, tensor_name_in_ckpt)
+      logging.info("Initialize variable %s from checkpoint %s with %s",
+                   var_name, ckpt_dir_or_file, tensor_name_in_ckpt)
+    else:
+      scopes = ""
+      # TODO(vihanjain): Support list of 'current_var_or_name' here.
+      if "/" in current_var_or_name:
+        scopes = current_var_or_name[:current_var_or_name.rindex("/")]
+      if not tensor_name_in_ckpt.endswith("/"):
+        raise ValueError(
+            "Assignment map with scope only name {} should map to scope only "
+            "{}. Should be 'scope/': 'other_scope/'.".format(
+                scopes, tensor_name_in_ckpt))
+      # If scope to scope mapping was provided, find all variables in the scope
+      # and create variable to variable mapping.
+      scope_variables = set()
+      for var_name in store_vars:
+        if not scopes or var_name.startswith(scopes + "/"):
+          # Consume /part_ if partitioned variable.
+          if "/part_" in var_name:
+            var_name = var_name[:var_name.index("/part_")]
+          scope_variables.add(var_name)
+      for var_name in scope_variables:
+        # Lookup name with specified prefix and suffix from current variable.
+        # If tensor_name given is '/' (root), don't use it for full name.
+        full_tensor_name = var_name[len(scopes):]
+        if current_var_or_name != "/":
+          full_tensor_name = full_tensor_name[1:]
+        if tensor_name_in_ckpt != "/":
+          full_tensor_name = tensor_name_in_ckpt + full_tensor_name
+        if full_tensor_name not in variable_map:
+          raise ValueError(
+              "Tensor %s (%s in %s) is not found in %s checkpoint" % (
+                  full_tensor_name, var_name[len(scopes) + 1:],
+                  tensor_name_in_ckpt, ckpt_dir_or_file
+              ))
+        var = store_vars.get(var_name, None)
+        if var is None:
+          var = _collect_partitioned_variable(var_name, store_vars)
+        _set_variable_or_list_initializer(var, ckpt_file, full_tensor_name)
+        logging.info("Initialize variable %s from checkpoint %s with %s",
+                     var_name, ckpt_dir_or_file, full_tensor_name)
+
+
+def _get_checkpoint_filename(ckpt_dir_or_file):
+  """Returns checkpoint filename given directory or specific checkpoint file."""
+  if gfile.IsDirectory(ckpt_dir_or_file):
+    return saver.latest_checkpoint(ckpt_dir_or_file)
+  return ckpt_dir_or_file
+
+
+def _set_checkpoint_initializer(variable,
+                                ckpt_file,
+                                tensor_name,
+                                slice_spec,
+                                name="checkpoint_initializer"):
+  """Overrides given variable's initialization op.
+
+  Sets variable initializer to assign op that initializes variable from tensor's
+  value in the checkpoint.
+
+  Args:
+    variable: `tf.Variable` object.
+    ckpt_file: string, full path of the checkpoint.
+    tensor_name: Name of the tensor to load from the checkpoint.
+    slice_spec: Slice specification for loading partitioned tensors.
+    name: Name of the operation.
+  """
+  base_type = variable.dtype.base_dtype
+  restore_op = io_ops.restore_v2(
+      ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
+  variable._initializer_op = state_ops.assign(variable, restore_op)  # pylint:disable=protected-access
+
+
+def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
+                                      tensor_name):
+  """Overrides initialization op of given variable or list of variables.
+
+  Calls `_set_checkpoint_initializer` for each variable in the given list of
+  variables.
+
+  Args:
+    variable_or_list: `tf.Variable` object or a list of `tf.Variable` objects.
+    ckpt_file: string, full path of the checkpoint.
+    tensor_name: Name of the tensor to load from the checkpoint.
+
+  Raises:
+    ValueError: if all objects in `variable_or_list` are not partitions of the
+      same large variable.
+  """
+  if isinstance(variable_or_list, (list, tuple)):
+    # A set of slices.
+    slice_name = None
+    for v in variable_or_list:
+      slice_info = v._save_slice_info  # pylint:disable=protected-access
+      if slice_name is None:
+        slice_name = slice_info.full_name
+      elif slice_name != slice_info.full_name:
+        raise ValueError("Slices must all be from the same tensor: %s != %s" %
+                         (slice_name, slice_info.full_name))
+      _set_checkpoint_initializer(v, ckpt_file, tensor_name, slice_info.spec)
+  else:
+    _set_checkpoint_initializer(variable_or_list, ckpt_file, tensor_name, "")
+
+
+def _collect_partitioned_variable(name, all_vars):
+  """Returns list of `tf.Variable` that comprise the partitioned variable."""
+  if name + "/part_0" in all_vars:
+    var = []
+    i = 0
+    while name + "/part_%d" % i in all_vars:
+      var.append(all_vars[name + "/part_%d" % i])
+      i += 1
+    return var
+  return None
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0af922c0c949ef6f22a78bcab65d4817fce6750
--- /dev/null
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -0,0 +1,323 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpoints tools."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import saver as saver_lib
+
+
+def _create_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  v1 = variable_scope.get_variable("var1", [1, 10])
+  v2 = variable_scope.get_variable("var2", [10, 10])
+  v3 = variable_scope.get_variable("var3", [100, 100])
+  with variable_scope.variable_scope("useful_scope"):
+    v4 = variable_scope.get_variable("var4", [9, 9])
+  sess.run(variables.global_variables_initializer())
+  v1_value, v2_value, v3_value, v4_value = sess.run([v1, v2, v3, v4])
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value, v2_value, v3_value, v4_value
+
+
+def _create_partition_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  with variable_scope.variable_scope("scope"):
+    v1 = variable_scope.get_variable(
+        name="var1",
+        shape=[100, 100],
+        initializer=init_ops.truncated_normal_initializer(0.5),
+        partitioner=partitioned_variables.min_max_variable_partitioner(
+            max_partitions=5, axis=0, min_slice_size=8 << 10))
+  sess.run(variables.global_variables_initializer())
+  v1_value = sess.run(v1._get_variable_list())
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value
+
+
+class CheckpointsTest(test.TestCase):
+
+  def testNoCheckpoints(self):
+    checkpoint_dir = self.get_temp_dir() + "/no_checkpoints"
+    with self.assertRaises(errors_impl.OpError):
+      self.assertAllEqual(
+          checkpoint_utils.load_variable(checkpoint_dir, "var1"), [])
+
+  def testNoTensor(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+    with self.assertRaises(errors_impl.OpError):
+      self.assertAllEqual(
+          checkpoint_utils.load_variable(checkpoint_dir, "var5"), [])
+
+  def testGetTensor(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var1"), v1)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var2"), v2)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "var3"), v3)
+    self.assertAllEqual(
+        checkpoint_utils.load_variable(checkpoint_dir, "useful_scope/var4"), v4)
+
+  def testGetAllVariables(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _create_checkpoints(session, checkpoint_dir)
+    self.assertEqual(
+        checkpoint_utils.list_variables(checkpoint_dir),
+        [("useful_scope/var4", [9, 9]), ("var1", [1, 10]), ("var2", [10, 10]),
+         ("var3", [100, 100])])
+
+  def testInitFromCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable("my1", [1, 10])
+          with variable_scope.variable_scope("some_other_scope"):
+            my2 = variable_scope.get_variable("my2", [10, 10])
+            with variable_scope.variable_scope("other_useful_scope"):
+              my4 = variable_scope.get_variable("var4", [9, 9])
+        my3 = variable_scope.get_variable("my3", [100, 100])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "var1": "some_scope/my1",
+            "useful_scope/": "some_scope/some_other_scope/other_useful_scope/",
+        })
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "var2": "some_scope/some_other_scope/my2",
+            "var3": my3,
+        })
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+        # Check that tensors are not explicitly in the graph.
+        self.assertLess(len(str(session.graph.as_graph_def())), 27000)
+
+  def testInitWithScopeDoesNotCaptureSuffixes(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    with ops.Graph().as_default() as g:
+      with variable_scope.variable_scope("useful_scope"):
+        my4 = variable_scope.get_variable("var4", [9, 9])
+      with variable_scope.variable_scope("useful_scope_1"):
+        my5_init = [[1.0, 2.0], [3.0, 4.0]]
+        my5 = variable_scope.get_variable("var5", initializer=my5_init)
+
+      checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                            {"useful_scope/": "useful_scope/"})
+      with self.test_session(graph=g) as session:
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my4.eval(session), v4)
+        self.assertAllEqual(my5.eval(session), my5_init)
+
+  def testInitFromRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable("var1", [1, 10])
+          my2 = variable_scope.get_variable("var2", [10, 10])
+          my3 = variable_scope.get_variable("var3", [100, 100])
+          with variable_scope.variable_scope("useful_scope"):
+            my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "some_scope/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+  def testInitToRootCheckpoint(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        my1 = variable_scope.get_variable("var1", [1, 10])
+        my2 = variable_scope.get_variable("var2", [10, 10])
+        my3 = variable_scope.get_variable("var3", [100, 100])
+        with variable_scope.variable_scope("useful_scope"):
+          my4 = variable_scope.get_variable("var4", [9, 9])
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"/": "/",})
+
+        session.run(variables.global_variables_initializer())
+        self.assertAllEqual(my1.eval(session), v1)
+        self.assertAllEqual(my2.eval(session), v2)
+        self.assertAllEqual(my3.eval(session), v3)
+        self.assertAllEqual(my4.eval(session), v4)
+
+  def testInitFromPartitionVar(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      v1 = _create_partition_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=init_ops.zeros_initializer(),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
+        # Create another variable with different partitions than the variable in
+        # the checkpoint.
+        with variable_scope.variable_scope("some_other_scope"):
+          my2 = variable_scope.get_variable(
+              name="var1",
+              shape=[100, 100],
+              initializer=init_ops.zeros_initializer(),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=16 << 10))
+          my2_var_list = my2._get_variable_list()
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
+            "scope/var1": "some_scope/my1",
+            "scope/": "some_other_scope/"})
+
+        session.run(variables.global_variables_initializer())
+        my1_values = session.run(my1_var_list)
+        self.assertAllEqual(my1_values, v1)
+        my2_values = session.run(my2_var_list)
+        # Verify we created different number of partitions.
+        self.assertNotEquals(len(my2_values), len(v1))
+        # Verify the values were correctly initialized inspite of different
+        # partitions.
+        full_my2_values = np.concatenate(my2_values, axis=0)
+        full_v1_values = np.concatenate(v1, axis=0)
+        self.assertAllEqual(full_my2_values, full_v1_values)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          my1 = variable_scope.get_variable(
+              name="my1",
+              shape=[100, 100],
+              initializer=init_ops.truncated_normal_initializer(0.5),
+              partitioner=partitioned_variables.min_max_variable_partitioner(
+                  max_partitions=5, axis=0, min_slice_size=8 << 10))
+          my1_var_list = my1._get_variable_list()
+
+        checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                              {"scope/var1": my1_var_list,})
+
+        session.run(variables.global_variables_initializer())
+        my1_values = session.run(my1_var_list)
+        self.assertAllEqual(my1_values, v1)
+
+  def testInitFromCheckpointMissing(self):
+    checkpoint_dir = self.get_temp_dir()
+    with self.test_session() as session:
+      _, _, _, _ = _create_checkpoints(session, checkpoint_dir)
+
+    # New graph and session.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as session:
+        with variable_scope.variable_scope("some_scope"):
+          _ = variable_scope.get_variable("my1", [10, 10])
+          _ = variable_scope.get_variable(
+              "my2", [1, 10],
+              dtype=dtypes.int64,
+              initializer=init_ops.zeros_initializer())
+
+        # No directory.
+        with self.assertRaises(errors_impl.OpError):
+          checkpoint_utils.init_from_checkpoint("no_dir",
+                                                {"var1": "some_scope/my1"})
+
+        # No variable in checkpoint.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"no_var": "some_scope/my1"})
+
+        # No variable in the graph.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"var3": "some_scope/no_var"})
+
+        # Shape mismatch.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"var1": "some_scope/my1"})
+
+        # Variable 'my1' and 'my2' are missing in given checkpoint scope.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(
+              checkpoint_dir, {"useful_scope/": "some_scope/"})
+
+        # Mapping is not to scope name.
+        with self.assertRaises(ValueError):
+          checkpoint_utils.init_from_checkpoint(checkpoint_dir,
+                                                {"useful_scope": "some_scope/"})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 2863afb21e2b489ae53e4bbe821b6e5561e5cda2..23e863876464886855e1db671da6f02fbebeafbb 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -62,7 +62,7 @@ class Coordinator(object):
   #### Exception handling:
 
   A thread can report an exception to the coordinator as part of the
-  `should_stop()` call.  The exception will be re-raised from the
+  `request_stop()` call.  The exception will be re-raised from the
   `coord.join()` call.
 
   Thread code:
@@ -106,7 +106,7 @@ class Coordinator(object):
   After a thread has called `coord.request_stop()` the other threads have a
   fixed time to stop, this is called the 'stop grace period' and defaults to 2
   minutes.  If any of the threads is still alive after the grace period expires
-  `coord.join()` raises a RuntimeException reporting the laggards.
+  `coord.join()` raises a RuntimeError reporting the laggards.
 
   ```python
   try:
@@ -117,7 +117,7 @@ class Coordinator(object):
     ...start thread N...(coord, ...)
     # Wait for all the threads to terminate, give them 10s grace period
     coord.join(threads, stop_grace_period_secs=10)
-  except RuntimeException:
+  except RuntimeError:
     ...one of the threads took more than 10s to stop after request_stop()
     ...was called.
   except Exception:
@@ -366,7 +366,7 @@ class Coordinator(object):
     # If any thread is still alive, wait for the grace period to expire.
     # By the time this check is executed, threads may still be shutting down,
     # so we add a sleep of increasing duration to give them a chance to shut
-    # down without loosing too many cycles.
+    # down without losing too many cycles.
     # The sleep duration is limited to the remaining grace duration.
     stop_wait_secs = 0.001
     while any(t.is_alive() for t in threads) and stop_grace_period_secs >= 0.0:
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index 7f403f49275692687dcd33b4d2ec029fb1b04357..02155a98d7d6e38cca6df77b52d66828181e593f 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -94,31 +94,31 @@ class _ReplicaDeviceChooser(object):
     Returns:
       The device to use for the `Operation`.
     """
+    # If we don't return early here, either merge_devices is True, or op.device
+    # is empty (in which case merging is a no-op). So we can always merge below.
     if not self._merge_devices and op.device:
       return op.device
+
     current_device = pydev.DeviceSpec.from_string(op.device or "")
-    spec = pydev.DeviceSpec()
-    if self._ps_tasks and self._ps_device:
-      node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-      if node_def.op in self._ps_ops:
-        device_string = "%s/task:%d" % (
-            self._ps_device, self._ps_strategy(op))
-        if self._merge_devices:
-          spec = pydev.DeviceSpec.from_string(device_string)
-          spec.merge_from(current_device)
-          return spec.to_string()
-        else:
-          return device_string
-    if self._worker_device:
-      if not self._merge_devices:
-        return self._worker_device
-      spec = pydev.DeviceSpec.from_string(self._worker_device)
-
-    if not self._merge_devices:
-      return ""
-
-    spec.merge_from(current_device)
-    return spec.to_string()
+
+    # The ps_device will be used for specified ops (ps_ops) whenever it is
+    # present and ps_tasks is non-zero. However, its task number will only be
+    # set (using ps_strategy) if there is a job field in ps_device that won't be
+    # changed by the job field (if present) in current_device.
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if self._ps_tasks and self._ps_device and node_def.op in self._ps_ops:
+      ps_device = pydev.DeviceSpec.from_string(self._ps_device)
+
+      current_job, ps_job = current_device.job, ps_device.job
+      if ps_job and (not current_job or current_job == ps_job):
+        ps_device.task = self._ps_strategy(op)
+
+      ps_device.merge_from(current_device)
+      return ps_device.to_string()
+
+    worker_device = pydev.DeviceSpec.from_string(self._worker_device or "")
+    worker_device.merge_from(current_device)
+    return worker_device.to_string()
 
 
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
@@ -186,7 +186,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
       cluster_spec = cluster.as_dict()
     else:
       cluster_spec = server_lib.ClusterSpec(cluster).as_dict()
-    # Get ps_job_name from ps_device by striping "/job:".
+    # Get ps_job_name from ps_device by stripping "/job:".
     ps_job_name = pydev.DeviceSpec.from_string(ps_device).job
     if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None:
       return None
@@ -198,7 +198,7 @@ def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
   if ps_ops is None:
     # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be
     # placed in the parameter server.
-    ps_ops = ["Variable", "VariableV2"]
+    ps_ops = ["Variable", "VariableV2", "VarHandleOp"]
 
   if not merge_devices:
     logging.warning(
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index e05f0f6a1c7028bd271dbaafcc14bdc3225e7ec8..85b75502ab0943013f12a34002e72b71d187bf68 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_setter
@@ -46,6 +47,12 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 
+  def testResource(self):
+    with ops.device(
+        device_setter.replica_device_setter(cluster=self._cluster_spec)):
+      v = resource_variable_ops.ResourceVariable([1, 2])
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+
   def testPS2TasksWithClusterSpecClass(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -58,6 +65,50 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  def testPS2TasksPinVariableToJob(self):
+    with ops.device(
+        device_setter.replica_device_setter(cluster=self._cluster_spec)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+        with ops.device("/job:ps"):  # Explicit PS job will get task set.
+          x = variables.Variable([0, 1])
+      a = v + w + x
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon", w.device)
+      self.assertDeviceEqual("/job:moon", w.initializer.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.device)
+      self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksUseCpuForPS(self):
+    with ops.device(
+        device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:moon"):
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/cpu:0", v.device)
+      self.assertDeviceEqual("/cpu:0", v.initializer.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.device)
+      self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
+  def testPS2TasksNoMerging(self):
+    with ops.device(
+        device_setter.replica_device_setter(
+            cluster=self._cluster_spec, merge_devices=False)):
+      v = variables.Variable([1, 2])
+      with ops.device("/job:ps"):  # Won't assign task when merge_devices=False.
+        w = variables.Variable([2, 1])
+      a = v + w
+      self.assertDeviceEqual("/job:ps/task:0", v.device)
+      self.assertDeviceEqual("/job:ps/task:0", v.initializer.device)
+      self.assertDeviceEqual("/job:ps", w.device)
+      self.assertDeviceEqual("/job:ps", w.initializer.device)
+      self.assertDeviceEqual("/job:worker", a.device)
+
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 7c46591d07b19303be395220f96181d462942ff5..bbaa3931c20ecb0659da7abb4018b4704d891058 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -113,7 +113,7 @@ def _evaluate_once(checkpoint_path,
 
   One may also consider using a `tf.contrib.training.SummaryAtEndHook` to record
   summaries after the `eval_ops` have run. If `eval_ops` is `None`, the
-  summaries run immedietly after the model checkpoint has been restored.
+  summaries run immediately after the model checkpoint has been restored.
 
   Note that `evaluate_once` creates a local variable used to track the number of
   evaluations run via `tf.contrib.training.get_or_create_eval_step`.
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index f5b895961207472bcc3f60d85bfaa2ea4f617f61..618f3baf089bdb11f8931d3f078983a7a3b9ac96 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Ftrl-proximal for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
@@ -32,12 +31,16 @@ class FtrlOptimizer(optimizer.Optimizer):
   https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
   """
 
-  def __init__(self, learning_rate,
+  def __init__(self,
+               learning_rate,
                learning_rate_power=-0.5,
                initial_accumulator_value=0.1,
                l1_regularization_strength=0.0,
                l2_regularization_strength=0.0,
-               use_locking=False, name="Ftrl"):
+               use_locking=False,
+               name="Ftrl",
+               accum_name=None,
+               linear_name=None):
     """Construct a new FTRL optimizer.
 
     Args:
@@ -52,6 +55,10 @@ class FtrlOptimizer(optimizer.Optimizer):
       use_locking: If `True` use locks for update operations.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Ftrl".
+      accum_name: The suffix for the variable that keeps the gradient squared
+        accumulator.  If not present, defaults to name.
+      linear_name: The suffix for the variable that keeps the linear gradient
+        accumulator.  If not present, defaults to name + "_1".
 
     Raises:
       ValueError: If one of the arguments is invalid.
@@ -82,35 +89,36 @@ class FtrlOptimizer(optimizer.Optimizer):
     self._learning_rate_power_tensor = None
     self._l1_regularization_strength_tensor = None
     self._l2_regularization_strength_tensor = None
+    self._accum_name = accum_name
+    self._linear_name = linear_name
 
   def _create_slots(self, var_list):
     # Create the "accum" and "linear" slots.
     for v in var_list:
       with ops.colocate_with(v):
-        val = constant_op.constant(self._initial_accumulator_value,
-                                   dtype=v.dtype, shape=v.get_shape())
-        self._get_or_make_slot(v, val, "accum", self._name)
-        self._zeros_slot(v, "linear", self._name)
+        val = constant_op.constant(
+            self._initial_accumulator_value, dtype=v.dtype, shape=v.get_shape())
+        self._get_or_make_slot(v, val, "accum", self._accum_name or self._name)
+        self._zeros_slot(v, "linear", self._linear_name or self._name)
 
   def _prepare(self):
     self._learning_rate_tensor = ops.convert_to_tensor(
-        self._learning_rate,
-        name="learning_rate")
+        self._learning_rate, name="learning_rate")
     self._l1_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l1_regularization_strength,
-        name="l1_regularization_strength")
+        self._l1_regularization_strength, name="l1_regularization_strength")
     self._l2_regularization_strength_tensor = ops.convert_to_tensor(
-        self._l2_regularization_strength,
-        name="l2_regularization_strength")
+        self._l2_regularization_strength, name="l2_regularization_strength")
     self._learning_rate_power_tensor = ops.convert_to_tensor(
-        self._learning_rate_power,
-        name="learning_rate_power")
+        self._learning_rate_power, name="learning_rate_power")
 
   def _apply_dense(self, grad, var):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.apply_ftrl(
-        var, accum, linear, grad,
+        var,
+        accum,
+        linear,
+        grad,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       var.dtype.base_dtype),
@@ -123,7 +131,10 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.resource_apply_ftrl(
-        var.handle, accum.handle, linear.handle, grad,
+        var.handle,
+        accum.handle,
+        linear.handle,
+        grad,
         math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       grad.dtype.base_dtype),
@@ -136,7 +147,11 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.sparse_apply_ftrl(
-        var, accum, linear, grad.values, grad.indices,
+        var,
+        accum,
+        linear,
+        grad.values,
+        grad.indices,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._l1_regularization_strength_tensor,
                       var.dtype.base_dtype),
@@ -149,11 +164,13 @@ class FtrlOptimizer(optimizer.Optimizer):
     accum = self.get_slot(var, "accum")
     linear = self.get_slot(var, "linear")
     return training_ops.resource_sparse_apply_ftrl(
-        var.handle, accum.handle, linear.handle, grad, indices,
+        var.handle,
+        accum.handle,
+        linear.handle,
+        grad,
+        indices,
         math_ops.cast(self._learning_rate_tensor, grad.dtype),
-        math_ops.cast(self._l1_regularization_strength_tensor,
-                      grad.dtype),
-        math_ops.cast(self._l2_regularization_strength_tensor,
-                      grad.dtype),
+        math_ops.cast(self._l1_regularization_strength_tensor, grad.dtype),
+        math_ops.cast(self._l2_regularization_strength_tensor, grad.dtype),
         math_ops.cast(self._learning_rate_power_tensor, grad.dtype),
         use_locking=self._use_locking)
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 06e21fb42044767c9dda1dfe08a1fd2d70d942e8..e9fe9215ae4e3cd61751b3844e9b911d0ca61521 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -40,7 +40,7 @@ from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.summary import summary
 from tensorflow.python.training import queue_runner
 
@@ -63,9 +63,10 @@ def match_filenames_once(pattern, name=None):
     A variable that is initialized to the list of files matching the pattern(s).
   """
   with ops.name_scope(name, "matching_filenames", [pattern]) as name:
-    return variables.Variable(io_ops.matching_files(pattern), trainable=False,
-                              name=name, validate_shape=False,
-                              collections=[ops.GraphKeys.LOCAL_VARIABLES])
+    return vs.variable(
+        name=name, initial_value=io_ops.matching_files(pattern),
+        trainable=False, validate_shape=False,
+        collections=[ops.GraphKeys.LOCAL_VARIABLES])
 
 
 def limit_epochs(tensor, num_epochs=None, name=None):
@@ -92,7 +93,7 @@ def limit_epochs(tensor, num_epochs=None, name=None):
     raise ValueError("num_epochs must be > 0 not %d." % num_epochs)
   with ops.name_scope(name, "limit_epochs", [tensor]) as name:
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
-    epochs = variables.Variable(
+    epochs = vs.variable(
         zero64, name="epochs", trainable=False,
         collections=[ops.GraphKeys.LOCAL_VARIABLES])
     counter = epochs.count_up_to(num_epochs)
@@ -196,7 +197,10 @@ def string_input_producer(string_tensor,
     seed: An integer (optional). Seed used if shuffle == True.
     capacity: An integer. Sets the queue capacity.
     shared_name: (optional). If set, this queue will be shared under the given
-      name across multiple sessions.
+      name across multiple sessions. All sessions open to the device which has
+      this queue will be able to access it via the shared_name. Using this in
+      a distributed setting means each name will only be seen by one of the
+      sessions which has access to this operation.
     name: A name for the operations (optional).
     cancel_op: Cancel op for the queue (optional).
 
@@ -875,13 +879,11 @@ def batch(tensors, batch_size, num_threads=1, capacity=32,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -933,7 +935,8 @@ def maybe_batch(tensors, keep_input, batch_size, num_threads=1, capacity=32,
       corresponding value in `keep_input` is `True`. This tensor essentially
       acts as a filtering mechanism.
     batch_size: The new batch size pulled from the queue.
-    num_threads: The number of threads enqueuing `tensors`.
+    num_threads: The number of threads enqueuing `tensors`.  The batching will
+      be nondeterministic if `num_threads > 1`.
     capacity: An integer. The maximum number of elements in the queue.
     enqueue_many: Whether each tensor in `tensors` is a single example.
     shapes: (Optional) The shapes for each example.  Defaults to the
@@ -977,6 +980,9 @@ def batch_join(tensors_list, batch_size, capacity=32, enqueue_many=False,
   dictionaries of tensors.  Each element in the list is treated similarly
   to the `tensors` argument of `tf.train.batch()`.
 
+  WARNING: This function is nondeterministic, since it starts a separate thread
+  for each tensor.
+
   Enqueues a different list of tensors in different threads.
   Implemented using a queue -- a `QueueRunner` for the queue
   is added to the current `Graph`'s `QUEUE_RUNNER` collection.
@@ -1172,9 +1178,6 @@ def shuffle_batch(tensors, batch_size, capacity, min_after_dequeue,
   `get_shape` method will have a first `Dimension` value of `None`, and
   operations that depend on fixed batch_size would fail.
 
-  Note: if `num_epochs` is not `None`, this function creates local counter
-  `epochs`. Use `local_variables_initializer()` to initialize local variables.
-
   Args:
     tensors: The list or dictionary of tensors to enqueue.
     batch_size: The new batch size pulled from the queue.
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index e7289504223a8b0ecda77f51134e82719cacde3f..8232882822d91d24e20f5e88e11b1566a4207056 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -23,6 +23,8 @@ import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -40,7 +42,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
 
   def testStaircase(self):
     with self.test_session():
-      step = gen_state_ops._variable(shape=[], dtype=dtypes.int32, 
+      step = gen_state_ops._variable(shape=[], dtype=dtypes.int32,
           name="step", container="", shared_name="")
       assign_100 = state_ops.assign(step, 100)
       assign_1 = state_ops.assign(step, 1)
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
index 9de681837d0d03617312e7f778dcad3b48c94a6e..7c097b943d05cd1a049886af6ef1d018d7b2c9ab 100644
--- a/tensorflow/python/training/localhost_cluster_performance_test.py
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import time
 
 import numpy as np
-import portpicker
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
@@ -31,37 +30,12 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import device_setter
-from tensorflow.python.training import server_lib
-
-
-def create_local_cluster(num_workers, num_ps, protocol="grpc"):
-  """Create local GRPC servers and return their servers."""
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-  cluster_dict = {
-      "worker": ["localhost:%s" % port for port in worker_ports],
-      "ps": ["localhost:%s" % port for port in ps_ports]
-  }
-  cs = server_lib.ClusterSpec(cluster_dict)
-
-  workers = [
-      server_lib.Server(
-          cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_workers)
-  ]
-  ps_servers = [
-      server_lib.Server(
-          cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
-      for ix in range(num_ps)
-  ]
-
-  return workers, ps_servers
 
 
 class CreateLocalClusterTest(test.TestCase):
 
   def testCreateLocalCluster(self):
-    workers, _ = create_local_cluster(num_workers=2, num_ps=2)
+    workers, _ = test.create_local_cluster(num_workers=2, num_ps=2)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
     with ops.device("/job:ps/task:0"):
       var0 = variables.Variable(0.0)
@@ -88,7 +62,7 @@ class CreateLocalClusterBenchmark(test.Benchmark):
     iters = 5
     for _ in range(iters):
       start_time = time.time()
-      create_local_cluster(num_workers=1, num_ps=10)
+      test.create_local_cluster(num_workers=1, num_ps=10)
       end_time = time.time()
       deltas.append(end_time - start_time)
 
@@ -104,7 +78,7 @@ class CreateLocalClusterBenchmark(test.Benchmark):
 class PartitionedVariablesBenchmark(test.Benchmark):
 
   def benchmark_create_1000_partitions_with_100_parameter_servers(self):
-    workers, _ = create_local_cluster(num_workers=1, num_ps=100)
+    workers, _ = test.create_local_cluster(num_workers=1, num_ps=100)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
     worker = worker_sessions[0]
     partition_sizes = (1, 512, 1024 * 32, 1024 * 128)
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index e42f3b639cac526a375472452bbf544688cfa016..ffd7c12c427aefc531cd785351993cea05a512e1 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -28,8 +28,11 @@ class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
   Computes (if `use_nesterov = False`):
-    accumulation = momentum * accumulation + gradient
-    variable -= learning_rate * accumulation
+  
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
 
   Note that in the dense version of this algorithm, `accumulation` is updated
   and applied regardless of a gradient's value, whereas the sparse version (when
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index ae76a1ab580c9e53595e8bf771bce3d61047a4da..b184aae1590ca1a8ba86536e3eeb70482a821f15 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -22,12 +22,11 @@ from __future__ import print_function
 import abc
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -103,7 +102,8 @@ class Scaffold(object):
                ready_for_local_init_op=None,
                local_init_op=None,
                summary_op=None,
-               saver=None):
+               saver=None,
+               copy_from_scaffold=None):
     """Create a scaffold.
 
     Args:
@@ -126,22 +126,42 @@ class Scaffold(object):
         string tensor containing a serialized `Summary` proto.
       saver: Optional `tf.train.Saver` object to use to save and restore
         variables.
+      copy_from_scaffold: Optional scaffold object to copy fields from. Its
+        fields will be overwritten by the provided fields in this function.
     """
+    if copy_from_scaffold is not None:
+      if not isinstance(copy_from_scaffold, Scaffold):
+        raise TypeError('copy_from_scaffold is not a Scaffold instance.')
+      # We need _coalesce since Tensor is not converted to bool automatically,
+      # so the common idiom of (a or b) does not work.
+      coalesce = lambda a, b: a if a is not None else b
+      init_op = coalesce(init_op, copy_from_scaffold.init_op)
+      init_feed_dict = coalesce(init_feed_dict,
+                                copy_from_scaffold.init_feed_dict)
+      # Use the original init_fn provided by the user to init the new Scaffold.
+      init_fn = coalesce(init_fn, copy_from_scaffold._user_init_fn)  # pylint: disable=protected-access
+      ready_op = coalesce(ready_op, copy_from_scaffold.ready_op)
+      ready_for_local_init_op = coalesce(
+          ready_for_local_init_op, copy_from_scaffold.ready_for_local_init_op)
+      local_init_op = coalesce(local_init_op, copy_from_scaffold.local_init_op)
+      summary_op = coalesce(summary_op, copy_from_scaffold.summary_op)
+      saver = coalesce(saver, copy_from_scaffold.saver)
 
     # NOTE(touts): modifying the init function to be passed the scaffold is a
     # hack to make it easy to find the saver.  Is there a better way?
+    self._user_init_fn = init_fn
     if init_fn:
       self._init_fn = lambda sess: init_fn(self, sess)
     else:
       self._init_fn = None
 
     self._init_op = init_op
+    self._init_feed_dict = init_feed_dict
     self._ready_op = ready_op
     self._ready_for_local_init_op = ready_for_local_init_op
     self._local_init_op = local_init_op
     self._summary_op = summary_op
     self._saver = saver
-    self._init_feed_dict = init_feed_dict
 
   def finalize(self):
     """Creates operations if needed and finalizes the graph."""
@@ -180,11 +200,7 @@ class Scaffold(object):
                                                  summary.merge_all)
     # pylint: disable=g-long-lambda
     if self._saver is None:
-      self._saver = Scaffold.get_or_default(
-          'saver',
-          ops.GraphKeys.SAVERS,
-          lambda: training_saver.Saver(sharded=True, allow_empty=True,
-                                       write_version=saver_pb2.SaverDef.V2))
+      self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
     self._saver.build()
 
@@ -243,7 +259,7 @@ class Scaffold(object):
   @staticmethod
   def _default_local_init_op():
     return control_flow_ops.group(variables.local_variables_initializer(),
-                                  data_flow_ops.tables_initializer())
+                                  lookup_ops.tables_initializer())
 
 
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
@@ -257,7 +273,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              save_summaries_secs=None,
                              config=None,
                              stop_grace_period_secs=120,
-                             log_step_count_steps=10000):
+                             log_step_count_steps=100):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
@@ -427,7 +443,9 @@ class WorkerSessionCreator(SessionCreator):
   def create_session(self):
     self._scaffold.finalize()
     return self._get_session_manager().wait_for_session(
-        self._master, config=self._config)
+        self._master, config=self._config,
+        max_wait_secs=30 * 60  # Wait up to 30 mins for the session to be ready.
+    )
 
 
 class _MonitoredSession(object):
@@ -562,7 +580,7 @@ class MonitoredSession(_MonitoredSession):
 
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with MonitoredSession(session_creator=ChiefSessionCreator(...),
                         hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
@@ -651,7 +669,7 @@ class SingularMonitoredSession(_MonitoredSession):
   Example usage:
   ```python
   saver_hook = CheckpointSaverHook(...)
-  summary_hook = SummaryHook(...)
+  summary_hook = SummarySaverHook(...)
   with SingularMonitoredSession(hooks=[saver_hook, summary_hook]) as sess:
     while not sess.should_stop():
       sess.run(train_op)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 41f8fb34869fa520d4b033c933002e044ddea08f..85a5ceeb08f4c658d5bede90a11d01127c531f91 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -147,6 +147,68 @@ class ScaffoldTest(test.TestCase):
                                    'Graph is finalized and cannot be modified'):
         constant_op.constant([0])
 
+  def test_new_scaffold_from_default_scaffold(self):
+    scaffold1 = monitored_session.Scaffold()
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold2 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(2, scaffold2.init_op)
+      self.assertEqual(3, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(5, scaffold2.ready_op)
+      self.assertEqual(6, scaffold2.ready_for_local_init_op)
+      self.assertEqual(7, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_new_scaffold_from_existing_scaffold(self):
+    with ops.Graph().as_default():
+      variables.Variable([1])
+      saver = saver_lib.Saver()
+      scaffold1 = monitored_session.Scaffold(
+          init_op=2,
+          init_feed_dict=3,
+          init_fn=lambda scaffold, sess: 4,
+          ready_op=5,
+          ready_for_local_init_op=6,
+          local_init_op=7,
+          saver=saver)
+
+      scaffold2 = monitored_session.Scaffold(
+          init_op=4,
+          init_feed_dict=6,
+          init_fn=lambda scaffold, sess: 8,
+          ready_op=10,
+          ready_for_local_init_op=12,
+          local_init_op=14,
+          saver=saver,
+          copy_from_scaffold=scaffold1)
+
+      scaffold2.finalize()
+      self.assertEqual(4, scaffold2.init_op)
+      self.assertEqual(6, scaffold2.init_feed_dict)
+      self.assertTrue(callable(scaffold2.init_fn))
+      self.assertEqual(10, scaffold2.ready_op)
+      self.assertEqual(12, scaffold2.ready_for_local_init_op)
+      self.assertEqual(14, scaffold2.local_init_op)
+      self.assertEqual(saver, scaffold2.saver)
+
+  def test_copy_from_scaffold_is_scaffold(self):
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, 'copy_from_scaffold is not a Scaffold instance'):
+        monitored_session.Scaffold(copy_from_scaffold=1)
+
 
 def _test_dir(temp_dir, test_name):
   """Create an empty dir to use for tests.
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 94cadbe55fb004539d062c4d905d9a488ab70fb1..0e10704dc866e544a890cdf34f087a6686fade58 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -143,15 +143,15 @@ class _StreamingModelPortProcessor(_OptimizableVariable):
     return self._v
 
   def update_op(self, optimizer, g):
-    return self._v
+    return g
 
 
 def _get_processor(v):
   """The processor of v."""
-  if isinstance(v, variables.Variable):
-    return _RefVariableProcessor(v)
   if v.op.type == "VarHandleOp":
     return _DenseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
   if v.op.type == "SubmodelPort":
     return _StreamingModelPortProcessor(v)
   raise NotImplementedError("Trying to optimize unsupported type ", v)
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 59cc895084acfeaf01c3ec3bf18a3721be7131ae..40c60769731d3f7255647a07141d86b1c2594b01 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -24,8 +24,10 @@ static PyObject* DoQuantizeTrainingOnGraphDefHelper(
     int num_bits,
     TF_Status* out_status) {
   string result;
+  // TODO(suharshs): Make the QuantizeAndDequantizeV2 configurable.
   tensorflow::Status status =
-      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits, &result);
+      tensorflow::DoQuantizeTrainingOnSerializedGraphDef(input_graph, num_bits,
+      "QuantizeAndDequantizeV2", &result);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
     Py_RETURN_NONE;
diff --git a/tensorflow/python/training/queue_runner_impl.py b/tensorflow/python/training/queue_runner_impl.py
index 3901470fbcf514e2388fdf4491896291ebebaa3a..d713e222aee7f287bb73070401c33b8df100a13f 100644
--- a/tensorflow/python/training/queue_runner_impl.py
+++ b/tensorflow/python/training/queue_runner_impl.py
@@ -227,11 +227,14 @@ class QueueRunner(object):
     """
     decremented = False
     try:
+      # Make a cached callable from the `enqueue_op` to decrease the
+      # Python overhead in the queue-runner loop.
+      enqueue_callable = sess.make_callable(enqueue_op)
       while True:
         if coord and coord.should_stop():
           break
         try:
-          sess.run(enqueue_op)
+          enqueue_callable()
         except self._queue_closed_exception_types:  # pylint: disable=catching-non-exception
           # This exception indicates that a queue was closed.
           with self._lock:
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ae5fc54d8548bc3a1a3d52e23e8b19486909b486..d66950f89f28597a4282630602cbef194348df7f 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -511,9 +511,7 @@ class BaseSaverBuilder(object):
           raise ValueError("At least two variables have the same name: %s" %
                            var.name)
         names_to_saveables[var.name] = var
-      elif ((isinstance(var, variables.Variable) or
-             isinstance(var, resource_variable_ops.ResourceVariable)) and
-            var._save_slice_info):
+      elif isinstance(var, variables.Variable) and var._save_slice_info:
         name = var._save_slice_info.full_name
         if name in names_to_saveables:
           if not isinstance(names_to_saveables[name], list):
@@ -573,8 +571,7 @@ class BaseSaverBuilder(object):
         slice_name = None
         # pylint: disable=protected-access
         for variable in op:
-          if (not isinstance(variable, variables.Variable) and
-              not isinstance(variable, resource_variable_ops.ResourceVariable)):
+          if not isinstance(variable, variables.Variable):
             raise ValueError("Slices must all be Variables: %s" % variable)
           if not variable._save_slice_info:
             raise ValueError("Slices must all be slices: %s" % variable)
@@ -712,6 +709,33 @@ class BaseSaverBuilder(object):
         version=self._write_version)
 
 
+def _get_saver_or_default():
+  """Returns the saver from SAVERS collection, or creates a default one.
+
+  This method is used by other members of the training module, such as
+  `Scaffold`, or `CheckpointSaverHook`.
+
+  Returns:
+    `Saver`.
+
+  Raises:
+    RuntimeError: If the SAVERS collection already has more than one items.
+  """
+  collection_key = ops.GraphKeys.SAVERS
+  savers = ops.get_collection(collection_key)
+  if savers:
+    if len(savers) > 1:
+      raise RuntimeError(
+          "More than one item in collection {}. "
+          "Please indicate which one to use by passing it to the constructor.".
+          format(collection_key))
+    return savers[0]
+  saver = Saver(sharded=True, allow_empty=True)
+  if saver is not None:
+    ops.add_to_collection(collection_key, saver)
+  return saver
+
+
 def _GetCheckpointFilename(save_dir, latest_filename):
   """Returns a filename for storing the CheckpointState.
 
@@ -908,11 +932,11 @@ def get_checkpoint_state(checkpoint_dir, latest_filename=None):
           ckpt.all_model_checkpoint_paths[i] = os.path.join(checkpoint_dir, p)
   except errors.OpError as e:
     # It's ok if the file cannot be read
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   except text_format.ParseError as e:
-    logging.warning(str(e))
+    logging.warning("%s: %s", type(e).__name__, e)
     logging.warning("%s: Checkpoint ignored", coord_checkpoint_filename)
     return None
   finally:
@@ -1434,28 +1458,31 @@ class Saver(object):
             "'latest_filename' collides with 'save_path': '%s' and '%s'" %
             (latest_filename, save_path))
 
-    if not gfile.IsDirectory(os.path.dirname(save_path)):
-      raise ValueError(
-          "Parent directory of {} doesn't exist, can't save.".format(save_path))
-
-    save_path = os.path.dirname(save_path)
     if not isinstance(sess, session.SessionInterface):
       raise TypeError("'sess' must be a Session; %s" % sess)
 
+    save_path_parent = os.path.dirname(save_path)
     if not self._is_empty:
-      model_checkpoint_path = sess.run(
-          self.saver_def.save_tensor_name,
-          {self.saver_def.filename_tensor_name: checkpoint_file})
-      model_checkpoint_path = compat.as_str(model_checkpoint_path)
-      if write_state:
-        self._MaybeDeleteOldCheckpoints(
-            model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
-        _update_checkpoint_state(
-            save_dir=save_path,
-            model_checkpoint_path=model_checkpoint_path,
-            all_model_checkpoint_paths=self.last_checkpoints,
-            latest_filename=latest_filename,
-            save_relative_paths=self._save_relative_paths)
+      try:
+        model_checkpoint_path = sess.run(
+            self.saver_def.save_tensor_name,
+            {self.saver_def.filename_tensor_name: checkpoint_file})
+        model_checkpoint_path = compat.as_str(model_checkpoint_path)
+        if write_state:
+          self._MaybeDeleteOldCheckpoints(
+              model_checkpoint_path, meta_graph_suffix=meta_graph_suffix)
+          _update_checkpoint_state(
+              save_dir=save_path_parent,
+              model_checkpoint_path=model_checkpoint_path,
+              all_model_checkpoint_paths=self.last_checkpoints,
+              latest_filename=latest_filename,
+              save_relative_paths=self._save_relative_paths)
+      except (errors.FailedPreconditionError, errors.NotFoundError) as exc:
+        if not gfile.IsDirectory(save_path_parent):
+          exc = ValueError(
+              "Parent directory of {} doesn't exist, can't save.".format(
+                  save_path))
+        raise exc
 
     if write_meta_graph:
       meta_graph_filename = self._MetaGraphFilename(
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index c0f0b309c1f0aac37e8257d213ad2729637709e6..5d1f434a5623cc7dd7569c12d1747c51b4678184 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -46,6 +46,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -156,6 +157,18 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  def testResourceSaveRestoreCachingDevice(self):
+    save_path = os.path.join(self.get_temp_dir(), "resource_cache")
+    v = resource_variable_ops.ResourceVariable([1], caching_device="/cpu:0")
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      save = saver_module.Saver()
+      save.save(sess, save_path)
+    with self.test_session() as sess:
+      save2 = saver_module.Saver()
+      save2.restore(sess, save_path)
+      self.assertEquals(v.eval(), [1])
+
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -542,32 +555,46 @@ class SaverTest(test.TestCase):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
   def testSaveToNonexistingPath(self):
+    file_io.write_string_to_file(
+        os.path.join(self.get_temp_dir(), "actually_a_file"), "")
+    paths = [
+        os.path.join(self.get_temp_dir(), "nonexisting_dir/path"),
+        os.path.join(self.get_temp_dir(), "other_nonexisting_dir/path1/path2"),
+        os.path.join(self.get_temp_dir(), "actually_a_file/path"),
+    ]
+
+    for save_path in paths:
+      # Build a graph with 2 parameter nodes, and Save and
+      # Restore nodes for them.
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+      save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
+      init_all_op = variables.global_variables_initializer()
 
-    save_path = os.path.join(self.get_temp_dir(), "nonexisting_dir/path")
-
-    # Build a graph with 2 parameter nodes, and Save and
-    # Restore nodes for them.
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-    save = saver_module.Saver({"v0": v0, "v1": v1}, restore_sequentially=True)
-    init_all_op = variables.global_variables_initializer()
-
-    with self.test_session() as sess:
-      # Initialize all variables
-      sess.run(init_all_op)
+      # In the case where the parent directory doesn't exist, whether or not the
+      # save succeeds or fails is implementation dependent.  Therefore we allow
+      # both cases.
+      try:
+        with self.test_session() as sess:
+          # Initialize all variables
+          sess.run(init_all_op)
 
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
+          # Check that the parameter nodes have been initialized.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
 
-      error_msg_template = "Parent directory of {} doesn't exist, can't save."
+          # Save the graph.
+          save.save(sess, save_path)
 
-      # Assert saving fails when parent dir of save path doesn't exist
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: error_msg_template.format(save_path) in str(e)
-      ):
-        save.save(sess, save_path)
+        with self.test_session() as sess:
+          # Restore the saved values in the parameter nodes.
+          save.restore(sess, save_path)
+          # Check that the parameter nodes have been restored.
+          self.assertEqual(10.0, v0.eval())
+          self.assertEqual(20.0, v1.eval())
+      except ValueError as exc:
+        error_msg_template = "Parent directory of {} doesn't exist, can't save."
+        self.assertEqual(error_msg_template.format(save_path), str(exc))
 
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
@@ -1470,7 +1497,9 @@ class MetaGraphTest(test.TestCase):
       # Generates a new MetaGraphDef.
       new_meta_graph_def = new_saver.export_meta_graph()
       # It should be the same as the original.
-      self.assertProtoEquals(meta_graph_def, new_meta_graph_def)
+
+    test_util.assert_meta_graph_protos_equal(
+        self, meta_graph_def, new_meta_graph_def)
 
   def testAddCollectionDefFails(self):
     with self.test_session():
@@ -1785,11 +1814,9 @@ class MetaGraphTest(test.TestCase):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
     filename = os.path.join(test_dir, "ckpt")
-    image = array_ops.placeholder(dtypes.float32, [None, 784])
-    label = array_ops.placeholder(dtypes.float32, [None, 10])
+    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
     with session.Session() as sess:
-      label = array_ops.identity(label, name="label")
-      image = array_ops.identity(image, name="image")
       weights = variables.Variable(
           random_ops.random_uniform([784, 10]), name="weights")
       bias = variables.Variable(array_ops.zeros([10]), name="bias")
@@ -1832,8 +1859,8 @@ class MetaGraphTest(test.TestCase):
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=False, import_scope="new_model")
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Cannot assign a device to node"):
+      # Device refers to GPU, which is not available here.
+      with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
@@ -2060,6 +2087,18 @@ class ScopedGraphTest(test.TestCase):
         biases3 = variables.Variable(array_ops.zeros([10]), name="biases")
         logits = math_ops.matmul(hidden2, weights3) + biases3
         ops_lib.add_to_collection("logits", logits)
+
+        # Adds user_defined proto in three formats: string, bytes and Any.
+        # Any proto should just pass through.
+        queue_runner = queue_runner_pb2.QueueRunnerDef(queue_name="test_queue")
+        ops_lib.add_to_collection("user_defined_string_collection",
+                                  str(queue_runner))
+        ops_lib.add_to_collection("user_defined_bytes_collection",
+                                  queue_runner.SerializeToString())
+        any_buf = Any()
+        any_buf.Pack(queue_runner)
+        ops_lib.add_to_collection("user_defined_any_collection", any_buf)
+
       _, var_list = meta_graph.export_scoped_meta_graph(
           filename=os.path.join(test_dir, exported_filename),
           graph=ops_lib.get_default_graph(),
diff --git a/tensorflow/python/training/saver_test_utils.py b/tensorflow/python/training/saver_test_utils.py
index 5f31e2aa539d25ba4fa4a76f4441f8b6f7e11e62..6a73565f82bf373836adca87a4af17ebe2641f8b 100644
--- a/tensorflow/python/training/saver_test_utils.py
+++ b/tensorflow/python/training/saver_test_utils.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import gen_lookup_ops
 from tensorflow.python.training import saver as saver_module
 
 
@@ -34,7 +34,7 @@ class CheckpointedOp(object):
   # pylint: disable=protected-access
   def __init__(self, name, table_ref=None):
     if table_ref is None:
-      self.table_ref = gen_data_flow_ops._mutable_hash_table(
+      self.table_ref = gen_lookup_ops._mutable_hash_table(
           key_dtype=dtypes.string, value_dtype=dtypes.float32, name=name)
     else:
       self.table_ref = table_ref
@@ -52,10 +52,10 @@ class CheckpointedOp(object):
     return self._saveable
 
   def insert(self, keys, values):
-    return gen_data_flow_ops._lookup_table_insert(self.table_ref, keys, values)
+    return gen_lookup_ops._lookup_table_insert(self.table_ref, keys, values)
 
   def lookup(self, keys, default):
-    return gen_data_flow_ops._lookup_table_find(self.table_ref, keys, default)
+    return gen_lookup_ops._lookup_table_find(self.table_ref, keys, default)
 
   def keys(self):
     return self._export()[0]
@@ -64,8 +64,8 @@ class CheckpointedOp(object):
     return self._export()[1]
 
   def _export(self):
-    return gen_data_flow_ops._lookup_table_export(self.table_ref, dtypes.string,
-                                                  dtypes.float32)
+    return gen_lookup_ops._lookup_table_export(self.table_ref, dtypes.string,
+                                               dtypes.float32)
 
   class CustomSaveable(saver_module.BaseSaverBuilder.SaveableObject):
     """A custom saveable for CheckpointedOp."""
@@ -81,6 +81,6 @@ class CheckpointedOp(object):
       super(CheckpointedOp.CustomSaveable, self).__init__(table, specs, name)
 
     def restore(self, restore_tensors, shapes):
-      return gen_data_flow_ops._lookup_table_import(
+      return gen_lookup_ops._lookup_table_import(
           self.op.table_ref, restore_tensors[0], restore_tensors[1])
   # pylint: enable=protected-access
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index d2ccf37d8856dcc9d2399b67f9c0799539c956d5..2091eca0b9c6f0af4a043a4639b6fb72b90cef56 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import errors
@@ -276,14 +277,14 @@ class ClusterSpec(object):
                           "from integers to strings." % job_name)
         self._cluster_spec[job_name] = job_tasks
       self._make_cluster_def()
-    elif isinstance(cluster, tensorflow_server_pb2.ClusterDef):
+    elif isinstance(cluster, cluster_pb2.ClusterDef):
       self._cluster_def = cluster
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
         self._cluster_spec[job_def.name] = {
             i: t for i, t in job_def.tasks.items()}
     elif isinstance(cluster, ClusterSpec):
-      self._cluster_def = tensorflow_server_pb2.ClusterDef()
+      self._cluster_def = cluster_pb2.ClusterDef()
       self._cluster_def.MergeFrom(cluster.as_cluster_def())
       self._cluster_spec = {}
       for job_def in self._cluster_def.job:
@@ -440,7 +441,7 @@ class ClusterSpec(object):
       TypeError: If `cluster_spec` is not a dictionary mapping strings to lists
         of strings.
     """
-    self._cluster_def = tensorflow_server_pb2.ClusterDef()
+    self._cluster_def = cluster_pb2.ClusterDef()
 
     # NOTE(mrry): Sort by job_name to produce deterministic protobufs.
     for job_name, tasks in sorted(self._cluster_spec.items()):
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 591488b60d8eb1cec1c39b353239ded4f61b8a2f..0a8ec4901c9ef050014b6a04cdab34ca08f292c1 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -34,6 +34,8 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import input as input_ops
+from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import server_lib
 
 
@@ -225,6 +227,20 @@ class GrpcServerTest(test.TestCase):
       _ = server_lib.Server(
           {"local_2": [server.target[len("grpc://"):]]})
 
+  def testExtendAfterQueueRunners(self):
+    server = self._cached_server
+    with session.Session(server.target) as sess:
+      input_queue = input_ops.input_producer(constant_op.constant(
+          [0.], dtype=dtypes.float32))
+      self.assertIsNotNone(input_queue)
+
+      var = variables.Variable(1., dtype=dtypes.float32, trainable=False,
+                               name="var")
+
+      sess.run(variables.global_variables_initializer())
+      queue_runner_impl.start_queue_runners(sess)
+      sess.run(var.assign(3.0))
+
 
 class ServerDefTest(test.TestCase):
 
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 6bcc6e25c363d60e0f338ef71807da0758ec477e..a13b6dd976a835d14c03ed90f40b172e0bcbfd07 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -27,6 +27,23 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
 
 
+def _maybe_name(obj):
+  """Returns object name if it has one, or a message otherwise.
+
+  This is useful for names that apper in error messages.
+  Args:
+    obj: Object to get the name of.
+  Returns:
+    name, "None", or a "no name" message.
+  """
+  if obj is None:
+    return "None"
+  elif hasattr(obj, "name"):
+    return obj.name
+  else:
+    return "<no name for %s>" % type(obj)
+
+
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -267,8 +284,8 @@ class SessionManager(object):
     if not local_init_success:
       raise RuntimeError(
           "Init operations did not make model ready for local_init.  "
-          "Init op: %s, init fn: %s, error: %s" % ("None" if init_op is None
-                                                   else init_op.name, init_fn,
+          "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
+                                                   init_fn,
                                                    msg))
 
     is_ready, msg = self._model_ready(sess)
@@ -276,8 +293,7 @@ class SessionManager(object):
       raise RuntimeError(
           "Init operations did not make model ready.  "
           "Init op: %s, init fn: %s, local_init_op: %s, error: %s" %
-          (None if init_op is None else init_op.name, init_fn,
-           self._local_init_op, msg))
+          (_maybe_name(init_op), init_fn, self._local_init_op, msg))
     return sess
 
   def recover_session(self,
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 246e95110a6f081e70187e60acabe7117b8afd2f..4dc1d5abb71f9d7b8d63da016876bcec84edd9eb 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -497,6 +497,23 @@ class SessionManagerTest(test.TestCase):
                                    "Init operations did not make model ready"):
         sm2.prepare_session("", init_op=v.initializer)
 
+  def testPrepareSessionDidNotInitLocalVariableList(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1, name="v")
+      w = variables.Variable(
+          v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w")
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+      sm2 = session_manager.SessionManager(
+          ready_op=variables.report_uninitialized_variables())
+      with self.assertRaisesRegexp(RuntimeError,
+                                   "Init operations did not make model ready"):
+        sm2.prepare_session("", init_op=[v.initializer])
+
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
       v = variables.Variable(1, name="v")
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 9435bdfa1cccf441252c38f936057ef0f9e4b3bc..230ed1db6874da6bbb106f687da616cda1f896f9 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as _summary
@@ -426,8 +426,10 @@ class Supervisor(object):
       local_init_op = self._get_first_op_from_collection(
           ops.GraphKeys.LOCAL_INIT_OP)
       if local_init_op is None:
-        op_list = [variables.local_variables_initializer(),
-                   data_flow_ops.tables_initializer()]
+        op_list = [
+            variables.local_variables_initializer(),
+            lookup_ops.tables_initializer()
+        ]
         if op_list:
           local_init_op = control_flow_ops.group(*op_list)
           ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op)
@@ -994,35 +996,39 @@ class SVSummaryThread(coordinator.LooperThread):
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
     if self._sv.summary_writer:
-      logging.info("Recording summary at step %d.", global_step)
+      logging.info("Recording summary at step %s.", global_step)
       self._sv.summary_writer.add_summary(summary_strs, global_step)
 
 
 class SVStepCounterThread(coordinator.LooperThread):
   """Threads to count steps and measure their duration."""
 
-  def __init__(self, sv, sess):
+  def __init__(self, sv, sess, step_counter=None):
     """Create a `SVStepCounterThread`.
 
     Args:
       sv: A `Supervisor`.
       sess: A `Session`.
+      step_counter: A `Tensor` holding the step counter. By defaults, it uses
+        sv.global_step.
     """
     super(SVStepCounterThread, self).__init__(sv.coord, sv.save_summaries_secs)
     self._sv = sv
     self._sess = sess
     self._last_time = 0.0
     self._last_step = 0
-    self._summary_tag = "%s/sec" % self._sv.global_step.op.name
+    step_counter = sv.global_step if step_counter is None else step_counter
+    self._step_counter = step_counter
+    self._summary_tag = "%s/sec" % self._step_counter.op.name
 
   def start_loop(self):
     self._last_time = time.time()
     self._last_step = training_util.global_step(
-        self._sess, self._sv.global_step)
+        self._sess, self._step_counter)
 
   def run_loop(self):
     # Count the steps.
-    current_step = training_util.global_step(self._sess, self._sv.global_step)
+    current_step = training_util.global_step(self._sess, self._step_counter)
     added_steps = current_step - self._last_step
     self._last_step = current_step
     # Measure the elapsed time.
@@ -1030,7 +1036,10 @@ class SVStepCounterThread(coordinator.LooperThread):
     elapsed_time = current_time - self._last_time
     self._last_time = current_time
     # Reports the number of steps done per second
-    steps_per_sec = added_steps / elapsed_time if elapsed_time != 0. else float("inf")
+    if elapsed_time > 0.:
+      steps_per_sec = added_steps / elapsed_time
+    else:
+      steps_per_sec = float("inf")
     summary = Summary(value=[Summary.Value(tag=self._summary_tag,
                                            simple_value=steps_per_sec)])
     if self._sv.summary_writer:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 3cee0b2f592d120cc7bd922360794a547d1512d7..f1830bd3fcf311937caa6ff04a67fda088640f3a 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer
@@ -239,12 +240,17 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     aggregated_grad = []
     var_list = []
 
-    self._local_step = variables.Variable(
-        initial_value=0,
-        trainable=False,
-        collections=[ops.GraphKeys.LOCAL_VARIABLES],
-        dtype=global_step.dtype.base_dtype,
-        name="sync_rep_local_step")
+    # local_anchor op will be placed on this worker task by default.
+    local_anchor = control_flow_ops.no_op()
+    # Colocating local_step variable prevents it being placed on the PS.
+    with ops.colocate_with(local_anchor):
+      self._local_step = variable_scope.variable(
+          initial_value=0,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          dtype=global_step.dtype.base_dtype,
+          name="sync_rep_local_step")
+
     self.local_step_init_op = state_ops.assign(self._local_step, global_step)
     chief_init_ops = [self.local_step_init_op]
     self.ready_for_local_init_op = variables.report_uninitialized_variables(
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 28c0668d24e981112c99f1332376d9e3192244b6..f4ac3c9758712182d2aee26a1a53c83e92e97b63 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -68,6 +68,7 @@ See the @{$python/train} guide.
 @@LoggingTensorHook
 @@StopAtStepHook
 @@CheckpointSaverHook
+@@CheckpointSaverListener
 @@NewCheckpointReader
 @@StepCounterHook
 @@NanLossDuringTrainingError
@@ -94,10 +95,14 @@ from __future__ import print_function
 import sys as _sys
 
 from tensorflow.python.ops import io_ops as _io_ops
+from tensorflow.python.ops import sdca_ops as _sdca_ops
 from tensorflow.python.ops import state_ops as _state_ops
 from tensorflow.python.util.all_util import remove_undocumented
 
 # pylint: disable=g-bad-import-order,unused-import
+from tensorflow.python.ops.sdca_ops import sdca_optimizer
+from tensorflow.python.ops.sdca_ops import sdca_fprint
+from tensorflow.python.ops.sdca_ops import sdca_shrink_l1
 from tensorflow.python.training.adadelta import AdadeltaOptimizer
 from tensorflow.python.training.adagrad import AdagradOptimizer
 from tensorflow.python.training.adagrad_da import AdagradDAOptimizer
@@ -128,6 +133,7 @@ from tensorflow.python.training.basic_session_run_hooks import SecondOrStepTimer
 from tensorflow.python.training.basic_session_run_hooks import LoggingTensorHook
 from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook
 from tensorflow.python.training.basic_session_run_hooks import CheckpointSaverHook
+from tensorflow.python.training.basic_session_run_hooks import CheckpointSaverListener
 from tensorflow.python.training.basic_session_run_hooks import StepCounterHook
 from tensorflow.python.training.basic_session_run_hooks import NanLossDuringTrainingError
 from tensorflow.python.training.basic_session_run_hooks import NanTensorHook
@@ -180,8 +186,8 @@ from tensorflow.python.training.learning_rate_decay import *
 # pylint: enable=wildcard-import
 
 # Distributed computing support.
-from tensorflow.core.protobuf.tensorflow_server_pb2 import ClusterDef
-from tensorflow.core.protobuf.tensorflow_server_pb2 import JobDef
+from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
+from tensorflow.core.protobuf.cluster_pb2 import JobDef
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.training.server_lib import Server
@@ -190,36 +196,36 @@ from tensorflow.python.training.server_lib import Server
 _allowed_symbols = [
     # TODO(cwhipkey): review these and move to contrib or expose through
     # documentation.
-    "generate_checkpoint_state_proto",   # Used internally by saver.
+    "generate_checkpoint_state_proto",  # Used internally by saver.
     "checkpoint_exists",  # Only used in test?
     "get_checkpoint_mtimes",  # Only used in test?
 
     # Legacy: remove.
     "do_quantize_training_on_graphdef",  # At least use grah_def, not graphdef.
-                                         # No uses within tensorflow.
+    # No uses within tensorflow.
     "queue_runner",  # Use tf.train.start_queue_runner etc directly.
-                     # This is also imported internally.
+    # This is also imported internally.
 
     # TODO(drpng): document these. The reference in howtos/distributed does
     # not link.
     "SyncReplicasOptimizer",
     # Protobufs:
-    "BytesList",          # from example_pb2.
+    "BytesList",  # from example_pb2.
     "ClusterDef",
-    "Example",            # from example_pb2
-    "Feature",            # from example_pb2
-    "Features",           # from example_pb2
-    "FeatureList",        # from example_pb2
-    "FeatureLists",       # from example_pb2
-    "FloatList",          # from example_pb2.
-    "Int64List",          # from example_pb2.
+    "Example",  # from example_pb2
+    "Feature",  # from example_pb2
+    "Features",  # from example_pb2
+    "FeatureList",  # from example_pb2
+    "FeatureLists",  # from example_pb2
+    "FloatList",  # from example_pb2.
+    "Int64List",  # from example_pb2.
     "JobDef",
-    "SaverDef",           # From saver_pb2.
-    "SequenceExample",    # from example_pb2.
+    "SaverDef",  # From saver_pb2.
+    "SequenceExample",  # from example_pb2.
     "ServerDef",
 ]
 # Include extra modules for docstrings because:
 # * Input methods in tf.train are documented in io_ops.
 # * Saver methods in tf.train are documented in state_ops.
 remove_undocumented(__name__, _allowed_symbols,
-                    [_sys.modules[__name__], _io_ops, _state_ops])
+                    [_sys.modules[__name__], _io_ops, _sdca_ops, _state_ops])
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 4b8bdbc98ba00b87ccaa901b4ba4992c8eeb4614..9ee48d1a4514f9914f690a7cb1d915cc668d0e2b 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -25,6 +25,8 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.test_util import TensorFlowTestCase
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import training_ops
diff --git a/tensorflow/python/util/all_util.py b/tensorflow/python/util/all_util.py
index 08f336575102e60213391131b909124a8718180a..50d480f8707b35a4d0c11b81fa62fe9bf645e086 100644
--- a/tensorflow/python/util/all_util.py
+++ b/tensorflow/python/util/all_util.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect as _inspect
 import re as _re
 import sys as _sys
 
+from tensorflow.python.util import tf_inspect as _tf_inspect
+
+
 _reference_pattern = _re.compile(r'^@@(\w+)$', flags=_re.MULTILINE)
 
 
@@ -45,7 +47,7 @@ def make_all(module_name, doc_string_modules=None):
   if doc_string_modules is None:
     doc_string_modules = [_sys.modules[module_name]]
   cur_members = set([name for name, _
-                     in _inspect.getmembers(_sys.modules[module_name])])
+                     in _tf_inspect.getmembers(_sys.modules[module_name])])
 
   results = set()
   for doc_module in doc_string_modules:
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 6fd7c4ab284c3a32ed4ff9fd76c27d7068aa868f..07382d93dfe5ebe3f063b86bc5afa288970330f6 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -69,7 +69,7 @@ def as_text(bytes_or_text, encoding='utf-8'):
   """Returns the given argument as a unicode string.
 
   Args:
-    bytes_or_text: A `bytes`, `str, or `unicode` object.
+    bytes_or_text: A `bytes`, `str`, or `unicode` object.
     encoding: A string indicating the charset for decoding unicode.
 
   Returns:
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index d09476a680d89d6b035779b9e24f47e284205f02..1e1599afb4bd9a3e8f6184748178d613ed34cc22 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import collections
 import functools
-import inspect
 import re
 
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 
 def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
@@ -33,7 +34,8 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
       doc, instructions,
       'DEPRECATED FUNCTION',
       '(deprecated)', [
-          'THIS FUNCTION IS DEPRECATED. It will be removed after %s.' % date,
+          'THIS FUNCTION IS DEPRECATED. It will be removed %s.' % (
+              'in a future version' if date is None else ('after %s' % date)),
           'Instructions for updating:'])
 
 
@@ -44,14 +46,13 @@ def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
       'DEPRECATED FUNCTION ARGUMENTS',
       '(deprecated arguments)', [
           'SOME ARGUMENTS ARE DEPRECATED. '
-          'They will be removed after %s.' % date,
+          'They will be removed %s.' % (
+              'in a future version' if date is None else ('after %s' % date)),
           'Instructions for updating:'])
 
 
 def _validate_deprecation_args(date, instructions):
-  if not date:
-    raise ValueError('Tell us what date this will be deprecated!')
-  if not re.match(r'20\d\d-[01]\d-[0123]\d', date):
+  if date is not None and not re.match(r'20\d\d-[01]\d-[0123]\d', date):
     raise ValueError('Date must be YYYY-MM-DD.')
   if not instructions:
     raise ValueError('Don\'t deprecate things without conversion instructions!')
@@ -59,7 +60,7 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location():
   """Returns call location given level up from current call."""
-  frame = inspect.currentframe()
+  frame = tf_inspect.currentframe()
   if frame:
     # CPython internals are available, use them for performance.
     # walk back two frames to get to deprecated function caller.
@@ -69,7 +70,7 @@ def _call_location():
     return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
   else:
     # Slow fallback path
-    stack = inspect.stack(0)  # 0 avoids generating unused context
+    stack = tf_inspect.stack(0)  # 0 avoids generating unused context
     entry = stack[2]
     return '%s:%d' % (entry[1], entry[2])
 
@@ -84,6 +85,7 @@ def deprecated(date, instructions):
     Instructions for updating:
     <instructions>
 
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
   <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated)' is appended
@@ -91,8 +93,8 @@ def deprecated(date, instructions):
   to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None.
     instructions: String. Instructions on how to update code using the
       deprecated function.
 
@@ -100,7 +102,8 @@ def deprecated(date, instructions):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+    ValueError: If date is not None or in ISO 8601 format, or instructions are
+      empty.
   """
   _validate_deprecation_args(date, instructions)
 
@@ -110,15 +113,17 @@ def deprecated(date, instructions):
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       logging.warning(
-          'From %s: %s (from %s) is deprecated and will be removed '
-          'after %s.\n'
+          'From %s: %s (from %s) is deprecated and will be removed %s.\n'
           'Instructions for updating:\n%s',
           _call_location(), decorator_utils.get_qualified_name(func),
-          func.__module__, date, instructions)
+          func.__module__,
+          'in a future version' if date is None else ('after %s' % date),
+          instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_function_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(
+        func, new_func, 'deprecated',
+        _add_deprecated_function_notice_to_docstring(func.__doc__, date,
+                                                     instructions))
   return deprecated_wrapper
 
 
@@ -136,15 +141,16 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     removed after <date>. Instructions for updating:
       <instructions>
 
-  <function> will include the class name if it is a method.
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
+  <function> includes the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated arguments)' is
   appended to the first line of the docstring and a deprecation notice is
   prepended to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None.
     instructions: String. Instructions on how to update code using the
       deprecated function.
     *deprecated_arg_names_or_tuples: String. or 2-Tuple(String,
@@ -156,7 +162,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, instructions are
+    ValueError: If date is not None or in ISO 8601 format, instructions are
       empty, the deprecated arguments are not present in the function
       signature, or the second element of a deprecated_tuple is not a
       list.
@@ -176,7 +182,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     return d
 
   def _get_deprecated_positional_arguments(names_to_ok_vals, arg_spec):
-    """Builds a dictionary from deprecated arguments to thier spec.
+    """Builds a dictionary from deprecated arguments to their spec.
 
     Returned dict is keyed by argument name.
     Each value is a DeprecatedArgSpec with the following fields:
@@ -189,7 +195,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     Args:
       names_to_ok_vals: dict from string arg_name to a list of values,
         possibly empty, which should not elicit a warning.
-      arg_spec: Output from inspect.getargspec on the called function.
+      arg_spec: Output from tf_inspect.getargspec on the called function.
 
     Returns:
       Dictionary from arg_name to DeprecatedArgSpec.
@@ -209,7 +215,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     decorator_utils.validate_callable(func, 'deprecated_args')
     deprecated_arg_names = _get_arg_names_to_ok_vals()
 
-    arg_spec = inspect.getargspec(func)
+    arg_spec = tf_inspect.getargspec(func)
     deprecated_positions = _get_deprecated_positional_arguments(
         deprecated_arg_names, arg_spec)
 
@@ -256,7 +262,7 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
       invalid_args = []
-      named_args = inspect.getcallargs(func, *args, **kwargs)
+      named_args = tf_inspect.getcallargs(func, *args, **kwargs)
       for arg_name, spec in iter(deprecated_positions.items()):
         if (spec.position < len(args) and
             not (spec.has_ok_value and
@@ -275,13 +281,15 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples):
       for arg_name in invalid_args:
         logging.warning(
             'From %s: calling %s (from %s) with %s is deprecated and will '
-            'be removed after %s.\nInstructions for updating:\n%s',
+            'be removed %s.\nInstructions for updating:\n%s',
             _call_location(), decorator_utils.get_qualified_name(func),
-            func.__module__, arg_name, date, instructions)
+            func.__module__, arg_name,
+            'in a future version' if date is None else ('after %s' % date),
+            instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(func, new_func, 'deprecated',
+                                       _add_deprecated_arg_notice_to_docstring(
+                                           func.__doc__, date, instructions))
   return deprecated_wrapper
 
 
@@ -295,6 +303,7 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     will be removed after <date>. Instructions for updating:
       <instructions>
 
+  If `date` is None, 'after <date>' is replaced with 'in a future version'.
   <function> will include the class name if it is a method.
 
   It also edits the docstring of the function: ' (deprecated arguments)' is
@@ -302,8 +311,8 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
   prepended to the rest of the docstring.
 
   Args:
-    date: String. The date the function is scheduled to be removed. Must be
-      ISO 8601 (YYYY-MM-DD).
+    date: String or None. The date the function is scheduled to be removed.
+      Must be ISO 8601 (YYYY-MM-DD), or None
     instructions: String. Instructions on how to update code using the
       deprecated function.
     **deprecated_kwargs: The deprecated argument values.
@@ -312,7 +321,8 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     Decorated function or method.
 
   Raises:
-    ValueError: If date is not in ISO 8601 format, or instructions are empty.
+    ValueError: If date is not None or in ISO 8601 format, or instructions are
+      empty.
   """
   _validate_deprecation_args(date, instructions)
   if not deprecated_kwargs:
@@ -324,18 +334,20 @@ def deprecated_arg_values(date, instructions, **deprecated_kwargs):
     @functools.wraps(func)
     def new_func(*args, **kwargs):
       """Deprecation wrapper."""
-      named_args = inspect.getcallargs(func, *args, **kwargs)
+      named_args = tf_inspect.getcallargs(func, *args, **kwargs)
       for arg_name, arg_value in deprecated_kwargs.items():
         if arg_name in named_args and named_args[arg_name] == arg_value:
           logging.warning(
               'From %s: calling %s (from %s) with %s=%s is deprecated and will '
-              'be removed after %s.\nInstructions for updating:\n%s',
+              'be removed %s.\nInstructions for updating:\n%s',
               _call_location(), decorator_utils.get_qualified_name(func),
-              func.__module__, arg_name, arg_value, date, instructions)
+              func.__module__, arg_name, arg_value,
+              'in a future version' if date is None else ('after %s' % date),
+              instructions)
       return func(*args, **kwargs)
-    new_func.__doc__ = _add_deprecated_arg_notice_to_docstring(
-        func.__doc__, date, instructions)
-    return new_func
+    return tf_decorator.make_decorator(func, new_func, 'deprecated',
+                                       _add_deprecated_arg_notice_to_docstring(
+                                           func.__doc__, date, instructions))
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 07d6bd6a48f513f6113cdf27b5108377596ba627..cce0bb1b4ee9981a8425dd16ed79f56f9d4cccf5 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -33,9 +33,7 @@ class DeprecationTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated(None, instructions)
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("", instructions)
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated("07-04-2016", instructions)
@@ -45,6 +43,46 @@ class DeprecationTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "instructions"):
       deprecation.deprecated(date, "")
 
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_no_date(self, mock_warning):
+    date = None
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions)
+    def _fn(arg0, arg1):
+      """fn doc.
+
+      Args:
+        arg0: Arg 0.
+        arg1: Arg 1.
+
+      Returns:
+        Sum of args.
+      """
+      return arg0 + arg1
+
+    self.assertEqual(
+        "fn doc. (deprecated)"
+        "\n"
+        "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version."
+        "\nInstructions for updating:\n%s"
+        "\n"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n"
+        "\nReturns:"
+        "\n  Sum of args." % instructions, _fn.__doc__)
+
+    # Assert calling new fn issues log warning.
+    self.assertEqual(3, _fn(1, 2))
+    self.assertEqual(1, mock_warning.call_count)
+    (args, _) = mock_warning.call_args
+    self.assertRegexpMatches(
+        args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["in a future version", instructions]),
+                        set(args[1:]))
+
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
@@ -82,8 +120,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_one_line_doc(self, mock_warning):
@@ -107,8 +145,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_no_doc(self, mock_warning):
@@ -132,8 +170,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_with_doc(self, mock_warning):
@@ -177,8 +215,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_with_one_line_doc(self, mock_warning):
@@ -207,8 +245,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_instance_fn_no_doc(self, mock_warning):
@@ -236,8 +274,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(3, _Object()._fn(1, 2))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   def test_prop_wrong_order(self):
     with self.assertRaisesRegexp(
@@ -290,8 +328,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_with_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_prop_no_doc(self, mock_warning):
@@ -320,8 +358,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual("prop_no_doc", _Object()._prop)
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
 
 class DeprecatedArgsTest(test.TestCase):
@@ -334,9 +372,7 @@ class DeprecatedArgsTest(test.TestCase):
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
     date = "2016-07-04"
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated_args(None, instructions, "deprecated")
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("", instructions, "deprecated")
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_args("07-04-2016", instructions, "deprecated")
@@ -401,8 +437,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_with_one_line_doc(self, mock_warning):
@@ -430,8 +466,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_static_fn_no_doc(self, mock_warning):
@@ -459,8 +495,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_varargs(self, mock_warning):
@@ -479,8 +515,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, True, False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_kwargs(self, mock_warning):
@@ -499,8 +535,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, a=True, b=False))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_positional_and_named(self, mock_warning):
@@ -519,11 +555,13 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, None, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d1"]), set(args1[1:]))
+    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d1"]),
+                        set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d2"]), set(args2[1:]))
+    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d2"]),
+                        set(args2[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_positional_and_named_with_ok_vals(self, mock_warning):
@@ -543,11 +581,13 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(2, _fn(1, False, 2, d2=False))
     self.assertEqual(2, mock_warning.call_count)
     (args1, _) = mock_warning.call_args_list[0]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d1"]), set(args1[1:]))
+    self.assertRegexpMatches(args1[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d1"]),
+                        set(args1[1:]))
     (args2, _) = mock_warning.call_args_list[1]
-    self.assertRegexpMatches(args1[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions, "d2"]), set(args2[1:]))
+    self.assertRegexpMatches(args2[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions, "d2"]),
+                        set(args2[1:]))
 
     # Assert calls with the deprecated arguments dont log warnings if
     # the value matches the 'ok_val'.
@@ -565,9 +605,7 @@ class DeprecatedArgValuesTest(test.TestCase):
 
   def test_deprecated_illegal_args(self):
     instructions = "This is how you update..."
-    with self.assertRaisesRegexp(ValueError, "date"):
-      deprecation.deprecated_arg_values(None, instructions, deprecated=True)
-    with self.assertRaisesRegexp(ValueError, "date"):
+    with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values("", instructions, deprecated=True)
     with self.assertRaisesRegexp(ValueError, "YYYY-MM-DD"):
       deprecation.deprecated_arg_values(
@@ -623,8 +661,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -656,8 +694,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
@@ -689,8 +727,8 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(3, _fn(1, 2, deprecated=True))
     self.assertEqual(1, mock_warning.call_count)
     (args, _) = mock_warning.call_args
-    self.assertRegexpMatches(args[0], r"deprecated and will be removed after")
-    self._assert_subset(set([date, instructions]), set(args[1:]))
+    self.assertRegexpMatches(args[0], r"deprecated and will be removed")
+    self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
     # Assert calling new fn with default deprecated value issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index 8843016a978cb1084b7e851b67d2f1dc2a190619..a3750851769a31466eebba5cfd5e665f4cbc4f9c 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -101,7 +101,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     fixed_config.shape.CopyFrom(
         tensor_shape.TensorShape(dense_shapes[i]).as_proto())
 
-    fixed_config.dtype = dense_types[i]
+    fixed_config.dtype = int(dense_types[i])
     # Get the output tensor name.
     fixed_config.values_output_tensor_name = parse_example_op.outputs[
         dense_values_start + i].name
@@ -111,7 +111,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     key = fetched[sparse_keys_start + i]
     feature_config = config.feature_map[key]
     var_len_feature = feature_config.var_len_feature
-    var_len_feature.dtype = sparse_types[i]
+    var_len_feature.dtype = int(sparse_types[i])
     var_len_feature.indices_output_tensor_name = parse_example_op.outputs[
         sparse_indices_start + i].name
     var_len_feature.values_output_tensor_name = parse_example_op.outputs[
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d2622b1c0472d14481f67e612c1bf276a5a16ab
--- /dev/null
+++ b/tensorflow/python/util/lazy_loader.py
@@ -0,0 +1,58 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""A LazyLoader class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import types
+
+
+class LazyLoader(types.ModuleType):
+  """Lazily import a module, mainly to avoid pulling in large dependencies.
+
+  `contrib`, and `ffmpeg` are examples of modules that are large and not always
+  needed, and this allows them to only be loaded when they are used.
+  """
+
+  # The lint error here is incorrect.
+  def __init__(self, local_name, parent_module_globals, name):  # pylint: disable=super-on-old-class
+    self._local_name = local_name
+    self._parent_module_globals = parent_module_globals
+
+    super(LazyLoader, self).__init__(name)
+
+  def _load(self):
+    # Import the target module and insert it into the parent's namespace
+    module = importlib.import_module(self.__name__)
+    self._parent_module_globals[self._local_name] = module
+
+    # Update this object's dict so that if someone keeps a reference to the
+    #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
+    #   that fail).
+    self.__dict__.update(module.__dict__)
+
+    return module
+
+  def __getattr__(self, item):
+    module = self._load()
+    return getattr(module, item)
+
+  def __dir__(self):
+    module = self._load()
+    return dir(module)
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index ad0fcd31e5f5da93c0beb52a9932a6fabd636733..1e22928b364f5c4a5e75cb4855eb63fb39d21206 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -15,11 +15,14 @@
 
 """## Functions for working with arbitrarily nested sequences of elements.
 
-This module is used to perform any operations on nested structures, which can be
-specified as sequences that contain non-sequence elements or other sequences.
-The utilities here assume (and do not check) that the nested structures form a
-'tree', i.e. no references in the structure of the input of these functions
-should be recursive.
+This module is used to perform any operations on nested structures. A nested
+structure is a Python sequence that contains non-sequence elements or other
+sequences. The utilities here assume (and do not check) that the nested
+structures form a 'tree', i.e. no references in the structure of the input of
+these functions should be recursive.
+
+Example structures: `((3, 4), 5, (6, 7, (9, 10), 8))`, `(np.array(0),
+  (np.array([3, 4]), tf.constant([3, 4])))`
 
 @@assert_same_structure
 @@is_sequence
@@ -99,7 +102,7 @@ def flatten(nest):
   return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
 
 
-def _recursive_assert_same_structure(nest1, nest2):
+def _recursive_assert_same_structure(nest1, nest2, check_types):
   is_sequence_nest1 = is_sequence(nest1)
   if is_sequence_nest1 != is_sequence(nest2):
     raise ValueError(
@@ -109,28 +112,31 @@ def _recursive_assert_same_structure(nest1, nest2):
   if is_sequence_nest1:
     type_nest1 = type(nest1)
     type_nest2 = type(nest2)
-    if type_nest1 != type_nest2:
+    if check_types and type_nest1 != type_nest2:
       raise TypeError(
           "The two structures don't have the same sequence type. First "
           "structure has type %s, while second structure has type %s."
           % (type_nest1, type_nest2))
 
     for n1, n2 in zip(nest1, nest2):
-      _recursive_assert_same_structure(n1, n2)
+      _recursive_assert_same_structure(n1, n2, check_types)
 
 
-def assert_same_structure(nest1, nest2):
+def assert_same_structure(nest1, nest2, check_types=True):
   """Asserts that two structures are nested in the same way.
 
   Args:
     nest1: an arbitrarily nested structure.
     nest2: an arbitrarily nested structure.
+    check_types: if `True` (default) types of sequences are checked as
+      well. If set to `False`, for example a list and a tuple of objects will
+      look same if they have the same size.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
       if the two structures are not nested in the same way.
     TypeError: If the two structures differ in the type of sequence in any of
-      their substructures.
+      their substructures. Only possible if `check_types` is `True`.
   """
   len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
   len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
@@ -138,7 +144,7 @@ def assert_same_structure(nest1, nest2):
     raise ValueError("The two structures don't have the same number of "
                      "elements. First structure: %s, second structure: %s."
                      % (nest1, nest2))
-  _recursive_assert_same_structure(nest1, nest2)
+  _recursive_assert_same_structure(nest1, nest2, check_types)
 
 
 def flatten_dict_items(dictionary):
@@ -266,7 +272,7 @@ def pack_sequence_as(structure, flat_sequence):
   return _sequence_like(structure, packed)
 
 
-def map_structure(func, *structure):
+def map_structure(func, *structure, **check_types_dict):
   """Applies `func` to each entry in `structure` and returns a new structure.
 
   Applies `func(x[0], x[1], ...)` where x[i] is an entry in
@@ -277,17 +283,24 @@ def map_structure(func, *structure):
     func: A callable that acceps as many arguments are there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered scalars.
+    **check_types_dict: only valid keyword argument is `check_types`. If set to
+      `True` (default) the types of iterables within the  structures have to be
+      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
+      exception). To allow this set this argument to `False`.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
     to `func(x[0], x[1], ...)` where `x[i]` is a value in the corresponding
-    location in `structure[i]`.
+    location in `structure[i]`. If there are different sequence types and
+    `check_types` is `False` the sequence types of the first structure will be
+    used.
 
   Raises:
     TypeError: If `func` is not callable or if the structures do not match
       each other by depth tree.
     ValueError: If no structure is provided or if the structures do not match
       each other by type.
+    ValueError: If wrong keyword arguments are provided.
   """
   if not callable(func):
     raise TypeError("func must be callable, got: %s" % func)
@@ -295,8 +308,15 @@ def map_structure(func, *structure):
   if not structure:
     raise ValueError("Must provide at least one structure")
 
+  if check_types_dict:
+    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
+      raise ValueError("Only valid keyword argument is check_types")
+    check_types = check_types_dict["check_types"]
+  else:
+    check_types = True
+
   for other in structure[1:]:
-    assert_same_structure(structure[0], other)
+    assert_same_structure(structure[0], other, check_types=check_types)
 
   flat_structure = [flatten(s) for s in structure]
   entries = zip(*flat_structure)
@@ -315,7 +335,7 @@ def _yield_flat_up_to(shallow_tree, input_tree):
     yield input_tree
 
 
-def assert_shallow_structure(shallow_tree, input_tree):
+def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
   """Asserts that `shallow_tree` is a shallow structure of `input_tree`.
 
   That is, this function tests if the `input_tree` structure can be created from
@@ -341,11 +361,13 @@ def assert_shallow_structure(shallow_tree, input_tree):
   Args:
     shallow_tree: an arbitrarily nested structure.
     input_tree: an arbitrarily nested structure.
+    check_types: if `True` (default) the sequence types of `shallow_tree` and
+      `input_tree` have to be the same.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
     TypeError: If the sequence types of `shallow_tree` are different from
-      `input_tree`.
+      `input_tree`. Only raised if `check_types` is `True`.
     ValueError: If the sequence lengths of `shallow_tree` are different from
       `input_tree`.
   """
@@ -355,7 +377,7 @@ def assert_shallow_structure(shallow_tree, input_tree):
           "If shallow structure is a sequence, input must also be a sequence. "
           "Input has type: %s." % type(input_tree))
 
-    if not isinstance(input_tree, type(shallow_tree)):
+    if check_types and not isinstance(input_tree, type(shallow_tree)):
       raise TypeError(
           "The two structures don't have the same sequence type. Input "
           "structure has type %s, while shallow structure has type %s."
@@ -368,7 +390,8 @@ def assert_shallow_structure(shallow_tree, input_tree):
           % (len(input_tree), len(shallow_tree)))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
-      assert_shallow_structure(shallow_branch, input_branch)
+      assert_shallow_structure(shallow_branch, input_branch,
+                               check_types=check_types)
 
 
 def flatten_up_to(shallow_tree, input_tree):
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index f6a2d8b6631719e6e766ccd0eaef0cc62db0cec6..8a17d990da216538172936eff34025cd83772df9 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -139,6 +139,13 @@ class NestTest(test.TestCase):
                                  "don't have the same nested structure"):
       nest.assert_same_structure([[3], 4], [3, [4]])
 
+    structure1_list = [[[1, 2], 3], 4, [5, 6]]
+    with self.assertRaisesRegexp(TypeError,
+                                 "don't have the same sequence type"):
+      nest.assert_same_structure(structure1, structure1_list)
+    nest.assert_same_structure(structure1, structure2, check_types=False)
+    nest.assert_same_structure(structure1, structure1_list, check_types=False)
+
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -169,6 +176,23 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "same nested structure"):
       nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)))
 
+    structure1_list = [[[1, 2], 3], 4, [5, 6]]
+    with self.assertRaisesRegexp(TypeError, "same sequence type"):
+      nest.map_structure(lambda x, y: None, structure1, structure1_list)
+
+    nest.map_structure(lambda x, y: None, structure1, structure1_list,
+                       check_types=False)
+
+    with self.assertRaisesRegexp(ValueError, "same nested structure"):
+      nest.map_structure(lambda x, y: None, ((3, 4), 5), (3, (4, 5)),
+                         check_types=False)
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, foo="a")
+
+    with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
+      nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
+
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
@@ -186,6 +210,7 @@ class NestTest(test.TestCase):
         "<(type|class) 'list'>.")
     with self.assertRaisesRegexp(TypeError, expected_message):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
+    nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
   def testFlattenUpTo(self):
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
diff --git a/tensorflow/python/util/tf_contextlib.py b/tensorflow/python/util/tf_contextlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..3830014d4acbd2948f07cd91ad656e0a65001ffa
--- /dev/null
+++ b/tensorflow/python/util/tf_contextlib.py
@@ -0,0 +1,36 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the contextlib module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib as _contextlib
+
+from tensorflow.python.util import tf_decorator
+
+
+def contextmanager(target):
+  """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
+
+  Usage is identical to `contextlib.contextmanager`.
+
+  Args:
+    target: A callable to be wrapped in a contextmanager.
+  Returns:
+    A callable that can be used inside of a `with` statement.
+  """
+  context_manager = _contextlib.contextmanager(target)
+  return tf_decorator.make_decorator(target, context_manager, 'contextmanager')
diff --git a/tensorflow/python/util/tf_contextlib_test.py b/tensorflow/python/util/tf_contextlib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5bf388a63597b07b52b33c3561a9df69bcd8d2
--- /dev/null
+++ b/tensorflow/python/util/tf_contextlib_test.py
@@ -0,0 +1,92 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_contextlib."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+@tf_contextlib.contextmanager
+def test_yield_append_before_and_after_yield(x, before, after):
+  x.append(before)
+  yield
+  x.append(after)
+
+
+@tf_contextlib.contextmanager
+def test_yield_return_x_plus_1(x):
+  yield x + 1
+
+
+@tf_contextlib.contextmanager
+def test_params_and_defaults(a, b=2, c=True, d='hello'):
+  return [a, b, c, d]
+
+
+class TfContextlibTest(test.TestCase):
+
+  def testRunsCodeBeforeYield(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 'before', ''):
+      self.assertEqual('before', x[-1])
+
+  def testRunsCodeAfterYield(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, '', 'after'):
+      pass
+    self.assertEqual('after', x[-1])
+
+  def testNestedWith(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 'before', 'after'):
+      with test_yield_append_before_and_after_yield(x, 'inner', 'outer'):
+        with test_yield_return_x_plus_1(1) as var:
+          x.append(var)
+    self.assertEqual(['before', 'inner', 2, 'outer', 'after'], x)
+
+  def testMultipleCallsOfSeparateInstances(self):
+    x = []
+    with test_yield_append_before_and_after_yield(x, 1, 2):
+      pass
+    with test_yield_append_before_and_after_yield(x, 3, 4):
+      pass
+    self.assertEqual([1, 2, 3, 4], x)
+
+  def testReturnsResultFromYield(self):
+    with test_yield_return_x_plus_1(3) as result:
+      self.assertEqual(4, result)
+
+  def testUnwrapContextManager(self):
+    decorators, target = tf_decorator.unwrap(test_params_and_defaults)
+    self.assertEqual(1, len(decorators))
+    self.assertTrue(isinstance(decorators[0], tf_decorator.TFDecorator))
+    self.assertEqual('contextmanager', decorators[0].decorator_name)
+    self.assertFalse(isinstance(target, tf_decorator.TFDecorator))
+
+  def testGetArgSpecReturnsWrappedArgSpec(self):
+    argspec = tf_inspect.getargspec(test_params_and_defaults)
+    self.assertEqual(['a', 'b', 'c', 'd'], argspec.args)
+    self.assertEqual((2, True, 'hello'), argspec.defaults)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5d979e376c02fcc29cc2abb7c0c81d46f70d928
--- /dev/null
+++ b/tensorflow/python/util/tf_decorator.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base TFDecorator class and utility functions for working with decorators.
+
+There are two ways to create decorators that TensorFlow can introspect into.
+This is important for documentation generation purposes, so that function
+signatures aren't obscured by the (*args, **kwds) signature that decorators
+often provide.
+
+1. Call `tf_decorator.make_decorator` on your wrapper function. If your
+decorator is stateless, or can capture all of the variables it needs to work
+with through lexical closure, this is the simplest option. Create your wrapper
+function as usual, but instead of returning it, return
+`tf_decorator.make_decorator(your_wrapper)`. This will attach some decorator
+introspection metadata onto your wrapper and return it.
+
+Example:
+
+  def print_hello_before_calling(target):
+    def wrapper(*args, **kwargs):
+      print('hello')
+      return target(*args, **kwargs)
+    return tf_decorator.make_decorator(wrapper)
+
+2. Derive from TFDecorator. If your decorator needs to be stateful, you can
+implement it in terms of a TFDecorator. Store whatever state you need in your
+derived class, and implement the `__call__` method to do your work before
+calling into your target. You can retrieve the target via
+`super(MyDecoratorClass, self).decorated_target`, and call it with whatever
+parameters it needs.
+
+Example:
+
+  class CallCounter(tf_decorator.TFDecorator):
+    def __init__(self, target):
+      super(CallCounter, self).__init__('count_calls', target)
+      self.call_count = 0
+
+    def __call__(self, *args, **kwargs):
+      self.call_count += 1
+      return super(CallCounter, self).decorated_target(*args, **kwargs)
+
+  def count_calls(target):
+    return CallCounter(target)
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools as _functools
+import inspect as _inspect
+
+
+def make_decorator(target,
+                   decorator_func,
+                   decorator_name=None,
+                   decorator_doc='',
+                   decorator_argspec=None):
+  """Make a decorator from a wrapper and a target.
+
+  Args:
+    target: The final callable to be wrapped.
+    decorator_func: The wrapper function.
+    decorator_name: The name of the decorator. If `None`, the name of the
+      function calling make_decorator.
+    decorator_doc: Documentation specific to this application of
+      `decorator_func` to `target`.
+    decorator_argspec: The new callable signature of this decorator.
+
+  Returns:
+    The `decorator_func` argument with new metadata attached.
+  """
+  if decorator_name is None:
+    decorator_name = _inspect.stack()[1][3]  # Caller's name.
+  decorator = TFDecorator(decorator_name, target, decorator_doc,
+                          decorator_argspec)
+  setattr(decorator_func, '_tf_decorator', decorator)
+  decorator_func.__name__ = target.__name__
+  decorator_func.__doc__ = decorator.__doc__
+  decorator_func.__wrapped__ = target
+  return decorator_func
+
+
+def unwrap(maybe_tf_decorator):
+  """Unwraps an object into a list of TFDecorators and a final target.
+
+  Args:
+    maybe_tf_decorator: Any callable object.
+
+  Returns:
+    A tuple whose first element is an list of TFDecorator-derived objects that
+    were applied to the final callable target, and whose second element is the
+    final undecorated callable target. If the `maybe_tf_decorator` parameter is
+    not decorated by any TFDecorators, the first tuple element will be an empty
+    list. The `TFDecorator` list is ordered from outermost to innermost
+    decorators.
+  """
+  decorators = []
+  cur = maybe_tf_decorator
+  while True:
+    if isinstance(cur, TFDecorator):
+      decorators.append(cur)
+    elif hasattr(cur, '_tf_decorator'):
+      decorators.append(getattr(cur, '_tf_decorator'))
+    else:
+      break
+    cur = decorators[-1].decorated_target
+  return decorators, cur
+
+
+class TFDecorator(object):
+  """Base class for all TensorFlow decorators.
+
+  TFDecorator captures and exposes the wrapped target, and provides details
+  about the current decorator.
+  """
+
+  def __init__(self,
+               decorator_name,
+               target,
+               decorator_doc='',
+               decorator_argspec=None):
+    self._decorated_target = target
+    self._decorator_name = decorator_name
+    self._decorator_doc = decorator_doc
+    self._decorator_argspec = decorator_argspec
+    self.__name__ = target.__name__
+    if self._decorator_doc:
+      self.__doc__ = self._decorator_doc
+    elif target.__doc__:
+      self.__doc__ = target.__doc__
+    else:
+      self.__doc__ = ''
+
+  def __get__(self, obj, objtype):
+    return _functools.partial(self.__call__, obj)
+
+  def __call__(self, *args, **kwargs):
+    return self._decorated_target(*args, **kwargs)
+
+  @property
+  def decorated_target(self):
+    return self._decorated_target
+
+  @property
+  def decorator_name(self):
+    return self._decorator_name
+
+  @property
+  def decorator_doc(self):
+    return self._decorator_doc
+
+  @property
+  def decorator_argspec(self):
+    return self._decorator_argspec
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f6a10b44081db2f5ce0d8ffb0333cd3c76fc269
--- /dev/null
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_decorator."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def test_tfdecorator(decorator_name, decorator_doc=None):
+
+  def make_tf_decorator(target):
+    return tf_decorator.TFDecorator(decorator_name, target, decorator_doc)
+
+  return make_tf_decorator
+
+
+def test_decorator_increment_first_int_arg(target):
+  """This test decorator skips past `self` as args[0] in the bound case."""
+
+  def wrapper(*args, **kwargs):
+    new_args = []
+    found = False
+    for arg in args:
+      if not found and isinstance(arg, int):
+        new_args.append(arg + 1)
+        found = True
+      else:
+        new_args.append(arg)
+    return target(*new_args, **kwargs)
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
+def test_function(x):
+  """Test Function Docstring."""
+  return x + 1
+
+
+@test_tfdecorator('decorator 1')
+@test_decorator_increment_first_int_arg
+@test_tfdecorator('decorator 3', 'decorator 3 documentation')
+def test_decorated_function(x):
+  """Test Decorated Function Docstring."""
+  return x * 2
+
+
+@test_tfdecorator('decorator')
+class TestDecoratedClass(object):
+  """Test Decorated Class."""
+
+  def __init__(self, two_attr=2):
+    self.two_attr = two_attr
+
+  @property
+  def two_prop(self):
+    return 2
+
+  def two_func(self):
+    return 2
+
+  @test_decorator_increment_first_int_arg
+  def return_params(self, a, b, c):
+    """Return parameters."""
+    return [a, b, c]
+
+
+class TfDecoratorTest(test.TestCase):
+
+  def testInitCapturesTarget(self):
+    self.assertIs(test_function,
+                  tf_decorator.TFDecorator('', test_function).decorated_target)
+
+  def testInitCapturesDecoratorName(self):
+    self.assertEqual('decorator name',
+                     tf_decorator.TFDecorator('decorator name',
+                                              test_function).decorator_name)
+
+  def testInitCapturesDecoratorDoc(self):
+    self.assertEqual('decorator doc',
+                     tf_decorator.TFDecorator('', test_function,
+                                              'decorator doc').decorator_doc)
+
+  def testInitCapturesNonNoneArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    self.assertIs(argspec,
+                  tf_decorator.TFDecorator('', test_function, '',
+                                           argspec).decorator_argspec)
+
+  def testInitSetsDecoratorNameToTargetName(self):
+    self.assertEqual('test_function',
+                     tf_decorator.TFDecorator('', test_function).__name__)
+
+  def testInitSetsDecoratorDocToTargetDoc(self):
+    self.assertEqual('Test Function Docstring.',
+                     tf_decorator.TFDecorator('', test_function).__doc__)
+
+  def testCallingATFDecoratorCallsTheTarget(self):
+    self.assertEqual(124, tf_decorator.TFDecorator('', test_function)(123))
+
+  def testCallingADecoratedFunctionCallsTheTarget(self):
+    self.assertEqual((2 + 1) * 2, test_decorated_function(2))
+
+  def testInitializingDecoratedClassWithInitParamsDoesntRaise(self):
+    try:
+      TestDecoratedClass(2)
+    except TypeError:
+      self.assertFail()
+
+  def testReadingClassAttributeOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_attr)
+
+  def testCallingClassMethodOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_func())
+
+  def testReadingClassPropertyOnDecoratedClass(self):
+    self.assertEqual(2, TestDecoratedClass().two_prop)
+
+  def testNameOnBoundProperty(self):
+    self.assertEqual('return_params',
+                     TestDecoratedClass().return_params.__name__)
+
+  def testDocstringOnBoundProperty(self):
+    self.assertEqual('Return parameters.',
+                     TestDecoratedClass().return_params.__doc__)
+
+
+def test_wrapper(*args, **kwargs):
+  return test_function(*args, **kwargs)
+
+
+class TfMakeDecoratorTest(test.TestCase):
+
+  def testAttachesATFDecoratorAttr(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertIsInstance(decorator, tf_decorator.TFDecorator)
+
+  def testAttachesWrappedAttr(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    wrapped_attr = getattr(decorated, '__wrapped__')
+    self.assertIs(test_function, wrapped_attr)
+
+  def testSetsTFDecoratorNameToDecoratorNameArg(self):
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper,
+                                            'test decorator name')
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test decorator name', decorator.decorator_name)
+
+  def testSetsTFDecoratorDocToDecoratorDocArg(self):
+    decorated = tf_decorator.make_decorator(
+        test_function, test_wrapper, decorator_doc='test decorator doc')
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test decorator doc', decorator.decorator_doc)
+
+  def testSetsTFDecoratorArgSpec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper, '', '',
+                                            argspec)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual(argspec, decorator.decorator_argspec)
+
+  def testSetsDecoratorNameToFunctionThatCallsMakeDecoratorIfAbsent(self):
+
+    def test_decorator_name(wrapper):
+      return tf_decorator.make_decorator(test_function, wrapper)
+
+    decorated = test_decorator_name(test_wrapper)
+    decorator = getattr(decorated, '_tf_decorator')
+    self.assertEqual('test_decorator_name', decorator.decorator_name)
+
+
+class TfDecoratorUnwrapTest(test.TestCase):
+
+  def testUnwrapReturnsEmptyArrayForUndecoratedFunction(self):
+    decorators, _ = tf_decorator.unwrap(test_function)
+    self.assertEqual(0, len(decorators))
+
+  def testUnwrapReturnsUndecoratedFunctionAsTarget(self):
+    _, target = tf_decorator.unwrap(test_function)
+    self.assertIs(test_function, target)
+
+  def testUnwrapReturnsFinalFunctionAsTarget(self):
+    self.assertEqual((4 + 1) * 2, test_decorated_function(4))
+    _, target = tf_decorator.unwrap(test_decorated_function)
+    self.assertTrue(tf_inspect.isfunction(target))
+    self.assertEqual(4 * 2, target(4))
+
+  def testUnwrapReturnsListOfUniqueTFDecorators(self):
+    decorators, _ = tf_decorator.unwrap(test_decorated_function)
+    self.assertEqual(3, len(decorators))
+    self.assertTrue(isinstance(decorators[0], tf_decorator.TFDecorator))
+    self.assertTrue(isinstance(decorators[1], tf_decorator.TFDecorator))
+    self.assertTrue(isinstance(decorators[2], tf_decorator.TFDecorator))
+    self.assertIsNot(decorators[0], decorators[1])
+    self.assertIsNot(decorators[1], decorators[2])
+    self.assertIsNot(decorators[2], decorators[0])
+
+  def testUnwrapReturnsDecoratorListFromOutermostToInnermost(self):
+    decorators, _ = tf_decorator.unwrap(test_decorated_function)
+    self.assertEqual('decorator 1', decorators[0].decorator_name)
+    self.assertEqual('test_decorator_increment_first_int_arg',
+                     decorators[1].decorator_name)
+    self.assertEqual('decorator 3', decorators[2].decorator_name)
+    self.assertEqual('decorator 3 documentation', decorators[2].decorator_doc)
+
+  def testUnwrapBoundMethods(self):
+    test_decorated_class = TestDecoratedClass()
+    self.assertEqual([2, 2, 3], test_decorated_class.return_params(1, 2, 3))
+    decorators, target = tf_decorator.unwrap(test_decorated_class.return_params)
+    self.assertEqual('test_decorator_increment_first_int_arg',
+                     decorators[0].decorator_name)
+    self.assertEqual([1, 2, 3], target(test_decorated_class, 1, 2, 3))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
new file mode 100644
index 0000000000000000000000000000000000000000..977b0df08b553fadf6ac2631dcd6cd4f3c2ad6c7
--- /dev/null
+++ b/tensorflow/python/util/tf_inspect.py
@@ -0,0 +1,141 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the inspect module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect as _inspect
+
+from tensorflow.python.util import tf_decorator
+
+ArgSpec = _inspect.ArgSpec
+
+
+def currentframe():
+  """TFDecorator-aware replacement for inspect.currentframe."""
+  return _inspect.stack()[1][0]
+
+
+def getargspec(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getargspec.
+
+  Args:
+    object: A callable, possibly decorated.
+
+  Returns:
+    The `ArgSpec` that describes the signature of the outermost decorator that
+    changes the callable's signature. If the callable is not decorated,
+    `inspect.getargspec()` will be called directly on the callable.
+  """
+  decorators, target = tf_decorator.unwrap(object)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), _inspect.getargspec(target))
+
+
+def getcallargs(func, *positional, **named):
+  """TFDecorator-aware replacement for inspect.getcallargs.
+
+  Args:
+    func: A callable, possibly decorated
+    *positional: The positional arguments that would be passed to `func`.
+    **named: The named argument dictionary that would be passed to `func`.
+
+  Returns:
+    A dictionary mapping `func`'s named arguments to the values they would
+    receive if `func(*positional, **named)` were called.
+
+  `getcallargs` will use the argspec from the outermost decorator that provides
+  it. If no attached decorators modify argspec, the final unwrapped target's
+  argspec will be used.
+  """
+  argspec = getargspec(func)
+  call_args = named.copy()
+  this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
+  if ismethod(func) and this:
+    positional = (this,) + positional
+  remaining_positionals = [arg for arg in argspec.args if arg not in call_args]
+  call_args.update(dict(zip(remaining_positionals, positional)))
+  default_count = 0 if not argspec.defaults else len(argspec.defaults)
+  if default_count:
+    for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
+      if arg not in call_args:
+        call_args[arg] = value
+  return call_args
+
+
+def getdoc(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getdoc.
+
+  Args:
+    object: An object, possibly decorated.
+
+  Returns:
+    The docstring associated with the object.
+
+  The outermost-decorated object is intended to have the most complete
+  documentation, so the decorated parameter is not unwrapped.
+  """
+  return _inspect.getdoc(object)
+
+
+def getfile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfile."""
+  return _inspect.getfile(tf_decorator.unwrap(object)[1])
+
+
+def getmembers(object, predicate=None):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmembers."""
+  return _inspect.getmembers(object, predicate)
+
+
+def getmro(cls):
+  """TFDecorator-aware replacement for inspect.getmro."""
+  return _inspect.getmro(cls)
+
+
+def getsource(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsource."""
+  return _inspect.getsource(tf_decorator.unwrap(object)[1])
+
+
+def isclass(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isclass."""
+  return _inspect.isclass(tf_decorator.unwrap(object)[1])
+
+
+def isfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isfunction."""
+  return _inspect.isfunction(tf_decorator.unwrap(object)[1])
+
+
+def ismethod(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismethod."""
+  return _inspect.ismethod(tf_decorator.unwrap(object)[1])
+
+
+def ismodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.ismodule(tf_decorator.unwrap(object)[1])
+
+
+def isroutine(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isroutine."""
+  return _inspect.isroutine(tf_decorator.unwrap(object)[1])
+
+
+def stack(context=1):
+  """TFDecorator-aware replacement for inspect.stack."""
+  return _inspect.stack(context)[1:]
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e8ffb30c3392251c2bf7076e02aafd2338696b
--- /dev/null
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -0,0 +1,327 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_inspect."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def test_decorator(decorator_name, decorator_doc=None):
+
+  def make_tf_decorator(target):
+    return tf_decorator.TFDecorator(decorator_name, target, decorator_doc)
+
+  return make_tf_decorator
+
+
+def test_undecorated_function():
+  pass
+
+
+@test_decorator('decorator 1')
+@test_decorator('decorator 2')
+@test_decorator('decorator 3')
+def test_decorated_function(x):
+  """Test Decorated Function Docstring."""
+  return x * 2
+
+
+@test_decorator('decorator')
+def test_decorated_function_with_defaults(a, b=2, c='Hello'):
+  """Test Decorated Function With Defaults Docstring."""
+  return [a, b, c]
+
+
+@test_decorator('decorator')
+class TestDecoratedClass(object):
+  """Test Decorated Class."""
+
+  def __init__(self):
+    pass
+
+  def two(self):
+    return 2
+
+
+class TfInspectTest(test.TestCase):
+
+  def testCurrentFrame(self):
+    self.assertEqual(inspect.currentframe(), tf_inspect.currentframe())
+
+  def testGetArgSpecOnDecoratorsThatDontProvideArgspec(self):
+    argspec = tf_inspect.getargspec(test_decorated_function_with_defaults)
+    self.assertEqual(['a', 'b', 'c'], argspec.args)
+    self.assertEqual((2, 'Hello'), argspec.defaults)
+
+  def testGetArgSpecOnDecoratorThatChangesArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
+                                         argspec)
+    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
+
+  def testGetArgSpecIgnoresDecoratorsThatDontProvideArgspec(self):
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator)
+    self.assertEqual(argspec, tf_inspect.getargspec(outer_decorator))
+
+  def testGetArgSpecReturnsOutermostDecoratorThatChangesArgspec(self):
+    outer_argspec = tf_inspect.ArgSpec(
+        args=['a'], varargs=None, keywords=None, defaults=None)
+    inner_argspec = tf_inspect.ArgSpec(
+        args=['b'], varargs=None, keywords=None, defaults=None)
+
+    inner_decorator = tf_decorator.TFDecorator('', test_undecorated_function,
+                                               '', inner_argspec)
+    outer_decorator = tf_decorator.TFDecorator('', inner_decorator, '',
+                                               outer_argspec)
+    self.assertEqual(outer_argspec, tf_inspect.getargspec(outer_decorator))
+
+  def testGetDoc(self):
+    self.assertEqual('Test Decorated Function With Defaults Docstring.',
+                     tf_inspect.getdoc(test_decorated_function_with_defaults))
+
+  def testGetFile(self):
+    self.assertTrue('tf_inspect_test.py' in tf_inspect.getfile(
+        test_decorated_function_with_defaults))
+    self.assertTrue('tf_decorator.py' in tf_inspect.getfile(
+        test_decorator('decorator')(tf_decorator.unwrap)))
+
+  def testGetMembers(self):
+    self.assertEqual(
+        inspect.getmembers(TestDecoratedClass),
+        tf_inspect.getmembers(TestDecoratedClass))
+
+  def testGetSource(self):
+    expected = '''@test_decorator('decorator')
+def test_decorated_function_with_defaults(a, b=2, c='Hello'):
+  """Test Decorated Function With Defaults Docstring."""
+  return [a, b, c]
+'''
+    self.assertEqual(
+        expected, tf_inspect.getsource(test_decorated_function_with_defaults))
+
+  def testIsClass(self):
+    self.assertTrue(tf_inspect.isclass(TestDecoratedClass))
+    self.assertFalse(tf_inspect.isclass(test_decorated_function))
+
+  def testIsFunction(self):
+    self.assertTrue(tf_inspect.isfunction(test_decorated_function))
+    self.assertFalse(tf_inspect.isfunction(TestDecoratedClass))
+
+  def testIsMethod(self):
+    self.assertTrue(tf_inspect.ismethod(TestDecoratedClass().two))
+    self.assertFalse(tf_inspect.ismethod(test_decorated_function))
+
+  def testIsModule(self):
+    self.assertTrue(
+        tf_inspect.ismodule(inspect.getmodule(inspect.currentframe())))
+    self.assertFalse(tf_inspect.ismodule(test_decorated_function))
+
+  def testIsRoutine(self):
+    self.assertTrue(tf_inspect.isroutine(len))
+    self.assertFalse(tf_inspect.isroutine(TestDecoratedClass))
+
+  def testStack(self):
+    expected_stack = inspect.stack()
+    actual_stack = tf_inspect.stack()
+    self.assertEqual(len(expected_stack), len(actual_stack))
+    self.assertEqual(expected_stack[0][0], actual_stack[0][0])  # Frame object
+    self.assertEqual(expected_stack[0][1], actual_stack[0][1])  # Filename
+    self.assertEqual(expected_stack[0][2],
+                     actual_stack[0][2] - 1)  # Line number
+    self.assertEqual(expected_stack[0][3], actual_stack[0][3])  # Function name
+    self.assertEqual(expected_stack[1:], actual_stack[1:])
+
+
+class TfInspectGetCallArgsTest(test.TestCase):
+
+  def testReturnsEmptyWhenUnboundFuncHasNoParameters(self):
+
+    def empty():
+      pass
+
+    self.assertEqual({}, tf_inspect.getcallargs(empty))
+
+  def testUnboundFuncWithOneParamPositional(self):
+
+    def func(a):
+      return a
+
+    self.assertEqual({'a': 5}, tf_inspect.getcallargs(func, 5))
+
+  def testUnboundFuncWithTwoParamsPositional(self):
+
+    def func(a, b):
+      return (a, b)
+
+    self.assertEqual({'a': 10, 'b': 20}, tf_inspect.getcallargs(func, 10, 20))
+
+  def testUnboundFuncWithOneParamKeyword(self):
+
+    def func(a):
+      return a
+
+    self.assertEqual({'a': 5}, tf_inspect.getcallargs(func, a=5))
+
+  def testUnboundFuncWithTwoParamsKeyword(self):
+
+    def func(a, b):
+      return (a, b)
+
+    self.assertEqual({'a': 6, 'b': 7}, tf_inspect.getcallargs(func, a=6, b=7))
+
+  def testUnboundFuncWithOneParamDefault(self):
+
+    def func(a=13):
+      return a
+
+    self.assertEqual({'a': 13}, tf_inspect.getcallargs(func))
+
+  def testUnboundFuncWithOneParamDefaultOnePositional(self):
+
+    def func(a=0):
+      return a
+
+    self.assertEqual({'a': 1}, tf_inspect.getcallargs(func, 1))
+
+  def testUnboundFuncWithTwoParamsDefaultOnePositional(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 5, 'b': 2}, tf_inspect.getcallargs(func, 5))
+
+  def testUnboundFuncWithTwoParamsDefaultTwoPositional(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 4}, tf_inspect.getcallargs(func, 3, 4))
+
+  def testUnboundFuncWithOneParamDefaultOneKeyword(self):
+
+    def func(a=1):
+      return a
+
+    self.assertEqual({'a': 3}, tf_inspect.getcallargs(func, a=3))
+
+  def testUnboundFuncWithTwoParamsDefaultOneKeywordFirst(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 2}, tf_inspect.getcallargs(func, a=3))
+
+  def testUnboundFuncWithTwoParamsDefaultOneKeywordSecond(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 1, 'b': 4}, tf_inspect.getcallargs(func, b=4))
+
+  def testUnboundFuncWithTwoParamsDefaultTwoKeywords(self):
+
+    def func(a=1, b=2):
+      return (a, b)
+
+    self.assertEqual({'a': 3, 'b': 4}, tf_inspect.getcallargs(func, a=3, b=4))
+
+  def testBoundFuncWithOneParam(self):
+
+    class Test(object):
+
+      def bound(self):
+        pass
+
+    t = Test()
+    self.assertEqual({'self': t}, tf_inspect.getcallargs(t.bound))
+
+  def testBoundFuncWithManyParamsAndDefaults(self):
+
+    class Test(object):
+
+      def bound(self, a, b=2, c='Hello'):
+        return (a, b, c)
+
+    t = Test()
+    self.assertEqual({
+        'self': t,
+        'a': 3,
+        'b': 2,
+        'c': 'Goodbye'
+    }, tf_inspect.getcallargs(t.bound, 3, c='Goodbye'))
+
+  def testClassMethod(self):
+
+    class Test(object):
+
+      @classmethod
+      def test(cls, a, b=3, c='hello'):
+        return (a, b, c)
+
+    self.assertEqual({
+        'cls': Test,
+        'a': 5,
+        'b': 3,
+        'c': 'goodbye'
+    }, tf_inspect.getcallargs(Test.test, 5, c='goodbye'))
+
+  def testUsesOutermostDecoratorsArgSpec(self):
+
+    def func():
+      pass
+
+    def wrapper(*args, **kwargs):
+      return func(*args, **kwargs)
+
+    decorated = tf_decorator.make_decorator(
+        func,
+        wrapper,
+        decorator_argspec=tf_inspect.ArgSpec(
+            args=['a', 'b', 'c'],
+            varargs=None,
+            keywords=None,
+            defaults=(3, 'hello')))
+
+    self.assertEqual({
+        'a': 4,
+        'b': 3,
+        'c': 'goodbye'
+    }, tf_inspect.getcallargs(decorated, 4, c='goodbye'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
new file mode 100644
index 0000000000000000000000000000000000000000..88df3351e66711632dcf74bd8875d6d89fabf908
--- /dev/null
+++ b/tensorflow/python/util/tf_should_use.py
@@ -0,0 +1,174 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Decorator that provides a warning if the wrapped object is never used."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import traceback
+import types
+
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import tf_decorator
+
+
+def _add_should_use_warning(x, fatal_error=False):
+  """Wraps object x so that if it is never used, a warning is logged.
+
+  Args:
+    x: Python object.
+    fatal_error: Python bool.  If `True`, tf.logging.fatal is raised
+      if the returned value is never used.
+
+  Returns:
+    An instance of `TFShouldUseWarningWrapper` which subclasses `type(x)`
+    and is a very shallow wrapper for `x` which logs access into `x`.
+  """
+  if x is None:  # special corner case where x is None
+    return x
+  has_been_used = getattr(x, '_tf_object_has_been_used', None)
+  if has_been_used is not None:
+    x._tf_object_has_been_used = has_been_used  # pylint: disable=protected-access
+    return x
+
+  def override_method(method):
+    def fn(self, *args, **kwargs):
+      self._tf_object_has_been_used = True  # pylint: disable=protected-access
+      return method(self, *args, **kwargs)
+    return fn
+
+  class TFShouldUseWarningWrapper(type(x)):
+    """Wrapper for objects that keeps track of their use."""
+
+    def __init__(self, true_self):
+      self.__dict__ = true_self.__dict__
+      stack = [x.strip() for x in traceback.format_stack()]
+      # Remove top three stack entries from adding the wrapper
+      self._tf_object_creation_stack = '\n'.join(stack[:-3])
+      self._tf_object_has_been_used = False
+
+    # Not sure why this pylint warning is being used; this is not an
+    # old class form.
+    # pylint: disable=super-on-old-class
+    def __getattribute__(self, name):
+      if name != '_tf_object_has_been_used':
+        self._tf_object_has_been_used = True
+      return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+
+    def __del__(self):
+      if not self._tf_object_has_been_used:
+        if fatal_error:
+          logger = tf_logging.fatal
+        else:
+          logger = tf_logging.error
+        logger(
+            '==================================\n'
+            'Object was never used (type %s):\n%s\nIf you want to mark it as '
+            'used call its "mark_used()" method.\nIt was originally created '
+            'here:\n%s\n'
+            '==================================' %
+            (type(x), x, self._tf_object_creation_stack))
+
+      if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
+        return super(TFShouldUseWarningWrapper, self).__del__()
+
+    def mark_used(self, *args, **kwargs):
+      self._tf_object_has_been_used = True
+      if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
+        return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
+    # pylint: enable=super-on-old-class
+
+  for name in dir(TFShouldUseWarningWrapper):
+    method = getattr(TFShouldUseWarningWrapper, name)
+    if not isinstance(method, types.FunctionType):
+      continue
+    if name in ('__init__', '__getattribute__', '__del__', 'mark_used'):
+      continue
+    setattr(TFShouldUseWarningWrapper, name,
+            functools.wraps(method)(override_method(method)))
+
+  wrapped = TFShouldUseWarningWrapper(x)
+  wrapped.__doc__ = x.__doc__  # functools.wraps fails on some objects.
+  wrapped._tf_object_has_been_used = False   # pylint: disable=protected-access
+  return wrapped
+
+
+def should_use_result(fn):
+  """Function wrapper that ensures the function's output is used.
+
+  If the output is not used, a `tf.logging.error` is logged.
+
+  An output is marked as used if any of its attributes are read, modified, or
+  updated.  Examples when the output is a `Tensor` include:
+
+  - Using it in any capacity (e.g. `y = t + 0`, `sess.run(t)`)
+  - Accessing a property (e.g. getting `t.name` or `t.op`).
+
+  Note, certain behaviors cannot be tracked - for these the object may not
+  be marked as used.  Examples include:
+
+  - `t != 0`.  In this case, comparison is done on types / ids.
+  - `isinstance(t, tf.Tensor)`.  Similar to above.
+
+  Args:
+    fn: The function to wrap.
+
+  Returns:
+    The wrapped function.
+  """
+  def wrapped(*args, **kwargs):
+    return _add_should_use_warning(fn(*args, **kwargs))
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'should_use_result',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function should be used.  If it is not, '
+        'a warning will be logged.  To mark the output as used, '
+        'call its .mark_used() method.')))
+
+
+def must_use_result_or_fatal(fn):
+  """Function wrapper that ensures the function's output is used.
+
+  If the output is not used, a `tf.logging.fatal` error is raised.
+
+  An output is marked as used if any of its attributes are read, modified, or
+  updated.  Examples when the output is a `Tensor` include:
+
+  - Using it in any capacity (e.g. `y = t + 0`, `sess.run(t)`)
+  - Accessing a property (e.g. getting `t.name` or `t.op`).
+
+  Note, certain behaviors cannot be tracked - for these the object may not
+  be marked as used.  Examples include:
+
+  - `t != 0`.  In this case, comparison is done on types / ids.
+  - `isinstance(t, tf.Tensor)`.  Similar to above.
+
+  Args:
+    fn: The function to wrap.
+
+  Returns:
+    The wrapped function.
+  """
+  def wrapped(*args, **kwargs):
+    return _add_should_use_warning(fn(*args, **kwargs), fatal_error=True)
+  return tf_decorator.make_decorator(
+      fn, wrapped, 'must_use_result_or_fatal',
+      ((fn.__doc__ or '') +
+       ('\n\n  '
+        '**NOTE** The output of this function must be used.  If it is not, '
+        'a fatal error will be raised.  To mark the output as used, '
+        'call its .mark_used() method.')))
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d48e3dde308c9af59b8dce6a06c4c7d587e24a
--- /dev/null
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for tf_should_use."""
+
+# pylint: disable=unused-import
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import tf_should_use
+
+
+@contextlib.contextmanager
+def reroute_error(captured):
+  """Temporarily reroute errors written to tf_logging.error into `captured`."""
+  del captured[:]
+  true_logger = tf_logging.error
+  def capture_errors(*args, **unused_kwargs):
+    captured.extend(args)
+  tf_logging.error = capture_errors
+  try:
+    yield
+  finally:
+    tf_logging.error = true_logger
+
+
+class TfShouldUseTest(test.TestCase):
+
+  def testAddShouldUseWarningWhenNotUsed(self):
+    c = constant_op.constant(0, name='blah')
+    captured = []
+    with reroute_error(captured):
+      def in_this_function():
+        h = tf_should_use._add_should_use_warning(c)
+        del h
+      in_this_function()
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('in_this_function', '\n'.join(captured))
+
+  def _testAddShouldUseWarningWhenUsed(self, fn):
+    c = constant_op.constant(0, name='blah')
+    captured = []
+    with reroute_error(captured):
+      h = tf_should_use._add_should_use_warning(c)
+      fn(h)
+      del h
+    self.assertNotIn('Object was never used', '\n'.join(captured))
+    self.assertNotIn('blah:0', '\n'.join(captured))
+
+  def testAddShouldUseWarningWhenUsedWithAdd(self):
+    def add(h):
+      _ = h + 1
+    self._testAddShouldUseWarningWhenUsed(add)
+
+  def testAddShouldUseWarningWhenUsedWithGetName(self):
+    def get_name(h):
+      _ = h.name
+    self._testAddShouldUseWarningWhenUsed(get_name)
+
+  def testShouldUseResult(self):
+    @tf_should_use.should_use_result
+    def return_const(value):
+      return constant_op.constant(value, name='blah')
+    captured = []
+    with reroute_error(captured):
+      return_const(0.0)
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('return_const', '\n'.join(captured))
+
+  def testShouldUseResultWhenNotReallyUsed(self):
+    @tf_should_use.should_use_result
+    def return_const(value):
+      return constant_op.constant(value, name='blah')
+    captured = []
+    with reroute_error(captured):
+      with self.test_session():
+        return_const(0.0)
+        # Creating another op and executing it does not mark the
+        # unused op as being "used".
+        v = constant_op.constant(1.0, name='meh')
+        v.eval()
+    self.assertIn('Object was never used', '\n'.join(captured))
+    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('return_const', '\n'.join(captured))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index 5cce6b936568cc6ea3257b7ba591f8193d61f2c6..aa68321acc858c902d1a43600a14ac5d88edb0be 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -75,7 +75,7 @@ class Diagnostician {
 
   // Given the DSO version number and the driver version file contents, extracts
   // the driver version and compares, warning the user in the case of
-  // incompatability.
+  // incompatibility.
   //
   // This is solely used for more informative log messages when the user is
   // running on a machine that happens to have a libcuda/kernel driver mismatch.
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index eed0d43a3c11bcb01ceea6c3904c2fe243cddcc3..b6d841f3653f211af56affaa3618562ef2c188c3 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1202,7 +1202,8 @@ class CudnnRnnSequenceTensorDescriptor
     // Only the first one needs to be destroyed. All others are the same.
     cudnnStatus_t status =
         wrap::cudnnDestroyTensorDescriptor(parent_, handles_[0]);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy sequence tensor descriptor");
+    CUDNN_RETURN_IF_FAIL(status,
+                         "Failed to destroy sequence tensor descriptor");
   }
 
   const cudnnTensorDescriptor_t* handles() const {
@@ -3089,7 +3090,7 @@ bool CudnnSupport::DoPoolForward(
     DeviceMemory<double>* output_data) {
   mutex_lock lock{dnn_handle_mutex_};
   auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                        AsCUDAStreamValue(stream));
+                                     AsCUDAStreamValue(stream));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
@@ -3195,7 +3196,7 @@ bool CudnnSupport::DoPoolBackward(
     DeviceMemory<double>* output_diff_data) {
   mutex_lock lock{dnn_handle_mutex_};
   auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                        AsCUDAStreamValue(stream));
+                                     AsCUDAStreamValue(stream));
   if (status != CUDNN_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
     return false;
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e441321fc86da830a9f5212d9a6a89763d140344..76778dbeececdd476ce6dce1814c8d2845bfbfc8 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 
-#include <map>
 #include <stdint.h>
 #include <stdlib.h>
+#include <map>
 #include <set>
+#include <utility>
 
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
@@ -227,7 +228,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current  = CUDADriver::CurrentContextOrDie();
+  CUcontext current = CUDADriver::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -453,7 +454,8 @@ static port::Status InternalInit() {
   return true;
 }
 
-bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
+bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
+                                 int *flags) {
   static_assert(DeviceOptions::kMask == 0xf,
                 "needs update for new device options");
 
@@ -480,27 +482,56 @@ bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
     CUdevice device, DeviceOptions device_options, CudaContext** context) {
   *context = nullptr;
 
-  CUcontext former_context = CurrentContext();
-  if (former_context != nullptr) {
-    LOG(WARNING) << "creating context when one is currently active; existing: "
-                 << former_context;
-  }
-
   int flags = 0;
   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
     LOG(WARNING) << "could not convert all device options into context flags";
   }
 
   CUresult res;
+  CUcontext former_context;
   CUcontext new_context;
   {
     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
     // context creation: see http://b/13248943
 
 #if CUDA_VERSION >= 7000
-    res = cuDevicePrimaryCtxSetFlags(device, flags);
+    {
+      unsigned int former_primary_context_flags;
+      int former_primary_context_is_active;
+      CHECK_EQ(CUDA_SUCCESS,
+               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                          &former_primary_context_is_active));
+      if (former_primary_context_flags != flags) {
+        if (former_primary_context_is_active) {
+          LOG(ERROR)
+              << "The primary context is active and has a different flag set ("
+              << former_primary_context_flags << ") than the desired flag set ("
+              << flags << ").";
+        } else {
+          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+        }
+      }
+    }
+
+    former_context = CUDADriver::CurrentContextOrDie();
     res = cuDevicePrimaryCtxRetain(&new_context, device);
+    if (former_context != nullptr) {
+      if (former_context == new_context) {
+        VLOG(2) << "The primary context " << former_context
+                << " exists before initializing the StreamExecutor.";
+      } else {
+        LOG(WARNING) << "A non-primary context " << former_context
+                     << " exists before initializing the StreamExecutor. We "
+                        "haven't verified StreamExecutor works with that.";
+      }
+    }
 #else
+    former_context = CurrentContext();
+    if (former_context != nullptr) {
+      LOG(WARNING)
+          << "creating context when one is currently active; existing: "
+          << former_context;
+    }
     res = cuCtxCreate(&new_context, flags, device);
 #endif
   }
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index c5d7d8b32f3a4212676565b5ac133be59143dd83..68494aba6597c2cd1ee52a7b4cb411cd50fad77b 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -77,7 +77,7 @@ class CUDADriver {
 
   // Destroys a CUDA stream associated with the given context.
   // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfuly destroyed.
+  // if the stream is successfully destroyed.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
   static void DestroyStream(CudaContext* context, CUstream *stream);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index 46f0232b1ddd7c706b2de1cb6fae9a8ec496d2d0..56667e65d38199fd4c340147c4e40a17c5bb2b2d 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -46,7 +46,7 @@ class CUDAEvent : public internal::EventInterface {
   // Polls the CUDA platform for the event's current status.
   Event::Status PollForStatus();
 
-  // The underyling CUDA event element.
+  // The underlying CUDA event element.
   const CUevent& cuda_event();
 
  private:
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 1bb90afd63e2eee3a51b057cebeff2c3cb2eac8f..43c707730af712d3e85b9e8c2dba27981edf41aa 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -67,14 +67,6 @@ limitations under the License.
 extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
-namespace perftools {
-namespace gputools {
-namespace rng {
-class RngSupport;
-}  // namespace rng
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
@@ -855,7 +847,7 @@ void *CUDAExecutor::CudaContextHack() { return context_; }
 
 CudaContext* CUDAExecutor::cuda_context() { return context_; }
 
-// Attemps to read the NUMA node corresponding to the GPU device's PCI bus out
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
 //
 // For anything more complicated/prod-focused than this, you'll likely want to
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 9d386b5ed9ef891751b59c560f3fa1696166d77e..6c5b9dca90b8be632d084aff46657132807b8ea5 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -35,17 +35,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-namespace perftools {
-namespace gputools {
-namespace blas {
-class BlasSupport;
-}
-namespace internal {
-class RngSupport;
-}  // namespace internal
-}  // namespace gputools
-}  // namespace perftools
-
 namespace perftools {
 namespace gputools {
 namespace cuda {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 2cd4eda4119956154bc4563a6fb9e139e5f720ce..c5805064f3c164fcde163bcc6cf00c049af4e8d3 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -1276,23 +1276,29 @@ class DnnSupport {
   virtual bool DoPoolForward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
-                             const DeviceMemory<double>& input_data,
+                             const DeviceMemory<float>& input_data,
                              const dnn::BatchDescriptor& output_dimensions,
-                             DeviceMemory<double>* output_data) = 0;
+                             DeviceMemory<float>* output_data) = 0;
 
   virtual bool DoPoolForward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
-                             const DeviceMemory<float>& input_data,
+                             const DeviceMemory<double>& input_data,
                              const dnn::BatchDescriptor& output_dimensions,
-                             DeviceMemory<float>* output_data) = 0;
+                             DeviceMemory<double>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for double.";
+    return false;
+  }
 
   virtual bool DoPoolForward(Stream* stream,
                              const dnn::PoolingDescriptor& pooling_dimensions,
                              const dnn::BatchDescriptor& input_dimensions,
                              const DeviceMemory<Eigen::half>& input_data,
                              const dnn::BatchDescriptor& output_dimensions,
-                             DeviceMemory<Eigen::half>* output_data) = 0;
+                             DeviceMemory<Eigen::half>* output_data) {
+    LOG(FATAL) << "DoPoolForward not implemented for float16.";
+    return false;
+  }
 
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
@@ -1302,7 +1308,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<double>& output_data,
                               const DeviceMemory<double>& input_diff_data,
-                              DeviceMemory<double>* output_diff_data) = 0;
+                              DeviceMemory<double>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1311,7 +1320,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<float>& output_data,
                               const DeviceMemory<float>& input_diff_data,
-                              DeviceMemory<float>* output_diff_data) = 0;
+                              DeviceMemory<float>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1320,7 +1332,10 @@ class DnnSupport {
                               const dnn::BatchDescriptor& output_dimensions,
                               const DeviceMemory<Eigen::half>& output_data,
                               const DeviceMemory<Eigen::half>& input_diff_data,
-                              DeviceMemory<Eigen::half>* output_diff_data) = 0;
+                              DeviceMemory<Eigen::half>* output_diff_data) {
+    LOG(FATAL) << "DoPoolBackward not implemented.";
+    return false;
+  }
 
   // Applies local response normalization to the values from
   // input_data and writes the result to output_data. See comments on
@@ -1900,4 +1915,3 @@ class DnnSupport {
 }  // namespace perftools
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
-
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 0aec2917dc224b8e6ebb3f4486a81b72778a5c15..8c289e1927fcf9b49851389e44d3fa09fdfea3ae 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -29,8 +29,7 @@ namespace port {
 
 using Status = tensorflow::Status;
 
-#define SE_CHECK_OK(val) \
-  CHECK_EQ(::perftools::gputools::port::Status::OK(), (val))
+#define SE_CHECK_OK(val) TF_CHECK_OK(val)
 #define SE_ASSERT_OK(val) \
   ASSERT_EQ(::perftools::gputools::port::Status::OK(), (val))
 
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 2a6df910abe5d4bf22203f90d345f0b133c4976b..bb423e390aa7ab32b3b388ae747a0e5d7856484a 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -135,7 +135,7 @@ class StatusOr {
   // operators, to support move-only types and avoid unnecessary copying.
   StatusOr(T&& value);  // NOLINT
 
-  // Move conversion operator to avoid unecessary copy.
+  // Move conversion operator to avoid unnecessary copy.
   // T must be assignable from U.
   // Not marked with explicit so the implicit conversion can happen.
   template <typename U>
@@ -202,13 +202,13 @@ StatusOr<T>::StatusOr(const T& value)
 
 template <typename T>
 const T& StatusOr<T>::ValueOrDie() const {
-  assert(status_.ok());
+  TF_CHECK_OK(status_);
   return value_;
 }
 
 template <typename T>
 T StatusOr<T>::ConsumeValueOrDie() {
-  assert(status_.ok());
+  TF_CHECK_OK(status_);
   return std::move(value_);
 }
 
diff --git a/tensorflow/stream_executor/plugin.h b/tensorflow/stream_executor/plugin.h
index b1db8b7cb8760afd7f16736991edc89b56f3b486..0b88b86e2b1cf8cbd3dddfa5ca3ae9cdc779a952 100644
--- a/tensorflow/stream_executor/plugin.h
+++ b/tensorflow/stream_executor/plugin.h
@@ -49,7 +49,7 @@ enum class PluginKind {
 //
 // A PluginConfig may be passed to the StreamExecutor constructor - the plugins
 // described therein will be used to provide BLAS, DNN, FFT, and RNG
-// functionality. Platform-approprate defaults will be used for any un-set
+// functionality. Platform-appropriate defaults will be used for any un-set
 // libraries. If a platform does not support a specified plugin (ex. cuBLAS on
 // an OpenCL executor), then an error will be logged and no plugin operations
 // will succeed.
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 751ccd3d0ef0c01d7bf84def148ae48a060d10f8..9d3ac4ed9ed9d3e2e01eaee791a040530c0494bf 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -319,7 +319,7 @@ class StreamExecutorInterface {
   // Creates a new DnnSupport object, ownership is transferred to the caller.
   // If SupportsDnn() is false, this will always return null.
   //
-  // If SupportsDnn() is true, this may return null, for example, if the RNG
+  // If SupportsDnn() is true, this may return null, for example, if the DNN
   // initialization fails.
   virtual dnn::DnnSupport *CreateDnn() { return nullptr; }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 42fcd5867cae4f8306174afb32f439c3cebe13bf..fe5da12639fdb73b18e9b5526b00e101dd509e25 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
 #include <atomic>
+#include <utility>
 
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
@@ -204,7 +205,7 @@ StreamExecutor::~StreamExecutor() {
 port::Status StreamExecutor::Init(int device_ordinal,
                                   DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
-  return implementation_->Init(device_ordinal, device_options);
+  return implementation_->Init(device_ordinal, std::move(device_options));
 }
 
 port::Status StreamExecutor::Init() {
@@ -619,7 +620,7 @@ bool StreamExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
 
 bool StreamExecutor::HostCallback(Stream *stream,
                                   std::function<void()> callback) {
-  return implementation_->HostCallback(stream, callback);
+  return implementation_->HostCallback(stream, std::move(callback));
 }
 
 port::Status StreamExecutor::AllocateEvent(Event *event) {
@@ -689,7 +690,7 @@ bool StreamExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
 }
 
 void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
-  background_threads_->Schedule(task);
+  background_threads_->Schedule(std::move(task));
 }
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 5c52afa794474cdb7cdd156a6f1d101d3f168f6e..780d12c8dce695877d25d9e3ae0267d0bf81cb75 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -205,7 +205,7 @@ class StreamExecutor {
   // This should be done before deallocating the region with delete[]/free/etc.
   bool HostMemoryUnregister(void *location) SE_MUST_USE_RESULT;
 
-  // Synchronizes all activity occuring in the StreamExecutor's context (most
+  // Synchronizes all activity occurring in the StreamExecutor's context (most
   // likely a whole device).
   bool SynchronizeAllActivity() SE_MUST_USE_RESULT;
 
@@ -238,7 +238,7 @@ class StreamExecutor {
                                     DeviceMemoryBase *gpu_dst);
 
   // Alternative interface for memcpying from host to device that takes an
-  // array slice. Checks that the destination size can accomodate the host
+  // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <class T>
   port::Status SynchronousMemcpyH2D(port::ArraySlice<T> host_src,
@@ -253,7 +253,7 @@ class StreamExecutor {
                                     void *host_dst);
 
   // Alternative interface for memcpying from device to host that takes an
-  // array slice. Checks that the destination size can accomodate the host
+  // array slice. Checks that the destination size can accommodate the host
   // slice size.
   template <typename T>
   port::Status SynchronousMemcpyD2H(const DeviceMemory<T> &gpu_src,
diff --git a/tensorflow/tensorboard/.bowerrc b/tensorflow/tensorboard/.bowerrc
deleted file mode 100644
index 333544ec7a8316f772b3524f40e407d2b9117b84..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/.bowerrc
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "directory" : "components"
-}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/.gitignore b/tensorflow/tensorboard/.gitignore
deleted file mode 100644
index 98b964254550f146b752efbb0cb11a0e95e28525..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/.gitignore
+++ /dev/null
@@ -1,27 +0,0 @@
-node_modules/*
-typings/*
-build/*
-dist/tf-tensorboard-demo.html
-
-# Since bower components are stored in the same directory as
-# tensorboard components, we ignore everything under components
-# except our own components which start with tf-.
-components/*
-# This rule should always be in sync with TF_COMPONENTS_TYPESCRIPT_GLOB
-# in gulpfile.js
-!components/tf-*
-!components/tf_*
-!components/vz-*
-!components/vz_*
-!components/index.html
-!components/BUILD
-# Ignore the sample graph files since they are too large to
-# be in the repo.
-components/tf-graph/demo/tf_model_zoo/*
-
-# All standalone code for TensorBoard components should be written in
-# typescript, and the compiled javascript code should be ignored.
-components/tf-*/**/*.js
-components/tf_*/**/*.js
-components/vz-*/**/*.js
-components/vz_*/**/*.js
diff --git a/tensorflow/tensorboard/BUILD b/tensorflow/tensorboard/BUILD
index ea409a931241e9ac042f5c17f5b19909e5c44352..11715319a01a4e6c97f72f5820513c92ace8c873 100644
--- a/tensorflow/tensorboard/BUILD
+++ b/tensorflow/tensorboard/BUILD
@@ -1,58 +1,42 @@
 # Description:
 # TensorBoard, a dashboard for investigating TensorFlow
 
-package(
-    default_visibility = ["//tensorflow:internal"],
-    features = [
-        "-layering_check",
-        "-parse_headers",
-    ],
-)
+package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-filegroup(
-    name = "frontend",
-    srcs = [
-        "TAG",
-        "dist/bazel-html-imports.html",
-        "dist/index.html",
-        "dist/tf-tensorboard.html",
-        "//tensorflow/tensorboard/bower",
-        "//tensorflow/tensorboard/lib:all_files",
-    ],
-)
-
 py_binary(
     name = "tensorboard",
-    srcs = [
-        "__main__.py",
-        "tensorboard.py",
-    ],
-    data = [":frontend"],
+    srcs = ["tensorboard.py"],
+    data = [":assets"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:platform",
         "//tensorflow/tensorboard/backend:application",
         "//tensorflow/tensorboard/backend/event_processing:event_file_inspector",
+        "//tensorflow/tensorboard/plugins/projector:projector_plugin",
+        "//tensorflow/tensorboard/plugins/text:text_plugin",
         "@org_pocoo_werkzeug//:werkzeug",
     ],
 )
 
+filegroup(
+    name = "assets",
+    srcs = [
+        "TAG",
+        "//tensorflow/tensorboard/components:index.html",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
-        ["**/*"],
+        ["**"],
         exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/node_modules/**",
-            "**/typings/**",
+            "METADATA",
+            "OWNERS",
+            "tensorboard.google.bzl",
         ],
     ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/DEVELOPMENT.md b/tensorflow/tensorboard/DEVELOPMENT.md
index 0a35dec42fb632fcae3ac7d1f3af3c88831365ce..8e86bf04db400505d3d56f2de2c8165b6a6cc7e3 100644
--- a/tensorflow/tensorboard/DEVELOPMENT.md
+++ b/tensorflow/tensorboard/DEVELOPMENT.md
@@ -2,125 +2,24 @@
 
 ## Launching a Development Instance
 
-The first step is getting a TensorBoard development environment set up. You
-should start by making sure you have [nodejs](https://nodejs.org/en/) and
-[npm](https://www.npmjs.com/). On Ubuntu, `sudo apt-get install -y nodejs
-nodejs-legacy npm`. Ensure your npm version is >=3.0 by running
-'npm --version'. If the version is <3.0, run 'sudo npm install npm -g' to
-update to the latest version. You may need to open a new terminal window after
-updating in order to make use of the newly-installed version.
+Run the following to launch a demo of TensorBoard in raw sources mode:
 
-Next, you'll want to install [gulp](http://gulpjs.com/) and
-[bower](http://bower.io/), which are used for build tooling and dependency
-management respectively. Both must be installed globally: `sudo npm install -g
-gulp bower` will do that.
+```sh
+bazel run third_party/tensorflow/tensorboard/components/tf_tensorboard_d3v4:demo
+```
 
-Then, cd into the TensorBoard directory:
-
-`cd tensorflow/tensorboard`
-
-and install dependencies:
-
-`npm run prepare`
-
-Then, run gulp: `gulp`
-
-(Don't worry if there are some linter errors.)
-
-Now you can navigate to
-[http://localhost:8000/demo/index.html](http://localhost:8000/demo/index.html)
-and play with the demo TensorBoard instance. If you make changes to the source
-code, `gulp` should detect it, recompile (if Typescript), and reload your
-browser.
+Now you can navigate to <http://localhost:6006/demo/index.html> and play with
+the demo TensorBoard instance. This will have live source reloading.
 
 This demo TensorBoard will have a small amount of demo data generated by
 [generate_testdata.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/generate_testdata.py).
 You can use [serialize_tensorboard.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/scripts/serialize_tensorboard.py)
 to create a realistic demo directory from your own data files.
 
-## Launching TensorBoard with modified source
-
-If you are developing in open source, and have made some changes to TensorBoard
-that you'd like to try out on real data, then you need to regenerate
-`dist/tf-tensorboard.html`.
-
-Run `gulp regenerate`. That will recompile all of the TensorBoard assets, and
-produce a new tf-tensorboard.html with your changes.
-
-Now, you can use `bazel` to launch TensorBoard:
-
-`bazel run //tensorflow/tensorboard:tensorboard -- --logdir=/path/to/logs`.
-
-## Updating the vulcanized HTML file (for linux)
-
-The vulcanized HTML file `dist/tf-tensorboard.html.OPENSOURCE` is the version of
-Tensorboard started up by users who install TensorFlow via pip. Today, updating
-that file involves using gulp. Future efforts will streamline this process.
-
-First, `cd` into the `tensorflow/tensorboard` directory within a git repository
-(a piper client will not work). Run `npm run prepare`.
-
-Next, we build some third party JS dependencies via webfiles targets. Run
-
-    bazel build \
-        tensorflow/tensorboard/components/tf_imports:d3 \
-        tensorflow/tensorboard/components/tf_imports:lodash \
-        tensorflow/tensorboard/components/tf_imports:graphlib \
-        tensorflow/tensorboard/components/tf_imports:dagre \
-        tensorflow/tensorboard/components/tf_imports:plottable
-
-Users internal to Google should use the internal build tool instead. Move the
-output JS binaries into the tf_imports directory.
-
-Run `gulp vulcanize`. If compilation errors arise (such as those related to
-TypeScript), fix them and re-run. This step should update the contents of
-`dist/tf-tensorboard.html.OPENSOURCE`.
-
-Next, we perform some manual find-and-replaces on script `src` paths within
-`dist/tf-tensorboard.html.OPENSOURCE`. Manually replace:
-
-* `<script src="../tf-imports/d3.js"></script>` with `<script src="../d3/d3.js"></script>`
-* `<script src="../tf-imports/dagre.js"></script>` with `<script src="../dagre/dist/dagre.core.js"></script>`
-* `<script src="../tf-imports/graphlib.js"></script>` with `<script src="../graphlib/dist/graphlib.core.js"></script>`
-* `<script src="../tf-imports/lodash.js"></script>` with `<script src="../lodash/lodash.min.js"></script>`
-* `<script src="../tf-imports/plottable.js"></script>` with `<script src="../plottable/plottable.js"></script>`
-
-Also, remove duplicate instances of script includes. Each of those scripts
-should only be included once (the first time) within the vulcanized output.
-
-### Try out the vulcanized Tensorboard HTML output
-
-To test the vulcanized output, prepare a pip package within a virtualized
-environment, and run `tensorboard` after activating the environment.
-
-To do that, we first create and activate a virtual environment called say
-`tf_foo` (Pick your own name.).
-
-    virtualenv --system-site-packages ~/tf_foo
-    source ~/tf_foo/bin/activate
-
-Make sure that you have installed `pip` and `virtualenv` beforehand. If not, run
-
-    sudo easy_install pip
-    sudo pip install --upgrade virtualenv
-
-Next, we run this command from the `tensorflow directory`.
-
-    tools/google/make_tree.sh --pip_dir=/tmp/pip_dir
-
-to create a pip package. If you are running within Google, also provide the
-`--pending_cl` flag. That script will generate a wheel file (.whl) within
-`/tmp/pip_dir`. Lets say that it is
-`tensorflow-1.0.0rc2-cp27-none-linux_x86_64.whl`.
-
-Run
-
-    pip install --upgrade /tmp/pip_dir/tensorflow-1.0.0rc2-cp27-none-linux_x86_64.whl
+## Launching TensorBoard Proper
 
-to update the pip installation of TensorFlow within the virtual environment.
-Verify that the `tensorboard` command defers to the tensorboard instance
-installed within your virtual environment (`tf_foo`) by running
-`which tensorboard`. To run tensorboard, start it up as usual within the virtual
-environment:
+Running TensorBoard automatically asks Bazel to create a vulcanized HTML binary:
 
-    tensorboard --logdir=/tmp/my/logdir
+```sh
+bazel run //tensorflow/tensorboard:tensorboard -- --logdir=/path/to/logs
+```
diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md
index c9e997044c7d0858ed1434c927a290b550ccfdfe..20be8593cb3c96c732a377c0ad8a259e71514f1d 100644
--- a/tensorflow/tensorboard/README.md
+++ b/tensorflow/tensorboard/README.md
@@ -303,19 +303,14 @@ events. This behavior may be disabled with the flag
 
 ### How can I export data from TensorBoard?
 
-If you'd like to export data to visualize elsewhere (e.g. iPython Notebook),
-that's possible too. You can directly depend on the underlying classes that
-TensorBoard uses for loading data: `python/summary/event_accumulator.py` (for
-loading data from a single run) or `python/summary/event_multiplexer.py` (for
-loading data from multiple runs, and keeping it organized). These classes load
-groups of event files, discard data that was "orphaned" by TensorFlow crashes,
-and organize the data by tag.
-
-As another option, there is a script
-(`tensorboard/scripts/serialize_tensorboard.py`) which will load a logdir just
-like TensorBoard does, but write all of the data out to disk as json instead of
-starting a server. This script is setup to make "fake TensorBoard backends" for
-testing, so it is a bit rough around the edges.
+The Scalar Dashboard supports exporting data; you can click the "enable
+download links" option in the left-hand bar. Then, each plot will provide
+download links for the data it contains.
+
+If you need access to the full dataset, you can read the event files that
+TensorBoard consumes by using the [`summary_iterator`](https://github.com/tensorflow/tensorflow/blob/e7f333b5f8b3c53b21d149d8d14c0cebbde431aa/tensorflow/python/summary/summary_iterator.py#L313)
+method.
+
 
 ### Can I overlap multiple plots?
 
diff --git a/tensorflow/tensorboard/TAG b/tensorflow/tensorboard/TAG
index 82cced27d7be32719d009707139bd949ad6263c9..fb1e7bc86996a80d4a16529b990adda1d3434c92 100644
--- a/tensorflow/tensorboard/TAG
+++ b/tensorflow/tensorboard/TAG
@@ -1 +1 @@
-51
+54
diff --git a/tensorflow/tensorboard/backend/BUILD b/tensorflow/tensorboard/backend/BUILD
index b99e6c565598c51edfc853f114ef65aba71c5eb6..adbdea5f45776c64a8276230a310fb369830d4f7 100644
--- a/tensorflow/tensorboard/backend/BUILD
+++ b/tensorflow/tensorboard/backend/BUILD
@@ -57,7 +57,7 @@ py_test(
 py_library(
     name = "application",
     srcs = ["application.py"],
-    data = ["//tensorflow/tensorboard:frontend"],
+    data = ["//tensorflow/tensorboard:assets"],
     srcs_version = "PY2AND3",
     deps = [
         ":http_util",
@@ -65,9 +65,6 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
         "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "//tensorflow/tensorboard/plugins/debugger:debugger_plugin",
-        "//tensorflow/tensorboard/plugins/projector:projector_plugin",
-        "//tensorflow/tensorboard/plugins/text:text_plugin",
         "@org_pocoo_werkzeug//:werkzeug",
         "@six_archive//:six",
     ],
@@ -90,6 +87,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/tensorboard",
         "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
+        "//tensorflow/tensorboard/plugins:base_plugin",
         "@org_pocoo_werkzeug//:werkzeug",
     ],
 )
diff --git a/tensorflow/tensorboard/backend/application.py b/tensorflow/tensorboard/backend/application.py
index e812880bbdadfe9c37fdc3bd06dd97a1950349b2..c38ad92be6934a0250e8555d9de11c0786b838a9 100644
--- a/tensorflow/tensorboard/backend/application.py
+++ b/tensorflow/tensorboard/backend/application.py
@@ -24,7 +24,6 @@ from __future__ import print_function
 
 import csv
 import imghdr
-import mimetypes
 import os
 import re
 import threading
@@ -43,9 +42,6 @@ from tensorflow.tensorboard.backend import http_util
 from tensorflow.tensorboard.backend import process_graph
 from tensorflow.tensorboard.backend.event_processing import event_accumulator
 from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.debugger import debugger_plugin
-from tensorflow.tensorboard.plugins.projector import projector_plugin
-from tensorflow.tensorboard.plugins.text import text_plugin
 
 
 DEFAULT_SIZE_GUIDANCE = {
@@ -61,6 +57,7 @@ DATA_PREFIX = '/data'
 LOGDIR_ROUTE = '/logdir'
 RUNS_ROUTE = '/runs'
 PLUGIN_PREFIX = '/plugin'
+PLUGINS_LISTING_ROUTE = '/plugins_listing'
 SCALARS_ROUTE = '/' + event_accumulator.SCALARS
 IMAGES_ROUTE = '/' + event_accumulator.IMAGES
 AUDIO_ROUTE = '/' + event_accumulator.AUDIO
@@ -96,18 +93,27 @@ class _OutputFormat(object):
   CSV = 'csv'
 
 
-def standard_tensorboard_wsgi(logdir, purge_orphaned_data, reload_interval):
-  """Construct a TensorBoardWSGIApp with standard plugins and multiplexer."""
+def standard_tensorboard_wsgi(
+    logdir,
+    purge_orphaned_data,
+    reload_interval,
+    plugins):
+  """Construct a TensorBoardWSGIApp with standard plugins and multiplexer.
+
+  Args:
+    logdir: The path to the directory containing events files.
+    purge_orphaned_data: Whether to purge orphaned data.
+    reload_interval: The interval at which the backend reloads more data in
+        seconds.
+    plugins: A list of plugins for TensorBoard to initialize.
+
+  Returns:
+    The new TensorBoard WSGI application.
+  """
   multiplexer = event_multiplexer.EventMultiplexer(
       size_guidance=DEFAULT_SIZE_GUIDANCE,
       purge_orphaned_data=purge_orphaned_data)
 
-  plugins = {
-      debugger_plugin.PLUGIN_PREFIX_ROUTE: debugger_plugin.DebuggerPlugin(),
-      projector_plugin.PLUGIN_PREFIX_ROUTE: projector_plugin.ProjectorPlugin(),
-      text_plugin.PLUGIN_PREFIX_ROUTE: text_plugin.TextPlugin(),
-  }
-
   return TensorBoardWSGIApp(logdir, plugins, multiplexer, reload_interval)
 
 
@@ -128,12 +134,16 @@ class TensorBoardWSGIApp(object):
       logdir: the logdir spec that describes where data will be loaded.
         may be a directory, or comma,separated list of directories, or colons
         can be used to provide named directories
-      plugins: Map from plugin name to plugin application
+      plugins: List of plugins that extend tensorboard.plugins.BasePlugin
       multiplexer: The EventMultiplexer with TensorBoard data to serve
       reload_interval: How often (in seconds) to reload the Multiplexer
 
     Returns:
       A WSGI application that implements the TensorBoard backend.
+
+    Raises:
+      ValueError: If some plugin has no plugin_name
+      ValueError: If two plugins have the same plugin_name
     """
     self._logdir = logdir
     self._plugins = plugins
@@ -148,44 +158,53 @@ class TensorBoardWSGIApp(object):
       reload_multiplexer(self._multiplexer, path_to_run)
 
     self.data_applications = {
-        DATA_PREFIX + LOGDIR_ROUTE:
-            self._serve_logdir,
-        DATA_PREFIX + SCALARS_ROUTE:
-            self._serve_scalars,
+        DATA_PREFIX + AUDIO_ROUTE:
+            self._serve_audio,
+        DATA_PREFIX + COMPRESSED_HISTOGRAMS_ROUTE:
+            self._serve_compressed_histograms,
         DATA_PREFIX + GRAPH_ROUTE:
             self._serve_graph,
-        DATA_PREFIX + RUN_METADATA_ROUTE:
-            self._serve_run_metadata,
         DATA_PREFIX + HISTOGRAMS_ROUTE:
             self._serve_histograms,
-        DATA_PREFIX + COMPRESSED_HISTOGRAMS_ROUTE:
-            self._serve_compressed_histograms,
         DATA_PREFIX + IMAGES_ROUTE:
             self._serve_images,
-        DATA_PREFIX + INDIVIDUAL_IMAGE_ROUTE:
-            self._serve_image,
-        DATA_PREFIX + AUDIO_ROUTE:
-            self._serve_audio,
         DATA_PREFIX + INDIVIDUAL_AUDIO_ROUTE:
             self._serve_individual_audio,
+        DATA_PREFIX + INDIVIDUAL_IMAGE_ROUTE:
+            self._serve_image,
+        DATA_PREFIX + LOGDIR_ROUTE:
+            self._serve_logdir,
+        # TODO(chizeng): Delete this RPC once we have skylark rules that obviate
+        # the need for the frontend to determine which plugins are active.
+        DATA_PREFIX + PLUGINS_LISTING_ROUTE:
+            self._serve_plugins_listing,
+        DATA_PREFIX + RUN_METADATA_ROUTE:
+            self._serve_run_metadata,
         DATA_PREFIX + RUNS_ROUTE:
             self._serve_runs,
-        '/app.js':
-            self._serve_js
+        DATA_PREFIX + SCALARS_ROUTE:
+            self._serve_scalars,
     }
 
     # Serve the routes from the registered plugins using their name as the route
     # prefix. For example if plugin z has two routes /a and /b, they will be
     # served as /data/plugin/z/a and /data/plugin/z/b.
-    for name in self._plugins:
+    plugin_names_encountered = set()
+    for plugin in self._plugins:
+      if plugin.plugin_name is None:
+        raise ValueError('Plugin %s has no plugin_name' % plugin)
+      if plugin.plugin_name in plugin_names_encountered:
+        raise ValueError('Duplicate plugins for name %s' % plugin.plugin_name)
+      plugin_names_encountered.add(plugin.plugin_name)
+
       try:
-        plugin = self._plugins[name]
         plugin_apps = plugin.get_plugin_apps(self._multiplexer, self._logdir)
       except Exception as e:  # pylint: disable=broad-except
-        logging.warning('Plugin %s failed. Exception: %s', name, str(e))
+        logging.warning('Plugin %s failed. Exception: %s', plugin.plugin_name,
+                        str(e))
         continue
       for route, app in plugin_apps.items():
-        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + name + route
+        path = DATA_PREFIX + PLUGIN_PREFIX + '/' + plugin.plugin_name + route
         self.data_applications[path] = app
 
   # We use underscore_names for consistency with inherited methods.
@@ -477,6 +496,21 @@ class TensorBoardWSGIApp(object):
     })
     return query_string
 
+  @wrappers.Request.application
+  def _serve_plugins_listing(self, request):
+    """Serves an object mapping plugin name to whether it is enabled.
+
+    Args:
+      request: The werkzeug.Request object.
+
+    Returns:
+      A werkzeug.Response object.
+    """
+    return http_util.Respond(
+        request,
+        {plugin.plugin_name: plugin.is_active() for plugin in self._plugins},
+        'application/json')
+
   @wrappers.Request.application
   def _serve_runs(self, request):
     """WSGI app serving a JSON object about runs and tags.
@@ -508,59 +542,9 @@ class TensorBoardWSGIApp(object):
   @wrappers.Request.application
   def _serve_index(self, request):
     """Serves the index page (i.e., the tensorboard app itself)."""
-    return self._serve_static_file(request, '/dist/index.html')
-
-  @wrappers.Request.application
-  def _serve_js(self, request):
-    """Serves the JavaScript for the index page."""
-    return self._serve_static_file(request, '/dist/app.js')
-
-  def _serve_static_file(self, request, path):
-    """Serves the static file located at the given path.
-
-    Args:
-      request: A werkzeug Request
-      path: The path of the static file, relative to the tensorboard/ directory.
-
-    Returns:
-      A werkzeug.Response application.
-    """
-    # Strip off the leading forward slash.
-    orig_path = path.lstrip('/')
-    if not self._path_is_safe(orig_path):
-      logging.warning('path not safe: %s', orig_path)
-      return http_util.Respond(request, 'Naughty naughty!', 'text/plain', 400)
-      # Resource loader wants a path relative to //WORKSPACE/tensorflow.
-    path = os.path.join('tensorboard', orig_path)
-    # Open the file and read it.
-    try:
-      contents = resource_loader.load_resource(path)
-    except IOError:
-      # For compatibility with latest version of Bazel, we renamed bower
-      # packages to use '_' rather than '-' in their package name.
-      # This means that the directory structure is changed too.
-      # So that all our recursive imports work, we need to modify incoming
-      # requests to map onto the new directory structure.
-      path = orig_path
-      components = path.split('/')
-      components[0] = components[0].replace('-', '_')
-      path = ('/').join(components)
-      # Bazel keeps all the external dependencies in //WORKSPACE/external.
-      # and resource loader wants a path relative to //WORKSPACE/tensorflow/.
-      path = os.path.join('../external', path)
-      try:
-        contents = resource_loader.load_resource(path)
-      except IOError:
-        logging.warning('path %s not found, sending 404', path)
-        return http_util.Respond(request, 'Not found', 'text/plain', code=404)
-    mimetype, content_encoding = mimetypes.guess_type(path)
-    mimetype = mimetype or 'application/octet-stream'
-    return http_util.Respond(
-        request,
-        contents,
-        mimetype,
-        expires=3600,
-        content_encoding=content_encoding)
+    contents = resource_loader.load_resource(
+        'tensorboard/components/index.html')
+    return http_util.Respond(request, contents, 'text/html', expires=3600)
 
   def __call__(self, environ, start_response):  # pylint: disable=invalid-name
     """Central entry point for the TensorBoard application.
@@ -591,8 +575,9 @@ class TensorBoardWSGIApp(object):
     elif clean_path in TAB_ROUTES:
       return self._serve_index(environ, start_response)
     else:
-      return self._serve_static_file(request, clean_path)(environ,
-                                                          start_response)
+      logging.warning('path %s not found, sending 404', clean_path)
+      return http_util.Respond(request, 'Not found', 'text/plain', code=404)(
+          environ, start_response)
     # pylint: enable=too-many-function-args
 
 
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
index 234c541803e2d3593c19a07f5dbf6e643436ba6c..e8556c0c6118797a67a99d9ca9ee024ae71c2536 100644
--- a/tensorflow/tensorboard/backend/application_test.py
+++ b/tensorflow/tensorboard/backend/application_test.py
@@ -48,6 +48,44 @@ from tensorflow.python.summary.writer import writer as writer_lib
 from tensorflow.tensorboard import tensorboard
 from tensorflow.tensorboard.backend import application
 from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins import base_plugin
+
+
+class FakePlugin(base_plugin.TBPlugin):
+  """A plugin with no functionality."""
+
+  def __init__(self, plugin_name, is_active_value, routes_mapping):
+    """Constructs a fake plugin.
+
+    Args:
+      plugin_name: The name of this plugin.
+      is_active_value: Whether the plugin is active.
+      routes_mapping: A dictionary mapping from route (string URL path) to the
+        method called when a user issues a request to that route.
+    """
+    self.plugin_name = plugin_name
+    self._is_active_value = is_active_value
+    self._routes_mapping = routes_mapping
+
+  def get_plugin_apps(self, multiplexer, logdir):
+    """Returns a mapping from routes to handlers offered by this plugin.
+
+    Args:
+      multiplexer: The event multiplexer.
+      logdir: The path to the directory containing logs.
+
+    Returns:
+      A dictionary mapping from routes to handlers offered by this plugin.
+    """
+    return self._routes_mapping
+
+  def is_active(self):
+    """Returns whether this plugin is active.
+
+    Returns:
+      A boolean. Whether this plugin is active.
+    """
+    return self._is_active_value
 
 
 class TensorboardServerTest(test.TestCase):
@@ -61,7 +99,10 @@ class TensorboardServerTest(test.TestCase):
     multiplexer = event_multiplexer.EventMultiplexer(
         size_guidance=application.DEFAULT_SIZE_GUIDANCE,
         purge_orphaned_data=True)
-    plugins = {}
+    plugins = [
+        FakePlugin(plugin_name='foo', is_active_value=True, routes_mapping={}),
+        FakePlugin(plugin_name='bar', is_active_value=False, routes_mapping={})
+    ]
     app = application.TensorBoardWSGIApp(
         self.temp_dir, plugins, multiplexer, reload_interval=0)
     try:
@@ -113,16 +154,17 @@ class TensorboardServerTest(test.TestCase):
     response = self._get('/asdf')
     self.assertEqual(response.status, 404)
 
-  def testDirectoryTraversal(self):
-    """Attempt a directory traversal attack."""
-    response = self._get('/..' * 30 + '/etc/passwd')
-    self.assertEqual(response.status, 400)
-
   def testLogdir(self):
     """Test the format of the data/logdir endpoint."""
     parsed_object = self._getJson('/data/logdir')
     self.assertEqual(parsed_object, {'logdir': self.temp_dir})
 
+  def testPluginsListing(self):
+    """Test the format of the data/plugins_listing endpoint."""
+    parsed_object = self._getJson('/data/plugins_listing')
+    # Plugin foo is active. Plugin bar is not.
+    self.assertEqual(parsed_object, {'foo': True, 'bar': False})
+
   def testRuns(self):
     """Test the format of the /data/runs endpoint."""
     run_json = self._getJson('/data/runs')
@@ -180,6 +222,19 @@ class TensorboardServerTest(test.TestCase):
       response.read()
       connection.close()
 
+  def testScalars(self):
+    """Test the format of /data/scalars."""
+    data = self._getJson('/data/scalars?run=run1&tag=simple_values')
+    self.assertEqual(len(data), self._SCALAR_COUNT)
+
+  def testScalarsCsv(self):
+    """Test the csv format of /data/scalars."""
+    data = self._get(
+        '/data/scalars?run=run1&tag=simple_values&format=csv').read()
+    line_count = data.count('\n')
+    self.assertEqual(line_count,
+                     self._SCALAR_COUNT + 1)  # include 1 more line for header
+
   def testHistograms(self):
     """Test the format of /data/histograms."""
     self.assertEqual(
@@ -432,10 +487,41 @@ class TensorBoardAssetsTest(test.TestCase):
   def testTagFound(self):
     tag = application.get_tensorboard_tag()
     self.assertTrue(tag)
-    app = application.standard_tensorboard_wsgi('', True, 60)
+    app = application.standard_tensorboard_wsgi('', True, 60, [])
     self.assertEqual(app.tag, tag)
 
 
+class TensorBoardPluginsTest(test.TestCase):
+
+  def testPluginsAdded(self):
+
+    def foo_handler():
+      pass
+
+    def bar_handler():
+      pass
+
+    plugins = [
+        FakePlugin(
+            plugin_name='foo',
+            is_active_value=True,
+            routes_mapping={'/foo_route': foo_handler}),
+        FakePlugin(
+            plugin_name='bar',
+            is_active_value=True,
+            routes_mapping={'/bar_route': bar_handler}),
+    ]
+
+    # The application should have added routes for both plugins.
+    app = application.standard_tensorboard_wsgi('', True, 60, plugins)
+
+    # The routes are prefixed with /data/plugin/[plugin name].
+    self.assertDictContainsSubset({
+        '/data/plugin/foo/foo_route': foo_handler,
+        '/data/plugin/bar/bar_route': bar_handler,
+    }, app.data_applications)
+
+
 class TensorboardSimpleServerConstructionTest(test.TestCase):
   """Tests that the default HTTP server is constructed without error.
 
@@ -480,5 +566,30 @@ class TensorboardSimpleServerConstructionTest(test.TestCase):
     self.assertTrue(one_passed)  # We expect either IPv4 or IPv6 to be supported
 
 
+class TensorBoardApplcationConstructionTest(test.TestCase):
+
+  def testExceptions(self):
+    logdir = '/fake/foo'
+    multiplexer = event_multiplexer.EventMultiplexer()
+
+    # Fails if there is an unnamed plugin
+    with self.assertRaises(ValueError):
+      # This plugin lacks a name.
+      plugins = [
+          FakePlugin(plugin_name=None, is_active_value=True, routes_mapping={})
+      ]
+      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
+
+    # Fails if there are two plugins with same name
+    with self.assertRaises(ValueError):
+      plugins = [
+          FakePlugin(
+              plugin_name='foo', is_active_value=True, routes_mapping={}),
+          FakePlugin(
+              plugin_name='foo', is_active_value=True, routes_mapping={}),
+      ]
+      application.TensorBoardWSGIApp(logdir, plugins, multiplexer, 0)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
index beba28da060db12b0a10252f7f4093b5c6a73a60..d5a91bbb6a26f5b8b1f6a3c91559c1f4c0f193f5 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator.py
@@ -438,6 +438,14 @@ class EventAccumulator(object):
     """
     return self._health_pills.Items(node_name)
 
+  def GetOpsWithHealthPills(self):
+    """Determines which ops have at least 1 health pill event.
+
+    Returns:
+      A list of names of ops with at least 1 health pill event.
+    """
+    return self._health_pills.Keys()
+
   def Graph(self):
     """Return the graph definition, if there is one.
 
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
index 38a8cd915fe46e5f615428b7eb23c16891d0583b..f7d424cb912fcacb32899b97ecbfb2676307e8ef 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -225,6 +225,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertTagsEqual(x.Tags(), {})
 
   def testTags(self):
+    """Tags should be found in EventAccumulator after adding some events."""
     gen = _EventGenerator(self)
     gen.AddScalar('s1')
     gen.AddScalar('s2')
@@ -245,6 +246,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     })
 
   def testReload(self):
+    """EventAccumulator contains suitable tags after calling Reload."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     acc.Reload()
@@ -267,6 +269,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     })
 
   def testScalars(self):
+    """Tests whether EventAccumulator contains scalars after adding them."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     s1 = ea.ScalarEvent(wall_time=1, step=10, value=32)
@@ -293,12 +296,11 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
       self.assertEqual(expected_value, gotten_event.value[i])
 
   def testHealthPills(self):
+    """HealthPills should be properly inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     gen.AddHealthPill(13371337, 41, 'Add', 0, range(1, 13))
     gen.AddHealthPill(13381338, 42, 'Add', 1, range(42, 54))
-
-    acc = ea.EventAccumulator(gen)
     acc.Reload()
 
     # Retrieve the health pills for each node name.
@@ -321,7 +323,16 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
             value=range(42, 54)),
         gotten_events[1])
 
+  def testGetOpsWithHealthPills(self):
+    gen = _EventGenerator(self)
+    acc = ea.EventAccumulator(gen)
+    gen.AddHealthPill(13371337, 41, 'Add', 0, range(1, 13))
+    gen.AddHealthPill(13381338, 42, 'MatMul', 1, range(42, 54))
+    acc.Reload()
+    self.assertItemsEqual(['Add', 'MatMul'], acc.GetOpsWithHealthPills())
+
   def testHistograms(self):
+    """Tests whether histograms are inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
 
@@ -371,6 +382,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Histograms('hst2'), [hst2])
 
   def testCompressedHistograms(self):
+    """Tests compressed histograms inserted into EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
 
@@ -422,6 +434,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.CompressedHistograms('hst2'), [expected_cmphst2])
 
   def testCompressedHistogramsWithEmptyHistogram(self):
+    """Tests that empty histograms compressed properly in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen, compression_bps=(0, 2500, 5000, 7500, 10000))
 
@@ -475,6 +488,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertAlmostEqual(vals[8].value, 1.0)
 
   def testImages(self):
+    """Tests 2 images inserted/accessed in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     im1 = ea.ImageEvent(
@@ -508,6 +522,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Images('im2'), [im2])
 
   def testAudio(self):
+    """Tests 2 audio events inserted/accessed in EventAccumulator."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     snd1 = ea.AudioEvent(
@@ -545,6 +560,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
     self.assertEqual(acc.Audio('snd2'), [snd2])
 
   def testKeyError(self):
+    """KeyError should be raised when accessing non-existing keys."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     acc.Reload()
@@ -568,7 +584,7 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
       acc.Audio('hst1')
 
   def testNonValueEvents(self):
-    """Tests that non-value events in the generator don't cause early exits."""
+    """Non-value events in the generator don't cause early exits."""
     gen = _EventGenerator(self)
     acc = ea.EventAccumulator(gen)
     gen.AddScalar('s1', wall_time=1, step=10, value=20)
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
index bbf958820a097bf1de2951166556c051056e9b7d..08e6dbb57d60239cf5dce8b19a78fa8770c40697 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_multiplexer.py
@@ -287,6 +287,21 @@ class EventMultiplexer(object):
     accumulator = self._GetAccumulator(run)
     return accumulator.HealthPills(node_name)
 
+  def GetOpsWithHealthPills(self, run):
+    """Determines which ops have at least 1 health pill event for a given run.
+
+    Args:
+      run: The name of the run.
+
+    Raises:
+      KeyError: If the run is not found, or the node name is not available for
+        the given run.
+
+    Returns:
+      The list of names of ops with health pill events.
+    """
+    return self._GetAccumulator(run).GetOpsWithHealthPills()
+
   def Graph(self, run):
     """Retrieve the graph associated with the provided run.
 
diff --git a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
index ed5cac4014f4c91dafcd76f34d99a5eeb0eb06c0..a97f39e87f85f04123f5f2ddddab7af84b960a52 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_multiplexer_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 import os.path
 import shutil
@@ -45,10 +46,16 @@ def _CreateCleanDirectory(path):
 
 class _FakeAccumulator(object):
 
-  def __init__(self, path):
+  def __init__(self, path, health_pill_mapping=None):
+    """Constructs a fake accumulator with some fake events.
+
+    Args:
+      path: The path for the run that this accumulator is for.
+      health_pill_mapping: An optional mapping from Op to health pill strings.
+    """
     self._path = path
     self.reload_called = False
-    self._node_names_to_health_pills = {'Add': ['hp1', 'hp2']}
+    self._node_names_to_health_pills = health_pill_mapping or {}
 
   def Tags(self):
     return {event_accumulator.IMAGES: ['im1', 'im2'],
@@ -74,6 +81,9 @@ class _FakeAccumulator(object):
     health_pills = self._node_names_to_health_pills[node_name]
     return [self._path + '/' + health_pill for health_pill in health_pills]
 
+  def GetOpsWithHealthPills(self):
+    return self._node_names_to_health_pills.keys()
+
   def Histograms(self, tag_name):
     return self._TagHelper(tag_name, event_accumulator.HISTOGRAMS)
 
@@ -93,14 +103,13 @@ class _FakeAccumulator(object):
     self.reload_called = True
 
 
-# pylint: disable=unused-argument
-def _GetFakeAccumulator(
-    path,
-    size_guidance=None,
-    compression_bps=None,
-    purge_orphaned_data=None):
-  return _FakeAccumulator(path)
-# pylint: enable=unused-argument
+def _GetFakeAccumulator(path,
+                        size_guidance=None,
+                        compression_bps=None,
+                        purge_orphaned_data=None,
+                        health_pill_mapping=None):
+  del size_guidance, compression_bps, purge_orphaned_data  # Unused.
+  return _FakeAccumulator(path, health_pill_mapping=health_pill_mapping)
 
 
 class EventMultiplexerTest(test_util.TensorFlowTestCase):
@@ -115,16 +124,19 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.stubs.CleanUp()
 
   def testEmptyLoader(self):
+    """Tests empty EventMultiplexer creation."""
     x = event_multiplexer.EventMultiplexer()
     self.assertEqual(x.Runs(), {})
 
   def testRunNamesRespected(self):
+    """Tests two EventAccumulators inserted/accessed in EventMultiplexer."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     self.assertItemsEqual(sorted(x.Runs().keys()), ['run1', 'run2'])
     self.assertEqual(x._GetAccumulator('run1')._path, 'path1')
     self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
 
   def testReload(self):
+    """EventAccumulators should Reload after EventMultiplexer call it."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     self.assertFalse(x._GetAccumulator('run1').reload_called)
     self.assertFalse(x._GetAccumulator('run2').reload_called)
@@ -133,6 +145,7 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertTrue(x._GetAccumulator('run2').reload_called)
 
   def testScalars(self):
+    """Tests Scalars function returns suitable values."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
 
     run1_actual = x.Scalars('run1', 'sv1')
@@ -141,15 +154,36 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertEqual(run1_expected, run1_actual)
 
   def testHealthPills(self):
+    """Tests HealthPills() returns events associated with run1/Add."""
+    self.stubs.Set(event_accumulator, 'EventAccumulator',
+                   functools.partial(
+                       _GetFakeAccumulator,
+                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     self.assertEqual(['path1/hp1', 'path1/hp2'], x.HealthPills('run1', 'Add'))
 
+  def testGetOpsWithHealthPillsWhenHealthPillsAreNotAvailable(self):
+    # The event accumulator lacks health pills for the run.
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual([], x.GetOpsWithHealthPills('run1'))
+
+  def testGetOpsWithHealthPillsWhenHealthPillsAreAvailable(self):
+    # The event accumulator has health pills for the run.
+    self.stubs.Set(event_accumulator, 'EventAccumulator',
+                   functools.partial(
+                       _GetFakeAccumulator,
+                       health_pill_mapping={'Add': ['hp1', 'hp2']}))
+    x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
+    self.assertItemsEqual(['Add'], x.GetOpsWithHealthPills('run1'))
+
   def testExceptions(self):
+    """KeyError should be raised when accessing non-existing keys."""
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
     with self.assertRaises(KeyError):
       x.Scalars('sv1', 'xxx')
 
   def testInitialization(self):
+    """Tests EventMultiplexer is created properly with its params."""
     x = event_multiplexer.EventMultiplexer()
     self.assertEqual(x.Runs(), {})
     x = event_multiplexer.EventMultiplexer({'run1': 'path1', 'run2': 'path2'})
@@ -158,6 +192,14 @@ class EventMultiplexerTest(test_util.TensorFlowTestCase):
     self.assertEqual(x._GetAccumulator('run2')._path, 'path2')
 
   def testAddRunsFromDirectory(self):
+    """Tests AddRunsFromDirectory function.
+
+    Tests the following scenarios:
+    - When the directory does not exist.
+    - When the directory is empty.
+    - When the directory has empty subdirectory.
+    - Contains proper EventAccumulators after adding events.
+    """
     x = event_multiplexer.EventMultiplexer()
     tmpdir = self.get_temp_dir()
     join = os.path.join
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
index b9b951fd127875041a6a671ec81c918d83cc7321..34bfd34195fc9185e27dc1a524c99c2773c068da 100644
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util.py
@@ -30,6 +30,11 @@ def _IsDirectory(parent, item):
   return gfile.IsDirectory(os.path.join(parent, item))
 
 
+def PluginDirectory(logdir, plugin_name):
+  """Returns the plugin directory for plugin_name."""
+  return os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+
+
 def ListPlugins(logdir):
   """List all the plugins that have registered assets in logdir.
 
@@ -61,7 +66,7 @@ def ListAssets(logdir, plugin_name):
     not exist (either because the logdir doesn't exist, or because the plugin
     didn't register) an empty list is returned.
   """
-  plugin_dir = os.path.join(logdir, _PLUGINS_DIR, plugin_name)
+  plugin_dir = PluginDirectory(logdir, plugin_name)
   if not gfile.IsDirectory(plugin_dir):
     return []
   entries = gfile.ListDirectory(plugin_dir)
@@ -83,7 +88,7 @@ def RetrieveAsset(logdir, plugin_name, asset_name):
     KeyError: if the asset does not exist.
   """
 
-  asset_path = os.path.join(logdir, _PLUGINS_DIR, plugin_name, asset_name)
+  asset_path = os.path.join(PluginDirectory(logdir, plugin_name), asset_name)
   try:
     with gfile.Open(asset_path, "r") as f:
       return f.read()
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
index e74a0642d6efecfc31e92ca1ca0c473ba1bae6aa..cfc6857777c9d48ff98e87a4503f7736034b71e4 100644
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
@@ -50,6 +50,11 @@ class PluginGamma(GenericContentPlugin):
 
 class PluginAssetUtilitiesTest(test.TestCase):
 
+  def testGetPluginDirectory(self):
+    self.assertEqual(
+        os.path.join("logdir", "plugins", "x"),
+        plugin_asset_util.PluginDirectory("logdir", "x"))
+
   def testNonExistentDirectory(self):
     tempdir = self.get_temp_dir()
     fake_dir = os.path.join(tempdir, "nonexistent_dir")
diff --git a/tensorflow/tensorboard/bower.json b/tensorflow/tensorboard/bower.json
deleted file mode 100644
index 0a0fac45fd6afbe4628b6eae6d90c2c3fb8aef92..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/bower.json
+++ /dev/null
@@ -1,187 +0,0 @@
-{
-  "__autoadded_transitive_dep__": [
-    "font-roboto",
-    "iron-a11y-announcer",
-    "iron-a11y-keys-behavior",
-    "iron-autogrow-textarea",
-    "iron-checked-element-behavior",
-    "iron-dropdown",
-    "iron-fit-behavior",
-    "iron-flex-layout",
-    "iron-form-element-behavior",
-    "iron-icon",
-    "iron-iconset-svg",
-    "iron-input",
-    "iron-menu-behavior",
-    "iron-meta",
-    "iron-overlay-behavior",
-    "iron-range-behavior",
-    "iron-resizable-behavior",
-    "iron-scroll-target-behavior",
-    "iron-validatable-behavior",
-    "neon-animation",
-    "paper-dialog-behavior",
-    "paper-material",
-    "paper-menu-button",
-    "paper-ripple",
-    "promise-polyfill",
-    "web-animations-js",
-    "webcomponentsjs"
-  ],
-  "authors": [
-    "Google"
-  ],
-  "dependencies": {
-    "d3": "3.5.15",
-    "dagre": "0.7.4",
-    "es6-promise": "2.1.0",
-    "font-roboto": "PolymerElements/font-roboto#1.0.1",
-    "graphlib": "1.0.7",
-    "iron-a11y-announcer": "PolymerElements/iron-a11y-announcer#1.0.5",
-    "iron-a11y-keys-behavior": "PolymerElements/iron-a11y-keys-behavior#1.1.8",
-    "iron-ajax": "PolymerElements/iron-ajax#1.2.0",
-    "iron-autogrow-textarea": "PolymerElements/iron-autogrow-textarea#1.0.12",
-    "iron-behaviors": "PolymerElements/iron-behaviors#1.0.17",
-    "iron-checked-element-behavior": "PolymerElements/iron-checked-element-behavior#1.0.4",
-    "iron-collapse": "PolymerElements/iron-collapse#1.0.8",
-    "iron-dropdown": "PolymerElements/iron-dropdown#1.4.0",
-    "iron-fit-behavior": "PolymerElements/iron-fit-behavior#1.2.5",
-    "iron-flex-layout": "PolymerElements/iron-flex-layout#1.3.0",
-    "iron-form-element-behavior": "PolymerElements/iron-form-element-behavior#1.0.6",
-    "iron-icon": "PolymerElements/iron-icon#1.0.11",
-    "iron-icons": "PolymerElements/iron-icons#1.1.3",
-    "iron-iconset-svg": "PolymerElements/iron-iconset-svg#1.1.0",
-    "iron-input": "PolymerElements/iron-input#1.0.10",
-    "iron-list": "PolymerElements/iron-list#1.3.9",
-    "iron-menu-behavior": "PolymerElements/iron-menu-behavior#1.1.10",
-    "iron-meta": "PolymerElements/iron-meta#1.1.1",
-    "iron-overlay-behavior": "PolymerElements/iron-overlay-behavior#1.10.1",
-    "iron-range-behavior": "PolymerElements/iron-range-behavior#1.0.4",
-    "iron-resizable-behavior": "PolymerElements/iron-resizable-behavior#1.0.3",
-    "iron-scroll-target-behavior": "PolymerElements/iron-scroll-target-behavior#1.0.3",
-    "iron-selector": "PolymerElements/iron-selector#1.5.2",
-    "iron-validatable-behavior": "PolymerElements/iron-validatable-behavior#1.1.1",
-    "lodash": "3.8.0",
-    "neon-animation": "PolymerElements/neon-animation#1.2.2",
-    "numericjs": "1.2.6",
-    "paper-behaviors": "PolymerElements/paper-behaviors#1.0.12",
-    "paper-button": "PolymerElements/paper-button#1.0.11",
-    "paper-checkbox": "PolymerElements/paper-checkbox#1.4.0",
-    "paper-dialog": "PolymerElements/paper-dialog#1.0.4",
-    "paper-dialog-behavior": "PolymerElements/paper-dialog-behavior#1.2.5",
-    "paper-dialog-scrollable": "PolymerElements/paper-dialog-scrollable#1.1.5",
-    "paper-dropdown-menu": "PolymerElements/paper-dropdown-menu#1.4.0",
-    "paper-header-panel": "PolymerElements/paper-header-panel#1.1.4",
-    "paper-icon-button": "PolymerElements/paper-icon-button#1.1.3",
-    "paper-input": "PolymerElements/paper-input#1.1.18",
-    "paper-item": "PolymerElements/paper-item#1.1.4",
-    "paper-listbox": "PolymerElements/paper-listbox#1.1.2",
-    "paper-material": "PolymerElements/paper-material#1.0.6",
-    "paper-menu": "PolymerElements/paper-menu#1.2.2",
-    "paper-menu-button": "PolymerElements/paper-menu-button#1.5.1",
-    "paper-progress": "PolymerElements/paper-progress#1.0.9",
-    "paper-radio-button": "PolymerElements/paper-radio-button#1.1.2",
-    "paper-radio-group": "PolymerElements/paper-radio-group#1.0.9",
-    "paper-ripple": "PolymerElements/paper-ripple#1.0.5",
-    "paper-slider": "PolymerElements/paper-slider#1.0.10",
-    "paper-spinner": "PolymerElements/paper-spinner#1.1.1",
-    "paper-styles": "PolymerElements/paper-styles#1.1.4",
-    "paper-tabs": "PolymerElements/paper-tabs#1.7.0",
-    "paper-toast": "PolymerElements/paper-toast#1.3.0",
-    "paper-toggle-button": "PolymerElements/paper-toggle-button#1.2.0",
-    "paper-toolbar": "PolymerElements/paper-toolbar#1.1.4",
-    "paper-tooltip": "PolymerElements/paper-tooltip#1.1.2",
-    "plottable": "1.16.1",
-    "polymer": "1.7.0",
-    "promise-polyfill": "polymerlabs/promise-polyfill#1.0.0",
-    "three.js": "threejs#r77",
-    "web-animations-js": "web-animations/web-animations-js#2.2.1",
-    "webcomponentsjs": "webcomponents/webcomponentsjs#0.7.22",
-    "weblas": "0.9.0"
-  },
-  "description": "TensorBoard: Visualizations for TensorFlow",
-  "devDependencies": {
-    "iron-component-page": "PolymerElements/iron-component-page#^1.1.4",
-    "iron-demo-helpers": "PolymerElements/iron-demo-helpers#^1.2.3",
-    "web-component-tester": "Polymer/web-component-tester"
-  },
-  "ignore": [
-    "**/.*",
-    "node_modules",
-    "bower_components",
-    "test",
-    "tests"
-  ],
-  "license": "Apache-2.0",
-  "name": "tensorboard",
-  "private": true,
-  "resolutions": {
-    "d3": "3.5.15",
-    "dagre": "0.7.4",
-    "es6-promise": "2.1.0",
-    "font-roboto": "1.0.1",
-    "graphlib": "1.0.7",
-    "iron-a11y-announcer": "1.0.5",
-    "iron-a11y-keys-behavior": "1.1.8",
-    "iron-ajax": "1.2.0",
-    "iron-autogrow-textarea": "1.0.12",
-    "iron-behaviors": "1.0.17",
-    "iron-checked-element-behavior": "1.0.4",
-    "iron-collapse": "1.0.8",
-    "iron-dropdown": "1.4.0",
-    "iron-fit-behavior": "1.2.5",
-    "iron-flex-layout": "1.3.0",
-    "iron-form-element-behavior": "1.0.6",
-    "iron-icon": "1.0.11",
-    "iron-icons": "1.1.3",
-    "iron-iconset-svg": "1.1.0",
-    "iron-input": "1.0.10",
-    "iron-list": "1.3.9",
-    "iron-menu-behavior": "1.1.10",
-    "iron-meta": "1.1.1",
-    "iron-overlay-behavior": "1.10.1",
-    "iron-range-behavior": "1.0.4",
-    "iron-resizable-behavior": "1.0.3",
-    "iron-scroll-target-behavior": "1.0.3",
-    "iron-selector": "1.5.2",
-    "iron-validatable-behavior": "1.1.1",
-    "lodash": "3.8.0",
-    "neon-animation": "1.2.2",
-    "numericjs": "1.2.6",
-    "paper-behaviors": "1.0.12",
-    "paper-button": "1.0.11",
-    "paper-checkbox": "1.4.0",
-    "paper-dialog": "1.0.4",
-    "paper-dialog-behavior": "1.2.5",
-    "paper-dialog-scrollable": "1.1.5",
-    "paper-dropdown-menu": "1.4.0",
-    "paper-header-panel": "1.1.4",
-    "paper-icon-button": "1.1.3",
-    "paper-input": "1.1.18",
-    "paper-item": "1.1.4",
-    "paper-listbox": "1.1.2",
-    "paper-material": "1.0.6",
-    "paper-menu": "1.2.2",
-    "paper-menu-button": "1.5.1",
-    "paper-progress": "1.0.9",
-    "paper-radio-button": "1.1.2",
-    "paper-radio-group": "1.0.9",
-    "paper-ripple": "1.0.5",
-    "paper-slider": "1.0.10",
-    "paper-spinner": "1.1.1",
-    "paper-styles": "1.1.4",
-    "paper-tabs": "1.7.0",
-    "paper-toast": "1.3.0",
-    "paper-toggle-button": "1.2.0",
-    "paper-toolbar": "1.1.4",
-    "paper-tooltip": "1.1.2",
-    "plottable": "1.16.1",
-    "polymer": "1.7.0",
-    "promise-polyfill": "1.0.0",
-    "three.js": "threejs#r77",
-    "web-animations-js": "2.2.1",
-    "webcomponentsjs": "0.7.22",
-    "weblas": "0.9.0"
-  },
-  "version": "0.0.0"
-}
diff --git a/tensorflow/tensorboard/bower/BUILD b/tensorflow/tensorboard/bower/BUILD
deleted file mode 100644
index 2c2921d9880e3b1d2a9a413474cdb00d2249ce8d..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/bower/BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-# AUTOGENERATED FILE by tensorboard_bower_dependency_sync.py
-
-package(default_visibility = ["//visibility:public"])
-
-filegroup(
-    name = "bower",
-    srcs = [
-        "@d3//:d3",
-        "@dagre//:dagre",
-        "@es6_promise//:es6_promise",
-        "@font_roboto//:font_roboto",
-        "@graphlib//:graphlib",
-        "@iron_a11y_announcer//:iron_a11y_announcer",
-        "@iron_a11y_keys_behavior//:iron_a11y_keys_behavior",
-        "@iron_ajax//:iron_ajax",
-        "@iron_autogrow_textarea//:iron_autogrow_textarea",
-        "@iron_behaviors//:iron_behaviors",
-        "@iron_checked_element_behavior//:iron_checked_element_behavior",
-        "@iron_collapse//:iron_collapse",
-        "@iron_dropdown//:iron_dropdown",
-        "@iron_fit_behavior//:iron_fit_behavior",
-        "@iron_flex_layout//:iron_flex_layout",
-        "@iron_form_element_behavior//:iron_form_element_behavior",
-        "@iron_icon//:iron_icon",
-        "@iron_icons//:iron_icons",
-        "@iron_iconset_svg//:iron_iconset_svg",
-        "@iron_input//:iron_input",
-        "@iron_list//:iron_list",
-        "@iron_menu_behavior//:iron_menu_behavior",
-        "@iron_meta//:iron_meta",
-        "@iron_overlay_behavior//:iron_overlay_behavior",
-        "@iron_range_behavior//:iron_range_behavior",
-        "@iron_resizable_behavior//:iron_resizable_behavior",
-        "@iron_scroll_target_behavior//:iron_scroll_target_behavior",
-        "@iron_selector//:iron_selector",
-        "@iron_validatable_behavior//:iron_validatable_behavior",
-        "@lodash//:lodash",
-        "@neon_animation//:neon_animation",
-        "@numericjs_numeric_min_js//file",
-        "@paper_behaviors//:paper_behaviors",
-        "@paper_button//:paper_button",
-        "@paper_checkbox//:paper_checkbox",
-        "@paper_dialog//:paper_dialog",
-        "@paper_dialog_behavior//:paper_dialog_behavior",
-        "@paper_dialog_scrollable//:paper_dialog_scrollable",
-        "@paper_dropdown_menu//:paper_dropdown_menu",
-        "@paper_header_panel//:paper_header_panel",
-        "@paper_icon_button//:paper_icon_button",
-        "@paper_input//:paper_input",
-        "@paper_item//:paper_item",
-        "@paper_listbox//:paper_listbox",
-        "@paper_material//:paper_material",
-        "@paper_menu//:paper_menu",
-        "@paper_menu_button//:paper_menu_button",
-        "@paper_progress//:paper_progress",
-        "@paper_radio_button//:paper_radio_button",
-        "@paper_radio_group//:paper_radio_group",
-        "@paper_ripple//:paper_ripple",
-        "@paper_slider//:paper_slider",
-        "@paper_spinner//:paper_spinner",
-        "@paper_styles//:paper_styles",
-        "@paper_tabs//:paper_tabs",
-        "@paper_toast//:paper_toast",
-        "@paper_toggle_button//:paper_toggle_button",
-        "@paper_toolbar//:paper_toolbar",
-        "@paper_tooltip//:paper_tooltip",
-        "@plottable//:plottable",
-        "@polymer//:polymer",
-        "@promise_polyfill//:promise_polyfill",
-        "@three_js_orbitcontrols_js//file",
-        "@three_js_three_min_js//file",
-        "@web_animations_js//:web_animations_js",
-        "@webcomponentsjs//:webcomponentsjs",
-        "@weblas_weblas_js//file",
-    ],
-)
diff --git a/tensorflow/tensorboard/components/BUILD b/tensorflow/tensorboard/components/BUILD
index 301425d96d082324afd2f871874ec6397d75ad0f..2d2c2d2d7b0e711c3f32d0bdcf2085240f6abcd9 100644
--- a/tensorflow/tensorboard/components/BUILD
+++ b/tensorflow/tensorboard/components/BUILD
@@ -1,23 +1,32 @@
 package(default_visibility = ["//tensorflow:internal"])
 
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:vulcanize.bzl", "tensorboard_html_binary")
+
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+web_library(
+    name = "tensorboard",
+    srcs = [
+        "analytics.html",
+        "tensorboard.html",
+    ],
+    path = "/",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_tensorboard_d3v4",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_html_binary(
+    name = "index",
+    input_path = "/tensorboard.html",
+    output_path = "/index.html",
+    deps = [":tensorboard"],
+)
 
 filegroup(
     name = "all_files",
-    srcs = glob(
-        [
-            "tf_*/**/*",
-            "vz_*/**/*",
-        ],
-        exclude = [
-            "**/tf_model_zoo/*",
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ) + [
-        "BUILD",
-    ],
-    visibility = ["//tensorflow:__subpackages__"],
+    srcs = glob(["**"]),
+    tags = ["notsan"],
 )
diff --git a/tensorflow/tensorboard/components/tf_imports_google/plottable.html b/tensorflow/tensorboard/components/analytics.html
similarity index 86%
rename from tensorflow/tensorboard/components/tf_imports_google/plottable.html
rename to tensorflow/tensorboard/components/analytics.html
index 6f9678f9cb21093b2ebc83ba261b56950c254e60..d319f576fc1e58296f52e006e7dfc6dda9d191b4 100644
--- a/tensorflow/tensorboard/components/tf_imports_google/plottable.html
+++ b/tensorflow/tensorboard/components/analytics.html
@@ -15,5 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="d3.html">
-<link rel="import" href="../plottable-library/plottable.html">
+<!-- TODO(jart): Give users the ability to opt-in to analytics. -->
diff --git a/tensorflow/tensorboard/components/index.html b/tensorflow/tensorboard/components/index.html
deleted file mode 100644
index c790a76f75372e59ef5c83c37d3e7dd9c07b656c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/index.html
+++ /dev/null
@@ -1,39 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <script src="webcomponentsjs/webcomponents-lite.min.js"></script>
-    <style>
-      html, body {
-        margin: 0;
-        padding: 0;
-        height: 100%;
-        font-family: "RobotoDraft","Roboto",sans-serif;
-      }
-    </style>
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-    <link rel="import" href="tf-tensorboard/tf-tensorboard.html">
-    <title>TensorBoard</title>
-  </head>
-  <body>
-    <tf-tensorboard use-hash></tf-tensorboard>
-    <script src="../app/analytics.js"></script>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tensorboard.html b/tensorflow/tensorboard/components/tensorboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..0652902cfac8adba652eba6bf15b20e13e270e19
--- /dev/null
+++ b/tensorflow/tensorboard/components/tensorboard.html
@@ -0,0 +1,27 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>TensorBoard</title>
+<link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
+<script src="webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="tf-tensorboard/style.html">
+<link rel="import" href="tf-tensorboard/tf-tensorboard.html">
+<link rel="import" href="analytics.html">
+<body>
+<tf-tensorboard use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
deleted file mode 100644
index aebf9152b139cfeeed94122da95c75c8ade55d1f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "tf_audio_dashboard",
-    srcs = [
-        "tf-audio-dashboard.html",
-        "tf-audio-grid.html",
-        "tf-audio-loader.html",
-    ],
-    path = "/tf-audio-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "@org_polymer",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-audio-dashboard.html",
-        "tf-audio-grid.html",
-        "tf-audio-loader.html",
-    ],
-    destdir = "tf-audio-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-styles:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD
deleted file mode 100644
index 383ea8d1b65208ec897e14a10100cfb7c4033cb7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_audio_dashboard/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-audio-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_audio_dashboard",
-        "//tensorflow/tensorboard/components/tf_audio_dashboard/demo/data",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD
deleted file mode 100644
index c3824a923dd9f87437cf0e5f10b84f47230eee37..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-audio-dashboard/demo/data",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json b/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json
deleted file mode 100644
index 478f5ed833715f3f6300d5a1d81bef1ddc6ca69a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run1": {"audio": ["foo", "bar"]}, "run2": {"audio": ["bar", "zod"]}}
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c73207475461f06843392b6e3545434013c564a7
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/BUILD
@@ -0,0 +1,79 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_audio_dashboard_d3v4",
+    srcs = [
+        "tf-audio-dashboard.html",
+        "tf-audio-grid.html",
+        "tf-audio-loader.html",
+    ],
+    path = "/tf-audio-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-audio-dashboard/demo",
+    deps = [
+        ":tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+web_library(
+    name = "test",
+    testonly = 1,
+    srcs = [
+        "audioDashboardTests.js",
+        "tests.html",
+    ] + glob(["data/**"]),
+    path = "/tf-audio-dashboard/test",
+    deps = [
+        ":tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    testonly = 1,
+    srcs = ["audioDashboardTests.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:sinon.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts
similarity index 93%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts
index a8daed7dd4291b12c1e6d5db47116e31f852637c..f2bf68eb8de5fa9bf66e96e4d710f959aaaecaff 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/audioDashboardTests.ts
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/audioDashboardTests.ts
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-declare function stub(el: string, obj: any): void;
-declare function fixture(id: string): void;
+
+// TODO(dandelion): Fix me.
+declare function fixture(id: string): any;
+declare function stub(x, y: any): void;
 
 describe(
     'audio dashboard tests', function() {
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/audio_run_run1_tag_au1_2Faudio_2F0.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run1_tag_au1_2Faudio_2F0.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/audio_run_run1_tag_au1_2Faudio_2F0.json
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run1_tag_au1_2Faudio_2F0.json
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/audio_run_run2_tag_au2_2Faudio_2F0.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run2_tag_au2_2Faudio_2F0.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/audio_run_run2_tag_au2_2Faudio_2F0.json
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/audio_run_run2_tag_au2_2Faudio_2F0.json
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au1_2Faudio_2F0_run_run1.wav
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_0_tag_au2_2Faudio_2F0_run_run2.wav
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_1_tag_au2_2Faudio_2F0_run_run2.wav
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/individualAudio_index_2_tag_au2_2Faudio_2F0_run_run2.wav
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html
similarity index 96%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html
index f0a79b573ced88464e9f4aa2472e4811a156c387..8e0587084df2fef61a23848810b7efc246958894 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/index.html
@@ -18,8 +18,8 @@ limitations under the License.
 
 <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
 <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../../paper-styles/typography.html">
 <link rel="import" href="../tf-audio-dashboard.html">
-    <link rel="import" href="../../paper-styles/typography.html">
 
 <title>Audio Dashboard Demo</title>
 <style>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html
similarity index 52%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html
index 20a157ea0e94065a9069aa3a191ae8aa14288fce..891e8bf0c29f5cca7a4654b49dde81997c6d27d5 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/index.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tests.html
@@ -16,15 +16,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<html>
-  <head>
-    <title>vz-histogram-timeseries</title>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <script src="../webcomponentsjs/webcomponents-lite.js"></script>
-    <link rel="import" href="../iron-component-page/iron-component-page.html">
-  </head>
-  <body>
-    <iron-component-page src="vz-histogram-timeseries.html"></iron-component-page>
-  </body>
-</html>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../tf-imports/d3.html">
+<link rel="import" href="../tf-audio-dashboard.html">
+<style>
+  html, body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft","Roboto",sans-serif;
+  }
+</style>
+
+<test-fixture id="testElementFixture">
+  <template>
+    <tf-audio-dashboard></tf-audio-dashboard>
+  </template>
+</test-fixture>
+
+<script src="audioDashboardTests.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html
index ad879210d6f95c616f8841c05e990799de8942ed..0353c51628d934b4490e2657670286d4e00beb55 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-dashboard.html
@@ -59,14 +59,18 @@ tf-audio-dashboard displays a dashboard that loads audio from a TensorFlow run.
     </style>
   </template>
   <script>
-    Polymer({
+    TF.Dashboard.TfAudioDashboard = Polymer({
       is: "tf-audio-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       properties: {
         dataType: {value: "audio"},
       },
       behaviors: [
+        TF.Dashboard.DashboardBehavior("audio"),
         TF.Dashboard.ReloadBehavior("tf-audio-loader"),
-        TF.Backend.Behavior
+        TF.Backend.BackendBehavior
       ],
       attached: function() {
         this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-grid.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-grid.html
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-grid.html
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html
similarity index 99%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
rename to tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html
index ed3b5efa07e0a27d1078d4f35aba9b0445a1daaa..71539537d0e55efcc6c1e07ed76f79ec5699ecf4 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/tf-audio-loader.html
+++ b/tensorflow/tensorboard/components/tf_audio_dashboard_d3v4/tf-audio-loader.html
@@ -107,6 +107,8 @@ future for loading older clips.
     </template>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-audio-loader",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_backend/BUILD b/tensorflow/tensorboard/components/tf_backend/BUILD
deleted file mode 100644
index 22ab1faff05926da71562281da35464d241a8ad2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/BUILD
+++ /dev/null
@@ -1,79 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# TODO(dandelion): Add webfiles support for the test code.
-
-webfiles(
-    name = "tf_backend",
-    srcs = [
-        "tf-backend.html",
-        ":ts",
-    ],
-    path = "/tf-backend",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/vz_sorting",
-        "@org_polymer",
-    ],
-)
-
-tensorboard_typescript_genrule(
-    name = "ts",
-    srcs = [
-        "backend.ts",
-        "behavior.ts",
-        "requestManager.ts",
-        "router.ts",
-        "urlPathHelpers.ts",
-    ],
-    typings = [
-        "@org_definitelytyped//:d3.d.ts",
-        "@org_definitelytyped//:lodash.d.ts",
-        "//tensorflow/tensorboard/components/vz_sorting:ts_typings",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-backend.html",
-        ":legacy_ts",
-    ],
-    visibility = ["//visibility:public"],
-    destdir = "tf-backend",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-        "backend.ts",
-        "behavior.ts",
-        "requestManager.ts",
-        "router.ts",
-        "urlPathHelpers.ts",
-    ],
-    deps = [
-        "//tensorflow/tensorboard/components:common_deps",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
-    ],
-)
diff --git a/tensorflow/tensorboard/components/tf_backend/backend.ts b/tensorflow/tensorboard/components/tf_backend/backend.ts
deleted file mode 100644
index 54d89a6bbb1a3d5cb430163700f3aa3e9e720e07..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/backend.ts
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF.Backend {
-  export interface RunEnumeration {
-    histograms: string[];
-    compressedHistogramTuples: string[];
-    scalars: string[];
-    images: string[];
-    audio: string[];
-    graph: boolean;
-    run_metadata: string[];
-  }
-
-  export interface LogdirResponse { logdir: string; }
-
-  export interface RunsResponse { [runName: string]: RunEnumeration; }
-
-  export type RunToTag = {[run: string]: string[];};
-
-  export interface Datum {
-    wall_time: Date;
-    step: number;
-  }
-
-  export type ScalarDatum = Datum & Scalar;
-  export interface Scalar { scalar: number; }
-
-  export interface Text { text: string; }
-  export type TextDatum = Datum & Text;
-
-  export type HistogramDatum = Datum & Histogram;
-  export interface Histogram {
-    min: number;
-    max: number;
-    nItems?: number;
-    sum?: number;
-    sumSquares?: number;
-    bucketRightEdges: number[];
-    bucketCounts: number[];
-  }
-
-  export interface HistogramBin {
-    x: number;
-    dx: number;
-    y: number;
-  }
-  export type HistogramSeriesDatum = HistogramSeries & Datum;
-  export interface HistogramSeries { bins: HistogramBin[]; }
-
-  export type ImageDatum = Datum & Image;
-  export interface Image {
-    width: number;
-    height: number;
-    url: string;
-  }
-
-  export type AudioDatum = Datum & Audio;
-  export interface Audio {
-    content_type: string;
-    url: string;
-  }
-
-  // A health pill encapsulates an overview of tensor element values. The value
-  // field is a list of 12 numbers that shed light on the status of the tensor.
-  export interface HealthPill {
-    node_name: string;
-    output_slot: number;
-    value: number[];
-  };
-  // When updating this type, keep it consistent with the HealthPill interface
-  // in tf_graph_common/lib/scene/scene.ts.
-  export type HealthPillDatum = Datum & HealthPill;
-  // A health pill response is a mapping from node name to a list of health pill
-  // data entries.
-  export interface HealthPillsResponse { [key: string]: HealthPillDatum[]; };
-
-  export var TYPES = [
-    'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
-    'runMetadata', 'text'
-  ];
-  /**
-   * The Backend class provides a convenient and typed interface to the backend.
-   *
-   * It provides methods corresponding to the different data sources on the
-   * TensorBoard backend. These methods return a promise containing the data
-   * from the backend. This class does some post-processing on the data; for
-   * example, converting data elements tuples into js objects so that they can
-   * be accessed in a more convenient and clearly-documented fashion.
-   */
-  export class Backend {
-    public router: Router;
-    public requestManager: RequestManager;
-
-    /**
-     * Construct a Backend instance.
-     * @param router the Router with info on what urls to get data from
-     * @param requestManager The RequestManager, overwritable so you may
-     * manually clear request queue, etc. Defaults to a new RequestManager.
-     */
-    constructor(router: Router, requestManager?: RequestManager) {
-      this.router = router;
-      this.requestManager = requestManager || new RequestManager();
-    }
-
-    /**
-     * Returns a promise for requesting the logdir string.
-     */
-    public logdir(): Promise<LogdirResponse> {
-      return this.requestManager.request(this.router.logdir());
-    }
-
-    /**
-     * Returns a listing of all the available data in the TensorBoard backend.
-     */
-    public runs(): Promise<RunsResponse> {
-      return this.requestManager.request(this.router.runs());
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for scalar data.
-     */
-    public scalarRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'scalars'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for histogram data.
-     */
-    public histogramRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'histograms'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for image data.
-     */
-    public imageRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'images'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for audio data.
-     */
-    public audioRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'audio'));
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for compressedHistogram
-     * data.
-     */
-    public compressedHistogramRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'compressedHistograms'));
-    }
-
-    /**
-     * Return a promise showing list of runs that contain graphs.
-     */
-    public graphRuns(): Promise<string[]> {
-      return this.runs().then(
-          (x) => { return _.keys(x).filter((k) => x[k].graph); });
-    }
-
-    /**
-     * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
-     */
-    public runMetadataRuns(): Promise<RunToTag> {
-      return this.runs().then((x) => _.mapValues(x, 'run_metadata'));
-    }
-
-
-    /**
-     * Returns a promise showing the Run-to-Tag mapping for text data.
-     */
-    public textRuns(): Promise<RunToTag> {
-      return this.requestManager.request(this.router.textRuns());
-    }
-
-
-    /**
-     * Returns a promise containing TextDatums for given run and tag.
-     */
-    public text(tag: string, run: string): Promise<TextDatum[]> {
-      let url = this.router.text(tag, run);
-      // tslint:disable-next-line:no-any it's convenient and harmless here
-      return this.requestManager.request(url).then(map(function(x: any) {
-        x.wall_time = timeToDate(x.wall_time);
-        return x;
-      }));
-    }
-
-    /**
-     * Return a promise of a graph string from the backend.
-     */
-    public graph(
-        tag: string, limit_attr_size?: number,
-        large_attrs_key?: string): Promise<string> {
-      let url = this.router.graph(tag, limit_attr_size, large_attrs_key);
-      return this.requestManager.request(url);
-    }
-
-    /**
-     * Return a promise containing ScalarDatums for given run and tag.
-     */
-    public scalar(tag: string, run: string): Promise<Array<ScalarDatum>> {
-      let p: Promise<TupleData<number>[]>;
-      let url = this.router.scalars(tag, run);
-      p = this.requestManager.request(url);
-      return p.then(map(detupler(createScalar)));
-    }
-
-    /**
-     * Returns a promise for requesting the health pills for a list of nodes.
-     */
-    public healthPills(nodeNames: string[], step?: number):
-        Promise<HealthPillsResponse> {
-      let postData = {'node_names': JSON.stringify(nodeNames)};
-      if (step !== undefined) {
-        // The user requested health pills for a specific step. This request
-        // might be slow since the backend reads events sequentially from disk.
-        postData['step'] = step;
-      }
-      return this.requestManager.request(this.router.healthPills(), postData);
-    }
-
-    /**
-     * Return a promise containing HistogramDatums for given run and tag.
-     */
-    public histogram(tag: string, run: string):
-        Promise<Array<HistogramSeriesDatum>> {
-      let p: Promise<TupleData<HistogramTuple>[]>;
-      let url = this.router.histograms(tag, run);
-      p = this.requestManager.request(url);
-      return p.then(map(detupler(createHistogram))).then(function(histos) {
-        // Get the minimum and maximum values across all histograms so that the
-        // visualization is aligned for all timesteps.
-        let min = d3.min(histos, d => d.min);
-        let max = d3.max(histos, d => d.max);
-
-        return histos.map(function(histo, i) {
-          return {
-            wall_time: histo.wall_time,
-            step: histo.step,
-            bins: convertBins(histo, min, max)
-          };
-        });
-      });
-    }
-
-    /**
-     * Return a promise containing ImageDatums for given run and tag.
-     */
-    public image(tag: string, run: string): Promise<Array<ImageDatum>> {
-      let url = this.router.images(tag, run);
-      let p: Promise<ImageMetadata[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(this.createImage.bind(this)));
-    }
-
-    /**
-     * Return a promise containing AudioDatums for given run and tag.
-     */
-    public audio(tag: string, run: string): Promise<Array<AudioDatum>> {
-      let url = this.router.audio(tag, run);
-      let p: Promise<AudioMetadata[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(this.createAudio.bind(this)));
-    }
-
-    /**
-     * Returns a promise to load the string RunMetadata for given run/tag.
-     */
-    public runMetadata(tag: string, run: string): Promise<string> {
-      let url = this.router.runMetadata(tag, run);
-      return this.requestManager.request(url);
-    }
-
-    /**
-     * Get compressedHistogram data.
-     * Unlike other methods, don't bother reprocessing this data into a nicer
-     * format. This is because we will deprecate this route.
-     */
-    private compressedHistogram(tag: string, run: string):
-        Promise<Array<Datum&CompressedHistogramTuple>> {
-      let url = this.router.compressedHistograms(tag, run);
-      let p: Promise<TupleData<CompressedHistogramTuple>[]>;
-      p = this.requestManager.request(url);
-      return p.then(map(detupler((x) => x)));
-    }
-
-    private createImage(x: ImageMetadata): Image&Datum {
-      return {
-        width: x.width,
-        height: x.height,
-        wall_time: timeToDate(x.wall_time),
-        step: x.step,
-        url: this.router.individualImage(x.query, x.wall_time),
-      };
-    }
-
-    private createAudio(x: AudioMetadata): Audio&Datum {
-      return {
-        content_type: x.content_type,
-        wall_time: timeToDate(x.wall_time),
-        step: x.step,
-        url: this.router.individualAudio(x.query),
-      };
-    }
-  }
-
-  /** Given a RunToTag, return sorted array of all runs */
-  export function getRuns(r: RunToTag): string[] {
-    return _.keys(r).sort(VZ.Sorting.compareTagNames);
-  }
-
-  /** Given a RunToTag, return array of all tags (sorted + dedup'd) */
-  export function getTags(r: RunToTag): string[] {
-    return _.union.apply(null, _.values(r)).sort(VZ.Sorting.compareTagNames);
-  }
-
-  /**
-   * Given a RunToTag and an array of runs, return every tag that appears for
-   * at least one run.
-   * Sorted, deduplicated.
-   */
-  export function filterTags(r: RunToTag, runs: string[]): string[] {
-    var result = [];
-    runs.forEach((x) => result = result.concat(r[x]));
-    return _.uniq(result).sort(VZ.Sorting.compareTagNames);
-  }
-
-  function timeToDate(x: number): Date { return new Date(x * 1000); };
-
-  /**  Just a curryable map to make things cute and tidy. */
-  function map<T, U>(f: (x: T) => U): (arr: T[]) => U[] {
-    return function(arr: T[]): U[] { return arr.map(f); };
-  };
-
-  /**
-   * This is a higher order function that takes a function that transforms a
-   * T into a G, and returns a function that takes TupleData<T>s and converts
-   * them into the intersection of a G and a Datum.
-   */
-  function detupler<T, G>(xform: (x: T) => G): (t: TupleData<T>) => Datum & G {
-    return function(x: TupleData<T>): Datum & G {
-      // Create a G, assert it has type <G & Datum>
-      let obj = <G&Datum>xform(x[2]);
-      // ... patch in the properties of datum
-      obj.wall_time = timeToDate(x[0]);
-      obj.step = x[1];
-      return obj;
-    };
-  };
-
-  function createScalar(x: number): Scalar { return {scalar: x}; };
-
-  function createHistogram(x: HistogramTuple): Histogram {
-    return {
-      min: x[0],
-      max: x[1],
-      nItems: x[2],
-      sum: x[3],
-      sumSquares: x[4],
-      bucketRightEdges: x[5],
-      bucketCounts: x[6],
-    };
-  };
-
-  /**
-   * Takes histogram data as stored by tensorboard backend and converts it to
-   * the standard d3 histogram data format to make it more compatible and easier
-   * to visualize. When visualizing histograms, having the left edge and width
-   * makes things quite a bit easier. The bins are also converted to have an
-   * uniform width, what makes the visualization easier to understand.
-   *
-   * @param histogram A histogram from tensorboard backend.
-   * @param min The leftmost edge. The binning will start on it.
-   * @param max The rightmost edge. The binning will end on it.
-   * @param numBins The number of bins of the converted data. The default of 30
-   * is a sensible default, using more starts to get artifacts because the event
-   * data is stored in buckets, and you start being able to see the aliased
-   * borders between each bucket.
-   * @return A histogram bin. Each bin has an x (left edge), a dx (width),
-   *     and a y (count).
-   *
-   * If given rightedges are inclusive, then these left edges (x) are exclusive.
-   */
-  export function convertBins(
-      histogram: Histogram, min: number, max: number, numBins = 30) {
-    if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
-      throw(new Error('Edges and counts are of different lengths.'));
-    }
-
-    if (max === min) {
-      // Create bins even if all the data has a single value.
-      max = min * 1.1 + 1;
-      min = min / 1.1 - 1;
-    }
-    let binWidth = (max - min) / numBins;
-    let bucketLeft = min;  // Use the min as the starting point for the bins.
-    let bucketPos = 0;
-    return d3.range(min, max, binWidth).map(function(binLeft) {
-      let binRight = binLeft + binWidth;
-
-      // Take the count of each existing bucket, multiply it by the proportion
-      // of overlap with the new bin, then sum and store as the count for the
-      // new bin. If no overlap, will add to zero, if 100% overlap, will include
-      // the full count into new bin.
-      let binY = 0;
-      while (bucketPos < histogram.bucketRightEdges.length) {
-        // Clip the right edge because right-most edge can be infinite-sized.
-        let bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
-
-        let intersect =
-            Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
-        let count = (intersect / (bucketRight - bucketLeft)) *
-            histogram.bucketCounts[bucketPos];
-
-        binY += intersect > 0 ? count : 0;
-
-        // If bucketRight is bigger than binRight, than this bin is finished and
-        // there is data for the next bin, so don't increment bucketPos.
-        if (bucketRight > binRight) {
-          break;
-        }
-        bucketLeft = Math.max(min, bucketRight);
-        bucketPos++;
-      };
-
-      return {x: binLeft, dx: binWidth, y: binY};
-    });
-  }
-
-  /**
-   * The following interfaces (TupleData, HistogramTuple,
-   * CompressedHistogramTuple, ImageMetadata, and AudioMetadata) describe how
-   * the data is sent over from the backend.
-   */
-  type TupleData<T> = [number, number, T];  // wall_time, step
-
-  // Min, Max, nItems, Sum, Sum_Squares, right edges of buckets, nItems in
-  // buckets
-  type HistogramTuple =
-      [number, number, number, number, number, number[], number[]];
-  type CompressedHistogramTuple = [number, number][];  // percentile, value
-  interface ImageMetadata {
-    width: number;
-    height: number;
-    wall_time: number;
-    step: number;
-    query: string;
-  }
-  interface AudioMetadata {
-    content_type: string;
-    wall_time: number;
-    step: number;
-    query: string;
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/behavior.ts b/tensorflow/tensorboard/components/tf_backend/behavior.ts
deleted file mode 100644
index de6590456f7af3a7671a9ed01a7d1d18dc7749b2..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/behavior.ts
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF.Backend {
-  export var Behavior = {
-    properties: {
-      /** *** Required properties *** */
-      /** Data type. One of TF.Backend.TYPES */
-      dataType: {
-        type: String,
-        observer: '_throwErrorOnUnrecognizedType',
-      },
-
-      /** TF.Backend.Backend for data loading. */
-      backend: {
-        type: Object,
-      },
-
-      /** Should it automatically load when configured ready? Default true. */
-      autoLoad: {
-        type: Boolean,
-        value: true,
-      },
-
-      /** *** Component-provided properties *** */
-      /** Every tag available for data type (sorted, dedpulicated) */
-      tags: {
-        type: Array,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Every run available for data type (sorted) */
-      runs: {
-        type: Array,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Mapping from runs to tags for the data type */
-      run2tag: {
-        type: Object,
-        readOnly: true,
-        notify: true,
-      },
-
-      /** Promise provider for the data. Useful for passing to subcomponents */
-      dataProvider:
-          {type: Function, computed: '_getDataProvider(dataType, backend)'},
-
-      /** Has the dashboard loaded yet? */
-      loadState: {
-        type: String,
-        value: 'noload',  // [noload, pending, loaded, failure]
-        readOnly: true,
-      },
-
-      /**
-       * True if dashboard has loaded, and no tags were found.
-       * Persists through subsequent reloads (ie. still true while
-       * next load is pending) so warning won't flash away every reload
-       * when there is no data.
-       */
-      dataNotFound: {
-        type: Boolean,
-        value: false,
-        readOnly: true,
-      }
-
-    },
-    observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
-    /**
-     * Reloading works in two steps:
-     * Backend reload, which gets metadata on available runs, tags, etc from
-     *   the backend.
-     * Frontend reload, which loads new data for each chart or visual display.
-     * Backend reload logic is provided by this behaivor. The frontend reload
-     *   logic should be provided elsewhere, since it is component-specific.
-     * To keep things simple and consistent, we do the backend reload first,
-     *   and the frontend reload afterwards.
-     */
-    reload: function() {
-      return this.backendReload().then(
-          (x) => { return this.frontendReload(); });
-    },
-    /**
-     * Load data from backend and then set run2tag, tags, runs, and loadState.
-     * Returns a promise that resolves/rejects when data is loaded.
-     */
-    backendReload: function() {
-      if (this.dataType == null) {
-        throw new Error('TF.Backend.Behavior: Need a dataType to reload.');
-      }
-      if (this.backend == null) {
-        throw new Error('TF.Backend.Behavior: Need a backend to reload.');
-      }
-      var runsRoute = this.backend[this.dataType + 'Runs'].bind(this.backend);
-      this._setLoadState('pending');
-      return runsRoute().then(
-          (x) => {
-            this._setLoadState('loaded');
-            if (_.isEqual(x, this.run2tag)) {
-              // If x and run2tag are equal, let's avoid updating everything
-              // since that can needlessly trigger run changes, reloads, etc
-              return x;
-            }
-            this._setRun2tag(x);
-            var tags = TF.Backend.getTags(x);
-            this._setDataNotFound(tags.length === 0);
-            this._setTags(tags);
-            this._setRuns(TF.Backend.getRuns(x));
-            return x;
-          },
-          (fail) => {
-            this._setLoadState('failure');
-            return fail;
-          });
-    },
-    _do_autoLoad: function(type, backend, autoLoad) {
-      if (autoLoad) {
-        this.reload();
-      };
-    },
-    _getDataProvider: function(dataType, backend) {
-      return this.backend[this.dataType].bind(this.backend);
-    },
-    _throwErrorOnUnrecognizedType: function(dataType) {
-      if (TF.Backend.TYPES.indexOf(dataType) === -1) {
-        throw new Error('TF.Backend.Behavior: Unknown dataType ' + dataType);
-      }
-    },
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/requestManager.ts b/tensorflow/tensorboard/components/tf_backend/requestManager.ts
deleted file mode 100644
index 1dfc3348b59f0561054a96bef93872c1c17276a7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/requestManager.ts
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF.Backend {
-  interface ResolveReject { resolve: Function; reject: Function; }
-  /**
-   * Manages many fetch requests. Launches up to nSimultaneousRequests
-   * simultaneously, and maintains a LIFO queue of requests to process when
-   * more urls are requested than can be handled at once. The queue can be
-   * cleared.
-   *
-   * When a request is made, a Promise is returned which resolves with the
-   * parsed JSON result from the request.
-   */
-  export class RequestCancellationError extends Error {
-    public name = 'RequestCancellationError';
-  }
-
-  export class RequestNetworkError extends Error {
-    public name: string;
-    public req: XMLHttpRequest;
-    public url: string;
-
-    constructor(req: XMLHttpRequest, url) {
-      super();
-      this.message = `RequestNetworkError: ${req.status} at ${url}`;
-      this.name = 'RequestNetworkError';
-      this.req = req;
-      this.url = url;
-    }
-  }
-
-  export class RequestManager {
-    private _queue: ResolveReject[];
-    private _maxRetries: number;
-    private _nActiveRequests: number;
-    private _nSimultaneousRequests: number;
-
-    constructor(nSimultaneousRequests = 10, maxRetries = 3) {
-      this._queue = [];
-      this._nActiveRequests = 0;
-      this._nSimultaneousRequests = nSimultaneousRequests;
-      this._maxRetries = maxRetries;
-    }
-
-    /**
-     * Gives a promise that loads assets from given url (respects queuing). If
-     * postData is provided, this request will use POST, not GET. This is an
-     * object mapping POST keys to string values.
-     */
-    public request(
-        url: string, postData?: {[key: string]: string}): Promise<any> {
-      var promise = new Promise((resolve, reject) => {
-                      var resolver = {resolve: resolve, reject: reject};
-                      this._queue.push(resolver);
-                      this.launchRequests();
-                    })
-                        .then(() => {
-                          return this.promiseWithRetries(
-                              url, this._maxRetries, postData);
-                        })
-                        .then(
-                            (response) => {
-                              // Success - Let's free space for another active
-                              // reqest, and launch it
-                              this._nActiveRequests--;
-                              this.launchRequests();
-                              return response;
-                            },
-                            (rejection) => {
-                              if (rejection.name === 'RequestNetworkError') {
-                                // If we failed due to network error, we should
-                                // decrement
-                                // _nActiveRequests because this request was
-                                // active
-                                this._nActiveRequests--;
-                                this.launchRequests();
-                              }
-                              return Promise.reject(rejection);
-                            });
-      return promise;
-    }
-
-    public clearQueue() {
-      while (this._queue.length > 0) {
-        this._queue.pop().reject(
-            new RequestCancellationError('Request cancelled by clearQueue'));
-      }
-    }
-
-    /* Return number of currently pending requests */
-    public activeRequests(): number {
-      return this._nActiveRequests;
-    }
-
-    /* Return total number of outstanding requests (includes queue) */
-    public outstandingRequests(): number {
-      return this._nActiveRequests + this._queue.length;
-    }
-
-    private launchRequests() {
-      while (this._nActiveRequests < this._nSimultaneousRequests &&
-             this._queue.length > 0) {
-        this._nActiveRequests++;
-        this._queue.pop().resolve();
-      }
-    }
-
-    /**
-     * Try to request a given URL using overwritable _promiseFromUrl method.
-     * If the request fails for any reason, we will retry up to maxRetries
-     * times. In practice, this will help us paper over transient network issues
-     * like '502 Bad Gateway'.
-     * By default, Chrome displays network errors in console, so
-     * the user will be able to tell when the requests are failing. I think this
-     * is a feature, if the request failures and retries are causing any
-     * pain to users, they can see it and file issues.
-     */
-    private promiseWithRetries(
-        url: string,
-        maxRetries: number,
-        postData?: {[key: string]: string}) {
-      var success = (x) =>  x;
-      var failure = (x) => {
-        if (maxRetries > 0) {
-          return this.promiseWithRetries(url, maxRetries - 1, postData);
-        } else {
-          return Promise.reject(x);
-        }
-      };
-      return this._promiseFromUrl(url, postData).then(success, failure);
-    }
-
-    /* Actually get promise from url using XMLHttpRequest */
-    protected _promiseFromUrl(url:string, postData?: {[key: string]: string}) {
-      return new Promise((resolve, reject) => {
-        let req = new XMLHttpRequest();
-        req.open(postData ? 'POST' : 'GET', url);
-
-        let formData;
-        if (postData) {
-          // We are to make a POST request.
-          formData = new FormData();
-          for (let postKey in postData) {
-            if (postKey) {
-              // The linter requires 'for in' loops to be filtered by an if
-              // condition.
-              formData.append(postKey, postData[postKey]);
-            }
-          }
-        }
-        req.onload = function() {
-          if (req.status === 200) {
-            resolve(JSON.parse(req.responseText));
-          } else {
-            reject(new RequestNetworkError(req, url));
-          }
-        };
-        req.onerror = function() {
-          reject(new RequestNetworkError(req, url));
-        };
-        req.send(formData);
-      });
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/router.ts b/tensorflow/tensorboard/components/tf_backend/router.ts
deleted file mode 100644
index d14216dcfc92cc58df77d8c43cfa4c5cc2772bbe..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/router.ts
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module TF.Backend {
-  export type RunTagUrlFn = (tag: string, run: string) => string;
-
-  export interface Router {
-    logdir: () => string;
-    runs: () => string;
-    scalars: RunTagUrlFn;
-    histograms: RunTagUrlFn;
-    compressedHistograms: RunTagUrlFn;
-    images: RunTagUrlFn;
-    individualImage: (query: string, wallTime: number) => string;
-    audio: RunTagUrlFn;
-    individualAudio: (query: string) => string;
-    graph: (run: string, limit_attr_size?: number, large_attrs_key?: string)
-        => string;
-    runMetadata: RunTagUrlFn;
-    textRuns: () => string;
-    text: RunTagUrlFn;
-    healthPills: () => string;
-  };
-
-  /**
-   * The standard router for communicating with the TensorBoard backend
-   * @param dataDir {string} The base prefix for finding data on server.
-   * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
-   */
-  export function router(dataDir = 'data', demoMode = false): Router {
-    var clean = demoMode ? demoify : (x) => x;
-    if (dataDir[dataDir.length - 1] === '/') {
-      dataDir = dataDir.slice(0, dataDir.length - 1);
-    }
-    function standardRoute(route: string, demoExtension = '.json'):
-        ((tag: string, run: string) => string) {
-      return function(tag: string, run: string): string {
-        var url =
-            dataDir + '/' + route + clean(queryEncoder({tag: tag, run: run}));
-        if (demoMode) {
-          url += demoExtension;
-        }
-        return url;
-      };
-    }
-    function individualImageUrl(query: string, wallTime: number) {
-      var url = dataDir + '/' + clean('individualImage?' + query);
-      // Include wall_time just to disambiguate the URL and force the browser
-      // to reload the image when the URL changes. The backend doesn't care
-      // about the value.
-      url += demoMode ? '.png' : '&ts=' + wallTime;
-      return url;
-    }
-    function individualAudioUrl(query: string) {
-      var url = dataDir + '/' + clean('individualAudio?' + query);
-      if (demoMode) {
-        url += '.wav';
-      }
-      return url;
-    }
-    function graphUrl(run: string, limit_attr_size?: number,
-        large_attrs_key?: string) {
-      let query_params = [['run', clean(run)]];
-      if (limit_attr_size != null && !demoMode) {
-        query_params.push(['limit_attr_size', String(limit_attr_size)]);
-      }
-      if (large_attrs_key != null && !demoMode) {
-        query_params.push(['large_attrs_key', large_attrs_key]);
-      }
-      let query = query_params
-                      .map(param => {
-                        return param[0] + '=' + encodeURIComponent(param[1]);
-                      })
-                      .join('&');
-      var url = dataDir + '/graph' + clean('?' + query);
-      if (demoMode) {
-        url += '.pbtxt';
-      }
-      return url;
-    }
-    return {
-      logdir: () => dataDir + '/logdir',
-      runs: () => dataDir + '/runs' + (demoMode ? '.json' : ''),
-      individualImage: individualImageUrl,
-      individualAudio: individualAudioUrl,
-      graph: graphUrl,
-      scalars: standardRoute('scalars'),
-      histograms: standardRoute('histograms'),
-      compressedHistograms: standardRoute('compressedHistograms'),
-      images: standardRoute('images'),
-      audio: standardRoute('audio'),
-      runMetadata: standardRoute('run_metadata', '.pbtxt'),
-      healthPills: () => dataDir + '/plugin/debugger/health_pills',
-      textRuns: () => dataDir + '/plugin/text/runs' + (demoMode ? '.json' : ''),
-      text: standardRoute('plugin/text/text'),
-    };
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts b/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
deleted file mode 100644
index 67ee70ce8317db9b16856d696ce354ce19bc791a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/backendTests.ts
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module TF.Backend {
-  describe('urlPathHelpers', function() {
-    let demoify = TF.Backend.demoify;
-    let encode = TF.Backend.queryEncoder;
-    it('demoify works as expected', function() {
-      let demoified = demoify(BAD_CHARACTERS);
-      let all_clean = '';
-      for (let i = 0; i < BAD_CHARACTERS.length; i++) {
-        all_clean += '_';
-      }
-      chai.assert.equal(
-          demoified, all_clean, 'cleaning the BAD_CHARACTERS works');
-      chai.assert.equal(
-          demoify('foozod'), 'foozod', 'doesnt change safe string');
-      chai.assert.equal(demoify('foo zod (2)'), 'foo_zod__2_', 'simple case');
-    });
-
-    it('queryEncoder works with demoify on spaces and parens', function() {
-      let params = {foo: 'something with spaces and (parens)'};
-      let actual = demoify(encode(params));
-      let expected = '_foo_something_with_spaces_and__28parens_29';
-      chai.assert.equal(actual, expected);
-    });
-  });
-
-  function assertIsDatum(x) {
-    chai.assert.isNumber(x.step);
-    chai.assert.instanceOf(x.wall_time, Date);
-  }
-
-  describe('backend tests', function() {
-    let backend: Backend;
-    let rm: RequestManager;
-    let base = 'data';
-    let demoRouter = TF.Backend.router(base, true);
-    beforeEach(function() {
-      // Construct a demo Backend (third param is true)
-      backend = new Backend(demoRouter);
-      rm = new RequestManager();
-    });
-
-    it('runs are loaded properly', function(done) {
-      let runsResponse = backend.runs();
-      let actualRuns = rm.request(demoRouter.runs());
-      Promise.all([runsResponse, actualRuns]).then((values) => {
-        chai.assert.deepEqual(values[0], values[1]);
-        done();
-      });
-    });
-
-    it('scalars are loaded properly', function(done) {
-      backend.scalar('cross_entropy (1)', 'run1').then((s) => {
-        // just check the data got reformatted properly
-        let aScalar = s[s.length - 1];
-        assertIsDatum(aScalar);
-        chai.assert.isNumber(aScalar.scalar);
-        // verify date conversion works
-        chai.assert.equal(aScalar.wall_time.valueOf(), 40000);
-        done();
-      });
-    });
-
-    it('histograms are loaded properly', function(done) {
-      backend.histogram('histo1', 'run1').then((histos) => {
-        let histo = histos[0];
-        assertIsDatum(histo);
-        chai.assert.instanceOf(histo.bins, Array);
-        done();
-      });
-    });
-
-    it('all registered types have handlers', function() {
-      TYPES.forEach((t: string) => {
-        chai.assert.isDefined(backend[t], t);
-        chai.assert.isDefined(backend[t + 'Runs'], t + 'Runs');
-      });
-    });
-
-    it('images are loaded properly', function(done) {
-      backend.image('im1', 'run1').then((images) => {
-        let image = images[0];
-        assertIsDatum(image);
-        chai.assert.isNumber(image.width);
-        chai.assert.isNumber(image.height);
-        let nonDemoQuery = 'index=0&tag=im1&run=run1';
-        let expectedUrl = demoRouter.individualImage(nonDemoQuery, 10.0);
-        chai.assert.equal(image.url, expectedUrl);
-        done();
-      });
-    });
-
-    it('audio is loaded properly', function(done) {
-      backend.audio('audio1', 'run1').then((audio_clips) => {
-        let audio = audio_clips[0];
-        assertIsDatum(audio);
-        chai.assert.equal(audio.content_type, 'audio/wav');
-        let nonDemoQuery = 'index=0&tag=audio1&run=run1';
-        let expectedUrl = demoRouter.individualAudio(nonDemoQuery);
-        chai.assert.equal(audio.url, expectedUrl);
-        done();
-      });
-    });
-
-    it('trailing slash removed from base route', function() {
-      let r = TF.Backend.router('foo/');
-      chai.assert.equal(r.runs(), 'foo/runs');
-    });
-
-    it('run helper methods work', function(done) {
-      let scalar = {run1: ['cross_entropy (1)'], fake_run_no_data: ['scalar2']};
-      let image = {run1: ['im1'], fake_run_no_data: ['im1', 'im2']};
-      let audio = {run1: ['audio1'], fake_run_no_data: ['audio1', 'audio2']};
-      let runMetadata = {run1: ['step99'], fake_run_no_data: ['step99']};
-      let graph = ['fake_run_no_data'];
-      let count = 0;
-      function next() {
-        count++;
-        if (count === 4) {
-          done();
-        }
-      }
-      backend.scalarRuns().then((x) => {
-        chai.assert.deepEqual(x, scalar);
-        next();
-      });
-      backend.imageRuns().then((x) => {
-        chai.assert.deepEqual(x, image);
-        next();
-      });
-      backend.audioRuns().then((x) => {
-        chai.assert.deepEqual(x, audio);
-        next();
-      });
-      backend.runMetadataRuns().then((x) => {
-        chai.assert.deepEqual(x, runMetadata);
-        next();
-      });
-      backend.graphRuns().then((x) => {
-        chai.assert.deepEqual(x, graph);
-        next();
-      });
-    });
-
-    it('runToTag helpers work', function() {
-      let r2t: RunToTag = {
-        run1: ['foo', 'bar', 'zod'],
-        run2: ['zod', 'zoink'],
-        a: ['foo', 'zod']
-      };
-      let empty1: RunToTag = {};
-      let empty2: RunToTag = {run1: [], run2: []};
-      chai.assert.deepEqual(getRuns(r2t), ['a', 'run1', 'run2']);
-      chai.assert.deepEqual(getTags(r2t), ['bar', 'foo', 'zod', 'zoink']);
-      chai.assert.deepEqual(filterTags(r2t, ['run1', 'run2']), getTags(r2t));
-      chai.assert.deepEqual(filterTags(r2t, ['run1']), ['bar', 'foo', 'zod']);
-      chai.assert.deepEqual(
-          filterTags(r2t, ['run2', 'a']), ['foo', 'zod', 'zoink']);
-
-      chai.assert.deepEqual(getRuns(empty1), []);
-      chai.assert.deepEqual(getTags(empty1), []);
-
-      chai.assert.deepEqual(getRuns(empty2), ['run1', 'run2']);
-      chai.assert.deepEqual(getTags(empty2), []);
-    });
-  });
-
-  describe('Verify that the histogram format conversion works.', function() {
-
-    function assertHistogramEquality(h1, h2) {
-      h1.forEach(function(b1, i) {
-        let b2 = h2[i];
-        chai.assert.closeTo(b1.x, b2.x, 1e-10);
-        chai.assert.closeTo(b1.dx, b2.dx, 1e-10);
-        chai.assert.closeTo(b1.y, b2.y, 1e-10);
-      });
-    }
-
-    it('Throws and error if the inputs are of different lengths', function() {
-      chai.assert.throws(function() {
-        convertBins(
-            {bucketRightEdges: [0], bucketCounts: [1, 2], min: 1, max: 2}, 1, 2,
-            2);
-      }, 'Edges and counts are of different lengths.');
-    });
-
-    it('Handles data with no bins', function() {
-      chai.assert.deepEqual(
-          convertBins(
-              {bucketRightEdges: [], bucketCounts: [], min: 0, max: 0}, 0, 0,
-              0),
-          []);
-    });
-
-    it('Handles data with one bin', function() {
-      let counts = [1];
-      let rightEdges = [1.21e-12];
-      let histogram = [{x: 1.1e-12, dx: 1.21e-12 - 1.1e-12, y: 1}];
-      let newHistogram = convertBins(
-          {
-            bucketRightEdges: rightEdges,
-            bucketCounts: counts,
-            min: 1.1e-12,
-            max: 1.21e-12
-          },
-          1.1e-12, 1.21e-12, 1);
-      assertHistogramEquality(newHistogram, histogram);
-    });
-
-    it('Handles data with two bins.', function() {
-      let counts = [1, 2];
-      let rightEdges = [1.1e-12, 1.21e-12];
-      let histogram = [
-        {x: 1.0e-12, dx: 1.05e-13, y: 1.09090909090909},
-        {x: 1.105e-12, dx: 1.05e-13, y: 1.9090909090909}
-      ];
-      let newHistogram = convertBins(
-          {
-            bucketRightEdges: rightEdges,
-            bucketCounts: counts,
-            min: 1.0e-12,
-            max: 1.21e-12
-          },
-          1.0e-12, 1.21e-12, 2);
-      assertHistogramEquality(newHistogram, histogram);
-    });
-
-    it('Handles a domain that crosses zero, but doesn\'t include zero as ' +
-           'an edge.',
-       function() {
-         let counts = [1, 2];
-         let rightEdges = [-1.0e-12, 1.0e-12];
-         let histogram = [
-           {x: -1.1e-12, dx: 1.05e-12, y: 1.95},
-           {x: -0.5e-13, dx: 1.05e-12, y: 1.05}
-         ];
-         let newHistogram = convertBins(
-             {
-               bucketRightEdges: rightEdges,
-               bucketCounts: counts,
-               min: -1.1e-12,
-               max: 1.0e-12
-             },
-             -1.1e-12, 1.0e-12, 2);
-         assertHistogramEquality(newHistogram, histogram);
-       });
-
-    it('Handles a histogram of all zeros', function() {
-      let h = {
-        min: 0,
-        max: 0,
-        nItems: 51200,
-        sum: 0,
-        sumSquares: 0,
-        bucketRightEdges: [0, 1e-12, 1.7976931348623157e+308],
-        bucketCounts: [0, 51200, 0],
-        wall_time: '2017-01-25T02:30:11.257Z',
-        step: 0
-      };
-      let newHistogram = convertBins(h, 0, 0, 5);
-      let expectedHistogram = [
-        {x: -1, dx: 0.4, y: 0}, {x: -0.6, dx: 0.4, y: 0},
-        {x: -0.2, dx: 0.4, y: 51200}, {x: 0.2, dx: 0.4, y: 0},
-        {x: 0.6, dx: 0.4, y: 0}
-      ];
-      assertHistogramEquality(newHistogram, expectedHistogram);
-    });
-
-    it('Handles a right-most right edge that extends to very large number.',
-       function() {
-         let counts = [1, 2, 3];
-         let rightEdges = [0, 1.0e-12, 1.0e14];
-         let histogram = [
-           {x: -1.0e-12, dx: 0.7e-12, y: 0.7},
-           {x: -0.3e-12, dx: 0.7e-12, y: 1.1},
-           {x: 0.4e-12, dx: 0.7e-12, y: 4.2}
-         ];
-         let newHistogram = convertBins(
-             {
-               bucketRightEdges: rightEdges,
-               bucketCounts: counts,
-               min: -1.0e-12,
-               max: 1.1e-12
-             },
-             -1.0e-12, 1.1e-12, 3);
-         assertHistogramEquality(newHistogram, histogram);
-       });
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts b/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
deleted file mode 100644
index 99bf2504ca67cf2483a580e13d6ee653fe51437c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/behaviorTests.ts
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-declare function fixture(id: string): void;
-
-    module TF.Backend {
-      window.addEventListener('WebComponentsReady', function() {
-        Polymer({
-          is: 'test-element',
-          behaviors: [TF.Backend.Behavior],
-          frontendReload: function() {
-            // no-op
-          },
-        });
-      });
-
-      describe('data-behavior', function() {
-        var testElement;
-        var resolve;
-        var reject;
-        var fakeBackend = {
-          scalarRuns: function() {
-            return new Promise(function(_resolve, _reject) {
-              resolve = (x) => _resolve(x);
-              reject = (x) => _reject(x);
-            });
-          },
-          scalar: function(x) { return this; },
-        };
-        beforeEach(function() {
-          testElement = fixture('testElementFixture');
-          testElement.autoLoad = false;
-          testElement.backend = fakeBackend;
-          testElement.dataType = 'scalar';
-        });
-
-        it('load states work as expected', function(done) {
-          chai.assert.equal(testElement.loadState, 'noload');
-          var reloaded = testElement.reload();
-          chai.assert.equal(testElement.loadState, 'pending');
-          resolve();
-          reloaded
-              .then(function() {
-                chai.assert.equal(testElement.loadState, 'loaded');
-                var reloaded2 = testElement.reload();
-                chai.assert.equal(testElement.loadState, 'pending');
-                reject();
-                return reloaded2;
-              })
-              .then(function() {
-                chai.assert.equal(testElement.loadState, 'failure');
-                done();
-              });
-        });
-
-        it('data provider set appropriately', function() {
-          chai.assert.deepEqual(
-              testElement.dataProvider(), testElement.backend);
-        });
-
-        it('loads data as expected', function(done) {
-          var r2t: RunToTag = {
-            run1: ['foo', 'bar', 'zod'],
-            run2: ['zoink', 'zow'],
-            run3: ['.'],
-          };
-          var tags = TF.Backend.getTags(r2t);
-          var runs = TF.Backend.getRuns(r2t);
-          testElement.backend = fakeBackend;
-          testElement.dataType = 'scalar';
-          testElement.reload().then(function(x) {
-            chai.assert.deepEqual(testElement.run2tag, r2t);
-            chai.assert.deepEqual(testElement.runs, runs);
-            chai.assert.deepEqual(testElement.tags, tags);
-            done();
-          });
-          resolve(r2t);
-        });
-
-        it('errors thrown on bad data types', function() {
-          testElement.backend = undefined;
-          chai.assert.throws(function() {
-            testElement.dataType = 'foo';
-          });
-          testElement.dataType = 'scalar';
-          testElement.dataType = 'graph';
-          testElement.dataType = 'histogram';
-        });
-
-        it('dataNotFound flag works', function(done) {
-          chai.assert.isFalse(testElement.dataNotFound, 'initially false');
-          var next = testElement.reload();
-          chai.assert.isFalse(
-              testElement.dataNotFound, 'still false while pending');
-          resolve({foo: [], bar: []});
-          next.then(() => {
-            chai.assert.isTrue(testElement.dataNotFound, 'true on empty data');
-            var last = testElement.reload();
-            chai.assert.isTrue(
-                testElement.dataNotFound, 'still true while pending');
-            resolve({foo: ['bar'], bar: ['zod']});
-            last.then(() => {
-              chai.assert.isFalse(
-                  testElement.dataNotFound, 'false now that we have data');
-              done();
-            });
-          });
-        });
-
-        it('reloads as soon as setup, if autoReload is true', function(done) {
-          var r2t = {foo: [], bar: []};
-          var fakeBackend = {
-            scalarRuns: () => Promise.resolve(r2t),
-            scalar: () => null,
-          };
-          testElement = fixture('testElementFixture');
-          testElement.dataType = 'scalar';
-          testElement.backend = fakeBackend;
-          setTimeout(() => {
-            chai.assert.equal(testElement.run2tag, r2t);
-            done();
-          });
-        });
-
-        it('doesn\'t mutate props if backend returns same data', function(
-                                                                     done) {
-          var r2t_1 = {foo: ['1', '2'], bar: ['3', '4']};
-          var r2t_2 = {foo: ['1', '2'], bar: ['3', '4']};
-          var fakeBackend = {
-            scalarRuns: () => Promise.resolve(r2t_1),
-            scalar: () => null,
-          };
-          testElement.backend = fakeBackend;
-          testElement.reload().then(() => {
-            fakeBackend.scalarRuns = () => Promise.resolve(r2t_2);
-            var tags = testElement.tags;
-            testElement.reload().then(() => {
-              // shallow equality ensures it wasn't recomputed
-              chai.assert.equal(
-                  tags, testElement.tags, 'tags was not recomputed');
-              done();
-            });
-          });
-
-          it('reload calls frontendReload', function(done) {
-            testElement.frontendReload = function() { done(); };
-            testElement.reload();
-          });
-
-        });
-      });
-    }
diff --git a/tensorflow/tensorboard/components/tf_backend/test/index.html b/tensorflow/tensorboard/components/tf_backend/test/index.html
deleted file mode 100644
index 7f51861d25a365fcf7e365897c427f931004f1c5..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/index.html
+++ /dev/null
@@ -1,46 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <script src="../../web-component-tester/browser.js"></script>
-  <link rel="import" href="../../polymer/polymer.html">
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../../vz-sorting/vz-sorting.html">
-</head>
-<body>
-  <test-fixture id="testElementFixture">
-    <template>
-      <test-element
-        id="test"
-      ></test-element>
-    </template>
-  </test-fixture>
-    <script src="../requestManager.js"></script>
-    <script src="../urlPathHelpers.js"></script>
-    <script src="../router.js"></script>
-    <script src="../backend.js"></script>
-    <script src="../behavior.js"></script>
-
-    <script src="requestManagerTest.js"></script>
-    <script src="backendTests.js"></script>
-    <script src="behaviorTests.js"></script>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts b/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts
deleted file mode 100644
index 9d5bd4d81968b97b37e51e2ec20d462cc1ac96ba..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/test/requestManagerTest.ts
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module TF.Backend {
-  interface MockRequest {
-    resolve: Function;
-    reject: Function;
-    id: number;
-    url: string;
-  }
-
-  class MockedRequestManager extends TF.Backend.RequestManager {
-    private resolvers: Function[];
-    private rejectors: Function[];
-    public requestsDispatched: number;
-
-    constructor(maxRequests = 10, maxRetries = 3) {
-      super(maxRequests, maxRetries);
-      this.resolvers = [];
-      this.rejectors = [];
-      this.requestsDispatched = 0;
-    }
-
-    protected _promiseFromUrl(url) {
-      return new Promise((resolve, reject) => {
-        var mockJSON = {
-          ok: true,
-          json: function() { return url; },
-          url: url,
-          status: 200,
-        };
-        var mockFailedRequest: any = {
-          ok: false,
-          url: url,
-          status: 502,
-        };
-        var mockFailure = new RequestNetworkError(mockFailedRequest, url);
-        this.resolvers.push(function() { resolve(mockJSON); });
-        this.rejectors.push(function() { reject(mockFailure); });
-        this.requestsDispatched++;
-      });
-    }
-
-    public resolveFakeRequest() {
-      this.resolvers.pop()();
-    }
-
-    public rejectFakeRequest() {
-      this.rejectors.pop()();
-    }
-
-    public dispatchAndResolve() {
-      // Wait for at least one request to be dispatched, then resolve it.
-      this.waitForDispatch(1).then(() => this.resolveFakeRequest());
-    }
-
-    public waitForDispatch(num) {
-      return waitForCondition(() => {return this.requestsDispatched >= num; });
-    }
-  }
-
-  /* Create a promise that returns when *check* returns true. */
-  // May cause a test timeout if check never becomes true.
-  function waitForCondition(check: () => boolean): Promise<any> {
-    return new Promise((resolve, reject) => {
-      var go = function() {
-        if (check()) {
-          resolve();
-        }
-        setTimeout(go, 2);
-      };
-      go();
-    });
-  }
-
-  describe('backend', () => {
-    describe('request manager', () => {
-      it('request loads JSON properly', (done) => {
-        var rm = new TF.Backend.RequestManager();
-        var promise = rm.request('data/example.json');
-        promise.then(
-            (response) => {
-              chai.assert.deepEqual(response, {foo: 3, bar: 'zoidberg'});
-              done();
-            },
-            (reject) => {
-              throw new Error(reject);
-            });
-      });
-
-      it('rejects on bad url', (done) => {
-        var rm = new TF.Backend.RequestManager(5, 0);
-        var bad_url = '_bad_url_which_doesnt_exist.json';
-        var promise = rm.request(bad_url);
-        promise.then(
-            (success) => {
-              done(new Error('the promise should have rejected'));
-            },
-            (reject: TF.Backend.RequestNetworkError) => {
-              chai.assert.instanceOf(reject, TF.Backend.RequestNetworkError);
-              chai.assert.include(reject.message, '404');
-              chai.assert.include(reject.message, bad_url);
-              chai.assert.equal(reject.req.status, 404);
-              done();
-            });
-      });
-
-      it('can retry if requests fail', (done) => {
-        var rm = new MockedRequestManager(3, 5);
-        var r = rm.request('foo');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(2);
-        }).then(() => rm.resolveFakeRequest());
-        r.then((success) => done());
-      });
-
-      it('retries at most maxRetries times', (done) => {
-        var MAX_RETRIES = 2;
-        var rm = new MockedRequestManager(3, MAX_RETRIES);
-        var r = rm.request('foo');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(2);
-        }).then(() => {
-          rm.rejectFakeRequest();
-          return rm.waitForDispatch(3);
-        }).then(() => {
-          rm.rejectFakeRequest();
-        });
-
-        r.then(
-            (success) => done(new Error('The reqest should have failed')),
-            (failure) => done());
-      });
-
-      it('requestManager only sends maxRequests requests at a time', (done) => {
-        var rm = new MockedRequestManager(3);
-        var requestsConcluded = 0;
-        var r0 = rm.request('1');
-        var r1 = rm.request('2');
-        var r2 = rm.request('3');
-        var r3 = rm.request('4');
-        chai.assert.equal(rm.activeRequests(), 3, 'three requests are active');
-        chai.assert.equal(
-            rm.outstandingRequests(), 4, 'four requests are pending');
-        rm.waitForDispatch(3)
-            .then(() => {
-              chai.assert.equal(
-                  rm.activeRequests(), 3,
-                  'three requests are still active (1)');
-              chai.assert.equal(
-                  rm.requestsDispatched, 3, 'three requests were dispatched');
-              rm.resolveFakeRequest();
-              return rm.waitForDispatch(4);
-            })
-            .then(() => {
-              chai.assert.equal(
-                  rm.activeRequests(), 3,
-                  'three requests are still active (2)');
-              chai.assert.equal(
-                  rm.requestsDispatched, 4, 'four requests were dispatched');
-              chai.assert.equal(
-                  rm.outstandingRequests(), 3, 'three requests are pending');
-              rm.resolveFakeRequest();
-              rm.resolveFakeRequest();
-              rm.resolveFakeRequest();
-              return r3;
-            })
-            .then(() => {
-              chai.assert.equal(
-                  rm.activeRequests(), 0, 'all requests finished');
-              chai.assert.equal(
-                  rm.outstandingRequests(), 0, 'no requests pending');
-              done();
-            });
-      });
-
-      it('queue continues after failures', (done) => {
-        var rm = new MockedRequestManager(1, 0);
-        var r0 = rm.request('1');
-        var r1 = rm.request('2');
-        rm.waitForDispatch(1).then(() => {
-          rm.rejectFakeRequest();
-        });
-
-        r0.then(
-              (success) => done(new Error('r0 should have failed')),
-              (failure) => 'unused_argument')
-            .then(() => rm.resolveFakeRequest());
-
-        // When the first request rejects, it should decrement nActiveRequests
-        // and then launch remaining requests in queue (i.e. this one)
-        r1.then((success) => done(),
-                (failure) => done(new Error(failure)));
-      });
-
-      it('queue is LIFO', (done) => {
-        /* This test is a bit tricky.
-        * We want to verify that the RequestManager queue has LIFO semantics.
-        * So we construct three requests off the bat: A, B, C.
-        * So LIFO semantics ensure these will resolve in order A, C, B.
-        * (Because the A request launches immediately when we create it, it's
-        * not in queue)
-        * Then after resolving A, C moves out of queue, and we create X.
-        * So expected final order is A, C, X, B.
-        * We verify this with an external var that counts how many requests were
-        * resolved.
-        */
-        var rm = new MockedRequestManager(1);
-        var nResolved = 0;
-        function assertResolutionOrder(expectedSpotInSequence) {
-          return function() {
-            nResolved++;
-            chai.assert.equal(expectedSpotInSequence, nResolved);
-          };
-        }
-
-        function launchThirdRequest() {
-          rm.request('started late but goes third')
-              .then(assertResolutionOrder(3))
-              .then(() => rm.dispatchAndResolve());
-        }
-
-        rm.request('first')
-            .then(assertResolutionOrder(
-                1))  // Assert that this one resolved first
-            .then(launchThirdRequest)
-            .then(() => rm.dispatchAndResolve());  // then trigger the next one
-
-        rm.request('this one goes fourth')  // created second, will go last
-            .then(assertResolutionOrder(
-                4))       // assert it was the fourth to get resolved
-            .then(done);  // finish the test
-
-        rm.request('second')
-            .then(assertResolutionOrder(2))
-            .then(() => rm.dispatchAndResolve());
-
-        rm.dispatchAndResolve();
-      });
-
-      it('requestManager can clear queue', (done) => {
-        var rm = new MockedRequestManager(1);
-        var requestsResolved = 0;
-        var requestsRejected = 0;
-        var success = () => requestsResolved++;
-        var failure = (err) => {
-          chai.assert.equal(err.name, 'RequestCancellationError');
-          requestsRejected++;
-        };
-        var finishTheTest = () => {
-          chai.assert.equal(rm.activeRequests(), 0, 'no requests still active');
-          chai.assert.equal(
-              rm.requestsDispatched, 1, 'only one req was ever dispatched');
-          chai.assert.equal(rm.outstandingRequests(), 0, 'no pending requests');
-          chai.assert.equal(requestsResolved, 1, 'one request got resolved');
-          chai.assert.equal(
-              requestsRejected, 4, 'four were cancelled and threw errors');
-          done();
-        };
-        rm.request('0').then(success, failure).then(finishTheTest);
-        rm.request('1').then(success, failure);
-        rm.request('2').then(success, failure);
-        rm.request('3').then(success, failure);
-        rm.request('4').then(success, failure);
-        chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
-        rm.waitForDispatch(1).then(() => {
-          chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
-          chai.assert.equal(rm.requestsDispatched, 1, 'one req was dispatched');
-          chai.assert.equal(
-              rm.outstandingRequests(), 5, 'five reqs outstanding');
-          rm.clearQueue();
-          rm.resolveFakeRequest();
-          // resolving the first request triggers finishTheTest
-        });
-      });
-    });
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts b/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
deleted file mode 100644
index 7c59eafb44894cd7ae90cfe74d561628174a8aec..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_backend/urlPathHelpers.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-module TF.Backend {
-  export var BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
-  /** Cleanup a url so that it can be loaded from a filesystem. */
-  export function demoify(s) {
-    // for consistency with python's urllib.urlencode
-    s = s.replace(new RegExp('%20', 'g'), '+');
-    for (var i = 0; i < BAD_CHARACTERS.length; i++) {
-      var c = BAD_CHARACTERS[i];
-      s = s.replace(new RegExp('\\' + c, 'g'), '_');
-    }
-    return s;
-  }
-
-  export function queryEncoder(params?: any): string {
-    // It's important that the keys be sorted, so we always grab the right file
-    // if we are talking to the backend generated by serialze_tensorboard.py
-    if (params == null) {
-      return '';
-    }
-    var components = _.keys(params)
-                         .sort()
-                         .filter((k) => params[k] !== undefined)
-                         .map((k) => k + '=' + encodeURIComponent(params[k]));
-    var result = components.length ? '?' + components.join('&') : '';
-    // Replace parens for consistency with urllib.urlencode
-    return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD b/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..179ca27cd26dac0885f66f16713d221338bb9306
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/BUILD
@@ -0,0 +1,58 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_backend_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-backend.html",
+    ],
+    path = "/tf-backend",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Backend": [
+        "requestManager.ts",
+        "backend.ts",
+        "behavior.ts",
+        "urlPathHelpers.ts",
+        "router.ts",
+    ]},
+    namespace_symbol_aliases = {"TF.Backend": {
+        "compareTagNames": "VZ.Sorting.compareTagNames",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts
new file mode 100644
index 0000000000000000000000000000000000000000..2e1282394bfebe6e8f80fb53777dda52076d8f1d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/backend.ts
@@ -0,0 +1,491 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+
+import {compareTagNames} from '../vz_sorting_d3v4/sorting';
+
+import {RequestManager} from './requestManager';
+import {Router} from './router';
+
+export interface RunEnumeration {
+  histograms: string[];
+  compressedHistogramTuples: string[];
+  scalars: string[];
+  images: string[];
+  audio: string[];
+  graph: boolean;
+  run_metadata: string[];
+}
+
+export interface LogdirResponse { logdir: string; }
+
+export interface RunsResponse { [runName: string]: RunEnumeration; }
+
+export type RunToTag = {
+  [run: string]: string[];
+};
+
+export interface Datum {
+  wall_time: Date;
+  step: number;
+}
+
+export type ScalarDatum = Datum & Scalar;
+export interface Scalar { scalar: number; }
+
+export interface Text { text: string; }
+export type TextDatum = Datum & Text;
+
+export type HistogramDatum = Datum & Histogram;
+export interface Histogram {
+  min: number;
+  max: number;
+  nItems?: number;
+  sum?: number;
+  sumSquares?: number;
+  bucketRightEdges: number[];
+  bucketCounts: number[];
+}
+
+export interface HistogramBin {
+  x: number;
+  dx: number;
+  y: number;
+}
+export type HistogramSeriesDatum = HistogramSeries & Datum;
+export interface HistogramSeries { bins: HistogramBin[]; }
+
+export type ImageDatum = Datum & Image;
+export interface Image {
+  width: number;
+  height: number;
+  url: string;
+}
+
+export type AudioDatum = Datum & Audio;
+export interface Audio {
+  content_type: string;
+  url: string;
+}
+
+// A health pill encapsulates an overview of tensor element values. The value
+// field is a list of 12 numbers that shed light on the status of the tensor.
+export interface HealthPill {
+  node_name: string;
+  output_slot: number;
+  value: number[];
+}
+
+// When updating this type, keep it consistent with the HealthPill interface
+// in tf_graph_common/lib/scene/scene.ts.
+export type HealthPillDatum = Datum & HealthPill;
+// A health pill response is a mapping from node name to a list of health pill
+// data entries.
+export interface HealthPillsResponse { [key: string]: HealthPillDatum[]; }
+
+
+export const TYPES = [
+  'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
+  'runMetadata', 'text'
+];
+/**
+ * The Backend class provides a convenient and typed interface to the backend.
+ *
+ * It provides methods corresponding to the different data sources on the
+ * TensorBoard backend. These methods return a promise containing the data
+ * from the backend. This class does some post-processing on the data; for
+ * example, converting data elements tuples into js objects so that they can
+ * be accessed in a more convenient and clearly-documented fashion.
+ */
+export class Backend {
+  public router: Router;
+  public requestManager: RequestManager;
+
+  /**
+   * Construct a Backend instance.
+   * @param router the Router with info on what urls to get data from
+   * @param requestManager The RequestManager, overwritable so you may
+   * manually clear request queue, etc. Defaults to a new RequestManager.
+   */
+  constructor(router: Router, requestManager?: RequestManager) {
+    this.router = router;
+    this.requestManager = requestManager || new RequestManager();
+  }
+
+  /**
+   * Returns a promise for requesting the logdir string.
+   */
+  public logdir(): Promise<LogdirResponse> {
+    return this.requestManager.request(this.router.logdir());
+  }
+
+  /**
+   * Returns a listing of all the available data in the TensorBoard backend.
+   */
+  public runs(): Promise<RunsResponse> {
+    return this.requestManager.request(this.router.runs());
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for scalar data.
+   */
+  public scalarRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'scalars'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for histogram data.
+   */
+  public histogramRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'histograms'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for image data.
+   */
+  public imageRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'images'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for audio data.
+   */
+  public audioRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'audio'));
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for compressedHistogram
+   * data.
+   */
+  public compressedHistogramRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'compressedHistograms'));
+  }
+
+  /**
+   * Return a promise showing list of runs that contain graphs.
+   */
+  public graphRuns(): Promise<string[]> {
+    return this.runs().then((x) => {
+      return _.keys(x).filter((k) => x[k].graph);
+    });
+  }
+
+  /**
+   * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
+   */
+  public runMetadataRuns(): Promise<RunToTag> {
+    return this.runs().then((x) => _.mapValues(x, 'run_metadata'));
+  }
+
+
+  /**
+   * Returns a promise showing the Run-to-Tag mapping for text data.
+   */
+  public textRuns(): Promise<RunToTag> {
+    return this.requestManager.request(this.router.textRuns());
+  }
+
+
+  /**
+   * Returns a promise containing TextDatums for given run and tag.
+   */
+  public text(tag: string, run: string): Promise<TextDatum[]> {
+    const url = this.router.text(tag, run);
+    // tslint:disable-next-line:no-any it's convenient and harmless here
+    return this.requestManager.request(url).then(map((x: any) => {
+      x.wall_time = timeToDate(x.wall_time);
+      return x;
+    }));
+  }
+
+  /**
+   * Return a promise of a graph string from the backend.
+   */
+  public graph(tag: string, limitAttrSize?: number, largeAttrKeys?: string):
+      Promise<string> {
+    const url = this.router.graph(tag, limitAttrSize, largeAttrKeys);
+    return this.requestManager.request(url);
+  }
+
+  /**
+   * Return a promise containing ScalarDatums for given run and tag.
+   */
+  public scalar(tag: string, run: string): Promise<Array<ScalarDatum>> {
+    let p: Promise<TupleData<number>[]>;
+    const url = this.router.scalars(tag, run);
+    p = this.requestManager.request(url);
+    return p.then(map(detupler(createScalar)));
+  }
+
+  /**
+   * Returns a promise for requesting the health pills for a list of nodes.
+   */
+  public healthPills(nodeNames: string[], step?: number):
+      Promise<HealthPillsResponse> {
+    const postData = {
+      'node_names': JSON.stringify(nodeNames),
+
+      // Events files with debugger data fall under this special run.
+      'run': '__debugger_data__',
+    };
+    if (step !== undefined) {
+      // The user requested health pills for a specific step. This request
+      // might be slow since the backend reads events sequentially from disk.
+      postData['step'] = step;
+    }
+    return this.requestManager.request(this.router.healthPills(), postData);
+  }
+
+  /**
+   * Return a promise containing HistogramDatums for given run and tag.
+   */
+  public histogram(tag: string, run: string):
+      Promise<Array<HistogramSeriesDatum>> {
+    let p: Promise<TupleData<HistogramTuple>[]>;
+    let url = this.router.histograms(tag, run);
+    p = this.requestManager.request(url);
+    return p.then(map(detupler(createHistogram))).then(function(histos) {
+      // Get the minimum and maximum values across all histograms so that the
+      // visualization is aligned for all timesteps.
+      const min = d3.min(histos, d => d.min);
+      const max = d3.max(histos, d => d.max);
+
+      return histos.map(function(histo, i) {
+        return {
+          wall_time: histo.wall_time,
+          step: histo.step,
+          bins: convertBins(histo, min, max)
+        };
+      });
+    });
+  }
+
+  /**
+   * Return a promise containing ImageDatums for given run and tag.
+   */
+  public image(tag: string, run: string): Promise<Array<ImageDatum>> {
+    const url = this.router.images(tag, run);
+    let p: Promise<ImageMetadata[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(this.createImage.bind(this)));
+  }
+
+  /**
+   * Return a promise containing AudioDatums for given run and tag.
+   */
+  public audio(tag: string, run: string): Promise<Array<AudioDatum>> {
+    const url = this.router.audio(tag, run);
+    let p: Promise<AudioMetadata[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(this.createAudio.bind(this)));
+  }
+
+  /**
+   * Returns a promise to load the string RunMetadata for given run/tag.
+   */
+  public runMetadata(tag: string, run: string): Promise<string> {
+    const url = this.router.runMetadata(tag, run);
+    return this.requestManager.request(url);
+  }
+
+  /**
+   * Get compressedHistogram data.
+   * Unlike other methods, don't bother reprocessing this data into a nicer
+   * format. This is because we will deprecate this route.
+   */
+  private compressedHistogram(tag: string, run: string):
+      Promise<Array<Datum&CompressedHistogramTuple>> {
+    const url = this.router.compressedHistograms(tag, run);
+    let p: Promise<TupleData<CompressedHistogramTuple>[]>;
+    p = this.requestManager.request(url);
+    return p.then(map(detupler((x) => x)));
+  }
+
+  private createImage(x: ImageMetadata): Image&Datum {
+    return {
+      width: x.width,
+      height: x.height,
+      wall_time: timeToDate(x.wall_time),
+      step: x.step,
+      url: this.router.individualImage(x.query, x.wall_time),
+    };
+  }
+
+  private createAudio(x: AudioMetadata): Audio&Datum {
+    return {
+      content_type: x.content_type,
+      wall_time: timeToDate(x.wall_time),
+      step: x.step,
+      url: this.router.individualAudio(x.query),
+    };
+  }
+}
+
+/** Given a RunToTag, return sorted array of all runs */
+export function getRuns(r: RunToTag): string[] {
+  return _.keys(r).sort(compareTagNames);
+}
+
+/** Given a RunToTag, return array of all tags (sorted + dedup'd) */
+export function getTags(r: RunToTag): string[] {
+  return _.union.apply(null, _.values(r)).sort(compareTagNames);
+}
+
+/**
+ * Given a RunToTag and an array of runs, return every tag that appears for
+ * at least one run.
+ * Sorted, deduplicated.
+ */
+export function filterTags(r: RunToTag, runs: string[]): string[] {
+  let result = [];
+  runs.forEach((x) => result = result.concat(r[x]));
+  return _.uniq(result).sort(compareTagNames);
+}
+
+function timeToDate(x: number): Date {
+  return new Date(x * 1000);
+};
+
+/**  Just a curryable map to make things cute and tidy. */
+function map<T, U>(f: (x: T) => U): (arr: T[]) => U[] {
+  return function(arr: T[]): U[] {
+    return arr.map(f);
+  };
+};
+
+/**
+ * This is a higher order function that takes a function that transforms a
+ * T into a G, and returns a function that takes TupleData<T>s and converts
+ * them into the intersection of a G and a Datum.
+ */
+function detupler<T, G>(xform: (x: T) => G): (t: TupleData<T>) => Datum & G {
+  return function(x: TupleData<T>): Datum & G {
+    // Create a G, assert it has type <G & Datum>
+    let obj = <G&Datum>xform(x[2]);
+    // ... patch in the properties of datum
+    obj.wall_time = timeToDate(x[0]);
+    obj.step = x[1];
+    return obj;
+  };
+};
+
+function createScalar(x: number): Scalar {
+  return {scalar: x};
+}
+
+function createHistogram(x: HistogramTuple): Histogram {
+  return {
+    min: x[0],
+    max: x[1],
+    nItems: x[2],
+    sum: x[3],
+    sumSquares: x[4],
+    bucketRightEdges: x[5],
+    bucketCounts: x[6],
+  };
+}
+
+/**
+ * Takes histogram data as stored by tensorboard backend and converts it to
+ * the standard d3 histogram data format to make it more compatible and easier
+ * to visualize. When visualizing histograms, having the left edge and width
+ * makes things quite a bit easier. The bins are also converted to have an
+ * uniform width, what makes the visualization easier to understand.
+ *
+ * @param histogram A histogram from tensorboard backend.
+ * @param min The leftmost edge. The binning will start on it.
+ * @param max The rightmost edge. The binning will end on it.
+ * @param numBins The number of bins of the converted data. The default of 30
+ * is a sensible default, using more starts to get artifacts because the event
+ * data is stored in buckets, and you start being able to see the aliased
+ * borders between each bucket.
+ * @return A histogram bin. Each bin has an x (left edge), a dx (width),
+ *     and a y (count).
+ *
+ * If given rightedges are inclusive, then these left edges (x) are exclusive.
+ */
+export function convertBins(
+    histogram: Histogram, min: number, max: number, numBins = 30) {
+  if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
+    throw(new Error('Edges and counts are of different lengths.'));
+  }
+
+  if (max === min) {
+    // Create bins even if all the data has a single value.
+    max = min * 1.1 + 1;
+    min = min / 1.1 - 1;
+  }
+  const binWidth = (max - min) / numBins;
+  let bucketLeft = min;  // Use the min as the starting point for the bins.
+  let bucketPos = 0;
+  return d3.range(min, max, binWidth).map((binLeft) => {
+    const binRight = binLeft + binWidth;
+
+    // Take the count of each existing bucket, multiply it by the proportion
+    // of overlap with the new bin, then sum and store as the count for the
+    // new bin. If no overlap, will add to zero, if 100% overlap, will include
+    // the full count into new bin.
+    let binY = 0;
+    while (bucketPos < histogram.bucketRightEdges.length) {
+      // Clip the right edge because right-most edge can be infinite-sized.
+      const bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
+
+      const intersect =
+          Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
+      const count = (intersect / (bucketRight - bucketLeft)) *
+          histogram.bucketCounts[bucketPos];
+
+      binY += intersect > 0 ? count : 0;
+
+      // If bucketRight is bigger than binRight, than this bin is finished and
+      // there is data for the next bin, so don't increment bucketPos.
+      if (bucketRight > binRight) {
+        break;
+      }
+      bucketLeft = Math.max(min, bucketRight);
+      bucketPos++;
+    }
+
+    return {x: binLeft, dx: binWidth, y: binY};
+  });
+}
+
+/**
+ * The following interfaces (TupleData, HistogramTuple,
+ * CompressedHistogramTuple, ImageMetadata, and AudioMetadata) describe how
+ * the data is sent over from the backend.
+ */
+type TupleData<T> = [number, number, T];  // wall_time, step
+
+// Min, Max, nItems, Sum, Sum_Squares, right edges of buckets, nItems in
+// buckets
+type HistogramTuple =
+    [number, number, number, number, number, number[], number[]];
+type CompressedHistogramTuple = [number, number][];  // percentile, value
+interface ImageMetadata {
+  width: number;
+  height: number;
+  wall_time: number;
+  step: number;
+  query: string;
+}
+interface AudioMetadata {
+  content_type: string;
+  wall_time: number;
+  step: number;
+  query: string;
+}
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/behavior.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..dc47df2a5c247ba86b707f24632490beea6fe99d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/behavior.ts
@@ -0,0 +1,145 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {getRuns, getTags, TYPES} from './backend';
+
+export const BackendBehavior = {
+  properties: {
+    /** *** Required properties *** */
+    /** Data type. One of Backend.TYPES */
+    dataType: {
+      type: String,
+      observer: '_throwErrorOnUnrecognizedType',
+    },
+
+    /** Backend for data loading. */
+    backend: {
+      type: Object,
+    },
+
+    /** Should it automatically load when configured ready? Default true. */
+    autoLoad: {
+      type: Boolean,
+      value: true,
+    },
+
+    /** *** Component-provided properties *** */
+    /** Every tag available for data type (sorted, dedpulicated) */
+    tags: {
+      type: Array,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Every run available for data type (sorted) */
+    runs: {
+      type: Array,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Mapping from runs to tags for the data type */
+    run2tag: {
+      type: Object,
+      readOnly: true,
+      notify: true,
+    },
+
+    /** Promise provider for the data. Useful for passing to subcomponents */
+    dataProvider:
+        {type: Function, computed: '_getDataProvider(dataType, backend)'},
+
+    /** Has the dashboard loaded yet? */
+    loadState: {
+      type: String,
+      value: 'noload',  // [noload, pending, loaded, failure]
+      readOnly: true,
+    },
+
+    /**
+     * True if dashboard has loaded, and no tags were found.
+     * Persists through subsequent reloads (ie. still true while
+     * next load is pending) so warning won't flash away every reload
+     * when there is no data.
+     */
+    dataNotFound: {
+      type: Boolean,
+      value: false,
+      readOnly: true,
+    }
+
+  },
+  observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
+  /**
+   * Reloading works in two steps:
+   * Backend reload, which gets metadata on available runs, tags, etc from
+   *   the backend.
+   * Frontend reload, which loads new data for each chart or visual display.
+   * Backend reload logic is provided by this behavior. The frontend reload
+   *   logic should be provided elsewhere, since it is component-specific.
+   * To keep things simple and consistent, we do the backend reload first,
+   *   and the frontend reload afterwards.
+   */
+  reload() {
+    return this.backendReload().then((x) => {
+      return this.frontendReload();
+    });
+  },
+  /**
+   * Load data from backend and then set run2tag, tags, runs, and loadState.
+   * Returns a promise that resolves/rejects when data is loaded.
+   */
+  backendReload() {
+    if (this.dataType == null) {
+      throw new Error('BackendBehavior: Need a dataType to reload.');
+    }
+    if (this.backend == null) {
+      throw new Error('BackendBehavior: Need a backend to reload.');
+    }
+    const runsRoute = this.backend[this.dataType + 'Runs'].bind(this.backend);
+    this._setLoadState('pending');
+    return runsRoute().then(
+        (x) => {
+          this._setLoadState('loaded');
+          if (_.isEqual(x, this.run2tag)) {
+            // If x and run2tag are equal, let's avoid updating everything
+            // since that can needlessly trigger run changes, reloads, etc
+            return x;
+          }
+          this._setRun2tag(x);
+          const tags = getTags(x);
+          this._setDataNotFound(tags.length === 0);
+          this._setTags(tags);
+          this._setRuns(getRuns(x));
+          return x;
+        },
+        (fail) => {
+          this._setLoadState('failure');
+          return fail;
+        });
+  },
+  _do_autoLoad(type, backend, autoLoad) {
+    if (autoLoad) {
+      this.reload();
+    }
+  },
+  _getDataProvider(dataType, backend) {
+    return this.backend[this.dataType].bind(this.backend);
+  },
+  _throwErrorOnUnrecognizedType(dataType) {
+    if (TYPES.indexOf(dataType) === -1) {
+      throw new Error('BackendBehavior: Unknown dataType ' + dataType);
+    }
+  },
+};
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/requestManager.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/requestManager.ts
new file mode 100644
index 0000000000000000000000000000000000000000..c943268cec57213a3b6f240ce3114a0a8bc194b8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/requestManager.ts
@@ -0,0 +1,177 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+interface ResolveReject {
+  resolve: Function;
+  reject: Function;
+}
+/**
+ * Manages many fetch requests. Launches up to nSimultaneousRequests
+ * simultaneously, and maintains a LIFO queue of requests to process when
+ * more urls are requested than can be handled at once. The queue can be
+ * cleared.
+ *
+ * When a request is made, a Promise is returned which resolves with the
+ * parsed JSON result from the request.
+ */
+export class RequestCancellationError extends Error {
+  public name = 'RequestCancellationError';
+}
+
+export class RequestNetworkError extends Error {
+  public name: string;
+  public req: XMLHttpRequest;
+  public url: string;
+
+  constructor(req: XMLHttpRequest, url) {
+    super();
+    this.message = `RequestNetworkError: ${req.status} at ${url}`;
+    this.name = 'RequestNetworkError';
+    this.req = req;
+    this.url = url;
+  }
+}
+
+export class RequestManager {
+  private _queue: ResolveReject[];
+  private _maxRetries: number;
+  private _nActiveRequests: number;
+  private _nSimultaneousRequests: number;
+
+  constructor(nSimultaneousRequests = 10, maxRetries = 3) {
+    this._queue = [];
+    this._nActiveRequests = 0;
+    this._nSimultaneousRequests = nSimultaneousRequests;
+    this._maxRetries = maxRetries;
+  }
+
+  /**
+   * Gives a promise that loads assets from given url (respects queuing). If
+   * postData is provided, this request will use POST, not GET. This is an
+   * object mapping POST keys to string values.
+   */
+  public request(url: string, postData?: {[key: string]: string}):
+      Promise<any> {
+    const promise =
+        new Promise((resolve, reject) => {
+          const resolver = {resolve: resolve, reject: reject};
+          this._queue.push(resolver);
+          this.launchRequests();
+        })
+            .then(() => {
+              return this.promiseWithRetries(url, this._maxRetries, postData);
+            })
+            .then(
+                (response) => {
+                  // Success - Let's free space for another active
+                  // reqest, and launch it
+                  this._nActiveRequests--;
+                  this.launchRequests();
+                  return response;
+                },
+                (rejection) => {
+                  if (rejection.name === 'RequestNetworkError') {
+                    // If we failed due to network error, we should
+                    // decrement
+                    // _nActiveRequests because this request was
+                    // active
+                    this._nActiveRequests--;
+                    this.launchRequests();
+                  }
+                  return Promise.reject(rejection);
+                });
+    return promise;
+  }
+
+  public clearQueue() {
+    while (this._queue.length > 0) {
+      this._queue.pop().reject(
+          new RequestCancellationError('Request cancelled by clearQueue'));
+    }
+  }
+
+  /* Return number of currently pending requests */
+  public activeRequests(): number {
+    return this._nActiveRequests;
+  }
+
+  /* Return total number of outstanding requests (includes queue) */
+  public outstandingRequests(): number {
+    return this._nActiveRequests + this._queue.length;
+  }
+
+  private launchRequests() {
+    while (this._nActiveRequests < this._nSimultaneousRequests &&
+           this._queue.length > 0) {
+      this._nActiveRequests++;
+      this._queue.pop().resolve();
+    }
+  }
+
+  /**
+   * Try to request a given URL using overwritable _promiseFromUrl method.
+   * If the request fails for any reason, we will retry up to maxRetries
+   * times. In practice, this will help us paper over transient network issues
+   * like '502 Bad Gateway'.
+   * By default, Chrome displays network errors in console, so
+   * the user will be able to tell when the requests are failing. I think this
+   * is a feature, if the request failures and retries are causing any
+   * pain to users, they can see it and file issues.
+   */
+  private promiseWithRetries(
+      url: string, maxRetries: number, postData?: {[key: string]: string}) {
+    var success = (x) => x;
+    var failure = (x) => {
+      if (maxRetries > 0) {
+        return this.promiseWithRetries(url, maxRetries - 1, postData);
+      } else {
+        return Promise.reject(x);
+      }
+    };
+    return this._promiseFromUrl(url, postData).then(success, failure);
+  }
+
+  /* Actually get promise from url using XMLHttpRequest */
+  protected _promiseFromUrl(url: string, postData?: {[key: string]: string}) {
+    return new Promise((resolve, reject) => {
+      let req = new XMLHttpRequest();
+      req.open(postData ? 'POST' : 'GET', url);
+
+      let formData;
+      if (postData) {
+        // We are to make a POST request.
+        formData = new FormData();
+        for (let postKey in postData) {
+          if (postKey) {
+            // The linter requires 'for in' loops to be filtered by an if
+            // condition.
+            formData.append(postKey, postData[postKey]);
+          }
+        }
+      }
+      req.onload = function() {
+        if (req.status === 200) {
+          resolve(JSON.parse(req.responseText));
+        } else {
+          reject(new RequestNetworkError(req, url));
+        }
+      };
+      req.onerror = function() {
+        reject(new RequestNetworkError(req, url));
+      };
+      req.send(formData);
+    });
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/router.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/router.ts
new file mode 100644
index 0000000000000000000000000000000000000000..319514f3f6b8172e2aea435a2ef156974acb5150
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/router.ts
@@ -0,0 +1,112 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {demoify, queryEncoder} from './urlPathHelpers'
+
+export type RunTagUrlFn = (tag: string, run: string) => string;
+
+export interface Router {
+  logdir: () => string;
+  runs: () => string;
+  scalars: RunTagUrlFn;
+  histograms: RunTagUrlFn;
+  compressedHistograms: RunTagUrlFn;
+  images: RunTagUrlFn;
+  individualImage: (query: string, wallTime: number) => string;
+  audio: RunTagUrlFn;
+  individualAudio: (query: string) => string;
+  graph:
+      (run: string, limit_attr_size?: number,
+       large_attrs_key?: string) => string;
+  runMetadata: RunTagUrlFn;
+  textRuns: () => string;
+  text: RunTagUrlFn;
+  healthPills: () => string;
+}
+;
+
+/**
+ * The standard router for communicating with the TensorBoard backend
+ * @param dataDir {string} The base prefix for finding data on server.
+ * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
+ */
+export function router(dataDir = 'data', demoMode = false): Router {
+  var clean = demoMode ? demoify : (x) => x;
+  if (dataDir[dataDir.length - 1] === '/') {
+    dataDir = dataDir.slice(0, dataDir.length - 1);
+  }
+  function standardRoute(route: string, demoExtension = '.json'):
+      ((tag: string, run: string) => string) {
+    return function(tag: string, run: string): string {
+      var url =
+          dataDir + '/' + route + clean(queryEncoder({tag: tag, run: run}));
+      if (demoMode) {
+        url += demoExtension;
+      }
+      return url;
+    };
+  }
+  function individualImageUrl(query: string, wallTime: number) {
+    var url = dataDir + '/' + clean('individualImage?' + query);
+    // Include wall_time just to disambiguate the URL and force the browser
+    // to reload the image when the URL changes. The backend doesn't care
+    // about the value.
+    url += demoMode ? '.png' : '&ts=' + wallTime;
+    return url;
+  }
+  function individualAudioUrl(query: string) {
+    var url = dataDir + '/' + clean('individualAudio?' + query);
+    if (demoMode) {
+      url += '.wav';
+    }
+    return url;
+  }
+  function graphUrl(
+      run: string, limit_attr_size?: number, large_attrs_key?: string) {
+    let query_params = [['run', clean(run)]];
+    if (limit_attr_size != null && !demoMode) {
+      query_params.push(['limit_attr_size', String(limit_attr_size)]);
+    }
+    if (large_attrs_key != null && !demoMode) {
+      query_params.push(['large_attrs_key', large_attrs_key]);
+    }
+    let query = query_params
+                    .map(param => {
+                      return param[0] + '=' + encodeURIComponent(param[1]);
+                    })
+                    .join('&');
+    var url = dataDir + '/graph' + clean('?' + query);
+    if (demoMode) {
+      url += '.pbtxt';
+    }
+    return url;
+  }
+  return {
+    logdir: () => dataDir + '/logdir',
+    runs: () => dataDir + '/runs' + (demoMode ? '.json' : ''),
+    individualImage: individualImageUrl,
+    individualAudio: individualAudioUrl,
+    graph: graphUrl,
+    scalars: standardRoute('scalars'),
+    histograms: standardRoute('histograms'),
+    compressedHistograms: standardRoute('compressedHistograms'),
+    images: standardRoute('images'),
+    audio: standardRoute('audio'),
+    runMetadata: standardRoute('run_metadata', '.pbtxt'),
+    healthPills: () => dataDir + '/plugin/debugger/health_pills',
+    textRuns: () => dataDir + '/plugin/text/runs' + (demoMode ? '.json' : ''),
+    text: standardRoute('plugin/text/text'),
+  };
+};
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..00b2be45eb4026ae54cc777dc43f3c1dea4c5361
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/BUILD
@@ -0,0 +1,56 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ] + glob(["data/**"]),
+    path = "/tf-backend/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Backend": [
+        "backendTests.ts",
+        "behaviorTests.ts",
+        "requestManagerTests.ts",
+    ]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..648d175621ea337a559b7e22a0595086d9b6b50c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/backendTests.ts
@@ -0,0 +1,299 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {Backend, convertBins, filterTags, getRuns, getTags, RunToTag, TYPES} from '../backend';
+import {RequestManager} from '../requestManager';
+import {Router, router} from '../router';
+import {BAD_CHARACTERS, demoify, queryEncoder} from '../urlPathHelpers';
+
+describe('urlPathHelpers', () => {
+  it('demoify works as expected', () => {
+    const demoified = demoify(BAD_CHARACTERS);
+    let allClean = '';
+    for (let i = 0; i < BAD_CHARACTERS.length; i++) {
+      allClean += '_';
+    }
+    chai.assert.equal(demoified, allClean, 'cleaning the BAD_CHARACTERS works');
+    chai.assert.equal(demoify('foozod'), 'foozod', 'doesnt change safe string');
+    chai.assert.equal(demoify('foo zod (2)'), 'foo_zod__2_', 'simple case');
+  });
+
+  it('queryEncoder works with demoify on spaces and parens', () => {
+    const params = {foo: 'something with spaces and (parens)'};
+    const actual = demoify(queryEncoder(params));
+    const expected = '_foo_something_with_spaces_and__28parens_29';
+    chai.assert.equal(actual, expected);
+  });
+});
+
+function assertIsDatum(x) {
+  chai.assert.isNumber(x.step);
+  chai.assert.instanceOf(x.wall_time, Date);
+}
+
+describe('backend tests', () => {
+  let backend: Backend;
+  let rm: RequestManager;
+  const base = 'data';
+  const demoRouter = router(base, true);
+  beforeEach(() => {
+    // Construct a demo Backend (third param is true)
+    backend = new Backend(demoRouter);
+    rm = new RequestManager();
+  });
+
+  it('runs are loaded properly', (done) => {
+    const runsResponse = backend.runs();
+    const actualRuns = rm.request(demoRouter.runs());
+    Promise.all([runsResponse, actualRuns]).then((values) => {
+      chai.assert.deepEqual(values[0], values[1]);
+      done();
+    });
+  });
+
+  it('scalars are loaded properly', (done) => {
+    backend.scalar('cross_entropy (1)', 'run1').then((s) => {
+      // just check the data got reformatted properly
+      const aScalar = s[s.length - 1];
+      assertIsDatum(aScalar);
+      chai.assert.isNumber(aScalar.scalar);
+      // verify date conversion works
+      chai.assert.equal(aScalar.wall_time.valueOf(), 40000);
+      done();
+    });
+  });
+
+  it('histograms are loaded properly', (done) => {
+    backend.histogram('histo1', 'run1').then((histos) => {
+      const histo = histos[0];
+      assertIsDatum(histo);
+      chai.assert.instanceOf(histo.bins, Array);
+      done();
+    });
+  });
+
+  it('all registered types have handlers', () => {
+    TYPES.forEach((t: string) => {
+      chai.assert.isDefined(backend[t], t);
+      chai.assert.isDefined(backend[t + 'Runs'], t + 'Runs');
+    });
+  });
+
+  it('images are loaded properly', (done) => {
+    backend.image('im1', 'run1').then((images) => {
+      const image = images[0];
+      assertIsDatum(image);
+      chai.assert.isNumber(image.width);
+      chai.assert.isNumber(image.height);
+      const nonDemoQuery = 'index=0&tag=im1&run=run1';
+      const expectedUrl = demoRouter.individualImage(nonDemoQuery, 10.0);
+      chai.assert.equal(image.url, expectedUrl);
+      done();
+    });
+  });
+
+  it('audio is loaded properly', (done) => {
+    backend.audio('audio1', 'run1').then((audioClips) => {
+      const audio = audioClips[0];
+      assertIsDatum(audio);
+      chai.assert.equal(audio.content_type, 'audio/wav');
+      const nonDemoQuery = 'index=0&tag=audio1&run=run1';
+      const expectedUrl = demoRouter.individualAudio(nonDemoQuery);
+      chai.assert.equal(audio.url, expectedUrl);
+      done();
+    });
+  });
+
+  it('trailing slash removed from base route', () => {
+    const r = router('foo/');
+    chai.assert.equal(r.runs(), 'foo/runs');
+  });
+
+  it('run helper methods work', (done) => {
+    const scalar = {run1: ['cross_entropy (1)'], fake_run_no_data: ['scalar2']};
+    const image = {run1: ['im1'], fake_run_no_data: ['im1', 'im2']};
+    const audio = {run1: ['audio1'], fake_run_no_data: ['audio1', 'audio2']};
+    const runMetadata = {run1: ['step99'], fake_run_no_data: ['step99']};
+    const graph = ['fake_run_no_data'];
+    let count = 0;
+    function next() {
+      count++;
+      if (count === 4) {
+        done();
+      }
+    }
+    backend.scalarRuns().then((x) => {
+      chai.assert.deepEqual(x, scalar);
+      next();
+    });
+    backend.imageRuns().then((x) => {
+      chai.assert.deepEqual(x, image);
+      next();
+    });
+    backend.audioRuns().then((x) => {
+      chai.assert.deepEqual(x, audio);
+      next();
+    });
+    backend.runMetadataRuns().then((x) => {
+      chai.assert.deepEqual(x, runMetadata);
+      next();
+    });
+    backend.graphRuns().then((x) => {
+      chai.assert.deepEqual(x, graph);
+      next();
+    });
+  });
+
+  it('runToTag helpers work', () => {
+    const r2t: RunToTag = {
+      run1: ['foo', 'bar', 'zod'],
+      run2: ['zod', 'zoink'],
+      a: ['foo', 'zod']
+    };
+    const empty1: RunToTag = {};
+    const empty2: RunToTag = {run1: [], run2: []};
+    chai.assert.deepEqual(getRuns(r2t), ['a', 'run1', 'run2']);
+    chai.assert.deepEqual(getTags(r2t), ['bar', 'foo', 'zod', 'zoink']);
+    chai.assert.deepEqual(filterTags(r2t, ['run1', 'run2']), getTags(r2t));
+    chai.assert.deepEqual(filterTags(r2t, ['run1']), ['bar', 'foo', 'zod']);
+    chai.assert.deepEqual(
+        filterTags(r2t, ['run2', 'a']), ['foo', 'zod', 'zoink']);
+
+    chai.assert.deepEqual(getRuns(empty1), []);
+    chai.assert.deepEqual(getTags(empty1), []);
+
+    chai.assert.deepEqual(getRuns(empty2), ['run1', 'run2']);
+    chai.assert.deepEqual(getTags(empty2), []);
+  });
+});
+
+describe('Verify that the histogram format conversion works.', () => {
+
+  function assertHistogramEquality(h1, h2) {
+    h1.forEach((b1, i) => {
+      const b2 = h2[i];
+      chai.assert.closeTo(b1.x, b2.x, 1e-10);
+      chai.assert.closeTo(b1.dx, b2.dx, 1e-10);
+      chai.assert.closeTo(b1.y, b2.y, 1e-10);
+    });
+  }
+
+  it('Throws and error if the inputs are of different lengths', () => {
+    chai.assert.throws(() => {
+      convertBins(
+          {bucketRightEdges: [0], bucketCounts: [1, 2], min: 1, max: 2}, 1, 2,
+          2);
+    }, 'Edges and counts are of different lengths.');
+  });
+
+  it('Handles data with no bins', () => {
+    chai.assert.deepEqual(
+        convertBins(
+            {bucketRightEdges: [], bucketCounts: [], min: 0, max: 0}, 0, 0, 0),
+        []);
+  });
+
+  it('Handles data with one bin', () => {
+    const counts = [1];
+    const rightEdges = [1.21e-12];
+    const histogram = [{x: 1.1e-12, dx: 1.21e-12 - 1.1e-12, y: 1}];
+    const newHistogram = convertBins(
+        {
+          bucketRightEdges: rightEdges,
+          bucketCounts: counts,
+          min: 1.1e-12,
+          max: 1.21e-12
+        },
+        1.1e-12, 1.21e-12, 1);
+    assertHistogramEquality(newHistogram, histogram);
+  });
+
+  it('Handles data with two bins.', () => {
+    const counts = [1, 2];
+    const rightEdges = [1.1e-12, 1.21e-12];
+    const histogram = [
+      {x: 1.0e-12, dx: 1.05e-13, y: 1.09090909090909},
+      {x: 1.105e-12, dx: 1.05e-13, y: 1.9090909090909}
+    ];
+    const newHistogram = convertBins(
+        {
+          bucketRightEdges: rightEdges,
+          bucketCounts: counts,
+          min: 1.0e-12,
+          max: 1.21e-12
+        },
+        1.0e-12, 1.21e-12, 2);
+    assertHistogramEquality(newHistogram, histogram);
+  });
+
+  it('Handles a domain that crosses zero, but doesn\'t include zero as ' +
+         'an edge.',
+     () => {
+       const counts = [1, 2];
+       const rightEdges = [-1.0e-12, 1.0e-12];
+       const histogram = [
+         {x: -1.1e-12, dx: 1.05e-12, y: 1.95},
+         {x: -0.5e-13, dx: 1.05e-12, y: 1.05}
+       ];
+       const newHistogram = convertBins(
+           {
+             bucketRightEdges: rightEdges,
+             bucketCounts: counts,
+             min: -1.1e-12,
+             max: 1.0e-12
+           },
+           -1.1e-12, 1.0e-12, 2);
+       assertHistogramEquality(newHistogram, histogram);
+     });
+
+  it('Handles a histogram of all zeros', () => {
+    const h = {
+      min: 0,
+      max: 0,
+      nItems: 51200,
+      sum: 0,
+      sumSquares: 0,
+      bucketRightEdges: [0, 1e-12, 1.7976931348623157e+308],
+      bucketCounts: [0, 51200, 0],
+      wall_time: '2017-01-25T02:30:11.257Z',
+      step: 0
+    };
+    const newHistogram = convertBins(h, 0, 0, 5);
+    const expectedHistogram = [
+      {x: -1, dx: 0.4, y: 0}, {x: -0.6, dx: 0.4, y: 0},
+      {x: -0.2, dx: 0.4, y: 51200}, {x: 0.2, dx: 0.4, y: 0},
+      {x: 0.6, dx: 0.4, y: 0}
+    ];
+    assertHistogramEquality(newHistogram, expectedHistogram);
+  });
+
+  it('Handles a right-most right edge that extends to very large number.',
+     () => {
+       const counts = [1, 2, 3];
+       const rightEdges = [0, 1.0e-12, 1.0e14];
+       const histogram = [
+         {x: -1.0e-12, dx: 0.7e-12, y: 0.7}, {x: -0.3e-12, dx: 0.7e-12, y: 1.1},
+         {x: 0.4e-12, dx: 0.7e-12, y: 4.2}
+       ];
+       const newHistogram = convertBins(
+           {
+             bucketRightEdges: rightEdges,
+             bucketCounts: counts,
+             min: -1.0e-12,
+             max: 1.1e-12
+           },
+           -1.0e-12, 1.1e-12, 3);
+       assertHistogramEquality(newHistogram, histogram);
+     });
+});
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..4a74fe01c1b57deb911df878e9cba96d91ac0283
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/behaviorTests.ts
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {Backend, getRuns, getTags, RunToTag} from '../backend'
+import {BackendBehavior} from '../behavior'
+
+declare function fixture(id: string): void;
+
+window.addEventListener('WebComponentsReady', function() {
+  Polymer({
+    is: 'test-element',
+    behaviors: [BackendBehavior],
+    frontendReload: function() {
+      // no-op
+    },
+  });
+});
+
+describe('data-behavior', function() {
+  var testElement;
+  var resolve;
+  var reject;
+  var fakeBackend = {
+    scalarRuns: function() {
+      return new Promise(function(_resolve, _reject) {
+        resolve = (x) => _resolve(x);
+        reject = (x) => _reject(x);
+      });
+    },
+    scalar: function(x) {
+      return this;
+    },
+  };
+  beforeEach(function() {
+    testElement = fixture('testElementFixture');
+    testElement.autoLoad = false;
+    testElement.backend = fakeBackend;
+    testElement.dataType = 'scalar';
+  });
+
+  it('load states work as expected', function(done) {
+    chai.assert.equal(testElement.loadState, 'noload');
+    var reloaded = testElement.reload();
+    chai.assert.equal(testElement.loadState, 'pending');
+    resolve();
+    reloaded
+        .then(function() {
+          chai.assert.equal(testElement.loadState, 'loaded');
+          var reloaded2 = testElement.reload();
+          chai.assert.equal(testElement.loadState, 'pending');
+          reject();
+          return reloaded2;
+        })
+        .then(function() {
+          chai.assert.equal(testElement.loadState, 'failure');
+          done();
+        });
+  });
+
+  it('data provider set appropriately', function() {
+    chai.assert.deepEqual(testElement.dataProvider(), testElement.backend);
+  });
+
+  it('loads data as expected', function(done) {
+    var r2t: RunToTag = {
+      run1: ['foo', 'bar', 'zod'],
+      run2: ['zoink', 'zow'],
+      run3: ['.'],
+    };
+    var tags = getTags(r2t);
+    var runs = getRuns(r2t);
+    testElement.backend = fakeBackend;
+    testElement.dataType = 'scalar';
+    testElement.reload().then(function(x) {
+      chai.assert.deepEqual(testElement.run2tag, r2t);
+      chai.assert.deepEqual(testElement.runs, runs);
+      chai.assert.deepEqual(testElement.tags, tags);
+      done();
+    });
+    resolve(r2t);
+  });
+
+  it('errors thrown on bad data types', function() {
+    testElement.backend = undefined;
+    chai.assert.throws(function() {
+      testElement.dataType = 'foo';
+    });
+    testElement.dataType = 'scalar';
+    testElement.dataType = 'graph';
+    testElement.dataType = 'histogram';
+  });
+
+  it('dataNotFound flag works', function(done) {
+    chai.assert.isFalse(testElement.dataNotFound, 'initially false');
+    var next = testElement.reload();
+    chai.assert.isFalse(testElement.dataNotFound, 'still false while pending');
+    resolve({foo: [], bar: []});
+    next.then(() => {
+      chai.assert.isTrue(testElement.dataNotFound, 'true on empty data');
+      var last = testElement.reload();
+      chai.assert.isTrue(testElement.dataNotFound, 'still true while pending');
+      resolve({foo: ['bar'], bar: ['zod']});
+      last.then(() => {
+        chai.assert.isFalse(
+            testElement.dataNotFound, 'false now that we have data');
+        done();
+      });
+    });
+  });
+
+  it('reloads as soon as setup, if autoReload is true', function(done) {
+    var r2t = {foo: [], bar: []};
+    var fakeBackend = {
+      scalarRuns: () => Promise.resolve(r2t),
+      scalar: () => null,
+    };
+    testElement = fixture('testElementFixture');
+    testElement.dataType = 'scalar';
+    testElement.backend = fakeBackend;
+    setTimeout(() => {
+      chai.assert.equal(testElement.run2tag, r2t);
+      done();
+    });
+  });
+
+  it('doesn\'t mutate props if backend returns same data', function(done) {
+    var r2t_1 = {foo: ['1', '2'], bar: ['3', '4']};
+    var r2t_2 = {foo: ['1', '2'], bar: ['3', '4']};
+    var fakeBackend = {
+      scalarRuns: () => Promise.resolve(r2t_1),
+      scalar: () => null,
+    };
+    testElement.backend = fakeBackend;
+    testElement.reload().then(() => {
+      fakeBackend.scalarRuns = () => Promise.resolve(r2t_2);
+      var tags = testElement.tags;
+      testElement.reload().then(() => {
+        // shallow equality ensures it wasn't recomputed
+        chai.assert.equal(tags, testElement.tags, 'tags was not recomputed');
+        done();
+      });
+    });
+  });
+
+  // TODO(dandelion): Fix this test.
+  it('reload calls frontendReload', function(done) {
+    testElement.frontendReload = function() {
+      done();
+    };
+    testElement.reload();
+  });
+
+});
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/audio_run_run1_tag_audio1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/audio_run_run1_tag_audio1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/audio_run_run1_tag_audio1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/audio_run_run1_tag_audio1.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/compressedHistograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/compressedHistograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/compressedHistograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/example.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/example.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/example.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/example.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/histograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/histograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/histograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/images_run_run1_tag_im1.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/images_run_run1_tag_im1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/images_run_run1_tag_im1.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/images_run_run1_tag_im1.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/individualImage_index_0_tag_im1_run_run1.png b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/individualImage_index_0_tag_im1_run_run1.png
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/individualImage_index_0_tag_im1_run_run1.png
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/individualImage_index_0_tag_im1_run_run1.png
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/run_metadata_run_step99_tag_train.pbtxt b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/run_metadata_run_step99_tag_train.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/run_metadata_run_step99_tag_train.pbtxt
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/run_metadata_run_step99_tag_train.pbtxt
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/runs.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/runs.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/scalars.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/scalars.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars.json
diff --git a/tensorflow/tensorboard/components/tf_backend/test/data/scalars_run_run1_tag_cross_entropy__281_29.json b/tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_backend/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/data/scalars_run_run1_tag_cross_entropy__281_29.json
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..23a4e8f6111b115875ec6d38a69d1f454acff7d3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/requestManagerTests.ts
@@ -0,0 +1,294 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {RequestManager, RequestNetworkError} from '../requestManager';
+
+interface MockRequest {
+  resolve: Function;
+  reject: Function;
+  id: number;
+  url: string;
+}
+
+class MockedRequestManager extends RequestManager {
+  private resolvers: Function[];
+  private rejectors: Function[];
+  public requestsDispatched: number;
+  constructor(maxRequests = 10, maxRetries = 3) {
+    super(maxRequests, maxRetries);
+    this.resolvers = [];
+    this.rejectors = [];
+    this.requestsDispatched = 0;
+  }
+  protected _promiseFromUrl(url) {
+    return new Promise((resolve, reject) => {
+      const mockJSON = {
+        ok: true,
+        json() {
+          return url;
+        },
+        url,
+        status: 200,
+      };
+      const mockFailedRequest: any = {
+        ok: false,
+        url,
+        status: 502,
+      };
+      const mockFailure = new RequestNetworkError(mockFailedRequest, url);
+      this.resolvers.push(() => {
+        resolve(mockJSON);
+      });
+      this.rejectors.push(() => {
+        reject(mockFailure);
+      });
+      this.requestsDispatched++;
+    });
+  }
+  public resolveFakeRequest() {
+    this.resolvers.pop()();
+  }
+  public rejectFakeRequest() {
+    this.rejectors.pop()();
+  }
+  public dispatchAndResolve() {
+    // Wait for at least one request to be dispatched, then resolve it.
+    this.waitForDispatch(1).then(() => this.resolveFakeRequest());
+  }
+  public waitForDispatch(num) {
+    return waitForCondition(() => {
+      return this.requestsDispatched >= num;
+    });
+  }
+}
+
+/** Create a promise that returns when *check* returns true.
+ * May cause a test timeout if check never becomes true.
+ */
+
+function waitForCondition(check: () => boolean): Promise<any> {
+  return new Promise((resolve, reject) => {
+    const go = () => {
+      if (check()) {
+        resolve();
+      }
+      setTimeout(go, 2);
+    };
+    go();
+  });
+}
+
+describe('backend', () => {
+  describe('request manager', () => {
+    it('request loads JSON properly', (done) => {
+      const rm = new RequestManager();
+      const promise = rm.request('data/example.json');
+      promise.then(
+          (response) => {
+            chai.assert.deepEqual(response, {foo: 3, bar: 'zoidberg'});
+            done();
+          },
+          (reject) => {
+            throw new Error(reject);
+          });
+    });
+
+    it('rejects on bad url', (done) => {
+      const rm = new RequestManager(5, 0);
+      const badUrl = '_bad_url_which_doesnt_exist.json';
+      const promise = rm.request(badUrl);
+      promise.then(
+          (success) => {
+            done(new Error('the promise should have rejected'));
+          },
+          (reject: RequestNetworkError) => {
+            chai.assert.include(reject.message, '404');
+            chai.assert.include(reject.message, badUrl);
+            chai.assert.equal(reject.req.status, 404);
+            done();
+          });
+    });
+
+    it('can retry if requests fail', (done) => {
+      const rm = new MockedRequestManager(3, 5);
+      const r = rm.request('foo');
+      rm.waitForDispatch(1)
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(2);
+          })
+          .then(() => rm.resolveFakeRequest());
+      r.then((success) => done());
+    });
+
+    it('retries at most maxRetries times', (done) => {
+      const MAX_RETRIES = 2;
+      const rm = new MockedRequestManager(3, MAX_RETRIES);
+      const r = rm.request('foo');
+      rm.waitForDispatch(1)
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(2);
+          })
+          .then(() => {
+            rm.rejectFakeRequest();
+            return rm.waitForDispatch(3);
+          })
+          .then(() => {
+            rm.rejectFakeRequest();
+          });
+
+      r.then(
+          (success) => done(new Error('The reqest should have failed')),
+          (failure) => done());
+    });
+
+    it('requestManager only sends maxRequests requests at a time', (done) => {
+      const rm = new MockedRequestManager(3);
+      const r0 = rm.request('1');
+      const r1 = rm.request('2');
+      const r2 = rm.request('3');
+      const r3 = rm.request('4');
+      chai.assert.equal(rm.activeRequests(), 3, 'three requests are active');
+      chai.assert.equal(
+          rm.outstandingRequests(), 4, 'four requests are pending');
+      rm.waitForDispatch(3)
+          .then(() => {
+            chai.assert.equal(
+                rm.activeRequests(), 3, 'three requests are still active (1)');
+            chai.assert.equal(
+                rm.requestsDispatched, 3, 'three requests were dispatched');
+            rm.resolveFakeRequest();
+            return rm.waitForDispatch(4);
+          })
+          .then(() => {
+            chai.assert.equal(
+                rm.activeRequests(), 3, 'three requests are still active (2)');
+            chai.assert.equal(
+                rm.requestsDispatched, 4, 'four requests were dispatched');
+            chai.assert.equal(
+                rm.outstandingRequests(), 3, 'three requests are pending');
+            rm.resolveFakeRequest();
+            rm.resolveFakeRequest();
+            rm.resolveFakeRequest();
+            return r3;
+          })
+          .then(() => {
+            chai.assert.equal(rm.activeRequests(), 0, 'all requests finished');
+            chai.assert.equal(
+                rm.outstandingRequests(), 0, 'no requests pending');
+            done();
+          });
+    });
+
+    it('queue continues after failures', (done) => {
+      const rm = new MockedRequestManager(1, 0);
+      const r0 = rm.request('1');
+      const r1 = rm.request('2');
+      rm.waitForDispatch(1).then(() => {
+        rm.rejectFakeRequest();
+      });
+
+      r0.then(
+            (success) => done(new Error('r0 should have failed')),
+            (failure) => 'unused_argument')
+          .then(() => rm.resolveFakeRequest());
+
+      // When the first request rejects, it should decrement nActiveRequests
+      // and then launch remaining requests in queue (i.e. this one)
+      r1.then((success) => done(), (failure) => done(new Error(failure)));
+    });
+
+    it('queue is LIFO', (done) => {
+      /* This test is a bit tricky.
+       * We want to verify that the RequestManager queue has LIFO semantics.
+       * So we construct three requests off the bat: A, B, C.
+       * So LIFO semantics ensure these will resolve in order A, C, B.
+       * (Because the A request launches immediately when we create it, it's
+       * not in queue)
+       * Then after resolving A, C moves out of queue, and we create X.
+       * So expected final order is A, C, X, B.
+       * We verify this with an external var that counts how many requests were
+       * resolved.
+       */
+      const rm = new MockedRequestManager(1);
+      let nResolved = 0;
+      function assertResolutionOrder(expectedSpotInSequence) {
+        return () => {
+          nResolved++;
+          chai.assert.equal(expectedSpotInSequence, nResolved);
+        };
+      }
+
+      function launchThirdRequest() {
+        rm.request('started late but goes third')
+            .then(assertResolutionOrder(3))
+            .then(() => rm.dispatchAndResolve());
+      }
+
+      rm.request('first')
+          .then(
+              assertResolutionOrder(1))  // Assert that this one resolved first
+          .then(launchThirdRequest)
+          .then(() => rm.dispatchAndResolve());  // then trigger the next one
+
+      rm.request('this one goes fourth')  // created second, will go last
+          .then(assertResolutionOrder(
+              4))       // assert it was the fourth to get resolved
+          .then(done);  // finish the test
+
+      rm.request('second')
+          .then(assertResolutionOrder(2))
+          .then(() => rm.dispatchAndResolve());
+
+      rm.dispatchAndResolve();
+    });
+
+    it('requestManager can clear queue', (done) => {
+      const rm = new MockedRequestManager(1);
+      let requestsResolved = 0;
+      let requestsRejected = 0;
+      const success = () => requestsResolved++;
+      const failure = (err) => {
+        chai.assert.equal(err.name, 'RequestCancellationError');
+        requestsRejected++;
+      };
+      const finishTheTest = () => {
+        chai.assert.equal(rm.activeRequests(), 0, 'no requests still active');
+        chai.assert.equal(
+            rm.requestsDispatched, 1, 'only one req was ever dispatched');
+        chai.assert.equal(rm.outstandingRequests(), 0, 'no pending requests');
+        chai.assert.equal(requestsResolved, 1, 'one request got resolved');
+        chai.assert.equal(
+            requestsRejected, 4, 'four were cancelled and threw errors');
+        done();
+      };
+      rm.request('0').then(success, failure).then(finishTheTest);
+      rm.request('1').then(success, failure);
+      rm.request('2').then(success, failure);
+      rm.request('3').then(success, failure);
+      rm.request('4').then(success, failure);
+      chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
+      rm.waitForDispatch(1).then(() => {
+        chai.assert.equal(rm.activeRequests(), 1, 'one req is active');
+        chai.assert.equal(rm.requestsDispatched, 1, 'one req was dispatched');
+        chai.assert.equal(rm.outstandingRequests(), 5, 'five reqs outstanding');
+        rm.clearQueue();
+        rm.resolveFakeRequest();
+        // resolving the first request triggers finishTheTest
+      });
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/index.html b/tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
similarity index 75%
rename from tensorflow/tensorboard/components/tf_color_scale/test/index.html
rename to tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
index 9a2a174349c1160ecc19913b560cc7e2ba00a47b..cdc17c2607e9b31ea530d49c75bd24e316b568ad 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/test/index.html
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/test/tests.html
@@ -19,13 +19,17 @@ limitations under the License.
 <html>
 <head>
   <meta charset="utf-8">
-  <script src="../../web-component-tester/browser.js"></script>
   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
+  <script src="../../web-component-tester/browser.js"></script>
+  <link rel="import" href="../../polymer/polymer.html">
+  <link rel="import" href="../tf-backend.html">
 </head>
 <body>
-    <script src="../colorScale.js"></script>
-    <script src="../palettes.js"></script>
-    <script src="colorScaleTests.js"></script>
+  <test-fixture id="testElementFixture">
+    <template>
+      <test-element id="test"></test-element>
+    </template>
+  </test-fixture>
+  <script src="bundle.js"></script>
 </body>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_backend/tf-backend.html b/tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html
similarity index 81%
rename from tensorflow/tensorboard/components/tf_backend/tf-backend.html
rename to tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html
index 0e07c7fdb1ed3db2d24314f2db0f558cc3e23512..5bf266336285719965a7456fc6f894d62820c940 100644
--- a/tensorflow/tensorboard/components/tf_backend/tf-backend.html
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/tf-backend.html
@@ -20,8 +20,4 @@ limitations under the License.
 <link rel="import" href="../tf-imports/d3.html">
 <link rel="import" href="../vz-sorting/vz-sorting.html">
 
-<script src="requestManager.js"></script>
-<script src="urlPathHelpers.js"></script>
-<script src="router.js"></script>
-<script src="backend.js"></script>
-<script src="behavior.js"></script>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_backend_d3v4/urlPathHelpers.ts b/tensorflow/tensorboard/components/tf_backend_d3v4/urlPathHelpers.ts
new file mode 100644
index 0000000000000000000000000000000000000000..62519dac5ca73b4b62880319dc81c80a188b337e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_backend_d3v4/urlPathHelpers.ts
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+export const BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
+/** Cleanup a url so that it can be loaded from a filesystem. */
+export function demoify(s) {
+  // for consistency with python's urllib.urlencode
+  s = s.replace(new RegExp('%20', 'g'), '+');
+  for (let i = 0; i < BAD_CHARACTERS.length; i++) {
+    const c = BAD_CHARACTERS[i];
+    s = s.replace(new RegExp('\\' + c, 'g'), '_');
+  }
+  return s;
+}
+
+export function queryEncoder(params?: any): string {
+  // It's important that the keys be sorted, so we always grab the right file
+  // if we are talking to the backend generated by serialze_tensorboard.py
+  if (params == null) {
+    return '';
+  }
+  const components = _.keys(params)
+                       .sort()
+                       .filter((k) => params[k] !== undefined)
+                       .map((k) => k + '=' + encodeURIComponent(params[k]));
+  const result = components.length ? '?' + components.join('&') : '';
+  // Replace parens for consistency with urllib.urlencode
+  return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
+}
diff --git a/tensorflow/tensorboard/components/tf_color_scale/BUILD b/tensorflow/tensorboard/components/tf_color_scale/BUILD
deleted file mode 100644
index 75bf812fe549bb3b471c5adfedb0bbc366d17cd9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-# TODO(dandelion): Add webfiles support for the test code.
-
-webfiles(
-    name = "tf_color_scale",
-    srcs = [
-        "tf-color-scale.html",
-        ":ts",
-    ],
-    path = "/tf-color-scale",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer",
-    ],
-)
-
-tensorboard_typescript_genrule(
-    name = "ts",
-    srcs = [
-        "colorScale.ts",
-        "palettes.ts",
-    ],
-    typings = ["@org_definitelytyped//:d3.d.ts"],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-color-scale.html",
-        ":legacy_ts",
-    ],
-    destdir = "tf-color-scale",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-        "colorScale.ts",
-        "palettes.ts",
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts b/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
deleted file mode 100644
index c05d9765335609e5c6678b93c9f53e519aef0524..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/colorScale.ts
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Example usage:
-// runs = ["train", "test", "test1", "test2"]
-// ccs = new TF.ColorScale();
-// ccs.domain(runs);
-// ccs.getColor("train");
-// ccs.getColor("test1");
-
-module TF {
-  export class ColorScale {
-    private palette: string[];
-    private identifiers = d3.map();
-
-    /**
-     * Creates a color scale with optional custom palette.
-     *  @param {string[]} [palette=TF.palettes.googleColorBlind] - The color
-     *                 palette you want as an Array of hex strings.
-     */
-    constructor(palette: string[] = TF.palettes.googleColorBlindAssist) {
-      this.palette = palette;
-    }
-
-    /**
-     * Set the domain of strings.
-     * @param {string[]} strings - An array of possible strings to use as the
-     *                             domain for your scale.
-     */
-    public domain(strings: string[]): this {
-      this.identifiers = d3.map();
-      strings.forEach((s, i) => {
-        this.identifiers.set(s, this.palette[i % this.palette.length]);
-      });
-      return this;
-    }
-
-    /**
-     * Use the color scale to transform an element in the domain into a color.
-     * @param {string} The input string to map to a color.
-     * @return {string} The color corresponding to that input string.
-     * @throws Will error if input string is not in the scale's domain.
-     */
-    public scale(s: string): string {
-      if (!this.identifiers.has(s)) {
-        throw new Error('String was not in the domain.');
-      }
-      return this.identifiers.get(s) as string;
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD b/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD
deleted file mode 100644
index 00b8a033b8ab5609192f7dc5e917d43329ca0423..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_color_scale/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-color-scale/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_button",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts b/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
deleted file mode 100644
index c53ed599ae9d34b2001cb277494b94dcf5148c65..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/palettes.ts
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF {
-  export const palettes = {
-    googleStandard: [
-      '#db4437',  // google red 500
-      '#ff7043',  // deep orange 400
-      '#f4b400',  // google yellow 500
-      '#0f9d58',  // google green 500
-      '#00796b',  // teal 700
-      '#00acc1',  // cyan 600
-      '#4285f4',  // google blue 500
-      '#5c6bc0',  // indigo 400
-      '#ab47bc'   // purple 400
-    ],
-    googleCool: [
-      '#9e9d24',  // lime 800
-      '#0f9d58',  // google green 500
-      '#00796b',  // teal 700
-      '#00acc1',  // cyan 600
-      '#4285f4',  // google blue 500
-      '#5c6bc0',  // indigo 400
-      '#607d8b'   // blue gray 500
-    ],
-    googleWarm: [
-      '#795548',  // brown 500
-      '#ab47bc',  // purple 400
-      '#f06292',  // pink 300
-      '#c2185b',  // pink 700
-      '#db4437',  // google red 500
-      '#ff7043',  // deep orange 400
-      '#f4b400'   // google yellow 700
-    ],
-    googleColorBlindAssist: [
-      '#ff7043',  // orange
-      '#00ACC1',  // dark cyan
-      '#AB47BC',  // bright purple
-      '#2A56C6',  // dark blue
-      '#0b8043',  // green
-      '#F7CB4D',  // yellow
-      '#c0ca33',  // lime
-      '#5e35b1',  // purple
-      '#A52714',  // red
-    ],
-    // These palettes try to be better for color differentiation.
-    // https://personal.sron.nl/~pault/
-    colorBlindAssist1:
-        ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
-    colorBlindAssist2: [
-      '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677',
-      '#882255', '#aa4499'
-    ],
-    colorBlindAssist3: [
-      '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933',
-      '#ddcc77', '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
-    ],
-    // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
-    colorBlindAssist4: [
-      '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB',
-      '#490092'
-    ],
-    mldash: [
-      '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8',
-      '#00B7ED'
-    ]
-  };
-}
diff --git a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts b/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
deleted file mode 100644
index 700a01848b675e6530d05dfa721b8d4efcd67843..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_color_scale/test/colorScaleTests.ts
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-module TF {
-  let assert = chai.assert;
-
-  describe('ColorScale', function() {
-    let ccs: ColorScale;
-
-    beforeEach(function() { ccs = new ColorScale(); });
-
-    it('Returns consistent colors', function() {
-      ccs.domain(['train', 'eval', 'test']);
-      let trainColor = ccs.scale('train');
-      let trainColor2 = ccs.scale('train');
-      assert.equal(trainColor, trainColor2);
-    });
-
-    it('Returns consistent colors after new domain', function() {
-      ccs.domain(['train', 'eval']);
-      let trainColor = ccs.scale('train');
-      ccs.domain(['train', 'eval', 'test']);
-      let trainColor2 = ccs.scale('train');
-      assert.equal(trainColor, trainColor2);
-    });
-
-    it('Throws an error if string is not in the domain', function() {
-      ccs.domain(['red', 'yellow', 'green']);
-      assert.throws(function() {
-        ccs.scale('not in domain');
-      }, 'String was not in the domain.');
-    });
-  });
-}
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD b/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5feea12fb47803ea30adbd4d32f880a4d035f176
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_color_scale_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-color-scale.html",
+    ],
+    path = "/tf-color-scale",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/tf-color-scale",
+    deps = [
+        ":tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF": [
+        "palettes.ts",
+        "colorScale.ts",
+    ]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts b/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts
new file mode 100644
index 0000000000000000000000000000000000000000..ff90d46aa249d240250854b0ec631834286ee651
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/colorScale.ts
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Example usage:
+// runs = ["train", "test", "test1", "test2"]
+// ccs = new ColorScale();
+// ccs.domain(runs);
+// ccs.getColor("train");
+// ccs.getColor("test1");
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import {palettes} from './palettes'
+
+
+export class ColorScale {
+  private palette: string[];
+  private identifiers = d3.map();
+
+  /**
+   * Creates a color scale with optional custom palette.
+   *  @param {string[]} [palette=palettes.googleColorBlind] - The color
+   *                 palette you want as an Array of hex strings.
+   */
+  constructor(palette: string[] = palettes.googleColorBlindAssist) {
+    this.palette = palette;
+  }
+
+  /**
+   * Set the domain of strings.
+   * @param {string[]} strings - An array of possible strings to use as the
+   *                             domain for your scale.
+   */
+  public domain(strings: string[]): this {
+    this.identifiers = d3.map();
+    strings.forEach((s, i) => {
+      this.identifiers.set(s, this.palette[i % this.palette.length]);
+    });
+    return this;
+  }
+
+  /**
+   * Use the color scale to transform an element in the domain into a color.
+   * @param {string} The input string to map to a color.
+   * @return {string} The color corresponding to that input string.
+   * @throws Will error if input string is not in the scale's domain.
+   */
+  public scale(s: string): string {
+    if (!this.identifiers.has(s)) {
+      throw new Error('String was not in the domain.');
+    }
+    return this.identifiers.get(s) as string;
+  }
+}
+
+Polymer({
+  is: 'tf-color-scale',
+  properties: {
+    runs: {
+      type: Array,
+    },
+    outColorScale: {
+      type: Object,
+      readOnly: true,
+      notify: true,
+      value: function() {
+        return new ColorScale();
+      },
+    },
+  },
+  observers: ['updateColorScale(runs.*)'],
+  updateColorScale: function(runsChange) {
+    this.outColorScale.domain(this.runs);
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_color_scale/demo/index.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
similarity index 90%
rename from tensorflow/tensorboard/components/tf_color_scale/demo/index.html
rename to tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
index ad9edbda98ea187435ec69f7032101a7bc3892ea..81dfab098c6d86dfc6b666aa26d0d39f4ad3ae8e 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/index.html
@@ -20,11 +20,11 @@ limitations under the License.
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <title>tf-color-scale demo</title>
 <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../tf-color-scale.html">
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../../paper-styles/typography.html">
-<link rel="import" href="../../paper-button/paper-button.html">
-<link rel="import" href="../../tf-imports/d3.html">
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="tf-color-scale.html">
 
 <style> body {font-family: "Roboto";}</style>
 <demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/palettes.ts b/tensorflow/tensorboard/components/tf_color_scale_d3v4/palettes.ts
new file mode 100644
index 0000000000000000000000000000000000000000..ce42a115458eb3d15bb6c3ac72cf7407f5a30afc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/palettes.ts
@@ -0,0 +1,76 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+export const palettes = {
+  googleStandard: [
+    '#db4437',  // google red 500
+    '#ff7043',  // deep orange 400
+    '#f4b400',  // google yellow 500
+    '#0f9d58',  // google green 500
+    '#00796b',  // teal 700
+    '#00acc1',  // cyan 600
+    '#4285f4',  // google blue 500
+    '#5c6bc0',  // indigo 400
+    '#ab47bc'   // purple 400
+  ],
+  googleCool: [
+    '#9e9d24',  // lime 800
+    '#0f9d58',  // google green 500
+    '#00796b',  // teal 700
+    '#00acc1',  // cyan 600
+    '#4285f4',  // google blue 500
+    '#5c6bc0',  // indigo 400
+    '#607d8b'   // blue gray 500
+  ],
+  googleWarm: [
+    '#795548',  // brown 500
+    '#ab47bc',  // purple 400
+    '#f06292',  // pink 300
+    '#c2185b',  // pink 700
+    '#db4437',  // google red 500
+    '#ff7043',  // deep orange 400
+    '#f4b400'   // google yellow 700
+  ],
+  googleColorBlindAssist: [
+    '#ff7043',  // orange
+    '#00ACC1',  // dark cyan
+    '#AB47BC',  // bright purple
+    '#2A56C6',  // dark blue
+    '#0b8043',  // green
+    '#F7CB4D',  // yellow
+    '#c0ca33',  // lime
+    '#5e35b1',  // purple
+    '#A52714',  // red
+  ],
+  // These palettes try to be better for color differentiation.
+  // https://personal.sron.nl/~pault/
+  colorBlindAssist1:
+      ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
+  colorBlindAssist2: [
+    '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677', '#882255',
+    '#aa4499'
+  ],
+  colorBlindAssist3: [
+    '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77',
+    '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
+  ],
+  // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
+  colorBlindAssist4: [
+    '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB', '#490092'
+  ],
+  mldash: [
+    '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8', '#00B7ED'
+  ]
+};
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ac22ab8218a0fa675aa7cf79a625af8e0292e87c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-color-scale/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF": ["colorScaleTests.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..78824a772c3e6b68a4d1fa2f63b821b202bba0c8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/colorScaleTests.ts
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+let assert = chai.assert;
+
+import {ColorScale} from '../colorScale'
+
+describe('ColorScale', function() {
+  let ccs: ColorScale;
+
+  beforeEach(function() {
+    ccs = new ColorScale();
+  });
+
+  it('Returns consistent colors', function() {
+    ccs.domain(['train', 'eval', 'test']);
+    let trainColor = ccs.scale('train');
+    let trainColor2 = ccs.scale('train');
+    assert.equal(trainColor, trainColor2);
+  });
+
+  it('Returns consistent colors after new domain', function() {
+    ccs.domain(['train', 'eval']);
+    let trainColor = ccs.scale('train');
+    ccs.domain(['train', 'eval', 'test']);
+    let trainColor2 = ccs.scale('train');
+    assert.equal(trainColor, trainColor2);
+  });
+
+  it('Throws an error if string is not in the domain', function() {
+    ccs.domain(['red', 'yellow', 'green']);
+    assert.throws(function() {
+      ccs.scale('not in domain');
+    }, 'String was not in the domain.');
+  });
+});
diff --git a/tensorflow/tensorboard/dist/bazel-html-imports.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html
similarity index 65%
rename from tensorflow/tensorboard/dist/bazel-html-imports.html
rename to tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html
index 2268e6d7d4cd2720674e600848c2bcbda446f77a..eccc32cdec5547e1e54c9eb28fd9605ba629323c 100644
--- a/tensorflow/tensorboard/dist/bazel-html-imports.html
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/test/tests.html
@@ -16,8 +16,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<!-- TENSORBOARD_BOWER_AUTOGENERATED_BELOW_THIS_LINE_DO_NOT_EDIT -->
-<script src="../numericjs_numeric_min_js/file/numeric.min.js"></script>
-<script src="../three_js_three_min_js/file/three.min.js"></script>
-<script src="../three_js_orbitcontrols_js/file/OrbitControls.js"></script>
-<script src="../weblas_weblas_js/file/weblas.js"></script>
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-color-scale.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html b/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
similarity index 60%
rename from tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
rename to tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
index 79bee6d957ac134927d258c89a131b13da614b27..3dedfaf1a1c10ca12f9119992d23fc7f67b44546 100644
--- a/tensorflow/tensorboard/components/tf_color_scale/tf-color-scale.html
+++ b/tensorflow/tensorboard/components/tf_color_scale_d3v4/tf-color-scale.html
@@ -26,30 +26,5 @@ a set of colors.
 @element tf-color-scale
 -->
 <dom-module id="tf-color-scale">
-  <script src="palettes.js"></script>
-  <script src="colorScale.js"></script>
-  <script>
-    (function() {
-      Polymer({
-        is: "tf-color-scale",
-        properties: {
-          runs: {
-            type: Array,
-          },
-          outColorScale: {
-            type: Object,
-            readOnly: true,
-            notify: true,
-            value: function() {
-              return new TF.ColorScale();
-            },
-          },
-        },
-        observers: ['updateColorScale(runs.*)'],
-        updateColorScale: function(runsChange) {
-          this.outColorScale.domain(this.runs);
-        },
-      });
-    })();
-  </script>
+  <script src="bundle.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
index 0fb9f70eb972d4b62bd439f91bea7fe196e28918..5a9e941e467def3be03cd556c8c73a7f08be72f4 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/BUILD
@@ -1,13 +1,13 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
+web_library(
     name = "tf_dashboard_common",
     srcs = glob(["*.html"]) + [
         ":ts",
@@ -41,10 +41,12 @@ tensorboard_typescript_genrule(
     name = "ts",
     srcs = [
         "categorizer.ts",
+        "dashboard-behavior.ts",
         "reload-behavior.ts",
     ],
     typings = [
         "@org_definitelytyped//:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
         "//tensorflow/tensorboard/components/vz_sorting:ts_typings",
     ],
 )
@@ -63,7 +65,7 @@ tensorboard_webcomponent_library(
     srcs = glob(["*.html"]) + [":legacy_ts"],
     destdir = "tf-dashboard-common",
     deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
         "//tensorflow/tensorboard/components/tf_storage:legacy",
         "//tensorflow/tensorboard/components/vz_sorting:legacy",
         "//third_party/javascript/polymer/v1/iron-ajax:lib",
@@ -89,10 +91,21 @@ tensorboard_ts_library(
     name = "legacy_ts",
     srcs = [
         "categorizer.ts",
+        "dashboard-behavior.ts",
         "reload-behavior.ts",
     ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
     deps = [
-        "//tensorflow/tensorboard/components:common_deps",
         "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/plottable/v1:typings",
+        "//third_party/javascript/typings/chai",
+        "//third_party/javascript/typings/d3",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/mocha",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/sinon",
+        "//third_party/javascript/typings/webcomponents_js",
     ],
 )
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
index 42e7cbcff4033843a30f32b11ad5f292e7c7187e..4c06462a981c09170472cfb5d02e23382ae4268a 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/categorizer.ts
@@ -72,24 +72,31 @@ module Categorizer {
       if (tags.length === 0) {
         return [];
       }
-      let sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-      let categories: Category[] = [];
-      let currentCategory = {
-        name: extractor(sortedTags[0]),
-        tags: [],
-      };
-      sortedTags.forEach((t: string) => {
-        let topLevel = extractor(t);
-        if (currentCategory.name !== topLevel) {
-          categories.push(currentCategory);
-          currentCategory = {
+
+      // Maps between top-level name and category. We use the mapping to avoid
+      // duplicating categories per run.
+      const categoryMapping: {[key: string]: Category} = {};
+
+      tags.forEach((t: string) => {
+        const topLevel = extractor(t);
+        if (!categoryMapping[topLevel]) {
+          const newCategory = {
             name: topLevel,
             tags: [],
           };
+          categoryMapping[topLevel] = newCategory;
         }
-        currentCategory.tags.push(t);
+
+        categoryMapping[topLevel].tags.push(t);
+      });
+
+      // Sort categories into alphabetical order.
+      const categories =
+          _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+      _.forEach(categories, (category) => {
+        // Sort the tags within each category.
+        category.tags.sort(VZ.Sorting.compareTagNames);
       });
-      categories.push(currentCategory);
       return categories;
     };
   }
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..269a62ccf97a03feb916574f1b656e654bbc5795
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/dashboard-behavior.ts
@@ -0,0 +1,40 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+module TF.Dashboard {
+  /**
+   * A behavior that TensorBoard dashboards must implement. This behavior serves
+   * the purpose of an interface.
+   */
+  export function DashboardBehavior(dashboardName) {
+    return {
+      properties: {
+        name: {
+          type: String,
+          value: dashboardName,
+          readOnly: true,
+        },
+      },
+      // This method is called when the dashboard reloads, either when the
+      // dashboard is first visited, periodically reloaded, or manually reloaded
+      // via the user clicking the button. Note that dashboard custom elements
+      // that use TF.Dashboard.ReloadBehavior already implement a reload method.
+      reload() {
+        throw Error(
+            'The ' + dashboardName + ' dashboard does not implement reload.');
+      },
+    };
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD
deleted file mode 100644
index 05cfe34e72cc8724b1f42356ab5f1723739473b7..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_dashboard_common/demo
-webfiles(
-    name = "demo",
-    srcs = [
-        "tf-categorizer-demo.html",
-        "tf-collapsable-pane-demo.html",
-        "tf-multi-checkbox-demo.html",
-        "tf-regex-group-demo.html",
-    ],
-    path = "/tf-dashboard-common/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer_iron_flex_layout",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
index ea149fda47a57fd5b891df22c5ff1eb15298a6b5..4e52b60f37f088b228ab98869abbcc02f460e11d 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/test/categorizerTest.ts
@@ -62,6 +62,18 @@ module Categorizer {
         assert.deepEqual(
             topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
       });
+
+      it('only create 1 category per run', () => {
+        // TensorBoard separates runs from tags using the / and _ characters
+        // *only* during sorting. The categorizer should group all tags under
+        // their correct categories - and create only 1 category per run.
+        const tags = ['foo/bar', 'foo_in_between_run/baz', 'foo/quux'];
+        const expected = [
+          {name: 'foo', tags: ['foo/bar', 'foo/quux']},
+          {name: 'foo_in_between_run', tags: ['foo_in_between_run/baz']},
+        ];
+        assert.deepEqual(topLevelNamespaceCategorizer(tags), expected);
+      });
     });
 
     describe('customCategorizer', () => {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
index e2530d597165b8481e6d9b38d03d3e14b5597920..b9d0a8c39ee4680fd2b0191ca2a40a4e390703ad 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-chart-scaffold.html
@@ -34,7 +34,7 @@ chart() - Returns the underlying chart element.
 reload() - Reloads the data and sends it to the underlying chart.
 
 This element should have a compatible chart plugin element as it's content. The
-plugin is requred to implement two functions:
+plugin is required to implement two functions:
 - setVisibleSeries(names: string[]): a function that receives an array of series
     names as the first parameter, responsible for changing the series currently
     being displayed to only the series in this array.
@@ -57,6 +57,8 @@ plugin is requred to implement two functions:
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-chart-scaffold",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
index 4c7c91d1713293f78111a70df6e327092077156c..9e2f6b9589b3648a07899758285d03bef2aa8a9f 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-dashboard.html
@@ -22,4 +22,5 @@ limitations under the License.
 <link rel="import" href="tf-downloader.html">
 <link rel="import" href="tf-no-data-warning.html">
 
+<script src="dashboard-behavior.js"></script>
 <script src="reload-behavior.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
index bc15312fc3aecd5a048938fc0b42b57c838605c3..e2c99772072b3f489eb64ad23189571b7734930d 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-multi-checkbox.html
@@ -183,6 +183,7 @@ handle these situations gracefully.
       // if undefined, default value (enable for first k runs, disable after).
         type: Object,
         value: TF.URIStorage.getObjectInitializer('runSelectionState', {}),
+        observer: "_storeRunToIsCheckedMapping",
       },
       // (Allows state to persist across regex filtering)
       outSelected: {
@@ -194,10 +195,11 @@ handle these situations gracefully.
         type: Object,
         observer: "synchronizeColors",
       }, // map from run name to css class
-      numRunsEnabledByDefault: {
-        // When TB first loads, first k runs are enabled, rest are disabled.
+      maxRunsToEnableByDefault: {
+        // When TB first loads, if it has k or fewer runs, they are all enabled
+        // by default. If there are more, then they are all disabled.
         type: Number,
-        value: 10,
+        value: 40,
       },
       _debouncedRegexChange: {
         type: Function,
@@ -230,7 +232,6 @@ handle these situations gracefully.
     },
     observers: [
       "_setIsolatorIcon(runSelectionState, names)",
-      "_storeRunToIsCheckedMapping(runSelectionState)",
     ],
     _storeRunToIsCheckedMapping: TF.URIStorage.getObjectObserver('runSelectionState', {}),
     _makeRegex: function(regex) {
@@ -261,9 +262,10 @@ handle these situations gracefully.
     },
     computeOutSelected: function(__, ___) {
       var runSelectionState = this.runSelectionState;
-      var num = this.numRunsEnabledByDefault;
+      var num = this.maxRunsToEnableByDefault;
+      var allEnabled = this.namesMatchingRegex.length <= num;
       return this.namesMatchingRegex.filter(function(n, i) {
-        return runSelectionState[n] == null ? i<num : runSelectionState[n];
+        return runSelectionState[n] == null ? allEnabled : runSelectionState[n];
       });
     },
     synchronizeColors: function(e) {
@@ -313,18 +315,24 @@ handle these situations gracefully.
     _regexInputObserver: TF.URIStorage.getStringObserver("regexInput", ""),
     toggleAll: function() {
       var _this = this;
-      var allToggledOn = this.namesMatchingRegex
-                    .every(function(n) {return _this.runSelectionState[n]});
+      var anyToggledOn = this.namesMatchingRegex
+                    .some(function(n) {return _this.runSelectionState[n]});
+
 
       var runSelectionStateIsDefault = Object.keys(this.runSelectionState).length == 0;
 
-      var numRuns = this.namesMatchingRegex.length;
+      var defaultOff = this.namesMatchingRegex.length > this.maxRunsToEnableByDefault;
+      // We have runs toggled either if some were explicitly toggled on, or if
+      // we are in the default state, and there are few enough that we default
+      // to toggling on.
+      anyToggledOn = anyToggledOn || runSelectionStateIsDefault && !defaultOff;
 
-      var shouldDisable = allToggledOn || runSelectionStateIsDefault;
+      // If any are toggled on, we turn everything off. Or, if none are toggled
+      // on, we turn everything on.
 
       var newRunsDisabled = {};
       this.names.forEach(function(n) {
-        newRunsDisabled[n] = !shouldDisable;
+        newRunsDisabled[n] = !anyToggledOn;
       })
       this.runSelectionState = newRunsDisabled;
     },
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
index dbc1dc5c5fa62ec17c9c9f8f5df85db6735acc6a..c90efac1d6b58debc6a39ae4ffafaeb3fb093da1 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-no-data-warning.html
@@ -34,10 +34,9 @@ Display a warning when there is no data found.
             and pass the graph either via the constructor, or by calling its
             <code>add_graph()</code> method.
             You may want to check out the
-            <a href="https://www.tensorflow.org/versions/master/how_tos/graph_viz/index.html">
+            <a href="https://www.tensorflow.org/get_started/graph_viz">
               graph visualizer tutorial
-            </a>
-            .
+            </a>.
           </p>
         </template>
         <template is="dom-if" if="[[_isProjector(dataType)]]">
@@ -53,7 +52,7 @@ Display a warning when there is no data found.
               <li>
                 You are not saving any checkpoint. To save your model,
                 create a
-                <a href="https://www.tensorflow.org/versions/master/api_docs/python/state_ops.html#Saver">
+                <a href="https://www.tensorflow.org/api_docs/python/tf/train/Saver">
                   <code>tf.train.Saver</code>
                 </a>
                 and save your model periodically
@@ -86,7 +85,7 @@ Display a warning when there is no data found.
             README
           </a>
           and perhaps the
-          <a href="https://www.tensorflow.org/versions/master/how_tos/summaries_and_tensorboard/index.html">
+          <a href="https://www.tensorflow.org/get_started/summaries_and_tensorboard">
             TensorBoard tutorial
           </a>.
         </p>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
index 8f2ea402e8991f1fd14d5365d7797d60f85fce72..81a72793b96347c2506ea636feffe64bd9d5696c 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common/tf-run-selector.html
@@ -139,17 +139,9 @@ Properties out:
       },
     },
     observers: [
+      "_onBackendUpdate(backend)",
       "_logdirSet(logdir)",
     ],
-    ready: function() {
-      // Populate the logdir.
-      this.backend.logdir().then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
     _toggleAll: function() {
       this.$.multiCheckbox.toggleAll();
     },
@@ -157,8 +149,21 @@ Properties out:
     _breakString: function(originalString) {
       return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
     },
+    _onBackendUpdate: function(backend) {
+      if (backend === undefined) {
+        return;
+      }
+
+      // When the backend is set, the selector can request the logdir.
+      backend.logdir().then(logdirObject => {
+        this.set('logdir', logdirObject.logdir);
+      }).catch(e => {
+        // Fetching the logdir failed. Prevent the exception from logging to
+        // console. The console already logs a 404 network event.
+      });
+    },
     _logdirSet: function(logdir) {
-      if (!logdir) {
+      if (logdir === undefined) {
         // The logdir has not been set yet.
         return;
       }
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e27a84651b118c17ee183d50dd16f004cd8be23a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/BUILD
@@ -0,0 +1,200 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_dashboard_common_d3v4",
+    srcs = [
+        "dashboard-style.html",
+        "run-color-style.html",
+        "scrollbar-style.html",
+        "tensorboard-color.html",
+        "tf-categorizer.html",
+        "tf-categorizer-bundle.js",
+        "tf-chart-scaffold.html",
+        "tf-collapsable-pane.html",
+        "tf-dashboard.html",
+        "tf-dashboard.js",
+        "tf-dashboard-layout.html",
+        "tf-downloader.html",
+        "tf-multi-checkbox.html",
+        "tf-multi-checkbox-bundle.js",
+        "tf-no-data-warning.html",
+        "tf-option-selector.html",
+        "tf-panes-helper.html",
+        "tf-regex-group.html",
+        "tf-regex-group-bundle.js",
+        "tf-run-selector.html",
+        "tf-sidebar-helper.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_ajax",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toggle_button",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = [
+        "tf-categorizer-demo.html",
+        "tf-collapsable-pane-demo.html",
+        "tf-multi-checkbox-demo.html",
+        "tf-regex-group-demo.html",
+    ],
+    path = "/tf-dashboard-common",
+    deps = [
+        ":tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_categorizer_bundle",
+    out = "tf-categorizer-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.Categorizer": ["tf-categorizer.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.Categorizer": {"compareTagNames": "VZ.Sorting.compareTagNames"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_categorizer_ts",
+    srcs = ["tf-categorizer-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_regex_group_bundle",
+    out = "tf-regex-group-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.RegexGroup": ["tf-regex-group.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.RegexGroup": {"storage": "TF.URIStorage"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_regex_group_ts",
+    srcs = ["tf-regex-group-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_multi_checkbox_bundle",
+    out = "tf-multi-checkbox-bundle.ts",
+    namespace_srcs = {"TF.Dashboard.MultiCheckbox": ["tf-multi-checkbox.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard.MultiCheckbox": {"storage": "TF.URIStorage"}},
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_multi_checkbox_ts",
+    srcs = ["tf-multi-checkbox-bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "tf_dashboard_bundle",
+    out = "tf-dashboard.ts",
+    namespace_srcs = {
+        "TF.Dashboard": [
+            "dashboard-behavior.ts",
+            "reload-behavior.ts",
+        ],
+    },
+)
+
+tensorboard_typescript_genrule(
+    name = "tf_dashboard_ts",
+    srcs = ["tf-dashboard.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = glob(["*.html"]) + [":legacy_ts"],
+    destdir = "tf-dashboard-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google_d3v4:lib",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:legacy",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:legacy",
+        "//third_party/javascript/polymer/v1/iron-ajax:lib",
+        "//third_party/javascript/polymer/v1/iron-collapse:lib",
+        "//third_party/javascript/polymer/v1/iron-icons:lib",
+        "//third_party/javascript/polymer/v1/paper-button:lib",
+        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+        "//third_party/javascript/polymer/v1/paper-dialog:lib",
+        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+        "//third_party/javascript/polymer/v1/paper-input:lib",
+        "//third_party/javascript/polymer/v1/paper-item:lib",
+        "//third_party/javascript/polymer/v1/paper-menu:lib",
+        "//third_party/javascript/polymer/v1/paper-slider:lib",
+        "//third_party/javascript/polymer/v1/paper-spinner:lib",
+        "//third_party/javascript/polymer/v1/paper-styles:lib",
+        "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "dashboard-behavior.ts",
+        "reload-behavior.ts",
+        "tf-categorizer.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:legacy_ts",
+        "//third_party/javascript/typings/d3_v4:bundle",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..3e40da14528dffb8abf9529eebb745ecdd575489
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-behavior.ts
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * A behavior that TensorBoard dashboards must implement. This behavior serves
+ * the purpose of an interface.
+ */
+export function DashboardBehavior(dashboardName) {
+  return {
+    properties: {
+      name: {
+        type: String,
+        value: dashboardName,
+        readOnly: true,
+      },
+    },
+    // This method is called when the dashboard reloads, either when the
+    // dashboard is first visited, periodically reloaded, or manually reloaded
+    // via the user clicking the button. Note that dashboard custom elements
+    // that use TF.Dashboard.ReloadBehavior already implement a reload method.
+    reload() {
+      throw Error(
+          'The ' + dashboardName + ' dashboard does not implement reload.');
+    },
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..6629e5bfc2284770da8559145c88e451ae063a77
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/dashboard-style.html
@@ -0,0 +1,53 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../paper-styles/paper-styles.html">
+<link rel="import" href="tensorboard-color.html">
+
+<dom-module id="dashboard-style">
+  <template>
+    <style>
+      .sidebar {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+        margin-right: 20px;
+      }
+
+      .sidebar-section {
+        border-top: solid 1px rgba(0, 0, 0, 0.12);
+        padding: 15px 0px 15px 30px;
+      }
+
+      .sidebar-section:first-child {
+        border: none;
+      }
+
+      .sidebar-section:last-child {
+        flex-grow: 1;
+        display: flex;
+      }
+
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
+        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
+        font-size: 14px;
+        margin-top: 5px;
+      }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts
new file mode 100644
index 0000000000000000000000000000000000000000..8b5ca120d609e26dea8dec57eede05ff39c518d0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/reload-behavior.ts
@@ -0,0 +1,39 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * ReloadBehavior: A simple behavior for dashboards where the
+ * frontendReload() function should find every child element with a
+ * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
+ * and call a `reload` method on that child.
+ * May later extend it so it has more sophisticated logic, e.g. reloading
+ * only tags that are in view.
+ */
+export function ReloadBehavior(tagName) {
+  return {
+    properties: {
+      reloadTag: {
+        type: String,
+        value: tagName,
+      },
+    },
+    frontendReload: function() {
+      var elements = this.getElementsByTagName(this.reloadTag);
+      Array.prototype.forEach.call(elements, function(x) {
+        x.reload();
+      });
+    },
+  };
+}
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..b15861694f57c1d801fe6d2c4cf3e5cb2410a611
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/run-color-style.html
@@ -0,0 +1,79 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../paper-styles/paper-styles.html">
+
+<dom-module id="run-color-style">
+  <template>
+    <style>
+    [color-class="light-blue"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-light-blue-500);
+      --paper-checkbox-checked-ink-color: var(--paper-light-blue-500);
+      --paper-checkbox-unchecked-color: var(--paper-light-blue-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-light-blue-900);
+    }
+    [color-class="red"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-red-500);
+      --paper-checkbox-checked-ink-color: var(--paper-red-500);
+      --paper-checkbox-unchecked-color: var(--paper-red-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-red-900);
+    }
+    [color-class="green"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-green-500);
+      --paper-checkbox-checked-ink-color: var(--paper-green-500);
+      --paper-checkbox-unchecked-color: var(--paper-green-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-green-900);
+    }
+    [color-class="purple"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-purple-500);
+      --paper-checkbox-checked-ink-color: var(--paper-purple-500);
+      --paper-checkbox-unchecked-color: var(--paper-purple-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-purple-900);
+    }
+    [color-class="teal"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-teal-500);
+      --paper-checkbox-checked-ink-color: var(--paper-teal-500);
+      --paper-checkbox-unchecked-color: var(--paper-teal-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-teal-900);
+    }
+    [color-class="pink"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-pink-500);
+      --paper-checkbox-checked-ink-color: var(--paper-pink-500);
+      --paper-checkbox-unchecked-color: var(--paper-pink-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-pink-900);
+    }
+    [color-class="orange"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-orange-500);
+      --paper-checkbox-checked-ink-color: var(--paper-orange-500);
+      --paper-checkbox-unchecked-color: var(--paper-orange-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-orange-900);
+    }
+    [color-class="brown"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-brown-500);
+      --paper-checkbox-checked-ink-color: var(--paper-brown-500);
+      --paper-checkbox-unchecked-color: var(--paper-brown-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-brown-900);
+    }
+    [color-class="indigo"] paper-checkbox {
+      --paper-checkbox-checked-color: var(--paper-indigo-500);
+      --paper-checkbox-checked-ink-color: var(--paper-indigo-500);
+      --paper-checkbox-unchecked-color: var(--paper-indigo-900);
+      --paper-checkbox-unchecked-ink-color: var(--paper-indigo-900);
+    }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html
new file mode 100644
index 0000000000000000000000000000000000000000..bfd61f66191df29521ecb3958f3bc9cccd57821e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/scrollbar-style.html
@@ -0,0 +1,46 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-styles/paper-styles.html">
+
+<dom-module id="scrollbar-style">
+  <template>
+    <style>
+      .scrollbar::-webkit-scrollbar-track
+      {
+        visibility: hidden;
+      }
+
+      .scrollbar::-webkit-scrollbar
+      {
+        width: 10px;
+      }
+
+      .scrollbar::-webkit-scrollbar-thumb
+      {
+        border-radius: 10px;
+        -webkit-box-shadow: inset 0 0 2px rgba(0,0,0,.3);
+        background-color: var(--paper-grey-500);
+        color: var(--paper-grey-900);
+      }
+      .scrollbar {
+        box-sizing: border-box;
+      }
+    </style>
+  </template>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html
new file mode 100644
index 0000000000000000000000000000000000000000..7f9ca6461485ad9b6356b05fac48544b4a995dfb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tensorboard-color.html
@@ -0,0 +1,32 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<style is="custom-style">
+
+  :root {
+    --tb-orange-weak: #ffa726;
+    --tb-orange-strong: #f57c00;
+    --tb-grey-darker: #e2e2e2;
+    --tb-grey-lighter: #f3f3f3;
+    --tb-ui-dark-accent: #757575;
+    --tb-ui-light-accent: #e0e0e0;
+    --tb-graph-faded: #e0d4b3;
+  }
+
+</style>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fc9912d54a4d64e206294a1caec8d9037132e739
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/BUILD
@@ -0,0 +1,49 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-dashboard-common/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:tf-categorizer-bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Dashboard": ["tf-categorizer-tests.ts"]},
+    namespace_symbol_aliases = {"TF.Dashboard": {"cat": "TF.Dashboard.Categorizer"}},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..cd33cee47427fd4f7cce1deeb5932937aa810b8c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../tf-categorizer.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..a786f39b4fb6f6c9560916e8ab863af8503780b9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/test/tf-categorizer-tests.ts
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as cat from '../tf-categorizer';
+
+let assert = chai.assert;
+
+describe('categorizer', () => {
+  describe('topLevelNamespaceCategorizer', () => {
+    it('returns empty array on empty tags', () => {
+      assert.lengthOf(cat.topLevelNamespaceCategorizer([]), 0);
+    });
+
+    it('handles a simple case', () => {
+      let simple = [
+        'foo1/bar', 'foo1/zod', 'foo2/bar', 'foo2/zod', 'gosh/lod/mar',
+        'gosh/lod/ned'
+      ];
+      let expected = [
+        {name: 'foo1', tags: ['foo1/bar', 'foo1/zod']},
+        {name: 'foo2', tags: ['foo2/bar', 'foo2/zod']},
+        {name: 'gosh', tags: ['gosh/lod/mar', 'gosh/lod/ned']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(simple), expected);
+    });
+
+    it('orders the categories', () => {
+      let test = ['e', 'f', 'g', 'a', 'b', 'c'];
+      let expected = [
+        {name: 'a', tags: ['a']},
+        {name: 'b', tags: ['b']},
+        {name: 'c', tags: ['c']},
+        {name: 'e', tags: ['e']},
+        {name: 'f', tags: ['f']},
+        {name: 'g', tags: ['g']},
+      ];
+      assert.deepEqual(cat.topLevelNamespaceCategorizer(test), expected);
+    });
+
+    it('handles cases where category names overlap node names', () => {
+      let test = ['a', 'a/a', 'a/b', 'a/c', 'b', 'b/a'];
+      const actual = cat.topLevelNamespaceCategorizer(test);
+      let expected = [
+        {name: 'a', tags: ['a', 'a/a', 'a/b', 'a/c']},
+        {name: 'b', tags: ['b', 'b/a']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('handles singleton case', () => {
+      assert.deepEqual(
+          cat.topLevelNamespaceCategorizer(['a']), [{name: 'a', tags: ['a']}]);
+    });
+  });
+
+  describe('customCategorizer', () => {
+    function noFallbackCategorizer(tags: string[]): cat.Category[] {
+      return [];
+    }
+
+    function testCategorizer(
+        defs: string[], fallback: cat.Categorizer,
+        tags: string[]): cat.Category[] {
+      const catDefs = defs.map(cat.defineCategory);
+      return cat._categorizer(catDefs, fallback)(tags);
+    }
+
+    it('categorizes by regular expression', () => {
+      let defs = ['foo..', 'bar..'];
+      let tags = ['fooab', 'fooxa', 'barts', 'barms'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: 'foo..', tags: ['fooab', 'fooxa']},
+        {name: 'bar..', tags: ['barms', 'barts']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('matches non-exclusively', () => {
+      let tags = ['abc', 'bar', 'zod'];
+      const actual =
+          testCategorizer(['...', 'bar'], noFallbackCategorizer, tags);
+      let expected = [
+        {name: '...', tags: ['abc', 'bar', 'zod']},
+        {name: 'bar', tags: ['bar']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('creates categories for unmatched rules', () => {
+      const actual =
+          testCategorizer(['a', 'b', 'c'], noFallbackCategorizer, []);
+      let expected = [
+        {name: 'a', tags: []},
+        {name: 'b', tags: []},
+        {name: 'c', tags: []},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category regexs work with special characters', () => {
+      let defs = ['^\\w+$', '^\\d+$', '^\\/..$'];
+      let tags = ['foo', '3243', '/xa'];
+      const actual = testCategorizer(defs, noFallbackCategorizer, tags);
+      let expected = [
+        {name: '^\\w+$', tags: ['3243', 'foo']},
+        {name: '^\\d+$', tags: ['3243']},
+        {name: '^\\/..$', tags: ['/xa']},
+      ];
+      assert.deepEqual(actual, expected);
+    });
+
+    it('category tags are sorted', () => {
+      let tags = ['a', 'z', 'c', 'd', 'e', 'x', 'f', 'y', 'g'];
+      let sorted = tags.slice().sort();
+      let expected = [{name: '.*', tags: sorted}];
+      const actual = testCategorizer(['.*'], noFallbackCategorizer, tags);
+      assert.deepEqual(actual, expected);
+    });
+
+    it('if nonexclusive: all tags passed to fallback', () => {
+      let passedToDefault = null;
+      function defaultCategorizer(tags: string[]): cat.Category[] {
+        passedToDefault = tags;
+        return [];
+      }
+      let tags = ['foo', 'bar', 'foo123'];
+      testCategorizer(['foo'], defaultCategorizer, tags);
+      assert.deepEqual(passedToDefault, tags);
+    });
+  });
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html
index 285d36fa7752df5f421eadab82e59e211ba8a05b..23babaaecc4d2fe1b31fa0e930a608a41c307f90 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-categorizer-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer-demo.html
@@ -18,11 +18,7 @@ limitations under the License.
 
 <html>
  <head>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-  <link rel="import" href="../../tf-imports/d3.html">
-  <link rel="import" href="../tf-categorizer.html">
-  <link rel="import" href="../../iron-flex-layout/classes/iron-flex-layout.html">
-
+  <link rel="import" href="tf-categorizer.html">
  </head>
  <body>
   <style>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html
new file mode 100644
index 0000000000000000000000000000000000000000..6388ab5e7d4ed490514ef180d2ad8b98494ab618
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.html
@@ -0,0 +1,63 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../vz-sorting/vz-sorting.html">
+<link rel="import" href="tf-regex-group.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+`tf-categorizer` turns an array of tags into an array of categories
+
+The transformation from tags to categories is controlled by the user, through
+interacting with the categorizer widget.
+
+(See type signatures in categorizer.ts)
+
+Example:
+  <tf-categorizer tags="[[tags]]" categories="{{categories}}"></tf-categorizer>
+
+Public Properties:
+`tags` - Array of strings that are the tags to categorize. Should be one-way bound downward.
+`categories` - Array of Categorizer.Category objects that are generated by the Categorizer.
+  Are readOnly and notify: True. Expected to be one-way bound upward.
+
+The categorizer provides inputs for adding regular expression rules and toggling whether
+categories are exclusive.
+-->
+<dom-module id="tf-categorizer">
+  <template>
+    <div class="inputs">
+      <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
+    </div>
+    <style>
+      :host {
+        display: block;
+        padding-bottom: 5px;
+      }
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--paper-grey-600);
+        --paper-checkbox-unchecked-color: var(--paper-grey-600);
+        font-size: 14px;
+      }
+    </style>
+  </template>
+  <script src="tf-categorizer-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7ff6a0627c6ec1e0d0fb7d692636b4f95b30b61f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-categorizer.ts
@@ -0,0 +1,192 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as _ from 'lodash';
+
+import {compareTagNames} from '../vz_sorting_d3v4/sorting';
+
+/**
+ * This module contains methods that allow sorting tags into 'categories'.
+ * A category contains a name and a list of tags.
+ * The sorting strategy is defined by a 'CustomCategorization', which contains
+ * 'categoryDefinitions' which are regex rules used to construct a category.
+ * E.g. the regex rule 'xent' will create a category called 'xent' that
+ * contains values whose tags match the regex.
+ *
+ * After custom categories are evaluated, the tags are sorted by a hardcoded
+ * fallback categorizer, which may, for example, group tags into categories
+ * based on their top namespace.
+ */
+
+export interface Category {
+  // Categories that data is sorted into
+  name: string;
+  tags: string[];
+}
+
+export interface CustomCategorization {
+  // Defines a categorization strategy
+  categoryDefinitions: string[];
+  fallbackCategorizer: string;
+  /* {'TopLevelNamespaceCategorizer',
+      'LegacyUnderscoreCategorizer'} */
+}
+
+export interface Categorizer {
+  // Function that generates categories
+  (tags: string[]): Category[];
+}
+
+/* Canonical TensorFlow ops are namespaced using forward slashes.
+ * This fallback categorizer categorizes by the top-level namespace.
+ */
+export var topLevelNamespaceCategorizer: Categorizer = splitCategorizer(/\//);
+
+export function fallbackCategorizer(s: string): Categorizer {
+  switch (s) {
+    case 'TopLevelNamespaceCategorizer':
+      return topLevelNamespaceCategorizer;
+    default:
+      throw new Error('Unrecognized categorization strategy: ' + s);
+  }
+}
+
+/* An 'extractor' is a function that takes a tag name, and 'extracts' a
+ * category name.
+ * This function takes an extractor, and produces a categorizer.
+ * Currently, it is just used for the fallbackCategorizer, but we may want to
+ * refactor the general categorization logic to use the concept of extractors.
+ */
+function extractorToCategorizer(extractor: (s: string) => string): Categorizer {
+  return (tags: string[]): Category[] => {
+    if (tags.length === 0) {
+      return [];
+    }
+
+    // Maps between top-level name and category. We use the mapping to avoid
+    // duplicating categories per run.
+    const categoryMapping: {[key: string]: Category} = {};
+
+    tags.forEach((t: string) => {
+      const topLevel = extractor(t);
+      if (!categoryMapping[topLevel]) {
+        const newCategory = {
+          name: topLevel,
+          tags: [],
+        };
+        categoryMapping[topLevel] = newCategory;
+      }
+
+      categoryMapping[topLevel].tags.push(t);
+    });
+
+    // Sort categories into alphabetical order.
+    const categories =
+        _.map(_.keys(categoryMapping).sort(), key => categoryMapping[key]);
+    _.forEach(categories, (category) => {
+      // Sort the tags within each category.
+      category.tags.sort(compareTagNames);
+    });
+    return categories;
+  };
+}
+
+function splitCategorizer(r: RegExp): Categorizer {
+  let extractor = (t: string) => {
+    return t.split(r)[0];
+  };
+  return extractorToCategorizer(extractor);
+}
+
+export interface CategoryDefinition {
+  name: string;
+  matches: (t: string) => boolean;
+}
+
+export function defineCategory(ruledef: string): CategoryDefinition {
+  let r = new RegExp(ruledef);
+  let f = function(tag: string): boolean {
+    return r.test(tag);
+  };
+  return {name: ruledef, matches: f};
+}
+
+export function _categorizer(
+    rules: CategoryDefinition[], fallback: Categorizer) {
+  return function(tags: string[]): Category[] {
+    let remaining: d3.Set = d3.set(tags);
+    let userSpecified = rules.map((def: CategoryDefinition) => {
+      let tags: string[] = [];
+      remaining.each((t: string) => {
+        if (def.matches(t)) {
+          tags.push(t);
+        }
+      });
+      let cat = {name: def.name, tags: tags.sort(compareTagNames)};
+      return cat;
+    });
+    let defaultCategories = fallback(remaining.values());
+    return userSpecified.concat(defaultCategories);
+  };
+}
+
+export function categorizer(s: CustomCategorization): Categorizer {
+  let rules = s.categoryDefinitions.map(defineCategory);
+  let fallback = fallbackCategorizer(s.fallbackCategorizer);
+  return _categorizer(rules, fallback);
+};
+
+Polymer({
+  is: 'tf-categorizer',
+  properties: {
+    regexes: {type: Array},
+    tags: {type: Array},
+    categoriesAreExclusive: {type: Boolean, value: true},
+    fallbackCategorizer: {
+      type: String,
+      value: 'TopLevelNamespaceCategorizer',
+    },
+    categorizer: {
+      type: Object,
+      computed:
+          'computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)',
+    },
+    categories: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+      notify: true,
+      readOnly: true
+    },
+  },
+  observers: ['recategorize(tags.*, categorizer)'],
+  computeCategorization: function(
+      regexes, categoriesAreExclusive, fallbackCategorizer) {
+    var categorizationStrategy = {
+      categoryDefinitions: regexes.base,
+      categoriesAreExclusive: categoriesAreExclusive,
+      fallbackCategorizer: fallbackCategorizer,
+    };
+    return categorizer(categorizationStrategy);
+  },
+  recategorize: function() {
+    this.debounce('tf-categorizer-recategorize', function() {
+      var categories = this.categorizer(this.tags);
+      this._setCategories(categories);
+    })
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html
new file mode 100644
index 0000000000000000000000000000000000000000..a39fb9462baf952688bef35372c2c3a70d1b1894
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-chart-scaffold.html
@@ -0,0 +1,152 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<!--
+tf-chart-scaffold is responsible for providing data from TensorBoard to charts.
+It has the following settable properties:
+tag: (required, string) - the name of the tag to load for this chart
+visibleSeries: (required, string[]) - the names of the series the chart should
+    display.
+dataProvider: (required, VZ.ChartHelpers.DataFn) - function that takes (tag,
+    run) and returns a promise containing an array of VZ.ChartHelpers.Datum,
+    compatible with TF.Backend.Datum.
+
+It exposes the following methods:
+chart() - Returns the underlying chart element.
+reload() - Reloads the data and sends it to the underlying chart.
+
+This element should have a compatible chart plugin element as it's content. The
+plugin is required to implement two functions:
+- setVisibleSeries(names: string[]): a function that receives an array of series
+    names as the first parameter, responsible for changing the series currently
+    being displayed to only the series in this array.
+- setSeriesData(name: string, data: VZ.ChartHelpers.Datum[]): sets the data of
+    the series with the given name to the data given in the second parameter.
+-->
+<dom-module id="tf-chart-scaffold">
+  <template>
+    <content></content>
+    <style>
+      :host {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        display: flex;
+        flex-direction: column;
+        flex-grow: 1;
+        flex-shrink: 1;
+        position: relative;
+      }
+    </style>
+  </template>
+  <script>
+    "use strict";
+
+    Polymer({
+      is: "tf-chart-scaffold",
+      properties: {
+        tag: String,
+        dataProvider: Function,
+        visibleSeries: Array,
+        _attached: {
+          type: Boolean,
+          value: false
+        },
+
+        // Storing the update ID of the previous request for data enables us to determine if a
+        // data response is outdated. We rely on an increasing ID instead of timestamp because
+        // successive updates often fire within the same millisecond.
+        _dataUpdateIdOfLastRequest: Number,
+        _nextAvailableDataUpdateId: {
+          type: Number,
+          value: 1,
+        },
+      },
+      observers: [
+        "reload(tag, dataProvider)",
+        "_changeSeries(visibleSeries.*)"
+      ],
+      ready: function() {
+        this.fire('ready');
+      },
+      attached: function() {
+        this._attached = true;
+        this._changeSeries();
+      },
+      detached: function() {
+        this._attached = false;
+      },
+      reload: function() {
+        if (!this._attached) {
+          return;
+        }
+        else if (!this.dataProvider) {
+          throw new Error('tf-chart-scaffold requires a dataProvider.');
+        }
+        else if (!this.tag) {
+          throw new Error('tf-chart-scaffold requires a tag.');
+        }
+
+        // TODO(chizeng): At this point, notify effective children that the previous data has been
+        // invalidated. For instance, the image dashboard may want to clear its images. Today, the
+        // chart scaffold only informs children when the new image URLs response finishes loading.
+
+        const dataUpdateId = this._nextAvailableDataUpdateId++;
+        this._dataUpdateIdOfLastRequest = dataUpdateId;
+
+        this.visibleSeries.forEach(function(name) {
+          this.dataProvider(this.tag, name).then(function(data) {
+            if (dataUpdateId != this._dataUpdateIdOfLastRequest) {
+              // This response is outdated. Ignore it.
+              // TODO(chizeng): Explore canceling an outdated request before we even receive its
+              // response. This involves creating hooks into the request manager and might introduce
+              // some complexity that may not be worth it; Tensorboard frankly does not seem
+              // bottlenecked by the network (It is often run in fast corp networks or locally.).
+              return;
+            }
+            this.chart().setSeriesData(name, data);
+          }.bind(this));
+        }.bind(this));
+      },
+      _changeSeries: function() {
+        if (!this._attached) {
+           return;
+        }
+        else if (!this.visibleSeries) {
+          throw new Error('tf-chart-scaffold requires a visibleSeries.');
+        }
+
+        this.chart().setVisibleSeries(this.visibleSeries);
+        this.reload();
+      },
+      chart: function() {
+        var children = this.getEffectiveChildren();
+        if (!children.length) {
+          throw new Error('tf-chart-scaffold has no children');
+        }
+
+        var child = children[0];
+        if (!child.setVisibleSeries || !child.setSeriesData) {
+          throw new Error("tf-chart-scaffold's content doesn't implement the " +
+              "required interface");
+        }
+        return child;
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html
similarity index 83%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html
index 15253a833139cb5e2acba1467b271950b5454474..efa990b11cfa45bc3396a65d33e3e07161dac80c 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-collapsable-pane-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane-demo.html
@@ -18,8 +18,8 @@ limitations under the License.
 
 <html>
  <head>
-   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-   <link rel="import" href="../tf-collapsable-pane.html">
+   <link rel="import" href="tf-collapsable-pane.html">
+   
  </head>
  <body>
   <style>
@@ -28,7 +28,4 @@ limitations under the License.
     <h1>This is content inside the pane.</h1>
   </tf-collapsable-pane>
  </body>
- <script>
-
- </script>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html
new file mode 100644
index 0000000000000000000000000000000000000000..e82540127fa5c765cde178dcc1d17014854990d2
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-collapsable-pane.html
@@ -0,0 +1,109 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-collapse/iron-collapse.html">
+
+<dom-module id="tf-collapsable-pane">
+  <template>
+    <button
+      class="heading"
+      on-tap="togglePane"
+      open-button$="[[opened]]"
+    >
+    <span class="name">[[name]]</span>
+    <span class="count">
+      <span>[[count]]</span>
+    </span>
+  </button>
+    <iron-collapse opened="[[opened]]">
+      <div class="content">
+        <template is="dom-if" if="[[opened]]" restamp="[[restamp]]">
+          <content></content>
+        </template>
+      </div>
+    </iron-collapse>
+    <style>
+      :host {
+        display: block;
+        margin: 0 5px 1px 10px;
+      }
+
+      :host:first-of-type {
+        margin-top: 20px;
+      }
+      
+      :host:last-of-type {
+        margin-bottom: 20px;
+      }
+
+      .heading {
+        background-color: white;
+        border: none;
+        cursor: pointer;
+        width: 100%;
+        font-size: 15px;
+        line-height: 1;
+        box-shadow: 0 1px 5px rgba(0,0,0,0.2);
+        padding: 10px 15px;
+      }
+
+      .content {
+        padding: 15px;
+        border: 1px solid #dedede;
+        border-top: none;
+        border-bottom-left-radius: 2px;
+        border-bottom-right-radius: 2px;
+        background: white;
+      }
+
+      [open-button] {
+        border-bottom-left-radius: 0px !important;
+        border-bottom-right-radius: 0px !important;
+      }
+
+      .name {
+        float: left;
+      }
+
+      .count {
+        float: right;
+        margin-right: 5px;
+        font-size: 12px;
+        color: var(--paper-grey-500);
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-collapsable-pane",
+      properties: {
+        opened: {type: Boolean, value: false},
+        restamp: {type: Boolean, value: true},
+        name: {type: String, observer: "hide"},
+        count: {type: Number},
+      },
+      hide: function() {
+        this.opened = false;
+      },
+      togglePane: function() {
+        this.opened = !this.opened;
+      }
+    });
+  </script>
+
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..e0e8a2b52c38965b78e254cf1c6c0bf4b5c0d4b3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard-layout.html
@@ -0,0 +1,67 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="scrollbar-style.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+Generic layout for a dashboard.
+-->
+<dom-module id="tf-dashboard-layout">
+  <template>
+    <div id="sidebar">
+      <content select=".sidebar"></content>
+    </div>
+
+    <div id="center" class="scrollbar">
+      <content select=".center"></content>
+    </div>
+    <style include="scrollbar-style"></style>
+    <style>
+      #sidebar {
+        width: inherit;
+        height: 100%;
+        overflow: ellipsis;
+        flex-grow: 0;
+        flex-shrink: 0;
+      }
+
+      #center {
+        height: 100%;
+        overflow-y: auto;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+
+      .tf-graph-dashboard #center {
+        background: white;
+      }
+
+      :host {
+        display: flex;
+        flex-direction: row;
+        height: 100%;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-dashboard-layout",
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html
new file mode 100644
index 0000000000000000000000000000000000000000..475c2cef3bd6c358d15adb09ccdc7790af539fc9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-dashboard.html
@@ -0,0 +1,25 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tf-dashboard-layout.html">
+<link rel="import" href="tensorboard-color.html">
+<link rel="import" href="dashboard-style.html">
+<link rel="import" href="tf-downloader.html">
+<link rel="import" href="tf-no-data-warning.html">
+
+<script src="tf-dashboard.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html
new file mode 100644
index 0000000000000000000000000000000000000000..719142595984e2e529c2b569098efbe5258e6906
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-downloader.html
@@ -0,0 +1,99 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
+<link rel="import" href="../paper-menu/paper-menu.html">
+<link rel="import" href="../paper-item/paper-item.html">
+
+<dom-module id="tf-downloader">
+  <template>
+    <paper-dropdown-menu
+      no-label-float="true"
+      label="run to download"
+      selected-item-label="{{_run}}"
+    >
+      <paper-menu class="dropdown-content">
+        <template is="dom-repeat" items="[[runs]]">
+          <paper-item no-label-float=true>[[item]]</paper-item>
+        </template>
+      </paper-menu>
+    </paper-dropdown-menu>
+    <div class="center">
+      <span>
+        <a
+          download="[[_csvName(_run)]]"
+          href="[[_csvUrl(_run, urlFn)]]"
+          >CSV</a>
+        <a
+          download="[[_jsonName(_run)]]"
+          href="[[_jsonUrl(_run, urlFn)]]"
+          >JSON</a>
+      </span>
+    </div>
+    <style>
+      :host {
+        display: flex;
+        height: 32px;
+      }
+      .center {
+        display: flex;
+        align-self: center;
+      }
+      paper-dropdown-menu {
+        width: 100px;
+        --paper-input-container-label: {
+          font-size: 10px;
+        }
+        --paper-input-container-input: {
+          font-size: 10px;
+        }
+      }
+      a {
+        font-size: 10px;
+        border-radius: 3px;
+        border: 1px solid #EEE;
+      }
+      paper-input {
+        font-size: 22px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-downloader",
+      properties: {
+        _run: String,
+        runs: Array,
+        tag: String,
+        urlFn: Function,
+      },
+      _csvUrl: function(_run, urlFn) {
+        return urlFn(this.tag, _run) + "&format=csv";
+      },
+      _jsonUrl: function(_run, urlFn) {
+        return urlFn(this.tag, _run);
+      },
+      _csvName: function(_run) {
+        return "run_" + _run + ",tag_" + this.tag + ".csv";
+      },
+      _jsonName: function(_run) {
+        return "run-" + _run + "-tag-" + this.tag + ".json";
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html
similarity index 94%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html
index 4c2fe55eba249f70233be1cc8631b3235d31807e..d0f5aa6f27d7cf5351c5c50fc3be693ce1bd39d4 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-multi-checkbox-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox-demo.html
@@ -18,10 +18,8 @@ limitations under the License.
 
 <html>
 <head>
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../tf-imports/d3.html">
-<link rel="import" href="../tf-multi-checkbox.html">
-<link rel="import" href="../../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="../tf-color-scale/tf-color-scale.html">
+<link rel="import" href="tf-multi-checkbox.html">
 
 </head>
 <body>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html
new file mode 100644
index 0000000000000000000000000000000000000000..8a56616f820f19c15a0097051abfaad929332d65
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.html
@@ -0,0 +1,160 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-checkbox/paper-checkbox.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="scrollbar-style.html">
+<link rel="import" href="run-color-style.html">
+
+<!--
+tf-multi-checkbox creates a list of checkboxes that can be used to toggle on or off
+a large number of values. Each checkbox displays a name, and may also have an
+associated tooltip value. Checkboxes can be highlighted, hidden, and re-ordered.
+
+tf-multi-checkbox assumes that the names may be very long compared to the width
+of the checkbox, and the number of names may also be very large, and works to
+handle these situations gracefully.
+-->
+<dom-module id="tf-multi-checkbox">
+  <style include="scrollbar-style"></style>
+  <style include="run-color-style"></style>
+
+  <template>
+      <paper-input
+        id="runs-regex"
+        no-label-float
+        label="Write a regex to filter runs"
+        value="[[regexInput]]"
+        on-bind-value-changed="_debouncedRegexChange"
+      ></paper-input>
+    <div id="outer-container" class="scrollbar">
+      <template
+        is="dom-repeat"
+        items="[[namesMatchingRegex]]"
+      >
+        <div
+          class="run-row"
+        >
+          <div class="icon-container checkbox-container vertical-align-container">
+            <paper-checkbox
+              class="checkbox vertical-align-center"
+              name="[[item]]"
+              checked$="[[_isChecked(item, runSelectionState.*)]]"
+              on-change="_checkboxChange"
+            ></paper-checkbox>
+
+          </div>
+          <div class="icon-container isolator-container vertical-align-container">
+            <paper-icon-button
+              icon="radio-button-unchecked"
+              class="isolator vertical-align-center"
+              on-tap="_isolateRun"
+              name="[[item]]"
+            ></paper-icon-button>
+          </div>
+          <div class="item-label-container">
+            <span>[[item]]</span>
+          </div>
+        </div>
+      </template>
+    </div>
+  <style>
+    paper-input {
+      --paper-input-container-focus-color: var(--tb-orange-strong);
+      --paper-input-container-input: {
+        font-size: 14px;
+      };
+      --paper-input-container-label: {
+        font-size: 14px;
+      };
+    }
+    :host {
+      display: flex;
+      flex-direction: column;
+      height: 100%;
+    }
+    #outer-container {
+      overflow-y: auto;
+      overflow-x: hidden;
+      width: 100%;
+      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
+      flex-grow: 1;
+      flex-shrink: 1;
+      word-wrap: break-word;
+    }
+    .run-row {
+      padding-top: 5px;
+      padding-bottom: 5px;
+      display: flex;
+      flex-direction: row;
+      font-size: 13px;
+    }
+    .icon-container {
+      flex-grow: 0;
+      flex-shrink: 0;
+      padding-left: 2px;
+    }
+    .checkbox {
+      padding-left: 2px;
+      width: 18px;
+      height: 18px;
+    }
+    .isolator {
+      width: 18px;
+      height: 18px;
+      padding: 0px;
+    }
+    .isolator-container {
+      padding-left: 6px;
+      padding-right: 3px;
+    }
+    .checkbox-container {
+      padding-left: 2px;
+    }
+    .item-label-container {
+      padding-left: 5px;
+      flex-grow: 1;
+      flex-shrink: 1;
+      width: 0px; /* hack to get the flex-grow to work properly */
+    }
+    .tooltip-value-container {
+      display: flex;
+      justify-content: center;
+      flex-grow: 0;
+      flex-shrink: 0;
+      text-align:right;
+      padding-left: 2px;
+    }
+    .vertical-align-container {
+      display: flex;
+      justify-content: center;
+    }
+    .vertical-align-container .vertical-align-center {
+      align-self: center;
+    }
+    .vertical-align-container .vertical-align-top {
+      align-self: start;
+    }
+  </style>
+  </template>
+  <script src="tf-multi-checkbox-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts
new file mode 100644
index 0000000000000000000000000000000000000000..44a14a21cfeb2ae75804d9803035b76fa8e29d68
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-multi-checkbox.ts
@@ -0,0 +1,206 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as _ from 'lodash';
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-multi-checkbox',
+  properties: {
+    names: {
+      type: Array,
+      value: function() {
+        return [];
+      },
+    },  // All the runs in consideration
+    regexInput: {
+      type: String,
+      value: storage.getStringInitializer('regexInput', ''),
+      observer: '_regexInputObserver',
+    },  // Regex for filtering the runs
+    regex: {type: Object, computed: '_makeRegex(regexInput)'},
+    namesMatchingRegex: {
+      type: Array,
+      computed: 'computeNamesMatchingRegex(names.*, regex)'
+    },  // Runs that match the regex
+    runSelectionState: {
+      // if a run is explicitly enabled, True, if explicitly disabled, False.
+      // if undefined, default value (enable for first k runs, disable after).
+      type: Object,
+      value: storage.getObjectInitializer('runSelectionState', {}),
+      observer: '_storeRunToIsCheckedMapping',
+    },
+    // (Allows state to persist across regex filtering)
+    outSelected: {
+      type: Array,
+      notify: true,
+      computed: 'computeOutSelected(namesMatchingRegex.*, runSelectionState.*)'
+    },
+    colorScale: {
+      type: Object,
+      observer: 'synchronizeColors',
+    },  // map from run name to css class
+    maxRunsToEnableByDefault: {
+      // When TB first loads, if it has k or fewer runs, they are all enabled
+      // by default. If there are more, then they are all disabled.
+      type: Number,
+      value: 40,
+    },
+    _debouncedRegexChange: {
+      type: Object,
+      // Updating the regex can be slow, because it involves updating styles
+      // on a large number of Polymer paper-checkboxes. We don't want to do
+      // this while the user is typing, as it may make a bad, laggy UI.
+      // So we debounce the updates that come from user typing.
+      value: function() {
+        const _this = this;
+        var debounced = _.debounce(function(r) {
+          _this.regexInput = r;
+        }, 150, {leading: false});
+        return function() {
+          var r = this.$$('#runs-regex').value;
+          if (r == '') {
+            // If the user cleared the field, they may be done typing, so
+            // update more quickly.
+            this.async(function() {
+              _this.regexInput = r;
+            }, 30);
+          } else {
+            debounced(r);
+          };
+        };
+      },
+    },
+  },
+  listeners: {
+    'dom-change': 'synchronizeColors',
+  },
+  observers: [
+    '_setIsolatorIcon(runSelectionState, names)',
+  ],
+  _storeRunToIsCheckedMapping:
+      storage.getObjectObserver('runSelectionState', {}),
+  _makeRegex: function(regex) {
+    try {
+      return new RegExp(regex)
+    } catch (e) {
+      return null;
+    }
+  },
+  _setIsolatorIcon: function() {
+    var runMap = this.runSelectionState;
+    var numChecked = _.filter(_.values(runMap)).length;
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+
+    buttons.forEach(function(b) {
+      if (numChecked === 1 && runMap[b.name]) {
+        b.icon = 'radio-button-checked';
+      } else {
+        b.icon = 'radio-button-unchecked';
+      }
+    });
+  },
+  computeNamesMatchingRegex: function(__, ___) {
+    var regex = this.regex;
+    return this.names.filter(function(n) {
+      return regex == null || regex.test(n);
+    });
+  },
+  computeOutSelected: function(__, ___) {
+    var runSelectionState = this.runSelectionState;
+    var num = this.maxRunsToEnableByDefault;
+    var allEnabled = this.namesMatchingRegex.length <= num;
+    return this.namesMatchingRegex.filter(function(n, i) {
+      return runSelectionState[n] == null ? allEnabled : runSelectionState[n];
+    });
+  },
+  synchronizeColors: function(e) {
+    if (!this.colorScale) return;
+
+    this._setIsolatorIcon();
+
+    var checkboxes =
+        Array.prototype.slice.call(this.querySelectorAll('paper-checkbox'));
+    var scale = this.colorScale;
+    checkboxes.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.customStyle['--paper-checkbox-checked-color'] = color;
+      p.customStyle['--paper-checkbox-checked-ink-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-color'] = color;
+      p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
+    });
+    var buttons =
+        Array.prototype.slice.call(this.querySelectorAll('.isolator'));
+    buttons.forEach(function(p) {
+      var color = scale.scale(p.name);
+      p.style['color'] = color;
+    });
+    // The updateStyles call fails silently if the browser doesn't have focus,
+    // e.g. if TensorBoard was opened into a new tab that isn't visible.
+    // So we wait for requestAnimationFrame.
+    var _this = this;
+    window.requestAnimationFrame(function() {
+      _this.updateStyles();
+    });
+  },
+  _isolateRun: function(e) {
+    // If user clicks on the label for one run, enable it and disable all other
+    // runs.
+
+    var name = (Polymer.dom(e) as any).localTarget.name;
+    var selectionState = {};
+    this.names.forEach(function(n) {
+      selectionState[n] = n == name;
+    });
+    this.runSelectionState = selectionState;
+  },
+  _checkboxChange: function(e) {
+    var target = (Polymer.dom(e) as any).localTarget;
+    this.runSelectionState[target.name] = target.checked;
+    // n.b. notifyPath won't work because run names may have periods.
+    this.runSelectionState = _.clone(this.runSelectionState);
+  },
+  _isChecked: function(item, outSelectedChange) {
+    return this.outSelected.indexOf(item) != -1;
+  },
+  _regexInputObserver: storage.getStringObserver('regexInput', ''),
+  toggleAll: function() {
+    var _this = this;
+    var anyToggledOn = this.namesMatchingRegex.some(function(n) {
+      return _this.runSelectionState[n]
+    });
+
+
+    var runSelectionStateIsDefault =
+        Object.keys(this.runSelectionState).length == 0;
+
+    var defaultOff =
+        this.namesMatchingRegex.length > this.maxRunsToEnableByDefault;
+    // We have runs toggled either if some were explicitly toggled on, or if
+    // we are in the default state, and there are few enough that we default
+    // to toggling on.
+    anyToggledOn = anyToggledOn || runSelectionStateIsDefault && !defaultOff;
+
+    // If any are toggled on, we turn everything off. Or, if none are toggled
+    // on, we turn everything on.
+
+    var newRunsDisabled = {};
+    this.names.forEach(function(n) {
+      newRunsDisabled[n] = !anyToggledOn;
+    });
+    this.runSelectionState = newRunsDisabled;
+  },
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html
new file mode 100644
index 0000000000000000000000000000000000000000..c90efac1d6b58debc6a39ae4ffafaeb3fb093da1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-no-data-warning.html
@@ -0,0 +1,129 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+
+<!--
+Display a warning when there is no data found.
+-->
+<dom-module id="tf-no-data-warning">
+  <template>
+    <template is="dom-if" if="[[showWarning]]">
+      <div class="warning">
+        <template is="dom-if" if="[[_isGraph(dataType)]]">
+          <h3>
+            No graph definition files were found.
+          </h3>
+          <p>
+            To store a graph, create a
+            <code>tf.summary.FileWriter</code>
+            and pass the graph either via the constructor, or by calling its
+            <code>add_graph()</code> method.
+            You may want to check out the
+            <a href="https://www.tensorflow.org/get_started/graph_viz">
+              graph visualizer tutorial
+            </a>.
+          </p>
+        </template>
+        <template is="dom-if" if="[[_isProjector(dataType)]]">
+          <h3>
+            No checkpoint was found.
+          </h3>
+          <p>
+            Probable causes:
+            <ul>
+              <li>
+                No checkpoint has been saved yet. Please refresh the page periodically.
+              </li>
+              <li>
+                You are not saving any checkpoint. To save your model,
+                create a
+                <a href="https://www.tensorflow.org/api_docs/python/tf/train/Saver">
+                  <code>tf.train.Saver</code>
+                </a>
+                and save your model periodically
+                by calling <code>saver.save(session, LOG_DIR/model.ckpt, step)</code>.
+              </li>
+            </ul>
+          </p>
+        </template>
+        <template is="dom-if" if="[[_isOther(dataType)]]">
+          <h3>
+            No <span>[[dataType]]</span> data was found.
+          </h3>
+          <p>
+            Probable causes:
+            <ul>
+              <li>
+                You haven't written any <span>[[dataType]]</span> data
+                to your event files.
+              </li>
+              <li>
+                TensorBoard can't find your event files.
+              </li>
+            </ul>
+          </p>
+        </template>
+        <p>
+          If you're new to using TensorBoard, and want to find out how to add
+          data and set up your event files, check out the
+          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md">
+            README
+          </a>
+          and perhaps the
+          <a href="https://www.tensorflow.org/get_started/summaries_and_tensorboard">
+            TensorBoard tutorial
+          </a>.
+        </p>
+
+        <p>
+          If you think TensorBoard is configured properly, please see the
+          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md#my-tensorboard-isnt-showing-any-data-whats-wrong">
+            section of the README devoted to missing data problems
+          </a>
+          and consider filing an issue on GitHub.
+        </p>
+
+      </div>
+    </template>
+    <style>
+      .warning {
+        max-width: 540px;
+        margin: 80px auto 0 auto;
+      }
+    </style>
+  </template>
+
+  <script>
+    Polymer({
+      is: "tf-no-data-warning",
+      properties: {
+        dataType: String,
+        showWarning: Boolean
+      },
+      _isGraph: function(dataType) {
+        return dataType === "graph";
+      },
+      _isProjector: function(dataType) {
+        return dataType === "projector";
+      },
+      _isOther: function(dataType) {
+        return !this._isGraph(dataType) && !this._isProjector(dataType);
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html
new file mode 100644
index 0000000000000000000000000000000000000000..547a558ad0b5da9305d88d2d678302be1f928f8b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-option-selector.html
@@ -0,0 +1,94 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tensorboard-color.html">
+
+<!--
+tf-option-selector is a simple component that has buttons as content and
+provides a "selectedId" property that is one of the IDs of the buttons inside it.
+-->
+<dom-module id="tf-option-selector">
+  <template>
+    <div id="wrap">
+      <h3>[[name]]</h3>
+      <div class="content-wrapper"><content></content></div>
+    </div>
+    <style>
+      .content-wrapper ::content > * {
+        width: 30%;
+        font-size: 13px;
+        background: none;
+        margin-top: 10px;
+        color: var(--tb-ui-dark-accent);
+      }
+
+      .content-wrapper ::content :first-of-type {
+        margin-left: 0;
+      }
+
+      .content-wrapper ::content .selected {
+        background-color: var(--tb-ui-dark-accent);
+        color: white!important;
+      }
+
+      h3 {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+        display: block;
+        pointer-events: none;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-option-selector",
+      properties: {
+        name: String,
+        selectedId: {
+          type: String,
+          notify: true,
+          observer: '_selectedIdChanged'
+        }
+      },
+      attached: function() {
+        this.async(function() {
+          this.getEffectiveChildren().forEach(function(node) {
+            this.listen(node, 'tap', '_selectTarget');
+          }.bind(this));
+        });
+      },
+      _selectTarget: function(e) {
+        this.selectedId = e.currentTarget.id;
+      },
+      _selectedIdChanged: function() {
+        var selected = this.queryEffectiveChildren('#' + this.selectedId);
+        if (!selected) {
+          return;
+        }
+
+        this.getEffectiveChildren().forEach(function(node) {
+          node.classList.remove("selected");
+        });
+        selected.classList.add("selected");
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html
new file mode 100644
index 0000000000000000000000000000000000000000..155259d3294bd1caf5cc59f91c56f304d12091a0
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-panes-helper.html
@@ -0,0 +1,352 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="tf-collapsable-pane.html">
+<link rel="import" href="tf-no-data-warning.html">
+<link rel="import" href="tf-chart-scaffold.html">
+
+<!--
+tf-panes-helper is a component that renders the contents of TensorBoard pages.
+It renders a tf-collapsable-pane for each category. Inside each category, the
+provided content template is rendered repeatedly for each tag within that
+category.
+
+This helper also incorporates an expand button and data download utility for
+each card.
+
+To use it, just specify a template inside tf-panes-helper that contains the
+code that will be replicated for each tag.
+
+<tf-panes-helper
+  categories="[[categories]]"
+  data-type="type"
+  data-provider="[[provider]]"
+  run2tag="[[run2tag]]"
+  selected-runs="[[selectedRuns]]"
+  >
+  <template>
+    <Code instantiated for each card>
+  </template>
+</tf-panes-helper>
+
+If you want for the template to be replicated for each tag and run, not only for
+each tag, you can set the repeatForRuns property to true.
+
+You can also set the showDownloadLinks property, which will show a menu with
+options to download JSON and CSV data. For this, you must also set the
+downloadLinkUrlFunction property to an appropriate value.
+
+@element tf-panes-helper
+-->
+<dom-module id="tf-panes-helper">
+  <template>
+    <content></content> <!-- User template will be put here -->
+    <tf-no-data-warning
+      data-type="[[dataType]]"
+      show-warning="[[dataNotFound]]"
+      ></tf-no-data-warning>
+
+    <template is="dom-repeat" items="[[categories]]" as="category">
+      <tf-collapsable-pane
+        name="[[category.name]]"
+        count="[[_count(category.tags, selectedRuns.*)]]"
+        >
+        <div class="layout horizontal wrap">
+          <template is="dom-repeat" items="[[_categoryCards(category, selectedRuns.*, run2tag.*)]]">
+              <div class="card">
+                <div class="card-title-container" style="border-color: [[_titleBorderColor(item.run)]]">
+                  <div class="card-title" inner-h-t-m-l="[[_break(item.tag)]]"></div>
+                  <template is="dom-if" if="[[repeatForRuns]]">
+                    <div class="card-subtitle" title="[[item.run]]">[[item.run]]</div>
+                  </template>
+                </div>
+                <div class="card-content">
+                  <tf-chart-scaffold
+                    tag="[[item.tag]]"
+                    data-provider="[[dataProvider]]"
+                    visible-series="[[item.runs]]"
+                    on-ready="_instantiateTemplate"
+                    >
+                    <!-- Instantiated template will be put here -->
+                  </tf-chart-scaffold>
+                </div>
+                <div class="card-bottom-row">
+                  <paper-icon-button
+                    class="expand-button"
+                    icon="fullscreen"
+                    on-tap="_toggleExpanded"
+                    ></paper-icon-button>
+                  <template is="dom-if" if="[[showDownloadLinks]]">
+                    <tf-downloader
+                      runs="[[item.runs]]"
+                      tag="[[item.tag]]"
+                      url-fn="[[downloadLinkUrlFunction]]"
+                      >
+                    </tf-downloader>
+                  </template>
+                </div>
+              </div>
+          </template>
+        </div>
+      </tf-collapsable-pane>
+    </template>
+
+    <style>
+      .card {
+        height: var(--card-height, 200px);
+        width: var(--card-width, 300px);
+        display: flex;
+        flex-direction: column;
+        margin: 5px;
+        padding: var(--card-padding, 0 30px 35px 0);
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        position: relative;
+      }
+
+      .card-expanded {
+        height: var(--card-expanded-height, 400px);
+        width: var(--card-expanded-width, 100%);
+      }
+
+      .card-title, .card-subtitle {
+        flex-grow: 0;
+        flex-shrink: 0;
+        font-size: 14px;
+        text-overflow: ellipsis;
+        overflow: hidden;
+      }
+
+      .card-subtitle {
+        font-size: 12px;
+      }
+
+      .card-content {
+        flex-grow: 1;
+        flex-shrink: 1;
+        display: flex;
+        margin-top: 10px;
+      }
+
+      .card-bottom-row {
+        position: absolute;
+        left: 0px;
+        bottom: 0px;
+        width: 100%;
+        display: flex;
+        flex-direction: row;
+        justify-content: space-between;
+        pointer-events: none;
+      }
+
+      .card-title-container {
+        border-left: 4px solid;
+        padding-left: 5px;
+      }
+
+      .expand-button {
+        color: #2196F3;
+        width: 32px;
+        height: 32px;
+        padding: 4px;
+        border-radius: 100%;
+        pointer-events: auto;
+        display: var(--show-expand-button, block);
+      }
+
+      .card-expanded .expand-button {
+        background: var(--tb-ui-light-accent);
+      }
+
+      tf-downloader {
+        margin-right: 30px;
+        pointer-events: auto;
+      }
+
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-panes-helper",
+      properties: {
+        /**
+         * Categories that separate the template instances. Each category will
+         * be given its own collapsible pane. The category must be an array of
+         * objects, each with a 'name' property and a 'tags' array of strings.
+         */
+        categories: Array,
+
+        /**
+         * Input of the colors that are used for the user's runs.
+         */
+        colorScale: Object,
+
+        /**
+         * The name of the data type that is used by this dashboard. This will
+         * be used to display what is missing when there is no data available.
+         */
+        dataType: String,
+
+        /**
+         * The function that requests and returns a promise with the data of the
+         * required type for the templates from the backend.
+         */
+        dataProvider: Object,
+
+        /**
+         * If false, instantiates one template for each tag and calls
+         * setVisibleSeries on the first element of the template with all valid
+         * runs the tag has. If true, instantiates one template for each run of
+         * each tag, and calls setVisibleSeries of the first element of the
+         * instantiated template with just the one run.
+         */
+        repeatForRuns: {
+          type: Boolean,
+          value: false
+        },
+
+        /**
+         * Map from runs to the valid tags that have them.
+         */
+        run2tag: Object,
+
+        /**
+         * Array with the runs that are selected by the user (i.e. valid to be
+         * displayed).
+         */
+        selectedRuns: Array,
+
+        /**
+         * If true, shows a menu with download links for the template data.
+         * If this is set to true, urlFn must also be provided.
+         */
+        showDownloadLinks: Boolean,
+
+        /**
+         * Function that returns the route to get data to download. Must be
+         * provided if showDownloadLinks is enabled.
+         */
+        downloadLinkUrlFunction: Function,
+        _contentTemplate: {
+          type: Object,
+          value: null
+        },
+        _stampedTemplates: {
+          type: Array,
+          value: function() { return [] }
+        }
+      },
+      behaviors: [
+        Polymer.Templatizer,
+      ],
+
+      /**
+       * Initializes the Polymer.Templatizer behavior with the template supplied
+       * by the user. With this, all calls to this.stamp() will produce an
+       * instance of the user template.
+       */
+      _initTemplatizer: function() {
+        if (!this._contentTemplate) {
+          // First template is used as the content.
+          this._contentTemplate = Polymer.dom(this).querySelector('template');
+          this.templatize(this._contentTemplate);
+        }
+      },
+
+      /**
+       * Called every time a tf-chart-scaffold is ready, stamps the user
+       * template inside the scaffold element (before it is attached) and
+       * stores the stamped template in an array to use for data binding
+       * (forwardParentProp/Path).
+       */
+      _instantiateTemplate: function(e) {
+        var scaffold = e.target;
+        this._initTemplatizer();
+        var instance = this.stamp();
+        this._stampedTemplates.push(instance);
+        Polymer.dom(scaffold).appendChild(instance.root);
+      },
+      _toggleExpanded: function(e) {
+        var currentTarget = Polymer.dom(e.currentTarget);
+        var card = currentTarget.node.closest('.card');
+        var scaffold = card.querySelector('tf-chart-scaffold');
+        card.classList.toggle('card-expanded');
+        scaffold.chart().redraw();
+      },
+      _count: function(tags) {
+        if (!this.repeatForRuns) {
+          return tags.length;
+        }
+
+        var targetTags = d3.set(tags);
+        var count = 0;
+        this.selectedRuns.forEach(function(r) {
+          this.run2tag[r].forEach(function(t) {
+            if (targetTags.has(t)) {
+              count++;
+            }
+          });
+        }.bind(this));
+        return count;
+      },
+      _categoryCards: function(category) {
+        var cards = [];
+        category.tags.forEach(function(tag) {
+          var runs = this.selectedRuns.filter(function(r) {
+            return this.run2tag[r] && this.run2tag[r].indexOf(tag) !== -1;
+          }.bind(this));
+
+          if (this.repeatForRuns) {
+            runs.forEach(function(run) {
+              cards.push({tag: tag, run: run, runs: [run]});
+            });
+          } else {
+            cards.push({tag: tag, runs: runs});
+          }
+        }.bind(this));
+
+        return cards;
+      },
+      _titleBorderColor: function(run) {
+        return this.repeatForRuns ? this.colorScale.scale(run) : 'white';
+      },
+
+      /*
+       * Polymer data binding forwarding functions. Check the
+       * Polymer.Templatizer documentation for more information.
+       */
+
+      _forwardParentProp: function(property, value) {
+        this._stampedTemplates.forEach(function(instance) {
+          instance[property] = value;
+        });
+      },
+      _forwardParentPath: function(path, value) {
+        this._stampedTemplates.forEach(function(instance) {
+          instance.notifyPath(path, value, true);
+        });
+      },
+      // TODO(renatoutsch): implement the instance forwarding for two-way data
+      // binding.
+      // Add breaks to input so it will wrap nicely
+      _break: function(ipt) {
+        return ipt.replace(/([\/_-])/g, "$1<wbr>")
+      },
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html
similarity index 88%
rename from tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html
rename to tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html
index eaf10999bfb65db9d0aa08a52a6c004674872b0b..3565fec17912437897ec6b3ec509d48fed10645a 100644
--- a/tensorflow/tensorboard/components/tf_dashboard_common/demo/tf-regex-group-demo.html
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group-demo.html
@@ -18,8 +18,7 @@ limitations under the License.
 
 <html>
  <head>
-   <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-   <link rel="import" href="../tf-regex-group.html">
+   <link rel="import" href="tf-regex-group.html">
  </head>
  <body>
   <style>
@@ -43,7 +42,4 @@ limitations under the License.
     </template>
   </template>
  </body>
- <script>
-
- </script>
 </html>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html
new file mode 100644
index 0000000000000000000000000000000000000000..e68b306ee33b5e57a1125c7cd9d1b687ae16202e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.html
@@ -0,0 +1,99 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-icon-button/paper-icon-button.html">
+<link rel="import" href="../iron-icons/iron-icons.html">
+<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
+<link rel="import" href="../paper-input/paper-input.html">
+<link rel="import" href="../tf-storage/tf-storage.html">
+
+<!--
+`tf-regex-group` provides an input component for a group of regular expressions.
+
+Example:
+  <tf-regex-group regexes="{{regexes}}"></tf-regex-group>
+
+It contains a series of regular expression input fields. From this, it computes
+`regexes', an array in which every element is either a string representing a
+valid, nonempty regular expression, or the value `null`
+
+Public Properties:
+`regexes` a readonly, notifying array of strings, where each string is a regex
+
+It maintains an invariant that the final regex should always be an empty string,
+so the user can easily add more regular expressions. It does this by adding
+a new empty regex when the final one is nonempty.
+
+Pressing "enter" moves focus to the next regex (or just blurs if there are no
+more regexes).
+-->
+<dom-module id="tf-regex-group">
+  <template>
+    <div class="regex-list">
+      <template is="dom-repeat" items="{{rawRegexes}}">
+        <div class="regex-line">
+          <paper-input
+            id="text-input"
+            class="regex-input"
+            label="Write a regex to create a tag group"
+            no-label-float
+            value="{{item.regex}}"
+            invalid="[[!item.valid]]"
+            on-keyup="moveFocus"
+          ></paper-input>
+          <paper-icon-button
+            icon="close"
+            class="delete-button"
+            aria-label="Delete Regex"
+            tabindex="0"
+            on-tap="deleteRegex"
+          ></paper-icon-button>
+        </div>
+        <style>
+          .regex-input {
+            width: 250px;
+            display: inline-block;
+            margin-left: -3px;
+          }
+
+          .delete-button {
+            color: var(--paper-grey-700);
+            width: 40px;
+            height: 40px;
+            margin-right: -10px;
+          }
+
+          .regex-list {
+            margin-bottom: 10px;
+          }
+
+          paper-input {
+            --paper-input-container-focus-color: var(--tb-orange-strong);
+            --paper-input-container-input: {
+              font-size: 14px;
+            };
+            --paper-input-container-label: {
+              font-size: 14px;
+            };
+          }
+        </style>
+      </template>
+    </div>
+  </template>
+  <script src="tf-regex-group-bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts
new file mode 100644
index 0000000000000000000000000000000000000000..92a0eb6a0b9d0738369ff89356e3c49336e2fb27
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-regex-group.ts
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import * as storage from '../tf-storage/storage';
+
+Polymer({
+  is: 'tf-regex-group',
+  properties: {
+    rawRegexes: {
+      type: Array,
+      value: storage.getObjectInitializer(
+          'rawRegexes', [{regex: '', valid: true}]),
+    },
+    regexes:
+        {type: Array, computed: 'usableRegexes(rawRegexes.*)', notify: true},
+  },
+  observers: [
+    'addNewRegexIfNeeded(rawRegexes.*)',
+    'checkValidity(rawRegexes.*)',
+    '_uriStoreRegexes(rawRegexes.*)',
+  ],
+  _uriStoreRegexes:
+      storage.getObjectObserver('rawRegexes', [{regex: '', valid: true}]),
+  checkValidity: function(x) {
+    var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
+    if (match) {
+      var idx = match[1];
+      this.set('rawRegexes.' + idx + '.valid', this.isValid(x.value));
+    }
+  },
+  isValid: function(s) {
+    try {
+      new RegExp(s);
+      return true;
+    } catch (e) {
+      return false;
+    }
+  },
+  usableRegexes: function(regexes) {
+    var isValid = this.isValid;
+    return regexes.base
+        .filter(function(r) {
+          // Checking validity here (rather than using the data property)
+          // is necessary because otherwise we might send invalid regexes due
+          // to the fact that this function can call before the observer does
+          return r.regex !== '' && isValid(r.regex);
+        })
+        .map(function(r) {
+          return r.regex;
+        });
+  },
+  addNewRegexIfNeeded: function() {
+    var last = this.rawRegexes[this.rawRegexes.length - 1];
+    if (last.regex !== '') {
+      this.push('rawRegexes', {regex: '', valid: true});
+    }
+  },
+  deleteRegex: function(e) {
+    if (this.rawRegexes.length > 1) {
+      this.splice('rawRegexes', e.model.index, 1);
+    }
+  },
+  moveFocus: function(e) {
+    if (e.keyCode === 13) {
+      var idx = e.model.index;
+      var inputs = Polymer.dom(this.root).querySelectorAll('.regex-input');
+      if (idx < this.rawRegexes.length - 1) {
+        (inputs[idx + 1] as any).$.input.focus();
+      } else {
+        (document.activeElement as HTMLElement).blur();
+      }
+    }
+  }
+});
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html
new file mode 100644
index 0000000000000000000000000000000000000000..e3d8a91fd0c2e64650ebbac0fcb6448ffadc9f52
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-run-selector.html
@@ -0,0 +1,188 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../paper-button/paper-button.html">
+<link rel="import" href="../paper-dialog/paper-dialog.html">
+<link rel="import" href="tf-multi-checkbox.html">
+<link rel="import" href="scrollbar-style.html">
+
+<!--
+tf-run-selector creates a set of checkboxes to display which runs are selected.
+It also displays tooltips.
+
+Properties in:
+- runs: Array of strings representing the runs that may be selected
+- colorScale: a TF.ColorScale mapping run names to colors
+
+Properties out:
+- outSelected: The array of run names that are currently checked by the user.
+
+-->
+<dom-module id="tf-run-selector">
+  <template>
+    <paper-dialog with-backdrop id="logdir-dialog">
+      <h2>logdir</h2>
+      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
+    </paper-dialog>
+    <div id="top-text">
+      <h3 id="tooltip-help" class="tooltip-container">
+        Runs
+      </h3>
+    </div>
+    <tf-multi-checkbox
+      id="multiCheckbox"
+      names="[[runs]]"
+      out-selected="{{outSelected}}"
+      color-scale="[[colorScale]]"
+    ></tf-multi-checkbox>
+    <paper-button
+      class="x-button"
+      id="toggle-all"
+      on-tap="_toggleAll"
+    >
+    Toggle All Runs
+    </paper-button>
+    <template
+      is="dom-if"
+      if="[[logdir]]">
+      <div id="logdir">
+        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><!--
+          We use HTML comments to remove spaces before the ellipsis.
+        --><template
+                     is="dom-if"
+                     if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><!--
+          --><a href="" on-click="_openLogdirDialog">…</a>
+        </template>
+      </div>
+    </template>
+    <style>
+      :host {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        box-sizing: border-box;
+      }
+      #top-text {
+        width: 100%;
+        flex-grow: 0;
+        flex-shrink: 0;
+        padding-right: 16px;
+        box-sizing: border-box;
+        color: var(--paper-grey-800);
+      }
+      tf-multi-checkbox {
+        display: flex;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+      .x-button {
+        font-size: 13px;
+        background-color: var(--tb-ui-light-accent);
+        color: var(--tb-ui-dark-accent);
+      }
+      #tooltip-help {
+        color: var(--paper-grey-800);
+        margin: 0;
+        font-weight: normal;
+        font-size: 14px;
+        margin-bottom: 5px;
+      }
+      paper-button {
+        margin-left: 0;
+      }
+      #logdir {
+        color: var(--tb-ui-dark-accent);
+        font-size: 13px;
+        margin: 5px 0 0 0;
+        max-width: 288px;
+      }
+    </style>
+  </template>
+  <script>
+  Polymer({
+    is: "tf-run-selector",
+    properties: {
+      backend: Object,
+      outSelected: {type: Array, notify: true},
+      // runs: an array of strings, representing the run names that may be chosen
+      runs: Array,
+      colorScale: Object, // TF.ColorScale
+      logdir: {
+        type: String,
+        notify: true,
+      },
+      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
+      _clippedLogdir: {
+        type: String,
+      },
+      _logdirClipLength: {
+        type: Number,
+        value: 250,
+        readOnly: true,
+      },
+    },
+    observers: [
+      "_onBackendUpdate(backend)",
+      "_logdirSet(logdir)",
+    ],
+    _toggleAll: function() {
+      this.$.multiCheckbox.toggleAll();
+    },
+    // Break the string at natural points, including commas, equals, and slashes
+    _breakString: function(originalString) {
+      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
+    },
+    _onBackendUpdate: function(backend) {
+      if (backend === undefined) {
+        return;
+      }
+
+      // When the backend is set, the selector can request the logdir.
+      backend.logdir().then(logdirObject => {
+        this.set('logdir', logdirObject.logdir);
+      }).catch(e => {
+        // Fetching the logdir failed. Prevent the exception from logging to
+        // console. The console already logs a 404 network event.
+      });
+    },
+    _logdirSet: function(logdir) {
+      if (logdir === undefined) {
+        // The logdir has not been set yet.
+        return;
+      }
+
+      var lineBrokenText;
+      if (logdir.length > this._logdirClipLength) {
+        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
+        // version of the logdir.
+        lineBrokenText = this._breakString(logdir.substring(0, this._logdirClipLength));
+      } else {
+        lineBrokenText = this._breakString(logdir);
+      }
+      this.set('_clippedLogdir', lineBrokenText);
+    },
+    _openLogdirDialog: function(event) {
+      event.preventDefault();
+      this.$$('#logdir-dialog').open();
+    },
+    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
+      return logdir && logdir.length > _logdirClipLength;
+    },
+  });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html
new file mode 100644
index 0000000000000000000000000000000000000000..5eb8537040ccef6e8fa76f31c80b85dea795dfdd
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_dashboard_common_d3v4/tf-sidebar-helper.html
@@ -0,0 +1,165 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="tf-categorizer.html">
+<link rel="import" href="tf-run-selector.html">
+
+<!--
+tf-sidebar-helper is a component that renders a sidebar for configuration
+components, like the tf-categorizer and the tf-run-selector. The component can
+also be extended with more options useful to the dashboards.
+
+To use it, create the tf-sidebar-helper with the required properties. To extend
+it with extra configuration components, add them to the element's component:
+
+<tf-sidebar-helper
+  backend: "[[backend]]",
+  categories: "{{outputCategories}}",
+  colorScale: "[[colorScale]]",
+  run2tag: "[[run2tag]]",
+  runs: "[[runs]]",
+  selectedRuns: "{{outSelectedRuns}}",
+  >
+  <div class="extend-first-section">
+    <my options>
+  </div>
+  <div class="sidebar-section">
+    <my options>
+  </div>
+  ...
+</tf-sidebar-helper>
+
+Elements inside the .extend-first-section div will be put on the first section
+of the sidebar, while the rest of the divs will be put after it and before the
+tf-run-selector.
+
+@element tf-sidebar-helper
+-->
+<dom-module id="tf-sidebar-helper">
+  <template>
+    <div class="sidebar-section">
+      <tf-categorizer
+        id="categorizer"
+        tags="[[tags]]"
+        categories="{{categories}}"
+        ></tf-categorizer>
+      <content select=".extend-first-section"></content>
+    </div>
+    <content></content>
+    <div class="sidebar-section">
+      <tf-run-selector
+        id="runSelector"
+        backend="[[backend]]"
+        runs="[[runs]]"
+        color-scale="[[colorScale]]"
+        out-selected="{{selectedRuns}}"
+        ></tf-run-selector>
+    </div>
+    <style include="dashboard-style"></style>
+    <style>
+      :host {
+        display: flex;
+        flex-direction: column;
+        height: 100%;
+      }
+
+      #categorizer {
+        flex-shrink: 0;
+      }
+
+      #runSelector {
+        flex-shrink: 1;
+        flex-grow: 1;
+      }
+
+      .sidebar-section {
+        border-top: solid 1px rgba(0, 0, 0, 0.12);
+        padding: 20px 0px 20px 30px;
+      }
+
+      .sidebar-section:first-child {
+        border: none;
+      }
+
+      .sidebar-section:last-child {
+        flex-grow: 1;
+        display: flex;
+      }
+
+      paper-checkbox {
+        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
+        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
+        font-size: 14px;
+      }
+    </style>
+  </template>
+  <script>
+    Polymer({
+      is: "tf-sidebar-helper",
+      properties: {
+        /**
+         * The backend object used to issue requests.
+         */
+        backend: Object,
+
+        /**
+         * This is an output of the categories that the user selected to
+         * separate the different tags. Each category here should be given its
+         * own collapsible pane.
+         */
+        categories: {
+          type: Array,
+          notify: true,
+        },
+
+        /**
+         * Input of the colors that are used for the user's runs.
+         */
+        colorScale: Object,
+
+        /**
+         * Map from runs to the valid tags that have them.
+         */
+        run2tag: Object,
+
+        /**
+         * Input of all valid runs that can be selected by the user.
+         */
+        runs: Array,
+
+        /**
+         * Outputs an array with the runs that are selected by the user (i.e.
+         * valid to be displayed).
+         */
+        selectedRuns: {
+          type: Array,
+          notify: true,
+        },
+
+        tags: {
+          type: Array,
+          computed: "_getTags(run2tag.*)"
+        },
+      },
+      _getTags: function() {
+        return _.union.apply(null, _.values(this.run2tag));
+      },
+    })
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
deleted file mode 100644
index 726fdbca9302d1e6d302966c98321f51e4eac8e6..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/BUILD
+++ /dev/null
@@ -1,60 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "tf_distribution_dashboard",
-    srcs = [
-        "tf-distribution-dashboard.html",
-    ],
-    path = "/tf-distribution-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/vz_distribution_chart",
-        "@org_polymer",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-distribution-dashboard.html",
-    ],
-    destdir = "tf-distribution-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/vz_distribution_chart:legacy",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-styles:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD
deleted file mode 100644
index 238937c0c2587e5d82b05b5893500d304f05a9aa..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_distribution_dashboard/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-distribution-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_distribution_dashboard",
-        "//tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD
deleted file mode 100644
index 589c1980e41a871dc922175bcdc3995829ea37d1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-distribution-dashboard/demo/data",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ace02adfba51f3397e52f5d4826f74129ffc9fce
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_distribution_dashboard_d3v4",
+    srcs = ["tf-distribution-dashboard.html"],
+    path = "/tf-distribution-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_distribution_chart_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-distribution-dashboard",
+    deps = [
+        ":tf_distribution_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run2_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo2.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/compressedHistograms_run_run2_tag_histo2.json
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/compressedHistograms_run_run2_tag_histo2.json
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/logdir
similarity index 100%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/logdir
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/logdir
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html
similarity index 85%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html
index 16c9b124e4248c81f473f6f1f76a5b1316e20a0a..5e825f13f5c87f1cc8331575236eaf4deab1a2d8 100644
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/index.html
@@ -16,10 +16,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../tf-distribution-dashboard.html">
-<link rel="import" href="../../paper-styles/typography.html">
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-distribution-dashboard.html">
 
 <title>Distribution Dashboard Demo</title>
 <style>
diff --git a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html
similarity index 94%
rename from tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
rename to tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html
index 2da848bd99e5a04a3af2a4a65ddd1fdeaf893063..063bd8d0993fef9d4121389cddcf0b314516cf29 100644
--- a/tensorflow/tensorboard/components/tf_distribution_dashboard/tf-distribution-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4/tf-distribution-dashboard.html
@@ -101,11 +101,15 @@ contains vz-distribution-charts embedded inside tf-panes-helper's.
   </template>
 
   <script>
-    Polymer({
+    TF.Dashboard.TfDistributionDashboard = Polymer({
       is: "tf-distribution-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       behaviors: [
+        TF.Dashboard.DashboardBehavior("distributions"),
         TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        TF.Backend.BackendBehavior,
       ],
       properties: {
         backend: Object,
diff --git a/tensorflow/tensorboard/components/tf_globals/BUILD b/tensorflow/tensorboard/components/tf_globals/BUILD
index 21724aa26b5d91cadbfbdef1d86cfb0145eadd0a..7e81163e8010d55a4717d030196f055859bc4b2a 100644
--- a/tensorflow/tensorboard/components/tf_globals/BUILD
+++ b/tensorflow/tensorboard/components/tf_globals/BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
@@ -9,7 +9,7 @@ licenses(["notice"])  # Apache 2.0
 
 # TODO(dandelion): Add webfiles support for the test code.
 
-webfiles(
+web_library(
     name = "tf_globals",
     srcs = [
         "tf-globals.html",
@@ -43,7 +43,7 @@ tensorboard_webcomponent_library(
 
 tensorboard_ts_library(
     name = "legacy_ts",
-    srcs = [
-        "globals.ts",
-    ],
+    srcs = ["globals.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
 )
diff --git a/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD b/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..49c2a1e57e1ad8ada3d8ef8fb6e891f1d4985f8a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_globals_d3v4/BUILD
@@ -0,0 +1,55 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_globals_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-globals.html",
+    ],
+    path = "/tf-globals",
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.Globals": ["globals.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-globals.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-globals",
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["globals.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_globals_d3v4/globals.ts b/tensorflow/tensorboard/components/tf_globals_d3v4/globals.ts
new file mode 100644
index 0000000000000000000000000000000000000000..42c73708cd7843fe5da697ede4db85f4f8296fb9
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_globals_d3v4/globals.ts
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+
+// The names of TensorBoard tabs.
+export const TABS = [
+  'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
+  'embeddings', 'text'
+];
+
+// If true, TensorBoard stores its hash in the URI state.
+// If false, tab switching in TensorBoard will not update location hash,
+// because hash updates interfere with wct_tests.
+export let USE_HASH = false;
+
+let _fakeHash = '';
+
+export function setFakeHash(h: string) {
+  _fakeHash = h;
+}
+
+export function getFakeHash() {
+  return _fakeHash;
+}
+
diff --git a/tensorflow/tensorboard/components/tf_imports_google/d3.html b/tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
similarity index 93%
rename from tensorflow/tensorboard/components/tf_imports_google/d3.html
rename to tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
index dbfd11aa87e4e243fc73c624edfc0abf53ace248..b0fd74d4f20b680e2d55b3de4ed51a1d35a39882 100644
--- a/tensorflow/tensorboard/components/tf_imports_google/d3.html
+++ b/tensorflow/tensorboard/components/tf_globals_d3v4/tf-globals.html
@@ -15,4 +15,5 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="../d3-library/d3.js"></script>
+<script src="bundle.js"></script>
+
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json b/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
deleted file mode 100644
index f5ca9aada79aee9929facd68b4737ce58de35378..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/demo_datasets.json
+++ /dev/null
@@ -1,123 +0,0 @@
-[
-  {
-    "name": "Mnist Eval",
-    "path": "mnist_eval.pbtxt"
-  },
-  {
-    "name": "Mnist with summaries (+stats)",
-    "path": "mnist_with_summaries.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step100",
-        "path": "mnist_with_summaries_step100.pbtxt"
-      },
-      {
-        "tag": "step1000",
-        "path": "mnist_with_summaries_step1000.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Mnist Train (with shapes)",
-    "path": "mnist_train_shapes.pbtxt"
-  },
-  {
-    "name": "Inception Train (huge)",
-    "path": "inception_train.pbtxt"
-  },
-  {
-    "name": "Inception Train Eval",
-    "path": "inception_train_eval.pbtxt"
-  },
-  {
-    "name": "Inception Test",
-    "path": "inception_test_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train",
-    "path": "ptb_word_lstm_train.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Train Eval",
-    "path": "ptb_word_lstm_train_eval.pbtxt"
-  },
-  {
-    "name": "PTB Word LSTM Test",
-    "path": "ptb_word_lstm_test_eval.pbtxt"
-  },
-  {
-    "name": "Cifar10 Train (+stats)",
-    "path": "cifar10_train.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_train_step0.pbtxt"
-      },
-      {
-        "tag": "step100",
-        "path": "cifar10_train_step100.pbtxt"
-      },
-      {
-        "tag": "step200",
-        "path": "cifar10_train_step200.pbtxt"
-      },
-      {
-        "tag": "step300",
-        "path": "cifar10_train_step300.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Cifar10 Multi-GPU Train",
-    "path": "cifar10_multi_gpu_train.pbtxt"
-  },
-  {
-    "name": "Cifar10 Eval (+stats)",
-    "path": "cifar10_eval.pbtxt",
-    "runMetadata": [
-      {
-        "tag": "step0",
-        "path": "cifar10_eval_step0.pbtxt"
-      },
-      {
-        "tag": "step10",
-        "path": "cifar10_eval_step10.pbtxt"
-      },
-      {
-        "tag": "step20",
-        "path": "cifar10_eval_step20.pbtxt"
-      }
-    ]
-  },
-  {
-    "name": "Fatcat LSTM",
-    "path": "fatcat_lstm.pbtxt"
-  },
-  {
-    "name": "Legacy Inception Renamed",
-    "path": "legacy_inception_renamed.pbtxt"
-  },
-  {
-    "name": "Wolfe (Broken)",
-    "path": "wolfe1.pbtxt"
-  },
-  {
-    "name": "Wolfe (Fixed)",
-    "path": "wolfe2.pbtxt"
-  },
-  {
-    "id": "alex",
-    "name": "AlexNet",
-    "path": "alexnet.pbtxt"
-  },
-  {
-    "id": "alexprivate",
-    "name": "AlexNet Private",
-    "path": "alexnet.pbtxt",
-    "private": true
-  },
-  {
-    "name": "TestError404",
-    "path": "nofile"
-  }
-]
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/index.html b/tensorflow/tensorboard/components/tf_graph/demo/index.html
deleted file mode 100644
index c89490f44d429b6fb907a84ff1c6b7228f585ed4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/index.html
+++ /dev/null
@@ -1,46 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, minimum-scale=1.0, initial-scale=1.0, user-scalable=yes">
-    <title>tf-graph Demo</title>
-    <!-- Libraries that should be imported in TensorBoard when the Graph visualizer ports to TensorBoard -->
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <script src="../../es6-promise/promise.min.js"></script>
-    <link rel="import" href="tf-graph-demo.html">
-    <style>
-      html {
-        width: 100%;
-        height: 100%;
-      }
-
-      body {
-        margin: 0;
-        padding: 0;
-        width: 100%;
-        height: 100%;
-      }
-    </style>
-  </head>
-
-  <body unresolved>
-    <tf-graph-demo></tf-graph-demo>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html b/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
deleted file mode 100644
index d5fd41dfebeb61471183ab3de8a6cb239d86983b..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph/demo/tf-graph-demo.html
+++ /dev/null
@@ -1,202 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../../polymer/polymer.html">
-<link rel="import" href="../../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../../tf-graph/tf-graph-controls.html">
-
-<!--
-Element for tf-graph demo page
-
-Example:
-
-<tf-graph-demo></tf-graph-demo>
--->
-<dom-module id="tf-graph-demo">
-<template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.main {
-  position: absolute;
-  right: 0;
-  left: 250px;
-  height: 100%;
-}
-
-.side {
-  position: absolute;
-  left: 0;
-  width: 250px;
-  height: 100%;
-  border: 1px solid black;
-  box-sizing: border-box;
-}
-
-.all {
-  position: relative;
-  width: 100%;
-  height: 100%
-}
-
-</style>
-<div class="all">
-  <div class="side">
-    <!-- The observatory header component is injected in during vulcanization
-         and an instance of it is initialized and filled here when the demo
-         app initializes. -->
-    <div id="observatory-header"></div>
-    <tf-graph-controls
-        devices-for-stats="{{_devicesForStats}}"
-        color-by-params="[[colorByParams]]"
-        stats="[[stats]]"
-        color-by="{{colorBy}}"
-        datasets="[[datasets]]"
-        render-hierarchy="[[_renderHierarchy]]"
-        selected-dataset="{{selectedDataset}}"
-        selected-file="{{selectedFile}}"
-        selected-metadata-tag="{{selectedMetadataTag}}"
-        show-session-runs-dropdown="[[showSessionRunsDropdown]]"
-        show-upload-button="[[showUploadButton]]"
-    ></tf-graph-controls>
-    <tf-graph-loader id="loader"
-        datasets="[[datasets]]"
-        selected-dataset="[[selectedDataset]]"
-        selected-metadata-tag="[[selectedMetadataTag]]"
-        selected-file="[[selectedFile]]"
-        out-graph-hierarchy="{{graphHierarchy}}"
-        out-graph="{{graph}}"
-        out-stats="{{stats}}"
-        progress="{{_progress}}"
-        out-hierarchy-params="{{_hierarchyParams}}"
-    ></tf-graph-loader>
-  </div>
-  <div class="main">
-    <tf-graph-board id="graphboard"
-        color-by="[[colorBy]]"
-        color-by-params="{{colorByParams}}"
-        devices-for-stats="[[_devicesForStats]]"
-        graph-hierarchy="[[graphHierarchy]]"
-        graph="[[graph]]"
-        hierarchy-params="[[_hierarchyParams]]"
-        progress="[[_progress]]"
-        render-hierarchy="{{_renderHierarchy}}"
-        stats="[[stats]]"
-    ></tf-graph-board>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-(function(){
-
-Polymer({
-  is: 'tf-graph-demo',
-  properties: {
-    datasets: {
-      type: Object
-    },
-    selectedDataset: {
-      type: Number,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    _renderHierarchy: Object,
-    _progress: Object,
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    }
-  },
-  created: function() {
-    let queryParams = tf.graph.util.getQueryParams(location.search);
-    let selectedDataset = 0;
-
-    if (typeof DEMO_DATASETS === 'undefined') {
-      DEMO_DATASETS = 'demo_datasets.json';
-    }
-
-    d3.json(DEMO_DATASETS, function (error, datasets) {
-        let publicDatasets = [];
-
-        if (error) {
-          console.log('Error loading demo datasets:');
-          console.log(error);
-          return;
-        }
-
-        if(typeof DEMO_DIR_PREFIX === 'undefined') {
-          DEMO_DIR_PREFIX = 'tf_model_zoo/';
-        }
-        _.each(datasets, function(dataset, index) {
-          if (queryParams['graphid'] && dataset.id == queryParams['graphid']) {
-            selectedDataset = index;
-          } else if (dataset['private']) {
-            return;
-          }
-
-          dataset.path = this._normalizePath(dataset.path);
-          if (dataset.runMetadata != null) {
-            _.each(dataset.runMetadata, function(metadata) {
-              metadata.path = this._normalizePath(metadata.path);
-            }, this);
-          }
-          publicDatasets.push(dataset);
-        }, this);
-        this.set('datasets', publicDatasets);
-        if (selectedDataset != 0) {
-          this.set('selectedDataset', selectedDataset);
-        }
-    }.bind(this));
-  },
-  ready: function() {
-    if (typeof IS_OBSERVATORY !== 'undefined' && IS_OBSERVATORY) {
-      // Create the header and add it to the DOM. This component is injected in
-      // during vulcanization.
-      document.getElementById('observatory-header').appendChild(
-          document.createElement('tf-graph-observatory-header'));
-
-      this.set('showSessionRunsDropdown', false);
-      this.set('showUploadButton', false);
-    }
-  },
-  _normalizePath: function(path) {
-    return this.resolveUrl(DEMO_DIR_PREFIX + path);
-  },
-  _selectedDatasetChanged: function() {
-    if (this.datasets) {
-      let dataset = this.datasets[this.selectedDataset];
-      let queryParams = '';
-      if (dataset['id']) {
-        queryParams = '?graphid=' + dataset['id'];
-      }
-      window.history.replaceState(
-          null, null, location.pathname + queryParams);
-    }
-  }
-});
-})();
-</script>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html b/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
deleted file mode 100644
index 0897cdd08bd440a18e7a4a7f39e4a53036a5e958..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_graph_app/demo/index.html
+++ /dev/null
@@ -1,45 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-<head>
-  <meta charset="utf-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <link rel="import" href="../tf-graph-app.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-  <style>
-  body {
-    margin: 0;
-  }
-  </style>
-</head>
-<body>
-  <h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
-  <demo-snippet>
-    <template>
-      <tf-graph-app id="tfgraph"></tf-graph-app>
-      <script>
-        let g = document.querySelector("#tfgraph");
-        fetch("graph.pbtxt", {credentials: "include"}).then(r => r.text()).then(pbtxt => {
-          g.pbtxt = pbtxt;
-        });
-      </script>
-    </template>
-  </demo-snippet>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8964634b841e64f4ec9663b68927b447a663a4bf
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_app_d3v4",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    path = "/tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_component_page",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "tf-graph-app.html",
+    ],
+    destdir = "tf-graph-app",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4:legacy",
+        "//third_party/javascript/polymer/v1/iron-list:lib",
+        "//third_party/javascript/polymer/v1/paper-radio-group:lib",
+        "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5c7455ec12b2fdd6acd354626762420eb4e42137
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/BUILD
@@ -0,0 +1,23 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_app/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-app/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_app_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/data/graph.pbtxt
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_app/demo/graph.pbtxt
rename to tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/data/graph.pbtxt
diff --git a/tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html
similarity index 50%
rename from tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html
rename to tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html
index 421a6658b025967ef4a1c785849fea1a1f418635..f71feea390a958b447e046e815cb36ec2152a1aa 100644
--- a/tensorflow/tensorboard/components/tf_audio_dashboard/test/index.html
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/demo/index.html
@@ -15,21 +15,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
-
-<html>
-<head>
-  <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <script src="../../web-component-tester/browser.js"></script>
-    <link rel="import" href="../../tf-imports/d3.html">
-    <link rel="import" href="../tf-audio-dashboard.html">
-    <link rel="stylesheet" type="text/css" href="../../../lib/css/global.css">
-</head>
-<body>
-  <test-fixture id="testElementFixture">
-    <template>
-      <tf-audio-dashboard></tf-audio-dashboard>
-    </template>
-  </test-fixture>
-  <script src="audioDashboardTests.js"></script>
-</body>
-</html>
+<meta charset="utf-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<link rel="import" href="../tf-graph-app.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<style>
+  /** Make the graph app tall enough so the bottom legend does not overlap with the top. */
+  tf-graph-app, .container.tf-graph-app {
+    display: block;
+    height: 700px;
+  }
+</style>
+<h3>Answer to the Ultimate Question of Life, the Universe, and Everything</h3>
+<demo-snippet>
+  <template>
+    <tf-graph-app pbtxt-file-location="data/graph.pbtxt"></tf-graph-app>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_app/index.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/index.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_app/index.html
rename to tensorflow/tensorboard/components/tf_graph_app_d3v4/index.html
diff --git a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html b/tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html
similarity index 69%
rename from tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
rename to tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html
index 07308d38e41c3ec2d4cd7b6243809f200de56951..915b54a06a9efe5e2bcbd60edcd2021df3304ce3 100644
--- a/tensorflow/tensorboard/components/tf_graph_app/tf-graph-app.html
+++ b/tensorflow/tensorboard/components/tf_graph_app_d3v4/tf-graph-app.html
@@ -18,7 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 
 <!--
 Stand alone element of tf-graph for embedding.
@@ -111,17 +111,42 @@ Polymer({
   is: 'tf-graph-app',
   properties: {
     stats: Object,
+
+    // To use tf-graph-app, specify one of these 2 properties. Provide either
+    // 1. The path to a pbtxt file to load (pbtxtFileLocation). This option nicely makes the
+    //    progress bar include the time it takes to load the file across the network. The path could
+    //    be either a relative path or an absolute URL (of a resource that supports CORS).
+    // 2. The raw contents of a pbtxt file (pbtxt).
+    // Do not set both of these 2 properties.
+    pbtxtFileLocation: {
+      type: String,
+      observer: '_updateGraph',
+    },
     pbtxt: {
       type: String,
       observer: '_updateGraph',
     },
+
     _renderHierarchy: Object,
-    _progress: Object
+    _progress: Object,
   },
   _updateGraph: function() {
-    var blob = new Blob([this.pbtxt]);
-    this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
-  }
+    if (this.pbtxtFileLocation) {
+      // Fetch a pbtxt file. The fetching will be part of the loading sequence.
+      this.$.loader.datasets = [{
+        // Just name the dataset based on the file location.
+        "name": this.pbtxtFileLocation,
+        "path": this.pbtxtFileLocation,
+      }];
+      this.$.loader.set('selectedDataset', 0);
+    } else if (this.pbtxt) {
+      // Render the provided pbtxt.
+      var blob = new Blob([this.pbtxt]);
+
+      // TODO(chizeng): Find out why we call a private method here and do away with the call.
+      this.$.loader._parseAndConstructHierarchicalGraph(null, blob);
+    }
+  },
 });
 })();
 </script>
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b4520fe3ebed14a6f07bb4440d565821ff62cd50
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/BUILD
@@ -0,0 +1,54 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_board_d3v4",
+    srcs = [
+        "tf-graph-board.html",
+    ],
+    path = "/tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4",
+        "@org_polymer",
+        "@org_polymer_paper_progress",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-board.html",
+    ],
+    destdir = "tf-graph-board",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c1ea1f115dbfc90254f2accf0ab9d6e5afe43597
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_board/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-board/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2563e1595e9648fafea8d3632ece3af7732bf642
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/demo/index.html
@@ -0,0 +1,98 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-board.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Board Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+
+  /** Make the graph take up the entire height of the demo container. */
+  tf-graph-board-demo, #board, #board > div {
+    display: block;
+    height: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-board-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph-board id="board" color-by="xla_cluster"></tf-graph-board>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-board-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.board.set('graph', slimGraph);
+              this.$.board.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-board-demo></tf-graph-board-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html b/tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html
similarity index 99%
rename from tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
rename to tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html
index 5909172fbe5ebf98cc0cb215a3548c3507c2e447..0ee694e1e6638f7ed8808f5d11a5c92d9ae6673f 100644
--- a/tensorflow/tensorboard/components/tf_graph_board/tf-graph-board.html
+++ b/tensorflow/tensorboard/components/tf_graph_board_d3v4/tf-graph-board.html
@@ -17,6 +17,7 @@ limitations under the License.
 
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph/tf-graph.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 <link rel="import" href="../tf-graph-info/tf-graph-info.html">
 <link rel="import" href="../paper-progress/paper-progress.html">
 
diff --git a/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..21a5621b52010e22ac51ecc44b94b57507c3193e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/BUILD
@@ -0,0 +1,72 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_common_d3v4",
+    srcs = [
+        "tf-graph-common.html",
+        ":ts",
+    ],
+    path = "/tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:dagre",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:graphlib",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = glob(["*.ts"]),
+    typings = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-common.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-graph-common",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google_d3v4:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = glob(["*.ts"]),
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/typings/d3_v4:bundle",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/webcomponents_js",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts
index ba308e6934edf048084d35785fe66f4d3a8e70f2..bde382977858d7a3a3a69ea233c801c41ab7b4f0 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/annotation.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/annotation.ts
@@ -75,9 +75,7 @@ module tf.graph.scene.annotation {
             addAnnotationLabel(
                 aGroup, a.node.name, a, Class.Annotation.ELLIPSIS);
           }
-        });
-
-    annotationGroups
+        }).merge(annotationGroups)
         .attr(
             'class',
             a => {
@@ -114,11 +112,10 @@ function annotationToClassName(annotationType: render.AnnotationType) {
 function buildShape(aGroup, a: render.Annotation) {
   if (a.annotationType === render.AnnotationType.SUMMARY) {
     let summary = selectOrCreateChild(aGroup, 'use');
-    summary.attr({
-      'class': 'summary',
-      'xlink:href': '#summary-icon',
-      'cursor': 'pointer'
-    });
+    summary
+      .attr('class', 'summary')
+      .attr('xlink:href', '#summary-icon')
+      .attr('cursor', 'pointer');
   } else {
     let shape = node.buildShape(aGroup, a, Class.Annotation.NODE);
     // add title tag to get native tooltips
@@ -203,20 +200,18 @@ function update(aGroup, d: render.RenderNodeInfo, a: render.Annotation,
   }
 
   // label position
-  aGroup.select('text.' + Class.Annotation.LABEL).transition().attr({
-    x: cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset),
-    y: d.y + a.dy
-  });
+  aGroup.select('text.' + Class.Annotation.LABEL).transition()
+    .attr('x', cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset))
+    .attr('y', d.y + a.dy);
 
   // Some annotations (such as summary) are represented using a 12x12 image tag.
   // Purposely omitted units (e.g. pixels) since the images are vector graphics.
   // If there is an image, we adjust the location of the image to be vertically
   // centered with the node and horizontally centered between the arrow and the
   // text label.
-  aGroup.select('use.summary').transition().attr({
-    x: cx + a.dx - 3,
-    y: d.y + a.dy - 6
-  });
+  aGroup.select('use.summary').transition()
+    .attr('x', cx + a.dx - 3)
+    .attr('y', d.y + a.dy - 6);
 
   // Node position (only one of the shape selection will be non-empty.)
   positionEllipse(
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/colors.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/colors.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/colors.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/colors.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/common.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/common.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/common.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/common.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts
similarity index 93%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts
index 628e9ae56f27ababc4e07c3bfee671a938936b61..8121cf9f6dab97347efa33e388ecc8f2fb4e9d38 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/contextmenu.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/contextmenu.ts
@@ -48,11 +48,10 @@ export function getMenu(menu: ContextMenuItem[]) {
   return function(data, index: number): void {
     // Position and display the menu.
     let event = <MouseEvent>d3.event;
-    menuSelection.style({
-      'display': 'block',
-      'left': (event.layerX + 1) + 'px',
-      'top': (event.layerY + 1) + 'px'
-    });
+    menuSelection
+      .style('display', 'block')
+      .style('left', (event.layerX + 1) + 'px')
+      .style('top', (event.layerY + 1) + 'px');
 
     // Stop the event from propagating further.
     event.preventDefault();
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts
similarity index 91%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts
index 868d47dc83aa74cc3adce3db1627c7975c73e4d6..3dc963cf34d6d490c05d6b5c8c3db720224d502e 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/edge.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/edge.ts
@@ -29,14 +29,14 @@ const EDGE_WIDTH_SCALE_EXPONENT = 0.3;
 /** The domain (min and max value) for the edge width. */
 const DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
 
-export const EDGE_WIDTH_SCALE = d3.scale.pow()
+export const EDGE_WIDTH_SCALE: d3.ScalePower<number, number> = d3.scalePow()
       .exponent(EDGE_WIDTH_SCALE_EXPONENT)
       .domain(DOMAIN_EDGE_WIDTH_SCALE)
       .range([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH])
       .clamp(true);
 
 let arrowheadMap =
-    d3.scale.quantize().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
+    d3.scaleQuantize<String>().domain([MIN_EDGE_WIDTH, MAX_EDGE_WIDTH]).range([
       'small', 'medium', 'large', 'xlarge'
     ]);
 
@@ -87,12 +87,7 @@ export function buildGroup(sceneGroup,
 
   // Select all children and join with data.
   // (Note that all children of g.edges are g.edge)
-  let edgeGroups = container.selectAll(function() {
-    // using d3's selector function
-    // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-    // (It's not listed in the d3 wiki.)
-    return this.childNodes;
-  }).data(edges, getEdgeKey);
+  let edgeGroups = (container as any).selectAll(function() {return this.childNodes;}).data(edges, getEdgeKey);
 
   // Make edges a group to support rendering multiple lines for metaedge
   edgeGroups.enter()
@@ -108,10 +103,10 @@ export function buildGroup(sceneGroup,
         // Add line during enter because we're assuming that type of line
         // normally does not change.
         appendEdge(edgeGroup, d, sceneElement);
-      });
-
-  edgeGroups.each(position);
-  edgeGroups.each(function(d) {
+      })
+      .merge(edgeGroups)
+      .each(position)
+      .each(function(d) {
     stylize(d3.select(this), d, sceneElement);
   });
 
@@ -168,8 +163,8 @@ export function getLabelForEdge(metaedge: Metaedge,
  * @return The new array of control points.
  */
 function adjustPathPointsForMarker(points: render.Point[],
-    marker: d3.Selection<any>, isStart: boolean): render.Point[] {
-  let lineFunc = d3.svg.line<render.Point>()
+    marker: d3.Selection<any, any, any, any>, isStart: boolean): render.Point[] {
+  let lineFunc = d3.line<render.Point>()
     .x(d => d.x)
     .y(d => d.y);
   let path =
@@ -232,11 +227,9 @@ export function appendEdge(edgeGroup, d: EdgeData,
   let strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
 
   let path = edgeGroup.append('path')
-                 .attr({
-                   'id': pathId,
-                   'class': edgeClass,
-                 })
-                 .style({'stroke-width': strokeWidth + 'px'});
+                 .attr('id', pathId)
+                 .attr('class', edgeClass)
+                 .style('stroke-width', strokeWidth + 'px');
 
   // Check if there is a reference edge and add an arrowhead of the right size.
   if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
@@ -264,17 +257,15 @@ export function appendEdge(edgeGroup, d: EdgeData,
 
   edgeGroup.append('text')
       .append('textPath')
-      .attr({
-        'xlink:href': '#' + pathId,
-        'startOffset': '50%',
-        'text-anchor': 'middle',
-        'dominant-baseline': 'central'
-      })
+        .attr('xlink:href', '#' + pathId)
+        .attr('startOffset', '50%')
+        .attr('text-anchor', 'middle')
+        .attr('dominant-baseline', 'central')
       .text(labelForEdge);
 };
 
-export let interpolate = d3.svg.line<{x: number, y: number}>()
-                             .interpolate('basis')
+export let interpolate: d3.Line<{x: number, y: number}> = d3.line<{x: number, y: number}>()
+                             .curve(d3.curveBasis)
                              .x((d) => { return d.x;})
                              .y((d) => { return d.y;});
 
@@ -333,7 +324,7 @@ function position(d) {
   d3.select(this)
       .select('path.' + Class.Edge.LINE)
       .transition()
-      .attrTween('d', getEdgePathInterpolator);
+      .attrTween('d', getEdgePathInterpolator as any);
 };
 
 /**
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/externs.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/externs.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/externs.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/externs.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts
index 13d5d2fc60efdb9f72d006251570541b84457f88..1b0abcfd85311e7c66481e76fa7f5351eaafded0 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/graph.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/graph.ts
@@ -42,6 +42,9 @@ export enum SeriesGroupingType {GROUP, UNGROUP};
 /** Attribute key reserved for the shapes of the output tensors. */
 const OUTPUT_SHAPES_KEY = '_output_shapes';
 
+/** Attribute key reserved for the XLA cluster that an op runs on. */
+const _XLA_CLUSTER_KEY = '_XlaCluster';
+
 /**
  * A BaseEdge is the label object (in the graphlib sense) for an edge in the
  * original, full graph produced after parsing. Subsequent graphs, like those
@@ -133,6 +136,7 @@ export type TensorShape = number[];
 
 export interface OpNode extends Node {
   op: string;
+  // The device on which the op ran. Null if it is unknown.
   device: string;
   attr: {key: string, value: any}[];
   inputs: NormalizedInput[];
@@ -154,6 +158,8 @@ export interface OpNode extends Node {
    *       of the middle dimension is unknown (encoded as -1).
    */
   outputShapes: TensorShape[];
+  // The XLA Cluster on which the op ran. Null if it is unknown.
+  xlaCluster: string;
 }
 
 export interface BridgeNode extends Node {
@@ -349,6 +355,7 @@ export class OpNodeImpl implements OpNode {
   owningSeries: string;
   outputShapes: TensorShape[];
   nodeAttributes: {[key: string]: any;};
+  xlaCluster: string;
 
   /**
    * Constructs a new Op node.
@@ -366,6 +373,7 @@ export class OpNodeImpl implements OpNode {
     // control dependency.
     this.inputs = normalizeInputs(rawNode.input);
     this.outputShapes = extractOutputShapes(rawNode.attr);
+    this.xlaCluster = extractXlaCluster(rawNode.attr);
     // additional properties
     this.type = NodeType.OP;
     this.isGroupNode = false;
@@ -501,16 +509,7 @@ export class NodeStats {
    * if it is a Group node.
    */
   totalBytes = 0;
-  /**
-   * Total number of compute time in microseconds used for the node.
-   * Sum of all children if it is a Group node. Null if it is unknown.
-   */
-  get totalMicros(): number {
-    if (this.startTime == null || this.endTime == null) {
-      return null;
-    }
-    return this.endTime - this.startTime;
-  }
+
   /**
    * The shape of each output tensors, if there are any.
    * Empty if it is a Group node.
@@ -526,10 +525,23 @@ export class NodeStats {
     if (stats.totalBytes != null) {
       this.totalBytes += stats.totalBytes;
     }
-    if (stats.totalMicros != null) {
+    if (stats.getTotalMicros() != null) {
       this.addExecutionTime(stats.startTime, stats.endTime);
     }
   }
+
+  /**
+   * Total number of compute time in microseconds used for the node.
+   * Sum of all children if it is a Group node. Null if it is unknown.
+   * This method can not be scaffolded under a getter attribute because
+   * ECMAScript 5 does not support getter attributes.
+   */
+  getTotalMicros(): number {
+    if (this.startTime == null || this.endTime == null) {
+      return null;
+    }
+    return this.endTime - this.startTime;
+  }
 }
 
 export class MetanodeImpl implements Metanode {
@@ -799,7 +811,9 @@ class SeriesNodeImpl implements SeriesNode {
  * Extracts the shapes of the output tensors from the attr property in the
  * node proto.
  */
-function extractOutputShapes(attr: {key: string, value: any}[]): TensorShape[] {
+// tslint:disable-next-line:no-any
+function extractOutputShapes(attr: Array<{key: string, value: any}>):
+    TensorShape[] {
   let result = null;
   // We don't know anything about the output tensors.
   if (!attr) {
@@ -843,6 +857,28 @@ function extractOutputShapes(attr: {key: string, value: any}[]): TensorShape[] {
   return null;
 }
 
+/**
+ * Extracts the XLA Cluster that an op runs on from the attrs of the OpNode.
+ * @param attr The attr property.
+ * @return A string that is the name of the cluster. Or null if it could not be
+ *     determined.
+ */
+// tslint:disable-next-line:no-any
+function extractXlaCluster(attr: Array<{key: string, value: any}>): string|
+    null {
+  if (!attr) {
+    return null;
+  }
+
+  // Find the attribute for XLA cluster if there is one.
+  for (let i = 0; i < attr.length; i++) {
+    if (attr[i].key === _XLA_CLUSTER_KEY) {
+      return attr[i].value['s'] || null;
+    }
+  }
+  return null;
+}
+
 /**
  * Normalizes the inputs and extracts associated metadata:
  * 1) Inputs can contain a colon followed by a number at the end
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts
index 52c809dff703a0d925584c2523ab62075fe741dd..889607ac5006bf75c698f7d121e1e0b6f9da6e8e 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/hierarchy.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/hierarchy.ts
@@ -30,6 +30,8 @@ export interface Hierarchy {
   templates: {[templateId: string]: string[]};
   /** List of all device names */
   devices: string[];
+  /** List of all XLA cluster names */
+  xlaClusters: string[];
   /** True if at least one tensor in the graph has shape information */
   hasShapeInfo: boolean;
   /** The maximum size across all meta edges. Used for scaling thickness. */
@@ -52,6 +54,7 @@ class HierarchyImpl implements Hierarchy {
   templates: {[templateId: string]: string[]};
   private index: {[nodeName: string]: GroupNode|OpNode};
   devices: string[];
+  xlaClusters: string[];
   hasShapeInfo = false;
   maxMetaEdgeSize = 1;
   orderings: { [nodeName: string]: { [childName: string]: number } };
@@ -346,7 +349,7 @@ class HierarchyImpl implements Hierarchy {
    */
   getTemplateIndex(): (string) => number {
     let templateNames = d3.keys(this.templates);
-    let templateIndex = d3.scale.ordinal()
+    let templateIndex = d3.scaleOrdinal()
         .domain(templateNames)
         .range(d3.range(0, templateNames.length));
     return (templateId: string) => <number>templateIndex(templateId);
@@ -395,14 +398,22 @@ export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
       .runAsyncTask(
           'Adding nodes', 20,
           () => {
-            // Get all the possible device names.
+            // Get all the possible device and XLA cluster names.
             let deviceNames = {};
+            let xlaClusterNames = {};
             _.each(graph.nodes, (node, nodeName) => {
-              if (node.device != null) {
+              if (node.device) {
                 deviceNames[node.device] = true;
               }
+
+              if (node.xlaCluster) {
+                xlaClusterNames[node.xlaCluster] = true;
+              }
             });
+
             h.devices = _.keys(deviceNames);
+            h.xlaClusters = _.keys(xlaClusterNames);
+
             addNodes(h, graph);
           },
           tracker)
@@ -426,7 +437,9 @@ export function build(graph: tf.graph.SlimGraph, params: HierarchyParams,
               h.templates = template.detect(h, params.verifyTemplate);
             }, tracker);
       })
-      .then(() => { return h; });
+      .then(() => {
+        return h;
+      });
 };
 
 export function joinAndAggregateStats(
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts
index 9f4e2f406bbeb3d3239919ae06c3da14e850c737..1019e4f2694d01e3bba71f8f91294cfe61d14a35 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/layout.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/layout.ts
@@ -103,7 +103,9 @@ export const PARAMS = {
       width: 60,
       maxLabelWidth: 52,
       /** A scale for the node's height based on number of nodes inside */
-      height: d3.scale.linear().domain([1, 200]).range([15, 60]).clamp(true),
+      // Hack - set this as an any type to avoid issues in exporting a type
+      // from an external module.
+      height: (d3 as any).scaleLinear().domain([1, 200]).range([15, 60]).clamp(true),
       /** The radius of the circle denoting the expand button. */
       expandButtonRadius: 3
     },
@@ -604,7 +606,7 @@ function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
           inboxHeight / 2);
   inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
 
-  let inY = d3.scale.linear()
+  let inY = d3.scaleLinear()
     .domain([0, inAnnotations.length - 1])
     .range([-inTouchHeight, inTouchHeight]);
 
@@ -633,7 +635,7 @@ function layoutAnnotation(renderNodeInfo: render.RenderNodeInfo): void {
       Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius,
           outboxHeight / 2);
   outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
-  let outY = d3.scale.linear()
+  let outY = d3.scaleLinear()
     .domain([0, outAnnotations.length - 1])
     .range([-outTouchHeight, outTouchHeight]);
 
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts
similarity index 93%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts
index 769984feb4af87ccae074c9b140e3144db032bc2..8129df3a4268803d5105ce6a8e31755c9e40f470 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/minimap.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/minimap.ts
@@ -43,7 +43,7 @@ export class Minimap {
   /** The svg group used for panning and zooming the main svg. */
   private zoomG: SVGGElement;
   /** The zoom behavior of the main svg. */
-  private mainZoom: d3.behavior.Zoom<any>;
+  private mainZoom: d3.ZoomBehavior<any, any>;
   /** The maximum width and height for the minimap. */
   private maxWandH: number;
   /** The last translation vector used in the main svg. */
@@ -67,7 +67,7 @@ export class Minimap {
    * @param labelPadding Padding in pixels due to the main graph labels.
    */
   constructor(svg: SVGSVGElement, zoomG: SVGGElement,
-      mainZoom: d3.behavior.Zoom<any>, minimap: HTMLElement,
+      mainZoom: d3.ZoomBehavior<any, any>, minimap: HTMLElement,
       maxWandH: number, labelPadding: number) {
     this.svg = svg;
     this.labelPadding = labelPadding;
@@ -87,8 +87,8 @@ export class Minimap {
       this.updateViewpoint();
     };
     this.viewpointCoord = {x: 0, y: 0};
-    let drag = d3.behavior.drag().origin(Object).on('drag', dragmove);
-    $viewpoint.datum(this.viewpointCoord).call(drag);
+    let drag = d3.drag().subject(Object).on('drag', dragmove);
+    $viewpoint.datum(this.viewpointCoord as any).call(drag);
 
     // Make the minimap clickable.
     $minimapSvg.on('click', () => {
@@ -99,7 +99,7 @@ export class Minimap {
       // Update the coordinates of the viewpoint.
       let width = Number($viewpoint.attr('width'));
       let height = Number($viewpoint.attr('height'));
-      let clickCoords = d3.mouse($minimapSvg.node());
+      let clickCoords = d3.mouse($minimapSvg.node() as any);
       this.viewpointCoord.x = clickCoords[0] - width / 2;
       this.viewpointCoord.y = clickCoords[1] - height / 2;
       this.updateViewpoint();
@@ -129,8 +129,9 @@ export class Minimap {
     // new viewpoint.
     let mainX = - this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
     let mainY = - this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
-    let zoomEvent = this.mainZoom.translate([mainX, mainY]).event;
-    d3.select(this.zoomG).call(zoomEvent);
+    d3.select(this.svg).call(
+        this.mainZoom.transform,
+        d3.zoomIdentity.translate(mainX, mainY).scale(this.scaleMain));
   }
 
   /**
@@ -198,10 +199,9 @@ export class Minimap {
     // Temporarily assign an explicit width/height to the main svg, since
     // it doesn't have one (uses flex-box), but we need it for the canvas
     // to work.
-    $svg.attr({
-      width: sceneSize.width,
-      height: sceneSize.height,
-    });
+    $svg
+      .attr('width', sceneSize.width)
+      .attr('height', sceneSize.height);
 
     // Since the content inside the svg changed (e.g. a node was expanded),
     // the aspect ratio have also changed. Thus, we need to update the scale
@@ -241,10 +241,8 @@ export class Minimap {
     // assigned styles, explicit width and height and bring back the pan/zoom
     // transform.
     svgStyle.remove();
-    $svg.attr({
-      width: null,
-      height: null
-    });
+    $svg.attr('width', null).attr('height', null);
+
     $zoomG.attr('transform', zoomTransform);
     let image = new Image();
     image.onload = () => {
@@ -283,14 +281,17 @@ export class Minimap {
    * @param translate The translate vector, or none to use the last used one.
    * @param scale The scaling factor, or none to use the last used one.
    */
-  zoom(translate?: [number, number], scale?: number): void {
+  zoom(transform?: d3.ZoomTransform): void {
     if (this.scaleMinimap == null) {
       // Scene is not ready yet.
       return;
     }
     // Update the new translate and scale params, only if specified.
-    this.translate = translate || this.translate;
-    this.scaleMain = scale || this.scaleMain;
+    if (transform) {
+      this.translate = [transform.x, transform.y];
+      this.scaleMain = transform.k;
+    }
+
     // Update the location of the viewpoint rectangle.
     let svgRect = this.svg.getBoundingClientRect();
     let $viewpoint = d3.select(this.viewpoint);
@@ -300,12 +301,11 @@ export class Minimap {
         this.scaleMain;
     let viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
     let viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
-    $viewpoint.attr({
-      x: this.viewpointCoord.x,
-      y: this.viewpointCoord.y,
-      width: viewpointWidth,
-      height: viewpointHeight
-    });
+    $viewpoint
+      .attr('x', this.viewpointCoord.x)
+      .attr('y', this.viewpointCoord.y)
+      .attr('width', viewpointWidth)
+      .attr('height', viewpointHeight);
     // Show/hide the minimap depending on the viewpoint area as fraction of the
     // whole minimap.
     let mapWidth = this.minimapSize.width;
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts
index bbfee96e440f4e1ce5d2f4d3a1ad638a20d5829e..3e140a388b056b52d15e7329dcfaee5421c0a391 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/node.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/node.ts
@@ -66,13 +66,7 @@ module tf.graph.scene.node {
     // Select all children and join with data.
     // (Note that all children of g.nodes are g.node)
     let nodeGroups =
-        container
-            .selectAll(function() {
-              // using d3's selector function
-              // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-              // (It's not listed in the d3 wiki.)
-              return this.childNodes;  // this here refers to container.node()
-            })
+        (container as any).selectAll(function() {return this.childNodes;})
             .data(nodeData, (d) => {
               // make sure that we don't have to swap shape type
               return d.node.name + ':' + d.node.type;
@@ -86,10 +80,9 @@ module tf.graph.scene.node {
           let nodeGroup = d3.select(this);
           // index node group for quick stylizing
           sceneElement.addNodeGroup(d.node.name, nodeGroup);
-        });
-
-    // UPDATE
-    nodeGroups
+        })
+        .merge(nodeGroups)
+        // ENTER + UPDATE
         .attr('class', d => { return Class.Node.GROUP + ' ' + nodeClass(d); })
         .each(function(d) {
           let nodeGroup = d3.select(this);
@@ -200,7 +193,7 @@ function addButton(selection, d: render.RenderNodeInfo, sceneElement) {
       .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
   scene.selectOrCreateChild(group, 'path', Class.Node.COLLAPSE_BUTTON)
       .attr('d', 'M-2.2,0 H2.2');
-  group.on('click', d => {
+  (group as any).on('click', (d: any) => {
     // Stop this event's propagation so that it isn't also considered a
     // node-select.
     (<Event>d3.event).stopPropagation();
@@ -368,7 +361,7 @@ function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
     label.attr('font-size', scale(text.length) + 'px');
   }
 
-  let txtElement = <d3.Selection<any>>label.text(text);
+  let txtElement = <d3.Selection<any, any, any, any>>label.text(text);
   enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
   return label;
 }
@@ -386,8 +379,8 @@ function labelBuild(nodeGroup, renderNodeInfo: render.RenderNodeInfo,
  * determine whether META nodes are collapsed or expanded.
  */
 export function enforceLabelWidth(
-    txtElementSelection: d3.Selection<any>, nodeType: NodeType | number,
-    renderNodeInfo?: render.RenderNodeInfo) {
+    txtElementSelection: d3.Selection<any, any, any, any>, nodeType: NodeType | number,
+    renderNodeInfo?: render.RenderNodeInfo): any {
   // Get text element itself and its on-screen width.
   let txtNode = <SVGTextElement>txtElementSelection.node();
   let computedTxtLength = txtNode.getComputedTextLength();
@@ -453,7 +446,7 @@ export function enforceLabelWidth(
 let fontScale = null;
 function getLabelFontScale(sceneElement) {
   if (!fontScale) {
-    fontScale = d3.scale.linear()
+    fontScale = d3.scaleLinear()
       .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
         sceneElement.maxMetanodeLabelLength])
       .range([sceneElement.maxMetanodeLabelLengthFontSize,
@@ -482,7 +475,7 @@ function labelPosition(nodeGroup, cx: number, cy: number,
  * @param nodeClass class for the element.
  * @return Selection of the shape.
  */
-export function buildShape(nodeGroup, d, nodeClass: string) {
+export function buildShape(nodeGroup, d, nodeClass: string): d3.Selection<any, any, any, any> {
   // Create a group to house the underlying visual elements.
   let shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
   // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
@@ -505,15 +498,15 @@ export function buildShape(nodeGroup, d, nodeClass: string) {
       scene.selectOrCreateChild(shapeGroup, 'use', classList)
           .attr('xlink:href', '#op-series-' + stampType + '-stamp');
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     case NodeType.BRIDGE:
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     case NodeType.META:
       scene.selectOrCreateChild(shapeGroup, 'rect', Class.Node.COLOR_TARGET)
-          .attr({rx: d.radius, ry: d.radius});
+          .attr('rx', d.radius).attr('ry', d.radius);
       break;
     default:
       throw Error('Unrecognized node type: ' + d.node.type);
@@ -591,7 +584,8 @@ function position(nodeGroup, d: render.RenderNodeInfo) {
 };
 
 /** Enum specifying the options to color nodes by */
-export enum ColorBy { STRUCTURE, DEVICE, COMPUTE_TIME, MEMORY };
+export enum ColorBy {STRUCTURE, DEVICE, XLA_CLUSTER, COMPUTE_TIME, MEMORY}
+;
 
 /**
  * Returns the fill color for the node given its state and the 'color by'
@@ -648,6 +642,9 @@ export function getFillForNode(templateIndex, colorBy,
         });
       }
       return isExpanded ? colorParams.EXPANDED_COLOR : `url(#${escapedId})`;
+    case ColorBy.XLA_CLUSTER:
+      return isExpanded ? colorParams.EXPANDED_COLOR :
+                          renderInfo.xlaClusterColor || colorParams.UNKNOWN;
     case ColorBy.COMPUTE_TIME:
       return isExpanded ?
         colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
@@ -740,13 +737,12 @@ export function traceInputs(renderGraphInfo: tf.graph.render.RenderGraphInfo) {
         traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
   });
 
-  d3.selectAll(selectedNodeSelectorString).classed({
+  d3.selectAll(selectedNodeSelectorString)
     // Remove the input-highlight from the selected node.
-    'input-highlight': false,
+    .classed('input-highlight', false)
     // Add input-highlight-selected class to selected node, which allows
     // treating the selected not as a special case of an input node.
-    'input-highlight-selected': true
-  });
+    .classed('input-highlight-selected', true)
 
   // Highlight all parent nodes of each OpNode as input parent to allow
   // specific highlighting.
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/parser.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/parser.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/parser.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/parser.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/proto.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/proto.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/proto.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/proto.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/render.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/render.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts
index ae5d1a7c571305727ec8cd04402136c0cce8a356..474e358ba95f3cd00ccadb1ce7a3535341030c1e 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/render.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/render.ts
@@ -45,7 +45,7 @@ export let MetanodeColors = {
    * Standard hue values for node color palette.
    */
   HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
-  STRUCTURE_PALETTE: function(id: number, lightened?: boolean) {
+  STRUCTURE_PALETTE(id: number, lightened?: boolean) {
     // The code below is a flexible way to computationally create a set
     // of colors that go well together.
     let hues = MetanodeColors.HUES;
@@ -56,8 +56,12 @@ export let MetanodeColors = {
     let light = lightened ? 95 : 80;
     return d3.hsl(hue, .01 * sat, .01 * light).toString();
   },
-  DEVICE_PALETTE: function(index: number):
-      string { return MetanodeColors.STRUCTURE_PALETTE(index);},
+  DEVICE_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
+  XLA_CLUSTER_PALETTE(index: number): string {
+    return MetanodeColors.STRUCTURE_PALETTE(index);
+  },
   UNKNOWN: '#eee',
   GRADIENT_OUTLINE: '#888'
 };
@@ -158,12 +162,13 @@ export class RenderGraphInfo {
   private displayingStats: boolean;
   private index: {[nodeName: string]: RenderNodeInfo};
   private renderedOpNames: string[];
-  private deviceColorMap: d3.scale.Ordinal<string, string>;
-  private memoryUsageScale: d3.scale.Linear<string, string>;
-  private computeTimeScale: d3.scale.Linear<string, string>;
+  private deviceColorMap: d3.ScaleOrdinal<string, string>;
+  private xlaClusterColorMap: d3.ScaleOrdinal<string, string>;
+  private memoryUsageScale: d3.ScaleLinear<string, string>;
+  private computeTimeScale: d3.ScaleLinear<string, string>;
   /** Scale for the thickness of edges when there is no shape information. */
   edgeWidthScale:
-      d3.scale.Linear<number, number> | d3.scale.Pow<number, number>;
+      d3.ScaleLinear<number, number> | d3.ScalePower<number, number>;
   // Since the rendering information for each node is constructed lazily,
   // upon node's expansion by the user, we keep a map between the node's name
   // and whether the rendering information was already constructed for that
@@ -191,11 +196,18 @@ export class RenderGraphInfo {
   }
 
   computeScales() {
-    this.deviceColorMap = d3.scale.ordinal<string>()
+    this.deviceColorMap = d3.scaleOrdinal<string>()
         .domain(this.hierarchy.devices)
         .range(_.map(d3.range(this.hierarchy.devices.length),
                      MetanodeColors.DEVICE_PALETTE));
 
+    this.xlaClusterColorMap =
+        d3.scaleOrdinal<string>()
+            .domain(this.hierarchy.xlaClusters)
+            .range(_.map(
+                d3.range(this.hierarchy.xlaClusters.length),
+                MetanodeColors.XLA_CLUSTER_PALETTE));
+
     let topLevelGraph = this.hierarchy.root.metagraph;
     // Find the maximum and minimum memory usage.
     let memoryExtent = d3.extent(topLevelGraph.nodes(),
@@ -206,7 +218,7 @@ export class RenderGraphInfo {
         return node.stats.totalBytes;
       }
     });
-    this.memoryUsageScale = d3.scale.linear<string, string>()
+    this.memoryUsageScale = d3.scaleLinear<string, string>()
         .domain(memoryExtent)
         .range(PARAMS.minMaxColors);
 
@@ -216,16 +228,16 @@ export class RenderGraphInfo {
       let node = topLevelGraph.node(nodeName);
       // Some ops don't have stats at all.
       if (node.stats != null) {
-        return node.stats.totalMicros;
+        return node.stats.getTotalMicros();
       }
     });
-    this.computeTimeScale = d3.scale.linear<string, string>()
+    this.computeTimeScale = d3.scaleLinear<string, string>()
         .domain(computeTimeExtent)
         .range(PARAMS.minMaxColors);
 
     this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
       scene.edge.EDGE_WIDTH_SCALE :
-      d3.scale.linear()
+      d3.scaleLinear()
         .domain([1, this.hierarchy.maxMetaEdgeSize])
         .range([scene.edge.MIN_EDGE_WIDTH, scene.edge.MAX_EDGE_WIDTH]);
   }
@@ -274,7 +286,14 @@ export class RenderGraphInfo {
     if (node.stats) {
       renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
       renderInfo.computeTimeColor =
-        this.computeTimeScale(node.stats.totalMicros);
+          this.computeTimeScale(node.stats.getTotalMicros());
+    }
+
+    if (!node.isGroupNode) {
+      let clusterName = (node as OpNode).xlaCluster;
+      if (clusterName) {
+        renderInfo.xlaClusterColor = this.xlaClusterColorMap(clusterName);
+      }
     }
 
     // We only fade nodes when we're displaying stats.
@@ -1031,7 +1050,12 @@ export class RenderNodeInfo {
    * its children. If this node is an op node, this list will have only one
    * color with proportion 1.0.
    */
-  deviceColors: {color: string, proportion: number}[];
+  deviceColors: Array<{color: string, proportion: number}>;
+
+  /**
+   * Color according to the XLA cluster of this node.
+   */
+  xlaClusterColor: string;
 
   /**
    * Color according to the memory usage of this node.
@@ -1135,7 +1159,7 @@ export class RenderMetaedgeInfo {
   /**
    * D3 selection of the group containing the path that displays this edge.
    */
-  edgeGroup: d3.Selection<RenderMetaedgeInfo>;
+  edgeGroup: d3.Selection<RenderMetaedgeInfo & any, any, any, any>;
 
   /** Id of the <marker> used as a start-marker for the edge path. */
   startMarkerId: string;
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts
similarity index 94%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts
index e84a2b747c7a7b7fca1f7003a6d721992e1e6b4d..53baf14ec48fe4d456b9cff73c67941675b55ebc 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/scene/scene.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/scene.ts
@@ -151,19 +151,21 @@ module tf.graph.scene {
             svgRect.width / sceneSize.width, svgRect.height / sceneSize.height,
             2);
     let params = layout.PARAMS.graph;
-    let zoomEvent =
-        d3zoom.scale(scale)
-            .on('zoomend.fitted',
-                () => {
-                  // Remove the listener for the zoomend event,
-                  // so we don't get called at the end of regular zoom events,
-                  // just those that fit the graph to screen.
-                  d3zoom.on('zoomend.fitted', null);
-                  callback();
-                })
-            .translate([params.padding.paddingLeft, params.padding.paddingTop])
-            .event;
-    d3.select(zoomG).transition().duration(500).call(zoomEvent);
+    const transform = d3.zoomIdentity
+        .scale(scale)
+        .translate(params.padding.paddingLeft, params.padding.paddingTop);
+
+    d3.select(svg)
+        .transition()
+        .duration(500)
+        .call(d3zoom.transform, transform)
+        .on('end.fitted', () => {
+          // Remove the listener for the zoomend event,
+          // so we don't get called at the end of regular zoom events,
+          // just those that fit the graph to screen.
+          d3zoom.on('end.fitted', null);
+          callback();
+        });
 };
 
 /**
@@ -184,7 +186,7 @@ export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
   if (!node) {
     return false;
   }
-  let translate = d3zoom.translate();
+  let transform = d3.zoomTransform(node);
   // Check if the selected node is off-screen in either
   // X or Y dimension in either direction.
   let nodeBox = node.getBBox();
@@ -212,7 +214,7 @@ export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
     let centerY = (pointTL.y + pointBR.y) / 2;
     let dx = ((svgRect.width / 2) - centerX);
     let dy = ((svgRect.height / 2) - centerY);
-    let zoomEvent = d3zoom.translate([translate[0] + dx, translate[1] + dy])
+    let zoomEvent = d3zoom.translate([transform.x + dx, transform.y + dy])
         .event;
     d3.select(zoomG).transition().duration(500).call(zoomEvent);
     return true;
@@ -232,7 +234,7 @@ export function panToNode(nodeName: String, svg, zoomG, d3zoom): boolean {
  * @return selection of the element
  */
 export function selectOrCreateChild(
-    container, tagName: string, className?: string | string[], before?) {
+    container, tagName: string, className?: string | string[], before?): d3.Selection<any, any, any, any> {
   let child = selectChild(container, tagName, className);
   if (!child.empty()) {
     return child;
@@ -269,7 +271,7 @@ export function selectOrCreateChild(
  * @return selection of the element, or an empty selection
  */
 export function selectChild(
-    container, tagName: string, className?: string | string[]) {
+    container, tagName: string, className?: string | string[]): d3.Selection<any, any, any, any> {
   let children = container.node().childNodes;
   for (let i = 0; i < children.length; i++) {
     let child = children[i];
@@ -325,7 +327,7 @@ export function selectChild(
 export function buildGroup(container,
     renderNode: render.RenderGroupNodeInfo,
     sceneElement,
-    sceneClass: string) {
+    sceneClass: string): d3.Selection<any, any, any, any> {
   sceneClass = sceneClass || Class.Scene.GROUP;
   let isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
   let sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
@@ -451,12 +453,11 @@ export function translate(selection, x0: number, y0: number) {
  */
 export function positionRect(rect, cx: number, cy: number, width: number,
     height: number) {
-  rect.transition().attr({
-    x: cx - width / 2,
-    y: cy - height / 2,
-    width: width,
-    height: height
-  });
+  rect.transition()
+    .attr('x', cx - width / 2)
+    .attr('y', cy - height / 2)
+    .attr('width', width)
+    .attr('height', height);
 };
 
 /**
@@ -497,12 +498,11 @@ export function positionButton(button, renderNode: render.RenderNodeInfo) {
  */
 export function positionEllipse(ellipse, cx: number, cy: number,
     width: number, height: number) {
-  ellipse.transition().attr({
-    cx: cx,
-    cy: cy,
-    rx: width / 2,
-    ry: height / 2
-  });
+  ellipse.transition()
+    .attr('cx', cx)
+    .attr('cy', cy)
+    .attr('rx', width / 2)
+    .attr('ry', height / 2);
 };
 
 /**
@@ -529,7 +529,7 @@ function _addHealthPill(
     nodeGroupElement: SVGElement, healthPill: HealthPill,
     nodeInfo: render.RenderNodeInfo) {
   // Check if text already exists at location.
-  d3.select(nodeGroupElement.parentNode).selectAll('.health-pill').remove();
+  d3.select(nodeGroupElement.parentNode as any).selectAll('.health-pill').remove();
 
   if (!nodeInfo || !healthPill) {
     return;
@@ -671,7 +671,7 @@ export function addHealthPills(
         // Only show health pill data for this node if it is available.
         let healthPills = nodeNamesToHealthPills[nodeInfo.node.name];
         let healthPill = healthPills ? healthPills[healthPillStepIndex] : null;
-        _addHealthPill(this, healthPill, nodeInfo);
+        _addHealthPill((this as SVGElement), healthPill, nodeInfo);
       });
 };
 
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/template.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/template.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/template.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/template.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts
index c633a659784b0cd740bdbdf85a3f70538cb010a3..af3030197e0824aaa808a8ad5b77fadf0cc856f9 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/test/graph-test.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/graph-test.ts
@@ -75,7 +75,7 @@ suite('graph', () => {
 
             tf.graph.parser.parseStatsPbTxt(statsPbtxt).then(stepStats => {
               tf.graph.joinStatsInfoWithGraph(slimGraph, stepStats);
-              assert.equal(slimGraph.nodes['Q'].stats.totalMicros, 6);
+              assert.equal(slimGraph.nodes['Q'].stats.getTotalMicros(), 6);
               done();
             });
           });
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/hierarchy-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/hierarchy-test.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/hierarchy-test.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/hierarchy-test.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/index.html b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/index.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/index.html
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/index.html
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/layout-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/layout-test.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/layout-test.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/layout-test.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/parser-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/parser-test.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/parser-test.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/parser-test.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/util-test.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util-test.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/util-test.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util-test.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/test/util.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_common/test/util.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/test/util.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html b/tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html
similarity index 55%
rename from tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html
index f4f98c8b9ac452ddf7b9455c35a993d39f66a67a..a460072a38f3c0fcd868b70f8c2325320df95028 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/tf-graph-common.html
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/tf-graph-common.html
@@ -20,19 +20,19 @@ limitations under the License.
 <link rel="import" href="../tf-imports/graphlib.html">
 <link rel="import" href="../tf-imports/lodash.html">
 
-<script src="lib/colors.js"></script>
-<script src="lib/common.js"></script>
-<script src="lib/externs.js"></script>
-<script src="lib/graph.js"></script>
-<script src="lib/hierarchy.js"></script>
-<script src="lib/layout.js"></script>
-<script src="lib/parser.js"></script>
-<script src="lib/proto.js"></script>
-<script src="lib/render.js"></script>
-<script src="lib/scene/annotation.js"></script>
-<script src="lib/scene/contextmenu.js"></script>
-<script src="lib/scene/edge.js"></script>
-<script src="lib/scene/node.js"></script>
-<script src="lib/scene/scene.js"></script>
-<script src="lib/template.js"></script>
-<script src="lib/util.js"></script>
+<script src="colors.js"></script>
+<script src="common.js"></script>
+<script src="externs.js"></script>
+<script src="graph.js"></script>
+<script src="hierarchy.js"></script>
+<script src="layout.js"></script>
+<script src="parser.js"></script>
+<script src="proto.js"></script>
+<script src="render.js"></script>
+<script src="annotation.js"></script>
+<script src="contextmenu.js"></script>
+<script src="edge.js"></script>
+<script src="node.js"></script>
+<script src="scene.js"></script>
+<script src="template.js"></script>
+<script src="util.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_graph_common/lib/util.ts b/tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_common/lib/util.ts
rename to tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts
index 8a8a29ca088b7a70c426125eb1d85d7e0b1a7be3..d0be1d6ba5a4e73525e66b856563f916454a917b 100644
--- a/tensorflow/tensorboard/components/tf_graph_common/lib/util.ts
+++ b/tensorflow/tensorboard/components/tf_graph_common_d3v4/util.ts
@@ -68,7 +68,7 @@ module tf.graph.util {
    * progress
    * of the subtask and the subtask message. The parent task should pass a
    * subtracker to its subtasks. The subtask reports its own progress which
-   * becames relative to the main task.
+   * becomes relative to the main task.
    */
   export function getSubtaskTracker(
       parentTracker: ProgressTracker, impactOnTotalProgress: number,
@@ -228,7 +228,8 @@ module tf.graph.util {
 
   export function hasDisplayableNodeStats(stats: NodeStats) {
     if (stats &&
-        (stats.totalBytes > 0 || stats.totalMicros > 0 || stats.outputSize)) {
+        (stats.totalBytes > 0 || stats.getTotalMicros() > 0 ||
+         stats.outputSize)) {
       return true;
     }
     return false;
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..21e876fd07af818ac5e0cfcd963c270a6e6be69b
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/BUILD
@@ -0,0 +1,57 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_controls_d3v4",
+    srcs = [
+        "tf-graph-controls.html",
+    ],
+    path = "/tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-controls.html",
+    ],
+    destdir = "tf-graph-controls",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD
similarity index 57%
rename from tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
rename to tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD
index 84699b67b6bdd550dc671c10d093b376a3ba81d5..f9ea3c1828578602ecee4940b8be20a7763aea0c 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/demo/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/BUILD
@@ -1,16 +1,16 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-# bazel run //third_party/tensorflow/tensorboard/components/vz_line_chart/demo
-webfiles(
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_controls/demo
+web_library(
     name = "demo",
     srcs = ["index.html"],
-    path = "/vz-line-chart/demo",
+    path = "/tf-graph-controls/demo",
     deps = [
-        "//tensorflow/tensorboard/components/vz_line_chart",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4",
         "@org_polymer_iron_demo_helpers",
         "@org_polymer_paper_styles",
         "@org_polymer_webcomponentsjs",
diff --git a/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8b12641b28e328351bd7321c43959a91fba56dcc
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/demo/index.html
@@ -0,0 +1,49 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-controls.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Controls Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 700px;
+    position: relative;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-controls-demo">
+      <template>
+        <tf-graph-controls
+            id="controls"
+            color-by="structure"
+        ></tf-graph-controls>
+      </template>
+      <script>
+        Polymer({
+          is: "tf-graph-controls-demo",
+        });
+      </script>
+    </dom-module>
+    <div id="demo-container">
+      <tf-graph-controls-demo></tf-graph-controls-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html
similarity index 88%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
rename to tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html
index dce0708d0c4e34da1a36996bf616f544a4a7d099..10faf29bbccd5b2448d31cb4c8b9378bdfb46ff4 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-controls.html
+++ b/tensorflow/tensorboard/components/tf_graph_controls_d3v4/tf-graph-controls.html
@@ -22,6 +22,7 @@ limitations under the License.
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
 <link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 
 <dom-module id="tf-graph-controls">
 <template>
@@ -145,7 +146,7 @@ svg.icon {
 .title small {
   font-weight: normal;
 }
-.deviceList {
+.deviceList, .xlaClusterList {
   max-height: 200px;
   overflow-y: auto;
 }
@@ -235,6 +236,12 @@ table.control-holder {
 table.tf-graph-controls td.input-element-table-data {
   padding: 0 0 0 20px;
 }
+
+/** Override inline styles that suppress pointer events for disabled buttons. Otherwise, the */
+/*  tooltips do not appear. */
+#color-by-radio-group paper-radio-button {
+  pointer-events: auto !important;
+}
 </style>
 <svg width="0" height="0">
   <defs>
@@ -317,13 +324,39 @@ table.tf-graph-controls td.input-element-table-data {
   </table>
   <div class="control-holder">
     <div class="title">Color</div>
-    <paper-radio-group selected="{{colorBy}}">
+    <paper-radio-group id="color-by-radio-group" selected="{{colorBy}}">
       <paper-radio-button name="structure">Structure</paper-radio-button>
+
       <paper-radio-button name="device">Device</paper-radio-button>
-      <template is="dom-if" if="[[_statsNotNull(stats)]]">
-        <paper-radio-button name="compute_time">Compute time</paper-radio-button>
-        <paper-radio-button name="memory">Memory</paper-radio-button>
-      </template>
+
+      <paper-radio-button id="xla-cluster-radio-button"
+                          name="xla_cluster"
+                          disabled="[[!_xlaClustersProvided(renderHierarchy)]]">
+        XLA Cluster
+      </paper-radio-button>
+      <paper-tooltip for="xla-cluster-radio-button" position="right">
+        Coloring by XLA cluster is only enabled if at least 1 op specifies an XLA cluster.
+      </paper-tooltip>
+
+      <paper-radio-button id="compute-time-radio-button"
+                          name="compute_time"
+                          disabled="[[!stats]]">
+        Compute time
+      </paper-radio-button>
+      <paper-tooltip for="compute-time-radio-button" position="right">
+        Coloring by compute time is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
+
+      <paper-radio-button id="memory-radio-button"
+                          name="memory"
+                          disabled="[[!stats]]">
+        Memory
+      </paper-radio-button>
+      <paper-tooltip for="memory-radio-button" position="right">
+        Coloring by memory is only enabled if the RunMetadata proto is passed to the
+        FileWriter when a specific session is run.
+      </paper-tooltip>
     </paper-radio-group>
   </div>
   <div>
@@ -410,6 +443,32 @@ table.tf-graph-controls td.input-element-table-data {
         </div>
       </div>
     </template>
+    <template is="dom-if" if="[[_equals(colorBy, 'xla_cluster')]]">
+      <div class="color-text">
+        <div class="xlaClusterList">
+          <table>
+          <template is="dom-repeat" items="[[colorByParams.xla_cluster]]">
+            <tr>
+              <td style$="[[_getBackgroundColor(item.color)]]">
+                <div class="colorBox"></div>
+              </td>
+              <td>
+                <div>[[item.xla_cluster]]</div>
+              </td>
+            </tr>
+          </template>
+          </table>
+        </div>
+        <br/>
+        <div class="color-legend-row">
+          <svg>
+            <use xmlns:xlink="http://www.w3.org/1999/xlink"
+                 xlink:href="#grey-rect" x="0" y="0"/>
+          </svg>
+          <span class="color-legend-value">unknown XLA cluster</span>
+        </div>
+      </div>
+    </template>
     <template is="dom-if" if="[[_statsNotNull(stats)]]">
       <div class="color-legend-row">
         <svg>
@@ -528,6 +587,8 @@ table.tf-graph-controls td.input-element-table-data {
   </template>
   </div>
 </template>
+</dom-module>
+
 <script>
 (function() { // Private scope.
 /**
@@ -620,8 +681,10 @@ Polymer({
     this.renderHierarchy.traceInputs = event.target.active;
     tf.graph.scene.node.traceInputs(this.renderHierarchy);
   },
-  _statsNotNull: function(stats) {
-    return stats != null;
+  _xlaClustersProvided: function(renderHierarchy) {
+    return renderHierarchy &&
+        renderHierarchy.hierarchy &&
+        renderHierarchy.hierarchy.xlaClusters.length > 0;
   },
   _statsChanged: function(stats) {
     if (stats == null) {
@@ -757,8 +820,10 @@ Polymer({
       graphPath = graphPath.substring(slashIndex + 1);
     }
     this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  }
+  },
+  _statsNotNull: function(stats) {
+    return stats !== null;
+  },
 });
 })(); // Closing private scope.
 </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..031aa4be10ad4737b8b99572af0094261cca858c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/BUILD
@@ -0,0 +1,63 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_d3v4",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    path = "/tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_flex_layout",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_radio_group",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph.html",
+        "tf-graph-minimap.html",
+        "tf-graph-scene.html",
+    ],
+    destdir = "tf-graph",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4b4004bec823f5ae480fdb7eeeee386d472df592
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..52e2f0b9340950ed5f873cba17c8bbf2aee62e6a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/demo/index.html
@@ -0,0 +1,92 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <!-- We color ops in the graph by XLA cluster. -->
+        <tf-graph id="graph" color-by="xla_cluster"></tf-graph>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Render the graph.
+              this.$.graph.set('basicGraph', slimGraph);
+              this.$.graph.set('graphHierarchy', graphHierarchy);
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-demo></tf-graph-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
rename to tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html
index cdbee3af5ed744fe4942ed86d6dd1d8bf69775c8..5fc16c05207fd082336717a6da2563e9eafc3985 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-minimap.html
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-minimap.html
@@ -16,7 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<script src="../tf-graph-common/lib/scene/minimap.js"></script>
+<script src="../tf-graph-common/minimap.js"></script>
 
 <dom-module id="tf-graph-minimap">
 <template>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html
similarity index 92%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
rename to tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html
index 09513699c30a87f88ec04a50e560e61ce224ec24..35705713b98cfc500a23c6df744e4805348e0ed3 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph-scene.html
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph-scene.html
@@ -322,7 +322,7 @@ limitations under the License.
 /* --- Annotation --- */
 
 /* only applied for annotations that are not summary or constant.
-(.summary, .constant gets overriden below) */
+(.summary, .constant gets overridden below) */
 ::content .annotation > .annotation-node > * {
   stroke-width: 0.5;
   stroke-dasharray: 1, 1;
@@ -619,6 +619,18 @@ Polymer({
     renderHierarchy: Object,
     name: String,
     colorBy: String,
+
+    // For each render hierarchy, we only fit it to the viewport once (when the scene is attached to
+    // the DOM). We do not fit the hierarchy again (unless the user clicks the reset button). For
+    // instance, if the user enters a certain view in the graph, switches to another dashboard, and
+    // returns to the graph dashboard, the user expects the previous view. These properties enable
+    // that behavior.
+
+    /** Whether the scene has fit the current render hierarchy (to the viewport) at least once. */
+    _hasRenderHierarchyBeenFitOnce: Boolean,
+    /** Whether this scene element is currently attached to a parent element. */
+    _isAttached: Boolean,
+
     /** @type {d3_zoom} d3 zoom object */
     _zoom: Object,
     highlightedNode: {
@@ -637,12 +649,12 @@ Polymer({
     },
     /** Keeps track of the starting coordinates of a graph zoom/pan */
     _zoomStartCoords: {
-      type: Array,
+      type: Object,
       value: null
     },
     /** Keeps track of the current coordinates of a graph zoom/pan */
-    _zoomCoords: {
-      type: Array,
+    _zoomTransform: {
+      type: Object,
       value: null
     },
     /** Maximum distance of a zoom event for it to be interpreted as a click */
@@ -723,7 +735,10 @@ Polymer({
   },
   observers: [
     '_colorByChanged(colorBy)',
-    '_buildAndFit(renderHierarchy)',
+    '_renderHierarchyChanged(renderHierarchy)',
+    // Animation and fitting must come after the observer for the hierarchy changing because we must
+    // first build the render hierarchy.
+    '_animateAndFit(_isAttached, renderHierarchy)',
     '_updateHealthPills(nodeNamesToHealthPills, healthPillStepIndex)',
   ],
   getNode: function(nodeName) {
@@ -772,8 +787,8 @@ Polymer({
     }.bind(this), tf.graph.layout.PARAMS.animation.duration);
   },
   ready: function() {
-    this._zoom = d3.behavior.zoom()
-      .on('zoomend', function() {
+    this._zoom = d3.zoom()
+      .on('end', function() {
         if (this._zoomStartCoords) {
           // Calculate the total distance dragged during the zoom event.
           // If it is sufficiently small, then fire an event indicating
@@ -782,8 +797,8 @@ Polymer({
           // is ignored (as this mouse click was part of a zooming, and should
           // not be used to indicate an actual click on the graph).
           var dragDistance = Math.sqrt(
-            Math.pow(this._zoomStartCoords[0] - this._zoomCoords[0], 2) +
-            Math.pow(this._zoomStartCoords[1] - this._zoomCoords[1], 2));
+            Math.pow(this._zoomStartCoords.x - this._zoomTransform.x, 2) +
+            Math.pow(this._zoomStartCoords.y - this._zoomTransform.y, 2));
           if (dragDistance < this._maxZoomDistanceForClick) {
             this._fireEnableClick();
           } else {
@@ -793,8 +808,8 @@ Polymer({
         this._zoomStartCoords = null;
       }.bind(this))
       .on('zoom', function() {
-        // Store the coordinates of the zoom event
-        this._zoomCoords = d3.event.translate;
+        // Store the coordinates of the zoom event.
+        this._zoomTransform = d3.event.transform;
 
         // If this is the first zoom event after a zoom-end, then
         // store the coordinates as the start coordinates as well,
@@ -803,15 +818,13 @@ Polymer({
         // event on mouse-down, even if there has been no dragging
         // done to translate the graph around.
         if (!this._zoomStartCoords) {
-          this._zoomStartCoords = this._zoomCoords.slice();
+          this._zoomStartCoords = this._zoomTransform;
           this.fire('disable-click');
         }
         this._zoomed = true;
-        d3.select(this.$.root).attr('transform',
-                    'translate(' + d3.event.translate + ')' +
-                    'scale(' + d3.event.scale + ')');
+        d3.select(this.$.root).attr('transform', d3.event.transform);
         // Notify the minimap.
-        this.minimap.zoom(d3.event.translate, d3.event.scale);
+        this.minimap.zoom(d3.event.transform);
       }.bind(this));
     d3.select(this.$.svg).call(this._zoom)
       .on('dblclick.zoom', null);
@@ -826,9 +839,24 @@ Polymer({
         tf.graph.layout.PARAMS.minimap.size,
         tf.graph.layout.PARAMS.subscene.meta.labelHeight);
   },
-  _buildAndFit: function(renderHierarchy) {
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
+  },
+  _renderHierarchyChanged: function(renderHierarchy) {
+    this._hasRenderHierarchyBeenFitOnce = false;
     this._resetState();
     this._build(renderHierarchy);
+  },
+  _animateAndFit: function(isAttached, renderHierarchy) {
+    if (this._hasRenderHierarchyBeenFitOnce || !isAttached) {
+      // Do not animate and fit if the scene has already fitted this render hierarchy once. Or if
+      // the graph dashboard is not attached (in which case the scene lacks DOM info for fitting).
+      return;
+    }
+
     // Fit to screen after the graph is done animating.
     setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
   },
@@ -837,14 +865,14 @@ Polymer({
     var titleStyle = mainGraphTitleElement.style;
     var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
     var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-      tf.graph.scene.Class.Scene.CORE)[0][0];
+      tf.graph.scene.Class.Scene.CORE).node();
     // Only show labels if the graph is fully loaded.
     if (showLabels && core && this.progress && this.progress.value === 100) {
       var aux =
         d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.INEXTRACT)[0][0] ||
+          tf.graph.scene.Class.Scene.INEXTRACT).node() ||
         d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.OUTEXTRACT)[0][0];
+          tf.graph.scene.Class.Scene.OUTEXTRACT).node();
       var coreX = core.getCTM().e;
       var auxX = aux ? aux.getCTM().e : null;
       titleStyle.display = 'inline';
@@ -881,6 +909,7 @@ Polymer({
     }
   },
   fit: function() {
+    this._hasRenderHierarchyBeenFitOnce = true;
     tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
       this._zoomed = false;
     }.bind(this));
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph.html b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph.html
rename to tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html
index 8792f475c44c0011bfb959ab85f3f23eae884408..efbf065a40ac80d3a45f6fe304841c98ed51a02b 100644
--- a/tensorflow/tensorboard/components/tf_graph/tf-graph.html
+++ b/tensorflow/tensorboard/components/tf_graph_d3v4/tf-graph.html
@@ -172,7 +172,14 @@ Polymer({
             device: deviceName,
             color: renderGraph.deviceColorMap(deviceName)
           };
-        })
+        }),
+        xla_cluster: _.map(renderGraph.xlaClusterColorMap.domain(),
+            function(xlaClusterName) {
+          return {
+            xla_cluster: xlaClusterName,
+            color: renderGraph.xlaClusterColorMap(xlaClusterName)
+          };
+        }),
       });
       this._setRenderHierarchy(renderGraph);
       this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..01e786d1b9c64add6e2af0e09fb3744d8401c468
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/BUILD
@@ -0,0 +1,58 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_graph_dashboard_d3v4",
+    srcs = [
+        "tf-graph-dashboard.html",
+    ],
+    path = "/tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-graph-dashboard.html",
+    ],
+    destdir = "tf-graph-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_board_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_controls_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4:legacy",
+    ],
+)
+
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD
similarity index 50%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD
rename to tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD
index 894de95be65775ce5c7411e4599dab121c662432..74238d78e2f2f97a054d3588abf7d3b08ef02867 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/BUILD
@@ -1,18 +1,17 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-# bazel run //third_party/tensorflow/tensorboard/components/vz_histogram_timeseries/demo
-webfiles(
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_dashboard/demo
+web_library(
     name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-histogram-timeseries/demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-dashboard/demo",
     deps = [
-        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4",
         "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_button",
         "@org_polymer_paper_styles",
         "@org_polymer_webcomponentsjs",
     ],
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/graph_run_run1.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json
new file mode 100644
index 0000000000000000000000000000000000000000..0429aa71f8271a291450f898e2a4b73da738b267
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/data/runs.json
@@ -0,0 +1,6 @@
+{
+  "run1": {
+    "graph": true,
+    "scalars": ["foo/sin"]
+  }
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..67756cc1298a15818263b1825b3d8a381b38ac7a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/demo/index.html
@@ -0,0 +1,56 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../tf-graph-dashboard.html">
+<link rel="import" href="../../paper-styles/typography.html">
+
+<title>Graph Dashboard Demo</title>
+<style>
+  #demo-container {
+    display: block;
+    height: 900px;
+    position: relative;
+    width: 100%;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="graph-dashboard-demo">
+      <template>
+        <tf-graph-dashboard backend="[[backend]]"></tf-graph-dashboard>
+      </template>
+      <script>
+        Polymer({
+          is: "graph-dashboard-demo",
+          properties: {
+            backend: {
+              type: Object,
+              value: function() {
+                var router = new TF.Backend.router("data", true);
+                return new TF.Backend.Backend(router);
+              },
+            },
+          },
+        });
+      </script>
+    </dom-module>
+    <graph-dashboard-demo id="demo-container"></graph-dashboard-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html
similarity index 84%
rename from tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
rename to tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html
index d62e4ccedc228979506d602415e1b19150ca35d0..891905e7c470aae627a03edf263d32ff8ed19c07 100644
--- a/tensorflow/tensorboard/components/tf_graph_dashboard/tf-graph-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_graph_dashboard_d3v4/tf-graph-dashboard.html
@@ -18,7 +18,7 @@ limitations under the License.
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../tf-graph-loader/tf-graph-loader.html">
 <link rel="import" href="../tf-graph-board/tf-graph-board.html">
-<link rel="import" href="../tf-graph/tf-graph-controls.html">
+<link rel="import" href="../tf-graph-controls/tf-graph-controls.html">
 <link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-backend/tf-backend.html">
 
@@ -103,21 +103,31 @@ out-hierarchy-params="{{_hierarchyParams}}"
 </dom-module>
 
 <script>
+"use strict";
+
 (function() {
-Polymer({
+TF.Dashboard.TfGraphDashboard = Polymer({
   is: 'tf-graph-dashboard',
+  factoryImpl: function(backend, debuggerDataEnabled) {
+    this.backend = backend;
+    this.debuggerDataEnabled = debuggerDataEnabled;
+  },
   behaviors: [
+    TF.Dashboard.DashboardBehavior("graphs"),
     TF.Dashboard.ReloadBehavior("tf-graph-dashboard"),
     TF.Backend.Behavior,
   ],
   properties: {
     _datasets: Object,
     _renderHierarchy: {type: Object, observer: '_renderHierarchyChanged'},
-    backend: {type: Object, observer: '_backendChanged'},
+    backend: Object,
     debuggerDataEnabled: Boolean,
     allStepsModeEnabled: Boolean,
     specificHealthPillStep: {type: Number, value: 0},
     healthPillsToggledOn: {type: Boolean, value: true, observer: '_healthPillsToggledOnChanged'},
+    _isAttached: Boolean,
+    // Whether this dashboard is initialized. This dashboard should only be initialized once.
+    _initialized: Boolean,
     // Whether health pills are currently being loaded, in which case we may want to say show a
     // spinner.
     _areHealthPillsLoading: Boolean,
@@ -145,24 +155,37 @@ Polymer({
     'node-toggle-expand': '_handleNodeToggleExpand',
   },
   observers: [
-    '_maybeFetchHealthPillsAtSpecificStep(allStepsModeEnabled, specificHealthPillStep)',
+    '_maybeFetchHealthPills(allStepsModeEnabled, specificHealthPillStep)',
+    '_maybeInitializeDashboard(backend, _isAttached)',
   ],
+  attached: function() {
+    this.set('_isAttached', true);
+  },
+  detached: function() {
+    this.set('_isAttached', false);
+  },
   reload: function() {
-    if (!this.debuggerDataEnabled ||
-        !this.healthPillsToggledOn ||
-        !this._renderHierarchy ||
-        this._datasetsEmpty(this._datasets)) {
-      // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
-      // or if the graph itself has not loaded yet. We need the graph to load so that we know which
-      // nodes to request health pills for.
+    this._maybeFetchHealthPills();
+  },
+  _shouldRequestHealthPills: function() {
+    // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
+    // or if the graph itself has not loaded yet. We need the graph to load so that we know which
+    // nodes to request health pills for.
+    return this.debuggerDataEnabled &&
+        this.healthPillsToggledOn &&
+        this._renderHierarchy &&
+        !this._datasetsEmpty(this._datasets);
+  },
+  _maybeInitializeDashboard: function(backend, isAttached) {
+    if (this._initialized || !backend || !isAttached) {
+      // Either this dashboard is already initialized ... or we are not yet ready to initialize.
       return;
     }
-
-    // Request debugger data on graph reloads, but do not re-request the graph itself. The graph
-    // would not change across reloads.
-    this._requestHealthPills();
-  },
-  _backendChanged: function(backend) {
+    if (typeof ga !== 'undefined' && ga != null) {
+      ga('send', {hitType: 'pageview', page: '/v/graph'});
+    }
+    // Set this to true so we only initialize once.
+    this._initialized = true;
     Promise.all([backend.graphRuns(), backend.runMetadataRuns()])
       .then(function(result) {
         var runsWithGraph = result[0].sort(VZ.Sorting.compareTagNames);
@@ -253,7 +276,7 @@ Polymer({
   },
   _handleNodeToggleExpand: function() {
     // Nodes were toggled. We may need to request health pills for more nodes.
-    this._requestHealthPills();
+    this._maybeFetchHealthPills();
   },
   _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
     if (healthPillsToggledOn) {
@@ -265,9 +288,8 @@ Polymer({
     }
   },
   // Fetch health pills for a specific step if applicable.
-  _maybeFetchHealthPillsAtSpecificStep: function(allStepsModeEnabled, specificHealthPillStep) {
-    if (!this._renderHierarchy) {
-      // The graph is not ready yet.
+  _maybeFetchHealthPills: function() {
+    if (!this._shouldRequestHealthPills()) {
       return;
     }
 
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD b/tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD
similarity index 51%
rename from tensorflow/tensorboard/components/tf_image_dashboard/BUILD
rename to tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD
index 58c7cf8cd5d11457579708a6f32492ffa89ebfa0..1235a51ae0de87aeeaeac99ecd42bfd8ba239786 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/BUILD
@@ -1,27 +1,26 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
-    name = "tf_image_dashboard",
+web_library(
+    name = "tf_graph_info_d3v4",
     srcs = [
-        "tf-image-dashboard.html",
-        "tf-image-loader.html",
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
     ],
-    path = "/tf-image-dashboard",
+    path = "/tf-graph-info",
     deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
         "@org_polymer",
-        "@org_polymer_paper_dialog",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_list",
         "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_item",
         "@org_polymer_paper_slider",
         "@org_polymer_paper_spinner",
     ],
@@ -36,21 +35,29 @@ filegroup(
 ################################################################################
 # MARKED FOR DELETION
 
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
 tensorboard_webcomponent_library(
     name = "legacy",
     srcs = [
-        "tf-image-dashboard.html",
-        "tf-image-loader.html",
+        "tf-graph-icon.html",
+        "tf-graph-info.html",
+        "tf-node-info.html",
+        "tf-node-list-item.html",
     ],
-    destdir = "tf-image-dashboard",
+    destdir = "tf-graph-info",
     deps = [
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:legacy",
     ],
 )
 
-# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
 tensorboard_ts_library(
     name = "legacy_ts",
     srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
 )
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9c8d78ee068ee76d35889a1cb9c3bce97d6ac97c
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/BUILD
@@ -0,0 +1,26 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_info/demo
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-info/demo",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_info_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..f7d2ef7ee5e56a870b1b49cfff3dd416953f3fa3
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/demo/index.html
@@ -0,0 +1,94 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-info.html">
+<link rel="import" href="../../tf-graph-common/tf-graph-common.html">
+<link rel="import" href="../../tf-graph-loader/tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Info Demo</title>
+<style>
+  #demo-container {
+    border: 2px solid #808080;
+    width: 1000px;
+    height: 600px;
+  }
+</style>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-info-demo">
+      <template>
+        <!-- We first use the graph loader to load and parse a pbtxt file into a graph object. -->
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            out-graph="{{_graph}}">
+        </tf-graph-loader>
+
+        <tf-graph-info id="info" title="selected"></tf-graph-info>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-info-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _graph: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_graphUpdated(_graph)',
+          ],
+          _graphUpdated: function(slimGraph) {
+            const tracker = tf.graph.util.getTracker(this.$.loader);
+            const hierarchyTracker = tf.graph.util.getSubtaskTracker(
+                tracker, 100, 'Namespace hierarchy');
+            const hierarchyOptions = {};
+            tf.graph.hierarchy.build(slimGraph, hierarchyOptions, hierarchyTracker).then(
+                function(graphHierarchy) {
+              // We have parsed and built the graph object from a pbtxt file. Show info.
+              this.$.info.set('graph', slimGraph);
+              this.$.info.set('graphHierarchy', graphHierarchy);
+
+              // Select a node within that graph.
+              this.$.info.set('selectedNode', 'GradientDescent/learning_rate');
+            }.bind(this));
+          },
+        });
+      </script>
+    </dom-module>
+    <div id='demo-container'>
+      <tf-graph-info-demo></tf-graph-info-demo>
+    </div>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-icon.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph/tf-graph-icon.html
rename to tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-icon.html
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html
similarity index 99%
rename from tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
rename to tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html
index 45347fb1de5e91ca8e67e50013588d9c0e91f160..b33e1e00d04e4836322ce0975847aa88f6b0a5d2 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-graph-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-graph-info.html
@@ -169,6 +169,8 @@ h2 {
 </template>
 </template>
 <script>
+"use strict";
+
 (function() {
   Polymer({
     is: 'tf-graph-info',
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html
similarity index 97%
rename from tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
rename to tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html
index 1e60cda66adbf125f716987dc3932894e14f553d..f1455acaee2b9f9cc7c5ef30c0036b3301f378e3 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-info.html
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-info.html
@@ -19,9 +19,10 @@ limitations under the License.
 <link rel="import" href="../iron-list/iron-list.html">
 <link rel="import" href="../polymer/polymer.html">
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-item/all-imports.html">
+<link rel="import" href="../paper-item/paper-item.html">
+<link rel="import" href="../paper-item/paper-item-body.html">
 <link rel="import" href="../tf-graph-common/tf-graph-common.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
+<link rel="import" href="tf-graph-icon.html">
 <link rel="import" href="tf-node-list-item.html">
 
 <dom-module id="tf-node-info">
@@ -315,7 +316,7 @@ limitations under the License.
                   <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
                 </div>
               </template>
-              <template is="dom-if" if="{{_nodeStats.totalMicros}}">
+              <template is="dom-if" if="{{_getTotalMicros(_nodeStats)}}">
                 <div class="sub-list-table-row">
                   <div class="sub-list-table-cell">Compute Time</div>
                   <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
@@ -456,6 +457,9 @@ limitations under the License.
           }
           return null;
         },
+        _getTotalMicros: function(stats) {
+          return stats.getTotalMicros();
+        },
         _getHasDisplayableNodeStats: function(stats) {
           return tf.graph.util.hasDisplayableNodeStats(stats);
         },
@@ -468,12 +472,12 @@ limitations under the License.
               stats.totalBytes, tf.graph.util.MEMORY_UNITS);
         },
         _getNodeStatsFormattedComputeTime: function(stats) {
-          if (!stats || !stats.totalMicros) {
+          if (!stats || !stats.getTotalMicros()) {
             return;
           }
 
           return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalMicros, tf.graph.util.TIME_UNITS);
+              stats.getTotalMicros(), tf.graph.util.TIME_UNITS);
         },
         _getNodeStatsFormattedOutputSizes: function(stats) {
           if (!stats || !stats.outputSize || !stats.outputSize.length) {
diff --git a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
rename to tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html
index 9e9bface5de22709406921ac4008c3781d7e81cd..c15478d126ccbb055a7bbb46f3a29c897321a648 100644
--- a/tensorflow/tensorboard/components/tf_graph_info/tf-node-list-item.html
+++ b/tensorflow/tensorboard/components/tf_graph_info_d3v4/tf-node-list-item.html
@@ -16,8 +16,8 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-graph/tf-graph-icon.html">
 <link rel="import" href="../tf-dashboard-common/tensorboard-color.html">
+<link rel="import" href="tf-graph-icon.html">
 
 <dom-module id="tf-node-list-item">
   <style>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD
similarity index 52%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD
index 49920b11a1a1dab3d36edda0c6dbf3e1038b7ba0..21eb4df1b46ceadc9dbf71d9a360faee5df5bfee 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/BUILD
@@ -1,19 +1,17 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
-    name = "vz_histogram_timeseries",
+web_library(
+    name = "tf_graph_loader_d3v4",
     srcs = [
-        "vz-histogram-timeseries.html",
+        "tf-graph-loader.html",
     ],
-    path = "/vz-histogram-timeseries",
+    path = "/tf-graph-loader",
     deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4",
         "@org_polymer",
     ],
 )
@@ -27,21 +25,25 @@ filegroup(
 ################################################################################
 # MARKED FOR DELETION
 
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
 tensorboard_webcomponent_library(
     name = "legacy",
     srcs = [
-        "index.html",
-        "vz-histogram-timeseries.html",
-        ":legacy_ts",
+        "tf-graph-loader.html",
     ],
-    visibility = ["//visibility:public"],
-    destdir = "vz-histogram-timeseries",
+    destdir = "tf-graph-loader",
     deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//third_party/javascript/polymer/v1/polymer:lib",
+        "//tensorflow/tensorboard/components/tf_graph_common_d3v4:legacy",
     ],
 )
 
+# This is needed despite how this component lacks TypeScript files because
+# components/BUILD seeks a legacy_ts rule in this package.
 tensorboard_ts_library(
     name = "legacy_ts",
+    srcs = [],
+    deps_mgmt = "off",
+    runtime = "nodejs",
 )
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD
similarity index 52%
rename from tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD
index 77f05aa2d4c3540953bcce3e9c8e6dbc9685cf79..b16ca418946aa8e52de371bef998c434202f59b4 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart/demo/BUILD
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/BUILD
@@ -1,16 +1,16 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-# bazel run //third_party/tensorflow/tensorboard/components/vz_distribution_chart/demo
-webfiles(
+# bazel run //third_party/tensorflow/tensorboard/components/tf_graph_loader/demo
+web_library(
     name = "demo",
-    srcs = ["index.html"],
-    path = "/vz-distribution-chart/demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-graph-loader/demo",
     deps = [
-        "//tensorflow/tensorboard/components/vz_distribution_chart",
+        "//tensorflow/tensorboard/components/tf_graph_loader_d3v4",
         "@org_polymer_iron_demo_helpers",
         "@org_polymer_paper_styles",
         "@org_polymer_webcomponentsjs",
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30b206453469801d31b46856c29cdda78164f18f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/data/graph.pbtxt
@@ -0,0 +1,4606 @@
+node {
+  name: "GradientDescent/learning_rate"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_3"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 100
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000d\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_grad/Shape"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/BroadcastGradientArgs"
+  op: "BroadcastGradientArgs"
+  input: "gradients/add_1_grad/Shape"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+        }
+        shape {
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod_1"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape_1"
+  input: "gradients/Mean_grad/Const_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Maximum"
+  op: "Maximum"
+  input: "gradients/Mean_grad/Prod_1"
+  input: "gradients/Mean_grad/Maximum/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Prod"
+  op: "Prod"
+  input: "gradients/Mean_grad/Shape"
+  input: "gradients/Mean_grad/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/floordiv"
+  op: "FloorDiv"
+  input: "gradients/Mean_grad/Prod"
+  input: "gradients/Mean_grad/Maximum"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Cast"
+  op: "Cast"
+  input: "gradients/Mean_grad/floordiv"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "DstT"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "SrcT"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile/multiples"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Fill"
+  op: "Fill"
+  input: "gradients/Shape"
+  input: "gradients/Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Fill"
+  input: "gradients/Mean_grad/Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/Tile"
+  op: "Tile"
+  input: "gradients/Mean_grad/Reshape"
+  input: "gradients/Mean_grad/Tile/multiples"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tmultiples"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Mean_grad/truediv"
+  op: "RealDiv"
+  input: "gradients/Mean_grad/Tile"
+  input: "gradients/Mean_grad/Cast"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_3_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/Mean_grad/truediv"
+  input: "gradients/Reshape_3_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  op: "ExpandDims"
+  input: "gradients/Reshape_3_grad/Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims/dim"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tdim"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/begin"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat_1/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_2"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_1"
+  op: "Sub"
+  input: "Rank_2"
+  input: "Sub_1/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_1/begin"
+  op: "Pack"
+  input: "Sub_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_1"
+  op: "Slice"
+  input: "Shape_2"
+  input: "Slice_1/begin"
+  input: "Slice_1/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat_1"
+  op: "ConcatV2"
+  input: "concat_1/values_0"
+  input: "Slice_1"
+  input: "concat_1/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat/axis"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "concat/values_0"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: -1
+      }
+    }
+  }
+}
+node {
+  name: "Slice/size"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Sub/y"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "Shape_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub"
+  op: "Sub"
+  input: "Rank_1"
+  input: "Sub/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice/begin"
+  op: "Pack"
+  input: "Sub"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice"
+  op: "Slice"
+  input: "Shape_1"
+  input: "Slice/begin"
+  input: "Slice/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "concat"
+  op: "ConcatV2"
+  input: "concat/values_0"
+  input: "Slice"
+  input: "concat/axis"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\n\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "Rank"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 2
+      }
+    }
+  }
+}
+node {
+  name: "Sub_2"
+  op: "Sub"
+  input: "Rank"
+  input: "Sub_2/y"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Slice_2/size"
+  op: "Pack"
+  input: "Sub_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "Slice_2"
+  op: "Slice"
+  input: "Shape"
+  input: "Slice_2/begin"
+  input: "Slice_2/size"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "Index"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_biases/read"
+  op: "Identity"
+  input: "logits_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "logits_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "logits_weights/read"
+  op: "Identity"
+  input: "logits_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_biases"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_biases/read"
+  op: "Identity"
+  input: "hidden_biases"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "hidden_weights"
+  op: "VariableV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 784
+        }
+        dim {
+          size: 100
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "hidden_weights/read"
+  op: "Identity"
+  input: "hidden_weights"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape/shape"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 2
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\310\000\000\000\377\377\377\377"
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/depth"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/off_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot/on_value"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 200
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_1/random_shuffle_queue"
+  op: "RandomShuffleQueueV2"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "capacity"
+    value {
+      i: 20000
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "container"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "min_after_dequeue"
+    value {
+      i: 4000
+    }
+  }
+  attr {
+    key: "seed"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "seed2"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 28
+          }
+          dim {
+            size: 28
+          }
+          dim {
+            size: 1
+          }
+        }
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "shared_name"
+    value {
+      s: ""
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  op: "QueueDequeueManyV2"
+  input: "mnist_dataset_train_1/random_shuffle_queue"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany/n"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "component_types"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    key: "timeout_ms"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany"
+  input: "Reshape/shape"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: -1
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "MatMul"
+  input: "Reshape"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add"
+  op: "Add"
+  input: "MatMul"
+  input: "hidden_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "add"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "add_1"
+  op: "Add"
+  input: "MatMul_1"
+  input: "logits_biases/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_1"
+  op: "Reshape"
+  input: "add_1"
+  input: "concat"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "mnist_dataset_train_2/one_hot"
+  op: "OneHot"
+  input: "mnist_dataset_train_2/random_shuffle_queue_DequeueMany:1"
+  input: "mnist_dataset_train_2/one_hot/depth"
+  input: "mnist_dataset_train_2/one_hot/on_value"
+  input: "mnist_dataset_train_2/one_hot/off_value"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "TI"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          unknown_rank: true
+        }
+      }
+    }
+  }
+  attr {
+    key: "axis"
+    value {
+      i: -1
+    }
+  }
+}
+node {
+  name: "Reshape_2"
+  op: "Reshape"
+  input: "mnist_dataset_train_2/one_hot"
+  input: "concat_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "SoftmaxCrossEntropyWithLogits"
+  op: "SoftmaxCrossEntropyWithLogits"
+  input: "Reshape_1"
+  input: "Reshape_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  op: "PreventGradient"
+  input: "SoftmaxCrossEntropyWithLogits:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "message"
+    value {
+      s: "Currently there is no way to take the second derivative of softmax_cross_entropy_with_logits due to the fused  implementation\'s interaction with tf.gradients()"
+    }
+  }
+}
+node {
+  name: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  op: "Mul"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/ExpandDims"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/PreventGradient"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Reshape_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/SoftmaxCrossEntropyWithLogits_grad/mul"
+  input: "gradients/Reshape_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum_1"
+  input: "gradients/add_1_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Sum"
+  op: "Sum"
+  input: "gradients/Reshape_1_grad/Reshape"
+  input: "gradients/add_1_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_1_grad/Sum"
+  input: "gradients/add_1_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape_1"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_1_grad/Reshape"
+  input: "^gradients/add_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_1_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul_1"
+  op: "MatMul"
+  input: "Relu"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_1_grad/tuple/control_dependency"
+  input: "logits_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul_1"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_logits_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "logits_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@logits_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 10
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_1_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/MatMul_1_grad/MatMul"
+  input: "^gradients/MatMul_1_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_1_grad/MatMul"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/Relu_grad/ReluGrad"
+  op: "ReluGrad"
+  input: "gradients/MatMul_1_grad/tuple/control_dependency"
+  input: "Relu"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum_1"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs:1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape_1"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum_1"
+  input: "gradients/add_grad/Shape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Sum"
+  op: "Sum"
+  input: "gradients/Relu_grad/ReluGrad"
+  input: "gradients/add_grad/BroadcastGradientArgs"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/Reshape"
+  op: "Reshape"
+  input: "gradients/add_grad/Sum"
+  input: "gradients/add_grad/Shape"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/Reshape_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape_1"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_biases"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/add_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_biases"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/add_grad/tuple/control_dependency"
+  op: "Identity"
+  input: "gradients/add_grad/Reshape"
+  input: "^gradients/add_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/add_grad/Reshape"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul_1"
+  op: "MatMul"
+  input: "Reshape"
+  input: "gradients/add_grad/tuple/control_dependency"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/MatMul"
+  op: "MatMul"
+  input: "gradients/add_grad/tuple/control_dependency"
+  input: "hidden_weights/read"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+          dim {
+            size: 784
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/group_deps"
+  op: "NoOp"
+  input: "^gradients/MatMul_grad/MatMul"
+  input: "^gradients/MatMul_grad/MatMul_1"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "gradients/MatMul_grad/tuple/control_dependency_1"
+  op: "Identity"
+  input: "gradients/MatMul_grad/MatMul_1"
+  input: "^gradients/MatMul_grad/tuple/group_deps"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@gradients/MatMul_grad/MatMul_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: -1
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  op: "ApplyGradientDescent"
+  input: "hidden_weights"
+  input: "GradientDescent/learning_rate"
+  input: "gradients/MatMul_grad/tuple/control_dependency_1"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@hidden_weights"
+      }
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 784
+          }
+          dim {
+            size: 100
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "use_locking"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "GradientDescent"
+  op: "NoOp"
+  input: "^GradientDescent/update_hidden_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_hidden_biases/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_weights/ApplyGradientDescent"
+  input: "^GradientDescent/update_logits_biases/ApplyGradientDescent"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_2"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+      }
+    }
+  }
+}
+node {
+  name: "Reshape_3"
+  op: "Reshape"
+  input: "SoftmaxCrossEntropyWithLogits"
+  input: "Slice_2"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tshape"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 200
+          }
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Mean"
+  op: "Mean"
+  input: "Reshape_3"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:XLA_CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_XlaCluster"
+    value {
+      s: "cluster_1"
+    }
+  }
+  attr {
+    key: "_output_shapes"
+    value {
+      list {
+        shape {
+        }
+      }
+    }
+  }
+  attr {
+    key: "keep_dims"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "_send_Mean_0"
+  op: "_Send"
+  input: "Mean"
+  device: "/job:localhost/replica:0/task:0/cpu:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/cpu:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: -5924635994370253548
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "Mean:0"
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 21
+}
diff --git a/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..2ffb2a1a59cba900252eec4169a93c4babbef094
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/demo/index.html
@@ -0,0 +1,75 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../tf-graph-loader.html">
+<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+<title>TF Graph Loader Demo</title>
+<demo-snippet>
+  <template>
+    <dom-module id="tf-graph-loader-demo">
+      <template>
+        <tf-graph-loader
+            id="loader"
+            datasets="[[_datasets]]"
+            selected-dataset="[[_selectedDataset]]"
+            progress="{{_progress}}">
+        </tf-graph-loader>
+      </template>
+      <script>
+        "use strict";
+
+        Polymer({
+          is: "tf-graph-loader-demo",
+          properties: {
+            // We tell the graph loader to load a specific pbtxt file.
+            _datasets: {
+              type: Array,
+              value: [{
+                "name": "Graph with XLA Clusters Specified",
+                "path": "data/graph.pbtxt"
+              }],
+            },
+            _selectedDataset: {
+              type: Number,
+              value: 0,
+            },
+
+            // This property will be updated by the graph loader.
+            _progress: {
+              type: Object,
+            },
+          },
+          observers: [
+            '_progressUpdated(_progress)',
+          ],
+          _progressUpdated(progress) {
+            // console.log the progress.
+            console.log('Progress updated.', progress);
+
+            // The graph has loaded. console.log it.
+            if (progress.value == 100) {
+              console.log('graph', this.$.loader.outGraph);
+            }
+          },
+        });
+      </script>
+    </dom-module>
+    <!-- The graph loader lacks visual elements. -->
+    <tf-graph-loader-demo></tf-graph-loader-demo>
+  </template>
+</demo-snippet>
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/test/index.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/index.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_loader/test/index.html
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/index.html
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/test/loader.ts b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/loader.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_graph_loader/test/loader.ts
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/test/loader.ts
diff --git a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html
similarity index 98%
rename from tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
rename to tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html
index bdfb3aa2bfc4672eb2e5a8614ebf6fe14388e69a..8d59cbd2aacf4295fbfe3bfa12013b47c2c39285 100644
--- a/tensorflow/tensorboard/components/tf_graph_loader/tf-graph-loader.html
+++ b/tensorflow/tensorboard/components/tf_graph_loader_d3v4/tf-graph-loader.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-graph-common/tf-graph-common.html">
 
 <!--
 An element which provides a filter parsing for pbtxt to graph output.
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
deleted file mode 100644
index 2c7d0ea6e05b21d765da46cd9d4b49d658e859b3..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/BUILD
+++ /dev/null
@@ -1,60 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "tf_histogram_dashboard",
-    srcs = [
-        "tf-histogram-dashboard.html",
-    ],
-    path = "/tf-histogram-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/vz_histogram_timeseries",
-        "@org_polymer",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-histogram-dashboard.html",
-    ],
-    destdir = "tf-histogram-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/vz_histogram_timeseries:legacy",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-styles:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD
deleted file mode 100644
index 8350084874b1b769340c56727ee313b060152a6c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_histogram_dashboard/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-histogram-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_histogram_dashboard",
-        "//tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD
deleted file mode 100644
index d396efab73823be6e6a426049fbc0808558fa71a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-histogram-dashboard/demo/data",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8ef115e6d5ad92afc8fab222cba136082134fc52
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_histogram_dashboard_d3v4",
+    srcs = ["tf-histogram-dashboard.html"],
+    path = "/tf-histogram-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-histogram-dashboard",
+    deps = [
+        ":tf_histogram_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run1_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run1_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run1_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run1_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run2_tag_histo1.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo1.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run2_tag_histo1.json
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo1.json
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run2_tag_histo2.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo2.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/histograms_run_run2_tag_histo2.json
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/histograms_run_run2_tag_histo2.json
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/logdir
similarity index 100%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/logdir
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/logdir
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html
similarity index 85%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html
index de0fbf4be74355e4053340283285086e8726920f..c8d02f990d37eb625f50ef8c28753b5f491d508f 100644
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/index.html
@@ -16,10 +16,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-<link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-<link rel="import" href="../tf-histogram-dashboard.html">
-<link rel="import" href="../../paper-styles/typography.html">
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+<link rel="import" href="../paper-styles/typography.html">
+<link rel="import" href="tf-histogram-dashboard.html">
 
 <title>Distribution Dashboard Demo</title>
 <style>
diff --git a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html
similarity index 96%
rename from tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
rename to tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html
index f8c9cc4537f0cf9fdbb7a57d4b6dc08c7fd5f9e2..d9967961ebb8ec0d8dd1460df83e73cad21f0ddc 100644
--- a/tensorflow/tensorboard/components/tf_histogram_dashboard/tf-histogram-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4/tf-histogram-dashboard.html
@@ -121,11 +121,15 @@ contains vz-histogram-timeseries embedded inside tf-panes-helper's.
   </template>
 
   <script>
-    Polymer({
+    TF.Dashboard.TfHistogramDashboard = Polymer({
       is: "tf-histogram-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       behaviors: [
+        TF.Dashboard.DashboardBehavior("histograms"),
         TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        TF.Backend.BackendBehavior,
       ],
       properties: {
         backend: Object,
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD
deleted file mode 100644
index 3a42342ca08b6cc126ec5d2ddf5fd3c7ba632b11..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/demo/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_image_dashboard/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-image-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_image_dashboard",
-        "//tensorflow/tensorboard/components/tf_image_dashboard/demo/data",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD
deleted file mode 100644
index a613ac66c70cd7e27b163bf2ce14cda18a611894..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-image-dashboard/demo/data",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fcb242d5da1c4372b2a05508202dd7c9a8b98e4e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_image_dashboard_d3v4",
+    srcs = [
+        "tf-image-dashboard.html",
+        "tf-image-loader.html",
+    ],
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-image-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run1_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im1_2Fimage_2F0.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run1_tag_im1_2Fimage_2F0.json
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im1_2Fimage_2F0.json
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run1_tag_im2_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im2_2Fimage_2F0.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run1_tag_im2_2Fimage_2F0.json
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run1_tag_im2_2Fimage_2F0.json
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run2_tag_im1_2Fimage_2F0.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run2_tag_im1_2Fimage_2F0.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/images_run_run2_tag_im1_2Fimage_2F0.json
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/images_run_run2_tag_im1_2Fimage_2F0.json
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run1.png
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im1_2Fimage_2F0_index_0_run_run2.png
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/individualImage_tag_im2_2Fimage_2F0_index_0_run_run1.png
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/logdir
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/logdir
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/logdir
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html
similarity index 89%
rename from tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html
index 0165b24f47b648ab79b53a4188cc154c8f1c69ae..f9ea187952f0e55a9cd267a4395d8d55ddd820c8 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/index.html
@@ -18,9 +18,9 @@ limitations under the License.
 
 <html>
   <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../tf-image-dashboard.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-image-dashboard.html">
     <title>Image Dashboard Demo</title>
     <style>
       #container{
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html
similarity index 94%
rename from tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html
index 19c272a468328dc14908d33c5fd1cb40bdd98eca..0700a8c0e7622a35355315132511a8cd69a39ef1 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-dashboard.html
@@ -106,8 +106,11 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
     </style>
   </template>
   <script>
-    Polymer({
+    TF.Dashboard.TfImageDashboard = Polymer({
       is: "tf-image-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       properties: {
         backend: Object,
         dataType: {
@@ -116,8 +119,9 @@ tf-image-dashboard displays a dashboard that loads images from a TensorFlow run.
         },
       },
       behaviors: [
-          TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-          TF.Backend.Behavior,
+        TF.Dashboard.DashboardBehavior("images"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
       ],
       attached: function() {
         this.async(function() {
diff --git a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html
similarity index 99%
rename from tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
rename to tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html
index f667520fb5734949e35c15e0a3af541dd378c4ed..d9ba013dcea072af71ce5f792b200c8acd1cf0a2 100644
--- a/tensorflow/tensorboard/components/tf_image_dashboard/tf-image-loader.html
+++ b/tensorflow/tensorboard/components/tf_image_dashboard_d3v4/tf-image-loader.html
@@ -108,6 +108,8 @@ future for loading older images.
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-image-loader",
       properties: {
diff --git a/tensorflow/tensorboard/components/tf_imports/BUILD b/tensorflow/tensorboard/components/tf_imports/BUILD
index 5655407b300279a8ed928c7a16ff4bbcd977f9d9..81d1234d66c3101676792090be2dda5ac346d35e 100644
--- a/tensorflow/tensorboard/components/tf_imports/BUILD
+++ b/tensorflow/tensorboard/components/tf_imports/BUILD
@@ -1,10 +1,10 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
+web_library(
     name = "d3",
     srcs = [
         "d3.html",
@@ -13,30 +13,28 @@ webfiles(
     path = "/tf-imports",
 )
 
-webfiles(
+web_library(
     name = "lodash",
-    srcs = [
-        "lodash.html",
-        "@com_lodash",
-    ],
+    srcs = ["lodash.html"],
     path = "/tf-imports",
+    deps = ["@com_lodash"],
 )
 
-webfiles(
+web_library(
     name = "graphlib",
     srcs = [
         "graphlib.html",
-        "@io_github_cpettitt_graphlib",
+        "@io_github_cpettitt_graphlib//:graphlib.core.js",
     ],
     path = "/tf-imports",
     deps = [":lodash"],
 )
 
-webfiles(
+web_library(
     name = "dagre",
     srcs = [
         "dagre.html",
-        "@io_github_cpettitt_dagre",
+        "@io_github_cpettitt_dagre//:dagre.core.js",
     ],
     path = "/tf-imports",
     deps = [
@@ -45,7 +43,7 @@ webfiles(
     ],
 )
 
-webfiles(
+web_library(
     name = "plottable",
     srcs = [
         "plottable.html",
@@ -55,3 +53,9 @@ webfiles(
     path = "/tf-imports",
     deps = [":d3"],
 )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_imports/dagre.html b/tensorflow/tensorboard/components/tf_imports/dagre.html
index 48fe39da7936a77b6cb5801481c7a44109e44ba8..11164dc5042f068d50a5c4546c7c5fd659862cf7 100644
--- a/tensorflow/tensorboard/components/tf_imports/dagre.html
+++ b/tensorflow/tensorboard/components/tf_imports/dagre.html
@@ -15,10 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<!--
-HTML imports are non-blocking thus getting the dependency 'graphlib'
-and 'lodash' via script imports instead.
--->
-<script src="lodash.js"></script>
-<script src="graphlib.core.js"></script>
+<link rel="import" href="lodash.html">
+<link rel="import" href="graphlib.html">
+
 <script src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/graphlib.html b/tensorflow/tensorboard/components/tf_imports/graphlib.html
index 4e19f7b008fe876d89c8a88d1067c9b1fd5646e3..783e33be0a6ee7cb2d9f54de38bf434f938eed85 100644
--- a/tensorflow/tensorboard/components/tf_imports/graphlib.html
+++ b/tensorflow/tensorboard/components/tf_imports/graphlib.html
@@ -15,5 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<link rel="import" href="lodash.html">
+
 <script src="graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports/lodash.html b/tensorflow/tensorboard/components/tf_imports/lodash.html
index f92aa8087999567e2f6c038b76e83dedafe05512..cbe35f10505686cb8527a92edc6aa95c164a9ec2 100644
--- a/tensorflow/tensorboard/components/tf_imports/lodash.html
+++ b/tensorflow/tensorboard/components/tf_imports/lodash.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<script src="lodash.js"></script>
+<script src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD b/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..086d3219d7799377d4f32fd7d8efef31e67b33ba
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/BUILD
@@ -0,0 +1,440 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "lodash",
+    srcs = ["lodash.html"],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = ["@com_lodash"],
+)
+
+web_library(
+    name = "threejs",
+    srcs = [
+        "threejs.html",
+        "@org_threejs//:OrbitControls.js",
+        "@org_threejs//:three.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+web_library(
+    name = "numericjs",
+    srcs = [
+        "numericjs.html",
+        "@com_numericjs//:numeric.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+web_library(
+    name = "weblas",
+    srcs = [
+        "weblas.html",
+        "@io_github_waylonflinn_weblas//:weblas.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+web_library(
+    name = "graphlib",
+    srcs = [
+        "graphlib.html",
+        "@io_github_cpettitt_graphlib//:graphlib.core.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = [":lodash"],
+)
+
+web_library(
+    name = "dagre",
+    srcs = [
+        "dagre.html",
+        "@io_github_cpettitt_dagre//:dagre.core.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graphlib",
+        ":lodash",
+    ],
+)
+
+web_library(
+    name = "d3",
+    srcs = [
+        "d3.html",
+        "@org_d3js_v4//:d3.js",
+    ],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+)
+
+web_library(
+    name = "plottable",
+    srcs = ["plottable.html"],
+    path = "/tf-imports",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":d3",
+        ":plottable_js_css",
+    ],
+)
+
+web_library(
+    name = "plottable_js_css",
+    srcs = [
+        "@com_palantir_plottable_v3//:package/plottable.css",
+        "@com_palantir_plottable_v3//:package/plottable.js",
+    ],
+    path = "/tf-imports",
+    strip_prefix = "package",
+    visibility = ["//visibility:private"],
+)
+
+# Generate single TypeScript typings file for d3.js with no ES6 imports.
+#
+# The DefinitelyTyped definition of d3 v4 was written under the assumption that
+# we want to use d3 in a modularized way. We don't want to do that because its
+# import statements use NodeJS namespaces, and the Web Compiler only supports
+# W3C, ECMA, and IETF standards.
+tensorboard_typescript_bundle(
+    name = "d3_typings",
+    out = "d3.d.ts",
+    namespace_srcs = {"d3": [
+        "d3-transition.d.ts",
+        "@org_definitelytyped_types_d3_path//:index.d.ts",
+        "@org_definitelytyped_types_d3_time//:index.d.ts",
+        "@org_definitelytyped_types_d3_dsv//:index.d.ts",
+        "@org_definitelytyped_types_d3_color//:index.d.ts",
+        "@org_definitelytyped_types_d3_selection//:index.d.ts",
+        "@org_definitelytyped_types_d3_shape//:index.d.ts",
+        "@org_definitelytyped_types_d3_scale//:index.d.ts",
+        "@org_definitelytyped_types_d3_request//:index.d.ts",
+        "@org_definitelytyped_types_d3_interpolate//:index.d.ts",
+        "@org_definitelytyped_types_d3_drag//:index.d.ts",
+        "@org_definitelytyped_types_d3_brush//:index.d.ts",
+        "@org_definitelytyped_types_d3_axis//:index.d.ts",
+        "@org_definitelytyped_types_d3_zoom//:index.d.ts",
+        "@org_definitelytyped_types_d3_array//:index.d.ts",
+        "@org_definitelytyped_types_d3_chord//:index.d.ts",
+        "@org_definitelytyped_types_d3_collection//:index.d.ts",
+        "@org_definitelytyped_types_d3_dispatch//:index.d.ts",
+        "@org_definitelytyped_types_d3_ease//:index.d.ts",
+        "@org_definitelytyped_types_d3_force//:index.d.ts",
+        "@org_definitelytyped_types_d3_format//:index.d.ts",
+        "@org_definitelytyped_types_d3_hierarchy//:index.d.ts",
+        "@org_definitelytyped_types_d3_polygon//:index.d.ts",
+        "@org_definitelytyped_types_d3_quadtree//:index.d.ts",
+        "@org_definitelytyped_types_d3_queue//:index.d.ts",
+        "@org_definitelytyped_types_d3_random//:index.d.ts",
+        "@org_definitelytyped_types_d3_timer//:index.d.ts",
+        "@org_definitelytyped_types_d3_voronoi//:index.d.ts",
+    ]},
+    visibility = ["//visibility:public"],
+)
+
+# It would be nice if Plottable released a .d.ts file for plottable.js like
+# they did for previous versions.
+tensorboard_typescript_bundle(
+    name = "plottable_typings",
+    out = "plottable.d.ts",
+    namespace_srcs = {
+        "Plottable": [
+            "@com_palantir_plottable_v3//:package/build/src/core/dataset.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/core/interfaces.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/core/version.d.ts",
+        ],
+        "Plottable.Animators": [
+            "@com_palantir_plottable_v3//:package/build/src/animators/animator.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/animators/easingAnimator.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/animators/nullAnimator.d.ts",
+        ],
+        "Plottable.Axes": [
+            "@com_palantir_plottable_v3//:package/build/src/axes/axis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/categoryAxis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/numericAxis.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/axes/timeAxis.d.ts",
+        ],
+        "Plottable.Components": [
+            "@com_palantir_plottable_v3//:package/build/src/components/component.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/componentContainer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/dragBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/dragLineLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/gridlines.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/group.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/guideLineLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/interpolatedColorLegend.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/label.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/legend.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/plotGroup.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/selectionBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/table.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/xDragBoxLayer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/components/yDragBoxLayer.d.ts",
+        ],
+        "Plottable.Configs": [
+            "@com_palantir_plottable_v3//:package/build/src/core/config.d.ts",
+        ],
+        "Plottable.Formatters": [
+            "@com_palantir_plottable_v3//:package/build/src/core/formatters.d.ts",
+        ],
+        "Plottable.RenderController": [
+            "@com_palantir_plottable_v3//:package/build/src/core/renderController.d.ts",
+        ],
+        "Plottable.RenderPolicies": [
+            "@com_palantir_plottable_v3//:package/build/src/core/renderPolicy.d.ts",
+        ],
+        "Plottable.SymbolFactories": [
+            "@com_palantir_plottable_v3//:package/build/src/core/symbolFactories.d.ts",
+        ],
+        "Plottable.Dispatchers": [
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/dispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/keyDispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/mouseDispatcher.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/dispatchers/touchDispatcher.d.ts",
+        ],
+        "Plottable.Drawers": [
+            "@com_palantir_plottable_v3//:package/build/src/drawers/arcDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/arcOutlineDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/areaDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/canvasBuffer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/canvasDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/drawStep.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/drawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/lineDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/rectangleDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/segmentDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/svgDrawer.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/drawers/symbolDrawer.d.ts",
+        ],
+        "Plottable.Interactions": [
+            "@com_palantir_plottable_v3//:package/build/src/interactions/clickInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/dragInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/interaction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/keyInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/panZoomInteraction.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/interactions/pointerInteraction.d.ts",
+        ],
+        "Plottable.Plots": [
+            "@com_palantir_plottable_v3//:package/build/src/plots/areaPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/barPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/clusteredBarPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/commons.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/linePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/piePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/plot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/rectanglePlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/scatterPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/segmentPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/stackedAreaPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/stackedBarPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/waterfallPlot.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/plots/xyPlot.d.ts",
+        ],
+        "Plottable.Scales": [
+            "@com_palantir_plottable_v3//:package/build/src/scales/index.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/categoryScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/colorScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/interpolatedColorScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/linearScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/modifiedLogScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/quantitativeScale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/scale.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/scales/timeScale.d.ts",
+        ],
+        "Plottable.Scales.TickGenerators": [
+            "@com_palantir_plottable_v3//:package/build/src/scales/tickGenerators.d.ts",
+        ],
+        "Plottable.Utils": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/addD3SelectionMulti.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/bucket.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/callbackSet.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/coerceD3.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/entityStore.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/makeEnum.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/map.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/set.d.ts",
+            "@com_palantir_plottable_v3//:package/build/src/utils/transformAwareTranslator.d.ts",
+        ],
+        "Plottable.Utils.Array": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/arrayUtils.d.ts",
+        ],
+        "Plottable.Utils.Color": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/colorUtils.d.ts",
+        ],
+        "Plottable.Utils.DOM": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/domUtils.d.ts",
+        ],
+        "Plottable.Utils.Math": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/mathUtils.d.ts",
+        ],
+        "Plottable.Utils.Stacking": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/stackingUtils.d.ts",
+        ],
+        "Plottable.Utils.Window": [
+            "@com_palantir_plottable_v3//:package/build/src/utils/windowUtils.d.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "Plottable.Animators": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "SimpleSelection": "Plottable.SimpleSelection",
+        },
+        "Plottable.Axes": {
+            "Component": "Plottable.Components.Component",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scale": "Plottable.Scales.Scale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+        },
+        "Plottable.Components": {
+            "Bounds": "Plottable.Bounds",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IEntity": "Plottable.IEntity",
+            "Interactions": "Plottable.Interactions",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Scales": "Plottable.Scales",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SpaceRequest": "Plottable.SpaceRequest",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.RenderController": {
+            "Component": "Plottable.Components.Component",
+            "RenderPolicies": "Plottable.RenderPolicies",
+        },
+        "Plottable.SymbolFactories": {
+            "d3Shape": "d3",
+        },
+        "Plottable.Dispatchers": {
+            "Component": "Plottable.Components.Component",
+            "Dispatchers": "Plottable.Dispatchers",
+            "Point": "Plottable.Point",
+        },
+        "Plottable.Drawers": {
+            "AttributeToAppliedProjector": "Plottable.AttributeToAppliedProjector",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+        },
+        "Plottable.Interactions": {
+            "Component": "Plottable.Components.Component",
+            "Point": "Plottable.Point",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+        },
+        "Plottable.Plots": {
+            "AppliedDrawStep": "Plottable.Drawers.AppliedDrawStep",
+            "AttributeToProjector": "Plottable.AttributeToProjector",
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "DrawStep": "Plottable.Drawers.DrawStep",
+            "Drawers": "Plottable.Drawers",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAccessor": "Plottable.IAccessor",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IEntity": "Plottable.IEntity",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Plots": "Plottable.Plots",
+            "Point": "Plottable.Point",
+            "Projector": "Plottable.Projector",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Range": "Plottable.Range",
+            "Scale": "Plottable.Scales.Scale",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "Utils": "Plottable.Utils",
+            "d3Shape": "d3",
+        },
+        "Plottable.Scales": {
+            "Dataset": "Plottable.Dataset",
+            "Scales": "Plottable.Scales",
+        },
+        "Plottable.Scales.TickGenerators": {
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+        },
+        "Plottable.Utils": {
+            "Bounds": "Plottable.Bounds",
+            "Component": "Plottable.Components.Component",
+            "Dataset": "Plottable.Dataset",
+            "IAccessor": "Plottable.IAccessor",
+            "Point": "Plottable.Point",
+            "Range": "Plottable.Range",
+            "SimpleSelection": "Plottable.SimpleSelection",
+            "Utils": "Plottable.Utils",
+        },
+    },
+    namespace_symbol_aliases_public = {
+        "Plottable": {
+            "Axis": "Plottable.Axes.Axis",
+            "AxisOrientation": "Plottable.Axes.AxisOrientation",
+            "ClickCallback": "Plottable.Interactions.ClickCallback",
+            "Component": "Plottable.Components.Component",
+            "ComponentCallback": "Plottable.Components.ComponentCallback",
+            "ComponentContainer": "Plottable.Components.ComponentContainer",
+            "Dispatcher": "Plottable.Dispatchers.Dispatcher",
+            "DragBoxCallback": "Plottable.Components.DragBoxCallback",
+            "DragCallback": "Plottable.Interactions.DragCallback",
+            "EaseFn": "Plottable.Animators.EaseFn",
+            "EaseName": "Plottable.Animators.EaseName",
+            "Easing": "Plottable.Animators.Easing",
+            "Formatter": "Plottable.Formatters.Formatter",
+            "IAnimator": "Plottable.Animators.IAnimator",
+            "IDragLineCallback": "Plottable.Components.IDragLineCallback",
+            "IDrawer": "Plottable.Drawers.IDrawer",
+            "IResizeHandler": "Plottable.Components.IResizeHandler",
+            "IScaleCallback": "Plottable.Scales.IScaleCallback",
+            "Interaction": "Plottable.Interactions.Interaction",
+            "Key": "Plottable.Interactions.Key",
+            "KeyCallback": "Plottable.Interactions.KeyCallback",
+            "Null": "Plottable.Animators.Null",
+            "Plot": "Plottable.Plots.Plot",
+            "PointerCallback": "Plottable.Interactions.PointerCallback",
+            "ProxyDrawer": "Plottable.Drawers.ProxyDrawer",
+            "QuantitativeScale": "Plottable.Scales.QuantitativeScale",
+            "Renderer": "Plottable.Plots.Renderer",
+            "Scale": "Plottable.Scales.Scale",
+            "SymbolFactory": "Plottable.SymbolFactories.SymbolFactory",
+            "TimeInterval": "Plottable.Axes.TimeInterval",
+            "TransformableScale": "Plottable.Scales.TransformableScale",
+            "XAlignment": "Plottable.Components.XAlignment",
+            "XYPlot": "Plottable.Plots.XYPlot",
+            "YAlignment": "Plottable.Components.YAlignment",
+        },
+    },
+    visibility = ["//visibility:public"],
+)
+
+# Removes the 'declare module' block inside this file, but keeps its content.
+genrule(
+    name = "kludge_d3_transition",
+    srcs = ["@org_definitelytyped_types_d3_transition//:index.d.ts"],
+    outs = ["d3-transition.d.ts"],
+    cmd = "sed '/^declare module/d' $< | awk '/^}$$/ && !p {p++;next}1' >$@",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/README.md b/tensorflow/tensorboard/components/tf_imports_d3v4/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1cabc61b9be000350c165690652ab906f5c1b53
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/README.md
@@ -0,0 +1,2 @@
+This file acts as import routers for third party javascript libraries,
+e.g. Plottable and D3.
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html b/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html
new file mode 100644
index 0000000000000000000000000000000000000000..2772db39a85d0aacddb17a6642fe48de9bd60e18
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/d3.html
@@ -0,0 +1,50 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+d3
+Copyright 2010-2017 Mike Bostock
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the author nor the names of contributors may be used to
+  endorse or promote products derived from this software without specific prior
+  written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+
+<script src="d3.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html b/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html
new file mode 100644
index 0000000000000000000000000000000000000000..1e2f6ef9af63b513f3877ea6679a4a0b600924ca
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/dagre.html
@@ -0,0 +1,45 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+Dagre
+Copyright (c) 2012-2014 Chris Pettitt
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<link rel="import" href="lodash.html">
+<link rel="import" href="graphlib.html">
+
+<script src="dagre.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/graphlib.html b/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
similarity index 88%
rename from tensorflow/tensorboard/components/tf_imports_google/graphlib.html
rename to tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
index 56b37ebe4bb2e7fac011f921f37140aa884d2748..783e33be0a6ee7cb2d9f54de38bf434f938eed85 100644
--- a/tensorflow/tensorboard/components/tf_imports_google/graphlib.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/graphlib.html
@@ -15,4 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="../graphlib-library/graphlib.html">
+<link rel="import" href="lodash.html">
+
+<script src="graphlib.core.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/lodash.html b/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
similarity index 90%
rename from tensorflow/tensorboard/components/tf_imports_google/lodash.html
rename to tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
index eb8fef28831191902ba746c167a206bd901d19ac..cbe35f10505686cb8527a92edc6aa95c164a9ec2 100644
--- a/tensorflow/tensorboard/components/tf_imports_google/lodash.html
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/lodash.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="../lodash-library/lodash-library.html">
+<script src="../lodash/lodash.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html b/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html
new file mode 100644
index 0000000000000000000000000000000000000000..7559054aabaa008d8a97a41ede707a56703d4dbb
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/numericjs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+Numeric Javascript
+Copyright (C) 2011 by Sébastien Loisel
+Copyright (c) 2011 Alberto Santini <albertosantini@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script src="numeric.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html b/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html
new file mode 100644
index 0000000000000000000000000000000000000000..2c3e10a7c443ed1377783e35b41c393ae3dfbeb1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/plottable.html
@@ -0,0 +1,44 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+Plottable.js
+Copyright (c) 2014-2017 Palantir Technologies, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<link rel="import" href="d3.html">
+<script src="plottable.js"></script>
+<link rel="stylesheet" href="plottable.css">
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html b/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html
new file mode 100644
index 0000000000000000000000000000000000000000..d6adad43b034acf640ddeef3420feb2d483d92af
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/threejs.html
@@ -0,0 +1,43 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+three.js
+Copyright (c) 2010-2013 three.js authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<script src="three.js"></script>
+<script src="OrbitControls.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html b/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html
new file mode 100644
index 0000000000000000000000000000000000000000..054d04ea85e16cc31e8cf248d3db86cd5262ab2d
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_imports_d3v4/weblas.html
@@ -0,0 +1,42 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!--
+@license
+weblas
+Copyright (c) 2015 Waylon Flinn
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-->
+
+<script src="weblas.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/README.md b/tensorflow/tensorboard/components/tf_imports_google/README.md
deleted file mode 100644
index 60d9cce777bfd53ee7088376b19eb900267ed641..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_imports_google/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This file acts as import routers for third party javascript libraries,
-e.g. Plottable and D3 from `g3/third_party`; it exists to facilitate development
-inside google.
diff --git a/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD b/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db5d07b0955b3f447be4fe2185509f148c0036d5
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_option_selector_d3v4/BUILD
@@ -0,0 +1,21 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_option_selector_d3v4",
+    srcs = ["tf-option-selector.html"],
+    path = "/tf-option-selector",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "@org_polymer",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_option_selector/tf-option-selector.html b/tensorflow/tensorboard/components/tf_option_selector_d3v4/tf-option-selector.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_option_selector/tf-option-selector.html
rename to tensorflow/tensorboard/components/tf_option_selector_d3v4/tf-option-selector.html
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
deleted file mode 100644
index 57bad96af6d14786102b9728de1a9e9dc8d1499e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/BUILD
+++ /dev/null
@@ -1,75 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "tf_scalar_dashboard",
-    srcs = [
-        "tf-scalar-dashboard.html",
-        "tf-smoothing-input.html",
-    ],
-    path = "/tf-scalar-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/vz_line_chart",
-        "@org_polymer",
-        "@org_polymer_iron_collapse",
-        "@org_polymer_paper_checkbox",
-        "@org_polymer_paper_dropdown_menu",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_input",
-        "@org_polymer_paper_item",
-        "@org_polymer_paper_menu",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_styles",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-scalar-dashboard.html",
-        "tf-smoothing-input.html",
-    ],
-    destdir = "tf-scalar-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_color_scale:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//tensorflow/tensorboard/components/vz_line_chart:legacy",
-        "//third_party/javascript/polymer/v1/iron-collapse:lib",
-        "//third_party/javascript/polymer/v1/paper-checkbox:lib",
-        "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-icon-button:lib",
-        "//third_party/javascript/polymer/v1/paper-input:lib",
-        "//third_party/javascript/polymer/v1/paper-item:lib",
-        "//third_party/javascript/polymer/v1/paper-menu:lib",
-        "//third_party/javascript/polymer/v1/paper-slider:lib",
-        "//third_party/javascript/polymer/v1/paper-styles:lib",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD
deleted file mode 100644
index 7f39d27f60724ca089f403b6d9a17a3d90effd1c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-scalar-dashboard/demo/data",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3b9f38feabd974dd4ece871857e4e6cbbf18ce06
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/BUILD
@@ -0,0 +1,37 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_scalar_dashboard_d3v4",
+    srcs = [
+        "tf-scalar-dashboard.html",
+        "tf-smoothing-input.html",
+    ],
+    path = "/tf-scalar-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_menu",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_styles",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD
similarity index 61%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD
index 218fda3fdb68ae2468710133917eca67c3192315..e3977205cbd1c44a1dbdbfa6d33396bec96f7953 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/BUILD
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/BUILD
@@ -1,18 +1,19 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 
 licenses(["notice"])  # Apache 2.0
 
-# bazel run //third_party/tensorflow/tensorboard/components/tf_scalar_dashboard/demo
-webfiles(
+web_library(
     name = "demo",
     srcs = ["index.html"],
     path = "/tf-scalar-dashboard/demo",
     deps = [
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard",
-        "//tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer",
         "@org_polymer_iron_demo_helpers",
         "@org_polymer_paper_styles",
         "@org_polymer_webcomponentsjs",
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/logdir
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/logdir
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/logdir
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_bar_2Fsquare.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fcos.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fcos.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fcos.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsin.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsin.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsin.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run1_tag_foo_2Fsquare.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_bar_2Fsquare.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fcos.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fcos.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fcos.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/data/scalars_run_run2_tag_foo_2Fsquare.json
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html
similarity index 90%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html
index 586ee2a47d65d9f985c9d0e6acb3cf8216099832..7429c87b873ec1d8fe2827c0f2215aa205a8f5c7 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/demo/index.html
@@ -17,9 +17,11 @@ limitations under the License.
 -->
 
 <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
 <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
 <link rel="import" href="../tf-scalar-dashboard.html">
 <link rel="import" href="../../paper-styles/typography.html">
+<link rel="import" href="../../tf-backend/tf-backend.html">
 
 <title>Scalar Dashboard Demo</title>
 <style>
@@ -49,7 +51,7 @@ limitations under the License.
             backend: {
               type: Object,
               value: function() {
-                var router = new TF.Backend.router("data", true);
+                var router = new TF.Backend.router("/data", true);
                 return new TF.Backend.Backend(router);
               },
             },
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html
similarity index 97%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html
index 641573366a6ce469b1ff69aaf270b432a16f738e..b91cd90c0371e90d5f5abc4cf07ce297ba56c386 100644
--- a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-scalar-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-scalar-dashboard.html
@@ -190,11 +190,16 @@ contains vz-line-charts embedded inside tf-panes-helper's.
   </template>
 
   <script>
-    Polymer({
+    TF.Dashboard.TfScalarDashboard = Polymer({
       is: "tf-scalar-dashboard",
+      factoryImpl: function(backend, router) {
+        this.backend = backend;
+        this.router = router;
+      },
       behaviors: [
+        TF.Dashboard.DashboardBehavior("scalars"),
         TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
+        TF.Backend.BackendBehavior,
       ],
       properties: {
         backend: Object,
diff --git a/tensorflow/tensorboard/components/tf_scalar_dashboard/tf-smoothing-input.html b/tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-smoothing-input.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_scalar_dashboard/tf-smoothing-input.html
rename to tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4/tf-smoothing-input.html
diff --git a/tensorflow/tensorboard/components/tf_storage/BUILD b/tensorflow/tensorboard/components/tf_storage/BUILD
index b97c98ae4b9b5147100063d8ef524066a60f460b..940d09681d2bc14fe9f089b31487b118d5216385 100644
--- a/tensorflow/tensorboard/components/tf_storage/BUILD
+++ b/tensorflow/tensorboard/components/tf_storage/BUILD
@@ -1,6 +1,6 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
@@ -9,7 +9,7 @@ licenses(["notice"])  # Apache 2.0
 
 # TODO(dandelion): Add webfiles support for the test code.
 
-webfiles(
+web_library(
     name = "tf_storage",
     srcs = [
         "tf-storage.html",
@@ -52,19 +52,27 @@ tensorboard_webcomponent_library(
     visibility = ["//visibility:public"],
     destdir = "tf-storage",
     deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
         "//tensorflow/tensorboard/components/tf_globals:legacy",
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
         "//third_party/javascript/polymer/v1/polymer:lib",
     ],
 )
 
 tensorboard_ts_library(
     name = "legacy_ts",
-    srcs = [
-        "storage.ts",
-    ],
+    srcs = ["storage.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
     deps = [
-        "//tensorflow/tensorboard/components:common_deps",
         "//tensorflow/tensorboard/components/tf_globals:legacy_ts",
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/plottable/v1:typings",
+        "//third_party/javascript/typings/chai",
+        "//third_party/javascript/typings/d3",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/mocha",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/sinon",
+        "//third_party/javascript/typings/webcomponents_js",
     ],
 )
diff --git a/tensorflow/tensorboard/components/tf_storage/storage.ts b/tensorflow/tensorboard/components/tf_storage/storage.ts
index 5516ae7ded3656d4fa82847c21fcb491b7dabc66..e8d5fa672fbd5a8ab04fb51ad801346affcd4f71 100644
--- a/tensorflow/tensorboard/components/tf_storage/storage.ts
+++ b/tensorflow/tensorboard/components/tf_storage/storage.ts
@@ -22,7 +22,7 @@ limitations under the License.
  * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
  * to store state in the URI.
  *
- * It also allows saving the values to localStorage for long-term persistance.
+ * It also allows saving the values to localStorage for long-term persistence.
  */
 module TF.URIStorage {
   type StringDict = {[key: string]: string};
@@ -36,7 +36,7 @@ module TF.URIStorage {
   /**
    * The name of the property for users to set on a Polymer component
    * in order for its stored properties to be stored in the URI unambiguously.
-   * (No need to set this if you want mutliple instances of the component to
+   * (No need to set this if you want multiple instances of the component to
    * share URI state)
    *
    * Example:
@@ -257,7 +257,7 @@ module TF.URIStorage {
    * Convert dictionary of strings into a URI Component.
    * All key value entries get added as key value pairs in the component,
    * with the exception of a key with the TAB value, which if present
-   * gets prepended to the URI Component string for backwards comptability
+   * gets prepended to the URI Component string for backwards compatibility
    * reasons.
    */
   function _dictToComponent(items: StringDict): string {
diff --git a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts b/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
index d036005caad3b14ce86fe2a3420640c19b4898e2..b1c4a5cf473c265ec1ff8fe102e568058813cef0 100644
--- a/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
+++ b/tensorflow/tensorboard/components/tf_storage/test/storageTests.ts
@@ -16,47 +16,47 @@ limitations under the License.
 module TF.URIStorage {
   describe('URIStorage', function() {
     it('get/setString', function() {
-      setString('key_a', 'hello');
-      setString('key_b', 'there');
-      chai.assert.equal('hello', getString('key_a'));
-      chai.assert.equal('there', getString('key_b'));
-      chai.assert.equal(null, getString('key_c'));
+      setString('key_a', 'hello', false);
+      setString('key_b', 'there', false);
+      chai.assert.equal('hello', getString('key_a', false));
+      chai.assert.equal('there', getString('key_b', false));
+      chai.assert.equal(null, getString('key_c', false));
     });
 
     it('get/setNumber', function() {
-      setNumber('key_a', 12);
-      setNumber('key_b', 3.4);
-      chai.assert.equal(12, getNumber('key_a'));
-      chai.assert.equal(3.4, getNumber('key_b'));
-      chai.assert.equal(null, getNumber('key_c'));
+      setNumber('key_a', 12, false);
+      setNumber('key_b', 3.4, false);
+      chai.assert.equal(12, getNumber('key_a', false));
+      chai.assert.equal(3.4, getNumber('key_b', false));
+      chai.assert.equal(null, getNumber('key_c', false));
     });
 
     it('get/setObject', function() {
       let obj = {'foo': 2.3, 'bar': 'barstr'};
-      setObject('key_a', obj);
-      chai.assert.deepEqual(obj, getObject('key_a'));
+      setObject('key_a', obj, false);
+      chai.assert.deepEqual(obj, getObject('key_a', false));
     });
 
     it('get/setWeirdValues', function() {
-      setNumber('key_a', NaN);
-      chai.assert.deepEqual(NaN, getNumber('key_a'));
+      setNumber('key_a', NaN, false);
+      chai.assert.deepEqual(NaN, getNumber('key_a', false));
 
-      setNumber('key_a', +Infinity);
-      chai.assert.equal(+Infinity, getNumber('key_a'));
+      setNumber('key_a', +Infinity, false);
+      chai.assert.equal(+Infinity, getNumber('key_a', false));
 
-      setNumber('key_a', -Infinity);
-      chai.assert.equal(-Infinity, getNumber('key_a'));
+      setNumber('key_a', -Infinity, false);
+      chai.assert.equal(-Infinity, getNumber('key_a', false));
 
-      setNumber('key_a', 1 / 3);
-      chai.assert.equal(1 / 3, getNumber('key_a'));
+      setNumber('key_a', 1 / 3, false);
+      chai.assert.equal(1 / 3, getNumber('key_a', false));
 
-      setNumber('key_a', -0);
-      chai.assert.equal(-0, getNumber('key_a'));
+      setNumber('key_a', -0, false);
+      chai.assert.equal(-0, getNumber('key_a', false));
     });
 
     it('set/getTab', function() {
-      setString(TAB, TF.Globals.TABS[0]);
-      chai.assert.equal(TF.Globals.TABS[0], getString(TAB));
+      setString(TAB, TF.Globals.TABS[0], false);
+      chai.assert.equal(TF.Globals.TABS[0], getString(TAB, false));
     });
   });
 }
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD b/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..76ac394213e7f00737e216feda464d964e17723e
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/BUILD
@@ -0,0 +1,82 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_storage_d3v4",
+    srcs = [
+        "bundle.js",
+        "tf-storage.html",
+    ],
+    path = "/tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.URIStorage": [
+        "storage.ts",
+    ]},
+    namespace_symbol_aliases = {"TF.URIStorage": {
+        "TABS": "TF.Globals.TABS",
+        "USE_HASH": "TF.Globals.USE_HASH",
+        "getFakeHash": "TF.Globals.getFakeHash",
+        "setFakeHash": "TF.Globals.setFakeHash",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "tf-storage.html",
+        ":legacy_ts",
+    ],
+    destdir = "tf-storage",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:legacy",
+        "//tensorflow/tensorboard/components/tf_imports_google_d3v4:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["storage.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:legacy_ts",
+        "//third_party/javascript/typings/lodash",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/storage.ts b/tensorflow/tensorboard/components/tf_storage_d3v4/storage.ts
new file mode 100644
index 0000000000000000000000000000000000000000..1b39efc03a1a96494caa3c32e687934dc80631c8
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/storage.ts
@@ -0,0 +1,400 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {USE_HASH, setFakeHash, getFakeHash, TABS} from '../tf_globals_d3v4/globals';
+import * as _ from 'lodash';
+
+
+/* tslint:disable:no-namespace variable-name */
+/**
+ * The Storage Module provides storage for URL parameters, and an API for
+ * getting and setting TensorBoard's stateful URI.
+ *
+ * It generates URI components like: events&runPrefix=train*
+ * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
+ * to store state in the URI.
+ *
+ * It also allows saving the values to localStorage for long-term persistence.
+ */
+type StringDict = {[key: string]: string};
+
+/**
+ * A key that users cannot use, since TensorBoard uses this to store info
+ * about the active tab.
+ */
+export let TAB = '__tab__';
+
+/**
+ * The name of the property for users to set on a Polymer component
+ * in order for its stored properties to be stored in the URI unambiguously.
+ * (No need to set this if you want multiple instances of the component to
+ * share URI state)
+ *
+ * Example:
+ * <my-component disambiguator="0"></my-component>
+ *
+ * The disambiguator should be set to any unique value so that multiple
+ * instances of the component can store properties in URI storage.
+ *
+ * Because it's hard to dereference this variable in HTML property bindings,
+ * it is NOT safe to change the disambiguator string without find+replace
+ * across the codebase.
+ */
+export let DISAMBIGUATOR = 'disambiguator';
+
+/**
+ * Return a string stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getString(key: string, useLocalStorage: boolean): string {
+  if (useLocalStorage) {
+    return window.localStorage.getItem(key);
+  } else {
+    return _componentToDict(_readComponent())[key];
+  }
+}
+
+/**
+ * Set a string in URI or localStorage.
+ */
+export function setString(
+    key: string, value: string, useLocalStorage: boolean) {
+  if (useLocalStorage) {
+    window.localStorage.setItem(key, value);
+  } else {
+    const items = _componentToDict(_readComponent());
+    items[key] = value;
+    _writeComponent(_dictToComponent(items));
+  }
+}
+
+/**
+ * Return a boolean stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getBoolean(key: string, useLocalStorage: boolean): boolean {
+  const item = getString(key, useLocalStorage);
+  return item === 'true' ? true : item === 'false' ? false : undefined;
+}
+
+/**
+ * Store a boolean in URI or localStorage.
+ */
+export function setBoolean(
+    key: string, value: boolean, useLocalStorage = false) {
+  setString(key, value.toString(), useLocalStorage);
+}
+
+/**
+ * Return a number stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getNumber(key: string, useLocalStorage: boolean): number {
+  const item = getString(key, useLocalStorage);
+  return item === undefined ? undefined : +item;
+}
+
+/**
+ * Store a number in URI or localStorage.
+ */
+export function setNumber(
+    key: string, value: number, useLocalStorage: boolean) {
+  setString(key, '' + value, useLocalStorage);
+}
+
+/**
+ * Return an object stored in stored in URI or localStorage.
+ * Undefined if not found.
+ */
+export function getObject(key: string, useLocalStorage: boolean): {} {
+  const item = getString(key, useLocalStorage);
+  return item === undefined ? undefined : JSON.parse(atob(item));
+}
+
+/**
+ * Store an object in URI or localStorage.
+ */
+export function setObject(key: string, value: {}, useLocalStorage: boolean) {
+  setString(key, btoa(JSON.stringify(value)), useLocalStorage);
+}
+
+/**
+ * Get a unique storage name for a (Polymer component, propertyName) tuple.
+ *
+ * DISAMBIGUATOR must be set on the component, if other components use the
+ * same propertyName.
+ */
+export function getURIStorageName(
+    component: {}, propertyName: string): string {
+  const d = component[DISAMBIGUATOR];
+  const components = d == null ? [propertyName] : [d, propertyName];
+  return components.join('.');
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer boolean property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getBooleanInitializer(
+    propertyName: string, defaultVal: boolean,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getBoolean, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer string property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getStringInitializer(
+    propertyName: string, defaultVal: string,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getString, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer number property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+export function getNumberInitializer(
+    propertyName: string, defaultVal: number,
+    useLocalStorage = false): Function {
+  return _getInitializer(
+      getNumber, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer Object property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ *
+ * Generates a deep clone of the defaultVal to avoid mutation issues.
+ */
+export function getObjectInitializer(
+    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
+  return _getInitializer(
+      getObject, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that updates URIStorage when a string property changes.
+ */
+export function getBooleanObserver(
+    propertyName: string, defaultVal: boolean,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getBoolean, setBoolean, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that updates URIStorage when a string property changes.
+ */
+export function getStringObserver(
+    propertyName: string, defaultVal: string,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getString, setString, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that updates URIStorage when a number property changes.
+ */
+export function getNumberObserver(
+    propertyName: string, defaultVal: number,
+    useLocalStorage = false): Function {
+  return _getObserver(
+      getNumber, setNumber, propertyName, defaultVal, useLocalStorage);
+}
+
+/**
+ * Return a function that updates URIStorage when an object property changes.
+ * Generates a deep clone of the defaultVal to avoid mutation issues.
+ */
+export function getObjectObserver(
+    propertyName: string, defaultVal: {}, useLocalStorage = false): Function {
+  const clone = _.cloneDeep(defaultVal);
+  return _getObserver(
+      getObject, setObject, propertyName, clone, useLocalStorage);
+}
+
+/**
+ * Read component from URI (e.g. returns "events&runPrefix=train*").
+ */
+function _readComponent(): string {
+  return USE_HASH ? window.location.hash.slice(1) : getFakeHash();
+}
+
+/**
+ * Write component to URI.
+ */
+function _writeComponent(component: string) {
+  if (USE_HASH) {
+    window.location.hash = component;
+  } else {
+    setFakeHash(component);
+  }
+}
+
+/**
+ * Convert dictionary of strings into a URI Component.
+ * All key value entries get added as key value pairs in the component,
+ * with the exception of a key with the TAB value, which if present
+ * gets prepended to the URI Component string for backwards compatibility
+ * reasons.
+ */
+function _dictToComponent(items: StringDict): string {
+  let component = '';
+
+  // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
+  // for backwards compatbility.
+  if (items[TAB] !== undefined) {
+    component += items[TAB];
+  }
+
+  // Join other strings with &key=value notation
+  const nonTab = _.pairs(items)
+                   .filter((pair) =>  pair[0] !== TAB)
+                   .map((pair) => {
+                     return encodeURIComponent(pair[0]) + '=' +
+                         encodeURIComponent(pair[1]);
+                   })
+                   .join('&');
+
+  return nonTab.length > 0 ? (component + '&' + nonTab) : component;
+}
+
+/**
+ * Convert a URI Component into a dictionary of strings.
+ * Component should consist of key-value pairs joined by a delimiter
+ * with the exception of the tabName.
+ * Returns dict consisting of all key-value pairs and
+ * dict[TAB] = tabName
+ */
+function _componentToDict(component: string): StringDict {
+  const items = {} as StringDict;
+
+  const tokens = component.split('&');
+  tokens.forEach((token) => {
+    const kv = token.split('=');
+    // Special backwards compatibility for URI components like #events
+    if (kv.length === 1 && _.contains(TABS, kv[0])) {
+      items[TAB] = kv[0];
+    } else if (kv.length === 2) {
+      items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
+    }
+  });
+  return items;
+}
+
+/**
+ * Return a function that:
+ * (1) Initializes a Polymer property with a default value, if its
+ *     value is not already set
+ * (2) Sets up listener that updates Polymer property on hash change.
+ */
+function _getInitializer<T>(
+    get: (name: string, useLocalStorage: boolean) => T, propertyName: string,
+    defaultVal: T, useLocalStorage): Function {
+  return function() {
+    const URIStorageName = getURIStorageName(this, propertyName);
+    // setComponentValue will be called every time the hash changes, and is
+    // responsible for ensuring that new state in the hash will be propagated
+    // to the component with that property.
+    // It is important that this function does not re-assign needlessly,
+    // to avoid Polymer observer churn.
+    const setComponentValue = () => {
+      const uriValue = get(URIStorageName, false);
+      const currentValue = this[propertyName];
+      // if uriValue is undefined, we will ensure that the property has the
+      // default value
+      if (uriValue === undefined) {
+        let valueToSet: T;
+        // if we are using localStorage, we will set the value to the value
+        // from localStorage. Then, the corresponding observer will proxy
+        // the localStorage value into URI storage.
+        // in this way, localStorage takes precedence over the default val
+        // but not over the URI value.
+        if (useLocalStorage) {
+          const useLocalStorageValue = get(URIStorageName, true);
+          valueToSet = useLocalStorageValue === undefined ?
+              defaultVal :
+              useLocalStorageValue;
+        } else {
+          valueToSet = defaultVal;
+        }
+        if (!_.isEqual(currentValue, valueToSet)) {
+          // If we don't have an explicit URI value, then we need to ensure
+          // the property value is equal to the default value.
+          // We will assign a clone rather than the canonical default, because
+          // the component receiving this property may mutate it, and we need
+          // to keep a pristine copy of the default.
+          this[propertyName] = _.clone(valueToSet);
+        }
+        // In this case, we have an explicit URI value, so we will ensure that
+        // the component has an equivalent value.
+      } else {
+        if (!_.isEqual(uriValue, currentValue)) {
+          this[propertyName] = uriValue;
+        }
+      }
+    };
+    // Set the value on the property.
+    setComponentValue();
+    // Update it when the hashchanges.
+    window.addEventListener('hashchange', setComponentValue);
+  };
+}
+
+/**
+ * Return a function that updates URIStorage when a property changes.
+ */
+function _getObserver<T>(
+    get: (name: string, useLocalStorage: boolean) => T,
+    set: (name: string, newVal: T, useLocalStorage: boolean) => void,
+    propertyName: string, defaultVal: T, useLocalStorage: boolean): Function {
+  return function() {
+    const URIStorageName = getURIStorageName(this, propertyName);
+    const newVal = this[propertyName];
+    // if this is a localStorage property, we always synchronize the value
+    // in localStorage to match the one currently in the URI.
+    if (useLocalStorage) {
+      set(URIStorageName, newVal, true);
+    }
+    if (!_.isEqual(newVal, get(URIStorageName, false))) {
+      if (_.isEqual(newVal, defaultVal)) {
+        _unsetFromURI(URIStorageName);
+      } else {
+        set(URIStorageName, newVal, false);
+      }
+    }
+  };
+}
+
+/**
+ * Delete a key from the URI.
+ */
+function _unsetFromURI(key) {
+  const items = _componentToDict(_readComponent());
+  delete items[key];
+  _writeComponent(_dictToComponent(items));
+}
+
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD b/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..472976f0005f7cdc93250b6f9f7a8cf74190294f
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/BUILD
@@ -0,0 +1,50 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/tf-storage/test",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4:bundle.d.ts",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"TF.URIStorage": ["storageTests.ts"]},
+    namespace_symbol_aliases = {"TF.URIStorage": {"TABS": "TF.Globals.TABS"}},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts b/tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..82dc51f05dade857f1c9cbd09bb6b215e148977a
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/storageTests.ts
@@ -0,0 +1,64 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the 'License');
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an 'AS IS' BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+import {TAB, getString, getNumber, getObject, setString, setNumber, setObject} from '../storage';
+import {TABS} from '../../tf-globals/globals';
+
+/* tslint:disable:no-namespace */
+describe('URIStorage', () => {
+  it('get/setString', () => {
+    setString('key_a', 'hello', false);
+    setString('key_b', 'there', false);
+    chai.assert.equal('hello', getString('key_a', false));
+    chai.assert.equal('there', getString('key_b', false));
+    chai.assert.equal(null, getString('key_c', false));
+  });
+
+  it('get/setNumber', () => {
+    setNumber('key_a', 12, false);
+    setNumber('key_b', 3.4, false);
+    chai.assert.equal(12, getNumber('key_a', false));
+    chai.assert.equal(3.4, getNumber('key_b', false));
+    chai.assert.equal(null, getNumber('key_c', false));
+  });
+
+  it('get/setObject', () => {
+    const obj = {'foo': 2.3, 'bar': 'barstr'};
+    setObject('key_a', obj, false);
+    chai.assert.deepEqual(obj, getObject('key_a', false));
+  });
+
+  it('get/setWeirdValues', () => {
+    setNumber('key_a', NaN, false);
+    chai.assert.deepEqual(NaN, getNumber('key_a', false));
+
+    setNumber('key_a', +Infinity, false);
+    chai.assert.equal(+Infinity, getNumber('key_a', false));
+
+    setNumber('key_a', -Infinity, false);
+    chai.assert.equal(-Infinity, getNumber('key_a', false));
+
+    setNumber('key_a', 1 / 3, false);
+    chai.assert.equal(1 / 3, getNumber('key_a', false));
+
+    setNumber('key_a', -0, false);
+    chai.assert.equal(-0, getNumber('key_a', false));
+  });
+
+  it('set/getTab', () => {
+    setString(TAB, TABS[0], false);
+    chai.assert.equal(TABS[0], getString(TAB, false));
+  });
+});
+
diff --git a/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html b/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..e0553c7d3c46443b640610d320fefdd3bab704af
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/test/tests.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<script src="../../web-component-tester/browser.js"></script>
+<link rel="import" href="../../polymer/polymer.html">
+<link rel="import" href="../tf-storage.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/app/analytics.js b/tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html
similarity index 70%
rename from tensorflow/tensorboard/app/analytics.js
rename to tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html
index 497c02ced8a5155830f7a6ff43cc01ad33bea37e..91b8976519d6fda482c96d7669dbbdbd0f2dba35 100644
--- a/tensorflow/tensorboard/app/analytics.js
+++ b/tensorflow/tensorboard/components/tf_storage_d3v4/tf-storage.html
@@ -1,4 +1,6 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,6 +13,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
-==============================================================================*/
+-->
 
-// Nothing to see here. vulcanize doesn't like empty files.
+<link rel="import" href="../tf-globals/tf-globals.html">
+<link rel="import" href="../tf-imports/lodash.html">
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
deleted file mode 100644
index 2a6af3284086b4d797ebf3598bffe286d74baddf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run1.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt b/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
deleted file mode 100644
index a5a4d65d5c61a7cf1c208b48f841a38a03847d60..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/graph_run_run2.pbtxt
+++ /dev/null
@@ -1,15 +0,0 @@
-node {
-  name: "a"
-  op: "matmul"
-}
-node {
-  name: "b"
-  op: "matmul"
-  input: "a:0"
-}
-node {
-  name: "c"
-  op: "matmul"
-  input: "a:0"
-  input: "b:0"
-}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json b/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
deleted file mode 100644
index 10b2821b30b04b528b6476831a9ed59c3e3e094f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_tensorboard/test/data/runs.json
+++ /dev/null
@@ -1 +0,0 @@
-{"run2": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}, "run1": {"graph": true, "histograms": [], "scalars": [], "compressedHistograms": [], "images": []}}
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e17a083cb1573b8b60786c6eda0a03477aa5f008
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/BUILD
@@ -0,0 +1,63 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_tensorboard_d3v4",
+    srcs = [
+        "style.html",
+        "tf-tensorboard.html",
+        ":ts",
+    ],
+    path = "/tf-tensorboard",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_audio_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_distribution_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_globals_d3v4",
+        "//tensorflow/tensorboard/components/tf_graph_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_histogram_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_image_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_scalar_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/tf_storage_d3v4",
+        "//tensorflow/tensorboard/components/tf_text_dashboard_d3v4",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4",
+        "@org_polymer",
+        "@org_polymer_font_roboto",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_header_panel",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_tabs",
+        "@org_polymer_paper_toolbar",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["demo.html"],
+    path = "/tf-tensorboard",
+    deps = [
+        ":tf_tensorboard_d3v4",
+        "//tensorflow/tensorboard/demo:demo_data",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["autoReloadBehavior.ts"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/autoReloadBehavior.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/autoReloadBehavior.ts
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/autoReloadBehavior.ts
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html
new file mode 100644
index 0000000000000000000000000000000000000000..c8a9238aef03b768d08cc1c33fccc4cafdf20d01
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/demo.html
@@ -0,0 +1,25 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<title>TensorBoard Demo</title>
+<script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="style.html">
+<link rel="import" href="tf-tensorboard.html">
+<body>
+<tf-tensorboard demo-dir="/data" use-hash></tf-tensorboard>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard_d3v4/style.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/style.html
new file mode 100644
index 0000000000000000000000000000000000000000..575e89e39828dda56067aa91e1145d45b7e87a18
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/style.html
@@ -0,0 +1,28 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../font-roboto/roboto.html">
+
+<style>
+  html,
+  body {
+    margin: 0;
+    padding: 0;
+    height: 100%;
+    font-family: "RobotoDraft", "Roboto", sans-serif;
+  }
+</style>
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/autoReloadTests.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/autoReloadTests.ts
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/autoReloadTests.ts
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.html
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/e2eTests.ts
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/e2eTests.ts
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.html
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/fastTabSwitch.ts
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/fastTabSwitch.ts
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/index.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/index.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/index.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/index.html
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.html
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.ts
similarity index 100%
rename from tensorflow/tensorboard/components/tf_tensorboard/test/tensorboardTests.ts
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/test/tensorboardTests.ts
diff --git a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html
similarity index 98%
rename from tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
rename to tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html
index b5b2e2d5a86aaa2a4b76f884b5279c1ba6fd7f3a..3a565354109b7c6e1e5c6bbbd44980173d792770 100644
--- a/tensorflow/tensorboard/components/tf_tensorboard/tf-tensorboard.html
+++ b/tensorflow/tensorboard/components/tf_tensorboard_d3v4/tf-tensorboard.html
@@ -36,6 +36,7 @@ limitations under the License.
 <link rel="import" href="../tf-backend/tf-backend.html">
 <link rel="import" href="../tf-storage/tf-storage.html">
 <link rel="import" href="../vz-projector/vz-projector-dashboard.html">
+<link rel="import" href="../vz-projector/bundle.html">
 
 <!--
 tf-tensorboard is the frontend entry point for TensorBoard.
@@ -210,6 +211,8 @@ allows the user to toggle between various dashboards.
     </style>
   </template>
   <script>
+    "use strict";
+
     Polymer({
       is: "tf-tensorboard",
       behaviors: [TF.TensorBoard.AutoReloadBehavior],
@@ -345,6 +348,5 @@ allows the user to toggle between various dashboards.
       },
     });
   </script>
-  <!-- Compiled bundle of all components using ES6 modules. -->
-  <script src="../bundle.js"></script>
+  <script src="autoReloadBehavior.js"></script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
deleted file mode 100644
index 9e4cd3614dcd83d0c0df20ff110dcd8a4340acbf..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/BUILD
+++ /dev/null
@@ -1,58 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "tf_text_dashboard",
-    srcs = [
-        "tf-text-dashboard.html",
-        "tf-text-loader.html",
-    ],
-    path = "/tf-text-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend",
-        "//tensorflow/tensorboard/components/tf_color_scale",
-        "//tensorflow/tensorboard/components/tf_dashboard_common",
-        "//tensorflow/tensorboard/components/tf_imports:d3",
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "@org_polymer",
-        "@org_polymer_paper_dialog",
-        "@org_polymer_paper_icon_button",
-        "@org_polymer_paper_material",
-        "@org_polymer_paper_slider",
-        "@org_polymer_paper_spinner",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "tf-text-dashboard.html",
-        "tf-text-loader.html",
-    ],
-    destdir = "tf-text-dashboard",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_backend:legacy",
-        "//tensorflow/tensorboard/components/tf_dashboard_common:legacy",
-        "//third_party/javascript/polymer/v1/paper-material:lib",
-    ],
-)
-
-# This is needed: components/BUILD seeks a legacy_ts rule in this package.
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [],
-)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD
deleted file mode 100644
index 6cd6702e4b77fcabbb4339a63855e26dce36022f..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/demo/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-# bazel run //third_party/tensorflow/tensorboard/components/tf_text_dashboard/demo
-webfiles(
-    name = "demo",
-    srcs = ["index.html"],
-    path = "/tf-text-dashboard/demo",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_text_dashboard",
-        "//tensorflow/tensorboard/components/tf_text_dashboard/demo/data",
-        "@org_polymer_iron_demo_helpers",
-        "@org_polymer_paper_styles",
-        "@org_polymer_webcomponentsjs",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD
deleted file mode 100644
index 8adf661396c3453d2b0318c1bc89986cd971cbad..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "data",
-    srcs = glob(["*"]),
-    path = "/tf-text-dashboard/demo/data/plugin/text",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c15cf2fdc4a7b8c78a9c4ac80599402df458b7a1
--- /dev/null
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/BUILD
@@ -0,0 +1,45 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "tf_text_dashboard_d3v4",
+    srcs = [
+        "tf-text-dashboard.html",
+        "tf-text-loader.html",
+    ],
+    path = "/tf-text-dashboard",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_backend_d3v4",
+        "//tensorflow/tensorboard/components/tf_color_scale_d3v4",
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "@org_polymer",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_material",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"] + glob(["data/**"]),
+    path = "/tf-text-dashboard",
+    deps = [
+        ":tf_text_dashboard_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/logdir b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/logdir
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/data/logdir
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/logdir
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/runs.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/runs.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/data/runs.json
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/runs.json
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_fry_tag_markdown.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_markdown.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_fry_tag_markdown.json
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_markdown.json
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_fry_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_message.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_fry_tag_message.json
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_fry_tag_message.json
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_leela_tag_message.json b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_leela_tag_message.json
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/data/text_run_leela_tag_message.json
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/data/text_run_leela_tag_message.json
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/demo/index.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html
similarity index 89%
rename from tensorflow/tensorboard/components/tf_text_dashboard/demo/index.html
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html
index 3ab6e8573874fed111c6168eaf0c65c7792f9127..77d19b948c9d1844f6e6c2990075f14e1a6e6347 100644
--- a/tensorflow/tensorboard/components/tf_text_dashboard/demo/index.html
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/index.html
@@ -18,9 +18,9 @@ limitations under the License.
 
 <html>
   <head>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../tf-text-dashboard.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="tf-text-dashboard.html">
     <title>text Dashboard Demo</title>
     <style>
       #container{
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html
similarity index 92%
rename from tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html
index d39c890a7aeb6bce9597f98c9b85a9796cdfc60c..4c0b34055d09f1b87e0eaf2faefc6d349c8c94c9 100644
--- a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-dashboard.html
+++ b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-dashboard.html
@@ -82,8 +82,11 @@ tf-text-dashboard displays a dashboard that loads texts from a TensorFlow run.
     </style>
   </template>
   <script>
-    Polymer({
+    TF.Dashboard.TfTextDashboard = Polymer({
       is: "tf-text-dashboard",
+      factoryImpl: function(backend) {
+        this.backend = backend;
+      },
       properties: {
         backend: Object,
         dataType: {
@@ -92,15 +95,15 @@ tf-text-dashboard displays a dashboard that loads texts from a TensorFlow run.
         },
       },
       behaviors: [
-          TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-          TF.Backend.Behavior,
+        TF.Dashboard.DashboardBehavior("text"),
+        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
+        TF.Backend.BackendBehavior,
       ],
       attached: function() {
         this.async(function() {
           this.fire("rendered");
         });
       },
-
     });
   </script>
 </dom-module>
diff --git a/tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html b/tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-loader.html
similarity index 100%
rename from tensorflow/tensorboard/components/tf_text_dashboard/tf-text-loader.html
rename to tensorflow/tensorboard/components/tf_text_dashboard_d3v4/tf-text-loader.html
diff --git a/tensorflow/tensorboard/components/vz_data_summary/BUILD b/tensorflow/tensorboard/components/vz_data_summary/BUILD
index 9743d70d947c13edf455b9306e60757f8b104d68..a4ba0c089c94425d6613b334c05495d38772c601 100644
--- a/tensorflow/tensorboard/components/vz_data_summary/BUILD
+++ b/tensorflow/tensorboard/components/vz_data_summary/BUILD
@@ -1,34 +1,91 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# Description:
-# Package for the data-summary vz-element.
-package(default_visibility = ["//tensorflow:internal"])
+package(default_visibility = ["//visibility:public"])
+
+load(
+    "//tensorflow/tensorboard:defs.bzl",
+    "tensorboard_ts_config",
+    "tensorboard_ts_declaration",
+    "tensorboard_ts_development_sources",
+    "tensorboard_ts_devserver",
+    "tensorboard_ts_library",
+    "tensorboard_webcomponent_library",
+)
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+tensorboard_webcomponent_library(
+    name = "lib",
+    srcs = ["vz-data-summary.html"],
+    ts_lib_deps = [":ts_lib"],
+    destdir = "vz-data-summary",
+    deps = [
+        "//learning/vis/vz_elements:common",
+        "//third_party/javascript/d3/v3:lib",
+        "//third_party/javascript/polymer/v1/iron-demo-helpers:lib",
+        "//third_party/javascript/polymer/v1/iron-resizable-behavior:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "ts_lib",
+    srcs = ["vz-data-summary.ts"],
+    externs_list = [":externs"],
+    deps = [
+        ":typings",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+    ],
+)
+
+tensorboard_ts_declaration(
+    name = "typings",
+    srcs = ["typings.d.ts"],
+)
+
+# This build rule is used to run the demo.
+tensorboard_ts_devserver(
+    name = "dev_server",
+    manifest = ":dev_sources",
+    serving_path = "/demo_lib_out/vz-data-summary/vz-data-summary.js",
+    static_files = [":demo_lib"],
+    deps = [":tsconfig"],
+)
+
+tensorboard_webcomponent_library(
+    name = "demo_lib",
+    srcs = ["demo.html"],
+    destdir = "vz-data-summary",
+    deps = [
+        ":lib",
+        "//third_party/javascript/d3/v3:lib",
+        "//third_party/javascript/polymer/v1/iron-demo-helpers:lib",
+        "//third_party/javascript/polymer/v1/iron-resizable-behavior:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "demo_ts_lib",
+    srcs = ["demo.ts"],
+    externs_list = [":externs"],
+    deps = [
+        ":ts_lib",
+        "//third_party/javascript/typings/d3",
+    ],
+)
+
+tensorboard_ts_development_sources(
+    name = "dev_sources",
+    deps = [":demo_ts_lib"],
+)
+
+tensorboard_ts_config(
+    name = "tsconfig",
+    deps = [":ts_lib"],
+)
 
 filegroup(
     name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
+    srcs = glob(["**"]),
+    tags = ["notsan"],
     visibility = ["//tensorflow:__subpackages__"],
 )
diff --git a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts b/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts
index 9a4e80c8a9830b7f8e839f900516ced5d7779b08..27faf35f6bb9c96055e97fa722e1adf7904456f4 100644
--- a/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts
+++ b/tensorflow/tensorboard/components/vz_data_summary/vz-data-summary.ts
@@ -118,7 +118,7 @@ function createRect(
   // Set dimensions.
   rect.setAttribute('width', sliceWidth.toString());
   rect.setAttribute('height', height.toString());
-  // Set colour.
+  // Set color.
   rect.setAttribute('fill', color);
 
   return rect;
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
deleted file mode 100644
index 225afe79d66b1c409932ac93d19ac2d3be85d526..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/BUILD
+++ /dev/null
@@ -1,71 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "vz_distribution_chart",
-    srcs = [
-        "vz-distribution-chart.html",
-        ":ts",
-    ],
-    path = "/vz-distribution-chart",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "//tensorflow/tensorboard/components/vz_line_chart",
-        "@org_polymer",
-    ],
-)
-
-tensorboard_typescript_genrule(
-    name = "ts",
-    srcs = [
-        "vz-distribution-chart.ts",
-    ],
-    typings = [
-        "@org_definitelytyped//:d3.d.ts",
-        "@com_palantir_plottable//:plottable.d.ts",
-        "@org_definitelytyped//:lodash.d.ts",
-        "//tensorflow/tensorboard/components/vz_line_chart:ts_typings",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "vz-distribution-chart.html",
-        ":legacy_ts",
-    ],
-    visibility = ["//visibility:public"],
-    destdir = "vz-distribution-chart",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-        "vz-distribution-chart.ts",
-    ],
-    deps = [
-        "//tensorflow/tensorboard/components:common_deps",
-        "//tensorflow/tensorboard/components/vz_line_chart:legacy_ts",
-    ],
-)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
deleted file mode 100644
index 726e2216072885d39bfc3098d06d616d401bef7e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.html
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/plottable.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<dom-module id="vz-distribution-chart">
-  <template>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-    </style>
-  </template>
-  <script src="vz-distribution-chart.js"></script>
-  <script src="../vz-line-chart/vz-chart-helpers.js"></script>
-  <script>
-    Polymer({
-      is: "vz-distribution-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: { type: Number, value: null }
-      },
-      observers: [
-        "_makeChart(xType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-      ],
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-      redraw: function() {
-        this._chart.redraw();
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId === null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!_attached) return;
-          if (this._chart) this._chart.destroy();
-          var chart = new VZ.DistributionChart(xType, colorScale);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts b/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
deleted file mode 100644
index 1c64eb4cd49d31761e6372dc7c14c3ff186355bc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_distribution_chart/vz-distribution-chart.ts
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-
-module VZ {
-  export class DistributionChart {
-    private run2datasets: {[run: string]: Plottable.Dataset};
-    protected runs: string[];
-
-    protected xAccessor: Plottable.Accessor<number|Date>;
-    protected xScale: Plottable.QuantitativeScale<number|Date>;
-    protected yScale: Plottable.QuantitativeScale<number>;
-    protected gridlines: Plottable.Components.Gridlines;
-    protected center: Plottable.Components.Group;
-    protected xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-    protected yAxis: Plottable.Axes.Numeric;
-    protected xLabel: Plottable.Components.AxisLabel;
-    protected yLabel: Plottable.Components.AxisLabel;
-    protected outer: Plottable.Components.Table;
-    protected colorScale: Plottable.Scales.Color;
-    private plots: Plottable.XYPlot<number|Date, number>[];
-
-    private targetSVG: d3.Selection<any>;
-
-    constructor(xType: string, colorScale: Plottable.Scales.Color) {
-      this.run2datasets = {};
-      this.colorScale = colorScale;
-      this.buildChart(xType);
-    }
-
-    protected getDataset(run: string) {
-      if (this.run2datasets[run] === undefined) {
-        this.run2datasets[run] = new Plottable.Dataset([], {run: run});
-      }
-      return this.run2datasets[run];
-    }
-
-    protected buildChart(xType: string) {
-      if (this.outer) {
-        this.outer.destroy();
-      }
-      let xComponents = VZ.ChartHelpers.getXComponents(xType);
-      this.xAccessor = xComponents.accessor;
-      this.xScale = xComponents.scale;
-      this.xAxis = xComponents.axis;
-      this.xAxis.margin(0).tickLabelPadding(3);
-      this.yScale = new Plottable.Scales.Linear();
-      this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-      let yFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-      this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-      this.yAxis.usesTextWidthApproximation(true);
-
-      let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-
-      this.gridlines =
-          new Plottable.Components.Gridlines(this.xScale, this.yScale);
-
-      this.center = new Plottable.Components.Group([this.gridlines, center]);
-      this.outer = new Plottable.Components.Table(
-          [[this.yAxis, this.center], [null, this.xAxis]]);
-    }
-
-    protected buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      let percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
-      let opacities = _.range(percents.length - 1)
-                          .map((i) => (percents[i + 1] - percents[i]) / 2500);
-      let accessors = percents.map((p, i) => (datum) => datum[i][1]);
-      let median = 4;
-      let medianAccessor = accessors[median];
-
-      let plots = _.range(accessors.length - 1).map((i) => {
-        let p = new Plottable.Plots.Area<number|Date>();
-        p.x(xAccessor, xScale);
-
-        let y0 = i > median ? accessors[i] : accessors[i + 1];
-        let y = i > median ? accessors[i + 1] : accessors[i];
-        p.y(y, yScale);
-        p.y0(y0);
-        p.attr(
-            'fill', (d: any, i: number, dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().run));
-        p.attr(
-            'stroke', (d: any, i: number, dataset: Plottable.Dataset) =>
-                          this.colorScale.scale(dataset.metadata().run));
-        p.attr('stroke-weight', (d: any, i: number, m: any) => '0.5px');
-        p.attr('stroke-opacity', () => opacities[i]);
-        p.attr('fill-opacity', () => opacities[i]);
-        return p;
-      });
-
-      let medianPlot = new Plottable.Plots.Line<number|Date>();
-      medianPlot.x(xAccessor, xScale);
-      medianPlot.y(medianAccessor, yScale);
-      medianPlot.attr(
-          'stroke',
-          (d: any, i: number, m: any) => this.colorScale.scale(m.run));
-
-      this.plots = plots;
-      return new Plottable.Components.Group(plots);
-    }
-
-    public setVisibleSeries(runs: string[]) {
-      this.runs = runs;
-      let datasets = runs.map((r) => this.getDataset(r));
-      this.plots.forEach((p) => p.datasets(datasets));
-    }
-
-    /**
-     * Set the data of a series on the chart.
-     */
-    public setSeriesData(name: string, data: any) {
-      this.getDataset(name).data(data);
-    }
-
-    public renderTo(targetSVG: d3.Selection<any>) {
-      this.targetSVG = targetSVG;
-      this.setViewBox();
-      this.outer.renderTo(targetSVG);
-    }
-
-    /** There's an issue in Chrome where the svg overflow is a bit
-     * "flickery". There is a border on the gridlines on the extreme edge of the
-     * chart, which behaves inconsistently and causes the screendiffing tests to
-     * flake. We can solve this by creating 1px effective margin for the svg by
-     * setting the viewBox on the containing svg.
-     */
-    private setViewBox() {
-      // There's an issue in Firefox where if we measure with the old viewbox
-      // set, we get horrible results.
-      this.targetSVG.attr('viewBox', null);
-
-      let parent = this.targetSVG.node().parentNode as HTMLElement;
-      let w = parent.clientWidth;
-      let h = parent.clientHeight;
-      this.targetSVG.attr({
-        'height': h,
-        'viewBox': `0 0 ${w + 1} ${h + 1}`,
-      });
-    }
-
-    public redraw() {
-      this.outer.redraw();
-      this.setViewBox();
-    }
-
-    protected destroy() { this.outer.destroy(); }
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3ff60d5143a2495cff7259e8d55459c1f58305de
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/BUILD
@@ -0,0 +1,66 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_distribution_chart_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-distribution-chart.html",
+    ],
+    path = "/vz-distribution-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-distribution-chart",
+    deps = [
+        ":vz_distribution_chart_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/vz_line_chart_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ": [
+        "vz-distribution-chart.ts",
+    ]},
+    namespace_symbol_aliases = {"VZ": {
+        "Dataset": "Plottable.Dataset",
+        "ChartHelpers": "VZ.ChartHelpers",
+    }},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
similarity index 94%
rename from tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html
rename to tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
index 5dca73aac4dfaf3ce44234a9a60e40423dcd9adf..39db09354bd527fa90bb05f0d7656991b1d2383a 100644
--- a/tensorflow/tensorboard/components/vz_distribution_chart/demo/index.html
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/index.html
@@ -21,16 +21,16 @@ limitations under the License.
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>vz-distribution chart demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-distribution-chart.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="vz-distribution-chart.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
     <style type="text/css">
       body {
         font-family: "Roboto";
       }
 
-      vz-line-chart {
+      vz-distribution-chart {
         height: 400px;
       }
     </style>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html
new file mode 100644
index 0000000000000000000000000000000000000000..3c517bd164e8981251ec490410fb6d357221bbac
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.html
@@ -0,0 +1,45 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/plottable.html">
+<link rel="import" href="../vz-line-chart/vz-line-chart.html">
+
+<dom-module id="vz-distribution-chart">
+  <template>
+    <div id="chartdiv"></div>
+    <style>
+      :host {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        display: flex;
+        flex-direction: column;
+        flex-grow: 1;
+        flex-shrink: 1;
+        position: relative;
+      }
+      #chartdiv {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+
+    </style>
+  </template>
+  <script src="bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.ts b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.ts
new file mode 100644
index 0000000000000000000000000000000000000000..01def4738391fec24c22bcc4ed98a70e7e356490
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_distribution_chart_d3v4/vz-distribution-chart.ts
@@ -0,0 +1,242 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* tslint:disable:no-namespace variable-name */
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as _ from 'lodash'
+import * as Plottable from 'Plottable/plottable';  // from //third_party/javascript/plottable/v3
+import {Dataset} from 'Plottable/plottable';
+
+import * as ChartHelpers from '../vz_line_chart_d3v4/vz-chart-helpers'
+
+export class DistributionChart {
+  private run2datasets: {[run: string]: Plottable.Dataset};
+  protected runs: string[];
+
+  protected xAccessor: Plottable.IAccessor<number|Date>;
+  protected xScale: Plottable.QuantitativeScale<number|Date>;
+  protected yScale: Plottable.QuantitativeScale<number>;
+  protected gridlines: Plottable.Components.Gridlines;
+  protected center: Plottable.Components.Group;
+  protected xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
+  protected yAxis: Plottable.Axes.Numeric;
+  protected xLabel: Plottable.Components.AxisLabel;
+  protected yLabel: Plottable.Components.AxisLabel;
+  protected outer: Plottable.Components.Table;
+  protected colorScale: Plottable.Scales.Color;
+  private plots: Plottable.XYPlot<number|Date, number>[];
+
+  private targetSVG: d3.Selection<any, any, any, any>;
+
+  constructor(xType: string, colorScale: Plottable.Scales.Color) {
+    this.run2datasets = {};
+    this.colorScale = colorScale;
+    this.buildChart(xType);
+  }
+
+  protected getDataset(run: string) {
+    if (this.run2datasets[run] === undefined) {
+      this.run2datasets[run] = new Plottable.Dataset([], {run: run});
+    }
+    return this.run2datasets[run];
+  }
+
+  protected buildChart(xType: string) {
+    if (this.outer) {
+      this.outer.destroy();
+    }
+    let xComponents = ChartHelpers.getXComponents(xType);
+    this.xAccessor = xComponents.accessor;
+    this.xScale = xComponents.scale;
+    this.xAxis = xComponents.axis;
+    this.xAxis.margin(0).tickLabelPadding(3);
+    this.yScale = new Plottable.Scales.Linear();
+    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
+    let yFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
+    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
+    this.yAxis.usesTextWidthApproximation(true);
+
+    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
+
+    this.gridlines =
+        new Plottable.Components.Gridlines(this.xScale, this.yScale);
+
+    this.center = new Plottable.Components.Group([this.gridlines, center]);
+    this.outer = new Plottable.Components.Table(
+        [[this.yAxis, this.center], [null, this.xAxis]]);
+  }
+
+  protected buildPlot(xAccessor, xScale, yScale): Plottable.Component {
+    let percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
+    let opacities = _.range(percents.length - 1)
+                        .map((i) => (percents[i + 1] - percents[i]) / 2500);
+    let accessors = percents.map((p, i) => (datum) => datum[i][1]);
+    let median = 4;
+    let medianAccessor = accessors[median];
+
+    let plots = _.range(accessors.length - 1).map((i) => {
+      let p = new Plottable.Plots.Area<number|Date>();
+      p.x(xAccessor, xScale);
+
+      let y0 = i > median ? accessors[i] : accessors[i + 1];
+      let y = i > median ? accessors[i + 1] : accessors[i];
+      p.y(y, yScale);
+      p.y0(y0);
+      p.attr(
+          'fill',
+          (d: any, i: number, dataset: Plottable.Dataset) =>
+              this.colorScale.scale(dataset.metadata().run));
+      p.attr(
+          'stroke',
+          (d: any, i: number, dataset: Plottable.Dataset) =>
+              this.colorScale.scale(dataset.metadata().run));
+      p.attr('stroke-weight', (d: any, i: number, m: any) => '0.5px');
+      p.attr('stroke-opacity', () => opacities[i]);
+      p.attr('fill-opacity', () => opacities[i]);
+      return p;
+    });
+
+    let medianPlot = new Plottable.Plots.Line<number|Date>();
+    medianPlot.x(xAccessor, xScale);
+    medianPlot.y(medianAccessor, yScale);
+    medianPlot.attr(
+        'stroke', (d: any, i: number, m: any) => this.colorScale.scale(m.run));
+
+    this.plots = plots;
+    return new Plottable.Components.Group(plots);
+  }
+
+  public setVisibleSeries(runs: string[]) {
+    this.runs = runs;
+    let datasets = runs.map((r) => this.getDataset(r));
+    this.plots.forEach((p) => p.datasets(datasets));
+  }
+
+  /**
+   * Set the data of a series on the chart.
+   */
+  public setSeriesData(name: string, data: any) {
+    this.getDataset(name).data(data);
+  }
+
+  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
+    this.targetSVG = targetSVG;
+    this.outer.renderTo(targetSVG);
+  }
+
+  public redraw() {
+    this.outer.redraw();
+  }
+
+  protected destroy() {
+    this.outer.destroy();
+  }
+}
+
+
+Polymer({
+  is: 'vz-distribution-chart',
+  properties: {
+    /**
+     * Scale that maps series names to colors. The default colors are from
+     * d3.d3.schemeCategory10. Use this property to replace the default
+     * line colors with colors of your own choice.
+     * @type {Plottable.Scales.Color}
+     * @required
+     */
+    colorScale: {
+      type: Object,
+      value: function() {
+        return new Plottable.Scales.Color().range(d3.schemeCategory10);
+      }
+    },
+    /**
+     * The way to display the X values. Allows:
+     * - "step" - Linear scale using the  "step" property of the datum.
+     * - "wall_time" - Temporal scale using the "wall_time" property of the
+     * datum.
+     * - "relative" - Temporal scale using the "relative" property of the
+     * datum if it is present or calculating from "wall_time" if it isn't.
+     */
+    xType: {type: String, value: 'step'},
+    _attached: Boolean,
+    _chart: Object,
+    _visibleSeriesCache: {
+      type: Array,
+      value: function() {
+        return []
+      }
+    },
+    _seriesDataCache: {
+      type: Object,
+      value: function() {
+        return {}
+      }
+    },
+    _makeChartAsyncCallbackId: {type: Number, value: null}
+  },
+  observers: [
+    '_makeChart(xType, colorScale, _attached)',
+    '_reloadFromCache(_chart)',
+  ],
+  setVisibleSeries: function(names) {
+    this._visibleSeriesCache = names;
+    if (this._chart) {
+      this._chart.setVisibleSeries(names);
+      this.redraw();
+    }
+  },
+  setSeriesData: function(name, data) {
+    this._seriesDataCache[name] = data;
+    if (this._chart) {
+      this._chart.setSeriesData(name, data);
+    }
+  },
+  redraw: function() {
+    this._chart.redraw();
+  },
+  ready: function() {
+    this.scopeSubtree(this.$.chartdiv, true);
+  },
+  _makeChart: function(xType, colorScale, _attached) {
+    if (this._makeChartAsyncCallbackId === null) {
+      this.cancelAsync(this._makeChartAsyncCallbackId);
+    }
+
+    this._makeChartAsyncCallbackId = this.async(function() {
+      this._makeChartAsyncCallbackId = null;
+      if (!_attached) return;
+      if (this._chart) this._chart.destroy();
+      var chart = new DistributionChart(xType, colorScale);
+      var svg = d3.select(this.$.chartdiv);
+      chart.renderTo(svg);
+      this._chart = chart;
+    }, 350);
+  },
+  _reloadFromCache: function() {
+    if (this._chart) {
+      this._chart.setVisibleSeries(this._visibleSeriesCache);
+      this._visibleSeriesCache.forEach(function(name) {
+        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
+      }.bind(this));
+    }
+  },
+  attached: function() {
+    this._attached = true;
+  },
+  detached: function() {
+    this._attached = false;
+  }
+});
diff --git a/tensorflow/tensorboard/components/vz_heatmap/BUILD b/tensorflow/tensorboard/components/vz_heatmap/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..eccc411f1b52bae193e7a8a79dc35a5bc5de88d7
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_heatmap/BUILD
@@ -0,0 +1,29 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+licenses(["notice"])  # Apache 2.0
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "demo/index.html",
+        "index.html",
+        "vz-heatmap.html",
+    ],
+    visibility = ["//visibility:public"],
+    destdir = "vz-heatmap",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//tensorflow/tensorboard/components/vz_sorting:legacy",
+        "//third_party/javascript/polymer/v1/iron-component-page:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+        "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_heatmap/demo/index.html b/tensorflow/tensorboard/components/vz_heatmap/demo/index.html
index 63dd04485fc9f37390bf8c56dd898e747bb0d6a0..0ef092fccc4a140a2b077086b670c088719def6c 100644
--- a/tensorflow/tensorboard/components/vz_heatmap/demo/index.html
+++ b/tensorflow/tensorboard/components/vz_heatmap/demo/index.html
@@ -22,6 +22,7 @@ limitations under the License.
   <title>Heatmap example</title>
   <link rel="import" href="../vz-heatmap.html">
   <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+  <link rel="import" href="../../tf-imports/d3.html">
 </head>
 <script>
   function generateRandomMatrix() {
@@ -108,7 +109,7 @@ limitations under the License.
     setTimeout(function () {
       var heatmapColor = document.getElementById('color_function');
       heatmapColor.colorFunction =
-          d3.scale.linear().range(['white', 'black']).domain([0, 5]);
+          d3.scaleLinear().range(['white', 'black']).domain([0, 5]);
     }, 1500);
   </script>
 </div>
diff --git a/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html b/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html
index 7acba47d5f5b9c42cac364fb745bd67e6b935763..09fa8579291ae9fea0d78cc5ac28e8d88a90cfce 100644
--- a/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html
+++ b/tensorflow/tensorboard/components/vz_heatmap/vz-heatmap.html
@@ -20,7 +20,7 @@ limitations under the License.
 
 <!--
 `vz-heatmap` A simple heatmap to visualize 2D data using predefined or user
-defined colour scheme, with a dependency on d3.js. The heatmap automatically
+defined color scheme, with a dependency on d3.js. The heatmap automatically
 fits itself to the width of the container it is placed in.
 
 @element vz-heatmap
@@ -89,7 +89,7 @@ fits itself to the width of the container it is placed in.
         _generatedColorScale: {
           type: Object,
           value: (function () {
-            var retFunction = d3.scale.linear();
+            var retFunction = d3.scaleLinear();
             if (this.colors) {
               retFunction.range(this.colors)
             }
@@ -277,7 +277,7 @@ fits itself to the width of the container it is placed in.
        */
       _updateColorFunction: function () {
         if (Array.isArray(this.colors) && this.colors.length) {
-          this._generatedColorScale = d3.scale.linear().range(this.colors);
+          this._generatedColorScale = d3.scaleLinear().range(this.colors);
         }
 
         if (this.values) {
@@ -287,14 +287,14 @@ fits itself to the width of the container it is placed in.
             this._generatedColorScale.domain(this.values);
           }
 
-          // If values reset, and data field contains valid data, set colour scale
+          // If values reset, and data field contains valid data, set color scale
           // to use the data extent as domain.
         } else if (this._currentDataValid) {
           this._generatedColorScale.domain(this._dataExtent);
         }
       },
       _getLinearInterpolation: function (domain, range) {
-        return d3.scale.linear().domain(domain).range(range);
+        return d3.scaleLinear().domain(domain).range(range);
       },
       /**
        * Find side length of each tile in heat map by dividing current width
diff --git a/tensorflow/tensorboard/components/vz_heatmap_d3v4/BUILD b/tensorflow/tensorboard/components/vz_heatmap_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b62629fc55493269a4c080c20cd3aa5ec9ee2c05
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_heatmap_d3v4/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_heatmap_d3v4",
+    srcs = ["vz-heatmap.html"],
+    path = "/vz-heatmap",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+        "@org_polymer_iron_resizable_behavior",
+    ],
+)
+
+web_library(
+    name = "index",
+    srcs = [
+        "demo/index.html",
+        "index.html",
+    ],
+    path = "/vz-heatmap",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":vz_heatmap_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+        "@org_polymer_iron_component_page",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_heatmap_d3v4/demo/index.html b/tensorflow/tensorboard/components/vz_heatmap_d3v4/demo/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..0ef092fccc4a140a2b077086b670c088719def6c
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_heatmap_d3v4/demo/index.html
@@ -0,0 +1,161 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>Heatmap example</title>
+  <link rel="import" href="../vz-heatmap.html">
+  <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
+  <link rel="import" href="../../tf-imports/d3.html">
+</head>
+<script>
+  function generateRandomMatrix() {
+    var rows = getRandomArbitrary(10, 20);
+    var columns = getRandomArbitrary(10, 20);
+    var data = [];
+    // Generate new data array.
+    for (var row = 0; row < rows; row++) {
+      var currentRow = [];
+      for (var col = 0; col < columns; col++) {
+        currentRow[col] = getRandomArbitrary(0, 20);
+      }
+      data[row] = currentRow;
+    }
+    return data;
+  }
+
+  // Returns a random number between min (inclusive) and max (exclusive)
+  function getRandomArbitrary(min, max) {
+    return Math.random() * (max - min) + min;
+  }
+
+  function getRandomColorRange() {
+    return [getRandomColor(), getRandomColor()];
+  }
+
+  function getRandomColor() {
+    var letters = '0123456789ABCDEF';
+    var color = '#';
+    for (var i = 0; i < 6; i++) {
+      color += letters[Math.floor(Math.random() * 16)];
+    }
+    return color;
+  }
+</script>
+<body>
+<h1>Initialized with data</h1>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap data="[[1,2],[3,4]]"></vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+
+<h1>Initialized with data and custom data range</h1>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap data="[[1,2],[3,4]]" values="[0,10]"></vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+
+<h1>Initialized with data and colors</h1>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap data="[[1,2],[3,4]]" colors='["yellow", "red"]'></vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+
+<h1>Initialized with data and colors and threshold values</h1>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap data="[[1,2],[3,4]]"
+                  values="[0, 10]"
+                  colors='["yellow", "red"]'
+      ></vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+<h1>Initialized with data and color function</h1>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap id="color_function" data="[[1,2],[3,4]]"
+      ></vz-heatmap>
+    </template>
+  </demo-snippet>
+  <script>
+    setTimeout(function () {
+      var heatmapColor = document.getElementById('color_function');
+      heatmapColor.colorFunction =
+          d3.scaleLinear().range(['white', 'black']).domain([0, 5]);
+    }, 1500);
+  </script>
+</div>
+
+<h1>Initialized with data and updated data</h1>
+<h3>Click on the heatmap to create new random data.</h3>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap data="[[1,2],[3,4]]"
+                  onclick="this.data = generateRandomMatrix()"
+      >
+      </vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+
+<h1>Initialized with data and updated color update</h1>
+<h3>Click on the heatmap to update the color scheme.</h3>
+<div style="width: 30%">
+  <demo-snippet>
+    <template>
+      <vz-heatmap id="data_color_update" data="[[1,2],[3,4]]"
+                  onclick="this.colors = getRandomColorRange();
+                       this.data = generateRandomMatrix()"
+      >
+      </vz-heatmap>
+    </template>
+  </demo-snippet>
+</div>
+
+<h2>Let's try to actually break it... *Puts on fedora*</h2>
+<p>Code below is not meant to be seen, but to ensure that no errors are
+  thrown when invalid data is passed into the Polymer element.</p>
+<demo-snippet>
+  <template>
+    <vz-heatmap id="break_the_heatmap" data="undefined"></vz-heatmap>
+  </template>
+</demo-snippet>
+<script>
+  var bth = document.getElementById('break_the_heatmap');
+  bth.data = []; // Empty 1D array.
+  bth.data = [[]]; // Empty 2D array.
+  bth.colors = ['yellow', 'blue', '']; // More than 2 elements in colors array.
+  bth.values = [1, 2, 3]; // More than 2 elements in values array.
+</script>
+
+</body>
+</html>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/index.html b/tensorflow/tensorboard/components/vz_heatmap_d3v4/index.html
similarity index 89%
rename from tensorflow/tensorboard/components/vz_line_chart/index.html
rename to tensorflow/tensorboard/components/vz_heatmap_d3v4/index.html
index b7b399d3fc835e126a7fb0b8a48c5024610a84c7..656306499eb4199c057b5e4c22d4b29940e6ed53 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/index.html
+++ b/tensorflow/tensorboard/components/vz_heatmap_d3v4/index.html
@@ -18,13 +18,14 @@ limitations under the License.
 
 <html>
   <head>
-    <title>vz-line-chart</title>
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <script src="../webcomponentsjs/webcomponents-lite.js"></script>
     <link rel="import" href="../iron-component-page/iron-component-page.html">
   </head>
   <body>
-    <iron-component-page src="vz-line-chart.html"></iron-component-page>
+
+    <iron-component-page src="vz-heatmap.html"></iron-component-page>
+
   </body>
 </html>
diff --git a/tensorflow/tensorboard/components/vz_heatmap_d3v4/vz-heatmap.html b/tensorflow/tensorboard/components/vz_heatmap_d3v4/vz-heatmap.html
new file mode 100644
index 0000000000000000000000000000000000000000..089692576fa72db40399e5f5374e4e770b6f5fa7
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_heatmap_d3v4/vz-heatmap.html
@@ -0,0 +1,360 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../iron-resizable-behavior/iron-resizable-behavior.html">
+<link rel="import" href="../tf-imports/d3.html">
+
+<!--
+`vz-heatmap` A simple heatmap to visualize 2D data using predefined or user
+defined color scheme, with a dependency on d3.js. The heatmap automatically
+fits itself to the width of the container it is placed in.
+
+@element vz-heatmap
+@demo demo/index.html
+-->
+<dom-module id="vz-heatmap">
+  <template>
+    <div>
+      <canvas id="heatmap"
+              style="width: 100%; image-rendering:pixelated"></canvas>
+    </div>
+  </template>
+  <script>
+    Polymer({
+      is: 'vz-heatmap',
+      properties: {
+        /**
+         * An 2-element array which defines the lower and upper bound of the
+         * automatically created color scale for the heatmap.
+         */
+        colors: {
+          type: Array,
+          value: ['white', 'steelblue'],
+          observer: '_onColorsChange'
+        },
+        /**
+         * A function which returns a hex-formatted color string
+         * given a number type.
+         * i.e. (input: number) => string
+         *
+         * Passing a colorFunction deactivates the color
+         * function generated by the `colors` and `values` fields.
+         */
+        colorFunction: {
+          type: Object,
+          observer: '_onColorFunctionChange'
+        },
+        /**
+         * A 2D data array containing the data to be visualized.
+         */
+        data: {
+          type: Array,
+          value: [],
+          observer: '_onDataChange'
+        },
+        /**
+         * A 2-element array containing the domain of values which
+         * maps to the color range defined by the `colors` array.
+         */
+        values: {
+          type: Array,
+          observer: '_onValuesChange'
+        },
+        _currentDataValid: {
+          type: Boolean,
+          value: false
+        },
+        _dataExtent: {
+          type: Array,
+          value: [0, 1]
+        },
+        _debug: {
+          type: Boolean,
+          value: false
+        },
+        _generatedColorScale: {
+          type: Object,
+          value: (function () {
+            var retFunction = d3.scaleLinear();
+            if (this.colors) {
+              retFunction.range(this.colors)
+            }
+
+            return retFunction;
+          })()
+        },
+        _isReady: {
+          type: Boolean,
+          value: false
+        },
+        _height: {
+          type: Number,
+          value: 0
+        },
+        _width: {
+          type: Number,
+          value: 0
+        },
+        _useUserColorPickerFunction: {
+          type: Boolean,
+          value: false
+        }
+      },
+      behaviors: [
+        Polymer.IronResizableBehavior
+      ],
+      listeners: {
+        'iron-resize': '_onWidthChange'
+      },
+      _onWidthChange: function () {
+        // Re-render the chart if the data has already been set.
+        if (this._currentDataValid) {
+          this._renderData();
+        }
+      },
+      ready: function () {
+        this._isReady = true;
+        this._onWidthChange();
+      },
+      attached: function () {
+        this._onWidthChange();
+      },
+      /**
+       * Data change handler, if new data is valid, sets flag to indicate data
+       * now valid, updates the data extent and renders the new data.
+       */
+      _onDataChange: function () {
+        // Validate new data.
+        if (!this._isDataValid(this.data)) {
+          return;
+        }
+
+        // Set flag to indicate data is now valid.
+        this._currentDataValid = true;
+
+        // Calculate new data extent.
+        this._updateDataExtent();
+
+        if (this._isReady) {
+          this._renderData();
+        }
+      },
+      /**
+       * (Re-)Renders the heatmap. Called when data, color mapping, or width
+       * changes.
+       * @private
+       */
+      _renderData: function () {
+        // Ensure dimensions are up-to-date and valid before starting rendering.
+        this._updateDimensions();
+        if (!this._width || !this._height) return;
+
+        // Ensure the color function is up-to-date.
+        this._updateColorFunction();
+        var width = this._width;
+        var rows = this.data.length;
+        var columns = this.data[0].length;
+
+        // Calculate side length of each tile.
+        var sideLength = width / columns;
+
+        // Set domain and range of the axis.
+        var colorScale = (this._useUserColorPickerFunction) ?
+            this.colorFunction : this._generatedColorScale;
+
+        // Clear the canvas.
+        this.resetCanvas();
+
+        // Render heatmap.
+        var ctx = this.$.heatmap.getContext("2d");
+
+        for (var row = 0; row < rows; row++) {
+          for (var column = 0; column < columns; column++) {
+            var value = this.data[row][column];
+            // Set location and dimensions.
+            ctx.fillStyle = colorScale(value);
+            // Preferred way to set color of individual pixels.
+            ctx.fillRect(column, row, 1, 1);
+          }
+        }
+      },
+      /**
+       * Observer for this.colors.
+       * @private
+       */
+      _onColorsChange: function () {
+        if (Array.isArray(this.colors) && this.colors.length == 2) {
+          this._updateColorFunction();
+        }
+        this._useUserColorPickerFunction = false;
+        this._renderData();
+      },
+      /**
+       * Observer for this.values. Updates the
+       * internal color function. If data invalid, internal color scale uses
+       * the extent of the data as domain.
+       * @private
+       */
+      _onValuesChange: function () {
+        // Verify that validity of the new content of values.
+        // If data not valid, set this.values to undefined, as
+        // _updateColorFunction will then use the data's extent as the color
+        // scale's domain.
+        if (!(Array.isArray(this.values) && this.values.length == 2)) {
+          this.values = undefined;
+        }
+
+        this._updateColorFunction();
+        this._useUserColorPickerFunction = false;
+        this._renderData();
+      },
+      /**
+       * Observer for this.colorFunction. This field allows the user to
+       * provide a custom color scale. Updates to this value are ignored, if the new
+       * value is not a function object.
+       * @private
+       */
+      _onColorFunctionChange: function () {
+        if (typeof this.colorFunction === 'function') {
+          this._useUserColorPickerFunction = true;
+          this._renderData();
+        } else if (this._debug) {
+          console.log('The colorFunction provided is not of function type, and was not set as default.')
+        }
+      },
+      /**
+       * Calculates the extent of the data if it is required by the
+       * current color function. This function is called when
+       * properties change which would result in the color scale
+       * changing, or when the computed color scale is started to be used.
+       * @private
+       */
+      _updateDataExtent: function () {
+        if (this._currentDataValid) {
+          var rows = this.data.length;
+          var columns = this.data[0].length;
+
+          var min = this.data[0][0];
+          var max = this.data[0][0];
+
+          for (var row = 0; row < rows; row++) {
+            for (var col = 0; col < columns; col++) {
+              var currentElement = this.data[row][col];
+
+              if (currentElement < min) {
+                min = currentElement;
+              } else if (currentElement > max) {
+                max = currentElement;
+              }
+            }
+          }
+
+          this._dataExtent = [min, max];
+        } else if (!this._dataExtent) {
+          this._dataExtent = [0, 1];
+        }
+      },
+      /**
+       * Updates the internal color function only when the number of
+       * elements in this.colors is equal to the number of elements in
+       * this.values or if this.values is empty, in which case the
+       * extent of the data is used.
+       * @private
+       */
+      _updateColorFunction: function () {
+        if (Array.isArray(this.colors) && this.colors.length) {
+          this._generatedColorScale = d3.scaleLinear().range(this.colors);
+        }
+
+        if (this.values) {
+          // Check that the number of elements in this.values and
+          // this.colors have an identical number of elements.
+          if (this.colors.length === this.values.length) {
+            this._generatedColorScale.domain(this.values);
+          }
+
+          // If values reset, and data field contains valid data, set color scale
+          // to use the data extent as domain.
+        } else if (this._currentDataValid) {
+          this._generatedColorScale.domain(this._dataExtent);
+        }
+      },
+      _getLinearInterpolation: function (domain, range) {
+        return d3.scaleLinear().domain(domain).range(range);
+      },
+      /**
+       * Find side length of each tile in heat map by dividing current width
+       * by number of columns
+       */
+      _findTileSideLength: function () {
+        if (this._currentDataValid) {
+          var numOfColumns = this.data[0].length;
+
+          // Make sure value is finite.
+          if (numOfColumns === 0) {
+            var returnValue = 0;
+          } else {
+            returnValue = this._width / numOfColumns;
+          }
+          return returnValue;
+
+        } else {
+          if (this._debug == true) {
+            console.log("WARNING: vz-heatmap is reverting to zero height as passed data is invalid.")
+          }
+          return 0; // Fall back to 0 height, if data is invalid.
+        }
+      },
+      /**
+       * Verifies whether input data is valid.
+       * @param newData Number[][]
+       * @private
+       */
+      _isDataValid: function (newData) {
+        return Array.isArray(newData) && newData.length &&
+            Array.isArray(newData[0]);
+      },
+      _updateDimensions: function () {
+        var canvasElement = this.$.heatmap;
+
+        var rows = 0;
+        var columns = 0;
+        if (this._currentDataValid) {
+          rows = this.data.length;
+          columns = this.data[0].length;
+        }
+
+        // Recalculate height and width, and ensure data can be rendered.
+        this._width = canvasElement.parentNode.clientWidth;
+        this._height = this._findTileSideLength() * rows;
+        // Update the attribute height and width.
+        canvasElement.setAttribute('height', rows);
+        canvasElement.setAttribute('width', columns);
+      },
+      /**
+       * Function which clears the canvas.
+       */
+      resetCanvas: function () {
+        // Reset the canvas.
+        var canvas = this.$.heatmap;
+        var ctx = canvas.getContext("2d");
+        ctx.clearRect(0, 0, canvas.width, canvas.height);
+      }
+    });
+  </script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a6efbfba2b355c3534cdc2a54d5e44a556acc601
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/BUILD
@@ -0,0 +1,61 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_histogram_timeseries_d3v4",
+    srcs = ["vz-histogram-timeseries.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-histogram-timeseries",
+    deps = [
+        ":vz_histogram_timeseries_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# LEGACY - MARKED FOR DELETION
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "vz-histogram-timeseries.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
+    destdir = "vz-histogram-timeseries",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+# This is needed: components/BUILD seeks a legacy_ts rule in this package.
+tensorboard_ts_library(
+    name = "legacy_ts",
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/index.html
similarity index 98%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html
rename to tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/index.html
index 56543fb66b294f7c124b560ff3a9b65bb7ef0d70..42efa83eb07d2da9993bb410a50c9503df8c582c 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/demo/index.html
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/index.html
@@ -22,10 +22,10 @@ limitations under the License.
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>vz-histogram-timeseries demo</title>
     <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-histogram-timeseries.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
-    <link rel="import" href="../../paper-button/paper-button.html">
+    <link rel="import" href="vz-histogram-timeseries.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
+    <link rel="import" href="../paper-button/paper-button.html">
     <style type="text/css">
       body {
         font-family: "Roboto";
diff --git a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
similarity index 94%
rename from tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
rename to tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
index 545a762cd7d1c69e880c8159abf339c7b7831081..bdba230077d48d3602be2213ea0aa18f7d60a5b4 100644
--- a/tensorflow/tensorboard/components/vz_histogram_timeseries/vz-histogram-timeseries.html
+++ b/tensorflow/tensorboard/components/vz_histogram_timeseries_d3v4/vz-histogram-timeseries.html
@@ -158,8 +158,7 @@ visualization.
         }
 
         .axis {
-          font-size: 10px;
-          fill: #aaa;
+          font-size: 11px;
         }
 
         .axis path.domain {
@@ -253,13 +252,13 @@ visualization.
 
         /**
          * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
+         * d3.schemeCategory10() scale. Use this property to replace the default
          * line colors with colors of your own choice.
          */
         colorScale: {
           type: Object,
           value: function() {
-            return d3.scale.category10();
+            return d3.scaleOrdinal(d3.schemeCategory10);
           }
         },
 
@@ -378,7 +377,7 @@ visualization.
         var yAxisFormat = d3.format(".0f");
 
         if (timeProp === "wall_time") {
-          yAxisFormat = d3.time.format("%m/%d %X");
+          yAxisFormat = d3.timeFormat("%m/%d %X");
         } else if (timeProp === "relative") {
           yAxisFormat = function(d) {
             return d3.format(".1r")(d / 3.6e6) + 'h'; // Convert to hours.
@@ -405,19 +404,19 @@ visualization.
 
         var extent = d3.extent(data, timeAccessor);
 
-        var yScale = (timeProp === "wall_time" ? d3.time.scale() : d3.scale.linear())
+        var yScale = (timeProp === "wall_time" ? d3.scaleTime() : d3.scaleLinear())
             .domain(extent)
             .range([0, (mode === "offset" ? height : 0)]);
 
-        var ySliceScale = d3.scale.linear()
+        var ySliceScale = d3.scaleLinear()
             .domain([0, d3.max(data, function(d, i) { return yExtents[i][1]; })])
             .range([sliceHeight, 0]);
 
-        var yLineScale = d3.scale.linear()
+        var yLineScale = d3.scaleLinear()
             .domain(ySliceScale.domain())
             .range([outlineCanvasSize, 0]);
 
-        var xScale = d3.scale.linear()
+        var xScale = d3.scaleLinear()
             .domain([
               d3.min(data, function(d, i) { return xExtents[i][0]; }),
               d3.max(data, function(d, i) { return xExtents[i][1]; })
@@ -425,39 +424,33 @@ visualization.
             .nice()
             .range([0, width]);
 
-        var xLineScale = d3.scale.linear()
+        var xLineScale = d3.scaleLinear()
             .domain(xScale.domain())
             .range([0, outlineCanvasSize]);
 
-        var outlineColor = d3.scale.linear()
+        var outlineColor = d3.scaleLinear()
             .domain(d3.extent(data, timeAccessor))
             .range([color.darker(), color.brighter()])
             .interpolate(d3.interpolateHcl);
 
-        var xAxis = d3.svg.axis()
-            .scale(xScale)
-            .ticks(Math.max(2, width / 20))
-            .orient("bottom");
+        var xAxis = d3.axisBottom(xScale).ticks(Math.max(2, width / 20));
 
-        var yAxis = d3.svg.axis()
-            .scale(yScale)
+        var yAxis = d3.axisRight(yScale)
             .ticks(Math.max(2, height / 15))
-            .tickFormat(yAxisFormat)
-            .orient("right");
+            .tickFormat(yAxisFormat);
+
+
 
-        var ySliceAxis = d3.svg.axis()
-            .scale(ySliceScale)
+        var ySliceAxis = d3.axisRight(ySliceScale)
             .ticks(Math.max(2, height / 15))
             .tickSize(width + 5)
-            .tickFormat(format)
-            .orient("right");
+            .tickFormat(format);
 
         var xBinCentroid = function(d) {
           return d[xProp] + d[dxProp] / 2;
         };
 
-        var linePath = d3.svg.line()
-            .interpolate("linear")
+        var linePath = d3.line()
             .x(function(d) { return xLineScale(xBinCentroid(d)); })
             .y(function(d) { return yLineScale(d[yProp]); });
 
@@ -513,7 +506,7 @@ visualization.
         var histogram = stage.selectAll(".histogram").data(data),
             histogramExit = histogram.exit().remove(),
             histogramEnter = histogram.enter().append("g").attr("class", "histogram"),
-            histogramUpdate = histogram
+            histogramUpdate = histogramEnter.merge(histogram)
                 .sort(function(a, b) { return timeAccessor(a) - timeAccessor(b); }),
             histogramTransition = gTransition.selectAll(".histogram")
                 .attr("transform", function(d) {
@@ -556,7 +549,7 @@ visualization.
 
         var xAxisHover = g.select(".x-axis-hover").selectAll(".label").data(["x"]),
             xAxisHoverEnter = xAxisHover.enter().append("g").attr("class", "label"),
-            xAxisHoverUpdate = xAxisHover;
+            xAxisHoverUpdate = xAxisHover.merge(xAxisHoverEnter);
 
         xAxisHoverEnter.append("rect")
             .attr("x", -20)
@@ -575,7 +568,7 @@ visualization.
 
         var yAxisHover = g.select(".y-axis-hover").selectAll(".label").data(["y"]),
             yAxisHoverEnter = yAxisHover.enter().append("g").attr("class", "label"),
-            yAxisHoverUpdate = yAxisHover;
+            yAxisHoverUpdate = yAxisHover.merge(yAxisHoverEnter);
 
         yAxisHoverEnter.append("rect")
             .attr("x", 8)
@@ -595,7 +588,7 @@ visualization.
 
         var ySliceAxisHover = g.select(".y-slice-axis-hover").selectAll(".label").data(["y"]),
             ySliceAxisHoverEnter = ySliceAxisHover.enter().append("g").attr("class", "label"),
-            ySliceAxisHoverUpdate = ySliceAxisHover;
+            ySliceAxisHoverUpdate = ySliceAxisHover.merge(ySliceAxisHoverEnter);
 
         ySliceAxisHoverEnter.append("rect")
             .attr("x", 8)
@@ -627,6 +620,11 @@ visualization.
             .attr("transform", "translate(" + width + ", " + (mode === "offset" ? 0 : height) + ")")
             .call(yAxis);
 
+        gTransition.selectAll(".tick text")
+            .attr("fill", "#aaa");
+        gTransition.selectAll(".axis path.domain").attr("stroke", "none");
+
+
         function onMouseMove() {
           var m = d3.mouse(this),
               v = xScale.invert(m[0]),
diff --git a/tensorflow/tensorboard/components/vz_line_chart/BUILD b/tensorflow/tensorboard/components/vz_line_chart/BUILD
deleted file mode 100644
index 967d86596f176c9c91f3883021bf1ba55012374a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/BUILD
+++ /dev/null
@@ -1,71 +0,0 @@
-package(default_visibility = ["//tensorflow:internal"])
-
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
-load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
-
-licenses(["notice"])  # Apache 2.0
-
-webfiles(
-    name = "vz_line_chart",
-    srcs = [
-        "vz-line-chart.html",
-        ":ts",
-    ],
-    path = "/vz-line-chart",
-    deps = [
-        "//tensorflow/tensorboard/components/tf_imports:lodash",
-        "//tensorflow/tensorboard/components/tf_imports:plottable",
-        "@org_polymer",
-    ],
-)
-
-tensorboard_typescript_genrule(
-    name = "ts",
-    srcs = [
-        "dragZoomInteraction.ts",
-        "vz-chart-helpers.ts",
-        "vz-line-chart.ts",
-    ],
-    typings = [
-        "@org_definitelytyped//:d3.d.ts",
-        "@com_palantir_plottable//:plottable.d.ts",
-        "@org_definitelytyped//:lodash.d.ts",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**"]),
-    tags = ["notsan"],
-)
-
-################################################################################
-# MARKED FOR DELETION
-
-tensorboard_webcomponent_library(
-    name = "legacy",
-    srcs = [
-        "index.html",
-        "vz-line-chart.html",
-        ":legacy_ts",
-    ],
-    visibility = ["//visibility:public"],
-    destdir = "vz-line-chart",
-    deps = [
-        "//tensorflow/tensorboard/components:tf_imports",
-        "//tensorflow/tensorboard/components/vz_sorting:legacy",
-        "//third_party/javascript/polymer/v1/polymer:lib",
-    ],
-)
-
-tensorboard_ts_library(
-    name = "legacy_ts",
-    srcs = [
-        "dragZoomInteraction.ts",
-        "vz-chart-helpers.ts",
-        "vz-line-chart.ts",
-    ],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
-)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
deleted file mode 100644
index 70defc4d3b5f357846f88c88d6415d6967f25fd1..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-chart-helpers.ts
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-module VZ.ChartHelpers {
-  export interface Datum {
-    wall_time: Date;
-    step: number;
-  }
-
-  export interface Scalar {
-    scalar: number;
-    smoothed: number;
-  }
-
-  export type ScalarDatum = Datum & Scalar;
-
-  export type DataFn = (run: string, tag: string) =>
-      Promise<Array<Datum>>;
-
-  export let Y_TOOLTIP_FORMATTER_PRECISION = 4;
-  export let STEP_FORMATTER_PRECISION = 4;
-  export let Y_AXIS_FORMATTER_PRECISION = 3;
-  export let TOOLTIP_Y_PIXEL_OFFSET = 20;
-  export let TOOLTIP_CIRCLE_SIZE = 4;
-  export let NAN_SYMBOL_SIZE = 6;
-
-  export interface Point {
-    x: number;  // pixel space
-    y: number;  // pixel space
-    datum: ScalarDatum;
-    dataset: Plottable.Dataset;
-  }
-
-  /* Create a formatter function that will switch between exponential and
-   * regular display depending on the scale of the number being formatted,
-   * and show `digits` significant digits.
-   */
-  export function multiscaleFormatter(digits: number): ((v: number) => string) {
-    return (v: number) => {
-      let absv = Math.abs(v);
-      if (absv < 1E-15) {
-        // Sometimes zero-like values get an annoying representation
-        absv = 0;
-      }
-      let f: (x: number) => string;
-      if (absv >= 1E4) {
-        f = d3.format('.' + digits + 'e');
-      } else if (absv > 0 && absv < 0.01) {
-        f = d3.format('.' + digits + 'e');
-      } else {
-        f = d3.format('.' + digits + 'g');
-      }
-      return f(v);
-    };
-  }
-
-  /* Compute an appropriate domain given an array of all the values that are
-   * going to be displayed. If ignoreOutliers is true, it will ignore the
-   * lowest 10% and highest 10% of the data when computing a domain.
-   * It has n log n performance when ignoreOutliers is true, as it needs to
-   * sort the data.
-   */
-  export function computeDomain(values: number[], ignoreOutliers: boolean) {
-    if (values.length === 0) {
-      return [-0.1, 1.1];
-    }
-    let a: number;
-    let b: number;
-    if (ignoreOutliers) {
-      let sorted = _.sortBy(values);
-      a = d3.quantile(sorted, 0.05);
-      b = d3.quantile(sorted, 0.95);
-    } else {
-      a = d3.min(values);
-      b = d3.max(values);
-    }
-
-    let padding: number;
-    let span = b - a;
-    if (span === 0) {
-      // If b===a, we would create an empty range. We instead select the range
-      // [0, 2*a] if a > 0, or [-2*a, 0] if a < 0, plus a little bit of
-      // extra padding on the top and bottom of the plot.
-      padding = Math.abs(a) * 1.1;
-    } else {
-      padding = span * 0.2;
-    }
-
-    let lower: number;
-    if (a >= 0 && a < span) {
-      // We include the intercept (y = 0) if doing so less than doubles the span
-      // of the y-axis. (We actually select a lower bound that's slightly less
-      // than 0 so that 0.00 will clearly be written on the lower edge of the
-      // chart. The label on the lowest tick is often filtered out.)
-      lower = -0.1 * b;
-    } else {
-      lower = a - padding;
-    }
-
-
-    let domain = [lower, b + padding];
-    domain = d3.scale.linear().domain(domain).nice().domain();
-    return domain;
-  }
-
-  export function accessorize(key: string): Plottable.Accessor<number> {
-    return (d: any, index: number, dataset: Plottable.Dataset) => d[key];
-  }
-
-  export interface XComponents {
-    /* tslint:disable */
-    scale: Plottable.Scales.Linear|Plottable.Scales.Time,
-        axis: Plottable.Axes.Numeric|Plottable.Axes.Time,
-        accessor: Plottable.Accessor<number|Date>,
-    /* tslint:enable */
-  }
-
-  export let stepFormatter =
-      Plottable.Formatters.siSuffix(STEP_FORMATTER_PRECISION);
-  export function stepX(): XComponents {
-    let scale = new Plottable.Scales.Linear();
-    let axis = new Plottable.Axes.Numeric(scale, 'bottom');
-    axis.formatter(stepFormatter);
-    return {
-      scale: scale,
-      axis: axis,
-      accessor: (d: Datum) => d.step,
-    };
-  }
-
-  export let timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-
-  export function wallX(): XComponents {
-    let scale = new Plottable.Scales.Time();
-    return {
-      scale: scale,
-      axis: new Plottable.Axes.Time(scale, 'bottom'),
-      accessor: (d: Datum) => d.wall_time,
-    };
-  }
-  export let relativeAccessor =
-      (d: any, index: number, dataset: Plottable.Dataset) => {
-        // We may be rendering the final-point datum for scatterplot.
-        // If so, we will have already provided the 'relative' property
-        if (d.relative != null) {
-          return d.relative;
-        }
-        let data = dataset.data();
-        // I can't imagine how this function would be called when the data is
-        // empty (after all, it iterates over the data), but lets guard just
-        // to be safe.
-        let first = data.length > 0 ? +data[0].wall_time : 0;
-        return (+d.wall_time - first) / (60 * 60 * 1000);  // ms to hours
-      };
-
-  export let relativeFormatter = (n: number) => {
-    // we will always show 2 units of precision, e.g days and hours, or
-    // minutes and seconds, but not hours and minutes and seconds
-    let ret = '';
-    let days = Math.floor(n / 24);
-    n -= (days * 24);
-    if (days) {
-      ret += days + 'd ';
-    }
-    let hours = Math.floor(n);
-    n -= hours;
-    n *= 60;
-    if (hours || days) {
-      ret += hours + 'h ';
-    }
-    let minutes = Math.floor(n);
-    n -= minutes;
-    n *= 60;
-    if (minutes || hours || days) {
-      ret += minutes + 'm ';
-    }
-    let seconds = Math.floor(n);
-    return ret + seconds + 's';
-  };
-  export function relativeX(): XComponents {
-    let scale = new Plottable.Scales.Linear();
-    return {
-      scale: scale,
-      axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-      accessor: relativeAccessor,
-    };
-  }
-
-  // a very literal definition of NaN: true for NaN for a non-number type
-  // or null, etc. False for Infinity or -Infinity
-  export let isNaN = (x) => +x !== x;
-
-  export function getXComponents(xType: string): XComponents {
-    switch (xType) {
-      case 'step':
-        return stepX();
-      case 'wall_time':
-        return wallX();
-      case 'relative':
-        return relativeX();
-      default:
-        throw new Error('invalid xType: ' + xType);
-    }
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
deleted file mode 100644
index ccd74d6e3d8997ec3a79f6779c7367447f1f4dd9..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.html
+++ /dev/null
@@ -1,369 +0,0 @@
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../tf-imports/plottable.html">
-<link rel="import" href="../tf-imports/lodash.html">
-
-<!--
-vz-line-chart creates an element that draws a line chart for
-displaying event values.
-
-This line chart supports drawing multiple lines at the same time, with features
-such as different X scales (linear and temporal), tooltips and smoothing.
-
-@element vz-line-chart
-@demo demo/index.html
--->
-<dom-module id="vz-line-chart">
-  <template>
-    <div id="tooltip">
-      <table>
-        <thead>
-          <tr>
-            <th></th>
-            <th>Name</th>
-            <template is="dom-if" if="{{smoothingEnabled}}">
-              <th>Smoothed</th>
-            </template>
-            <th>Value</th>
-            <th>Step</th>
-            <th>Time</th>
-            <th>Relative</th>
-          </tr>
-        </thead>
-        <tbody>
-        </tbody>
-      </table>
-    </div>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-        outline: none;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      td {
-        padding-left: 5px;
-        padding-right: 5px;
-        font-size: 13px;
-        opacity: 1;
-      }
-      #tooltip {
-        pointer-events: none;
-        position: absolute;
-        opacity: 0;
-        box-shadow: 0 1px 4px rgba(0, 0, 0, 0.3);
-        font-size: 14px;
-        background: rgba(0, 0, 0, 0.8);
-        color: white;
-        border-radius: 4px;
-        line-height: 1.4em;
-        padding: 8px;
-        z-index: 5;
-        cursor: none;
-        margin-top: 10px;
-      }
-      .swatch {
-        border-radius: 50%;
-        width: 14px;
-        height: 14px;
-        display: block;
-        border: 2px solid rgba(0,0,0,0);
-      }
-      .closest .swatch {
-        border: 2px solid white;
-      }
-      th {
-        padding-left: 5px;
-        padding-right: 5px;
-        text-align: left;
-      }
-      .distant td {
-        opacity: 0.8;
-      }
-
-      .distant td.swatch {
-        opacity: 1;
-      }
-
-      .ghost {
-        opacity: 0.2;
-        stroke-width: 1px;
-      }
-
-      #chartsvg line.guide-line {
-        stroke: #999;
-        stroke-width: 1.5px;
-      }
-
-    </style>
-  </template>
-  <script src="dragZoomInteraction.js"></script>
-  <script src="vz-line-chart.js"></script>
-  <script src="vz-chart-helpers.js"></script>
-  <script>
-    Polymer({
-      is: "vz-line-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-
-        /**
-         * Whether smoothing is enabled or not. If true, smoothed lines will be
-         * plotted in the chart while the unsmoothed lines will be ghosted in
-         * the background.
-         *
-         * The smoothing algorithm is a simple moving average, which, given a
-         * point p and a window w, replaces p with a simple average of the
-         * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
-         * aren't enough points to cover the entire window to the left, the
-         * window is reduced to fit exactly the amount of elements available.
-         * This means that the smoothed line will be less in and gradually
-         * become more smooth until the desired window is reached. However when
-         * there aren't enough points on the right, the line stops being
-         * rendered at all.
-         */
-        smoothingEnabled: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
-         * the window size, and a weight of 1.0 means using 50% of the entire
-         * dataset as the window, while a weight of 0.0 means using a window of
-         * 0 (and thus replacing each point with themselves).
-         *
-         * The growth between 0.0 and 1.0 is not linear though. Because
-         * changing the window from 0% to 30% of the dataset smooths the line a
-         * lot more than changing the window from 70% to 100%, an exponential
-         * function is used instead: http://i.imgur.com/bDrhEZU.png. This
-         * function increases the size of the window slowly at the beginning
-         * and gradually speeds up the growth, but 0.0 still means a window of
-         * 0 and 1.0 still means a window of the dataset's length.
-         */
-        smoothingWeight: {
-          type: Number,
-          value: 0.6
-        },
-
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-
-        /**
-         * The scale for the y-axis. Allows:
-         * - "linear" - linear scale (Plottable.Scales.Linear)
-         * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
-         */
-        yScaleType: {
-          type: String,
-          value: 'linear'
-        },
-
-        /**
-         * Whether to ignore outlier data when computing the yScale domain.
-         */
-
-        ignoreYOutliers: {
-          type: Boolean,
-          value: false,
-        },
-
-        /**
-         * Change how the tooltip is sorted. Allows:
-         * - "default" - Sort the tooltip by input order.
-         * - "ascending" - Sort the tooltip by ascending value.
-         * - "descending" - Sort the tooltip by descending value.
-         * - "nearest" - Sort the tooltip by closest to cursor.
-         */
-        tooltipSortingMethod: {
-          type: String,
-          value: 'default'
-        },
-
-        /**
-         * Change how the tooltip is positioned. Allows:
-         * - "bottom" - Position the tooltip on the bottom of the chart.
-         * - "right" - Position the tooltip to the right of the chart.
-         */
-        tooltipPosition: {
-          type: String,
-          value: 'bottom'
-        },
-
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: {
-          type: Number,
-          value: null
-        }
-      },
-      observers: [
-        "_makeChart(xType, yScaleType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-        "_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)",
-        "_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)",
-        "_tooltipPositionChanged(tooltipPosition, _chart)",
-        "_outliersChanged(ignoreYOutliers, _chart)"
-      ],
-
-      /**
-       * Sets the series that the chart displays. Series with other names will
-       * not be displayed.
-       *
-       * @param {String[]} names Array with the names of the series to
-       * display.
-       */
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-
-      /**
-       * Sets the data of one of the series. Note that to display this series
-       * its name must be in the setVisibleSeries() array.
-       *
-       * @param {String} name Name of the series.
-       * @param {VZ.ChartHelpers.ScalarDatum[]} data Data of the series. This is
-       * an array of objects with at least the following properties:
-       * - step: (Number) - index of the datum.
-       * - wall_time: (Date) - Date object with the datum's time.
-       * - scalar: (Number) - Value of the datum.
-       */
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-
-      /**
-       * Re-renders the chart. Useful if e.g. the container size changed.
-       */
-      redraw: function() {
-        this._chart.redraw();
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.tooltip, true);
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, yScaleType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId !== null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-          this._makeChartAsyncCallbackId = null;
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!this._attached) return;
-          if (this._chart) this._chart.destroy();
-          var tooltip = d3.select(this.$.tooltip);
-          var chart = new VZ.LineChart(xType, yScaleType, colorScale, tooltip);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      _smoothingChanged: function() {
-        if(!this._chart) {
-          return;
-        }
-        if(this.smoothingEnabled) {
-          this._chart.smoothingUpdate(this.smoothingWeight);
-        }
-        else {
-          this._chart.smoothingDisable();
-        }
-      },
-      _outliersChanged: function() {
-        if (!this._chart) {
-          return;
-        }
-        this._chart.ignoreYOutliers(this.ignoreYOutliers);
-      },
-      _tooltipSortingMethodChanged: function() {
-        if(this._chart) {
-          this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
-        }
-      },
-      _tooltipPositionChanged: function() {
-        if (this._chart) {
-          this._chart.setTooltipPosition(this.tooltipPosition);
-        }
-      }
-    });
-  </script>
-</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts b/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
deleted file mode 100644
index 59da03d455f6fa84bcc959d3790bd84d244860ca..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/components/vz_line_chart/vz-line-chart.ts
+++ /dev/null
@@ -1,581 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-
-module VZ {
-  export class LineChart {
-    private name2datasets: {[name: string]: Plottable.Dataset};
-    private seriesNames: string[];
-
-    private xAccessor: Plottable.Accessor<number|Date>;
-    private xScale: Plottable.QuantitativeScale<number|Date>;
-    private yScale: Plottable.QuantitativeScale<number>;
-    private gridlines: Plottable.Components.Gridlines;
-    private center: Plottable.Components.Group;
-    private xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
-    private yAxis: Plottable.Axes.Numeric;
-    private outer: Plottable.Components.Table;
-    private colorScale: Plottable.Scales.Color;
-    private tooltip: d3.Selection<any>;
-    private dzl: Plottable.DragZoomLayer;
-
-    private linePlot: Plottable.Plots.Line<number|Date>;
-    private smoothLinePlot: Plottable.Plots.Line<number|Date>;
-    private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
-    private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
-    private scalarAccessor: Plottable.Accessor<number>;
-    private smoothedAccessor: Plottable.Accessor<number>;
-    private lastPointsDataset: Plottable.Dataset;
-    private datasets: Plottable.Dataset[];
-    private onDatasetChanged: (dataset: Plottable.Dataset) => void;
-    private nanDataset: Plottable.Dataset;
-    private smoothingWeight: number;
-    private smoothingEnabled: Boolean;
-    private tooltipSortingMethod: string;
-    private tooltipPosition: string;
-    private _ignoreYOutliers: boolean;
-
-    private targetSVG: d3.Selection<any>;
-
-    constructor(
-        xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
-        tooltip: d3.Selection<any>) {
-      this.seriesNames = [];
-      this.name2datasets = {};
-      this.colorScale = colorScale;
-      this.tooltip = tooltip;
-      this.datasets = [];
-      this._ignoreYOutliers = false;
-      // lastPointDataset is a dataset that contains just the last point of
-      // every dataset we're currently drawing.
-      this.lastPointsDataset = new Plottable.Dataset();
-      this.nanDataset = new Plottable.Dataset();
-      // need to do a single bind, so we can deregister the callback from
-      // old Plottable.Datasets. (Deregistration is done by identity checks.)
-      this.onDatasetChanged = this._onDatasetChanged.bind(this);
-      this.buildChart(xType, yScaleType);
-    }
-
-    private buildChart(xType: string, yScaleType: string) {
-      if (this.outer) {
-        this.outer.destroy();
-      }
-      let xComponents = VZ.ChartHelpers.getXComponents(xType);
-      this.xAccessor = xComponents.accessor;
-      this.xScale = xComponents.scale;
-      this.xAxis = xComponents.axis;
-      this.xAxis.margin(0).tickLabelPadding(3);
-      this.yScale = LineChart.getYScaleFromType(yScaleType);
-      this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-      let yFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-      this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-      this.yAxis.usesTextWidthApproximation(true);
-
-      this.dzl = new Plottable.DragZoomLayer(
-          this.xScale, this.yScale, this.updateSpecialDatasets.bind(this));
-
-      let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-
-      this.gridlines =
-          new Plottable.Components.Gridlines(this.xScale, this.yScale);
-
-      let xZeroLine = new Plottable.Components.GuideLineLayer('horizontal');
-      xZeroLine.scale(this.yScale).value(0);
-      let yZeroLine = new Plottable.Components.GuideLineLayer('vertical');
-      yZeroLine.scale(this.xScale).value(0);
-
-      this.center = new Plottable.Components.Group(
-          [this.gridlines, xZeroLine, yZeroLine, center, this.dzl]);
-      this.outer =  new Plottable.Components.Table([
-                                                   [this.yAxis, this.center],
-                                                   [null, this.xAxis]
-                                                  ]);
-    }
-
-    private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
-      this.scalarAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.scalar;
-      this.smoothedAccessor = (d: VZ.ChartHelpers.ScalarDatum) => d.smoothed;
-      let linePlot = new Plottable.Plots.Line<number|Date>();
-      linePlot.x(xAccessor, xScale);
-      linePlot.y(this.scalarAccessor, yScale);
-      linePlot.attr(
-          'stroke', (d: VZ.ChartHelpers.Datum, i: number,
-                     dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().name));
-      this.linePlot = linePlot;
-      let group = this.setupTooltips(linePlot);
-
-      let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
-      smoothLinePlot.x(xAccessor, xScale);
-      smoothLinePlot.y(this.smoothedAccessor, yScale);
-      smoothLinePlot.attr(
-          'stroke', (d: VZ.ChartHelpers.Datum, i: number,
-                     dataset: Plottable.Dataset) =>
-                        this.colorScale.scale(dataset.metadata().name));
-      this.smoothLinePlot = smoothLinePlot;
-
-      // The scatterPlot will display the last point for each dataset.
-      // This way, if there is only one datum for the series, it is still
-      // visible. We hide it when tooltips are active to keep things clean.
-      let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
-      scatterPlot.x(xAccessor, xScale);
-      scatterPlot.y(this.scalarAccessor, yScale);
-      scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
-      scatterPlot.attr('opacity', 1);
-      scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
-      scatterPlot.datasets([this.lastPointsDataset]);
-      this.scatterPlot = scatterPlot;
-
-      let nanDisplay = new Plottable.Plots.Scatter<number|Date, number>();
-      nanDisplay.x(xAccessor, xScale);
-      nanDisplay.y((x) => x.displayY, yScale);
-      nanDisplay.attr('fill', (d: any) => this.colorScale.scale(d.name));
-      nanDisplay.attr('opacity', 1);
-      nanDisplay.size(VZ.ChartHelpers.NAN_SYMBOL_SIZE * 2);
-      nanDisplay.datasets([this.nanDataset]);
-      nanDisplay.symbol(Plottable.SymbolFactories.triangleUp);
-      this.nanDisplay = nanDisplay;
-
-      return new Plottable.Components.Group(
-          [nanDisplay, scatterPlot, smoothLinePlot, group]);
-    }
-
-    /** Updates the chart when a dataset changes. Called every time the data of
-     * a dataset changes to update the charts.
-     */
-    private _onDatasetChanged(dataset: Plottable.Dataset) {
-      if (this.smoothingEnabled) {
-        this.resmoothDataset(dataset);
-      }
-      this.updateSpecialDatasets();
-    }
-
-    public ignoreYOutliers(ignoreYOutliers: boolean) {
-      if (ignoreYOutliers !== this._ignoreYOutliers) {
-        this._ignoreYOutliers = ignoreYOutliers;
-        this.updateSpecialDatasets();
-      }
-    }
-
-    private updateSpecialDatasets() {
-      if (this.smoothingEnabled) {
-        this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-      } else {
-        this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-      }
-    }
-
-    /** Constructs special datasets. Each special dataset contains exceptional
-     * values from all of the regular datasets, e.g. last points in series, or
-     * NaN values. Those points will have a `name` and `relative` property added
-     * (since usually those are context in the surrounding dataset).
-     * The accessor will point to the correct data to access.
-     */
-    private updateSpecialDatasetsWithAccessor(accessor:
-                                                  Plottable.Accessor<number>) {
-      let lastPointsData =
-          this.datasets
-              .map((d) => {
-                let datum = null;
-                // filter out NaNs to ensure last point is a clean one
-                let nonNanData =
-                    d.data().filter((x) => !isNaN(accessor(x, -1, d)));
-                if (nonNanData.length > 0) {
-                  let idx = nonNanData.length - 1;
-                  datum = nonNanData[idx];
-                  datum.name = d.metadata().name;
-                  datum.relative =
-                      VZ.ChartHelpers.relativeAccessor(datum, -1, d);
-                }
-                return datum;
-              })
-              .filter((x) => x != null);
-      this.lastPointsDataset.data(lastPointsData);
-
-      // Take a dataset, return an array of NaN data points
-      // the NaN points will have a "displayY" property which is the
-      // y-value of a nearby point that was not NaN (0 if all points are NaN)
-      let datasetToNaNData = (d: Plottable.Dataset) => {
-        let displayY = null;
-        let data = d.data();
-        let i = 0;
-        while (i < data.length && displayY == null) {
-          if (!isNaN(accessor(data[i], -1, d))) {
-            displayY = accessor(data[i], -1, d);
-          }
-          i++;
-        }
-        if (displayY == null) {
-          displayY = 0;
-        }
-        let nanData = [];
-        for (i = 0; i < data.length; i++) {
-          if (!isNaN(accessor(data[i], -1, d))) {
-            displayY = accessor(data[i], -1, d);
-          } else {
-            data[i].name = d.metadata().name;
-            data[i].displayY = displayY;
-            data[i].relative = VZ.ChartHelpers.relativeAccessor(data[i], -1, d);
-            nanData.push(data[i]);
-          }
-        }
-        return nanData;
-      };
-      let nanData = _.flatten(this.datasets.map(datasetToNaNData));
-      this.nanDataset.data(nanData);
-
-      let datasetToValues: (d: Plottable.Dataset) => number[] = (d) => {
-        return d.data().map((x) => accessor(x, -1, d));
-      };
-      let vals = _.flatten(this.datasets.map(datasetToValues));
-      vals = vals.filter((x) => x === x && x !== Infinity && x !== -Infinity);
-      let domain = VZ.ChartHelpers.computeDomain(vals, this._ignoreYOutliers);
-      this.yScale.domain(domain);
-    }
-
-    private setupTooltips(plot: Plottable.XYPlot<number|Date, number>):
-        Plottable.Components.Group {
-      let pi = new Plottable.Interactions.Pointer();
-      pi.attachTo(plot);
-      // PointsComponent is a Plottable Component that will hold the little
-      // circles we draw over the closest data points
-      let pointsComponent = new Plottable.Component();
-      let group = new Plottable.Components.Group([plot, pointsComponent]);
-
-      let hideTooltips = () => {
-        this.tooltip.style('opacity', 0);
-        this.scatterPlot.attr('opacity', 1);
-        pointsComponent.content().selectAll('.point').remove();
-      };
-
-      let enabled = true;
-      let disableTooltips = () => {
-        enabled = false;
-        hideTooltips();
-      };
-      let enableTooltips = () => { enabled = true; };
-
-      this.dzl.interactionStart(disableTooltips);
-      this.dzl.interactionEnd(enableTooltips);
-
-      pi.onPointerMove((p: Plottable.Point) => {
-        if (!enabled) {
-          return;
-        }
-        let target: VZ.ChartHelpers.Point = {
-          x: p.x,
-          y: p.y,
-          datum: null,
-          dataset: null,
-        };
-
-
-        let bbox: SVGRect = (<any>this.gridlines.content().node()).getBBox();
-
-        // pts is the closets point to the tooltip for each dataset
-        let pts = plot.datasets()
-                      .map((dataset) => this.findClosestPoint(target, dataset))
-                      .filter(x => x != null);
-        let intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
-        // We draw tooltips for points that are NaN, or are currently visible
-        let ptsForTooltips = pts.filter(
-            (p) => intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar));
-        // Only draw little indicator circles for the non-NaN points
-        let ptsToCircle = ptsForTooltips.filter((p) => !isNaN(p.datum.scalar));
-
-        let ptsSelection: any =
-            pointsComponent.content().selectAll('.point').data(
-                ptsToCircle,
-                (p: VZ.ChartHelpers.Point) => p.dataset.metadata().name);
-        if (pts.length !== 0) {
-          ptsSelection.enter().append('circle').classed('point', true);
-          ptsSelection.attr('r', VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE)
-              .attr('cx', (p) => p.x)
-              .attr('cy', (p) => p.y)
-              .style('stroke', 'none')
-              .attr(
-                  'fill',
-                  (p) => this.colorScale.scale(p.dataset.metadata().name));
-          ptsSelection.exit().remove();
-          this.drawTooltips(ptsForTooltips, target);
-        } else {
-          hideTooltips();
-        }
-      });
-
-      pi.onPointerExit(hideTooltips);
-
-      return group;
-    }
-
-    private drawTooltips(
-        points: VZ.ChartHelpers.Point[], target: VZ.ChartHelpers.Point) {
-      // Formatters for value, step, and wall_time
-      this.scatterPlot.attr('opacity', 0);
-      let valueFormatter = VZ.ChartHelpers.multiscaleFormatter(
-          VZ.ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
-
-      let dist = (p: VZ.ChartHelpers.Point) =>
-          Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
-      let closestDist = _.min(points.map(dist));
-
-      let valueSortMethod = this.scalarAccessor;
-      if (this.smoothingEnabled) {
-        valueSortMethod = this.smoothedAccessor;
-      }
-
-      if (this.tooltipSortingMethod === 'ascending') {
-        points =
-            _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset));
-      } else if (this.tooltipSortingMethod === 'descending') {
-        points =
-            _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset))
-                .reverse();
-      } else if (this.tooltipSortingMethod === 'nearest') {
-        points = _.sortBy(points, dist);
-      } else {
-        // The 'default' sorting method maintains the order of names passed to
-        // setVisibleSeries(). However we reverse that order when defining the
-        // datasets. So we must call reverse again to restore the order.
-        points = points.slice(0).reverse();
-      }
-
-      let rows = this.tooltip.select('tbody')
-                     .html('')
-                     .selectAll('tr')
-                     .data(points)
-                     .enter()
-                     .append('tr');
-      // Grey out the point if any of the following are true:
-      // - The cursor is outside of the x-extent of the dataset
-      // - The point's y value is NaN
-      rows.classed('distant', (d) => {
-        let firstPoint = d.dataset.data()[0];
-        let lastPoint = _.last(d.dataset.data());
-        let firstX =
-            this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
-        let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
-        let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
-        return target.x < firstX || target.x > lastX || isNaN(s);
-      });
-      rows.classed('closest', (p) => dist(p) === closestDist);
-      // It is a bit hacky that we are manually applying the width to the swatch
-      // and the nowrap property to the text here. The reason is as follows:
-      // the style gets updated asynchronously by Polymer scopeSubtree observer.
-      // Which means we would get incorrect sizing information since the text
-      // would wrap by default. However, we need correct measurements so that
-      // we can stop the text from falling off the edge of the screen.
-      // therefore, we apply the size-critical styles directly.
-      rows.style('white-space', 'nowrap');
-      rows.append('td')
-          .append('span')
-          .classed('swatch', true)
-          .style(
-              'background-color',
-              (d) => this.colorScale.scale(d.dataset.metadata().name));
-      rows.append('td').text((d) => d.dataset.metadata().name);
-      if (this.smoothingEnabled) {
-        rows.append('td').text(
-            (d) => isNaN(d.datum.smoothed) ? 'NaN' :
-                                             valueFormatter(d.datum.smoothed));
-      }
-      rows.append('td').text(
-          (d) =>
-              isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.stepFormatter(d.datum.step));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.timeFormatter(d.datum.wall_time));
-      rows.append('td').text(
-          (d) => VZ.ChartHelpers.relativeFormatter(
-              VZ.ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)));
-
-      // compute left position
-      let documentWidth = document.body.clientWidth;
-      let node: any = this.tooltip.node();
-      let parentRect = node.parentElement.getBoundingClientRect();
-      let nodeRect = node.getBoundingClientRect();
-      // prevent it from falling off the right side of the screen
-      let left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
-
-      if (this.tooltipPosition === 'right') {
-        left = Math.min(parentRect.width, left);
-      } else {  // 'bottom'
-        left = Math.min(0, left);
-        top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
-      }
-
-      this.tooltip.style(
-          'transform', 'translate(' + left + 'px,' + top + 'px)');
-      this.tooltip.style('opacity', 1);
-    }
-
-    private findClosestPoint(
-        target: VZ.ChartHelpers.Point,
-        dataset: Plottable.Dataset): VZ.ChartHelpers.Point {
-      let points: VZ.ChartHelpers.Point[] = dataset.data().map((d, i) => {
-        let x = this.xAccessor(d, i, dataset);
-        let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
-                                        this.scalarAccessor(d, i, dataset);
-        return {
-          x: this.xScale.scale(x),
-          y: this.yScale.scale(y),
-          datum: d,
-          dataset: dataset,
-        };
-      });
-      let idx: number =
-          _.sortedIndex(points, target, (p: VZ.ChartHelpers.Point) => p.x);
-      if (idx === points.length) {
-        return points[points.length - 1];
-      } else if (idx === 0) {
-        return points[0];
-      } else {
-        let prev = points[idx - 1];
-        let next = points[idx];
-        let prevDist = Math.abs(prev.x - target.x);
-        let nextDist = Math.abs(next.x - target.x);
-        return prevDist < nextDist ? prev : next;
-      }
-    }
-
-    private resmoothDataset(dataset: Plottable.Dataset) {
-      var data = dataset.data();
-      var smoothingWeight = this.smoothingWeight;
-      let last = data.length > 0 ? data[0].scalar : NaN;
-      data.forEach((d) => {
-        if (!_.isFinite(last)) {
-          d.smoothed = d.scalar;
-        } else {
-          // 1st-order IIR low-pass filter to attenuate the higher-
-          // frequency components of the time-series.
-          d.smoothed = last * smoothingWeight +
-                       (1 - smoothingWeight) * d.scalar;
-        }
-        last = d.smoothed;
-      });
-    }
-
-    private getDataset(name: string) {
-      if (this.name2datasets[name] === undefined) {
-        this.name2datasets[name] = new Plottable.Dataset([], {name: name});
-      }
-      return this.name2datasets[name];
-    }
-
-    static getYScaleFromType(yScaleType: string):
-        Plottable.QuantitativeScale<number> {
-      if (yScaleType === 'log') {
-        return new Plottable.Scales.ModifiedLog();
-      } else if (yScaleType === 'linear') {
-        return new Plottable.Scales.Linear();
-      } else {
-        throw new Error('Unrecognized yScale type ' + yScaleType);
-      }
-    }
-
-    /**
-     * Update the selected series on the chart.
-     */
-    public setVisibleSeries(names: string[]) {
-      names = names.sort();
-      this.seriesNames = names;
-
-      names.reverse();  // draw first series on top
-      this.datasets.forEach((d) => d.offUpdate(this.onDatasetChanged));
-      this.datasets = names.map((r) => this.getDataset(r));
-      this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
-      this.linePlot.datasets(this.datasets);
-
-      if (this.smoothingEnabled) {
-        this.smoothLinePlot.datasets(this.datasets);
-      }
-      this.updateSpecialDatasets();
-    }
-
-    /**
-     * Set the data of a series on the chart.
-     */
-    public setSeriesData(name: string, data: VZ.ChartHelpers.ScalarDatum[]) {
-      this.getDataset(name).data(data);
-    }
-
-    public smoothingUpdate(weight: number) {
-      this.smoothingWeight = weight;
-      this.datasets.forEach((d) => this.resmoothDataset(d));
-
-      if (!this.smoothingEnabled) {
-        this.linePlot.addClass('ghost');
-        this.scatterPlot.y(this.smoothedAccessor, this.yScale);
-        this.smoothingEnabled = true;
-        this.smoothLinePlot.datasets(this.datasets);
-      }
-
-      this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-    }
-
-    public smoothingDisable() {
-      if (this.smoothingEnabled) {
-        this.linePlot.removeClass('ghost');
-        this.scatterPlot.y(this.scalarAccessor, this.yScale);
-        this.smoothLinePlot.datasets([]);
-        this.smoothingEnabled = false;
-        this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-      }
-    }
-
-    public setTooltipSortingMethod(method: string) {
-      this.tooltipSortingMethod = method;
-    }
-
-    public setTooltipPosition(position: string) {
-      this.tooltipPosition = position;
-    }
-
-    public renderTo(targetSVG: d3.Selection<any>) {
-      this.targetSVG = targetSVG;
-      this.setViewBox();
-      this.outer.renderTo(targetSVG);
-    }
-
-    /** There's an issue in Chrome where the svg overflow is a bit
-     * "flickery". There is a border on the gridlines on the extreme edge of the
-     * chart, which behaves inconsistently and causes the screendiffing tests to
-     * flake. We can solve this by creating 1px effective margin for the svg by
-     * setting the viewBox on the containing svg.
-     */
-    private setViewBox() {
-      // There's an issue in Firefox where if we measure with the old viewbox
-      // set, we get horrible results.
-      this.targetSVG.attr('viewBox', null);
-
-      let parent = this.targetSVG.node().parentNode as HTMLElement;
-      let w = parent.clientWidth;
-      let h = parent.clientHeight;
-      this.targetSVG.attr({
-        'height': h,
-        'viewBox': `0 0 ${w + 1} ${h + 1}`,
-      });
-    }
-
-    public redraw() {
-      this.outer.redraw();
-      this.setViewBox();
-    }
-
-    public destroy() { this.outer.destroy(); }
-  }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD b/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..948e04cd8e80f3a347f1e7e63c88aaa5f9c0ca73
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/BUILD
@@ -0,0 +1,116 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_line_chart_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-line-chart.html",
+    ],
+    path = "/vz-line-chart",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:lodash",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable",
+        "@org_polymer",
+    ],
+)
+
+web_library(
+    name = "demo",
+    srcs = ["index.html"],
+    path = "/vz-line-chart",
+    deps = [
+        ":vz_line_chart_d3v4",
+        "@org_polymer_iron_demo_helpers",
+        "@org_polymer_paper_styles",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:lodash.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.ChartHelpers": [
+            "vz-chart-helpers.ts",
+        ],
+        "VZ": [
+            "vz-line-chart.ts",
+            "dragZoomInteraction.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "VZ.ChartHelpers": {
+            "Dataset": "Plottable.Dataset",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "index.html",
+        "vz-line-chart.html",
+        ":legacy_ts",
+    ],
+    visibility = ["//learning/vis/vz_elements/catalog:__pkg__"],
+    destdir = "vz-line-chart",
+    deps = [
+        "//tensorflow/tensorboard/components/tf_imports_google:lib",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:legacy",
+        "//third_party/javascript/polymer/v1/polymer:lib",
+    ],
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = [
+        "dragZoomInteraction.ts",
+        "vz-chart-helpers.ts",
+        "vz-line-chart.ts",
+    ],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+    deps = [
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/plottable/v3:bundle",
+        "//third_party/javascript/typings/chai",
+        "//third_party/javascript/typings/d3_v4:bundle",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/mocha",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/sinon",
+        "//third_party/javascript/typings/webcomponents_js",
+    ],
+)
diff --git a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts b/tensorflow/tensorboard/components/vz_line_chart_d3v4/dragZoomInteraction.ts
similarity index 84%
rename from tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
rename to tensorflow/tensorboard/components/vz_line_chart_d3v4/dragZoomInteraction.ts
index 0e5a7c1d5b2b6a7f58000b2f2dde953eb7506674..d374a71a88551b0963943bd292e28e67df22bfc4 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/dragZoomInteraction.ts
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/dragZoomInteraction.ts
@@ -13,12 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-module Plottable {
-export class DragZoomLayer extends Components.SelectionBoxLayer {
-  private _dragInteraction: Interactions.Drag;
-  private _doubleClickInteraction: Interactions.DoubleClick;
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as Plottable from 'Plottable/plottable';  // from //third_party/javascript/plottable/v3
+
+
+export class DragZoomLayer extends Plottable.Components.SelectionBoxLayer {
+  private _dragInteraction: Plottable.Interactions.Drag;
+  private _doubleClickInteraction: Plottable.Interactions.Click;
   private isZoomed = false;
-  private easeFn: (t: number) => number = d3.ease('cubic-in-out');
+  private easeFn: (t: number) => number = d3.easeCubicInOut;
   private _animationTime = 750;
   private onStart: Function;
   private onEnd: Function;
@@ -37,15 +41,15 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
    * TODO(danmane) - merge this into Plottable
    */
   constructor(
-      xScale: QuantitativeScale<number|{valueOf(): number}>,
-      yScale: QuantitativeScale<number|{valueOf(): number}>,
+      xScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
+      yScale: Plottable.QuantitativeScale<number|{valueOf(): number}>,
       unzoomMethod: Function) {
     super();
     this.xScale(xScale);
     this.yScale(yScale);
-    this._dragInteraction = new Interactions.Drag();
+    this._dragInteraction = new Plottable.Interactions.Drag();
     this._dragInteraction.attachTo(this);
-    this._doubleClickInteraction = new Interactions.DoubleClick();
+    this._doubleClickInteraction = new Plottable.Interactions.Click();
     this._doubleClickInteraction.attachTo(this);
     this.setupCallbacks();
     this.unzoomMethod = unzoomMethod;
@@ -54,16 +58,20 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
   /**
    * Register a method that calls when the DragZoom interaction starts.
    */
-  public interactionStart(cb: Function) { this.onStart = cb; }
+  public interactionStart(cb: Function) {
+    this.onStart = cb;
+  }
 
   /**
    * Register a method that calls when the DragZoom interaction ends.
    */
-  public interactionEnd(cb: Function) { this.onEnd = cb; }
+  public interactionEnd(cb: Function) {
+    this.onEnd = cb;
+  }
 
   private setupCallbacks() {
     let dragging = false;
-    this._dragInteraction.onDragStart((startPoint: Point) => {
+    this._dragInteraction.onDragStart((startPoint: Plottable.Point) => {
       this.bounds({
         topLeft: startPoint,
         bottomRight: startPoint,
@@ -114,7 +122,7 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
       throw new Error('ease function must be a function');
     }
     if (fn(0) !== 0 || fn(1) !== 1) {
-      Utils.Window.warn(
+      Plottable.Utils.Window.warn(
           'Easing function does not maintain invariant ' +
           'f(0)==0 && f(1)==1. Bad behavior may result.');
     }
@@ -186,7 +194,7 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
       this.xScale().domain([x0, x1]);
       this.yScale().domain([y0, y1]);
       if (p < 1) {
-        Utils.DOM.requestAnimationFramePolyfill(draw);
+        Plottable.Utils.DOM.requestAnimationFramePolyfill(draw);
       } else {
         this.onEnd();
         this.isZooming(false);
@@ -195,4 +203,3 @@ export class DragZoomLayer extends Components.SelectionBoxLayer {
     draw();
   }
 }
-}
diff --git a/tensorflow/tensorboard/components/vz_line_chart/demo/index.html b/tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
similarity index 99%
rename from tensorflow/tensorboard/components/vz_line_chart/demo/index.html
rename to tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
index fec8e8bed88ca0eeb564c50f5a704347eca551b9..fb571a518370c343156ea158657ec0b68bfe1da2 100644
--- a/tensorflow/tensorboard/components/vz_line_chart/demo/index.html
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/index.html
@@ -21,10 +21,10 @@ limitations under the License.
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>vz-line-chart demo</title>
-    <script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../vz-line-chart.html">
-    <link rel="import" href="../../iron-demo-helpers/demo-snippet.html">
-    <link rel="import" href="../../paper-styles/typography.html">
+    <script src="../webcomponentsjs/webcomponents-lite.min.js"></script>
+    <link rel="import" href="vz-line-chart.html">
+    <link rel="import" href="../iron-demo-helpers/demo-snippet.html">
+    <link rel="import" href="../paper-styles/typography.html">
     <style type="text/css">
       body {
         font-family: "Roboto";
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-chart-helpers.ts b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-chart-helpers.ts
new file mode 100644
index 0000000000000000000000000000000000000000..84e2d528d7d97aff943ac0109c3ca0b6af0e8041
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-chart-helpers.ts
@@ -0,0 +1,219 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* tslint:disable:no-namespace variable-name */
+
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as Plottable from 'Plottable/plottable';  // from //third_party/javascript/plottable/v3
+import {Dataset} from 'Plottable/plottable';
+
+export interface Datum {
+  wall_time: Date;
+  step: number;
+}
+
+export interface Scalar {
+  scalar: number;
+  smoothed: number;
+}
+
+export type ScalarDatum = Datum & Scalar;
+
+export type DataFn = (run: string, tag: string) => Promise<Array<Datum>>;
+
+export let Y_TOOLTIP_FORMATTER_PRECISION = 4;
+export let STEP_FORMATTER_PRECISION = 4;
+export let Y_AXIS_FORMATTER_PRECISION = 3;
+export let TOOLTIP_Y_PIXEL_OFFSET = 20;
+export let TOOLTIP_CIRCLE_SIZE = 4;
+export let NAN_SYMBOL_SIZE = 6;
+
+export interface Point {
+  x: number;  // pixel space
+  y: number;  // pixel space
+  datum: ScalarDatum;
+  dataset: Plottable.Dataset;
+}
+
+/* Create a formatter function that will switch between exponential and
+ * regular display depending on the scale of the number being formatted,
+ * and show `digits` significant digits.
+ */
+export function multiscaleFormatter(digits: number): ((v: number) => string) {
+  return (v: number) => {
+    let absv = Math.abs(v);
+    if (absv < 1E-15) {
+      // Sometimes zero-like values get an annoying representation
+      absv = 0;
+    }
+    let f: (x: number) => string;
+    if (absv >= 1E4) {
+      f = d3.format('.' + digits + 'e');
+    } else if (absv > 0 && absv < 0.01) {
+      f = d3.format('.' + digits + 'e');
+    } else {
+      f = d3.format('.' + digits + 'g');
+    }
+    return f(v);
+  };
+}
+
+/* Compute an appropriate domain given an array of all the values that are
+ * going to be displayed. If ignoreOutliers is true, it will ignore the
+ * lowest 10% and highest 10% of the data when computing a domain.
+ * It has n log n performance when ignoreOutliers is true, as it needs to
+ * sort the data.
+ */
+export function computeDomain(values: number[], ignoreOutliers: boolean) {
+  if (values.length === 0) {
+    return [-0.1, 1.1];
+  }
+  let a: number;
+  let b: number;
+  if (ignoreOutliers) {
+    let sorted = _.sortBy(values);
+    a = d3.quantile(sorted, 0.05);
+    b = d3.quantile(sorted, 0.95);
+  } else {
+    a = d3.min(values);
+    b = d3.max(values);
+  }
+
+  let padding: number;
+  let span = b - a;
+  if (span === 0) {
+    // If b===a, we would create an empty range. We instead select the range
+    // [0, 2*a] if a > 0, or [-2*a, 0] if a < 0, plus a little bit of
+    // extra padding on the top and bottom of the plot.
+    padding = Math.abs(a) * 1.1 + 1.1;
+  } else {
+    padding = span * 0.2;
+  }
+
+  let lower: number;
+  if (a >= 0 && a < span) {
+    // We include the intercept (y = 0) if doing so less than doubles the span
+    // of the y-axis. (We actually select a lower bound that's slightly less
+    // than 0 so that 0.00 will clearly be written on the lower edge of the
+    // chart. The label on the lowest tick is often filtered out.)
+    lower = -0.1 * b;
+  } else {
+    lower = a - padding;
+  }
+
+
+  let domain = [lower, b + padding];
+  domain = d3.scaleLinear().domain(domain).nice().domain();
+  return domain;
+}
+
+export function accessorize(key: string): Plottable.IAccessor<number> {
+  return (d: any, index: number, dataset: Plottable.Dataset) => d[key];
+}
+
+export interface XComponents {
+  /* tslint:disable */
+  scale: Plottable.Scales.Linear|Plottable.Scales.Time,
+      axis: Plottable.Axes.Numeric|Plottable.Axes.Time,
+      accessor: Plottable.IAccessor<number|Date>,
+  /* tslint:enable */
+}
+
+export let stepFormatter =
+    Plottable.Formatters.siSuffix(STEP_FORMATTER_PRECISION);
+export function stepX(): XComponents {
+  let scale = new Plottable.Scales.Linear();
+  let axis = new Plottable.Axes.Numeric(scale, 'bottom');
+  axis.formatter(stepFormatter);
+  return {
+    scale: scale,
+    axis: axis,
+    accessor: (d: Datum) => d.step,
+  };
+}
+
+export let timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
+
+export function wallX(): XComponents {
+  let scale = new Plottable.Scales.Time();
+  return {
+    scale: scale,
+    axis: new Plottable.Axes.Time(scale, 'bottom'),
+    accessor: (d: Datum) => d.wall_time,
+  };
+}
+export let relativeAccessor = (d: any, index: number, dataset: Dataset) => {
+  // We may be rendering the final-point datum for scatterplot.
+  // If so, we will have already provided the 'relative' property
+  if (d.relative != null) {
+    return d.relative;
+  }
+  let data = dataset.data();
+  // I can't imagine how this function would be called when the data is
+  // empty (after all, it iterates over the data), but lets guard just
+  // to be safe.
+  let first = data.length > 0 ? +data[0].wall_time : 0;
+  return (+d.wall_time - first) / (60 * 60 * 1000);  // ms to hours
+};
+
+export let relativeFormatter = (n: number) => {
+  // we will always show 2 units of precision, e.g days and hours, or
+  // minutes and seconds, but not hours and minutes and seconds
+  let ret = '';
+  let days = Math.floor(n / 24);
+  n -= (days * 24);
+  if (days) {
+    ret += days + 'd ';
+  }
+  let hours = Math.floor(n);
+  n -= hours;
+  n *= 60;
+  if (hours || days) {
+    ret += hours + 'h ';
+  }
+  let minutes = Math.floor(n);
+  n -= minutes;
+  n *= 60;
+  if (minutes || hours || days) {
+    ret += minutes + 'm ';
+  }
+  let seconds = Math.floor(n);
+  return ret + seconds + 's';
+};
+export function relativeX(): XComponents {
+  let scale = new Plottable.Scales.Linear();
+  return {
+    scale: scale,
+    axis: new Plottable.Axes.Numeric(scale, 'bottom'),
+    accessor: relativeAccessor,
+  };
+}
+
+// a very literal definition of NaN: true for NaN for a non-number type
+// or null, etc. False for Infinity or -Infinity
+export let isNaN = (x) => +x !== x;
+
+export function getXComponents(xType: string): XComponents {
+  switch (xType) {
+    case 'step':
+      return stepX();
+    case 'wall_time':
+      return wallX();
+    case 'relative':
+      return relativeX();
+    default:
+      throw new Error('invalid xType: ' + xType);
+  }
+}
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html
new file mode 100644
index 0000000000000000000000000000000000000000..85e24ae4be0320330ec1567d8522c0d825bddcc7
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.html
@@ -0,0 +1,129 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/lodash.html">
+<link rel="import" href="../tf-imports/plottable.html">
+
+<!--
+vz-line-chart creates an element that draws a line chart for
+displaying event values.
+
+This line chart supports drawing multiple lines at the same time, with features
+such as different X scales (linear and temporal), tooltips and smoothing.
+
+@element vz-line-chart
+@demo demo/index.html
+-->
+<dom-module id="vz-line-chart">
+  <template>
+    <div id="tooltip">
+      <table>
+        <thead>
+          <tr>
+            <th></th>
+            <th>Name</th>
+            <template is="dom-if" if="{{smoothingEnabled}}">
+              <th>Smoothed</th>
+            </template>
+            <th>Value</th>
+            <th>Step</th>
+            <th>Time</th>
+            <th>Relative</th>
+          </tr>
+        </thead>
+        <tbody>
+        </tbody>
+      </table>
+    </div>
+    <div id="chartdiv"></div>
+    <style>
+      :host {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        display: flex;
+        flex-direction: column;
+        flex-grow: 1;
+        flex-shrink: 1;
+        position: relative;
+        outline: none;
+      }
+      div {
+        -webkit-user-select: none;
+        -moz-user-select: none;
+        flex-grow: 1;
+        flex-shrink: 1;
+      }
+      td {
+        padding-left: 5px;
+        padding-right: 5px;
+        font-size: 13px;
+        opacity: 1;
+      }
+      #tooltip {
+        pointer-events: none;
+        position: absolute;
+        opacity: 0;
+        box-shadow: 0 1px 4px rgba(0, 0, 0, 0.3);
+        font-size: 14px;
+        background: rgba(0, 0, 0, 0.8);
+        color: white;
+        border-radius: 4px;
+        line-height: 1.4em;
+        padding: 8px;
+        z-index: 5;
+        cursor: none;
+        margin-top: 10px;
+      }
+      .swatch {
+        border-radius: 50%;
+        width: 14px;
+        height: 14px;
+        display: block;
+        border: 2px solid rgba(0,0,0,0);
+      }
+      .closest .swatch {
+        border: 2px solid white;
+      }
+      th {
+        padding-left: 5px;
+        padding-right: 5px;
+        text-align: left;
+      }
+      .distant td {
+        opacity: 0.8;
+      }
+
+      .distant td.swatch {
+        opacity: 1;
+      }
+
+      .ghost {
+        opacity: 0.2;
+        stroke-width: 1px;
+      }
+
+      #chartdiv line.guide-line {
+        stroke: #999;
+        stroke-width: 1.5px;
+      }
+
+    </style>
+  </template>
+  <script src="bundle.js"></script>
+</dom-module>
diff --git a/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.ts b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.ts
new file mode 100644
index 0000000000000000000000000000000000000000..1bce0e33f003e29a8f12b659ab4837a187792d7e
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_line_chart_d3v4/vz-line-chart.ts
@@ -0,0 +1,777 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/* tslint:disable:no-namespace variable-name */
+
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+import * as _ from 'lodash'
+import * as Plottable from 'Plottable/plottable';  // from //third_party/javascript/plottable/v3
+
+import {DragZoomLayer} from './dragZoomInteraction'
+import * as ChartHelpers from './vz-chart-helpers'
+
+Polymer({
+  is: 'vz-line-chart',
+  properties: {
+    /**
+     * Scale that maps series names to colors. The default colors are from
+     * d3.schemeCategory10. Use this property to replace the default line
+     * colors with colors of your own choice.
+     * @type {Plottable.Scales.Color}
+     * @required
+     */
+    colorScale: {
+      type: Object,
+      value: function() {
+        return new Plottable.Scales.Color().range(d3.schemeCategory10);
+      }
+    },
+
+    /**
+     * Whether smoothing is enabled or not. If true, smoothed lines will be
+     * plotted in the chart while the unsmoothed lines will be ghosted in
+     * the background.
+     *
+     * The smoothing algorithm is a simple moving average, which, given a
+     * point p and a window w, replaces p with a simple average of the
+     * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
+     * aren't enough points to cover the entire window to the left, the
+     * window is reduced to fit exactly the amount of elements available.
+     * This means that the smoothed line will be less in and gradually
+     * become more smooth until the desired window is reached. However when
+     * there aren't enough points on the right, the line stops being
+     * rendered at all.
+     */
+    smoothingEnabled: {type: Boolean, value: false},
+
+    /**
+     * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
+     * the window size, and a weight of 1.0 means using 50% of the entire
+     * dataset as the window, while a weight of 0.0 means using a window of
+     * 0 (and thus replacing each point with themselves).
+     *
+     * The growth between 0.0 and 1.0 is not linear though. Because
+     * changing the window from 0% to 30% of the dataset smooths the line a
+     * lot more than changing the window from 70% to 100%, an exponential
+     * function is used instead: http://i.imgur.com/bDrhEZU.png. This
+     * function increases the size of the window slowly at the beginning
+     * and gradually speeds up the growth, but 0.0 still means a window of
+     * 0 and 1.0 still means a window of the dataset's length.
+     */
+    smoothingWeight: {type: Number, value: 0.6},
+
+    /**
+     * The way to display the X values. Allows:
+     * - "step" - Linear scale using the  "step" property of the datum.
+     * - "wall_time" - Temporal scale using the "wall_time" property of the
+     * datum.
+     * - "relative" - Temporal scale using the "relative" property of the
+     * datum if it is present or calculating from "wall_time" if it isn't.
+     */
+    xType: {type: String, value: 'step'},
+
+    /**
+     * The scale for the y-axis. Allows:
+     * - "linear" - linear scale (Plottable.Scales.Linear)
+     * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
+     */
+    yScaleType: {type: String, value: 'linear'},
+
+    /**
+     * Whether to ignore outlier data when computing the yScale domain.
+     */
+
+    ignoreYOutliers: {
+      type: Boolean,
+      value: false,
+    },
+
+    /**
+     * Change how the tooltip is sorted. Allows:
+     * - "default" - Sort the tooltip by input order.
+     * - "ascending" - Sort the tooltip by ascending value.
+     * - "descending" - Sort the tooltip by descending value.
+     * - "nearest" - Sort the tooltip by closest to cursor.
+     */
+    tooltipSortingMethod: {type: String, value: 'default'},
+
+    /**
+     * Change how the tooltip is positioned. Allows:
+     * - "bottom" - Position the tooltip on the bottom of the chart.
+     * - "right" - Position the tooltip to the right of the chart.
+     */
+    tooltipPosition: {type: String, value: 'bottom'},
+
+    _attached: Boolean,
+    _chart: Object,
+    _visibleSeriesCache: {
+      type: Array,
+      value: function() {
+        return []
+      }
+    },
+    _seriesDataCache: {
+      type: Object,
+      value: function() {
+        return {}
+      }
+    },
+    _makeChartAsyncCallbackId: {type: Number, value: null}
+  },
+  observers: [
+    '_makeChart(xType, yScaleType, colorScale, _attached)',
+    '_reloadFromCache(_chart)',
+    '_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)',
+    '_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)',
+    '_tooltipPositionChanged(tooltipPosition, _chart)',
+    '_outliersChanged(ignoreYOutliers, _chart)'
+  ],
+
+  /**
+   * Sets the series that the chart displays. Series with other names will
+   * not be displayed.
+   *
+   * @param {String[]} names Array with the names of the series to
+   * display.
+   */
+  setVisibleSeries: function(names) {
+    this._visibleSeriesCache = names;
+    if (this._chart) {
+      this._chart.setVisibleSeries(names);
+      this.redraw();
+    }
+  },
+
+  /**
+   * Sets the data of one of the series. Note that to display this series
+   * its name must be in the setVisibleSeries() array.
+   *
+   * @param {String} name Name of the series.
+   * @param {VZ.ChartHelpers.ScalarDatum[]} data Data of the series. This is
+   * an array of objects with at least the following properties:
+   * - step: (Number) - index of the datum.
+   * - wall_time: (Date) - Date object with the datum's time.
+   * - scalar: (Number) - Value of the datum.
+   */
+  setSeriesData: function(name, data) {
+    this._seriesDataCache[name] = data;
+    if (this._chart) {
+      this._chart.setSeriesData(name, data);
+    }
+  },
+
+  /**
+   * Re-renders the chart. Useful if e.g. the container size changed.
+   */
+  redraw: function() {
+    this._chart.redraw();
+  },
+  attached: function() {
+    this._attached = true;
+  },
+  detached: function() {
+    this._attached = false;
+  },
+  ready: function() {
+    this.scopeSubtree(this.$.tooltip, true);
+    this.scopeSubtree(this.$.chartdiv, true);
+  },
+  _makeChart: function(xType, yScaleType, colorScale, _attached) {
+    if (this._makeChartAsyncCallbackId !== null) {
+      this.cancelAsync(this._makeChartAsyncCallbackId);
+      this._makeChartAsyncCallbackId = null;
+    }
+
+    this._makeChartAsyncCallbackId = this.async(function() {
+      this._makeChartAsyncCallbackId = null;
+      if (!this._attached) return;
+      if (this._chart) this._chart.destroy();
+      var tooltip = d3.select(this.$.tooltip);
+      var chart = new LineChart(xType, yScaleType, colorScale, tooltip);
+      var div = d3.select(this.$.chartdiv);
+      chart.renderTo(div);
+      this._chart = chart;
+    }, 350);
+  },
+  _reloadFromCache: function() {
+    if (this._chart) {
+      this._chart.setVisibleSeries(this._visibleSeriesCache);
+      this._visibleSeriesCache.forEach(function(name) {
+        this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
+      }.bind(this));
+    }
+  },
+  _smoothingChanged: function() {
+    if (!this._chart) {
+      return;
+    }
+    if (this.smoothingEnabled) {
+      this._chart.smoothingUpdate(this.smoothingWeight);
+    } else {
+      this._chart.smoothingDisable();
+    }
+  },
+  _outliersChanged: function() {
+    if (!this._chart) {
+      return;
+    }
+    this._chart.ignoreYOutliers(this.ignoreYOutliers);
+  },
+  _tooltipSortingMethodChanged: function() {
+    if (this._chart) {
+      this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
+    }
+  },
+  _tooltipPositionChanged: function() {
+    if (this._chart) {
+      this._chart.setTooltipPosition(this.tooltipPosition);
+    }
+  }
+});
+
+class LineChart {
+  private name2datasets: {[name: string]: Plottable.Dataset};
+  private seriesNames: string[];
+
+  private xAccessor: Plottable.IAccessor<number|Date>;
+  private xScale: Plottable.QuantitativeScale<number|Date>;
+  private yScale: Plottable.QuantitativeScale<number>;
+  private gridlines: Plottable.Components.Gridlines;
+  private center: Plottable.Components.Group;
+  private xAxis: Plottable.Axes.Numeric|Plottable.Axes.Time;
+  private yAxis: Plottable.Axes.Numeric;
+  private outer: Plottable.Components.Table;
+  private colorScale: Plottable.Scales.Color;
+  private tooltip: d3.Selection<any, any, any, any>;
+  private dzl: DragZoomLayer;
+
+  private linePlot: Plottable.Plots.Line<number|Date>;
+  private smoothLinePlot: Plottable.Plots.Line<number|Date>;
+  private scatterPlot: Plottable.Plots.Scatter<number|Date, Number>;
+  private nanDisplay: Plottable.Plots.Scatter<number|Date, Number>;
+  private scalarAccessor: Plottable.IAccessor<number>;
+  private smoothedAccessor: Plottable.IAccessor<number>;
+  private lastPointsDataset: Plottable.Dataset;
+  private datasets: Plottable.Dataset[];
+  private onDatasetChanged: (dataset: Plottable.Dataset) => void;
+  private nanDataset: Plottable.Dataset;
+  private smoothingWeight: number;
+  private smoothingEnabled: Boolean;
+  private tooltipSortingMethod: string;
+  private tooltipPosition: string;
+  private _ignoreYOutliers: boolean;
+
+  private targetSVG: d3.Selection<any, any, any, any>;
+
+  constructor(
+      xType: string, yScaleType: string, colorScale: Plottable.Scales.Color,
+      tooltip: d3.Selection<any, any, any, any>) {
+    this.seriesNames = [];
+    this.name2datasets = {};
+    this.colorScale = colorScale;
+    this.tooltip = tooltip;
+    this.datasets = [];
+    this._ignoreYOutliers = false;
+    // lastPointDataset is a dataset that contains just the last point of
+    // every dataset we're currently drawing.
+    this.lastPointsDataset = new Plottable.Dataset();
+    this.nanDataset = new Plottable.Dataset();
+    // need to do a single bind, so we can deregister the callback from
+    // old Plottable.Datasets. (Deregistration is done by identity checks.)
+    this.onDatasetChanged = this._onDatasetChanged.bind(this);
+    this.buildChart(xType, yScaleType);
+  }
+
+  private buildChart(xType: string, yScaleType: string) {
+    if (this.outer) {
+      this.outer.destroy();
+    }
+    let xComponents = ChartHelpers.getXComponents(xType);
+    this.xAccessor = xComponents.accessor;
+    this.xScale = xComponents.scale;
+    this.xAxis = xComponents.axis;
+    this.xAxis.margin(0).tickLabelPadding(3);
+    this.yScale = LineChart.getYScaleFromType(yScaleType);
+    this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
+    let yFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
+    this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
+    this.yAxis.usesTextWidthApproximation(true);
+
+    this.dzl = new DragZoomLayer(
+        this.xScale, this.yScale, this.updateSpecialDatasets.bind(this));
+
+    let center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
+
+    this.gridlines =
+        new Plottable.Components.Gridlines(this.xScale, this.yScale);
+
+    let xZeroLine = new Plottable.Components.GuideLineLayer('horizontal');
+    xZeroLine.scale(this.yScale).value(0);
+    let yZeroLine = new Plottable.Components.GuideLineLayer('vertical');
+    yZeroLine.scale(this.xScale).value(0);
+
+    this.center = new Plottable.Components.Group(
+        [this.gridlines, xZeroLine, yZeroLine, center, this.dzl]);
+    this.outer = new Plottable.Components.Table(
+        [[this.yAxis, this.center], [null, this.xAxis]]);
+  }
+
+  private buildPlot(xAccessor, xScale, yScale): Plottable.Component {
+    this.scalarAccessor = (d: ChartHelpers.ScalarDatum) => d.scalar;
+    this.smoothedAccessor = (d: ChartHelpers.ScalarDatum) => d.smoothed;
+    let linePlot = new Plottable.Plots.Line<number|Date>();
+    linePlot.x(xAccessor, xScale);
+    linePlot.y(this.scalarAccessor, yScale);
+    linePlot.attr(
+        'stroke',
+        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
+            this.colorScale.scale(dataset.metadata().name));
+    this.linePlot = linePlot;
+    let group = this.setupTooltips(linePlot);
+
+    let smoothLinePlot = new Plottable.Plots.Line<number|Date>();
+    smoothLinePlot.x(xAccessor, xScale);
+    smoothLinePlot.y(this.smoothedAccessor, yScale);
+    smoothLinePlot.attr(
+        'stroke',
+        (d: ChartHelpers.Datum, i: number, dataset: Plottable.Dataset) =>
+            this.colorScale.scale(dataset.metadata().name));
+    this.smoothLinePlot = smoothLinePlot;
+
+    // The scatterPlot will display the last point for each dataset.
+    // This way, if there is only one datum for the series, it is still
+    // visible. We hide it when tooltips are active to keep things clean.
+    let scatterPlot = new Plottable.Plots.Scatter<number|Date, number>();
+    scatterPlot.x(xAccessor, xScale);
+    scatterPlot.y(this.scalarAccessor, yScale);
+    scatterPlot.attr('fill', (d: any) => this.colorScale.scale(d.name));
+    scatterPlot.attr('opacity', 1);
+    scatterPlot.size(ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
+    scatterPlot.datasets([this.lastPointsDataset]);
+    this.scatterPlot = scatterPlot;
+
+    let nanDisplay = new Plottable.Plots.Scatter<number|Date, number>();
+    nanDisplay.x(xAccessor, xScale);
+    nanDisplay.y((x) => x.displayY, yScale);
+    nanDisplay.attr('fill', (d: any) => this.colorScale.scale(d.name));
+    nanDisplay.attr('opacity', 1);
+    nanDisplay.size(ChartHelpers.NAN_SYMBOL_SIZE * 2);
+    nanDisplay.datasets([this.nanDataset]);
+    nanDisplay.symbol(Plottable.SymbolFactories.triangle);
+    this.nanDisplay = nanDisplay;
+
+    return new Plottable.Components.Group(
+        [nanDisplay, scatterPlot, smoothLinePlot, group]);
+  }
+
+  /** Updates the chart when a dataset changes. Called every time the data of
+   * a dataset changes to update the charts.
+   */
+  private _onDatasetChanged(dataset: Plottable.Dataset) {
+    if (this.smoothingEnabled) {
+      this.resmoothDataset(dataset);
+    }
+    this.updateSpecialDatasets();
+  }
+
+  public ignoreYOutliers(ignoreYOutliers: boolean) {
+    if (ignoreYOutliers !== this._ignoreYOutliers) {
+      this._ignoreYOutliers = ignoreYOutliers;
+      this.updateSpecialDatasets();
+    }
+  }
+
+  private updateSpecialDatasets() {
+    if (this.smoothingEnabled) {
+      this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
+    } else {
+      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
+    }
+  }
+
+  /** Constructs special datasets. Each special dataset contains exceptional
+   * values from all of the regular datasets, e.g. last points in series, or
+   * NaN values. Those points will have a `name` and `relative` property added
+   * (since usually those are context in the surrounding dataset).
+   * The accessor will point to the correct data to access.
+   */
+  private updateSpecialDatasetsWithAccessor(accessor:
+                                                Plottable.IAccessor<number>) {
+    let lastPointsData =
+        this.datasets
+            .map((d) => {
+              let datum = null;
+              // filter out NaNs to ensure last point is a clean one
+              let nonNanData =
+                  d.data().filter((x) => !isNaN(accessor(x, -1, d)));
+              if (nonNanData.length > 0) {
+                let idx = nonNanData.length - 1;
+                datum = nonNanData[idx];
+                datum.name = d.metadata().name;
+                datum.relative = ChartHelpers.relativeAccessor(datum, -1, d);
+              }
+              return datum;
+            })
+            .filter((x) => x != null);
+    this.lastPointsDataset.data(lastPointsData);
+
+    // Take a dataset, return an array of NaN data points
+    // the NaN points will have a "displayY" property which is the
+    // y-value of a nearby point that was not NaN (0 if all points are NaN)
+    let datasetToNaNData = (d: Plottable.Dataset) => {
+      let displayY = null;
+      let data = d.data();
+      let i = 0;
+      while (i < data.length && displayY == null) {
+        if (!isNaN(accessor(data[i], -1, d))) {
+          displayY = accessor(data[i], -1, d);
+        }
+        i++;
+      }
+      if (displayY == null) {
+        displayY = 0;
+      }
+      let nanData = [];
+      for (i = 0; i < data.length; i++) {
+        if (!isNaN(accessor(data[i], -1, d))) {
+          displayY = accessor(data[i], -1, d);
+        } else {
+          data[i].name = d.metadata().name;
+          data[i].displayY = displayY;
+          data[i].relative = ChartHelpers.relativeAccessor(data[i], -1, d);
+          nanData.push(data[i]);
+        }
+      }
+      return nanData;
+    };
+    let nanData = _.flatten(this.datasets.map(datasetToNaNData));
+    this.nanDataset.data(nanData);
+
+    let datasetToValues: (d: Plottable.Dataset) => number[] = (d) => {
+      return d.data().map((x) => accessor(x, -1, d));
+    };
+    let vals = _.flatten(this.datasets.map(datasetToValues));
+    vals = vals.filter((x) => x === x && x !== Infinity && x !== -Infinity);
+    let domain = ChartHelpers.computeDomain(vals, this._ignoreYOutliers);
+    this.yScale.domain(domain);
+  }
+
+  private setupTooltips(plot: Plottable.XYPlot<number|Date, number>):
+      Plottable.Components.Group {
+    let pi = new Plottable.Interactions.Pointer();
+    pi.attachTo(plot);
+    // PointsComponent is a Plottable Component that will hold the little
+    // circles we draw over the closest data points
+    let pointsComponent = new Plottable.Component();
+    let group = new Plottable.Components.Group([plot, pointsComponent]);
+
+    let hideTooltips = () => {
+      this.tooltip.style('opacity', 0);
+      this.scatterPlot.attr('opacity', 1);
+      pointsComponent.content().selectAll('.point').remove();
+    };
+
+    let enabled = true;
+    let disableTooltips = () => {
+      enabled = false;
+      hideTooltips();
+    };
+    let enableTooltips = () => {
+      enabled = true;
+    };
+
+    this.dzl.interactionStart(disableTooltips);
+    this.dzl.interactionEnd(enableTooltips);
+
+    pi.onPointerMove((p: Plottable.Point) => {
+      if (!enabled) {
+        return;
+      }
+      let target: ChartHelpers.Point = {
+        x: p.x,
+        y: p.y,
+        datum: null,
+        dataset: null,
+      };
+
+
+      let bbox: SVGRect = (<any>this.gridlines.content().node()).getBBox();
+
+      // pts is the closets point to the tooltip for each dataset
+      let pts = plot.datasets()
+                    .map((dataset) => this.findClosestPoint(target, dataset))
+                    .filter(x => x != null);
+      let intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
+      // We draw tooltips for points that are NaN, or are currently visible
+      let ptsForTooltips = pts.filter(
+          (p) => intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar));
+      // Only draw little indicator circles for the non-NaN points
+      let ptsToCircle = ptsForTooltips.filter((p) => !isNaN(p.datum.scalar));
+
+      let ptsSelection: any =
+          pointsComponent.content().selectAll('.point').data(
+              ptsToCircle,
+              (p: ChartHelpers.Point) => p.dataset.metadata().name);
+      if (pts.length !== 0) {
+        ptsSelection.enter().append('circle').classed('point', true);
+        ptsSelection.attr('r', ChartHelpers.TOOLTIP_CIRCLE_SIZE)
+            .attr('cx', (p) => p.x)
+            .attr('cy', (p) => p.y)
+            .style('stroke', 'none')
+            .attr(
+                'fill',
+                (p) => this.colorScale.scale(p.dataset.metadata().name));
+        ptsSelection.exit().remove();
+        this.drawTooltips(ptsForTooltips, target);
+      } else {
+        hideTooltips();
+      }
+    });
+
+    pi.onPointerExit(hideTooltips);
+
+    return group;
+  }
+
+  private drawTooltips(
+      points: ChartHelpers.Point[], target: ChartHelpers.Point) {
+    // Formatters for value, step, and wall_time
+    this.scatterPlot.attr('opacity', 0);
+    let valueFormatter = ChartHelpers.multiscaleFormatter(
+        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
+
+    let dist = (p: ChartHelpers.Point) =>
+        Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
+    let closestDist = _.min(points.map(dist));
+
+    let valueSortMethod = this.scalarAccessor;
+    if (this.smoothingEnabled) {
+      valueSortMethod = this.smoothedAccessor;
+    }
+
+    if (this.tooltipSortingMethod === 'ascending') {
+      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset));
+    } else if (this.tooltipSortingMethod === 'descending') {
+      points = _.sortBy(points, (d) => valueSortMethod(d.datum, -1, d.dataset))
+                   .reverse();
+    } else if (this.tooltipSortingMethod === 'nearest') {
+      points = _.sortBy(points, dist);
+    } else {
+      // The 'default' sorting method maintains the order of names passed to
+      // setVisibleSeries(). However we reverse that order when defining the
+      // datasets. So we must call reverse again to restore the order.
+      points = points.slice(0).reverse();
+    }
+
+    let rows = this.tooltip.select('tbody')
+                   .html('')
+                   .selectAll('tr')
+                   .data(points)
+                   .enter()
+                   .append('tr');
+    // Grey out the point if any of the following are true:
+    // - The cursor is outside of the x-extent of the dataset
+    // - The point's y value is NaN
+    rows.classed('distant', (d) => {
+      let firstPoint = d.dataset.data()[0];
+      let lastPoint = _.last(d.dataset.data());
+      let firstX = this.xScale.scale(this.xAccessor(firstPoint, 0, d.dataset));
+      let lastX = this.xScale.scale(this.xAccessor(lastPoint, 0, d.dataset));
+      let s = this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
+      return target.x < firstX || target.x > lastX || isNaN(s);
+    });
+    rows.classed('closest', (p) => dist(p) === closestDist);
+    // It is a bit hacky that we are manually applying the width to the swatch
+    // and the nowrap property to the text here. The reason is as follows:
+    // the style gets updated asynchronously by Polymer scopeSubtree observer.
+    // Which means we would get incorrect sizing information since the text
+    // would wrap by default. However, we need correct measurements so that
+    // we can stop the text from falling off the edge of the screen.
+    // therefore, we apply the size-critical styles directly.
+    rows.style('white-space', 'nowrap');
+    rows.append('td')
+        .append('span')
+        .classed('swatch', true)
+        .style(
+            'background-color',
+            (d) => this.colorScale.scale(d.dataset.metadata().name));
+    rows.append('td').text((d) => d.dataset.metadata().name);
+    if (this.smoothingEnabled) {
+      rows.append('td').text(
+          (d) => isNaN(d.datum.smoothed) ? 'NaN' :
+                                           valueFormatter(d.datum.smoothed));
+    }
+    rows.append('td').text(
+        (d) => isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar));
+    rows.append('td').text((d) => ChartHelpers.stepFormatter(d.datum.step));
+    rows.append('td').text(
+        (d) => ChartHelpers.timeFormatter(d.datum.wall_time));
+    rows.append('td').text(
+        (d) => ChartHelpers.relativeFormatter(
+            ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)));
+
+    // compute left position
+    let documentWidth = document.body.clientWidth;
+    let node: any = this.tooltip.node();
+    let parentRect = node.parentElement.getBoundingClientRect();
+    let nodeRect = node.getBoundingClientRect();
+    // prevent it from falling off the right side of the screen
+    let left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
+
+    if (this.tooltipPosition === 'right') {
+      left = Math.min(parentRect.width, left);
+    } else {  // 'bottom'
+      left = Math.min(0, left);
+      top = parentRect.height + ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
+    }
+
+    this.tooltip.style('transform', 'translate(' + left + 'px,' + top + 'px)');
+    this.tooltip.style('opacity', 1);
+  }
+
+  private findClosestPoint(
+      target: ChartHelpers.Point,
+      dataset: Plottable.Dataset): ChartHelpers.Point {
+    let points: ChartHelpers.Point[] = dataset.data().map((d, i) => {
+      let x = this.xAccessor(d, i, dataset);
+      let y = this.smoothingEnabled ? this.smoothedAccessor(d, i, dataset) :
+                                      this.scalarAccessor(d, i, dataset);
+      return {
+        x: this.xScale.scale(x),
+        y: this.yScale.scale(y),
+        datum: d,
+        dataset: dataset,
+      };
+    });
+    let idx: number =
+        _.sortedIndex(points, target, (p: ChartHelpers.Point) => p.x);
+    if (idx === points.length) {
+      return points[points.length - 1];
+    } else if (idx === 0) {
+      return points[0];
+    } else {
+      let prev = points[idx - 1];
+      let next = points[idx];
+      let prevDist = Math.abs(prev.x - target.x);
+      let nextDist = Math.abs(next.x - target.x);
+      return prevDist < nextDist ? prev : next;
+    }
+  }
+
+  private resmoothDataset(dataset: Plottable.Dataset) {
+    let data = dataset.data();
+    const smoothingWeight = this.smoothingWeight;
+    let last = data.length > 0 ? data[0].scalar : NaN;
+    data.forEach((d) => {
+      if (!_.isFinite(last)) {
+        d.smoothed = d.scalar;
+      } else {
+        // 1st-order IIR low-pass filter to attenuate the higher-
+        // frequency components of the time-series.
+        d.smoothed = last * smoothingWeight + (1 - smoothingWeight) * d.scalar;
+      }
+      last = d.smoothed;
+    });
+  }
+
+  private getDataset(name: string) {
+    if (this.name2datasets[name] === undefined) {
+      this.name2datasets[name] = new Plottable.Dataset([], {name: name});
+    }
+    return this.name2datasets[name];
+  }
+
+  static getYScaleFromType(yScaleType: string):
+      Plottable.QuantitativeScale<number> {
+    if (yScaleType === 'log') {
+      return new Plottable.Scales.ModifiedLog();
+    } else if (yScaleType === 'linear') {
+      return new Plottable.Scales.Linear();
+    } else {
+      throw new Error('Unrecognized yScale type ' + yScaleType);
+    }
+  }
+
+  /**
+   * Update the selected series on the chart.
+   */
+  public setVisibleSeries(names: string[]) {
+    names = names.sort();
+    this.seriesNames = names;
+
+    names.reverse();  // draw first series on top
+    this.datasets.forEach((d) => d.offUpdate(this.onDatasetChanged));
+    this.datasets = names.map((r) => this.getDataset(r));
+    this.datasets.forEach((d) => d.onUpdate(this.onDatasetChanged));
+    this.linePlot.datasets(this.datasets);
+
+    if (this.smoothingEnabled) {
+      this.smoothLinePlot.datasets(this.datasets);
+    }
+    this.updateSpecialDatasets();
+  }
+
+  /**
+   * Set the data of a series on the chart.
+   */
+  public setSeriesData(name: string, data: ChartHelpers.ScalarDatum[]) {
+    this.getDataset(name).data(data);
+  }
+
+  public smoothingUpdate(weight: number) {
+    this.smoothingWeight = weight;
+    this.datasets.forEach((d) => this.resmoothDataset(d));
+
+    if (!this.smoothingEnabled) {
+      this.linePlot.addClass('ghost');
+      this.scatterPlot.y(this.smoothedAccessor, this.yScale);
+      this.smoothingEnabled = true;
+      this.smoothLinePlot.datasets(this.datasets);
+    }
+
+    this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
+  }
+
+  public smoothingDisable() {
+    if (this.smoothingEnabled) {
+      this.linePlot.removeClass('ghost');
+      this.scatterPlot.y(this.scalarAccessor, this.yScale);
+      this.smoothLinePlot.datasets([]);
+      this.smoothingEnabled = false;
+      this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
+    }
+  }
+
+  public setTooltipSortingMethod(method: string) {
+    this.tooltipSortingMethod = method;
+  }
+
+  public setTooltipPosition(position: string) {
+    this.tooltipPosition = position;
+  }
+
+  public renderTo(targetSVG: d3.Selection<any, any, any, any>) {
+    this.targetSVG = targetSVG;
+    this.outer.renderTo(targetSVG);
+  }
+
+  public redraw() {
+    this.outer.redraw();
+  }
+
+  public destroy() {
+    this.outer.destroy();
+  }
+}
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD b/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a1848d95448cb29cfe2f2c12c1ede58052a2c21f
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/BUILD
@@ -0,0 +1,448 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_projector_d3v4",
+    srcs = [
+        "bundle.html",
+        "bundle.js",
+        "styles.html",
+        "vz-projector.html",
+        "vz-projector-app.html",
+        "vz-projector-bookmark-panel.html",
+        "vz-projector-colab.html",
+        "vz-projector-dashboard.html",
+        "vz-projector-data-panel.html",
+        "vz-projector-input.html",
+        "vz-projector-inspector-panel.html",
+        "vz-projector-legend.html",
+        "vz-projector-metadata-card.html",
+        "vz-projector-projections-panel.html",
+    ],
+    path = "/vz-projector",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/tensorboard/components/tf_dashboard_common_d3v4",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:numericjs",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:threejs",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:weblas",
+        "@org_polymer",
+        "@org_polymer_iron_collapse",
+        "@org_polymer_iron_icons",
+        "@org_polymer_paper_button",
+        "@org_polymer_paper_checkbox",
+        "@org_polymer_paper_dialog",
+        "@org_polymer_paper_dialog_scrollable",
+        "@org_polymer_paper_dropdown_menu",
+        "@org_polymer_paper_icon_button",
+        "@org_polymer_paper_input",
+        "@org_polymer_paper_item",
+        "@org_polymer_paper_listbox",
+        "@org_polymer_paper_slider",
+        "@org_polymer_paper_spinner",
+        "@org_polymer_paper_styles",
+        "@org_polymer_paper_toast",
+        "@org_polymer_paper_toggle_button",
+        "@org_polymer_paper_tooltip",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "external.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:three.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.Projector.Heap": ["heap.ts"],
+        "VZ.Projector.Label": ["label.ts"],
+        "VZ.Projector.SPTree": ["sptree.ts"],
+        "VZ.Projector.BhTsne": ["bh_tsne.ts"],
+        "VZ.Projector.Logging": ["logging.ts"],
+        "VZ.Projector.RenderContext": ["renderContext.ts"],
+        "VZ.Projector.ScatterPlotRectangleSelector": ["scatterPlotRectangleSelector.ts"],
+        "VZ.Projector.AnalyticsLogger": ["analyticsLogger.ts"],
+        "VZ.Projector.Util": ["util.ts"],
+        "VZ.Projector.Data": ["data.ts"],
+        "VZ.Projector.DataProvider": ["data-provider.ts"],
+        "VZ.Projector.DataProviderDemo": ["data-provider-demo.ts"],
+        "VZ.Projector.DataProviderProto": ["data-provider-proto.ts"],
+        "VZ.Projector.DataProviderServer": ["data-provider-server.ts"],
+        "VZ.Projector.Knn": ["knn.ts"],
+        "VZ.Projector.ProjectorEventContext": ["projectorEventContext.ts"],
+        "VZ.Projector.ScatterPlot": ["scatterPlot.ts"],
+        "VZ.Projector.ScatterPlotVisualizer3DLabels": ["scatterPlotVisualizer3DLabels.ts"],
+        "VZ.Projector.ScatterPlotVisualizerCanvasLabels": ["scatterPlotVisualizerCanvasLabels.ts"],
+        "VZ.Projector.ScatterPlotVisualizerPolylines": ["scatterPlotVisualizerPolylines.ts"],
+        "VZ.Projector.ScatterPlotVisualizerSprites": ["scatterPlotVisualizerSprites.ts"],
+        "VZ.Projector.ScatterPlotVisualizer": ["scatterPlotVisualizer.ts"],
+        "VZ.Projector.ProjectorScatterPlotAdapter": ["projectorScatterPlotAdapter.ts"],
+        "VZ.Projector.Vector": ["vector.ts"],
+        "VZ.Projector.ProjectorUtil": ["vz-projector-util.ts"],
+        "VZ.Projector.ProjectorBookmarkPanel": ["vz-projector-bookmark-panel.ts"],
+        "VZ.Projector.ProjectorDataPanel": ["vz-projector-data-panel.ts"],
+        "VZ.Projector.ProjectorInput": ["vz-projector-input.ts"],
+        "VZ.Projector.ProjectorInspectorPanel": ["vz-projector-inspector-panel.ts"],
+        "VZ.Projector.ProjectorLegend": ["vz-projector-legend.ts"],
+        "VZ.Projector.ProjectorMetadataCard": ["vz-projector-metadata-card.ts"],
+        "VZ.Projector.ProjectorProjectionsPanel": ["vz-projector-projections-panel.ts"],
+        "VZ.Projector": ["vz-projector.ts"],
+    },
+    namespace_symbol_aliases = {
+        "VZ.Projector.AnalyticsLogger": {
+            "ProjectionType": "VZ.Projector.Data.ProjectionType",
+        },
+        "VZ.Projector.BhTsne": {
+            "SPNode": "VZ.Projector.SPTree.SPNode",
+            "SPTree": "VZ.Projector.SPTree.SPTree",
+        },
+        "VZ.Projector.DataProviderDemo": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "TENSORS_MSG_ID": "VZ.Projector.DataProvider.TENSORS_MSG_ID",
+            "dataProvider": "VZ.Projector.DataProvider",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.DataProviderProto": {
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataProto": "VZ.Projector.Data.DataProto",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "analyzeMetadata": "VZ.Projector.DataProvider.analyzeMetadata",
+        },
+        "VZ.Projector.DataProviderServer": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "dataProvider": "VZ.Projector.DataProvider",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.DataProvider": {
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "logging": "VZ.Projector.Logging",
+            "runAsyncTask": "VZ.Projector.Util.runAsyncTask",
+        },
+        "VZ.Projector.Data": {
+            "SpriteMetadata": "VZ.Projector.DataProvider.SpriteMetadata",
+            "TSNE": "VZ.Projector.BhTsne.TSNE",
+            "knn": "VZ.Projector.Knn",
+            "logging": "VZ.Projector.Logging",
+            "scatterPlot": "VZ.Projector.ScatterPlot",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.Knn": {
+            "KMin": "VZ.Projector.Heap.KMin",
+            "Vector": "VZ.Projector.Vector.Vector",
+            "logging": "VZ.Projector.Logging",
+            "runAsyncTask": "VZ.Projector.Util.runAsyncTask",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ProjectorEventContext": {
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "NearestEntry": "VZ.Projector.Knn.NearestEntry",
+            "Projection": "VZ.Projector.Data.Projection",
+        },
+        "VZ.Projector.ProjectorScatterPlotAdapter": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "LabelRenderParams": "VZ.Projector.RenderContext.LabelRenderParams",
+            "NearestEntry": "VZ.Projector.Knn.NearestEntry",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionComponents3D": "VZ.Projector.Data.ProjectionComponents3D",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ScatterPlot": "VZ.Projector.ScatterPlot.ScatterPlot",
+            "ScatterPlotVisualizer3DLabels": "VZ.Projector.ScatterPlotVisualizer3DLabels.ScatterPlotVisualizer3DLabels",
+            "ScatterPlotVisualizerCanvasLabels": "VZ.Projector.ScatterPlotVisualizerCanvasLabels.ScatterPlotVisualizerCanvasLabels",
+            "ScatterPlotVisualizerPolylines": "VZ.Projector.ScatterPlotVisualizerPolylines.ScatterPlotVisualizerPolylines",
+            "ScatterPlotVisualizerSprites": "VZ.Projector.ScatterPlotVisualizerSprites.ScatterPlotVisualizerSprites",
+            "State": "VZ.Projector.Data.State",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ScatterPlot": {
+            "BoundingBox": "VZ.Projector.ScatterPlotRectangleSelector.BoundingBox",
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "LabelRenderParams": "VZ.Projector.RenderContext.LabelRenderParams",
+            "Point2D": "VZ.Projector.Vector.Point2D",
+            "Point3D": "VZ.Projector.Vector.Point3D",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotRectangleSelector": "VZ.Projector.ScatterPlotRectangleSelector.ScatterPlotRectangleSelector",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizer3DLabels": {
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerCanvasLabels": {
+            "BoundingBox": "VZ.Projector.Label.BoundingBox",
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "CollisionGrid": "VZ.Projector.Label.CollisionGrid",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerPolylines": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizerSprites": {
+            "CameraType": "VZ.Projector.RenderContext.CameraType",
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+            "ScatterPlotVisualizer": "VZ.Projector.ScatterPlotVisualizer.ScatterPlotVisualizer",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ScatterPlotVisualizer": {
+            "RenderContext": "VZ.Projector.RenderContext.RenderContext",
+        },
+        "VZ.Projector.Util": {
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "Point2D": "VZ.Projector.Vector.Point2D",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.Vector": {
+            "assert": "VZ.Projector.Util.assert",
+        },
+        "VZ.Projector.ProjectorBookmarkPanel": {
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "State": "VZ.Projector.Data.State",
+            "logging": "VZ.Projector.Logging",
+        },
+        "VZ.Projector.ProjectorDataPanel": {
+            "ColorLegendRenderInfo": "VZ.Projector.ProjectorLegend.ColorLegendRenderInfo",
+            "ColorLegendThreshold": "VZ.Projector.ProjectorLegend.ColorLegendThreshold",
+            "ColorOption": "VZ.Projector.Data.ColorOption",
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorConfig": "VZ.Projector.DataProvider.ProjectorConfig",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "parseRawMetadata": "VZ.Projector.DataProvider.parseRawMetadata",
+            "parseRawTensors": "VZ.Projector.DataProvider.parseRawTensors",
+            "util": "VZ.Projector.Util",
+        },
+        "VZ.Projector.ProjectorInput": {
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorInspectorPanel": {
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ProjectorInput": "VZ.Projector.ProjectorInput.ProjectorInput",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "adapter": "VZ.Projector.ProjectorScatterPlotAdapter",
+            "knn": "VZ.Projector.Knn",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector.ProjectorLegend": {
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorMetadataCard": {
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+        },
+        "VZ.Projector.ProjectorProjectionsPanel": {
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionType": "VZ.Projector.Data.ProjectionType",
+            "Projector": "VZ.Projector.Projector",
+            "ProjectorInput": "VZ.Projector.ProjectorInput.ProjectorInput",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "Vector": "VZ.Projector.Vector.Vector",
+            "data": "VZ.Projector.Data",
+            "util": "VZ.Projector.Util",
+            "vector": "VZ.Projector.Vector",
+        },
+        "VZ.Projector": {
+            "AnalyticsLogger": "VZ.Projector.AnalyticsLogger.AnalyticsLogger",
+            "BookmarkPanel": "VZ.Projector.ProjectorBookmarkPanel.BookmarkPanel",
+            "ColorOption": "VZ.Projector.Data.ColorOption",
+            "ColumnStats": "VZ.Projector.Data.ColumnStats",
+            "DataPanel": "VZ.Projector.ProjectorDataPanel.DataPanel",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataProto": "VZ.Projector.Data.DataProto",
+            "DataProvider": "VZ.Projector.DataProvider.DataProvider",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "DemoDataProvider": "VZ.Projector.DataProviderDemo.DemoDataProvider",
+            "DistanceFunction": "VZ.Projector.Data.DistanceFunction",
+            "DistanceMetricChangedListener": "VZ.Projector.ProjectorEventContext.DistanceMetricChangedListener",
+            "EmbeddingInfo": "VZ.Projector.DataProvider.EmbeddingInfo",
+            "HoverListener": "VZ.Projector.ProjectorEventContext.HoverListener",
+            "InspectorPanel": "VZ.Projector.ProjectorInspectorPanel.InspectorPanel",
+            "MetadataCard": "VZ.Projector.ProjectorMetadataCard.MetadataCard",
+            "MouseMode": "VZ.Projector.ScatterPlot.MouseMode",
+            "PointMetadata": "VZ.Projector.Data.PointMetadata",
+            "PolymerElement": "VZ.Projector.ProjectorUtil.PolymerElement",
+            "PolymerHTMLElement": "VZ.Projector.ProjectorUtil.PolymerHTMLElement",
+            "Projection": "VZ.Projector.Data.Projection",
+            "ProjectionChangedListener": "VZ.Projector.ProjectorEventContext.ProjectionChangedListener",
+            "ProjectionsPanel": "VZ.Projector.ProjectorProjectionsPanel.ProjectionsPanel",
+            "ProjectorEventContext": "VZ.Projector.ProjectorEventContext.ProjectorEventContext",
+            "ProjectorScatterPlotAdapter": "VZ.Projector.ProjectorScatterPlotAdapter.ProjectorScatterPlotAdapter",
+            "ProtoDataProvider": "VZ.Projector.DataProviderProto.ProtoDataProvider",
+            "SelectionChangedListener": "VZ.Projector.ProjectorEventContext.SelectionChangedListener",
+            "ServerDataProvider": "VZ.Projector.DataProviderServer.ServerDataProvider",
+            "ServingMode": "VZ.Projector.DataProvider.ServingMode",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "data": "VZ.Projector.Data",
+            "knn": "VZ.Projector.Knn",
+            "logging": "VZ.Projector.Logging",
+            "stateGetAccessorDimensions": "VZ.Projector.Data.stateGetAccessorDimensions",
+            "util": "VZ.Projector.Util",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+#### Legacy for other consumers
+load(
+    "//tensorflow/tensorboard:defs.bzl",
+    "tensorboard_webcomponent_library",
+    "tensorboard_ts_library",
+    "tensorboard_ts_declaration",
+)
+
+# Standalone embedding projector demos should depend on this target. We
+# exclude the HTML file for the dashboard itself. Demos do not need that
+# HTML file. This was introduced because standalone demos as of today
+# have an additional Closure pass that uses a compilation configuration
+# stricter than that of TensorBoard.
+
+_PROJECTOR_LIB_TS_LIB_DEPS = [
+    ":ts_lib",
+    ":tsne_ts_lib",
+]
+
+_PROJECTOR_DESTDIR = "vz-projector"
+
+_PROJECTOR_LIB_DEPS = [
+    "//third_party/javascript/polymer/v1/iron-collapse:lib",
+    "//third_party/javascript/polymer/v1/iron-icons:lib",
+    "//third_party/javascript/polymer/v1/paper-button:lib",
+    "//third_party/javascript/polymer/v1/paper-checkbox:lib",
+    "//third_party/javascript/polymer/v1/paper-dialog:lib",
+    "//third_party/javascript/polymer/v1/paper-dialog-scrollable:lib",
+    "//third_party/javascript/polymer/v1/paper-dropdown-menu:lib",
+    "//third_party/javascript/polymer/v1/paper-icon-button:lib",
+    "//third_party/javascript/polymer/v1/paper-input:lib",
+    "//third_party/javascript/polymer/v1/paper-item:lib",
+    "//third_party/javascript/polymer/v1/paper-listbox:lib",
+    "//third_party/javascript/polymer/v1/paper-slider:lib",
+    "//third_party/javascript/polymer/v1/paper-spinner:lib",
+    "//third_party/javascript/polymer/v1/paper-toast:lib",
+    "//third_party/javascript/polymer/v1/paper-toggle-button:lib",
+    "//third_party/javascript/polymer/v1/paper-tooltip:lib",
+    "//third_party/javascript/polymer/v1/polymer:lib",
+]
+
+tensorboard_ts_library(
+    name = "tsne_ts_lib",
+    srcs = [
+        "bh_tsne.ts",
+        "sptree.ts",
+    ],
+)
+
+tensorboard_ts_declaration(
+    name = "external",
+    srcs = ["external.d.ts"],
+)
+
+tensorboard_ts_library(
+    name = "ts_lib",
+    srcs = glob(
+        ["*.ts"],
+        exclude = [
+            "*.d.ts",
+            "*_test.ts",
+            "bh_tsne.ts",
+            "sptree.ts",
+        ],
+    ),
+    runtime_deps = [
+        "//third_party/javascript/d3/v4:d3",
+        "//third_party/javascript/numericjs",
+        "//third_party/javascript/threejs/r77:threejs",
+        "//third_party/javascript/threejs/r77/examples/js/controls:orbitcontrols",
+        "//third_party/javascript/weblas",
+    ],
+    deps = [
+        ":external",
+        ":tsne_ts_lib",
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/typings/d3_v4:bundle",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/threejs:three",
+        "//third_party/javascript/typings/webcomponents_js",
+    ],
+)
+
+tensorboard_webcomponent_library(
+    name = "lib",
+    srcs = glob(
+        ["*.html"],
+        exclude = ["vz-projector-dashboard.html"],
+    ),
+    ts_lib_deps = _PROJECTOR_LIB_TS_LIB_DEPS,
+    destdir = _PROJECTOR_DESTDIR,
+    deps = _PROJECTOR_LIB_DEPS,
+)
diff --git a/tensorflow/tensorboard/components/vz_projector/analyticsLogger.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/analyticsLogger.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/analyticsLogger.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/analyticsLogger.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
similarity index 99%
rename from tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
index 9d2df65f56088b0f6a9a08d36f37f51cd96ac99b..063d57ec401d196827ce978dc64d4121a9c5edb3 100644
--- a/tensorflow/tensorboard/components/vz_projector/bh_tsne.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/bh_tsne.ts
@@ -22,6 +22,7 @@ limitations under the License.
  */
 
 /**
+ * @license
  * The MIT License (MIT)
  * Copyright (c) 2015 Andrej Karpathy
  * Permission is hereby granted, free of charge, to any person obtaining a copy
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html b/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html
new file mode 100644
index 0000000000000000000000000000000000000000..2837fed870832c20310945914f41f6c5047f5f5f
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/bundle.html
@@ -0,0 +1,24 @@
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-imports/d3.html">
+<link rel="import" href="../tf-imports/numericjs.html">
+<link rel="import" href="../tf-imports/threejs.html">
+<link rel="import" href="../tf-imports/weblas.html">
+
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-demo.ts
similarity index 78%
rename from tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-demo.ts
index 57e549c2f231a9798d0eb24e2d0a5a894ee45d75..1410a84a8e4ee844eab76eb0c7d55aec70b9774f 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-demo.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-demo.ts
@@ -46,23 +46,27 @@ export class DemoDataProvider implements DataProvider {
 
   retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
       : void {
-    let msgId = logging.setModalMessage('Fetching projector config...');
-    d3.json(this.projectorConfigPath, (err, projectorConfig) => {
-      if (err) {
-        let errorMessage = err;
-        // If the error is a valid XMLHttpResponse, it's possible this is a
-        // cross-origin error.
-        if (err.responseText != null) {
-          errorMessage = 'Cannot fetch projector config, possibly a ' +
-              'Cross-Origin request error.';
-        }
-        logging.setErrorMessage(errorMessage, 'fetching projector config');
-        return;
+    const msgId = logging.setModalMessage('Fetching projector config...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', this.projectorConfigPath);
+    xhr.onerror = (err) => {
+      let errorMessage = err.message;
+      // If the error is a valid XMLHttpResponse, it's possible this is a
+      // cross-origin error.
+      if (xhr.responseText != null) {
+        errorMessage = 'Cannot fetch projector config, possibly a ' +
+            'Cross-Origin request error.';
       }
+      logging.setErrorMessage(errorMessage, 'fetching projector config');
+    };
+    xhr.onload = () => {
+      const projectorConfig = JSON.parse(xhr.responseText) as ProjectorConfig;
       logging.setModalMessage(null, msgId);
       this.projectorConfig = projectorConfig;
       callback(projectorConfig);
-    });
+    };
+    xhr.send();
   }
 
   retrieveTensor(run: string, tensorName: string,
@@ -107,14 +111,17 @@ export class DemoDataProvider implements DataProvider {
       run: string, tensorName: string, callback: (r: State[]) => void) {
     let embedding = this.getEmbeddingInfo(tensorName);
     let msgId = logging.setModalMessage('Fetching bookmarks...');
-    d3.json(embedding.bookmarksPath, (err, bookmarks: State[]) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText);
-        return;
-      }
 
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', embedding.bookmarksPath);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText);
+    };
+    xhr.onload = () => {
+      const bookmarks = JSON.parse(xhr.responseText) as State[];
       logging.setModalMessage(null, msgId);
       callback(bookmarks);
-    });
+    };
+    xhr.send();
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-proto.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/data-provider-proto.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-proto.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-server.ts
similarity index 72%
rename from tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-server.ts
index ff535468de708effc62d115a3b2fa5aed3cb5cb5..02720ebf6a7cbbf68de64070fd9e9293dbc7300a 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider-server.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider-server.ts
@@ -36,9 +36,9 @@ export class ServerDataProvider implements DataProvider {
   private getEmbeddingInfo(run: string, tensorName: string,
       callback: (e: EmbeddingInfo) => void): void {
     this.retrieveProjectorConfig(run, config => {
-      let embeddings = config.embeddings;
+      const embeddings = config.embeddings;
       for (let i = 0; i < embeddings.length; i++) {
-        let embedding = embeddings[i];
+        const embedding = embeddings[i];
         if (embedding.tensorName === tensorName) {
           callback(embedding);
           return;
@@ -49,15 +49,19 @@ export class ServerDataProvider implements DataProvider {
   }
 
   retrieveRuns(callback: (runs: string[]) => void): void {
-    let msgId = logging.setModalMessage('Fetching runs...');
-    d3.json(`${this.routePrefix}/runs`, (err, runs: string[]) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText, 'fetching runs');
-        return;
-      }
+    const msgId = logging.setModalMessage('Fetching runs...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', `${this.routePrefix}/runs`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching runs');
+    };
+    xhr.onload = () => {
+      const runs = JSON.parse(xhr.responseText);
       logging.setModalMessage(null, msgId);
       callback(runs);
-    });
+    };
+    xhr.send();
   }
 
   retrieveProjectorConfig(run: string, callback: (d: ProjectorConfig) => void)
@@ -67,17 +71,20 @@ export class ServerDataProvider implements DataProvider {
       return;
     }
 
-    let msgId = logging.setModalMessage('Fetching projector config...');
-    d3.json(`${this.routePrefix}/info?run=${run}`, (err,
-        config: ProjectorConfig) => {
-      if (err) {
-        logging.setErrorMessage(err.responseText, 'fetching projector config');
-        return;
-      }
+    const msgId = logging.setModalMessage('Fetching projector config...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('GET', `${this.routePrefix}/info?run=${run}`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching projector config');
+    };
+    xhr.onload = () => {
+      const config = JSON.parse(xhr.responseText) as ProjectorConfig;
       logging.setModalMessage(null, msgId);
       this.runProjectorConfigCache[run] = config;
       callback(config);
-    });
+    };
+    xhr.send();
   }
 
   retrieveTensor(run: string, tensorName: string,
@@ -112,14 +119,19 @@ export class ServerDataProvider implements DataProvider {
 
   getBookmarks(
       run: string, tensorName: string, callback: (r: State[]) => void) {
-    let msgId = logging.setModalMessage('Fetching bookmarks...');
-    d3.json(
-        `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`,
-        (err, bookmarks: State[]) => {
-          logging.setModalMessage(null, msgId);
-          if (!err) {
-            callback(bookmarks);
-          }
-        });
+    const msgId = logging.setModalMessage('Fetching bookmarks...');
+
+    const xhr = new XMLHttpRequest();
+    xhr.open(
+        'GET', `${this.routePrefix}/bookmarks?run=${run}&name=${tensorName}`);
+    xhr.onerror = (err) => {
+      logging.setErrorMessage(xhr.responseText, 'fetching bookmarks');
+    };
+    xhr.onload = () => {
+      logging.setModalMessage(null, msgId);
+      const bookmarks = JSON.parse(xhr.responseText);
+      callback(bookmarks);
+    };
+    xhr.send();
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider.ts
similarity index 94%
rename from tensorflow/tensorboard/components/vz_projector/data-provider.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/data-provider.ts
index 3acc5a4374a5ff0f754749bbfcfa74450de9b976..c8eede798c670372e334e4a89e677162055d397e 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/data-provider.ts
@@ -203,7 +203,7 @@ export function parseTensors(
   logging.setModalMessage('Parsing tensors...', TENSORS_MSG_ID);
 
   return new Promise<DataPoint[]>((resolve, reject) => {
-    let data: DataPoint[] = [];
+    const data: DataPoint[] = [];
     let numDim: number;
 
     streamParse(content, (line: string) => {
@@ -211,8 +211,8 @@ export function parseTensors(
       if (line === '') {
         return;
       }
-      let row = line.split(valueDelim);
-      let dataPoint: DataPoint = {
+      const row = line.split(valueDelim);
+      const dataPoint: DataPoint = {
         metadata: {},
         vector: null,
         index: data.length,
@@ -250,8 +250,8 @@ export function parseTensors(
 export function parseTensorsFromFloat32Array(data: Float32Array,
     dim: number): Promise<DataPoint[]> {
   return runAsyncTask('Parsing tensors...', () => {
-    let N = data.length / dim;
-    let dataPoints: DataPoint[] = [];
+    const N = data.length / dim;
+    const dataPoints: DataPoint[] = [];
     let offset = 0;
     for (let i = 0; i < N; ++i) {
       dataPoints.push({
@@ -271,7 +271,7 @@ export function parseTensorsFromFloat32Array(data: Float32Array,
 
 export function analyzeMetadata(
     columnNames, pointsMetadata: PointMetadata[]): ColumnStats[] {
-  let columnStats: ColumnStats[] = columnNames.map(name => {
+  const columnStats: ColumnStats[] = columnNames.map(name => {
     return {
       name: name,
       isNumeric: true,
@@ -280,12 +280,15 @@ export function analyzeMetadata(
       max: Number.NEGATIVE_INFINITY
     };
   });
-  let mapOfValues = columnNames.map(() => d3.map<number>());
+
+  const mapOfValues: [{[value: string]: number}] =
+      columnNames.map(() => new Object());
+
   pointsMetadata.forEach(metadata => {
     columnNames.forEach((name: string, colIndex: number) => {
-      let stats = columnStats[colIndex];
-      let map = mapOfValues[colIndex];
-      let value = metadata[name];
+      const stats = columnStats[colIndex];
+      const map = mapOfValues[colIndex];
+      const value = metadata[name];
 
       // Skip missing values.
       if (value == null) {
@@ -293,12 +296,12 @@ export function analyzeMetadata(
       }
 
       if (!stats.tooManyUniqueValues) {
-        if (map.has(value)) {
-          map.set(value, map.get(value) + 1);
+        if (value in map) {
+          map[value]++;
         } else {
-          map.set(value, 1);
+          map[value] = 1;
         }
-        if (map.size() > NUM_COLORS_COLOR_MAP) {
+        if (Object.keys(map).length > NUM_COLORS_COLOR_MAP) {
           stats.tooManyUniqueValues = true;
         }
       }
@@ -312,8 +315,8 @@ export function analyzeMetadata(
     });
   });
   columnStats.forEach((stats, colIndex) => {
-    stats.uniqueEntries = mapOfValues[colIndex].entries().map(e => {
-      return {label: e.key, count: e.value};
+    stats.uniqueEntries = Object.keys(mapOfValues[colIndex]).map(label => {
+      return {label, count: mapOfValues[colIndex][label]};
     });
   });
   return columnStats;
@@ -409,7 +412,7 @@ export function retrieveSpriteAndMetadataInfo(metadataPath: string,
     if (spriteMsgId) {
       logging.setModalMessage(null, spriteMsgId);
     }
-    let [metadata, spriteImage] = values;
+    const [metadata, spriteImage] = values;
 
     if (spriteImage && (spriteImage.height > MAX_SPRITE_IMAGE_SIZE_PX ||
                         spriteImage.width > MAX_SPRITE_IMAGE_SIZE_PX)) {
diff --git a/tensorflow/tensorboard/components/vz_projector/data.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/data.ts
similarity index 98%
rename from tensorflow/tensorboard/components/vz_projector/data.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/data.ts
index 701e28274b3a2842f84156ad3487f0d28067e225..c4e81985fc84ee17d6daaa337b589a32cec8cfc6 100644
--- a/tensorflow/tensorboard/components/vz_projector/data.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/data.ts
@@ -125,7 +125,7 @@ export class DataSet {
    * This keeps a list of all current projections so you can easily test to see
    * if it's been calculated already.
    */
-  projections = d3.set();
+  projections: {[projection: string]: boolean} = {};
   nearest: knn.NearestEntry[][];
   nearestK: number;
   tSNEIteration: number = 0;
@@ -141,7 +141,7 @@ export class DataSet {
   constructor(
       points: DataPoint[], spriteAndMetadataInfo?: SpriteAndMetadataInfo) {
     this.points = points;
-    this.shuffledDataIndices = util.shuffle(d3.range(this.points.length));
+    this.shuffledDataIndices = util.shuffle(util.range(this.points.length));
     this.sequences = this.computeSequences(points);
     this.dim = [this.points.length, this.points[0].vector.length];
     this.spriteAndMetadataInfo = spriteAndMetadataInfo;
@@ -241,7 +241,7 @@ export class DataSet {
 
   /** Projects the dataset onto a given vector and caches the result. */
   projectLinear(dir: vector.Vector, label: string) {
-    this.projections.add(label);
+    this.projections[label] = true;
     this.points.forEach(dataPoint => {
       dataPoint.projections[label] = vector.dot(dataPoint.vector, dir);
     });
@@ -249,7 +249,7 @@ export class DataSet {
 
   /** Projects the dataset along the top 10 principal components. */
   projectPCA(): Promise<void> {
-    if (this.projections.has('pca-0')) {
+    if (this.projections['pca-0'] != null) {
       return Promise.resolve<void>(null);
     }
     return util.runAsyncTask('Computing PCA...', () => {
@@ -290,7 +290,7 @@ export class DataSet {
       });
       for (let d = 0; d < NUM_PCA_COMPONENTS; d++) {
         let label = 'pca-' + d;
-        this.projections.add(label);
+        this.projections[label] = true;
         for (let i = 0; i < pcaVectors.length; i++) {
           let pointIndex = this.shuffledDataIndices[i];
           this.points[pointIndex].projections[label] = pcaVectors[i][d];
diff --git a/tensorflow/tensorboard/components/vz_projector/external.d.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/external.d.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/external.d.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/external.d.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/heap.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/heap.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/heap.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/heap.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/knn.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/knn.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/knn.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/knn.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/label.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/label.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/label.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/label.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/logging.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/logging.ts
similarity index 84%
rename from tensorflow/tensorboard/components/vz_projector/logging.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/logging.ts
index b51b70265305e47951b8730ea188cad7a8634924..59f3720601236453134f0e4cdf0448b6cb72d644 100644
--- a/tensorflow/tensorboard/components/vz_projector/logging.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/logging.ts
@@ -53,19 +53,22 @@ export function setModalMessage(
   dialog.querySelector('#notification-title').innerHTML = title;
   let msgsContainer = dialog.querySelector('#notify-msgs') as HTMLElement;
   if (isErrorMsg) {
-    d3.select(msgsContainer).html('');
+    msgsContainer.innerHTML = '';
   } else {
-    d3.select(msgsContainer).selectAll('.error').remove();
+    const errors = msgsContainer.querySelectorAll('.error');
+    for (let i = 0; i < errors.length; i++) {
+      msgsContainer.removeChild(errors[i]);
+    }
   }
   let divId = `notify-msg-${id}`;
-  let msgDiv = d3.select(dialog.querySelector('#' + divId));
-  let exists = msgDiv.size() > 0;
-  if (!exists) {
-    msgDiv = d3.select(msgsContainer)
-                 .insert('div', ':first-child')
-                 .attr('class', 'notify-msg')
-                 .classed('error', isErrorMsg)
-                 .attr('id', divId);
+  let msgDiv = dialog.querySelector('#' + divId) as HTMLDivElement;
+  if (msgDiv == null) {
+    msgDiv = document.createElement('div');
+    msgDiv.className = 'notify-msg ' + (isErrorMsg ? 'error' : '');
+    msgDiv.id = divId;
+
+    msgsContainer.insertBefore(msgDiv, msgsContainer.firstChild);
+
     if (!isErrorMsg) {
       numActiveMessages++;
     } else {
@@ -79,7 +82,7 @@ export function setModalMessage(
     }
     msgDiv.remove();
   } else {
-    msgDiv.text(msg);
+    msgDiv.innerText = msg;
     dialog.open();
   }
   return id;
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorEventContext.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/projectorEventContext.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/projectorEventContext.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/projectorEventContext.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
similarity index 97%
rename from tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
index 2bf63eebddfd4447b545f7258f08f3fd05069d25..9d6df953d65c57b60c88c3b744342123b61fe5bf 100644
--- a/tensorflow/tensorboard/components/vz_projector/projectorScatterPlotAdapter.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/projectorScatterPlotAdapter.ts
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
+
 import {DataSet, DistanceFunction, Projection, ProjectionComponents3D, State} from './data';
 import {NearestEntry} from './knn';
 import {ProjectorEventContext} from './projectorEventContext';
@@ -65,7 +67,7 @@ const SCATTER_PLOT_CUBE_LENGTH = 2;
 
 /** Color scale for nearest neighbors. */
 const NN_COLOR_SCALE =
-    d3.scale.linear<string>()
+    d3.scaleLinear<string, string>()
         .domain([1, 0.7, 0.4])
         .range(['hsl(285, 80%, 40%)', 'hsl(0, 80%, 65%)', 'hsl(40, 70%, 60%)'])
         .clamp(true);
@@ -76,7 +78,6 @@ const NN_COLOR_SCALE =
  */
 export class ProjectorScatterPlotAdapter {
   public scatterPlot: ScatterPlot;
-  private scatterPlotContainer: d3.Selection<any>;
   private projection: Projection;
   private hoverPointIndex: number;
   private selectedPointIndices: number[];
@@ -92,11 +93,10 @@ export class ProjectorScatterPlotAdapter {
   private polylineVisualizer: ScatterPlotVisualizerPolylines;
 
   constructor(
-      scatterPlotContainer: d3.Selection<any>,
+      private scatterPlotContainer: HTMLElement,
       projectorEventContext: ProjectorEventContext) {
     this.scatterPlot =
         new ScatterPlot(scatterPlotContainer, projectorEventContext);
-    this.scatterPlotContainer = scatterPlotContainer;
     projectorEventContext.registerProjectionChangedListener(projection => {
       this.projection = projection;
       this.updateScatterPlotWithNewProjection(projection);
@@ -247,9 +247,9 @@ export class ProjectorScatterPlotAdapter {
       return null;
     }
 
-    const xScaler: d3.scale.Linear<number, number> = d3.scale.linear();
-    const yScaler: d3.scale.Linear<number, number> = d3.scale.linear();
-    let zScaler: d3.scale.Linear<number, number> = null;
+    const xScaler = d3.scaleLinear();
+    const yScaler = d3.scaleLinear();
+    let zScaler = null;
     {
       // Determine max and min of each axis of our data.
       const xExtent = d3.extent(
@@ -269,7 +269,7 @@ export class ProjectorScatterPlotAdapter {
         const zExtent = d3.extent(
             ds.points,
             (p, i) => ds.points[i].projections[projectionComponents[2]]);
-        zScaler = d3.scale.linear();
+        zScaler = d3.scaleLinear();
         zScaler.domain(zExtent).range(range);
       }
     }
@@ -378,8 +378,8 @@ export class ProjectorScatterPlotAdapter {
     }
 
     return new LabelRenderParams(
-        visibleLabels, labelStrings, scale, opacityFlags, LABEL_FONT_SIZE,
-        fillColors, strokeColors);
+        new Float32Array(visibleLabels), labelStrings, scale, opacityFlags,
+        LABEL_FONT_SIZE, fillColors, strokeColors);
   }
 
   generatePointScaleFactorArray(
diff --git a/tensorflow/tensorboard/components/vz_projector/renderContext.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/renderContext.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/renderContext.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/renderContext.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlot.ts
similarity index 95%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlot.ts
index b40863cead1555c7d397caada96ba742b36c15c1..283b608e836b934c81ae25891ac257b0ad7c1193 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlot.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlot.ts
@@ -72,9 +72,6 @@ export class CameraDef {
  * array of visualizers and dispatches application events to them.
  */
 export class ScatterPlot {
-  private projectorEventContext: ProjectorEventContext;
-
-  private containerNode: HTMLElement;
   private visualizers: ScatterPlotVisualizer[] = [];
 
   private onCameraMoveListeners: OnCameraMoveListener[] = [];
@@ -113,17 +110,15 @@ export class ScatterPlot {
   private rectangleSelector: ScatterPlotRectangleSelector;
 
   constructor(
-      container: d3.Selection<any>,
-      projectorEventContext: ProjectorEventContext) {
-    this.containerNode = container.node() as HTMLElement;
-    this.projectorEventContext = projectorEventContext;
+      private container: HTMLElement,
+      private projectorEventContext: ProjectorEventContext) {
     this.getLayoutValues();
 
     this.scene = new THREE.Scene();
     this.renderer = new THREE.WebGLRenderer(
         {alpha: true, premultipliedAlpha: false, antialias: false});
     this.renderer.setClearColor(BACKGROUND_COLOR, 1);
-    this.containerNode.appendChild(this.renderer.domElement);
+    this.container.appendChild(this.renderer.domElement);
     this.light = new THREE.PointLight(0xFFECBF, 1, 0);
     this.scene.add(this.light);
 
@@ -132,18 +127,16 @@ export class ScatterPlot {
     this.renderer.render(this.scene, this.camera);
 
     this.rectangleSelector = new ScatterPlotRectangleSelector(
-        this.containerNode,
+        this.container,
         (boundingBox: BoundingBox) => this.selectBoundingBox(boundingBox));
     this.addInteractionListeners();
   }
 
   private addInteractionListeners() {
-    this.containerNode.addEventListener(
-        'mousemove', this.onMouseMove.bind(this));
-    this.containerNode.addEventListener(
-        'mousedown', this.onMouseDown.bind(this));
-    this.containerNode.addEventListener('mouseup', this.onMouseUp.bind(this));
-    this.containerNode.addEventListener('click', this.onClick.bind(this));
+    this.container.addEventListener('mousemove', this.onMouseMove.bind(this));
+    this.container.addEventListener('mousedown', this.onMouseDown.bind(this));
+    this.container.addEventListener('mouseup', this.onMouseUp.bind(this));
+    this.container.addEventListener('click', this.onClick.bind(this));
     window.addEventListener('keydown', this.onKeyDown.bind(this), false);
     window.addEventListener('keyup', this.onKeyUp.bind(this), false);
   }
@@ -356,7 +349,7 @@ export class ScatterPlot {
     // If shift is pressed, start selecting
     if (e.keyCode === SHIFT_KEY) {
       this.selecting = true;
-      this.containerNode.style.cursor = 'crosshair';
+      this.container.style.cursor = 'crosshair';
     }
   }
 
@@ -371,7 +364,7 @@ export class ScatterPlot {
     if (e.keyCode === SHIFT_KEY) {
       this.selecting = (this.getMouseMode() === MouseMode.AREA_SELECT);
       if (!this.selecting) {
-        this.containerNode.style.cursor = 'default';
+        this.container.style.cursor = 'default';
       }
       this.render();
     }
@@ -441,8 +434,8 @@ export class ScatterPlot {
   }
 
   private getLayoutValues(): Point2D {
-    this.width = this.containerNode.offsetWidth;
-    this.height = Math.max(1, this.containerNode.offsetHeight);
+    this.width = this.container.offsetWidth;
+    this.height = Math.max(1, this.container.offsetHeight);
     return [this.width, this.height];
   }
 
@@ -623,10 +616,10 @@ export class ScatterPlot {
     this.mouseMode = mouseMode;
     if (mouseMode === MouseMode.AREA_SELECT) {
       this.selecting = true;
-      this.containerNode.style.cursor = 'crosshair';
+      this.container.style.cursor = 'crosshair';
     } else {
       this.selecting = false;
-      this.containerNode.style.cursor = 'default';
+      this.container.style.cursor = 'default';
     }
   }
 
@@ -668,9 +661,11 @@ export class ScatterPlot {
   }
 
   setDayNightMode(isNight: boolean) {
-    d3.select(this.containerNode)
-        .selectAll('canvas')
-        .style('filter', isNight ? 'invert(100%)' : null);
+    const canvases = this.container.querySelectorAll('canvas');
+    const filterValue = isNight ? 'invert(100%)' : null;
+    for (let i = 0; i < canvases.length; i++) {
+      canvases[i].style.filter = filterValue;
+    }
   }
 
   resize(render = true) {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector.ts
similarity index 70%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector.ts
index a2dba9dd257007184402306c7333e1aa092bb097..a781877014edfcf40746c7bd72b4e8fc0cfd2e47 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotRectangleSelector.ts
@@ -32,8 +32,8 @@ export interface BoundingBox {
  * A class that manages and renders a data selection rectangle.
  */
 export class ScatterPlotRectangleSelector {
-  private svgElement: d3.Selection<any>;
-  private rectElement: d3.Selection<any>;
+  private svgElement: SVGElement;
+  private rectElement: SVGRectElement;
 
   private isMouseDown: boolean;
   private startCoordinates: [number, number];
@@ -51,20 +51,23 @@ export class ScatterPlotRectangleSelector {
   constructor(
       container: HTMLElement,
       selectionCallback: (boundingBox: BoundingBox) => void) {
-    this.svgElement = d3.select(container).select('#selector');
-    this.rectElement = this.svgElement.append('rect')
-                           .style('stroke', STROKE)
-                           .style('stroke-dasharray', STROKE_DASHARRAY)
-                           .style('stroke-width', STROKE_WIDTH)
-                           .style('fill', FILL)
-                           .style('fill-opacity', FILL_OPACITY);
+    this.svgElement = container.querySelector('#selector') as SVGElement;
+    this.rectElement =
+        document.createElementNS('http://www.w3.org/2000/svg', 'rect');
+    this.rectElement.style.stroke = STROKE;
+    this.rectElement.style.strokeDasharray = STROKE_DASHARRAY;
+    this.rectElement.style.strokeWidth = '' + STROKE_WIDTH;
+    this.rectElement.style.fill = FILL;
+    this.rectElement.style.fillOpacity = '' + FILL_OPACITY;
+    this.svgElement.appendChild(this.rectElement);
+
     this.selectionCallback = selectionCallback;
     this.isMouseDown = false;
   }
 
   onMouseDown(offsetX: number, offsetY: number) {
     this.isMouseDown = true;
-    this.svgElement.style('display', 'block');
+    this.rectElement.style.display = 'block';
 
     this.startCoordinates = [offsetX, offsetY];
     this.lastBoundingBox = {
@@ -87,19 +90,18 @@ export class ScatterPlotRectangleSelector {
     this.lastBoundingBox.height =
         this.lastBoundingBox.y - Math.min(offsetY, this.startCoordinates[1]);
 
-    this.rectElement.attr({
-      x: this.lastBoundingBox.x,
-      y: this.lastBoundingBox.y - this.lastBoundingBox.height,
-      width: this.lastBoundingBox.width,
-      height: this.lastBoundingBox.height
-    });
+    this.rectElement.setAttribute('x', '' + this.lastBoundingBox.x);
+    this.rectElement.setAttribute(
+        'y', '' + (this.lastBoundingBox.y - this.lastBoundingBox.height));
+    this.rectElement.setAttribute('width', '' + this.lastBoundingBox.width);
+    this.rectElement.setAttribute('height', '' + this.lastBoundingBox.height);
   }
 
   onMouseUp() {
     this.isMouseDown = false;
-    this.svgElement.style('display', 'none');
-    this.rectElement.attr('width', 0);
-    this.rectElement.attr('height', 0);
+    this.rectElement.style.display = 'none';
+    this.rectElement.setAttribute('width', '0');
+    this.rectElement.setAttribute('height', '0');
     this.selectionCallback(this.lastBoundingBox);
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizer.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizer.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizer3DLabels.ts
similarity index 99%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizer3DLabels.ts
index cbd9785e2f6eb5668741e4220de2b573b835052f..7820af0d48dd0e5876644d27ae6c3092134dbe62 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizer3DLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizer3DLabels.ts
@@ -38,7 +38,7 @@ const VERTICES_PER_GLYPH = 2 * 3;  // 2 triangles, 3 verts per triangle
  *            bottom center of the word is positioned at (0, 0);
  *    position: The position of the label in worldspace.
  *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
- *    color: The color of the label (matches the cooresponding point's color.)
+ *    color: The color of the label (matches the corresponding point's color.)
  *    wordShown: Boolean. Whether or not the label is visible.
  */
 
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerCanvasLabels.ts
similarity index 91%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerCanvasLabels.ts
index 04603e0fba8dd63c033972ec65646d94eb77b005..ece4d84ef28334d93b69925cecc193a98a713851 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerCanvasLabels.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerCanvasLabels.ts
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
 import {BoundingBox, CollisionGrid} from './label';
 import {CameraType, RenderContext} from './renderContext';
 import {ScatterPlotVisualizer} from './scatterPlotVisualizer';
@@ -33,10 +34,14 @@ export class ScatterPlotVisualizerCanvasLabels implements
   private canvas: HTMLCanvasElement;
   private labelsActive: boolean = true;
 
-  constructor(container: d3.Selection<any>) {
-    this.canvas = container.append('canvas').node() as HTMLCanvasElement;
+  constructor(container: HTMLElement) {
+    this.canvas = document.createElement('canvas');
+    container.appendChild(this.canvas);
+
     this.gc = this.canvas.getContext('2d');
-    d3.select(this.canvas).style({position: 'absolute', left: 0, top: 0});
+    this.canvas.style.position = 'absolute';
+    this.canvas.style.left = '0';
+    this.canvas.style.top = '0';
     this.canvas.style.pointerEvents = 'none';
   }
 
@@ -69,7 +74,7 @@ export class ScatterPlotVisualizerCanvasLabels implements
     }
 
     let opacityMap =
-        d3.scale.pow()
+        d3.scalePow()
             .exponent(Math.E)
             .domain([rc.farthestCameraSpacePointZ, rc.nearestCameraSpacePointZ])
             .range([0.1, 1]);
@@ -151,10 +156,10 @@ export class ScatterPlotVisualizerCanvasLabels implements
 
   onResize(newWidth: number, newHeight: number) {
     let dpr = window.devicePixelRatio;
-    d3.select(this.canvas)
-        .attr('width', newWidth * dpr)
-        .attr('height', newHeight * dpr)
-        .style({width: newWidth + 'px', height: newHeight + 'px'});
+    this.canvas.width = newWidth * dpr;
+    this.canvas.height = newHeight * dpr;
+    this.canvas.style.width = newWidth + 'px';
+    this.canvas.style.height = newHeight + 'px';
   }
 
   dispose() {
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerPolylines.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerPolylines.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerPolylines.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
similarity index 99%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
index 8adc9a9bd234cd905df8f9b26b3ea9a419b72097..be9c1703c727f11381d7836de86dad1c1c294cc0 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotVisualizerSprites.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/scatterPlotVisualizerSprites.ts
@@ -330,7 +330,7 @@ export class ScatterPlotVisualizerSprites implements ScatterPlotVisualizer {
 
   setSpriteAtlas(
       spriteImage: HTMLImageElement, spriteDimensions: [number, number],
-      spriteIndices: Uint8Array) {
+      spriteIndices: Float32Array) {
     this.disposeTextureAtlas();
     this.createTextureFromSpriteAtlas(
         spriteImage, spriteDimensions, spriteIndices);
diff --git a/tensorflow/tensorboard/components/vz_projector/sptree.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/sptree.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/sptree.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/sptree.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/styles.html b/tensorflow/tensorboard/components/vz_projector_d3v4/styles.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/styles.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/styles.html
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD b/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e76c84e8f3372ae3d06ecec7413f64e8f3732cac
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/BUILD
@@ -0,0 +1,81 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/vz-projector/test",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_projector_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+        "@org_polymer",
+        "@org_polymer_webcomponentsjs",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:chai.d.ts",
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:polymer.d.ts",
+        "@org_definitelytyped//:three.d.ts",
+        "@org_definitelytyped//:webcomponents.js.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:d3.d.ts",
+        "//tensorflow/tensorboard/components/tf_imports_d3v4:plottable.d.ts",
+        "//tensorflow/tensorboard/components/vz_projector_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {
+        "VZ.Projector.Test": [
+            "assert.ts",
+            "sptree_test.ts",
+            "data_test.ts",
+            "data-provider_test.ts",
+            "util_test.ts",
+
+            # TODO(smilkov): Migrate these away from jasmine.
+            # "scatterPlotRectangleSelector_test.ts",
+            # "vz-projector-projections-panel_test.ts",
+        ],
+    },
+    namespace_symbol_aliases = {
+        "VZ.Projector.Test": {
+            "BoundingBox": "VZ.Projector.ScatterPlotRectangleSelector.BoundingBox",
+            "DataPoint": "VZ.Projector.Data.DataPoint",
+            "DataSet": "VZ.Projector.Data.DataSet",
+            "ProjectionsPanel": "VZ.Projector.ProjectorProjectionsPanel.ProjectionsPanel",
+            "SPTree": "VZ.Projector.SPTree.SPTree",
+            "ScatterPlotRectangleSelector": "VZ.Projector.ScatterPlotRectangleSelector.ScatterPlotRectangleSelector",
+            "SpriteAndMetadataInfo": "VZ.Projector.Data.SpriteAndMetadataInfo",
+            "State": "VZ.Projector.Data.State",
+            "State": "VZ.Projector.Data.State",
+            "data_provider": "VZ.Projector.DataProvider",
+            "stateGetAccessorDimensions": "VZ.Projector.Data.stateGetAccessorDimensions",
+            "util": "VZ.Projector.Util",
+        },
+    },
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/lib/css/global.css b/tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts
similarity index 86%
rename from tensorflow/tensorboard/lib/css/global.css
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts
index cb6e966fddaf4781703ad53478542248c10fb34b..f489517a7f23f36ecb91875638e464e3c7312926 100644
--- a/tensorflow/tensorboard/lib/css/global.css
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/assert.ts
@@ -13,9 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-html, body {
-  margin: 0;
-  padding: 0;
-  height: 100%;
-  font-family: "RobotoDraft","Roboto",sans-serif;
-}
+const assert = chai.assert;
diff --git a/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
similarity index 53%
rename from tensorflow/tensorboard/components/vz_projector/data-provider_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
index 01b89ca700169c763845a8c1bf41706c3d08c6bb..59a42ffbfd84d7a6731af504081b7f0c64d17592 100644
--- a/tensorflow/tensorboard/components/vz_projector/data-provider_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data-provider_test.ts
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataPoint, SpriteAndMetadataInfo} from './data';
-import * as data_provider from './data-provider';
+import {DataPoint, SpriteAndMetadataInfo} from '../data';
+import * as data_provider from '../data-provider';
 
 /**
  * Converts a string to an ArrayBuffer.
@@ -48,15 +48,15 @@ describe('parse tensors', () => {
         .then((tensorsArrayBuffer: ArrayBuffer) => {
           data_provider.parseTensors(tensorsArrayBuffer)
               .then((data: DataPoint[]) => {
-                expect(data.length).toBe(2);
+                assert.equal(2, data.length);
 
-                expect(data[0].vector).toEqual(new Float32Array(tensors[0]));
-                expect(data[0].index).toEqual(0);
-                expect(data[0].projections).toBeNull();
+                assert.deepEqual(new Float32Array(tensors[0]), data[0].vector);
+                assert.equal(0, data[0].index);
+                assert.isNull(data[0].projections);
 
-                expect(data[1].vector).toEqual(new Float32Array(tensors[1]));
-                expect(data[1].index).toEqual(1);
-                expect(data[1].projections).toBeNull();
+                assert.deepEqual(new Float32Array(tensors[1]), data[1].vector);
+                assert.equal(1, data[1].index);
+                assert.isNull(data[1].projections);
                 doneFn();
               });
         });
@@ -68,27 +68,27 @@ describe('parse tensors', () => {
         .then((metadataArrayBuffer: ArrayBuffer) => {
           data_provider.parseMetadata(metadataArrayBuffer)
               .then((spriteAndMetadataInfo: SpriteAndMetadataInfo) => {
-                expect(spriteAndMetadataInfo.stats.length).toBe(2);
-                expect(spriteAndMetadataInfo.stats[0].name)
-                    .toBe(metadata[0][0]);
-                expect(spriteAndMetadataInfo.stats[0].isNumeric).toBe(false);
-                expect(spriteAndMetadataInfo.stats[0].tooManyUniqueValues)
-                    .toBe(false);
-                expect(spriteAndMetadataInfo.stats[1].name)
-                    .toBe(metadata[0][1]);
-                expect(spriteAndMetadataInfo.stats[1].isNumeric).toBe(true);
-                expect(spriteAndMetadataInfo.stats[1].tooManyUniqueValues)
-                    .toBe(false);
+                assert.equal(2, spriteAndMetadataInfo.stats.length);
+                assert.equal(metadata[0][0],
+                             spriteAndMetadataInfo.stats[0].name);
+                assert.isFalse(spriteAndMetadataInfo.stats[0].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[0].tooManyUniqueValues);
+                assert.equal(metadata[0][1],
+                             spriteAndMetadataInfo.stats[1].name);
+                assert.isTrue(spriteAndMetadataInfo.stats[1].isNumeric);
+                assert.isFalse(
+                    spriteAndMetadataInfo.stats[1].tooManyUniqueValues);
 
-                expect(spriteAndMetadataInfo.pointsInfo.length).toBe(2);
-                expect(spriteAndMetadataInfo.pointsInfo[0]['label'])
-                    .toBe(metadata[1][0]);
-                expect(spriteAndMetadataInfo.pointsInfo[0]['fakecol'])
-                    .toBe(+metadata[1][1]);
-                expect(spriteAndMetadataInfo.pointsInfo[1]['label'])
-                    .toBe(metadata[2][0]);
-                expect(spriteAndMetadataInfo.pointsInfo[1]['fakecol'])
-                    .toBe(+metadata[2][1]);
+                assert.equal(2, spriteAndMetadataInfo.pointsInfo.length);
+                assert.equal(metadata[1][0],
+                             spriteAndMetadataInfo.pointsInfo[0]['label']);
+                assert.equal(+metadata[1][1],
+                             spriteAndMetadataInfo.pointsInfo[0]['fakecol']);
+                assert.equal(metadata[2][0],
+                             spriteAndMetadataInfo.pointsInfo[1]['label']);
+                assert.equal(+metadata[2][1],
+                             spriteAndMetadataInfo.pointsInfo[1]['fakecol']);
                 doneFn();
               });
         });
diff --git a/tensorflow/tensorboard/components/vz_projector/data_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
similarity index 80%
rename from tensorflow/tensorboard/components/vz_projector/data_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
index 924ae3a929f568efd69feb3af6bf104660a24969..5e47c091c5b5565ed084612b178201ee5ba19386 100644
--- a/tensorflow/tensorboard/components/vz_projector/data_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/data_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {DataPoint, DataSet, State, stateGetAccessorDimensions} from './data';
+import {DataPoint, DataSet, State, stateGetAccessorDimensions} from '../data';
 
 /**
  * Helper method that makes a list of points given an array of
@@ -44,8 +44,8 @@ describe('constructor_with_sequences', () => {
     // one sequence 0->2->3.
     const points = makePointsWithSequences([2, -1, 3, -1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([0, 2, 3]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
   });
 
   it('Simple forward pointing sequences, __next__ metadata format', () => {
@@ -53,14 +53,14 @@ describe('constructor_with_sequences', () => {
     // one sequence 0->2->3.
     const points = makePointsWithSequences([2, -1, 3, -1], '__next__');
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([0, 2, 3]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([0, 2, 3], dataset.sequences[0].pointIndices);
   });
 
   it('No sequences', () => {
     let points = makePointsWithSequences([-1, -1, -1, -1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(0);
+    assert.equal(0, dataset.sequences.length);
   });
 
   it('A sequence that goes backwards and forward in the array', () => {
@@ -68,8 +68,8 @@ describe('constructor_with_sequences', () => {
     // one sequence 3->1->0->2.
     let points = makePointsWithSequences([2, 0, -1, 1]);
     let dataset = new DataSet(points);
-    expect(dataset.sequences.length).toEqual(1);
-    expect(dataset.sequences[0].pointIndices).toEqual([3, 1, 0, 2]);
+    assert.equal(1, dataset.sequences.length);
+    assert.deepEqual([3, 1, 0, 2], dataset.sequences[0].pointIndices);
   });
 });
 
@@ -78,27 +78,27 @@ describe('stateGetAccessorDimensions', () => {
     const state = new State();
     state.selectedProjection = 'tsne';
     state.tSNEis3d = false;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1]);
+    assert.deepEqual([0, 1], stateGetAccessorDimensions(state));
   });
 
   it('returns [0, 1, 2] for 3d t-SNE', () => {
     const state = new State();
     state.selectedProjection = 'tsne';
     state.tSNEis3d = true;
-    expect(stateGetAccessorDimensions(state)).toEqual([0, 1, 2]);
+    assert.deepEqual([0, 1, 2], stateGetAccessorDimensions(state));
   });
 
   it('returns pca component dimensions array for pca', () => {
     const state = new State();
     state.selectedProjection = 'pca';
     state.pcaComponentDimensions = [13, 12, 11, 10];
-    expect(stateGetAccessorDimensions(state))
-        .toEqual(state.pcaComponentDimensions);
+    assert.deepEqual(state.pcaComponentDimensions,
+                     stateGetAccessorDimensions(state));
   });
 
   it('returns ["x", "y"] for custom projections', () => {
     const state = new State();
     state.selectedProjection = 'custom';
-    expect(stateGetAccessorDimensions(state)).toEqual(['x', 'y']);
+    assert.deepEqual(['x', 'y'], stateGetAccessorDimensions(state));
   });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
similarity index 90%
rename from tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
index a93aca74a2039d7accd0978fde04eed919cddc84..0ee6cf620df8bb082adf424a66548b832346597d 100644
--- a/tensorflow/tensorboard/components/vz_projector/scatterPlotRectangleSelector_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/scatterPlotRectangleSelector_test.ts
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {BoundingBox, ScatterPlotRectangleSelector} from './scatterPlotRectangleSelector';
+import {BoundingBox, ScatterPlotRectangleSelector} from '../scatterPlotRectangleSelector';
 
 describe('selector callbacks make bounding box start bottom left', () => {
   let containerElement: HTMLElement;
@@ -22,6 +22,10 @@ describe('selector callbacks make bounding box start bottom left', () => {
 
   beforeEach(() => {
     containerElement = document.createElement('div');
+    const selector = document.createElement('svg');
+    selector.id = 'selector';
+    containerElement.appendChild(selector);
+
     selectionCallback = jasmine.createSpy('selectionCallback');
     selection =
         new ScatterPlotRectangleSelector(containerElement, selectionCallback);
diff --git a/tensorflow/tensorboard/components/vz_projector/sptree_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
similarity index 97%
rename from tensorflow/tensorboard/components/vz_projector/sptree_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
index 440680bdf1eb4a0d2a478e3480772686ab875af0..7e340ea62f5d1146e11b8321f4668dc97d14e0c8 100644
--- a/tensorflow/tensorboard/components/vz_projector/sptree_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/sptree_test.ts
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-import {SPTree} from './sptree';
-
-const assert = chai.assert;
+import {SPTree} from '../sptree';
 
 it('simple 2D data', () => {
   let data = [
diff --git a/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html b/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..dd43079bde1f827a42893f2166d8c95645d93f99
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/tests.html
@@ -0,0 +1,24 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<script src="../../webcomponentsjs/webcomponents-lite.min.js"></script>
+<link rel="import" href="../bundle.html">
+<body>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/vz_projector/util_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
similarity index 79%
rename from tensorflow/tensorboard/components/vz_projector/util_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
index f7c0027c81bf307bfa4f08148e02e30fe4734046..c18db95eed706a3eacd09486fca2b67b5e01f595 100644
--- a/tensorflow/tensorboard/components/vz_projector/util_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/util_test.ts
@@ -12,31 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import * as util from './util';
+import * as util from '../util';
 
 describe('getURLParams', () => {
   it('search query with valid param returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('search query with multiple valid params returns correct object', () => {
     let urlParams = util.getURLParams('?config=http://google.com/&foo=bar');
-    expect(urlParams).toEqual({'config': 'http://google.com/', 'foo': 'bar'});
+    assert.deepEqual({'config': 'http://google.com/', 'foo': 'bar'}, urlParams);
   });
 
   it('search query with valid param with URL encoded characters', () => {
     let urlParams = util.getURLParams('?config=http://google.com/%20search');
-    expect(urlParams).toEqual({'config': 'http://google.com/ search'});
+    assert.deepEqual({'config': 'http://google.com/ search'}, urlParams);
   });
 
   it('search query with pound sign', () => {
     let urlParams = util.getURLParams('?config=http://google.com/#foo');
-    expect(urlParams).toEqual({'config': 'http://google.com/'});
+    assert.deepEqual({'config': 'http://google.com/'}, urlParams);
   });
 
   it('no search query returns empty object', () => {
     let urlParams = util.getURLParams('');
-    expect(urlParams).toEqual({});
+    assert.deepEqual({}, urlParams);
   });
 });
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
similarity index 55%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
index 3ce35afb7433a1447263df1b247542b3a2836e8b..2bf0c6eb48f019e2467d7c9451748696bb6ed54d 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel_test.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/test/vz-projector-projections-panel_test.ts
@@ -12,16 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-import {State} from './data';
-import {ProjectionsPanel} from './vz-projector-projections-panel';
-
-const assert = chai.assert;
+import {State} from '../data';
+import {ProjectionsPanel} from '../vz-projector-projections-panel';
 
 describe('restoreUIFromBookmark', () => {
-  it('sets the pcaX/Y properties when setting 2D component values', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+  let projectionsPanel: ProjectionsPanel;
+  beforeEach(() => {
+    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
+        ProjectionsPanel;
+
+    // Set up some of the UI so the elements are found in the production code.
+    const tsnePerplexityContainer = document.createElement('div');
+    tsnePerplexityContainer.className = 'tsne-perplexity';
+    const tsnePerplexity = document.createElement('span');
+    tsnePerplexityContainer.appendChild(tsnePerplexity);
+    projectionsPanel.appendChild(tsnePerplexityContainer);
+
+    const tsneLearningRateContainer = document.createElement('div');
+    tsneLearningRateContainer.className = 'tsne-learning-rate';
+    const tsneLearningRate = document.createElement('span');
+    tsneLearningRateContainer.appendChild(tsneLearningRate);
+    projectionsPanel.appendChild(tsneLearningRateContainer);
+  });
 
+  it('sets the pcaX/Y properties when setting 2D component values', () => {
     spyOn(projectionsPanel, 'setZDropdownEnabled');
 
     const s = new State();
@@ -35,9 +49,6 @@ describe('restoreUIFromBookmark', () => {
   });
 
   it('sets the pcaX/Y properties when setting 3D component values', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
     spyOn(projectionsPanel, 'setZDropdownEnabled');
 
     const s = new State();
@@ -53,10 +64,27 @@ describe('restoreUIFromBookmark', () => {
 });
 
 describe('populateBookmarkFromUI', () => {
-  it('gets the PCA component UI values from a 2D PCA projection', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
+  let projectionsPanel: ProjectionsPanel;
+
+  beforeEach(() => {
+    projectionsPanel = document.createElement(ProjectionsPanel.prototype.is) as
+        ProjectionsPanel;
+
+    // Set up some of the UI so the elements are found in the production code.
+    const tsnePerplexityContainer = document.createElement('div');
+    tsnePerplexityContainer.className = 'tsne-perplexity';
+    const tsnePerplexity = document.createElement('span');
+    tsnePerplexityContainer.appendChild(tsnePerplexity);
+    projectionsPanel.appendChild(tsnePerplexityContainer);
+
+    const tsneLearningRateContainer = document.createElement('div');
+    tsneLearningRateContainer.className = 'tsne-learning-rate';
+    const tsneLearningRate = document.createElement('span');
+    tsneLearningRateContainer.appendChild(tsneLearningRate);
+    projectionsPanel.appendChild(tsneLearningRateContainer);
+  });
 
+  it('gets the PCA component UI values from a 2D PCA projection', () => {
     projectionsPanel.pcaX = 0;
     projectionsPanel.pcaY = 1;
     projectionsPanel.pcaIs3d = false;
@@ -67,9 +95,6 @@ describe('populateBookmarkFromUI', () => {
   });
 
   it('gets the PCA component UI values from a 3D PCA projection', () => {
-    let projectionsPanel = document.createElement(
-        ProjectionsPanel.prototype.is) as ProjectionsPanel;
-
     projectionsPanel.pcaX = 0;
     projectionsPanel.pcaY = 1;
     projectionsPanel.pcaZ = 2;
diff --git a/tensorflow/tensorboard/components/vz_projector/util.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/util.ts
similarity index 91%
rename from tensorflow/tensorboard/components/vz_projector/util.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/util.ts
index b2400bac83e7abeb72b94773553be3a226610dc8..bd6df68b1a5965d7289db1eb8ecda528938908bb 100644
--- a/tensorflow/tensorboard/components/vz_projector/util.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/util.ts
@@ -42,6 +42,33 @@ export function shuffle<T>(array: T[]): T[] {
   return array;
 }
 
+export function range(count: number): number[] {
+  const rangeOutput: number[] = [];
+  for (let i = 0; i < count; i++) {
+    rangeOutput.push(i);
+  }
+  return rangeOutput;
+}
+
+export function classed(
+    element: HTMLElement, className: string, enabled: boolean) {
+  const classNames = element.className.split(' ');
+  if (enabled) {
+    if (className in classNames) {
+      return;
+    } else {
+      classNames.push(className);
+    }
+  } else {
+    const index = classNames.indexOf(className);
+    if (index === -1) {
+      return;
+    }
+    classNames.splice(index, 1);
+  }
+  element.className = classNames.join(' ');
+}
+
 /** Projects a 3d point into screen space */
 export function vector3DToScreenCoords(
     cam: THREE.Camera, w: number, h: number, v: THREE.Vector3): Point2D {
@@ -153,7 +180,7 @@ export function runAsyncTask<T>(
   let autoClear = (msgId == null);
   msgId = logging.setModalMessage(message, msgId);
   return new Promise<T>((resolve, reject) => {
-    d3.timer(() => {
+    setTimeout(() => {
       try {
         let result = task();
         // Clearing the old message.
diff --git a/tensorflow/tensorboard/components/vz_projector/vector.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vector.ts
similarity index 98%
rename from tensorflow/tensorboard/components/vz_projector/vector.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vector.ts
index 88ca24b25ddf25dd8a61858477db169c1079ba8d..0de78ad85df6aaf0eb02e89ddfdfeabab3f451b4 100644
--- a/tensorflow/tensorboard/components/vz_projector/vector.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vector.ts
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
 import {assert} from './util';
 
 /**
@@ -203,7 +204,7 @@ export function centroid<T>(dataPoints: T[], accessor?: (a: T) => Vector):
  * a random (0, 1) gaussian distribution.
  */
 export function rn(size: number): Float32Array {
-  let normal = d3.random.normal();
+  const normal = d3.randomNormal();
   let result = new Float32Array(size);
   for (let i = 0; i < size; ++i) {
     result[i] = normal();
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-app.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-app.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-app.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-app.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-bookmark-panel.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-bookmark-panel.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-bookmark-panel.ts
similarity index 91%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-bookmark-panel.ts
index d22904337ab48472c244dc87457f51aebf16783d..53195fa47c05132102943d7052378ebf136973c8 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-bookmark-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-bookmark-panel.ts
@@ -41,13 +41,17 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
   private selectedState: number;
   private ignoreNextProjectionEvent: boolean;
 
-  private dom: d3.Selection<any>;
+  private expandLessButton: HTMLButtonElement;
+  private expandMoreButton: HTMLButtonElement;
 
   ready() {
-    this.dom = d3.select(this);
     this.savedStates = [];
     this.setupUploadButton();
     this.ignoreNextProjectionEvent = false;
+    this.expandLessButton =
+        this.querySelector('#expand-less') as HTMLButtonElement;
+    this.expandMoreButton =
+        this.querySelector('#expand-more') as HTMLButtonElement;
   }
 
   initialize(
@@ -80,15 +84,15 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
   /** Handles a click on show bookmarks tray button. */
   _expandMore() {
     this.$.panel.show();
-    this.dom.select('#expand-more').style('display', 'none');
-    this.dom.select('#expand-less').style('display', '');
+    this.expandMoreButton.style.display = 'none';
+    this.expandLessButton.style.display = '';
   }
 
   /** Handles a click on hide bookmarks tray button. */
   _expandLess() {
     this.$.panel.hide();
-    this.dom.select('#expand-more').style('display', '');
-    this.dom.select('#expand-less').style('display', 'none');
+    this.expandMoreButton.style.display = '';
+    this.expandLessButton.style.display = 'none';
   }
 
   /** Handles a click on the add bookmark button. */
@@ -136,16 +140,16 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
 
   private setupUploadButton() {
     // Show and setup the load view button.
-    let fileInput = this.dom.select('#state-file');
-    fileInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileInput = this.querySelector('#state-file') as HTMLInputElement;
+    fileInput.onchange = () => {
+      const file: File = fileInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = (evt) => {
-        let str: string = (evt.target as any).result;
-        let savedStates = JSON.parse(str);
+        const str: string = fileReader.result;
+        const savedStates = JSON.parse(str);
 
         // Verify the bookmarks match.
         if (this.savedStatesValid(savedStates)) {
@@ -158,7 +162,7 @@ export class BookmarkPanel extends BookmarkPanelPolymer {
         }
       };
       fileReader.readAsText(file);
-    });
+    };
   }
 
   addStates(savedStates?: State[]) {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-colab.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-colab.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-colab.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-colab.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
similarity index 56%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
index b641bb0f2937e7277af574c4438ca907bafeeaa0..55c15da5ed73360b486cd65be3a05cdde68e91c5 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-dashboard.html
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-dashboard.html
@@ -16,6 +16,7 @@ limitations under the License.
 -->
 
 <link rel="import" href="../polymer/polymer.html">
+<link rel="import" href="../tf-dashboard-common/tf-dashboard.html">
 <link rel="import" href="../tf-dashboard-common/tf-no-data-warning.html">
 <link rel="import" href="vz-projector.html">
 
@@ -36,19 +37,44 @@ limitations under the License.
   </template>
 </template>
 <script>
+"use strict";
+
 (function() {
-Polymer({
+TF.Dashboard.VzProjectorDashboard = Polymer({
   is: 'vz-projector-dashboard',
+  factoryImpl: function(routePrefix) {
+    this.routePrefix = routePrefix;
+  },
   properties: {
     dataNotFound: Boolean,
-    routePrefix: String
+    routePrefix: String,
+    // Whether this dashboard is initialized. This dashboard should only be initialized once.
+    _initialized: Boolean,
+  },
+  behaviors: [
+    TF.Dashboard.DashboardBehavior("embeddings"),
+  ],
+  reload: function() {
+    // Do not reload the embedding projector. Reloading could take a long time.
+  },
+  attached: function() {
+    if (this._initialized) {
+      return;
+    }
+    let xhr = new XMLHttpRequest();
+    xhr.open('GET', this.routePrefix + '/runs');
+    xhr.onload = () => {
+      // Set this to true so we only initialize once.
+      this._initialized = true;
+
+      let runs = JSON.parse(xhr.responseText);
+      this.set('dataNotFound', runs.length === 0);
+    };
+    xhr.onerror = () => {
+      this.set('dataNotFound', false);
+    };
+    xhr.send();
   },
-  ready() {
-    var self = this;
-    d3.json(this.routePrefix + '/runs', function(err, runs) {
-      self.dataNotFound = (runs.length === 0);
-    });
-  }
 });
 })();
 </script>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.html
similarity index 99%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.html
index f7ef0593023d9f6bbc50f5ad1419daa8e7b18607..607d4467892b5918e26a3271d50cddcb8dfa7578 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.html
@@ -347,6 +347,7 @@ paper-dropdown-menu paper-item {
           </p>
           <p>
             One option is using a <a target=_blank href="https://gist.github.com/">github gist</a>.
+            If you choose this approach, make sure to link directly to the raw file.
           </p>
         </div>
         <div>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.ts
similarity index 82%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.ts
index d52f024e3f6d142ba5ab6928ca1a1246003e58d7..a6847ed3c87db95c9a1048a0e40818d0a2be5c22 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-data-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-data-panel.ts
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+import * as d3 from 'd3';  // from //third_party/javascript/typings/d3_v4
 import {ColorOption, ColumnStats, SpriteAndMetadataInfo} from './data';
 import {DataProvider, EmbeddingInfo, parseRawMetadata, parseRawTensors, ProjectorConfig} from './data-provider';
 import * as util from './util';
@@ -47,7 +48,6 @@ export class DataPanel extends DataPanelPolymer {
   private labelOptions: string[];
   private colorOptions: ColorOption[];
   forceCategoricalColoring: boolean = false;
-  private dom: d3.Selection<any>;
 
   private selectedTensor: string;
   private selectedRun: string;
@@ -61,7 +61,6 @@ export class DataPanel extends DataPanelPolymer {
   private metadataFile: string;
 
   ready() {
-    this.dom = d3.select(this);
     this.normalizeData = true;
   }
 
@@ -129,9 +128,11 @@ export class DataPanel extends DataPanelPolymer {
   }
 
   private updateMetadataUI(columnStats: ColumnStats[], metadataFile: string) {
-    this.dom.select('#metadata-file')
-        .html(this.addWordBreaks(metadataFile))
-        .attr('title', metadataFile);
+    const metadataFileElement =
+        this.querySelector('#metadata-file') as HTMLSpanElement;
+    metadataFileElement.innerHTML = this.addWordBreaks(metadataFile);
+    metadataFileElement.title = metadataFile;
+
     // Label by options.
     let labelIndex = -1;
     this.labelOptions = columnStats.map((stats, i) => {
@@ -144,25 +145,25 @@ export class DataPanel extends DataPanelPolymer {
     this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
 
     // Color by options.
-    let standardColorOption: ColorOption[] = [
+    const standardColorOption: ColorOption[] = [
       {name: 'No color map'},
       // TODO(smilkov): Implement this.
       // {name: 'Distance of neighbors',
       //    desc: 'How far is each point from its neighbors'}
     ];
-    let metadataColorOption: ColorOption[] =
+    const metadataColorOption: ColorOption[] =
         columnStats
             .filter(stats => {
               return !stats.tooManyUniqueValues || stats.isNumeric;
             })
             .map(stats => {
-              let map: (v: string|number) => string;
+              let map;
               let items: {label: string, count: number}[];
               let thresholds: ColorLegendThreshold[];
               let isCategorical =
                   this.forceCategoricalColoring || !stats.tooManyUniqueValues;
               if (isCategorical) {
-                let scale = d3.scale.category20();
+                const scale = d3.scaleOrdinal(d3.schemeCategory20);
                 let range = scale.range();
                 // Re-order the range.
                 let newRange = range.map((color, i) => {
@@ -177,7 +178,7 @@ export class DataPanel extends DataPanelPolymer {
                   {color: '#ffffdd', value: stats.min},
                   {color: '#1f2d86', value: stats.max}
                 ];
-                map = d3.scale.linear<string>()
+                map = d3.scaleLinear<string, string>()
                           .domain(thresholds.map(t => t.value))
                           .range(thresholds.map(t => t.color));
               }
@@ -263,13 +264,15 @@ export class DataPanel extends DataPanelPolymer {
       this.tensorNames = names.map(name => {
         return {name, shape: this.getEmbeddingInfoByName(name).tensorShape};
       });
-      let wordBreakablePath =
+      const wordBreakablePath =
           this.addWordBreaks(this.projectorConfig.modelCheckpointPath);
-      this.dom.select('#checkpoint-file')
-          .html(wordBreakablePath)
-          .attr('title', this.projectorConfig.modelCheckpointPath);
+      const checkpointFile =
+          this.querySelector('#checkpoint-file') as HTMLSpanElement;
+      checkpointFile.innerHTML = wordBreakablePath;
+      checkpointFile.title = this.projectorConfig.modelCheckpointPath;
+
       // If in demo mode, let the order decide which tensor to load by default.
-      let defaultTensor = this.projector.servingMode === 'demo' ?
+      const defaultTensor = this.projector.servingMode === 'demo' ?
           this.projectorConfig.embeddings[0].tensorName :
           names[0];
       if (this.selectedTensor === defaultTensor) {
@@ -322,9 +325,10 @@ export class DataPanel extends DataPanelPolymer {
 
   private tensorWasReadFromFile(rawContents: ArrayBuffer, fileName: string) {
     parseRawTensors(rawContents, ds => {
-      this.dom.select('#checkpoint-file')
-          .text(fileName)
-          .attr('title', fileName);
+      const checkpointFile =
+          this.querySelector('#checkpoint-file') as HTMLSpanElement;
+      checkpointFile.innerText = fileName;
+      checkpointFile.title = fileName;
       this.projector.updateDataSet(ds);
     });
   }
@@ -337,7 +341,7 @@ export class DataPanel extends DataPanelPolymer {
 
   private getEmbeddingInfoByName(tensorName: string): EmbeddingInfo {
     for (let i = 0; i < this.projectorConfig.embeddings.length; i++) {
-      let e = this.projectorConfig.embeddings[i];
+      const e = this.projectorConfig.embeddings[i];
       if (e.tensorName === tensorName) {
         return e;
       }
@@ -346,44 +350,47 @@ export class DataPanel extends DataPanelPolymer {
 
   private setupUploadButtons() {
     // Show and setup the upload button.
-    let fileInput = this.dom.select('#file');
-    fileInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileInput = this.querySelector('#file') as HTMLInputElement;
+    fileInput.onchange = () => {
+      const file: File = fileInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = evt => {
-        let content: ArrayBuffer = (evt.target as any).result;
+        const content: ArrayBuffer = fileReader.result;
         this.tensorWasReadFromFile(content, file.name);
       };
       fileReader.readAsArrayBuffer(file);
-    });
+    };
 
-    let uploadButton = this.dom.select('#upload-tensors');
-    uploadButton.on('click', () => {
-      (fileInput.node() as HTMLInputElement).click();
-    });
+    const uploadButton =
+        this.querySelector('#upload-tensors') as HTMLButtonElement;
+    uploadButton.onclick = () => {
+      fileInput.click();
+    };
 
     // Show and setup the upload metadata button.
-    let fileMetadataInput = this.dom.select('#file-metadata');
-    fileMetadataInput.on('change', () => {
-      let file: File = (d3.event as any).target.files[0];
+    const fileMetadataInput =
+        this.querySelector('#file-metadata') as HTMLInputElement;
+    fileMetadataInput.onchange = () => {
+      const file: File = fileMetadataInput.files[0];
       // Clear out the value of the file chooser. This ensures that if the user
       // selects the same file, we'll re-read it.
-      (d3.event as any).target.value = '';
-      let fileReader = new FileReader();
+      fileMetadataInput.value = '';
+      const fileReader = new FileReader();
       fileReader.onload = evt => {
-        let contents: ArrayBuffer = (evt.target as any).result;
+        const contents: ArrayBuffer = fileReader.result;
         this.metadataWasReadFromFile(contents, file.name);
       };
       fileReader.readAsArrayBuffer(file);
-    });
+    };
 
-    let uploadMetadataButton = this.dom.select('#upload-metadata');
-    uploadMetadataButton.on('click', () => {
-      (fileMetadataInput.node() as HTMLInputElement).click();
-    });
+    const uploadMetadataButton =
+        this.querySelector('#upload-metadata') as HTMLButtonElement;
+    uploadMetadataButton.onclick = () => {
+      fileMetadataInput.click();
+    };
 
     if (this.projector.servingMode !== 'demo') {
       (this.$$('#publish-container') as HTMLElement).style.display = 'none';
@@ -396,22 +403,24 @@ export class DataPanel extends DataPanelPolymer {
         'block';
 
     // Fill out the projector config.
-    let projectorConfigTemplate =
+    const projectorConfigTemplate =
         this.$$('#projector-config-template') as HTMLTextAreaElement;
-    let projectorConfigTemplateJson: ProjectorConfig = {
+    const projectorConfigTemplateJson: ProjectorConfig = {
       embeddings: [{
         tensorName: 'My tensor',
         tensorShape: [1000, 50],
-        tensorPath: 'https://gist.github.com/.../tensors.tsv',
-        metadataPath: 'https://gist.github.com/.../optional.metadata.tsv',
+        tensorPath: 'https://raw.githubusercontent.com/.../tensors.tsv',
+        metadataPath:
+            'https://raw.githubusercontent.com/.../optional.metadata.tsv',
       }],
     };
     this.setProjectorConfigTemplateJson(
         projectorConfigTemplate, projectorConfigTemplateJson);
 
     // Set up optional field checkboxes.
-    let spriteFieldCheckbox = this.$$('#config-sprite-checkbox');
-    spriteFieldCheckbox.addEventListener('change', () => {
+    const spriteFieldCheckbox =
+        this.$$('#config-sprite-checkbox') as HTMLInputElement;
+    spriteFieldCheckbox.onchange = () => {
       if ((spriteFieldCheckbox as any).checked) {
         projectorConfigTemplateJson.embeddings[0].sprite = {
           imagePath: 'https://github.com/.../optional.sprite.png',
@@ -422,35 +431,38 @@ export class DataPanel extends DataPanelPolymer {
       }
       this.setProjectorConfigTemplateJson(
           projectorConfigTemplate, projectorConfigTemplateJson);
-    });
-    let bookmarksFieldCheckbox = this.$$('#config-bookmarks-checkbox');
-    bookmarksFieldCheckbox.addEventListener('change', () => {
+    };
+    const bookmarksFieldCheckbox =
+        this.$$('#config-bookmarks-checkbox') as HTMLInputElement;
+    bookmarksFieldCheckbox.onchange = () => {
       if ((bookmarksFieldCheckbox as any).checked) {
         projectorConfigTemplateJson.embeddings[0].bookmarksPath =
-            'https://gist.github.com/.../bookmarks.txt';
+            'https://raw.githubusercontent.com/.../bookmarks.txt';
       } else {
         delete projectorConfigTemplateJson.embeddings[0].bookmarksPath;
       }
       this.setProjectorConfigTemplateJson(
           projectorConfigTemplate, projectorConfigTemplateJson);
-    });
-    let metadataFieldCheckbox = this.$$('#config-metadata-checkbox');
-    metadataFieldCheckbox.addEventListener('change', () => {
+    };
+    const metadataFieldCheckbox =
+        this.$$('#config-metadata-checkbox') as HTMLInputElement;
+    metadataFieldCheckbox.onchange = () => {
       if ((metadataFieldCheckbox as HTMLInputElement).checked) {
         projectorConfigTemplateJson.embeddings[0].metadataPath =
-            'https://gist.github.com/.../optional.metadata.tsv';
+            'https://raw.githubusercontent.com/.../optional.metadata.tsv';
       } else {
         delete projectorConfigTemplateJson.embeddings[0].metadataPath;
       }
       this.setProjectorConfigTemplateJson(
           projectorConfigTemplate, projectorConfigTemplateJson);
-    });
+    };
 
     // Update the link and the readonly shareable URL.
-    let projectorConfigUrlInput = this.$$('#projector-config-url');
-    let projectorConfigDemoUrlInput = this.$$('#projector-share-url');
-    let projectorConfigDemoUrlLink = this.$$('#projector-share-url-link');
-    projectorConfigUrlInput.addEventListener('input', () => {
+    const projectorConfigUrlInput =
+        this.$$('#projector-config-url') as HTMLInputElement;
+    const projectorConfigDemoUrlInput = this.$$('#projector-share-url');
+    const projectorConfigDemoUrlLink = this.$$('#projector-share-url-link');
+    projectorConfigUrlInput.onchange = () => {
       let projectorDemoUrl = location.protocol + '//' + location.host +
           location.pathname +
           '?config=' + (projectorConfigUrlInput as HTMLInputElement).value;
@@ -458,7 +470,7 @@ export class DataPanel extends DataPanelPolymer {
       (projectorConfigDemoUrlInput as HTMLInputElement).value =
           projectorDemoUrl;
       (projectorConfigDemoUrlLink as HTMLLinkElement).href = projectorDemoUrl;
-    });
+    };
   }
 
   private setProjectorConfigTemplateJson(
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-input.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-input.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-input.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-input.ts
similarity index 93%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-input.ts
index 6270185dd4a29a68bec2c618ab18e902a1616ec6..e11346d327ff7bc12e5b3c84f32c15a86cfec975 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-input.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-input.ts
@@ -26,7 +26,6 @@ export interface InputChangedListener {
 
 /** Input control with custom capabilities (e.g. regex). */
 export class ProjectorInput extends PolymerClass {
-  private dom: d3.Selection<HTMLElement>;
   private textChangedListeners: InputChangedListener[];
   private paperInput: HTMLInputElement;
   private inRegexModeButton: HTMLButtonElement;
@@ -43,7 +42,6 @@ export class ProjectorInput extends PolymerClass {
   ready() {
     this.inRegexMode = false;
     this.textChangedListeners = [];
-    this.dom = d3.select(this);
     this.paperInput = this.querySelector('paper-input') as HTMLInputElement;
     this.inRegexModeButton =
         this.querySelector('paper-button') as HTMLButtonElement;
@@ -89,9 +87,12 @@ export class ProjectorInput extends PolymerClass {
   }
 
   private updateRegexModeDisplaySlashes() {
-    d3.select(this.paperInput)
-        .selectAll('.slash')
-        .style('display', this.inRegexMode ? null : 'none');
+    const slashes = this.paperInput.querySelectorAll('.slash');
+    const display = this.inRegexMode ? '' : 'none';
+
+    for (let i = 0; i < slashes.length; i++) {
+      (slashes[i] as HTMLDivElement).style.display = display;
+    }
   }
 
   getValue(): string {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.html
similarity index 99%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.html
index 7554c322cef0607a569566a63a5c07ea3e7a05a7..f5087a99967ca894dcfd9b428dcf4a55b82fe9ae 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.html
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.html
@@ -223,7 +223,7 @@ limitations under the License.
         <span class="option-label">distance</span>
         <div class="options">
           <a class="selected cosine" href="javascript:void(0);">COSINE</a>
-          <a class="euclidean" href="javascript:void(0);">EUCLIDIAN</a>
+          <a class="euclidean" href="javascript:void(0);">EUCLIDEAN</a>
         </div>
       </div>
     </div>
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.ts
similarity index 51%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.ts
index 20dc67167f9fca322feb28bafe16558842e4149c..3ee2c2165f218f4a690b40569314611ebcf58fd1 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-inspector-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-inspector-panel.ts
@@ -17,6 +17,7 @@ import {DistanceFunction, SpriteAndMetadataInfo, State} from './data';
 import * as knn from './knn';
 import {ProjectorEventContext} from './projectorEventContext';
 import * as adapter from './projectorScatterPlotAdapter';
+import * as util from './util';
 import * as vector from './vector';
 import {Projector} from './vz-projector';
 import {ProjectorInput} from './vz-projector-input';
@@ -40,23 +41,24 @@ export class InspectorPanel extends PolymerClass {
 
   private selectedMetadataField: string;
   private metadataFields: string[];
-  private dom: d3.Selection<HTMLElement>;
   private projector: Projector;
   private selectedPointIndices: number[];
   private neighborsOfFirstPoint: knn.NearestEntry[];
   private searchBox: ProjectorInput;
 
-  private resetFilterButton: d3.Selection<HTMLElement>;
-  private setFilterButton: d3.Selection<HTMLElement>;
-  private clearSelectionButton: d3.Selection<HTMLElement>;
-  private limitMessage: d3.Selection<HTMLElement>;
+  private resetFilterButton: HTMLButtonElement;
+  private setFilterButton: HTMLButtonElement;
+  private clearSelectionButton: HTMLButtonElement;
+  private limitMessage: HTMLDivElement;
 
   ready() {
-    this.dom = d3.select(this);
-    this.resetFilterButton = this.dom.select('.reset-filter');
-    this.setFilterButton = this.dom.select('.set-filter');
-    this.clearSelectionButton = this.dom.select('.clear-selection');
-    this.limitMessage = this.dom.select('.limit-msg');
+    this.resetFilterButton =
+        this.querySelector('.reset-filter') as HTMLButtonElement;
+    this.setFilterButton =
+        this.querySelector('.set-filter') as HTMLButtonElement;
+    this.clearSelectionButton =
+        this.querySelector('.clear-selection') as HTMLButtonElement;
+    this.limitMessage = this.querySelector('.limit-msg') as HTMLDivElement;
     this.searchBox = this.querySelector('#search-box') as ProjectorInput;
     // https://www.polymer-project.org/1.0/docs/devguide/styling#scope-subtree
     this.scopeSubtree(this, true);
@@ -88,7 +90,7 @@ export class InspectorPanel extends PolymerClass {
   }
 
   private enableResetFilterButton(enabled: boolean) {
-    this.resetFilterButton.attr('disabled', enabled ? null : true);
+    this.resetFilterButton.disabled = !enabled;
   }
 
   restoreUIFromBookmark(bookmark: State) {
@@ -113,143 +115,178 @@ export class InspectorPanel extends PolymerClass {
   }
 
   private updateSearchResults(indices: number[]) {
-    let container = this.dom.select('.matches-list');
-    container.style('display', indices.length ? null : 'none');
-    let list = container.select('.list');
-    list.html('');
+    const container = this.querySelector('.matches-list') as HTMLDivElement;
+    container.style.display = indices.length ? null : 'none';
+    const list = container.querySelector('.list') as HTMLDivElement;
+    list.innerHTML = '';
     if (indices.length === 0) {
       return;
     }
-    this.limitMessage.style(
-        'display', indices.length <= LIMIT_RESULTS ? 'none' : null);
+
+    this.limitMessage.style.display =
+        indices.length <= LIMIT_RESULTS ? 'none' : null;
     indices = indices.slice(0, LIMIT_RESULTS);
-    let rows = list.selectAll('.row').data(indices).enter().append('div').attr(
-        'class', 'row');
-    rows.append('a')
-        .attr('class', 'label')
-        .attr('title', index => this.getLabelFromIndex(index))
-        .text(index => this.getLabelFromIndex(index));
-    rows.on('mouseenter', index => {
-      this.projectorEventContext.notifyHoverOverPoint(index);
-    });
-    rows.on('mouseleave', () => {
-      this.projectorEventContext.notifyHoverOverPoint(null);
-    });
-    rows.on('click', index => {
-      this.projectorEventContext.notifySelectionChanged([index]);
-    });
+
+    for (let i = 0; i < indices.length; i++) {
+      const index = indices[i];
+
+      const row = document.createElement('div');
+      row.className = 'row';
+
+      const label = this.getLabelFromIndex(index);
+      const rowLink = document.createElement('a');
+      rowLink.className = 'label';
+      rowLink.title = label;
+      rowLink.innerText = label;
+
+      rowLink.onmouseenter = () => {
+        this.projectorEventContext.notifyHoverOverPoint(index);
+      };
+      rowLink.onmouseleave = () => {
+        this.projectorEventContext.notifyHoverOverPoint(null);
+      };
+      rowLink.onclick = () => {
+        this.projectorEventContext.notifySelectionChanged([index]);
+      };
+
+      row.appendChild(rowLink);
+      list.appendChild(row);
+    }
   }
 
   private getLabelFromIndex(pointIndex: number): string {
-    let point = this.projector.dataSet.points[pointIndex];
+    const point = this.projector.dataSet.points[pointIndex];
     return point.metadata[this.selectedMetadataField].toString();
   }
 
   private updateNeighborsList(neighbors: knn.NearestEntry[]) {
-    let nnlist = this.dom.select('.nn-list');
-    nnlist.html('');
-    this.dom.select('.nn').style('display', neighbors.length ? null : 'none');
+    const nnlist = this.querySelector('.nn-list') as HTMLDivElement;
+    nnlist.innerHTML = '';
+
+    (this.querySelector('.nn') as HTMLDivElement).style.display =
+        neighbors.length ? null : 'none';
 
     if (neighbors.length === 0) {
       return;
     }
 
     this.searchBox.message = '';
-    let minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
-    let n = nnlist.selectAll('.neighbor')
-                .data(neighbors)
-                .enter()
-                .append('div')
-                .attr('class', 'neighbor')
-                .append('a')
-                .attr('class', 'neighbor-link')
-                .attr('title', d => this.getLabelFromIndex(d.index));
-
-
-    let labelValue = n.append('div').attr('class', 'label-and-value');
-    labelValue.append('div')
-        .attr('class', 'label')
-        .style('color', d => adapter.dist2color(this.distFunc, d.dist, minDist))
-        .text(d => this.getLabelFromIndex(d.index));
-
-    labelValue.append('div')
-        .attr('class', 'value')
-        .text(d => d.dist.toFixed(3));
-
-    let bar = n.append('div').attr('class', 'bar');
-
-    bar.append('div')
-        .attr('class', 'fill')
-        .style(
-            'border-top-color',
-            d => {
-              return adapter.dist2color(this.distFunc, d.dist, minDist);
-            })
-        .style(
-            'width',
-            d => adapter.normalizeDist(this.distFunc, d.dist, minDist) * 100 +
-                '%');
-
-    bar.selectAll('.tick')
-        .data(d3.range(1, 4))
-        .enter()
-        .append('div')
-        .attr('class', 'tick')
-        .style('left', d => d * 100 / 4 + '%');
-    n.on('mouseenter', d => {
-      this.projectorEventContext.notifyHoverOverPoint(d.index);
-    });
-    n.on('mouseleave', () => {
-      this.projectorEventContext.notifyHoverOverPoint(null);
-    });
-    n.on('click', d => {
-      this.projectorEventContext.notifySelectionChanged([d.index]);
-    });
+    const minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
+
+    for (let i = 0; i < neighbors.length; i++) {
+      const neighbor = neighbors[i];
+
+      const neighborElement = document.createElement('div');
+      neighborElement.className = 'neighbor';
+
+      const neighborElementLink = document.createElement('a');
+      neighborElementLink.className = 'neighbor-link';
+      neighborElementLink.title = this.getLabelFromIndex(neighbor.index);
+
+      const labelValueElement = document.createElement('div');
+      labelValueElement.className = 'label-and-value';
+
+      const labelElement = document.createElement('div');
+      labelElement.className = 'label';
+      labelElement.style.color =
+          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
+      labelElement.innerText = this.getLabelFromIndex(neighbor.index);
+
+      const valueElement = document.createElement('div');
+      valueElement.className = 'value';
+      valueElement.innerText = neighbor.dist.toFixed(3);
+
+      labelValueElement.appendChild(labelElement);
+      labelValueElement.appendChild(valueElement);
+
+      const barElement = document.createElement('div');
+      barElement.className = 'bar';
+
+      const barFillElement = document.createElement('div');
+      barFillElement.className = 'fill';
+      barFillElement.style.borderTopColor =
+          adapter.dist2color(this.distFunc, neighbor.dist, minDist);
+      barFillElement.style.width =
+          adapter.normalizeDist(this.distFunc, neighbor.dist, minDist) * 100 +
+          '%';
+      barElement.appendChild(barFillElement);
+
+      for (let j = 1; j < 4; j++) {
+        const tickElement = document.createElement('div');
+        tickElement.className = 'tick';
+        tickElement.style.left = j * 100 / 4 + '%';
+        barElement.appendChild(tickElement);
+      }
+
+      neighborElementLink.appendChild(labelValueElement);
+      neighborElementLink.appendChild(barElement);
+      neighborElement.appendChild(neighborElementLink);
+      nnlist.appendChild(neighborElement);
+
+      neighborElementLink.onmouseenter = () => {
+        this.projectorEventContext.notifyHoverOverPoint(neighbor.index);
+      };
+      neighborElementLink.onmouseleave = () => {
+        this.projectorEventContext.notifyHoverOverPoint(null);
+      };
+      neighborElementLink.onclick = () => {
+        this.projectorEventContext.notifySelectionChanged([neighbor.index]);
+      };
+    }
   }
 
   private updateFilterButtons(numPoints: number) {
     if (numPoints > 1) {
-      this.setFilterButton.text(`Isolate ${numPoints} points`)
-          .attr('disabled', null);
-      this.clearSelectionButton.attr('disabled', null);
+      this.setFilterButton.innerText = `Isolate ${numPoints} points`;
+      this.setFilterButton.disabled = null;
+      this.clearSelectionButton.disabled = null;
     } else {
-      this.setFilterButton.attr('disabled', true);
-      this.clearSelectionButton.attr('disabled', true);
+      this.setFilterButton.disabled = true;
+      this.clearSelectionButton.disabled = true;
     }
   }
 
   private setupUI(projector: Projector) {
     this.distFunc = vector.cosDist;
-    let eucDist = this.dom.select('.distance a.euclidean');
-    eucDist.on('click', () => {
-      this.dom.selectAll('.distance a').classed('selected', false);
-      eucDist.classed('selected', true);
+    const eucDist =
+        this.querySelector('.distance a.euclidean') as HTMLLinkElement;
+    eucDist.onclick = () => {
+      const links = this.querySelectorAll('.distance a');
+      for (let i = 0; i < links.length; i++) {
+        util.classed(links[i] as HTMLElement, 'selected', false);
+      }
+      util.classed(eucDist as HTMLElement, 'selected', true);
+
       this.distFunc = vector.dist;
       this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      let neighbors = projector.dataSet.findNeighbors(
+      const neighbors = projector.dataSet.findNeighbors(
           this.selectedPointIndices[0], this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
-    });
+    };
+
+    const cosDist = this.querySelector('.distance a.cosine') as HTMLLinkElement;
+    cosDist.onclick = () => {
+      const links = this.querySelectorAll('.distance a');
+      for (let i = 0; i < links.length; i++) {
+        util.classed(links[i] as HTMLElement, 'selected', false);
+      }
+      util.classed(cosDist, 'selected', true);
 
-    let cosDist = this.dom.select('.distance a.cosine');
-    cosDist.on('click', () => {
-      this.dom.selectAll('.distance a').classed('selected', false);
-      cosDist.classed('selected', true);
       this.distFunc = vector.cosDist;
       this.projectorEventContext.notifyDistanceMetricChanged(this.distFunc);
-      let neighbors = projector.dataSet.findNeighbors(
+      const neighbors = projector.dataSet.findNeighbors(
           this.selectedPointIndices[0], this.distFunc, this.numNN);
       this.updateNeighborsList(neighbors);
-    });
+    };
 
     // Called whenever the search text input changes.
-    let updateInput = (value: string, inRegexMode: boolean) => {
+    const updateInput = (value: string, inRegexMode: boolean) => {
       if (value == null || value.trim() === '') {
         this.searchBox.message = '';
         this.projectorEventContext.notifySelectionChanged([]);
         return;
       }
-      let indices = projector.dataSet.query(
+      const indices = projector.dataSet.query(
           value, inRegexMode, this.selectedMetadataField);
       if (indices.length === 0) {
         this.searchBox.message = '0 matches.';
@@ -263,10 +300,11 @@ export class InspectorPanel extends PolymerClass {
     });
 
     // Nearest neighbors controls.
-    let numNNInput = this.$$('#nn-slider') as HTMLInputElement;
-    let updateNumNN = () => {
+    const numNNInput = this.$$('#nn-slider') as HTMLInputElement;
+    const updateNumNN = () => {
       this.numNN = +numNNInput.value;
-      this.dom.select('.num-nn .nn-count').text(this.numNN);
+      (this.querySelector('.num-nn .nn-count') as HTMLSpanElement).innerText =
+          '' + this.numNN;
       if (this.selectedPointIndices != null) {
         this.projectorEventContext.notifySelectionChanged(
             [this.selectedPointIndices[0]]);
@@ -276,22 +314,22 @@ export class InspectorPanel extends PolymerClass {
     updateNumNN();
 
     // Filtering dataset.
-    this.setFilterButton.on('click', () => {
+    this.setFilterButton.onclick = () => {
       const indices = this.selectedPointIndices.concat(
           this.neighborsOfFirstPoint.map(n => n.index));
       projector.filterDataset(indices);
       this.enableResetFilterButton(true);
       this.updateFilterButtons(0);
-    });
+    };
 
-    this.resetFilterButton.on('click', () => {
+    this.resetFilterButton.onclick = () => {
       projector.resetFilterDataset();
       this.enableResetFilterButton(false);
-    });
+    };
 
-    this.clearSelectionButton.on('click', () => {
+    this.clearSelectionButton.onclick = () => {
       projector.adjustSelectionAndHover([]);
-    });
+    };
     this.enableResetFilterButton(false);
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-legend.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-legend.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-legend.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-legend.ts
similarity index 80%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-legend.ts
index d30a95548058c608a11c5990bbdce0da1d238447..1c4ddf940dc06c1eb6c4a523d18c2da673707934 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-legend.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-legend.ts
@@ -44,11 +44,6 @@ export interface ColorLegendThreshold {
 
 export class Legend extends LegendPolymer {
   renderInfo: ColorLegendRenderInfo;
-  dom: d3.Selection<HTMLElement>;
-
-  ready() {
-    this.dom = d3.select(this);
-  }
 
   _renderInfoChanged() {
     if (this.renderInfo == null) {
@@ -70,29 +65,32 @@ export class Legend extends LegendPolymer {
   }
 
   private getOffset(value: number): string {
-    let min = this.renderInfo.thresholds[0].value;
-    let max =
+    const min = this.renderInfo.thresholds[0].value;
+    const max =
         this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1].value;
     return (100 * (value - min) / (max - min)).toFixed(2) + '%';
   }
 
   private setupLinearGradient() {
-    let linearGradient = this.dom.select('#gradient');
+    const linearGradient =
+        this.querySelector('#gradient') as SVGLinearGradientElement;
 
-    let width =
-        (this.dom.select('svg.gradient').node() as SVGElement).clientWidth;
+    const width =
+        (this.querySelector('svg.gradient') as SVGElement).clientWidth;
 
     // Set the svg <rect> to be the width of its <svg> parent.
-    this.dom.select('svg.gradient rect').attr('width', width);
+    (this.querySelector('svg.gradient rect') as SVGRectElement).style.width =
+        width + 'px';
 
     // Remove all <stop> children from before.
-    linearGradient.selectAll('*').remove();
+    linearGradient.innerHTML = '';
 
     // Add a <stop> child in <linearGradient> for each gradient threshold.
     this.renderInfo.thresholds.forEach(t => {
-      linearGradient.append('stop')
-          .attr('offset', this.getOffset(t.value))
-          .attr('stop-color', t.color);
+      const stopElement =
+          document.createElementNS('http://www.w3.org/2000/svg', 'stop');
+      stopElement.setAttribute('offset', this.getOffset(t.value));
+      stopElement.setAttribute('stop-color', t.color);
     });
   }
 }
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-metadata-card.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-metadata-card.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-metadata-card.ts
similarity index 83%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-metadata-card.ts
index 17a4700bb5c60dc14e8a97020079f1b3380d80a8..939300f3878e6c09551c77062a94a92d3cc07000 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-metadata-card.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-metadata-card.ts
@@ -28,8 +28,6 @@ export let MetadataCardPolymer = PolymerElement({
 });
 
 export class MetadataCard extends MetadataCardPolymer {
-  private dom: d3.Selection<any>;
-
   hasMetadata: boolean;
   metadata: Array<{key: string, value: string}>;
   label: string;
@@ -37,22 +35,28 @@ export class MetadataCard extends MetadataCardPolymer {
   private labelOption: string;
   private pointMetadata: PointMetadata;
 
+  private expandLessButton: HTMLButtonElement;
+  private expandMoreButton: HTMLButtonElement;
+
   ready() {
-    this.dom = d3.select(this);
+    this.expandLessButton =
+        this.querySelector('#expand-less') as HTMLButtonElement;
+    this.expandMoreButton =
+        this.querySelector('#expand-more') as HTMLButtonElement;
   }
-
   /** Handles a click on the expand more icon. */
   _expandMore() {
     (this.$$('#metadata-container') as any).toggle();
-    this.dom.select('#expand-more').style('display', 'none');
-    this.dom.select('#expand-less').style('display', '');
+
+    this.expandMoreButton.style.display = 'none';
+    this.expandLessButton.style.display = '';
   }
 
   /** Handles a click on the expand less icon. */
   _expandLess() {
     (this.$$('#metadata-container') as any).toggle();
-    this.dom.select('#expand-more').style('display', '');
-    this.dom.select('#expand-less').style('display', 'none');
+    this.expandMoreButton.style.display = '';
+    this.expandLessButton.style.display = 'none';
   }
 
   updateMetadata(pointMetadata?: PointMetadata) {
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel.ts
similarity index 84%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel.ts
index 9df182ed489afd6cf905706094949288bfb25a90..377c6c11ad5d19343682540bdadc3319b5d0ee3c 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector-projections-panel.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-projections-panel.ts
@@ -15,6 +15,7 @@ limitations under the License.
 
 import * as data from './data';
 import {DataSet, Projection, ProjectionType, SpriteAndMetadataInfo, State} from './data';
+import * as util from './util';
 import * as vector from './vector';
 import {Vector} from './vector';
 import {Projector} from './vz-projector';
@@ -92,13 +93,12 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   public customSelectedSearchByMetadataOption: string;
 
   /** Polymer elements. */
-  private dom: d3.Selection<any>;
-  private runTsneButton: d3.Selection<HTMLButtonElement>;
-  private stopTsneButton: d3.Selection<HTMLButtonElement>;
+  private runTsneButton: HTMLButtonElement;
+  private stopTsneButton: HTMLButtonElement;
   private perplexitySlider: HTMLInputElement;
   private learningRateInput: HTMLInputElement;
-  private zDropdown: d3.Selection<HTMLElement>;
-  private iterationLabel: d3.Selection<HTMLElement>;
+  private zDropdown: HTMLElement;
+  private iterationLabel: HTMLElement;
 
   private customProjectionXLeftInput: ProjectorInput;
   private customProjectionXRightInput: ProjectorInput;
@@ -121,14 +121,14 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   ready() {
-    this.dom = d3.select(this);
-    this.zDropdown = this.dom.select('#z-dropdown');
-    this.runTsneButton = this.dom.select('.run-tsne');
-    this.stopTsneButton = this.dom.select('.stop-tsne');
-    this.perplexitySlider = this.$$('#perplexity-slider') as HTMLInputElement;
+    this.zDropdown = this.querySelector('#z-dropdown') as HTMLElement;
+    this.runTsneButton = this.querySelector('.run-tsne') as HTMLButtonElement;
+    this.stopTsneButton = this.querySelector('.stop-tsne') as HTMLButtonElement;
+    this.perplexitySlider =
+        this.querySelector('#perplexity-slider') as HTMLInputElement;
     this.learningRateInput =
-        this.$$('#learning-rate-slider') as HTMLInputElement;
-    this.iterationLabel = this.dom.select('.run-tsne-iter');
+        this.querySelector('#learning-rate-slider') as HTMLInputElement;
+    this.iterationLabel = this.querySelector('.run-tsne-iter') as HTMLElement;
   }
 
   disablePolymerChangesTriggerReprojection() {
@@ -143,27 +143,33 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     if (this.perplexitySlider) {
       this.perplexity = +this.perplexitySlider.value;
     }
-    this.dom.select('.tsne-perplexity span').text(this.perplexity);
+    (this.querySelector('.tsne-perplexity span') as HTMLSpanElement).innerText =
+        '' + this.perplexity;
   }
 
   private updateTSNELearningRateFromUIChange() {
     if (this.learningRateInput) {
       this.learningRate = Math.pow(10, +this.learningRateInput.value);
     }
-    this.dom.select('.tsne-learning-rate span').text(this.learningRate);
+    (this.querySelector('.tsne-learning-rate span') as HTMLSpanElement)
+        .innerText = '' + this.learningRate;
   }
 
   private setupUIControls() {
     {
       const self = this;
-      this.dom.selectAll('.ink-tab').on('click', function() {
-        let id = this.getAttribute('data-tab');
-        self.showTab(id);
-      });
+      const inkTabs = this.querySelectorAll('.ink-tab');
+      for (let i = 0; i < inkTabs.length; i++) {
+        inkTabs[i].addEventListener('click', function() {
+          let id = this.getAttribute('data-tab');
+          self.showTab(id);
+        });
+      }
     }
 
-    this.runTsneButton.on('click', () => this.runTSNE());
-    this.stopTsneButton.on('click', () => this.dataSet.stopTSNE());
+    this.runTsneButton.addEventListener('click', () => this.runTSNE());
+    this.stopTsneButton.addEventListener(
+        'click', () => this.dataSet.stopTSNE());
 
     this.perplexitySlider.value = this.perplexity.toString();
     this.perplexitySlider.addEventListener(
@@ -177,8 +183,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.setupCustomProjectionInputFields();
     // TODO: figure out why `--paper-input-container-input` css mixin didn't
     // work.
-    this.dom.selectAll('paper-dropdown-menu paper-input input')
-        .style('font-size', '14px');
+    const inputs =
+        this.querySelectorAll('paper-dropdown-menu paper-input input');
+    for (let i = 0; i < inputs.length; i++) {
+      (inputs[i] as HTMLElement).style.fontSize = '14px';
+    }
   }
 
   restoreUIFromBookmark(bookmark: State) {
@@ -226,9 +235,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.updateTSNEPerplexityFromSliderChange();
     this.updateTSNELearningRateFromUIChange();
     if (this.iterationLabel) {
-      this.iterationLabel.text(bookmark.tSNEIteration.toString());
+      this.iterationLabel.innerText = bookmark.tSNEIteration.toString();
+    }
+    if (bookmark.selectedProjection != null) {
+      this.showTab(bookmark.selectedProjection);
     }
-    this.showTab(bookmark.selectedProjection);
     this.enablePolymerChangesTriggerReprojection();
   }
 
@@ -282,7 +293,11 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   // and the DOM.
   setZDropdownEnabled(enabled: boolean) {
     if (this.zDropdown) {
-      this.zDropdown.attr('disabled', enabled ? null : true);
+      if (enabled) {
+        this.zDropdown.removeAttribute('disabled');
+      } else {
+        this.zDropdown.setAttribute('disabled', 'true');
+      }
     }
   }
 
@@ -296,13 +311,13 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
     this.updateTSNEPerplexityFromSliderChange();
     this.clearCentroids();
 
-    this.dom.select('#tsne-sampling')
-        .style('display', pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none');
+    (this.querySelector('#tsne-sampling') as HTMLElement).style.display =
+        pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none';
     const wasSampled =
         (dataSet == null) ? false : (dataSet.dim[0] > data.PCA_SAMPLE_DIM ||
                                      dataSet.dim[1] > data.PCA_SAMPLE_DIM);
-    this.dom.select('#pca-sampling')
-        .style('display', wasSampled ? null : 'none');
+    (this.querySelector('#pca-sampling') as HTMLElement).style.display =
+        wasSampled ? null : 'none';
     this.showTab('pca');
   }
 
@@ -332,12 +347,24 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   public showTab(id: ProjectionType) {
     this.currentProjection = id;
 
-    let tab = this.dom.select('.ink-tab[data-tab="' + id + '"]');
-    this.dom.selectAll('.ink-tab').classed('active', false);
-    tab.classed('active', true);
-    this.dom.selectAll('.ink-panel-content').classed('active', false);
-    this.dom.select('.ink-panel-content[data-panel="' + id + '"]')
-        .classed('active', true);
+    const tab =
+        this.querySelector('.ink-tab[data-tab="' + id + '"]') as HTMLElement;
+    const allTabs = this.querySelectorAll('.ink-tab');
+    for (let i = 0; i < allTabs.length; i++) {
+      util.classed(allTabs[i] as HTMLElement, 'active', false);
+    }
+
+    util.classed(tab, 'active', true);
+
+    const allTabContent = this.querySelectorAll('.ink-panel-content');
+    for (let i = 0; i < allTabContent.length; i++) {
+      util.classed(allTabContent[i] as HTMLElement, 'active', false);
+    }
+
+    util.classed(
+        this.querySelector('.ink-panel-content[data-panel="' + id + '"]') as
+            HTMLElement,
+        'active', true);
 
     // guard for unit tests, where polymer isn't attached and $ doesn't exist.
     if (this.$ != null) {
@@ -392,17 +419,17 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
   }
 
   private runTSNE() {
-    this.runTsneButton.attr('disabled', true);
-    this.stopTsneButton.attr('disabled', null);
+    this.runTsneButton.disabled = true;
+    this.stopTsneButton.disabled = null;
     this.dataSet.projectTSNE(
         this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2,
         (iteration: number) => {
           if (iteration != null) {
-            this.iterationLabel.text(iteration);
+            this.iterationLabel.innerText = '' + iteration;
             this.projector.notifyProjectionPositionsUpdated();
           } else {
-            this.runTsneButton.attr('disabled', null);
-            this.stopTsneButton.attr('disabled', true);
+            this.runTsneButton.disabled = null;
+            this.stopTsneButton.disabled = true;
           }
         });
   }
@@ -422,7 +449,7 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       totalVariance += variances[this.pcaZ];
     }
     msg += (totalVariance * 100).toFixed(1) + '%.';
-    this.dom.select('#total-variance').html(msg);
+    (this.querySelector('#total-variance') as HTMLElement).innerHTML = msg;
   }
 
   private showPCA() {
@@ -440,7 +467,7 @@ export class ProjectionsPanel extends ProjectionsPanelPolymer {
       this.projector.setProjection(projection);
       let numComponents = Math.min(NUM_PCA_COMPONENTS, this.dataSet.dim[1]);
       this.updateTotalVarianceMessage();
-      this.pcaComponents = d3.range(0, numComponents).map(i => {
+      this.pcaComponents = util.range(numComponents).map(i => {
         let fracVariance = this.dataSet.fracVariancesExplained[i];
         return {
           id: i,
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector-util.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-util.ts
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector-util.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector-util.ts
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.html b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector.html
similarity index 100%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector.html
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector.html
diff --git a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector.ts
similarity index 94%
rename from tensorflow/tensorboard/components/vz_projector/vz-projector.ts
rename to tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector.ts
index ba0f669e56fe4e97119218d5124c588d515d2ebf..bf98a4d478599f7b859e893e7a17567f22fd5114 100644
--- a/tensorflow/tensorboard/components/vz_projector/vz-projector.ts
+++ b/tensorflow/tensorboard/components/vz_projector_d3v4/vz-projector.ts
@@ -70,7 +70,6 @@ export class Projector extends ProjectorPolymer implements
 
   private originalDataSet: DataSet;
   private dataSetBeforeFilter: DataSet;
-  private dom: d3.Selection<any>;
   private projectorScatterPlotAdapter: ProjectorScatterPlotAdapter;
   private dim: number;
 
@@ -94,13 +93,12 @@ export class Projector extends ProjectorPolymer implements
   private projectionsPanel: ProjectionsPanel;
   private metadataCard: MetadataCard;
 
-  private statusBar: d3.Selection<HTMLElement>;
+  private statusBar: HTMLDivElement;
   private analyticsLogger: AnalyticsLogger;
   private eventLogging: boolean;
   private pageViewLogging: boolean;
 
   ready() {
-    this.dom = d3.select(this);
     logging.setDomContainer(this);
 
     this.analyticsLogger =
@@ -130,7 +128,7 @@ export class Projector extends ProjectorPolymer implements
     this.bookmarkPanel = this.$['bookmark-panel'] as BookmarkPanel;
     this.bookmarkPanel.initialize(this, this as ProjectorEventContext);
     this.metadataCard = this.$['metadata-card'] as MetadataCard;
-    this.statusBar = this.dom.select('#status-bar');
+    this.statusBar = this.querySelector('#status-bar') as HTMLDivElement;
     this.scopeSubtree(this.$$('#notification-dialog'), true);
     this.setupUIControls();
     this.initializeDataProvider();
@@ -199,8 +197,8 @@ export class Projector extends ProjectorPolymer implements
       this.dataPanel.metadataChanged(spriteAndMetadata, metadataFile);
       // Set the container to a fixed height, otherwise in Colab the
       // height can grow indefinitely.
-      let container = this.dom.select('#container');
-      container.style('height', container.property('clientHeight') + 'px');
+      const container = this.querySelector('#container') as HTMLDivElement;
+      container.style.height = container.clientHeight + 'px';
     } else {
       this.setCurrentDataSet(null);
     }
@@ -226,7 +224,7 @@ export class Projector extends ProjectorPolymer implements
     this.dataSetFilterIndices = pointIndices;
     this.projectorScatterPlotAdapter.updateScatterPlotPositions();
     this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-    this.adjustSelectionAndHover(d3.range(selectionSize));
+    this.adjustSelectionAndHover(util.range(selectionSize));
   }
 
   resetFilterDataset() {
@@ -387,8 +385,10 @@ export class Projector extends ProjectorPolymer implements
       ds.normalize();
     }
     this.dim = (ds == null) ? 0 : ds.dim[1];
-    this.dom.select('span.numDataPoints').text((ds == null) ? '0' : ds.dim[0]);
-    this.dom.select('span.dim').text((ds == null) ? '0' : ds.dim[1]);
+    (this.querySelector('span.numDataPoints') as HTMLSpanElement).innerText =
+        (ds == null) ? '0' : '' + ds.dim[0];
+    (this.querySelector('span.dim') as HTMLSpanElement).innerText =
+        (ds == null) ? '0' : '' + ds.dim[1];
 
     this.dataSet = ds;
 
@@ -425,10 +425,9 @@ export class Projector extends ProjectorPolymer implements
     });
 
     window.addEventListener('resize', () => {
-      let container = this.dom.select('#container');
-      let parentHeight =
-          (container.node().parentNode as HTMLElement).clientHeight;
-      container.style('height', parentHeight + 'px');
+      const container = this.querySelector('#container') as HTMLDivElement;
+      const parentHeight = (container.parentNode as HTMLElement).clientHeight;
+      container.style.height = parentHeight + 'px';
       this.projectorScatterPlotAdapter.resize();
     });
 
@@ -463,13 +462,13 @@ export class Projector extends ProjectorPolymer implements
       }
     }
     if (this.selectedPointIndices.length === 0) {
-      this.statusBar.style('display', hoverText ? null : 'none');
-      this.statusBar.text(hoverText);
+      this.statusBar.style.display = hoverText ? null : 'none';
+      this.statusBar.innerText = hoverText;
     }
   }
 
-  private getScatterContainer(): d3.Selection<any> {
-    return this.dom.select('#scatter');
+  private getScatterContainer(): HTMLDivElement {
+    return this.querySelector('#scatter') as HTMLDivElement;
   }
 
   private onSelectionChanged(
@@ -479,8 +478,8 @@ export class Projector extends ProjectorPolymer implements
     this.neighborsOfFirstPoint = neighborsOfFirstPoint;
     let totalNumPoints =
         this.selectedPointIndices.length + neighborsOfFirstPoint.length;
-    this.statusBar.text(`Selected ${totalNumPoints} points`)
-        .style('display', totalNumPoints > 0 ? null : 'none');
+    this.statusBar.innerText = `Selected ${totalNumPoints} points`;
+    this.statusBar.style.display = totalNumPoints > 0 ? null : 'none';
   }
 
   setProjection(projection: Projection) {
diff --git a/tensorflow/tensorboard/components/vz_sorting/BUILD b/tensorflow/tensorboard/components/vz_sorting/BUILD
index ae3f6e27774acedad155b73d28676c1a2f2ab3ae..97b727eb5d6f4feaa2b4c155ad6f586e374d15df 100644
--- a/tensorflow/tensorboard/components/vz_sorting/BUILD
+++ b/tensorflow/tensorboard/components/vz_sorting/BUILD
@@ -1,13 +1,13 @@
 package(default_visibility = ["//tensorflow:internal"])
 
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
 load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
 
 licenses(["notice"])  # Apache 2.0
 
-webfiles(
+web_library(
     name = "vz_sorting",
     srcs = [
         "vz-sorting.html",
@@ -37,12 +37,12 @@ tensorboard_webcomponent_library(
         "vz-sorting.html",
         ":legacy_ts",
     ],
-    visibility = ["//visibility:public"],
     destdir = "vz-sorting",
 )
 
 tensorboard_ts_library(
     name = "legacy_ts",
     srcs = ["sorting.ts"],
-    deps = ["//tensorflow/tensorboard/components:common_deps"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
 )
diff --git a/tensorflow/tensorboard/components/vz_sorting/test/BUILD b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
index f8b01b61f29301e90f883d192d320acddedb4d78..649bfc5c6e9f99561211d79ad5954244ebc7b61c 100644
--- a/tensorflow/tensorboard/components/vz_sorting/test/BUILD
+++ b/tensorflow/tensorboard/components/vz_sorting/test/BUILD
@@ -17,7 +17,10 @@ filegroup(
 tensorboard_wct_test_suite(
     name = "legacy_test",
     size = "medium",
-    srcs = ["index.html"],
+    srcs = [
+        "index.html",
+        ":legacy_ts",
+    ],
     deps = [
         "//tensorflow/tensorboard/components/vz_sorting:legacy",
         "//third_party/javascript/polymer/v1/webcomponentsjs:lib",
@@ -28,8 +31,18 @@ tensorboard_ts_library(
     name = "legacy_ts",
     testonly = 1,
     srcs = ["sortingTests.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
     deps = [
-        "//tensorflow/tensorboard/components:common_deps",
         "//tensorflow/tensorboard/components/vz_sorting:legacy_ts",
+        "//third_party/javascript/node_modules/typescript:es2015.promise",
+        "//third_party/javascript/plottable/v1:typings",
+        "//third_party/javascript/typings/chai",
+        "//third_party/javascript/typings/d3",
+        "//third_party/javascript/typings/lodash",
+        "//third_party/javascript/typings/mocha",
+        "//third_party/javascript/typings/polymer:polymer_without_externs",
+        "//third_party/javascript/typings/sinon",
+        "//third_party/javascript/typings/webcomponents_js",
     ],
 )
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD b/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..22775248d7f35d168fcc7023d85b215dc2b30be3
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/BUILD
@@ -0,0 +1,56 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "vz_sorting_d3v4",
+    srcs = [
+        "bundle.js",
+        "vz-sorting.html",
+    ],
+    path = "/vz-sorting",
+    visibility = ["//visibility:public"],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ.Sorting": ["sorting.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
+
+################################################################################
+# MARKED FOR DELETION
+
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_ts_library")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_webcomponent_library")
+
+tensorboard_webcomponent_library(
+    name = "legacy",
+    srcs = [
+        "vz-sorting.html",
+        ":legacy_ts",
+    ],
+    destdir = "vz-sorting",
+)
+
+tensorboard_ts_library(
+    name = "legacy_ts",
+    srcs = ["sorting.ts"],
+    deps_mgmt = "off",
+    runtime = "nodejs",
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/sorting.ts b/tensorflow/tensorboard/components/vz_sorting_d3v4/sorting.ts
new file mode 100644
index 0000000000000000000000000000000000000000..061184d24bf30623e05834269b32acf745a56299
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/sorting.ts
@@ -0,0 +1,107 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/**
+ * Compares tag names asciinumerically broken into components.
+ *
+ * <p>This is the comparison function used for sorting most string values in
+ * TensorBoard. Unlike the standard asciibetical comparator, this function
+ * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
+ * supported. This function also splits the input by slash and underscore to
+ * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
+ * though '+' < '/' in the ASCII table.
+ */
+export function compareTagNames(a, b: string): number {
+  let ai = 0;
+  let bi = 0;
+  while (true) {
+    if (ai === a.length) {
+      return bi === b.length ? 0 : -1;
+    }
+    if (bi === b.length) {
+      return 1;
+    }
+    if (isDigit(a[ai]) && isDigit(b[bi])) {
+      const ais = ai;
+      const bis = bi;
+      ai = consumeNumber(a, ai + 1);
+      bi = consumeNumber(b, bi + 1);
+      const an = parseFloat(a.slice(ais, ai));
+      const bn = parseFloat(b.slice(bis, bi));
+      if (an < bn) {
+        return -1;
+      }
+      if (an > bn) {
+        return 1;
+      }
+      continue;
+    }
+    if (isBreak(a[ai])) {
+      if (!isBreak(b[bi])) {
+        return -1;
+      }
+    } else if (isBreak(b[bi])) {
+      return 1;
+    } else if (a[ai] < b[bi]) {
+      return -1;
+    } else if (a[ai] > b[bi]) {
+      return 1;
+    }
+    ai++;
+    bi++;
+  }
+}
+
+function consumeNumber(s: string, i: number): number {
+  enum State { NATURAL, REAL, EXPONENT_SIGN, EXPONENT }
+  let state = State.NATURAL;
+  for (; i < s.length; i++) {
+    if (state === State.NATURAL) {
+      if (s[i] === '.') {
+        state = State.REAL;
+      } else if (s[i] === 'e' || s[i] === 'E') {
+        state = State.EXPONENT_SIGN;
+      } else if (!isDigit(s[i])) {
+        break;
+      }
+    } else if (state === State.REAL) {
+      if (s[i] === 'e' || s[i] === 'E') {
+        state = State.EXPONENT_SIGN;
+      } else if (!isDigit(s[i])) {
+        break;
+      }
+    } else if (state === State.EXPONENT_SIGN) {
+      if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
+        state = State.EXPONENT;
+      } else {
+        break;
+      }
+    } else if (state === State.EXPONENT) {
+      if (!isDigit(s[i])) {
+        break;
+      }
+    }
+  }
+  return i;
+}
+
+function isDigit(c: string): boolean {
+  return '0' <= c && c <= '9';
+}
+
+function isBreak(c: string): boolean {
+  // TODO(jart): Remove underscore when people stop using it like a slash.
+  return c === '/' || c === '_' || isDigit(c);
+}
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4993cf841dcc9a510e6b16f4942d31da3f5903e1
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/BUILD
@@ -0,0 +1,46 @@
+package(
+    default_testonly = True,
+    default_visibility = ["//tensorflow:internal"],
+)
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+load("//tensorflow/tensorboard:hacks.bzl", "tensorboard_typescript_bundle")
+load("//tensorflow/tensorboard:defs.bzl", "tensorboard_typescript_genrule")
+
+licenses(["notice"])  # Apache 2.0
+
+web_library(
+    name = "test",
+    srcs = [
+        "bundle.js",
+        "tests.html",
+    ],
+    path = "/vz-sorting/test",
+    deps = [
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4",
+        "@org_npmjs_registry_web_component_tester",
+    ],
+)
+
+tensorboard_typescript_genrule(
+    name = "ts",
+    srcs = ["bundle.ts"],
+    typings = [
+        "@org_definitelytyped//:mocha.d.ts",
+        "@org_definitelytyped//:chai.d.ts",
+        "//tensorflow/tensorboard/components/vz_sorting_d3v4:bundle.d.ts",
+    ],
+)
+
+tensorboard_typescript_bundle(
+    name = "bundle",
+    out = "bundle.ts",
+    namespace_srcs = {"VZ.Sorting": ["sortingTests.ts"]},
+)
+
+filegroup(
+    name = "all_files",
+    testonly = 0,
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts
new file mode 100644
index 0000000000000000000000000000000000000000..510685cb4b5e42ca19e56acef6b1f87347811c99
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/sortingTests.ts
@@ -0,0 +1,77 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+import {compareTagNames} from '../sorting';
+
+describe('compareTagNames', () => {
+
+  const assert = chai.assert;
+  const sortTagNames = (a) => a.sort(compareTagNames);
+
+  it('is asciibetical', () => {
+    assert.deepEqual(sortTagNames(['a', 'b']), ['a', 'b']);
+    assert.deepEqual(sortTagNames(['a', 'B']), ['B', 'a']);
+  });
+
+  it('sorts integer portions', () => {
+    assert.deepEqual(['03', '1'].sort(), ['03', '1']);
+    assert.deepEqual(sortTagNames(['03', '1']), ['1', '03']);
+    assert.deepEqual(sortTagNames(['a03', 'a1']), ['a1', 'a03']);
+    assert.deepEqual(sortTagNames(['a03', 'b1']), ['a03', 'b1']);
+    assert.deepEqual(sortTagNames(['x0a03', 'x0a1']), ['x0a1', 'x0a03']);
+    assert.deepEqual(sortTagNames(['a/b/03', 'a/b/1']), ['a/b/1', 'a/b/03']);
+  });
+
+  it('sorts fixed point numbers', () => {
+    assert.deepEqual(sortTagNames(['a0.1', 'a0.01']), ['a0.01', 'a0.1']);
+  });
+
+  it('sorts engineering notation', () => {
+    assert.deepEqual(sortTagNames(['a1e9', 'a9e8']), ['a9e8', 'a1e9']);
+    assert.deepEqual(sortTagNames(['a1e+9', 'a9e+8']), ['a9e+8', 'a1e+9']);
+    assert.deepEqual(sortTagNames(['a1e+5', 'a9e-6']), ['a9e-6', 'a1e+5']);
+    assert.deepEqual(sortTagNames(['a1.0e9', 'a9.0e8']), ['a9.0e8', 'a1.0e9']);
+    assert.deepEqual(
+        sortTagNames(['a1.0e+9', 'a9.0e+8']), ['a9.0e+8', 'a1.0e+9']);
+  });
+
+  it('is componentized by slash', () => {
+    assert.deepEqual(['a+/a', 'a/a', 'ab/a'].sort(), ['a+/a', 'a/a', 'ab/a']);
+    assert.deepEqual(
+        sortTagNames(['a+/a', 'a/a', 'ab/a']), ['a/a', 'a+/a', 'ab/a']);
+  });
+
+  it('is componentized by underscore', () => {
+    assert.deepEqual(
+        sortTagNames(['a+_a', 'a_a', 'ab_a']), ['a_a', 'a+_a', 'ab_a']);
+    assert.deepEqual(
+        sortTagNames(['a+/a', 'a_a', 'ab_a']), ['a_a', 'a+/a', 'ab_a']);
+  });
+
+  it('is componentized by number boundaries', () => {
+    assert.deepEqual(
+        sortTagNames(['a+0a', 'a0a', 'ab0a']), ['a0a', 'a+0a', 'ab0a']);
+  });
+
+  it('empty comes first', () => {
+    assert.deepEqual(sortTagNames(['a', '//', '/', '']), ['', '/', '//', 'a']);
+  });
+
+  it('decimal parsed correctly', () => {
+    assert.deepEqual(sortTagNames(['0.2', '0.03']), ['0.03', '0.2']);
+    assert.deepEqual(sortTagNames(['0..2', '0..03']), ['0..2', '0..03']);
+    assert.deepEqual(sortTagNames(['.2', '.03']), ['.2', '.03']);
+  });
+});
diff --git a/tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html
new file mode 100644
index 0000000000000000000000000000000000000000..d1b4a1db31ccaa1dfbc0838cbe79709b5f1cbedd
--- /dev/null
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/test/tests.html
@@ -0,0 +1,23 @@
+<!doctype html>
+<!--
+@license
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<meta charset="utf-8">
+<script src="../../web-component-tester/browser.js"></script>
+<body>
+<script src="../bundle.js"></script>
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/components/tf_imports_google/dagre.html b/tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html
similarity index 91%
rename from tensorflow/tensorboard/components/tf_imports_google/dagre.html
rename to tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html
index 5b8b9817410d833fa63081b2b2117e869e700b5d..9f925951cb2db13638dd8a9df8c4e9adb8fda5f2 100644
--- a/tensorflow/tensorboard/components/tf_imports_google/dagre.html
+++ b/tensorflow/tensorboard/components/vz_sorting_d3v4/vz-sorting.html
@@ -15,4 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-<link rel="import" href="../dagre-library/dagre.html">
+<script src="bundle.js"></script>
diff --git a/tensorflow/tensorboard/defs.bzl b/tensorflow/tensorboard/defs.bzl
index bae7078c5b5ba741ff0292ca25a11c66d5088264..5d88baa5be31d7d2b87c688756d1921c158db1f2 100644
--- a/tensorflow/tensorboard/defs.bzl
+++ b/tensorflow/tensorboard/defs.bzl
@@ -46,6 +46,8 @@ def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs):
       cmd = "$(location @com_microsoft_typescript//:tsc.sh)" +
             " --inlineSourceMap" +
             " --inlineSources" +
+            # Do not follow triple slash references within typings.
+            " --noResolve" +
             " --declaration" +
             " --outDir $(@D) " +
             " ".join(["$(locations %s)" % i for i in inputs]),
@@ -58,6 +60,26 @@ def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs):
       **kwargs
   )
 
+def tensorboard_karma_web_test_suite(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_config(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_declaration(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_development_sources(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
+def tensorboard_ts_devserver(**kwargs):
+  """Rules referencing this will be deleted from the codebase soon."""
+  pass
+
 def tensorboard_ts_library(**kwargs):
   """Rules referencing this will be deleted from the codebase soon."""
   pass
diff --git a/tensorflow/tensorboard/demo/BUILD b/tensorflow/tensorboard/demo/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..66065af0441299dae274996deab9e004c104df26
--- /dev/null
+++ b/tensorflow/tensorboard/demo/BUILD
@@ -0,0 +1,20 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library")
+
+licenses(["notice"])  # Apache 2.0
+
+# THIS PACKAGE HAS MOVED
+# See tensorflow/tensorboard/components/tf_tensorboard_d3v4:demo
+
+web_library(
+    name = "demo_data",
+    srcs = glob(["data/**"]),
+    path = "/",
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/demo/data/logdir b/tensorflow/tensorboard/demo/data/logdir
new file mode 100644
index 0000000000000000000000000000000000000000..b6362b45d777266d6204b23884222a080f789f71
--- /dev/null
+++ b/tensorflow/tensorboard/demo/data/logdir
@@ -0,0 +1 @@
+{"logdir": "/foo/some/fake/logdir"}
\ No newline at end of file
diff --git a/tensorflow/tensorboard/demo/index.html b/tensorflow/tensorboard/demo/index.html
deleted file mode 100644
index 581f8a27235ba8b67bf95f0e9afac9d3abe4b20e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/demo/index.html
+++ /dev/null
@@ -1,31 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<!-- This demo index file serves statically serialized TensorBoard json.
-It is essentially a mocked version of the TensorBoard backend. -->
-<html>
-<head>
-  <script src="../components/webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="import" href="../components/tf_tensorboard/tf-tensorboard.html">
-    <link rel="stylesheet" type="text/css" href="../lib/css/global.css">
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-</head>
-<body>
-  <tf-tensorboard demo-dir="data/" use-hash></tf-tensorboard>
-</body>
-</html>
diff --git a/tensorflow/tensorboard/dist/index.html b/tensorflow/tensorboard/dist/index.html
deleted file mode 100644
index 66fce9fe9af1db0bd6ac66a234370f85978c8067..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/dist/index.html
+++ /dev/null
@@ -1,32 +0,0 @@
-<!doctype html>
-<!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-<html>
-  <head>
-    <title>TensorBoard</title>
-    <script src="webcomponentsjs/webcomponents-lite.min.js"></script>
-    <link rel="stylesheet" type="text/css" href="lib/css/global.css">
-    <link rel="stylesheet" type="text/css" href="plottable/plottable.css">
-    <link rel="shortcut icon" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMQAAADECAMAAAD3eH5ZAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAD/UExURfFlKfaELvFmKfNyK/67NvWALf68Nv69NvNxK/20NfyyNP22NfN0K/JrKvqhMv2zNf25Nf24Nf23NfeOL/yzNPyvNPJoKviWMPmeMfN1K/WBLfePL/FnKfeML/qlMvR7LPmcMfeLL/aJLvR5LPFoKfJuKvR3LP66NvywNPeNL/V/LfaILv21Nf26NfNzK/NvK/R6LPmaMfyxNPqfMvV+LfurM/iSMPmbMfJvKvmdMfumM/qiMvmZMfytNPJqKvysNPN2K/iYMPNwK/upM/JtKvJsKviVMPaHLvaGLvJpKvR8LPaKLvqkMvuqM/aFLvR4LPuoM/iTMPWDLfiRMPmYMXS0ngkAAALoSURBVHja7drnctpAFIbhFUISSKJ3MKYa0+y4xTW9937/15JkJhlTjhrSrHRmvuf/as6L0YLFCgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBJ6njenqspzgnPrsrGX9Zpi2tCrmnc6+dYNthVY5WpMmxQLWPdMsOuYVwzNj3ei2t3mQwaV43BJPDCS2NbJ5aEeuX/+9qcjQOtfFIkIkrvY2g4MVcmOBsFWbowKO/kNyj62gRpJcDaPBlxLr1B0zdG0C/8LzbJiJrshuvy1gzlA9+rD8mIkuyIJjFE3/dqnYwoSm7IUEPoD/wut8iIguSIDjlFxe/yfXL5vuSI21BTZLLhXoOILMO8Hxwa/L8bI0LfmUdhGowb2ZvT0e57pFNDgB06IlVyjmmIBl2T/nl9Rw6SD9GgSG/Q0uQkaW3XhmovKQ3eFQ4N2Uo9OQ1eFZsNerf7vP+rO4rhmY1Lg3vFVoP8+8BXg1sFnwbnCk4NThW8GuiKBDdkVVtTNFvNelVsNqTbyWnIOM2oeTRoyWvwmpJHg/ucXBrcJuXT4DwrpwZi2vy0VCx8YtXg/D2bU4OfiuQ3eFfE2KD4bfCqiLNB993gXsGlwa2CT4NzBacGIVQ6YsipQdh0xEdODUKjIxrSp88onZ8zbbFLg1DoiFO5BXvDGv2My9/JhUT8JUZTI0yDaNHLBzIbvqTDNYhUiVw/kdjQ1kM2CHFDPjKW+KzyRTF0g/ga9w9y+fANQpxvX8CU+Ny7FUWDeF3Y+g3lROIf4k0UDX9eCyvO531PyYhHga9zvPZJU5b73Y/eXj8Hv9D48n6HaF5LbcjRt8TZTtda5M1DfXnbkX1C0SHCFKzQB5Fe8op4GNGNHavvZESbVwT5r6W1xyuCPBY3Y9YgDqzknH/e3YfNzzuL30l0IebrZ5kKtuDIXt1n868ET6kf3/49tLvrCcZyF8Pu215dAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAcPIbNrBhOaBXucoAAAAASUVORK5CYII=">
-    <link rel="import" href="dist/bazel-html-imports.html">
-    <link rel="import" href="dist/tf-tensorboard.html">
-  </head>
-  <body>
-    <tf-tensorboard></tf-tensorboard>
-  </body>
-</html>
diff --git a/tensorflow/tensorboard/dist/tf-tensorboard.html b/tensorflow/tensorboard/dist/tf-tensorboard.html
deleted file mode 100644
index 7922c69793d7c5051b21abe526727720718aebaa..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/dist/tf-tensorboard.html
+++ /dev/null
@@ -1,26188 +0,0 @@
-<!-- Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-============================================================================
-
-This file is generated by `gulp` & `vulcanize`. Do not directly change it.
-Instead, use `gulp regenerate` to create a new version with your changes.
--->
-
-<html><head><!--
-@license
-Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
---><meta charset="UTF-8"><link rel="import" href="../polymer/polymer.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
-<link rel="import" href="../paper-tabs/paper-tabs.html">
-<link rel="import" href="../paper-dialog/paper-dialog.html">
-<link rel="import" href="../paper-checkbox/paper-checkbox.html">
-<link rel="import" href="../paper-toolbar/paper-toolbar.html">
-<link rel="import" href="../paper-button/paper-button.html">
-<link rel="import" href="../paper-icon-button/paper-icon-button.html">
-<link rel="import" href="../paper-header-panel/paper-header-panel.html">
-
-
-</head><body><div hidden="" by-vulcanize=""><dom-module id="tf-globals" assetpath="../tf-globals/">
-  <script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace */
-var TF;
-(function (TF) {
-    var Globals;
-    (function (Globals) {
-        // The names of TensorBoard tabs.
-        Globals.TABS = [
-            'scalars', 'images', 'audio', 'graphs', 'distributions', 'histograms',
-            'embeddings'
-        ];
-        // If true, TensorBoard stores its hash in the URI state.
-        // If false, tab switching in TensorBoard will not update location hash,
-        // because hash updates interfere with wct_tests.
-        Globals.USE_HASH = false;
-        // If USE_HASH is false, FAKE_HASH holds the hash contents.
-        Globals.FAKE_HASH = '';
-    })(Globals = TF.Globals || (TF.Globals = {}));
-})(TF || (TF = {}));
-</script>
-</dom-module>
-
-<script src="../lodash/lodash.min.js"></script>
-<link rel="import" href="../paper-slider/paper-slider.html">
-<link rel="import" href="../paper-input/paper-input.html">
-
-<dom-module id="tf-smoothing-input" assetpath="../tf-scalar-dashboard/">
-  <template>
-    <h3 class="title">Smoothing</h3>
-    <div class="smoothing-block">
-      <paper-slider id="slider" value="{{weight}}" immediate-value="{{_immediateWeightNumberForPaperSlider}}" type="number" step="[[step]]" min="[[min]]" max="[[max]]"></paper-slider>
-      <paper-input id="input" label="weight" no-label-float="" value="{{_inputWeightStringForPaperInput}}" type="number" step="[[step]]" min="[[min]]" max="[[max]]"></paper-input>
-    </div>
-    <style>
-      .title {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-
-      .smoothing-block {
-        display: flex;
-      }
-
-      paper-slider {
-        margin-left: 12px;
-        --paper-slider-knob-color: var(--tb-orange-strong);
-        --paper-slider-active-color: var(--tb-orange-strong);
-        flex-grow: 2;
-      }
-
-      paper-input {
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        --paper-input-container-input: {
-          font-size: 14px;
-        };
-        --paper-input-container-label: {
-          font-size: 14px;
-        };
-        width: 60px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-smoothing-input",
-
-      properties: {
-        step: Number,
-        max: Number,
-        min: Number,
-
-        weight: {
-          type: Number,
-          value: 0.6,
-          notify: true
-        },
-
-        _immediateWeightNumberForPaperSlider: {
-          type: Number,
-          notify: true,
-          observer: '_immediateWeightNumberForPaperSliderChanged'
-        },
-
-        // Paper input treats values as strings even if you specify them as
-        // numbers.
-        _inputWeightStringForPaperInput: {
-          type: String,
-          notify: true,
-          observer: '_inputWeightStringForPaperInputChanged'
-        }
-      },
-
-      _updateWeight: _.debounce(function(val) {
-        this.weight = val;
-      }, 250),
-
-      _immediateWeightNumberForPaperSliderChanged: function() {
-        this._inputWeightStringForPaperInput =
-            this._immediateWeightNumberForPaperSlider.toString();
-        this._updateWeight.call(this, this._immediateWeightNumberForPaperSlider);
-      },
-
-      _inputWeightStringForPaperInputChanged: function() {
-        if (+this._inputWeightStringForPaperInput < 0) {
-          this._inputWeightStringForPaperInput = '0';
-        }
-        else if (+this._inputWeightStringForPaperInput > 1) {
-          this._inputWeightStringForPaperInput = '1';
-        }
-
-        var d = +this._inputWeightStringForPaperInput;
-        if (!isNaN(d)) {
-          this._updateWeight.call(this, d);
-        }
-      }
-    });
-  </script>
-</dom-module>
-<script src="../d3/d3.js"></script>
-<script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var VZ;
-(function (VZ) {
-    var Sorting;
-    (function (Sorting) {
-        /**
-         * Compares tag names asciinumerically broken into components.
-         *
-         * <p>This is the comparison function used for sorting most string values in
-         * TensorBoard. Unlike the standard asciibetical comparator, this function
-         * knows that 'a10b' > 'a2b'. Fixed point and engineering notation are
-         * supported. This function also splits the input by slash and underscore to
-         * perform array comparison. Therefore it knows that 'a/a' < 'a+/a' even
-         * though '+' < '/' in the ASCII table.
-         */
-        function compareTagNames(a, b) {
-            var ai = 0;
-            var bi = 0;
-            while (true) {
-                if (ai === a.length) {
-                    return bi === b.length ? 0 : -1;
-                }
-                if (bi === b.length) {
-                    return 1;
-                }
-                if (isDigit(a[ai]) && isDigit(b[bi])) {
-                    var ais = ai;
-                    var bis = bi;
-                    ai = consumeNumber(a, ai + 1);
-                    bi = consumeNumber(b, bi + 1);
-                    var an = parseFloat(a.slice(ais, ai));
-                    var bn = parseFloat(b.slice(bis, bi));
-                    if (an < bn) {
-                        return -1;
-                    }
-                    if (an > bn) {
-                        return 1;
-                    }
-                    continue;
-                }
-                if (isBreak(a[ai])) {
-                    if (!isBreak(b[bi])) {
-                        return -1;
-                    }
-                }
-                else if (isBreak(b[bi])) {
-                    return 1;
-                }
-                else if (a[ai] < b[bi]) {
-                    return -1;
-                }
-                else if (a[ai] > b[bi]) {
-                    return 1;
-                }
-                ai++;
-                bi++;
-            }
-        }
-        Sorting.compareTagNames = compareTagNames;
-        function consumeNumber(s, i) {
-            var State;
-            (function (State) {
-                State[State["NATURAL"] = 0] = "NATURAL";
-                State[State["REAL"] = 1] = "REAL";
-                State[State["EXPONENT_SIGN"] = 2] = "EXPONENT_SIGN";
-                State[State["EXPONENT"] = 3] = "EXPONENT";
-            })(State || (State = {}));
-            var state = State.NATURAL;
-            for (; i < s.length; i++) {
-                if (state === State.NATURAL) {
-                    if (s[i] === '.') {
-                        state = State.REAL;
-                    }
-                    else if (s[i] === 'e' || s[i] === 'E') {
-                        state = State.EXPONENT_SIGN;
-                    }
-                    else if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-                else if (state === State.REAL) {
-                    if (s[i] === 'e' || s[i] === 'E') {
-                        state = State.EXPONENT_SIGN;
-                    }
-                    else if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-                else if (state === State.EXPONENT_SIGN) {
-                    if (isDigit(s[i]) || s[i] === '+' || s[i] === '-') {
-                        state = State.EXPONENT;
-                    }
-                    else {
-                        break;
-                    }
-                }
-                else if (state === State.EXPONENT) {
-                    if (!isDigit(s[i])) {
-                        break;
-                    }
-                }
-            }
-            return i;
-        }
-        function isDigit(c) { return '0' <= c && c <= '9'; }
-        function isBreak(c) {
-            // TODO(jart): Remove underscore when people stop using it like a slash.
-            return c === '/' || c === '_' || isDigit(c);
-        }
-    })(Sorting = VZ.Sorting || (VZ.Sorting = {}));
-})(VZ || (VZ = {}));
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        /**
-         * Manages many fetch requests. Launches up to nSimultaneousRequests
-         * simultaneously, and maintains a LIFO queue of requests to process when
-         * more urls are requested than can be handled at once. The queue can be
-         * cleared.
-         *
-         * When a request is made, a Promise is returned which resolves with the
-         * parsed JSON result from the request.
-         */
-        var RequestCancellationError = (function (_super) {
-            __extends(RequestCancellationError, _super);
-            function RequestCancellationError() {
-                var _this = _super !== null && _super.apply(this, arguments) || this;
-                _this.name = 'RequestCancellationError';
-                return _this;
-            }
-            return RequestCancellationError;
-        }(Error));
-        Backend.RequestCancellationError = RequestCancellationError;
-        var RequestNetworkError = (function (_super) {
-            __extends(RequestNetworkError, _super);
-            function RequestNetworkError(req, url) {
-                var _this = _super.call(this) || this;
-                _this.message = "RequestNetworkError: " + req.status + " at " + url;
-                _this.name = 'RequestNetworkError';
-                _this.req = req;
-                _this.url = url;
-                return _this;
-            }
-            return RequestNetworkError;
-        }(Error));
-        Backend.RequestNetworkError = RequestNetworkError;
-        var RequestManager = (function () {
-            function RequestManager(nSimultaneousRequests, maxRetries) {
-                if (nSimultaneousRequests === void 0) { nSimultaneousRequests = 10; }
-                if (maxRetries === void 0) { maxRetries = 3; }
-                this._queue = [];
-                this._nActiveRequests = 0;
-                this._nSimultaneousRequests = nSimultaneousRequests;
-                this._maxRetries = maxRetries;
-            }
-            /**
-             * Gives a promise that loads assets from given url (respects queuing). If
-             * postData is provided, this request will use POST, not GET. This is an
-             * object mapping POST keys to string values.
-             */
-            RequestManager.prototype.request = function (url, postData) {
-                var _this = this;
-                var promise = new Promise(function (resolve, reject) {
-                    var resolver = { resolve: resolve, reject: reject };
-                    _this._queue.push(resolver);
-                    _this.launchRequests();
-                })
-                    .then(function () {
-                    return _this.promiseWithRetries(url, _this._maxRetries, postData);
-                })
-                    .then(function (response) {
-                    // Success - Let's free space for another active
-                    // reqest, and launch it
-                    _this._nActiveRequests--;
-                    _this.launchRequests();
-                    return response;
-                }, function (rejection) {
-                    if (rejection.name === 'RequestNetworkError') {
-                        // If we failed due to network error, we should
-                        // decrement
-                        // _nActiveRequests because this request was
-                        // active
-                        _this._nActiveRequests--;
-                        _this.launchRequests();
-                    }
-                    return Promise.reject(rejection);
-                });
-                return promise;
-            };
-            RequestManager.prototype.clearQueue = function () {
-                while (this._queue.length > 0) {
-                    this._queue.pop().reject(new RequestCancellationError('Request cancelled by clearQueue'));
-                }
-            };
-            /* Return number of currently pending requests */
-            RequestManager.prototype.activeRequests = function () {
-                return this._nActiveRequests;
-            };
-            /* Return total number of outstanding requests (includes queue) */
-            RequestManager.prototype.outstandingRequests = function () {
-                return this._nActiveRequests + this._queue.length;
-            };
-            RequestManager.prototype.launchRequests = function () {
-                while (this._nActiveRequests < this._nSimultaneousRequests &&
-                    this._queue.length > 0) {
-                    this._nActiveRequests++;
-                    this._queue.pop().resolve();
-                }
-            };
-            /**
-             * Try to request a given URL using overwritable _promiseFromUrl method.
-             * If the request fails for any reason, we will retry up to maxRetries
-             * times. In practice, this will help us paper over transient network issues
-             * like '502 Bad Gateway'.
-             * By default, Chrome displays network errors in console, so
-             * the user will be able to tell when the requests are failing. I think this
-             * is a feature, if the request failures and retries are causing any
-             * pain to users, they can see it and file issues.
-             */
-            RequestManager.prototype.promiseWithRetries = function (url, maxRetries, postData) {
-                var _this = this;
-                var success = function (x) { return x; };
-                var failure = function (x) {
-                    if (maxRetries > 0) {
-                        return _this.promiseWithRetries(url, maxRetries - 1, postData);
-                    }
-                    else {
-                        return Promise.reject(x);
-                    }
-                };
-                return this._promiseFromUrl(url, postData).then(success, failure);
-            };
-            /* Actually get promise from url using XMLHttpRequest */
-            RequestManager.prototype._promiseFromUrl = function (url, postData) {
-                return new Promise(function (resolve, reject) {
-                    var req = new XMLHttpRequest();
-                    req.open(postData ? 'POST' : 'GET', url);
-                    var formData;
-                    if (postData) {
-                        // We are to make a POST request.
-                        formData = new FormData();
-                        for (var postKey in postData) {
-                            if (postKey) {
-                                // The linter requires 'for in' loops to be filtered by an if
-                                // condition.
-                                formData.append(postKey, postData[postKey]);
-                            }
-                        }
-                    }
-                    req.onload = function () {
-                        if (req.status === 200) {
-                            resolve(JSON.parse(req.responseText));
-                        }
-                        else {
-                            reject(new RequestNetworkError(req, url));
-                        }
-                    };
-                    req.onerror = function () {
-                        reject(new RequestNetworkError(req, url));
-                    };
-                    req.send(formData);
-                });
-            };
-            return RequestManager;
-        }());
-        Backend.RequestManager = RequestManager;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        Backend.BAD_CHARACTERS = '#%&{}\\/<>*? $!\'":@+`|=() ';
-        /** Cleanup a url so that it can be loaded from a filesystem. */
-        function demoify(s) {
-            // for consistency with python's urllib.urlencode
-            s = s.replace(new RegExp('%20', 'g'), '+');
-            for (var i = 0; i < Backend.BAD_CHARACTERS.length; i++) {
-                var c = Backend.BAD_CHARACTERS[i];
-                s = s.replace(new RegExp('\\' + c, 'g'), '_');
-            }
-            return s;
-        }
-        Backend.demoify = demoify;
-        function queryEncoder(params) {
-            // It's important that the keys be sorted, so we always grab the right file
-            // if we are talking to the backend generated by serialze_tensorboard.py
-            if (params == null) {
-                return '';
-            }
-            var components = _.keys(params)
-                .sort()
-                .filter(function (k) { return params[k] !== undefined; })
-                .map(function (k) { return k + '=' + encodeURIComponent(params[k]); });
-            var result = components.length ? '?' + components.join('&') : '';
-            // Replace parens for consistency with urllib.urlencode
-            return result.replace(/\(/g, '%28').replace(/\)/g, '%29');
-        }
-        Backend.queryEncoder = queryEncoder;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        ;
-        /**
-         * The standard router for communicating with the TensorBoard backend
-         * @param dataDir {string} The base prefix for finding data on server.
-         * @param demoMode {boolean} Whether to modify urls for filesystem demo usage.
-         */
-        function router(dataDir, demoMode) {
-            if (dataDir === void 0) { dataDir = 'data'; }
-            if (demoMode === void 0) { demoMode = false; }
-            var clean = demoMode ? Backend.demoify : function (x) { return x; };
-            if (dataDir[dataDir.length - 1] === '/') {
-                dataDir = dataDir.slice(0, dataDir.length - 1);
-            }
-            function standardRoute(route, demoExtension) {
-                if (demoExtension === void 0) { demoExtension = '.json'; }
-                return function (tag, run) {
-                    var url = dataDir + '/' + route + clean(Backend.queryEncoder({ tag: tag, run: run }));
-                    if (demoMode) {
-                        url += demoExtension;
-                    }
-                    return url;
-                };
-            }
-            function individualImageUrl(query, wallTime) {
-                var url = dataDir + '/' + clean('individualImage?' + query);
-                // Include wall_time just to disambiguate the URL and force the browser
-                // to reload the image when the URL changes. The backend doesn't care
-                // about the value.
-                url += demoMode ? '.png' : '&ts=' + wallTime;
-                return url;
-            }
-            function individualAudioUrl(query) {
-                var url = dataDir + '/' + clean('individualAudio?' + query);
-                if (demoMode) {
-                    url += '.wav';
-                }
-                return url;
-            }
-            function graphUrl(run, limit_attr_size, large_attrs_key) {
-                var query_params = [['run', clean(run)]];
-                if (limit_attr_size != null && !demoMode) {
-                    query_params.push(['limit_attr_size', String(limit_attr_size)]);
-                }
-                if (large_attrs_key != null && !demoMode) {
-                    query_params.push(['large_attrs_key', large_attrs_key]);
-                }
-                var query = query_params
-                    .map(function (param) {
-                    return param[0] + '=' + encodeURIComponent(param[1]);
-                })
-                    .join('&');
-                var url = dataDir + '/graph' + clean('?' + query);
-                if (demoMode) {
-                    url += '.pbtxt';
-                }
-                return url;
-            }
-            return {
-                logdir: function () { return dataDir + '/logdir'; },
-                runs: function () { return dataDir + '/runs' + (demoMode ? '.json' : ''); },
-                individualImage: individualImageUrl,
-                individualAudio: individualAudioUrl,
-                graph: graphUrl,
-                scalars: standardRoute('scalars'),
-                histograms: standardRoute('histograms'),
-                compressedHistograms: standardRoute('compressedHistograms'),
-                images: standardRoute('images'),
-                audio: standardRoute('audio'),
-                runMetadata: standardRoute('run_metadata', '.pbtxt'),
-                healthPills: function () { return dataDir + '/plugin/debugger/health_pills'; },
-            };
-        }
-        Backend.router = router;
-        ;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend_1) {
-        ;
-        ;
-        Backend_1.TYPES = [
-            'scalar', 'histogram', 'compressedHistogram', 'graph', 'image', 'audio',
-            'runMetadata'
-        ];
-        /**
-         * The Backend class provides a convenient and typed interface to the backend.
-         *
-         * It provides methods corresponding to the different data sources on the
-         * TensorBoard backend. These methods return a promise containing the data
-         * from the backend. This class does some post-processing on the data; for
-         * example, converting data elements tuples into js objects so that they can
-         * be accessed in a more convenient and clearly-documented fashion.
-         */
-        var Backend = (function () {
-            /**
-             * Construct a Backend instance.
-             * @param router the Router with info on what urls to get data from
-             * @param requestManager The RequestManager, overwritable so you may
-             * manually clear request queue, etc. Defaults to a new RequestManager.
-             */
-            function Backend(router, requestManager) {
-                this.router = router;
-                this.requestManager = requestManager || new Backend_1.RequestManager();
-            }
-            /**
-             * Returns a promise for requesting the logdir string.
-             */
-            Backend.prototype.logdir = function () {
-                return this.requestManager.request(this.router.logdir());
-            };
-            /**
-             * Returns a listing of all the available data in the TensorBoard backend.
-             */
-            Backend.prototype.runs = function () {
-                return this.requestManager.request(this.router.runs());
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for scalar data.
-             */
-            Backend.prototype.scalarRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'scalars'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for histogram data.
-             */
-            Backend.prototype.histogramRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'histograms'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for image data.
-             */
-            Backend.prototype.imageRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'images'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for audio data.
-             */
-            Backend.prototype.audioRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'audio'); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for compressedHistogram
-             * data.
-             */
-            Backend.prototype.compressedHistogramRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'compressedHistograms'); });
-            };
-            /**
-             * Return a promise showing list of runs that contain graphs.
-             */
-            Backend.prototype.graphRuns = function () {
-                return this.runs().then(function (x) { return _.keys(x).filter(function (k) { return x[k].graph; }); });
-            };
-            /**
-             * Return a promise showing the Run-to-Tag mapping for run_metadata objects.
-             */
-            Backend.prototype.runMetadataRuns = function () {
-                return this.runs().then(function (x) { return _.mapValues(x, 'run_metadata'); });
-            };
-            /**
-             * Return a promise of a graph string from the backend.
-             */
-            Backend.prototype.graph = function (tag, limit_attr_size, large_attrs_key) {
-                var url = this.router.graph(tag, limit_attr_size, large_attrs_key);
-                return this.requestManager.request(url);
-            };
-            /**
-             * Return a promise containing ScalarDatums for given run and tag.
-             */
-            Backend.prototype.scalar = function (tag, run) {
-                var p;
-                var url = this.router.scalars(tag, run);
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(createScalar)));
-            };
-            /**
-             * Returns a promise for requesting the health pills for a list of nodes.
-             */
-            Backend.prototype.healthPills = function (nodeNames) {
-                var postData = { 'node_names': JSON.stringify(nodeNames) };
-                return this.requestManager.request(this.router.healthPills(), postData);
-            };
-            /**
-             * Return a promise containing HistogramDatums for given run and tag.
-             */
-            Backend.prototype.histogram = function (tag, run) {
-                var p;
-                var url = this.router.histograms(tag, run);
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(createHistogram))).then(function (histos) {
-                    // Get the minimum and maximum values across all histograms so that the
-                    // visualization is aligned for all timesteps.
-                    var min = d3.min(histos, function (d) { return d.min; });
-                    var max = d3.max(histos, function (d) { return d.max; });
-                    return histos.map(function (histo, i) {
-                        return {
-                            wall_time: histo.wall_time,
-                            step: histo.step,
-                            bins: convertBins(histo, min, max)
-                        };
-                    });
-                });
-            };
-            /**
-             * Return a promise containing ImageDatums for given run and tag.
-             */
-            Backend.prototype.image = function (tag, run) {
-                var url = this.router.images(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(this.createImage.bind(this)));
-            };
-            /**
-             * Return a promise containing AudioDatums for given run and tag.
-             */
-            Backend.prototype.audio = function (tag, run) {
-                var url = this.router.audio(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(this.createAudio.bind(this)));
-            };
-            /**
-             * Returns a promise to load the string RunMetadata for given run/tag.
-             */
-            Backend.prototype.runMetadata = function (tag, run) {
-                var url = this.router.runMetadata(tag, run);
-                return this.requestManager.request(url);
-            };
-            /**
-             * Get compressedHistogram data.
-             * Unlike other methods, don't bother reprocessing this data into a nicer
-             * format. This is because we will deprecate this route.
-             */
-            Backend.prototype.compressedHistogram = function (tag, run) {
-                var url = this.router.compressedHistograms(tag, run);
-                var p;
-                p = this.requestManager.request(url);
-                return p.then(map(detupler(function (x) { return x; })));
-            };
-            Backend.prototype.createImage = function (x) {
-                return {
-                    width: x.width,
-                    height: x.height,
-                    wall_time: timeToDate(x.wall_time),
-                    step: x.step,
-                    url: this.router.individualImage(x.query, x.wall_time),
-                };
-            };
-            Backend.prototype.createAudio = function (x) {
-                return {
-                    content_type: x.content_type,
-                    wall_time: timeToDate(x.wall_time),
-                    step: x.step,
-                    url: this.router.individualAudio(x.query),
-                };
-            };
-            return Backend;
-        }());
-        Backend_1.Backend = Backend;
-        /** Given a RunToTag, return sorted array of all runs */
-        function getRuns(r) {
-            return _.keys(r).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.getRuns = getRuns;
-        /** Given a RunToTag, return array of all tags (sorted + dedup'd) */
-        function getTags(r) {
-            return _.union.apply(null, _.values(r)).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.getTags = getTags;
-        /**
-         * Given a RunToTag and an array of runs, return every tag that appears for
-         * at least one run.
-         * Sorted, deduplicated.
-         */
-        function filterTags(r, runs) {
-            var result = [];
-            runs.forEach(function (x) { return result = result.concat(r[x]); });
-            return _.uniq(result).sort(VZ.Sorting.compareTagNames);
-        }
-        Backend_1.filterTags = filterTags;
-        function timeToDate(x) { return new Date(x * 1000); }
-        ;
-        /**  Just a curryable map to make things cute and tidy. */
-        function map(f) {
-            return function (arr) { return arr.map(f); };
-        }
-        ;
-        /**
-         * This is a higher order function that takes a function that transforms a
-         * T into a G, and returns a function that takes TupleData<T>s and converts
-         * them into the intersection of a G and a Datum.
-         */
-        function detupler(xform) {
-            return function (x) {
-                // Create a G, assert it has type <G & Datum>
-                var obj = xform(x[2]);
-                // ... patch in the properties of datum
-                obj.wall_time = timeToDate(x[0]);
-                obj.step = x[1];
-                return obj;
-            };
-        }
-        ;
-        function createScalar(x) { return { scalar: x }; }
-        ;
-        function createHistogram(x) {
-            return {
-                min: x[0],
-                max: x[1],
-                nItems: x[2],
-                sum: x[3],
-                sumSquares: x[4],
-                bucketRightEdges: x[5],
-                bucketCounts: x[6],
-            };
-        }
-        ;
-        /**
-         * Takes histogram data as stored by tensorboard backend and converts it to
-         * the standard d3 histogram data format to make it more compatible and easier
-         * to visualize. When visualizing histograms, having the left edge and width
-         * makes things quite a bit easier. The bins are also converted to have an
-         * uniform width, what makes the visualization easier to understand.
-         *
-         * @param histogram A histogram from tensorboard backend.
-         * @param min The leftmost edge. The binning will start on it.
-         * @param max The rightmost edge. The binning will end on it.
-         * @param numBins The number of bins of the converted data. The default of 30
-         * is a sensible default, using more starts to get artifacts because the event
-         * data is stored in buckets, and you start being able to see the aliased
-         * borders between each bucket.
-         * @return A histogram bin. Each bin has an x (left edge), a dx (width),
-         *     and a y (count).
-         *
-         * If given rightedges are inclusive, then these left edges (x) are exclusive.
-         */
-        function convertBins(histogram, min, max, numBins) {
-            if (numBins === void 0) { numBins = 30; }
-            if (histogram.bucketRightEdges.length !== histogram.bucketCounts.length) {
-                throw (new Error('Edges and counts are of different lengths.'));
-            }
-            if (max === min) {
-                // Create bins even if all the data has a single value.
-                max = min * 1.1 + 1;
-                min = min / 1.1 - 1;
-            }
-            var binWidth = (max - min) / numBins;
-            var bucketLeft = min; // Use the min as the starting point for the bins.
-            var bucketPos = 0;
-            return d3.range(min, max, binWidth).map(function (binLeft) {
-                var binRight = binLeft + binWidth;
-                // Take the count of each existing bucket, multiply it by the proportion
-                // of overlap with the new bin, then sum and store as the count for the
-                // new bin. If no overlap, will add to zero, if 100% overlap, will include
-                // the full count into new bin.
-                var binY = 0;
-                while (bucketPos < histogram.bucketRightEdges.length) {
-                    // Clip the right edge because right-most edge can be infinite-sized.
-                    var bucketRight = Math.min(max, histogram.bucketRightEdges[bucketPos]);
-                    var intersect = Math.min(bucketRight, binRight) - Math.max(bucketLeft, binLeft);
-                    var count = (intersect / (bucketRight - bucketLeft)) *
-                        histogram.bucketCounts[bucketPos];
-                    binY += intersect > 0 ? count : 0;
-                    // If bucketRight is bigger than binRight, than this bin is finished and
-                    // there is data for the next bin, so don't increment bucketPos.
-                    if (bucketRight > binRight) {
-                        break;
-                    }
-                    bucketLeft = Math.max(min, bucketRight);
-                    bucketPos++;
-                }
-                ;
-                return { x: binLeft, dx: binWidth, y: binY };
-            });
-        }
-        Backend_1.convertBins = convertBins;
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script><script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Backend;
-    (function (Backend) {
-        Backend.Behavior = {
-            properties: {
-                /** *** Required properties *** */
-                /** Data type. One of TF.Backend.TYPES */
-                dataType: {
-                    type: String,
-                    observer: '_throwErrorOnUnrecognizedType',
-                },
-                /** TF.Backend.Backend for data loading. */
-                backend: {
-                    type: Object,
-                },
-                /** Should it automatically load when configured ready? Default true. */
-                autoLoad: {
-                    type: Boolean,
-                    value: true,
-                },
-                /** *** Component-provided properties *** */
-                /** Every tag available for data type (sorted, dedpulicated) */
-                tags: {
-                    type: Array,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Every run available for data type (sorted) */
-                runs: {
-                    type: Array,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Mapping from runs to tags for the data type */
-                run2tag: {
-                    type: Object,
-                    readOnly: true,
-                    notify: true,
-                },
-                /** Promise provider for the data. Useful for passing to subcomponents */
-                dataProvider: { type: Function, computed: '_getDataProvider(dataType, backend)' },
-                /** Has the dashboard loaded yet? */
-                loadState: {
-                    type: String,
-                    value: 'noload',
-                    readOnly: true,
-                },
-                /**
-                 * True if dashboard has loaded, and no tags were found.
-                 * Persists through subsequent reloads (ie. still true while
-                 * next load is pending) so warning won't flash away every reload
-                 * when there is no data.
-                 */
-                dataNotFound: {
-                    type: Boolean,
-                    value: false,
-                    readOnly: true,
-                }
-            },
-            observers: ['_do_autoLoad(dataType, backend, autoLoad)'],
-            /**
-             * Reloading works in two steps:
-             * Backend reload, which gets metadata on available runs, tags, etc from
-             *   the backend.
-             * Frontend reload, which loads new data for each chart or visual display.
-             * Backend reload logic is provided by this behaivor. The frontend reload
-             *   logic should be provided elsewhere, since it is component-specific.
-             * To keep things simple and consistent, we do the backend reload first,
-             *   and the frontend reload afterwards.
-             */
-            reload: function () {
-                var _this = this;
-                return this.backendReload().then(function (x) { return _this.frontendReload(); });
-            },
-            /**
-             * Load data from backend and then set run2tag, tags, runs, and loadState.
-             * Returns a promise that resolves/rejects when data is loaded.
-             */
-            backendReload: function () {
-                var _this = this;
-                if (this.dataType == null) {
-                    throw new Error('TF.Backend.Behavior: Need a dataType to reload.');
-                }
-                if (this.backend == null) {
-                    throw new Error('TF.Backend.Behavior: Need a backend to reload.');
-                }
-                var runsRoute = this.backend[this.dataType + 'Runs'].bind(this.backend);
-                this._setLoadState('pending');
-                return runsRoute().then(function (x) {
-                    _this._setLoadState('loaded');
-                    if (_.isEqual(x, _this.run2tag)) {
-                        // If x and run2tag are equal, let's avoid updating everything
-                        // since that can needlessly trigger run changes, reloads, etc
-                        return x;
-                    }
-                    _this._setRun2tag(x);
-                    var tags = TF.Backend.getTags(x);
-                    _this._setDataNotFound(tags.length === 0);
-                    _this._setTags(tags);
-                    _this._setRuns(TF.Backend.getRuns(x));
-                    return x;
-                }, function (fail) {
-                    _this._setLoadState('failure');
-                    return fail;
-                });
-            },
-            _do_autoLoad: function (type, backend, autoLoad) {
-                if (autoLoad) {
-                    this.reload();
-                }
-                ;
-            },
-            _getDataProvider: function (dataType, backend) {
-                return this.backend[this.dataType].bind(this.backend);
-            },
-            _throwErrorOnUnrecognizedType: function (dataType) {
-                if (TF.Backend.TYPES.indexOf(dataType) === -1) {
-                    throw new Error('TF.Backend.Behavior: Unknown dataType ' + dataType);
-                }
-            },
-        };
-    })(Backend = TF.Backend || (TF.Backend = {}));
-})(TF || (TF = {}));
-</script>
-
-
-
-
-
-<dom-module id="tf-color-scale" assetpath="../tf-color-scale/">
-  <script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    TF.palettes = {
-        googleStandard: [
-            '#db4437',
-            '#ff7043',
-            '#f4b400',
-            '#0f9d58',
-            '#00796b',
-            '#00acc1',
-            '#4285f4',
-            '#5c6bc0',
-            '#ab47bc' // purple 400
-        ],
-        googleCool: [
-            '#9e9d24',
-            '#0f9d58',
-            '#00796b',
-            '#00acc1',
-            '#4285f4',
-            '#5c6bc0',
-            '#607d8b' // blue gray 500
-        ],
-        googleWarm: [
-            '#795548',
-            '#ab47bc',
-            '#f06292',
-            '#c2185b',
-            '#db4437',
-            '#ff7043',
-            '#f4b400' // google yellow 700
-        ],
-        googleColorBlindAssist: [
-            '#ff7043',
-            '#00ACC1',
-            '#AB47BC',
-            '#2A56C6',
-            '#0b8043',
-            '#F7CB4D',
-            '#c0ca33',
-            '#5e35b1',
-            '#A52714',
-        ],
-        // These palettes try to be better for color differentiation.
-        // https://personal.sron.nl/~pault/
-        colorBlindAssist1: ['#4477aa', '#44aaaa', '#aaaa44', '#aa7744', '#aa4455', '#aa4488'],
-        colorBlindAssist2: [
-            '#88ccee', '#44aa99', '#117733', '#999933', '#ddcc77', '#cc6677',
-            '#882255', '#aa4499'
-        ],
-        colorBlindAssist3: [
-            '#332288', '#6699cc', '#88ccee', '#44aa99', '#117733', '#999933',
-            '#ddcc77', '#cc6677', '#aa4466', '#882255', '#661100', '#aa4499'
-        ],
-        // based on this palette: http://mkweb.bcgsc.ca/biovis2012/
-        colorBlindAssist4: [
-            '#FF6DB6', '#920000', '#924900', '#DBD100', '#24FF24', '#006DDB',
-            '#490092'
-        ],
-        mldash: [
-            '#E47EAD', '#F4640D', '#FAA300', '#F5E636', '#00A077', '#0077B8',
-            '#00B7ED'
-        ]
-    };
-})(TF || (TF = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Example usage:
-// runs = ["train", "test", "test1", "test2"]
-// ccs = new TF.ColorScale();
-// ccs.domain(runs);
-// ccs.getColor("train");
-// ccs.getColor("test1");
-var TF;
-(function (TF) {
-    var ColorScale = (function () {
-        /**
-         * Creates a color scale with optional custom palette.
-         *  @param {string[]} [palette=TF.palettes.googleColorBlind] - The color
-         *                 palette you want as an Array of hex strings.
-         */
-        function ColorScale(palette) {
-            if (palette === void 0) { palette = TF.palettes.googleColorBlindAssist; }
-            this.identifiers = d3.map();
-            this.palette = palette;
-        }
-        /**
-         * Set the domain of strings.
-         * @param {string[]} strings - An array of possible strings to use as the
-         *                             domain for your scale.
-         */
-        ColorScale.prototype.domain = function (strings) {
-            var _this = this;
-            this.identifiers = d3.map();
-            strings.forEach(function (s, i) {
-                _this.identifiers.set(s, _this.palette[i % _this.palette.length]);
-            });
-            return this;
-        };
-        /**
-         * Use the color scale to transform an element in the domain into a color.
-         * @param {string} The input string to map to a color.
-         * @return {string} The color corresponding to that input string.
-         * @throws Will error if input string is not in the scale's domain.
-         */
-        ColorScale.prototype.scale = function (s) {
-            if (!this.identifiers.has(s)) {
-                throw new Error('String was not in the domain.');
-            }
-            return this.identifiers.get(s);
-        };
-        return ColorScale;
-    }());
-    TF.ColorScale = ColorScale;
-})(TF || (TF = {}));
-</script>
-  <script>
-    (function() {
-      Polymer({
-        is: "tf-color-scale",
-        properties: {
-          runs: {
-            type: Array,
-          },
-          outColorScale: {
-            type: Object,
-            readOnly: true,
-            notify: true,
-            value: function() {
-              return new TF.ColorScale();
-            },
-          },
-        },
-        observers: ['updateColorScale(runs.*)'],
-        updateColorScale: function(runsChange) {
-          this.outColorScale.domain(this.runs);
-        },
-      });
-    })();
-  </script>
-</dom-module>
-<link rel="import" href="../paper-styles/paper-styles.html">
-
-<dom-module id="scrollbar-style" assetpath="../tf-dashboard-common/">
-  <template>
-    <style>
-      .scrollbar::-webkit-scrollbar-track
-      {
-        visibility: hidden;
-      }
-
-      .scrollbar::-webkit-scrollbar
-      {
-        width: 10px;
-      }
-
-      .scrollbar::-webkit-scrollbar-thumb
-      {
-        border-radius: 10px;
-        -webkit-box-shadow: inset 0 0 2px rgba(0,0,0,.3);
-        background-color: var(--paper-grey-500);
-        color: var(--paper-grey-900);
-      }
-      .scrollbar {
-        box-sizing: border-box;
-      }
-    </style>
-  </template>
-</dom-module>
-<style is="custom-style">
-
-  :root {
-    --tb-orange-weak: #ffa726;
-    --tb-orange-strong: #f57c00;
-    --tb-grey-darker: #e2e2e2;
-    --tb-grey-lighter: #f3f3f3;
-    --tb-ui-dark-accent: #757575;
-    --tb-ui-light-accent: #e0e0e0;
-    --tb-graph-faded: #e0d4b3;
-  }
-
-</style>
-
-<dom-module id="tf-dashboard-layout" assetpath="../tf-dashboard-common/">
-  <template>
-    <div id="sidebar">
-      <content select=".sidebar"></content>
-    </div>
-
-    <div id="center" class="scrollbar">
-      <content select=".center"></content>
-    </div>
-    <style include="scrollbar-style"></style>
-    <style>
-      #sidebar {
-        width: inherit;
-        height: 100%;
-        overflow: ellipsis;
-        flex-grow: 0;
-        flex-shrink: 0;
-      }
-
-      #center {
-        height: 100%;
-        overflow-y: auto;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-      .tf-graph-dashboard #center {
-        background: white;
-      }
-
-      :host {
-        display: flex;
-        flex-direction: row;
-        height: 100%;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-dashboard-layout",
-    });
-  </script>
-</dom-module>
-<dom-module id="dashboard-style" assetpath="../tf-dashboard-common/">
-  <template>
-    <style>
-      .sidebar {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-        margin-right: 20px;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 15px 0px 15px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-        margin-top: 5px;
-      }
-    </style>
-  </template>
-</dom-module>
-<link rel="import" href="../paper-dropdown-menu/paper-dropdown-menu.html">
-<link rel="import" href="../paper-menu/paper-menu.html">
-<link rel="import" href="../paper-item/paper-item.html">
-
-<dom-module id="tf-downloader" assetpath="../tf-dashboard-common/">
-  <template>
-    <paper-dropdown-menu no-label-float="true" label="run to download" selected-item-label="{{_run}}">
-      <paper-menu class="dropdown-content">
-        <template is="dom-repeat" items="[[runs]]">
-          <paper-item no-label-float="true">[[item]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-    <div class="center">
-      <span>
-        <a download="[[_csvName(_run)]]" href="[[_csvUrl(_run, urlFn)]]">CSV</a>
-        <a download="[[_jsonName(_run)]]" href="[[_jsonUrl(_run, urlFn)]]">JSON</a>
-      </span>
-    </div>
-    <style>
-      :host {
-        display: flex;
-        height: 32px;
-      }
-      .center {
-        display: flex;
-        align-self: center;
-      }
-      paper-dropdown-menu {
-        width: 100px;
-        --paper-input-container-label: {
-          font-size: 10px;
-        }
-        --paper-input-container-input: {
-          font-size: 10px;
-        }
-      }
-      a {
-        font-size: 10px;
-        border-radius: 3px;
-        border: 1px solid #EEE;
-      }
-      paper-input {
-        font-size: 22px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-downloader",
-      properties: {
-        _run: String,
-        runs: Array,
-        tag: String,
-        urlFn: Function,
-      },
-      _csvUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run) + "&format=csv";
-      },
-      _jsonUrl: function(_run, urlFn) {
-        return urlFn(this.tag, _run);
-      },
-      _csvName: function(_run) {
-        return "run_" + _run + ",tag_" + this.tag + ".csv";
-      },
-      _jsonName: function(_run) {
-        return "run-" + _run + "-tag-" + this.tag + ".json";
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-no-data-warning" assetpath="../tf-dashboard-common/">
-  <template>
-    <template is="dom-if" if="[[showWarning]]">
-      <div class="warning">
-        <template is="dom-if" if="[[_isGraph(dataType)]]">
-          <h3>
-            No graph definition files were found.
-          </h3>
-          <p>
-            To store a graph, create a
-            <code>tf.summary.FileWriter</code>
-            and pass the graph either via the constructor, or by calling its
-            <code>add_graph()</code> method.
-            You may want to check out the
-            <a href="https://www.tensorflow.org/versions/master/how_tos/graph_viz/index.html">
-              graph visualizer tutorial
-            </a>
-            .
-          </p>
-        </template>
-        <template is="dom-if" if="[[_isProjector(dataType)]]">
-          <h3>
-            No checkpoint was found.
-          </h3>
-          <p>
-            Probable causes:
-            </p><ul>
-              <li>
-                No checkpoint has been saved yet. Please refresh the page periodically.
-              </li>
-              <li>
-                You are not saving any checkpoint. To save your model,
-                create a
-                <a href="https://www.tensorflow.org/versions/master/api_docs/python/state_ops.html#Saver">
-                  <code>tf.train.Saver</code>
-                </a>
-                and save your model periodically
-                by calling <code>saver.save(session, LOG_DIR/model.ckpt, step)</code>.
-              </li>
-            </ul>
-          <p></p>
-        </template>
-        <template is="dom-if" if="[[_isOther(dataType)]]">
-          <h3>
-            No <span>[[dataType]]</span> data was found.
-          </h3>
-          <p>
-            Probable causes:
-            </p><ul>
-              <li>
-                You haven't written any <span>[[dataType]]</span> data
-                to your event files.
-              </li>
-              <li>
-                TensorBoard can't find your event files.
-              </li>
-            </ul>
-          <p></p>
-        </template>
-        <p>
-          If you're new to using TensorBoard, and want to find out how to add
-          data and set up your event files, check out the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md">
-            README
-          </a>
-          and perhaps the
-          <a href="https://www.tensorflow.org/versions/master/how_tos/summaries_and_tensorboard/index.html">
-            TensorBoard tutorial
-          </a>.
-        </p>
-
-        <p>
-          If you think TensorBoard is configured properly, please see the
-          <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md#my-tensorboard-isnt-showing-any-data-whats-wrong">
-            section of the README devoted to missing data problems
-          </a>
-          and consider filing an issue on GitHub.
-        </p>
-
-      </div>
-    </template>
-    <style>
-      .warning {
-        max-width: 540px;
-        margin: 80px auto 0 auto;
-      }
-    </style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-no-data-warning",
-      properties: {
-        dataType: String,
-        showWarning: Boolean
-      },
-      _isGraph: function(dataType) {
-        return dataType === "graph";
-      },
-      _isProjector: function(dataType) {
-        return dataType === "projector";
-      },
-      _isOther: function(dataType) {
-        return !this._isGraph(dataType) && !this._isProjector(dataType);
-      }
-    });
-  </script>
-</dom-module>
-<script>/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var Dashboard;
-    (function (Dashboard) {
-        /**
-         * ReloadBehavior: A simple behavior for dashboards where the
-         * frontendReload() function should find every child element with a
-         * given tag name (e.g. "tf-line-chart" or "tf-image-loader")
-         * and call a `reload` method on that child.
-         * May later extend it so it has more sophisticated logic, e.g. reloading
-         * only tags that are in view.
-         */
-        function ReloadBehavior(tagName) {
-            return {
-                properties: {
-                    reloadTag: {
-                        type: String,
-                        value: tagName,
-                    },
-                },
-                frontendReload: function () {
-                    var elements = this.getElementsByTagName(this.reloadTag);
-                    Array.prototype.forEach.call(elements, function (x) { x.reload(); });
-                },
-            };
-        }
-        Dashboard.ReloadBehavior = ReloadBehavior;
-    })(Dashboard = TF.Dashboard || (TF.Dashboard = {}));
-})(TF || (TF = {}));
-</script>
-
-<dom-module id="tf-option-selector" assetpath="../tf-dashboard-common/">
-  <template>
-    <div id="wrap">
-      <h3>[[name]]</h3>
-      <div class="content-wrapper"><content></content></div>
-    </div>
-    <style>
-      .content-wrapper ::content > * {
-        width: 30%;
-        font-size: 13px;
-        background: none;
-        margin-top: 10px;
-        color: var(--tb-ui-dark-accent);
-      }
-
-      .content-wrapper ::content :first-of-type {
-        margin-left: 0;
-      }
-
-      .content-wrapper ::content .selected {
-        background-color: var(--tb-ui-dark-accent);
-        color: white!important;
-      }
-
-      h3 {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-        display: block;
-        pointer-events: none;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-option-selector",
-      properties: {
-        name: String,
-        selectedId: {
-          type: String,
-          notify: true,
-          observer: '_selectedIdChanged'
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.getEffectiveChildren().forEach(function(node) {
-            this.listen(node, 'tap', '_selectTarget');
-          }.bind(this));
-        });
-      },
-      _selectTarget: function(e) {
-        this.selectedId = e.currentTarget.id;
-      },
-      _selectedIdChanged: function() {
-        var selected = this.queryEffectiveChildren('#' + this.selectedId);
-        if (!selected) {
-          return;
-        }
-
-        this.getEffectiveChildren().forEach(function(node) {
-          node.classList.remove("selected");
-        });
-        selected.classList.add("selected");
-      }
-    });
-  </script>
-</dom-module>
-<link rel="import" href="../iron-collapse/iron-collapse.html">
-
-<dom-module id="tf-collapsable-pane" assetpath="../tf-dashboard-common/">
-  <template>
-    <button class="heading" on-tap="togglePane" open-button$="[[opened]]">
-    <span class="name">[[name]]</span>
-    <span class="count">
-      <span>[[count]]</span>
-    </span>
-  </button>
-    <iron-collapse opened="[[opened]]">
-      <div class="content">
-        <template is="dom-if" if="[[opened]]" restamp="[[restamp]]">
-          <content></content>
-        </template>
-      </div>
-    </iron-collapse>
-    <style>
-      :host {
-        display: block;
-        margin: 0 5px 1px 10px;
-      }
-
-      :host:first-of-type {
-        margin-top: 20px;
-      }
-
-      :host:last-of-type {
-        margin-bottom: 20px;
-      }
-
-      .heading {
-        background-color: white;
-        border: none;
-        cursor: pointer;
-        width: 100%;
-        font-size: 15px;
-        line-height: 1;
-        box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-        padding: 10px 15px;
-      }
-
-      .content {
-        padding: 15px;
-        border: 1px solid #dedede;
-        border-top: none;
-        border-bottom-left-radius: 2px;
-        border-bottom-right-radius: 2px;
-        background: white;
-      }
-
-      [open-button] {
-        border-bottom-left-radius: 0px !important;
-        border-bottom-right-radius: 0px !important;
-      }
-
-      .name {
-        float: left;
-      }
-
-      .count {
-        float: right;
-        margin-right: 5px;
-        font-size: 12px;
-        color: var(--paper-grey-500);
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-collapsable-pane",
-      properties: {
-        opened: {type: Boolean, value: false},
-        restamp: {type: Boolean, value: true},
-        name: {type: String, observer: "hide"},
-        count: {type: Number},
-      },
-      hide: function() {
-        this.opened = false;
-      },
-      togglePane: function() {
-        this.opened = !this.opened;
-      }
-    });
-  </script>
-
-</dom-module>
-<script src="../plottable/plottable.js"></script>
-<style>
-/**
- * @license
- * The MIT License (MIT)
- *
- * Copyright (c) 2014-2015 Palantir Technologies, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- * Plottable 1.16.1 (https://github.com/palantir/plottable)
- * Copyright 2014-2015 Palantir Technologies
- * Licensed under MIT (https://github.com/palantir/plottable/blob/master/LICENSE)
-*/
-.plottable-colors-0 {
-  background-color: #5279c7; /* INDIGO */
-}
-
-.plottable-colors-1 {
-  background-color: #fd373e; /* CORAL_RED */
-}
-
-.plottable-colors-2 {
-  background-color: #63c261; /* FERN */
-}
-
-.plottable-colors-3 {
-  background-color: #fad419; /* BRIGHT_SUN */
-}
-
-.plottable-colors-4 {
-  background-color: #2c2b6f; /* JACARTA */
-}
-
-.plottable-colors-5 {
-  background-color: #ff7939; /* BURNING_ORANGE */
-}
-
-.plottable-colors-6 {
-  background-color: #db2e65; /* CERISE_RED */
-}
-
-.plottable-colors-7 {
-  background-color: #99ce50; /* CONIFER */
-}
-
-.plottable-colors-8 {
-  background-color: #962565; /* ROYAL_HEATH */
-}
-
-.plottable-colors-9 {
-  background-color: #06cccc; /* ROBINS_EGG_BLUE */
-}
-
-svg.plottable {
-  display : block; /* SVGs must be block elements for width/height calculations to work in Firefox. */
-  pointer-events: visibleFill;
-}
-
-.plottable .background-fill {
-  fill: none;
-  pointer-events: none;
-}
-
-.plottable .bounding-box {
-  /* Invisible pink bounding-box to allow for collision testing */
-  fill: pink;
-  visibility: hidden;
-}
-
-.plottable .label text {
-  font-family: "Helvetica Neue", sans-serif;
-  fill: #32313F;
-}
-
-.plottable .bar-label-text-area text {
-  font-family: "Helvetica Neue", sans-serif;
-  font-size: 14px;
-}
-
-.plottable .label-area text {
-  fill: #32313F;
-  font-family: "Helvetica Neue", sans-serif;
-  font-size: 14px;
-}
-
-.plottable .light-label text {
-  fill: white;
-}
-
-.plottable .dark-label text {
-  fill: #32313F;
-}
-
-.plottable .off-bar-label text {
-  fill: #32313F;
-}
-
-.plottable .stacked-bar-plot .off-bar-label {
-  /* HACKHACK #2795: correct off-bar label logic to be implemented on StackedBar */
-  visibility: hidden !important;
-}
-
-.plottable .axis-label text {
-  font-size: 10px;
-  font-weight: bold;
-  letter-spacing: 1px;
-  line-height: normal;
-  text-transform: uppercase;
-}
-
-.plottable .title-label text {
-  font-size: 20px;
-  font-weight: bold;
-}
-
-.plottable .axis line.baseline {
-  stroke: #CCC;
-  stroke-width: 1px;
-}
-
-.plottable .axis line.tick-mark {
-  stroke: #CCC;
-  stroke-width: 1px;
-}
-
-.plottable .axis text {
-  fill: #32313F;
-  font-family: "Helvetica Neue", sans-serif;
-  font-size: 12px;
-  font-weight: 200;
-  line-height: normal;
-}
-
-.plottable .axis .annotation-circle {
-  fill: white;
-  stroke-width: 1px;
-  stroke: #CCC;
-}
-
-.plottable .axis .annotation-line {
-  stroke: #CCC;
-  stroke-width: 1px;
-}
-
-.plottable .axis .annotation-rect {
-  stroke: #CCC;
-  stroke-width: 1px;
-  fill: white;
-}
-
-.plottable .bar-plot .baseline {
-  stroke: #999;
-}
-
-.plottable .gridlines line {
-  stroke: #3C3C3C; /* hackhack: gridlines should be solid; see #820 */
-  opacity: 0.25;
-  stroke-width: 1px;
-}
-
-.plottable .selection-box-layer .selection-area {
-  fill: black;
-  fill-opacity: 0.03;
-  stroke: #CCC;
-}
-/* DragBoxLayer */
-.plottable .drag-box-layer.x-resizable .drag-edge-lr {
-  cursor: ew-resize;
-}
-.plottable .drag-box-layer.y-resizable .drag-edge-tb {
-  cursor: ns-resize;
-}
-
-.plottable .drag-box-layer.x-resizable.y-resizable .drag-corner-tl {
-  cursor: nwse-resize;
-}
-.plottable .drag-box-layer.x-resizable.y-resizable .drag-corner-tr {
-  cursor: nesw-resize;
-}
-.plottable .drag-box-layer.x-resizable.y-resizable .drag-corner-bl {
-  cursor: nesw-resize;
-}
-.plottable .drag-box-layer.x-resizable.y-resizable .drag-corner-br {
-  cursor: nwse-resize;
-}
-
-.plottable .drag-box-layer.movable .selection-area {
-  cursor: move; /* IE fallback */
-  cursor: -moz-grab;
-  cursor: -webkit-grab;
-  cursor: grab;
-}
-
-.plottable .drag-box-layer.movable .selection-area:active {
-  cursor: -moz-grabbing;
-  cursor: -webkit-grabbing;
-  cursor: grabbing;
-}
-/* /DragBoxLayer */
-
-.plottable .guide-line-layer line.guide-line {
-  stroke: #CCC;
-  stroke-width: 1px;
-}
-
-.plottable .drag-line-layer.enabled.vertical line.drag-edge {
-  cursor: ew-resize;
-}
-
-.plottable .drag-line-layer.enabled.horizontal line.drag-edge {
-  cursor: ns-resize;
-}
-
-.plottable .legend text {
-  fill: #32313F;
-  font-family: "Helvetica Neue", sans-serif;
-  font-size: 12px;
-  font-weight: bold;
-  line-height: normal;
-}
-
-.plottable .interpolated-color-legend rect.swatch-bounding-box {
-  fill: none;
-  stroke: #CCC;
-  stroke-width: 1px;
-  pointer-events: none;
-}
-
-.plottable .waterfall-plot line.connector {
-  stroke: #CCC;
-  stroke-width: 1px;
-}
-
-.plottable .pie-plot .arc.outline {
-  stroke-linejoin: round;
-}
-
-</style>
-
-<dom-module id="tf-chart-scaffold" assetpath="../tf-dashboard-common/">
-  <template>
-    <content></content>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-chart-scaffold",
-      properties: {
-        tag: String,
-        dataProvider: Function,
-        visibleSeries: Array,
-        _attached: {
-          type: Boolean,
-          value: false
-        },
-
-        // Storing the update ID of the previous request for data enables us to determine if a
-        // data response is outdated. We rely on an increasing ID instead of timestamp because
-        // successive updates often fire within the same millisecond.
-        _dataUpdateIdOfLastRequest: Number,
-        _nextAvailableDataUpdateId: {
-          type: Number,
-          value: 1,
-        },
-      },
-      observers: [
-        "reload(tag, dataProvider)",
-        "_changeSeries(visibleSeries.*)"
-      ],
-      ready: function() {
-        this.fire('ready');
-      },
-      attached: function() {
-        this._attached = true;
-        this._changeSeries();
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      reload: function() {
-        if (!this._attached) {
-          return;
-        }
-        else if (!this.dataProvider) {
-          throw new Error('tf-chart-scaffold requires a dataProvider.');
-        }
-        else if (!this.tag) {
-          throw new Error('tf-chart-scaffold requires a tag.');
-        }
-
-        // TODO(chizeng): At this point, notify effective children that the previous data has been
-        // invalidated. For instance, the image dashboard may want to clear its images. Today, the
-        // chart scaffold only informs children when the new image URLs response finishes loading.
-
-        const dataUpdateId = this._nextAvailableDataUpdateId++;
-        this._dataUpdateIdOfLastRequest = dataUpdateId;
-
-        this.visibleSeries.forEach(function(name) {
-          this.dataProvider(this.tag, name).then(function(data) {
-            if (dataUpdateId != this._dataUpdateIdOfLastRequest) {
-              // This response is outdated. Ignore it.
-              // TODO(chizeng): Explore canceling an outdated request before we even receive its
-              // response. This involves creating hooks into the request manager and might introduce
-              // some complexity that may not be worth it; Tensorboard frankly does not seem
-              // bottlenecked by the network (It is often run in fast corp networks or locally.).
-              return;
-            }
-            this.chart().setSeriesData(name, data);
-          }.bind(this));
-        }.bind(this));
-      },
-      _changeSeries: function() {
-        if (!this._attached) {
-           return;
-        }
-        else if (!this.visibleSeries) {
-          throw new Error('tf-chart-scaffold requires a visibleSeries.');
-        }
-
-        this.chart().setVisibleSeries(this.visibleSeries);
-        this.reload();
-      },
-      chart: function() {
-        var children = this.getEffectiveChildren();
-        if (!children.length) {
-          throw new Error('tf-chart-scaffold has no children');
-        }
-
-        var child = children[0];
-        if (!child.setVisibleSeries || !child.setSeriesData) {
-          throw new Error("tf-chart-scaffold's content doesn't implement the " +
-              "required interface");
-        }
-        return child;
-      }
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-panes-helper" assetpath="../tf-dashboard-common/">
-  <template>
-    <content></content>
-    <tf-no-data-warning data-type="[[dataType]]" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-
-    <template is="dom-repeat" items="[[categories]]" as="category">
-      <tf-collapsable-pane name="[[category.name]]" count="[[_count(category.tags, selectedRuns.*)]]">
-        <div class="layout horizontal wrap">
-          <template is="dom-repeat" items="[[_categoryCards(category, selectedRuns.*, run2tag.*)]]">
-              <div class="card">
-                <div class="card-title-container" style="border-color: [[_titleBorderColor(item.run)]]">
-                  <div class="card-title" inner-h-t-m-l="[[_break(item.tag)]]"></div>
-                  <template is="dom-if" if="[[repeatForRuns]]">
-                    <div class="card-subtitle" title="[[item.run]]">[[item.run]]</div>
-                  </template>
-                </div>
-                <div class="card-content">
-                  <tf-chart-scaffold tag="[[item.tag]]" data-provider="[[dataProvider]]" visible-series="[[item.runs]]" on-ready="_instantiateTemplate">
-
-                  </tf-chart-scaffold>
-                </div>
-                <div class="card-bottom-row">
-                  <paper-icon-button class="expand-button" icon="fullscreen" on-tap="_toggleExpanded"></paper-icon-button>
-                  <template is="dom-if" if="[[showDownloadLinks]]">
-                    <tf-downloader runs="[[item.runs]]" tag="[[item.tag]]" url-fn="[[downloadLinkUrlFunction]]">
-                    </tf-downloader>
-                  </template>
-                </div>
-              </div>
-          </template>
-        </div>
-      </tf-collapsable-pane>
-    </template>
-
-    <style>
-      .card {
-        height: var(--card-height, 200px);
-        width: var(--card-width, 300px);
-        display: flex;
-        flex-direction: column;
-        margin: 5px;
-        padding: 0 30px 35px 0;
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        position: relative;
-      }
-
-      .card-expanded {
-        height: var(--card-expanded-height, 400px);
-        width: var(--card-expanded-width, 100%);
-      }
-
-      .card-title, .card-subtitle {
-        flex-grow: 0;
-        flex-shrink: 0;
-        font-size: 14px;
-        text-overflow: ellipsis;
-        overflow: hidden;
-      }
-
-      .card-subtitle {
-        font-size: 12px;
-      }
-
-      .card-content {
-        flex-grow: 1;
-        flex-shrink: 1;
-        display: flex;
-        margin-top: 10px;
-      }
-
-      .card-bottom-row {
-        position: absolute;
-        left: 0px;
-        bottom: 0px;
-        width: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        pointer-events: none;
-      }
-
-      .card-title-container {
-        border-left: 4px solid;
-        padding-left: 5px;
-      }
-
-      .expand-button {
-        color: #2196F3;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-        pointer-events: auto;
-      }
-
-      .card-expanded .expand-button {
-        background: var(--tb-ui-light-accent);
-      }
-
-      tf-downloader {
-        margin-right: 30px;
-        pointer-events: auto;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-panes-helper",
-      properties: {
-        /**
-         * Categories that separate the template instances. Each category will
-         * be given its own collapsible pane. The category must be an array of
-         * objects, each with a 'name' property and a 'tags' array of strings.
-         */
-        categories: Array,
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * The name of the data type that is used by this dashboard. This will
-         * be used to display what is missing when there is no data available.
-         */
-        dataType: String,
-
-        /**
-         * The function that requests and returns a promise with the data of the
-         * required type for the templates from the backend.
-         */
-        dataProvider: Object,
-
-        /**
-         * If false, instantiates one template for each tag and calls
-         * setVisibleSeries on the first element of the template with all valid
-         * runs the tag has. If true, instantiates one template for each run of
-         * each tag, and calls setVisibleSeries of the first element of the
-         * instantiated template with just the one run.
-         */
-        repeatForRuns: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Array with the runs that are selected by the user (i.e. valid to be
-         * displayed).
-         */
-        selectedRuns: Array,
-
-        /**
-         * If true, shows a menu with download links for the template data.
-         * If this is set to true, urlFn must also be provided.
-         */
-        showDownloadLinks: Boolean,
-
-        /**
-         * Function that returns the route to get data to download. Must be
-         * provided if showDownloadLinks is enabled.
-         */
-        downloadLinkUrlFunction: Function,
-        _contentTemplate: {
-          type: Object,
-          value: null
-        },
-        _stampedTemplates: {
-          type: Array,
-          value: function() { return [] }
-        }
-      },
-      behaviors: [
-        Polymer.Templatizer,
-      ],
-
-      /**
-       * Initializes the Polymer.Templatizer behavior with the template supplied
-       * by the user. With this, all calls to this.stamp() will produce an
-       * instance of the user template.
-       */
-      _initTemplatizer: function() {
-        if (!this._contentTemplate) {
-          // First template is used as the content.
-          this._contentTemplate = Polymer.dom(this).querySelector('template');
-          this.templatize(this._contentTemplate);
-        }
-      },
-
-      /**
-       * Called every time a tf-chart-scaffold is ready, stamps the user
-       * template inside the scaffold element (before it is attached) and
-       * stores the stamped template in an array to use for data binding
-       * (forwardParentProp/Path).
-       */
-      _instantiateTemplate: function(e) {
-        var scaffold = e.target;
-        this._initTemplatizer();
-        var instance = this.stamp();
-        this._stampedTemplates.push(instance);
-        Polymer.dom(scaffold).appendChild(instance.root);
-      },
-      _toggleExpanded: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-        var scaffold = card.querySelector('tf-chart-scaffold');
-        card.classList.toggle('card-expanded');
-        scaffold.chart().redraw();
-      },
-      _count: function(tags) {
-        if (!this.repeatForRuns) {
-          return tags.length;
-        }
-
-        var targetTags = d3.set(tags);
-        var count = 0;
-        this.selectedRuns.forEach(function(r) {
-          this.run2tag[r].forEach(function(t) {
-            if (targetTags.has(t)) {
-              count++;
-            }
-          });
-        }.bind(this));
-        return count;
-      },
-      _categoryCards: function(category) {
-        var cards = [];
-        category.tags.forEach(function(tag) {
-          var runs = this.selectedRuns.filter(function(r) {
-            return this.run2tag[r] && this.run2tag[r].indexOf(tag) !== -1;
-          }.bind(this));
-
-          if (this.repeatForRuns) {
-            runs.forEach(function(run) {
-              cards.push({tag: tag, run: run, runs: [run]});
-            });
-          } else {
-            cards.push({tag: tag, runs: runs});
-          }
-        }.bind(this));
-
-        return cards;
-      },
-      _titleBorderColor: function(run) {
-        return this.repeatForRuns ? this.colorScale.scale(run) : 'white';
-      },
-
-      /*
-       * Polymer data binding forwarding functions. Check the
-       * Polymer.Templatizer documentation for more information.
-       */
-
-      _forwardParentProp: function(property, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance[property] = value;
-        });
-      },
-      _forwardParentPath: function(path, value) {
-        this._stampedTemplates.forEach(function(instance) {
-          instance.notifyPath(path, value, true);
-        });
-      },
-      // TODO(renatoutsch): implement the instance forwarding for two-way data
-      // binding.
-      // Add breaks to input so it will wrap nicely
-      _break: function(ipt) {
-        return ipt.replace(/([\/_-])/g, "$1<wbr>")
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="tf-storage" assetpath="../tf-storage/">
- <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-/**
- * The Storage Module provides storage for URL parameters, and an API for
- * getting and setting TensorBoard's stateful URI.
- *
- * It generates URI components like: events&runPrefix=train*
- * which TensorBoard uses after like localhost:8000/#events&runPrefix=train*
- * to store state in the URI.
- */
-var TF;
-(function (TF) {
-    var URIStorage;
-    (function (URIStorage) {
-        /**
-         * A key that users cannot use, since TensorBoard uses this to store info
-         * about the active tab.
-         */
-        URIStorage.TAB = '__tab__';
-        /**
-         * The name of the property for users to set on a Polymer component
-         * in order for its stored properties to be stored in the URI unambiguously.
-         * (No need to set this if you want mutliple instances of the component to
-         * share URI state)
-         *
-         * Example:
-         * <my-component disambiguator="0"></my-component>
-         *
-         * The disambiguator should be set to any unique value so that multiple
-         * instances of the component can store properties in URI storage.
-         *
-         * Because it's hard to dereference this variable in HTML property bindings,
-         * it is NOT safe to change the disambiguator string without find+replace
-         * across the codebase.
-         */
-        URIStorage.DISAMBIGUATOR = 'disambiguator';
-        /**
-         * Return a boolean stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getBoolean(key) {
-            var items = _componentToDict(_readComponent());
-            var item = items[key];
-            return item === 'true' ? true : item === 'false' ? false : undefined;
-        }
-        URIStorage.getBoolean = getBoolean;
-        /**
-         * Store a boolean in the URI, with a corresponding key.
-         */
-        function setBoolean(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = value.toString();
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setBoolean = setBoolean;
-        /**
-         * Return a string stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getString(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key];
-        }
-        URIStorage.getString = getString;
-        /**
-         * Store a string in the URI, with a corresponding key.
-         */
-        function setString(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = value;
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setString = setString;
-        /**
-         * Return a number stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getNumber(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key] === undefined ? undefined : +items[key];
-        }
-        URIStorage.getNumber = getNumber;
-        /**
-         * Store a number in the URI, with a corresponding key.
-         */
-        function setNumber(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = '' + value;
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setNumber = setNumber;
-        /**
-         * Return an object stored in the URI, given a corresponding key.
-         * Undefined if not found.
-         */
-        function getObject(key) {
-            var items = _componentToDict(_readComponent());
-            return items[key] === undefined ? undefined : JSON.parse(atob(items[key]));
-        }
-        URIStorage.getObject = getObject;
-        /**
-         * Store an object in the URI, with a corresponding key.
-         */
-        function setObject(key, value) {
-            var items = _componentToDict(_readComponent());
-            items[key] = btoa(JSON.stringify(value));
-            _writeComponent(_dictToComponent(items));
-        }
-        URIStorage.setObject = setObject;
-        /**
-         * Get a unique storage name for a (Polymer component, propertyName) tuple.
-         *
-         * DISAMBIGUATOR must be set on the component, if other components use the
-         * same propertyName.
-         */
-        function getURIStorageName(component, propertyName) {
-            var d = component[URIStorage.DISAMBIGUATOR];
-            var components = d == null ? [propertyName] : [d, propertyName];
-            return components.join('.');
-        }
-        URIStorage.getURIStorageName = getURIStorageName;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer boolean property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getBooleanInitializer(propertyName, defaultVal) {
-            return _getInitializer(getBoolean, propertyName, defaultVal);
-        }
-        URIStorage.getBooleanInitializer = getBooleanInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer string property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getStringInitializer(propertyName, defaultVal) {
-            return _getInitializer(getString, propertyName, defaultVal);
-        }
-        URIStorage.getStringInitializer = getStringInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer number property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function getNumberInitializer(propertyName, defaultVal) {
-            return _getInitializer(getNumber, propertyName, defaultVal);
-        }
-        URIStorage.getNumberInitializer = getNumberInitializer;
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer Object property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         *
-         * Generates a deep clone of the defaultVal to avoid mutation issues.
-         */
-        function getObjectInitializer(propertyName, defaultVal) {
-            return _getInitializer(getObject, propertyName, defaultVal);
-        }
-        URIStorage.getObjectInitializer = getObjectInitializer;
-        /**
-         * Return a function that updates URIStorage when a string property changes.
-         */
-        function getBooleanObserver(propertyName, defaultVal) {
-            return _getObserver(getBoolean, setBoolean, propertyName, defaultVal);
-        }
-        URIStorage.getBooleanObserver = getBooleanObserver;
-        /**
-         * Return a function that updates URIStorage when a string property changes.
-         */
-        function getStringObserver(propertyName, defaultVal) {
-            return _getObserver(getString, setString, propertyName, defaultVal);
-        }
-        URIStorage.getStringObserver = getStringObserver;
-        /**
-         * Return a function that updates URIStorage when a number property changes.
-         */
-        function getNumberObserver(propertyName, defaultVal) {
-            return _getObserver(getNumber, setNumber, propertyName, defaultVal);
-        }
-        URIStorage.getNumberObserver = getNumberObserver;
-        /**
-         * Return a function that updates URIStorage when an object property changes.
-         * Generates a deep clone of the defaultVal to avoid mutation issues.
-         */
-        function getObjectObserver(propertyName, defaultVal) {
-            var clone = _.cloneDeep(defaultVal);
-            return _getObserver(getObject, setObject, propertyName, clone);
-        }
-        URIStorage.getObjectObserver = getObjectObserver;
-        /**
-         * Read component from URI (e.g. returns "events&runPrefix=train*").
-         */
-        function _readComponent() {
-            return TF.Globals.USE_HASH ? window.location.hash.slice(1) :
-                TF.Globals.FAKE_HASH;
-        }
-        /**
-         * Write component to URI.
-         */
-        function _writeComponent(component) {
-            if (TF.Globals.USE_HASH) {
-                window.location.hash = component;
-            }
-            else {
-                TF.Globals.FAKE_HASH = component;
-            }
-        }
-        /**
-         * Convert dictionary of strings into a URI Component.
-         * All key value entries get added as key value pairs in the component,
-         * with the exception of a key with the TAB value, which if present
-         * gets prepended to the URI Component string for backwards comptability
-         * reasons.
-         */
-        function _dictToComponent(items) {
-            var component = '';
-            // Add the tab name e.g. 'events', 'images', 'histograms' as a prefix
-            // for backwards compatbility.
-            if (items[URIStorage.TAB] !== undefined) {
-                component += items[URIStorage.TAB];
-            }
-            // Join other strings with &key=value notation
-            var nonTab = _.pairs(items)
-                .filter(function (pair) { return pair[0] !== URIStorage.TAB; })
-                .map(function (pair) {
-                return encodeURIComponent(pair[0]) + '=' +
-                    encodeURIComponent(pair[1]);
-            })
-                .join('&');
-            return nonTab.length > 0 ? (component + '&' + nonTab) : component;
-        }
-        /**
-         * Convert a URI Component into a dictionary of strings.
-         * Component should consist of key-value pairs joined by a delimiter
-         * with the exception of the tabName.
-         * Returns dict consisting of all key-value pairs and
-         * dict[TAB] = tabName
-         */
-        function _componentToDict(component) {
-            var items = {};
-            var tokens = component.split('&');
-            tokens.forEach(function (token) {
-                var kv = token.split('=');
-                // Special backwards compatibility for URI components like #events
-                if (kv.length === 1 && _.contains(TF.Globals.TABS, kv[0])) {
-                    items[URIStorage.TAB] = kv[0];
-                }
-                else if (kv.length === 2) {
-                    items[decodeURIComponent(kv[0])] = decodeURIComponent(kv[1]);
-                }
-            });
-            return items;
-        }
-        /**
-         * Return a function that:
-         * (1) Initializes a Polymer property with a default value, if its
-         *     value is not already set
-         * (2) Sets up listener that updates Polymer property on hash change.
-         */
-        function _getInitializer(get, propertyName, defaultVal) {
-            return function () {
-                var _this = this;
-                var URIStorageName = getURIStorageName(this, propertyName);
-                // setComponentValue will be called every time the hash changes, and is
-                // responsible for ensuring that new state in the hash will be propagated
-                // to the component with that property.
-                // It is important that this function does not re-assign needlessly,
-                // to avoid Polymer observer churn.
-                var setComponentValue = function () {
-                    var uriValue = get(URIStorageName);
-                    var currentValue = _this[propertyName];
-                    // if uriValue is undefined, we will ensure that the property has the
-                    // default value
-                    if (uriValue === undefined) {
-                        if (!_.isEqual(currentValue, defaultVal)) {
-                            // If we don't have an explicit URI value, then we need to ensure
-                            // the property value is equal to the default value.
-                            // We will assign a clone rather than the canonical default, because
-                            // the component receiving this property may mutate it, and we need
-                            // to keep a pristine copy of the default.
-                            _this[propertyName] = _.clone(defaultVal);
-                        }
-                    }
-                    else if (!_.isEqual(uriValue, currentValue)) {
-                        _this[propertyName] = uriValue;
-                    }
-                };
-                // Set the value on the property.
-                setComponentValue();
-                // Update it when the hashchanges.
-                window.addEventListener('hashchange', setComponentValue);
-            };
-        }
-        /**
-         * Return a function that updates URIStorage when a property changes.
-         */
-        function _getObserver(get, set, propertyName, defaultVal) {
-            return function () {
-                var URIStorageName = getURIStorageName(this, propertyName);
-                var newVal = this[propertyName];
-                if (!_.isEqual(newVal, get(URIStorageName))) {
-                    if (_.isEqual(newVal, defaultVal)) {
-                        _unset(URIStorageName);
-                    }
-                    else {
-                        set(URIStorageName, newVal);
-                    }
-                }
-            };
-        }
-        /**
-         * Delete a key from the URI.
-         */
-        function _unset(key) {
-            var items = _componentToDict(_readComponent());
-            delete items[key];
-            _writeComponent(_dictToComponent(items));
-        }
-    })(URIStorage = TF.URIStorage || (TF.URIStorage = {}));
-})(TF || (TF = {}));
-</script>
-</dom-module>
-
-<dom-module id="tf-regex-group" assetpath="../tf-dashboard-common/">
-  <template>
-    <div class="regex-list">
-      <template is="dom-repeat" items="{{rawRegexes}}">
-        <div class="regex-line">
-          <paper-input id="text-input" class="regex-input" label="Write a regex to create a tag group" no-label-float="" value="{{item.regex}}" invalid="[[!item.valid]]" on-keyup="moveFocus"></paper-input>
-          <paper-icon-button icon="close" class="delete-button" aria-label="Delete Regex" tabindex="0" on-tap="deleteRegex"></paper-icon-button>
-        </div>
-        <style>
-          .regex-input {
-            width: 250px;
-            display: inline-block;
-            margin-left: -3px;
-          }
-
-          .delete-button {
-            color: var(--paper-grey-700);
-            width: 40px;
-            height: 40px;
-            margin-right: -10px;
-          }
-
-          .regex-list {
-            margin-bottom: 10px;
-          }
-
-          paper-input {
-            --paper-input-container-focus-color: var(--tb-orange-strong);
-            --paper-input-container-input: {
-              font-size: 14px;
-            };
-            --paper-input-container-label: {
-              font-size: 14px;
-            };
-          }
-        </style>
-      </template>
-    </div>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-regex-group",
-      properties: {
-        rawRegexes: {
-          type: Array,
-          value: TF.URIStorage.getObjectInitializer('rawRegexes', [{regex: "", valid: true}]),
-        },
-        regexes: {type: Array, computed: "usableRegexes(rawRegexes.*)", notify: true},
-      },
-      observers: [
-        "addNewRegexIfNeeded(rawRegexes.*)",
-        "checkValidity(rawRegexes.*)",
-        "_uriStoreRegexes(rawRegexes.*)",
-      ],
-      _uriStoreRegexes: TF.URIStorage.getObjectObserver('rawRegexes', [{regex: "", valid: true}]),
-      checkValidity: function(x) {
-        var match = x.path.match(/rawRegexes\.(\d+)\.regex/);
-        if (match) {
-          var idx = match[1];
-          this.set("rawRegexes." + idx + ".valid", this.isValid(x.value));
-        }
-      },
-      isValid: function(s) {
-        try {
-          new RegExp(s);
-          return true;
-        } catch (e) {
-          return false;
-        }
-      },
-      usableRegexes: function(regexes) {
-        var isValid = this.isValid;
-        return regexes.base.filter(function (r) {
-          // Checking validity here (rather than using the data property)
-          // is necessary because otherwise we might send invalid regexes due
-          // to the fact that this function can call before the observer does
-          return r.regex !== "" && isValid(r.regex);
-        }).map(function(r) {
-          return r.regex;
-        });
-      },
-      addNewRegexIfNeeded: function() {
-        var last = this.rawRegexes[this.rawRegexes.length - 1];
-        if (last.regex !== "") {
-          this.push("rawRegexes", {regex: "", valid: true});
-        }
-      },
-      deleteRegex: function(e) {
-        if (this.rawRegexes.length > 1) {
-          this.splice("rawRegexes", e.model.index, 1);
-        }
-      },
-      moveFocus: function(e) {
-        if (e.keyCode === 13) {
-          var idx = e.model.index;
-          var inputs = Polymer.dom(this.root).querySelectorAll(".regex-input");
-          if (idx < this.rawRegexes.length - 1) {
-            inputs[idx+1].$.input.focus();
-          } else {
-            document.activeElement.blur();
-          }
-        }
-      }
-    });
-  </script>
-</dom-module>
-<link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
-
-<dom-module id="tf-categorizer" assetpath="../tf-dashboard-common/">
-  <template>
-    <div class="inputs">
-      <tf-regex-group id="regexGroup" regexes="{{regexes}}"></tf-regex-group>
-    </div>
-    <style>
-      :host {
-        display: block;
-        padding-bottom: 5px;
-      }
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--paper-grey-600);
-        --paper-checkbox-unchecked-color: var(--paper-grey-600);
-        font-size: 14px;
-      }
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var Categorizer;
-(function (Categorizer) {
-    /**
-     * This module contains methods that allow sorting tags into 'categories'.
-     * A category contains a name and a list of tags.
-     * The sorting strategy is defined by a 'CustomCategorization', which contains
-     * 'categoryDefinitions' which are regex rules used to construct a category.
-     * E.g. the regex rule 'xent' will create a category called 'xent' that
-     * contains values whose tags match the regex.
-     *
-     * After custom categories are evaluated, the tags are sorted by a hardcoded
-     * fallback categorizer, which may, for example, group tags into categories
-     * based on their top namespace.
-     */
-    /* Canonical TensorFlow ops are namespaced using forward slashes.
-     * This fallback categorizer categorizes by the top-level namespace.
-     */
-    Categorizer.topLevelNamespaceCategorizer = splitCategorizer(/\//);
-    function fallbackCategorizer(s) {
-        switch (s) {
-            case 'TopLevelNamespaceCategorizer':
-                return Categorizer.topLevelNamespaceCategorizer;
-            default:
-                throw new Error('Unrecognized categorization strategy: ' + s);
-        }
-    }
-    Categorizer.fallbackCategorizer = fallbackCategorizer;
-    /* An 'extractor' is a function that takes a tag name, and 'extracts' a
-     * category name.
-     * This function takes an extractor, and produces a categorizer.
-     * Currently, it is just used for the fallbackCategorizer, but we may want to
-     * refactor the general categorization logic to use the concept of extractors.
-     */
-    function extractorToCategorizer(extractor) {
-        return function (tags) {
-            if (tags.length === 0) {
-                return [];
-            }
-            var sortedTags = tags.slice().sort(VZ.Sorting.compareTagNames);
-            var categories = [];
-            var currentCategory = {
-                name: extractor(sortedTags[0]),
-                tags: [],
-            };
-            sortedTags.forEach(function (t) {
-                var topLevel = extractor(t);
-                if (currentCategory.name !== topLevel) {
-                    categories.push(currentCategory);
-                    currentCategory = {
-                        name: topLevel,
-                        tags: [],
-                    };
-                }
-                currentCategory.tags.push(t);
-            });
-            categories.push(currentCategory);
-            return categories;
-        };
-    }
-    function splitCategorizer(r) {
-        var extractor = function (t) {
-            return t.split(r)[0];
-        };
-        return extractorToCategorizer(extractor);
-    }
-    function defineCategory(ruledef) {
-        var r = new RegExp(ruledef);
-        var f = function (tag) {
-            return r.test(tag);
-        };
-        return { name: ruledef, matches: f };
-    }
-    Categorizer.defineCategory = defineCategory;
-    function _categorizer(rules, fallback) {
-        return function (tags) {
-            var remaining = d3.set(tags);
-            var userSpecified = rules.map(function (def) {
-                var tags = [];
-                remaining.forEach(function (t) {
-                    if (def.matches(t)) {
-                        tags.push(t);
-                    }
-                });
-                var cat = { name: def.name, tags: tags.sort(VZ.Sorting.compareTagNames) };
-                return cat;
-            });
-            var defaultCategories = fallback(remaining.values());
-            return userSpecified.concat(defaultCategories);
-        };
-    }
-    Categorizer._categorizer = _categorizer;
-    function categorizer(s) {
-        var rules = s.categoryDefinitions.map(defineCategory);
-        var fallback = fallbackCategorizer(s.fallbackCategorizer);
-        return _categorizer(rules, fallback);
-    }
-    Categorizer.categorizer = categorizer;
-    ;
-})(Categorizer || (Categorizer = {}));
-</script>
-  <script>
-    Polymer({
-      is: "tf-categorizer",
-      properties: {
-        regexes: {type: Array},
-        tags: {type: Array},
-        categoriesAreExclusive: {type: Boolean, value: true},
-        fallbackCategorizer: {
-          type: String,
-          value: "TopLevelNamespaceCategorizer",
-        },
-        categorizer: {
-          type: Object,
-          computed: "computeCategorization(regexes.*, categoriesAreExclusive, fallbackCategorizer)",
-        },
-        categories: {type: Array, value: function() {return [];}, notify: true, readOnly: true},
-      },
-      observers: ['recategorize(tags.*, categorizer)'],
-      computeCategorization: function(regexes, categoriesAreExclusive, fallbackCategorizer) {
-        var categorizationStrategy = {
-          categoryDefinitions: regexes.base,
-          categoriesAreExclusive: categoriesAreExclusive,
-          fallbackCategorizer: fallbackCategorizer,
-        };
-        return Categorizer.categorizer(categorizationStrategy);
-      },
-      recategorize: function() {
-        this.debounce("tf-categorizer-recategorize", function (){
-          var categories = this.categorizer(this.tags);
-          this._setCategories(categories);
-        })
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="run-color-style" assetpath="../tf-dashboard-common/">
-  <template>
-    <style>
-    [color-class="light-blue"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-light-blue-500);
-      --paper-checkbox-checked-ink-color: var(--paper-light-blue-500);
-      --paper-checkbox-unchecked-color: var(--paper-light-blue-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-light-blue-900);
-    }
-    [color-class="red"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-red-500);
-      --paper-checkbox-checked-ink-color: var(--paper-red-500);
-      --paper-checkbox-unchecked-color: var(--paper-red-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-red-900);
-    }
-    [color-class="green"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-green-500);
-      --paper-checkbox-checked-ink-color: var(--paper-green-500);
-      --paper-checkbox-unchecked-color: var(--paper-green-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-green-900);
-    }
-    [color-class="purple"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-purple-500);
-      --paper-checkbox-checked-ink-color: var(--paper-purple-500);
-      --paper-checkbox-unchecked-color: var(--paper-purple-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-purple-900);
-    }
-    [color-class="teal"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-teal-500);
-      --paper-checkbox-checked-ink-color: var(--paper-teal-500);
-      --paper-checkbox-unchecked-color: var(--paper-teal-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-teal-900);
-    }
-    [color-class="pink"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-pink-500);
-      --paper-checkbox-checked-ink-color: var(--paper-pink-500);
-      --paper-checkbox-unchecked-color: var(--paper-pink-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-pink-900);
-    }
-    [color-class="orange"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-orange-500);
-      --paper-checkbox-checked-ink-color: var(--paper-orange-500);
-      --paper-checkbox-unchecked-color: var(--paper-orange-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-orange-900);
-    }
-    [color-class="brown"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-brown-500);
-      --paper-checkbox-checked-ink-color: var(--paper-brown-500);
-      --paper-checkbox-unchecked-color: var(--paper-brown-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-brown-900);
-    }
-    [color-class="indigo"] paper-checkbox {
-      --paper-checkbox-checked-color: var(--paper-indigo-500);
-      --paper-checkbox-checked-ink-color: var(--paper-indigo-500);
-      --paper-checkbox-unchecked-color: var(--paper-indigo-900);
-      --paper-checkbox-unchecked-ink-color: var(--paper-indigo-900);
-    }
-    </style>
-  </template>
-</dom-module>
-
-<dom-module id="tf-multi-checkbox" assetpath="../tf-dashboard-common/">
-  <style include="scrollbar-style"></style>
-  <style include="run-color-style"></style>
-
-  <template>
-      <paper-input id="runs-regex" no-label-float="" label="Write a regex to filter runs" value="[[regexInput]]" on-bind-value-changed="_debouncedRegexChange"></paper-input>
-    <div id="outer-container" class="scrollbar">
-      <template is="dom-repeat" items="[[namesMatchingRegex]]">
-        <div class="run-row">
-          <div class="icon-container checkbox-container vertical-align-container">
-            <paper-checkbox class="checkbox vertical-align-center" name="[[item]]" checked$="[[_isChecked(item, runSelectionState.*)]]" on-change="_checkboxChange"></paper-checkbox>
-
-          </div>
-          <div class="icon-container isolator-container vertical-align-container">
-            <paper-icon-button icon="radio-button-unchecked" class="isolator vertical-align-center" on-tap="_isolateRun" name="[[item]]"></paper-icon-button>
-          </div>
-          <div class="item-label-container">
-            <span>[[item]]</span>
-          </div>
-        </div>
-      </template>
-    </div>
-  <style>
-    paper-input {
-      --paper-input-container-focus-color: var(--tb-orange-strong);
-      --paper-input-container-input: {
-        font-size: 14px;
-      };
-      --paper-input-container-label: {
-        font-size: 14px;
-      };
-    }
-    :host {
-      display: flex;
-      flex-direction: column;
-      height: 100%;
-    }
-    #outer-container {
-      overflow-y: auto;
-      overflow-x: hidden;
-      width: 100%;
-      height: 0; /* Quirk to make firefox add scrolling instead of expand div */
-      flex-grow: 1;
-      flex-shrink: 1;
-      word-wrap: break-word;
-    }
-    .run-row {
-      padding-top: 5px;
-      padding-bottom: 5px;
-      display: flex;
-      flex-direction: row;
-      font-size: 13px;
-    }
-    .icon-container {
-      flex-grow: 0;
-      flex-shrink: 0;
-      padding-left: 2px;
-    }
-    .checkbox {
-      padding-left: 2px;
-      width: 18px;
-      height: 18px;
-    }
-    .isolator {
-      width: 18px;
-      height: 18px;
-      padding: 0px;
-    }
-    .isolator-container {
-      padding-left: 6px;
-      padding-right: 3px;
-    }
-    .checkbox-container {
-      padding-left: 2px;
-    }
-    .item-label-container {
-      padding-left: 5px;
-      flex-grow: 1;
-      flex-shrink: 1;
-      width: 0px; /* hack to get the flex-grow to work properly */
-    }
-    .tooltip-value-container {
-      display: flex;
-      justify-content: center;
-      flex-grow: 0;
-      flex-shrink: 0;
-      text-align:right;
-      padding-left: 2px;
-    }
-    .vertical-align-container {
-      display: flex;
-      justify-content: center;
-    }
-    .vertical-align-container .vertical-align-center {
-      align-self: center;
-    }
-    .vertical-align-container .vertical-align-top {
-      align-self: start;
-    }
-  </style>
-  </template>
-
-  <script>
-  Polymer({
-    is: "tf-multi-checkbox",
-    properties: {
-      names: {
-        type: Array,
-        value: function() {return [];},
-      }, // All the runs in consideration
-      regexInput: {
-        type: String,
-        value: TF.URIStorage.getStringInitializer("regexInput", ""),
-        observer: "_regexInputObserver",
-      }, // Regex for filtering the runs
-      regex: {
-        type: Object,
-        computed: "_makeRegex(regexInput)"
-      },
-      namesMatchingRegex: {
-        type: Array,
-        computed: "computeNamesMatchingRegex(names.*, regex)"
-      }, // Runs that match the regex
-      runSelectionState: {
-      // if a run is explicitly enabled, True, if explicitly disabled, False.
-      // if undefined, default value (enable for first k runs, disable after).
-        type: Object,
-        value: TF.URIStorage.getObjectInitializer('runSelectionState', {}),
-      },
-      // (Allows state to persist across regex filtering)
-      outSelected: {
-        type: Array,
-        notify: true,
-        computed: 'computeOutSelected(namesMatchingRegex.*, runSelectionState.*)'
-      },
-      colorScale: {
-        type: Object,
-        observer: "synchronizeColors",
-      }, // map from run name to css class
-      numRunsEnabledByDefault: {
-        // When TB first loads, first k runs are enabled, rest are disabled.
-        type: Number,
-        value: 10,
-      },
-      _debouncedRegexChange: {
-        type: Function,
-        // Updating the regex can be slow, because it involves updating styles
-        // on a large number of Polymer paper-checkboxes. We don't want to do
-        // this while the user is typing, as it may make a bad, laggy UI.
-        // So we debounce the updates that come from user typing.
-        value: function() {
-          _this = this;
-          var debounced = _.debounce(function(r) {
-            _this.regexInput = r;
-          }, 150, {leading: false});
-          return function() {
-            var r = this.$$("#runs-regex").value;
-            if (r == "") {
-              // If the user cleared the field, they may be done typing, so
-              // update more quickly.
-              this.async(function() {
-                _this.regexInput = r;
-              }, 30);
-            } else {
-              debounced(r);
-            };
-          };
-        },
-      },
-    },
-    listeners: {
-      'dom-change': 'synchronizeColors',
-    },
-    observers: [
-      "_setIsolatorIcon(runSelectionState, names)",
-      "_storeRunToIsCheckedMapping(runSelectionState)",
-    ],
-    _storeRunToIsCheckedMapping: TF.URIStorage.getObjectObserver('runSelectionState', {}),
-    _makeRegex: function(regex) {
-      try {
-        return new RegExp(regex)
-      } catch (e) {
-        return null;
-      }
-    },
-    _setIsolatorIcon: function() {
-      var runMap = this.runSelectionState;
-      var numChecked = _.filter(_.values(runMap)).length;
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-
-      buttons.forEach(function(b) {
-        if (numChecked === 1 && runMap[b.name]) {
-          b.icon = "radio-button-checked";
-        } else {
-          b.icon = "radio-button-unchecked";
-        }
-      });
-    },
-    computeNamesMatchingRegex: function(__, ___) {
-      var regex = this.regex;
-      return this.names.filter(function(n) {
-        return regex == null || regex.test(n);
-      });
-    },
-    computeOutSelected: function(__, ___) {
-      var runSelectionState = this.runSelectionState;
-      var num = this.numRunsEnabledByDefault;
-      return this.namesMatchingRegex.filter(function(n, i) {
-        return runSelectionState[n] == null ? i<num : runSelectionState[n];
-      });
-    },
-    synchronizeColors: function(e) {
-      if (!this.colorScale) return;
-
-      this._setIsolatorIcon();
-
-      var checkboxes = Array.prototype.slice.call(this.querySelectorAll("paper-checkbox"));
-      var scale = this.colorScale;
-      checkboxes.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.customStyle['--paper-checkbox-checked-color'] = color;
-        p.customStyle['--paper-checkbox-checked-ink-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-color'] = color;
-        p.customStyle['--paper-checkbox-unchecked-ink-color'] = color;
-      });
-      var buttons = Array.prototype.slice.call(this.querySelectorAll(".isolator"));
-      buttons.forEach(function(p) {
-        var color = scale.scale(p.name);
-        p.style['color'] = color;
-      });
-      // The updateStyles call fails silently if the browser doesn't have focus,
-      // e.g. if TensorBoard was opened into a new tab that isn't visible.
-      // So we wait for requestAnimationFrame.
-      var _this = this;
-      window.requestAnimationFrame(function() {_this.updateStyles();});
-    },
-    _isolateRun: function(e) {
-      // If user clicks on the label for one run, enable it and disable all other runs.
-
-      var name = Polymer.dom(e).localTarget.name;
-      var selectionState = {};
-      this.names.forEach(function(n) {
-        selectionState[n] = n == name;
-      })
-      this.runSelectionState = selectionState;
-    },
-    _checkboxChange: function(e) {
-      var target = Polymer.dom(e).localTarget;
-      this.runSelectionState[target.name] = target.checked;
-      // n.b. notifyPath won't work because run names may have periods.
-      this.runSelectionState = _.clone(this.runSelectionState);
-    },
-    _isChecked: function(item, outSelectedChange) {
-      return this.outSelected.indexOf(item) != -1;
-    },
-    _regexInputObserver: TF.URIStorage.getStringObserver("regexInput", ""),
-    toggleAll: function() {
-      var _this = this;
-      var allToggledOn = this.namesMatchingRegex
-                    .every(function(n) {return _this.runSelectionState[n]});
-
-      var runSelectionStateIsDefault = Object.keys(this.runSelectionState).length == 0;
-
-      var numRuns = this.namesMatchingRegex.length;
-
-      var shouldDisable = allToggledOn || runSelectionStateIsDefault;
-
-      var newRunsDisabled = {};
-      this.names.forEach(function(n) {
-        newRunsDisabled[n] = !shouldDisable;
-      })
-      this.runSelectionState = newRunsDisabled;
-    },
-  });
-  </script>
-</dom-module>
-
-<dom-module id="tf-run-selector" assetpath="../tf-dashboard-common/">
-  <template>
-    <paper-dialog with-backdrop="" id="logdir-dialog">
-      <h2>logdir</h2>
-      <div inner-h-t-m-l="{{_breakString(logdir)}}"></div>
-    </paper-dialog>
-    <div id="top-text">
-      <h3 id="tooltip-help" class="tooltip-container">
-        Runs
-      </h3>
-    </div>
-    <tf-multi-checkbox id="multiCheckbox" names="[[runs]]" out-selected="{{outSelected}}" color-scale="[[colorScale]]"></tf-multi-checkbox>
-    <paper-button class="x-button" id="toggle-all" on-tap="_toggleAll">
-    Toggle All Runs
-    </paper-button>
-    <template is="dom-if" if="[[logdir]]">
-      <div id="logdir">
-        <span id="clipped-logdir" inner-h-t-m-l="[[_clippedLogdir]]"></span><template is="dom-if" if="[[_shouldShowExpandLogdirButton(logdir, _logdirClipLength)]]"><a href="" on-click="_openLogdirDialog">…</a>
-        </template>
-      </div>
-    </template>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        padding-bottom: 10px;
-        box-sizing: border-box;
-      }
-      #top-text {
-        width: 100%;
-        flex-grow: 0;
-        flex-shrink: 0;
-        padding-right: 16px;
-        box-sizing: border-box;
-        color: var(--paper-grey-800);
-      }
-      tf-multi-checkbox {
-        display: flex;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      .x-button {
-        font-size: 13px;
-        background-color: var(--tb-ui-light-accent);
-        color: var(--tb-ui-dark-accent);
-      }
-      #tooltip-help {
-        color: var(--paper-grey-800);
-        margin: 0;
-        font-weight: normal;
-        font-size: 14px;
-        margin-bottom: 5px;
-      }
-      paper-button {
-        margin-left: 0;
-      }
-      #logdir {
-        color: var(--tb-ui-dark-accent);
-        font-size: 13px;
-        margin: 5px 0 0 0;
-        max-width: 288px;
-      }
-    </style>
-  </template>
-  <script>
-  Polymer({
-    is: "tf-run-selector",
-    properties: {
-      backend: Object,
-      outSelected: {type: Array, notify: true},
-      // runs: an array of strings, representing the run names that may be chosen
-      runs: Array,
-      colorScale: Object, // TF.ColorScale
-      logdir: {
-        type: String,
-        notify: true,
-      },
-      // This is the potentially clipped portion of the logdir we show at the bottom of the sidebar.
-      _clippedLogdir: {
-        type: String,
-      },
-      _logdirClipLength: {
-        type: Number,
-        value: 250,
-        readOnly: true,
-      },
-    },
-    observers: [
-      "_logdirSet(logdir)",
-    ],
-    ready: function() {
-      // Populate the logdir.
-      this.backend.logdir().then(logdirObject => {
-        this.set('logdir', logdirObject.logdir);
-      }).catch(e => {
-        // Fetching the logdir failed. Prevent the exception from logging to
-        // console. The console already logs a 404 network event.
-      });
-    },
-    _toggleAll: function() {
-      this.$.multiCheckbox.toggleAll();
-    },
-    // Break the string at natural points, including commas, equals, and slashes
-    _breakString: function(originalString) {
-      return originalString.replace(/([\/=\-_,])/g, "$1<wbr>");
-    },
-    _logdirSet: function(logdir) {
-      if (!logdir) {
-        // The logdir has not been set yet.
-        return;
-      }
-
-      var lineBrokenText;
-      if (logdir.length > this._logdirClipLength) {
-        // Clip the logdir to avoid blocking the runs selector. Let the user view a more full
-        // version of the logdir.
-        lineBrokenText = this._breakString(logdir.substring(0, this._logdirClipLength));
-      } else {
-        lineBrokenText = this._breakString(logdir);
-      }
-      this.set('_clippedLogdir', lineBrokenText);
-    },
-    _openLogdirDialog: function(event) {
-      event.preventDefault();
-      this.$$('#logdir-dialog').open();
-    },
-    _shouldShowExpandLogdirButton(logdir, _logdirClipLength) {
-      return logdir && logdir.length > _logdirClipLength;
-    },
-  });
-  </script>
-</dom-module>
-
-<dom-module id="tf-sidebar-helper" assetpath="../tf-dashboard-common/">
-  <template>
-    <div class="sidebar-section">
-      <tf-categorizer id="categorizer" tags="[[tags]]" categories="{{categories}}"></tf-categorizer>
-      <content select=".extend-first-section"></content>
-    </div>
-    <content></content>
-    <div class="sidebar-section">
-      <tf-run-selector id="runSelector" backend="[[backend]]" runs="[[runs]]" color-scale="[[colorScale]]" out-selected="{{selectedRuns}}"></tf-run-selector>
-    </div>
-    <style include="dashboard-style"></style>
-    <style>
-      :host {
-        display: flex;
-        flex-direction: column;
-        height: 100%;
-      }
-
-      #categorizer {
-        flex-shrink: 0;
-      }
-
-      #runSelector {
-        flex-shrink: 1;
-        flex-grow: 1;
-      }
-
-      .sidebar-section {
-        border-top: solid 1px rgba(0, 0, 0, 0.12);
-        padding: 20px 0px 20px 30px;
-      }
-
-      .sidebar-section:first-child {
-        border: none;
-      }
-
-      .sidebar-section:last-child {
-        flex-grow: 1;
-        display: flex;
-      }
-
-      paper-checkbox {
-        --paper-checkbox-checked-color: var(--tb-ui-dark-accent);
-        --paper-checkbox-unchecked-color: var(--tb-ui-dark-accent);
-        font-size: 14px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-sidebar-helper",
-      properties: {
-        /**
-         * The backend object used to issue requests.
-         */
-        backend: Object,
-
-        /**
-         * This is an output of the categories that the user selected to
-         * separate the different tags. Each category here should be given its
-         * own collapsible pane.
-         */
-        categories: {
-          type: Array,
-          notify: true,
-        },
-
-        /**
-         * Input of the colors that are used for the user's runs.
-         */
-        colorScale: Object,
-
-        /**
-         * Map from runs to the valid tags that have them.
-         */
-        run2tag: Object,
-
-        /**
-         * Input of all valid runs that can be selected by the user.
-         */
-        runs: Array,
-
-        /**
-         * Outputs an array with the runs that are selected by the user (i.e.
-         * valid to be displayed).
-         */
-        selectedRuns: {
-          type: Array,
-          notify: true,
-        },
-
-        tags: {
-          type: Array,
-          computed: "_getTags(run2tag.*)"
-        },
-      },
-      _getTags: function() {
-        return _.union.apply(null, _.values(this.run2tag));
-      },
-    })
-  </script>
-</dom-module>
-
-<dom-module id="vz-line-chart" assetpath="../vz-line-chart/">
-  <template>
-    <div id="tooltip">
-      <table>
-        <thead>
-          <tr>
-            <th></th>
-            <th>Name</th>
-            <template is="dom-if" if="{{smoothingEnabled}}">
-              <th>Smoothed</th>
-            </template>
-            <th>Value</th>
-            <th>Step</th>
-            <th>Time</th>
-            <th>Relative</th>
-          </tr>
-        </thead>
-        <tbody>
-        </tbody>
-      </table>
-    </div>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-        outline: none;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-      td {
-        padding-left: 5px;
-        padding-right: 5px;
-        font-size: 13px;
-        opacity: 1;
-      }
-      #tooltip {
-        pointer-events: none;
-        position: absolute;
-        opacity: 0;
-        box-shadow: 0 1px 4px rgba(0, 0, 0, 0.3);
-        font-size: 14px;
-        background: rgba(0, 0, 0, 0.8);
-        color: white;
-        border-radius: 4px;
-        line-height: 1.4em;
-        padding: 8px;
-        z-index: 5;
-        cursor: none;
-        margin-top: 10px;
-      }
-      .swatch {
-        border-radius: 50%;
-        width: 14px;
-        height: 14px;
-        display: block;
-        border: 2px solid rgba(0,0,0,0);
-      }
-      .closest .swatch {
-        border: 2px solid white;
-      }
-      th {
-        padding-left: 5px;
-        padding-right: 5px;
-        text-align: left;
-      }
-      .distant td {
-        opacity: 0.8;
-      }
-
-      .distant td.swatch {
-        opacity: 1;
-      }
-
-      .ghost {
-        opacity: 0.2;
-        stroke-width: 1px;
-      }
-
-      #chartsvg line.guide-line {
-        stroke: #999;
-        stroke-width: 1.5px;
-      }
-
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var Plottable;
-(function (Plottable) {
-    var DragZoomLayer = (function (_super) {
-        __extends(DragZoomLayer, _super);
-        /**
-         * Constructs a SelectionBoxLayer with an attached DragInteraction and
-         * ClickInteraction. On drag, it triggers an animated zoom into the box
-         * that was dragged. On double click, it zooms back out to the original
-         * view, before any zooming.
-         * The zoom animation uses an easing function (default
-         * d3.ease('cubic-in-out')) and is customizable.
-         * Usage: Construct the selection box layer and attach x and y scales,
-         * and then add the layer over the plot you are zooming on using a
-         * Component Group.
-         * TODO(danmane) - merge this into Plottable
-         */
-        function DragZoomLayer(xScale, yScale, unzoomMethod) {
-            var _this = _super.call(this) || this;
-            _this.isZoomed = false;
-            _this.easeFn = d3.ease('cubic-in-out');
-            _this._animationTime = 750;
-            _this.xScale(xScale);
-            _this.yScale(yScale);
-            _this._dragInteraction = new Plottable.Interactions.Drag();
-            _this._dragInteraction.attachTo(_this);
-            _this._doubleClickInteraction = new Plottable.Interactions.DoubleClick();
-            _this._doubleClickInteraction.attachTo(_this);
-            _this.setupCallbacks();
-            _this.unzoomMethod = unzoomMethod;
-            return _this;
-        }
-        /**
-         * Register a method that calls when the DragZoom interaction starts.
-         */
-        DragZoomLayer.prototype.interactionStart = function (cb) { this.onStart = cb; };
-        /**
-         * Register a method that calls when the DragZoom interaction ends.
-         */
-        DragZoomLayer.prototype.interactionEnd = function (cb) { this.onEnd = cb; };
-        DragZoomLayer.prototype.setupCallbacks = function () {
-            var _this = this;
-            var dragging = false;
-            this._dragInteraction.onDragStart(function (startPoint) {
-                _this.bounds({
-                    topLeft: startPoint,
-                    bottomRight: startPoint,
-                });
-                _this.onStart();
-            });
-            this._dragInteraction.onDrag(function (startPoint, endPoint) {
-                _this.bounds({ topLeft: startPoint, bottomRight: endPoint });
-                _this.boxVisible(true);
-                dragging = true;
-            });
-            this._dragInteraction.onDragEnd(function (startPoint, endPoint) {
-                _this.boxVisible(false);
-                _this.bounds({ topLeft: startPoint, bottomRight: endPoint });
-                if (dragging) {
-                    _this.zoom();
-                }
-                else {
-                    _this.onEnd();
-                }
-                dragging = false;
-            });
-            this._doubleClickInteraction.onDoubleClick(this.unzoom.bind(this));
-        };
-        DragZoomLayer.prototype.animationTime = function (animationTime) {
-            if (animationTime == null) {
-                return this._animationTime;
-            }
-            if (animationTime < 0) {
-                throw new Error('animationTime cannot be negative');
-            }
-            this._animationTime = animationTime;
-            return this;
-        };
-        /**
-         * Set the easing function, which determines how the zoom interpolates
-         * over time.
-         */
-        DragZoomLayer.prototype.ease = function (fn) {
-            if (typeof (fn) !== 'function') {
-                throw new Error('ease function must be a function');
-            }
-            if (fn(0) !== 0 || fn(1) !== 1) {
-                Plottable.Utils.Window.warn('Easing function does not maintain invariant ' +
-                    'f(0)==0 && f(1)==1. Bad behavior may result.');
-            }
-            this.easeFn = fn;
-            return this;
-        };
-        // Zoom into extent of the selection box bounds
-        DragZoomLayer.prototype.zoom = function () {
-            var x0 = this.xExtent()[0].valueOf();
-            var x1 = this.xExtent()[1].valueOf();
-            var y0 = this.yExtent()[1].valueOf();
-            var y1 = this.yExtent()[0].valueOf();
-            if (x0 === x1 || y0 === y1) {
-                return;
-            }
-            if (!this.isZoomed) {
-                this.isZoomed = true;
-            }
-            this.interpolateZoom(x0, x1, y0, y1);
-        };
-        // Restore the scales to their state before any zoom
-        DragZoomLayer.prototype.unzoom = function () {
-            if (!this.isZoomed) {
-                return;
-            }
-            this.isZoomed = false;
-            var xScale = this.xScale();
-            xScale._domainMin = null;
-            xScale._domainMax = null;
-            var xDomain = xScale._getExtent();
-            this.xScale().domain(xDomain);
-            this.unzoomMethod();
-        };
-        // If we are zooming, disable interactions, to avoid contention
-        DragZoomLayer.prototype.isZooming = function (isZooming) {
-            this._dragInteraction.enabled(!isZooming);
-            this._doubleClickInteraction.enabled(!isZooming);
-        };
-        DragZoomLayer.prototype.interpolateZoom = function (x0f, x1f, y0f, y1f) {
-            var _this = this;
-            var x0s = this.xScale().domain()[0].valueOf();
-            var x1s = this.xScale().domain()[1].valueOf();
-            var y0s = this.yScale().domain()[0].valueOf();
-            var y1s = this.yScale().domain()[1].valueOf();
-            // Copy a ref to the ease fn, so that changing ease wont affect zooms in
-            // progress.
-            var ease = this.easeFn;
-            var interpolator = function (a, b, p) {
-                return d3.interpolateNumber(a, b)(ease(p));
-            };
-            this.isZooming(true);
-            var start = Date.now();
-            var draw = function () {
-                var now = Date.now();
-                var passed = now - start;
-                var p = _this._animationTime === 0 ?
-                    1 :
-                    Math.min(1, passed / _this._animationTime);
-                var x0 = interpolator(x0s, x0f, p);
-                var x1 = interpolator(x1s, x1f, p);
-                var y0 = interpolator(y0s, y0f, p);
-                var y1 = interpolator(y1s, y1f, p);
-                _this.xScale().domain([x0, x1]);
-                _this.yScale().domain([y0, y1]);
-                if (p < 1) {
-                    Plottable.Utils.DOM.requestAnimationFramePolyfill(draw);
-                }
-                else {
-                    _this.onEnd();
-                    _this.isZooming(false);
-                }
-            };
-            draw();
-        };
-        return DragZoomLayer;
-    }(Plottable.Components.SelectionBoxLayer));
-    Plottable.DragZoomLayer = DragZoomLayer;
-})(Plottable || (Plottable = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var LineChart = (function () {
-        function LineChart(xType, yScaleType, colorScale, tooltip) {
-            this.seriesNames = [];
-            this.name2datasets = {};
-            this.colorScale = colorScale;
-            this.tooltip = tooltip;
-            this.datasets = [];
-            this._ignoreYOutliers = true;
-            // lastPointDataset is a dataset that contains just the last point of
-            // every dataset we're currently drawing.
-            this.lastPointsDataset = new Plottable.Dataset();
-            this.nanDataset = new Plottable.Dataset();
-            // need to do a single bind, so we can deregister the callback from
-            // old Plottable.Datasets. (Deregistration is done by identity checks.)
-            this.onDatasetChanged = this._onDatasetChanged.bind(this);
-            this.buildChart(xType, yScaleType);
-        }
-        LineChart.prototype.buildChart = function (xType, yScaleType) {
-            if (this.outer) {
-                this.outer.destroy();
-            }
-            var xComponents = VZ.ChartHelpers.getXComponents(xType);
-            this.xAccessor = xComponents.accessor;
-            this.xScale = xComponents.scale;
-            this.xAxis = xComponents.axis;
-            this.xAxis.margin(0).tickLabelPadding(3);
-            this.yScale = LineChart.getYScaleFromType(yScaleType);
-            this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-            var yFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-            this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-            this.yAxis.usesTextWidthApproximation(true);
-            this.dzl = new Plottable.DragZoomLayer(this.xScale, this.yScale, this.updateSpecialDatasets.bind(this));
-            var center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-            this.gridlines =
-                new Plottable.Components.Gridlines(this.xScale, this.yScale);
-            var xZeroLine = new Plottable.Components.GuideLineLayer('horizontal');
-            xZeroLine.scale(this.yScale).value(0);
-            var yZeroLine = new Plottable.Components.GuideLineLayer('vertical');
-            yZeroLine.scale(this.xScale).value(0);
-            this.center = new Plottable.Components.Group([this.gridlines, xZeroLine, yZeroLine, center, this.dzl]);
-            this.outer = new Plottable.Components.Table([
-                [this.yAxis, this.center],
-                [null, this.xAxis]
-            ]);
-        };
-        LineChart.prototype.buildPlot = function (xAccessor, xScale, yScale) {
-            var _this = this;
-            this.scalarAccessor = function (d) { return d.scalar; };
-            this.smoothedAccessor = function (d) { return d.smoothed; };
-            var linePlot = new Plottable.Plots.Line();
-            linePlot.x(xAccessor, xScale);
-            linePlot.y(this.scalarAccessor, yScale);
-            linePlot.attr('stroke', function (d, i, dataset) {
-                return _this.colorScale.scale(dataset.metadata().name);
-            });
-            this.linePlot = linePlot;
-            var group = this.setupTooltips(linePlot);
-            var smoothLinePlot = new Plottable.Plots.Line();
-            smoothLinePlot.x(xAccessor, xScale);
-            smoothLinePlot.y(this.smoothedAccessor, yScale);
-            smoothLinePlot.attr('stroke', function (d, i, dataset) {
-                return _this.colorScale.scale(dataset.metadata().name);
-            });
-            this.smoothLinePlot = smoothLinePlot;
-            // The scatterPlot will display the last point for each dataset.
-            // This way, if there is only one datum for the series, it is still
-            // visible. We hide it when tooltips are active to keep things clean.
-            var scatterPlot = new Plottable.Plots.Scatter();
-            scatterPlot.x(xAccessor, xScale);
-            scatterPlot.y(this.scalarAccessor, yScale);
-            scatterPlot.attr('fill', function (d) { return _this.colorScale.scale(d.name); });
-            scatterPlot.attr('opacity', 1);
-            scatterPlot.size(VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE * 2);
-            scatterPlot.datasets([this.lastPointsDataset]);
-            this.scatterPlot = scatterPlot;
-            var nanDisplay = new Plottable.Plots.Scatter();
-            nanDisplay.x(xAccessor, xScale);
-            nanDisplay.y(function (x) { return x.displayY; }, yScale);
-            nanDisplay.attr('fill', function (d) { return _this.colorScale.scale(d.name); });
-            nanDisplay.attr('opacity', 1);
-            nanDisplay.size(VZ.ChartHelpers.NAN_SYMBOL_SIZE * 2);
-            nanDisplay.datasets([this.nanDataset]);
-            nanDisplay.symbol(Plottable.SymbolFactories.triangleUp);
-            this.nanDisplay = nanDisplay;
-            return new Plottable.Components.Group([nanDisplay, scatterPlot, smoothLinePlot, group]);
-        };
-        /** Updates the chart when a dataset changes. Called every time the data of
-         * a dataset changes to update the charts.
-         */
-        LineChart.prototype._onDatasetChanged = function (dataset) {
-            if (this.smoothingEnabled) {
-                this.resmoothDataset(dataset);
-            }
-            this.updateSpecialDatasets();
-        };
-        LineChart.prototype.ignoreYOutliers = function (ignoreYOutliers) {
-            if (ignoreYOutliers !== this._ignoreYOutliers) {
-                this._ignoreYOutliers = ignoreYOutliers;
-                this.updateSpecialDatasets();
-            }
-        };
-        LineChart.prototype.updateSpecialDatasets = function () {
-            if (this.smoothingEnabled) {
-                this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-            }
-            else {
-                this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-            }
-        };
-        /** Constructs special datasets. Each special dataset contains exceptional
-         * values from all of the regular datasets, e.g. last points in series, or
-         * NaN values. Those points will have a `name` and `relative` property added
-         * (since usually those are context in the surrounding dataset).
-         * The accessor will point to the correct data to access.
-         */
-        LineChart.prototype.updateSpecialDatasetsWithAccessor = function (accessor) {
-            var lastPointsData = this.datasets
-                .map(function (d) {
-                var datum = null;
-                // filter out NaNs to ensure last point is a clean one
-                var nonNanData = d.data().filter(function (x) { return !isNaN(accessor(x, -1, d)); });
-                if (nonNanData.length > 0) {
-                    var idx = nonNanData.length - 1;
-                    datum = nonNanData[idx];
-                    datum.name = d.metadata().name;
-                    datum.relative =
-                        VZ.ChartHelpers.relativeAccessor(datum, -1, d);
-                }
-                return datum;
-            })
-                .filter(function (x) { return x != null; });
-            this.lastPointsDataset.data(lastPointsData);
-            // Take a dataset, return an array of NaN data points
-            // the NaN points will have a "displayY" property which is the
-            // y-value of a nearby point that was not NaN (0 if all points are NaN)
-            var datasetToNaNData = function (d) {
-                var displayY = null;
-                var data = d.data();
-                var i = 0;
-                while (i < data.length && displayY == null) {
-                    if (!isNaN(accessor(data[i], -1, d))) {
-                        displayY = accessor(data[i], -1, d);
-                    }
-                    i++;
-                }
-                if (displayY == null) {
-                    displayY = 0;
-                }
-                var nanData = [];
-                for (i = 0; i < data.length; i++) {
-                    if (!isNaN(accessor(data[i], -1, d))) {
-                        displayY = accessor(data[i], -1, d);
-                    }
-                    else {
-                        data[i].name = d.metadata().name;
-                        data[i].displayY = displayY;
-                        data[i].relative = VZ.ChartHelpers.relativeAccessor(data[i], -1, d);
-                        nanData.push(data[i]);
-                    }
-                }
-                return nanData;
-            };
-            var nanData = _.flatten(this.datasets.map(datasetToNaNData));
-            this.nanDataset.data(nanData);
-            var datasetToValues = function (d) {
-                return d.data().map(function (x) { return accessor(x, -1, d); });
-            };
-            var vals = _.flatten(this.datasets.map(datasetToValues));
-            vals = vals.filter(function (x) { return x === x && x !== Infinity && x !== -Infinity; });
-            var domain = VZ.ChartHelpers.computeDomain(vals, this._ignoreYOutliers);
-            this.yScale.domain(domain);
-        };
-        LineChart.prototype.setupTooltips = function (plot) {
-            var _this = this;
-            var pi = new Plottable.Interactions.Pointer();
-            pi.attachTo(plot);
-            // PointsComponent is a Plottable Component that will hold the little
-            // circles we draw over the closest data points
-            var pointsComponent = new Plottable.Component();
-            var group = new Plottable.Components.Group([plot, pointsComponent]);
-            var hideTooltips = function () {
-                _this.tooltip.style('opacity', 0);
-                _this.scatterPlot.attr('opacity', 1);
-                pointsComponent.content().selectAll('.point').remove();
-            };
-            var enabled = true;
-            var disableTooltips = function () {
-                enabled = false;
-                hideTooltips();
-            };
-            var enableTooltips = function () { enabled = true; };
-            this.dzl.interactionStart(disableTooltips);
-            this.dzl.interactionEnd(enableTooltips);
-            pi.onPointerMove(function (p) {
-                if (!enabled) {
-                    return;
-                }
-                var target = {
-                    x: p.x,
-                    y: p.y,
-                    datum: null,
-                    dataset: null,
-                };
-                var bbox = _this.gridlines.content().node().getBBox();
-                // pts is the closets point to the tooltip for each dataset
-                var pts = plot.datasets()
-                    .map(function (dataset) { return _this.findClosestPoint(target, dataset); })
-                    .filter(function (x) { return x != null; });
-                var intersectsBBox = Plottable.Utils.DOM.intersectsBBox;
-                // We draw tooltips for points that are NaN, or are currently visible
-                var ptsForTooltips = pts.filter(function (p) { return intersectsBBox(p.x, p.y, bbox) || isNaN(p.datum.scalar); });
-                // Only draw little indicator circles for the non-NaN points
-                var ptsToCircle = ptsForTooltips.filter(function (p) { return !isNaN(p.datum.scalar); });
-                var ptsSelection = pointsComponent.content().selectAll('.point').data(ptsToCircle, function (p) { return p.dataset.metadata().name; });
-                if (pts.length !== 0) {
-                    ptsSelection.enter().append('circle').classed('point', true);
-                    ptsSelection.attr('r', VZ.ChartHelpers.TOOLTIP_CIRCLE_SIZE)
-                        .attr('cx', function (p) { return p.x; })
-                        .attr('cy', function (p) { return p.y; })
-                        .style('stroke', 'none')
-                        .attr('fill', function (p) { return _this.colorScale.scale(p.dataset.metadata().name); });
-                    ptsSelection.exit().remove();
-                    _this.drawTooltips(ptsForTooltips, target);
-                }
-                else {
-                    hideTooltips();
-                }
-            });
-            pi.onPointerExit(hideTooltips);
-            return group;
-        };
-        LineChart.prototype.drawTooltips = function (points, target) {
-            var _this = this;
-            // Formatters for value, step, and wall_time
-            this.scatterPlot.attr('opacity', 0);
-            var valueFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION);
-            var dist = function (p) {
-                return Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2);
-            };
-            var closestDist = _.min(points.map(dist));
-            var valueSortMethod = this.scalarAccessor;
-            if (this.smoothingEnabled) {
-                valueSortMethod = this.smoothedAccessor;
-            }
-            if (this.tooltipSortingMethod === 'ascending') {
-                points =
-                    _.sortBy(points, function (d) { return valueSortMethod(d.datum, -1, d.dataset); });
-            }
-            else if (this.tooltipSortingMethod === 'descending') {
-                points =
-                    _.sortBy(points, function (d) { return valueSortMethod(d.datum, -1, d.dataset); })
-                        .reverse();
-            }
-            else if (this.tooltipSortingMethod === 'nearest') {
-                points = _.sortBy(points, dist);
-            }
-            else {
-                // The 'default' sorting method maintains the order of names passed to
-                // setVisibleSeries(). However we reverse that order when defining the
-                // datasets. So we must call reverse again to restore the order.
-                points = points.slice(0).reverse();
-            }
-            var rows = this.tooltip.select('tbody')
-                .html('')
-                .selectAll('tr')
-                .data(points)
-                .enter()
-                .append('tr');
-            // Grey out the point if any of the following are true:
-            // - The cursor is outside of the x-extent of the dataset
-            // - The point's y value is NaN
-            rows.classed('distant', function (d) {
-                var firstPoint = d.dataset.data()[0];
-                var lastPoint = _.last(d.dataset.data());
-                var firstX = _this.xScale.scale(_this.xAccessor(firstPoint, 0, d.dataset));
-                var lastX = _this.xScale.scale(_this.xAccessor(lastPoint, 0, d.dataset));
-                var s = _this.smoothingEnabled ? d.datum.smoothed : d.datum.scalar;
-                return target.x < firstX || target.x > lastX || isNaN(s);
-            });
-            rows.classed('closest', function (p) { return dist(p) === closestDist; });
-            // It is a bit hacky that we are manually applying the width to the swatch
-            // and the nowrap property to the text here. The reason is as follows:
-            // the style gets updated asynchronously by Polymer scopeSubtree observer.
-            // Which means we would get incorrect sizing information since the text
-            // would wrap by default. However, we need correct measurements so that
-            // we can stop the text from falling off the edge of the screen.
-            // therefore, we apply the size-critical styles directly.
-            rows.style('white-space', 'nowrap');
-            rows.append('td')
-                .append('span')
-                .classed('swatch', true)
-                .style('background-color', function (d) { return _this.colorScale.scale(d.dataset.metadata().name); });
-            rows.append('td').text(function (d) { return d.dataset.metadata().name; });
-            if (this.smoothingEnabled) {
-                rows.append('td').text(function (d) { return isNaN(d.datum.smoothed) ? 'NaN' :
-                    valueFormatter(d.datum.smoothed); });
-            }
-            rows.append('td').text(function (d) {
-                return isNaN(d.datum.scalar) ? 'NaN' : valueFormatter(d.datum.scalar);
-            });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.stepFormatter(d.datum.step); });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.timeFormatter(d.datum.wall_time); });
-            rows.append('td').text(function (d) { return VZ.ChartHelpers.relativeFormatter(VZ.ChartHelpers.relativeAccessor(d.datum, -1, d.dataset)); });
-            // compute left position
-            var documentWidth = document.body.clientWidth;
-            var node = this.tooltip.node();
-            var parentRect = node.parentElement.getBoundingClientRect();
-            var nodeRect = node.getBoundingClientRect();
-            // prevent it from falling off the right side of the screen
-            var left = documentWidth - parentRect.left - nodeRect.width - 60, top = 0;
-            if (this.tooltipPosition === 'right') {
-                left = Math.min(parentRect.width, left);
-            }
-            else {
-                left = Math.min(0, left);
-                top = parentRect.height + VZ.ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET;
-            }
-            this.tooltip.style('transform', 'translate(' + left + 'px,' + top + 'px)');
-            this.tooltip.style('opacity', 1);
-        };
-        LineChart.prototype.findClosestPoint = function (target, dataset) {
-            var _this = this;
-            var points = dataset.data().map(function (d, i) {
-                var x = _this.xAccessor(d, i, dataset);
-                var y = _this.smoothingEnabled ? _this.smoothedAccessor(d, i, dataset) :
-                    _this.scalarAccessor(d, i, dataset);
-                return {
-                    x: _this.xScale.scale(x),
-                    y: _this.yScale.scale(y),
-                    datum: d,
-                    dataset: dataset,
-                };
-            });
-            var idx = _.sortedIndex(points, target, function (p) { return p.x; });
-            if (idx === points.length) {
-                return points[points.length - 1];
-            }
-            else if (idx === 0) {
-                return points[0];
-            }
-            else {
-                var prev = points[idx - 1];
-                var next = points[idx];
-                var prevDist = Math.abs(prev.x - target.x);
-                var nextDist = Math.abs(next.x - target.x);
-                return prevDist < nextDist ? prev : next;
-            }
-        };
-        LineChart.prototype.resmoothDataset = function (dataset) {
-            // When increasing the smoothing window, it smoothes a lot with the first
-            // few points and then starts to gradually smooth slower, so using an
-            // exponential function makes the slider more consistent. 1000^x has a
-            // range of [1, 1000], so subtracting 1 and dividing by 999 results in a
-            // range of [0, 1], which can be used as the percentage of the data, so
-            // that the kernel size can be specified as a percentage instead of a
-            // hardcoded number, what would be bad with multiple series.
-            var factor = (Math.pow(1000, this.smoothingWeight) - 1) / 999;
-            var data = dataset.data();
-            var kernelRadius = Math.floor(data.length * factor / 2);
-            data.forEach(function (d, i) {
-                var actualKernelRadius = Math.min(kernelRadius, i);
-                var start = i - actualKernelRadius;
-                var end = i + actualKernelRadius + 1;
-                if (end >= data.length) {
-                    // In the beginning, it's OK for the smoothing window to be small,
-                    // but this is not desirable towards the end. Rather than shrinking
-                    // the window, or extrapolating data to fill the gap, we're simply
-                    // not going to display the smoothed line towards the end.
-                    d.smoothed = Infinity;
-                }
-                else if (!_.isFinite(d.scalar)) {
-                    // Only smooth finite numbers.
-                    d.smoothed = d.scalar;
-                }
-                else {
-                    d.smoothed = d3.mean(data.slice(start, end).filter(function (d) { return _.isFinite(d.scalar); }), function (d) { return d.scalar; });
-                }
-            });
-        };
-        LineChart.prototype.getDataset = function (name) {
-            if (this.name2datasets[name] === undefined) {
-                this.name2datasets[name] = new Plottable.Dataset([], { name: name });
-            }
-            return this.name2datasets[name];
-        };
-        LineChart.getYScaleFromType = function (yScaleType) {
-            if (yScaleType === 'log') {
-                return new Plottable.Scales.ModifiedLog();
-            }
-            else if (yScaleType === 'linear') {
-                return new Plottable.Scales.Linear();
-            }
-            else {
-                throw new Error('Unrecognized yScale type ' + yScaleType);
-            }
-        };
-        /**
-         * Update the selected series on the chart.
-         */
-        LineChart.prototype.setVisibleSeries = function (names) {
-            var _this = this;
-            names = names.sort();
-            this.seriesNames = names;
-            names.reverse(); // draw first series on top
-            this.datasets.forEach(function (d) { return d.offUpdate(_this.onDatasetChanged); });
-            this.datasets = names.map(function (r) { return _this.getDataset(r); });
-            this.datasets.forEach(function (d) { return d.onUpdate(_this.onDatasetChanged); });
-            this.linePlot.datasets(this.datasets);
-            if (this.smoothingEnabled) {
-                this.smoothLinePlot.datasets(this.datasets);
-            }
-            this.updateSpecialDatasets();
-        };
-        /**
-         * Set the data of a series on the chart.
-         */
-        LineChart.prototype.setSeriesData = function (name, data) {
-            this.getDataset(name).data(data);
-        };
-        LineChart.prototype.smoothingUpdate = function (weight) {
-            var _this = this;
-            this.smoothingWeight = weight;
-            this.datasets.forEach(function (d) { return _this.resmoothDataset(d); });
-            if (!this.smoothingEnabled) {
-                this.linePlot.addClass('ghost');
-                this.scatterPlot.y(this.smoothedAccessor, this.yScale);
-                this.smoothingEnabled = true;
-                this.smoothLinePlot.datasets(this.datasets);
-            }
-            this.updateSpecialDatasetsWithAccessor(this.smoothedAccessor);
-        };
-        LineChart.prototype.smoothingDisable = function () {
-            if (this.smoothingEnabled) {
-                this.linePlot.removeClass('ghost');
-                this.scatterPlot.y(this.scalarAccessor, this.yScale);
-                this.smoothLinePlot.datasets([]);
-                this.smoothingEnabled = false;
-                this.updateSpecialDatasetsWithAccessor(this.scalarAccessor);
-            }
-        };
-        LineChart.prototype.setTooltipSortingMethod = function (method) {
-            this.tooltipSortingMethod = method;
-        };
-        LineChart.prototype.setTooltipPosition = function (position) {
-            this.tooltipPosition = position;
-        };
-        LineChart.prototype.renderTo = function (targetSVG) {
-            this.targetSVG = targetSVG;
-            this.setViewBox();
-            this.outer.renderTo(targetSVG);
-        };
-        /** There's an issue in Chrome where the svg overflow is a bit
-         * "flickery". There is a border on the gridlines on the extreme edge of the
-         * chart, which behaves inconsistently and causes the screendiffing tests to
-         * flake. We can solve this by creating 1px effective margin for the svg by
-         * setting the viewBox on the containing svg.
-         */
-        LineChart.prototype.setViewBox = function () {
-            // There's an issue in Firefox where if we measure with the old viewbox
-            // set, we get horrible results.
-            this.targetSVG.attr('viewBox', null);
-            var parent = this.targetSVG.node().parentNode;
-            var w = parent.clientWidth;
-            var h = parent.clientHeight;
-            this.targetSVG.attr({
-                'height': h,
-                'viewBox': "0 0 " + (w + 1) + " " + (h + 1),
-            });
-        };
-        LineChart.prototype.redraw = function () {
-            this.outer.redraw();
-            this.setViewBox();
-        };
-        LineChart.prototype.destroy = function () { this.outer.destroy(); };
-        return LineChart;
-    }());
-    VZ.LineChart = LineChart;
-})(VZ || (VZ = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var ChartHelpers;
-    (function (ChartHelpers) {
-        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION = 4;
-        ChartHelpers.STEP_FORMATTER_PRECISION = 4;
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION = 3;
-        ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET = 20;
-        ChartHelpers.TOOLTIP_CIRCLE_SIZE = 4;
-        ChartHelpers.NAN_SYMBOL_SIZE = 6;
-        /* Create a formatter function that will switch between exponential and
-         * regular display depending on the scale of the number being formatted,
-         * and show `digits` significant digits.
-         */
-        function multiscaleFormatter(digits) {
-            return function (v) {
-                var absv = Math.abs(v);
-                if (absv < 1E-15) {
-                    // Sometimes zero-like values get an annoying representation
-                    absv = 0;
-                }
-                var f;
-                if (absv >= 1E4) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else if (absv > 0 && absv < 0.01) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else {
-                    f = d3.format('.' + digits + 'g');
-                }
-                return f(v);
-            };
-        }
-        ChartHelpers.multiscaleFormatter = multiscaleFormatter;
-        /* Compute an appropriate domain given an array of all the values that are
-         * going to be displayed. If ignoreOutliers is true, it will ignore the
-         * lowest 10% and highest 10% of the data when computing a domain.
-         * It has n log n performance when ignoreOutliers is true, as it needs to
-         * sort the data.
-         */
-        function computeDomain(values, ignoreOutliers) {
-            if (values.length === 0) {
-                return [-0.1, 1.1];
-            }
-            var a;
-            var b;
-            if (ignoreOutliers) {
-                var sorted = _.sortBy(values);
-                a = d3.quantile(sorted, 0.10);
-                b = d3.quantile(sorted, 0.90);
-            }
-            else {
-                a = d3.min(values);
-                b = d3.max(values);
-            }
-            // When the data all fits into the unit interval, we switch to a consistent
-            // domain for unit data. This is helpful for proportional parameters like
-            // error rates or % of queue that is full. This way, users can meaningfully
-            // compare charts and see information at a glance (if the value is always
-            // 1, it appears at top of the chart, 0 is bottom, etc.)
-            if (a >= 0 && b <= 1) {
-                return [-0.1, 1.1];
-            }
-            var padding = (b - a) * 0.20;
-            var domain = [a - padding, b + padding];
-            domain = d3.scale.linear().domain(domain).nice().domain();
-            return domain;
-        }
-        ChartHelpers.computeDomain = computeDomain;
-        function accessorize(key) {
-            return function (d, index, dataset) { return d[key]; };
-        }
-        ChartHelpers.accessorize = accessorize;
-        ChartHelpers.stepFormatter = Plottable.Formatters.siSuffix(ChartHelpers.STEP_FORMATTER_PRECISION);
-        function stepX() {
-            var scale = new Plottable.Scales.Linear();
-            var axis = new Plottable.Axes.Numeric(scale, 'bottom');
-            axis.formatter(ChartHelpers.stepFormatter);
-            return {
-                scale: scale,
-                axis: axis,
-                accessor: function (d) { return d.step; },
-            };
-        }
-        ChartHelpers.stepX = stepX;
-        ChartHelpers.timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-        function wallX() {
-            var scale = new Plottable.Scales.Time();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Time(scale, 'bottom'),
-                accessor: function (d) { return d.wall_time; },
-            };
-        }
-        ChartHelpers.wallX = wallX;
-        ChartHelpers.relativeAccessor = function (d, index, dataset) {
-            // We may be rendering the final-point datum for scatterplot.
-            // If so, we will have already provided the 'relative' property
-            if (d.relative != null) {
-                return d.relative;
-            }
-            var data = dataset.data();
-            // I can't imagine how this function would be called when the data is
-            // empty (after all, it iterates over the data), but lets guard just
-            // to be safe.
-            var first = data.length > 0 ? +data[0].wall_time : 0;
-            return (+d.wall_time - first) / (60 * 60 * 1000); // ms to hours
-        };
-        ChartHelpers.relativeFormatter = function (n) {
-            // we will always show 2 units of precision, e.g days and hours, or
-            // minutes and seconds, but not hours and minutes and seconds
-            var ret = '';
-            var days = Math.floor(n / 24);
-            n -= (days * 24);
-            if (days) {
-                ret += days + 'd ';
-            }
-            var hours = Math.floor(n);
-            n -= hours;
-            n *= 60;
-            if (hours || days) {
-                ret += hours + 'h ';
-            }
-            var minutes = Math.floor(n);
-            n -= minutes;
-            n *= 60;
-            if (minutes || hours || days) {
-                ret += minutes + 'm ';
-            }
-            var seconds = Math.floor(n);
-            return ret + seconds + 's';
-        };
-        function relativeX() {
-            var scale = new Plottable.Scales.Linear();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-                accessor: ChartHelpers.relativeAccessor,
-            };
-        }
-        ChartHelpers.relativeX = relativeX;
-        // a very literal definition of NaN: true for NaN for a non-number type
-        // or null, etc. False for Infinity or -Infinity
-        ChartHelpers.isNaN = function (x) { return +x !== x; };
-        function getXComponents(xType) {
-            switch (xType) {
-                case 'step':
-                    return stepX();
-                case 'wall_time':
-                    return wallX();
-                case 'relative':
-                    return relativeX();
-                default:
-                    throw new Error('invalid xType: ' + xType);
-            }
-        }
-        ChartHelpers.getXComponents = getXComponents;
-    })(ChartHelpers = VZ.ChartHelpers || (VZ.ChartHelpers = {}));
-})(VZ || (VZ = {}));
-</script>
-  <script>
-    Polymer({
-      is: "vz-line-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-
-        /**
-         * Whether smoothing is enabled or not. If true, smoothed lines will be
-         * plotted in the chart while the unsmoothed lines will be ghosted in
-         * the background.
-         *
-         * The smoothing algorithm is a simple moving average, which, given a
-         * point p and a window w, replaces p with a simple average of the
-         * points in the [p - floor(w/2), p + floor(w/2)] range.  If there
-         * aren't enough points to cover the entire window to the left, the
-         * window is reduced to fit exactly the amount of elements available.
-         * This means that the smoothed line will be less in and gradually
-         * become more smooth until the desired window is reached. However when
-         * there aren't enough points on the right, the line stops being
-         * rendered at all.
-         */
-        smoothingEnabled: {
-          type: Boolean,
-          value: false
-        },
-
-        /**
-         * Weight (between 0.0 and 1.0) of the smoothing. This weight controls
-         * the window size, and a weight of 1.0 means using 50% of the entire
-         * dataset as the window, while a weight of 0.0 means using a window of
-         * 0 (and thus replacing each point with themselves).
-         *
-         * The growth between 0.0 and 1.0 is not linear though. Because
-         * changing the window from 0% to 30% of the dataset smooths the line a
-         * lot more than changing the window from 70% to 100%, an exponential
-         * function is used instead: http://i.imgur.com/bDrhEZU.png. This
-         * function increases the size of the window slowly at the beginning
-         * and gradually speeds up the growth, but 0.0 still means a window of
-         * 0 and 1.0 still means a window of the dataset's length.
-         */
-        smoothingWeight: {
-          type: Number,
-          value: 0.6
-        },
-
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-
-        /**
-         * The scale for the y-axis. Allows:
-         * - "linear" - linear scale (Plottable.Scales.Linear)
-         * - "log" - modified-log scale (Plottable.Scales.ModifiedLog)
-         */
-        yScaleType: {
-          type: String,
-          value: 'linear'
-        },
-
-        /**
-         * Whether to ignore outlier data when computing the yScale domain.
-         */
-
-        ignoreYOutliers: {
-          type: Boolean,
-          value: true,
-        },
-
-        /**
-         * Change how the tooltip is sorted. Allows:
-         * - "default" - Sort the tooltip by input order.
-         * - "ascending" - Sort the tooltip by ascending value.
-         * - "descending" - Sort the tooltip by descending value.
-         * - "nearest" - Sort the tooltip by closest to cursor.
-         */
-        tooltipSortingMethod: {
-          type: String,
-          value: 'default'
-        },
-
-        /**
-         * Change how the tooltip is positioned. Allows:
-         * - "bottom" - Position the tooltip on the bottom of the chart.
-         * - "right" - Position the tooltip to the right of the chart.
-         */
-        tooltipPosition: {
-          type: String,
-          value: 'bottom'
-        },
-
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: {
-          type: Number,
-          value: null
-        }
-      },
-      observers: [
-        "_makeChart(xType, yScaleType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-        "_smoothingChanged(smoothingEnabled, smoothingWeight, _chart)",
-        "_tooltipSortingMethodChanged(tooltipSortingMethod, _chart)",
-        "_tooltipPositionChanged(tooltipPosition, _chart)",
-        "_outliersChanged(ignoreYOutliers, _chart)"
-      ],
-
-      /**
-       * Sets the series that the chart displays. Series with other names will
-       * not be displayed.
-       *
-       * @param {String[]} names Array with the names of the series to
-       * display.
-       */
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-
-      /**
-       * Sets the data of one of the series. Note that to display this series
-       * its name must be in the setVisibleSeries() array.
-       *
-       * @param {String} name Name of the series.
-       * @param {VZ.ChartHelpers.ScalarDatum[]} data Data of the series. This is
-       * an array of objects with at least the following properties:
-       * - step: (Number) - index of the datum.
-       * - wall_time: (Date) - Date object with the datum's time.
-       * - scalar: (Number) - Value of the datum.
-       */
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-
-      /**
-       * Re-renders the chart. Useful if e.g. the container size changed.
-       */
-      redraw: function() {
-        this._chart.redraw();
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.tooltip, true);
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, yScaleType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId !== null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-          this._makeChartAsyncCallbackId = null;
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!this._attached) return;
-          if (this._chart) this._chart.destroy();
-          var tooltip = d3.select(this.$.tooltip);
-          var chart = new VZ.LineChart(xType, yScaleType, colorScale, tooltip);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      _smoothingChanged: function() {
-        if(!this._chart) {
-          return;
-        }
-        if(this.smoothingEnabled) {
-          this._chart.smoothingUpdate(this.smoothingWeight);
-        }
-        else {
-          this._chart.smoothingDisable();
-        }
-      },
-      _outliersChanged: function() {
-        if (!this._chart) {
-          return;
-        }
-        this._chart.ignoreYOutliers(this.ignoreYOutliers);
-      },
-      _tooltipSortingMethodChanged: function() {
-        if(this._chart) {
-          this._chart.setTooltipSortingMethod(this.tooltipSortingMethod);
-        }
-      },
-      _tooltipPositionChanged: function() {
-        if (this._chart) {
-          this._chart.setTooltipPosition(this.tooltipPosition);
-        }
-      }
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-scalar-dashboard" assetpath="../tf-scalar-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-          <div class="extend-first-section">
-            <div class="line-item">
-              <paper-checkbox id="download-option" checked="{{_showDownloadLinks}}">Show data download links</paper-checkbox>
-            </div>
-              <div class="line-item">
-                <paper-checkbox id="outliersCheckbox" checked="{{_ignoreYOutliers}}">Ignore outliers in chart scaling</paper-checkbox>
-              </div>
-            <div id="tooltip-sorting">
-              <div id="tooltip-sorting-label">Tooltip sorting method:</div>
-              <paper-dropdown-menu no-label-float="" selected-item-label="{{_tooltipSortingMethod}}">
-                <paper-menu class="dropdown-content" selected="0">
-                  <paper-item>default</paper-item>
-                  <paper-item>descending</paper-item>
-                  <paper-item>ascending</paper-item>
-                  <paper-item>nearest</paper-item>
-                </paper-menu>
-              </paper-dropdown-menu>
-            </div>
-          </div>
-          <div class="sidebar-section">
-            <tf-smoothing-input weight="{{_smoothingWeight}}" step="0.001" min="0" max="1"></tf-smoothing-input>
-          </div>
-          <div class="sidebar-section">
-            <tf-option-selector id="xTypeSelector" name="Horizontal Axis" selected-id="{{_xType}}">
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
-          </div>
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" show-download-links="[[_showDownloadLinks]]" download-link-url-function="[[scalarUrl]]">
-          <template>
-            <vz-line-chart x-type="[[_xType]]" color-scale="[[_colorScale]]" smoothing-enabled="[[_smoothingEnabled]]" smoothing-weight="[[_smoothingWeight]]" tooltip-sorting-method="[[_tooltipSortingMethod]]" ignore-y-outliers="[[_ignoreYOutliers]]"></vz-line-chart>
-            <paper-icon-button class="log-button" icon="line-weight" on-tap="toggleLogScale" title="Toggle y-axis log scale"></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      .log-button {
-        position: absolute;
-        left: 35px;
-        bottom: -35px;
-        color: #2196F3;
-        background: #fff;
-        width: 32px;
-        height: 32px;
-        padding: 4px;
-        border-radius: 100%;
-      }
-
-      .log-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #tooltip-sorting {
-        display: flex;
-        font-size: 14px;
-        margin-top: 5px;
-      }
-
-      #tooltip-sorting-label {
-        margin-top: 13px;
-        margin-left: 28px;
-      }
-
-      #tooltip-sorting paper-dropdown-menu {
-        margin-left: 10px;
-        --paper-input-container-focus-color: var(--tb-orange-strong);
-        width: 105px;
-      }
-      .line-item {
-        display: block;
-        padding-top: 5px;
-      }
-    </style>
-
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-scalar-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "scalar"
-        },
-        router: Object,
-        scalarUrl: {
-          type: Function,
-          computed: "_getScalarUrl(router)"
-        },
-        _showDownloadLinks: {
-          type: Boolean,
-          notify: true,
-          value: TF.URIStorage.getBooleanInitializer('_showDownloadLinks',
-              false),
-          observer: '_showDownloadLinksObserver'
-        },
-        _smoothingWeight: {
-          type: Number,
-          notify: true,
-          value: TF.URIStorage.getNumberInitializer('_smoothingWeight', 0.6),
-          observer: '_smoothingWeightObserver'
-        },
-        _smoothingEnabled: {
-          type: Boolean,
-          computed: '_computeSmoothingEnabled(_smoothingWeight)'
-        },
-        _ignoreYOutliers: {
-          type: Boolean,
-          value: true,
-        },
-        _xType: {
-          type: String,
-          value: "step"
-        }
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _getScalarUrl: function() {
-        return this.router.scalars;
-      },
-      _showDownloadLinksObserver: TF.URIStorage.getBooleanObserver(
-          '_showDownloadLinks', false),
-      _smoothingWeightObserver: TF.URIStorage.getNumberObserver(
-          '_smoothingWeight', 0.6),
-      _computeSmoothingEnabled: function(_smoothingWeight) {
-        return _smoothingWeight > 0;
-      },
-      toggleLogScale: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var button = currentTarget.parentNode.querySelector('.log-button');
-        var chart = currentTarget.parentNode.querySelector('vz-line-chart');
-
-        button.classList.toggle("log-button-selected");
-        chart.yScaleType = chart.yScaleType === 'log' ? 'linear' : 'log';
-        chart.redraw();
-      },
-    });
-  </script>
-</dom-module>
-<dom-module id="vz-distribution-chart" assetpath="../vz-distribution-chart/">
-  <template>
-    <svg id="chartsvg"></svg>
-    <style>
-      :host {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        display: flex;
-        flex-direction: column;
-        flex-grow: 1;
-        flex-shrink: 1;
-        position: relative;
-      }
-      svg {
-        -webkit-user-select: none;
-        -moz-user-select: none;
-        flex-grow: 1;
-        flex-shrink: 1;
-      }
-
-    </style>
-  </template>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var DistributionChart = (function () {
-        function DistributionChart(xType, colorScale) {
-            this.run2datasets = {};
-            this.colorScale = colorScale;
-            this.buildChart(xType);
-        }
-        DistributionChart.prototype.getDataset = function (run) {
-            if (this.run2datasets[run] === undefined) {
-                this.run2datasets[run] = new Plottable.Dataset([], { run: run });
-            }
-            return this.run2datasets[run];
-        };
-        DistributionChart.prototype.buildChart = function (xType) {
-            if (this.outer) {
-                this.outer.destroy();
-            }
-            var xComponents = VZ.ChartHelpers.getXComponents(xType);
-            this.xAccessor = xComponents.accessor;
-            this.xScale = xComponents.scale;
-            this.xAxis = xComponents.axis;
-            this.xAxis.margin(0).tickLabelPadding(3);
-            this.yScale = new Plottable.Scales.Linear();
-            this.yAxis = new Plottable.Axes.Numeric(this.yScale, 'left');
-            var yFormatter = VZ.ChartHelpers.multiscaleFormatter(VZ.ChartHelpers.Y_AXIS_FORMATTER_PRECISION);
-            this.yAxis.margin(0).tickLabelPadding(5).formatter(yFormatter);
-            this.yAxis.usesTextWidthApproximation(true);
-            var center = this.buildPlot(this.xAccessor, this.xScale, this.yScale);
-            this.gridlines =
-                new Plottable.Components.Gridlines(this.xScale, this.yScale);
-            this.center = new Plottable.Components.Group([this.gridlines, center]);
-            this.outer = new Plottable.Components.Table([[this.yAxis, this.center], [null, this.xAxis]]);
-        };
-        DistributionChart.prototype.buildPlot = function (xAccessor, xScale, yScale) {
-            var _this = this;
-            var percents = [0, 228, 1587, 3085, 5000, 6915, 8413, 9772, 10000];
-            var opacities = _.range(percents.length - 1)
-                .map(function (i) { return (percents[i + 1] - percents[i]) / 2500; });
-            var accessors = percents.map(function (p, i) { return function (datum) { return datum[i][1]; }; });
-            var median = 4;
-            var medianAccessor = accessors[median];
-            var plots = _.range(accessors.length - 1).map(function (i) {
-                var p = new Plottable.Plots.Area();
-                p.x(xAccessor, xScale);
-                var y0 = i > median ? accessors[i] : accessors[i + 1];
-                var y = i > median ? accessors[i + 1] : accessors[i];
-                p.y(y, yScale);
-                p.y0(y0);
-                p.attr('fill', function (d, i, dataset) {
-                    return _this.colorScale.scale(dataset.metadata().run);
-                });
-                p.attr('stroke', function (d, i, dataset) {
-                    return _this.colorScale.scale(dataset.metadata().run);
-                });
-                p.attr('stroke-weight', function (d, i, m) { return '0.5px'; });
-                p.attr('stroke-opacity', function () { return opacities[i]; });
-                p.attr('fill-opacity', function () { return opacities[i]; });
-                return p;
-            });
-            var medianPlot = new Plottable.Plots.Line();
-            medianPlot.x(xAccessor, xScale);
-            medianPlot.y(medianAccessor, yScale);
-            medianPlot.attr('stroke', function (d, i, m) { return _this.colorScale.scale(m.run); });
-            this.plots = plots;
-            return new Plottable.Components.Group(plots);
-        };
-        DistributionChart.prototype.setVisibleSeries = function (runs) {
-            var _this = this;
-            this.runs = runs;
-            var datasets = runs.map(function (r) { return _this.getDataset(r); });
-            this.plots.forEach(function (p) { return p.datasets(datasets); });
-        };
-        /**
-         * Set the data of a series on the chart.
-         */
-        DistributionChart.prototype.setSeriesData = function (name, data) {
-            this.getDataset(name).data(data);
-        };
-        DistributionChart.prototype.renderTo = function (targetSVG) {
-            this.targetSVG = targetSVG;
-            this.setViewBox();
-            this.outer.renderTo(targetSVG);
-        };
-        /** There's an issue in Chrome where the svg overflow is a bit
-         * "flickery". There is a border on the gridlines on the extreme edge of the
-         * chart, which behaves inconsistently and causes the screendiffing tests to
-         * flake. We can solve this by creating 1px effective margin for the svg by
-         * setting the viewBox on the containing svg.
-         */
-        DistributionChart.prototype.setViewBox = function () {
-            // There's an issue in Firefox where if we measure with the old viewbox
-            // set, we get horrible results.
-            this.targetSVG.attr('viewBox', null);
-            var parent = this.targetSVG.node().parentNode;
-            var w = parent.clientWidth;
-            var h = parent.clientHeight;
-            this.targetSVG.attr({
-                'height': h,
-                'viewBox': "0 0 " + (w + 1) + " " + (h + 1),
-            });
-        };
-        DistributionChart.prototype.redraw = function () {
-            this.outer.redraw();
-            this.setViewBox();
-        };
-        DistributionChart.prototype.destroy = function () { this.outer.destroy(); };
-        return DistributionChart;
-    }());
-    VZ.DistributionChart = DistributionChart;
-})(VZ || (VZ = {}));
-</script>
-  <script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/* tslint:disable:no-namespace variable-name */
-var VZ;
-(function (VZ) {
-    var ChartHelpers;
-    (function (ChartHelpers) {
-        ChartHelpers.Y_TOOLTIP_FORMATTER_PRECISION = 4;
-        ChartHelpers.STEP_FORMATTER_PRECISION = 4;
-        ChartHelpers.Y_AXIS_FORMATTER_PRECISION = 3;
-        ChartHelpers.TOOLTIP_Y_PIXEL_OFFSET = 20;
-        ChartHelpers.TOOLTIP_CIRCLE_SIZE = 4;
-        ChartHelpers.NAN_SYMBOL_SIZE = 6;
-        /* Create a formatter function that will switch between exponential and
-         * regular display depending on the scale of the number being formatted,
-         * and show `digits` significant digits.
-         */
-        function multiscaleFormatter(digits) {
-            return function (v) {
-                var absv = Math.abs(v);
-                if (absv < 1E-15) {
-                    // Sometimes zero-like values get an annoying representation
-                    absv = 0;
-                }
-                var f;
-                if (absv >= 1E4) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else if (absv > 0 && absv < 0.01) {
-                    f = d3.format('.' + digits + 'e');
-                }
-                else {
-                    f = d3.format('.' + digits + 'g');
-                }
-                return f(v);
-            };
-        }
-        ChartHelpers.multiscaleFormatter = multiscaleFormatter;
-        /* Compute an appropriate domain given an array of all the values that are
-         * going to be displayed. If ignoreOutliers is true, it will ignore the
-         * lowest 10% and highest 10% of the data when computing a domain.
-         * It has n log n performance when ignoreOutliers is true, as it needs to
-         * sort the data.
-         */
-        function computeDomain(values, ignoreOutliers) {
-            if (values.length === 0) {
-                return [-0.1, 1.1];
-            }
-            var a;
-            var b;
-            if (ignoreOutliers) {
-                var sorted = _.sortBy(values);
-                a = d3.quantile(sorted, 0.10);
-                b = d3.quantile(sorted, 0.90);
-            }
-            else {
-                a = d3.min(values);
-                b = d3.max(values);
-            }
-            // When the data all fits into the unit interval, we switch to a consistent
-            // domain for unit data. This is helpful for proportional parameters like
-            // error rates or % of queue that is full. This way, users can meaningfully
-            // compare charts and see information at a glance (if the value is always
-            // 1, it appears at top of the chart, 0 is bottom, etc.)
-            if (a >= 0 && b <= 1) {
-                return [-0.1, 1.1];
-            }
-            var padding = (b - a) * 0.20;
-            var domain = [a - padding, b + padding];
-            domain = d3.scale.linear().domain(domain).nice().domain();
-            return domain;
-        }
-        ChartHelpers.computeDomain = computeDomain;
-        function accessorize(key) {
-            return function (d, index, dataset) { return d[key]; };
-        }
-        ChartHelpers.accessorize = accessorize;
-        ChartHelpers.stepFormatter = Plottable.Formatters.siSuffix(ChartHelpers.STEP_FORMATTER_PRECISION);
-        function stepX() {
-            var scale = new Plottable.Scales.Linear();
-            var axis = new Plottable.Axes.Numeric(scale, 'bottom');
-            axis.formatter(ChartHelpers.stepFormatter);
-            return {
-                scale: scale,
-                axis: axis,
-                accessor: function (d) { return d.step; },
-            };
-        }
-        ChartHelpers.stepX = stepX;
-        ChartHelpers.timeFormatter = Plottable.Formatters.time('%a %b %e, %H:%M:%S');
-        function wallX() {
-            var scale = new Plottable.Scales.Time();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Time(scale, 'bottom'),
-                accessor: function (d) { return d.wall_time; },
-            };
-        }
-        ChartHelpers.wallX = wallX;
-        ChartHelpers.relativeAccessor = function (d, index, dataset) {
-            // We may be rendering the final-point datum for scatterplot.
-            // If so, we will have already provided the 'relative' property
-            if (d.relative != null) {
-                return d.relative;
-            }
-            var data = dataset.data();
-            // I can't imagine how this function would be called when the data is
-            // empty (after all, it iterates over the data), but lets guard just
-            // to be safe.
-            var first = data.length > 0 ? +data[0].wall_time : 0;
-            return (+d.wall_time - first) / (60 * 60 * 1000); // ms to hours
-        };
-        ChartHelpers.relativeFormatter = function (n) {
-            // we will always show 2 units of precision, e.g days and hours, or
-            // minutes and seconds, but not hours and minutes and seconds
-            var ret = '';
-            var days = Math.floor(n / 24);
-            n -= (days * 24);
-            if (days) {
-                ret += days + 'd ';
-            }
-            var hours = Math.floor(n);
-            n -= hours;
-            n *= 60;
-            if (hours || days) {
-                ret += hours + 'h ';
-            }
-            var minutes = Math.floor(n);
-            n -= minutes;
-            n *= 60;
-            if (minutes || hours || days) {
-                ret += minutes + 'm ';
-            }
-            var seconds = Math.floor(n);
-            return ret + seconds + 's';
-        };
-        function relativeX() {
-            var scale = new Plottable.Scales.Linear();
-            return {
-                scale: scale,
-                axis: new Plottable.Axes.Numeric(scale, 'bottom'),
-                accessor: ChartHelpers.relativeAccessor,
-            };
-        }
-        ChartHelpers.relativeX = relativeX;
-        // a very literal definition of NaN: true for NaN for a non-number type
-        // or null, etc. False for Infinity or -Infinity
-        ChartHelpers.isNaN = function (x) { return +x !== x; };
-        function getXComponents(xType) {
-            switch (xType) {
-                case 'step':
-                    return stepX();
-                case 'wall_time':
-                    return wallX();
-                case 'relative':
-                    return relativeX();
-                default:
-                    throw new Error('invalid xType: ' + xType);
-            }
-        }
-        ChartHelpers.getXComponents = getXComponents;
-    })(ChartHelpers = VZ.ChartHelpers || (VZ.ChartHelpers = {}));
-})(VZ || (VZ = {}));
-</script>
-  <script>
-    Polymer({
-      is: "vz-distribution-chart",
-      properties: {
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         * @type {Plottable.Scales.Color}
-         * @required
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return new Plottable.Scales.Color()
-                .range(d3.scale.category10().range());
-          }
-        },
-        /**
-         * The way to display the X values. Allows:
-         * - "step" - Linear scale using the  "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale using the "relative" property of the
-         * datum if it is present or calculating from "wall_time" if it isn't.
-         */
-        xType: {
-          type: String,
-          value: 'step'
-        },
-        _attached: Boolean,
-        _chart: Object,
-        _visibleSeriesCache: {
-          type: Array,
-          value: function() { return [] }
-        },
-        _seriesDataCache: {
-          type: Object,
-          value: function() { return {} }
-        },
-        _makeChartAsyncCallbackId: { type: Number, value: null }
-      },
-      observers: [
-        "_makeChart(xType, colorScale, _attached)",
-        "_reloadFromCache(_chart)",
-      ],
-      setVisibleSeries: function(names) {
-        this._visibleSeriesCache = names;
-        if (this._chart) {
-          this._chart.setVisibleSeries(names);
-          this.redraw();
-        }
-      },
-      setSeriesData: function(name, data) {
-        this._seriesDataCache[name] = data;
-        if (this._chart) {
-          this._chart.setSeriesData(name, data);
-        }
-      },
-      redraw: function() {
-        this._chart.redraw();
-      },
-      ready: function() {
-        this.scopeSubtree(this.$.chartsvg, true);
-      },
-      _makeChart: function(xType, colorScale, _attached) {
-        if (this._makeChartAsyncCallbackId === null) {
-          this.cancelAsync(this._makeChartAsyncCallbackId);
-        }
-
-        this._makeChartAsyncCallbackId = this.async(function() {
-          this._makeChartAsyncCallbackId = null;
-          if (!_attached) return;
-          if (this._chart) this._chart.destroy();
-          var chart = new VZ.DistributionChart(xType, colorScale);
-          var svg = d3.select(this.$.chartsvg);
-          chart.renderTo(svg);
-          this._chart = chart;
-        }, 350);
-      },
-      _reloadFromCache: function() {
-        if(this._chart) {
-          this._chart.setVisibleSeries(this._visibleSeriesCache);
-          this._visibleSeriesCache.forEach(function(name) {
-            this._chart.setSeriesData(name, this._seriesDataCache[name] || []);
-          }.bind(this));
-        }
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      }
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-distribution-dashboard" assetpath="../tf-distribution-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-        <div class="sidebar-section">
-          <tf-option-selector id="xTypeSelector" name="Horizontal Axis" selected-id="{{_xType}}">
-            <paper-button id="step">step</paper-button>
-            <paper-button id="relative">relative</paper-button>
-            <paper-button id="wall_time">wall</paper-button>
-          </tf-option-selector>
-        </div>
-        </tf-sidebar-helper>
-      </div>
-
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <vz-distribution-chart x-type="[[_xType]]" color-scale="[[_colorScale]]"></vz-distribution-chart>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-distribution-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        _xType: {
-          type: String,
-          value: "step"
-        },
-        dataType: {value: "compressedHistogram"},
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="vz-histogram-timeseries" assetpath="../vz-histogram-timeseries/">
-    <template>
-      <div id="tooltip"><span></span></div>
-      <svg id="svg">
-        <g>
-          <g class="axis x"></g>
-          <g class="axis y"></g>
-          <g class="axis y slice"></g>
-          <g class="stage">
-            <rect class="background"></rect>
-          </g>
-          <g class="x-axis-hover"></g>
-          <g class="y-axis-hover"></g>
-          <g class="y-slice-axis-hover"></g>
-        </g>
-      </svg>
-
-      <style>
-        :host {
-          display: flex;
-          flex-direction: column;
-          flex-grow: 1;
-          flex-shrink: 1;
-          position: relative;
-        }
-
-        svg {
-          font-family: roboto, sans-serif;
-          overflow: visible;
-          display: block;
-          width: 100%;
-          flex-grow: 1;
-          flex-shrink: 1;
-        }
-
-        #tooltip {
-          position: absolute;
-          display: block;
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-        }
-
-        .background {
-          fill-opacity: 0;
-          fill: red;
-        }
-
-        .histogram {
-          pointer-events: none;
-        }
-
-        .hover {
-          font-size: 9px;
-          dominant-baseline: middle;
-          opacity: 0;
-        }
-
-        .hover circle {
-          stroke: white;
-          stroke-opacity: 0.5;
-          stroke-width: 1px;
-        }
-
-        .hover text {
-          fill: black;
-          opacity: 0;
-        }
-
-        .hover.hover-closest circle {
-          fill: black!important;
-        }
-
-        .hover.hover-closest text {
-          opacity: 1;
-        }
-
-        .baseline {
-          stroke: black;
-          stroke-opacity: 0.1;
-        }
-
-        .outline {
-          fill: none;
-          stroke: white;
-          stroke-opacity: 0.5;
-        }
-
-        .outline.outline-hover {
-          stroke: black!important;
-          stroke-opacity: 1;
-        }
-
-        .x-axis-hover,
-        .y-axis-hover,
-        .y-slice-axis-hover {
-          pointer-events: none;
-        }
-
-        .x-axis-hover .label,
-        .y-axis-hover .label,
-        .y-slice-axis-hover .label {
-          opacity: 0;
-          font-weight: bold;
-          font-size: 11px;
-          text-anchor: end;
-        }
-
-        .x-axis-hover text {
-          text-anchor: middle;
-        }
-
-        .y-axis-hover text,
-        .y-slice-axis-hover text {
-          text-anchor: start;
-        }
-
-        .x-axis-hover line,
-        .y-axis-hover line,
-        .y-slice-axis-hover line {
-          stroke: black;
-        }
-
-        .x-axis-hover rect,
-        .y-axis-hover rect,
-        .y-slice-axis-hover rect {
-          fill: white;
-        }
-
-        .axis {
-          font-size: 10px;
-          fill: #aaa;
-        }
-
-        .axis path.domain {
-          fill: none;
-        }
-
-        .axis .tick line {
-          stroke: #ddd;
-        }
-
-        .axis.slice {
-          opacity: 0;
-        }
-
-        .axis.slice .tick line {
-          stroke-dasharray: 2;
-        }
-
-        .small .axis text { display: none; }
-        .small .axis .tick:first-of-type text { display: block; }
-        .small .axis .tick:last-of-type text { display: block; }
-        .medium .axis text { display: none; }
-        .medium .axis .tick:nth-child(2n + 1) text { display: block; }
-        .large .axis text { display: none; }
-        .large .axis .tick:nth-child(2n + 1) text { display: block; }
-
-      </style>
-    </template>
-
-    <script>
-    Polymer({
-      is: "vz-histogram-timeseries",
-      properties: {
-        /**
-         * Defines which view mode is being used by the chart. Supported values
-         * are:
-         * - "offset" - Offset view of the data showing all timesteps.
-         * - "overlay" - Overlays all timesteps into one 2D view, with the
-         * brighter lines representing the newer timesteps.
-         */
-        mode: {
-          type: String,
-          value: "offset"
-        },
-
-        /*
-         * The name of the datum's property that contains the time values.
-         * Allows:
-         * - "step" - Linear scale using the "step" property of the datum.
-         * - "wall_time" - Temporal scale using the "wall_time" property of the
-         * datum.
-         * - "relative" - Temporal scale starting at 0 created by using
-         * the "wall_time" property of the datum.
-         */
-        timeProperty: {
-          type: String,
-          value: "step"
-        },
-
-        /**
-         * The name of the data's property that contains the bins.
-         */
-        bins: {
-          type: String,
-          value: "bins"
-        },
-
-        /**
-         * The name of the datum's property that contains the x values.
-         */
-        x: {
-          type: String,
-          value: "x"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin width values.
-         */
-        dx: {
-          type: String,
-          value: "dx"
-        },
-
-        /**
-         * The name of the datum's property that contains the bin height.
-         */
-        y: {
-          type: String,
-          value: "y"
-        },
-
-        /**
-         * Scale that maps series names to colors. The default colors are from
-         * d3.scale.category10() scale. Use this property to replace the default
-         * line colors with colors of your own choice.
-         */
-        colorScale: {
-          type: Object,
-          value: function() {
-            return d3.scale.category10();
-          }
-        },
-
-        /**
-         * Duration of the transition between histogram modes.
-         */
-        modeTransitionDuration: {
-          type: Number,
-          value: 500
-        },
-
-        _attached: Boolean,
-        _name: { type: String, value: null },
-        _data: { type: Array, value: null },
-      },
-      observers: [
-        'redraw(timeProperty, _attached)',
-        '_modeRedraw(mode)'
-      ],
-      ready: function() {
-        // Polymer's way of scoping styles on nodes that d3 created
-        this.scopeSubtree(this.$.svg, true);
-      },
-      attached: function() {
-        this._attached = true;
-      },
-      detached: function() {
-        this._attached = false;
-      },
-      setVisibleSeries: function(names) {
-        // Do nothing.
-      },
-      setSeriesData: function(name, data) {
-        this._name = name;
-        this._data = data;
-        this.redraw();
-      },
-
-      /**
-       * Redraws the chart. This is only called if the chart is attached to the
-       * screen and if the chart has data.
-       */
-      redraw: function() {
-        this._draw(0);
-      },
-
-      _modeRedraw: function() {
-        this._draw(this.modeTransitionDuration);
-      },
-
-      _draw: function(duration) {
-        if (!this._attached || !this._data) {
-          return;
-        }
-
-        //
-        // Data verification
-        //
-        if (duration === undefined) throw(new Error("vz-histogram-timeseries _draw needs duration"));
-        if (this._data.length <= 0) throw(new Error("Not enough steps in the data"));
-        if (!this._data[0].hasOwnProperty(this.bins)) throw(new Error("No bins property of '" + this.bins + "' in data"));
-        if (this._data[0][this.bins].length <= 0) throw(new Error("Must have at least one bin in bins in data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.x)) throw(new Error("No x property '" + this.x + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.dx)) throw(new Error("No dx property '" + this.dx + "' on bins data"));
-        if (!this._data[0][this.bins][0].hasOwnProperty(this.y)) throw(new Error("No y property '" + this.y + "' on bins data"));
-
-        //
-        // Initialization
-        //
-        var timeProp = this.timeProperty;
-        var xProp = this.x;
-        var binsProp = this.bins;
-        var dxProp = this.dx;
-        var yProp = this.y;
-
-        var data = this._data;
-        var name = this._name;
-        var mode = this.mode;
-        var color = d3.hcl(this.colorScale(name));
-        var tooltip = d3.select(this.$.tooltip);
-
-        var xAccessor = function(d) { return d[xProp] };
-        var yAccessor = function(d) { return d[yProp] };
-        var dxAccessor = function(d) { return d[dxProp] };
-        var xRightAccessor = function(d) { return d[xProp] + d[dxProp] };
-        var timeAccessor = function(d) { return d[timeProp] };
-
-        if (timeProp === "relative") {
-          timeAccessor = function(d) { return d.wall_time - data[0].wall_time };
-        }
-
-        var brect = this.$.svg.getBoundingClientRect();
-        var outerWidth = brect.width,
-            outerHeight = brect.height;
-
-        var sliceHeight,
-            margin = {top: 5, right: 60, bottom: 20, left: 24};
-
-        if (mode === "offset") {
-          sliceHeight = outerHeight / 2.5;
-          margin.top = sliceHeight + 5;
-        } else {
-          sliceHeight = outerHeight - margin.top - margin.bottom;
-        }
-
-        var width = outerWidth - margin.left - margin.right,
-            height = outerHeight - margin.top - margin.bottom;
-
-        var leftMin = d3.min(data, xAccessor),
-            rightMax = d3.max(data, xRightAccessor);
-
-        //
-        // Text formatters
-        //
-        var format = d3.format(".3n");
-        var yAxisFormat = d3.format(".0f");
-
-        if (timeProp === "wall_time") {
-          yAxisFormat = d3.time.format("%m/%d %X");
-        } else if (timeProp === "relative") {
-          yAxisFormat = function(d) {
-            return d3.format(".1r")(d / 3.6e6) + 'h'; // Convert to hours.
-          };
-        }
-
-        //
-        // Calculate the extents
-        //
-        var xExtents = data.map(function(d, i) {
-          return [
-            d3.min(d[binsProp], xAccessor),
-            d3.max(d[binsProp], xRightAccessor)
-          ];
-        });
-        var yExtents = data.map(function(d) {
-          return d3.extent(d[binsProp], yAccessor);
-        });
-
-        //
-        // Scales and axis
-        //
-        var outlineCanvasSize = 500;
-
-        var extent = d3.extent(data, timeAccessor);
-
-        var yScale = (timeProp === "wall_time" ? d3.time.scale() : d3.scale.linear())
-            .domain(extent)
-            .range([0, (mode === "offset" ? height : 0)]);
-
-        var ySliceScale = d3.scale.linear()
-            .domain([0, d3.max(data, function(d, i) { return yExtents[i][1]; })])
-            .range([sliceHeight, 0]);
-
-        var yLineScale = d3.scale.linear()
-            .domain(ySliceScale.domain())
-            .range([outlineCanvasSize, 0]);
-
-        var xScale = d3.scale.linear()
-            .domain([
-              d3.min(data, function(d, i) { return xExtents[i][0]; }),
-              d3.max(data, function(d, i) { return xExtents[i][1]; })
-            ])
-            .nice()
-            .range([0, width]);
-
-        var xLineScale = d3.scale.linear()
-            .domain(xScale.domain())
-            .range([0, outlineCanvasSize]);
-
-        var outlineColor = d3.scale.linear()
-            .domain(d3.extent(data, timeAccessor))
-            .range([color.darker(), color.brighter()])
-            .interpolate(d3.interpolateHcl);
-
-        var xAxis = d3.svg.axis()
-            .scale(xScale)
-            .ticks(Math.max(2, width / 20))
-            .orient("bottom");
-
-        var yAxis = d3.svg.axis()
-            .scale(yScale)
-            .ticks(Math.max(2, height / 15))
-            .tickFormat(yAxisFormat)
-            .orient("right");
-
-        var ySliceAxis = d3.svg.axis()
-            .scale(ySliceScale)
-            .ticks(Math.max(2, height / 15))
-            .tickSize(width + 5)
-            .tickFormat(format)
-            .orient("right");
-
-        var xBinCentroid = function(d) {
-          return d[xProp] + d[dxProp] / 2;
-        };
-
-        var linePath = d3.svg.line()
-            .interpolate("linear")
-            .x(function(d) { return xLineScale(xBinCentroid(d)); })
-            .y(function(d) { return yLineScale(d[yProp]); });
-
-        var path = function(d) {
-          // Draw a line from 0 to the first point and from the last point to 0.
-          return 'M' + xLineScale(xBinCentroid(d[0])) + ',' + yLineScale(0) +
-              'L' + linePath(d).slice(1) +
-              "L" + xLineScale(xBinCentroid(d[d.length - 1])) + "," + yLineScale(0);
-        };
-
-        //
-        // Render
-        //
-        var svgNode = this.$.svg;
-
-        var svg = d3.select(svgNode)
-
-        var svgTransition = svg.transition().duration(duration);
-
-        var g = svg.select("g")
-            .classed("small", function() { return (width > 0 && width <= 150); })
-            .classed("medium", function() { return (width > 150 && width <= 300); })
-            .classed("large", function() { return (width > 300); })
-
-        var gTransition = svgTransition.select("g")
-            .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
-
-        var bisect = d3.bisector(xRightAccessor).left;
-        var stage = g.select(".stage")
-            .on("mouseover", function() {
-              hoverUpdate.style("opacity", 1);
-              xAxisHoverUpdate.style("opacity", 1);
-              yAxisHoverUpdate.style("opacity", 1);
-              ySliceAxisHoverUpdate.style("opacity", 1);
-              tooltip.style("opacity", 1);
-            })
-            .on("mouseout", function() {
-              hoverUpdate.style("opacity", 0);
-              xAxisHoverUpdate.style("opacity", 0);
-              yAxisHoverUpdate.style("opacity", 0);
-              ySliceAxisHoverUpdate.style("opacity", 0);
-              hoverUpdate.classed("hover-closest", false);
-              outlineUpdate.classed("outline-hover", false);
-              tooltip.style("opacity", 0);
-            })
-            .on("mousemove", onMouseMove);
-
-        var background = stage.select(".background")
-            .attr("transform", "translate(" + -margin.left + "," + -margin.top + ")")
-            .attr("width", outerWidth)
-            .attr("height", outerHeight);
-
-        var histogram = stage.selectAll(".histogram").data(data),
-            histogramExit = histogram.exit().remove(),
-            histogramEnter = histogram.enter().append("g").attr("class", "histogram"),
-            histogramUpdate = histogram
-                .sort(function(a, b) { return timeAccessor(a) - timeAccessor(b); }),
-            histogramTransition = gTransition.selectAll(".histogram")
-                .attr("transform", function(d) {
-                  return "translate(0, " +
-                    (mode === "offset" ? (yScale(timeAccessor(d)) - sliceHeight) : 0) + ")";
-                });
-
-        var baselineEnter = histogramEnter.append("line").attr("class", "baseline"),
-            baselineUpdate = histogramTransition.select(".baseline")
-                .style("stroke-opacity", function(d) { return (mode === "offset" ? 0.1 : 0); })
-                .attr("y1", sliceHeight)
-                .attr("y2", sliceHeight)
-                .attr("x2", width);
-
-        var outlineEnter = histogramEnter.append("path").attr("class", "outline"),
-            outlineUpdate = histogramUpdate.select(".outline")
-                .attr("vector-effect", "non-scaling-stroke")
-                .attr("d", function(d) { return path(d[binsProp]); })
-                .style("stroke-width", 1),
-            outlineTransition = histogramTransition.select(".outline")
-                .attr("transform", "scale(" + width / outlineCanvasSize + ", " +
-                      sliceHeight / outlineCanvasSize + ")")
-                .style("stroke", function(d) {
-                  return (mode === "offset" ? "white" : outlineColor(timeAccessor(d)));
-                })
-                .style("fill-opacity", function(d) { return (mode === "offset" ? 1 : 0); })
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); });
-
-        var hoverEnter = histogramEnter.append("g")
-                .attr("class", "hover")
-                .style("fill", function(d) { return outlineColor(timeAccessor(d)); }),
-            hoverUpdate = histogramUpdate.select(".hover");
-
-        hoverEnter.append("circle")
-            .attr("r", 2);
-
-        hoverEnter.append("text")
-            .style("display", "none")
-            .attr("dx", 4);
-
-        var xAxisHover = g.select(".x-axis-hover").selectAll(".label").data(["x"]),
-            xAxisHoverEnter = xAxisHover.enter().append("g").attr("class", "label"),
-            xAxisHoverUpdate = xAxisHover;
-
-        xAxisHoverEnter.append("rect")
-            .attr("x", -20)
-            .attr("y", 6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        xAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 0)
-            .attr("y1", 0)
-            .attr("y2", 6);
-
-        xAxisHoverEnter.append("text")
-            .attr("dy", 18);
-
-        var yAxisHover = g.select(".y-axis-hover").selectAll(".label").data(["y"]),
-            yAxisHoverEnter = yAxisHover.enter().append("g").attr("class", "label"),
-            yAxisHoverUpdate = yAxisHover;
-
-        yAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        yAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        yAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        var ySliceAxisHover = g.select(".y-slice-axis-hover").selectAll(".label").data(["y"]),
-            ySliceAxisHoverEnter = ySliceAxisHover.enter().append("g").attr("class", "label"),
-            ySliceAxisHoverUpdate = ySliceAxisHover;
-
-        ySliceAxisHoverEnter.append("rect")
-            .attr("x", 8)
-            .attr("y", -6)
-            .attr("width", 40)
-            .attr("height", 14)
-
-        ySliceAxisHoverEnter.append("line")
-            .attr("x1", 0)
-            .attr("x2", 6)
-            .attr("y1", 0)
-            .attr("y2", 0);
-
-        ySliceAxisHoverEnter.append("text")
-            .attr("dx", 8)
-            .attr("dy", 4);
-
-        gTransition.select(".y.axis.slice")
-            .style("opacity", mode === "offset" ? 0 : 1)
-            .attr("transform", "translate(0, " + (mode === "offset" ? -sliceHeight : 0) + ")")
-            .call(ySliceAxis);
-
-        gTransition.select(".x.axis")
-            .attr("transform", "translate(0, " + height + ")")
-            .call(xAxis);
-
-        gTransition.select(".y.axis")
-            .style("opacity", mode === "offset" ? 1 : 0)
-            .attr("transform", "translate(" + width + ", " + (mode === "offset" ? 0 : height) + ")")
-            .call(yAxis);
-
-        function onMouseMove() {
-          var m = d3.mouse(this),
-              v = xScale.invert(m[0]),
-              t = yScale.invert(m[1]);
-
-          function hoverXIndex(d) {
-            return Math.min(d[binsProp].length - 1, bisect(d[binsProp], v));
-          }
-          var closestSliceData;
-          var closestSliceDistance = Infinity;
-          var lastSliceData;
-          hoverUpdate
-            .attr("transform", function(d, i) {
-              var index = hoverXIndex(d);
-              lastSliceData = d;
-              var x = xScale(d[binsProp][index][xProp] + d[binsProp][index][dxProp] / 2);
-              var y = ySliceScale(d[binsProp][index][yProp]);
-              var globalY = (mode === "offset" ? yScale(timeAccessor(d)) - (sliceHeight - y) : y);
-              var dist = Math.abs(m[1] - globalY);
-              if (dist < closestSliceDistance) {
-                closestSliceDistance = dist;
-                closestSliceData = d;
-              }
-              return "translate(" + x + "," + y + ")";
-            });
-          hoverUpdate.select("text").text(function(d) {
-            var index = hoverXIndex(d);
-            return d[binsProp][index][yProp];
-          })
-          hoverUpdate.classed("hover-closest", function(d) { return d === closestSliceData; });
-          outlineUpdate.classed("outline-hover", function(d) { return d === closestSliceData; });
-
-          var index = hoverXIndex(lastSliceData);
-
-          xAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" +
-                  xScale(lastSliceData[binsProp][index][xProp] +
-                         lastSliceData[binsProp][index][dxProp] / 2) + ", " +
-                  height + ")";
-              })
-            .select("text")
-              .text(function(d) { return format(lastSliceData[binsProp][index][xProp] +
-                                                lastSliceData[binsProp][index][dxProp] / 2); });
-
-          var fy = yAxis.tickFormat();
-          yAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? yScale(timeAccessor(closestSliceData)) : 0) + ")";
-              })
-              .style("display", mode === "offset" ? "" : "none")
-            .select("text")
-              .text(function(d) { return fy(timeAccessor(closestSliceData));});
-
-          var fsy = ySliceAxis.tickFormat();
-          ySliceAxisHoverUpdate
-              .attr("transform", function(d) {
-                return "translate(" + width + ", " +
-                  (mode === "offset" ? 0 : ySliceScale(closestSliceData[binsProp][index][yProp])) +
-                  ")";
-              })
-              .style("display", mode === "offset" ? "none" : "")
-            .select("text")
-              .text(function(d) { return fsy(closestSliceData[binsProp][index][yProp]); });
-
-          var svgMouse = d3.mouse(svgNode);
-          tooltip.style("transform", "translate(" + (svgMouse[0] + 15) + "px," +
-              (svgMouse[1] - 15) + "px)")
-            .select('span')
-            .text(mode === "offset" ?
-                fsy(closestSliceData[binsProp][index][yProp]) :
-                (timeProp === "step" ? "step " : "") +
-                fy(timeAccessor(closestSliceData)));
-        }
-      }
-    });
-    </script>
-
-  </dom-module>
-
-<dom-module id="tf-histogram-dashboard" assetpath="../tf-histogram-dashboard/">
-  <template>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}" show-download-links="{{_showDownloadLinks}}">
-          <div class="sidebar-section">
-            <tf-option-selector id="histogramModeSelector" name="Histogram Mode" selected-id="{{_histogramMode}}">
-              <paper-button id="overlay">overlay</paper-button>
-              <paper-button id="offset">offset</paper-button>
-            </tf-option-selector>
-          </div>
-          <div class="sidebar-section">
-            <tf-option-selector id="timePropertySelector" name="Offset Time Axis" selected-id="{{_timeProperty}}">
-              <paper-button id="step">step</paper-button>
-              <paper-button id="relative">relative</paper-button>
-              <paper-button id="wall_time">wall</paper-button>
-            </tf-option-selector>
-
-       </div>
-      </tf-sidebar-helper></div>
-
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <vz-histogram-timeseries time-property="[[_timeProperty]]" mode="[[_histogramMode]]" color-scale="[[_colorScaleFunction]]"></vz-histogram-timeseries>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-expanded-height: 500px;
-        --card-expanded-width: 700px;
-      }
-    </style>
-  </template>
-
-  <script>
-    Polymer({
-      is: "tf-histogram-dashboard",
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-        TF.Backend.Behavior,
-      ],
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "histogram"
-        },
-        _histogramMode: {
-          type: String,
-          value: "offset"
-        },
-        _timeProperty: {
-          type: String,
-          value: "step"
-        },
-        _colorScaleFunction: {
-          type: Function,
-          computed: "_getColorScaleFunction(_colorScale)"
-        },
-      },
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _getColorScaleFunction: function() {
-        return this._colorScale.scale.bind(this._colorScale);
-      },
-    });
-  </script>
-</dom-module>
-<link rel="import" href="../paper-spinner/paper-spinner-lite.html">
-
-<dom-module id="tf-image-loader" assetpath="../tf-image-dashboard/">
-  <template>
-    <div id="image-annotation">
-      <template is="dom-if" if="[[_hasAtLeastOneStep]]">
-        step
-        <span class="step-value">
-          [[_stepValue]]
-        </span>
-        <template is="dom-if" if="[[_currentWallTime]]">
-          ([[_currentWallTime]])
-        </template>
-        <paper-spinner-lite active="" hidden$="[[!_isImageLoading]]"></paper-spinner-lite>
-      </template>
-      <template is="dom-if" if="[[_hasMultipleSteps]]">
-        <paper-slider id="steps" immediate-value="{{_stepIndex}}" max="[[_maxStepIndex]]" max-markers="[[_maxStepIndex]]" snaps="" step="1" value="{{_stepIndex}}"></paper-slider>
-      </template>
-    </div>
-
-    <div id="main-image-container"></div>
-
-    <style>
-      :host {
-        display: block;
-        width: 100%;
-        height: auto;
-        position: relative;
-        --step-slider-knob-color: #424242;
-      }
-
-      #image-annotation {
-        border-left: 4px solid;
-        padding-left: 5px;
-        font-size: 12px;
-        margin: -10px 0 10px 0;
-      }
-
-      #image-annotation .step-value {
-        font-weight: bold;
-      }
-
-      #image-annotation paper-spinner-lite {
-        width: 14px;
-        height: 14px;
-        vertical-align: text-bottom;
-        --paper-spinner-color: var(--tb-orange-strong)
-      }
-
-      #steps {
-        height: 15px;
-        margin: 0 0 0 -15px;
-        /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
-         * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
-         * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2. */
-        width: calc(100% + 31px);
-        --paper-slider-active-color: var(--step-slider-knob-color);
-        --paper-slider-knob-color: var(--step-slider-knob-color);
-        --paper-slider-pin-color: var(--step-slider-knob-color);
-        --paper-slider-knob-start-color: var(--step-slider-knob-color);
-        --paper-slider-knob-start-border-color: var(--step-slider-knob-color);
-        --paper-slider-pin-start-color: var(--step-slider-knob-color);
-      }
-
-      #main-image-container img {
-        border: 1px solid #f5f5f5;
-        image-rendering: -moz-crisp-edges;
-        image-rendering: pixelated;
-        display: block;
-        width: 100%;
-        height: auto;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-image-loader",
-      properties: {
-        colorScale: Object,
-        run: String,
-        // This is an array of Tensorboard Image&Datum objects (See backend.ts for details). The
-        // properties of objects in this array are
-        // {
-        //   width: number,
-        //   height: number,
-        //   wall_time: Date,
-        //   step: number,
-        //   url: string,
-        // }
-        _steps: {
-          type: Array,
-          value: [],
-          notify: true,
-        },
-        _stepIndex: {
-          type: Number,
-          notify: true,
-        },
-        _hasAtLeastOneStep: {
-          type: Boolean,
-          computed: "_computeHasAtLeastOneStep(_steps)",
-        },
-        _hasMultipleSteps: {
-          type: Boolean,
-          computed: "_computeHasMultipleSteps(_steps)",
-        },
-        _stepValue: {
-          type: Number,
-          computed: "_computeStepValue(_stepIndex)",
-        },
-        _currentWallTime: {
-          type: Number,
-          computed: "_computeCurrentWallTime(_stepIndex)",
-        },
-        _maxStepIndex: {
-          type: Number,
-          computed: "_computeMaxStepIndex(_steps)",
-        },
-        // We use a strictly increasing index to make sure that we don't settle on a stale image.
-        _currentImageLoadIndex: {
-          type: Number,
-          value: 1,
-        },
-        _isImageLoading: {
-          type: Boolean,
-          value: false,
-        },
-      },
-      observers: [
-        "_updateImageUrl(_steps, _stepIndex)",
-      ],
-      redraw: function() {
-        // Other dashboards logic requires a redraw method to be defined. redraw is called at
-        // various places such as when the image is expanded.
-        this.setSeriesData(this.run, this._steps);
-      },
-      setVisibleSeries: function(runs) {
-        // Do nothing.
-      },
-      setSeriesData: function(run, steps) {
-        this.set("run", run);
-        this.set("_steps", steps);
-        this.set("_stepIndex", steps.length - 1);
-
-        // Update the border color based on the run.
-        var color = this.colorScale.scale(run);
-        this.$$("#image-annotation").style.borderColor = color;
-      },
-      _updateImageUrl: function(steps, stepIndex) {
-        // We manually change the image URL (instead of binding to the image's src attribute)
-        // because we would like to manage what happens when the image starts to / finishes loading.
-        if (!steps.length) {
-          return;
-        }
-
-        let img = new Image();
-        img.id = "img"; // '#img' used to select the image in tf-image-dashboard.
-
-        const loadIndex = ++this._currentImageLoadIndex;
-        img.onload = img.onerror = (function() {
-          if (loadIndex != this._currentImageLoadIndex) {
-            // This load is no longer relevant.
-            return;
-          }
-
-          // The new image has finished loading. Remove the old image. Add the new one.
-          let mainImageContainer = this.$$("#main-image-container");
-          mainImageContainer.innerHTML = "";
-          Polymer.dom(mainImageContainer).appendChild(img);
-
-          // The image has finished loading (or has erred and failed to load).
-          this.set("_isImageLoading", false);
-        }).bind(this);
-
-        // Load the new image.
-        this.set("_isImageLoading", true);
-        img.src = steps[stepIndex].url;
-      },
-      _computeHasAtLeastOneStep: function(steps) {
-        return !!steps && steps.length > 0;
-      },
-      _computeHasMultipleSteps: function(steps) {
-        return !!steps && steps.length > 1;
-      },
-      _computeStepValue: function(stepIndex) {
-        return this._steps[stepIndex].step;
-      },
-      _computeCurrentWallTime: function(stepIndex) {
-        return this._steps[stepIndex].wall_time.toString();
-      },
-      _computeMaxStepIndex: function(steps) {
-        return steps.length - 1;
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-image-dashboard" assetpath="../tf-image-dashboard/">
-  <template>
-    <paper-dialog with-backdrop="" id="actual-image-size-dialog"></paper-dialog>
-    <div id="plumbing">
-      <tf-color-scale id="colorScale" runs="[[runs]]" out-color-scale="{{_colorScale}}"></tf-color-scale>
-    </div>
-
-    <tf-dashboard-layout>
-      <div class="sidebar">
-        <tf-sidebar-helper backend="[[backend]]" categories="{{_categories}}" color-scale="[[_colorScale]]" run2tag="[[run2tag]]" runs="[[runs]]" selected-runs="{{_selectedRuns}}">
-        </tf-sidebar-helper>
-      </div>
-      <div class="center">
-        <tf-panes-helper categories="[[_categories]]" color-scale="[[_colorScale]]" data-type="[[dataType]]" data-provider="[[dataProvider]]" run2tag="[[run2tag]]" selected-runs="[[_selectedRuns]]" repeat-for-runs="">
-          <template>
-            <tf-image-loader color-scale="[[_colorScale]]"></tf-image-loader>
-            <paper-icon-button class="actual-size-button" icon="aspect-ratio" on-tap="_showActualSize" title="Show the image at its true pixel size"></paper-icon-button>
-          </template>
-        </tf-panes-helper>
-      </div>
-    </tf-dashboard-layout>
-    <style include="dashboard-style"></style>
-    <style>
-      tf-panes-helper {
-        --card-width: 340px;
-        --card-height: auto;
-        --card-expanded-width: 700px;
-        --card-expanded-height: auto;
-      }
-
-      .actual-size-button {
-        background: #fff;
-        border-radius: 100%;
-        bottom: -35px;
-        color: #2196f3;
-        height: 32px;
-        left: 35px;
-        padding: 4px;
-        pointer-events: auto;
-        position: absolute;
-        width: 32px;
-      }
-
-      .actual-size-button-selected {
-        background: var(--tb-ui-light-accent);
-      }
-
-      #actual-image-size-dialog {
-        overflow: auto;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-image-dashboard",
-      properties: {
-        backend: Object,
-        dataType: {
-          type: String,
-          value: "image"
-        },
-      },
-      behaviors: [
-          TF.Dashboard.ReloadBehavior("tf-chart-scaffold"),
-          TF.Backend.Behavior,
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _showActualSize: function(e) {
-        var currentTarget = Polymer.dom(e.currentTarget);
-        var card = currentTarget.node.closest('.card');
-
-        // Create a full-size copy of the image.
-        var newImage = card.querySelector('#img').cloneNode();
-        newImage.style.height = 'auto';
-        newImage.style.width = 'auto';
-        newImage.style.margin = 0;
-        newImage.style.padding = 0;
-        newImage.classList.add("actual-size-image");
-
-        // When the user clicks on the image, empty and close the dialog.
-        var dialog = this.$$('#actual-image-size-dialog');
-        newImage.addEventListener('click', function() {
-          dialog.close();
-        });
-
-        // Update dialog content. Show the dialog.
-        dialog.innerHTML = '';
-        dialog.appendChild(newImage);
-        dialog.open();
-      }
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-audio-loader" assetpath="../tf-audio-dashboard/">
-  <style>
-  :host {
-    display: block;
-  }
-  img {
-    width: 100%;
-    height: 100%;
-    image-rendering: pixelated;
-  }
-  </style>
-  <template>
-    <template is="dom-if" if="[[audioUrl]]">
-      <audio controls="" loop="">
-        <source src="[[audioUrl]]" type="[[audioContentType]]">
-      </audio>
-    </template>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-loader",
-      properties: {
-        run: String,
-        tag: String,
-        audioGenerator: Function,
-        audioUrl: String,
-        audioContentType: String
-      },
-      reload: function() {
-        var _this = this;
-        this.audioUrl = ""; // force reload
-        this.audioContentType = "";
-        this.audioGenerator(this.tag, this.run).then(function(metadatas) {
-          var last_metadata = _.last(metadatas);
-          _this.audioUrl = last_metadata.url;
-          _this.audioContentType = last_metadata.content_type;
-        })
-      },
-      ready: function() {
-        // Need to test so that it will not error if it is constructed w/o
-        // all properties (so that it's possible to use stub to mock it out)
-        if (this.run != null && this.tag != null && this.audioGenerator != null) {
-          this.reload();
-        }
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-audio-grid" assetpath="../tf-audio-dashboard/">
-  <template>
-    <style include="scrollbar-style"></style>
-    <div id="fullContainer" class="container scrollbar">
-      <div id="topRow" class="container">
-        <div class="noshrink" id="paddingCell"></div>
-        <template is="dom-repeat" items="[[runs]]" as="run">
-        <div class="run-name-cell noshrink">
-          <span>[[run]]</span>
-        </div>
-      </template>
-      </div>
-      <div id="bottomContainer" class="container">
-        <template is="dom-repeat" items="[[tags]]" as="tag">
-          <div class="audio-row container noshrink">
-            <div class="tag-name-cell noshrink">
-              <span class="tag-name">[[tag]]</span>
-            </div>
-            <template is="dom-repeat" items="[[runs]]" as="run">
-              <div class="audio-cell noshrink">
-                <template is="dom-if" if="[[_exists(run, tag, runToAudio.*)]]">
-                  <tf-audio-loader id="loader" run="[[run]]" tag="[[tag]]" audio-generator="[[audioGenerator]]">
-                  </tf-audio-loader>
-                </template>
-              </div>
-            </template>
-          </div>
-        </template>
-      </div>
-    </div>
-    <style>
-      :host {
-        display: block;
-        height: 100%;
-      }
-      .container {
-        display: flex;
-        flex-wrap: nowrap;
-      }
-      #fullContainer {
-        width: 100%;
-        height: 100%;
-        flex-direction: column;
-        padding-top: 20px;
-        overflow: auto;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      #topRow {
-        flex-direction: row;
-      }
-      #bottomContainer {
-        flex-direction: column;
-        height: 100%;
-        width: 100%;
-      }
-      .audio-row {
-        flex-direction: row;
-        padding-top: 5px;
-      }
-      .audio-cell {
-        width: 300px;
-        height: 36px;
-        border: 1px solid black;
-        margin-right: 3px;
-      }
-      .tag-name-cell {
-        width: 300px;
-        height: 36px;
-        display:flex;
-        flex-direction: column;
-        justify-content: center;
-      }
-      .tag-name {
-        word-wrap: break-word;
-        text-align: center;
-        white-space: nowrap;
-      }
-      .run-name-cell {
-        width: 300px;
-        height: 36px;
-        text-align: center;
-        margin-right: 5px;
-      }
-      .noshrink {
-        flex-shrink: 0;
-      }
-      #paddingCell {
-        width: 300px;
-        height: 36px;
-      }
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-grid",
-      properties: {
-        runToAudio: Object,
-        tags: Array,
-        runs: Array,
-        audioGenerator: Function,
-      },
-      _exists: function (run, tag) {
-        return this.runToAudio[run].indexOf(tag) !== -1;
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-audio-dashboard" assetpath="../tf-audio-dashboard/">
-  <template>
-    <div class="center">
-      <tf-no-data-warning data-type="audio" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-      <tf-audio-grid id="audioGrid" run-to-audio="[[run2tag]]" audio-generator="[[dataProvider]]" tags="[[tags]]" runs="[[runs]]"></tf-audio-grid>
-    </div>
-
-    <style>
-      .center {
-        height: 100%;
-        width: 100%;
-        -webkit-box-sizing: border-box;
-        -moz-box-sizing: border-box;
-        box-sizing: border-box;
-      }
-      :host {
-        height: 100%;
-        display: block;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-audio-dashboard",
-      properties: {
-        dataType: {value: "audio"},
-      },
-      behaviors: [
-        TF.Dashboard.ReloadBehavior("tf-audio-loader"),
-        TF.Backend.Behavior
-      ],
-      attached: function() {
-        this.async(function() {
-          this.fire("rendered");
-        });
-      },
-      _hasAudio: function(runToAudioChange) {
-        return _.values(runToAudioChange.base).some(function(arr) {
-          return arr.length > 0;
-        });
-      },
-    });
-  </script>
-</dom-module>
-
-<dom-module id="tf-graph-loader" assetpath="../tf-graph-loader/">
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph-loader',
-
-  properties: {
-    /**
-     * @type {value: number, msg: string}
-     *
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     */
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    datasets: Array,
-    selectedDataset: Number,
-    selectedFile: {
-      type: Object,
-      observer: '_selectedFileChanged'
-    },
-    outGraphHierarchy: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outGraph: {
-      type: Object,
-      readOnly: true, //readonly so outsider can't change this via binding
-      notify: true
-    },
-    outHierarchyParams: {
-      type: Object,
-      readOnly: true,
-      notify: true
-    },
-    outStats: {
-      type: Object,
-      readOnly: true, // This property produces data.
-      notify: true
-    }
-  },
-  observers: [
-    '_selectedDatasetChanged(selectedDataset, datasets)',
-    '_readAndParseMetadata(selectedMetadataTag)'
-  ],
-  _readAndParseMetadata: function(metadataIndex) {
-    if (metadataIndex == -1 || this.datasets[this.selectedDataset] == null ||
-        this.datasets[this.selectedDataset].runMetadata == null ||
-        this.datasets[this.selectedDataset].runMetadata[metadataIndex] == null) {
-      this._setOutStats(null);
-      return;
-    }
-    var path = this.datasets[this.selectedDataset].runMetadata[metadataIndex].path;
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    tf.graph.parser.fetchAndParseMetadata(path, tracker)
-    .then(function(stats) {
-      this._setOutStats(stats);
-    }.bind(this));
-  },
-  _parseAndConstructHierarchicalGraph: function(path, pbTxtFile) {
-    // Reset the progress bar to 0.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyParams = {
-      verifyTemplate: true,
-      // If a set of numbered op nodes has at least this number of nodes
-      // then group them into a series node.
-      seriesNodeMinSize: 5,
-      // A map of series node names to series grouping settings, to indicate
-      // if a series is to be rendered as grouped or ungrouped.
-      // Starts out empty which allows the renderer to decide which series
-      // are initially rendered grouped and which aren't.
-      seriesMap: {},
-    };
-    this._setOutHierarchyParams(hierarchyParams);
-    var dataTracker = tf.graph.util.getSubtaskTracker(tracker, 30, 'Data');
-    tf.graph.parser.fetchAndParseGraphData(path, pbTxtFile, dataTracker)
-    .then(function(graph) {
-      if (!graph) {
-        throw 'The graph is empty. Make sure that the graph is passed to the ' +
-            'SummaryWriter after the graph is defined.';
-      }
-
-      // Build the flat graph (consists only of Op nodes).
-
-      // This is the whitelist of inputs on op types that are considered
-      // reference edges. "Assign 0" indicates that the first input to
-      // an OpNode with operation type "Assign" is a reference edge.
-      var refEdges = {};
-      refEdges["Assign 0"] = true;
-      refEdges["AssignAdd 0"] = true;
-      refEdges["AssignSub 0"] = true;
-      refEdges["assign 0"] = true;
-      refEdges["assign_add 0"] = true;
-      refEdges["assign_sub 0"] = true;
-      refEdges["count_up_to 0"] = true;
-      refEdges["ScatterAdd 0"] = true;
-      refEdges["ScatterSub 0"] = true;
-      refEdges["ScatterUpdate 0"] = true;
-      refEdges["scatter_add 0"] = true;
-      refEdges["scatter_sub 0"] = true;
-      refEdges["scatter_update 0"] = true;
-      var buildParams = {
-        enableEmbedding: true,
-        inEmbeddingTypes: ['Const'],
-        outEmbeddingTypes: ['^[a-zA-Z]+Summary$'],
-        refEdges: refEdges
-      };
-      var graphTracker = tf.graph.util.getSubtaskTracker(tracker, 20, 'Graph');
-      return tf.graph.build(graph, buildParams, graphTracker);
-    })
-    .then(function(graph) {
-      this._setOutGraph(graph);
-      var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 50,
-          'Namespace hierarchy');
-      return tf.graph.hierarchy.build(graph, hierarchyParams, hierarchyTracker);
-    }.bind(this))
-    .then(function(graphHierarchy) {
-      // Update the properties which notify the parent with the
-      // graph hierarchy and whether the data has live stats or not.
-      this._setOutGraphHierarchy(graphHierarchy);
-    }.bind(this))
-    .catch(function(e) {
-      // Generic error catch, for errors that happened outside
-      // asynchronous tasks.
-      tracker.reportError("Graph visualization failed: " + e, e);
-    });
-  },
-  _selectedDatasetChanged: function(datasetIndex, datasets) {
-    this._parseAndConstructHierarchicalGraph(datasets[datasetIndex].path);
-  },
-  _selectedFileChanged: function(e) {
-    if (!e) {
-      return;
-    }
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-
-    // Clear out the value of the file chooser. This ensures that if the user
-    // selects the same file, we'll re-read it.
-    e.target.value = '';
-
-    this._parseAndConstructHierarchicalGraph(null, file);
-  }
-});
-</script>
-<script src="../lodash/lodash.min.js"></script>
-<script src="../graphlib/dist/graphlib.core.js"></script>
-<script src="../dagre/dist/dagre.core.js"></script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    /**
-     * Mapping from color palette name to color palette, which contains
-     * exact colors for multiple states of a single color palette.
-     */
-    tf.COLORS = [
-        {
-            'name': 'Google Blue',
-            'color': '#4184f3',
-            'active': '#3a53c5',
-            'disabled': '#cad8fc'
-        },
-        {
-            'name': 'Google Red',
-            'color': '#db4437',
-            'active': '#8f2a0c',
-            'disabled': '#e8c6c1'
-        },
-        {
-            'name': 'Google Yellow',
-            'color': '#f4b400',
-            'active': '#db9200',
-            'disabled': '#f7e8b0'
-        },
-        {
-            'name': 'Google Green',
-            'color': '#0f9d58',
-            'active': '#488046',
-            'disabled': '#c2e1cc'
-        },
-        {
-            'name': 'Purple',
-            'color': '#aa46bb',
-            'active': '#5c1398',
-            'disabled': '#d7bce6'
-        },
-        {
-            'name': 'Teal',
-            'color': '#00abc0',
-            'active': '#47828e',
-            'disabled': '#c2eaf2'
-        },
-        {
-            'name': 'Deep Orange',
-            'color': '#ff6f42',
-            'active': '#ca4a06',
-            'disabled': '#f2cbba'
-        },
-        {
-            'name': 'Lime',
-            'color': '#9d9c23',
-            'active': '#7f771d',
-            'disabled': '#f1f4c2'
-        },
-        {
-            'name': 'Indigo',
-            'color': '#5b6abf',
-            'active': '#3e47a9',
-            'disabled': '#c5c8e8'
-        },
-        {
-            'name': 'Pink',
-            'color': '#ef6191',
-            'active': '#ca1c60',
-            'disabled': '#e9b9ce'
-        },
-        {
-            'name': 'Deep Teal',
-            'color': '#00786a',
-            'active': '#2b4f43',
-            'disabled': '#bededa'
-        },
-        {
-            'name': 'Deep Pink',
-            'color': '#c1175a',
-            'active': '#75084f',
-            'disabled': '#de8cae'
-        },
-        {
-            'name': 'Gray',
-            'color': '#9E9E9E',
-            'active': '#424242',
-            'disabled': 'F5F5F5' // 100
-        }
-    ].reduce(function (m, c) {
-        m[c.name] = c;
-        return m;
-    }, {});
-    /**
-     * Mapping from op category to color palette name
-     * e.g.,  OP_GROUP_COLORS['state_ops'] = 'Google Blue';
-     */
-    tf.OP_GROUP_COLORS = [
-        {
-            color: 'Google Red',
-            groups: [
-                'gen_legacy_ops', 'legacy_ops', 'legacy_flogs_input',
-                'legacy_image_input', 'legacy_input_example_input',
-                'legacy_sequence_input', 'legacy_seti_input_input'
-            ]
-        },
-        { color: 'Deep Orange', groups: ['constant_ops'] },
-        { color: 'Indigo', groups: ['state_ops'] },
-        { color: 'Purple', groups: ['nn_ops', 'nn'] },
-        { color: 'Google Green', groups: ['math_ops'] },
-        { color: 'Lime', groups: ['array_ops'] },
-        { color: 'Teal', groups: ['control_flow_ops', 'data_flow_ops'] },
-        { color: 'Pink', groups: ['summary_ops'] },
-        { color: 'Deep Pink', groups: ['io_ops'] }
-    ].reduce(function (m, c) {
-        c.groups.forEach(function (group) { m[group] = c.color; });
-        return m;
-    }, {});
-})(tf || (tf = {}));
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        /** Delimiter used in node names to denote namespaces. */
-        graph_1.NAMESPACE_DELIM = '/';
-        graph_1.ROOT_NAME = '__root__';
-        /** Attribute key used for storing attributes that are too large. */
-        graph_1.LARGE_ATTRS_KEY = '_too_large_attrs';
-        /**
-         * Maximum allowed size in bytes, before the attribute is considered large
-         * and filtered out of the graph.
-         */
-        graph_1.LIMIT_ATTR_SIZE = 1024;
-        // Separator between the source and the destination name of the edge.
-        graph_1.EDGE_KEY_DELIM = '--';
-        var GraphType;
-        (function (GraphType) {
-            GraphType[GraphType["FULL"] = 0] = "FULL";
-            GraphType[GraphType["EMBEDDED"] = 1] = "EMBEDDED";
-            GraphType[GraphType["META"] = 2] = "META";
-            GraphType[GraphType["SERIES"] = 3] = "SERIES";
-            GraphType[GraphType["CORE"] = 4] = "CORE";
-            GraphType[GraphType["SHADOW"] = 5] = "SHADOW";
-            GraphType[GraphType["BRIDGE"] = 6] = "BRIDGE";
-            GraphType[GraphType["EDGE"] = 7] = "EDGE";
-        })(GraphType = graph_1.GraphType || (graph_1.GraphType = {}));
-        ;
-        var NodeType;
-        (function (NodeType) {
-            NodeType[NodeType["META"] = 0] = "META";
-            NodeType[NodeType["OP"] = 1] = "OP";
-            NodeType[NodeType["SERIES"] = 2] = "SERIES";
-            NodeType[NodeType["BRIDGE"] = 3] = "BRIDGE";
-            NodeType[NodeType["ELLIPSIS"] = 4] = "ELLIPSIS";
-        })(NodeType = graph_1.NodeType || (graph_1.NodeType = {}));
-        ;
-        /** Indicates if a node is to be included in the main graph when rendered. */
-        var InclusionType;
-        (function (InclusionType) {
-            InclusionType[InclusionType["INCLUDE"] = 0] = "INCLUDE";
-            InclusionType[InclusionType["EXCLUDE"] = 1] = "EXCLUDE";
-            InclusionType[InclusionType["UNSPECIFIED"] = 2] = "UNSPECIFIED";
-        })(InclusionType = graph_1.InclusionType || (graph_1.InclusionType = {}));
-        ;
-        /** Indicates if a series is to be grouped in the graph when rendered. */
-        var SeriesGroupingType;
-        (function (SeriesGroupingType) {
-            SeriesGroupingType[SeriesGroupingType["GROUP"] = 0] = "GROUP";
-            SeriesGroupingType[SeriesGroupingType["UNGROUP"] = 1] = "UNGROUP";
-        })(SeriesGroupingType = graph_1.SeriesGroupingType || (graph_1.SeriesGroupingType = {}));
-        ;
-        /** Attribute key reserved for the shapes of the output tensors. */
-        var OUTPUT_SHAPES_KEY = '_output_shapes';
-        /**
-         * A SlimGraph is inspired by graphlib.Graph, but having only the functionality
-         * that we need.
-         */
-        var SlimGraph = (function () {
-            function SlimGraph() {
-                this.nodes = {};
-                this.edges = [];
-            }
-            return SlimGraph;
-        }());
-        graph_1.SlimGraph = SlimGraph;
-        var EllipsisNodeImpl = (function () {
-            /**
-             * Constructs a new ellipsis annotation node.
-             *
-             * @param numNodes The number of additional annotations this node represents.
-             */
-            function EllipsisNodeImpl(numNodes) {
-                this.type = NodeType.ELLIPSIS;
-                this.isGroupNode = false;
-                this.cardinality = 1;
-                this.parentNode = null;
-                this.stats = null;
-                this.setNumMoreNodes(numNodes);
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            EllipsisNodeImpl.prototype.setNumMoreNodes = function (numNodes) {
-                this.numMoreNodes = numNodes;
-                this.name = '... ' + numNodes + ' more';
-            };
-            return EllipsisNodeImpl;
-        }());
-        graph_1.EllipsisNodeImpl = EllipsisNodeImpl;
-        ;
-        /**
-         * A label object for nodes in the full graph and leaf nodes in the render
-         * graph.
-         */
-        var OpNodeImpl = (function () {
-            /**
-             * Constructs a new Op node.
-             *
-             * @param rawNode The raw node.
-             */
-            function OpNodeImpl(rawNode) {
-                this.op = rawNode.op;
-                this.name = rawNode.name;
-                this.device = rawNode.device;
-                this.attr = rawNode.attr;
-                // An array of normalized inputs that denote the incoming edges to
-                // the current node. Each input contains the normalized name of the
-                // source node, whether it has a number part and whether it is a
-                // control dependency.
-                this.inputs = normalizeInputs(rawNode.input);
-                this.outputShapes = extractOutputShapes(rawNode.attr);
-                // additional properties
-                this.type = NodeType.OP;
-                this.isGroupNode = false;
-                this.cardinality = 1;
-                this.inEmbeddings = [];
-                this.outEmbeddings = [];
-                this.parentNode = null;
-                this.include = InclusionType.UNSPECIFIED;
-                this.owningSeries = null;
-            }
-            return OpNodeImpl;
-        }());
-        graph_1.OpNodeImpl = OpNodeImpl;
-        ;
-        function createMetanode(name, opt) {
-            if (opt === void 0) { opt = {}; }
-            return new MetanodeImpl(name, opt);
-        }
-        graph_1.createMetanode = createMetanode;
-        /**
-         * Joins the information from the stats file (memory, compute time) with the
-         * graph information.
-         */
-        function joinStatsInfoWithGraph(graph, stats, devicesForStats) {
-            // Reset stats for each node.
-            _.each(graph.nodes, function (node) { node.stats = null; });
-            _.each(stats.dev_stats, function (devStats) {
-                // Ignore devices that are not selected.
-                if (devicesForStats && !devicesForStats[devStats.device]) {
-                    return;
-                }
-                _.each(devStats.node_stats, function (nodeStats) {
-                    // Lookup the node in the graph by its original name, e.g. A. If not
-                    // found, lookup by the rewritten name A/(A) in case the name is both
-                    // a namespace and a node name.
-                    var nodeName = nodeStats.node_name in graph.nodes ? nodeStats.node_name :
-                        nodeStats.node_name +
-                            graph_1.NAMESPACE_DELIM + '(' + nodeStats.node_name + ')';
-                    // Couldn't find a matching node.
-                    if (!(nodeName in graph.nodes)) {
-                        return;
-                    }
-                    // Compute the total bytes used.
-                    var totalBytes = 0;
-                    if (nodeStats.memory) {
-                        _.each(nodeStats.memory, function (alloc) {
-                            if (alloc.total_bytes) {
-                                if (alloc.total_bytes > 0) {
-                                    totalBytes += Number(alloc.total_bytes);
-                                }
-                                else {
-                                    /* tslint:disable */
-                                    console.log('ignoring negative memory allocation for ' + nodeName);
-                                }
-                            }
-                        });
-                    }
-                    var outputSize = null;
-                    if (nodeStats.output) {
-                        outputSize = _.map(nodeStats.output, function (output) {
-                            return _.map(output.tensor_description.shape.dim, function (dim) { return Number(dim.size); });
-                        });
-                    }
-                    graph.nodes[nodeName].device = devStats.device;
-                    if (graph.nodes[nodeName].stats == null) {
-                        graph.nodes[nodeName].stats = new NodeStats(outputSize);
-                    }
-                    graph.nodes[nodeName].stats.addBytesAllocation(totalBytes);
-                    if (nodeStats.all_end_rel_micros) {
-                        if (nodeStats.all_end_rel_micros > 0) {
-                            graph.nodes[nodeName].stats.addExecutionTime(nodeStats.all_start_micros, nodeStats.all_start_micros + nodeStats.all_end_rel_micros);
-                        }
-                        else {
-                            /* tslint:disable */
-                            console.log('ignoring negative runtime for ' + nodeName);
-                        }
-                    }
-                });
-            });
-        }
-        graph_1.joinStatsInfoWithGraph = joinStatsInfoWithGraph;
-        /**
-         * Execution stats for the node.
-         */
-        var NodeStats = (function () {
-            function NodeStats(outputSize) {
-                /**
-                 * Total number of bytes used for the node. Sum of all children
-                 * if it is a Group node.
-                 */
-                this.totalBytes = 0;
-                this.outputSize = outputSize;
-            }
-            /**
-             * Add the start and end time for a particular kernel execution of this op.
-             * Ops can have multiple kernel executions within the same session run.
-             */
-            NodeStats.prototype.addExecutionTime = function (startTime, endTime) {
-                if (this.startTime != null) {
-                    this.startTime = Math.min(this.startTime, startTime);
-                }
-                else {
-                    this.startTime = startTime;
-                }
-                if (this.endTime != null) {
-                    this.endTime = Math.max(this.endTime, endTime);
-                }
-                else {
-                    this.endTime = endTime;
-                }
-            };
-            /**
-             * Add the bytes allocated for a particular kernel execution of this op.
-             * Ops can have multiple kernel executions within the same session run.
-             */
-            NodeStats.prototype.addBytesAllocation = function (totalBytes) {
-                if (this.totalBytes != null) {
-                    this.totalBytes = Math.max(this.totalBytes, totalBytes);
-                }
-                else {
-                    this.totalBytes = totalBytes;
-                }
-            };
-            Object.defineProperty(NodeStats.prototype, "totalMicros", {
-                /**
-                 * Total number of compute time in microseconds used for the node.
-                 * Sum of all children if it is a Group node. Null if it is unknown.
-                 */
-                get: function () {
-                    if (this.startTime == null || this.endTime == null) {
-                        return null;
-                    }
-                    return this.endTime - this.startTime;
-                },
-                enumerable: true,
-                configurable: true
-            });
-            /**
-             * Combines the specified stats with the current stats.
-             * Modifies the current object. This method is used to
-             * compute aggregate stats for group nodes.
-             */
-            NodeStats.prototype.combine = function (stats) {
-                if (stats.totalBytes != null) {
-                    this.totalBytes += stats.totalBytes;
-                }
-                if (stats.totalMicros != null) {
-                    this.addExecutionTime(stats.startTime, stats.endTime);
-                }
-            };
-            return NodeStats;
-        }());
-        graph_1.NodeStats = NodeStats;
-        var MetanodeImpl = (function () {
-            /** A label object for meta-nodes in the graph hierarchy */
-            function MetanodeImpl(name, opt) {
-                if (opt === void 0) { opt = {}; }
-                this.name = name;
-                this.type = NodeType.META;
-                /** number of levels under this group */
-                this.depth = 1;
-                this.isGroupNode = true;
-                /** # of leaf nodes (including embedded ones) */
-                this.cardinality = 0;
-                /** graph contains metanodes, nodes, edges
-                 * and metaedges for main items within this metanode
-                 */
-                this.metagraph =
-                    createGraph(name, GraphType.META, opt);
-                /** bridgegraph must be constructed lazily-see hierarchy.getBridgegraph() */
-                this.bridgegraph = null;
-                /**
-                 * A dictionary that count ops type of nodes in this metanode
-                 * (op type => count).
-                 */
-                this.opHistogram = {};
-                this.deviceHistogram = {};
-                /** unique id for a metanode of similar subgraph */
-                this.templateId = null;
-                /** Metanode which contains this node, if any */
-                this.parentNode = null;
-                this.hasNonControlEdges = false;
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            MetanodeImpl.prototype.getFirstChild = function () {
-                return this.metagraph.node(this.metagraph.nodes()[0]);
-            };
-            /**
-             * Returns the op node associated with the metanode.
-             * For example, if the metanode is 'sgd', the associated
-             * op node is sgd/(sgd).
-             */
-            MetanodeImpl.prototype.getRootOp = function () {
-                var nameSplit = this.name.split('/');
-                var rootOpName = this.name + '/(' + nameSplit[nameSplit.length - 1] + ')';
-                return this.metagraph.node(rootOpName);
-            };
-            /**
-             * Return an array of the names of all the leaves (non-GroupNodes) inside
-             * this metanode. This performs a breadth-first search of the tree, so
-             * immediate child leaves will appear earlier in the output array than
-             * descendant leaves.
-             */
-            MetanodeImpl.prototype.leaves = function () {
-                var leaves = [];
-                var queue = [this];
-                var metagraph; // Defined here due to a limitation of ES6->5 compilation.
-                while (queue.length) {
-                    var node = queue.shift();
-                    if (node.isGroupNode) {
-                        metagraph = node.metagraph;
-                        _.each(metagraph.nodes(), function (name) { return queue.push(metagraph.node(name)); });
-                    }
-                    else {
-                        leaves.push(node.name);
-                    }
-                }
-                return leaves;
-            };
-            return MetanodeImpl;
-        }());
-        graph_1.MetanodeImpl = MetanodeImpl;
-        ;
-        function createMetaedge(v, w) {
-            return new MetaedgeImpl(v, w);
-        }
-        graph_1.createMetaedge = createMetaedge;
-        /**
-         * A label object for edges between metanodes of subgraphs in the render graph.
-         */
-        var MetaedgeImpl = (function () {
-            function MetaedgeImpl(v, w) {
-                this.v = v;
-                this.w = w;
-                this.baseEdgeList = [];
-                this.inbound = null;
-                this.numRegularEdges = 0;
-                this.numControlEdges = 0;
-                this.numRefEdges = 0;
-                this.totalSize = 0;
-            }
-            MetaedgeImpl.prototype.addBaseEdge = function (edge, h) {
-                this.baseEdgeList.push(edge);
-                if (edge.isControlDependency) {
-                    this.numControlEdges += 1;
-                }
-                else {
-                    this.numRegularEdges += 1;
-                }
-                if (edge.isReferenceEdge) {
-                    this.numRefEdges += 1;
-                }
-                // Compute the size of the tensor flowing through this
-                // base edge.
-                this.totalSize += MetaedgeImpl.computeSizeOfEdge(edge, h);
-                h.maxMetaEdgeSize = Math.max(h.maxMetaEdgeSize, this.totalSize);
-            };
-            MetaedgeImpl.computeSizeOfEdge = function (edge, h) {
-                var opNode = h.node(edge.v);
-                if (opNode.outputShapes == null) {
-                    // No shape information. Asssume a single number. This gives
-                    // a lower bound for the total size.
-                    return 1;
-                }
-                h.hasShapeInfo = true;
-                // Sum the sizes of all output tensors.
-                return _(opNode.outputShapes).map(function (shape) {
-                    // If the shape is unknown, treat it as 1 when computing
-                    // total size. This gives a lower bound for the total size.
-                    if (shape == null) {
-                        return 1;
-                    }
-                    // Multiply all shapes to get the total size of the tensor.
-                    // E.g. The total size of [4, 2, 1] is 4 * 2 * 1.
-                    return _(shape).reduce(function (accumulated, currSize) {
-                        // If this particular dimension is unknown, treat
-                        // it as 1 when computing total size. This gives a lower bound
-                        // for the total size.
-                        if (currSize === -1) {
-                            currSize = 1;
-                        }
-                        return accumulated * currSize;
-                    }, 1);
-                }).sum();
-            };
-            return MetaedgeImpl;
-        }());
-        graph_1.MetaedgeImpl = MetaedgeImpl;
-        function createSeriesNode(prefix, suffix, parent, clusterId, name) {
-            return new SeriesNodeImpl(prefix, suffix, parent, clusterId, name);
-        }
-        graph_1.createSeriesNode = createSeriesNode;
-        function getSeriesNodeName(prefix, suffix, parent, startId, endId) {
-            var numRepresentation = (typeof startId !== 'undefined' && typeof endId !== 'undefined') ?
-                '[' + startId + '-' + endId + ']' :
-                '#';
-            var pattern = prefix + numRepresentation + suffix;
-            return (parent ? parent + '/' : '') + pattern;
-        }
-        graph_1.getSeriesNodeName = getSeriesNodeName;
-        var SeriesNodeImpl = (function () {
-            function SeriesNodeImpl(prefix, suffix, parent, clusterId, name) {
-                this.name = name || getSeriesNodeName(prefix, suffix, parent);
-                this.type = NodeType.SERIES;
-                this.hasLoop = false;
-                this.prefix = prefix;
-                this.suffix = suffix;
-                this.clusterId = clusterId;
-                this.ids = [];
-                this.parent = parent;
-                this.isGroupNode = true;
-                this.cardinality = 0;
-                this.metagraph = createGraph(name, GraphType.SERIES);
-                // bridgegraph must be constructed lazily-see hierarchy.getBridgegraph()
-                this.bridgegraph = null;
-                this.parentNode = null;
-                this.deviceHistogram = {};
-                this.hasNonControlEdges = false;
-                this.include = InclusionType.UNSPECIFIED;
-            }
-            return SeriesNodeImpl;
-        }());
-        /**
-         * Extracts the shapes of the output tensors from the attr property in the
-         * node proto.
-         */
-        function extractOutputShapes(attr) {
-            var result = null;
-            // We don't know anything about the output tensors.
-            if (!attr) {
-                return null;
-            }
-            for (var i = 0; i < attr.length; i++) {
-                var _a = attr[i], key = _a.key, value = _a.value;
-                if (key === OUTPUT_SHAPES_KEY) {
-                    if (!value.list.shape) {
-                        // The OUTPUT_SHAPES_KEY lacks a value. We know nothing about the shape.
-                        return null;
-                    }
-                    // Map all output tensors into array of numbers denoting their shape.
-                    var result_1 = value.list.shape.map(function (shape) {
-                        if (shape.unknown_rank) {
-                            // This output tensor is of unknown rank. We don't know if it is a
-                            // scalar, or a tensor, or of what shape it is.
-                            return null;
-                        }
-                        if (shape.dim == null ||
-                            (shape.dim.length === 1 && shape.dim[0].size == null)) {
-                            // This output tensor is a scalar.
-                            return [];
-                        }
-                        // This output tensor has a known rank. Map each dimension size
-                        // into a number.
-                        return shape.dim.map(function (dim) {
-                            // Size can be -1 if this particular dimension is unknown.
-                            return dim.size;
-                        });
-                    });
-                    // Since we already processed it, remove the entry from the attribute
-                    // list (saves memory).
-                    attr.splice(i, 1);
-                    return result_1;
-                }
-            }
-            // We didn't find OUTPUT_SHAPES_KEY in attributes, so we don't know anything
-            // about the output tensors.
-            return null;
-        }
-        /**
-         * Normalizes the inputs and extracts associated metadata:
-         * 1) Inputs can contain a colon followed by a number at the end
-         *    (e.g. inputName:1) and we remove this from the input name, and take note
-         *    that the input was numbered.
-         * 2) Control dependency inputs contain caret at the beginning and we
-         *    remove this and annotate the edge as a control dependency.
-         * @param inputs Array of unnormalized names of input nodes.
-         */
-        function normalizeInputs(inputs) {
-            var normalizedInputs = [];
-            _.each(inputs, function (inputName) {
-                var start = inputName[0] === '^';
-                var colon = inputName.lastIndexOf(':');
-                var end = colon !== -1 &&
-                    inputName.length - colon > 1 &&
-                    !(/\D/).test(inputName.substring(colon + 1)) ?
-                    colon : inputName.length;
-                var name = inputName.substring(start ? 1 : 0, end);
-                if (normalizedInputs.length === 0 ||
-                    name !== normalizedInputs[normalizedInputs.length - 1].name) {
-                    normalizedInputs.push({
-                        name: name,
-                        outputTensorIndex: end === inputName.length ? 0 : Number(inputName.slice(colon + 1)),
-                        isControlDependency: start
-                    });
-                }
-            });
-            return normalizedInputs;
-        }
-        function addEdgeToGraph(graph, inputName, outputNode, input, params, index) {
-            // Don't allow loops in the graph.
-            if (inputName === outputNode.name) {
-                return;
-            }
-            // Check if this op type and input number corresponds to a
-            // reference edge using the refEdges dictionary in the params.
-            var isRefEdge = params.refEdges[outputNode.op + ' ' + index] === true;
-            graph.edges.push({
-                v: inputName,
-                w: outputNode.name,
-                outputTensorIndex: input.outputTensorIndex,
-                isControlDependency: input.isControlDependency,
-                isReferenceEdge: isRefEdge
-            });
-        }
-        function build(rawNodes, params, tracker) {
-            /**
-             * A dictionary that maps each in-embedding node name to the node
-             * object.
-             */
-            var inEmbedding = {};
-            /**
-             * A dictionary that maps each out-embedding node name to the node
-             * object.
-             */
-            var outEmbedding = {};
-            /**
-             * A dictionary that maps each node name to an array of the node's
-             * out-embedding node label objects.
-             */
-            var outEmbeddings = {};
-            var isInEmbeddedPred = getEmbedPredicate(params.inEmbeddingTypes);
-            var isOutEmbeddedPred = getEmbedPredicate(params.outEmbeddingTypes);
-            var embeddingNodeNames = [];
-            /**
-             * A list of all the non-embedding node names which appear in the processed
-             * list of raw nodes. Here we pre-allocate enough room for all the rawNodes,
-             * even though there will some number of embeddings. The excess array length
-             * is spliced off later.
-             *
-             * Experimentation shows that around 30% of the array will go unused, and
-             * even for very large networks that amounts to less than 10k spaces.
-             */
-            var nodeNames = new Array(rawNodes.length);
-            return tf.graph.util
-                .runAsyncTask('Normalizing names', 30, function () {
-                var opNodes = new Array(rawNodes.length);
-                var index = 0;
-                _.each(rawNodes, function (rawNode) {
-                    var opNode = new OpNodeImpl(rawNode);
-                    if (isInEmbeddedPred(opNode)) {
-                        embeddingNodeNames.push(opNode.name);
-                        inEmbedding[opNode.name] = opNode;
-                        return;
-                    }
-                    if (isOutEmbeddedPred(opNode)) {
-                        embeddingNodeNames.push(opNode.name);
-                        outEmbedding[opNode.name] = opNode;
-                        _.each(opNode.inputs, function (input) {
-                            var inputName = input.name;
-                            outEmbeddings[inputName] = outEmbeddings[inputName] || [];
-                            outEmbeddings[inputName].push(opNode);
-                        });
-                        return;
-                    }
-                    // The node is not an embedding, so add it to the names and nodes
-                    // lists.
-                    opNodes[index] = opNode;
-                    nodeNames[index] = opNode.name;
-                    index++;
-                });
-                opNodes.splice(index);
-                nodeNames.splice(index);
-                return opNodes;
-            }, tracker)
-                .then(function (opNodes) {
-                // Create the graph data structure from the graphlib library.
-                return tf.graph.util.runAsyncTask('Building the data structure', 70, function () {
-                    var normalizedNameDict = mapStrictHierarchy(nodeNames, embeddingNodeNames);
-                    var graph = new SlimGraph;
-                    // Add the nodes to the graph.
-                    _.each(opNodes, function (opNode) {
-                        var normalizedName = normalizedNameDict[opNode.name] || opNode.name;
-                        graph.nodes[normalizedName] = opNode;
-                        // Check if the node has out-embeddings. If yes, add them to the
-                        // node.
-                        if (opNode.name in outEmbeddings) {
-                            opNode.outEmbeddings = outEmbeddings[opNode.name];
-                            // Normalize the names of the out-embeddings.
-                            _.each(opNode.outEmbeddings, function (node) {
-                                node.name = normalizedNameDict[node.name] || node.name;
-                            });
-                        }
-                        // Update the name of the node.
-                        opNode.name = normalizedName;
-                    });
-                    // Visit each node's inputs to add the edges to the graph. If the
-                    // input
-                    // is an in-embedding, then add it to the node's in-embeddings
-                    // instead.
-                    _.each(opNodes, function (opNode) {
-                        _.each(opNode.inputs, function (input, i) {
-                            var inputName = input.name;
-                            if (inputName in inEmbedding) {
-                                var inEmbedNode = inEmbedding[inputName];
-                                opNode.inEmbeddings.push(inEmbedNode);
-                                // Move the inputs of the in-embedding node into incoming
-                                // edges of
-                                // the main node. E.g. the control dependency of a constant
-                                // node
-                                // should be moved to the op node where the constant is
-                                // embedded.
-                                for (var _i = 0, _a = inEmbedNode.inputs; _i < _a.length; _i++) {
-                                    var embedInput = _a[_i];
-                                    addEdgeToGraph(graph, normalizedNameDict[embedInput.name] ||
-                                        embedInput.name, opNode, embedInput, params, i);
-                                }
-                            }
-                            else if (inputName in outEmbedding) {
-                                // Move the inputs of the out-embedding node into inputs of
-                                // the main node where the out-embedding points to.
-                                var outEmbedNode = outEmbedding[inputName];
-                                for (var _b = 0, _c = outEmbedNode.inputs; _b < _c.length; _b++) {
-                                    var embedInput = _c[_b];
-                                    addEdgeToGraph(graph, normalizedNameDict[embedInput.name] ||
-                                        embedInput.name, opNode, input, params, i);
-                                }
-                            }
-                            else {
-                                addEdgeToGraph(graph, normalizedNameDict[inputName] || inputName, opNode, input, params, i);
-                            }
-                        });
-                    });
-                    // Normalize the names of in-embeddings.
-                    _.each(inEmbedding, function (node, name) {
-                        node.name = normalizedNameDict[node.name] || node.name;
-                    });
-                    return graph;
-                }, tracker);
-            });
-        }
-        graph_1.build = build;
-        ;
-        /**
-         * Create a new graphlib.Graph() instance with default parameters
-         */
-        function createGraph(name, type, opt) {
-            if (opt === void 0) { opt = {}; }
-            var graph = new graphlib.Graph(opt);
-            graph.setGraph({
-                name: name,
-                rankdir: 'BT',
-                type: type
-            });
-            return graph;
-        }
-        graph_1.createGraph = createGraph;
-        ;
-        /**
-         * Create a predicate for checking whether a node should be embedded based on
-         * the specified types.
-         */
-        function getEmbedPredicate(types) {
-            return function (node) {
-                // check types
-                for (var i = 0; i < types.length; i++) {
-                    var regExp = new RegExp(types[i]);
-                    if (node.op.match(regExp)) {
-                        return true;
-                    }
-                }
-                return false;
-            };
-        }
-        ;
-        /**
-         * Returns a strict node name (name => name/(name)) to avoid conflicts
-         * where the node name is also a namespace.
-         */
-        function getStrictName(name) {
-            var parts = name.split(graph_1.NAMESPACE_DELIM);
-            return name + graph_1.NAMESPACE_DELIM + '(' + parts[parts.length - 1] + ')';
-        }
-        graph_1.getStrictName = getStrictName;
-        /**
-         * For each op node (embedding or non-embedding), rename it if there is a
-         * non-embedding node under its namespace. For example, assume node name 'A'.
-         * If there is a non-embedding node under its namespace (e.g. 'A/B'), 'A' will
-         * be renamed to 'A/(A)'. Then the namespace 'A' will contain 2 nodes: '(A)'
-         * and 'B'. If all the nodes under 'A' are embedding nodes (e.g. constant and
-         * summary), keep 'A' as an Op node and don't create a namespace.
-         *
-         * @param nodeNames An array of regular (non-embedding) node names.
-         * @param embeddingNodeNames An array of embedding node names.
-         * @return Dictionary object mapping names that need to be renamed to
-         *     new names.
-         */
-        function mapStrictHierarchy(nodeNames, embeddingNodeNames) {
-            /** Dictionary that maps the old new to the new name */
-            var newNameDictionary = {};
-            /** Set used to store all namespaces. */
-            var namespaceSet = {};
-            // sort the nodes to make prefix check faster
-            nodeNames.sort();
-            // look for nodes with a prefix a,a/b -> a/(a),a/b
-            for (var i = 0; i < nodeNames.length - 1; ++i) {
-                var a = nodeNames[i];
-                // Get all the parent namespaces of the current node
-                // and add them in the namespace set.
-                _.each(getHierarchicalPath(a).slice(0, -1), function (ns) {
-                    namespaceSet[ns] = true;
-                });
-                for (var j = i + 1; j < nodeNames.length; ++j) {
-                    var b = nodeNames[j];
-                    if (_.startsWith(b, a)) {
-                        if (b.length > a.length && b.charAt(a.length) === graph_1.NAMESPACE_DELIM) {
-                            newNameDictionary[a] = getStrictName(a);
-                            break;
-                        }
-                    }
-                    else {
-                        break;
-                    }
-                }
-            }
-            // Go through all the embedding node names and rename them in case they
-            // collide with namespaces.
-            _.each(embeddingNodeNames, function (embeddingName) {
-                if (embeddingName in namespaceSet) {
-                    // Rename to follow strict hierarchy.
-                    newNameDictionary[embeddingName] = getStrictName(embeddingName);
-                }
-            });
-            return newNameDictionary;
-        }
-        ;
-        /**
-         * Returns a list of the degrees of each node in the graph.
-         */
-        function degreeSequence(graph) {
-            var degrees = graph.nodes().map(function (name) {
-                return graph.neighbors(name).length;
-            });
-            degrees.sort();
-            return degrees;
-        }
-        ;
-        /**
-         * Returns if the degree sequence of the two graphs is the same.
-         */
-        function hasSimilarDegreeSequence(graph1, graph2) {
-            var dg1 = degreeSequence(graph1);
-            var dg2 = degreeSequence(graph2);
-            for (var i = 0; i < dg1.length; i++) {
-                if (dg1[i] !== dg2[i]) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        graph_1.hasSimilarDegreeSequence = hasSimilarDegreeSequence;
-        ;
-        /**
-         * Returns the hierarchical path of the current node, based on the node's name.
-         * For example, if the name is 'a/b/c', the returned path is
-         * ['a', 'a/b', 'a/b/c'].
-         */
-        function getHierarchicalPath(name, seriesNames) {
-            var path = [];
-            var i = name.indexOf(graph_1.NAMESPACE_DELIM);
-            // Push all parent portions of the path.
-            while (i >= 0) {
-                path.push(name.substring(0, i));
-                i = name.indexOf(graph_1.NAMESPACE_DELIM, i + 1);
-            }
-            // If the node's path is under a series, then add the series node name to the
-            // hierarchical path as the parent of the leaf.
-            if (seriesNames) {
-                var seriesName = seriesNames[name];
-                if (seriesName) {
-                    path.push(seriesName);
-                }
-            }
-            // Push the leaf of the path.
-            path.push(name);
-            return path;
-        }
-        graph_1.getHierarchicalPath = getHierarchicalPath;
-        ;
-        /**
-         * Returns the string for the node inclusion toggle button, dependant
-         * on the provided current InclusionType.
-         */
-        function getIncludeNodeButtonString(include) {
-            if (include === tf.graph.InclusionType.EXCLUDE) {
-                return 'Add to main graph';
-            }
-            else {
-                return 'Remove from main graph';
-            }
-        }
-        graph_1.getIncludeNodeButtonString = getIncludeNodeButtonString;
-        ;
-        /**
-         * Returns the string for the series node grouping toggle button, dependant
-         * on the provided current SeriesGroupingType.
-         */
-        function getGroupSeriesNodeButtonString(group) {
-            if (group === tf.graph.SeriesGroupingType.GROUP) {
-                return 'Ungroup this series of nodes';
-            }
-            else {
-                return 'Group this series of nodes';
-            }
-        }
-        graph_1.getGroupSeriesNodeButtonString = getGroupSeriesNodeButtonString;
-        ;
-        /**
-         * Toggle the node series grouping option in the provided map, setting it
-         * to ungroup if the series is not already in the map.
-         */
-        function toggleNodeSeriesGroup(map, name) {
-            if (!(name in map) || map[name] === tf.graph.SeriesGroupingType.GROUP) {
-                map[name] = tf.graph.SeriesGroupingType.UNGROUP;
-            }
-            else {
-                map[name] = tf.graph.SeriesGroupingType.GROUP;
-            }
-        }
-        graph_1.toggleNodeSeriesGroup = toggleNodeSeriesGroup;
-        ;
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Graph Hierarchy for TensorFlow graph.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var hierarchy;
-        (function (hierarchy_1) {
-            /**
-             * Class for the Graph Hierarchy for TensorFlow graph.
-             */
-            var HierarchyImpl = (function () {
-                function HierarchyImpl() {
-                    this.hasShapeInfo = false;
-                    this.maxMetaEdgeSize = 1;
-                    this.root = graph_1.createMetanode(graph_1.ROOT_NAME, { compound: true });
-                    this.templates = null;
-                    this.devices = null;
-                    /**
-                     * @type {Object} Dictionary object that maps node name to the node
-                     * (could be op-node, metanode, or series-node)
-                     */
-                    this.index = {};
-                    this.index[graph_1.ROOT_NAME] = this.root;
-                    this.orderings = {};
-                }
-                HierarchyImpl.prototype.getNodeMap = function () {
-                    return this.index;
-                };
-                HierarchyImpl.prototype.node = function (name) {
-                    return this.index[name];
-                };
-                HierarchyImpl.prototype.setNode = function (name, node) {
-                    this.index[name] = node;
-                };
-                /**
-                 * Given the name of a node in this hierarchy, get its bridgegraph, creating
-                 * it on the fly if necessary. If the node is not a GroupNode, then this
-                 * method returns null. If the provided name does not map to a node in the
-                 * hierarchy, an error will be thrown.
-                 */
-                HierarchyImpl.prototype.getBridgegraph = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node in hierarchy: ' + nodeName);
-                    }
-                    if (!('metagraph' in node)) {
-                        return null;
-                    }
-                    var groupNode = node;
-                    if (groupNode.bridgegraph) {
-                        return groupNode.bridgegraph;
-                    }
-                    var bridgegraph = groupNode.bridgegraph =
-                        graph_1.createGraph('BRIDGEGRAPH', graph_1.GraphType.BRIDGE);
-                    if (!node.parentNode || !('metagraph' in node.parentNode)) {
-                        return bridgegraph;
-                    }
-                    var parentNode = node.parentNode;
-                    var parentMetagraph = parentNode.metagraph;
-                    var parentBridgegraph = this.getBridgegraph(parentNode.name);
-                    // For each of the parent node's two Metaedge containing graphs, process
-                    // each Metaedge involving this node.
-                    _.each([parentMetagraph, parentBridgegraph], function (parentGraph) {
-                        _(parentGraph.edges())
-                            .filter(function (e) { return e.v === nodeName || e.w === nodeName; })
-                            .each(function (parentEdgeObj) {
-                            var inbound = parentEdgeObj.w === nodeName;
-                            var parentMetaedge = parentGraph.edge(parentEdgeObj);
-                            // The parent's Metaedge represents some number of underlying
-                            // BaseEdges from the original full graph. For each of those, we need
-                            // to determine which immediate child is involved and make sure
-                            // there's a Metaedge in the bridgegraph that covers it.
-                            _.each(parentMetaedge.baseEdgeList, function (baseEdge) {
-                                // Based on the direction, figure out which is the descendant node
-                                // and which is the 'other' node (sibling of parent or ancestor).
-                                var _a = inbound ?
-                                    [baseEdge.w, parentEdgeObj.v] :
-                                    [baseEdge.v, parentEdgeObj.w], descendantName = _a[0], otherName = _a[1];
-                                // Determine the immediate child containing this descendant node.
-                                var childName = _this.getChildName(nodeName, descendantName);
-                                // Look for an existing Metaedge in the bridgegraph (or create a
-                                // new one) that covers the relationship between child and other.
-                                var bridgeEdgeObj = {
-                                    v: inbound ? otherName : childName,
-                                    w: inbound ? childName : otherName,
-                                };
-                                var bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-                                if (!bridgeMetaedge) {
-                                    bridgeMetaedge = graph_1.createMetaedge(bridgeEdgeObj.v, bridgeEdgeObj.w);
-                                    bridgeMetaedge.inbound = inbound;
-                                    bridgegraph.setEdge(bridgeEdgeObj.v, bridgeEdgeObj.w, bridgeMetaedge);
-                                }
-                                // Copy the BaseEdge from the parent's Metaedge into this
-                                // bridgegraph Metaedge.
-                                bridgeMetaedge.addBaseEdge(baseEdge, _this);
-                            });
-                        })
-                            .value(); // force lodash chain execution.
-                    });
-                    return bridgegraph;
-                };
-                /**
-                 * Utility function for determining the name of the immediate child under a
-                 * node for a given descendant path. If the descendant corresponds to no
-                 * immediate child, an error is thrown.
-                 */
-                HierarchyImpl.prototype.getChildName = function (nodeName, descendantName) {
-                    // Walk up the hierarchy from the descendant to find the child.
-                    var currentNode = this.index[descendantName];
-                    while (currentNode) {
-                        if (currentNode.parentNode && currentNode.parentNode.name === nodeName) {
-                            return currentNode.name;
-                        }
-                        currentNode = currentNode.parentNode;
-                    }
-                    throw Error('Could not find immediate child for descendant: ' + descendantName);
-                };
-                ;
-                /** Given the name of a node, return its incoming metaedges. */
-                HierarchyImpl.prototype.getPredecessors = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    var predecessors = this.getOneWayEdges(node, true);
-                    // Add embedded predecessors, such as constants.
-                    if (!node.isGroupNode) {
-                        _.each(node.inEmbeddings, function (embeddedNode) {
-                            _.each(node.inputs, function (input) {
-                                if (input.name === embeddedNode.name) {
-                                    // Make a new metaedge holding the edge between the
-                                    // node and the in-embedding.
-                                    var metaedge = new graph_1.MetaedgeImpl(embeddedNode.name, nodeName);
-                                    metaedge.addBaseEdge({
-                                        isControlDependency: input.isControlDependency,
-                                        outputTensorIndex: input.outputTensorIndex,
-                                        isReferenceEdge: false,
-                                        v: embeddedNode.name,
-                                        w: nodeName
-                                    }, _this);
-                                    predecessors.regular.push(metaedge);
-                                }
-                            });
-                        });
-                    }
-                    return predecessors;
-                };
-                /**
-                 * Given the name of a node, return its outgoing metaedges.
-                 *
-                 * This is the inverse of getPredecessors(). See that method's documentation
-                 * for an in-depth example.
-                 */
-                HierarchyImpl.prototype.getSuccessors = function (nodeName) {
-                    var _this = this;
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    var successors = this.getOneWayEdges(node, false);
-                    // Add embedded successors, such as summaries.
-                    if (!node.isGroupNode) {
-                        _.each(node.outEmbeddings, function (embeddedNode) {
-                            _.each(embeddedNode.inputs, function (input) {
-                                if (input.name === nodeName) {
-                                    // Make a new metaedge holding the edge between the
-                                    // node and the out-embedding.
-                                    var metaedge = new graph_1.MetaedgeImpl(nodeName, embeddedNode.name);
-                                    metaedge.addBaseEdge({
-                                        isControlDependency: input.isControlDependency,
-                                        outputTensorIndex: input.outputTensorIndex,
-                                        isReferenceEdge: false,
-                                        v: nodeName,
-                                        w: embeddedNode.name
-                                    }, _this);
-                                    successors.regular.push(metaedge);
-                                }
-                            });
-                        });
-                    }
-                    return successors;
-                };
-                /** Helper method for getPredecessors and getSuccessors */
-                HierarchyImpl.prototype.getOneWayEdges = function (node, inEdges) {
-                    var edges = { control: [], regular: [] };
-                    // A node with no parent cannot have any edges.
-                    if (!node.parentNode || !node.parentNode.isGroupNode) {
-                        return edges;
-                    }
-                    var parentNode = node.parentNode;
-                    var metagraph = parentNode.metagraph;
-                    var bridgegraph = this.getBridgegraph(parentNode.name);
-                    findEdgeTargetsInGraph(metagraph, node, inEdges, edges);
-                    findEdgeTargetsInGraph(bridgegraph, node, inEdges, edges);
-                    return edges;
-                };
-                /**
-                 * For a given GroupNode, get or calculate an object which describes a
-                 * topological ordering of child nodes within that GroupNode's metagraph.
-                 *
-                 * This ordering is used when rendering bridge control edges which are
-                 * sometimes backwards relative to the dataflow.
-                 *
-                 * For example, say we have a graph with two edges A->B and A->C, and we're
-                 * interested in the ordering under ROOT. In this case, any of the following
-                 * would be legitimate return values:
-                 *
-                 *  - { 'A': 0, 'B': 1, 'C': 2 } -- most likely
-                 *  - { 'A': 0, 'B': 2, 'C': 1 } -- less likely
-                 *  - { 'A': 12, 'B': 100, 'C': 99 } -- unlikely, but still OK
-                 *
-                 * The algorithm does not guarantee that all numbers from 0-N (where N is
-                 * the number of nodes) appear exactly once. Rather it guarantees that if
-                 * there is a path between two nodes, the earlier one will have a lower
-                 * number in the ordering hash.
-                 *
-                 * When generating the ordering, we ignore control Metaedges (those which
-                 * represent only BaseEdges that have isControlDependency set to true).
-                 *
-                 * If there is no node with the specified name, an error is thrown. If the
-                 * node with the specified name is not a group node, null is returned.
-                 */
-                HierarchyImpl.prototype.getTopologicalOrdering = function (nodeName) {
-                    var node = this.index[nodeName];
-                    if (!node) {
-                        throw Error('Could not find node with name: ' + nodeName);
-                    }
-                    if (!node.isGroupNode) {
-                        return null;
-                    }
-                    if (nodeName in this.orderings) {
-                        return this.orderings[nodeName];
-                    }
-                    // Mapping of a child node names to lists of their successors.
-                    var successors = {};
-                    // Set of node names which have appeared as a destination.
-                    var destinations = {};
-                    var metagraph = node.metagraph;
-                    _.each(metagraph.edges(), function (e) {
-                        if (!metagraph.edge(e).numRegularEdges) {
-                            return; // Skip control edges.
-                        }
-                        // Keep track of successors and destinations.
-                        if (!(e.v in successors)) {
-                            successors[e.v] = [];
-                        }
-                        successors[e.v].push(e.w);
-                        destinations[e.w] = true;
-                    });
-                    // Seed the queue with true sources (those that are not destinations).
-                    var queue = _.difference(_.keys(successors), _.keys(destinations));
-                    // Produce an ordering by traversing the graph breadth first.
-                    var ordering = this.orderings[nodeName] = {};
-                    var index = 0;
-                    while (queue.length) {
-                        var childName = queue.shift();
-                        ordering[childName] = index++;
-                        _.each(successors[childName], function (succName) { return queue.push(succName); });
-                        delete successors[childName]; // Prevent cycles from infinite looping.
-                    }
-                    return ordering;
-                };
-                /**
-                 * Returns a d3 Ordinal function that can be used to look up the index of
-                 * a node based on its template id.
-                 */
-                HierarchyImpl.prototype.getTemplateIndex = function () {
-                    var templateNames = d3.keys(this.templates);
-                    var templateIndex = d3.scale.ordinal()
-                        .domain(templateNames)
-                        .range(d3.range(0, templateNames.length));
-                    return function (templateId) { return templateIndex(templateId); };
-                };
-                return HierarchyImpl;
-            }());
-            /**
-             * Internal utility function - given a graph (should be either a metagraph or a
-             * bridgegraph) and a node which is known to be in that graph, determine
-             * the other ends of edges that involve that node in the direction specified
-             * by whether it's inbound.
-             *
-             * For example if you wanted to find the predecessors of a node, you'd call
-             * this method for the parent's metagraph and bridgegraph, specifying inbound
-             * as true (look at the source of inbound edges to the specified node).
-             *
-             * Discovered target names are appended to the targets array.
-             */
-            function findEdgeTargetsInGraph(graph, node, inbound, targets) {
-                var edges = inbound ? graph.inEdges(node.name) : graph.outEdges(node.name);
-                _.each(edges, function (e) {
-                    var metaedge = graph.edge(e);
-                    var targetList = metaedge.numRegularEdges ? targets.regular : targets.control;
-                    targetList.push(metaedge);
-                });
-            }
-            /**
-             * @param graph The raw graph.
-             * @param params Parameters used when building a hierarchy.
-             */
-            function build(graph, params, tracker) {
-                var h = new HierarchyImpl();
-                var seriesNames = {};
-                return tf.graph.util
-                    .runAsyncTask('Adding nodes', 20, function () {
-                    // Get all the possible device names.
-                    var deviceNames = {};
-                    _.each(graph.nodes, function (node, nodeName) {
-                        if (node.device != null) {
-                            deviceNames[node.device] = true;
-                        }
-                    });
-                    h.devices = _.keys(deviceNames);
-                    addNodes(h, graph);
-                }, tracker)
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Detect series', 20, function () {
-                        if (params.seriesNodeMinSize > 0) {
-                            groupSeries(h.root, h, seriesNames, params.seriesNodeMinSize, params.seriesMap);
-                        }
-                    }, tracker);
-                })
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Adding edges', 30, function () {
-                        addEdges(h, graph, seriesNames);
-                    }, tracker);
-                })
-                    .then(function () {
-                    return tf.graph.util.runAsyncTask('Finding similar subgraphs', 30, function () {
-                        h.templates = graph_1.template.detect(h, params.verifyTemplate);
-                    }, tracker);
-                })
-                    .then(function () { return h; });
-            }
-            hierarchy_1.build = build;
-            ;
-            function joinAndAggregateStats(h, stats) {
-                // Get all the possible device names.
-                var deviceNames = {};
-                _.each(h.root.leaves(), function (nodeName) {
-                    var leaf = h.node(nodeName);
-                    if (leaf.device != null) {
-                        deviceNames[leaf.device] = true;
-                    }
-                });
-                h.devices = _.keys(deviceNames);
-                // Reset stats for each group node.
-                _.each(h.getNodeMap(), function (node, nodeName) {
-                    if (node.isGroupNode) {
-                        node.stats = new graph_1.NodeStats(null);
-                        node.deviceHistogram = {};
-                    }
-                });
-                // Bubble-up the stats and device distribution from leaves to parents.
-                _.each(h.root.leaves(), function (nodeName) {
-                    var leaf = h.node(nodeName);
-                    var node = leaf;
-                    while (node.parentNode != null) {
-                        if (leaf.device != null) {
-                            var deviceHistogram = node.parentNode.deviceHistogram;
-                            deviceHistogram[leaf.device] = (deviceHistogram[leaf.device] || 0) + 1;
-                        }
-                        if (leaf.stats != null) {
-                            node.parentNode.stats.combine(leaf.stats);
-                        }
-                        node = node.parentNode;
-                    }
-                });
-            }
-            hierarchy_1.joinAndAggregateStats = joinAndAggregateStats;
-            /**
-             * Creates the metanodes in the hierarchical graph and assigns parent-child
-             * relationship between them.
-             */
-            function addNodes(h, graph) {
-                _.each(graph.nodes, function (node, nodeName) {
-                    var path = graph_1.getHierarchicalPath(node.name);
-                    var parent = h.root;
-                    parent.depth = Math.max(path.length, parent.depth);
-                    // Create parent metanodes for each depth. For example if the node name
-                    // is 'a/b/c', then create metanodes 'a' and 'a/b', where 'a/b' is a child
-                    // of a.
-                    for (var i = 0; i < path.length; i++) {
-                        parent.depth = Math.max(parent.depth, path.length - i);
-                        parent.cardinality += node.cardinality;
-                        parent.opHistogram[node.op] = (parent.opHistogram[node.op] || 0) + 1;
-                        if (node.device != null) {
-                            parent.deviceHistogram[node.device] =
-                                (parent.deviceHistogram[node.device] || 0) + 1;
-                        }
-                        if (i === path.length - 1) {
-                            break;
-                        }
-                        var name_1 = path[i];
-                        var child = h.node(name_1);
-                        if (!child) {
-                            child = graph_1.createMetanode(name_1);
-                            child.parentNode = parent;
-                            h.setNode(name_1, child);
-                            parent.metagraph.setNode(name_1, child);
-                        }
-                        parent = child;
-                    }
-                    // Assuming node name is 'a/b/c', assign the OpNode as a child of the
-                    // metanode 'a/b'.
-                    h.setNode(node.name, node);
-                    node.parentNode = parent;
-                    parent.metagraph.setNode(node.name, node);
-                    // Add each of the in-embeddings and out-embeddings in the hierarchy.
-                    _.each(node.inEmbeddings, function (embedding) {
-                        h.setNode(embedding.name, embedding);
-                        embedding.parentNode = node;
-                    });
-                    _.each(node.outEmbeddings, function (embedding) {
-                        h.setNode(embedding.name, embedding);
-                        embedding.parentNode = node;
-                    });
-                });
-            }
-            ;
-            /**
-             * For each metanode in the hierarchical graph, this method adds:
-             * the edges in the metagraph. These are edges between nodes
-             * that share the same parent.
-             */
-            function addEdges(h, graph, seriesNames) {
-                var nodeIndex = h.getNodeMap();
-                // Ancestor paths for the source and destination nodes of an edge. These are
-                // reused for each edge rather than allocating new ones. It's about 10% faster
-                // than allocating new ones on each pass through the loop.
-                var sourcePath = [];
-                var destPath = [];
-                // Insert the ancestor path for a node into the provided array, including the
-                // node itself. Return the index of the last node inserted (always ROOT).
-                var getPath = function (node, path) {
-                    var i = 0;
-                    while (node) {
-                        path[i++] = node.name;
-                        node = node.parentNode;
-                    }
-                    return i - 1;
-                };
-                _.each(graph.edges, function (baseEdge) {
-                    // Get the hierarchical paths for the source and destination of the edge.
-                    var sourceAncestorIndex = getPath(graph.nodes[baseEdge.v], sourcePath);
-                    var destAncestorIndex = getPath(graph.nodes[baseEdge.w], destPath);
-                    // If the hierarchical path cannot be found for either endpoint, then we
-                    // cannot create the edge. This happens for example when a node has a
-                    // control dependency on a summary node, which are embedded.
-                    if (sourceAncestorIndex === -1 || destAncestorIndex === -1) {
-                        return;
-                    }
-                    // Find the lowest shared ancestor between source and dest by looking for
-                    // the highest nodes that differ between their ancestor paths.
-                    while (sourcePath[sourceAncestorIndex] === destPath[destAncestorIndex]) {
-                        sourceAncestorIndex--;
-                        destAncestorIndex--;
-                        if (sourceAncestorIndex < 0 || destAncestorIndex < 0) {
-                            // This would only occur if the two nodes were the same (a cycle in the
-                            // graph), or if one endpoint was a strict ancestor of the other. The
-                            // latter shouldn't happen because we rename nodes which are both
-                            // metanodes and op nodes. E.g. 'A/B' becomes 'A/B/(B)'.
-                            throw Error('No difference found between ancestor paths.');
-                        }
-                    }
-                    var sharedAncestorNode = nodeIndex[sourcePath[sourceAncestorIndex + 1]];
-                    var sourceAncestorName = sourcePath[sourceAncestorIndex];
-                    var destAncestorName = destPath[destAncestorIndex];
-                    // Find or create the Metaedge which should contain this BaseEdge inside
-                    // the shared ancestor.
-                    var metaedge = sharedAncestorNode.metagraph.edge(sourceAncestorName, destAncestorName);
-                    if (!metaedge) {
-                        metaedge = graph_1.createMetaedge(sourceAncestorName, destAncestorName);
-                        sharedAncestorNode.metagraph
-                            .setEdge(sourceAncestorName, destAncestorName, metaedge);
-                    }
-                    if (!sharedAncestorNode.hasNonControlEdges &&
-                        !baseEdge.isControlDependency) {
-                        sharedAncestorNode.hasNonControlEdges = true;
-                    }
-                    metaedge.addBaseEdge(baseEdge, h);
-                });
-            }
-            ;
-            /**
-             * Using the hierarchy template information, detect series in the provided
-             * metanode.  For each detected series, create a new SeriesNode
-             * and remove series members from the metanode's metagraph and move them to
-             * the new series node's metagraph.
-             *
-             * @param metanode
-             * @param hierarchy
-             * @param seriesNames Map of node names to their series they are contained in.
-             *     This should be provided empty and is populated by this method.
-             * @param threshold If the series has this many nodes or more, then group them
-             *     into a series.
-             * @param map Map of series names to their series grouping type, if one has
-             *     been set.
-             * @return A dictionary from node name to series node name that contains the
-             *     node.
-             */
-            function groupSeries(metanode, hierarchy, seriesNames, threshold, map) {
-                var metagraph = metanode.metagraph;
-                _.each(metagraph.nodes(), function (n) {
-                    var child = metagraph.node(n);
-                    if (child.type === tf.graph.NodeType.META) {
-                        groupSeries(child, hierarchy, seriesNames, threshold, map);
-                    }
-                });
-                var clusters = clusterNodes(metagraph);
-                var seriesDict = detectSeries(clusters, metagraph);
-                // Add each series node to the graph and add its grouped children to its own
-                // metagraph.
-                _.each(seriesDict, function (seriesNode, seriesName) {
-                    var nodeMemberNames = seriesNode.metagraph.nodes();
-                    _.each(nodeMemberNames, function (n) {
-                        var child = metagraph.node(n);
-                        if (!child.owningSeries) {
-                            child.owningSeries = seriesName;
-                        }
-                    });
-                    // If the series contains less than the threshold number of nodes and
-                    // this series has not been adding to the series map, then set this
-                    // series to be shown ungrouped in the map.
-                    if (nodeMemberNames.length < threshold && !(seriesNode.name in map)) {
-                        map[seriesNode.name] = tf.graph.SeriesGroupingType.UNGROUP;
-                    }
-                    // If the series is in the map as ungrouped then do not group the series.
-                    if (seriesNode.name in map
-                        && map[seriesNode.name] === tf.graph.SeriesGroupingType.UNGROUP) {
-                        return;
-                    }
-                    hierarchy.setNode(seriesName, seriesNode); // add to the index
-                    metagraph.setNode(seriesName, seriesNode);
-                    _.each(nodeMemberNames, function (n) {
-                        var child = metagraph.node(n);
-                        seriesNode.metagraph.setNode(n, child);
-                        seriesNode.parentNode = child.parentNode;
-                        seriesNode.cardinality++;
-                        if (child.device != null) {
-                            seriesNode.deviceHistogram[child.device] =
-                                (seriesNode.deviceHistogram[child.device] || 0) + 1;
-                        }
-                        child.parentNode = seriesNode;
-                        seriesNames[n] = seriesName;
-                        // Remove now-grouped node from its original parent's metagraph.
-                        metagraph.removeNode(n);
-                    });
-                });
-            }
-            ;
-            /** cluster op-nodes with similar op */
-            function clusterNodes(metagraph) {
-                var result = {};
-                return _.reduce(metagraph.nodes(), function (clusters, n) {
-                    var child = metagraph.node(n);
-                    if (child.type === graph_1.NodeType.META) {
-                        // skip metanodes
-                        return clusters;
-                    }
-                    var template = child.op;
-                    if (template) {
-                        clusters[template] = clusters[template] || [];
-                        clusters[template].push(child.name);
-                    }
-                    return clusters;
-                }, result);
-            }
-            /**
-             * For each cluster of op-nodes based op type, try to detect groupings.
-             * Infer series name using by trying to find pattern '<number>' in the node
-             * name.
-             *
-             * @param clusters Dictionary output from clusterNodes().
-             * @param metagraph
-             * @return A dictionary from series name => seriesNode
-             */
-            function detectSeries(clusters, metagraph) {
-                var seriesDict = {};
-                _.each(clusters, function (members, clusterId) {
-                    if (members.length <= 1) {
-                        return;
-                    } // isolated clusters can't make series
-                    /** @type {Object}  A dictionary mapping seriesName to seriesInfoArray,
-                     * which is an array that contains objects with name, id, prefix, suffix,
-                     * and parent properties.
-                     */
-                    var candidatesDict = {};
-                    // Group all nodes that have the same name, with the exception of a
-                    // number at the end of the name after an underscore, which is allowed to
-                    // vary.
-                    _.each(members, function (name) {
-                        var isGroup = name.charAt(name.length - 1) === '*';
-                        var namepath = name.split('/');
-                        var leaf = namepath[namepath.length - 1];
-                        var parent = namepath.slice(0, namepath.length - 1).join('/');
-                        var matches = leaf.match(/^(\D*)_(\d+)$/);
-                        var prefix;
-                        var id;
-                        var suffix = '';
-                        if (matches) {
-                            prefix = matches[1]; // the front non-numeric characters
-                            id = matches[2]; // the digits
-                        }
-                        else {
-                            prefix = isGroup ? leaf.substr(0, leaf.length - 1) : leaf;
-                            id = 0;
-                            suffix = isGroup ? '*' : '';
-                        }
-                        var seriesName = graph_1.getSeriesNodeName(prefix, suffix, parent);
-                        candidatesDict[seriesName] = candidatesDict[seriesName] || [];
-                        var seriesNode = graph_1.createSeriesNode(prefix, suffix, parent, +id, name);
-                        candidatesDict[seriesName].push(seriesNode);
-                    });
-                    // In each group of nodes, group nodes in bunches that have monotonically
-                    // increasing numbers in their names.  Each of these bunches is a series.
-                    _.each(candidatesDict, function (seriesInfoArray, seriesName) {
-                        if (seriesInfoArray.length < 2) {
-                            return;
-                        }
-                        seriesInfoArray.sort(function (a, b) {
-                            return (+a.clusterId) - (+b.clusterId);
-                        });
-                        // Loop through the nodes sorted by its detected series number, grouping
-                        // all nodes with monotonically-increasing series numbers.
-                        var seriesNodes = [seriesInfoArray[0]];
-                        for (var index = 1; index < seriesInfoArray.length; index++) {
-                            var nextNode = seriesInfoArray[index];
-                            if (nextNode.clusterId === seriesNodes[seriesNodes.length - 1].clusterId
-                                + 1) {
-                                seriesNodes.push(nextNode);
-                                continue;
-                            }
-                            addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-                            seriesNodes = [nextNode];
-                        }
-                        addSeriesToDict(seriesNodes, seriesDict, +clusterId, metagraph);
-                    });
-                });
-                return seriesDict;
-            }
-            /**
-             * Add a series to the provided dictionary mapping series names to series.
-             *
-             * @param seriesNodes the nodes in the series. Contains
-             *     name, id, prefix, suffix and parent properties of the node.
-             * @param seriesDict the dictionary of series
-             * @param clusterId ID of the template of the nodes of the series
-             * @param metagraph
-             */
-            function addSeriesToDict(seriesNodes, seriesDict, clusterId, metagraph) {
-                if (seriesNodes.length > 1) {
-                    var curSeriesName = graph_1.getSeriesNodeName(seriesNodes[0].prefix, seriesNodes[0].suffix, seriesNodes[0].parent, seriesNodes[0].clusterId, seriesNodes[seriesNodes.length - 1].clusterId);
-                    var curSeriesNode_1 = graph_1.createSeriesNode(seriesNodes[0].prefix, seriesNodes[0].suffix, seriesNodes[0].parent, clusterId, curSeriesName);
-                    _.each(seriesNodes, function (node) {
-                        curSeriesNode_1.ids.push(node.clusterId);
-                        curSeriesNode_1.metagraph.setNode(node.name, metagraph.node(node.name));
-                    });
-                    seriesDict[curSeriesName] = curSeriesNode_1;
-                }
-            }
-        })(hierarchy = graph_1.hierarchy || (graph_1.hierarchy = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph.hierarchy
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var layout;
-        (function (layout) {
-            /** Set of parameters that define the look and feel of the graph. */
-            layout.PARAMS = {
-                animation: {
-                    /** Default duration for graph animations in ms. */
-                    duration: 250
-                },
-                graph: {
-                    /** Graph parameter for metanode. */
-                    meta: {
-                        /**
-                         * Dagre's nodesep param - number of pixels that
-                         * separate nodes horizontally in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        nodeSep: 5,
-                        /**
-                         * Dagre's ranksep param - number of pixels
-                         * between each rank in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        rankSep: 25,
-                        /**
-                         * Dagre's edgesep param - number of pixels that separate
-                         * edges horizontally in the layout.
-                         */
-                        edgeSep: 5,
-                    },
-                    /** Graph parameter for metanode. */
-                    series: {
-                        /**
-                         * Dagre's nodesep param - number of pixels that
-                         * separate nodes horizontally in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        nodeSep: 5,
-                        /**
-                         * Dagre's ranksep param - number of pixels
-                         * between each rank in the layout.
-                         *
-                         * See https://github.com/cpettitt/dagre/wiki#configuring-the-layout
-                         */
-                        rankSep: 25,
-                        /**
-                         * Dagre's edgesep param - number of pixels that separate
-                         * edges horizontally in the layout.
-                         */
-                        edgeSep: 5
-                    },
-                    /**
-                     * Padding is used to correctly position the graph SVG inside of its parent
-                     * element. The padding amounts are applied using an SVG transform of X and
-                     * Y coordinates.
-                     */
-                    padding: { paddingTop: 40, paddingLeft: 20 }
-                },
-                subscene: {
-                    meta: {
-                        paddingTop: 10,
-                        paddingBottom: 10,
-                        paddingLeft: 10,
-                        paddingRight: 10,
-                        /**
-                         * Used to leave room for the label on top of the highest node in
-                         * the core graph.
-                         */
-                        labelHeight: 20,
-                        /** X-space between each extracted node and the core graph. */
-                        extractXOffset: 15,
-                        /** Y-space between each extracted node. */
-                        extractYOffset: 20
-                    },
-                    series: {
-                        paddingTop: 10,
-                        paddingBottom: 10,
-                        paddingLeft: 10,
-                        paddingRight: 10,
-                        labelHeight: 10
-                    }
-                },
-                nodeSize: {
-                    /** Size of meta nodes. */
-                    meta: {
-                        radius: 5,
-                        width: 60,
-                        maxLabelWidth: 52,
-                        /** A scale for the node's height based on number of nodes inside */
-                        height: d3.scale.linear().domain([1, 200]).range([15, 60]).clamp(true),
-                        /** The radius of the circle denoting the expand button. */
-                        expandButtonRadius: 3
-                    },
-                    /** Size of op nodes. */
-                    op: {
-                        width: 15,
-                        height: 6,
-                        radius: 3,
-                        labelOffset: -8,
-                        maxLabelWidth: 30
-                    },
-                    /** Size of series nodes. */
-                    series: {
-                        expanded: {
-                            // For expanded series nodes, width and height will be
-                            // computed to account for the subscene.
-                            radius: 10,
-                            labelOffset: 0,
-                        },
-                        vertical: {
-                            // When unexpanded, series whose underlying metagraphs contain
-                            // one or more non-control edges will show as a vertical stack
-                            // of ellipses.
-                            width: 16,
-                            height: 13,
-                            labelOffset: -13,
-                        },
-                        horizontal: {
-                            // When unexpanded, series whose underlying metagraphs contain
-                            // no non-control edges will show as a horizontal stack of
-                            // ellipses.
-                            width: 24,
-                            height: 8,
-                            radius: 10,
-                            labelOffset: -10,
-                        },
-                    },
-                    /** Size of bridge nodes. */
-                    bridge: {
-                        // NOTE: bridge nodes will normally be invisible, but they must
-                        // take up some space so that the layout step leaves room for
-                        // their edges.
-                        width: 20,
-                        height: 20,
-                        radius: 2,
-                        labelOffset: 0
-                    }
-                },
-                shortcutSize: {
-                    /** Size of shortcuts for op nodes */
-                    op: { width: 10, height: 4 },
-                    /** Size of shortcuts for meta nodes */
-                    meta: { width: 12, height: 4, radius: 1 },
-                    /** Size of shortcuts for series nodes */
-                    series: {
-                        width: 14,
-                        height: 4,
-                    }
-                },
-                annotations: {
-                    /** Maximum possible width of the bounding box for in annotations */
-                    inboxWidth: 50,
-                    /** Maximum possible width of the bounding box for out annotations */
-                    outboxWidth: 50,
-                    /** X-space between the shape and each annotation-node. */
-                    xOffset: 10,
-                    /** Y-space between each annotation-node. */
-                    yOffset: 3,
-                    /** X-space between each annotation-node and its label. */
-                    labelOffset: 2,
-                    /** Defines the max width for annotation label */
-                    maxLabelWidth: 120
-                },
-                constant: { size: { width: 4, height: 4 } },
-                series: {
-                    /** Maximum number of repeated item for unexpanded series node. */
-                    maxStackCount: 3,
-                    /**
-                     * Positioning offset ratio for collapsed stack
-                     * of parallel series (series without edges between its members).
-                     */
-                    parallelStackOffsetRatio: 0.2,
-                    /**
-                     * Positioning offset ratio for collapsed stack
-                     * of tower series (series with edges between its members).
-                     */
-                    towerStackOffsetRatio: 0.5
-                },
-                minimap: {
-                    /** The maximum width/height the minimap can have. */
-                    size: 150
-                }
-            };
-            /** Calculate layout for a scene of a group node. */
-            function layoutScene(renderNodeInfo) {
-                // Update layout, size, and annotations of its children nodes and edges.
-                if (renderNodeInfo.node.isGroupNode) {
-                    layoutChildren(renderNodeInfo);
-                }
-                // Update position of its children nodes and edges
-                if (renderNodeInfo.node.type === graph_1.NodeType.META) {
-                    layoutMetanode(renderNodeInfo);
-                }
-                else if (renderNodeInfo.node.type === graph_1.NodeType.SERIES) {
-                    layoutSeriesNode(renderNodeInfo);
-                }
-            }
-            layout.layoutScene = layoutScene;
-            ;
-            /**
-             * Updates the total width of an unexpanded node which includes the size of its
-             * in and out annotations.
-             */
-            function updateTotalWidthOfNode(renderInfo) {
-                renderInfo.inboxWidth = renderInfo.inAnnotations.list.length > 0 ?
-                    layout.PARAMS.annotations.inboxWidth : 0;
-                renderInfo.outboxWidth = renderInfo.outAnnotations.list.length > 0 ?
-                    layout.PARAMS.annotations.outboxWidth : 0;
-                // Assign the width of the core box (the main shape of the node).
-                renderInfo.coreBox.width = renderInfo.width;
-                renderInfo.coreBox.height = renderInfo.height;
-                // TODO(jimbo): Account for font width rather than using a magic number.
-                var labelLength = renderInfo.node.name.length -
-                    renderInfo.node.name.lastIndexOf(graph_1.NAMESPACE_DELIM) - 1;
-                var charWidth = 3; // 3 pixels per character.
-                // Compute the total width of the node.
-                renderInfo.width = Math.max(renderInfo.coreBox.width +
-                    renderInfo.inboxWidth + renderInfo.outboxWidth, labelLength * charWidth);
-            }
-            /**
-             * Update layout, size, and annotations of its children nodes and edges.
-             */
-            function layoutChildren(renderNodeInfo) {
-                var children = renderNodeInfo.coreGraph.nodes().map(function (n) {
-                    return renderNodeInfo.coreGraph.node(n);
-                }).concat(renderNodeInfo.isolatedInExtract, renderNodeInfo.isolatedOutExtract);
-                _.each(children, function (childNodeInfo) {
-                    // Set size of each child
-                    switch (childNodeInfo.node.type) {
-                        case graph_1.NodeType.OP:
-                            _.extend(childNodeInfo, layout.PARAMS.nodeSize.op);
-                            break;
-                        case graph_1.NodeType.BRIDGE:
-                            _.extend(childNodeInfo, layout.PARAMS.nodeSize.bridge);
-                            break;
-                        case graph_1.NodeType.META:
-                            if (!childNodeInfo.expanded) {
-                                // Set fixed width and scalable height based on cardinality
-                                _.extend(childNodeInfo, layout.PARAMS.nodeSize.meta);
-                                childNodeInfo.height =
-                                    layout.PARAMS.nodeSize.meta.height(childNodeInfo.node.cardinality);
-                            }
-                            else {
-                                var childGroupNodeInfo = childNodeInfo;
-                                layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-                            }
-                            break;
-                        case graph_1.NodeType.SERIES:
-                            if (childNodeInfo.expanded) {
-                                _.extend(childNodeInfo, layout.PARAMS.nodeSize.series.expanded);
-                                var childGroupNodeInfo = childNodeInfo;
-                                layoutScene(childGroupNodeInfo); // Recursively layout its subscene.
-                            }
-                            else {
-                                var childGroupNodeInfo = childNodeInfo;
-                                var seriesParams = childGroupNodeInfo.node.hasNonControlEdges ?
-                                    layout.PARAMS.nodeSize.series.vertical :
-                                    layout.PARAMS.nodeSize.series.horizontal;
-                                _.extend(childNodeInfo, seriesParams);
-                            }
-                            break;
-                        default:
-                            throw Error('Unrecognized node type: ' + childNodeInfo.node.type);
-                    }
-                    // Compute total width of un-expanded nodes. Width of expanded nodes
-                    // has already been computed.
-                    if (!childNodeInfo.expanded) {
-                        updateTotalWidthOfNode(childNodeInfo);
-                    }
-                    // Layout each child's annotations
-                    layoutAnnotation(childNodeInfo);
-                });
-            }
-            /**
-             * Calculate layout for a graph using dagre
-             * @param graph the graph to be laid out
-             * @param params layout parameters
-             * @return width and height of the core graph
-             */
-            function dagreLayout(graph, params) {
-                _.extend(graph.graph(), {
-                    nodesep: params.nodeSep,
-                    ranksep: params.rankSep,
-                    edgesep: params.edgeSep
-                });
-                var bridgeNodeNames = [];
-                var nonBridgeNodeNames = [];
-                // Split out nodes into bridge and non-bridge nodes, and calculate the total
-                // width we should use for bridge nodes.
-                _.each(graph.nodes(), function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    if (nodeInfo.node.type === graph_1.NodeType.BRIDGE) {
-                        bridgeNodeNames.push(nodeName);
-                    }
-                    else {
-                        nonBridgeNodeNames.push(nodeName);
-                    }
-                });
-                // If there are no non-bridge nodes, then the graph has zero size.
-                if (!nonBridgeNodeNames.length) {
-                    return {
-                        width: 0,
-                        height: 0,
-                    };
-                }
-                dagre.layout(graph);
-                // Calculate the true bounding box of the graph by iterating over nodes and
-                // edges rather than accepting dagre's word for it. In particular, we should
-                // ignore the extra-wide bridge nodes and bridge edges, and allow for
-                // annotation boxes and labels.
-                var minX = Infinity;
-                var minY = Infinity;
-                var maxX = -Infinity;
-                var maxY = -Infinity;
-                _.each(nonBridgeNodeNames, function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    var w = 0.5 * nodeInfo.width;
-                    var x1 = nodeInfo.x - w;
-                    var x2 = nodeInfo.x + w;
-                    minX = x1 < minX ? x1 : minX;
-                    maxX = x2 > maxX ? x2 : maxX;
-                    // TODO(jimbo): Account for the height of labels above op nodes here.
-                    var h = 0.5 * nodeInfo.height;
-                    var y1 = nodeInfo.y - h;
-                    var y2 = nodeInfo.y + h;
-                    minY = y1 < minY ? y1 : minY;
-                    maxY = y2 > maxY ? y2 : maxY;
-                });
-                _.each(graph.edges(), function (edgeObj) {
-                    var edgeInfo = graph.edge(edgeObj);
-                    if (edgeInfo.structural) {
-                        return; // Skip structural edges from min/max calculations.
-                    }
-                    // Since the node size passed to dagre includes the in and out
-                    // annotations, the endpoints of the edge produced by dagre may not
-                    // point to the actual node shape (rectangle, ellipse). We correct the
-                    // end-points by finding the intersection of a line between the
-                    // next-to-last (next-to-first) point and the destination (source)
-                    // rectangle.
-                    var sourceNode = graph.node(edgeInfo.metaedge.v);
-                    var destNode = graph.node(edgeInfo.metaedge.w);
-                    // Straight 3-points edges are special case, since they are curved after
-                    // our default correction. To keep them straight, we remove the mid point
-                    // and correct the first and the last point to be the center of the
-                    // source and destination node respectively.
-                    if (edgeInfo.points.length === 3 && isStraightLine(edgeInfo.points)) {
-                        if (sourceNode != null) {
-                            var cxSource = sourceNode.expanded ?
-                                sourceNode.x : computeCXPositionOfNodeShape(sourceNode);
-                            edgeInfo.points[0].x = cxSource;
-                        }
-                        if (destNode != null) {
-                            var cxDest = destNode.expanded ?
-                                destNode.x : computeCXPositionOfNodeShape(destNode);
-                            edgeInfo.points[2].x = cxDest;
-                        }
-                        // Remove the middle point so the edge doesn't curve.
-                        edgeInfo.points = [edgeInfo.points[0], edgeInfo.points[1]];
-                    }
-                    // Correct the destination endpoint of the edge.
-                    var nextToLastPoint = edgeInfo.points[edgeInfo.points.length - 2];
-                    // The destination node might be null if this is a bridge edge.
-                    if (destNode != null) {
-                        edgeInfo.points[edgeInfo.points.length - 1] =
-                            intersectPointAndNode(nextToLastPoint, destNode);
-                    }
-                    // Correct the source endpoint of the edge.
-                    var secondPoint = edgeInfo.points[1];
-                    // The source might be null if this is a bridge edge.
-                    if (sourceNode != null) {
-                        edgeInfo.points[0] = intersectPointAndNode(secondPoint, sourceNode);
-                    }
-                    _.each(edgeInfo.points, function (point) {
-                        minX = point.x < minX ? point.x : minX;
-                        maxX = point.x > maxX ? point.x : maxX;
-                        minY = point.y < minY ? point.y : minY;
-                        maxY = point.y > maxY ? point.y : maxY;
-                    });
-                });
-                // Shift all nodes and edge points to account for the left-padding amount,
-                // and the invisible bridge nodes.
-                _.each(graph.nodes(), function (nodeName) {
-                    var nodeInfo = graph.node(nodeName);
-                    nodeInfo.x -= minX;
-                    nodeInfo.y -= minY;
-                });
-                _.each(graph.edges(), function (edgeObj) {
-                    _.each(graph.edge(edgeObj).points, function (point) {
-                        point.x -= minX;
-                        point.y -= minY;
-                    });
-                });
-                return {
-                    width: maxX - minX,
-                    height: maxY - minY
-                };
-            }
-            /** Layout a metanode. Only called for an expanded node. */
-            function layoutMetanode(renderNodeInfo) {
-                // First, copy params specific to meta nodes onto this render info object.
-                var params = layout.PARAMS.subscene.meta;
-                _.extend(renderNodeInfo, params);
-                // Invoke dagre.layout() on the core graph and record the bounding box
-                // dimensions.
-                _.extend(renderNodeInfo.coreBox, dagreLayout(renderNodeInfo.coreGraph, layout.PARAMS.graph.meta));
-                // Calculate the position of nodes in isolatedInExtract relative to the
-                // top-left corner of inExtractBox (the bounding box for all inExtract nodes)
-                // and calculate the size of the inExtractBox.
-                var maxInExtractWidth = _.max(renderNodeInfo.isolatedInExtract, function (renderNode) { return renderNode.width; }).width;
-                renderNodeInfo.inExtractBox.width = maxInExtractWidth != null ?
-                    maxInExtractWidth : 0;
-                renderNodeInfo.inExtractBox.height =
-                    _.reduce(renderNodeInfo.isolatedInExtract, function (height, child, i) {
-                        var yOffset = i > 0 ? params.extractYOffset : 0;
-                        // use width/height here to avoid overlaps between extracts
-                        child.x = 0;
-                        child.y = height + yOffset + child.height / 2;
-                        return height + yOffset + child.height;
-                    }, 0);
-                // Calculate the position of nodes in isolatedOutExtract relative to the
-                // top-left corner of outExtractBox (the bounding box for all outExtract
-                // nodes) and calculate the size of the outExtractBox.
-                var maxOutExtractWidth = _.max(renderNodeInfo.isolatedOutExtract, function (renderNode) { return renderNode.width; }).width;
-                renderNodeInfo.outExtractBox.width = maxOutExtractWidth != null ?
-                    maxOutExtractWidth : 0;
-                renderNodeInfo.outExtractBox.height =
-                    _.reduce(renderNodeInfo.isolatedOutExtract, function (height, child, i) {
-                        var yOffset = i > 0 ? params.extractYOffset : 0;
-                        // use width/height here to avoid overlaps between extracts
-                        child.x = 0;
-                        child.y = height + yOffset + child.height / 2;
-                        return height + yOffset + child.height;
-                    }, 0);
-                // Compute the total padding between the core graph, in-extract and
-                // out-extract boxes.
-                var numParts = 0;
-                if (renderNodeInfo.isolatedInExtract.length > 0) {
-                    numParts++;
-                }
-                if (renderNodeInfo.isolatedOutExtract.length > 0) {
-                    numParts++;
-                }
-                if (renderNodeInfo.coreGraph.nodeCount() > 0) {
-                    numParts++;
-                }
-                var offset = layout.PARAMS.subscene.meta.extractXOffset;
-                var padding = numParts <= 1 ? 0 : (numParts <= 2 ? offset : 2 * offset);
-                // Add the in-extract and out-extract width to the core box width.
-                renderNodeInfo.coreBox.width += renderNodeInfo.inExtractBox.width +
-                    renderNodeInfo.outExtractBox.width + padding;
-                renderNodeInfo.coreBox.height =
-                    params.labelHeight +
-                        Math.max(renderNodeInfo.inExtractBox.height, renderNodeInfo.coreBox.height, renderNodeInfo.outExtractBox.height);
-                // Determine the whole metanode's width (from left to right).
-                renderNodeInfo.width = renderNodeInfo.coreBox.width +
-                    params.paddingLeft + params.paddingRight;
-                // Determine the whole metanode's height (from top to bottom).
-                renderNodeInfo.height =
-                    renderNodeInfo.paddingTop +
-                        renderNodeInfo.coreBox.height +
-                        renderNodeInfo.paddingBottom;
-            }
-            /**
-             * Calculate layout for series node's core graph. Only called for an expanded
-             * series.
-             */
-            function layoutSeriesNode(node) {
-                var graph = node.coreGraph;
-                var params = layout.PARAMS.subscene.series;
-                _.extend(node, params);
-                // Layout the core.
-                _.extend(node.coreBox, dagreLayout(node.coreGraph, layout.PARAMS.graph.series));
-                _.each(graph.nodes(), function (nodeName) {
-                    graph.node(nodeName).excluded = false;
-                });
-                // Series do not have in/outExtractBox so no need to include them here.
-                node.width = node.coreBox.width + params.paddingLeft + params.paddingRight;
-                node.height = node.coreBox.height + params.paddingTop + params.paddingBottom;
-            }
-            /**
-             * Calculate layout for annotations of a given node.
-             * This will modify positions of the given node and its annotations.
-             *
-             * @see tf.graph.render.Node and tf.graph.render.Annotation
-             * for description of each property of each render node.
-             *
-             */
-            function layoutAnnotation(renderNodeInfo) {
-                // If the render node is an expanded metanode, then its annotations will not
-                // be visible and we should skip the annotation calculations.
-                if (renderNodeInfo.expanded) {
-                    return;
-                }
-                var inAnnotations = renderNodeInfo.inAnnotations.list;
-                var outAnnotations = renderNodeInfo.outAnnotations.list;
-                // Calculate size for in-annotations
-                _.each(inAnnotations, function (a) { return sizeAnnotation(a); });
-                // Calculate size for out-annotations
-                _.each(outAnnotations, function (a) { return sizeAnnotation(a); });
-                var params = layout.PARAMS.annotations;
-                // Calculate annotation node position (a.dx, a.dy)
-                // and total height for in-annotations
-                // After this chunk of code:
-                // inboxHeight = sum of annotation heights+ (annotation.length - 1 * yOffset)
-                var inboxHeight = _.reduce(inAnnotations, function (height, a, i) {
-                    var yOffset = i > 0 ? params.yOffset : 0;
-                    a.dx = -(renderNodeInfo.coreBox.width + a.width) / 2 - params.xOffset;
-                    a.dy = height + yOffset + a.height / 2;
-                    return height + yOffset + a.height;
-                }, 0);
-                _.each(inAnnotations, function (a) {
-                    a.dy -= inboxHeight / 2;
-                    a.labelOffset = params.labelOffset;
-                });
-                // Calculate annotation node position (a.dx, a.dy)
-                // and total height for out-annotations
-                // After this chunk of code:
-                // outboxHeight = sum of annotation heights +
-                //                (annotation.length - 1 * yOffset)
-                var outboxHeight = _.reduce(outAnnotations, function (height, a, i) {
-                    var yOffset = i > 0 ? params.yOffset : 0;
-                    a.dx = (renderNodeInfo.coreBox.width + a.width) / 2 + params.xOffset;
-                    a.dy = height + yOffset + a.height / 2;
-                    return height + yOffset + a.height;
-                }, 0);
-                _.each(outAnnotations, function (a) {
-                    // adjust by (half of ) the total height
-                    // so dy is relative to the host node's center.
-                    a.dy -= outboxHeight / 2;
-                    a.labelOffset = params.labelOffset;
-                });
-                // Creating scales for touch point between the in-annotation edges
-                // and their hosts.
-                var inTouchHeight = Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius, inboxHeight / 2);
-                inTouchHeight = inTouchHeight < 0 ? 0 : inTouchHeight;
-                var inY = d3.scale.linear()
-                    .domain([0, inAnnotations.length - 1])
-                    .range([-inTouchHeight, inTouchHeight]);
-                // Calculate annotation edge position
-                _.each(inAnnotations, function (a, i) {
-                    a.points = [
-                        // The annotation node end
-                        {
-                            dx: a.dx + a.width / 2,
-                            dy: a.dy
-                        },
-                        // The host node end
-                        {
-                            dx: -renderNodeInfo.coreBox.width / 2,
-                            // only use scale if there are more than one,
-                            // otherwise center it vertically
-                            dy: inAnnotations.length > 1 ? inY(i) : 0
-                        }
-                    ];
-                });
-                // Creating scales for touch point between the out-annotation edges
-                // and their hosts.
-                var outTouchHeight = Math.min(renderNodeInfo.height / 2 - renderNodeInfo.radius, outboxHeight / 2);
-                outTouchHeight = outTouchHeight < 0 ? 0 : outTouchHeight;
-                var outY = d3.scale.linear()
-                    .domain([0, outAnnotations.length - 1])
-                    .range([-outTouchHeight, outTouchHeight]);
-                _.each(outAnnotations, function (a, i) {
-                    // Add point from the border of the annotation node
-                    a.points = [
-                        // The host node end
-                        {
-                            dx: renderNodeInfo.coreBox.width / 2,
-                            // only use scale if there are more than one,
-                            // otherwise center it vertically
-                            dy: outAnnotations.length > 1 ? outY(i) : 0
-                        },
-                        // The annotation node end
-                        {
-                            dx: a.dx - a.width / 2,
-                            dy: a.dy
-                        }
-                    ];
-                });
-                renderNodeInfo.height =
-                    Math.max(renderNodeInfo.height, inboxHeight, outboxHeight);
-            }
-            /**
-             * Set size of an annotation node.
-             */
-            function sizeAnnotation(a) {
-                switch (a.annotationType) {
-                    case graph_1.render.AnnotationType.CONSTANT:
-                        _.extend(a, layout.PARAMS.constant.size);
-                        break;
-                    case graph_1.render.AnnotationType.SHORTCUT:
-                        if (a.node.type === graph_1.NodeType.OP) {
-                            _.extend(a, layout.PARAMS.shortcutSize.op);
-                        }
-                        else if (a.node.type === graph_1.NodeType.META) {
-                            _.extend(a, layout.PARAMS.shortcutSize.meta);
-                        }
-                        else if (a.node.type === graph_1.NodeType.SERIES) {
-                            _.extend(a, layout.PARAMS.shortcutSize.series);
-                        }
-                        else {
-                            throw Error('Invalid node type: ' + a.node.type);
-                        }
-                        break;
-                    case graph_1.render.AnnotationType.SUMMARY:
-                        _.extend(a, layout.PARAMS.constant.size);
-                        break;
-                }
-            }
-            /**
-             * Determines the center position of the node's shape. The position depends
-             * on if the node has in and out-annotations.
-             */
-            function computeCXPositionOfNodeShape(renderInfo) {
-                if (renderInfo.expanded) {
-                    return renderInfo.x;
-                }
-                var dx = renderInfo.inAnnotations.list.length ? renderInfo.inboxWidth : 0;
-                return renderInfo.x - renderInfo.width / 2 + dx +
-                    renderInfo.coreBox.width / 2;
-            }
-            layout.computeCXPositionOfNodeShape = computeCXPositionOfNodeShape;
-            /** Returns the angle (in degrees) between two points. */
-            function angleBetweenTwoPoints(a, b) {
-                var dx = b.x - a.x;
-                var dy = b.y - a.y;
-                return 180 * Math.atan(dy / dx) / Math.PI;
-            }
-            /**
-             * Returns if a line going through the specified points is a straight line.
-             */
-            function isStraightLine(points) {
-                var angle = angleBetweenTwoPoints(points[0], points[1]);
-                for (var i = 1; i < points.length - 1; i++) {
-                    var newAngle = angleBetweenTwoPoints(points[i], points[i + 1]);
-                    // Have a tolerance of 1 degree.
-                    if (Math.abs(newAngle - angle) > 1) {
-                        return false;
-                    }
-                    angle = newAngle;
-                }
-                return true;
-            }
-            /**
-             * Returns the intersection of a line between the provided point
-             * and the provided rectangle.
-             */
-            function intersectPointAndNode(point, node) {
-                // cx and cy are the center of the rectangle.
-                var cx = node.expanded ?
-                    node.x : computeCXPositionOfNodeShape(node);
-                var cy = node.y;
-                // Calculate the slope
-                var dx = point.x - cx;
-                var dy = point.y - cy;
-                var w = node.expanded ? node.width : node.coreBox.width;
-                var h = node.expanded ? node.height : node.coreBox.height;
-                var deltaX, deltaY;
-                if (Math.abs(dy) * w / 2 > Math.abs(dx) * h / 2) {
-                    // The intersection is above or below the rectangle.
-                    if (dy < 0) {
-                        h = -h;
-                    }
-                    deltaX = dy === 0 ? 0 : h / 2 * dx / dy;
-                    deltaY = h / 2;
-                }
-                else {
-                    // The intersection is left or right of the rectangle.
-                    if (dx < 0) {
-                        w = -w;
-                    }
-                    deltaX = w / 2;
-                    deltaY = dx === 0 ? 0 : w / 2 * dy / dx;
-                }
-                return { x: cx + deltaX, y: cy + deltaY };
-            }
-        })(layout = graph_1.layout || (graph_1.layout = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var parser;
-        (function (parser) {
-            /**
-             * Parses a native js value, which can be either a string, boolean or number.
-             *
-             * @param value The value to be parsed.
-             */
-            function parseValue(value) {
-                if (value === 'true') {
-                    return true;
-                }
-                if (value === 'false') {
-                    return false;
-                }
-                var firstChar = value[0];
-                if (firstChar === '"') {
-                    return value.substring(1, value.length - 1);
-                }
-                var num = parseFloat(value);
-                return isNaN(num) ? value : num;
-            }
-            /**
-             * Fetches a text file and returns a promise of the result.
-             */
-            function fetchPbTxt(filepath) {
-                return new Promise(function (resolve, reject) {
-                    var request = new XMLHttpRequest();
-                    request.open('GET', filepath);
-                    request.responseType = 'arraybuffer';
-                    request.onerror = function () { return reject(request.status); };
-                    request.onload = function () { return resolve(request.response); };
-                    request.send(null);
-                });
-            }
-            parser.fetchPbTxt = fetchPbTxt;
-            /**
-             * Fetches the metadata file, parses it and returns a promise of the result.
-             */
-            function fetchAndParseMetadata(path, tracker) {
-                return tf.graph.util
-                    .runTask('Reading metadata pbtxt', 40, function () {
-                    if (path == null) {
-                        return Promise.resolve(null);
-                    }
-                    return fetchPbTxt(path);
-                }, tracker)
-                    .then(function (arrayBuffer) {
-                    return tf.graph.util.runAsyncPromiseTask('Parsing metadata.pbtxt', 60, function () {
-                        return arrayBuffer != null ? parseStatsPbTxt(arrayBuffer) :
-                            Promise.resolve(null);
-                    }, tracker);
-                });
-            }
-            parser.fetchAndParseMetadata = fetchAndParseMetadata;
-            /**
-             * Fetches the graph file, parses it and returns a promise of the result. The
-             * result will be undefined if the graph is empty.
-             */
-            function fetchAndParseGraphData(path, pbTxtFile, tracker) {
-                return tf.graph.util
-                    .runTask('Reading graph pbtxt', 40, function () {
-                    if (pbTxtFile) {
-                        return new Promise(function (resolve, reject) {
-                            var fileReader = new FileReader();
-                            fileReader.onload = function () { return resolve(fileReader.result); };
-                            fileReader.onerror = function () { return reject(fileReader.error); };
-                            fileReader.readAsArrayBuffer(pbTxtFile);
-                        });
-                    }
-                    else {
-                        return fetchPbTxt(path);
-                    }
-                }, tracker)
-                    .then(function (arrayBuffer) {
-                    return tf.graph.util.runTask('Parsing graph.pbtxt', 60, function () {
-                        return parseGraphPbTxt(arrayBuffer);
-                    }, tracker);
-                });
-            }
-            parser.fetchAndParseGraphData = fetchAndParseGraphData;
-            /**
-             * Parse a file object in a streaming fashion line by line (or custom delim).
-             * Can handle very large files.
-             * @param input The file object as an array buffer.
-             * @param callback The callback called on each line
-             * @param chunkSize The size of each read chunk. (optional)
-             * @param delim The delimiter used to split a line. (optional)
-             * @returns A promise for when it is finished.
-             */
-            function streamParse(arrayBuffer, callback, chunkSize, delim) {
-                if (chunkSize === void 0) { chunkSize = 1000000; }
-                if (delim === void 0) { delim = '\n'; }
-                return new Promise(function (resolve, reject) {
-                    var offset = 0;
-                    var bufferSize = arrayBuffer.byteLength - 1;
-                    var data = '';
-                    function readHandler(str) {
-                        offset += chunkSize;
-                        var parts = str.split(delim);
-                        var first = data + parts[0];
-                        if (parts.length === 1) {
-                            data = first;
-                            readChunk(offset, chunkSize);
-                            return;
-                        }
-                        data = parts[parts.length - 1];
-                        callback(first);
-                        for (var i = 1; i < parts.length - 1; i++) {
-                            callback(parts[i]);
-                        }
-                        if (offset >= bufferSize) {
-                            if (data) {
-                                callback(data);
-                            }
-                            resolve(true);
-                            return;
-                        }
-                        readChunk(offset, chunkSize);
-                    }
-                    function readChunk(offset, size) {
-                        var arrayBufferChunk = arrayBuffer.slice(offset, offset + size);
-                        var blob = new Blob([arrayBufferChunk]);
-                        var file = new FileReader();
-                        file.onload = function (e) { return readHandler(e.target.result); };
-                        file.readAsText(blob);
-                    }
-                    readChunk(offset, chunkSize);
-                });
-            }
-            parser.streamParse = streamParse;
-            /**
-             * Since proto-txt doesn't explicitly say whether an attribute is repeated
-             * (an array) or not, we keep a hard-coded list of attributes that are known
-             * to be repeated. This list is used in parsing time to convert repeated
-             * attributes into arrays even when the attribute only shows up once in the
-             * object.
-             */
-            var GRAPH_REPEATED_FIELDS = {
-                'node': true,
-                'node.input': true,
-                'node.attr': true,
-                'node.attr.value.list.type': true,
-                'node.attr.value.shape.dim': true,
-                'node.attr.value.tensor.string_val': true,
-                'node.attr.value.tensor.tensor_shape.dim': true,
-                'node.attr.value.list.shape': true,
-                'node.attr.value.list.shape.dim': true,
-                'node.attr.value.list.s': true
-            };
-            var METADATA_REPEATED_FIELDS = {
-                'step_stats.dev_stats': true,
-                'step_stats.dev_stats.node_stats': true,
-                'step_stats.dev_stats.node_stats.output': true,
-                'step_stats.dev_stats.node_stats.memory': true,
-                'step_stats.dev_stats.node_stats.output.tensor_description.shape.dim': true
-            };
-            /**
-             * Parses an ArrayBuffer of a proto txt file into a raw Graph object.
-             */
-            function parseGraphPbTxt(input) {
-                return parsePbtxtFile(input, GRAPH_REPEATED_FIELDS).then(function (obj) { return obj['node']; });
-            }
-            parser.parseGraphPbTxt = parseGraphPbTxt;
-            /**
-             * Parses an ArrayBuffer of a proto txt file into a StepStats object.
-             */
-            function parseStatsPbTxt(input) {
-                return parsePbtxtFile(input, METADATA_REPEATED_FIELDS)
-                    .then(function (obj) { return obj['step_stats']; });
-            }
-            parser.parseStatsPbTxt = parseStatsPbTxt;
-            /**
-             * Parses a ArrayBuffer of a proto txt file into javascript object.
-             *
-             * @param input The ArrayBuffer or file object implementing slice.
-             * @param repeatedFields Map (Set) of all the repeated fields, since you can't
-             *   tell directly from the pbtxt if a field is repeated or not.
-             * @returns The parsed object.
-             */
-            function parsePbtxtFile(input, repeatedFields) {
-                var output = {};
-                var stack = [];
-                var path = [];
-                var current = output;
-                function splitNameAndValueInAttribute(line) {
-                    var colonIndex = line.indexOf(':');
-                    var name = line.substring(0, colonIndex).trim();
-                    var value = parseValue(line.substring(colonIndex + 2).trim());
-                    return {
-                        name: name,
-                        value: value
-                    };
-                }
-                /**
-                 * Adds a value, given the attribute name and the host object. If the
-                 * attribute already exists, but is not an array, it will convert it to an
-                 * array of values.
-                 *
-                 * @param obj The host object that holds the attribute.
-                 * @param name The attribute name (key).
-                 * @param value The attribute value.
-                 * @param path A path that identifies the attribute. Used to check if
-                 *     an attribute is an array or not.
-                 */
-                function addAttribute(obj, name, value, path) {
-                    // We treat 'node' specially since it is done so often.
-                    var existingValue = obj[name];
-                    if (existingValue == null) {
-                        obj[name] = path.join('.') in repeatedFields ? [value] : value;
-                    }
-                    else if (Array.isArray(existingValue)) {
-                        existingValue.push(value);
-                    }
-                    else {
-                        obj[name] = [existingValue, value];
-                    }
-                }
-                // Run through the file a line at a time.
-                return streamParse(input, function (line) {
-                    if (!line) {
-                        return;
-                    }
-                    line = line.trim();
-                    switch (line[line.length - 1]) {
-                        case '{':
-                            var name_1 = line.substring(0, line.length - 2).trim();
-                            var newValue = {};
-                            stack.push(current);
-                            path.push(name_1);
-                            addAttribute(current, name_1, newValue, path);
-                            current = newValue;
-                            break;
-                        case '}':
-                            current = stack.pop();
-                            path.pop();
-                            break;
-                        default:
-                            var x = splitNameAndValueInAttribute(line);
-                            addAttribute(current, x.name, x.value, path.concat(x.name));
-                            break;
-                    }
-                }).then(function () {
-                    return output;
-                });
-            }
-        })(parser = graph.parser || (graph.parser = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // Close module tf.graph.parser.
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-</script>
-<script>var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * Package for the Render Hierarchy for TensorFlow graph.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var render;
-        (function (render) {
-            /**
-             * Color parameters for op nodes.
-             */
-            render.OpNodeColors = { DEFAULT_FILL: 'white', DEFAULT_STROKE: '#b2b2b2' };
-            /**
-             * Color parameters for node encoding.
-             * @type {Object}
-             */
-            render.MetanodeColors = {
-                /**
-                 * Default fill and stroke to use when no other information is available.
-                 */
-                DEFAULT_FILL: '#d9d9d9',
-                DEFAULT_STROKE: '#a6a6a6',
-                SATURATION: 0.6,
-                LIGHTNESS: 0.85,
-                /**
-                 * Neutral color to use when the node is expanded (used when coloring by
-                 * compute time, memory and device).
-                 */
-                EXPANDED_COLOR: '#f0f0f0',
-                /**
-                 * Standard hue values for node color palette.
-                 */
-                HUES: [220, 100, 180, 40, 20, 340, 260, 300, 140, 60],
-                STRUCTURE_PALETTE: function (id, lightened) {
-                    // The code below is a flexible way to computationally create a set
-                    // of colors that go well together.
-                    var hues = render.MetanodeColors.HUES;
-                    var n = hues.length;
-                    var hue = hues[id % n];
-                    var m = Math.sin(hue * Math.PI / 360);
-                    var sat = lightened ? 30 : 90 - 60 * m;
-                    var light = lightened ? 95 : 80;
-                    return d3.hsl(hue, .01 * sat, .01 * light).toString();
-                },
-                DEVICE_PALETTE: function (index) { return render.MetanodeColors.STRUCTURE_PALETTE(index); },
-                UNKNOWN: '#eee',
-                GRADIENT_OUTLINE: '#888'
-            };
-            /**
-             * Color parameters for op nodes.
-             */
-            render.SeriesNodeColors = {
-                DEFAULT_FILL: 'white',
-                DEFAULT_STROKE: '#b2b2b2'
-            };
-            /**
-             * Parameters that affect how the graph is rendered on the screen.
-             */
-            var PARAMS = {
-                /**
-                 * Whether to extract high degree nodes from the core part of the graph.
-                 */
-                enableExtraction: true,
-                /**
-                 * The minimum number of nodes for a graph to have in order for high in and
-                 * out degree nodes to be extracted in auxiliary. The aim here is to prevent
-                 * nodes from being extracted from small graphs.
-                 */
-                minNodeCountForExtraction: 15,
-                /**
-                 * The minimum in or out degree a node must have in order to be possibly
-                 * extracted.
-                 */
-                minDegreeForExtraction: 5,
-                /**
-                 * Maximum number of control edges a node can have before they aren't
-                 * displayed.
-                 */
-                maxControlDegree: 4,
-                /**
-                 * Maximum in (for outbound bridge paths) or out (for inbound bridge paths)
-                 * degree of a node allowed for a bridge path to be rendered to it from a
-                 * subhierarchy of nodes. Having a max prevents having too many nodes emanate
-                 * from a subhierarchy and crowding up.
-                 */
-                maxBridgePathDegree: 4,
-                /**
-                 * Types patterns for predefined out-extract nodes, which are
-                 * sink-like nodes that will be extracted from the main graph.
-                 */
-                outExtractTypes: [
-                    'NoOp' // NoOps are sink-like used for managing control dependencies.
-                ],
-                /**
-                 * Types patterns for predefined in-extract nodes, which are
-                 * source-like nodes that will be extracted from the main graph.
-                 */
-                inExtractTypes: [],
-                /**
-                 * When removing edges from a high degree node, remove all of its edges if
-                 * detachAllEdgesForHighDegree is true.  Otherwise remove all in-edges if
-                 * the node has high in-degree, or all out-edges if the node has high
-                 * out-degree.
-                 */
-                detachAllEdgesForHighDegree: true,
-                /**
-                 * After extracting high in/out degree nodes and predefined
-                 * source-like/sink-like, extract isolated nodes to the side
-                 * if this extractIsolatedNodesWithAnnotationsOnOneSide is true.
-                 */
-                extractIsolatedNodesWithAnnotationsOnOneSide: true,
-                /**
-                 * Whether to add bridge nodes and edges to the core when building the
-                 * subhierarchy of an expanded metanode. See buildSubhierarchy().
-                 */
-                enableBridgegraph: true,
-                /**
-                 * 2 colors, for the minimum and maximum value respectively, whenever we
-                 * have a gradient scale.
-                 */
-                minMaxColors: ['#fff5f0', '#fb6a4a'],
-                /**
-                 * Maximum number of annotations to be displayed on a node before an
-                 * ellipsis is used.
-                 */
-                maxAnnotations: 5
-            };
-            /**
-             * Stores the rendering information, such as x and y coordinates,
-             * for each node in the graph.
-             */
-            var RenderGraphInfo = (function () {
-                function RenderGraphInfo(hierarchy, displayingStats) {
-                    this.hierarchy = hierarchy;
-                    this.displayingStats = displayingStats;
-                    this.index = {};
-                    this.renderedOpNames = [];
-                    this.computeScales();
-                    // Maps node name to whether the rendering hierarchy was already
-                    // constructed.
-                    this.hasSubhierarchy = {};
-                    this.root = new RenderGroupNodeInfo(hierarchy.root);
-                    this.index[hierarchy.root.name] = this.root;
-                    this.renderedOpNames.push(hierarchy.root.name);
-                    this.buildSubhierarchy(hierarchy.root.name);
-                    this.root.expanded = true;
-                    this.traceInputs = false;
-                }
-                RenderGraphInfo.prototype.computeScales = function () {
-                    this.deviceColorMap = d3.scale.ordinal()
-                        .domain(this.hierarchy.devices)
-                        .range(_.map(d3.range(this.hierarchy.devices.length), render.MetanodeColors.DEVICE_PALETTE));
-                    var topLevelGraph = this.hierarchy.root.metagraph;
-                    // Find the maximum and minimum memory usage.
-                    var memoryExtent = d3.extent(topLevelGraph.nodes(), function (nodeName, index) {
-                        var node = topLevelGraph.node(nodeName);
-                        // Some ops don't have stats at all.
-                        if (node.stats != null) {
-                            return node.stats.totalBytes;
-                        }
-                    });
-                    this.memoryUsageScale = d3.scale.linear()
-                        .domain(memoryExtent)
-                        .range(PARAMS.minMaxColors);
-                    // Find also the minimum and maximum compute time.
-                    var computeTimeExtent = d3.extent(topLevelGraph.nodes(), function (nodeName, index) {
-                        var node = topLevelGraph.node(nodeName);
-                        // Some ops don't have stats at all.
-                        if (node.stats != null) {
-                            return node.stats.totalMicros;
-                        }
-                    });
-                    this.computeTimeScale = d3.scale.linear()
-                        .domain(computeTimeExtent)
-                        .range(PARAMS.minMaxColors);
-                    this.edgeWidthScale = this.hierarchy.hasShapeInfo ?
-                        graph_1.scene.edge.EDGE_WIDTH_SCALE :
-                        d3.scale.linear()
-                            .domain([1, this.hierarchy.maxMetaEdgeSize])
-                            .range([graph_1.scene.edge.MIN_EDGE_WIDTH, graph_1.scene.edge.MAX_EDGE_WIDTH]);
-                };
-                /**
-                 * Get a previously created RenderNodeInfo by its node name.
-                 */
-                RenderGraphInfo.prototype.getRenderNodeByName = function (nodeName) {
-                    return this.index[nodeName];
-                };
-                /**
-                 * Get the underlying node in the hierarchical graph by its name.
-                 */
-                RenderGraphInfo.prototype.getNodeByName = function (nodeName) {
-                    return this.hierarchy.node(nodeName);
-                };
-                /**
-                 * Get a previously created RenderNodeInfo for the specified node name,
-                 * or create one if it hasn't been created yet.
-                 */
-                RenderGraphInfo.prototype.getOrCreateRenderNodeByName = function (nodeName) {
-                    var _this = this;
-                    // Polymer may invoke this with null.
-                    if (!nodeName) {
-                        return null;
-                    }
-                    if (nodeName in this.index) {
-                        return this.index[nodeName];
-                    }
-                    var node = this.hierarchy.node(nodeName);
-                    // Exit early if the node does not exist in the hierarchy. This can happen
-                    // when a graph is reloaded while the infocard points to a node not visible
-                    // at the top-level.
-                    if (!node) {
-                        return null;
-                    }
-                    var renderInfo = node.isGroupNode ?
-                        new RenderGroupNodeInfo(node) :
-                        new RenderNodeInfo(node);
-                    this.index[nodeName] = renderInfo;
-                    this.renderedOpNames.push(nodeName);
-                    if (node.stats) {
-                        renderInfo.memoryColor = this.memoryUsageScale(node.stats.totalBytes);
-                        renderInfo.computeTimeColor =
-                            this.computeTimeScale(node.stats.totalMicros);
-                    }
-                    // We only fade nodes when we're displaying stats.
-                    renderInfo.isFadedOut = this.displayingStats &&
-                        !tf.graph.util.hasDisplayableNodeStats(node.stats);
-                    if (node.isGroupNode) {
-                        // Make a list of tuples (device, proportion), where proportion
-                        // is the fraction of op nodes that have that device.
-                        var pairs = _.pairs(node.deviceHistogram);
-                        if (pairs.length > 0) {
-                            // Compute the total # of devices.
-                            var numDevices_1 = _.sum(pairs, _.last);
-                            renderInfo.deviceColors = _.map(pairs, function (pair) { return ({
-                                color: _this.deviceColorMap(pair[0]),
-                                // Normalize to a proportion of total # of devices.
-                                proportion: pair[1] / numDevices_1
-                            }); });
-                        }
-                    }
-                    else {
-                        var device = renderInfo.node.device;
-                        if (device) {
-                            renderInfo.deviceColors = [{
-                                    color: this.deviceColorMap(device),
-                                    proportion: 1.0
-                                }];
-                        }
-                    }
-                    return this.index[nodeName];
-                };
-                /**
-                 * Return the nearest ancestor node, including itself, that is visible
-                 * in the visualization. This method is used so that we can select
-                 * (highlight) a node that isn't drawn yet, by selecting (highlighting)
-                 * its nearest ancestor that has been drawn.
-                 */
-                RenderGraphInfo.prototype.getNearestVisibleAncestor = function (name) {
-                    var path = graph_1.getHierarchicalPath(name);
-                    for (var i = 0; i < path.length; i++) {
-                        var nodeName = path[i];
-                        // Op nodes have expanded set to false by default.
-                        if (!this.getRenderNodeByName(nodeName).expanded) {
-                            return nodeName;
-                        }
-                    }
-                    // Fallthrough. If everything was expanded return the node.
-                    return name;
-                };
-                // TODO(jimbo): Delete this an any code it touches (all deprecated).
-                RenderGraphInfo.prototype.setDepth = function (depth) {
-                    setGroupNodeDepth(this.root, +depth);
-                };
-                /**
-                 * Returns true if the renderNode is an isolated node within its parent node.
-                 */
-                RenderGraphInfo.prototype.isNodeAuxiliary = function (renderNode) {
-                    var parentNode = this.getRenderNodeByName(renderNode.node.parentNode.name);
-                    var found = _.find(parentNode.isolatedInExtract, function (node) {
-                        return node.node.name === renderNode.node.name;
-                    });
-                    if (found) {
-                        return true;
-                    }
-                    found = _.find(parentNode.isolatedOutExtract, function (node) {
-                        return node.node.name === renderNode.node.name;
-                    });
-                    return !!found;
-                };
-                /**
-                 * Returns a list of ops that have been rendered so far for this graph. More
-                 * ops may later be rendered if the user expands nodes for instance. The list
-                 * returned here can only stay the same size or grow on successive calls.
-                 */
-                RenderGraphInfo.prototype.getNamesOfRenderedOps = function () {
-                    return this.renderedOpNames;
-                };
-                RenderGraphInfo.prototype.buildSubhierarchy = function (nodeName) {
-                    var _this = this;
-                    // Terminate if the rendering hierarchy was already constructed
-                    // for this node.
-                    if (nodeName in this.hasSubhierarchy) {
-                        return;
-                    }
-                    var renderNodeInfo = this.index[nodeName];
-                    // If it is not a meta node or a series node, don't do anything.
-                    if (renderNodeInfo.node.type !== graph_1.NodeType.META &&
-                        renderNodeInfo.node.type !== graph_1.NodeType.SERIES) {
-                        return;
-                    }
-                    // At this point we know the rendering information is about a group node.
-                    var renderGroupNodeInfo = renderNodeInfo;
-                    var metagraph = renderGroupNodeInfo.node.metagraph;
-                    var coreGraph = renderGroupNodeInfo.coreGraph;
-                    // Create render nodes to represent each child from the metagraph. Although
-                    // these will initially be added to the coreGraph, they may later be
-                    // extracted. Also, due to extraction, the coreGraph may contain disjoint
-                    // groups between which there is no visible path (other than annotations).
-                    _.each(metagraph.nodes(), function (childName) {
-                        var childRenderInfo = _this.getOrCreateRenderNodeByName(childName);
-                        var childNode = childRenderInfo.node;
-                        coreGraph.setNode(childName, childRenderInfo);
-                        if (!childNode.isGroupNode) {
-                            _.each(childNode.inEmbeddings, function (embedding) {
-                                var renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-                                addInAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo, AnnotationType.CONSTANT);
-                                _this.index[embedding.name] = new RenderNodeInfo(embedding);
-                            });
-                            _.each(childNode.outEmbeddings, function (embedding) {
-                                var renderMetaedgeInfo = new RenderMetaedgeInfo(null);
-                                addOutAnnotation(childRenderInfo, embedding, null, renderMetaedgeInfo, AnnotationType.SUMMARY);
-                                _this.index[embedding.name] = new RenderNodeInfo(embedding);
-                            });
-                        }
-                    });
-                    // Add render metaedge info for edges in the metagraph.
-                    _.each(metagraph.edges(), function (edgeObj) {
-                        var metaedge = metagraph.edge(edgeObj);
-                        var renderMetaedgeInfo = new RenderMetaedgeInfo(metaedge);
-                        renderMetaedgeInfo.isFadedOut =
-                            _this.index[edgeObj.v].isFadedOut || _this.index[edgeObj.w].isFadedOut;
-                        coreGraph.setEdge(edgeObj.v, edgeObj.w, renderMetaedgeInfo);
-                    });
-                    if (PARAMS.enableExtraction &&
-                        renderGroupNodeInfo.node.type === graph_1.NodeType.META) {
-                        extractHighDegrees(renderGroupNodeInfo);
-                    }
-                    // Record that we constructed the rendering hierarchy for this node, so we
-                    // don't construct it another time.
-                    this.hasSubhierarchy[nodeName] = true;
-                    // Look up the parent node's render information and short circuit if none.
-                    var parentNode = renderGroupNodeInfo.node.parentNode;
-                    if (!parentNode) {
-                        return;
-                    }
-                    var parentNodeInfo = this.index[parentNode.name];
-                    // Utility function for computing the name of a bridge node.
-                    var getBridgeNodeName = function (inbound) {
-                        var rest = [];
-                        for (var _i = 1; _i < arguments.length; _i++) {
-                            rest[_i - 1] = arguments[_i];
-                        }
-                        return rest.concat([inbound ? 'IN' : 'OUT']).join('~~');
-                    };
-                    // Build out the bridgegraph.
-                    var bridgegraph = this.hierarchy.getBridgegraph(nodeName);
-                    // Look for popular nodes so we can make annotations instead of paths.
-                    var otherCounts = {
-                        // Counts of edges coming INTO other nodes by name (outgoing from self).
-                        in: {},
-                        // Counts of edges going OUT from other nodes by name (coming into self).
-                        out: {},
-                        // Counts of all control edges involving other nodes by name.
-                        control: {},
-                    };
-                    _.each(bridgegraph.edges(), function (e) {
-                        // An edge is inbound if its destination node is in the metagraph.
-                        var inbound = !!metagraph.node(e.w);
-                        var otherName = inbound ? e.v : e.w;
-                        var metaedge = bridgegraph.edge(e);
-                        if (!metaedge.numRegularEdges) {
-                            otherCounts.control[otherName] =
-                                (otherCounts.control[otherName] || 0) + 1;
-                        }
-                        else if (inbound) {
-                            otherCounts.out[otherName] = (otherCounts.out[otherName] || 0) + 1;
-                        }
-                        else {
-                            otherCounts.in[otherName] = (otherCounts.in[otherName] || 0) + 1;
-                        }
-                    });
-                    // Add annotations and edges for bridgegraph relationships.
-                    var hierarchyNodeMap = this.hierarchy.getNodeMap();
-                    _.each(bridgegraph.edges(), function (bridgeEdgeObj) {
-                        var bridgeMetaedge = bridgegraph.edge(bridgeEdgeObj);
-                        // Determine whether this bridge edge is incoming by checking the
-                        // metagraph for a node that matches the destination end.
-                        var inbound = !!metagraph.node(bridgeEdgeObj.w);
-                        // Based on the direction of the edge, one endpoint will be an immediate
-                        // child of this renderNodeInfo, and the other endpoint will be a sibling
-                        // of the parent (or an ancestor further up).
-                        var _a = inbound ?
-                            [bridgeEdgeObj.w, bridgeEdgeObj.v] :
-                            [bridgeEdgeObj.v, bridgeEdgeObj.w], childName = _a[0], otherName = _a[1];
-                        var childRenderInfo = _this.index[childName];
-                        var otherRenderInfo = _this.index[otherName];
-                        var otherNode = otherRenderInfo ?
-                            otherRenderInfo.node :
-                            hierarchyNodeMap[otherName];
-                        // Determine whether this edge is a control edge between nodes where
-                        // either node is high-degree with respect to control edges. This will
-                        // be a signal to show it as an annotation instead of a bridge edge.
-                        var isHighDegreeControlEdge = !bridgeMetaedge.numRegularEdges &&
-                            otherCounts.control[otherName] > PARAMS.maxControlDegree;
-                        var _b = inbound ?
-                            [renderNodeInfo.inAnnotations, childRenderInfo.inAnnotations] :
-                            [renderNodeInfo.outAnnotations, childRenderInfo.outAnnotations], childAnnotations = _b[1];
-                        // Don't render a bridge path if the other node has in or out degree above
-                        // a threshold, lest bridge paths emanating out of a metagraph crowd up,
-                        // as was the case for the Fatcat LSTM lstm_1 > lstm_1 metagraph.
-                        var otherDegreeCount = (inbound ? otherCounts.out : otherCounts.in)[otherName];
-                        var isOtherHighDegree = otherDegreeCount > PARAMS.maxBridgePathDegree;
-                        // The adjoining render metaedge info from the parent's coreGraph, if any.
-                        // It will either be a Metaedge involving this node directly, if it
-                        // previously came from a metagraph, or it'll be a Metaedge involving
-                        // a previously created bridge node standing in for the other node.
-                        var adjoiningMetaedge = null;
-                        // We can only hope to render a bridge path if:
-                        //  - bridgegraph paths are enabled,
-                        //  - the other node is not too high-degree,
-                        //  - the child is in the core (not extracted for being high-degree), and
-                        //  - there's a path (in the traversal sense) between child and other.
-                        var canDrawBridgePath = false;
-                        if (PARAMS.enableBridgegraph &&
-                            !isOtherHighDegree &&
-                            !isHighDegreeControlEdge &&
-                            childRenderInfo.isInCore()) {
-                            // Utility function for finding an adjoining metaedge.
-                            var findAdjoiningMetaedge = function (targetName) {
-                                var adjoiningEdgeObj = inbound ?
-                                    { v: targetName, w: nodeName } :
-                                    { v: nodeName, w: targetName };
-                                return parentNodeInfo.coreGraph.edge(adjoiningEdgeObj);
-                            };
-                            adjoiningMetaedge = findAdjoiningMetaedge(otherName);
-                            if (!adjoiningMetaedge) {
-                                adjoiningMetaedge = findAdjoiningMetaedge(getBridgeNodeName(inbound, otherName, parentNode.name));
-                            }
-                            canDrawBridgePath = !!adjoiningMetaedge;
-                        }
-                        // Although dataflow edges are acyclic, control dependency edges may
-                        // actually point 'backwards' in the graph. If this bridgeMetaedge is
-                        // a control dependency, we need to determine whether it's backwards
-                        // pointing so that we render it appropriately.
-                        //
-                        // For instance, say we're rendering a graph with nodes named A/B and Z/Y,
-                        // and we're currently rendering the bridgegraph for A. Further, let's say
-                        // that there was an original BaseEdge from A/B->Z/Y and a CONTROL EDGE
-                        // from Z/Y=>A/B.
-                        //
-                        //     +----------------+
-                        //     | A              |
-                        //     |  +-----+       |         +------+
-                        //     |  | B   |>-----\x3e|>-------\x3e| Z    |
-                        //     |  |     |       |         |      |
-                        //     |  |     |   *   |         |      |
-                        //     |  |     |<=====<|<=======<|      |
-                        //     |  +-----+       |         +------+
-                        //     +----------------+
-                        //
-                        // When we render the subhierarchy for Metanode A, we'll come across a
-                        // control-only Metaedge in the bridgegraph from Z=>A/B (*). The question
-                        // is whether this edge is backwards.
-                        //
-                        // To answer that question, we follow the chain of adjoining metaedges
-                        // until we reach the topmost one. In this case, that's the control-only
-                        // Metaedge Z=>A in the ROOT's metagraph. We determine that this edge
-                        // is backwards by looking at the topological ordering of ROOT's metagraph
-                        // (which ignores control edges) and seeing that Z comes AFTER A.
-                        //
-                        // The property of being backwards is independent of whether the edge
-                        // is inbound or outbound. In the preceding example, if we were building
-                        // the subhierarchy for Z, we'd find bridge edge Z/Y=>A, walk to its
-                        // topmost adjoining metaedge Z=>A and discover that it's backwards.
-                        var backwards = false;
-                        if (adjoiningMetaedge && !bridgeMetaedge.numRegularEdges) {
-                            // Find the top-most adjoining render metaedge information, and the
-                            // GroupNode whose metagraph must contain the associated metaedge.
-                            var topAdjoiningMetaedge = adjoiningMetaedge;
-                            var topGroupNode = parentNodeInfo.node;
-                            while (topAdjoiningMetaedge.adjoiningMetaedge) {
-                                topAdjoiningMetaedge = topAdjoiningMetaedge.adjoiningMetaedge;
-                                topGroupNode = topGroupNode.parentNode;
-                            }
-                            // Check against the topological ordering for the top node. The current
-                            // bridge metaedge we're evaluating is backwards if its source comes
-                            // after its destination.
-                            var ordering = _this.hierarchy.getTopologicalOrdering(topGroupNode.name);
-                            var e = topAdjoiningMetaedge.metaedge;
-                            backwards = ordering[e.v] > ordering[e.w];
-                        }
-                        // Render backwards control edges as annotations.
-                        canDrawBridgePath = canDrawBridgePath && !backwards;
-                        // If we can't make a bridge path for any reason, then we add an
-                        // annotation instead.
-                        if (!canDrawBridgePath) {
-                            childAnnotations.push(new Annotation(otherNode, otherRenderInfo, new RenderMetaedgeInfo(bridgeMetaedge), AnnotationType.SHORTCUT, inbound));
-                            return;
-                        }
-                        // At this point, all conditions have been met for drawing a bridge path.
-                        // Find or create the IN/OUT node representing otherNode.
-                        var bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-                        var bridgeNodeName = getBridgeNodeName(inbound, otherName, nodeName);
-                        var bridgeNodeRenderInfo = coreGraph.node(bridgeNodeName);
-                        if (!bridgeNodeRenderInfo) {
-                            // Find or create the directional container for the bridge node.
-                            var bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-                            if (!bridgeContainerInfo) {
-                                var bridgeContainerNode = {
-                                    // Important node properties.
-                                    name: bridgeContainerName,
-                                    type: graph_1.NodeType.BRIDGE,
-                                    // Unused node properties.
-                                    isGroupNode: false,
-                                    cardinality: 0,
-                                    parentNode: null,
-                                    stats: null,
-                                    include: graph_1.InclusionType.UNSPECIFIED,
-                                    // BridgeNode properties.
-                                    inbound: inbound,
-                                    nodeAttributes: {},
-                                };
-                                bridgeContainerInfo =
-                                    new RenderNodeInfo(bridgeContainerNode);
-                                _this.index[bridgeContainerName] = bridgeContainerInfo;
-                                coreGraph.setNode(bridgeContainerName, bridgeContainerInfo);
-                            }
-                            var bridgeNode = {
-                                // Important node properties.
-                                name: bridgeNodeName,
-                                type: graph_1.NodeType.BRIDGE,
-                                // Unimportant node properties.
-                                isGroupNode: false,
-                                cardinality: 1,
-                                parentNode: null,
-                                stats: null,
-                                include: graph_1.InclusionType.UNSPECIFIED,
-                                // BridgeNode properties.
-                                inbound: inbound,
-                                nodeAttributes: {},
-                            };
-                            bridgeNodeRenderInfo = new RenderNodeInfo(bridgeNode);
-                            _this.index[bridgeNodeName] = bridgeNodeRenderInfo;
-                            coreGraph.setNode(bridgeNodeName, bridgeNodeRenderInfo);
-                            // Set bridgeNode to be a graphlib child of the container node.
-                            coreGraph.setParent(bridgeNodeName, bridgeContainerName);
-                            bridgeContainerInfo.node.cardinality++;
-                        }
-                        // Create and add a bridge render metaedge.
-                        var bridgeRenderMetaedge = new RenderMetaedgeInfo(bridgeMetaedge);
-                        bridgeRenderMetaedge.adjoiningMetaedge = adjoiningMetaedge;
-                        inbound ?
-                            coreGraph.setEdge(bridgeNodeName, childName, bridgeRenderMetaedge) :
-                            coreGraph.setEdge(childName, bridgeNodeName, bridgeRenderMetaedge);
-                    }); // End _.each(bridgegraph.edges).
-                    // For each bridge container (IN and/or OUT), add structural edges between
-                    // terminal nodes and that container. A terminal node is one which has no
-                    // non-bridge edges in the direction of the container.
-                    //
-                    // For example, consider a Metanode A which contains two child nodes A/B
-                    // and A/C. Let's say it has one edge in the metagraph from A/B->A/C, and
-                    // one edge in the bridgegraph from Z->A/C.
-                    //
-                    // At this point, we've added a container bridge node IN to house all
-                    // incoming bridge nodes. We've also added a bridge node Z' (with parent IN)
-                    // to A, and a bridge edge from Z'->C.
-                    //
-                    //     +----------------------+
-                    //     | A          +---+     |
-                    //     |    +------\x3e| C |     |
-                    //     |    |       +---+     |
-                    //     |    |         ^       |
-                    //     |    |         |       |
-                    //     |    |    +----|----+  |
-                    //     |    |    | IN |    |  |
-                    //     |  +---+  |  +---+  |  |
-                    //     |  | B |  |  | Z'|  |  |
-                    //     |  +---+  |  +---+  |  |
-                    //     |         +---------+  |
-                    //     +----------------------+
-                    //
-                    // With no other help, dagre would lay out B and Z' on the same level,
-                    // because both of them have no incoming edges. In other words, B is a
-                    // terminal node in the INCOMING direction.
-                    //
-                    // But we want to force dagre to lay out Z' (and everything in IN) lower
-                    // than all non-bridge nodes, so that there's enough room for the bridge
-                    // edges after they've been adjusted to meet up with paths coming in from
-                    // outside.
-                    //
-                    // To force Z' (and all other bridge nodes) to be lowest in the graph, we
-                    // identify terminal nodes like B and give them structural edges to
-                    // a new structural bridge node S which we add to IN.
-                    //
-                    //     +----------------------+
-                    //     | A          +---+     |
-                    //     |       +---\x3e| C |     |
-                    //     |       |    +---+     |
-                    //     |     +---+    ^       |
-                    //     |     | B |    |       |
-                    //     |     +---+    |       |
-                    //     |       ^      |       |
-                    //     |       |      |       |
-                    //     |  +----|------|----+  |
-                    //     |  |IN  |      |    |  |
-                    //     |  |  +---+  +---+  |  |
-                    //     |  |  | S |  | Z'|  |  |
-                    //     |  |  +---+  +---+  |  |
-                    //     |  +----------------+  |
-                    //     +----------------------+
-                    //
-                    // This ensures that dagre will lay out the bridge containers strictly at
-                    // the ends of the graph. The structural edges will never be seen in the
-                    // visualization except as a debugging aid.
-                    _.each([true, false], function (inbound) {
-                        var bridgeContainerName = getBridgeNodeName(inbound, nodeName);
-                        var bridgeContainerInfo = coreGraph.node(bridgeContainerName);
-                        if (!bridgeContainerInfo) {
-                            return;
-                        }
-                        _.each(coreGraph.nodes(), function (childName) {
-                            // Short-circuit if this child is a bridge node or it's not a terminal
-                            // node in the direction we're interested in.
-                            var childNodeInfo = coreGraph.node(childName);
-                            if (childNodeInfo.node.type === graph_1.NodeType.BRIDGE) {
-                                return;
-                            }
-                            var isTerminal = inbound ?
-                                !coreGraph.predecessors(childName).length :
-                                !coreGraph.successors(childName).length;
-                            if (!isTerminal) {
-                                return;
-                            }
-                            // Find or create a bridge node in the container for all structural
-                            // metaedges. It would have been nice to skip this step and simply
-                            // set a metaedge between the terminal node and the container node, but
-                            // in that case, something about the graph upsets dagre.layout()'s
-                            // longestPath algorithm (was getting errors due to an undefined).
-                            var structuralNodeName = getBridgeNodeName(inbound, nodeName, 'STRUCTURAL_TARGET');
-                            var structuralRenderInfo = coreGraph.node(structuralNodeName);
-                            if (!structuralRenderInfo) {
-                                var bridgeNode = {
-                                    // Important Node properties.
-                                    name: structuralNodeName,
-                                    type: graph_1.NodeType.BRIDGE,
-                                    // Unimportant Node properties.
-                                    isGroupNode: false,
-                                    cardinality: 1,
-                                    parentNode: null,
-                                    stats: null,
-                                    include: graph_1.InclusionType.UNSPECIFIED,
-                                    // BridgeNode properties.
-                                    inbound: inbound,
-                                    nodeAttributes: {},
-                                };
-                                structuralRenderInfo = new RenderNodeInfo(bridgeNode);
-                                structuralRenderInfo.structural = true;
-                                _this.index[structuralNodeName] = structuralRenderInfo;
-                                coreGraph.setNode(structuralNodeName, structuralRenderInfo);
-                                bridgeContainerInfo.node.cardinality++;
-                                coreGraph.setParent(structuralNodeName, bridgeContainerName);
-                            }
-                            // Create the structural Metaedge and insert it.
-                            var structuralMetaedgeInfo = new RenderMetaedgeInfo(null);
-                            structuralMetaedgeInfo.structural = true;
-                            structuralMetaedgeInfo.weight--; // Reduce weight for dagre layout.
-                            inbound ?
-                                coreGraph.setEdge(structuralNodeName, childName, structuralMetaedgeInfo) :
-                                coreGraph.setEdge(childName, structuralNodeName, structuralMetaedgeInfo);
-                        });
-                    });
-                };
-                return RenderGraphInfo;
-            }());
-            render.RenderGraphInfo = RenderGraphInfo;
-            /**
-             * A class for rendering annotation object which contains label
-             * about the node embedded as annotation, type of annotation and the location
-             * of both the annotation's node and edge.
-             *
-             * Annotation objects include embedded constants, embedded summary, and
-             * edge shortcuts.
-             */
-            var Annotation = (function () {
-                /**
-                 * Creates a new Annotation.
-                 *
-                 * @param node The underlying node this annotation points to.
-                 * @param renderNodeInfo The render information for the underlying node
-                 *     this annotation points to. This can be null if the annotation
-                 *     denotes an embedding (constant, summary), in which case we
-                 *     use the node property.
-                 * @param renderMetaedgeInfo The render information for the edge associated
-                 *     with the annotation.
-                 * @param type The type of the annotation.
-                 * @param isIn True if it is an in-annotation. False if it is an
-                 *     out-annotation.
-                 */
-                function Annotation(node, renderNodeInfo, renderMetaedgeInfo, type, isIn) {
-                    this.node = node;
-                    this.renderNodeInfo = renderNodeInfo;
-                    this.renderMetaedgeInfo = renderMetaedgeInfo;
-                    this.annotationType = type;
-                    // Properties specified by layout
-                    this.dx = 0;
-                    this.dy = 0;
-                    this.width = 0;
-                    this.height = 0;
-                    // Properties needed for generating an ID for the edge's path element if
-                    // this annotation is associated with a metaedge.
-                    if (renderMetaedgeInfo && renderMetaedgeInfo.metaedge) {
-                        this.v = renderMetaedgeInfo.metaedge.v;
-                        this.w = renderMetaedgeInfo.metaedge.w;
-                    }
-                    this.isIn = isIn;
-                    this.points = [];
-                }
-                return Annotation;
-            }());
-            render.Annotation = Annotation;
-            ;
-            var AnnotationType;
-            (function (AnnotationType) {
-                AnnotationType[AnnotationType["SHORTCUT"] = 0] = "SHORTCUT";
-                AnnotationType[AnnotationType["CONSTANT"] = 1] = "CONSTANT";
-                AnnotationType[AnnotationType["SUMMARY"] = 2] = "SUMMARY";
-                AnnotationType[AnnotationType["ELLIPSIS"] = 3] = "ELLIPSIS";
-            })(AnnotationType = render.AnnotationType || (render.AnnotationType = {}));
-            ;
-            /**
-             * Manages a list of annotations. Two will be used for each
-             * RenderNodeInfo, one for in annotations and one for out annotations.
-             */
-            var AnnotationList = (function () {
-                function AnnotationList() {
-                    this.list = [];
-                    this.nodeNames = {};
-                }
-                /**
-                 * Append an annotation to the list, or a stand-in ellipsis annotation instead
-                 * if this would make it too many.
-                 */
-                AnnotationList.prototype.push = function (annotation) {
-                    if (annotation.node.name in this.nodeNames) {
-                        return; // Skip duplicate annotation.
-                    }
-                    this.nodeNames[annotation.node.name] = true;
-                    if (this.list.length < PARAMS.maxAnnotations) {
-                        this.list.push(annotation);
-                        return;
-                    }
-                    var lastAnnotation = this.list[this.list.length - 1];
-                    if (lastAnnotation.annotationType === AnnotationType.ELLIPSIS) {
-                        var ellipsisNode_1 = lastAnnotation.node;
-                        ellipsisNode_1.setNumMoreNodes(++ellipsisNode_1.numMoreNodes);
-                        return;
-                    }
-                    var ellipsisNode = new tf.graph.EllipsisNodeImpl(1);
-                    this.list.push(new Annotation(ellipsisNode, new RenderNodeInfo(ellipsisNode), null, AnnotationType.ELLIPSIS, annotation.isIn));
-                };
-                return AnnotationList;
-            }());
-            render.AnnotationList = AnnotationList;
-            /**
-             * Contains rendering information about a node in the hierarchical graph.
-             */
-            var RenderNodeInfo = (function () {
-                function RenderNodeInfo(node) {
-                    this.node = node;
-                    this.expanded = false;
-                    this.inAnnotations = new AnnotationList();
-                    this.outAnnotations = new AnnotationList();
-                    // Params specified by layout
-                    this.x = 0;
-                    this.y = 0;
-                    this.width = 0;
-                    this.height = 0;
-                    this.inboxWidth = 0;
-                    this.outboxWidth = 0;
-                    this.excluded = false;
-                    // Params for bridge paths.
-                    this.structural = false;
-                    // Params for node box.
-                    this.labelOffset = 0;
-                    this.radius = 0;
-                    // Params for expanded node
-                    this.labelHeight = 0;
-                    this.paddingTop = 0;
-                    this.paddingLeft = 0;
-                    this.paddingRight = 0;
-                    this.paddingBottom = 0;
-                    this.isInExtract = false;
-                    this.isOutExtract = false;
-                    this.coreBox = { width: 0, height: 0 };
-                    // By default, we don't fade nodes out. Default to false for safety.
-                    this.isFadedOut = false;
-                }
-                RenderNodeInfo.prototype.isInCore = function () {
-                    return !this.isInExtract && !this.isOutExtract;
-                };
-                return RenderNodeInfo;
-            }());
-            render.RenderNodeInfo = RenderNodeInfo;
-            /**
-             * Contains rendering information about a Metaedge from the underlying
-             * hierarchical graph. It may be from either a metagraph or a bridgegraph.
-             */
-            var RenderMetaedgeInfo = (function () {
-                function RenderMetaedgeInfo(metaedge) {
-                    this.metaedge = metaedge;
-                    this.adjoiningMetaedge = null;
-                    this.structural = false;
-                    this.weight = 1;
-                    this.isFadedOut = false;
-                }
-                return RenderMetaedgeInfo;
-            }());
-            render.RenderMetaedgeInfo = RenderMetaedgeInfo;
-            function addInAnnotation(node, predecessor, predecessorRenderInfo, edge, type) {
-                var annotation = new Annotation(predecessor, predecessorRenderInfo, edge, type, true);
-                node.inAnnotations.push(annotation);
-            }
-            function addOutAnnotation(node, successor, successorRenderInfo, edge, type) {
-                var annotation = new Annotation(successor, successorRenderInfo, edge, type, false);
-                node.outAnnotations.push(annotation);
-            }
-            function setGraphDepth(graph, depth) {
-                _.each(graph.nodes(), function (nodeName) {
-                    var child = graph.node(nodeName);
-                    child.expanded = depth > 1; // set all child of depth 1 to collapsed
-                    if (depth > 0) {
-                        switch (child.node.type) {
-                            case graph_1.NodeType.META:
-                            case graph_1.NodeType.SERIES:
-                                setGroupNodeDepth(child, depth - 1);
-                                break;
-                        }
-                    }
-                });
-            }
-            ;
-            var RenderGroupNodeInfo = (function (_super) {
-                __extends(RenderGroupNodeInfo, _super);
-                function RenderGroupNodeInfo(groupNode) {
-                    var _this = _super.call(this, groupNode) || this;
-                    var metagraph = groupNode.metagraph;
-                    var gl = metagraph.graph();
-                    _this.coreGraph =
-                        graph_1.createGraph(gl.name, graph_1.GraphType.CORE, { compound: true });
-                    _this.inExtractBox = { width: 0, height: 0 };
-                    _this.outExtractBox = { width: 0, height: 0 };
-                    _this.isolatedInExtract = [];
-                    _this.isolatedOutExtract = [];
-                    return _this;
-                }
-                return RenderGroupNodeInfo;
-            }(RenderNodeInfo));
-            render.RenderGroupNodeInfo = RenderGroupNodeInfo;
-            function setGroupNodeDepth(renderInfo, depth) {
-                if (renderInfo.coreGraph) {
-                    setGraphDepth(renderInfo.coreGraph, depth);
-                }
-            }
-            /**
-             * Remove an edge from the graph and add annotations to both ends of the edge.
-             *
-             * @param The core graph.
-             * @param v Source name.
-             * @param w Sink name.
-             */
-            function createShortcut(graph, v, w) {
-                var src = graph.node(v);
-                var sink = graph.node(w);
-                var edge = graph.edge(v, w);
-                // If either of the nodes is explicitly included in the main graph and
-                // both nodes are in the main graph then do not create the shortcut
-                // and instead keep the real edge.
-                if ((src.node.include === graph_1.InclusionType.INCLUDE ||
-                    sink.node.include === graph_1.InclusionType.INCLUDE) &&
-                    src.node.include !== graph_1.InclusionType.EXCLUDE &&
-                    sink.node.include !== graph_1.InclusionType.EXCLUDE) {
-                    return;
-                }
-                // Add each annotation.
-                addOutAnnotation(src, sink.node, sink, edge, AnnotationType.SHORTCUT);
-                addInAnnotation(sink, src.node, src, edge, AnnotationType.SHORTCUT);
-                // Remove the edge from the core graph.
-                graph.removeEdge(v, w);
-            }
-            /**
-             * Remove edges from a node, and set its isOutExtract property to true,
-             * and remove the node and move it to isolatedOutExtract.
-             *
-             * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
-             * edges. Otherwise, only extract all in-edges.
-             */
-            function makeOutExtract(renderNode, n, forceDetach) {
-                var graph = renderNode.coreGraph;
-                var child = graph.node(n);
-                child.isOutExtract = true;
-                _.each(graph.predecessors(n), function (p, index) {
-                    createShortcut(graph, p, n);
-                });
-                if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-                    _.each(graph.successors(n), function (s, index) {
-                        createShortcut(graph, n, s);
-                    });
-                }
-                // Remove the node from the core graph if it no longer has neighbors.
-                if (graph.neighbors(n).length === 0) {
-                    child.node.include = graph_1.InclusionType.EXCLUDE;
-                    renderNode.isolatedOutExtract.push(child);
-                    graph.removeNode(n);
-                }
-            }
-            /**
-             * Remove edges from a node, set its isInExtract property to true,
-             * and remove the node and move it to isolatedInExtract.
-             *
-             * If detachAllEdgesForHighDegree or forceDetach is true, extract all of its
-             * edges. Otherwise, only remove all out-edges.
-             */
-            function makeInExtract(renderNode, n, forceDetach) {
-                var graph = renderNode.coreGraph;
-                var child = graph.node(n);
-                child.isInExtract = true;
-                _.each(graph.successors(n), function (s, index) {
-                    createShortcut(graph, n, s);
-                });
-                if (PARAMS.detachAllEdgesForHighDegree || forceDetach) {
-                    _.each(graph.predecessors(n), function (p, index) {
-                        createShortcut(graph, p, n);
-                    });
-                }
-                // Remove the node from the core graph if it no longer has neighbors.
-                if (graph.neighbors(n).length === 0) {
-                    child.node.include = graph_1.InclusionType.EXCLUDE;
-                    renderNode.isolatedInExtract.push(child);
-                    graph.removeNode(n);
-                }
-            }
-            render.makeInExtract = makeInExtract;
-            /**
-             * Check whether the node's type is a member of the given list of types.
-             *
-             * @param node Node.
-             * @param types List of type to match.
-             */
-            function hasTypeIn(node, types) {
-                if (node.type === graph_1.NodeType.OP) {
-                    for (var i = 0; i < types.length; i++) {
-                        if (node.op === types[i]) {
-                            return true;
-                        }
-                    }
-                }
-                else if (node.type === graph_1.NodeType.META) {
-                    var rootOpNode = node.getRootOp();
-                    if (rootOpNode) {
-                        for (var i = 0; i < types.length; i++) {
-                            if (rootOpNode.op === types[i]) {
-                                return true;
-                            }
-                        }
-                    }
-                }
-                return false;
-            }
-            /** Move nodes that are specified to be excluded out of the core graph. */
-            function extractSpecifiedNodes(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include === graph_1.InclusionType.EXCLUDE) {
-                        if (renderNode.coreGraph.outEdges(n).length >
-                            renderNode.coreGraph.inEdges(n).length) {
-                            makeOutExtract(renderNode, n, true);
-                        }
-                        else {
-                            makeInExtract(renderNode, n, true);
-                        }
-                    }
-                });
-            }
-            /** Remove edges from pre-defined out-extract patterns */
-            function extractPredefinedSink(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (hasTypeIn(renderInfo.node, PARAMS.outExtractTypes)) {
-                        makeOutExtract(renderNode, n);
-                    }
-                });
-            }
-            /** Remove edges from pre-defined in-extract patterns */
-            function extractPredefinedSource(renderNode) {
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var renderInfo = graph.node(n);
-                    if (renderInfo.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (hasTypeIn(renderInfo.node, PARAMS.inExtractTypes)) {
-                        makeInExtract(renderNode, n);
-                    }
-                });
-            }
-            /** Extract nodes deemed to have either high in-degree or high out-degree. */
-            function extractHighInOrOutDegree(renderNode) {
-                var graph = renderNode.coreGraph;
-                // Create mappings from node to in and out degrees. Count the number of valid
-                // nodes along the way.
-                var nodeToInDegree = {};
-                var nodeToOutDegree = {};
-                var validNodeCount = 0;
-                _.each(graph.nodes(), function (currentNode) {
-                    if (graph.node(currentNode).node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        // This node is not included in the first place.
-                        return;
-                    }
-                    // Count the in and out degrees based on only regular edges, unless there
-                    // are no regular edges, in which case use the number of control edges.
-                    // This is done so that control edges don't affect if nodes are extracted
-                    // from the core graph, unless the node is only used for control.
-                    var inDegree = _.reduce(graph.predecessors(currentNode), function (inDegree, pred) {
-                        var metaedge = graph.edge(pred, currentNode).metaedge;
-                        return inDegree + (metaedge.numRegularEdges ? 1 : 0);
-                    }, 0);
-                    if (inDegree === 0 && graph.predecessors(currentNode).length > 0) {
-                        inDegree = graph.predecessors(currentNode).length;
-                    }
-                    var outDegree = _.reduce(graph.successors(currentNode), function (outDegree, succ) {
-                        var metaedge = graph.edge(currentNode, succ).metaedge;
-                        return outDegree + (metaedge.numRegularEdges ? 1 : 0);
-                    }, 0);
-                    if (outDegree === 0 && graph.successors(currentNode).length > 0) {
-                        outDegree = graph.successors(currentNode).length;
-                    }
-                    // Store the in and out degrees of this node to avoid recomputing.
-                    nodeToInDegree[currentNode] = inDegree;
-                    nodeToOutDegree[currentNode] = outDegree;
-                    validNodeCount++;
-                });
-                if (validNodeCount < PARAMS.minNodeCountForExtraction) {
-                    // This graph has few nodes. Do not extract any nodes.
-                    return;
-                }
-                // We only extract if the node has a min in or out degree greater than this.
-                var minUpperBound = PARAMS.minDegreeForExtraction - 1;
-                // Mark for extraction nodes with in-degree > Q3 + (Q3 - Q1).
-                var q3Index = Math.round(validNodeCount * 0.75);
-                var q1Index = Math.round(validNodeCount * 0.25);
-                var sortedByInDegree = Object.keys(nodeToInDegree).sort(function (node0, node1) {
-                    return nodeToInDegree[node0] - nodeToInDegree[node1];
-                });
-                var inDegreeQ3 = nodeToInDegree[sortedByInDegree[q3Index]];
-                var inDegreeQ1 = nodeToInDegree[sortedByInDegree[q1Index]];
-                var inDegreeUpperBound = inDegreeQ3 + inDegreeQ3 - inDegreeQ1;
-                // Only extract if the upper bound is high enough.
-                inDegreeUpperBound = Math.max(inDegreeUpperBound, minUpperBound);
-                for (var i = validNodeCount - 1; nodeToInDegree[sortedByInDegree[i]] > inDegreeUpperBound; i--) {
-                    // Extract a high in-degree node.
-                    makeInExtract(renderNode, sortedByInDegree[i]);
-                }
-                // Mark for extraction nodes with out-degree > Q3 + (Q3 - Q1) * 4.
-                var sortedByOutDegree = Object.keys(nodeToOutDegree).sort(function (node0, node1) {
-                    return nodeToOutDegree[node0] - nodeToOutDegree[node1];
-                });
-                var outDegreeQ3 = nodeToOutDegree[sortedByOutDegree[q3Index]];
-                var outDegreeQ1 = nodeToOutDegree[sortedByOutDegree[q1Index]];
-                // The upper bound for extracting out-degree nodes is higher than that for
-                // extracting in-degree ones (Note the "* 4") because, in practice, some
-                // graphs look worse with a smaller out-degree bound. For instance, a smaller
-                // out-degree bound removes the convolution nodes from cifar 10 train's graph.
-                var outDegreeUpperBound = outDegreeQ3 + (outDegreeQ3 - outDegreeQ1) * 4;
-                // Only extract if the upper bound is high enough.
-                outDegreeUpperBound = Math.max(outDegreeUpperBound, minUpperBound);
-                for (var i = validNodeCount - 1; nodeToOutDegree[sortedByOutDegree[i]] > outDegreeUpperBound; i--) {
-                    var node = graph.node(sortedByOutDegree[i]);
-                    if (!node || node.isInExtract) {
-                        // This node has already been extracted due to high in-degree. It might
-                        // have been removed from the graph in general (during in-degree
-                        // extraction) due to a lack of neighbors. Do not extract this node twice.
-                        continue;
-                    }
-                    // Extract a high out-degree node that has not already been extracted.
-                    makeOutExtract(renderNode, sortedByOutDegree[i]);
-                }
-            }
-            /** Remove control edges from nodes that have too many control edges */
-            function removeControlEdges(renderNode) {
-                var graph = renderNode.coreGraph;
-                // Collect control edges into a map by node name.
-                var map = {};
-                _.each(graph.edges(), function (e) {
-                    if (!graph.edge(e).metaedge.numRegularEdges) {
-                        (map[e.v] = map[e.v] || []).push(e);
-                        (map[e.w] = map[e.w] || []).push(e);
-                    }
-                });
-                // For each node with too many control edges, turn them into annotations.
-                _.each(map, function (edges, nodeName) {
-                    if (edges.length > PARAMS.maxControlDegree) {
-                        _.each(edges, function (e) { return createShortcut(graph, e.v, e.w); });
-                    }
-                });
-            }
-            /**
-             * Given an integer, picks a hue that is far apart from other colors.
-             * The formula for picking color that avoid collision is:
-             *     hue = (color range * golden ratio * index) % color range
-             */
-            function mapIndexToHue(id) {
-                var GOLDEN_RATIO = 1.61803398875;
-                // Hue of 0 is reserved for the gray nodes.
-                var MIN_HUE = 1;
-                var MAX_HUE = 359;
-                var COLOR_RANGE = MAX_HUE - MIN_HUE;
-                return MIN_HUE + ((COLOR_RANGE * GOLDEN_RATIO * id) % COLOR_RANGE);
-            }
-            render.mapIndexToHue = mapIndexToHue;
-            ;
-            /**
-             * Remove edges and add to annotation instead.
-             *
-             * For root node, consider predefined types for source and sink.
-             * We do not extract predefined type from non-root so that Variables and the
-             * sgd node (op type = 'NoOp') do not get extract from inside own group.
-             *
-             * The order of extraction is important here as swapping the order can totally
-             * screw up the graph layout.
-             *
-             * @param {Render.Node} renderNode Node to manipulate.
-             */
-            function extractHighDegrees(renderNode) {
-                extractSpecifiedNodes(renderNode);
-                if (PARAMS.outExtractTypes) {
-                    extractPredefinedSink(renderNode);
-                }
-                // This has to come before extract high in-degree to protect the core part
-                // that takes many variables.
-                if (PARAMS.inExtractTypes) {
-                    extractPredefinedSource(renderNode);
-                }
-                extractHighInOrOutDegree(renderNode);
-                if (PARAMS.maxControlDegree) {
-                    removeControlEdges(renderNode);
-                }
-                // Extract isolated nodes, which can be
-                // (1) source-like and sink-like nodes that are not originally isolated but
-                //     become isolated after further removal.
-                // (2) isolated nodes with annotations on one-side.  These might be either
-                //     - nodes that originally have high out-degree but because we remove
-                //       high in-degree nodes first, they no longer have high in-degree when
-                //       we check.  (Detecting all high-degree before removing also leads to
-                //       another problem.)
-                //     - nodes that do not have high degree, but their neighbors are all
-                //       extracted, so it might make sense to extract them too.
-                var graph = renderNode.coreGraph;
-                _.each(graph.nodes(), function (n) {
-                    var child = graph.node(n);
-                    var degree = graph.neighbors(n).length;
-                    if (child.node.include !== graph_1.InclusionType.UNSPECIFIED) {
-                        return;
-                    }
-                    if (degree === 0) {
-                        var hasOutAnnotations = child.outAnnotations.list.length > 0;
-                        var hasInAnnotations = child.inAnnotations.list.length > 0;
-                        if (child.isInExtract) {
-                            // This case only happens if detachAllEdgesForHighDegree is false.
-                            // (Otherwise all source-like nodes are all isolated already.)
-                            renderNode.isolatedInExtract.push(child);
-                            child.node.include = graph_1.InclusionType.EXCLUDE;
-                            graph.removeNode(n);
-                        }
-                        else if (child.isOutExtract) {
-                            // This case only happens if detachAllEdgesForHighDegree is false.
-                            // // (Otherwise all sink-like nodes are all isolated already.)
-                            renderNode.isolatedOutExtract.push(child);
-                            child.node.include = graph_1.InclusionType.EXCLUDE;
-                            graph.removeNode(n);
-                        }
-                        else if (PARAMS.extractIsolatedNodesWithAnnotationsOnOneSide) {
-                            if (hasOutAnnotations && !hasInAnnotations) {
-                                child.isInExtract = true; // for ones with high out-annotations
-                                renderNode.isolatedInExtract.push(child);
-                                child.node.include = graph_1.InclusionType.EXCLUDE;
-                                graph.removeNode(n);
-                            }
-                            else if (hasInAnnotations && !hasOutAnnotations) {
-                                child.isOutExtract = true; // for ones with high in-annotations
-                                renderNode.isolatedOutExtract.push(child);
-                                child.node.include = graph_1.InclusionType.EXCLUDE;
-                                graph.removeNode(n);
-                            }
-                            else {
-                            }
-                        }
-                    }
-                });
-            }
-        })(render = graph_1.render || (graph_1.render = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module tf.graph.render
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var annotation;
-            (function (annotation_1) {
-                /**
-                 * Populate a given annotation container group
-                 *
-                 *     <g class='{in|out}-annotations'></g>
-                 *
-                 * with annotation group of the following structure:
-                 *
-                 * <g class='annotation'>
-                 *   <g class='annotation-node'>
-                 *   \x3c!--
-                 *   Content here determined by Scene.node.buildGroup.
-                 *   --\x3e
-                 *   </g>
-                 * </g>
-                 *
-                 * @param container selection of the container.
-                 * @param annotationData node.{in|out}Annotations
-                 * @param d node to build group for.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return selection of appended objects
-                 */
-                function buildGroup(container, annotationData, d, sceneElement) {
-                    // Select all children and join with data.
-                    var annotationGroups = container
-                        .selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes;
-                    })
-                        .data(annotationData.list, function (d) { return d.node.name; });
-                    annotationGroups.enter()
-                        .append('g')
-                        .attr('data-name', function (a) { return a.node.name; })
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        // Add annotation to the index in the scene
-                        sceneElement.addAnnotationGroup(a, d, aGroup);
-                        // Append annotation edge
-                        var edgeType = scene.Class.Annotation.EDGE;
-                        var metaedge = a.renderMetaedgeInfo && a.renderMetaedgeInfo.metaedge;
-                        if (metaedge && !metaedge.numRegularEdges) {
-                            edgeType += ' ' + scene.Class.Annotation.CONTROL_EDGE;
-                        }
-                        // If any edges are reference edges, add the reference edge class.
-                        if (metaedge && metaedge.numRefEdges) {
-                            edgeType += ' ' + scene.Class.Edge.REF_LINE;
-                        }
-                        scene.edge.appendEdge(aGroup, a, sceneElement, edgeType);
-                        if (a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                            addAnnotationLabelFromNode(aGroup, a);
-                            buildShape(aGroup, a);
-                        }
-                        else {
-                            addAnnotationLabel(aGroup, a.node.name, a, scene.Class.Annotation.ELLIPSIS);
-                        }
-                    });
-                    annotationGroups
-                        .attr('class', function (a) {
-                        return scene.Class.Annotation.GROUP + ' ' +
-                            annotationToClassName(a.annotationType) + ' ' +
-                            scene.node.nodeClass(a);
-                    })
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        update(aGroup, d, a, sceneElement);
-                        if (a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                            addInteraction(aGroup, d, a, sceneElement);
-                        }
-                    });
-                    annotationGroups.exit()
-                        .each(function (a) {
-                        var aGroup = d3.select(this);
-                        // Remove annotation from the index in the scene
-                        sceneElement.removeAnnotationGroup(a, d, aGroup);
-                    })
-                        .remove();
-                    return annotationGroups;
-                }
-                annotation_1.buildGroup = buildGroup;
-                ;
-                /**
-                 * Maps an annotation enum to a class name used in css rules.
-                 */
-                function annotationToClassName(annotationType) {
-                    return (graph.render.AnnotationType[annotationType] || '').toLowerCase() || null;
-                }
-                function buildShape(aGroup, a) {
-                    if (a.annotationType === graph.render.AnnotationType.SUMMARY) {
-                        var summary = scene.selectOrCreateChild(aGroup, 'use');
-                        summary.attr({
-                            'class': 'summary',
-                            'xlink:href': '#summary-icon',
-                            'cursor': 'pointer'
-                        });
-                    }
-                    else {
-                        var shape = scene.node.buildShape(aGroup, a, scene.Class.Annotation.NODE);
-                        // add title tag to get native tooltips
-                        scene.selectOrCreateChild(shape, 'title').text(a.node.name);
-                    }
-                }
-                function addAnnotationLabelFromNode(aGroup, a) {
-                    var namePath = a.node.name.split('/');
-                    var text = namePath[namePath.length - 1];
-                    return addAnnotationLabel(aGroup, text, a, null);
-                }
-                function addAnnotationLabel(aGroup, label, a, additionalClassNames) {
-                    var classNames = scene.Class.Annotation.LABEL;
-                    if (additionalClassNames) {
-                        classNames += ' ' + additionalClassNames;
-                    }
-                    var txtElement = aGroup.append('text')
-                        .attr('class', classNames)
-                        .attr('dy', '.35em')
-                        .attr('text-anchor', a.isIn ? 'end' : 'start')
-                        .text(label);
-                    return tf.graph.scene.node.enforceLabelWidth(txtElement, -1);
-                }
-                function addInteraction(selection, d, annotation, sceneElement) {
-                    selection
-                        .on('mouseover', function (a) {
-                        sceneElement.fire('annotation-highlight', { name: a.node.name, hostName: d.node.name });
-                    })
-                        .on('mouseout', function (a) {
-                        sceneElement.fire('annotation-unhighlight', { name: a.node.name, hostName: d.node.name });
-                    })
-                        .on('click', function (a) {
-                        // Stop this event's propagation so that it isn't also considered a
-                        // graph-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('annotation-select', { name: a.node.name, hostName: d.node.name });
-                    });
-                    if (annotation.annotationType !== graph.render.AnnotationType.SUMMARY &&
-                        annotation.annotationType !== graph.render.AnnotationType.CONSTANT) {
-                        selection.on('contextmenu', scene.contextmenu.getMenu(scene.node.getContextMenu(annotation.node, sceneElement)));
-                    }
-                }
-                ;
-                /**
-                 * Adjust annotation's position.
-                 *
-                 * @param aGroup selection of a 'g.annotation' element.
-                 * @param d Host node data.
-                 * @param a annotation node data.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function update(aGroup, d, a, sceneElement) {
-                    var cx = graph.layout.computeCXPositionOfNodeShape(d);
-                    // Annotations that point to embedded nodes (constants,summary)
-                    // don't have a render information attached so we don't stylize these.
-                    // Also we don't stylize ellipsis annotations (the string '... and X more').
-                    if (a.renderNodeInfo &&
-                        a.annotationType !== graph.render.AnnotationType.ELLIPSIS) {
-                        scene.node.stylize(aGroup, a.renderNodeInfo, sceneElement, scene.Class.Annotation.NODE);
-                    }
-                    if (a.annotationType === graph.render.AnnotationType.SUMMARY) {
-                        // Update the width of the annotation to give space for the image.
-                        a.width += 10;
-                    }
-                    // label position
-                    aGroup.select('text.' + scene.Class.Annotation.LABEL).transition().attr({
-                        x: cx + a.dx + (a.isIn ? -1 : 1) * (a.width / 2 + a.labelOffset),
-                        y: d.y + a.dy
-                    });
-                    // Some annotations (such as summary) are represented using a 12x12 image tag.
-                    // Purposely omitted units (e.g. pixels) since the images are vector graphics.
-                    // If there is an image, we adjust the location of the image to be vertically
-                    // centered with the node and horizontally centered between the arrow and the
-                    // text label.
-                    aGroup.select('use.summary').transition().attr({
-                        x: cx + a.dx - 3,
-                        y: d.y + a.dy - 6
-                    });
-                    // Node position (only one of the shape selection will be non-empty.)
-                    scene.positionEllipse(aGroup.select('.' + scene.Class.Annotation.NODE + ' ellipse'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    scene.positionRect(aGroup.select('.' + scene.Class.Annotation.NODE + ' rect'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    scene.positionRect(aGroup.select('.' + scene.Class.Annotation.NODE + ' use'), cx + a.dx, d.y + a.dy, a.width, a.height);
-                    // Edge position
-                    aGroup.select('path.' + scene.Class.Annotation.EDGE).transition().attr('d', function (a) {
-                        // map relative position to absolute position
-                        var points = a.points.map(function (p) { return { x: p.dx + cx, y: p.dy + d.y }; });
-                        return scene.edge.interpolate(points);
-                    });
-                }
-                ;
-            })(annotation = scene.annotation || (scene.annotation = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var contextmenu;
-            (function (contextmenu) {
-                /**
-                 * Returns the event listener, which can be used as an argument for the d3
-                 * selection.on function. Renders the context menu that is to be displayed
-                 * in response to the event.
-                 */
-                function getMenu(menu) {
-                    var menuSelection = d3.select('.context-menu');
-                    // Close the menu when anything else is clicked.
-                    d3.select('body').on('click.context', function () { menuSelection.style('display', 'none'); });
-                    // Function called to populate the context menu.
-                    return function (data, index) {
-                        var _this = this;
-                        // Position and display the menu.
-                        var event = d3.event;
-                        menuSelection.style({
-                            'display': 'block',
-                            'left': (event.layerX + 1) + 'px',
-                            'top': (event.layerY + 1) + 'px'
-                        });
-                        // Stop the event from propagating further.
-                        event.preventDefault();
-                        event.stopPropagation();
-                        // Add provided items to the context menu.
-                        menuSelection.html('');
-                        var list = menuSelection.append('ul');
-                        list.selectAll('li')
-                            .data(menu)
-                            .enter()
-                            .append('li')
-                            .html(function (d) { return d.title(data); })
-                            .on('click', function (d, i) {
-                            d.action(_this, data, index);
-                            menuSelection.style('display', 'none');
-                        });
-                    };
-                }
-                contextmenu.getMenu = getMenu;
-                ;
-            })(contextmenu = scene.contextmenu || (scene.contextmenu = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var scene;
-        (function (scene) {
-            var edge;
-            (function (edge) {
-                /** Delimiter between dimensions when showing sizes of tensors. */
-                var TENSOR_SHAPE_DELIM = '×';
-                /** The minimum stroke width of an edge. */
-                edge.MIN_EDGE_WIDTH = 0.75;
-                /** The maximum stroke width of an edge. */
-                edge.MAX_EDGE_WIDTH = 12;
-                /** The exponent used in the power scale for edge thickness. */
-                var EDGE_WIDTH_SCALE_EXPONENT = 0.3;
-                /** The domain (min and max value) for the edge width. */
-                var DOMAIN_EDGE_WIDTH_SCALE = [1, 5E6];
-                edge.EDGE_WIDTH_SCALE = d3.scale.pow()
-                    .exponent(EDGE_WIDTH_SCALE_EXPONENT)
-                    .domain(DOMAIN_EDGE_WIDTH_SCALE)
-                    .range([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH])
-                    .clamp(true);
-                var arrowheadMap = d3.scale.quantize().domain([edge.MIN_EDGE_WIDTH, edge.MAX_EDGE_WIDTH]).range([
-                    'small', 'medium', 'large', 'xlarge'
-                ]);
-                /** Minimum stroke width to put edge labels in the middle of edges */
-                var CENTER_EDGE_LABEL_MIN_STROKE_WIDTH = 2.5;
-                function getEdgeKey(edgeObj) {
-                    return edgeObj.v + graph_1.EDGE_KEY_DELIM + edgeObj.w;
-                }
-                edge.getEdgeKey = getEdgeKey;
-                /**
-                 * Select or Create a 'g.edges' group to a given sceneGroup
-                 * and builds a number of 'g.edge' groups inside the group.
-                 *
-                 * Structure Pattern:
-                 *
-                 * <g class='edges'>
-                 *   <g class='edge'>
-                 *     <path class='edgeline'/>
-                 *   </g>
-                 *   ...
-                 * </g>
-                 *
-                 *
-                 * @param sceneGroup container
-                 * @param graph
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return selection of the created nodeGroups
-                 */
-                function buildGroup(sceneGroup, graph, sceneElement) {
-                    var edges = [];
-                    edges = _.reduce(graph.edges(), function (edges, edgeObj) {
-                        var edgeLabel = graph.edge(edgeObj);
-                        edges.push({
-                            v: edgeObj.v,
-                            w: edgeObj.w,
-                            label: edgeLabel
-                        });
-                        return edges;
-                    }, edges);
-                    var container = scene.selectOrCreateChild(sceneGroup, 'g', scene.Class.Edge.CONTAINER);
-                    // Select all children and join with data.
-                    // (Note that all children of g.edges are g.edge)
-                    var edgeGroups = container.selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes;
-                    }).data(edges, getEdgeKey);
-                    // Make edges a group to support rendering multiple lines for metaedge
-                    edgeGroups.enter()
-                        .append('g')
-                        .attr('class', scene.Class.Edge.GROUP)
-                        .attr('data-edge', getEdgeKey)
-                        .each(function (d) {
-                        var edgeGroup = d3.select(this);
-                        d.label.edgeGroup = edgeGroup;
-                        // index node group for quick highlighting
-                        sceneElement._edgeGroupIndex[getEdgeKey(d)] = edgeGroup;
-                        // Add line during enter because we're assuming that type of line
-                        // normally does not change.
-                        appendEdge(edgeGroup, d, sceneElement);
-                    });
-                    edgeGroups.each(position);
-                    edgeGroups.each(function (d) {
-                        stylize(d3.select(this), d, sceneElement);
-                    });
-                    edgeGroups.exit()
-                        .each(function (d) {
-                        delete sceneElement._edgeGroupIndex[getEdgeKey(d)];
-                    })
-                        .remove();
-                    return edgeGroups;
-                }
-                edge.buildGroup = buildGroup;
-                ;
-                /**
-                 * Returns the label for the given base edge.
-                 * The label is the shape of the underlying tensor.
-                 */
-                function getLabelForBaseEdge(baseEdge, renderInfo) {
-                    var node = renderInfo.getNodeByName(baseEdge.v);
-                    if (node.outputShapes == null || node.outputShapes.length === 0) {
-                        return null;
-                    }
-                    var shape = node.outputShapes[baseEdge.outputTensorIndex];
-                    if (shape == null) {
-                        return null;
-                    }
-                    if (shape.length === 0) {
-                        return 'scalar';
-                    }
-                    return shape.map(function (size) { return size === -1 ? '?' : size; })
-                        .join(TENSOR_SHAPE_DELIM);
-                }
-                edge.getLabelForBaseEdge = getLabelForBaseEdge;
-                /**
-                 * Creates the label for the given metaedge. If the metaedge consists
-                 * of only 1 tensor, and it's shape is known, the label will contain that
-                 * shape. Otherwise, the label will say the number of tensors in the metaedge.
-                 */
-                function getLabelForEdge(metaedge, renderInfo) {
-                    var isMultiEdge = metaedge.baseEdgeList.length > 1;
-                    return isMultiEdge ?
-                        metaedge.baseEdgeList.length + ' tensors' :
-                        getLabelForBaseEdge(metaedge.baseEdgeList[0], renderInfo);
-                }
-                edge.getLabelForEdge = getLabelForEdge;
-                /**
-                 * Shortens the path enought such that the tip of the start/end marker will
-                 * point to the start/end of the path. The marker can be of arbitrary size.
-                 *
-                 * @param points Array of path control points.
-                 * @param marker D3 selection of the <marker> svg element.
-                 * @param isStart Is the marker a `start-marker`. If false, the marker is
-                 *     an `end-marker`.
-                 * @return The new array of control points.
-                 */
-                function adjustPathPointsForMarker(points, marker, isStart) {
-                    var lineFunc = d3.svg.line()
-                        .x(function (d) { return d.x; })
-                        .y(function (d) { return d.y; });
-                    var path = d3.select(document.createElementNS('http://www.w3.org/2000/svg', 'path'))
-                        .attr('d', lineFunc(points));
-                    var markerWidth = +marker.attr('markerWidth');
-                    var viewBox = marker.attr('viewBox').split(' ').map(Number);
-                    var viewBoxWidth = viewBox[2] - viewBox[0];
-                    var refX = +marker.attr('refX');
-                    var pathNode = path.node();
-                    if (isStart) {
-                        var fractionStickingOut = refX / viewBoxWidth;
-                        var length_1 = markerWidth * fractionStickingOut;
-                        var point = pathNode.getPointAtLength(length_1);
-                        // Figure out how many segments of the path we need to remove in order
-                        // to shorten the path.
-                        var segIndex = pathNode.getPathSegAtLength(length_1);
-                        // Update the very first segment.
-                        points[segIndex - 1] = { x: point.x, y: point.y };
-                        // Ignore every point before segIndex - 1.
-                        return points.slice(segIndex - 1);
-                    }
-                    else {
-                        var fractionStickingOut = 1 - refX / viewBoxWidth;
-                        var length_2 = pathNode.getTotalLength() - markerWidth * fractionStickingOut;
-                        var point = pathNode.getPointAtLength(length_2);
-                        // Figure out how many segments of the path we need to remove in order
-                        // to shorten the path.
-                        var segIndex = pathNode.getPathSegAtLength(length_2);
-                        // Update the very last segment.
-                        points[segIndex] = { x: point.x, y: point.y };
-                        // Ignore every point after segIndex.
-                        return points.slice(0, segIndex + 1);
-                    }
-                }
-                /**
-                 * For a given d3 selection and data object, create a path to represent the
-                 * edge described in d.label.
-                 *
-                 * If d.label is defined, it will be a RenderMetaedgeInfo instance. It
-                 * will sometimes be undefined, for example for some Annotation edges for which
-                 * there is no underlying Metaedge in the hierarchical graph.
-                 */
-                function appendEdge(edgeGroup, d, sceneElement, edgeClass) {
-                    var size = 1;
-                    if (d.label != null && d.label.metaedge != null) {
-                        // There is an underlying Metaedge.
-                        size = d.label.metaedge.totalSize;
-                    }
-                    edgeClass = edgeClass || scene.Class.Edge.LINE; // set default type
-                    if (d.label && d.label.structural) {
-                        edgeClass += ' ' + scene.Class.Edge.STRUCTURAL;
-                    }
-                    // Give the path a unique id, which will be used to link
-                    // the textPath (edge label) to this path.
-                    var pathId = 'path_' + getEdgeKey(d);
-                    var strokeWidth = sceneElement.renderHierarchy.edgeWidthScale(size);
-                    var path = edgeGroup.append('path')
-                        .attr({
-                        'id': pathId,
-                        'class': edgeClass,
-                    })
-                        .style({ 'stroke-width': strokeWidth + 'px' });
-                    // Check if there is a reference edge and add an arrowhead of the right size.
-                    if (d.label && d.label.metaedge && d.label.metaedge.numRefEdges) {
-                        var markerId = "ref-arrowhead-" + arrowheadMap(strokeWidth);
-                        path.style('marker-start', "url(#" + markerId + ")");
-                        d.label.startMarkerId = markerId;
-                    }
-                    if (d.label == null || d.label.metaedge == null) {
-                        // There is no associated metaedge, thus no text.
-                        // This happens for annotation edges.
-                        return;
-                    }
-                    var labelForEdge = getLabelForEdge(d.label.metaedge, sceneElement.renderHierarchy);
-                    if (labelForEdge == null) {
-                        // We have no information to show on this edge.
-                        return;
-                    }
-                    // Put edge label in the middle of edge only if the edge is thick enough.
-                    var baseline = strokeWidth > CENTER_EDGE_LABEL_MIN_STROKE_WIDTH ?
-                        'central' :
-                        'text-after-edge';
-                    edgeGroup.append('text')
-                        .append('textPath')
-                        .attr({
-                        'xlink:href': '#' + pathId,
-                        'startOffset': '50%',
-                        'text-anchor': 'middle',
-                        'dominant-baseline': 'central'
-                    })
-                        .text(labelForEdge);
-                }
-                edge.appendEdge = appendEdge;
-                ;
-                edge.interpolate = d3.svg.line()
-                    .interpolate('basis')
-                    .x(function (d) { return d.x; })
-                    .y(function (d) { return d.y; });
-                /**
-                 * Returns a tween interpolator for the endpoint of an edge path.
-                 */
-                function getEdgePathInterpolator(d, i, a) {
-                    var renderMetaedgeInfo = d.label;
-                    var adjoiningMetaedge = renderMetaedgeInfo.adjoiningMetaedge;
-                    var points = renderMetaedgeInfo.points;
-                    // Adjust the path so that start/end markers point to the end
-                    // of the path.
-                    if (d.label.startMarkerId) {
-                        points = adjustPathPointsForMarker(points, d3.select('#' + d.label.startMarkerId), true);
-                    }
-                    if (d.label.endMarkerId) {
-                        points = adjustPathPointsForMarker(points, d3.select('#' + d.label.endMarkerId), false);
-                    }
-                    if (!adjoiningMetaedge) {
-                        return d3.interpolate(a, edge.interpolate(points));
-                    }
-                    var renderPath = this;
-                    // Get the adjoining path that matches the adjoining metaedge.
-                    var adjoiningPath = (adjoiningMetaedge.edgeGroup.node()
-                        .firstChild);
-                    // Find the desired SVGPoint along the adjoining path, then convert those
-                    // coordinates into the space of the renderPath using its Current
-                    // Transformation Matrix (CTM).
-                    var inbound = renderMetaedgeInfo.metaedge.inbound;
-                    return function (t) {
-                        var adjoiningPoint = adjoiningPath
-                            .getPointAtLength(inbound ? adjoiningPath.getTotalLength() : 0)
-                            .matrixTransform(adjoiningPath.getCTM())
-                            .matrixTransform(renderPath.getCTM().inverse());
-                        // Update the relevant point in the renderMetaedgeInfo's points list, then
-                        // re-interpolate the path.
-                        var index = inbound ? 0 : points.length - 1;
-                        points[index].x = adjoiningPoint.x;
-                        points[index].y = adjoiningPoint.y;
-                        var dPath = edge.interpolate(points);
-                        return dPath;
-                    };
-                }
-                function position(d) {
-                    d3.select(this)
-                        .select('path.' + scene.Class.Edge.LINE)
-                        .transition()
-                        .attrTween('d', getEdgePathInterpolator);
-                }
-                ;
-                /**
-                 * For a given d3 selection and data object, mark the edge as a control
-                 * dependency if it contains only control edges.
-                 *
-                 * d's label property will be a RenderMetaedgeInfo object.
-                 */
-                function stylize(edgeGroup, d, stylize) {
-                    edgeGroup.classed('faded', d.label.isFadedOut);
-                    var metaedge = d.label.metaedge;
-                    edgeGroup.select('path.' + scene.Class.Edge.LINE)
-                        .classed('control-dep', metaedge && !metaedge.numRegularEdges);
-                }
-                ;
-            })(edge = scene.edge || (scene.edge = {}));
-        })(scene = graph_1.scene || (graph_1.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var node;
-            (function (node_1) {
-                /**
-                 * Select or Create a 'g.nodes' group to a given sceneGroup
-                 * and builds a number of 'g.node' groups inside the group.
-                 *
-                 * Structure Pattern:
-                 *
-                 * <g class='nodes'>
-                 *   <g class='node'>
-                 *     <g class='in-annotations'>
-                 *       ...
-                 *     </g>
-                 *     <g class='out-annotations'>
-                 *       ...
-                 *     </g>
-                 *     <g class='nodeshape'>
-                 *      \x3c!--
-                 *      Content of the node shape should be for the node itself. For example a
-                 *      Metanode would have a <rect> with rounded edges, an op would have an
-                 *      <ellipse>. More complex nodes like series may contain multiple
-                 *      elements which are conditionally visible based on whether the node is
-                 *      expanded.
-                 *      --\x3e
-                 *     </g>
-                 *     <text class='label'>node name</text>
-                 *     <g class='subscene'>
-                 *       \x3c!--
-                 *       Content of  the subscene (only for metanode and series node).
-                 *
-                 *       Subscene is a svg group that contains content of the
-                 *       metanode's metagraph that is recursively generated by Scene.build().
-                 *
-                 *       When the graph is expanded multiple times, a subscene can contain
-                 *       nested subscenes inside.
-                 *       --\x3e
-                 *     </g>
-                 *   </g>
-                 *   ...
-                 * </g>
-                 *
-                 *
-                 * @param sceneGroup selection of the container
-                 * @param nodeData array of render node information to map
-                 * @param sceneElement <tf-graph-scene> polymer element
-                 * @return selection of the created nodeGroups
-                 */
-                function buildGroup(sceneGroup, nodeData, sceneElement) {
-                    var container = scene.selectOrCreateChild(sceneGroup, 'g', scene.Class.Node.CONTAINER);
-                    // Select all children and join with data.
-                    // (Note that all children of g.nodes are g.node)
-                    var nodeGroups = container
-                        .selectAll(function () {
-                        // using d3's selector function
-                        // See https://github.com/mbostock/d3/releases/tag/v2.0.0
-                        // (It's not listed in the d3 wiki.)
-                        return this.childNodes; // this here refers to container.node()
-                    })
-                        .data(nodeData, function (d) {
-                        // make sure that we don't have to swap shape type
-                        return d.node.name + ':' + d.node.type;
-                    });
-                    // ENTER
-                    nodeGroups.enter()
-                        .append('g')
-                        .attr('data-name', function (d) { return d.node.name; })
-                        .each(function (d) {
-                        var nodeGroup = d3.select(this);
-                        // index node group for quick stylizing
-                        sceneElement.addNodeGroup(d.node.name, nodeGroup);
-                    });
-                    // UPDATE
-                    nodeGroups
-                        .attr('class', function (d) { return scene.Class.Node.GROUP + ' ' + nodeClass(d); })
-                        .each(function (d) {
-                        var nodeGroup = d3.select(this);
-                        // Add g.in-annotations (always add -- to keep layer order
-                        // consistent.)
-                        var inAnnotationBox = scene.selectOrCreateChild(nodeGroup, 'g', scene.Class.Annotation.INBOX);
-                        scene.annotation.buildGroup(inAnnotationBox, d.inAnnotations, d, sceneElement);
-                        // Add g.out-annotations  (always add -- to keep layer order
-                        // consistent.)
-                        var outAnnotationBox = scene.selectOrCreateChild(nodeGroup, 'g', scene.Class.Annotation.OUTBOX);
-                        scene.annotation.buildGroup(outAnnotationBox, d.outAnnotations, d, sceneElement);
-                        // Build .shape first (background of the node).
-                        var shape = buildShape(nodeGroup, d, scene.Class.Node.SHAPE);
-                        if (d.node.isGroupNode) {
-                            addButton(shape, d, sceneElement);
-                        }
-                        addInteraction(shape, d, sceneElement);
-                        // Build subscene on the top.
-                        subsceneBuild(nodeGroup, d, sceneElement);
-                        // Build label last. Should be on top of everything else.
-                        var label = labelBuild(nodeGroup, d, sceneElement);
-                        // Do not add interaction to metanode labels as they live inside the
-                        // metanode shape which already has the same interactions.
-                        addInteraction(label, d, sceneElement, d.node.type === graph.NodeType.META);
-                        stylize(nodeGroup, d, sceneElement);
-                        position(nodeGroup, d);
-                    });
-                    // EXIT
-                    nodeGroups.exit()
-                        .each(function (d) {
-                        // remove all indices on remove
-                        sceneElement.removeNodeGroup(d.node.name);
-                        var nodeGroup = d3.select(this);
-                        if (d.inAnnotations.list.length > 0) {
-                            nodeGroup.select('.' + scene.Class.Annotation.INBOX)
-                                .selectAll('.' + scene.Class.Annotation.GROUP)
-                                .each(function (a) { sceneElement.removeAnnotationGroup(a, d); });
-                        }
-                        if (d.outAnnotations.list.length > 0) {
-                            nodeGroup.select('.' + scene.Class.Annotation.OUTBOX)
-                                .selectAll('.' + scene.Class.Annotation.GROUP)
-                                .each(function (a) { sceneElement.removeAnnotationGroup(a, d); });
-                        }
-                    })
-                        .remove();
-                    return nodeGroups;
-                }
-                node_1.buildGroup = buildGroup;
-                ;
-                /**
-                 * Update or remove the subscene of a render group node depending on whether it
-                 * is a expanded. If the node is not a group node, this method has no effect.
-                 *
-                 * @param nodeGroup selection of the container
-                 * @param renderNodeInfo the render information for the node.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 * @return Selection of the subscene group, or null if node group does not have
-                 *        a subscene. Op nodes, bridge nodes and unexpanded group nodes will
-                 *        not have a subscene.
-                 */
-                function subsceneBuild(nodeGroup, renderNodeInfo, sceneElement) {
-                    if (renderNodeInfo.node.isGroupNode) {
-                        if (renderNodeInfo.expanded) {
-                            // Recursively build the subscene.
-                            return scene.buildGroup(nodeGroup, renderNodeInfo, sceneElement, scene.Class.Subscene.GROUP);
-                        }
-                        // Clean out existing subscene if the node is not expanded.
-                        scene.selectChild(nodeGroup, 'g', scene.Class.Subscene.GROUP).remove();
-                    }
-                    return null;
-                }
-                ;
-                /**
-                 * Translate the subscene of the given node group
-                 */
-                function subscenePosition(nodeGroup, d) {
-                    var x0 = d.x - d.width / 2.0 + d.paddingLeft;
-                    var y0 = d.y - d.height / 2.0 + d.paddingTop;
-                    var subscene = scene.selectChild(nodeGroup, 'g', scene.Class.Subscene.GROUP);
-                    scene.translate(subscene, x0, y0);
-                }
-                ;
-                /**
-                 * Add an expand/collapse button to a group node
-                 *
-                 * @param selection The group node selection.
-                 * @param d Info about the node being rendered.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function addButton(selection, d, sceneElement) {
-                    var group = scene.selectOrCreateChild(selection, 'g', scene.Class.Node.BUTTON_CONTAINER);
-                    scene.selectOrCreateChild(group, 'circle', scene.Class.Node.BUTTON_CIRCLE);
-                    scene.selectOrCreateChild(group, 'path', scene.Class.Node.EXPAND_BUTTON)
-                        .attr('d', 'M0,-2.2 V2.2 M-2.2,0 H2.2');
-                    scene.selectOrCreateChild(group, 'path', scene.Class.Node.COLLAPSE_BUTTON)
-                        .attr('d', 'M-2.2,0 H2.2');
-                    group.on('click', function (d) {
-                        // Stop this event's propagation so that it isn't also considered a
-                        // node-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('node-toggle-expand', { name: d.node.name });
-                    });
-                    scene.positionButton(group, d);
-                }
-                ;
-                /**
-                 * Fire node-* events when the selection is interacted.
-                 *
-                 * @param disableInteraction When true, have the provided selection
-                 * ignore all pointer events. Used for text labels inside of metanodes, which
-                 * don't need interaction as their surrounding shape has interaction, and if
-                 * given interaction would cause conflicts with the expand/collapse button.
-                 */
-                function addInteraction(selection, d, sceneElement, disableInteraction) {
-                    if (disableInteraction) {
-                        selection.attr('pointer-events', 'none');
-                        return;
-                    }
-                    var contextMenuFunction = scene.contextmenu.getMenu(getContextMenu(d.node, sceneElement));
-                    selection
-                        .on('dblclick', function (d) {
-                        sceneElement.fire('node-toggle-expand', { name: d.node.name });
-                    })
-                        .on('mouseover', function (d) {
-                        // don't send mouseover over expanded group,
-                        // otherwise it is causing too much glitches
-                        if (sceneElement.isNodeExpanded(d)) {
-                            return;
-                        }
-                        sceneElement.fire('node-highlight', { name: d.node.name });
-                    })
-                        .on('mouseout', function (d) {
-                        // don't send mouseover over expanded group,
-                        // otherwise it is causing too much glitches
-                        if (sceneElement.isNodeExpanded(d)) {
-                            return;
-                        }
-                        sceneElement.fire('node-unhighlight', { name: d.node.name });
-                    })
-                        .on('click', function (d) {
-                        // Stop this event's propagation so that it isn't also considered
-                        // a graph-select.
-                        d3.event.stopPropagation();
-                        sceneElement.fire('node-select', { name: d.node.name });
-                    })
-                        .on('contextmenu', function (d, i) {
-                        sceneElement.fire('node-select', { name: d.node.name });
-                        contextMenuFunction.call(d, i);
-                    });
-                }
-                ;
-                /**
-                 * Returns the d3 context menu specification for the provided node.
-                 */
-                function getContextMenu(node, sceneElement) {
-                    var menu = [{
-                            title: function (d) {
-                                return graph.getIncludeNodeButtonString(node.include);
-                            },
-                            action: function (elm, d, i) {
-                                sceneElement.fire('node-toggle-extract', { name: node.name });
-                            }
-                        }];
-                    if (canBeInSeries(node)) {
-                        menu.push({
-                            title: function (d) { return getGroupSettingLabel(node); },
-                            action: function (elm, d, i) {
-                                sceneElement.fire('node-toggle-seriesgroup', { name: getSeriesName(node) });
-                            }
-                        });
-                    }
-                    return menu;
-                }
-                node_1.getContextMenu = getContextMenu;
-                /** Returns if a node can be part of a grouped series */
-                function canBeInSeries(node) {
-                    return getSeriesName(node) !== null;
-                }
-                node_1.canBeInSeries = canBeInSeries;
-                /**
-                 * Returns the name of the possible grouped series containing this node.
-                 * Returns null if the node cannot be part of a grouped series of nodes.
-                 */
-                function getSeriesName(node) {
-                    if (!node) {
-                        return null;
-                    }
-                    if (node.type === graph.NodeType.SERIES) {
-                        return node.name;
-                    }
-                    if (node.type === graph.NodeType.OP) {
-                        var op = node;
-                        return op.owningSeries;
-                    }
-                    return null;
-                }
-                node_1.getSeriesName = getSeriesName;
-                /**
-                 * Returns the SeriesNode that represents the series that the provided node
-                 * is contained in (or itself if the provided node is itself a SeriesNode).
-                 * Returns null if the node is not rendered as part of a series.
-                 */
-                function getContainingSeries(node) {
-                    var s = null;
-                    if (!node) {
-                        return null;
-                    }
-                    else if (node.type === graph.NodeType.SERIES) {
-                        s = node;
-                    }
-                    else if (node.parentNode && node.parentNode.type === graph.NodeType.SERIES) {
-                        s = node.parentNode;
-                    }
-                    return s;
-                }
-                /**
-                 * Returns the label for a button to toggle the group setting of the provided
-                 * node.
-                 */
-                function getGroupSettingLabel(node) {
-                    return tf.graph.getGroupSeriesNodeButtonString(getContainingSeries(node) !== null ? tf.graph.SeriesGroupingType.GROUP :
-                        tf.graph.SeriesGroupingType.UNGROUP);
-                }
-                node_1.getGroupSettingLabel = getGroupSettingLabel;
-                /**
-                 * Append svg text for label and assign data.
-                 * @param nodeGroup
-                 * @param renderNodeInfo The render node information for the label.
-                 * @param sceneElement <tf-graph-scene> polymer element.
-                 */
-                function labelBuild(nodeGroup, renderNodeInfo, sceneElement) {
-                    var namePath = renderNodeInfo.node.name.split('/');
-                    var text = namePath[namePath.length - 1];
-                    // Truncate long labels for unexpanded Metanodes.
-                    var useFontScale = renderNodeInfo.node.type === graph.NodeType.META &&
-                        !renderNodeInfo.expanded;
-                    var label = scene.selectOrCreateChild(nodeGroup, 'text', scene.Class.Node.LABEL);
-                    // Make sure the label is visually on top among its siblings.
-                    var labelNode = label.node();
-                    labelNode.parentNode.appendChild(labelNode);
-                    label.attr('dy', '.35em').attr('text-anchor', 'middle');
-                    if (useFontScale) {
-                        if (text.length > sceneElement.maxMetanodeLabelLength) {
-                            text = text.substr(0, sceneElement.maxMetanodeLabelLength - 2) + '...';
-                        }
-                        var scale = getLabelFontScale(sceneElement);
-                        label.attr('font-size', scale(text.length) + 'px');
-                    }
-                    var txtElement = label.text(text);
-                    enforceLabelWidth(txtElement, renderNodeInfo.node.type, renderNodeInfo);
-                    return label;
-                }
-                /**
-                 * This function shortens text which would exceed the maximum pixel width of
-                 * a label.
-                 *
-                 * @param txtElementSelection The text element containing the label's text as d3
-                 * selection.
-                 * @param nodeType The type of the node the label belongs to. If the node is
-                 * an annotation, the value is -1. Label widths are defined in
-                 * layout.PARAMS.nodeSize.{meta|op|...}.maxLabelWidth for nodes and
-                 * layout.PARAMS.annotations.labelWidth for annotations.
-                 * @param renderNodeInfo The render information about the node, required to
-                 * determine whether META nodes are collapsed or expanded.
-                 */
-                function enforceLabelWidth(txtElementSelection, nodeType, renderNodeInfo) {
-                    // Get text element itself and its on-screen width.
-                    var txtNode = txtElementSelection.node();
-                    var computedTxtLength = txtNode.getComputedTextLength();
-                    var labelContent = txtNode.textContent;
-                    // Get maximum length from settings.
-                    var maxLength = null;
-                    switch (nodeType) {
-                        case graph.NodeType.META:
-                            if (renderNodeInfo && !renderNodeInfo.expanded) {
-                                // node expanded.
-                                maxLength = graph.layout.PARAMS.nodeSize.meta.maxLabelWidth;
-                            }
-                            break;
-                        case graph.NodeType.OP:
-                            maxLength = graph.layout.PARAMS.nodeSize.op.maxLabelWidth;
-                            break;
-                        case -1:
-                            maxLength = graph.layout.PARAMS.annotations.maxLabelWidth;
-                            break;
-                        default:
-                            break;
-                    }
-                    // Return if no max length provided for node type, or current label length is
-                    // less than or equal to the provided length limit.
-                    if (maxLength === null || computedTxtLength <= maxLength) {
-                        return;
-                    }
-                    // Find the index of the character which exceeds the width.
-                    // getSubStringLength performs far better than getComputedTextLength, and
-                    // results in a 3x speed-up on average.
-                    var index = 1;
-                    while (txtNode.getSubStringLength(0, index) < maxLength) {
-                        index++;
-                    }
-                    // Shorten the label starting at the string length known to be one
-                    // character above max pixel length.
-                    // When shortened the original label's substring is concatenated with
-                    // '...', baseText contains the substring not including the '...'.
-                    var baseText = txtNode.textContent.substr(0, index);
-                    do {
-                        baseText = baseText.substr(0, baseText.length - 1);
-                        // Recompute text length.
-                        txtNode.textContent = baseText + '...';
-                        computedTxtLength = txtNode.getComputedTextLength();
-                    } while (computedTxtLength > maxLength && baseText.length > 0);
-                    // Add tooltip with full name and return.
-                    return txtElementSelection.append('title').text(labelContent);
-                }
-                node_1.enforceLabelWidth = enforceLabelWidth;
-                /**
-                 * d3 scale used for sizing font of labels, used by labelBuild,
-                 * initialized once by getLabelFontScale.
-                 */
-                var fontScale = null;
-                function getLabelFontScale(sceneElement) {
-                    if (!fontScale) {
-                        fontScale = d3.scale.linear()
-                            .domain([sceneElement.maxMetanodeLabelLengthLargeFont,
-                            sceneElement.maxMetanodeLabelLength])
-                            .range([sceneElement.maxMetanodeLabelLengthFontSize,
-                            sceneElement.minMetanodeLabelLengthFontSize]).clamp(true);
-                    }
-                    return fontScale;
-                }
-                /**
-                 * Set label position of a given node group
-                 */
-                function labelPosition(nodeGroup, cx, cy, yOffset) {
-                    scene.selectChild(nodeGroup, 'text', scene.Class.Node.LABEL)
-                        .transition()
-                        .attr('x', cx)
-                        .attr('y', cy + yOffset);
-                }
-                ;
-                /**
-                 * Select or append/insert shape for a node and assign renderNode
-                 * as the shape's data.
-                 *
-                 * @param nodeGroup
-                 * @param d Render node information.
-                 * @param nodeClass class for the element.
-                 * @return Selection of the shape.
-                 */
-                function buildShape(nodeGroup, d, nodeClass) {
-                    // Create a group to house the underlying visual elements.
-                    var shapeGroup = scene.selectOrCreateChild(nodeGroup, 'g', nodeClass);
-                    // TODO(jimbo): DOM structure should be templated in HTML somewhere, not JS.
-                    switch (d.node.type) {
-                        case graph.NodeType.OP:
-                            scene.selectOrCreateChild(shapeGroup, 'ellipse', scene.Class.Node.COLOR_TARGET);
-                            break;
-                        case graph.NodeType.SERIES:
-                            // Choose the correct stamp to use to represent this series.
-                            var stampType = 'annotation';
-                            var groupNodeInfo = d;
-                            if (groupNodeInfo.coreGraph) {
-                                stampType =
-                                    groupNodeInfo.node.hasNonControlEdges ? 'vertical' : 'horizontal';
-                            }
-                            var classList = [scene.Class.Node.COLOR_TARGET];
-                            if (groupNodeInfo.isFadedOut) {
-                                classList.push('faded-ellipse');
-                            }
-                            scene.selectOrCreateChild(shapeGroup, 'use', classList)
-                                .attr('xlink:href', '#op-series-' + stampType + '-stamp');
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        case graph.NodeType.BRIDGE:
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        case graph.NodeType.META:
-                            scene.selectOrCreateChild(shapeGroup, 'rect', scene.Class.Node.COLOR_TARGET)
-                                .attr({ rx: d.radius, ry: d.radius });
-                            break;
-                        default:
-                            throw Error('Unrecognized node type: ' + d.node.type);
-                    }
-                    return shapeGroup;
-                }
-                node_1.buildShape = buildShape;
-                ;
-                function nodeClass(d) {
-                    switch (d.node.type) {
-                        case graph.NodeType.OP:
-                            return scene.Class.OPNODE;
-                        case graph.NodeType.META:
-                            return scene.Class.METANODE;
-                        case graph.NodeType.SERIES:
-                            return scene.Class.SERIESNODE;
-                        case graph.NodeType.BRIDGE:
-                            return scene.Class.BRIDGENODE;
-                        case graph.NodeType.ELLIPSIS:
-                            return scene.Class.ELLIPSISNODE;
-                    }
-                    ;
-                    throw Error('Unrecognized node type: ' + d.node.type);
-                }
-                node_1.nodeClass = nodeClass;
-                ;
-                /** Modify node and its subscene and its label's positional attributes */
-                function position(nodeGroup, d) {
-                    var shapeGroup = scene.selectChild(nodeGroup, 'g', scene.Class.Node.SHAPE);
-                    var cx = graph.layout.computeCXPositionOfNodeShape(d);
-                    switch (d.node.type) {
-                        case graph.NodeType.OP: {
-                            // position shape
-                            var shape = scene.selectChild(shapeGroup, 'ellipse');
-                            scene.positionEllipse(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                            labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-                            break;
-                        }
-                        case graph.NodeType.META: {
-                            // position shape
-                            var shape = scene.selectChild(shapeGroup, 'rect');
-                            if (d.expanded) {
-                                scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                                subscenePosition(nodeGroup, d);
-                                // put label on top
-                                labelPosition(nodeGroup, cx, d.y, -d.height / 2 + d.labelHeight / 2);
-                            }
-                            else {
-                                scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                                labelPosition(nodeGroup, cx, d.y, 0);
-                            }
-                            break;
-                        }
-                        case graph.NodeType.SERIES: {
-                            var shape = scene.selectChild(shapeGroup, 'use');
-                            if (d.expanded) {
-                                scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                                subscenePosition(nodeGroup, d);
-                                // put label on top
-                                labelPosition(nodeGroup, cx, d.y, -d.height / 2 + d.labelHeight / 2);
-                            }
-                            else {
-                                scene.positionRect(shape, cx, d.y, d.coreBox.width, d.coreBox.height);
-                                labelPosition(nodeGroup, cx, d.y, d.labelOffset);
-                            }
-                            break;
-                        }
-                        case graph.NodeType.BRIDGE: {
-                            // position shape
-                            // NOTE: In reality, these will not be visible, but it helps to put them
-                            // in the correct position for debugging purposes.
-                            var shape = scene.selectChild(shapeGroup, 'rect');
-                            scene.positionRect(shape, d.x, d.y, d.width, d.height);
-                            break;
-                        }
-                        default: {
-                            throw Error('Unrecognized node type: ' + d.node.type);
-                        }
-                    }
-                }
-                ;
-                /** Enum specifying the options to color nodes by */
-                var ColorBy;
-                (function (ColorBy) {
-                    ColorBy[ColorBy["STRUCTURE"] = 0] = "STRUCTURE";
-                    ColorBy[ColorBy["DEVICE"] = 1] = "DEVICE";
-                    ColorBy[ColorBy["COMPUTE_TIME"] = 2] = "COMPUTE_TIME";
-                    ColorBy[ColorBy["MEMORY"] = 3] = "MEMORY";
-                })(ColorBy = node_1.ColorBy || (node_1.ColorBy = {}));
-                ;
-                /**
-                 * Returns the fill color for the node given its state and the 'color by'
-                 * option.
-                 */
-                function getFillForNode(templateIndex, colorBy, renderInfo, isExpanded) {
-                    var colorParams = graph.render.MetanodeColors;
-                    switch (colorBy) {
-                        case ColorBy.STRUCTURE:
-                            if (renderInfo.node.type === graph.NodeType.META) {
-                                var tid = renderInfo.node.templateId;
-                                return tid === null ?
-                                    colorParams.UNKNOWN :
-                                    colorParams.STRUCTURE_PALETTE(templateIndex(tid), isExpanded);
-                            }
-                            else if (renderInfo.node.type === graph.NodeType.SERIES) {
-                                // If expanded, we're showing the background rect, which we want to
-                                // appear gray. Otherwise we're showing a stack of ellipses which we
-                                // want to show white.
-                                return isExpanded ? colorParams.EXPANDED_COLOR : 'white';
-                            }
-                            else if (renderInfo.node.type === graph.NodeType.BRIDGE) {
-                                return renderInfo.structural ?
-                                    '#f0e' :
-                                    renderInfo.node.inbound ? '#0ef' : '#fe0';
-                            }
-                            else {
-                                // Op nodes are white.
-                                return 'white';
-                            }
-                        case ColorBy.DEVICE:
-                            if (renderInfo.deviceColors == null) {
-                                // Return the hue for unknown device.
-                                return colorParams.UNKNOWN;
-                            }
-                            var id = renderInfo.node.name;
-                            var escapedId = tf.graph.util.escapeQuerySelector(id);
-                            var gradientDefs = d3.select('svg#svg defs #linearGradients');
-                            var linearGradient_1 = gradientDefs.select('linearGradient#' + escapedId);
-                            // If the linear gradient is not there yet, create it.
-                            if (linearGradient_1.size() === 0) {
-                                linearGradient_1 = gradientDefs.append('linearGradient').attr('id', id);
-                                // Re-create the stops of the linear gradient.
-                                linearGradient_1.selectAll('*').remove();
-                                var cumulativeProportion_1 = 0;
-                                // For each device, create a stop using the proportion of that device.
-                                _.each(renderInfo.deviceColors, function (d) {
-                                    var color = d.color;
-                                    linearGradient_1.append('stop')
-                                        .attr('offset', cumulativeProportion_1)
-                                        .attr('stop-color', color);
-                                    linearGradient_1.append('stop')
-                                        .attr('offset', cumulativeProportion_1 + d.proportion)
-                                        .attr('stop-color', color);
-                                    cumulativeProportion_1 += d.proportion;
-                                });
-                            }
-                            return isExpanded ? colorParams.EXPANDED_COLOR : "url(#" + escapedId + ")";
-                        case ColorBy.COMPUTE_TIME:
-                            return isExpanded ?
-                                colorParams.EXPANDED_COLOR : renderInfo.computeTimeColor ||
-                                colorParams.UNKNOWN;
-                        case ColorBy.MEMORY:
-                            return isExpanded ?
-                                colorParams.EXPANDED_COLOR : renderInfo.memoryColor ||
-                                colorParams.UNKNOWN;
-                        default:
-                            throw new Error('Unknown case to color nodes by');
-                    }
-                }
-                node_1.getFillForNode = getFillForNode;
-                /**
-                 * Modify node style by toggling class and assign attributes (only for things
-                 * that can't be done in css).
-                 */
-                function stylize(nodeGroup, renderInfo, sceneElement, nodeClass) {
-                    nodeClass = nodeClass || scene.Class.Node.SHAPE;
-                    var isHighlighted = sceneElement.isNodeHighlighted(renderInfo.node.name);
-                    var isSelected = sceneElement.isNodeSelected(renderInfo.node.name);
-                    var isExtract = renderInfo.isInExtract || renderInfo.isOutExtract;
-                    var isExpanded = renderInfo.expanded;
-                    var isFadedOut = renderInfo.isFadedOut;
-                    nodeGroup.classed('highlighted', isHighlighted);
-                    nodeGroup.classed('selected', isSelected);
-                    nodeGroup.classed('extract', isExtract);
-                    nodeGroup.classed('expanded', isExpanded);
-                    nodeGroup.classed('faded', isFadedOut);
-                    // Main node always exists here and it will be reached before subscene,
-                    // so d3 selection is fine here.
-                    var node = nodeGroup.select('.' + nodeClass + ' .' + scene.Class.Node.COLOR_TARGET);
-                    var fillColor = getFillForNode(sceneElement.templateIndex, ColorBy[sceneElement.colorBy.toUpperCase()], renderInfo, isExpanded);
-                    node.style('fill', fillColor);
-                    // Choose outline to be darker version of node color if the node is a single
-                    // color and is not selected.
-                    node.style('stroke', isSelected ? null : getStrokeForFill(fillColor));
-                }
-                node_1.stylize = stylize;
-                ;
-                /**
-                 * Given a node's fill color/gradient, determine the stroke for the node.
-                 */
-                function getStrokeForFill(fill) {
-                    // If node is colored by a gradient, then use a dark gray outline.
-                    return fill.substring(0, 3) === 'url' ?
-                        graph.render.MetanodeColors.GRADIENT_OUTLINE :
-                        d3.rgb(fill).darker().toString();
-                }
-                node_1.getStrokeForFill = getStrokeForFill;
-                /**
-                 * Finds selected node and highlights all nodes which are providing direct
-                 * or indirect input to the node and all edges connecting these nodes
-                 * together and to the selected node.
-                 *
-                 * @param renderGraphInfo Information on the rendered state of the graph.
-                 */
-                function traceInputs(renderGraphInfo) {
-                    // Reset all styling.
-                    d3.selectAll('.input-highlight').classed('input-highlight', false);
-                    d3.selectAll('.non-input').classed('non-input', false);
-                    d3.selectAll('.input-parent').classed('input-parent', false);
-                    d3.selectAll('.input-child').classed('input-child', false);
-                    d3.selectAll('.input-edge-highlight').classed('input-edge-highlight', false);
-                    d3.selectAll('.non-input-edge-highlight')
-                        .classed('non-input-edge-highlight', false);
-                    d3.selectAll('.input-highlight-selected')
-                        .classed('input-highlight-selected', false);
-                    // Extract currently selected node. Return if input tracing disabled or no
-                    // node is selected.
-                    var selectedNodeSelectorString = 'g.node.selected,g.op.selected';
-                    var node = d3.select(selectedNodeSelectorString);
-                    var currentNode = undefined;
-                    if (renderGraphInfo && renderGraphInfo.traceInputs && node && node[0] &&
-                        node[0][0]) {
-                        currentNode = node[0][0];
-                    }
-                    else {
-                        return;
-                    }
-                    var nodeName = currentNode.getAttribute('data-name');
-                    var opNodes = _getAllContainedOpNodes(nodeName, renderGraphInfo);
-                    var allTracedNodes = {};
-                    _.each(opNodes, function (nodeInstance) {
-                        allTracedNodes =
-                            traceAllInputsOfOpNode(renderGraphInfo, nodeInstance, allTracedNodes);
-                    });
-                    d3.selectAll(selectedNodeSelectorString).classed({
-                        // Remove the input-highlight from the selected node.
-                        'input-highlight': false,
-                        // Add input-highlight-selected class to selected node, which allows
-                        // treating the selected not as a special case of an input node.
-                        'input-highlight-selected': true
-                    });
-                    // Highlight all parent nodes of each OpNode as input parent to allow
-                    // specific highlighting.
-                    var highlightedNodes = Object.keys(allTracedNodes);
-                    var visibleNodes = _findVisibleParentsFromOpNodes(renderGraphInfo, highlightedNodes);
-                    _markParentsOfNodes(visibleNodes);
-                    // Attach class to all non-input nodes and edges for styling.
-                    d3.selectAll('g.node:not(.selected):not(.input-highlight)' +
-                        ':not(.input-parent):not(.input-children)')
-                        .classed('non-input', true)
-                        .each(function (d) {
-                        // Mark all nodes with the specified name as non-inputs. This
-                        // results in Annotation nodes which are attached to inputs to be
-                        // tagged as well.
-                        var nodeName = d.node.name;
-                        d3.selectAll("[data-name=\"" + nodeName + "\"]").classed('non-input', true);
-                    });
-                    d3.selectAll('g.edge:not(.input-edge-highlight)')
-                        .classed('non-input-edge-highlight', true);
-                }
-                node_1.traceInputs = traceInputs;
-                /**
-                 * Recursively find all op nodes contained by the node identified by the
-                 * provided name.
-                 * @param nodeName The meta or op node of which the OpNode instances are
-                 * required.
-                 * @param renderGraphInfo The rendered graph information object.
-                 * @returns {Array} An array of OpNodeImpl instances.
-                 */
-                function _getAllContainedOpNodes(nodeName, renderGraphInfo) {
-                    var opNodes = [];
-                    // Get current node.
-                    var node = renderGraphInfo.getNodeByName(nodeName);
-                    // If node is already OpNode then return the node plus its input embeddings.
-                    if (node instanceof tf.graph.OpNodeImpl) {
-                        return [node].concat(node.inEmbeddings);
-                    }
-                    // Otherwise, make recursive call for each node contained by the GroupNode.
-                    var childNodeNames = node.metagraph.nodes();
-                    _.each(childNodeNames, function (childNodeName) {
-                        opNodes =
-                            opNodes.concat(_getAllContainedOpNodes(childNodeName, renderGraphInfo));
-                    });
-                    return opNodes;
-                }
-                node_1._getAllContainedOpNodes = _getAllContainedOpNodes;
-                function traceAllInputsOfOpNode(renderGraphInfo, startNode, allTracedNodes) {
-                    // To prevent infinite loops due to cyclical relationships and improving
-                    // performance by tracing OpNode which is input to 2+ nodes only once.
-                    if (allTracedNodes[startNode.name]) {
-                        return allTracedNodes;
-                    }
-                    else {
-                        allTracedNodes[startNode.name] = true;
-                    }
-                    // Extract the inputs.
-                    var inputs = startNode.inputs;
-                    // Get visible parent.
-                    var currentVisibleParent = getVisibleParent(renderGraphInfo, startNode);
-                    // Mark as input node.
-                    d3.select(".node[data-name=\"" + currentVisibleParent.name + "\"]")
-                        .classed('input-highlight', true);
-                    // Find the visible parent of each input.
-                    var visibleInputs = {};
-                    _.each(inputs, function (nodeInstance) {
-                        var resolvedNode = renderGraphInfo.getNodeByName(nodeInstance.name);
-                        if (resolvedNode === undefined) {
-                            // Node could not be found in rendered Hierarchy, which happens when
-                            // tracing inputs of a SummaryNode.
-                            return;
-                        }
-                        // Ensure node is resolved to OpNode if name collision with Metanode exists.
-                        if (resolvedNode instanceof graph.MetanodeImpl) {
-                            var resolvedNodeName = tf.graph.getStrictName(resolvedNode.name);
-                            resolvedNode = renderGraphInfo.getNodeByName(resolvedNodeName);
-                        }
-                        var visibleParent = getVisibleParent(renderGraphInfo, resolvedNode);
-                        // Append OpNode to visible parent entry.
-                        var visibleInputsEntry = visibleInputs[visibleParent.name];
-                        if (visibleInputsEntry) {
-                            visibleInputsEntry.opNodes.push(resolvedNode);
-                        }
-                        else {
-                            visibleInputs[visibleParent.name] = {
-                                visibleParent: visibleParent,
-                                opNodes: [resolvedNode]
-                            };
-                        }
-                    });
-                    // Find all parents of the start node.
-                    var startNodeParents = {};
-                    var indexedStartNodeParents = [currentVisibleParent];
-                    startNodeParents[currentVisibleParent.name] = {
-                        traced: false,
-                        index: 0,
-                        connectionEndpoints: []
-                    };
-                    var currentNode = currentVisibleParent;
-                    for (var index = 1; currentNode.name !== tf.graph.ROOT_NAME; index++) {
-                        currentNode = currentNode.parentNode;
-                        startNodeParents[currentNode.name] = {
-                            traced: false,
-                            index: index,
-                            connectionEndpoints: []
-                        };
-                        indexedStartNodeParents[index] = currentNode;
-                    }
-                    // Find first mutual parent of each input node and highlight connection.
-                    _.forOwn(visibleInputs, function (visibleParentInfo, key) {
-                        var nodeInstance = visibleParentInfo.visibleParent;
-                        // Make recursive call for each input-OpNode contained by the visible
-                        // parent.
-                        _.each(visibleParentInfo.opNodes, function (opNode) {
-                            allTracedNodes =
-                                traceAllInputsOfOpNode(renderGraphInfo, opNode, allTracedNodes);
-                        });
-                        if (nodeInstance.name !== currentVisibleParent.name) {
-                            _createVisibleTrace(nodeInstance, startNodeParents, indexedStartNodeParents);
-                        }
-                    });
-                    return allTracedNodes;
-                }
-                node_1.traceAllInputsOfOpNode = traceAllInputsOfOpNode;
-                /**
-                 * Colors the edges to connect the passed node to the start node. This is
-                 * done by:
-                 *
-                 * a) Finding the first (visible) common parent in the rendered
-                 * hierarchy.
-                 * NB: There are 2 types of connections:
-                 * 1) Direct connections between node A
-                 * and B, marked below as II,
-                 * 2) Connections from any node A to its parent, A'. Marked below as I and III.
-                 * For type 2 connection you need to know the inner-nested node, the
-                 * direct parent, and the ultimate destination of the connection.
-                 *
-                 *  A_parent      B_parent
-                 * +--------+    +---------+
-                 * |        |    |         |
-                 * |  +--+ I| II |III+--+  |
-                 * |  |A +----------\x3e+B |  |
-                 * |  +--+  |    |   +--+  |
-                 * |        |    |         |
-                 * +--------+    +---------+
-                 *
-                 *
-                 * b) Highlighting the direct connection between the parents of A and B,
-                 * called A_parent and B_parent, s.t. A_parent and B_parent are children of the
-                 * mutual parent of A and B found in a), marked above as II.
-                 *
-                 * c) Highlighting the connection from A to A_parent and B to B_parent
-                 * (through all layers of parents between A and A_parent and B and B_parent,
-                 * respectively). Marked above as I and III.
-                 *
-                 * @param nodeInstance The instance of the node to use as destination node, B.
-                 * @param startNodeParents Map of startNodeParent names to information objects
-                 * about the parent.
-                 * @param indexedStartNodeParents An array of all parents of the start node.
-                 * This is required to find the child of the mutual parent which is a parent
-                 * of the start node.
-                 * @private
-                 */
-                function _createVisibleTrace(nodeInstance, startNodeParents, indexedStartNodeParents) {
-                    var currentNode = nodeInstance;
-                    var previousNode = nodeInstance;
-                    // Ascend through parents until a mutual parent is found with the start
-                    // node.
-                    var destinationParentPairs = [];
-                    while (!startNodeParents[currentNode.name]) {
-                        if (previousNode.name !== currentNode.name) {
-                            destinationParentPairs.push([previousNode, currentNode]);
-                        }
-                        previousNode = currentNode;
-                        currentNode = currentNode.parentNode;
-                    }
-                    // Connection between nodes is drawn between the parents of each
-                    // respective node, both of which share the mutual parent.
-                    var startNodeIndex = startNodeParents[currentNode.name].index;
-                    var startNodeName = indexedStartNodeParents[Math.max(startNodeIndex - 1, 0)].name;
-                    var startNodeTopParentName = startNodeName;
-                    var targetNodeTopParentName = previousNode.name;
-                    var endNodeName = previousNode.name;
-                    d3.selectAll("[data-edge=\"" + endNodeName + "--" + startNodeName + "\"]")
-                        .classed('input-edge-highlight', true);
-                    // Trace up the parents of the input.
-                    _.each(destinationParentPairs, function (value) {
-                        var inner = value[0];
-                        var outer = value[1];
-                        var edgeSelector = "[data-edge=\"" + inner.name + "--" + startNodeTopParentName +
-                            ("~~" + outer.name + "~~OUT\"]");
-                        d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-                    });
-                    // Trace up the parents of the start node.
-                    for (var index = 1; index < startNodeIndex; index++) {
-                        var inner = indexedStartNodeParents[index - 1];
-                        var outer = indexedStartNodeParents[index];
-                        var edgeSelector = "[data-edge=\"" + targetNodeTopParentName + "~~" + outer.name +
-                            ("~~IN--" + inner.name + "\"]");
-                        d3.selectAll(edgeSelector).classed('input-edge-highlight', true);
-                    }
-                }
-                /**
-                 * Creates map { [name: string] -> Node } of all visible / rendered parents
-                 * of the nodes identified by the node names passed in.
-                 *
-                 * @param renderGraphInfo The information on the rendered graph.
-                 * @param nodeNames String array of node names.
-                 * @returns {[nodeName: string]: Node}
-                 * @private
-                 */
-                function _findVisibleParentsFromOpNodes(renderGraphInfo, nodeNames) {
-                    var visibleParents = {};
-                    _.each(nodeNames, function (nodeName) {
-                        var currentNode = renderGraphInfo.getNodeByName(nodeName);
-                        var visibleParent = getVisibleParent(renderGraphInfo, currentNode);
-                        visibleParents[visibleParent.name] = visibleParent;
-                    });
-                    return visibleParents;
-                }
-                /**
-                 * Traverse through the parents of all nodes in the list and mark each
-                 * encountered node as input-parent.
-                 * @param visibleNodes Map of input nodes, have to be visible/rendered when
-                 * called.
-                 * @private
-                 */
-                function _markParentsOfNodes(visibleNodes) {
-                    _.forOwn(visibleNodes, function (nodeInstance) {
-                        // Mark all parents of the node as input-parents.
-                        var currentNode = nodeInstance;
-                        while (currentNode.name !== tf.graph.ROOT_NAME) {
-                            var renderedElement = d3.select(".node[data-name=\"" + currentNode.name + "\"]");
-                            // Only mark the element as a parent node to an input if it is not
-                            // marked as input node itself.
-                            if (renderedElement[0][0] &&
-                                !renderedElement.classed('input-highlight') &&
-                                !renderedElement.classed('selected') &&
-                                // OpNode only parent if start node is embedded node, in which case
-                                // the OpNode should be faded as well.
-                                !renderedElement.classed('op')) {
-                                renderedElement.classed('input-parent', true);
-                            }
-                            currentNode = currentNode.parentNode;
-                        }
-                    });
-                }
-                /**
-                 * Find the parent of the passed in op node which is expanded. This is done
-                 * by going through all parents until the parent's parent is expanded, thus
-                 * finding the first unexpanded parent which is rendered on the screen.
-                 * @param renderGraphInfo The graph info object used to gain access to the
-                 * render info of the parents.
-                 * @param currentNode The node whose parent is to be found.
-                 * @returns Node
-                 */
-                function getVisibleParent(renderGraphInfo, currentNode) {
-                    var found = false;
-                    var currentParent = currentNode;
-                    while (!found) {
-                        // Get parent element, to extract name.
-                        currentNode = currentParent;
-                        currentParent = currentNode.parentNode;
-                        if (currentParent === undefined) {
-                            found = true;
-                        }
-                        else {
-                            var renderNode = renderGraphInfo.getRenderNodeByName(currentParent.name);
-                            // Found if node is rendered on the screen (renderNode truthy), and
-                            // the parent is either expanded (i.e. it is a metanode or seriesnode)
-                            // or the parent is an OpNode in which case currentNode is an embedded
-                            // node which has another OpNode as parent.
-                            if (renderNode &&
-                                (renderNode.expanded || currentParent instanceof graph.OpNodeImpl)) {
-                                found = true;
-                            }
-                        }
-                    } // Close while loop.
-                    return currentNode;
-                }
-                node_1.getVisibleParent = getVisibleParent;
-            })(node = scene.node || (scene.node = {}));
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // Close module.
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var scene;
-        (function (scene) {
-            var svgNamespace = 'http://www.w3.org/2000/svg';
-            /** Enums element class of objects in the scene */
-            scene.Class = {
-                Node: {
-                    // <g> element that contains nodes.
-                    CONTAINER: 'nodes',
-                    // <g> element that contains detail about a node.
-                    GROUP: 'node',
-                    // <g> element that contains visual elements (like rect, ellipse).
-                    SHAPE: 'nodeshape',
-                    // <*> element(s) under SHAPE that should receive color updates.
-                    COLOR_TARGET: 'nodecolortarget',
-                    // <text> element showing the node's label.
-                    LABEL: 'nodelabel',
-                    // <g> element that contains all visuals for the expand/collapse
-                    // button for expandable group nodes.
-                    BUTTON_CONTAINER: 'buttoncontainer',
-                    // <circle> element that surrounds expand/collapse buttons.
-                    BUTTON_CIRCLE: 'buttoncircle',
-                    // <path> element of the expand button.
-                    EXPAND_BUTTON: 'expandbutton',
-                    // <path> element of the collapse button.
-                    COLLAPSE_BUTTON: 'collapsebutton'
-                },
-                Edge: {
-                    CONTAINER: 'edges',
-                    GROUP: 'edge',
-                    LINE: 'edgeline',
-                    REF_LINE: 'refline',
-                    STRUCTURAL: 'structural'
-                },
-                Annotation: {
-                    OUTBOX: 'out-annotations',
-                    INBOX: 'in-annotations',
-                    GROUP: 'annotation',
-                    NODE: 'annotation-node',
-                    EDGE: 'annotation-edge',
-                    CONTROL_EDGE: 'annotation-control-edge',
-                    LABEL: 'annotation-label',
-                    ELLIPSIS: 'annotation-ellipsis'
-                },
-                Scene: {
-                    GROUP: 'scene',
-                    CORE: 'core',
-                    INEXTRACT: 'in-extract',
-                    OUTEXTRACT: 'out-extract'
-                },
-                Subscene: { GROUP: 'subscene' },
-                OPNODE: 'op',
-                METANODE: 'meta',
-                SERIESNODE: 'series',
-                BRIDGENODE: 'bridge',
-                ELLIPSISNODE: 'ellipsis'
-            };
-            ;
-            ;
-            scene.healthPillEntries = [
-                {
-                    background_color: '#CC2F2C',
-                    label: 'NaN',
-                },
-                {
-                    background_color: '#FF8D00',
-                    label: '- ∞',
-                },
-                {
-                    background_color: '#EAEAEA',
-                    label: '-',
-                },
-                {
-                    background_color: '#A5A5A5',
-                    label: '0',
-                },
-                {
-                    background_color: '#262626',
-                    label: '+',
-                },
-                {
-                    background_color: '#003ED4',
-                    label: '+ ∞',
-                },
-            ];
-            /**
-             * Helper method for fitting the graph in the svg view.
-             *
-             * @param svg The main svg.
-             * @param zoomG The svg group used for panning and zooming.
-             * @param d3zoom The zoom behavior.
-             * @param callback Called when the fitting is done.
-             */
-            function fit(svg, zoomG, d3zoom, callback) {
-                var svgRect = svg.getBoundingClientRect();
-                var sceneSize = null;
-                try {
-                    sceneSize = zoomG.getBBox();
-                    if (sceneSize.width === 0) {
-                        // There is no scene anymore. We have been detached from the dom.
-                        return;
-                    }
-                }
-                catch (e) {
-                    // Firefox produced NS_ERROR_FAILURE if we have been
-                    // detached from the dom.
-                    return;
-                }
-                var scale = 0.9 *
-                    Math.min(svgRect.width / sceneSize.width, svgRect.height / sceneSize.height, 2);
-                var params = graph.layout.PARAMS.graph;
-                var zoomEvent = d3zoom.scale(scale)
-                    .on('zoomend.fitted', function () {
-                    // Remove the listener for the zoomend event,
-                    // so we don't get called at the end of regular zoom events,
-                    // just those that fit the graph to screen.
-                    d3zoom.on('zoomend.fitted', null);
-                    callback();
-                })
-                    .translate([params.padding.paddingLeft, params.padding.paddingTop])
-                    .event;
-                d3.select(zoomG).transition().duration(500).call(zoomEvent);
-            }
-            scene.fit = fit;
-            ;
-            /**
-             * Helper method for panning the graph to center on the provided node,
-             * if the node is currently off-screen.
-             *
-             * @param nodeName The node to center the graph on
-             * @param svg The root SVG element for the graph
-             * @param zoomG The svg group used for panning and zooming.
-             * @param d3zoom The zoom behavior.
-             * @return True if the graph had to be panned to display the
-             *            provided node.
-             */
-            function panToNode(nodeName, svg, zoomG, d3zoom) {
-                var node = d3
-                    .select('[data-name="' + nodeName + '"].' + scene.Class.Node.GROUP)
-                    .node();
-                if (!node) {
-                    return false;
-                }
-                var translate = d3zoom.translate();
-                // Check if the selected node is off-screen in either
-                // X or Y dimension in either direction.
-                var nodeBox = node.getBBox();
-                var nodeCtm = node.getScreenCTM();
-                var pointTL = svg.createSVGPoint();
-                var pointBR = svg.createSVGPoint();
-                pointTL.x = nodeBox.x;
-                pointTL.y = nodeBox.y;
-                pointBR.x = nodeBox.x + nodeBox.width;
-                pointBR.y = nodeBox.y + nodeBox.height;
-                pointTL = pointTL.matrixTransform(nodeCtm);
-                pointBR = pointBR.matrixTransform(nodeCtm);
-                var isOutsideOfBounds = function (start, end, bound) {
-                    return end < 0 || start > bound;
-                };
-                var svgRect = svg.getBoundingClientRect();
-                if (isOutsideOfBounds(pointTL.x, pointBR.x, svgRect.width) ||
-                    isOutsideOfBounds(pointTL.y, pointBR.y, svgRect.height)) {
-                    // Determine the amount to transform the graph in both X and Y
-                    // dimensions in order to center the selected node. This takes into
-                    // acount the position of the node, the size of the svg scene, the
-                    // amount the scene has been scaled by through zooming, and any previous
-                    // transform already performed by this logic.
-                    var centerX = (pointTL.x + pointBR.x) / 2;
-                    var centerY = (pointTL.y + pointBR.y) / 2;
-                    var dx = ((svgRect.width / 2) - centerX);
-                    var dy = ((svgRect.height / 2) - centerY);
-                    var zoomEvent = d3zoom.translate([translate[0] + dx, translate[1] + dy])
-                        .event;
-                    d3.select(zoomG).transition().duration(500).call(zoomEvent);
-                    return true;
-                }
-                return false;
-            }
-            scene.panToNode = panToNode;
-            ;
-            /**
-             * Given a container d3 selection, select a child svg element of a given tag
-             * and class if exists or append / insert one otherwise.  If multiple children
-             * matches the tag and class name, returns only the first one.
-             *
-             * @param container
-             * @param tagName tag name.
-             * @param className (optional) Class name or a list of class names.
-             * @param before (optional) reference DOM node for insertion.
-             * @return selection of the element
-             */
-            function selectOrCreateChild(container, tagName, className, before) {
-                var child = selectChild(container, tagName, className);
-                if (!child.empty()) {
-                    return child;
-                }
-                var newElement = document.createElementNS('http://www.w3.org/2000/svg', tagName);
-                if (className instanceof Array) {
-                    for (var i = 0; i < className.length; i++) {
-                        newElement.classList.add(className[i]);
-                    }
-                }
-                else {
-                    newElement.classList.add(className);
-                }
-                if (before) {
-                    container.node().insertBefore(newElement, before);
-                }
-                else {
-                    container.node().appendChild(newElement);
-                }
-                return d3.select(newElement)
-                    .datum(container.datum());
-            }
-            scene.selectOrCreateChild = selectOrCreateChild;
-            ;
-            /**
-             * Given a container d3 selection, select a child element of a given tag and
-             * class. If multiple children matches the tag and class name, returns only
-             * the first one.
-             *
-             * @param container
-             * @param tagName tag name.
-             * @param className (optional) Class name or list of class names.
-             * @return selection of the element, or an empty selection
-             */
-            function selectChild(container, tagName, className) {
-                var children = container.node().childNodes;
-                for (var i = 0; i < children.length; i++) {
-                    var child = children[i];
-                    if (child.tagName === tagName) {
-                        if (className instanceof Array) {
-                            var hasAllClasses = true;
-                            for (var j = 0; j < className.length; j++) {
-                                hasAllClasses =
-                                    hasAllClasses && child.classList.contains(className[j]);
-                            }
-                            if (hasAllClasses) {
-                                return d3.select(child);
-                            }
-                        }
-                        else if ((!className || child.classList.contains(className))) {
-                            return d3.select(child);
-                        }
-                    }
-                }
-                return d3.select(null);
-            }
-            scene.selectChild = selectChild;
-            ;
-            /**
-             * Select or create a sceneGroup and build/update its nodes and edges.
-             *
-             * Structure Pattern:
-             *
-             * <g class='scene'>
-             *   <g class='core'>
-             *     <g class='edges'>
-             *       ... stuff from tf.graph.scene.edges.build ...
-             *     </g>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             *   <g class='in-extract'>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             *   <g class='out-extract'>
-             *     <g class='nodes'>
-             *       ... stuff from tf.graph.scene.nodes.build ...
-             *     </g>
-             *   </g>
-             * </g>
-             *
-             * @param container D3 selection of the parent.
-             * @param renderNode render node of a metanode or series node.
-             * @param sceneElement <tf-graph-scene> polymer element.
-             * @param sceneClass class attribute of the scene (default='scene').
-             */
-            function buildGroup(container, renderNode, sceneElement, sceneClass) {
-                sceneClass = sceneClass || scene.Class.Scene.GROUP;
-                var isNewSceneGroup = selectChild(container, 'g', sceneClass).empty();
-                var sceneGroup = selectOrCreateChild(container, 'g', sceneClass);
-                // core
-                var coreGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.CORE);
-                var coreNodes = _.reduce(renderNode.coreGraph.nodes(), function (nodes, name) {
-                    var node = renderNode.coreGraph.node(name);
-                    if (!node.excluded) {
-                        nodes.push(node);
-                    }
-                    return nodes;
-                }, []);
-                if (renderNode.node.type === graph.NodeType.SERIES) {
-                    // For series, we want the first item on top, so reverse the array so
-                    // the first item in the series becomes last item in the top, and thus
-                    // is rendered on the top.
-                    coreNodes.reverse();
-                }
-                // Create the layer of edges for this scene (paths).
-                scene.edge.buildGroup(coreGroup, renderNode.coreGraph, sceneElement);
-                // Create the layer of nodes for this scene (ellipses, rects etc).
-                scene.node.buildGroup(coreGroup, coreNodes, sceneElement);
-                // In-extract
-                if (renderNode.isolatedInExtract.length > 0) {
-                    var inExtractGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT);
-                    scene.node.buildGroup(inExtractGroup, renderNode.isolatedInExtract, sceneElement);
-                }
-                else {
-                    selectChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT).remove();
-                }
-                // Out-extract
-                if (renderNode.isolatedOutExtract.length > 0) {
-                    var outExtractGroup = selectOrCreateChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT);
-                    scene.node.buildGroup(outExtractGroup, renderNode.isolatedOutExtract, sceneElement);
-                }
-                else {
-                    selectChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT).remove();
-                }
-                position(sceneGroup, renderNode);
-                // Fade in the scene group if it didn't already exist.
-                if (isNewSceneGroup) {
-                    sceneGroup.attr('opacity', 0).transition().attr('opacity', 1);
-                }
-                return sceneGroup;
-            }
-            scene.buildGroup = buildGroup;
-            ;
-            /**
-             * Given a scene's svg group, set  g.in-extract, g.coreGraph, g.out-extract svg
-             * groups' position relative to the scene.
-             *
-             * @param sceneGroup
-             * @param renderNode render node of a metanode or series node.
-             */
-            function position(sceneGroup, renderNode) {
-                // Translate scenes down by the label height so that when showing graphs in
-                // expanded metanodes, the graphs are below the labels.  Do not shift them
-                // down for series nodes as series nodes don't have labels inside of their
-                // bounding boxes.
-                var yTranslate = renderNode.node.type === graph.NodeType.SERIES ?
-                    0 : graph.layout.PARAMS.subscene.meta.labelHeight;
-                // core
-                translate(selectChild(sceneGroup, 'g', scene.Class.Scene.CORE), 0, yTranslate);
-                // in-extract
-                var hasInExtract = renderNode.isolatedInExtract.length > 0;
-                var hasOutExtract = renderNode.isolatedOutExtract.length > 0;
-                if (hasInExtract) {
-                    var offset = graph.layout.PARAMS.subscene.meta.extractXOffset;
-                    var inExtractX = renderNode.coreBox.width -
-                        renderNode.inExtractBox.width / 2 - renderNode.outExtractBox.width -
-                        (hasOutExtract ? offset : 0);
-                    translate(selectChild(sceneGroup, 'g', scene.Class.Scene.INEXTRACT), inExtractX, yTranslate);
-                }
-                // out-extract
-                if (hasOutExtract) {
-                    var outExtractX = renderNode.coreBox.width -
-                        renderNode.outExtractBox.width / 2;
-                    translate(selectChild(sceneGroup, 'g', scene.Class.Scene.OUTEXTRACT), outExtractX, yTranslate);
-                }
-            }
-            ;
-            /** Adds a click listener to a group that fires a graph-select event */
-            function addGraphClickListener(graphGroup, sceneElement) {
-                d3.select(graphGroup).on('click', function () {
-                    sceneElement.fire('graph-select');
-                });
-            }
-            scene.addGraphClickListener = addGraphClickListener;
-            ;
-            /** Helper for adding transform: translate(x0, y0) */
-            function translate(selection, x0, y0) {
-                // If it is already placed on the screen, make it a transition.
-                if (selection.attr('transform') != null) {
-                    selection = selection.transition('position');
-                }
-                selection.attr('transform', 'translate(' + x0 + ',' + y0 + ')');
-            }
-            scene.translate = translate;
-            ;
-            /**
-             * Helper for setting position of a svg rect
-             * @param rect rect to set position of.
-             * @param cx Center x.
-             * @param cy Center x.
-             * @param width Width to set.
-             * @param height Height to set.
-             */
-            function positionRect(rect, cx, cy, width, height) {
-                rect.transition().attr({
-                    x: cx - width / 2,
-                    y: cy - height / 2,
-                    width: width,
-                    height: height
-                });
-            }
-            scene.positionRect = positionRect;
-            ;
-            /**
-             * Helper for setting position of a svg expand/collapse button
-             * @param button container group
-             * @param renderNode the render node of the group node to position
-             *        the button on.
-             */
-            function positionButton(button, renderNode) {
-                var cx = graph.layout.computeCXPositionOfNodeShape(renderNode);
-                // Position the button in the top-right corner of the group node,
-                // with space given the draw the button inside of the corner.
-                var width = renderNode.expanded ?
-                    renderNode.width : renderNode.coreBox.width;
-                var height = renderNode.expanded ?
-                    renderNode.height : renderNode.coreBox.height;
-                var x = cx + width / 2 - 6;
-                var y = renderNode.y - height / 2 + 6;
-                // For unexpanded series nodes, the button has special placement due
-                // to the unique visuals of this group node.
-                if (renderNode.node.type === graph.NodeType.SERIES && !renderNode.expanded) {
-                    x += 10;
-                    y -= 2;
-                }
-                var translateStr = 'translate(' + x + ',' + y + ')';
-                button.selectAll('path').transition().attr('transform', translateStr);
-                button.select('circle').transition().attr({ cx: x, cy: y, r: graph.layout.PARAMS.nodeSize.meta.expandButtonRadius });
-            }
-            scene.positionButton = positionButton;
-            ;
-            /**
-             * Helper for setting position of a svg ellipse
-             * @param ellipse ellipse to set position of.
-             * @param cx Center x.
-             * @param cy Center x.
-             * @param width Width to set.
-             * @param height Height to set.
-             */
-            function positionEllipse(ellipse, cx, cy, width, height) {
-                ellipse.transition().attr({
-                    cx: cx,
-                    cy: cy,
-                    rx: width / 2,
-                    ry: height / 2
-                });
-            }
-            scene.positionEllipse = positionEllipse;
-            ;
-            /**
-             * @param {number} stat A stat for a health pill (such as mean or variance).
-             * @param {boolean} shouldRoundOnesDigit Whether to round this number to the
-             *     ones digit. Useful for say int, uint, and bool output types.
-             * @return {string} A human-friendly string representation of that stat.
-             */
-            function humanizeHealthPillStat(stat, shouldRoundOnesDigit) {
-                if (shouldRoundOnesDigit) {
-                    return stat.toFixed(0);
-                }
-                if (Math.abs(stat) >= 1) {
-                    return stat.toFixed(1);
-                }
-                return stat.toExponential(1);
-            }
-            scene.humanizeHealthPillStat = humanizeHealthPillStat;
-            /**
-             * Renders a health pill for an op atop a node.
-             */
-            function _addHealthPill(nodeGroupElement, healthPill, nodeInfo) {
-                // Check if text already exists at location.
-                d3.select(nodeGroupElement.parentNode).selectAll('.health-pill').remove();
-                if (!nodeInfo || !healthPill) {
-                    return;
-                }
-                var lastHealthPillData = healthPill.value;
-                // For now, we only visualize the 6 values that summarize counts of tensor
-                // elements of various categories: -Inf, negative, 0, positive, Inf, and NaN.
-                var lastHealthPillOverview = lastHealthPillData.slice(2, 8);
-                var totalCount = lastHealthPillData[1];
-                var healthPillWidth = 60;
-                var healthPillHeight = 10;
-                if (nodeInfo.node.type === tf.graph.NodeType.OP) {
-                    // Use a smaller health pill for op nodes (rendered as smaller ellipses).
-                    healthPillWidth /= 2;
-                    healthPillHeight /= 2;
-                }
-                var healthPillGroup = document.createElementNS(svgNamespace, 'g');
-                healthPillGroup.classList.add('health-pill');
-                // Define the gradient for the health pill.
-                var healthPillDefs = document.createElementNS(svgNamespace, 'defs');
-                healthPillGroup.appendChild(healthPillDefs);
-                var healthPillGradient = document.createElementNS(svgNamespace, 'linearGradient');
-                var healthPillGradientId = 'health-pill-gradient';
-                healthPillGradient.setAttribute('id', healthPillGradientId);
-                var titleOnHoverTextEntries = [];
-                var cumulativeCount = 0;
-                var previousOffset = '0%';
-                for (var i = 0; i < lastHealthPillOverview.length; i++) {
-                    if (!lastHealthPillOverview[i]) {
-                        // Exclude empty categories.
-                        continue;
-                    }
-                    cumulativeCount += lastHealthPillOverview[i];
-                    // Create a color interval using 2 stop elements.
-                    var stopElement0 = document.createElementNS(svgNamespace, 'stop');
-                    stopElement0.setAttribute('offset', previousOffset);
-                    stopElement0.setAttribute('stop-color', scene.healthPillEntries[i].background_color);
-                    healthPillGradient.appendChild(stopElement0);
-                    var stopElement1 = document.createElementNS(svgNamespace, 'stop');
-                    var percent = (cumulativeCount * 100 / totalCount) + '%';
-                    stopElement1.setAttribute('offset', percent);
-                    stopElement1.setAttribute('stop-color', scene.healthPillEntries[i].background_color);
-                    healthPillGradient.appendChild(stopElement1);
-                    previousOffset = percent;
-                    // Include this number in the title that appears on hover.
-                    titleOnHoverTextEntries.push(scene.healthPillEntries[i].label + ': ' + lastHealthPillOverview[i]);
-                }
-                healthPillDefs.appendChild(healthPillGradient);
-                // Create the rectangle for the health pill.
-                var rect = document.createElementNS(svgNamespace, 'rect');
-                rect.setAttribute('fill', 'url(#' + healthPillGradientId + ')');
-                rect.setAttribute('width', String(healthPillWidth));
-                rect.setAttribute('height', String(healthPillHeight));
-                healthPillGroup.appendChild(rect);
-                // Show a title with specific counts on hover.
-                var titleSvg = document.createElementNS(svgNamespace, 'title');
-                titleSvg.textContent = titleOnHoverTextEntries.join(', ');
-                healthPillGroup.appendChild(titleSvg);
-                // Center this health pill just right above the node for the op.
-                var healthPillX = nodeInfo.x - healthPillWidth / 2;
-                var healthPillY = nodeInfo.y - healthPillHeight - nodeInfo.height / 2 - 2;
-                if (nodeInfo.labelOffset < 0) {
-                    // The label is positioned above the node. Do not occlude the label.
-                    healthPillY += nodeInfo.labelOffset;
-                }
-                if (lastHealthPillOverview[2] || lastHealthPillOverview[3] ||
-                    lastHealthPillOverview[4]) {
-                    // At least 1 "non-Inf and non-NaN" value exists (a -, 0, or + value). Show
-                    // stats on tensor values.
-                    // Determine if we should display the output range as integers.
-                    var shouldRoundOnesDigit = false;
-                    var node_1 = nodeInfo.node;
-                    var attributes = node_1.attr;
-                    if (attributes && attributes.length) {
-                        // Find the attribute for output type if there is one.
-                        for (var i = 0; i < attributes.length; i++) {
-                            if (attributes[i].key === 'T') {
-                                // Note whether the output type is an integer.
-                                var outputType = attributes[i].value['type'];
-                                shouldRoundOnesDigit =
-                                    outputType && /^DT_(BOOL|INT|UINT)/.test(outputType);
-                                break;
-                            }
-                        }
-                    }
-                    var statsSvg = document.createElementNS(svgNamespace, 'text');
-                    var minString = humanizeHealthPillStat(lastHealthPillData[8], shouldRoundOnesDigit);
-                    var maxString = humanizeHealthPillStat(lastHealthPillData[9], shouldRoundOnesDigit);
-                    statsSvg.textContent = minString + ' ~ ' + maxString;
-                    statsSvg.classList.add('health-pill-stats');
-                    statsSvg.setAttribute('x', String(healthPillWidth / 2));
-                    statsSvg.setAttribute('y', '-2');
-                    healthPillGroup.appendChild(statsSvg);
-                }
-                healthPillGroup.setAttribute('transform', 'translate(' + healthPillX + ', ' + healthPillY + ')');
-                Polymer.dom(nodeGroupElement.parentNode).appendChild(healthPillGroup);
-            }
-            /**
-             * Adds health pills (which visualize tensor summaries) to a graph group.
-             * @param svgRoot The root SVG element of the graph to add heath pills to.
-             * @param nodeNamesToHealthPills An object mapping node name to health pill.
-             * @param colors A list of colors to use.
-             */
-            function addHealthPills(svgRoot, nodeNamesToHealthPills, healthPillStepIndex) {
-                if (!nodeNamesToHealthPills) {
-                    // No health pill information available.
-                    return;
-                }
-                var svgRootSelection = d3.select(svgRoot);
-                svgRootSelection.selectAll('g.nodeshape')
-                    .each(function (nodeInfo) {
-                    // Only show health pill data for this node if it is available.
-                    var healthPills = nodeNamesToHealthPills[nodeInfo.node.name];
-                    var healthPill = healthPills ? healthPills[healthPillStepIndex] : null;
-                    _addHealthPill(this, healthPill, nodeInfo);
-                });
-            }
-            scene.addHealthPills = addHealthPills;
-            ;
-        })(scene = graph.scene || (graph.scene = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {})); // close module
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph_1) {
-        var template;
-        (function (template) {
-            /**
-             * Detect repeating patterns of subgraphs.
-             * Assign templateId to each subgraph if it belongs to a template.
-             * Returns clusters of similar subgraphs .
-             *
-             * @param graph
-             * @param verifyTemplate whether to run the template verification algorithm
-             * @return a dict (template id => Array of node names)
-             */
-            function detect(h, verifyTemplate) {
-                // In any particular subgraph, there are either
-                // - leaf nodes (which do not have subgraph)
-                // - metanode nodes - some of them have only one member (singular metanode)
-                //                    and some have multiple members (non-singular metanode)
-                // First, generate a nearest neighbor hash of metanode nodes.
-                var nnGroups = clusterSimilarSubgraphs(h);
-                // For each metanode, compare its subgraph (starting from shallower groups)
-                // and assign template id.
-                var templates = groupTemplateAndAssignId(nnGroups, verifyTemplate);
-                // Sort the templates by minimum level in the graph at which they appear,
-                // as this leads to optimal setting of the colors of each template for
-                // maximum differentiation.
-                return _(templates)
-                    .pairs()
-                    .sortBy(function (pair) {
-                    return pair[1].level;
-                })
-                    .map(function (pair) {
-                    return [pair[0], pair[1].nodes];
-                })
-                    .object()
-                    .value();
-            }
-            template.detect = detect;
-            ;
-            /**
-             * @return Unique string for a metanode based on depth, |V|, |E| and
-             * op type histogram.
-             */
-            function getSignature(metanode) {
-                // depth=<number> |V|=<number> |E|=<number>
-                var props = _.map({
-                    'depth': metanode.depth,
-                    '|V|': metanode.metagraph.nodes().length,
-                    '|E|': metanode.metagraph.edges().length
-                }, function (v, k) { return k + '=' + v; })
-                    .join(' ');
-                // optype1=count1,optype2=count2
-                var ops = _.map(metanode.opHistogram, function (count, op) {
-                    return op + '=' + count;
-                }).join(',');
-                return props + ' [ops] ' + ops;
-            }
-            /**
-             * Generate a nearest neighbor hash of metanodes
-             * based on depth, |V|, |E|, and opHistogram of their subgraph
-             * (excluding leaf nodes and singular metanodes).
-             * @param graph The graph
-             * @return Array of pairs of [signature,
-             *   Object with min level of the template and an Array of tf.graph.Group]
-             *   sort by ascending order of minimum depth at which metanode appears.
-             */
-            function clusterSimilarSubgraphs(h) {
-                /** a dict from metanode.signature() => Array of tf.graph.Groups */
-                var hashDict = _(h.getNodeMap()).reduce(function (hash, node, name) {
-                    if (node.type !== graph_1.NodeType.META) {
-                        return hash;
-                    }
-                    var levelOfMetaNode = name.split('/').length - 1;
-                    var signature = getSignature(node);
-                    var templateInfo = hash[signature] ||
-                        { nodes: [], level: levelOfMetaNode };
-                    hash[signature] = templateInfo;
-                    templateInfo.nodes.push(node);
-                    if (templateInfo.level > levelOfMetaNode) {
-                        templateInfo.level = levelOfMetaNode;
-                    }
-                    return hash;
-                }, {});
-                return _(hashDict)
-                    .pairs()
-                    .filter(function (pair) {
-                    return pair[1].nodes.length > 1;
-                })
-                    .sortBy(function (pair) {
-                    // sort by depth
-                    // (all members in the same nnGroup has equal depth)
-                    return pair[1].nodes[0].depth;
-                })
-                    .value();
-            }
-            function groupTemplateAndAssignId(nnGroups, verifyTemplate) {
-                // For each metanode, compare its subgraph (starting from shallower groups)
-                // and assign template id.
-                var result = {};
-                return _.reduce(nnGroups, function (templates, nnGroupPair) {
-                    var signature = nnGroupPair[0], nnGroup = nnGroupPair[1].nodes, clusters = [];
-                    nnGroup.forEach(function (metanode) {
-                        // check with each existing cluster
-                        for (var i = 0; i < clusters.length; i++) {
-                            var similar = !verifyTemplate ||
-                                isSimilarSubgraph(clusters[i].metanode.metagraph, metanode.metagraph);
-                            // if similar, just add this metanode to the cluster
-                            if (similar) {
-                                // get template from the first one
-                                metanode.templateId = clusters[i].metanode.templateId;
-                                clusters[i].members.push(metanode.name);
-                                return;
-                            }
-                        }
-                        // otherwise create a new cluster with id 'signature [count] '
-                        metanode.templateId = signature + '[' + clusters.length + ']';
-                        clusters.push({
-                            metanode: metanode,
-                            members: [metanode.name]
-                        });
-                    });
-                    clusters.forEach(function (c) {
-                        templates[c.metanode.templateId] = {
-                            level: nnGroupPair[1].level,
-                            nodes: c.members
-                        };
-                    });
-                    return templates;
-                }, result);
-            }
-            function sortNodes(names, graph, prefix) {
-                return _.sortByAll(names, function (name) {
-                    var node = graph.node(name);
-                    return node.op;
-                }, function (name) {
-                    var node = graph.node(name);
-                    return node.templateId;
-                }, function (name) {
-                    return graph.neighbors(name).length;
-                }, function (name) {
-                    return graph.predecessors(name).length;
-                }, function (name) {
-                    return graph.successors(name).length;
-                }, function (name) {
-                    return name.substr(prefix.length);
-                });
-            }
-            function isSimilarSubgraph(g1, g2) {
-                if (!tf.graph.hasSimilarDegreeSequence(g1, g2)) {
-                    return false;
-                }
-                // if we want to skip, just return true here.
-                // return true;
-                // Verify sequence by running DFS
-                var g1prefix = g1.graph().name;
-                var g2prefix = g2.graph().name;
-                var visited1 = {};
-                var visited2 = {};
-                var stack = [];
-                /**
-                 * push sources or successors into the stack
-                 * if the visiting pattern has been similar.
-                 */
-                function stackPushIfNotDifferent(n1, n2) {
-                    var sub1 = n1.substr(g1prefix.length), sub2 = n2.substr(g2prefix.length);
-                    /* tslint:disable */
-                    if (visited1[sub1] ^ visited2[sub1]) {
-                        console.warn('different visit pattern', '[' + g1prefix + ']', sub1, '[' + g2prefix + ']', sub2);
-                        return true;
-                    }
-                    /* tslint:enable */
-                    if (!visited1[sub1]) {
-                        visited1[sub1] = visited2[sub2] = true;
-                        stack.push({ n1: n1, n2: n2 });
-                    }
-                    return false;
-                }
-                // check if have same # of sources then sort and push
-                var sources1 = g1.sources();
-                var sources2 = g2.sources();
-                if (sources1.length !== sources2.length) {
-                    /* tslint:disable */
-                    console.log('different source length');
-                    /* tslint:enable */
-                    return false;
-                }
-                sources1 = sortNodes(sources1, g1, g1prefix);
-                sources2 = sortNodes(sources2, g2, g2prefix);
-                for (var i = 0; i < sources1.length; i++) {
-                    var different = stackPushIfNotDifferent(sources1[i], sources2[i]);
-                    if (different) {
-                        return false;
-                    }
-                }
-                while (stack.length > 0) {
-                    var cur = stack.pop();
-                    // check node
-                    var similar = isSimilarNode(g1.node(cur.n1), g2.node(cur.n2));
-                    if (!similar) {
-                        return false;
-                    }
-                    // check if have same # of successors then sort and push
-                    var succ1 = g1.successors(cur.n1), succ2 = g2.successors(cur.n2);
-                    if (succ1.length !== succ2.length) {
-                        /* tslint:disable */
-                        console.log('# of successors mismatch', succ1, succ2);
-                        /* tslint:enable */
-                        return false;
-                    }
-                    succ1 = sortNodes(succ1, g1, g1prefix);
-                    succ2 = sortNodes(succ2, g2, g2prefix);
-                    for (var j = 0; j < succ1.length; j++) {
-                        var different = stackPushIfNotDifferent(succ1[j], succ2[j]);
-                        if (different) {
-                            return false;
-                        }
-                    }
-                }
-                return true;
-            }
-            /**
-             * Returns if two nodes have identical structure.
-             */
-            function isSimilarNode(n1, n2) {
-                if (n1.type === graph_1.NodeType.META) {
-                    // compare metanode
-                    var metanode1 = n1;
-                    var metanode2 = n2;
-                    return metanode1.templateId && metanode2.templateId &&
-                        metanode1.templateId === metanode2.templateId;
-                }
-                else if (n1.type === graph_1.NodeType.OP && n2.type === graph_1.NodeType.OP) {
-                    // compare leaf node
-                    return n1.op === n2.op;
-                }
-                else if (n1.type === graph_1.NodeType.SERIES && n2.type === graph_1.NodeType.SERIES) {
-                    // compare series node sizes and operations
-                    // (only need to check one op as all op nodes are identical in series)
-                    var sn1 = n1;
-                    var sn2 = n2;
-                    var seriesnode1Count = sn1.metagraph.nodeCount();
-                    return (seriesnode1Count === sn2.metagraph.nodeCount() &&
-                        (seriesnode1Count === 0 ||
-                            (sn1.metagraph.node(sn1.metagraph.nodes()[0]).op ===
-                                sn2.metagraph.node(sn2.metagraph.nodes()[0]).op)));
-                }
-                return false;
-            }
-        })(template = graph_1.template || (graph_1.template = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {}));
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-/**
- * @fileoverview Utility functions for the tensorflow graph visualizer.
- */
-var tf;
-(function (tf) {
-    var graph;
-    (function (graph) {
-        var util;
-        (function (util) {
-            /**
-             * Recommended delay (ms) when running an expensive task asynchronously
-             * that gives enough time for the progress bar to update its UI.
-             */
-            var ASYNC_TASK_DELAY = 20;
-            function time(msg, task) {
-                var start = Date.now();
-                var result = task();
-                /* tslint:disable */
-                console.log(msg, ':', Date.now() - start, 'ms');
-                /* tslint:enable */
-                return result;
-            }
-            util.time = time;
-            /**
-             * Creates a tracker that sets the progress property of the
-             * provided polymer component. The provided component must have
-             * a property called 'progress' that is not read-only. The progress
-             * property is an object with a numerical 'value' property and a
-             * string 'msg' property.
-             */
-            function getTracker(polymerComponent) {
-                return {
-                    setMessage: function (msg) {
-                        polymerComponent.set('progress', { value: polymerComponent.progress.value, msg: msg });
-                    },
-                    updateProgress: function (value) {
-                        polymerComponent.set('progress', {
-                            value: polymerComponent.progress.value + value,
-                            msg: polymerComponent.progress.msg
-                        });
-                    },
-                    reportError: function (msg, err) {
-                        // Log the stack trace in the console.
-                        console.error(err.stack);
-                        // And send a user-friendly message to the UI.
-                        polymerComponent.set('progress', { value: polymerComponent.progress.value, msg: msg, error: true });
-                    },
-                };
-            }
-            util.getTracker = getTracker;
-            /**
-             * Creates a tracker for a subtask given the parent tracker, the total
-             * progress
-             * of the subtask and the subtask message. The parent task should pass a
-             * subtracker to its subtasks. The subtask reports its own progress which
-             * becames relative to the main task.
-             */
-            function getSubtaskTracker(parentTracker, impactOnTotalProgress, subtaskMsg) {
-                return {
-                    setMessage: function (progressMsg) {
-                        // The parent should show a concatenation of its message along with
-                        // its subtask tracker message.
-                        parentTracker.setMessage(subtaskMsg + ': ' + progressMsg);
-                    },
-                    updateProgress: function (incrementValue) {
-                        // Update the parent progress relative to the child progress.
-                        // For example, if the sub-task progresses by 30%, and the impact on the
-                        // total progress is 50%, then the task progresses by 30% * 50% = 15%.
-                        parentTracker.updateProgress(incrementValue * impactOnTotalProgress / 100);
-                    },
-                    reportError: function (msg, err) {
-                        // The parent should show a concatenation of its message along with
-                        // its subtask error message.
-                        parentTracker.reportError(subtaskMsg + ': ' + msg, err);
-                    }
-                };
-            }
-            util.getSubtaskTracker = getSubtaskTracker;
-            /**
-             * Runs an expensive task and return the result.
-             */
-            function runTask(msg, incProgressValue, task, tracker) {
-                // Update the progress message to say the current running task.
-                tracker.setMessage(msg);
-                // Run the expensive task with a delay that gives enough time for the
-                // UI to update.
-                try {
-                    var result = tf.graph.util.time(msg, task);
-                    // Update the progress value.
-                    tracker.updateProgress(incProgressValue);
-                    // Return the result to be used by other tasks.
-                    return result;
-                }
-                catch (e) {
-                    // Errors that happen inside asynchronous tasks are
-                    // reported to the tracker using a user-friendly message.
-                    tracker.reportError('Failed ' + msg, e);
-                }
-            }
-            util.runTask = runTask;
-            /**
-             * Runs an expensive task asynchronously and returns a promise of the result.
-             */
-            function runAsyncTask(msg, incProgressValue, task, tracker) {
-                return new Promise(function (resolve, reject) {
-                    // Update the progress message to say the current running task.
-                    tracker.setMessage(msg);
-                    // Run the expensive task with a delay that gives enough time for the
-                    // UI to update.
-                    setTimeout(function () {
-                        try {
-                            var result = tf.graph.util.time(msg, task);
-                            // Update the progress value.
-                            tracker.updateProgress(incProgressValue);
-                            // Return the result to be used by other tasks.
-                            resolve(result);
-                        }
-                        catch (e) {
-                            // Errors that happen inside asynchronous tasks are
-                            // reported to the tracker using a user-friendly message.
-                            tracker.reportError('Failed ' + msg, e);
-                        }
-                    }, ASYNC_TASK_DELAY);
-                });
-            }
-            util.runAsyncTask = runAsyncTask;
-            /**
-             * Asynchronously runs an expensive task that returns a promise. Updates the
-             * tracker's progress after the promise resolves. Returns a new promise that
-             * resolves after the progress is updated.
-             */
-            function runAsyncPromiseTask(msg, incProgressValue, task, tracker) {
-                return new Promise(function (resolve, reject) {
-                    var handleError = function (e) {
-                        // Errors that happen inside asynchronous tasks are
-                        // reported to the tracker using a user-friendly message.
-                        tracker.reportError('Failed ' + msg, e);
-                        reject(e);
-                    };
-                    // Update the progress message to say the current running task.
-                    tracker.setMessage(msg);
-                    // Run the expensive task with a delay that gives enough time for the
-                    // UI to update.
-                    setTimeout(function () {
-                        try {
-                            var start_1 = Date.now();
-                            task()
-                                .then(function (value) {
-                                /* tslint:disable */
-                                console.log(msg, ':', Date.now() - start_1, 'ms');
-                                // Update the progress value.
-                                tracker.updateProgress(incProgressValue);
-                                // Return the result to be used by other tasks.
-                                resolve(value);
-                            })
-                                .catch(handleError);
-                        }
-                        catch (e) {
-                            handleError(e);
-                        }
-                    }, ASYNC_TASK_DELAY);
-                });
-            }
-            util.runAsyncPromiseTask = runAsyncPromiseTask;
-            /**
-             * Returns a query selector with escaped special characters that are not
-             * allowed in a query selector.
-             */
-            function escapeQuerySelector(querySelector) {
-                return querySelector.replace(/([:.\[\],/\\\(\)])/g, '\\$1');
-            }
-            util.escapeQuerySelector = escapeQuerySelector;
-            // For unit conversion.
-            util.MEMORY_UNITS = [
-                // Atomic unit.
-                { symbol: 'B' },
-                // numUnits specifies how many previous units this unit contains.
-                { symbol: 'KB', numUnits: 1024 }, { symbol: 'MB', numUnits: 1024 },
-                { symbol: 'GB', numUnits: 1024 }, { symbol: 'TB', numUnits: 1024 },
-                { symbol: 'PB', numUnits: 1024 }
-            ];
-            util.TIME_UNITS = [
-                // Atomic unit. Finest granularity in TensorFlow stat collection.
-                { symbol: 'µs' },
-                // numUnits specifies how many previous units this unit contains.
-                { symbol: 'ms', numUnits: 1000 }, { symbol: 's', numUnits: 1000 },
-                { symbol: 'min', numUnits: 60 }, { symbol: 'hr', numUnits: 60 },
-                { symbol: 'days', numUnits: 24 }
-            ];
-            /**
-             * Returns the human readable version of the unit.
-             * (e.g. 1.35 GB, 23 MB, 34 ms, 6.53 min etc).
-             */
-            function convertUnitsToHumanReadable(value, units, unitIndex) {
-                unitIndex = unitIndex == null ? 0 : unitIndex;
-                if (unitIndex + 1 < units.length &&
-                    value >= units[unitIndex + 1].numUnits) {
-                    return tf.graph.util.convertUnitsToHumanReadable(value / units[unitIndex + 1].numUnits, units, unitIndex + 1);
-                }
-                // toPrecision() has the tendency to return a number in scientific
-                // notation and (number - 0) brings it back to normal notation.
-                return (value.toPrecision(3) - 0) + ' ' + units[unitIndex].symbol;
-            }
-            util.convertUnitsToHumanReadable = convertUnitsToHumanReadable;
-            function hasDisplayableNodeStats(stats) {
-                if (stats &&
-                    (stats.totalBytes > 0 || stats.totalMicros > 0 || stats.outputSize)) {
-                    return true;
-                }
-                return false;
-            }
-            util.hasDisplayableNodeStats = hasDisplayableNodeStats;
-            /**
-             * Given a list of strings, it returns a new list of strings with the longest
-             * common prefix removed. If the common prefix is one of the strings in the
-             * list, it returns the original strings.
-             */
-            function removeCommonPrefix(strings) {
-                if (strings.length < 2) {
-                    return strings;
-                }
-                var index = 0;
-                var largestIndex = 0;
-                // Find the shortest name across all strings.
-                var minLength = _.min(_.map(strings, function (str) { return str.length; }));
-                var _loop_1 = function () {
-                    index++;
-                    var prefixes = _.map(strings, function (str) { return str.substring(0, index); });
-                    var allTheSame = prefixes.every(function (prefix, i) {
-                        return (i === 0 ? true : prefix === prefixes[i - 1]);
-                    });
-                    if (allTheSame) {
-                        if (index >= minLength) {
-                            return { value: strings };
-                        }
-                        largestIndex = index;
-                    }
-                    else {
-                        return "break";
-                    }
-                };
-                while (true) {
-                    var state_1 = _loop_1();
-                    if (typeof state_1 === "object")
-                        return state_1.value;
-                    if (state_1 === "break")
-                        break;
-                }
-                return _.map(strings, function (str) { return str.substring(largestIndex); });
-            }
-            util.removeCommonPrefix = removeCommonPrefix;
-            /**
-             * Given a queryString, aka ?foo=1&bar=2, return the object representation.
-             */
-            function getQueryParams(queryString) {
-                if (queryString.charAt(0) === '?') {
-                    queryString = queryString.slice(1);
-                }
-                var queryParams = _.chain(queryString.split('&'))
-                    .map(function (item) {
-                    if (item) {
-                        return item.split('=');
-                    }
-                })
-                    .compact()
-                    .value();
-                return _.object(queryParams);
-            }
-            util.getQueryParams = getQueryParams;
-        })(util = graph.util || (graph.util = {}));
-    })(graph = tf.graph || (tf.graph = {}));
-})(tf || (tf = {}));
-</script>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the 'License');
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an 'AS IS' BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var tf;
-(function (tf) {
-    var scene;
-    (function (scene) {
-        /** Show minimap when the viewpoint area is less than X% of the whole area. */
-        var FRAC_VIEWPOINT_AREA = 0.8;
-        var Minimap = (function () {
-            /**
-             * Constructs a new minimap.
-             *
-             * @param svg The main svg element.
-             * @param zoomG The svg group used for panning and zooming the main svg.
-             * @param mainZoom The main zoom behavior.
-             * @param minimap The minimap container.
-             * @param maxWandH The maximum width/height for the minimap.
-             * @param labelPadding Padding in pixels due to the main graph labels.
-             */
-            function Minimap(svg, zoomG, mainZoom, minimap, maxWandH, labelPadding) {
-                var _this = this;
-                this.svg = svg;
-                this.labelPadding = labelPadding;
-                this.zoomG = zoomG;
-                this.mainZoom = mainZoom;
-                this.maxWandH = maxWandH;
-                var $minimap = d3.select(minimap);
-                // The minimap will have 2 main components: the canvas showing the content
-                // and an svg showing a rectangle of the currently zoomed/panned viewpoint.
-                var $minimapSvg = $minimap.select('svg');
-                // Make the viewpoint rectangle draggable.
-                var $viewpoint = $minimapSvg.select('rect');
-                var dragmove = function (d) {
-                    _this.viewpointCoord.x = d3.event.x;
-                    _this.viewpointCoord.y = d3.event.y;
-                    _this.updateViewpoint();
-                };
-                this.viewpointCoord = { x: 0, y: 0 };
-                var drag = d3.behavior.drag().origin(Object).on('drag', dragmove);
-                $viewpoint.datum(this.viewpointCoord).call(drag);
-                // Make the minimap clickable.
-                $minimapSvg.on('click', function () {
-                    if (d3.event.defaultPrevented) {
-                        // This click was part of a drag event, so suppress it.
-                        return;
-                    }
-                    // Update the coordinates of the viewpoint.
-                    var width = Number($viewpoint.attr('width'));
-                    var height = Number($viewpoint.attr('height'));
-                    var clickCoords = d3.mouse($minimapSvg.node());
-                    _this.viewpointCoord.x = clickCoords[0] - width / 2;
-                    _this.viewpointCoord.y = clickCoords[1] - height / 2;
-                    _this.updateViewpoint();
-                });
-                this.viewpoint = $viewpoint.node();
-                this.minimapSvg = $minimapSvg.node();
-                this.minimap = minimap;
-                this.canvas = $minimap.select('canvas.first').node();
-                this.canvasBuffer =
-                    $minimap.select('canvas.second').node();
-                this.downloadCanvas =
-                    $minimap.select('canvas.download').node();
-                d3.select(this.downloadCanvas).style('display', 'none');
-                this.update();
-            }
-            /**
-             * Updates the position and the size of the viewpoint rectangle.
-             * It also notifies the main svg about the new panned position.
-             */
-            Minimap.prototype.updateViewpoint = function () {
-                // Update the coordinates of the viewpoint rectangle.
-                d3.select(this.viewpoint)
-                    .attr('x', this.viewpointCoord.x)
-                    .attr('y', this.viewpointCoord.y);
-                // Update the translation vector of the main svg to reflect the
-                // new viewpoint.
-                var mainX = -this.viewpointCoord.x * this.scaleMain / this.scaleMinimap;
-                var mainY = -this.viewpointCoord.y * this.scaleMain / this.scaleMinimap;
-                var zoomEvent = this.mainZoom.translate([mainX, mainY]).event;
-                d3.select(this.zoomG).call(zoomEvent);
-            };
-            /**
-             * Redraws the minimap. Should be called whenever the main svg
-             * was updated (e.g. when a node was expanded).
-             */
-            Minimap.prototype.update = function () {
-                var _this = this;
-                var sceneSize = null;
-                try {
-                    // Get the size of the entire scene.
-                    sceneSize = this.zoomG.getBBox();
-                    if (sceneSize.width === 0) {
-                        // There is no scene anymore. We have been detached from the dom.
-                        return;
-                    }
-                }
-                catch (e) {
-                    // Firefox produced NS_ERROR_FAILURE if we have been
-                    // detached from the dom.
-                    return;
-                }
-                var $download = d3.select('#graphdownload');
-                this.download = $download.node();
-                $download.on('click', function (d) {
-                    _this.download.href = _this.downloadCanvas.toDataURL('image/png');
-                });
-                var $svg = d3.select(this.svg);
-                // Read all the style rules in the document and embed them into the svg.
-                // The svg needs to be self contained, i.e. all the style rules need to be
-                // embedded so the canvas output matches the origin.
-                var stylesText = '';
-                for (var k = 0; k < document.styleSheets.length; k++) {
-                    try {
-                        var cssRules = document.styleSheets[k].cssRules ||
-                            document.styleSheets[k].rules;
-                        if (cssRules == null) {
-                            continue;
-                        }
-                        for (var i = 0; i < cssRules.length; i++) {
-                            // Remove tf-* selectors from the styles.
-                            stylesText +=
-                                cssRules[i].cssText.replace(/ ?tf-[\w-]+ ?/g, '') + '\n';
-                        }
-                    }
-                    catch (e) {
-                        if (e.name !== 'SecurityError') {
-                            throw e;
-                        }
-                    }
-                }
-                // Temporarily add the css rules to the main svg.
-                var svgStyle = $svg.append('style');
-                svgStyle.text(stylesText);
-                // Temporarily remove the zoom/pan transform from the main svg since we
-                // want the minimap to show a zoomed-out and centered view.
-                var $zoomG = d3.select(this.zoomG);
-                var zoomTransform = $zoomG.attr('transform');
-                $zoomG.attr('transform', null);
-                // Since we add padding, account for that here.
-                sceneSize.height += this.labelPadding * 2;
-                sceneSize.width += this.labelPadding * 2;
-                // Temporarily assign an explicit width/height to the main svg, since
-                // it doesn't have one (uses flex-box), but we need it for the canvas
-                // to work.
-                $svg.attr({
-                    width: sceneSize.width,
-                    height: sceneSize.height,
-                });
-                // Since the content inside the svg changed (e.g. a node was expanded),
-                // the aspect ratio have also changed. Thus, we need to update the scale
-                // factor of the minimap. The scale factor is determined such that both
-                // the width and height of the minimap are <= maximum specified w/h.
-                this.scaleMinimap =
-                    this.maxWandH / Math.max(sceneSize.width, sceneSize.height);
-                this.minimapSize = {
-                    width: sceneSize.width * this.scaleMinimap,
-                    height: sceneSize.height * this.scaleMinimap
-                };
-                // Update the size of the minimap's svg, the buffer canvas and the
-                // viewpoint rect.
-                d3.select(this.minimapSvg).attr(this.minimapSize);
-                d3.select(this.canvasBuffer).attr(this.minimapSize);
-                // Download canvas width and height are multiples of the style width and
-                // height in order to increase pixel density of the PNG for clarity.
-                d3.select(this.downloadCanvas).style({ width: sceneSize.width, height: sceneSize.height });
-                d3.select(this.downloadCanvas).attr({ width: sceneSize.width * 3, height: sceneSize.height * 3 });
-                if (this.translate != null && this.zoom != null) {
-                    // Update the viewpoint rectangle shape since the aspect ratio of the
-                    // map has changed.
-                    requestAnimationFrame(function () { return _this.zoom(); });
-                }
-                // Serialize the main svg to a string which will be used as the rendering
-                // content for the canvas.
-                var svgXml = (new XMLSerializer()).serializeToString(this.svg);
-                // Now that the svg is serialized for rendering, remove the temporarily
-                // assigned styles, explicit width and height and bring back the pan/zoom
-                // transform.
-                svgStyle.remove();
-                $svg.attr({
-                    width: null,
-                    height: null
-                });
-                $zoomG.attr('transform', zoomTransform);
-                var image = new Image();
-                image.onload = function () {
-                    // Draw the svg content onto the buffer canvas.
-                    var context = _this.canvasBuffer.getContext('2d');
-                    context.clearRect(0, 0, _this.canvasBuffer.width, _this.canvasBuffer.height);
-                    context.drawImage(image, 0, 0, _this.minimapSize.width, _this.minimapSize.height);
-                    requestAnimationFrame(function () {
-                        // Hide the old canvas and show the new buffer canvas.
-                        d3.select(_this.canvasBuffer).style('display', null);
-                        d3.select(_this.canvas).style('display', 'none');
-                        // Swap the two canvases.
-                        _a = [_this.canvasBuffer, _this.canvas], _this.canvas = _a[0], _this.canvasBuffer = _a[1];
-                        var _a;
-                    });
-                    var downloadContext = _this.downloadCanvas.getContext('2d');
-                    downloadContext.clearRect(0, 0, _this.downloadCanvas.width, _this.downloadCanvas.height);
-                    downloadContext.drawImage(image, 0, 0, _this.downloadCanvas.width, _this.downloadCanvas.height);
-                };
-                image.onerror = function () {
-                    var blob = new Blob([svgXml], { type: 'image/svg+xml;charset=utf-8' });
-                    image.src = URL.createObjectURL(blob);
-                };
-                image.src =
-                    'data:image/svg+xml;charset=utf-8,' + encodeURIComponent(svgXml);
-            };
-            /**
-             * Handles changes in zooming/panning. Should be called from the main svg
-             * to notify that a zoom/pan was performed and this minimap will update it's
-             * viewpoint rectangle.
-             *
-             * @param translate The translate vector, or none to use the last used one.
-             * @param scale The scaling factor, or none to use the last used one.
-             */
-            Minimap.prototype.zoom = function (translate, scale) {
-                if (this.scaleMinimap == null) {
-                    // Scene is not ready yet.
-                    return;
-                }
-                // Update the new translate and scale params, only if specified.
-                this.translate = translate || this.translate;
-                this.scaleMain = scale || this.scaleMain;
-                // Update the location of the viewpoint rectangle.
-                var svgRect = this.svg.getBoundingClientRect();
-                var $viewpoint = d3.select(this.viewpoint);
-                this.viewpointCoord.x = -this.translate[0] * this.scaleMinimap /
-                    this.scaleMain;
-                this.viewpointCoord.y = -this.translate[1] * this.scaleMinimap /
-                    this.scaleMain;
-                var viewpointWidth = svgRect.width * this.scaleMinimap / this.scaleMain;
-                var viewpointHeight = svgRect.height * this.scaleMinimap / this.scaleMain;
-                $viewpoint.attr({
-                    x: this.viewpointCoord.x,
-                    y: this.viewpointCoord.y,
-                    width: viewpointWidth,
-                    height: viewpointHeight
-                });
-                // Show/hide the minimap depending on the viewpoint area as fraction of the
-                // whole minimap.
-                var mapWidth = this.minimapSize.width;
-                var mapHeight = this.minimapSize.height;
-                var x = this.viewpointCoord.x;
-                var y = this.viewpointCoord.y;
-                var w = Math.min(Math.max(0, x + viewpointWidth), mapWidth) -
-                    Math.min(Math.max(0, x), mapWidth);
-                var h = Math.min(Math.max(0, y + viewpointHeight), mapHeight) -
-                    Math.min(Math.max(0, y), mapHeight);
-                var fracIntersect = (w * h) / (mapWidth * mapHeight);
-                if (fracIntersect < FRAC_VIEWPOINT_AREA) {
-                    this.minimap.classList.remove('hidden');
-                }
-                else {
-                    this.minimap.classList.add('hidden');
-                }
-            };
-            return Minimap;
-        }());
-        scene.Minimap = Minimap;
-    })(scene = tf.scene || (tf.scene = {}));
-})(tf || (tf = {})); // close module tf.scene
-</script>
-
-<dom-module id="tf-graph-minimap" assetpath="../tf-graph/">
-<template>
-<style>
-:host {
-  background-color:white;
-  transition: opacity .3s linear;
-  pointer-events: auto;
-}
-
-:host.hidden {
-  opacity: 0;
-  pointer-events: none;
-}
-
-canvas {
-  border: 1px solid #999;
-}
-
-rect {
-  fill: white;
-  stroke: #111111;
-  stroke-width: 1px;
-  fill-opacity: 0;
-  filter: url("#minimapDropShadow");
-  cursor: move;
-}
-
-svg {
-  position: absolute;
-}
-</style>
-<svg>
-  <defs>
-    <filter id="minimapDropShadow" x="-20%" y="-20%" width="150%" height="150%">
-      <feOffset result="offOut" in="SourceGraphic" dx="1" dy="1"></feOffset>
-      <feColorMatrix result="matrixOut" in="offOut" type="matrix" values="0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.1 0 0 0 0 0 0.5 0"></feColorMatrix>
-      <feGaussianBlur result="blurOut" in="matrixOut" stdDeviation="2"></feGaussianBlur>
-      <feBlend in="SourceGraphic" in2="blurOut" mode="normal"></feBlend>
-    </filter>
-  </defs>
-  <rect></rect>
-</svg>
-<canvas class="first"></canvas>
-
-<canvas class="second"></canvas>
-<canvas class="download"></canvas>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-minimap',
-
-  /**
-   * Initializes the minimap and returns a minimap object to notify when
-   * things update.
-   *
-   * @param svg The main svg element.
-   * @param zoomG The svg group used for panning and zooming the main svg.
-   * @param mainZoom The main zoom behavior.
-   * @param maxWandH The maximum width/height for the minimap.
-   * @param labelPadding Padding in pixels due to the main graph labels.
-   */
-  init: function(svg, zoomG, mainZoom, maxWAndH, labelPadding) {
-    return new tf.scene.Minimap(svg, zoomG, mainZoom, this, maxWAndH,
-        labelPadding);
-  }
-});
-</script>
-</dom-module>
-
-<dom-module id="tf-graph-scene" assetpath="../tf-graph/">
-<template>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  font-size: 20px;
-}
-
-::content #svg {
-  overflow: hidden;
-  flex: 1;
-  height: 100%;
-  width: 100%;
-}
-
-::content #hidden {
-  position: fixed;
-  top: 0px;
-  visibility: hidden;
-}
-
-/* --- Node and annotation-node for Metanode --- */
-
-::content .meta > .nodeshape > rect,
-::content .meta > .annotation-node > rect {
-  cursor: pointer;
-  fill: hsl(0, 0%, 70%);
-}
-
-::content .node.meta.highlighted > .nodeshape > rect,
-::content .node.meta.highlighted > .annotation-node > rect {
-  stroke-width: 2;
-}
-
-::content .annotation.meta.highlighted > .nodeshape > rect,
-::content .annotation.meta.highlighted > .annotation-node > rect {
-  stroke-width: 1;
-}
-
-::content .meta.selected > .nodeshape > rect,
-::content .meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded > .nodeshape > rect,
-::content .node.meta.selected.expanded > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 3;
-}
-
-::content .annotation.meta.selected > .nodeshape > rect,
-::content .annotation.meta.selected > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .node.meta.selected.expanded.highlighted > .nodeshape > rect,
-::content .node.meta.selected.expanded.highlighted > .annotation-node > rect {
-  stroke: red;
-  stroke-width: 4;
-}
-
-::content .faded,
-::content .faded rect,
-::content .faded ellipse,
-::content .faded path,
-::content .faded use,
-::content #rectHatch line,
-::content #ellipseHatch line {
-  color: #e0d4b3 !important;
-  fill: white;
-  stroke: #e0d4b3 !important;
-}
-
-
-::content .faded path {
-  stroke-width: 1px !important;
-}
-
-::content .faded rect {
-  fill: url("#rectHatch") !important;
-}
-
-::content .faded ellipse,
-::content .faded use {
-  fill: url("#ellipseHatch") !important;
-}
-
-::content .faded text {
-  opacity: 0;
-}
-
-/* Rules used for input-tracing. */
-::content .input-highlight > * > rect,
-::content .input-highlight > * > ellipse,
-::content .input-highlight > * > use
-{
-  fill: white;
-  stroke: #ff9800 !important;
-}
-
-/*  - Faded non-input styling */
-::content .non-input > * > rect,
-::content .non-input > * > ellipse,
-::content .non-input > * > use,
-/* For Const nodes. */
-::content .non-input > * > .constant:not([class*="input-highlight"]) >
-  .annotation-node > ellipse,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect {
-  stroke: #e0d4b3 !important;
-  stroke-width: inherit;
-  stroke-dasharray: inherit;
-}
-
-
-::content .non-input path {
-  visibility: hidden;
-}
-
-::content .non-input > .nodeshape > rect,
-::content .non-input > .annotation-node > rect,
-/* For styling of annotation nodes of non-input nodes. */
-::content .non-input > g > .annotation > .annotation-node > rect
-{
-  fill: url("#rectHatch") !important;
-}
-
-::content .non-input ellipse,
-::content .non-input use {
-  fill: url("#ellipseHatch") !important;
-}
-
-::content .non-input > text {
-  opacity: 0;
-}
-
-::content .non-input .annotation > .annotation-edge {
-  marker-end: url("#annotation-arrowhead-faded");
-}
-
-::content .non-input .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead-faded");
-}
-
-/* Input edges. */
-::content .input-edge-highlight > text {
-  fill: black !important;
-}
-::content .input-edge-highlight > path,
-::content .input-highlight > .in-annotations > .annotation > .annotation-edge,
-::content .input-highlight-selected > .in-annotations > .annotation >
-.annotation-edge {
-  stroke: #999 !important;
-}
-
-/* Non-input edges. */
-::content .non-input-edge-highlight,
-::content .non-input > g > .annotation > path,
-/* Annotation styles (label and edges respectively). */
-::content .non-input > g >
-.annotation:not(.input-highlight):not(.input-highlight-selected) >
-.annotation-label
-/*.annotation-edge*/
-{
-  visibility: hidden;
-}
-
-/* --- Op Node --- */
-
-::content .op > .nodeshape > ellipse,
-::content .op > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #fff;
-  stroke: #ccc;
-}
-
-::content .op.selected > .nodeshape > ellipse,
-::content .op.selected > .annotation-node > ellipse {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .op.highlighted > .nodeshape > ellipse,
-::content .op.highlighted > .annotation-node > ellipse {
-  stroke-width: 2;
-}
-
-/* --- Series Node --- */
-
-/* By default, don't show the series background <rect>. */
-::content .series > .nodeshape > rect {
-  fill: hsl(0, 0%, 70%);
-  fill-opacity: 0;
-  stroke-dasharray: 5, 5;
-  stroke-opacity: 0;
-  cursor: pointer;
-}
-
-/* Once expanded, show the series background <rect> and hide the <use>. */
-::content .series.expanded > .nodeshape > rect {
-  fill-opacity: 0.15;
-  stroke: hsl(0, 0%, 70%);
-  stroke-opacity: 1;
-}
-::content .series.expanded > .nodeshape > use {
-  visibility: hidden;
-}
-
-/**
- * TODO(jimbo): Simplify this by applying a stable class name to all <g>
- * elements that currently have either the nodeshape or annotation-node classes.
- */
-::content .series > .nodeshape > use ,
-::content .series > .annotation-node > use {
-  stroke: #ccc;
-}
-::content .series.highlighted > .nodeshape > use ,
-::content .series.highlighted > .annotation-node > use {
-  stroke-width: 2;
-}
-::content .series.selected > .nodeshape > use ,
-::content .series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .series.selected > .nodeshape > rect {
-  stroke: red;
-  stroke-width: 2;
-}
-
-::content .annotation.series.selected > .annotation-node > use {
-  stroke: red;
-  stroke-width: 2;
-}
-
-/* --- Bridge Node --- */
-::content .bridge > .nodeshape > rect {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* --- Structural Elements --- */
-::content .edge > path.edgeline.structural {
-  stroke: #f0f;
-  opacity: 0.2;
-  display: none;
-}
-
-/* --- Series Nodes --- */
-
-/* Hide the rect for a series' annotation. */
-::content .series > .annotation-node > rect {
-  display: none;
-}
-
-/* --- Node label --- */
-
-
-::content .node > text.nodelabel {
-  cursor: pointer;
-  fill: #444;
-}
-
-::content .meta.expanded > text.nodelabel {
-  font-size: 9px;
-}
-
-::content .series > text.nodelabel {
-  font-size: 8px;
-}
-
-::content .op > text.nodelabel {
-  font-size: 6px;
-}
-
-::content .bridge > text.nodelabel {
-  display: none;
-}
-
-::content .node.meta.expanded > text.nodelabel{
-  cursor: normal;
-}
-
-::content .annotation.meta.highlighted > text.annotation-label {
-  fill: #50A3F7;
-}
-
-::content .annotation.meta.selected > text.annotation-label {
-  fill: #4285F4;
-}
-
-/* --- Annotation --- */
-
-/* only applied for annotations that are not summary or constant.
-(.summary, .constant gets overriden below) */
-::content .annotation > .annotation-node > * {
-  stroke-width: 0.5;
-  stroke-dasharray: 1, 1;
-}
-
-::content .annotation.summary > .annotation-node > *,
-::content .annotation.constant > .annotation-node > * {
-  stroke-width: 1;
-  stroke-dasharray: none;
-}
-
-::content .annotation > .annotation-edge {
-  fill: none;
-  stroke: #aaa;
-  stroke-width: 0.5;
-  marker-end: url("#annotation-arrowhead");
-}
-
-::content .faded .annotation > .annotation-edge {
-  marker-end: url("#annotation-arrowhead-faded");
-}
-
-::content .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead");
-}
-
-::content .faded .annotation > .annotation-edge.refline {
-  marker-start: url("#ref-annotation-arrowhead-faded");
-}
-
-::content .annotation > .annotation-control-edge {
-  stroke-dasharray: 1, 1;
-}
-
-::content #annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content #ref-annotation-arrowhead {
-  fill: #aaa;
-}
-
-::content #ref-annotation-arrowhead-faded {
-  fill: #e0d4b3;
-}
-
-::content .annotation > .annotation-label {
-  font-size: 5px;
-  cursor: pointer;
-}
-::content .annotation > .annotation-label.annotation-ellipsis {
-  cursor: default;
-}
-
-/* Hide annotations on expanded meta nodes since they're redundant. */
-::content .expanded > .in-annotations,
-::content .expanded > .out-annotations {
-  display: none;
-}
-
-/* --- Annotation: Constant --- */
-
-::content .constant > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: white;
-  stroke: #848484;
-}
-
-::content .constant.selected > .annotation-node > ellipse {
-  fill: white;
-  stroke: red;
-}
-
-::content .constant.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Annotation: Summary --- */
-
-::content .summary > .annotation-node > ellipse {
-  cursor: pointer;
-  fill: #DB4437;
-  stroke: #DB4437;
-}
-
-::content .summary.selected > .annotation-node > ellipse {
-  fill: #A52714;
-  stroke: #A52714;
-}
-
-::content .summary.highlighted > .annotation-node > ellipse {
-  stroke-width: 1.5;
-}
-
-/* --- Edge --- */
-
-::content .edge > path.edgeline {
-  fill: none;
-  stroke: #bbb;
-  stroke-linecap: round;
-  stroke-width: 0.75;
-}
-
-/* Labels showing tensor shapes on edges */
-::content .edge > text {
-  font-size: 3.5px;
-  fill: #666;
-}
-
-::content .ref-arrowhead {
-  fill: #bbb;
-}
-
-::content .edge .control-dep {
-  stroke-dasharray: 2, 2;
-}
-
-/* --- Group node expand/collapse button --- */
-
-/* Hides expand/collapse buttons when a node isn't expanded or highlighted. Using
-   incredibly small opacity so that the bounding box of the <g> parent still takes
-   this container into account even when it isn't visible */
-::content .node:not(.highlighted):not(.expanded) > .nodeshape > .buttoncontainer {
-  opacity: 0.01;
-}
-::content .node.highlighted > .nodeshape > .buttoncontainer {
-  cursor: pointer;
-}
-::content .buttoncircle {
-  fill: #E7811D;
-}
-::content .buttoncircle:hover {
-  fill: #B96717;
-}
-::content .expandbutton,
-::content .collapsebutton {
-  stroke: white;
-}
-/* Do not let the path elements in the button take pointer focus */
-::content .node > .nodeshape > .buttoncontainer > .expandbutton,
-::content .node > .nodeshape > .buttoncontainer > .collapsebutton {
-  pointer-events: none;
-}
-/* Only show the expand button when a node is collapsed and only show the
-   collapse button when a node is expanded. */
-::content .node.expanded > .nodeshape > .buttoncontainer > .expandbutton {
-  display: none;
-}
-::content .node:not(.expanded) > .nodeshape > .buttoncontainer > .collapsebutton {
-  display: none;
-}
-
-::content .health-pill-stats {
-  font-size: 4px;
-  text-anchor: middle;
-}
-
-::content .health-pill rect {
-  filter: url("#health-pill-shadow");
-  rx: 3;
-  ry: 3;
-}
-
-.titleContainer {
-  position: relative;
-  top: 20px;
-}
-
-.title {
-  position: absolute;
-}
-
-.auxTitle {
-  position: absolute;
-}
-
-#minimap {
-  position: absolute;
-  right: 20px;
-  bottom: 20px;
-}
-</style>
-<div class="titleContainer">
-  <div id="title" class="title">Main Graph</div>
-  <div id="auxTitle" class="auxTitle">Auxiliary Nodes</div>
-</div>
-<svg id="svg">
-  <defs>
-
-
-    <path id="ref-arrowhead-path" d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"></path>
-    <marker class="ref-arrowhead" id="ref-arrowhead-small" viewBox="0 0 10 10" markerWidth="10" markerHeight="10" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-medium" viewBox="0 0 10 10" markerWidth="13" markerHeight="13" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-large" viewBox="0 0 10 10" markerWidth="16" markerHeight="16" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-    <marker class="ref-arrowhead" id="ref-arrowhead-xlarge" viewBox="0 0 10 10" markerWidth="20" markerHeight="20" refX="8" refY="5" orient="auto" markerUnits="userSpaceOnUse">
-      <use xlink:href="#ref-arrowhead-path"></use>
-    </marker>
-
-
-    <marker id="annotation-arrowhead" markerWidth="5" markerHeight="5" refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"></path>
-    </marker>
-    <marker id="annotation-arrowhead-faded" markerWidth="5" markerHeight="5" refX="5" refY="2.5" orient="auto">
-      <path d="M 0,0 L 5,2.5 L 0,5 L 0,0"></path>
-    </marker>
-    <marker id="ref-annotation-arrowhead" markerWidth="5" markerHeight="5" refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"></path>
-    </marker>
-    <marker id="ref-annotation-arrowhead-faded" markerWidth="5" markerHeight="5" refX="0" refY="2.5" orient="auto">
-      <path d="M 5,0 L 0,2.5 L 5,5 L 5,0"></path>
-    </marker>
-
-    <ellipse id="op-node-stamp" rx="7.5" ry="3" stroke="inherit" fill="inherit"></ellipse>
-
-    <ellipse id="op-node-annotation-stamp" rx="5" ry="2" stroke="inherit" fill="inherit"></ellipse>
-
-    <g id="op-series-vertical-stamp">
-      <use xlink:href="#op-node-stamp" x="8" y="9"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="6"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="3"></use>
-    </g>
-
-    <g id="op-series-horizontal-stamp">
-      <use xlink:href="#op-node-stamp" x="16" y="4"></use>
-      <use xlink:href="#op-node-stamp" x="12" y="4"></use>
-      <use xlink:href="#op-node-stamp" x="8" y="4"></use>
-    </g>
-
-    <g id="op-series-annotation-stamp">
-      <use xlink:href="#op-node-annotation-stamp" x="9" y="2"></use>
-      <use xlink:href="#op-node-annotation-stamp" x="7" y="2"></use>
-      <use xlink:href="#op-node-annotation-stamp" x="5" y="2"></use>
-    </g>
-    <svg id="summary-icon" fill="#848484" height="12" viewBox="0 0 24 24" width="12">
-      <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"></path>
-    </svg>
-
-    <g id="linearGradients"></g>
-
-
-    <pattern id="rectHatch" patternTransform="rotate(45 0 0)" width="5" height="5" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="5" style="stroke-width: 1"></line>
-    </pattern>
-    <pattern id="ellipseHatch" patternTransform="rotate(45 0 0)" width="2" height="2" patternUnits="userSpaceOnUse">
-      <line x1="0" y1="0" x2="0" y2="2" style="stroke-width: 1"></line>
-    </pattern>
-
-
-    <filter id="health-pill-shadow" x="-40%" y="-40%" width="180%" height="180%">
-      <feGaussianBlur in="SourceAlpha" stdDeviation="0.8"></feGaussianBlur>
-      <feOffset dx="0" dy="0" result="offsetblur"></feOffset>
-      <feFlood flood-color="#000000"></feFlood>
-      <feComposite in2="offsetblur" operator="in"></feComposite>
-      <feMerge>
-        <feMergeNode></feMergeNode>
-        <feMergeNode in="SourceGraphic"></feMergeNode>
-      </feMerge>
-    </filter>
-  </defs>
-
-  <rect fill="white" width="10000" height="10000"></rect>
-  <g id="root"></g>
-</svg>
-<tf-graph-minimap id="minimap"></tf-graph-minimap>
-</template>
-<script>
-Polymer({
-  is: 'tf-graph-scene',
-  properties: {
-    renderHierarchy: Object,
-    name: String,
-    colorBy: String,
-    /** @type {d3_zoom} d3 zoom object */
-    _zoom: Object,
-    highlightedNode: {
-      type: String,
-      observer: '_highlightedNodeChanged'
-    },
-    selectedNode: {
-      type: String,
-      observer: '_selectedNodeChanged'
-    },
-    /** Keeps track of if the graph has been zoomed/panned since loading */
-    _zoomed: {
-      type: Boolean,
-      observer: '_onZoomChanged',
-      value: false
-    },
-    /** Keeps track of the starting coordinates of a graph zoom/pan */
-    _zoomStartCoords: {
-      type: Array,
-      value: null
-    },
-    /** Keeps track of the current coordinates of a graph zoom/pan */
-    _zoomCoords: {
-      type: Array,
-      value: null
-    },
-    /** Maximum distance of a zoom event for it to be interpreted as a click */
-    _maxZoomDistanceForClick: {
-      type: Number,
-      value: 20
-    },
-    /**
-     * @type {d3.scale.ordinal}
-     * Scale mapping from template name to a number between 0 and N-1
-     * where N is the number of different template names. Used by
-     * tf.graph.scene.node when computing node color by structure.
-     */
-    templateIndex: Function,
-    /**
-     * @type {tf.scene.Minimap}
-     * A minimap object to notify for zoom events.
-     */
-    minimap: Object,
-    /*
-     * Dictionary for easily stylizing nodes when state changes.
-     * _nodeGroupIndex[nodeName] = d3_selection of the nodeGroup
-     */
-    _nodeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing annotation nodes when state changes.
-     * _annotationGroupIndex[nodeName][hostNodeName] =
-     *   d3_selection of the annotationGroup
-     */
-    _annotationGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /*
-     * Dictionary for easily stylizing edges when state changes.
-     * _edgeGroupIndex[edgeName] = d3_selection of the edgeGroup
-     */
-    _edgeGroupIndex: {
-      type: Object,
-      value: function() { return {}; }
-    },
-    /**
-     * Max font size for metanode label strings.
-     */
-    maxMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 9
-    },
-    /**
-     * Min font size for metanode label strings.
-     */
-    minMetanodeLabelLengthFontSize: {
-      type: Number,
-      value: 6
-    },
-    /**
-     * Metanode label strings longer than this are given smaller fonts.
-     */
-    maxMetanodeLabelLengthLargeFont: {
-      type: Number,
-      value: 11
-    },
-    /**
-     * Metanode label strings longer than this are truncated with ellipses.
-     */
-    maxMetanodeLabelLength: {
-      type: Number,
-      value: 18
-    },
-    progress: Object,
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-  },
-  observers: [
-    '_colorByChanged(colorBy)',
-    '_buildAndFit(renderHierarchy)',
-    '_updateHealthPills(nodeNamesToHealthPills, healthPillStepIndex)',
-  ],
-  getNode: function(nodeName) {
-    return this.renderHierarchy.getRenderNodeByName(nodeName);
-  },
-  isNodeExpanded: function(node) {
-    return node.expanded;
-  },
-  setNodeExpanded: function(renderNode) {
-    this._build(this.renderHierarchy);
-    this._updateLabels(!this._zoomed);
-  },
-  /**
-   * Resets the state of the component. Called whenever the whole graph
-   * (dataset) changes.
-   */
-  _resetState: function() {
-    // Reset the state of the component.
-    this._nodeGroupIndex = {};
-    this._annotationGroupIndex = {};
-    this._edgeGroupIndex = {};
-    this._updateLabels(false);
-    // Remove all svg elements under the 'root' svg group.
-    d3.select(this.$.svg).select('#root').selectAll('*').remove();
-    // And the defs.
-    d3.select(this.$.svg).select('defs #linearGradients')
-        .selectAll('*').remove();
-  },
-  /** Main method for building the scene */
-  _build: function(renderHierarchy) {
-    this.templateIndex = renderHierarchy.hierarchy.getTemplateIndex();
-    tf.graph.util.time('tf-graph-scene (layout):', function() {
-      // layout the scene for this meta / series node
-      tf.graph.layout.layoutScene(renderHierarchy.root, this);
-    }.bind(this));
-
-    tf.graph.util.time('tf-graph-scene (build scene):', function() {
-      tf.graph.scene.buildGroup(d3.select(this.$.root), renderHierarchy.root, this);
-      tf.graph.scene.addGraphClickListener(this.$.svg, this);
-      tf.graph.scene.node.traceInputs(renderHierarchy);
-    }.bind(this));
-    // Update the minimap again when the graph is done animating.
-    setTimeout(function() {
-      this._updateHealthPills(this.nodeNamesToHealthPills, this.healthPillStepIndex);
-      this.minimap.update();
-    }.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  ready: function() {
-    this._zoom = d3.behavior.zoom()
-      .on('zoomend', function() {
-        if (this._zoomStartCoords) {
-          // Calculate the total distance dragged during the zoom event.
-          // If it is sufficiently small, then fire an event indicating
-          // that zooming has ended. Otherwise wait to fire the zoom end
-          // event, so that a mouse click registered as part of this zooming
-          // is ignored (as this mouse click was part of a zooming, and should
-          // not be used to indicate an actual click on the graph).
-          var dragDistance = Math.sqrt(
-            Math.pow(this._zoomStartCoords[0] - this._zoomCoords[0], 2) +
-            Math.pow(this._zoomStartCoords[1] - this._zoomCoords[1], 2));
-          if (dragDistance < this._maxZoomDistanceForClick) {
-            this._fireEnableClick();
-          } else {
-            setTimeout(this._fireEnableClick.bind(this), 50);
-          }
-        }
-        this._zoomStartCoords = null;
-      }.bind(this))
-      .on('zoom', function() {
-        // Store the coordinates of the zoom event
-        this._zoomCoords = d3.event.translate;
-
-        // If this is the first zoom event after a zoom-end, then
-        // store the coordinates as the start coordinates as well,
-        // and fire an event to indicate that zooming has started.
-        // This doesn't use the zoomstart event, as d3 sends this
-        // event on mouse-down, even if there has been no dragging
-        // done to translate the graph around.
-        if (!this._zoomStartCoords) {
-          this._zoomStartCoords = this._zoomCoords.slice();
-          this.fire('disable-click');
-        }
-        this._zoomed = true;
-        d3.select(this.$.root).attr('transform',
-                    'translate(' + d3.event.translate + ')' +
-                    'scale(' + d3.event.scale + ')');
-        // Notify the minimap.
-        this.minimap.zoom(d3.event.translate, d3.event.scale);
-      }.bind(this));
-    d3.select(this.$.svg).call(this._zoom)
-      .on('dblclick.zoom', null);
-    d3.select(window).on('resize', function() {
-      // Notify the minimap that the user's window was resized.
-      // The minimap will figure out the new dimensions of the main svg
-      // and will use the existing translate and scale params.
-      this.minimap.zoom();
-    }.bind(this));
-    // Initialize the minimap.
-    this.minimap = this.$.minimap.init(this.$.svg, this.$.root, this._zoom,
-        tf.graph.layout.PARAMS.minimap.size,
-        tf.graph.layout.PARAMS.subscene.meta.labelHeight);
-  },
-  _buildAndFit: function(renderHierarchy) {
-    this._resetState();
-    this._build(renderHierarchy);
-    // Fit to screen after the graph is done animating.
-    setTimeout(this.fit.bind(this), tf.graph.layout.PARAMS.animation.duration);
-  },
-  _updateLabels: function(showLabels) {
-    var mainGraphTitleElement = this.getElementsByClassName('title')[0];
-    var titleStyle = mainGraphTitleElement.style;
-    var auxTitleStyle = this.getElementsByClassName('auxTitle')[0].style;
-    var core = d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-      tf.graph.scene.Class.Scene.CORE)[0][0];
-    // Only show labels if the graph is fully loaded.
-    if (showLabels && core && this.progress && this.progress.value === 100) {
-      var aux =
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.INEXTRACT)[0][0] ||
-        d3.select("." + tf.graph.scene.Class.Scene.GROUP + ">." +
-          tf.graph.scene.Class.Scene.OUTEXTRACT)[0][0];
-      var coreX = core.getCTM().e;
-      var auxX = aux ? aux.getCTM().e : null;
-      titleStyle.display = 'inline';
-      titleStyle.left = coreX + 'px';
-      if (auxX !== null && auxX !== coreX) {
-        auxTitleStyle.display = 'inline';
-
-        // Make sure that the aux title is positioned rightwards enough so as to
-        // prevent overlap with the main graph title.
-        auxX = Math.max(
-            coreX + mainGraphTitleElement.getBoundingClientRect().width, auxX);
-
-        auxTitleStyle.left = auxX + 'px';
-      } else {
-        auxTitleStyle.display = 'none';
-      }
-    } else {
-      titleStyle.display='none';
-      auxTitleStyle.display = 'none';
-    }
-  },
-  /**
-    * Called whenever the user changed the 'color by' option in the
-    * UI controls.
-    */
-  _colorByChanged: function() {
-    if (this.renderHierarchy != null) {
-      // We iterate through each svg node and update its state.
-      _.each(this._nodeGroupIndex, function(nodeGroup, nodeName) {
-        this._updateNodeState(nodeName);
-      }, this);
-      // Notify also the minimap.
-      this.minimap.update();
-    }
-  },
-  fit: function() {
-    tf.graph.scene.fit(this.$.svg, this.$.root, this._zoom, function() {
-      this._zoomed = false;
-    }.bind(this));
-  },
-  isNodeSelected: function(n) {
-    return n === this.selectedNode;
-  },
-  isNodeHighlighted: function(n) {
-    return n === this.highlightedNode;
-  },
-  addAnnotationGroup: function(a, d, selection) {
-    var an = a.node.name;
-    this._annotationGroupIndex[an] = this._annotationGroupIndex[an] || {};
-    this._annotationGroupIndex[an][d.node.name] = selection;
-  },
-  getAnnotationGroupsIndex: function(a) {
-    return this._annotationGroupIndex[a];
-  },
-  removeAnnotationGroup: function(a, d) {
-    delete this._annotationGroupIndex[a.node.name][d.node.name];
-  },
-  addNodeGroup: function(n, selection) {
-    this._nodeGroupIndex[n] = selection;
-  },
-  getNodeGroup: function(n) {
-    return this._nodeGroupIndex[n];
-  },
-  removeNodeGroup: function(n) {
-    delete this._nodeGroupIndex[n];
-  },
-  addEdgeGroup: function(n, selection) {
-    this._edgeGroupIndex[e] = selection;
-  },
-  getEdgeGroup: function(e) {
-    return this._edgeGroupIndex[e];
-  },
-  _updateHealthPills: function(nodeNamesToHealthPills, healthPillStepIndex) {
-    tf.graph.scene.addHealthPills(
-        this.$.svg, nodeNamesToHealthPills, healthPillStepIndex);
-  },
-  /**
-   * Update node and annotation node of the given name.
-   * @param  {String} n node name
-   */
-  _updateNodeState: function(n) {
-    var node = this.getNode(n);
-    var nodeGroup = this.getNodeGroup(n);
-
-    if (nodeGroup) {
-      tf.graph.scene.node.stylize(nodeGroup, node, this);
-    }
-
-    var annotationGroupIndex = this.getAnnotationGroupsIndex(n);
-    _.each(annotationGroupIndex, function(aGroup, hostName) {
-      tf.graph.scene.node.stylize(aGroup, node, this,
-          tf.graph.scene.Class.Annotation.NODE);
-    }, this);
-  },
-
-  /**
-   * Handles new node selection. 1) Updates the selected-state of each node,
-   * 2) triggers input tracing.
-   * @param selectedNode {string} The name of the newly selected node.
-   * @param oldSelectedNode {string} The name of the previously selected node.
-   * @private
-   */
-  _selectedNodeChanged: function(selectedNode, oldSelectedNode) {
-    if (selectedNode === oldSelectedNode) {
-      return;
-    }
-
-    if (selectedNode) {
-      this._updateNodeState(selectedNode);
-    }
-    if (oldSelectedNode) {
-      this._updateNodeState(oldSelectedNode);
-    }
-
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-
-    if (!selectedNode) {
-      return;
-    }
-
-
-    // Update the minimap to reflect the highlighted (selected) node.
-    this.minimap.update();
-    var node = this.renderHierarchy.hierarchy.node(selectedNode);
-    var nodeParents = [];
-    // Create list of all metanode parents of the selected node.
-    while (node.parentNode != null
-        && node.parentNode.name != tf.graph.ROOT_NAME) {
-      node = node.parentNode;
-      nodeParents.push(node.name);
-    }
-    // Ensure each parent metanode is built and expanded.
-    var topParentNodeToBeExpanded;
-    _.forEachRight(nodeParents, function(parentName) {
-      this.renderHierarchy.buildSubhierarchy(parentName);
-      var renderNode = this.renderHierarchy.getRenderNodeByName(parentName);
-      if (renderNode.node.isGroupNode && !renderNode.expanded) {
-        renderNode.expanded = true;
-        if (!topParentNodeToBeExpanded) {
-          topParentNodeToBeExpanded = renderNode;
-        }
-      }
-    }, this);
-    // If any expansion was needed to display this selected node, then
-    // inform the scene of the top-most expansion.
-    if (topParentNodeToBeExpanded) {
-      this.setNodeExpanded(topParentNodeToBeExpanded);
-      this._zoomed = true;
-    }
-
-    if (tf.graph.scene.panToNode(selectedNode, this.$.svg, this.$.root,
-        this._zoom)) {
-      this._zoomed = true;
-    }
-  },
-  _highlightedNodeChanged: function(highlightedNode, oldHighlightedNode) {
-    if (highlightedNode === oldHighlightedNode) {
-      return;
-    }
-
-    if (highlightedNode) {
-      this._updateNodeState(highlightedNode);
-    }
-    if (oldHighlightedNode) {
-      this._updateNodeState(oldHighlightedNode);
-    }
-  },
-  _onZoomChanged: function() {
-    this._updateLabels(!this._zoomed);
-  },
-  _fireEnableClick: function() {
-    this.fire('enable-click');
-  },
-});
-</script>
-</dom-module>
-<link rel="import" href="../iron-flex-layout/iron-flex-layout.html">
-<dom-module id="tf-graph" assetpath="../tf-graph/">
-<template>
-<style>
-.container {
-  width: 100%;
-  height: 100%;
-  background: white;
-  box-shadow: 0 1px 5px rgba(0,0,0,0.2);
-}
-
-.vertical {
-  width:100%;
-  height:100%;
-  @apply(--layout-vertical);
-}
-
-.auto {
-  @apply(--layout-flex-auto);
-  @apply(--layout-vertical);
-}
-
-h2 {
-  text-align: center;
-}
-
-paper-button {
-  text-transform: none;
-}
-</style>
-<div class="container">
-  <div class="vertical">
-    <template is="dom-if" if="[[title]]">
-      <h2>[[title]]</h2>
-    </template>
-    <tf-graph-scene id="scene" class="auto" render-hierarchy="[[renderHierarchy]]" highlighted-node="[[_getVisible(highlightedNode)]]" selected-node="{{selectedNode}}" color-by="[[colorBy]]" progress="[[progress]]" node-names-to-health-pills="[[nodeNamesToHealthPills]]" health-pill-step-index="{{healthPillStepIndex}}"></tf-graph-scene>
-  </div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-
-  is: 'tf-graph',
-
-  properties: {
-    graphHierarchy: {
-      type: Object,
-      notify: true,
-      observer: '_graphChanged'
-    },
-    basicGraph: Object,
-    stats: Object,
-    devicesForStats: Object,
-    hierarchyParams: Object,
-    progress: {
-      type: Object,
-      notify: true,
-    },
-    title: String,
-    selectedNode: {
-      type: String,
-      notify: true,
-    },
-    highlightedNode: {
-      type: String,
-      notify: true
-    },
-    /** What to color the nodes by (compute time, memory, device etc.) */
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true,
-      readOnly: true, // Produces and doesn't consume.
-    },
-    renderHierarchy: {
-      type: Object,
-      readOnly: true,
-      notify: true,
-    },
-    _renderDepth: {
-      type: Number,
-      value: 1
-    },
-    _allowGraphSelect: {
-      type: Boolean,
-      value: true
-    },
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-  },
-  observers: [
-    '_statsChanged(stats, devicesForStats)',
-    '_buildRenderHierarchy(graphHierarchy)'
-  ],
-  _statsChanged: function(stats, devicesForStats) {
-    if (this.graphHierarchy) {
-      if (stats && devicesForStats) {
-        tf.graph.joinStatsInfoWithGraph(this.basicGraph, stats, devicesForStats);
-        tf.graph.hierarchy.joinAndAggregateStats(this.graphHierarchy, stats);
-      }
-      // Recompute the rendering information.
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }
-  },
-  _buildRenderHierarchy: function(graphHierarchy) {
-    tf.graph.util.time('new tf.graph.render.Hierarchy', function() {
-      if (graphHierarchy.root.type !== tf.graph.NodeType.META) {
-        // root must be metanode but sometimes Polymer's dom-if has not
-        // remove tf-graph element yet in <tf-node-info>
-        // and thus mistakenly pass non-metanode to this module.
-        return;
-      }
-      var renderGraph = new tf.graph.render.RenderGraphInfo(
-          graphHierarchy, !!this.stats /** displayingStats */);
-      // Producing the 'color by' parameters to be consumed
-      // by the tf-graph-controls panel. It contains information about the
-      // min and max values and their respective colors, as well as list
-      // of devices with their respective colors.
-
-      function getColorParamsFromScale(scale) {
-        return {
-          minValue: scale.domain()[0],
-          maxValue: scale.domain()[1],
-          startColor: scale.range()[0],
-          endColor: scale.range()[1]
-        };
-      }
-
-      this._setColorByParams({
-        compute_time: getColorParamsFromScale(renderGraph.computeTimeScale),
-        memory: getColorParamsFromScale(renderGraph.memoryUsageScale),
-        device: _.map(renderGraph.deviceColorMap.domain(),
-            function(deviceName) {
-          return {
-            device: deviceName,
-            color: renderGraph.deviceColorMap(deviceName)
-          };
-        })
-      });
-      this._setRenderHierarchy(renderGraph);
-      this.async(function() {
-        this.fire("rendered");
-      });
-    }.bind(this));
-  },
-  _getVisible: function(name) {
-    if (!name) {
-      return name;
-    }
-    return this.renderHierarchy.getNearestVisibleAncestor(name);
-  },
-  listeners: {
-    'graph-select': '_graphSelected',
-    'disable-click': '_disableClick',
-    'enable-click': '_enableClick',
-    // Nodes
-    'node-toggle-expand': '_nodeToggleExpand',
-    'node-select': '_nodeSelected',
-    'node-highlight': '_nodeHighlighted',
-    'node-unhighlight': '_nodeUnhighlighted',
-    'node-toggle-extract': '_nodeToggleExtract',
-    'node-toggle-seriesgroup': '_nodeToggleSeriesGroup',
-
-    // Annotations
-
-    /* Note: currently highlighting/selecting annotation node has the same
-      * behavior as highlighting/selecting actual node so we point to the same
-      * set of event listeners.  However, we might redesign this to be a bit
-      * different.
-      */
-    'annotation-select': '_nodeSelected',
-    'annotation-highlight': '_nodeHighlighted',
-    'annotation-unhighlight': '_nodeUnhighlighted',
-  },
-  _graphChanged: function() {
-    // When a new graph is loaded, fire this event so that there is no
-    // info-card being displayed for the previously-loaded graph.
-    this.fire('graph-select');
-  },
-  _graphSelected: function(event) {
-    // Graph selection is not allowed during an active zoom event, as the
-    // click seen during a zoom/pan is part of the zooming and does not
-    // indicate a user desire to click on a specific section of the graph.
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', null);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _disableClick: function(event) {
-    this._allowGraphSelect = false;
-  },
-  _enableClick: function(event) {
-    this._allowGraphSelect = true;
-  },
-  _nodeSelected: function(event) {
-    if (this._allowGraphSelect) {
-      this.set('selectedNode', event.detail.name);
-    }
-    // Reset this variable as a bug in d3 zoom behavior can cause zoomend
-    // callback not to be called if a right-click happens during a zoom event.
-    this._allowGraphSelect = true;
-  },
-  _nodeHighlighted: function(event) {
-    this.set('highlightedNode', event.detail.name);
-  },
-  _nodeUnhighlighted: function(event) {
-    this.set('highlightedNode', null);
-  },
-  _nodeToggleExpand: function(event) {
-    // Immediately select the node that is about to be expanded.
-    this._nodeSelected(event);
-
-    // Compute the sub-hierarchy scene.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    // Op nodes are not expandable.
-    if (renderNode.node.type === tf.graph.NodeType.OP) {
-      return;
-    }
-    this.renderHierarchy.buildSubhierarchy(nodeName);
-    renderNode.expanded = !renderNode.expanded;
-
-    // Expand the node with some delay so that the user can immediately see
-    // the visual effect of selecting that node, before the expansion is
-    // done.
-    this.async(function() {
-      this.querySelector('#scene').setNodeExpanded(renderNode);
-    }, 75);
-  },
-  _nodeToggleExtract: function(event) {
-    // Toggle the include setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    var renderNode = this.renderHierarchy.getRenderNodeByName(nodeName);
-    if (renderNode.node.include == tf.graph.InclusionType.INCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.EXCLUDE;
-    } else if (renderNode.node.include == tf.graph.InclusionType.EXCLUDE) {
-      renderNode.node.include = tf.graph.InclusionType.INCLUDE;
-    } else {
-      renderNode.node.include =
-       this.renderHierarchy.isNodeAuxiliary(renderNode)
-          ? tf.graph.InclusionType.INCLUDE : tf.graph.InclusionType.EXCLUDE;
-    }
-
-    // Rebuild the render hierarchy.
-    this._buildRenderHierarchy(this.graphHierarchy);
-  },
-  _nodeToggleSeriesGroup: function(event) {
-    // Toggle the group setting of the specified node appropriately.
-    var nodeName = event.detail.name;
-    tf.graph.toggleNodeSeriesGroup(this.hierarchyParams.seriesMap, nodeName);
-
-    // Rebuild the render hierarchy with the updated series grouping map.
-    this.set('progress', {
-      value: 0,
-      msg: ''
-    });
-    var tracker = tf.graph.util.getTracker(this);
-    var hierarchyTracker = tf.graph.util.getSubtaskTracker(tracker, 100,
-          'Namespace hierarchy');
-    tf.graph.hierarchy.build(this.basicGraph, this.hierarchyParams, hierarchyTracker)
-    .then(function(graphHierarchy) {
-      this.set('graphHierarchy', graphHierarchy);
-      this._buildRenderHierarchy(this.graphHierarchy);
-    }.bind(this));
-  },
-  not: function(x) {
-    return !x;
-  }
-});
-</script>
-<dom-module id="tf-graph-icon" assetpath="../tf-graph/">
-  <style>
-    .faded-rect {
-      fill: url("#rectHatch");
-    }
-
-    .faded-ellipse {
-      fill: url("#ellipseHatch");
-    }
-
-    .faded-rect, .faded-ellipse, .faded-series {
-      stroke:   var(--tb-graph-faded) !important;
-    }
-  </style>
-  <template>
-    <template is="dom-if" if="[[_isType(node, type, 'OP')]]">
-      <template is="dom-if" if="[[_isConst(node, const)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
-          <circle cx="5" cy="5" r="3" fill$="[[_getFill(_computedFill, 'OP')]]" stroke$="[[_getStroke(_computedFill, 'OP')]]"></circle>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isSummary(node, summary)]]">
-        <svg width$="[[height]]" height$="[[height]]" viewBox="0 0 12 12">
-          <use x="0" y="0" xlink:href="#summary-icon"></use>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[_isRegularOp(node, const, summary)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 8">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-node-stamp" fill$="[[_getFill(_computedFill, 'OP')]]" stroke$="[[_getStroke(_computedFill, 'OP')]]" class$="{{_fadedClass(renderInfo, 'ellipse')}}" x="8" y="4"></use>
-        </svg>
-      </template>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'META')]]">
-      <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 37 16">
-        <rect x="1" y="1" fill$="[[_getFill(_computedFill, 'META')]]" stroke$="[[_getStroke(_computedFill, 'META')]]" class$="{{_fadedClass(renderInfo, 'rect')}}" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-      </svg>
-    </template>
-    <template is="dom-if" if="[[_isType(node, type, 'SERIES')]]">
-      <template is="dom-if" if="[[_isVertical(node, vertical)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 16 15">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-series-vertical-stamp" fill$="[[_getFill(_computedFill, 'SERIES')]]" stroke$="[[_getStroke(_computedFill, 'SERIES')]]" class$="{{_fadedClass(renderInfo, 'series')}}" x="0" y="2"></use>
-        </svg>
-      </template>
-      <template is="dom-if" if="[[!_isVertical(node, vertical)]]">
-        <svg height$="[[height]]" preserveAspectRatio="xMinYMid meet" viewBox="0 0 24 10">
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#op-series-horizontal-stamp" fill$="[[_getFill(_computedFill, 'SERIES')]]" stroke$="[[_getStroke(_computedFill, 'SERIES')]]" class$="{{_fadedClass(renderInfo, 'series')}}" x="0" y="1"></use>
-        </svg>
-      </template>
-    </template>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-graph-icon',
-
-        properties: {
-          /**
-           * Node to represent with an icon. Optional, but if specified, its
-           * properties override those defined in the type, vertical, const and
-           * summary properties.
-           * @type {tf.graph.Node}
-           */
-          node: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * Render node information associated with this node. Optional. If
-           * specified, this is only used when computing the fill of the icon
-           * element.
-           * @type {tf.graph.render.RenderNodeInfo}
-           */
-          renderInfo: {
-            type: Object,
-            value: null
-          },
-
-          /**
-           * String indicating the type of coloring to use for this node, used
-           * only for determining the fill.
-           */
-          colorBy: {
-            type: Object,
-            value: "structural"
-          },
-
-          /**
-           * Function used by structural coloring algorithm to determine which
-           * color to use based on the template ID of the node. Optional.
-           */
-          templateIndex: {
-            type: Function,
-            value: null
-          },
-
-          /** Type of node to draw (ignored if node is set). */
-          type: {
-            type: String,
-            value: null
-          },
-
-          /** Direction for series (ignored for other types). */
-          vertical: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is Const (ignored for non-ops). */
-          const: {
-            type: Boolean,
-            value: false
-          },
-
-          /** Whether the op is a Summary (ignored for non-ops). */
-          summary: {
-            type: Boolean,
-            value: false
-          },
-
-          /**
-           * Fill for the icon, optional. If fill is specified and node is not
-           * specified, then this value will override the default for the
-           * element. However, if node is specified, this value will be ignored.
-           */
-          fill: {
-            type: String,
-            value: null
-          },
-
-          /** Height of the SVG element in pixels, used for scaling. */
-          height: {
-            type: Number,
-            value: 20
-          },
-
-          /** The computed fill for the node. **/
-          _computedFill: {
-            type: String,
-            computed:
-              "_getComputedFill(node, renderInfo, colorBy, templateIndex, fill)"
-          }
-
-        },
-
-        /**
-         * Get the computed fill value for the element.
-         */
-        _getComputedFill: function(inputNode, inputRenderInfo, inputColorBy,
-            inputTemplateIndex, inputFill) {
-          if (inputNode && inputRenderInfo &&
-              inputColorBy && inputTemplateIndex) {
-            var ns = tf.graph.scene.node;
-            var colorBy = ns.ColorBy[inputColorBy.toUpperCase()];
-            return ns.getFillForNode(inputTemplateIndex, colorBy,
-                inputRenderInfo, false);
-          }
-          return inputFill;
-        },
-
-        /**
-         * Get the fill value for the element, or if that's not possible, return
-         * the default fill value for the node type.
-         */
-        _getFill: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill || ({
-            OP: tf.graph.render.OpNodeColors.DEFAULT_FILL,
-            META: tf.graph.render.MetanodeColors.DEFAULT_FILL,
-            SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_FILL
-          })[inputNodeType];
-        },
-
-        /**
-         * Get the stroke value for the element, or if that's not possible,
-         * return the default stroke value for the node type.
-         */
-        _getStroke: function(inputComputedFill, inputNodeType) {
-          return inputComputedFill ?
-            tf.graph.scene.node.getStrokeForFill(inputComputedFill) :
-            ({
-              OP: tf.graph.render.OpNodeColors.DEFAULT_STROKE,
-              META: tf.graph.render.MetanodeColors.DEFAULT_STROKE,
-              SERIES: tf.graph.render.SeriesNodeColors.DEFAULT_STROKE
-            })[inputNodeType];
-        },
-
-        /**
-         * Test whether the specified node's type, or the literal type string,
-         * match a particular other type.
-         */
-        _isType: function(inputNode, inputType, targetType) {
-          if (inputNode) {
-            return tf.graph.NodeType[inputNode.type] === targetType;
-          }
-          return inputType === targetType;
-        },
-
-        /**
-         * Test whether the specified node should be represented as a vertical
-         * series. Defaults to the value of the vertical property if node is
-         * not specified.
-         */
-        _isVertical: function(inputNode, inputVertical) {
-          if (inputNode) {
-            return inputNode.hasNonControlEdges;
-          }
-          return !!inputVertical;
-        },
-
-        /**
-         * Test whether the specified node is a constant. Defaults to the value
-         * of the const property if node is not specified.
-         */
-        _isConst: function(inputNode, inputConst) {
-          if (inputNode) {
-            return inputNode.op === 'Const';
-          }
-          return !!inputConst;
-        },
-
-        /**
-         * Test whether the specified node is a summary. Defaults to the value
-         * of the summary property if node is not specified.
-         */
-        _isSummary: function(inputNode, inputSummary) {
-          if (inputNode) {
-            return this._isType(inputNode, null, 'OP') &&
-                inputNode.op.substr(-7) === 'Summary';
-          }
-          return !!inputSummary;
-        },
-
-        /**
-         * Test whether the op node is a regular non-summary non-const node.
-         */
-        _isRegularOp: function(inputNode, inputConst, inputSummary) {
-          return !this._isConst(inputNode, inputConst) &&
-              !this._isSummary(inputNode, inputSummary);
-        },
-
-        _fadedClass: function(itemRenderInfo, shape) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded-' + shape : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<dom-module id="tf-node-list-item" assetpath="../tf-graph-info/">
-  <style>
-  #list-item {
-    width: 100%;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-    position: relative;
-    display: inline-block;
-  }
-
-  #list-item:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .clickable {
-    cursor: pointer;
-  }
-
-  #list-item span {
-    margin-left: 40px;
-  }
-
-  #list-item.excluded span {
-    color: #999;
-  }
-
-  #list-item span.edge-label {
-    float: right;
-    font-size: 10px;
-    margin-left: 3px;
-    margin-right: 5px;
-  }
-
-  .node-icon {
-    position: absolute;
-    top: 1px;
-    left: 2px;
-  }
-
-  .faded span {
-    color: var(--tb-graph-faded);
-  }
-  </style>
-  <template>
-    <div id="list-item" on-mouseover="_nodeListener" on-mouseout="_nodeListener" on-click="_nodeListener">
-      <div class$="{{_fadedClass(itemRenderInfo)}}">
-        <tf-graph-icon class="node-icon" height="12" color-by="[[colorBy]]" color-by-params="[[colorByParams]]" node="[[itemNode]]" render-info="[[itemRenderInfo]]" template-index="[[templateIndex]]"></tf-graph-icon>
-        <span title$="[[name]]">[[name]]</span>
-        <span class="edge-label">[[edgeLabel]]</span>
-      </div>
-    </div>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-list-item',
-
-        properties: {
-          /**
-           * The Node for the card itself, on which this item is being drawn.
-           * @type {tf.graph.Node}
-           */
-          cardNode: Object,
-          /**
-           * The Node for the item within the card, somehow related to cardNode.
-           * @type {tf.graph.Node}
-           */
-          itemNode: Object,
-          /** The edge label associated with this item. */
-          edgeLabel: String,
-          /**
-           * The render node information for the item node. Used by the graph
-           * icon in determining fill color.
-           */
-          itemRenderInfo: Object,
-          name: String,
-          itemType: {
-            type: String,
-            observer: '_itemTypeChanged'
-          },
-          colorBy: String,
-          colorByParams: Object,
-          templateIndex: Function
-        },
-
-        _itemTypeChanged: function() {
-          if (this.itemType !== 'subnode') {
-            this.$['list-item'].classList.add('clickable');
-          } else {
-            this.$['list-item'].classList.remove('clickable');
-          }
-        },
-
-        _nodeListener: function(event) {
-          // fire node.click/mouseover/mouseout
-          this.fire('node-list-item-' + event.type, {
-            cardNode: this.cardNode.name,
-            nodeName: this.name,
-            type: this.itemType
-          });
-        },
-
-        _fadedClass: function(itemRenderInfo) {
-          return itemRenderInfo && itemRenderInfo.isFadedOut ? 'faded' : '';
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<link rel="import" href="../iron-list/iron-list.html">
-<link rel="import" href="../paper-item/all-imports.html">
-<dom-module id="tf-node-info" assetpath="../tf-graph-info/">
-  <style>
-  .sub-list-group {
-    font-weight: 500;
-    font-size: 12pt;
-    padding-bottom: 8px;
-    width: 100%;
-  }
-
-  .sub-list {
-    max-height: 300px;
-    overflow-y: scroll;
-  }
-
-  .attr-left {
-    float: left;
-    width: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .attr-right {
-    margin-left: 30%;
-    word-wrap: break-word;
-    color: #565656;
-    font-weight: 400;
-  }
-
-  .sub-list-table {
-    display: table;
-    width: 100%;
-  }
-
-  .sub-list-table-row {
-    display: table-row;
-  }
-
-  .sub-list-table-row .sub-list-table-cell:last-child {
-    text-align: right;
-  }
-
-  .sub-list-table-cell {
-    color: #565656;
-    display: table-cell;
-    font-size: 11pt;
-    font-weight: 400;
-    max-width: 200px;
-    padding: 0 4px;
-  }
-
-  paper-item {
-    padding: 0;
-    background: #e9e9e9;
-  }
-
-  paper-item-body[two-line] {
-    min-height: 0;
-    padding: 8px 12px 4px;
-  }
-
-  .expandedInfo {
-    padding: 8px 12px;
-  }
-
-  .controlDeps {
-    padding: 0 0 0 8px;
-  }
-
-  .node-name {
-    white-space: normal;
-    word-wrap: break-word;
-    font-size: 14pt;
-    font-weight: 500;
-  }
-
-  .node-icon {
-    float: right;
-  }
-
-  .subtitle {
-    font-size: 12pt;
-    color: #5e5e5e;
-  }
-
-  .controlLine {
-    font-size: 11pt;
-    font-weight: 400;
-  }
-
-  .toggle-button {
-    float: right;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .control-toggle-button {
-    float: left;
-    max-height: 20px;
-    max-width: 20px;
-    padding: 0;
-  }
-
-  .toggle-include-group {
-    padding-top: 4px;
-  }
-
-  .toggle-include {
-    margin: 5px 6px;
-    text-transform: none;
-    padding: 4px 6px;
-    font-size: 10pt;
-    background-color: #fafafa;
-    color: #666;
-  }
-
-  .toggle-include:hover {
-    background-color: var(--google-yellow-100);
-  }
-
-  .non-control-list-item {
-    padding-left: 10px;
-  }
-  </style>
-  <template>
-    <paper-item>
-      <paper-item-body two-line="">
-        <div>
-          <paper-icon-button icon="{{_getToggleIcon(_expanded)}}" on-click="_toggleExpanded" class="toggle-button">
-          </paper-icon-button>
-          <div class="node-name" id="nodetitle"></div>
-        </div>
-        <div secondary="">
-          <tf-graph-icon class="node-icon" node="[[_node]]" render-info="[[_getRenderInfo(nodeName, renderHierarchy)]]" color-by="[[colorBy]]" template-index="[[_templateIndex]]"></tf-graph-icon>
-          <template is="dom-if" if="{{_node.op}}">
-            <div class="subtitle">
-              Operation:
-              <span>[[_node.op]]</span>
-            </div>
-          </template>
-          <template is="dom-if" if="{{_node.metagraph}}">
-            <div class="subtitle">
-              Subgraph:
-              <span>[[_node.cardinality]]</span> nodes
-            </div>
-          </template>
-        </div>
-      </paper-item-body>
-    </paper-item>
-    <iron-collapse opened="{{_expanded}}">
-    <template is="dom-if" if="{{_expanded}}" restamp="true">
-      <div class="expandedInfo">
-        <div class="sub-list-group attributes">
-          Attributes
-          (<span>[[_attributes.length]]</span>)
-          <iron-list class="sub-list" id="attributesList" items="[[_attributes]]">
-            <template>
-              <div>
-                <div class="attr-left">[[item.key]]</div>
-                <div class="attr-right">[[item.value]]</div>
-              </div>
-            </template>
-          </iron-list>
-        </div>
-
-        <template is="dom-if" if="{{_device}}">
-          <div class="sub-list-group device">
-            <div class="attr-left">Device</div>
-            <div class="attr-right">[[_device]]</div>
-          </div>
-        </template>
-
-        <div class="sub-list-group predecessors">
-          Inputs
-          (<span>[[_totalPredecessors]]</span>)
-          <iron-list class="sub-list" id="inputsList" items="[[_predecessors.regular]]">
-            <template>
-              <tf-node-list-item class="non-control-list-item" card-node="[[_node]]" item-node="[[item.node]]" edge-label="[[item.edgeLabel]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="predecessors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_predecessors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button icon="{{_getToggleIcon(_openedControlPred)}}" on-click="_toggleControlPred" class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlPred}}" no-animation="">
-                <template is="dom-if" if="{{_openedControlPred}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_predecessors.control]]">
-                    <template>
-                      <tf-node-list-item card-node="[[_node]]" item-node="[[item.node]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="predecessors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-
-        <div class="sub-list-group successors">
-          Outputs
-          (<span>[[_totalSuccessors]]</span>)
-          <iron-list class="sub-list" id="outputsList" items="[[_successors.regular]]">
-            <template>
-              <tf-node-list-item class="non-control-list-item" card-node="[[_node]]" item-node="[[item.node]]" edge-label="[[item.edgeLabel]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="successor" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-              </tf-node-list-item>
-            </template>
-          </iron-list>
-          <template is="dom-if" if="[[_successors.control.length]]">
-            <div class="controlDeps">
-              <div class="controlLine">
-                <paper-icon-button icon="{{_getToggleIcon(_openedControlSucc)}}" on-click="_toggleControlSucc" class="control-toggle-button">
-                </paper-icon-button>
-                Control dependencies
-              </div>
-              <iron-collapse opened="{{_openedControlSucc}}" no-animation="">
-                <template is="dom-if" if="{{_openedControlSucc}}" restamp="true">
-                  <iron-list class="sub-list" items="[[_successors.control]]">
-                    <template>
-                      <tf-node-list-item card-node="[[_node]]" item-node="[[item.node]]" item-render-info="[[item.renderInfo]]" name="[[item.name]]" item-type="successors" color-by="[[colorBy]]" template-index="[[_templateIndex]]">
-                      </tf-node-list-item>
-                    </template>
-                  </iron-list>
-                </template>
-              </iron-collapse>
-            </div>
-          </template>
-        </div>
-        <template is="dom-if" if="{{_hasDisplayableNodeStats}}">
-          <div class="sub-list-group node-stats">
-            Node Stats
-            <div class="sub-list-table">
-              <template is="dom-if" if="{{_nodeStats.totalBytes}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Memory</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedBytes]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_nodeStats.totalMicros}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Compute Time</div>
-                  <div class="sub-list-table-cell">[[_nodeStatsFormattedComputeTime]]</div>
-                </div>
-              </template>
-              <template is="dom-if" if="{{_nodeStats.outputSize}}">
-                <div class="sub-list-table-row">
-                  <div class="sub-list-table-cell">Tensor Output Sizes</div>
-                  <div class="sub-list-table-cell">
-                    <template is="dom-repeat" items="{{_nodeStatsFormattedOutputSizes}}">
-                      [[item]] <br>
-                    </template>
-                  </div>
-                </div>
-              </template>
-            </div>
-          </div>
-        </template>
-        <div class="toggle-include-group">
-          <paper-button raised="" class="toggle-include" on-click="_toggleInclude">
-            <span>[[_auxButtonText]]</span>
-          </paper-button>
-        </div>
-        <template is="dom-if" if="{{_isInSeries(_node)}}">
-          <div class="toggle-include-group">
-            <paper-button raised="" class="toggle-include" on-click="_toggleGroup">
-              <span>[[_groupButtonText]]</span>
-            </paper-button>
-          </div>
-        </template>
-      </div>
-    </template>
-    </iron-collapse>
-  </template>
-
-  <script>
-    (function() {
-      Polymer({
-        is: 'tf-node-info',
-
-        properties: {
-          nodeName: String,
-          graphHierarchy: Object,
-          renderHierarchy: Object,
-          /** What to color the nodes by (compute time, memory, device etc.) */
-          colorBy: String,
-          _templateIndex: {
-            type: Function,
-            computed: '_getTemplateIndex(graphHierarchy)'
-          },
-          _node: {
-            type: Object,
-            computed: '_getNode(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _nodeStats: {
-            type: Object,
-            computed: '_getNodeStats(nodeName, graphHierarchy)',
-            observer: '_resetState'
-          },
-          _hasDisplayableNodeStats: {
-            type: Object,
-            computed: '_getHasDisplayableNodeStats(_nodeStats)',
-          },
-          _nodeStatsFormattedBytes: {
-            type: String,
-            computed: '_getNodeStatsFormattedBytes(_nodeStats)',
-          },
-          _nodeStatsFormattedComputeTime: {
-            type: String,
-            computed: '_getNodeStatsFormattedComputeTime(_nodeStats)',
-          },
-          _nodeStatsFormattedOutputSizes: {
-            type: Array,
-            computed: '_getNodeStatsFormattedOutputSizes(_nodeStats)',
-          },
-          // The enum value of the include property of the selected node.
-          nodeInclude: {
-            type: Number,
-            observer: '_nodeIncludeStateChanged'
-          },
-          _attributes: {
-            type: Array,
-            computed: '_getAttributes(_node)'
-          },
-          _device: {
-            type: String,
-            computed: '_getDevice(_node)'
-          },
-          _successors: {
-            type: Object,
-            computed: '_getSuccessors(_node, graphHierarchy)'
-          },
-          _predecessors: {
-            type: Object,
-            computed: '_getPredecessors(_node, graphHierarchy)'
-          },
-          _subnodes: {
-            type: Array,
-            computed: '_getSubnodes(_node)'
-          },
-          _expanded: {
-            type: Boolean,
-            value: true
-          },
-          _totalPredecessors: {
-            type: Number,
-            computed: '_getTotalPred(_predecessors)'
-          },
-          _totalSuccessors: {
-            type: Number,
-            computed: '_getTotalSucc(_successors)'
-          },
-          _openedControlPred: {
-            type: Boolean,
-            value: false
-          },
-          _openedControlSucc: {
-            type: Boolean,
-            value: false
-          },
-          _auxButtonText: String,
-          _groupButtonText: String
-        },
-        expandNode: function() {
-          this.fire('_node.expand', this.node);
-        },
-        _getTemplateIndex: function(graphHierarchy) {
-          return graphHierarchy.getTemplateIndex();
-        },
-        _getNode: function(nodeName, graphHierarchy) {
-          return graphHierarchy.node(nodeName);
-        },
-        _getNodeStats: function(nodeName, graphHierarchy) {
-          var node = this._getNode(nodeName, graphHierarchy);
-          if (node) {
-            return node.stats;
-          }
-          return null;
-        },
-        _getHasDisplayableNodeStats: function(stats) {
-          return tf.graph.util.hasDisplayableNodeStats(stats);
-        },
-        _getNodeStatsFormattedBytes: function(stats) {
-          if (!stats || !stats.totalBytes) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalBytes, tf.graph.util.MEMORY_UNITS);
-        },
-        _getNodeStatsFormattedComputeTime: function(stats) {
-          if (!stats || !stats.totalMicros) {
-            return;
-          }
-
-          return tf.graph.util.convertUnitsToHumanReadable(
-              stats.totalMicros, tf.graph.util.TIME_UNITS);
-        },
-        _getNodeStatsFormattedOutputSizes: function(stats) {
-          if (!stats || !stats.outputSize || !stats.outputSize.length) {
-            return;
-          }
-
-          return _.map(stats.outputSize, function(shape) {
-            if (shape.length === 0) {
-              return "scalar";
-            }
-            return "[" + shape.join(", ") + "]";
-          });
-        },
-        _getPrintableHTMLNodeName: function(nodeName) {
-          // Insert an optional line break before each slash so that
-          // long node names wrap cleanly at path boundaries.
-          return (nodeName || '').replace(/\//g, '<wbr>/');
-        },
-        _getRenderInfo: function(nodeName, renderHierarchy) {
-          return this.renderHierarchy.getOrCreateRenderNodeByName(nodeName);
-        },
-        _getAttributes: function(node) {
-          this.async(this._resizeList.bind(this, "#attributesList"));
-          if (!node || !node.attr) {
-            return [];
-          }
-          var attrs = [];
-          _.each(node.attr, function(entry) {
-            // Unpack the "too large" attributes into separate attributes
-            // in the info card, with values "too large to show".
-            if (entry.key === tf.graph.LARGE_ATTRS_KEY) {
-              attrs = attrs.concat(entry.value.list.s.map(function(key) {
-                return {key: key, value: "Too large to show..."};
-              }));
-            } else {
-              attrs.push({
-                key: entry.key,
-                value: JSON.stringify(entry.value)
-              });
-            }
-          });
-          return attrs;
-        },
-        _getDevice: function(node) {
-          return node ? node.device : null;
-        },
-        _getSuccessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#inputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getSuccessors(node.name), false, node.isGroupNode);
-        },
-        _getPredecessors: function(node, hierarchy) {
-          this.async(this._resizeList.bind(this, "#outputsList"));
-          if (!node) {
-            return {regular: [], control: []}
-          }
-          return this._convertEdgeListToEdgeInfoList(
-            hierarchy.getPredecessors(node.name), true, node.isGroupNode);
-        },
-        _convertEdgeListToEdgeInfoList: function(list, isPredecessor, isGroupNode) {
-
-          /**
-           * Unpacks the metaedge into a list of base edge information
-           * that can be rendered.
-           */
-          var unpackMetaedge = function(metaedge) {
-            return _.map(metaedge.baseEdgeList, function(baseEdge) {
-              name = isPredecessor ? baseEdge.v : baseEdge.w;
-              return {
-                name: name,
-                node: this._getNode(name, this.graphHierarchy),
-                edgeLabel: tf.graph.scene.edge.getLabelForBaseEdge(baseEdge,
-                    this.renderHierarchy),
-                renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-              };
-            }, this);
-          }.bind(this);
-
-          /**
-           * Converts a list of metaedges to a list of edge information
-           * that can be rendered.
-           */
-          var toEdgeInfoList = function(edges) {
-            var edgeInfoList = [];
-            _.each(edges, function(metaedge) {
-              var name = isPredecessor ? metaedge.v : metaedge.w;
-              // Enumerate all the base edges if the node is an OpNode, or the
-              // metaedge has only 1 edge in it.
-              if (!isGroupNode || metaedge.baseEdgeList.length == 1) {
-                edgeInfoList = edgeInfoList.concat(unpackMetaedge(metaedge));
-              } else {
-                edgeInfoList.push({
-                  name: name,
-                  node: this._getNode(name, this.graphHierarchy),
-                  edgeLabel: tf.graph.scene.edge.getLabelForEdge(metaedge,
-                      this.renderHierarchy),
-                  renderInfo: this._getRenderInfo(name, this.renderHierarchy)
-                });
-              }
-            }, this);
-            return edgeInfoList;
-          }.bind(this);
-
-          return {
-            regular: toEdgeInfoList(list.regular),
-            control: toEdgeInfoList(list.control)
-          };
-        },
-        _getSubnodes: function(node) {
-          return node && node.metagraph ? node.metagraph.nodes() : null;
-        },
-        _getTotalPred: function(predecessors) {
-          return predecessors.regular.length + predecessors.control.length;
-        },
-        _getTotalSucc: function(successors) {
-          return successors.regular.length + successors.control.length;
-        },
-        _toggleControlPred: function() {
-          this._openedControlPred = !this._openedControlPred;
-        },
-        _toggleControlSucc: function() {
-          this._openedControlSucc = !this._openedControlSucc;
-        },
-        _toggleExpanded: function() {
-          this._expanded = !this._expanded;
-        },
-        _getToggleIcon: function(expanded) {
-          return expanded ? "expand-less" : "expand-more";
-        },
-        _resetState: function() {
-          this._openedControlPred = false;
-          this._openedControlSucc = false;
-
-          this.set("_groupButtonText",
-            tf.graph.scene.node.getGroupSettingLabel(this._node));
-
-          if (this._node) {
-            Polymer.dom(this.$.nodetitle).innerHTML =
-              this._getPrintableHTMLNodeName(this._node.name);
-          }
-        },
-        _resizeList: function(selector) {
-          var list = document.querySelector(selector);
-          if (list) {
-            list.fire('iron-resize');
-          }
-        },
-        _toggleInclude: function() {
-          var graphElem = document.querySelector("#graph");
-          graphElem.fire("node-toggle-extract", { name: this.nodeName });
-          var graphBoardElem = document.querySelector("#graphboard");
-          graphBoardElem.fire("node-toggle-extract");
-        },
-        _nodeIncludeStateChanged: function(include, oldInclude) {
-          this.set("_auxButtonText",
-            tf.graph.getIncludeNodeButtonString(include));
-        },
-        _toggleGroup: function() {
-          var graphElem = document.querySelector("#graph");
-          var seriesName = tf.graph.scene.node.getSeriesName(this._node);
-          graphElem.fire("node-toggle-seriesgroup", { name: seriesName });
-        },
-        _isInSeries: function(node) {
-          return tf.graph.scene.node.canBeInSeries(node);
-        }
-      });
-    })();
-  </script>
-</dom-module>
-<dom-module id="tf-graph-info" assetpath="../tf-graph-info/">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  margin: 0;
-  padding: 0;
-  display: block;
-}
-
-h2 {
-  padding: 0;
-  text-align: center;
-  margin: 0;
-}
-
-.health-pill-legend {
-  padding: 15px;
-}
-
-.health-pill-legend h2 {
-  text-align: left;
-}
-
-.health-pill-entry {
-  margin: 10px 10px 10px 0;
-}
-
-.health-pill-entry .color-preview {
-  width: 26px;
-  height: 26px;
-  border-radius: 3px;
-  display: inline-block;
-  margin: 0 10px 0 0;
-}
-
-.health-pill-entry .color-label, .health-pill-entry .tensor-count {
-  color: #777;
-  display: inline-block;
-  height: 26px;
-  font-size: 22px;
-  line-height: 26px;
-  vertical-align: top;
-}
-
-.health-pill-entry .tensor-count {
-  float: right;
-}
-
-#health-pill-step-slider {
-  width: 100%;
-  margin: 0 0 0 -15px;
-  /* 31 comes from adding a padding of 15px from both sides of the paper-slider, subtracting
-   * 1px so that the slider width aligns with the image (the last slider marker takes up 1px),
-   * and adding 2px to account for a border of 1px on both sides of the image. 30 - 1 + 2.
-   * Apparently, the paper-slider lacks a mixin for those padding values. */
-  width: calc(100% + 31px);
-}
-</style>
-<template is="dom-if" if="{{selectedNode}}">
-  <paper-material elevation="1" class="card">
-    <tf-node-info graph-hierarchy="[[graphHierarchy]]" render-hierarchy="[[renderHierarchy]]" flat-graph="[[graph]]" node-name="[[selectedNode]]" node-include="[[selectedNodeInclude]]" highlighted-node="{{highlightedNode}}" color-by="[[colorBy]]">
-    </tf-node-info>
-  </paper-material>
-</template>
-<template is="dom-if" if="[[_healthPillsAvailable(nodeNamesToHealthPills)]]">
-  <paper-material elevation="1" class="card health-pill-legend">
-    <template is="dom-if" if="[[_maxStepIndex]]">
-      <h2>
-        Step of Health Pills: [[_currentStepDisplayValue]]
-      </h2>
-      <paper-slider id="health-pill-step-slider" immediate-value="{{healthPillStepIndex}}" max="[[_maxStepIndex]]" snaps="" step="1" value="{{healthPillStepIndex}}"></paper-slider>
-    </template>
-    <h2>
-      Health Pill
-      <template is="dom-if" if="[[healthPillValuesForSelectedNode]]">
-        Counts for Selected Node
-      </template>
-      <template is="dom-if" if="[[!healthPillValuesForSelectedNode]]">
-        Legend
-      </template>
-    </h2>
-    <template is="dom-repeat" items="[[healthPillEntries]]">
-      <div class="health-pill-entry">
-        <div class="color-preview" style="background:[[item.background_color]]"></div>
-        <div class="color-label">[[item.label]]</div>
-        <div class="tensor-count">
-          [[_computeTensorCountString(healthPillValuesForSelectedNode, index)]]
-        </div>
-      </div>
-    </template>
-  </paper-material>
-</template>
-</template>
-<script>
-(function() {
-  Polymer({
-    is: 'tf-graph-info',
-
-    properties: {
-      title: String,
-      graphHierarchy: Object,
-      graph: Object,
-      renderHierarchy: Object,
-      nodeNamesToHealthPills: Object,
-      healthPillStepIndex: {
-        type: Number,
-        notify: true,
-      },
-      colorBy: String,
-      // Two-ways
-      selectedNode: {
-        type: String,
-        notify: true
-      },
-      highlightedNode: {
-        type: String,
-        notify: true
-      },
-      // The enum value of the include property of the selected node.
-      selectedNodeInclude: {
-        type: Number,
-        notify: true
-      },
-      healthPillEntries: {
-        type: Array,
-        value: tf.graph.scene.healthPillEntries,
-        readOnly: true,
-      },
-      healthPillValuesForSelectedNode: {
-        type: Array,
-        computed: '_computeHealthPillForNode(nodeNamesToHealthPills, healthPillStepIndex, selectedNode)',
-      },
-      _maxStepIndex: {
-        type: Number,
-        computed: '_computeMaxStepIndex(nodeNamesToHealthPills)',
-      },
-      _currentStepDisplayValue: {
-        type: String,
-        computed: '_computeCurrentStepDisplayValue(nodeNamesToHealthPills, healthPillStepIndex)',
-      },
-    },
-    listeners: {
-      'node-list-item-click': '_nodeListItemClicked',
-      'node-list-item-mouseover': '_nodeListItemMouseover',
-      'node-list-item-mouseout': '_nodeListItemMouseout'
-    },
-    _nodeListItemClicked: function(event) {
-      this.selectedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseover: function(event) {
-      this.highlightedNode = event.detail.nodeName;
-    },
-    _nodeListItemMouseout: function() {
-      this.highlightedNode = null;
-    },
-    _healthPillsAvailable: function(nodeNamesToHealthPills) {
-      let count = 0;
-      for (let nodeName in nodeNamesToHealthPills) {
-        return true;
-      }
-      return false;
-    },
-    _computeTensorCountString: function(healthPillValuesForSelectedNode, valueIndex) {
-      if (!healthPillValuesForSelectedNode) {
-        // No health pill data is available.
-        return '';
-      }
-
-      return healthPillValuesForSelectedNode[valueIndex].toFixed(0);
-    },
-    _computeHealthPillForNode: function(
-        nodeNamesToHealthPills, healthPillStepIndex, selectedNode) {
-      if (!selectedNode) {
-        // No node is selected.
-        return null;
-      }
-
-      const healthPills = nodeNamesToHealthPills[selectedNode];
-      if (!healthPills) {
-        // This node lacks a health pill.
-        return null;
-      }
-
-      const healthPill = healthPills[healthPillStepIndex];
-      if (!healthPill) {
-        // This node lacks a health pill at the current step.
-        return null;
-      }
-
-      // The health pill count values start at 2. Each health pill contains 6 values.
-      return healthPill.value.slice(2, 8);
-    },
-    _computeCurrentStepDisplayValue: function(nodeNamesToHealthPills, healthPillStepIndex) {
-      for (let nodeName in nodeNamesToHealthPills) {
-        // All nodes have the same number of steps stored, so only examine 1 node. We cannot
-        // directly index into the nodeNamesToHealthPills object because we do not have a key.
-        return nodeNamesToHealthPills[nodeName][healthPillStepIndex].step.toFixed(0);
-      }
-
-      // The current step could not be computed.
-      return 0;
-    },
-    _computeMaxStepIndex: function(nodeNamesToHealthPills) {
-      for (let nodeName in nodeNamesToHealthPills) {
-        // All nodes have the same number of steps stored, so only examine 1 node.
-        // The index is 1 less than the count. Tensorboard backend logic guarantees that the length
-        // of the array will be greater than 1.
-        return nodeNamesToHealthPills[nodeName].length - 1;
-      }
-
-      // Return a falsy value. The slider should be hidden.
-      return 0;
-    },
-  });
-})();
-</script>
-</dom-module>
-<link rel="import" href="../paper-progress/paper-progress.html">
-
-
-<dom-module id="tf-graph-board" assetpath="../tf-graph-board/">
-<template>
-<style>
-::host {
-  display: block;
-}
-
-/deep/ .close {
-  position: absolute;
-  cursor: pointer;
-  left: 15px;
-  bottom: 15px;
-}
-
-.container {
-  width: 100%;
-  height: 100%;
-  opacity: 1;
-}
-
-.container.loading {
-  cursor: progress;
-  opacity: 0.1;
-}
-
-.container.loading.error {
-  cursor: auto;
-}
-
-#info {
-  position: absolute;
-  right: 5px;
-  top: 5px;
-  padding: 0px;
-  max-width: 380px;
-  min-width: 320px;
-  background-color: rgba(255,255,255,0.9);
-  @apply(--shadow-elevation-2dp);
-}
-
-#main {
-  width: 100%;
-  height: 100%;
-}
-
-#progress-bar {
-  display: flex;
-  flex-direction: column;
-  align-items: center;
-  justify-content: center;
-  width: 100%;
-  position: absolute;
-  top: 40px;
-  left: 0;
-  font-size: 13px;
-}
-
-#progress-msg {
-  width: 400px;
-  margin-bottom: 5px;
-}
-
-paper-progress {
-  width: 400px;
-  --paper-progress-height: 6px;
-  --paper-progress-active-color: #f3913e;
-}
-
-.context-menu {
-  position: absolute;
-  display: none;
-  background-color: #e2e2e2;
-  border-radius: 2px;
-  font-size: 14px;
-  min-width: 150px;
-  border: 1px solid #d4d4d4;
-}
-
-/deep/ .context-menu ul {
-  list-style-type: none;
-  margin: 0;
-  padding: 0;
-  cursor: default;
-}
-
-/deep/ .context-menu ul li {
-  padding: 4px 16px;
-}
-
-/deep/ .context-menu ul li:hover {
-  background-color: #f3913e;
-  color: white;
-}
-</style>
-<template is="dom-if" if="[[_isNotComplete(progress)]]">
-  <div id="progress-bar">
-    <div id="progress-msg">[[progress.msg]]</div>
-    <paper-progress value="[[progress.value]]"></paper-progress>
-  </div>
-</template>
-<div class$="[[_getContainerClass(progress)]]">
-  <div id="main">
-    <tf-graph id="graph" graph-hierarchy="{{graphHierarchy}}" basic-graph="[[graph]]" hierarchy-params="[[hierarchyParams]]" render-hierarchy="{{renderHierarchy}}" devices-for-stats="[[devicesForStats]]" stats="[[stats]]" selected-node="{{_selectedNode}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="{{colorByParams}}" progress="{{progress}}" node-names-to-health-pills="[[nodeNamesToHealthPills]]" health-pill-step-index="[[healthPillStepIndex]]"></tf-graph>
-  </div>
-  <div id="info">
-    <tf-graph-info id="graph-info" title="selected" graph-hierarchy="[[graphHierarchy]]" render-hierarchy="[[renderHierarchy]]" graph="[[graph]]" selected-node="{{_selectedNode}}" selected-node-include="{{_selectedNodeInclude}}" highlighted-node="{{_highlightedNode}}" color-by="[[colorBy]]" color-by-params="[[colorByParams]]" node-names-to-health-pills="[[nodeNamesToHealthPills]]" health-pill-step-index="{{healthPillStepIndex}}"></tf-graph-info>
-  </div>
-  <div class="context-menu"></div>
-</div>
-</template>
-</dom-module>
-
-<script>
-Polymer({
-  is: 'tf-graph-board',
-  properties: {
-    // Public API.
-    graphHierarchy: Object,
-    graph: Object,
-    stats: Object,
-    /**
-     * @type {value: number, msg: string}
-     *
-     * A number between 0 and 100 denoting the % of progress
-     * for the progress bar and the displayed message.
-     */
-    progress: Object,
-    colorBy: String,
-    colorByParams: {
-      type: Object,
-      notify: true
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true
-    },
-    // A mapping between node name to the tf.graph.scene.HealthPill to render.
-    nodeNamesToHealthPills: Object,
-    // The step of health pills to show throughout the graph.
-    healthPillStepIndex: Number,
-    // Private API: Data routing between child components.
-    _selectedNode: String,
-    // The enum value of the include property of the selected node.
-    _selectedNodeInclude: Number,
-    _highlightedNode: String
-  },
-  listeners: {
-    'node-toggle-extract': '_nodeToggleExtract'
-  },
-  observers: [
-    '_updateNodeInclude(_selectedNode)'
-  ],
-  /** True if the progress is not complete yet (< 100 %). */
-  _isNotComplete: function(progress) {
-    return progress.value < 100;
-  },
-  _getContainerClass: function(progress) {
-    var result = 'container';
-    if (progress.error) {
-      result += ' error';
-    }
-    if (this._isNotComplete(progress)) {
-      result += ' loading';
-    }
-    return result;
-  },
-  _updateNodeInclude: function(nodeName) {
-    var node = this.graphHierarchy.node(nodeName);
-    this.set("_selectedNodeInclude",
-      node ? node.include : tf.graph.InclusionType.UNSPECIFIED);
-  },
-  _nodeToggleExtract: function() {
-    this._updateNodeInclude(this._selectedNode);
-  }
-});
-</script>
-<link rel="import" href="../paper-radio-group/paper-radio-group.html">
-<link rel="import" href="../paper-tooltip/paper-tooltip.html">
-<dom-module id="tf-graph-controls" assetpath="../tf-graph/">
-<template>
-<style>
-:host {
-  font-size: 12px;
-  color: gray;
-  --paper-font-subhead: {
-    font-size: 14px;
-    color: gray;
-  };
-  --paper-dropdown-menu-icon: {
-    width: 15px;
-    height: 15px;
-  };
-  --paper-dropdown-menu-button: {
-    padding: 0;
-  };
-  --paper-dropdown-menu-input: {
-    padding: 0;
-  };
-  --paper-item-min-height: 30px;
-}
-
-paper-button[raised].keyboard-focus {
-  font-weight: normal;
-}
-
-.run-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 25px;
-  };
-}
-
-.color-dropdown {
-  --paper-input-container: {
-    padding: 9px 0 0 13px;
-  };
-}
-
-table {
-  border-collapse: collapse;
-  border-spacing: 0;
-}
-
-table td {
-  padding: 0;
-  margin: 0;
-}
-
-.allcontrols {
-  width: 188px;
-  padding: 0 30px;
-}
-
-.legend-holder {
-  position: absolute;
-  bottom: 0;
-  padding-bottom: 10px;
-}
-
-paper-radio-button {
-  display: block;
-  padding: 5px;
-}
-svg.icon {
-  width: 60px;
-  height: 18px;
-}
-.icon ellipse {
-  rx: 10px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 1px;
-  fill: #FFFFFF;
-  cy: 10px;
-}
-.icon rect {
-  height: 14px;
-  width: 35px;
-  rx: 5px;
-  ry: 5px;
-  stroke: #CCC;
-  stroke-width: 2px;
-  fill: #D9D9D9;
-}
-.domainValues {
-  margin-bottom: 10px;
-  width: 165px;
-}
-.domainStart {
-  float: left;
-}
-.domainEnd {
-  float: right;
-}
-.colorBox {
-  width: 20px;
-}
-
-.image-icon {
-  width: 24px;
-  height: 24px;
-}
-
-.help-icon {
-  height: 15px;
-  margin: 0;
-  padding: 0;
-}
-
-.gray {
-  color: #666;
-}
-
-.title {
-  font-size: 16px;
-  margin: 8px 5px 8px 0;
-  color: black;
-}
-.title small {
-  font-weight: normal;
-}
-.deviceList {
-  max-height: 200px;
-  overflow-y: auto;
-}
-
-#file {
-  padding: 8px 0;
-}
-
-.color-legend-row {
-  clear: both;
-  height: 20px;
-  margin-top: 5px;
-  position: relative;
-}
-
-.color-legend-row svg {
-  position: absolute;
-  top: -1px;
-  width: 40px;
-}
-
-.color-legend-row span.color-legend-value {
-  margin-left: 60px;
-}
-
-#grey-rect {
-  fill: #eee;
-  stroke: #a6a6a6;
-}
-
-#faded-rect {
-  fill: url("#rectHatch");
-  stroke: var(--tb-graph-faded);
-}
-
-.button-text {
-  text-transform: none;
-  padding: 8px 18px 0 18px;
-  font-size: 14px
-}
-
-.upload-button {
-  width: 165px;
-  height: 25px;
-  text-transform: none;
-  margin-top: 4px;
-}
-
-.iconbutton {
-  padding: 2px;
-  width: 30px;
-  height: 30px;
-  color: var(--paper-orange-500);
-}
-
-.hidden-input {
-  height: 0px;
-  width: 0px;
-  overflow:hidden;
-}
-
-.allcontrols .control-holder {
-  display: flex;
-  clear: both;
-}
-
-.allcontrols .control-holder paper-radio-group {
-  margin-top: 5px;
-}
-
-span.counter {
-  font-size: 13px;
-  color: gray;
-}
-
-.runs paper-item {
-  --paper-item: {
-    white-space: nowrap;
-  }
-}
-
-table.control-holder {
-  border: 0;
-  border-collapse: collapse;
-}
-
-table.tf-graph-controls td.input-element-table-data {
-  padding: 0 0 0 20px;
-}
-</style>
-<svg width="0" height="0">
-  <defs>
-    <g id="legend-rect">
-      <rect x="1" y="1" stroke-width="2px" height="14" width="35" rx="5" ry="5"></rect>
-    </g>
-    <g id="grey-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#legend-rect"></use>
-     </g>
-     <g id="faded-rect">
-       <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#legend-rect"></use>
-     </g>
-  </defs>
-</svg>
-<div class="allcontrols">
-  <div class="control-holder">
-    <paper-icon-button icon="aspect-ratio" class="iconbutton" on-click="fit" alt="Fit to screen">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="fit">Fit to screen
-    </paper-button>
-  </div>
-  <div class="control-holder">
-    <paper-icon-button icon="file-download" class="iconbutton" on-click="download" alt="Download PNG">
-    </paper-icon-button>
-    <paper-button class="button-text" on-click="download">Download PNG
-    </paper-button>
-    <a href="#" id="graphdownload" class="title" download="graph.png">
-    </a>
-  </div>
-  <div class="control-holder runs">
-    <div class="title">Run <span class="counter">([[datasets.length]])</span></div>
-    <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
-      <paper-menu id="select" class="dropdown-content" selected="{{selectedDataset}}">
-        <template is="dom-repeat" items="[[datasets]]">
-          <paper-item>[[item.name]]</paper-item>
-        </template>
-      </paper-menu>
-    </paper-dropdown-menu>
-  </div>
-  <template is="dom-if" if="[[showSessionRunsDropdown]]">
-    <div class="control-holder">
-      <div class="title">Session runs <span class="counter">([[_numSessionRuns(metadataTags)]])</span></div>
-      <paper-dropdown-menu no-label-float="" no-animations="" noink="" class="run-dropdown">
-        <paper-menu id="select" class="dropdown-content" selected="{{selectedMetadataTag}}">
-          <template is="dom-repeat" items="[[metadataTags]]">
-            <paper-item>[[item.tag]]</paper-item>
-          </template>
-          <paper-item>None</paper-item>
-        </paper-menu>
-      </paper-dropdown-menu>
-    </div>
-  </template>
-  <template is="dom-if" if="[[showUploadButton]]">
-    <div class="control-holder">
-      <div class="title">Upload</div>
-      <paper-button raised="" class="text-button upload-button" on-click="_getFile">Choose File</paper-button>
-      <div class="hidden-input">
-        <input type="file" id="file" name="file" on-change="_updateFileInput">
-      </div>
-    </div>
-  </template>
-  <table class="control-holder">
-    <tbody><tr>
-      <td class="title">Trace inputs</td>
-      <td class="input-element-table-data">
-        <paper-toggle-button id="trace-inputs"></paper-toggle-button>
-      </td>
-    </tr>
-    <template is="dom-if" if="[[healthPillsFeatureEnabled]]">
-      <tr>
-        <td class="title">Show health pills</td>
-        <td class="input-element-table-data">
-          <paper-toggle-button checked="{{healthPillsToggledOn}}"></paper-toggle-button>
-        </td>
-      </tr>
-    </template>
-  </tbody></table>
-  <div class="control-holder">
-    <div class="title">Color</div>
-    <paper-radio-group selected="{{colorBy}}">
-      <paper-radio-button name="structure">Structure</paper-radio-button>
-      <paper-radio-button name="device">Device</paper-radio-button>
-      <template is="dom-if" if="[[_statsNotNull(stats)]]">
-        <paper-radio-button name="compute_time">Compute time</paper-radio-button>
-        <paper-radio-button name="memory">Memory</paper-radio-button>
-      </template>
-    </paper-radio-group>
-  </div>
-  <div>
-    <template is="dom-if" if="[[_isGradientColoring(stats, colorBy)]]">
-      <svg width="140" height="20" style="margin: 0 5px" class="color-text">
-        <defs>
-          <linearGradient id="linearGradient" x1="0%" y1="0%" x2="100%" y2="0%">
-            <stop class="start" offset="0%" stop-color$="[[_currentGradientParams.startColor]]"></stop>
-            <stop class="end" offset="100%" stop-color$="[[_currentGradientParams.endColor]]"></stop>
-          </linearGradient>
-        </defs>
-        <rect x="0" y="0" width="135" height="20" fill="url(#linearGradient)" stroke="black"></rect>
-      </svg>
-      <div class="domainValues color-text">
-        <div class="domainStart">[[_currentGradientParams.minValue]]</div>
-        <div class="domainEnd">[[_currentGradientParams.maxValue]]</div>
-      </div>
-      <br style="clear: both">
-      <div>Devices included in stats:</div>
-      <div class="deviceList">
-        <table>
-        <template is="dom-repeat" items="[[_getDevices(devicesForStats)]]">
-          <tr>
-            <td>
-              <input type="checkbox" value$="[[item.device]]" checked$="[[item.used]]" on-click="_deviceCheckboxClicked">
-            </td>
-            <td>
-              <div>
-                <span>[[item.suffix]]</span>
-                <template is="dom-if" if="[[item.ignoredMsg]]">
-                  <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-                  <paper-tooltip position="right" animation-delay="0">[[item.ignoredMsg]]</paper-tooltip>
-                </template>
-              </div>
-            </td>
-          </tr>
-        </template>
-        </table>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'structure')]]">
-      <div class="color-text">
-        <div class="color-legend-row">
-          <div style="position: absolute;">
-            colors
-          </div>
-          <span class="color-legend-value">same substructure</span>
-        </div>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#grey-rect" x="0" y="0"></use>
-          </svg>
-          <span class="color-legend-value">unique substructure</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_equals(colorBy, 'device')]]">
-      <div class="color-text">
-        <div class="deviceList">
-          <table>
-          <template is="dom-repeat" items="[[colorByParams.device]]">
-            <tr>
-              <td style$="[[_getBackgroundColor(item.color)]]">
-                <div class="colorBox"></div>
-              </td>
-              <td>
-                <div>[[item.device]]</div>
-              </td>
-            </tr>
-          </template>
-          </table>
-        </div>
-        <br>
-        <div class="color-legend-row">
-          <svg>
-            <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#grey-rect" x="0" y="0"></use>
-          </svg>
-          <span class="color-legend-value">unknown device</span>
-        </div>
-      </div>
-    </template>
-    <template is="dom-if" if="[[_statsNotNull(stats)]]">
-      <div class="color-legend-row">
-        <svg>
-          <use xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#faded-rect" x="0" y="0"></use>
-        </svg>
-        <span class="color-legend-value">unused substructure</span>
-      </div>
-    </template>
-  </div>
-
-  <template is="dom-if" if="[[!_isGradientColoring(stats, colorBy)]]">
-    <div class="legend-holder">
-      <table>
-        <tbody><tr>
-          <td><div class="title">Graph</div></td>
-          <td>(* = expandable)</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <rect transform="translate(3, 1)" height="14" width="35" rx="5" ry="5"></rect>
-            </svg>
-          </td>
-          <td>Namespace<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" preserveAspectRatio="xMinYMid meet" viewBox="0 0 10 10">
-              <use xlink:href="#op-node-stamp" fill="white" stroke="#ccc" x="9.5" y="6"></use>
-            </svg>
-          </td>
-          <td>OpNode</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 12 12">
-              <use xlink:href="#op-series-horizontal-stamp" fill="white" stroke="#ccc" x="2" y="2"></use>
-            </svg>
-          </td>
-          <td>Unconnected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <use xlink:href="#op-series-vertical-stamp" fill="white" stroke="#ccc" x="2" y="2"></use>
-            </svg>
-          </td>
-          <td>Connected series<span class="gray">*</span></td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon">
-              <circle fill="white" stroke="#848484" cx="10" cy="10" r="5"></circle>
-            </svg>
-          </td>
-          <td>Constant</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="image-icon" viewBox="0 0 12 12" width="24" height="24">
-              <use x="0" y="0" class="image-icon" xlink:href="#summary-icon"></use>
-            </svg>
-          </td>
-          <td>Summary</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <defs>
-                <marker id="ref-arrowhead-legend" fill="#bbb" markerWidth="10" markerHeight="10" refX="1" refY="5" orient="auto">
-                  <path d="M 10,0 L 0,5 L 10,10 C 7,7 7,3 10,0"></path>
-                </marker>
-              </defs>
-              <path stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round"></path>
-            </svg>
-          </td>
-          <td>Dataflow edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round" stroke-dasharray="2, 2"></path>
-            </svg>
-          </td>
-          <td>Control dependency edge</td>
-        </tr>
-        <tr>
-          <td>
-            <svg class="icon" height="15px" preserveAspectRatio="xMinYMid meet" viewBox="0 0 15 15">
-              <path marker-start="url(#ref-arrowhead-legend)" stroke="#bbb" d="M2 9 l 23 0" stroke-linecap="round"></path>
-            </svg>
-          </td>
-          <td>Reference edge</td>
-        </tr>
-      </tbody></table>
-    </div>
-  </template>
-  </div>
-</template>
-<script>
-(function() { // Private scope.
-/**
- * Stats from device names that match these regexes will be excluded by default.
- * The user can still turn on a device by selecting the checkbox in the device list.
- * See b/29089982 for context.
- */
-var DEVICE_NAMES_EXCLUDE = [
-  {
-    regex: /gpu:[0-9]+$/,
-    msg: 'Excluded by default since this is a CPU thread setting up GPU kernels.'
-  }
-];
-
-Polymer({
-  is: 'tf-graph-controls',
-  properties: {
-    // Public API.
-    stats: {
-      value: null,
-      type: Object,
-      observer: '_statsChanged'
-    },
-    devicesForStats: {
-      value: null,
-      type: Object,
-      notify: true,
-      readonly: true,
-    },
-    colorBy: {
-      type: String,
-      value: 'structure',
-      notify: true,
-      readonly: true
-    },
-    colorByParams: Object,
-    datasets: {
-      type: Array,
-      observer: '_datasetsChanged'
-    },
-    renderHierarchy: {
-      type: Object,
-      notify: true,
-    },
-    metadataTags: {
-      type: Array,
-      computed: '_getMetadataTags(selectedDataset, datasets)'
-    },
-    selectedDataset: {
-      type: Number,
-      notify: true,
-      value: 0,
-      observer: '_selectedDatasetChanged'
-    },
-    selectedFile: {
-      type: Object,
-      notify: true
-    },
-    selectedMetadataTag: {
-      type: Number,
-      notify: true,
-      value: -1
-    },
-    _currentGradientParams: {
-      type: Object,
-      computed: '_getCurrentGradientParams(colorByParams, colorBy)'
-    },
-    showSessionRunsDropdown: {
-      type: Boolean,
-      value: true
-    },
-    showUploadButton: {
-      type: Boolean,
-      value: true
-    },
-    // This stores whether the feature for showing health pills is enabled in the first place.
-    healthPillsFeatureEnabled: Boolean,
-    // This stores whether to show health pills. Only relevant if healthPillsFeatureEnabled. The
-    // user can toggle this value.
-    healthPillsToggledOn: {
-      type: Boolean,
-      notify: true,
-    },
-  },
-  listeners: {
-    'trace-inputs.change': '_traceInputToggleChanged'
-  },
-  _traceInputToggleChanged: function(event) {
-    // Flip the state of the trace inputs flag.
-    this.renderHierarchy.traceInputs = event.target.active;
-    tf.graph.scene.node.traceInputs(this.renderHierarchy);
-  },
-  _statsNotNull: function(stats) {
-    return stats != null;
-  },
-  _statsChanged: function(stats) {
-    if (stats == null) {
-      return;
-    }
-    var devicesForStats = {};
-    var devices = _.each(stats.dev_stats, function(d) {
-      // Avoid device names that are ignored by default.
-      var exclude = _.some(DEVICE_NAMES_EXCLUDE, function(rule) {
-        return rule.regex.test(d.device);
-      });
-      if (!exclude) {
-        devicesForStats[d.device] = true;
-      }
-    });
-    this.set('devicesForStats', devicesForStats);
-  },
-  _getDevices: function(devicesForStats) {
-    var devices = _.map(this.stats.dev_stats, function(d) {
-      return d.device;
-    });
-    // Devices names can be long so we remove the longest common prefix
-    // before showing the devices in a list.
-    var suffixes = tf.graph.util.removeCommonPrefix(devices);
-    return _.map(devices, function(device, i) {
-      var ignoredMsg = null;
-      _.each(DEVICE_NAMES_EXCLUDE, function(rule) {
-        if (rule.regex.test(device)) {
-          ignoredMsg = rule.msg;
-        }
-      });
-      return {
-        device: device,
-        suffix: suffixes[i],
-        used: devicesForStats[device],
-        ignoredMsg: ignoredMsg
-      };
-    });
-  },
-  _deviceCheckboxClicked: function(checkbox) {
-    // Update the device map.
-    var devicesForStats = _.extend({}, this.devicesForStats);
-    var device = checkbox.target.value;
-    if (checkbox.target.checked) {
-      devicesForStats[device] = true;
-    } else {
-      delete devicesForStats[device];
-    }
-    this.set('devicesForStats', devicesForStats);
-  },
-  _numSessionRuns: function(metadataTags) {
-    return metadataTags != null ? metadataTags.length : 0;
-  },
-  _getBackgroundColor: function(color) {
-    return 'background-color:' + color;
-  },
-  fit: function() {
-    document.querySelector('#scene').fit();
-  },
-  _isGradientColoring: function(stats, colorBy) {
-    return ["compute_time", "memory"].indexOf(colorBy) !== -1
-        && stats != null;
-  },
-  _equals: function(a, b) {
-    return a === b;
-  },
-  _getCurrentGradientParams: function(colorByParams, colorBy) {
-    if (!this._isGradientColoring(this.stats, colorBy)) {
-      return;
-    }
-    var params = colorByParams[colorBy];
-    var minValue = params.minValue;
-    var maxValue = params.maxValue;
-    if (colorBy === 'memory') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.MEMORY_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.MEMORY_UNITS);
-    } else if (colorBy === 'compute_time') {
-      minValue = tf.graph.util.convertUnitsToHumanReadable(
-          minValue, tf.graph.util.TIME_UNITS);
-      maxValue = tf.graph.util.convertUnitsToHumanReadable(
-          maxValue, tf.graph.util.TIME_UNITS);
-    }
-    return {
-      minValue: minValue,
-      maxValue: maxValue,
-      startColor: params.startColor,
-      endColor: params.endColor
-    };
-  },
-  download: function() {
-    this.$.graphdownload.click();
-  },
-  _updateFileInput: function(e) {
-    var file = e.target.files[0];
-    if (!file) {
-      return;
-    }
-    this._setDownloadFilename(file.name);
-    this.set('selectedFile', e);
-  },
-  _datasetsChanged: function(newDatasets, oldDatasets) {
-    if (oldDatasets != null || this.selected == null) {
-      // Select the first dataset by default.
-      this.set('selectedDataset', 0);
-      this._setDownloadFilename(this.datasets[this.selectedDataset].path);
-    }
-  },
-  _getMetadataTags: function(selectedDataset, datasets) {
-    return this.datasets[selectedDataset].runMetadata;
-  },
-  _selectedDatasetChanged: function(newDataset, oldDataset) {
-    if (this.datasets) {
-      this.set('selectedMetadataTag', -1);
-      this.set('colorBy', 'structure');
-      this.$['trace-inputs'].active = false; // Set trace input to off-state.
-      this._setDownloadFilename(this.datasets[newDataset].path);
-    }
-  },
-  _getFile: function() {
-    this.$$("#file").click();
-  },
-  _setDownloadFilename: function(graphPath) {
-    // Strip off everything before the last "/" and strip off the file
-    // extension in order to get the name of the PNG for the graph.
-    var dotIndex = graphPath.lastIndexOf('.');
-    if (dotIndex) {
-      graphPath = graphPath.substring(0, dotIndex);
-    }
-    var slashIndex = graphPath.lastIndexOf('/');
-    if (slashIndex) {
-      graphPath = graphPath.substring(slashIndex + 1);
-    }
-    this.$.graphdownload.setAttribute('download', graphPath + '.png');
-  }
-});
-})(); // Closing private scope.
-</script>
-</dom-module>
-
-<dom-module id="tf-graph-dashboard" assetpath="../tf-graph-dashboard/">
-<template>
-<tf-no-data-warning data-type="graph" show-warning="[[_datasetsEmpty(_datasets)]]"></tf-no-data-warning>
-<template is="dom-if" if="[[!_datasetsEmpty(_datasets)]]">
-<tf-dashboard-layout>
-<div class="sidebar">
-  <tf-graph-controls id="controls" devices-for-stats="{{_devicesForStats}}" color-by-params="[[_colorByParams]]" stats="[[_stats]]" color-by="{{_colorBy}}" datasets="[[_datasets]]" render-hierarchy="[[_renderHierarchy]]" selected-dataset="{{_selectedDataset}}" selected-file="{{_selectedFile}}" selected-metadata-tag="{{_selectedMetadataTag}}" health-pills-feature-enabled="[[debuggerDataEnabled]]" health-pills-toggled-on="{{healthPillsToggledOn}}"></tf-graph-controls>
-  <tf-graph-loader id="loader" datasets="[[_datasets]]" selected-dataset="[[_selectedDataset]]" selected-metadata-tag="[[_selectedMetadataTag]]" selected-file="[[_selectedFile]]" out-graph-hierarchy="{{_graphHierarchy}}" out-graph="{{_graph}}" out-stats="{{_stats}}" progress="{{_progress}}" out-hierarchy-params="{{_hierarchyParams}}"></tf-graph-loader>
-</div>
-<div class="center">
-    <tf-graph-board id="graphboard" devices-for-stats="[[_devicesForStats]]" color-by="[[_colorBy]]" color-by-params="{{_colorByParams}}" graph-hierarchy="[[_graphHierarchy]]" graph="[[_graph]]" hierarchy-params="[[_hierarchyParams]]" progress="[[_progress]]" node-names-to-health-pills="[[_nodeNamesToHealthPills]]" health-pill-step-index="[[_healthPillStepIndex]]" render-hierarchy="{{_renderHierarchy}}" stats="[[_stats]]"></tf-graph-board>
-</div>
-</tf-dashboard-layout>
-</template>
-<style>
-
-:host /deep/ {
-  font-family: 'Roboto', sans-serif;
-}
-
-.center {
-  position: relative;
-  height: 100%;
-}
-
-</style>
-</template>
-</dom-module>
-
-<script>
-(function() {
-Polymer({
-  is: 'tf-graph-dashboard',
-  behaviors: [
-    TF.Dashboard.ReloadBehavior("tf-graph-dashboard"),
-    TF.Backend.Behavior,
-  ],
-  properties: {
-    _datasets: Object,
-    _renderHierarchy: {type: Object, observer: '_renderHierarchyChanged'},
-    backend: {type: Object, observer: '_backendChanged'},
-    debuggerDataEnabled: Boolean,
-    healthPillsToggledOn: {type: Boolean, value: true, observer: '_healthPillsToggledOnChanged'},
-    // Maps the names of nodes to an array of health pills (HealthPillDatums).
-    _nodeNamesToHealthPills: {
-      type: Object,
-      value: {},
-    },
-    _healthPillStepIndex: Number,
-    runs: Array
-  },
-  listeners: {
-    'node-toggle-expand': '_handleNodeToggleExpand',
-  },
-  reload: function() {
-    if (!this.debuggerDataEnabled ||
-        !this.healthPillsToggledOn ||
-        !this._renderHierarchy ||
-        this._datasetsEmpty(this._datasets)) {
-      // Do not load debugger data if the feature is disabled, if the user toggled off the feature,
-      // or if the graph itself has not loaded yet. We need the graph to load so that we know which
-      // nodes to request health pills for.
-      return;
-    }
-
-    // Request debugger data on graph reloads, but do not re-request the graph itself. The graph
-    // would not change across reloads.
-    this._requestHealthPills();
-  },
-  _backendChanged: function(backend) {
-    Promise.all([backend.graphRuns(), backend.runMetadataRuns()])
-      .then(function(result) {
-        var runsWithGraph = result[0].sort(VZ.Sorting.compareTagNames);
-        var runToMetadata = result[1];
-        var datasets = _.map(runsWithGraph, function(runName) {
-          return {
-            name: runName,
-            path: backend.router.graph(
-                runName, tf.graph.LIMIT_ATTR_SIZE, tf.graph.LARGE_ATTRS_KEY),
-            runMetadata: runToMetadata[runName] ? _.map(
-              runToMetadata[runName].sort(VZ.Sorting.compareTagNames), function(tag) {
-                return {
-                  tag: tag,
-                  path: backend.router.runMetadata(tag, runName)
-                };
-              }, this) : []
-          };
-        }, this);
-        this.set('_datasets', datasets);
-      }.bind(this));
-  },
-  _requestHealthPills: function() {
-    this.backend.healthPills(this._renderHierarchy.getNamesOfRenderedOps()).then(function(result) {
-      if (!this.healthPillsToggledOn) {
-        // The user has opted to hide health pills via the toggle button.
-        return;
-      }
-
-      // Set the index for which step to show for the health pills. By default, show the last step.
-      // A precondition we assume (that Tensorboard's reservoir sampling guarantees) is that all
-      // node names should be mapped to the same number of steps.
-      for (let nodeName in result) {
-        this.set('_healthPillStepIndex', result[nodeName].length - 1);
-        break;
-      }
-
-      this.set('_nodeNamesToHealthPills', result);
-    }.bind(this));
-  },
-  _datasetsEmpty: function(datasets) {
-    return !datasets || !datasets.length;
-  },
-  _renderHierarchyChanged: function(renderHierarchy) {
-    // Reload any data on the graph when the render hierarchy (which determines which nodes are
-    // rendered) changes.
-    this.reload();
-  },
-  _handleNodeToggleExpand: function() {
-    // Nodes were toggled. We may need to request health pills for more nodes.
-    this._requestHealthPills();
-  },
-  _healthPillsToggledOnChanged: function(healthPillsToggledOn) {
-    if (healthPillsToggledOn) {
-      // Load health pills.
-      this.reload();
-    } else {
-      // Remove all health pills by setting an empty mapping.
-      this.set('_nodeNamesToHealthPills', {});
-    }
-  },
-});
-})();
-</script>
-<dom-module id="vz-projector-styles" assetpath="../vz-projector/">
-<template>
-<style>
-:host {
-  --paper-input-container-label: {
-    font-size: 14px;
-  };
-  --paper-input-container-input: {
-    font-size: 14px;
-  };
-  /* TODO: Figure out why this doesn't work */
-  --paper-dropdown-menu-input: {
-    font-size: 14px;
-  };
-}
-
-paper-button {
-  background: #e3e3e3;
-  margin-left: 0;
-  text-transform: none;
-}
-
-paper-dropdown-menu paper-item {
-  font-size: 13px;
-}
-
-paper-tooltip {
-  max-width: 200px;
-  --paper-tooltip: {
-    font-size: 12px;
-  };
-}
-
-paper-checkbox {
-  --paper-checkbox-checked-color: #880E4F;
-}
-
-paper-toggle-button {
-  --paper-toggle-button-checked-bar-color:  #880E4F;
-  --paper-toggle-button-checked-button-color:  #880E4F;
-  --paper-toggle-button-checked-ink-color: #880E4F;
-}
-
-paper-icon-button {
-  border-radius: 50%;
-}
-
-paper-icon-button[active] {
-  color: white;
-  background-color: #880E4F;
-}
-
-.slider {
-  display: flex;
-  align-items: center;
-  margin-bottom: 10px;
-  justify-content: space-between;
-}
-
-.slider span {
-  width: 35px;
-  text-align: right;
-}
-
-.slider label {
-  align-items: center;
-  display: flex;
-}
-
-.help-icon {
-  height: 15px;
-  left: 2px;
-  min-width: 15px;
-  min-height: 15px;
-  margin: 0;
-  padding: 0;
-  top: -2px;
-  width: 15px;
-}
-
-.ink-panel {
-  display: flex;
-  flex-direction: column;
-  font-size: 14px;
-}
-
-.ink-panel h4 {
-  border-bottom: 1px solid #ddd;
-  font-size: 14px;
-  font-weight: 500;
-  margin: 0;
-  margin-bottom: 10px;
-  padding-bottom: 5px;
-}
-
-.ink-panel-header {
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  height: 50px;
-}
-
-.ink-panel-content {
-  display: none;
-  height: 100%;
-}
-
-.ink-panel-content.active {
-  display: block;
-}
-
-.ink-panel-content h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin-top: 20px;
-  margin-bottom: 5px;
-  text-transform: uppercase;
-}
-
-.ink-panel-header h3 {
-  font-weight: 500;
-  font-size: 14px;
-  margin: 0;
-  padding: 0 24px;
-  text-transform: uppercase;
-}
-
-
-/* - Tabs */
-.ink-tab-group {
-  align-items: center;
-  box-sizing: border-box;
-  display: flex;
-  height: 100%;
-  justify-content: space-around;
-}
-
-.ink-tab-group .projection-tab {
-  color: rgba(0, 0, 0, 0.5);
-  cursor: pointer;
-  font-weight: 300;
-  line-height: 49px;
-  padding: 0 12px;
-  text-align: center;
-  text-transform: uppercase;
-}
-
-.ink-tab-group .projection-tab:hover {
-  color: black;
-}
-
-.ink-tab-group .projection-tab.active {
-  border-bottom: 2px solid black;
-  color: black;
-  font-weight: 500;
-}
-
-h4 {
-  margin: 30px 0 10px 0;
-}
-
-.dismiss-dialog-note {
-  margin-top: 25px;
-  font-size: 11px;
-  text-align: right;
-}
-</style>
-</template>
-</dom-module>
-<link rel="import" href="../paper-input/paper-textarea.html">
-<dom-module id="vz-projector-bookmark-panel" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-#title {
-  background-color: #fafafa;
-  color: black;
-  font-weight: 500;
-  left: 0;
-  line-height: 60px;
-  padding-left: 24px;
-  position: absolute;
-  width: 276px;
-}
-#bookmark-container {
-  background-color: #fafafa;
-}
-#icon-container {
-  line-height: 60px;
-  position: absolute;
-  right: 0;
-}
-#header {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  position: relative;
-}
-#panel {
-  background-color: #fafafa;
-  position: relative;
-  overflow-y: scroll;
-  top: 60px;
-  max-height: 50vh;
-}
-
-#save-container {
-  text-align: center;
-}
-
-.state-radio {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 16px;
-}
-
-.state-label {
-  display: table-cell;
-  vertical-align: middle;
-  top: 14px;
-}
-
-.state-label-input {
-  width: 194px;
-}
-
-.state-clear {
-  display: table-cell;
-  vertical-align: middle;
-  padding-top: 20px;
-}
-#state-file {
-  display: none;
-}
-#no-bookmarks {
-  padding: 0 24px;
-}
-#action-buttons-container .add-icon-button {
-  background-color: #03a9f4;
-  color: white;
-  margin: 0 4px 4px auto;
-  right: 7px;
-  top: -4px;
-}
-.upload-download-icon-button {
-  padding: 0;
-}
-#action-buttons-container {
-  display: flex;
-  margin-left: 34px;
-  margin-top: 6px;
-}
-.ink-fab {
-  border-radius: 50%;
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-paper-textarea {
-  --paper-input-container-input: {
-    font-size: 12px;
-  }
-  --paper-font-caption: {
-    display: none
-  }
-}
-</style>
-
-
-<div id="bookmark-container">
-  <div id="header">
-    <div id="title">
-      BOOKMARKS ([[savedStates.length]])
-      <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" position="top" offset="0">
-        Open this drawer to save a set of views of the projection, including
-        selected points. A file containing the bookmarks can then be saved and
-        later loaded to view them.
-      </paper-tooltip>
-    </div>
-    <div id="icon-container">
-
-      <paper-icon-button id="expand-more" icon="expand-less" on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less" style="display: none" icon="expand-more" on-tap="_expandLess"></paper-icon-button>
-    </div>
-  </div>
-  <iron-collapse id="panel">
-
-    <div id="state-section">
-      <template is="dom-if" if="[[!savedStates.length]]">
-        <p id="no-bookmarks">
-            No bookmarks yet, upload a bookmarks file or add a new bookmark by clicking the "+" below.
-        </p>
-      </template>
-
-      <template is="dom-repeat" items="{{savedStates}}">
-        <div class="state-row">
-          <div class="state-radio">
-            <template is="dom-if" if="{{item.isSelected}}">
-              <paper-icon-button icon="radio-button-checked"></paper-icon-button>
-            </template>
-            <template is="dom-if" if="{{!item.isSelected}}">
-              <paper-icon-button icon="radio-button-unchecked" data-index$="{{index}}" on-tap="_radioButtonHandler"></paper-icon-button>
-            </template>
-          </div>
-          <div class="state-label">
-            <paper-textarea value="[[item.label]]" class="state-label-input" on-keyup="_labelChange" data-index$="[[index]]" autoresizing="">
-          </paper-textarea></div>
-          <div class="state-clear">
-            <paper-icon-button icon="clear" data-index$="{{index}}" on-tap="_clearButtonHandler"></paper-icon-button>
-          </div>
-        </div>
-      </template>
-
-      <div id="action-buttons-container">
-        <paper-icon-button class="upload-download-icon-button" icon="save" title="Save bookmarks" disabled="[[!hasStates]]" on-tap="_downloadFile"></paper-icon-button>
-        <paper-icon-button class="upload-download-icon-button" icon="file-upload" title="Load bookmarks" on-tap="_uploadFile"></paper-icon-button>
-        <paper-icon-button class="add-icon-button ink-fab" icon="add" title="Add bookmark" on-tap="_addBookmark"></paper-icon-button>
-        <input type="file" id="state-file" name="state-file">
-      </div>
-    </div>
-  </iron-collapse>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-legend" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.item {
-  display: flex;
-  align-items: flex-start;
-  margin-bottom: 10px;
-}
-
-.shape {
-  width: 10px;
-  height: 10px;
-  margin-right: 10px;
-  margin-top: 5px;
-  border-radius: 50%;
-}
-
-.label {
-  flex-grow: 1;
-}
-
-.gradient {
-  width: 100%;
-  height: 10px;
-}
-
-.gradient-boundaries {
-  display: flex;
-  justify-content: space-between;
-}
-</style>
-
-<template is="dom-repeat" items="[[renderInfo.items]]">
-  <div class="item">
-    <div class="shape" style="background-color: [[item.color]];"></div>
-    <div class="label">[[item.label]]</div>
-    <div class="info" style="color: [[item.color]];">[[item.count]]</div>
-  </div>
-</template>
-
-<template is="dom-if" if="[[renderInfo.thresholds]]">
-  <svg class="gradient">
-    <defs>
-      <linearGradient id="gradient" x1="0%" y1="100%" x2="100%" y2="100%"></linearGradient>
-    </defs>
-    <rect height="10" style="fill: url(&quot;#gradient&quot;);"></rect>
-  </svg>
-  <div class="gradient-boundaries">
-    <div>[[renderInfo.thresholds.0.value]]</div>
-    <div>[[_getLastThreshold(renderInfo.thresholds)]]</div>
-  </div>
-</template>
-
-</template>
-</dom-module><dom-module id="vz-projector-data-panel" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.container {
-  padding: 10px 20px 20px 20px;
-}
-
-input[type=file] {
-  display: none;
-}
-
-.file-name {
-  margin-right: 10px;
-}
-
-.dirs {
-  color: rgba(0, 0, 0, 0.7);
-  font-size: 12px;
-}
-
-.dirs table tr {
-  vertical-align: top;
-}
-
-.dirs table tr td {
-  padding-bottom: 10px;
-}
-
-paper-item {
-  --paper-item-disabled: {
-    border-bottom: 1px solid black;
-    justify-content: center;
-    font-size: 12px;
-    line-height: normal;
-    min-height: 0px;
-  };
-}
-
-.item-details {
-  margin-left: 5px;
-  color: gray;
-  font-size: 12px;
-}
-
-paper-dropdown-menu {
-  width: 100%;
-}
-
-paper-dropdown-menu paper-item {
-  justify-content: space-between;
-}
-
-.title {
-  align-items: center;
-  border-bottom: 1px solid rgba(0, 0, 0, 0.1);
-  color: black;
-  display: flex;
-  font-weight: 500;
-  height: 59px;
-  padding-left: 20px;
-}
-
-#normalize-data-checkbox {
-  margin: 10px 0;
-}
-
-#projector-config-template {
-  --paper-input-container-input: {
-    line-height: 13px;
-    font-family: monospace;
-    font-size: 12px;
-  };
-}
-
-#generate-share-url {
-  padding: 16px;
-  margin-left: 24px;
-}
-
-#projector-share-button-container {
-  margin: 10px 0;
-}
-
-.config-checkbox {
-  display: inline-block;
-  font-size: 11px;
-  margin-left: 10px;
-}
-
-.projector-config-options {
-  margin-top: 12px;
-}
-
-.projector-config-dialog-container {
-  padding: 24px;
-}
-
-.code {
-  background-color: #f7f7f7;
-  display: table;
-  font-family: monospace;
-  margin-top: 7px;
-  padding: 15px;
-}
-
-.delimiter {
-  color: #B71C1C;
-}
-
-.upload-step {
-  display: flex;
-  justify-content: space-between;
-  margin-bottom: 6px;
-}
-
-.upload-step paper-button {
-  margin-left: 30px;
-}
-
-.step-label {
-  color: rgb(38, 180, 226);
-}
-
-.scrollable-container {
-  margin-top: 0;
-  min-width: 400px;
-}
-
-#projectorConfigDialog p {
-  margin: 8px 0 8px;
-}
-
-.data-step {
-  margin-top: 40px;
-}
-
-.data-step-contents {
-  display: table;
-  width: 100%;
-}
-
-.data-step-contents-contents {
-  display: table-cell;
-  margin-top: 6px;
-}
-
-.data-step-contents-upload {
-  display: table-cell;
-  text-align: right;
-  vertical-align: bottom;
-}
-
-#demo-data-buttons-container {
-  display: none;
-}
-
-.colorby-container {
-  margin-bottom: 10px;
-}
-</style>
-<div class="title">DATA</div>
-<div class="container">
-
-  <template is="dom-if" if="[[_hasChoices(runNames)]]">
-    <paper-dropdown-menu no-animations="" label="[[_getNumRunsLabel(runNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedRun}}">
-        <template is="dom-repeat" items="[[runNames]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  <template is="dom-if" if="[[tensorNames]]">
-
-    <paper-dropdown-menu no-animations="" label="[[_getNumTensorsLabel(tensorNames)]] found">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedTensor}}">
-        <template is="dom-repeat" items="[[tensorNames]]">
-          <paper-item value="[[item.name]]" label="[[item.name]]">
-            [[item.name]]
-            <span class="item-details">
-              [[item.shape.0]]x[[item.shape.1]]
-            </span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-  <template is="dom-if" if="[[_hasChoices(labelOptions)]]">
-    <paper-dropdown-menu no-animations="" label="Label by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedLabelOption}}">
-        <template is="dom-repeat" items="[[labelOptions]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </template>
-
-
-  <div hidden$="[[!_hasChoices(colorOptions)]]" class="colorby-container">
-    <paper-dropdown-menu id="colorby" no-animations="" label="Color by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedColorOptionName}}">
-        <template is="dom-repeat" items="[[colorOptions]]">
-          <paper-item class$="[[getSeparatorClass(item.isSeparator)]]" value="[[item.name]]" label="[[item.name]]" disabled="[[item.isSeparator]]">
-            [[item.name]]
-            <span class="item-details">[[item.desc]]</span>
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-    <div hidden$="[[!showForceCategoricalColorsCheckbox]]">
-      <paper-checkbox id="force-categorical-checkbox"></paper-checkbox>
-      Use categorical coloring
-      <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        For metadata fields that have many unique values we use a gradient color map
-        by default. This checkbox allows you to force categorical coloring by a given
-        metadata field.
-      </paper-tooltip>
-    </div>
-    <template dom-if="[[colorLegendRenderInfo]]">
-      <vz-projector-legend render-info="[[colorLegendRenderInfo]]"></vz-projector-legend>
-    </template>
-  </div>
-  <paper-checkbox id="normalize-data-checkbox" checked="{{normalizeData}}">
-    Sphereize data
-    <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-    <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-      The data is normalized by shifting each point by the centroid and making
-      it unit norm.
-    </paper-tooltip>
-  </paper-checkbox>
-  <p id="demo-data-buttons-container">
-    <span>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Load data from your computer
-      </paper-tooltip>
-      <paper-button id="upload" class="ink-button" onclick="dataDialog.open()">Load data</paper-button>
-    </span>
-    <span>
-      <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Publish your embedding visualization and data
-      </paper-tooltip>
-      <paper-button id="host-embedding" class="ink-button" onclick="projectorConfigDialog.open()">Publish</paper-button>
-    </span>
-  </p>
-  <div>
-    <paper-dialog id="dataDialog" with-backdrop="">
-      <h2>Load data from your computer</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div class="data-step">
-          <div class="upload-step">
-            <div>
-                <b><span class="step-label">Step 1:</span> Load a TSV file of vectors.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 vectors with dimension 4:
-              <div class="code">
-                0.1<span class="delimiter">\t</span>0.2<span class="delimiter">\t</span>0.5<span class="delimiter">\t</span>0.9<br>
-                0.2<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>5.0<span class="delimiter">\t</span>0.2<br>
-                0.4<span class="delimiter">\t</span>0.1<span class="delimiter">\t</span>7.0<span class="delimiter">\t</span>0.8
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-tensors" title="Choose a TSV tensor file">Choose file</paper-button>
-              <input type="file" id="file" name="file">
-            </div>
-          </div>
-        </div>
-        <div class="data-step">
-          <div class="upload-step">
-            <div>
-                <span class="step-label"><b>Step 2</b> (optional):</span> <b>Load a TSV file of metadata.</b>
-            </div>
-          </div>
-          <div class="data-step-contents">
-            <div class="data-step-contents-contents">
-              Example of 3 data points and 2 columns.<br>
-              <i>Note: If there is more than one column, the first row will be parsed as column labels.</i>
-              <div class="code">
-                <b>Pokémon<span class="delimiter">\t</span>Species</b><br>
-                Wartortle<span class="delimiter">\t</span>Turtle<br>
-                Venusaur<span class="delimiter">\t</span>Seed<br>
-                Charmeleon<span class="delimiter">\t</span>Flame
-              </div>
-            </div>
-            <div class="data-step-contents-upload">
-              <paper-button id="upload-metadata" title="Choose a TSV metadata file" class="ink-button">Choose file</paper-button>
-              <input type="file" id="file-metadata" name="file-metadata">
-            </div>
-          </div>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-    <paper-dialog id="projectorConfigDialog" with-backdrop="">
-      <h2>Publish your embedding visualization and data</h2>
-      <paper-dialog-scrollable class="scrollable-container">
-        <div>
-          <p>
-            If you'd like to share your visualization with the world, follow these simple steps.
-            See <a target="_blank" href="https://www.tensorflow.org/versions/master/how_tos/embedding_viz/index.md">this tutorial</a> for more.
-          </p>
-          <h4><span class="step-label">Step 1:</span> Make data public</h4>
-          <p>
-            Host tensors, metadata, sprite image, and bookmarks TSV files <i>publicly</i> on the web.
-          </p>
-          <p>
-            One option is using a <a target="_blank" href="https://gist.github.com/">github gist</a>.
-          </p>
-        </div>
-        <div>
-          <h4><span class="step-label">Step 2:</span> Projector config</h4>
-          <div class="projector-config-options">
-            <i>Optional:</i>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-metadata-checkbox" checked="">Metadata</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-sprite-checkbox">Sprite</paper-checkbox>
-            </div>
-            <div class="config-checkbox">
-              <paper-checkbox id="config-bookmarks-checkbox">Bookmarks</paper-checkbox>
-            </div>
-          </div>
-        </div>
-        <paper-textarea id="projector-config-template" label="template_projector_config.json"></paper-textarea>
-        <div>
-          <h4><span class="step-label">Step 3:</span> Host projector config</h4>
-          After you have hosted the projector config JSON file you built above, paste the URL to the config below.
-        </div>
-        <paper-input id="projector-config-url" label="Path to projector config"></paper-input>
-        <paper-input id="projector-share-url" label="Your shareable URL" readonly=""></paper-input>
-        <div id="projector-share-button-container">
-          <a target="_blank" id="projector-share-url-link">
-            <paper-button title="Test your shareable URL" class="ink-button">Test your shareable URL</paper-button>
-          </a>
-        </div>
-      </paper-dialog-scrollable>
-      <div class="dismiss-dialog-note">Click outside to dismiss.</div>
-    </paper-dialog>
-  </div>
-  <div class="dirs">
-    <table>
-      <tbody><tr>
-        <td>Checkpoint:</td>
-        <td><span id="checkpoint-file"></span></td>
-      </tr>
-      <tr>
-        <td>Metadata:</td>
-        <td><span id="metadata-file"></span></td>
-      </tr>
-    </tbody></table>
-  </div>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-input" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-.info {
-  color: rgba(0, 0, 0, 0.5);
-  display: block;
-  font-size: 11px;
-}
-
-.toggle {
-  font-size: 12px;
-  height: 21px;
-  margin: 0px;
-  min-width: 0px;
-  min-height: 0px;
-  padding: 0;
-  width: 17px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-</style>
-
-<paper-input label="[[label]]">
-  <div class="slash" prefix="">/</div>
-  <div class="slash" suffix="">/</div>
-  <div suffix="">
-    <paper-button id="regex" toggles="" class="toggle">.*</paper-button>
-  </div>
-</paper-input>
-<paper-tooltip for="regex" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-  Enable/disable regex mode.
-</paper-tooltip>
-<span class="info">[[message]]</span>
-
-
-</template>
-</dom-module><dom-module id="vz-projector-inspector-panel" assetpath="../vz-projector/">
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-   display: flex;
-   flex-direction: column;
-   /* Account for the bookmark pane at the bottom */
-   height: calc(100% - 55px);
-}
-
-.container {
-  display: block;
-  padding: 10px 20px 0 20px;
-}
-
-.buttons {
-  display: flex;
-  height: 60px;
-}
-
-.button {
-  margin-right: 10px;
-  border: none;
-  border-radius: 7px;
-  font-size: 13px;
-  padding: 10px;
-  background: #e3e3e3;
-}
-
-.button:last-child {
-  margin-right: 0;
-}
-
-.nn {
-  display: flex;
-  flex-direction: column;
-}
-
-.nn > * {
-  padding: 0 20px;
-}
-
-.nn-list {
-  overflow-y: auto;
-}
-
-.nn-list .neighbor {
-  font-size: 12px;
-  margin-bottom: 8px;
-}
-
-.nn-list .label-and-value {
-  display: flex;
-  justify-content: space-between;
-}
-
-.label {
-  overflow: hidden;
-  text-overflow: ellipsis;
-  white-space: nowrap;
-}
-
-.nn-list .value {
-  color: #666;
-  float: right;
-  font-weight: 300;
-  margin-left: 8px;
-}
-
-.nn-list .bar {
-  position: relative;
-  border-top: 1px solid rgba(0, 0, 0, 0.15);
-  margin: 2px 0;
-}
-
-.nn-list .bar .fill {
-  position: absolute;
-  top: -1px;
-  border-top: 1px solid white;
-}
-
-.nn-list .tick {
-  position: absolute;
-  top: 0px;
-  height: 3px;
-  border-left: 1px solid rgba(0, 0, 0, 0.15);
-}
-
-.nn-list .neighbor-link:hover {
-  cursor: pointer;
-}
-
-.search-by {
-  display: flex;
-}
-
-.search-by vz-projector-input {
-  width: 100%;
-}
-
-.search-by paper-dropdown-menu {
-  margin-left: 10px;
-  width: 100px;
-}
-
-.distance .options {
-  float: right;
-}
-
-.options a {
-  color: #727272;
-  font-size: 13px;
-  margin-left: 12px;
-  text-decoration: none;
-}
-
-.options a.selected {
-  color: #009EFE;
-}
-
-.neighbors {
-  margin-bottom: 30px;
-}
-
-.neighbors-options {
-  margin-top: 6px;
-}
-
-.neighbors-options .option-label, .distance .option-label {
-  color: #727272;
-  margin-right: 2px;
-  width: auto;
-}
-
-.num-neighbors-container {
-  display: inline-block;
-}
-
-#nn-slider {
-  margin: 0 -12px 0 10px;
-}
-
-.euclidian {
-  margin-right: 10px;
-}
-
-.matches-list {
-  padding: 0 20px;
-}
-
-.matches-list .row {
-  border-bottom: 1px solid #ddd;
-  cursor: pointer;
-  display: flex;
-  font-size: 12px;
-  margin: 5px 0;
-  padding: 4px 0;
-}
-
-.results {
-  display: flex;
-  flex-direction: column;
-}
-</style>
-<template>
-<div class="container">
-  <div class="buttons">
-    <button class="button reset-filter">Show All Data</button>
-    <button class="button set-filter">Isolate selection</button>
-    <button class="button clear-selection">Clear selection</button>
-  </div>
-  <div class="search-by">
-    <vz-projector-input id="search-box" label="Search"></vz-projector-input>
-    <paper-dropdown-menu no-animations="" label="by">
-      <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{selectedMetadataField}}">
-        <template is="dom-repeat" items="[[metadataFields]]">
-          <paper-item value="[[item]]" label="[[item]]">
-            [[item]]
-          </paper-item>
-        </template>
-      </paper-listbox>
-    </paper-dropdown-menu>
-  </div>
-</div>
-<div class="results">
-  <div class="nn" style="display: none">
-    <div class="neighbors">
-      <div class="neighbors-options">
-        <div class="slider num-nn">
-          <span class="option-label">neighbors</span>
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="bottom" animation-delay="0" fit-to-visible-bounds="">
-            The number of neighbors (in the original space) to show when clicking on a point.
-          </paper-tooltip>
-          <paper-slider id="nn-slider" pin="" min="5" max="1000" value="100"></paper-slider>
-          <span class="nn-count"></span>
-        </div>
-      </div>
-      <div class="distance">
-        <span class="option-label">distance</span>
-        <div class="options">
-          <a class="selected cosine" href="javascript:void(0);">COSINE</a>
-          <a class="euclidean" href="javascript:void(0);">EUCLIDIAN</a>
-        </div>
-      </div>
-    </div>
-    <p>Nearest points in the original space:
-    </p><div class="nn-list"></div>
-  </div>
-  <div class="matches-list" style="display: none">
-    <div class="list"></div>
-    <div class="limit-msg">Showing only the first 100 results...</div>
-  </div>
-</div>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-metadata-card" assetpath="../vz-projector/">
-<template>
-<style>
-#metadata-card {
-  background-color: rgba(255,255,255,0.9);
-  box-shadow: 0 2px 2px 0 rgba(0, 0, 0, 0.14),
-      0 1px 5px 0 rgba(0, 0, 0, 0.12), 0 3px 1px -2px rgba(0, 0, 0, 0.2);
-  width: 280px;
-}
-
-#header {
-  background: #e9e9e9;
-}
-
-#icon-container {
-  position: absolute;
-  right: 0;
-  top: 4px;
-}
-
-#metadata-label {
-  font-weight: 400;
-  font-size: 14px;
-  line-height: 24px;
-  padding: 12px 12px 8px;
-  width: 230px;
-}
-
-#metadata-table {
-  display: table;
-  padding: 8px 12px 4px;
-}
-
-.metadata-row {
-  display: table-row;
-}
-
-.metadata-key {
-  font-weight: bold;
-}
-
-.metadata-key, .metadata-value {
-  display: table-cell;
-  font-size: 12px;
-  padding: 3px 3px;
-}
-</style>
-
-<template is="dom-if" if="[[hasMetadata]]">
-  <div id="metadata-card">
-    <div id="icon-container">
-      <paper-icon-button id="expand-more" style="display: none" icon="expand-more" on-tap="_expandMore"></paper-icon-button>
-      <paper-icon-button id="expand-less" on-tap="_expandLess" icon="expand-less"></paper-icon-button>
-    </div>
-    <div id="header">
-      <div id="metadata-label">[[label]]</div>
-    </div>
-    <iron-collapse id="metadata-container" opened="">
-      <div id="metadata-table">
-        <template is="dom-repeat" items="[[metadata]]">
-          <div class="metadata-row">
-            <div class="metadata-key">[[item.key]]</div>
-            <div class="metadata-value">[[item.value]]</div>
-          </div>
-        </template>
-      </div>
-    </iron-collapse>
-  </div>
-</template>
-</template>
-</dom-module>
-<dom-module id="vz-projector-projections-panel" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  transition: height 0.2s;
-}
-
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.ink-panel-buttons {
-  margin-bottom: 10px;
-}
-
-.two-way-toggle {
-  display: flex;
-  flex-direction: row;
-}
-
-.two-way-toggle span {
-  padding-right: 7px;
-}
-
-.has-border {
-  border: 1px solid rgba(0, 0, 0, 0.1);
-}
-
-.toggle {
-  min-width: 0px;
-  font-size: 12px;
-  width: 17px;
-  min-height: 0px;
-  height: 21px;
-  padding: 0;
-  margin: 0px;
-}
-
-.toggle[active] {
-  background-color: #880E4F;
-  color: white;
-}
-
-.two-columns {
-  display:flex;
-  justify-content: space-between;
-}
-
-.two-columns > :first-child {
-  margin-right: 15px;
-}
-
-.two-columns > div {
-  width: 50%;
-}
-
-.dropdown-item {
-  justify-content: space-between;
-  min-height: 35px;
-}
-
-#z-container {
-  display: flex;
-  align-items: center;
-  width: 50%;
-}
-
-#z-checkbox {
-  margin: 27px 0 0 5px;
-  width: 18px;
-}
-
-#z-dropdown {
-  flex-grow: 1;
-}
-
-.notice {
-  color: #880E4F;
-}
-
-.container {
-  padding: 20px;
-}
-
-.book-icon {
-  height: 20px;
-  color: rgba(0, 0, 0, 0.7);
-}
-
-.item-details {
-  color: gray;
-  font-size: 12px;
-  margin-left: 5px;
-}
-
-.pca-dropdown {
-  width: 100%;
-}
-
-.pca-dropdown paper-listbox {
-  width: 135px;
-}
-
-.dropdown-item.header {
-  border-bottom: 1px solid #aaa;
-  color: #333;
-  font-weight: bold;
-}
-
-#total-variance {
-  color: rgba(0, 0, 0, 0.7);
-}
-</style>
-<div id="main">
-  <div class="ink-panel-header">
-    <div class="ink-tab-group">
-
-      <div data-tab="tsne" id="tsne-tab" class="ink-tab projection-tab">t-SNE</div>
-      <paper-tooltip for="tsne-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        t-distributed stochastic neighbor embedding
-      </paper-tooltip>
-
-      <div data-tab="pca" id="pca-tab" class="ink-tab projection-tab">PCA</div>
-      <paper-tooltip for="pca-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Principal component analysis
-      </paper-tooltip>
-
-      <div data-tab="custom" id="custom-tab" class="ink-tab projection-tab" title="Linear projection of two custom vectors">Custom</div>
-      <paper-tooltip for="custom-tab" position="bottom" animation-delay="0" fit-to-visible-bounds="">
-        Search for two vectors upon which to project all points.
-      </paper-tooltip>
-
-    </div>
-  </div>
-  <div class="container">
-
-    <div data-panel="tsne" class="ink-panel-content">
-      <div class="slider">
-        <label>Dimension</label>
-        <div class="two-way-toggle">
-          <span>2D</span>
-          <paper-toggle-button id="tsne-toggle" checked="{{tSNEis3d}}">3D</paper-toggle-button>
-        </div>
-      </div>
-      <div class="slider tsne-perplexity">
-        <label>
-          Perplexity
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds="">
-            The most appropriate perplexity value depends on the density of the
-            data. Loosely speaking, a larger / denser dataset
-            requires a larger perplexity. Typical values for perplexity range
-            between 5 and 50.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="perplexity-slider" pin="" min="2" max="100" value="30"></paper-slider>
-        <span></span>
-      </div>
-      <div class="slider tsne-learning-rate">
-        <label>
-          Learning rate
-          <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-          <paper-tooltip position="right" animation-delay="0" fit-to-visible-bounds="">
-            The ideal learning rate often depends on the size of the data,
-            with smaller datasets requiring smaller learning rates.
-          </paper-tooltip>
-        </label>
-        <paper-slider id="learning-rate-slider" snaps="" min="-3" max="2" step="1" value="1" max-markers="6">
-        </paper-slider>
-        <span></span>
-      </div>
-      <p>
-        <button class="run-tsne ink-button" title="Re-run t-SNE">Re-run</button>
-        <button class="stop-tsne ink-button" title="Stop t-SNE">Stop</button>
-      </p>
-      <p>Iteration: <span class="run-tsne-iter">0</span></p>
-      <p id="tsne-sampling" class="notice">
-        For fast results, the data will be sampled down to [[getTsneSampleSizeText()]] points.
-      </p>
-      <p>
-        <iron-icon icon="book" class="book-icon"></iron-icon>
-        <a target="_blank" href="http://distill.pub/2016/misread-tsne/">
-          How to use t-SNE effectively.
-        </a>
-      </p>
-    </div>
-
-    <div data-panel="pca" class="ink-panel-content">
-      <div class="two-columns">
-        <div>
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations="" label="X">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaX}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-dropdown-menu class="pca-dropdown" no-animations="" vertical-align="bottom" label="Z" disabled="[[!hasPcaZ]]" id="z-dropdown">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaZ}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-        </div>
-        <div>
-          <paper-dropdown-menu class="pca-dropdown" vertical-align="bottom" no-animations="" label="Y">
-            <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{pcaY}}">
-              <paper-item disabled="" class="dropdown-item header">
-                  <div>#</div>
-                  <div>Variance (%)</div>
-              </paper-item>
-              <template is="dom-repeat" items="[[pcaComponents]]">
-                <paper-item class="dropdown-item" value="[[item.id]]" label="Component #[[item.componentNumber]]">
-                  <div>[[item.componentNumber]]</div>
-                  <div class="item-details">[[item.percVariance]]</div>
-                </paper-item>
-              </template>
-            </paper-listbox>
-          </paper-dropdown-menu>
-          <paper-checkbox id="z-checkbox" checked="{{pcaIs3d}}"></paper-checkbox>
-        </div>
-      </div>
-      <p id="pca-sampling" class="notice">
-        PCA is approximate.
-        <paper-icon-button icon="help" class="help-icon"></paper-icon-button>
-      </p>
-      <div id="total-variance">Total variance</div>
-      <paper-tooltip for="pca-sampling" position="top" animation-delay="0" fit-to-visible-bounds="">
-        For fast results, the data was sampled to [[getPcaSampleSizeText()]] points and randomly projected down to [[getPcaSampledDimText()]] dimensions.
-      </paper-tooltip>
-    </div>
-
-    <div data-panel="custom" class="ink-panel-content">
-      <paper-dropdown-menu style="width: 100%" no-animations="" label="Search by">
-        <paper-listbox attr-for-selected="value" class="dropdown-content" selected="{{customSelectedSearchByMetadataOption}}">
-          <template is="dom-repeat" items="[[searchByMetadataOptions]]">
-            <paper-item class="dropdown-item" value="[[item]]" label="[[item]]">
-              [[item]]
-            </paper-item>
-          </template>
-        </paper-listbox>
-      </paper-dropdown-menu>
-      <div class="two-columns">
-        <vz-projector-input id="xLeft" label="Left"></vz-projector-input>
-        <vz-projector-input id="xRight" label="Right"></vz-projector-input>
-      </div>
-      <div class="two-columns">
-        <vz-projector-input id="yUp" label="Up"></vz-projector-input>
-        <vz-projector-input id="yDown" label="Down"></vz-projector-input>
-      </div>
-    </div>
-  </div>
-</div>
-</template>
-</dom-module>
-<link rel="import" href="../paper-listbox/paper-listbox.html">
-<link rel="import" href="../iron-icons/image-icons.html">
-<link rel="import" href="../paper-toast/paper-toast.html">
-<link rel="import" href="../paper-styles/typography.html">
-<link rel="import" href="../paper-dialog-scrollable/paper-dialog-scrollable.html">
-
-<dom-module id="vz-projector" assetpath="../vz-projector/">
-<template>
-<style include="vz-projector-styles"></style>
-<style>
-:host {
-  display: flex;
-  width: 100%;
-  height: 100%;
-}
-
-#container {
-  display: flex;
-  width: 100%;
-  height: 100%;
-  overflow: hidden;
-}
-
-.hidden {
-  display: none !important;
-}
-
-/* Main */
-
-#main {
-  position: relative;
-  flex-grow: 2;
-}
-
-#main .stage {
-  position: relative;
-  flex-grow: 2;
-}
-
-#scatter {
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-}
-
-#selector {
-  display: none;
-  height: 100%;
-  position: absolute;
-  width: 100%;
-}
-
-#left-pane {
-  display: flex;
-  flex-direction: column;
-  justify-content: space-between;
-  min-width: 312px;
-  width: 312px;
-  border-right: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-}
-
-#right-pane {
-  border-left: 1px solid rgba(0, 0, 0, 0.1);
-  background: #fafafa;
-  display: flex;
-  height: 100%;
-  min-width: 300px;
-  width: 300px;
-}
-
-.file-name {
-  margin-right: 5px;
-}
-
-.control input[type=text]:focus {
-  outline: none;
-  border-bottom: 1px solid rgba(0, 0, 0, 1);
-}
-
-.control {
-  display: inline-block;
-  width: 45%;
-  vertical-align: top;
-  margin-right: 10px;
-  overflow-x: hidden;
-}
-
-.control.last {
-  margin-right: 0;
-}
-
-#notification-dialog {
-  width: 400px;
-  padding-bottom: 20px;
-}
-
-#notification-dialog paper-button {
-  background: none;
-  text-transform: uppercase;
-}
-
-#notification-dialog .progress {
-  --paper-spinner-color: #880E4F;
-  --paper-spinner-stroke-width: 2px;
-}
-
-#notify-msgs {
-  text-align: center;
-  display: block;
-}
-
-.notify-msg {
-  font-weight: 500;
-  margin: 0;
-  padding: 0;
-}
-
-.notify-msg.error {
-  text-align: left;
-}
-
-.brush .extent {
-  stroke: #fff;
-  fill-opacity: .125;
-  shape-rendering: crispEdges;
-}
-
-.origin text {
-  font-size: 12px;
-  font-weight: 500;
-}
-
-.origin line {
-  stroke: black;
-  stroke-opacity: 0.2;
-}
-
-/* Ink Framework */
-
-/* - Buttons */
-.ink-button, ::shadow .ink-button {
-  border: none;
-  border-radius: 2px;
-  font-size: 13px;
-  padding: 10px;
-  min-width: 100px;
-  flex-shrink: 0;
-  background: #e3e3e3;
-}
-
-.status-bar-panel {
-  display: flex;
-  align-items: center;
-}
-
-.status-bar-entry {
-  border-left: 1px solid rgba(0, 0, 0, 0.5);
-  margin-left: 5px;
-  padding-left: 5px;
-}
-
-/* - Menubar */
-
-.ink-panel-menubar {
-  align-items: center;
-  position: relative;
-  height: 60px;
-  border-bottom: solid 1px #eee;
-  padding: 0 24px;
-  display: flex;
-}
-
-.ink-panel-menubar .ink-fabs {
-  position: absolute;
-  right: 12px;
-  top: 40px;
-  z-index: 1;
-}
-
-#bookmark-panel {
-  bottom: 0;
-  position: absolute;
-  width: 300px;
-}
-#bookmark-panel-container {
-  bottom: 60px;
-  position: absolute;
-}
-
-.ink-fab {
-  margin-left: 8px;
-  border: 1px solid rgba(0, 0, 0, 0.02);
-  background: white;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
-}
-
-#metadata-card {
-  position: absolute;
-  right: 5px;
-  top: 25px;
-}
-
-#help-3d-icon {
-  position: absolute;
-  top: 20px;
-  left: 20px;
-}
-
-#help3dDialog .main {
-  margin: 0;
-  padding: 20px;
-}
-
-#help3dDialog h3 {
-  margin-top: 20px;
-  margin-bottom: 5px;
-}
-
-#help3dDialog h3:first-child {
-  margin-top: 0;
-}
-
-#data-panel {
-  border-top: 1px solid rgba(0, 0, 0, 0.1);
-  overflow-y: auto;
-}
-
-#toast {
-  display: flex;
-  align-items: center;
-  --paper-toast-color: #eeff41;
-}
-</style>
-<paper-dialog id="notification-dialog" modal="">
-  <h2 id="notification-title"></h2>
-  <paper-dialog-scrollable>
-    <div id="notify-msgs"></div>
-  </paper-dialog-scrollable>
-  <div style="text-align: center;"><paper-spinner-lite active="" class="progress"></paper-spinner-lite></div>
-  <div class="buttons">
-    <paper-button class="close-button" dialog-confirm="" autofocus="">Close</paper-button>
-  </div>
-</paper-dialog>
-<div id="container">
-  <div id="left-pane" class="ink-panel">
-    <vz-projector-data-panel id="data-panel"></vz-projector-data-panel>
-    <vz-projector-projections-panel id="projections-panel"></vz-projector-projections-panel>
-  </div>
-  <div id="main" class="ink-panel">
-    <div class="ink-panel-menubar">
-      <paper-icon-button id="selectMode" alt="Bounding box selection" toggles="" icon="image:photo-size-select-small"></paper-icon-button>
-      <paper-tooltip for="selectMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Bounding box selection</paper-tooltip>
-
-      <paper-icon-button id="nightDayMode" alt="Enable/disable night mode" toggles="" icon="image:brightness-2"></paper-icon-button>
-      <paper-tooltip for="nightDayMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Enable/disable night mode</paper-tooltip>
-
-      <paper-icon-button id="labels3DMode" alt="Enable/disable 3D labels mode" toggles="" icon="font-download"></paper-icon-button>
-      <paper-tooltip for="labels3DMode" position="bottom" animation-delay="0" fit-to-visible-bounds="">Enable/disable 3D labels mode</paper-tooltip>
-      <div class="status-bar-panel">
-        <div class="status-bar-entry">Points: <span class="numDataPoints">Loading...</span></div>
-        <div class="status-bar-entry">Dimension: <span class="dim">Loading...</span></div>
-        <div id="status-bar" class="status-bar-entry" style="display: none;"></div>
-      </div>
-      <div class="ink-fabs">
-        <paper-icon-button id="reset-zoom" class="ink-fab" alt="Reset zoom to fit all points" icon="home"></paper-icon-button>
-        <paper-tooltip for="reset-zoom" position="left" animation-delay="0">Reset zoom to fit all points</paper-tooltip>
-      </div>
-    </div>
-    <div class="stage">
-      <div id="scatter">
-        <svg id="selector"></svg>
-      </div>
-      <vz-projector-metadata-card id="metadata-card"></vz-projector-metadata-card>
-      <paper-icon-button raised="" onclick="help3dDialog.open()" icon="help-outline" id="help-3d-icon"></paper-icon-button>
-      <paper-tooltip animation-delay="0" for="help-3d-icon">Help with interaction controls.</paper-tooltip>
-      <paper-dialog id="help3dDialog" with-backdrop="">
-        <div class="main" dialog-confirm="" autofocus="">
-          <h3>3D controls</h3>
-            <b>Rotate</b> Mouse left click.<br>
-            <b>Pan</b> Mouse right click.<br>
-            <b>Zoom</b> Mouse wheel.<br>
-            Holding <b>ctrl</b> reverses the mouse clicks.
-          <h3>2D controls</h3>
-            <b>Pan</b> Mouse left click.<br>
-            <b>Zoom</b> Mouse wheel.
-          <div class="dismiss-dialog-note"> Click anywhere to dismiss.</div>
-        </div>
-      </paper-dialog>
-    </div>
-  </div>
-  <div id="right-pane" class="ink-panel">
-    <div class="ink-panel-content active">
-      <vz-projector-inspector-panel id="inspector-panel"></vz-projector-inspector-panel>
-    </div>
-    <div id="bookmark-panel-container">
-      <vz-projector-bookmark-panel id="bookmark-panel"></vz-projector-bookmark-panel>
-    </div>
-  </div>
-</div>
-<paper-toast id="toast" always-on-top=""></paper-toast>
-
-</template>
-</dom-module>
-<dom-module id="vz-projector-dashboard" assetpath="../vz-projector/">
-<template>
-  <tf-no-data-warning data-type="projector" show-warning="[[dataNotFound]]"></tf-no-data-warning>
-  <template is="dom-if" if="[[!dataNotFound]]">
-    <vz-projector id="projector" route-prefix="[[routePrefix]]" serving-mode="server" page-view-logging="" event-logging=""></vz-projector>
-  </template>
-</template>
-<script>
-(function() {
-Polymer({
-  is: 'vz-projector-dashboard',
-  properties: {
-    dataNotFound: Boolean,
-    routePrefix: String
-  },
-  ready() {
-    var self = this;
-    d3.json(this.routePrefix + '/runs', function(err, runs) {
-      self.dataNotFound = (runs.length === 0);
-    });
-  }
-});
-})();
-</script>
-</dom-module>
-<script>/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-var TF;
-(function (TF) {
-    var TensorBoard;
-    (function (TensorBoard) {
-        TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY = 'TF.TensorBoard.autoReloadEnabled';
-        var getAutoReloadFromLocalStorage = function () {
-            var val = window.localStorage.getItem(TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY);
-            return val === 'true' || val == null; // defaults to true
-        };
-        TensorBoard.AutoReloadBehavior = {
-            properties: {
-                autoReloadEnabled: {
-                    type: Boolean,
-                    observer: '_autoReloadObserver',
-                    value: getAutoReloadFromLocalStorage,
-                },
-                _autoReloadId: {
-                    type: Number,
-                },
-                autoReloadIntervalSecs: {
-                    type: Number,
-                    value: 30,
-                },
-            },
-            detached: function () {
-                window.clearTimeout(this._autoReloadId);
-            },
-            _autoReloadObserver: function (autoReload) {
-                window.localStorage.setItem(TensorBoard.AUTORELOAD_LOCALSTORAGE_KEY, autoReload);
-                if (autoReload) {
-                    var _this = this;
-                    this._autoReloadId = window.setTimeout(this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-                }
-                else {
-                    window.clearTimeout(this._autoReloadId);
-                }
-            },
-            _doAutoReload: function () {
-                if (this.reload == null) {
-                    throw new Error('AutoReloadBehavior requires a reload method');
-                }
-                this.reload();
-                this._autoReloadId = window.setTimeout(this._doAutoReload.bind(this), this.autoReloadIntervalSecs * 1000);
-            }
-        };
-    })(TensorBoard = TF.TensorBoard || (TF.TensorBoard = {}));
-})(TF || (TF = {}));
-</script></div><dom-module id="tf-tensorboard">
-  <template>
-    <paper-dialog with-backdrop="" id="settings">
-      <h2>Settings</h2>
-      <paper-checkbox id="auto-reload-checkbox" checked="{{autoReloadEnabled}}">
-        Reload data every <span>[[autoReloadIntervalSecs]]</span>s.
-      </paper-checkbox>
-    </paper-dialog>
-    <paper-header-panel>
-      <paper-toolbar id="toolbar">
-        <div id="toolbar-content">
-          <div class="toolbar-title">TensorBoard</div>
-          <paper-tabs selected="{{modeIndex}}" noink="" class="tabs" id="tabs">
-            <template is="dom-repeat" items="[[tabs]]">
-              <template is="dom-if" if="[[_isTabEnabled(item)]]">
-                <paper-tab data-mode="[[item]]">[[item]]</paper-tab>
-              </template>
-            </template>
-          </paper-tabs>
-          <div class="global-actions">
-            <paper-icon-button icon="refresh" on-tap="reload" disabled$="[[_isReloadDisabled(mode)]]" id="reload-button"></paper-icon-button>
-            <paper-icon-button icon="settings" on-tap="openSettings" id="settings-button"></paper-icon-button>
-            <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/README.md" tabindex="-1">
-              <paper-icon-button icon="help-outline"></paper-icon-button>
-            </a>
-          </div>
-        </div>
-      </paper-toolbar>
-
-      <div id="content" class="fit">
-        <content id="injected-overview"></content>
-
-        <template is="dom-if" if="[[_modeIsScalars(mode)]]">
-          <tf-scalar-dashboard id="scalars" backend="[[_backend]]" router="[[router]]"></tf-scalar-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsImages(mode)]]">
-          <tf-image-dashboard id="images" backend="[[_backend]]"></tf-image-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsAudio(mode)]]">
-          <tf-audio-dashboard id="audio" backend="[[_backend]]"></tf-audio-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsGraphs(mode)]]">
-          <tf-graph-dashboard id="graphs" backend="[[_backend]]" debugger-data-enabled="[[_debuggerDataEnabled]]"></tf-graph-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsDistributions(mode)]]">
-          <tf-distribution-dashboard id="distributions" backend="[[_backend]]"></tf-distribution-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsHistograms(mode)]]">
-          <tf-histogram-dashboard id="histograms" backend="[[_backend]]"></tf-histogram-dashboard>
-        </template>
-
-        <template is="dom-if" if="[[_modeIsEmbeddings(mode)]]">
-          <vz-projector-dashboard id="projector" route-prefix="/data/plugin/projector">
-          </vz-projector-dashboard>
-        </template>
-      </div>
-    </paper-header-panel>
-
-    <style>
-      :host {
-        height: 100%;
-        display: block;
-        background-color: var(--paper-grey-100);
-      }
-
-      #toolbar {
-        background-color: var(--tb-toolbar-background-color, --tb-orange-strong);
-        -webkit-font-smoothing: antialiased;
-      }
-
-      .toolbar-title {
-        font-size: 20px;
-        margin-left: 10px;
-        text-rendering: optimizeLegibility;
-        letter-spacing: -0.025em;
-        font-weight: 500;
-        flex-grow: 2;
-        display: var(--tb-toolbar-title-display, block);
-      }
-
-      .tabs {
-        flex-grow: 1;
-        text-transform: uppercase;
-        height: 100%;
-      }
-
-      paper-tabs {
-        --paper-tabs-selection-bar-color: white;
-      }
-
-      .global-actions {
-        flex-grow: 2;
-        display: inline-flex; /* Ensure that icons stay aligned */
-        justify-content: flex-end;
-        text-align: right;
-        color: white;
-      }
-
-      .global-actions a {
-        color: white;
-      }
-
-      #toolbar-content {
-        width: 100%;
-        height: 100%;
-        display: flex;
-        flex-direction: row;
-        justify-content: space-between;
-        align-items: center;
-      }
-
-      #content {
-        height: 100%;
-      }
-
-      [disabled] {
-        opacity: 0.2;
-        color: white;
-      }
-
-    </style>
-  </template>
-  <script>
-    Polymer({
-      is: "tf-tensorboard",
-      behaviors: [TF.TensorBoard.AutoReloadBehavior],
-      properties: {
-        router: {
-          type: Object,
-          value: function() {
-            return TF.Backend.router();
-          },
-        },
-        _backend: {
-          type: Object,
-          computed: "_makeBackend(router, demoDir)",
-        },
-        _debuggerDataEnabled: {
-          type: Boolean,
-          value: function() {
-            // For now, Tensorboard only shows debugger data if the debugger_data GET param is set
-            // to enabled.
-            let match = window.location.href.match(/[&\?]debugger_data=enabled/);
-            return match && match.length == 1;
-          },
-        },
-        // Which tab is selected (scalars, graph, images etc).
-        mode: {
-          type: String,
-          computed: '_getModeFromIndex(modeIndex)',
-          notify: true,
-        },
-        tabs: {
-          type: Array,
-          readOnly: true,
-          value: TF.Globals.TABS,
-        },
-        // If this is set to a string, TensorBoard will switch to "demo mode"
-        // and attempt to load serialized json data from that directory. You can
-        // generate conformant json using
-        // tensorboard/scripts/serialize_tensorboard.py
-        demoDir: {
-          type: String,
-          value: null,
-        },
-        // Set this to true to store state in URI hash. Should be true for all non-test purposes.
-        useHash: {
-          type: Boolean,
-          value: false,
-        },
-        disabledTabs: String,
-      },
-      _isTabEnabled: function(tab) {
-        if (this.disabledTabs != null &&
-            this.disabledTabs.split(',').indexOf(tab) >= 0) {
-          return false;
-        }
-        return true;
-      },
-      _getModeFromIndex: function(modeIndex) {
-        var mode = this.tabs[modeIndex];
-        TF.URIStorage.setString(TF.URIStorage.TAB, mode);
-        return mode;
-      },
-      _makeBackend: function(router, demoDir) {
-        // use the demoDir if it is set, otherwise use the provided router
-        if (demoDir != null) {
-          router = TF.Backend.router(demoDir, true);
-        }
-        return new TF.Backend.Backend(router);
-      },
-      _isReloadDisabled: function(mode) {
-        return !this._debuggerDataEnabled && this._modeIsGraphs(mode);
-      },
-      _modeIsScalars: function(mode) {
-        return mode === "scalars";
-      },
-      _modeIsImages: function(mode) {
-        return mode === "images";
-      },
-      _modeIsAudio: function(mode) {
-        return mode === "audio";
-      },
-      _modeIsGraphs: function(mode) {
-        return mode === "graphs";
-      },
-      _modeIsEmbeddings: function(mode) {
-        return mode === "embeddings";
-      },
-      _modeIsDistributions: function(mode) {
-        return mode === "distributions";
-      },
-      _modeIsHistograms: function(mode) {
-        return mode === "histograms";
-      },
-      selectedDashboard: function() {
-        var dashboard = this.$$("#" + this.mode);
-        if (dashboard == null) {
-          throw new Error(`Unable to find dashboard for mode: ${this.mode}`);
-        }
-        return dashboard;
-      },
-      ready: function() {
-        TF.Globals.USE_HASH = this.useHash;
-
-        this._getModeFromHash();
-        window.addEventListener('hashchange', function() {
-          this._getModeFromHash();
-        }.bind(this));
-      },
-      _getModeFromHash: function() {
-        var tabName = TF.URIStorage.getString(TF.URIStorage.TAB);
-        var modeIndex = this.tabs.indexOf(tabName);
-        if (modeIndex == -1 && this.modeIndex == null) {
-          // Select the first tab as default.
-          this.set('modeIndex', 0);
-        }
-        if (modeIndex != -1 && modeIndex != this.modeIndex) {
-          this.set('modeIndex', modeIndex);
-        }
-      },
-      reload: function() {
-        if (this._modeIsEmbeddings(this.mode)) {
-          return;
-        }
-        if (!this._debuggerDataEnabled && this._modeIsGraphs(this.mode)) {
-          return;
-        }
-        this.selectedDashboard().reload();
-      },
-      openSettings: function() {
-        this.$.settings.open();
-      },
-    });
-  </script>
-
-  <script>(function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
-"use strict";
-var AnalyticsLogger = (function () {
-    /**
-     * Constructs an event logger using Google Analytics. It assumes there is a
-     * Google Analytics script added to the page elsewhere. If there is no such
-     * script, the logger acts as a no-op.
-     *
-     * @param pageViewLogging Whether to log page views.
-     * @param eventLogging Whether to log user interaction.
-     */
-    function AnalyticsLogger(pageViewLogging, eventLogging) {
-        if (typeof ga === 'undefined' || ga == null) {
-            this.eventLogging = false;
-            this.pageViewLogging = false;
-            return;
-        }
-        this.eventLogging = eventLogging;
-        this.pageViewLogging = pageViewLogging;
-    }
-    AnalyticsLogger.prototype.logPageView = function (pageTitle) {
-        if (this.pageViewLogging) {
-            // Always send a page view.
-            ga('send', { hitType: 'pageview', page: "/v/" + pageTitle });
-        }
-    };
-    AnalyticsLogger.prototype.logProjectionChanged = function (projection) {
-        if (this.eventLogging) {
-            ga('send', {
-                hitType: 'event',
-                eventCategory: 'Projection',
-                eventAction: 'click',
-                eventLabel: projection
-            });
-        }
-    };
-    AnalyticsLogger.prototype.logWebGLDisabled = function () {
-        if (this.eventLogging) {
-            ga('send', {
-                hitType: 'event',
-                eventCategory: 'Error',
-                eventAction: 'PageLoad',
-                eventLabel: 'WebGL_disabled'
-            });
-        }
-    };
-    return AnalyticsLogger;
-}());
-exports.AnalyticsLogger = AnalyticsLogger;
-
-},{}],2:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * This is a fork of the Karpathy's TSNE.js (original license below).
- * This fork implements Barnes-Hut approximation and runs in O(NlogN)
- * time, as opposed to the Karpathy's O(N^2) version.
- *
- * @author smilkov@google.com (Daniel Smilkov)
- */
-/**
- * The MIT License (MIT)
- * Copyright (c) 2015 Andrej Karpathy
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-var sptree_1 = require('./sptree');
-/**
- * Barnes-hut approximation level. Higher means more approximation and faster
- * results. Recommended value mentioned in the paper is 0.8.
- */
-var THETA = 0.8;
-var MIN_POSSIBLE_PROB = 1E-9;
-// Variables used for memorizing the second random number since running
-// gaussRandom() generates two random numbers at the cost of 1 atomic
-// computation. This optimization results in 2X speed-up of the generator.
-var return_v = false;
-var v_val = 0.0;
-/** Returns the square euclidean distance between two vectors. */
-function dist2(a, b) {
-    if (a.length !== b.length) {
-        throw new Error('Vectors a and b must be of same length');
-    }
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-    }
-    return result;
-}
-exports.dist2 = dist2;
-/** Returns the square euclidean distance between two 2D points. */
-function dist2_2D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist2_2D = dist2_2D;
-/** Returns the square euclidean distance between two 3D points. */
-function dist2_3D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    var dZ = a[2] - b[2];
-    return dX * dX + dY * dY + dZ * dZ;
-}
-exports.dist2_3D = dist2_3D;
-function gaussRandom(rng) {
-    if (return_v) {
-        return_v = false;
-        return v_val;
-    }
-    var u = 2 * rng() - 1;
-    var v = 2 * rng() - 1;
-    var r = u * u + v * v;
-    if (r === 0 || r > 1) {
-        return gaussRandom(rng);
-    }
-    var c = Math.sqrt(-2 * Math.log(r) / r);
-    v_val = v * c; // cache this for next function call for efficiency
-    return_v = true;
-    return u * c;
-}
-;
-// return random normal number
-function randn(rng, mu, std) {
-    return mu + gaussRandom(rng) * std;
-}
-;
-// utilitity that creates contiguous vector of zeros of size n
-function zeros(n) {
-    return new Float64Array(n);
-}
-;
-// utility that returns a matrix filled with random numbers
-// generated by the provided generator.
-function randnMatrix(n, d, rng) {
-    var nd = n * d;
-    var x = zeros(nd);
-    for (var i = 0; i < nd; ++i) {
-        x[i] = randn(rng, 0.0, 1E-4);
-    }
-    return x;
-}
-;
-// utility that returns a matrix filled with the provided value.
-function arrayofs(n, d, val) {
-    var x = [];
-    for (var i = 0; i < n; ++i) {
-        x.push(d === 3 ? [val, val, val] : [val, val]);
-    }
-    return x;
-}
-;
-// compute (p_{i|j} + p_{j|i})/(2n)
-function nearest2P(nearest, perplexity, tol) {
-    var N = nearest.length;
-    var Htarget = Math.log(perplexity); // target entropy of distribution
-    var P = zeros(N * N); // temporary probability matrix
-    var K = nearest[0].length;
-    var pRow = new Array(K); // pij[].
-    for (var i = 0; i < N; ++i) {
-        var neighbors = nearest[i];
-        var betaMin = -Infinity;
-        var betaMax = Infinity;
-        var beta = 1; // initial value of precision
-        var maxTries = 50;
-        // perform binary search to find a suitable precision beta
-        // so that the entropy of the distribution is appropriate
-        var numTries = 0;
-        while (true) {
-            // compute entropy and kernel row with beta precision
-            var psum = 0.0;
-            for (var k = 0; k < neighbors.length; ++k) {
-                var neighbor = neighbors[k];
-                var pij = (i === neighbor.index) ? 0 : Math.exp(-neighbor.dist * beta);
-                pij = Math.max(pij, MIN_POSSIBLE_PROB);
-                pRow[k] = pij;
-                psum += pij;
-            }
-            // normalize p and compute entropy
-            var Hhere = 0.0;
-            for (var k = 0; k < pRow.length; ++k) {
-                pRow[k] /= psum;
-                var pij = pRow[k];
-                if (pij > 1E-7) {
-                    Hhere -= pij * Math.log(pij);
-                }
-                ;
-            }
-            // adjust beta based on result
-            if (Hhere > Htarget) {
-                // entropy was too high (distribution too diffuse)
-                // so we need to increase the precision for more peaky distribution
-                betaMin = beta; // move up the bounds
-                if (betaMax === Infinity) {
-                    beta = beta * 2;
-                }
-                else {
-                    beta = (beta + betaMax) / 2;
-                }
-            }
-            else {
-                // converse case. make distrubtion less peaky
-                betaMax = beta;
-                if (betaMin === -Infinity) {
-                    beta = beta / 2;
-                }
-                else {
-                    beta = (beta + betaMin) / 2;
-                }
-            }
-            numTries++;
-            // stopping conditions: too many tries or got a good precision
-            if (numTries >= maxTries || Math.abs(Hhere - Htarget) < tol) {
-                break;
-            }
-        }
-        // copy over the final prow to P at row i
-        for (var k = 0; k < pRow.length; ++k) {
-            var pij = pRow[k];
-            var j = neighbors[k].index;
-            P[i * N + j] = pij;
-        }
-    } // end loop over examples i
-    // symmetrize P and normalize it to sum to 1 over all ij
-    var N2 = N * 2;
-    for (var i = 0; i < N; ++i) {
-        for (var j = i + 1; j < N; ++j) {
-            var i_j = i * N + j;
-            var j_i = j * N + i;
-            var value = (P[i_j] + P[j_i]) / N2;
-            P[i_j] = value;
-            P[j_i] = value;
-        }
-    }
-    return P;
-}
-;
-// helper function
-function sign(x) {
-    return x > 0 ? 1 : x < 0 ? -1 : 0;
-}
-function computeForce_2d(force, mult, pointA, pointB) {
-    force[0] += mult * (pointA[0] - pointB[0]);
-    force[1] += mult * (pointA[1] - pointB[1]);
-}
-function computeForce_3d(force, mult, pointA, pointB) {
-    force[0] += mult * (pointA[0] - pointB[0]);
-    force[1] += mult * (pointA[1] - pointB[1]);
-    force[2] += mult * (pointA[2] - pointB[2]);
-}
-var TSNE = (function () {
-    function TSNE(opt) {
-        this.iter = 0;
-        opt = opt || { dim: 2 };
-        this.perplexity = opt.perplexity || 30;
-        this.epsilon = opt.epsilon || 10;
-        this.rng = opt.rng || Math.random;
-        this.dim = opt.dim;
-        if (opt.dim === 2) {
-            this.dist2 = dist2_2D;
-            this.computeForce = computeForce_2d;
-        }
-        else if (opt.dim === 3) {
-            this.dist2 = dist2_3D;
-            this.computeForce = computeForce_3d;
-        }
-        else {
-            throw new Error('Only 2D and 3D is supported');
-        }
-    }
-    // this function takes a fattened distance matrix and creates
-    // matrix P from them.
-    // D is assumed to be provided as an array of size N^2.
-    TSNE.prototype.initDataDist = function (nearest) {
-        var N = nearest.length;
-        this.nearest = nearest;
-        this.P = nearest2P(nearest, this.perplexity, 1E-4);
-        this.N = N;
-        this.initSolution(); // refresh this
-    };
-    // (re)initializes the solution to random
-    TSNE.prototype.initSolution = function () {
-        // generate random solution to t-SNE
-        this.Y = randnMatrix(this.N, this.dim, this.rng); // the solution
-        this.gains = arrayofs(this.N, this.dim, 1.0); // step gains
-        // to accelerate progress in unchanging directions
-        this.ystep = arrayofs(this.N, this.dim, 0.0); // momentum accumulator
-        this.iter = 0;
-    };
-    // return pointer to current solution
-    TSNE.prototype.getSolution = function () { return this.Y; };
-    // perform a single step of optimization to improve the embedding
-    TSNE.prototype.step = function () {
-        this.iter += 1;
-        var N = this.N;
-        var grad = this.costGrad(this.Y); // evaluate gradient
-        // perform gradient step
-        var ymean = this.dim === 3 ? [0, 0, 0] : [0, 0];
-        for (var i = 0; i < N; ++i) {
-            for (var d = 0; d < this.dim; ++d) {
-                var gid = grad[i][d];
-                var sid = this.ystep[i][d];
-                var gainid = this.gains[i][d];
-                // compute gain update
-                var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
-                if (newgain < 0.01) {
-                    newgain = 0.01; // clamp
-                }
-                this.gains[i][d] = newgain; // store for next turn
-                // compute momentum step direction
-                var momval = this.iter < 250 ? 0.5 : 0.8;
-                var newsid = momval * sid - this.epsilon * newgain * grad[i][d];
-                this.ystep[i][d] = newsid; // remember the step we took
-                // step!
-                var i_d = i * this.dim + d;
-                this.Y[i_d] += newsid;
-                ymean[d] += this.Y[i_d]; // accumulate mean so that we
-            }
-        }
-        // reproject Y to be zero mean
-        for (var i = 0; i < N; ++i) {
-            for (var d = 0; d < this.dim; ++d) {
-                this.Y[i * this.dim + d] -= ymean[d] / N;
-            }
-        }
-    };
-    // return cost and gradient, given an arrangement
-    TSNE.prototype.costGrad = function (Y) {
-        var _this = this;
-        var N = this.N;
-        var P = this.P;
-        // Trick that helps with local optima.
-        var alpha = this.iter < 100 ? 4 : 1;
-        // Make data for the SP tree.
-        var points = new Array(N); // (x, y)[]
-        for (var i = 0; i < N; ++i) {
-            var iTimesD = i * this.dim;
-            var row = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                row[d] = Y[iTimesD + d];
-            }
-            points[i] = row;
-        }
-        // Make a tree.
-        var tree = new sptree_1.SPTree(points);
-        var root = tree.root;
-        // Annotate the tree.
-        var annotateTree = function (node) {
-            var numCells = 1;
-            if (node.children == null) {
-                // Update the current node and tell the parent.
-                node.numCells = numCells;
-                node.yCell = node.point;
-                return { numCells: numCells, yCell: node.yCell };
-            }
-            // node.point is a 2 or 3-dim number[], so slice() makes a copy.
-            var yCell = node.point.slice();
-            for (var i = 0; i < node.children.length; ++i) {
-                var child = node.children[i];
-                if (child == null) {
-                    continue;
-                }
-                var result = annotateTree(child);
-                numCells += result.numCells;
-                for (var d = 0; d < _this.dim; ++d) {
-                    yCell[d] += result.yCell[d];
-                }
-            }
-            // Update the node and tell the parent.
-            node.numCells = numCells;
-            node.yCell = yCell.map(function (v) { return v / numCells; });
-            return { numCells: numCells, yCell: yCell };
-        };
-        // Augment the tree with more info.
-        annotateTree(root);
-        tree.visit(function (node, low, high) {
-            node.rCell = high[0] - low[0];
-            return false;
-        });
-        // compute current Q distribution, unnormalized first
-        var grad = [];
-        var Z = 0;
-        var forces = new Array(N);
-        var _loop_1 = function(i) {
-            var pointI = points[i];
-            // Compute the positive forces for the i-th node.
-            var Fpos = this_1.dim === 3 ? [0, 0, 0] : [0, 0];
-            var neighbors = this_1.nearest[i];
-            for (var k = 0; k < neighbors.length; ++k) {
-                var j = neighbors[k].index;
-                var pij = P[i * N + j];
-                var pointJ = points[j];
-                var squaredDistItoJ = this_1.dist2(pointI, pointJ);
-                var premult = pij / (1 + squaredDistItoJ);
-                this_1.computeForce(Fpos, premult, pointI, pointJ);
-            }
-            // Compute the negative forces for the i-th node.
-            var FnegZ = this_1.dim === 3 ? [0, 0, 0] : [0, 0];
-            tree.visit(function (node) {
-                var squaredDistToCell = _this.dist2(pointI, node.yCell);
-                // Squared distance from point i to cell.
-                if (node.children == null ||
-                    (squaredDistToCell > 0 &&
-                        node.rCell / Math.sqrt(squaredDistToCell) < THETA)) {
-                    var qijZ_1 = 1 / (1 + squaredDistToCell);
-                    var dZ = node.numCells * qijZ_1;
-                    Z += dZ;
-                    dZ *= qijZ_1;
-                    _this.computeForce(FnegZ, dZ, pointI, node.yCell);
-                    return true;
-                }
-                // Cell is too close to approximate.
-                var squaredDistToPoint = _this.dist2(pointI, node.point);
-                var qijZ = 1 / (1 + squaredDistToPoint);
-                Z += qijZ;
-                qijZ *= qijZ;
-                _this.computeForce(FnegZ, qijZ, pointI, node.point);
-                return false;
-            }, true);
-            forces[i] = [Fpos, FnegZ];
-        };
-        var this_1 = this;
-        for (var i = 0; i < N; ++i) {
-            _loop_1(i);
-        }
-        // Normalize the negative forces and compute the gradient.
-        var A = 4 * alpha;
-        var B = 4 / Z;
-        for (var i = 0; i < N; ++i) {
-            var _a = forces[i], FPos = _a[0], FNegZ = _a[1];
-            var gsum = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                gsum[d] = A * FPos[d] - B * FNegZ[d];
-            }
-            grad.push(gsum);
-        }
-        return grad;
-    };
-    return TSNE;
-}());
-exports.TSNE = TSNE;
-
-},{"./sptree":23}],3:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var data_provider_1 = require('./data-provider');
-var dataProvider = require('./data-provider');
-var logging = require('./logging');
-var BYTES_EXTENSION = '.bytes';
-/** Data provider that loads data from a demo folder. */
-var DemoDataProvider = (function () {
-    function DemoDataProvider(projectorConfigPath) {
-        this.projectorConfigPath = projectorConfigPath;
-    }
-    DemoDataProvider.prototype.getEmbeddingInfo = function (tensorName) {
-        var embeddings = this.projectorConfig.embeddings;
-        for (var i = 0; i < embeddings.length; i++) {
-            var embedding = embeddings[i];
-            if (embedding.tensorName === tensorName) {
-                return embedding;
-            }
-        }
-        return null;
-    };
-    DemoDataProvider.prototype.retrieveRuns = function (callback) {
-        callback(['Demo']);
-    };
-    DemoDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        var _this = this;
-        var msgId = logging.setModalMessage('Fetching projector config...');
-        d3.json(this.projectorConfigPath, function (err, projectorConfig) {
-            if (err) {
-                var errorMessage = err;
-                // If the error is a valid XMLHttpResponse, it's possible this is a
-                // cross-origin error.
-                if (err.responseText != null) {
-                    errorMessage = 'Cannot fetch projector config, possibly a ' +
-                        'Cross-Origin request error.';
-                }
-                logging.setErrorMessage(errorMessage, 'fetching projector config');
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            _this.projectorConfig = projectorConfig;
-            callback(projectorConfig);
-        });
-    };
-    DemoDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var url = "" + embedding.tensorPath;
-        if (embedding.tensorPath.substr(-1 * BYTES_EXTENSION.length) ===
-            BYTES_EXTENSION) {
-            dataProvider.retrieveTensorAsBytes(this, this.getEmbeddingInfo(tensorName), run, tensorName, url, callback);
-        }
-        else {
-            logging.setModalMessage('Fetching tensors...', data_provider_1.TENSORS_MSG_ID);
-            d3.text(url, function (error, dataString) {
-                if (error) {
-                    logging.setErrorMessage(error.responseText, 'fetching tensors');
-                    return;
-                }
-                dataProvider.parseTensors(dataString).then(function (points) {
-                    callback(new data_1.DataSet(points));
-                });
-            });
-        }
-    };
-    DemoDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var spriteImagePath = null;
-        if (embedding.sprite && embedding.sprite.imagePath) {
-            spriteImagePath = embedding.sprite.imagePath;
-        }
-        dataProvider.retrieveSpriteAndMetadataInfo(embedding.metadataPath, spriteImagePath, embedding.sprite, callback);
-    };
-    DemoDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        var embedding = this.getEmbeddingInfo(tensorName);
-        var msgId = logging.setModalMessage('Fetching bookmarks...');
-        d3.json(embedding.bookmarksPath, function (err, bookmarks) {
-            if (err) {
-                logging.setErrorMessage(err.responseText);
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            callback(bookmarks);
-        });
-    };
-    return DemoDataProvider;
-}());
-exports.DemoDataProvider = DemoDataProvider;
-
-},{"./data":7,"./data-provider":6,"./logging":12}],4:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var data_provider_1 = require('./data-provider');
-var ProtoDataProvider = (function () {
-    function ProtoDataProvider(dataProto) {
-        this.dataProto = dataProto;
-    }
-    ProtoDataProvider.prototype.retrieveRuns = function (callback) {
-        callback(['proto']);
-    };
-    ProtoDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        callback({
-            modelCheckpointPath: 'proto',
-            embeddings: [{
-                    tensorName: 'proto',
-                    tensorShape: this.dataProto.shape,
-                    metadataPath: 'proto'
-                }]
-        });
-    };
-    ProtoDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        callback(this.flatArrayToDataset(this.dataProto.tensor));
-    };
-    ProtoDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var columnNames = this.dataProto.metadata.columns.map(function (c) { return c.name; });
-        var n = this.dataProto.shape[0];
-        var pointsMetadata = new Array(n);
-        this.dataProto.metadata.columns.forEach(function (c) {
-            var values = c.numericValues || c.stringValues;
-            for (var i = 0; i < n; i++) {
-                pointsMetadata[i] = pointsMetadata[i] || {};
-                pointsMetadata[i][c.name] = values[i];
-            }
-        });
-        callback({
-            stats: data_provider_1.analyzeMetadata(columnNames, pointsMetadata),
-            pointsInfo: pointsMetadata
-        });
-    };
-    ProtoDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        return callback([]);
-    };
-    ProtoDataProvider.prototype.flatArrayToDataset = function (tensor) {
-        var points = [];
-        var n = this.dataProto.shape[0];
-        var d = this.dataProto.shape[1];
-        if (n * d !== tensor.length) {
-            throw 'The shape doesn\'t match the length of the flattened array';
-        }
-        for (var i = 0; i < n; i++) {
-            var offset = i * d;
-            points.push({
-                vector: new Float32Array(tensor.slice(offset, offset + d)),
-                metadata: {},
-                projections: null,
-                index: i
-            });
-        }
-        return new data_1.DataSet(points);
-    };
-    return ProtoDataProvider;
-}());
-exports.ProtoDataProvider = ProtoDataProvider;
-
-},{"./data":7,"./data-provider":6}],5:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var dataProvider = require('./data-provider');
-var logging = require('./logging');
-// Limit for the number of data points we receive from the server.
-exports.LIMIT_NUM_POINTS = 100000;
-/**
- * Data provider that loads data provided by a python server (usually backed
- * by a checkpoint file).
- */
-var ServerDataProvider = (function () {
-    function ServerDataProvider(routePrefix) {
-        this.runProjectorConfigCache = {};
-        this.routePrefix = routePrefix;
-    }
-    ServerDataProvider.prototype.getEmbeddingInfo = function (run, tensorName, callback) {
-        this.retrieveProjectorConfig(run, function (config) {
-            var embeddings = config.embeddings;
-            for (var i = 0; i < embeddings.length; i++) {
-                var embedding = embeddings[i];
-                if (embedding.tensorName === tensorName) {
-                    callback(embedding);
-                    return;
-                }
-            }
-            callback(null);
-        });
-    };
-    ServerDataProvider.prototype.retrieveRuns = function (callback) {
-        var msgId = logging.setModalMessage('Fetching runs...');
-        d3.json(this.routePrefix + "/runs", function (err, runs) {
-            if (err) {
-                logging.setErrorMessage(err.responseText, 'fetching runs');
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            callback(runs);
-        });
-    };
-    ServerDataProvider.prototype.retrieveProjectorConfig = function (run, callback) {
-        var _this = this;
-        if (run in this.runProjectorConfigCache) {
-            callback(this.runProjectorConfigCache[run]);
-            return;
-        }
-        var msgId = logging.setModalMessage('Fetching projector config...');
-        d3.json(this.routePrefix + "/info?run=" + run, function (err, config) {
-            if (err) {
-                logging.setErrorMessage(err.responseText, 'fetching projector config');
-                return;
-            }
-            logging.setModalMessage(null, msgId);
-            _this.runProjectorConfigCache[run] = config;
-            callback(config);
-        });
-    };
-    ServerDataProvider.prototype.retrieveTensor = function (run, tensorName, callback) {
-        var _this = this;
-        this.getEmbeddingInfo(run, tensorName, function (embedding) {
-            dataProvider.retrieveTensorAsBytes(_this, embedding, run, tensorName, (_this.routePrefix + "/tensor?run=" + run + "&name=" + tensorName) +
-                ("&num_rows=" + exports.LIMIT_NUM_POINTS), callback);
-        });
-    };
-    ServerDataProvider.prototype.retrieveSpriteAndMetadata = function (run, tensorName, callback) {
-        var _this = this;
-        this.getEmbeddingInfo(run, tensorName, function (embedding) {
-            var metadataPath = null;
-            if (embedding.metadataPath) {
-                metadataPath =
-                    (_this.routePrefix + "/metadata?") +
-                        ("run=" + run + "&name=" + tensorName + "&num_rows=" + exports.LIMIT_NUM_POINTS);
-            }
-            var spriteImagePath = null;
-            if (embedding.sprite && embedding.sprite.imagePath) {
-                spriteImagePath =
-                    _this.routePrefix + "/sprite_image?run=" + run + "&name=" + tensorName;
-            }
-            dataProvider.retrieveSpriteAndMetadataInfo(metadataPath, spriteImagePath, embedding.sprite, callback);
-        });
-    };
-    ServerDataProvider.prototype.getBookmarks = function (run, tensorName, callback) {
-        var msgId = logging.setModalMessage('Fetching bookmarks...');
-        d3.json(this.routePrefix + "/bookmarks?run=" + run + "&name=" + tensorName, function (err, bookmarks) {
-            logging.setModalMessage(null, msgId);
-            if (!err) {
-                callback(bookmarks);
-            }
-        });
-    };
-    return ServerDataProvider;
-}());
-exports.ServerDataProvider = ServerDataProvider;
-
-},{"./data-provider":6,"./logging":12}],6:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var data_1 = require('./data');
-var logging = require('./logging');
-var util_1 = require('./util');
-/** Maximum number of colors supported in the color map. */
-var NUM_COLORS_COLOR_MAP = 50;
-var MAX_SPRITE_IMAGE_SIZE_PX = 8192;
-exports.METADATA_MSG_ID = 'metadata';
-exports.TENSORS_MSG_ID = 'tensors';
-function retrieveTensorAsBytes(dp, embedding, run, tensorName, tensorsPath, callback) {
-    // Get the tensor.
-    logging.setModalMessage('Fetching tensor values...', exports.TENSORS_MSG_ID);
-    var xhr = new XMLHttpRequest();
-    xhr.open('GET', tensorsPath);
-    xhr.responseType = 'arraybuffer';
-    xhr.onprogress = function (ev) {
-        if (ev.lengthComputable) {
-            var percent = (ev.loaded * 100 / ev.total).toFixed(1);
-            logging.setModalMessage('Fetching tensor values: ' + percent + '%', exports.TENSORS_MSG_ID);
-        }
-    };
-    xhr.onload = function () {
-        if (xhr.status !== 200) {
-            var msg = String.fromCharCode.apply(null, new Uint8Array(xhr.response));
-            logging.setErrorMessage(msg, 'fetching tensors');
-            return;
-        }
-        var data;
-        try {
-            data = new Float32Array(xhr.response);
-        }
-        catch (e) {
-            logging.setErrorMessage(e, 'parsing tensor bytes');
-            return;
-        }
-        var dim = embedding.tensorShape[1];
-        var N = data.length / dim;
-        if (embedding.tensorShape[0] > N) {
-            logging.setWarningMessage(("Showing the first " + N.toLocaleString()) +
-                (" of " + embedding.tensorShape[0].toLocaleString() + " data points"));
-        }
-        parseTensorsFromFloat32Array(data, dim).then(function (dataPoints) {
-            callback(new data_1.DataSet(dataPoints));
-        });
-    };
-    xhr.send();
-}
-exports.retrieveTensorAsBytes = retrieveTensorAsBytes;
-function parseRawTensors(content, callback) {
-    parseTensors(content).then(function (data) {
-        callback(new data_1.DataSet(data));
-    });
-}
-exports.parseRawTensors = parseRawTensors;
-function parseRawMetadata(contents, callback) {
-    parseMetadata(contents).then(function (result) { return callback(result); });
-}
-exports.parseRawMetadata = parseRawMetadata;
-/** Parses a tsv text file. */
-function parseTensors(content, delim) {
-    if (delim === void 0) { delim = '\t'; }
-    var data = [];
-    var numDim;
-    return util_1.runAsyncTask('Parsing tensors...', function () {
-        var lines = content.split('\n');
-        lines.forEach(function (line) {
-            line = line.trim();
-            if (line === '') {
-                return;
-            }
-            var row = line.split(delim);
-            var dataPoint = {
-                metadata: {},
-                vector: null,
-                index: data.length,
-                projections: null,
-            };
-            // If the first label is not a number, take it as the label.
-            if (isNaN(row[0]) || numDim === row.length - 1) {
-                dataPoint.metadata['label'] = row[0];
-                dataPoint.vector = new Float32Array(row.slice(1).map(Number));
-            }
-            else {
-                dataPoint.vector = new Float32Array(row.map(Number));
-            }
-            data.push(dataPoint);
-            if (numDim == null) {
-                numDim = dataPoint.vector.length;
-            }
-            if (numDim !== dataPoint.vector.length) {
-                logging.setModalMessage('Parsing failed. Vector dimensions do not match');
-                throw Error('Parsing failed');
-            }
-            if (numDim <= 1) {
-                logging.setModalMessage('Parsing failed. Found a vector with only one dimension?');
-                throw Error('Parsing failed');
-            }
-        });
-        return data;
-    }, exports.TENSORS_MSG_ID).then(function (dataPoints) {
-        logging.setModalMessage(null, exports.TENSORS_MSG_ID);
-        return dataPoints;
-    });
-}
-exports.parseTensors = parseTensors;
-/** Parses a tsv text file. */
-function parseTensorsFromFloat32Array(data, dim) {
-    return util_1.runAsyncTask('Parsing tensors...', function () {
-        var N = data.length / dim;
-        var dataPoints = [];
-        var offset = 0;
-        for (var i = 0; i < N; ++i) {
-            dataPoints.push({
-                metadata: {},
-                vector: data.subarray(offset, offset + dim),
-                index: i,
-                projections: null,
-            });
-            offset += dim;
-        }
-        return dataPoints;
-    }, exports.TENSORS_MSG_ID).then(function (dataPoints) {
-        logging.setModalMessage(null, exports.TENSORS_MSG_ID);
-        return dataPoints;
-    });
-}
-exports.parseTensorsFromFloat32Array = parseTensorsFromFloat32Array;
-function analyzeMetadata(columnNames, pointsMetadata) {
-    var columnStats = columnNames.map(function (name) {
-        return {
-            name: name,
-            isNumeric: true,
-            tooManyUniqueValues: false,
-            min: Number.POSITIVE_INFINITY,
-            max: Number.NEGATIVE_INFINITY
-        };
-    });
-    var mapOfValues = columnNames.map(function () { return d3.map(); });
-    pointsMetadata.forEach(function (metadata) {
-        columnNames.forEach(function (name, colIndex) {
-            var stats = columnStats[colIndex];
-            var map = mapOfValues[colIndex];
-            var value = metadata[name];
-            // Skip missing values.
-            if (value == null) {
-                return;
-            }
-            if (!stats.tooManyUniqueValues) {
-                if (map.has(value)) {
-                    map.set(value, map.get(value) + 1);
-                }
-                else {
-                    map.set(value, 1);
-                }
-                if (map.size() > NUM_COLORS_COLOR_MAP) {
-                    stats.tooManyUniqueValues = true;
-                }
-            }
-            if (isNaN(value)) {
-                stats.isNumeric = false;
-            }
-            else {
-                metadata[name] = +value;
-                stats.min = Math.min(stats.min, +value);
-                stats.max = Math.max(stats.max, +value);
-            }
-        });
-    });
-    columnStats.forEach(function (stats, colIndex) {
-        stats.uniqueEntries = mapOfValues[colIndex].entries().map(function (e) {
-            return { label: e.key, count: e.value };
-        });
-    });
-    return columnStats;
-}
-exports.analyzeMetadata = analyzeMetadata;
-function parseMetadata(content) {
-    return util_1.runAsyncTask('Parsing metadata...', function () {
-        var lines = content.split('\n').filter(function (line) { return line.trim().length > 0; });
-        var hasHeader = lines[0].indexOf('\t') >= 0;
-        var pointsMetadata = [];
-        // If the first row doesn't contain metadata keys, we assume that the values
-        // are labels.
-        var columnNames = ['label'];
-        if (hasHeader) {
-            columnNames = lines[0].split('\t');
-            lines = lines.slice(1);
-        }
-        lines.forEach(function (line) {
-            var rowValues = line.split('\t');
-            var metadata = {};
-            pointsMetadata.push(metadata);
-            columnNames.forEach(function (name, colIndex) {
-                var value = rowValues[colIndex];
-                // Normalize missing values.
-                value = (value === '' ? null : value);
-                metadata[name] = value;
-            });
-        });
-        return {
-            stats: analyzeMetadata(columnNames, pointsMetadata),
-            pointsInfo: pointsMetadata
-        };
-    }, exports.METADATA_MSG_ID).then(function (metadata) {
-        logging.setModalMessage(null, exports.METADATA_MSG_ID);
-        return metadata;
-    });
-}
-exports.parseMetadata = parseMetadata;
-function fetchImage(url) {
-    return new Promise(function (resolve, reject) {
-        var image = new Image();
-        image.onload = function () { return resolve(image); };
-        image.onerror = function (err) { return reject(err); };
-        image.crossOrigin = '';
-        image.src = url;
-    });
-}
-exports.fetchImage = fetchImage;
-function retrieveSpriteAndMetadataInfo(metadataPath, spriteImagePath, spriteMetadata, callback) {
-    var metadataPromise = Promise.resolve({});
-    if (metadataPath) {
-        metadataPromise = new Promise(function (resolve, reject) {
-            logging.setModalMessage('Fetching metadata...', exports.METADATA_MSG_ID);
-            d3.text(metadataPath, function (err, rawMetadata) {
-                if (err) {
-                    logging.setErrorMessage(err.responseText, 'fetching metadata');
-                    reject(err);
-                    return;
-                }
-                resolve(parseMetadata(rawMetadata));
-            });
-        });
-    }
-    var spriteMsgId = null;
-    var spritesPromise = null;
-    if (spriteImagePath) {
-        spriteMsgId = logging.setModalMessage('Fetching sprite image...');
-        spritesPromise = fetchImage(spriteImagePath);
-    }
-    // Fetch the metadata and the image in parallel.
-    Promise.all([metadataPromise, spritesPromise]).then(function (values) {
-        if (spriteMsgId) {
-            logging.setModalMessage(null, spriteMsgId);
-        }
-        var metadata = values[0], spriteImage = values[1];
-        if (spriteImage && (spriteImage.height > MAX_SPRITE_IMAGE_SIZE_PX ||
-            spriteImage.width > MAX_SPRITE_IMAGE_SIZE_PX)) {
-            logging.setModalMessage(("Error: Sprite image of dimensions " + spriteImage.width + "px x ") +
-                (spriteImage.height + "px exceeds maximum dimensions ") +
-                (MAX_SPRITE_IMAGE_SIZE_PX + "px x " + MAX_SPRITE_IMAGE_SIZE_PX + "px"));
-        }
-        else {
-            metadata.spriteImage = spriteImage;
-            metadata.spriteMetadata = spriteMetadata;
-            callback(metadata);
-        }
-    });
-}
-exports.retrieveSpriteAndMetadataInfo = retrieveSpriteAndMetadataInfo;
-
-},{"./data":7,"./logging":12,"./util":24}],7:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var bh_tsne_1 = require('./bh_tsne');
-var knn = require('./knn');
-var logging = require('./logging');
-var util = require('./util');
-var vector = require('./vector');
-var IS_FIREFOX = navigator.userAgent.toLowerCase().indexOf('firefox') >= 0;
-/** Controls whether nearest neighbors computation is done on the GPU or CPU. */
-var KNN_GPU_ENABLED = util.hasWebGLSupport() && !IS_FIREFOX;
-exports.TSNE_SAMPLE_SIZE = 10000;
-exports.PCA_SAMPLE_SIZE = 50000;
-/** Number of dimensions to sample when doing approximate PCA. */
-exports.PCA_SAMPLE_DIM = 200;
-/** Number of pca components to compute. */
-var NUM_PCA_COMPONENTS = 10;
-/** Reserved metadata attribute used for trace information. */
-var TRACE_METADATA_ATTR = '__next__';
-/**
- * Dataset contains a DataPoints array that should be treated as immutable. This
- * acts as a working subset of the original data, with cached properties
- * from computationally expensive operations. Because creating a subset
- * requires normalizing and shifting the vector space, we make a copy of the
- * data so we can still always create new subsets based on the original data.
- */
-var DataSet = (function () {
-    /** Creates a new Dataset */
-    function DataSet(points, spriteAndMetadataInfo) {
-        this.shuffledDataIndices = [];
-        /**
-         * This keeps a list of all current projections so you can easily test to see
-         * if it's been calculated already.
-         */
-        this.projections = d3.set();
-        this.tSNEIteration = 0;
-        this.tSNEShouldStop = true;
-        this.dim = [0, 0];
-        this.hasTSNERun = false;
-        this.points = points;
-        this.shuffledDataIndices = util.shuffle(d3.range(this.points.length));
-        this.traces = this.computeTraces(points);
-        this.dim = [this.points.length, this.points[0].vector.length];
-        this.spriteAndMetadataInfo = spriteAndMetadataInfo;
-    }
-    DataSet.prototype.computeTraces = function (points) {
-        // Keep a list of indices seen so we don't compute traces for a given
-        // point twice.
-        var indicesSeen = new Int8Array(points.length);
-        // Compute traces.
-        var indexToTrace = {};
-        var traces = [];
-        for (var i = 0; i < points.length; i++) {
-            if (indicesSeen[i]) {
-                continue;
-            }
-            indicesSeen[i] = 1;
-            // Ignore points without a trace attribute.
-            var next = points[i].metadata[TRACE_METADATA_ATTR];
-            if (next == null || next === '') {
-                continue;
-            }
-            if (next in indexToTrace) {
-                var existingTrace = indexToTrace[+next];
-                // Pushing at the beginning of the array.
-                existingTrace.pointIndices.unshift(i);
-                indexToTrace[i] = existingTrace;
-                continue;
-            }
-            // The current point is pointing to a new/unseen trace.
-            var newTrace = { pointIndices: [] };
-            indexToTrace[i] = newTrace;
-            traces.push(newTrace);
-            var currentIndex = i;
-            while (points[currentIndex]) {
-                newTrace.pointIndices.push(currentIndex);
-                var next_1 = points[currentIndex].metadata[TRACE_METADATA_ATTR];
-                if (next_1 != null && next_1 !== '') {
-                    indicesSeen[+next_1] = 1;
-                    currentIndex = +next_1;
-                }
-                else {
-                    currentIndex = -1;
-                }
-            }
-        }
-        return traces;
-    };
-    DataSet.prototype.projectionCanBeRendered = function (projection) {
-        if (projection !== 'tsne') {
-            return true;
-        }
-        return this.tSNEIteration > 0;
-    };
-    /**
-     * Returns a new subset dataset by copying out data. We make a copy because
-     * we have to modify the vectors by normalizing them.
-     *
-     * @param subset Array of indices of points that we want in the subset.
-     *
-     * @return A subset of the original dataset.
-     */
-    DataSet.prototype.getSubset = function (subset) {
-        var _this = this;
-        var pointsSubset = ((subset != null) && (subset.length > 0)) ?
-            subset.map(function (i) { return _this.points[i]; }) :
-            this.points;
-        var points = pointsSubset.map(function (dp) {
-            return {
-                metadata: dp.metadata,
-                index: dp.index,
-                vector: dp.vector.slice(),
-                projections: {}
-            };
-        });
-        return new DataSet(points, this.spriteAndMetadataInfo);
-    };
-    /**
-     * Computes the centroid, shifts all points to that centroid,
-     * then makes them all unit norm.
-     */
-    DataSet.prototype.normalize = function () {
-        // Compute the centroid of all data points.
-        var centroid = vector.centroid(this.points, function (a) { return a.vector; });
-        if (centroid == null) {
-            throw Error('centroid should not be null');
-        }
-        // Shift all points by the centroid and make them unit norm.
-        for (var id = 0; id < this.points.length; ++id) {
-            var dataPoint = this.points[id];
-            dataPoint.vector = vector.sub(dataPoint.vector, centroid);
-            vector.unit(dataPoint.vector);
-        }
-    };
-    /** Projects the dataset onto a given vector and caches the result. */
-    DataSet.prototype.projectLinear = function (dir, label) {
-        this.projections.add(label);
-        this.points.forEach(function (dataPoint) {
-            dataPoint.projections[label] = vector.dot(dataPoint.vector, dir);
-        });
-    };
-    /** Projects the dataset along the top 10 principal components. */
-    DataSet.prototype.projectPCA = function () {
-        var _this = this;
-        if (this.projections.has('pca-0')) {
-            return Promise.resolve(null);
-        }
-        return util.runAsyncTask('Computing PCA...', function () {
-            // Approximate pca vectors by sampling the dimensions.
-            var dim = _this.points[0].vector.length;
-            var vectors = _this.shuffledDataIndices.map(function (i) { return _this.points[i].vector; });
-            if (dim > exports.PCA_SAMPLE_DIM) {
-                vectors = vector.projectRandom(vectors, exports.PCA_SAMPLE_DIM);
-            }
-            var sampledVectors = vectors.slice(0, exports.PCA_SAMPLE_SIZE);
-            var sigma = numeric.div(numeric.dot(numeric.transpose(sampledVectors), sampledVectors), sampledVectors.length);
-            var svd = numeric.svd(sigma);
-            var variances = svd.S;
-            var totalVariance = 0;
-            for (var i = 0; i < variances.length; ++i) {
-                totalVariance += variances[i];
-            }
-            for (var i = 0; i < variances.length; ++i) {
-                variances[i] /= totalVariance;
-            }
-            _this.fracVariancesExplained = variances;
-            var U = svd.U;
-            var pcaVectors = vectors.map(function (vector) {
-                var newV = new Float32Array(NUM_PCA_COMPONENTS);
-                for (var newDim = 0; newDim < NUM_PCA_COMPONENTS; newDim++) {
-                    var dot = 0;
-                    for (var oldDim = 0; oldDim < vector.length; oldDim++) {
-                        dot += vector[oldDim] * U[oldDim][newDim];
-                    }
-                    newV[newDim] = dot;
-                }
-                return newV;
-            });
-            for (var d = 0; d < NUM_PCA_COMPONENTS; d++) {
-                var label = 'pca-' + d;
-                _this.projections.add(label);
-                for (var i = 0; i < pcaVectors.length; i++) {
-                    var pointIndex = _this.shuffledDataIndices[i];
-                    _this.points[pointIndex].projections[label] = pcaVectors[i][d];
-                }
-            }
-        });
-    };
-    /** Runs tsne on the data. */
-    DataSet.prototype.projectTSNE = function (perplexity, learningRate, tsneDim, stepCallback) {
-        var _this = this;
-        this.hasTSNERun = true;
-        var k = Math.floor(3 * perplexity);
-        var opt = { epsilon: learningRate, perplexity: perplexity, dim: tsneDim };
-        this.tsne = new bh_tsne_1.TSNE(opt);
-        this.tSNEShouldStop = false;
-        this.tSNEIteration = 0;
-        var sampledIndices = this.shuffledDataIndices.slice(0, exports.TSNE_SAMPLE_SIZE);
-        var step = function () {
-            if (_this.tSNEShouldStop) {
-                stepCallback(null);
-                _this.tsne = null;
-                return;
-            }
-            _this.tsne.step();
-            var result = _this.tsne.getSolution();
-            sampledIndices.forEach(function (index, i) {
-                var dataPoint = _this.points[index];
-                dataPoint.projections['tsne-0'] = result[i * tsneDim + 0];
-                dataPoint.projections['tsne-1'] = result[i * tsneDim + 1];
-                if (tsneDim === 3) {
-                    dataPoint.projections['tsne-2'] = result[i * tsneDim + 2];
-                }
-            });
-            _this.tSNEIteration++;
-            stepCallback(_this.tSNEIteration);
-            requestAnimationFrame(step);
-        };
-        // Nearest neighbors calculations.
-        var knnComputation;
-        if (this.nearest != null && k === this.nearestK) {
-            // We found the nearest neighbors before and will reuse them.
-            knnComputation = Promise.resolve(this.nearest);
-        }
-        else {
-            var sampledData = sampledIndices.map(function (i) { return _this.points[i]; });
-            this.nearestK = k;
-            knnComputation = KNN_GPU_ENABLED ?
-                knn.findKNNGPUCosine(sampledData, k, (function (d) { return d.vector; })) :
-                knn.findKNN(sampledData, k, (function (d) { return d.vector; }), function (a, b, limit) { return vector.cosDistNorm(a, b); });
-        }
-        knnComputation.then(function (nearest) {
-            _this.nearest = nearest;
-            util.runAsyncTask('Initializing T-SNE...', function () {
-                _this.tsne.initDataDist(_this.nearest);
-            }).then(step);
-        });
-    };
-    /**
-     * Merges metadata to the dataset and returns whether it succeeded.
-     */
-    DataSet.prototype.mergeMetadata = function (metadata) {
-        var _this = this;
-        if (metadata.pointsInfo.length !== this.points.length) {
-            var errorMessage = ("Number of tensors (" + this.points.length + ") do not") +
-                " match the number of lines in metadata" +
-                (" (" + metadata.pointsInfo.length + ").");
-            if (metadata.stats.length === 1 &&
-                this.points.length + 1 === metadata.pointsInfo.length) {
-                // If there is only one column of metadata and the number of points is
-                // exactly one less than the number of metadata lines, this is due to an
-                // unnecessary header line in the metadata and we can show a meaningful
-                // error.
-                logging.setErrorMessage(errorMessage + ' Single column metadata should not have a header ' +
-                    'row.', 'merging metadata');
-                return false;
-            }
-            else if (metadata.stats.length > 1 &&
-                this.points.length - 1 === metadata.pointsInfo.length) {
-                // If there are multiple columns of metadata and the number of points is
-                // exactly one greater than the number of lines in the metadata, this
-                // means there is a missing metadata header.
-                logging.setErrorMessage(errorMessage + ' Multi-column metadata should have a header ' +
-                    'row with column labels.', 'merging metadata');
-                return false;
-            }
-            logging.setWarningMessage(errorMessage);
-        }
-        this.spriteAndMetadataInfo = metadata;
-        metadata.pointsInfo.slice(0, this.points.length)
-            .forEach(function (m, i) { return _this.points[i].metadata = m; });
-        return true;
-    };
-    DataSet.prototype.stopTSNE = function () {
-        this.tSNEShouldStop = true;
-    };
-    /**
-     * Finds the nearest neighbors of the query point using a
-     * user-specified distance metric.
-     */
-    DataSet.prototype.findNeighbors = function (pointIndex, distFunc, numNN) {
-        // Find the nearest neighbors of a particular point.
-        var neighbors = knn.findKNNofPoint(this.points, pointIndex, numNN, (function (d) { return d.vector; }), distFunc);
-        // TODO(smilkov): Figure out why we slice.
-        var result = neighbors.slice(0, numNN);
-        return result;
-    };
-    /**
-     * Search the dataset based on a metadata field.
-     */
-    DataSet.prototype.query = function (query, inRegexMode, fieldName) {
-        var predicate = util.getSearchPredicate(query, inRegexMode, fieldName);
-        var matches = [];
-        this.points.forEach(function (point, id) {
-            if (predicate(point)) {
-                matches.push(id);
-            }
-        });
-        return matches;
-    };
-    return DataSet;
-}());
-exports.DataSet = DataSet;
-var Projection = (function () {
-    function Projection(projectionType, projectionComponents, dimensionality, dataSet) {
-        this.projectionType = projectionType;
-        this.projectionComponents = projectionComponents;
-        this.dimensionality = dimensionality;
-        this.dataSet = dataSet;
-    }
-    return Projection;
-}());
-exports.Projection = Projection;
-/**
- * An interface that holds all the data for serializing the current state of
- * the world.
- */
-var State = (function () {
-    function State() {
-        /** A label identifying this state. */
-        this.label = '';
-        /** Whether this State is selected in the bookmarks pane. */
-        this.isSelected = false;
-        /** t-SNE parameters */
-        this.tSNEIteration = 0;
-        this.tSNEPerplexity = 0;
-        this.tSNELearningRate = 0;
-        this.tSNEis3d = true;
-        /** PCA projection component dimensions */
-        this.pcaComponentDimensions = [];
-        /** The computed projections of the tensors. */
-        this.projections = [];
-        /** The indices of selected points. */
-        this.selectedPoints = [];
-    }
-    return State;
-}());
-exports.State = State;
-function getProjectionComponents(projection, components) {
-    if (components.length > 3) {
-        throw new RangeError('components length must be <= 3');
-    }
-    var projectionComponents = [null, null, null];
-    var prefix = (projection === 'custom') ? 'linear' : projection;
-    for (var i = 0; i < components.length; ++i) {
-        if (components[i] == null) {
-            continue;
-        }
-        projectionComponents[i] = prefix + '-' + components[i];
-    }
-    return projectionComponents;
-}
-exports.getProjectionComponents = getProjectionComponents;
-function stateGetAccessorDimensions(state) {
-    var dimensions;
-    switch (state.selectedProjection) {
-        case 'pca':
-            dimensions = state.pcaComponentDimensions.slice();
-            break;
-        case 'tsne':
-            dimensions = [0, 1];
-            if (state.tSNEis3d) {
-                dimensions.push(2);
-            }
-            break;
-        case 'custom':
-            dimensions = ['x', 'y'];
-            break;
-        default:
-            throw new Error('Unexpected fallthrough');
-    }
-    return dimensions;
-}
-exports.stateGetAccessorDimensions = stateGetAccessorDimensions;
-
-},{"./bh_tsne":2,"./knn":10,"./logging":12,"./util":24,"./vector":25}],8:[function(require,module,exports){
-
-},{}],9:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * Min-heap data structure. Provides O(1) for peek, returning the smallest key.
- */
-// TODO(jart): Rename to Heap and use Comparator.
-var MinHeap = (function () {
-    function MinHeap() {
-        this.arr = [];
-    }
-    /** Push an element with the provided key. */
-    MinHeap.prototype.push = function (key, value) {
-        this.arr.push({ key: key, value: value });
-        this.bubbleUp(this.arr.length - 1);
-    };
-    /** Pop the element with the smallest key. */
-    MinHeap.prototype.pop = function () {
-        if (this.arr.length === 0) {
-            throw new Error('pop() called on empty binary heap');
-        }
-        var item = this.arr[0];
-        var last = this.arr.length - 1;
-        this.arr[0] = this.arr[last];
-        this.arr.pop();
-        if (last > 0) {
-            this.bubbleDown(0);
-        }
-        return item;
-    };
-    ;
-    /** Returns, but doesn't remove the element with the smallest key */
-    MinHeap.prototype.peek = function () { return this.arr[0]; };
-    /**
-     * Pops the element with the smallest key and at the same time
-     * adds the newly provided element. This is faster than calling
-     * pop() and push() separately.
-     */
-    MinHeap.prototype.popPush = function (key, value) {
-        if (this.arr.length === 0) {
-            throw new Error('pop() called on empty binary heap');
-        }
-        var item = this.arr[0];
-        this.arr[0] = { key: key, value: value };
-        if (this.arr.length > 0) {
-            this.bubbleDown(0);
-        }
-        return item;
-    };
-    /** Returns the number of elements in the heap. */
-    MinHeap.prototype.size = function () { return this.arr.length; };
-    /** Returns all the items in the heap. */
-    MinHeap.prototype.items = function () { return this.arr; };
-    MinHeap.prototype.swap = function (a, b) {
-        var temp = this.arr[a];
-        this.arr[a] = this.arr[b];
-        this.arr[b] = temp;
-    };
-    MinHeap.prototype.bubbleDown = function (pos) {
-        var left = (pos << 1) + 1;
-        var right = left + 1;
-        var largest = pos;
-        if (left < this.arr.length && this.arr[left].key < this.arr[largest].key) {
-            largest = left;
-        }
-        if (right < this.arr.length &&
-            this.arr[right].key < this.arr[largest].key) {
-            largest = right;
-        }
-        if (largest !== pos) {
-            this.swap(largest, pos);
-            this.bubbleDown(largest);
-        }
-    };
-    MinHeap.prototype.bubbleUp = function (pos) {
-        if (pos <= 0) {
-            return;
-        }
-        var parent = ((pos - 1) >> 1);
-        if (this.arr[pos].key < this.arr[parent].key) {
-            this.swap(pos, parent);
-            this.bubbleUp(parent);
-        }
-    };
-    return MinHeap;
-}());
-exports.MinHeap = MinHeap;
-/** List that keeps the K elements with the smallest keys. */
-var KMin = (function () {
-    /** Constructs a new k-min data structure with the provided k. */
-    function KMin(k) {
-        this.maxHeap = new MinHeap();
-        this.k = k;
-    }
-    /** Adds an element to the list. */
-    KMin.prototype.add = function (key, value) {
-        if (this.maxHeap.size() < this.k) {
-            this.maxHeap.push(-key, value);
-            return;
-        }
-        var largest = this.maxHeap.peek();
-        // If the new element is smaller, replace the largest with the new element.
-        if (key < -largest.key) {
-            this.maxHeap.popPush(-key, value);
-        }
-    };
-    /** Returns the k items with the smallest keys. */
-    KMin.prototype.getMinKItems = function () {
-        var items = this.maxHeap.items();
-        items.sort(function (a, b) { return b.key - a.key; });
-        return items.map(function (a) { return a.value; });
-    };
-    /** Returns the size of the list. */
-    KMin.prototype.getSize = function () { return this.maxHeap.size(); };
-    /** Returns the largest key in the list. */
-    KMin.prototype.getLargestKey = function () {
-        return this.maxHeap.size() === 0 ? null : -this.maxHeap.peek().key;
-    };
-    return KMin;
-}());
-exports.KMin = KMin;
-
-},{}],10:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util_1 = require('./util');
-var logging = require('./logging');
-var heap_1 = require('./heap');
-var vector = require('./vector');
-/**
- * Optimal size for the height of the matrix when doing computation on the GPU
- * using WebGL. This was found experimentally.
- *
- * This also guarantees that for computing pair-wise distance for up to 10K
- * vectors, no more than 40MB will be allocated in the GPU. Without the
- * allocation limit, we can freeze the graphics of the whole OS.
- */
-var OPTIMAL_GPU_BLOCK_SIZE = 256;
-/** Id of message box used for knn gpu progress bar. */
-var KNN_GPU_MSG_ID = 'knn-gpu';
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the GPU (WebGL) using cosine distance.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- */
-function findKNNGPUCosine(dataPoints, k, accessor) {
-    var N = dataPoints.length;
-    var dim = accessor(dataPoints[0]).length;
-    // The goal is to compute a large matrix multiplication A*A.T where A is of
-    // size NxD and A.T is its transpose. This results in a NxN matrix which
-    // could be too big to store on the GPU memory. To avoid memory overflow, we
-    // compute multiple A*partial_A.T where partial_A is of size BxD (B is much
-    // smaller than N). This results in storing only NxB size matrices on the GPU
-    // at a given time.
-    // A*A.T will give us NxN matrix holding the cosine distance between every
-    // pair of points, which we sort using KMin data structure to obtain the
-    // K nearest neighbors for each point.
-    var typedArray = vector.toTypedArray(dataPoints, accessor);
-    var bigMatrix = new weblas.pipeline.Tensor([N, dim], typedArray);
-    var nearest = new Array(N);
-    var numPieces = Math.ceil(N / OPTIMAL_GPU_BLOCK_SIZE);
-    var M = Math.floor(N / numPieces);
-    var modulo = N % numPieces;
-    var offset = 0;
-    var progress = 0;
-    var progressDiff = 1 / (2 * numPieces);
-    var piece = 0;
-    function step(resolve) {
-        var progressMsg = 'Finding nearest neighbors: ' + (progress * 100).toFixed() + '%';
-        util_1.runAsyncTask(progressMsg, function () {
-            var B = piece < modulo ? M + 1 : M;
-            var typedB = new Float32Array(B * dim);
-            for (var i = 0; i < B; ++i) {
-                var vector_1 = accessor(dataPoints[offset + i]);
-                for (var d = 0; d < dim; ++d) {
-                    typedB[i * dim + d] = vector_1[d];
-                }
-            }
-            var partialMatrix = new weblas.pipeline.Tensor([B, dim], typedB);
-            // Result is N x B matrix.
-            var result = weblas.pipeline.sgemm(1, bigMatrix, partialMatrix, null, null);
-            var partial = result.transfer();
-            partialMatrix.delete();
-            result.delete();
-            progress += progressDiff;
-            for (var i = 0; i < B; i++) {
-                var kMin = new heap_1.KMin(k);
-                var iReal = offset + i;
-                for (var j = 0; j < N; j++) {
-                    if (j === iReal) {
-                        continue;
-                    }
-                    var cosDist = 1 - partial[j * B + i]; // [j, i];
-                    kMin.add(cosDist, { index: j, dist: cosDist });
-                }
-                nearest[iReal] = kMin.getMinKItems();
-            }
-            progress += progressDiff;
-            offset += B;
-            piece++;
-        }, KNN_GPU_MSG_ID).then(function () {
-            if (piece < numPieces) {
-                step(resolve);
-            }
-            else {
-                logging.setModalMessage(null, KNN_GPU_MSG_ID);
-                bigMatrix.delete();
-                resolve(nearest);
-            }
-        }, function (error) {
-            // GPU failed. Reverting back to CPU.
-            logging.setModalMessage(null, KNN_GPU_MSG_ID);
-            var distFunc = function (a, b, limit) { return vector.cosDistNorm(a, b); };
-            findKNN(dataPoints, k, accessor, distFunc).then(function (nearest) {
-                resolve(nearest);
-            });
-        });
-    }
-    return new Promise(function (resolve) { return step(resolve); });
-}
-exports.findKNNGPUCosine = findKNNGPUCosine;
-/**
- * Returns the K nearest neighbors for each vector where the distance
- * computation is done on the CPU using a user-specified distance method.
- *
- * @param dataPoints List of data points, where each data point holds an
- *   n-dimensional vector.
- * @param k Number of nearest neighbors to find.
- * @param accessor A method that returns the vector, given the data point.
- * @param dist Method that takes two vectors and a limit, and computes the
- *   distance between two vectors, with the ability to stop early if the
- *   distance is above the limit.
- */
-function findKNN(dataPoints, k, accessor, dist) {
-    return util_1.runAsyncTask('Finding nearest neighbors...', function () {
-        var N = dataPoints.length;
-        var nearest = new Array(N);
-        // Find the distances from node i.
-        var kMin = new Array(N);
-        for (var i = 0; i < N; i++) {
-            kMin[i] = new heap_1.KMin(k);
-        }
-        for (var i = 0; i < N; i++) {
-            var a = accessor(dataPoints[i]);
-            var kMinA = kMin[i];
-            for (var j = i + 1; j < N; j++) {
-                var kMinB = kMin[j];
-                var limitI = kMinA.getSize() === k ?
-                    kMinA.getLargestKey() || Number.MAX_VALUE :
-                    Number.MAX_VALUE;
-                var limitJ = kMinB.getSize() === k ?
-                    kMinB.getLargestKey() || Number.MAX_VALUE :
-                    Number.MAX_VALUE;
-                var limit = Math.max(limitI, limitJ);
-                var dist2ItoJ = dist(a, accessor(dataPoints[j]), limit);
-                if (dist2ItoJ >= 0) {
-                    kMinA.add(dist2ItoJ, { index: j, dist: dist2ItoJ });
-                    kMinB.add(dist2ItoJ, { index: i, dist: dist2ItoJ });
-                }
-            }
-        }
-        for (var i = 0; i < N; i++) {
-            nearest[i] = kMin[i].getMinKItems();
-        }
-        return nearest;
-    });
-}
-exports.findKNN = findKNN;
-/** Calculates the minimum distance between a search point and a rectangle. */
-function minDist(point, x1, y1, x2, y2) {
-    var x = point[0];
-    var y = point[1];
-    var dx1 = x - x1;
-    var dx2 = x - x2;
-    var dy1 = y - y1;
-    var dy2 = y - y2;
-    if (dx1 * dx2 <= 0) {
-        if (dy1 * dy2 <= 0) {
-            return 0; // return 0 as point is in rect
-        }
-        return Math.min(Math.abs(dy1), Math.abs(dy2));
-    }
-    if (dy1 * dy2 <= 0) {
-        // We know it is already inside the rectangle
-        return Math.min(Math.abs(dx1), Math.abs(dx2));
-    }
-    var corner;
-    if (x > x2) {
-        // Upper-right vs lower-right.
-        corner = y > y2 ? [x2, y2] : [x2, y1];
-    }
-    else {
-        // Upper-left vs lower-left.
-        corner = y > y2 ? [x1, y2] : [x1, y1];
-    }
-    return Math.sqrt(vector.dist22D([x, y], corner));
-}
-/**
- * Returns the nearest neighbors of a particular point.
- *
- * @param dataPoints List of data points.
- * @param pointIndex The index of the point we need the nearest neighbors of.
- * @param k Number of nearest neighbors to search for.
- * @param accessor Method that maps a data point => vector (array of numbers).
- * @param distance Method that takes two vectors and returns their distance.
- */
-function findKNNofPoint(dataPoints, pointIndex, k, accessor, distance) {
-    var kMin = new heap_1.KMin(k);
-    var a = accessor(dataPoints[pointIndex]);
-    for (var i = 0; i < dataPoints.length; ++i) {
-        if (i === pointIndex) {
-            continue;
-        }
-        var b = accessor(dataPoints[i]);
-        var dist = distance(a, b);
-        kMin.add(dist, { index: i, dist: dist });
-    }
-    return kMin.getMinKItems();
-}
-exports.findKNNofPoint = findKNNofPoint;
-
-},{"./heap":9,"./logging":12,"./util":24,"./vector":25}],11:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * Accelerates label placement by dividing the view into a uniform grid.
- * Labels only need to be tested for collision with other labels that overlap
- * the same grid cells. This is a fork of {@code amoeba.CollisionGrid}.
- */
-var CollisionGrid = (function () {
-    /**
-     * Constructs a new Collision grid.
-     *
-     * @param bound The bound of the grid. Labels out of bounds will be rejected.
-     * @param cellWidth Width of a cell in the grid.
-     * @param cellHeight Height of a cell in the grid.
-     */
-    function CollisionGrid(bound, cellWidth, cellHeight) {
-        /** The bound of the grid. Labels out of bounds will be rejected. */
-        this.bound = bound;
-        /** Width of a cell in the grid. */
-        this.cellWidth = cellWidth;
-        /** Height of a cell in the grid. */
-        this.cellHeight = cellHeight;
-        /** Number of grid cells along the x axis. */
-        this.numHorizCells = Math.ceil(this.boundWidth(bound) / cellWidth);
-        /** Number of grid cells along the y axis. */
-        this.numVertCells = Math.ceil(this.boundHeight(bound) / cellHeight);
-        /**
-         * The 2d grid (stored as a 1d array.) Each cell consists of an array of
-         * BoundingBoxes for objects that are in the cell.
-         */
-        this.grid = new Array(this.numHorizCells * this.numVertCells);
-    }
-    CollisionGrid.prototype.boundWidth = function (bound) { return bound.hiX - bound.loX; };
-    CollisionGrid.prototype.boundHeight = function (bound) { return bound.hiY - bound.loY; };
-    CollisionGrid.prototype.boundsIntersect = function (a, b) {
-        return !(a.loX > b.hiX || a.loY > b.hiY || a.hiX < b.loX || a.hiY < b.loY);
-    };
-    /**
-     * Checks if a given bounding box has any conflicts in the grid and inserts it
-     * if none are found.
-     *
-     * @param bound The bound to insert.
-     * @param justTest If true, just test if it conflicts, without inserting.
-     * @return True if the bound was successfully inserted; false if it
-     *         could not be inserted due to a conflict.
-     */
-    CollisionGrid.prototype.insert = function (bound, justTest) {
-        if (justTest === void 0) { justTest = false; }
-        // Reject if the label is out of bounds.
-        if ((bound.hiX < this.bound.loX) || (bound.loX > this.bound.hiX) ||
-            (bound.hiY < this.bound.loY) || (bound.loY > this.bound.hiY)) {
-            return false;
-        }
-        var minCellX = this.getCellX(bound.loX);
-        var maxCellX = this.getCellX(bound.hiX);
-        var minCellY = this.getCellY(bound.loY);
-        var maxCellY = this.getCellY(bound.hiY);
-        // Check all overlapped cells to verify that we can insert.
-        var baseIdx = minCellY * this.numHorizCells + minCellX;
-        var idx = baseIdx;
-        for (var j = minCellY; j <= maxCellY; j++) {
-            for (var i = minCellX; i <= maxCellX; i++) {
-                var cell = this.grid[idx++];
-                if (cell) {
-                    for (var k = 0; k < cell.length; k++) {
-                        if (this.boundsIntersect(bound, cell[k])) {
-                            return false;
-                        }
-                    }
-                }
-            }
-            idx += this.numHorizCells - (maxCellX - minCellX + 1);
-        }
-        if (justTest) {
-            return true;
-        }
-        // Insert into the overlapped cells.
-        idx = baseIdx;
-        for (var j = minCellY; j <= maxCellY; j++) {
-            for (var i = minCellX; i <= maxCellX; i++) {
-                if (!this.grid[idx]) {
-                    this.grid[idx] = [bound];
-                }
-                else {
-                    this.grid[idx].push(bound);
-                }
-                idx++;
-            }
-            idx += this.numHorizCells - (maxCellX - minCellX + 1);
-        }
-        return true;
-    };
-    /**
-     * Returns the x index of the grid cell where the given x coordinate falls.
-     *
-     * @param x the coordinate, in world space.
-     * @return the x index of the cell.
-     */
-    CollisionGrid.prototype.getCellX = function (x) {
-        return Math.floor((x - this.bound.loX) / this.cellWidth);
-    };
-    ;
-    /**
-     * Returns the y index of the grid cell where the given y coordinate falls.
-     *
-     * @param y the coordinate, in world space.
-     * @return the y index of the cell.
-     */
-    CollisionGrid.prototype.getCellY = function (y) {
-        return Math.floor((y - this.bound.loY) / this.cellHeight);
-    };
-    ;
-    return CollisionGrid;
-}());
-exports.CollisionGrid = CollisionGrid;
-
-},{}],12:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/** Duration in ms for showing warning messages to the user */
-var WARNING_DURATION_MS = 10000;
-var dom = null;
-var msgId = 0;
-var numActiveMessages = 0;
-function setDomContainer(domElement) {
-    dom = domElement;
-}
-exports.setDomContainer = setDomContainer;
-/**
- * Updates the user message with the provided id.
- *
- * @param msg The message shown to the user. If null, the message is removed.
- * @param id The id of an existing message. If no id is provided, a unique id
- *     is assigned.
- * @param title The title of the notification.
- * @param isErrorMsg If true, the message is error and the dialog will have a
- *                   close button.
- * @return The id of the message.
- */
-function setModalMessage(msg, id, title, isErrorMsg) {
-    if (id === void 0) { id = null; }
-    if (title === void 0) { title = null; }
-    if (isErrorMsg === void 0) { isErrorMsg = false; }
-    if (dom == null) {
-        console.warn('Can\'t show modal message before the dom is initialized');
-        return;
-    }
-    if (id == null) {
-        id = (msgId++).toString();
-    }
-    var dialog = dom.querySelector('#notification-dialog');
-    dialog.querySelector('.close-button').style.display =
-        isErrorMsg ? null : 'none';
-    var spinner = dialog.querySelector('.progress');
-    spinner.style.display = isErrorMsg ? 'none' : null;
-    spinner.active = isErrorMsg ? null : true;
-    dialog.querySelector('#notification-title').innerHTML = title;
-    var msgsContainer = dialog.querySelector('#notify-msgs');
-    if (isErrorMsg) {
-        d3.select(msgsContainer).html('');
-    }
-    else {
-        d3.select(msgsContainer).selectAll('.error').remove();
-    }
-    var divId = "notify-msg-" + id;
-    var msgDiv = d3.select(dialog.querySelector('#' + divId));
-    var exists = msgDiv.size() > 0;
-    if (!exists) {
-        msgDiv = d3.select(msgsContainer)
-            .insert('div', ':first-child')
-            .attr('class', 'notify-msg')
-            .classed('error', isErrorMsg)
-            .attr('id', divId);
-        if (!isErrorMsg) {
-            numActiveMessages++;
-        }
-        else {
-            numActiveMessages = 0;
-        }
-    }
-    if (msg == null) {
-        numActiveMessages--;
-        if (numActiveMessages === 0) {
-            dialog.close();
-        }
-        msgDiv.remove();
-    }
-    else {
-        msgDiv.text(msg);
-        dialog.open();
-    }
-    return id;
-}
-exports.setModalMessage = setModalMessage;
-function setErrorMessage(errMsg, task) {
-    setModalMessage(errMsg, null, 'Error ' + (task != null ? task : ''), true);
-}
-exports.setErrorMessage = setErrorMessage;
-/**
- * Shows a warning message to the user for a certain amount of time.
- */
-function setWarningMessage(msg) {
-    var toast = dom.querySelector('#toast');
-    toast.text = msg;
-    toast.duration = WARNING_DURATION_MS;
-    toast.open();
-}
-exports.setWarningMessage = setWarningMessage;
-
-},{}],13:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-
-},{}],14:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var scatterPlot_1 = require('./scatterPlot');
-var scatterPlotVisualizer3DLabels_1 = require('./scatterPlotVisualizer3DLabels');
-var scatterPlotVisualizerCanvasLabels_1 = require('./scatterPlotVisualizerCanvasLabels');
-var scatterPlotVisualizerSprites_1 = require('./scatterPlotVisualizerSprites');
-var scatterPlotVisualizerTraces_1 = require('./scatterPlotVisualizerTraces');
-var vector = require('./vector');
-var LABEL_FONT_SIZE = 10;
-var LABEL_SCALE_DEFAULT = 1.0;
-var LABEL_SCALE_LARGE = 2;
-var LABEL_FILL_COLOR_SELECTED = 0x000000;
-var LABEL_FILL_COLOR_HOVER = 0x000000;
-var LABEL_FILL_COLOR_NEIGHBOR = 0x000000;
-var LABEL_STROKE_COLOR_SELECTED = 0xFFFFFF;
-var LABEL_STROKE_COLOR_HOVER = 0xFFFFFF;
-var LABEL_STROKE_COLOR_NEIGHBOR = 0xFFFFFF;
-var POINT_COLOR_UNSELECTED = 0xE3E3E3;
-var POINT_COLOR_NO_SELECTION = 0x7575D9;
-var POINT_COLOR_SELECTED = 0xFA6666;
-var POINT_COLOR_HOVER = 0x760B4F;
-var POINT_SCALE_DEFAULT = 1.0;
-var POINT_SCALE_SELECTED = 1.2;
-var POINT_SCALE_NEIGHBOR = 1.2;
-var POINT_SCALE_HOVER = 1.2;
-var LABELS_3D_COLOR_UNSELECTED = 0xFFFFFF;
-var LABELS_3D_COLOR_NO_SELECTION = 0xFFFFFF;
-var SPRITE_IMAGE_COLOR_UNSELECTED = 0xFFFFFF;
-var SPRITE_IMAGE_COLOR_NO_SELECTION = 0xFFFFFF;
-var TRACE_START_HUE = 60;
-var TRACE_END_HUE = 360;
-var TRACE_SATURATION = 1;
-var TRACE_LIGHTNESS = .3;
-var TRACE_DEFAULT_OPACITY = .2;
-var TRACE_DEFAULT_LINEWIDTH = 2;
-var TRACE_SELECTED_OPACITY = .9;
-var TRACE_SELECTED_LINEWIDTH = 3;
-var TRACE_DESELECTED_OPACITY = .05;
-var SCATTER_PLOT_CUBE_LENGTH = 2;
-/** Color scale for nearest neighbors. */
-var NN_COLOR_SCALE = d3.scale.linear()
-    .domain([1, 0.7, 0.4])
-    .range(['hsl(285, 80%, 40%)', 'hsl(0, 80%, 65%)', 'hsl(40, 70%, 60%)'])
-    .clamp(true);
-/**
- * Interprets projector events and assembes the arrays and commands necessary
- * to use the ScatterPlot to render the current projected data set.
- */
-var ProjectorScatterPlotAdapter = (function () {
-    function ProjectorScatterPlotAdapter(scatterPlotContainer, projectorEventContext) {
-        var _this = this;
-        this.renderLabelsIn3D = false;
-        this.scatterPlot =
-            new scatterPlot_1.ScatterPlot(scatterPlotContainer, projectorEventContext);
-        this.scatterPlotContainer = scatterPlotContainer;
-        projectorEventContext.registerProjectionChangedListener(function (projection) {
-            _this.projection = projection;
-            _this.updateScatterPlotWithNewProjection(projection);
-        });
-        projectorEventContext.registerSelectionChangedListener(function (selectedPointIndices, neighbors) {
-            _this.selectedPointIndices = selectedPointIndices;
-            _this.neighborsOfFirstSelectedPoint = neighbors;
-            _this.updateScatterPlotPositions();
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        projectorEventContext.registerHoverListener(function (hoverPointIndex) {
-            _this.hoverPointIndex = hoverPointIndex;
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        projectorEventContext.registerDistanceMetricChangedListener(function (distanceMetric) {
-            _this.distanceMetric = distanceMetric;
-            _this.updateScatterPlotAttributes();
-            _this.scatterPlot.render();
-        });
-        this.createVisualizers(false);
-    }
-    ProjectorScatterPlotAdapter.prototype.notifyProjectionPositionsUpdated = function () {
-        this.updateScatterPlotPositions();
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.setDataSet = function (dataSet) {
-        if (this.projection != null) {
-            // TODO(nicholsonc): setDataSet needs to go away, the projection is the
-            // atomic unit of update.
-            this.projection.dataSet = dataSet;
-        }
-        if (this.traceVisualizer != null) {
-            this.traceVisualizer.setDataSet(dataSet);
-        }
-        if (this.labels3DVisualizer != null) {
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(dataSet, this.labelPointAccessor));
-        }
-        if (this.spriteVisualizer == null) {
-            return;
-        }
-        this.spriteVisualizer.clearSpriteAtlas();
-        if ((dataSet == null) || (dataSet.spriteAndMetadataInfo == null)) {
-            return;
-        }
-        var metadata = dataSet.spriteAndMetadataInfo;
-        if ((metadata.spriteImage == null) || (metadata.spriteMetadata == null)) {
-            return;
-        }
-        var n = dataSet.points.length;
-        var spriteIndices = new Float32Array(n);
-        for (var i = 0; i < n; ++i) {
-            spriteIndices[i] = dataSet.points[i].index;
-        }
-        this.spriteVisualizer.setSpriteAtlas(metadata.spriteImage, metadata.spriteMetadata.singleImageDim, spriteIndices);
-    };
-    ProjectorScatterPlotAdapter.prototype.set3DLabelMode = function (renderLabelsIn3D) {
-        this.renderLabelsIn3D = renderLabelsIn3D;
-        this.createVisualizers(renderLabelsIn3D);
-        this.updateScatterPlotAttributes();
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.setLegendPointColorer = function (legendPointColorer) {
-        this.legendPointColorer = legendPointColorer;
-    };
-    ProjectorScatterPlotAdapter.prototype.setLabelPointAccessor = function (labelPointAccessor) {
-        this.labelPointAccessor = labelPointAccessor;
-        if (this.labels3DVisualizer != null) {
-            var ds = (this.projection == null) ? null : this.projection.dataSet;
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(ds, labelPointAccessor));
-        }
-    };
-    ProjectorScatterPlotAdapter.prototype.resize = function () {
-        this.scatterPlot.resize();
-    };
-    ProjectorScatterPlotAdapter.prototype.populateBookmarkFromUI = function (state) {
-        state.cameraDef = this.scatterPlot.getCameraDef();
-    };
-    ProjectorScatterPlotAdapter.prototype.restoreUIFromBookmark = function (state) {
-        this.scatterPlot.setCameraParametersForNextCameraCreation(state.cameraDef, false);
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotPositions = function () {
-        var ds = (this.projection == null) ? null : this.projection.dataSet;
-        var projectionComponents = (this.projection == null) ? null : this.projection.projectionComponents;
-        var newPositions = this.generatePointPositionArray(ds, projectionComponents);
-        this.scatterPlot.setPointPositions(newPositions);
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotAttributes = function () {
-        if (this.projection == null) {
-            return;
-        }
-        var dataSet = this.projection.dataSet;
-        var selectedSet = this.selectedPointIndices;
-        var hoverIndex = this.hoverPointIndex;
-        var neighbors = this.neighborsOfFirstSelectedPoint;
-        var pointColorer = this.legendPointColorer;
-        var pointColors = this.generatePointColorArray(dataSet, pointColorer, this.distanceMetric, selectedSet, neighbors, hoverIndex, this.renderLabelsIn3D, this.getSpriteImageMode());
-        var pointScaleFactors = this.generatePointScaleFactorArray(dataSet, selectedSet, neighbors, hoverIndex);
-        var labels = this.generateVisibleLabelRenderParams(dataSet, selectedSet, neighbors, hoverIndex);
-        var traceColors = this.generateLineSegmentColorMap(dataSet, pointColorer);
-        var traceOpacities = this.generateLineSegmentOpacityArray(dataSet, selectedSet);
-        var traceWidths = this.generateLineSegmentWidthArray(dataSet, selectedSet);
-        this.scatterPlot.setPointColors(pointColors);
-        this.scatterPlot.setPointScaleFactors(pointScaleFactors);
-        this.scatterPlot.setLabels(labels);
-        this.scatterPlot.setTraceColors(traceColors);
-        this.scatterPlot.setTraceOpacities(traceOpacities);
-        this.scatterPlot.setTraceWidths(traceWidths);
-    };
-    ProjectorScatterPlotAdapter.prototype.render = function () {
-        this.scatterPlot.render();
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointPositionArray = function (ds, projectionComponents) {
-        if (ds == null) {
-            return null;
-        }
-        var xScaler = d3.scale.linear();
-        var yScaler = d3.scale.linear();
-        var zScaler = null;
-        {
-            // Determine max and min of each axis of our data.
-            var xExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[0]]; });
-            var yExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[1]]; });
-            var range = [-SCATTER_PLOT_CUBE_LENGTH / 2, SCATTER_PLOT_CUBE_LENGTH / 2];
-            xScaler.domain(xExtent).range(range);
-            yScaler.domain(yExtent).range(range);
-            if (projectionComponents[2] != null) {
-                var zExtent = d3.extent(ds.points, function (p, i) { return ds.points[i].projections[projectionComponents[2]]; });
-                zScaler = d3.scale.linear();
-                zScaler.domain(zExtent).range(range);
-            }
-        }
-        var positions = new Float32Array(ds.points.length * 3);
-        var dst = 0;
-        ds.points.forEach(function (d, i) {
-            positions[dst++] =
-                xScaler(ds.points[i].projections[projectionComponents[0]]);
-            positions[dst++] =
-                yScaler(ds.points[i].projections[projectionComponents[1]]);
-            positions[dst++] = 0.0;
-        });
-        if (zScaler) {
-            dst = 2;
-            ds.points.forEach(function (d, i) {
-                positions[dst] =
-                    zScaler(ds.points[i].projections[projectionComponents[2]]);
-                dst += 3;
-            });
-        }
-        return positions;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateVisibleLabelRenderParams = function (ds, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex) {
-        if (ds == null) {
-            return null;
-        }
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        var n = selectedPointCount + neighborCount +
-            ((hoverPointIndex != null) ? 1 : 0);
-        var visibleLabels = new Uint32Array(n);
-        var scale = new Float32Array(n);
-        var opacityFlags = new Int8Array(n);
-        var fillColors = new Uint8Array(n * 3);
-        var strokeColors = new Uint8Array(n * 3);
-        var labelStrings = [];
-        scale.fill(LABEL_SCALE_DEFAULT);
-        opacityFlags.fill(1);
-        var dst = 0;
-        if (hoverPointIndex != null) {
-            labelStrings.push(this.getLabelText(ds, hoverPointIndex, this.labelPointAccessor));
-            visibleLabels[dst] = hoverPointIndex;
-            scale[dst] = LABEL_SCALE_LARGE;
-            opacityFlags[dst] = 0;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_HOVER);
-            packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_HOVER);
-            packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[1]);
-            ++dst;
-        }
-        // Selected points
-        {
-            var n_1 = selectedPointCount;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_SELECTED);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_SELECTED);
-            for (var i = 0; i < n_1; ++i) {
-                var labelIndex = selectedPointIndices[i];
-                labelStrings.push(this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-                visibleLabels[dst] = labelIndex;
-                scale[dst] = LABEL_SCALE_LARGE;
-                opacityFlags[dst] = (n_1 === 1) ? 0 : 1;
-                packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-                packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-                ++dst;
-            }
-        }
-        // Neighbors
-        {
-            var n_2 = neighborCount;
-            var fillRgb = styleRgbFromHexColor(LABEL_FILL_COLOR_NEIGHBOR);
-            var strokeRgb = styleRgbFromHexColor(LABEL_STROKE_COLOR_NEIGHBOR);
-            for (var i = 0; i < n_2; ++i) {
-                var labelIndex = neighborsOfFirstPoint[i].index;
-                labelStrings.push(this.getLabelText(ds, labelIndex, this.labelPointAccessor));
-                visibleLabels[dst] = labelIndex;
-                packRgbIntoUint8Array(fillColors, dst, fillRgb[0], fillRgb[1], fillRgb[2]);
-                packRgbIntoUint8Array(strokeColors, dst, strokeRgb[0], strokeRgb[1], strokeRgb[2]);
-                ++dst;
-            }
-        }
-        return new renderContext_1.LabelRenderParams(visibleLabels, labelStrings, scale, opacityFlags, LABEL_FONT_SIZE, fillColors, strokeColors);
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointScaleFactorArray = function (ds, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var scale = new Float32Array(ds.points.length);
-        scale.fill(POINT_SCALE_DEFAULT);
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        // Scale up all selected points.
-        {
-            var n = selectedPointCount;
-            for (var i = 0; i < n; ++i) {
-                var p = selectedPointIndices[i];
-                scale[p] = POINT_SCALE_SELECTED;
-            }
-        }
-        // Scale up the neighbor points.
-        {
-            var n = neighborCount;
-            for (var i = 0; i < n; ++i) {
-                var p = neighborsOfFirstPoint[i].index;
-                scale[p] = POINT_SCALE_NEIGHBOR;
-            }
-        }
-        // Scale up the hover point.
-        if (hoverPointIndex != null) {
-            scale[hoverPointIndex] = POINT_SCALE_HOVER;
-        }
-        return scale;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentColorMap = function (ds, legendPointColorer) {
-        var traceColorArrayMap = {};
-        if (ds == null) {
-            return traceColorArrayMap;
-        }
-        for (var i = 0; i < ds.traces.length; i++) {
-            var dataTrace = ds.traces[i];
-            var colors = new Float32Array(2 * (dataTrace.pointIndices.length - 1) * 3);
-            var colorIndex = 0;
-            if (legendPointColorer) {
-                for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                    var c1 = new THREE.Color(legendPointColorer(ds, dataTrace.pointIndices[j]));
-                    var c2 = new THREE.Color(legendPointColorer(ds, dataTrace.pointIndices[j + 1]));
-                    colors[colorIndex++] = c1.r;
-                    colors[colorIndex++] = c1.g;
-                    colors[colorIndex++] = c1.b;
-                    colors[colorIndex++] = c2.r;
-                    colors[colorIndex++] = c2.g;
-                    colors[colorIndex++] = c2.b;
-                }
-            }
-            else {
-                for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                    var c1 = getDefaultPointInTraceColor(j, dataTrace.pointIndices.length);
-                    var c2 = getDefaultPointInTraceColor(j + 1, dataTrace.pointIndices.length);
-                    colors[colorIndex++] = c1.r;
-                    colors[colorIndex++] = c1.g;
-                    colors[colorIndex++] = c1.b;
-                    colors[colorIndex++] = c2.r;
-                    colors[colorIndex++] = c2.g;
-                    colors[colorIndex++] = c2.b;
-                }
-            }
-            traceColorArrayMap[i] = colors;
-        }
-        return traceColorArrayMap;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentOpacityArray = function (ds, selectedPoints) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var opacities = new Float32Array(ds.traces.length);
-        var selectedPointCount = (selectedPoints == null) ? 0 : selectedPoints.length;
-        if (selectedPointCount > 0) {
-            opacities.fill(TRACE_DESELECTED_OPACITY);
-            var i = ds.points[selectedPoints[0]].traceIndex;
-            opacities[i] = TRACE_SELECTED_OPACITY;
-        }
-        else {
-            opacities.fill(TRACE_DEFAULT_OPACITY);
-        }
-        return opacities;
-    };
-    ProjectorScatterPlotAdapter.prototype.generateLineSegmentWidthArray = function (ds, selectedPoints) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var widths = new Float32Array(ds.traces.length);
-        widths.fill(TRACE_DEFAULT_LINEWIDTH);
-        var selectedPointCount = (selectedPoints == null) ? 0 : selectedPoints.length;
-        if (selectedPointCount > 0) {
-            var i = ds.points[selectedPoints[0]].traceIndex;
-            widths[i] = TRACE_SELECTED_LINEWIDTH;
-        }
-        return widths;
-    };
-    ProjectorScatterPlotAdapter.prototype.generatePointColorArray = function (ds, legendPointColorer, distFunc, selectedPointIndices, neighborsOfFirstPoint, hoverPointIndex, label3dMode, spriteImageMode) {
-        if (ds == null) {
-            return new Float32Array(0);
-        }
-        var selectedPointCount = (selectedPointIndices == null) ? 0 : selectedPointIndices.length;
-        var neighborCount = (neighborsOfFirstPoint == null) ? 0 : neighborsOfFirstPoint.length;
-        var colors = new Float32Array(ds.points.length * 3);
-        var unselectedColor = POINT_COLOR_UNSELECTED;
-        var noSelectionColor = POINT_COLOR_NO_SELECTION;
-        if (label3dMode) {
-            unselectedColor = LABELS_3D_COLOR_UNSELECTED;
-            noSelectionColor = LABELS_3D_COLOR_NO_SELECTION;
-        }
-        if (spriteImageMode) {
-            unselectedColor = SPRITE_IMAGE_COLOR_UNSELECTED;
-            noSelectionColor = SPRITE_IMAGE_COLOR_NO_SELECTION;
-        }
-        // Give all points the unselected color.
-        {
-            var n = ds.points.length;
-            var dst = 0;
-            if (selectedPointCount > 0) {
-                var c = new THREE.Color(unselectedColor);
-                for (var i = 0; i < n; ++i) {
-                    colors[dst++] = c.r;
-                    colors[dst++] = c.g;
-                    colors[dst++] = c.b;
-                }
-            }
-            else {
-                if (legendPointColorer != null) {
-                    for (var i = 0; i < n; ++i) {
-                        var c = new THREE.Color(legendPointColorer(ds, i));
-                        colors[dst++] = c.r;
-                        colors[dst++] = c.g;
-                        colors[dst++] = c.b;
-                    }
-                }
-                else {
-                    var c = new THREE.Color(noSelectionColor);
-                    for (var i = 0; i < n; ++i) {
-                        colors[dst++] = c.r;
-                        colors[dst++] = c.g;
-                        colors[dst++] = c.b;
-                    }
-                }
-            }
-        }
-        // Color the selected points.
-        {
-            var n = selectedPointCount;
-            var c = new THREE.Color(POINT_COLOR_SELECTED);
-            for (var i = 0; i < n; ++i) {
-                var dst = selectedPointIndices[i] * 3;
-                colors[dst++] = c.r;
-                colors[dst++] = c.g;
-                colors[dst++] = c.b;
-            }
-        }
-        // Color the neighbors.
-        {
-            var n = neighborCount;
-            var minDist = n > 0 ? neighborsOfFirstPoint[0].dist : 0;
-            for (var i = 0; i < n; ++i) {
-                var c = new THREE.Color(dist2color(distFunc, neighborsOfFirstPoint[i].dist, minDist));
-                var dst = neighborsOfFirstPoint[i].index * 3;
-                colors[dst++] = c.r;
-                colors[dst++] = c.g;
-                colors[dst++] = c.b;
-            }
-        }
-        // Color the hover point.
-        if (hoverPointIndex != null) {
-            var c = new THREE.Color(POINT_COLOR_HOVER);
-            var dst = hoverPointIndex * 3;
-            colors[dst++] = c.r;
-            colors[dst++] = c.g;
-            colors[dst++] = c.b;
-        }
-        return colors;
-    };
-    ProjectorScatterPlotAdapter.prototype.generate3DLabelsArray = function (ds, accessor) {
-        if ((ds == null) || (accessor == null)) {
-            return null;
-        }
-        var labels = [];
-        var n = ds.points.length;
-        for (var i = 0; i < n; ++i) {
-            labels.push(this.getLabelText(ds, i, accessor));
-        }
-        return labels;
-    };
-    ProjectorScatterPlotAdapter.prototype.getLabelText = function (ds, i, accessor) {
-        return ds.points[i].metadata[accessor].toString();
-    };
-    ProjectorScatterPlotAdapter.prototype.updateScatterPlotWithNewProjection = function (projection) {
-        if (projection == null) {
-            this.createVisualizers(this.renderLabelsIn3D);
-            this.scatterPlot.render();
-            return;
-        }
-        this.setDataSet(projection.dataSet);
-        this.scatterPlot.setDimensions(projection.dimensionality);
-        if (projection.dataSet.projectionCanBeRendered(projection.projectionType)) {
-            this.updateScatterPlotAttributes();
-            this.notifyProjectionPositionsUpdated();
-        }
-        this.scatterPlot.setCameraParametersForNextCameraCreation(null, false);
-    };
-    ProjectorScatterPlotAdapter.prototype.createVisualizers = function (inLabels3DMode) {
-        var ds = (this.projection == null) ? null : this.projection.dataSet;
-        var scatterPlot = this.scatterPlot;
-        scatterPlot.removeAllVisualizers();
-        this.labels3DVisualizer = null;
-        this.canvasLabelsVisualizer = null;
-        this.spriteVisualizer = null;
-        this.traceVisualizer = null;
-        if (inLabels3DMode) {
-            this.labels3DVisualizer = new scatterPlotVisualizer3DLabels_1.ScatterPlotVisualizer3DLabels();
-            this.labels3DVisualizer.setLabelStrings(this.generate3DLabelsArray(ds, this.labelPointAccessor));
-        }
-        else {
-            this.spriteVisualizer = new scatterPlotVisualizerSprites_1.ScatterPlotVisualizerSprites();
-            scatterPlot.addVisualizer(this.spriteVisualizer);
-            this.canvasLabelsVisualizer =
-                new scatterPlotVisualizerCanvasLabels_1.ScatterPlotVisualizerCanvasLabels(this.scatterPlotContainer);
-        }
-        this.traceVisualizer = new scatterPlotVisualizerTraces_1.ScatterPlotVisualizerTraces();
-        this.setDataSet(ds);
-        if (this.spriteVisualizer) {
-            scatterPlot.addVisualizer(this.spriteVisualizer);
-        }
-        if (this.labels3DVisualizer) {
-            scatterPlot.addVisualizer(this.labels3DVisualizer);
-        }
-        if (this.canvasLabelsVisualizer) {
-            scatterPlot.addVisualizer(this.canvasLabelsVisualizer);
-        }
-        scatterPlot.addVisualizer(this.traceVisualizer);
-    };
-    ProjectorScatterPlotAdapter.prototype.getSpriteImageMode = function () {
-        if (this.projection == null) {
-            return false;
-        }
-        var ds = this.projection.dataSet;
-        if ((ds == null) || (ds.spriteAndMetadataInfo == null)) {
-            return false;
-        }
-        return ds.spriteAndMetadataInfo.spriteImage != null;
-    };
-    return ProjectorScatterPlotAdapter;
-}());
-exports.ProjectorScatterPlotAdapter = ProjectorScatterPlotAdapter;
-function packRgbIntoUint8Array(rgbArray, labelIndex, r, g, b) {
-    rgbArray[labelIndex * 3] = r;
-    rgbArray[labelIndex * 3 + 1] = g;
-    rgbArray[labelIndex * 3 + 2] = b;
-}
-function styleRgbFromHexColor(hex) {
-    var c = new THREE.Color(hex);
-    return [(c.r * 255) | 0, (c.g * 255) | 0, (c.b * 255) | 0];
-}
-function getDefaultPointInTraceColor(index, totalPoints) {
-    var hue = TRACE_START_HUE + (TRACE_END_HUE - TRACE_START_HUE) * index / totalPoints;
-    var rgb = d3.hsl(hue, TRACE_SATURATION, TRACE_LIGHTNESS).rgb();
-    return new THREE.Color(rgb.r / 255, rgb.g / 255, rgb.b / 255);
-}
-/**
- * Normalizes the distance so it can be visually encoded with color.
- * The normalization depends on the distance metric (cosine vs euclidean).
- */
-function normalizeDist(distFunc, d, minDist) {
-    return (distFunc === vector.dist) ? (minDist / d) : (1 - d);
-}
-exports.normalizeDist = normalizeDist;
-/** Normalizes and encodes the provided distance with color. */
-function dist2color(distFunc, d, minDist) {
-    return NN_COLOR_SCALE(normalizeDist(distFunc, d, minDist));
-}
-exports.dist2color = dist2color;
-
-},{"./renderContext":15,"./scatterPlot":16,"./scatterPlotVisualizer3DLabels":19,"./scatterPlotVisualizerCanvasLabels":20,"./scatterPlotVisualizerSprites":21,"./scatterPlotVisualizerTraces":22,"./vector":25}],15:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http:www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * LabelRenderParams describes the set of points that should have labels
- * rendered next to them.
- */
-var LabelRenderParams = (function () {
-    function LabelRenderParams(pointIndices, labelStrings, scaleFactors, useSceneOpacityFlags, defaultFontSize, fillColors, strokeColors) {
-        this.pointIndices = pointIndices;
-        this.labelStrings = labelStrings;
-        this.scaleFactors = scaleFactors;
-        this.useSceneOpacityFlags = useSceneOpacityFlags;
-        this.defaultFontSize = defaultFontSize;
-        this.fillColors = fillColors;
-        this.strokeColors = strokeColors;
-    }
-    return LabelRenderParams;
-}());
-exports.LabelRenderParams = LabelRenderParams;
-/** Details about the camera projection being used to render the scene. */
-(function (CameraType) {
-    CameraType[CameraType["Perspective"] = 0] = "Perspective";
-    CameraType[CameraType["Orthographic"] = 1] = "Orthographic";
-})(exports.CameraType || (exports.CameraType = {}));
-var CameraType = exports.CameraType;
-/**
- * RenderContext contains all of the state required to color and render the data
- * set. ScatterPlot passes this to every attached visualizer as part of the
- * render callback.
- * TODO(nicholsonc): This should only contain the data that's changed between
- * each frame. Data like colors / scale factors / labels should be reapplied
- * only when they change.
- */
-var RenderContext = (function () {
-    function RenderContext(camera, cameraType, cameraTarget, screenWidth, screenHeight, nearestCameraSpacePointZ, farthestCameraSpacePointZ, backgroundColor, pointColors, pointScaleFactors, labels, traceColors, traceOpacities, traceWidths) {
-        this.camera = camera;
-        this.cameraType = cameraType;
-        this.cameraTarget = cameraTarget;
-        this.screenWidth = screenWidth;
-        this.screenHeight = screenHeight;
-        this.nearestCameraSpacePointZ = nearestCameraSpacePointZ;
-        this.farthestCameraSpacePointZ = farthestCameraSpacePointZ;
-        this.backgroundColor = backgroundColor;
-        this.pointColors = pointColors;
-        this.pointScaleFactors = pointScaleFactors;
-        this.labels = labels;
-        this.traceColors = traceColors;
-        this.traceOpacities = traceOpacities;
-        this.traceWidths = traceWidths;
-    }
-    return RenderContext;
-}());
-exports.RenderContext = RenderContext;
-
-},{}],16:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var scatterPlotRectangleSelector_1 = require('./scatterPlotRectangleSelector');
-var util = require('./util');
-var BACKGROUND_COLOR = 0xffffff;
-/**
- * The length of the cube (diameter of the circumscribing sphere) where all the
- * points live.
- */
-var CUBE_LENGTH = 2;
-var MAX_ZOOM = 5 * CUBE_LENGTH;
-var MIN_ZOOM = 0.025 * CUBE_LENGTH;
-// Constants relating to the camera parameters.
-var PERSP_CAMERA_FOV_VERTICAL = 70;
-var PERSP_CAMERA_NEAR_CLIP_PLANE = 0.01;
-var PERSP_CAMERA_FAR_CLIP_PLANE = 100;
-var ORTHO_CAMERA_FRUSTUM_HALF_EXTENT = 1.2;
-// Key presses.
-var SHIFT_KEY = 16;
-var CTRL_KEY = 17;
-var START_CAMERA_POS_3D = new THREE.Vector3(0.45, 0.9, 1.6);
-var START_CAMERA_TARGET_3D = new THREE.Vector3(0, 0, 0);
-var START_CAMERA_POS_2D = new THREE.Vector3(0, 0, 4);
-var START_CAMERA_TARGET_2D = new THREE.Vector3(0, 0, 0);
-var ORBIT_MOUSE_ROTATION_SPEED = 1;
-var ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS = 7;
-/** Supported modes of interaction. */
-(function (MouseMode) {
-    MouseMode[MouseMode["AREA_SELECT"] = 0] = "AREA_SELECT";
-    MouseMode[MouseMode["CAMERA_AND_CLICK_SELECT"] = 1] = "CAMERA_AND_CLICK_SELECT";
-})(exports.MouseMode || (exports.MouseMode = {}));
-var MouseMode = exports.MouseMode;
-/** Defines a camera, suitable for serialization. */
-var CameraDef = (function () {
-    function CameraDef() {
-        this.orthographic = false;
-    }
-    return CameraDef;
-}());
-exports.CameraDef = CameraDef;
-/**
- * Maintains a three.js instantiation and context,
- * animation state, and all other logic that's
- * independent of how a 3D scatter plot is actually rendered. Also holds an
- * array of visualizers and dispatches application events to them.
- */
-var ScatterPlot = (function () {
-    function ScatterPlot(container, projectorEventContext) {
-        var _this = this;
-        this.visualizers = [];
-        this.onCameraMoveListeners = [];
-        this.backgroundColor = BACKGROUND_COLOR;
-        this.dimensionality = 3;
-        this.cameraDef = null;
-        this.orbitAnimationOnNextCameraCreation = false;
-        this.selecting = false;
-        this.mouseIsDown = false;
-        this.isDragSequence = false;
-        this.containerNode = container.node();
-        this.projectorEventContext = projectorEventContext;
-        this.getLayoutValues();
-        this.scene = new THREE.Scene();
-        this.renderer = new THREE.WebGLRenderer({ alpha: true, premultipliedAlpha: false, antialias: false });
-        this.renderer.setClearColor(BACKGROUND_COLOR, 1);
-        this.containerNode.appendChild(this.renderer.domElement);
-        this.light = new THREE.PointLight(0xFFECBF, 1, 0);
-        this.scene.add(this.light);
-        this.setDimensions(3);
-        this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-        this.renderer.render(this.scene, this.camera);
-        this.rectangleSelector = new scatterPlotRectangleSelector_1.ScatterPlotRectangleSelector(this.containerNode, function (boundingBox) { return _this.selectBoundingBox(boundingBox); });
-        this.addInteractionListeners();
-    }
-    ScatterPlot.prototype.addInteractionListeners = function () {
-        this.containerNode.addEventListener('mousemove', this.onMouseMove.bind(this));
-        this.containerNode.addEventListener('mousedown', this.onMouseDown.bind(this));
-        this.containerNode.addEventListener('mouseup', this.onMouseUp.bind(this));
-        this.containerNode.addEventListener('click', this.onClick.bind(this));
-        window.addEventListener('keydown', this.onKeyDown.bind(this), false);
-        window.addEventListener('keyup', this.onKeyUp.bind(this), false);
-    };
-    ScatterPlot.prototype.addCameraControlsEventListeners = function (cameraControls) {
-        var _this = this;
-        // Start is called when the user stars interacting with
-        // controls.
-        cameraControls.addEventListener('start', function () {
-            _this.stopOrbitAnimation();
-            _this.onCameraMoveListeners.forEach(function (l) { return l(_this.camera.position, cameraControls.target); });
-        });
-        // Change is called everytime the user interacts with the controls.
-        cameraControls.addEventListener('change', function () {
-            _this.render();
-        });
-        // End is called when the user stops interacting with the
-        // controls (e.g. on mouse up, after dragging).
-        cameraControls.addEventListener('end', function () { });
-    };
-    ScatterPlot.prototype.makeOrbitControls = function (camera, cameraDef, cameraIs3D) {
-        if (this.orbitCameraControls != null) {
-            this.orbitCameraControls.dispose();
-        }
-        var occ = new THREE.OrbitControls(camera, this.renderer.domElement);
-        occ.target0 = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-        occ.position0 = new THREE.Vector3().copy(camera.position);
-        occ.zoom0 = cameraDef.zoom;
-        occ.enableRotate = cameraIs3D;
-        occ.autoRotate = false;
-        occ.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-        if (cameraIs3D) {
-            occ.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            occ.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        else {
-            occ.mouseButtons.ORBIT = null;
-            occ.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-        occ.reset();
-        this.camera = camera;
-        this.orbitCameraControls = occ;
-        this.addCameraControlsEventListeners(this.orbitCameraControls);
-    };
-    ScatterPlot.prototype.makeCamera3D = function (cameraDef, w, h) {
-        var camera;
-        {
-            var aspectRatio = w / h;
-            camera = new THREE.PerspectiveCamera(PERSP_CAMERA_FOV_VERTICAL, aspectRatio, PERSP_CAMERA_NEAR_CLIP_PLANE, PERSP_CAMERA_FAR_CLIP_PLANE);
-            camera.position.set(cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-            var at = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-            camera.lookAt(at);
-            camera.zoom = cameraDef.zoom;
-            camera.updateProjectionMatrix();
-        }
-        this.camera = camera;
-        this.makeOrbitControls(camera, cameraDef, true);
-    };
-    ScatterPlot.prototype.makeCamera2D = function (cameraDef, w, h) {
-        var camera;
-        var target = new THREE.Vector3(cameraDef.target[0], cameraDef.target[1], cameraDef.target[2]);
-        {
-            var aspectRatio = w / h;
-            var left = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var right = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var bottom = -ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            var top_1 = ORTHO_CAMERA_FRUSTUM_HALF_EXTENT;
-            // Scale up the larger of (w, h) to match the aspect ratio.
-            if (aspectRatio > 1) {
-                left *= aspectRatio;
-                right *= aspectRatio;
-            }
-            else {
-                top_1 /= aspectRatio;
-                bottom /= aspectRatio;
-            }
-            camera =
-                new THREE.OrthographicCamera(left, right, top_1, bottom, -1000, 1000);
-            camera.position.set(cameraDef.position[0], cameraDef.position[1], cameraDef.position[2]);
-            camera.up = new THREE.Vector3(0, 1, 0);
-            camera.lookAt(target);
-            camera.zoom = cameraDef.zoom;
-            camera.updateProjectionMatrix();
-        }
-        this.camera = camera;
-        this.makeOrbitControls(camera, cameraDef, false);
-    };
-    ScatterPlot.prototype.makeDefaultCameraDef = function (dimensionality) {
-        var def = new CameraDef();
-        def.orthographic = (dimensionality === 2);
-        def.zoom = 1.0;
-        if (def.orthographic) {
-            def.position =
-                [START_CAMERA_POS_2D.x, START_CAMERA_POS_2D.y, START_CAMERA_POS_2D.z];
-            def.target = [
-                START_CAMERA_TARGET_2D.x, START_CAMERA_TARGET_2D.y,
-                START_CAMERA_TARGET_2D.z
-            ];
-        }
-        else {
-            def.position =
-                [START_CAMERA_POS_3D.x, START_CAMERA_POS_3D.y, START_CAMERA_POS_3D.z];
-            def.target = [
-                START_CAMERA_TARGET_3D.x, START_CAMERA_TARGET_3D.y,
-                START_CAMERA_TARGET_3D.z
-            ];
-        }
-        return def;
-    };
-    /** Recreate the scatter plot camera from a definition structure. */
-    ScatterPlot.prototype.recreateCamera = function (cameraDef) {
-        if (cameraDef.orthographic) {
-            this.makeCamera2D(cameraDef, this.width, this.height);
-        }
-        else {
-            this.makeCamera3D(cameraDef, this.width, this.height);
-        }
-        this.orbitCameraControls.minDistance = MIN_ZOOM;
-        this.orbitCameraControls.maxDistance = MAX_ZOOM;
-        this.orbitCameraControls.update();
-        if (this.orbitAnimationOnNextCameraCreation) {
-            this.startOrbitAnimation();
-        }
-    };
-    ScatterPlot.prototype.onClick = function (e, notify) {
-        if (notify === void 0) { notify = true; }
-        if (e && this.selecting) {
-            return;
-        }
-        // Only call event handlers if the click originated from the scatter plot.
-        if (!this.isDragSequence && notify) {
-            var selection = (this.nearestPoint != null) ? [this.nearestPoint] : [];
-            this.projectorEventContext.notifySelectionChanged(selection);
-        }
-        this.isDragSequence = false;
-        this.render();
-    };
-    ScatterPlot.prototype.onMouseDown = function (e) {
-        this.isDragSequence = false;
-        this.mouseIsDown = true;
-        if (this.selecting) {
-            this.orbitCameraControls.enabled = false;
-            this.rectangleSelector.onMouseDown(e.offsetX, e.offsetY);
-            this.setNearestPointToMouse(e);
-        }
-        else if (!e.ctrlKey && this.sceneIs3D() &&
-            this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.RIGHT) {
-            // The user happened to press the ctrl key when the tab was active,
-            // unpressed the ctrl when the tab was inactive, and now he/she
-            // is back to the projector tab.
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        else if (e.ctrlKey && this.sceneIs3D() &&
-            this.orbitCameraControls.mouseButtons.ORBIT === THREE.MOUSE.LEFT) {
-            // Similarly to the situation above.
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-    };
-    /** When we stop dragging/zooming, return to normal behavior. */
-    ScatterPlot.prototype.onMouseUp = function (e) {
-        if (this.selecting) {
-            this.orbitCameraControls.enabled = true;
-            this.rectangleSelector.onMouseUp();
-            this.render();
-        }
-        this.mouseIsDown = false;
-    };
-    /**
-     * When the mouse moves, find the nearest point (if any) and send it to the
-     * hoverlisteners (usually called from embedding.ts)
-     */
-    ScatterPlot.prototype.onMouseMove = function (e) {
-        this.isDragSequence = this.mouseIsDown;
-        // Depending if we're selecting or just navigating, handle accordingly.
-        if (this.selecting && this.mouseIsDown) {
-            this.rectangleSelector.onMouseMove(e.offsetX, e.offsetY);
-            this.render();
-        }
-        else if (!this.mouseIsDown) {
-            this.setNearestPointToMouse(e);
-            this.projectorEventContext.notifyHoverOverPoint(this.nearestPoint);
-        }
-    };
-    /** For using ctrl + left click as right click, and for circle select */
-    ScatterPlot.prototype.onKeyDown = function (e) {
-        // If ctrl is pressed, use left click to orbit
-        if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.RIGHT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.LEFT;
-        }
-        // If shift is pressed, start selecting
-        if (e.keyCode === SHIFT_KEY) {
-            this.selecting = true;
-            this.containerNode.style.cursor = 'crosshair';
-        }
-    };
-    /** For using ctrl + left click as right click, and for circle select */
-    ScatterPlot.prototype.onKeyUp = function (e) {
-        if (e.keyCode === CTRL_KEY && this.sceneIs3D()) {
-            this.orbitCameraControls.mouseButtons.ORBIT = THREE.MOUSE.LEFT;
-            this.orbitCameraControls.mouseButtons.PAN = THREE.MOUSE.RIGHT;
-        }
-        // If shift is released, stop selecting
-        if (e.keyCode === SHIFT_KEY) {
-            this.selecting = (this.getMouseMode() === MouseMode.AREA_SELECT);
-            if (!this.selecting) {
-                this.containerNode.style.cursor = 'default';
-            }
-            this.render();
-        }
-    };
-    /**
-     * Returns a list of indices of points in a bounding box from the picking
-     * texture.
-     * @param boundingBox The bounding box to select from.
-     */
-    ScatterPlot.prototype.getPointIndicesFromPickingTexture = function (boundingBox) {
-        if (this.worldSpacePointPositions == null) {
-            return null;
-        }
-        var pointCount = this.worldSpacePointPositions.length / 3;
-        var dpr = window.devicePixelRatio || 1;
-        var x = Math.floor(boundingBox.x * dpr);
-        var y = Math.floor(boundingBox.y * dpr);
-        var width = Math.floor(boundingBox.width * dpr);
-        var height = Math.floor(boundingBox.height * dpr);
-        // Create buffer for reading all of the pixels from the texture.
-        var pixelBuffer = new Uint8Array(width * height * 4);
-        // Read the pixels from the bounding box.
-        this.renderer.readRenderTargetPixels(this.pickingTexture, x, this.pickingTexture.height - y, width, height, pixelBuffer);
-        // Keep a flat list of each point and whether they are selected or not. This
-        // approach is more efficient than using an object keyed by the index.
-        var pointIndicesSelection = new Uint8Array(this.worldSpacePointPositions.length);
-        for (var i = 0; i < width * height; i++) {
-            var id = (pixelBuffer[i * 4] << 16) | (pixelBuffer[i * 4 + 1] << 8) |
-                pixelBuffer[i * 4 + 2];
-            if (id !== 0xffffff && (id < pointCount)) {
-                pointIndicesSelection[id] = 1;
-            }
-        }
-        var pointIndices = [];
-        for (var i = 0; i < pointIndicesSelection.length; i++) {
-            if (pointIndicesSelection[i] === 1) {
-                pointIndices.push(i);
-            }
-        }
-        return pointIndices;
-    };
-    ScatterPlot.prototype.selectBoundingBox = function (boundingBox) {
-        var pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-        this.projectorEventContext.notifySelectionChanged(pointIndices);
-    };
-    ScatterPlot.prototype.setNearestPointToMouse = function (e) {
-        if (this.pickingTexture == null) {
-            this.nearestPoint = null;
-            return;
-        }
-        var boundingBox = { x: e.offsetX, y: e.offsetY, width: 1, height: 1 };
-        var pointIndices = this.getPointIndicesFromPickingTexture(boundingBox);
-        this.nearestPoint = (pointIndices != null) ? pointIndices[0] : null;
-    };
-    ScatterPlot.prototype.getLayoutValues = function () {
-        this.width = this.containerNode.offsetWidth;
-        this.height = Math.max(1, this.containerNode.offsetHeight);
-        return [this.width, this.height];
-    };
-    ScatterPlot.prototype.sceneIs3D = function () {
-        return this.dimensionality === 3;
-    };
-    ScatterPlot.prototype.remove3dAxisFromScene = function () {
-        var axes = this.scene.getObjectByName('axes');
-        if (axes != null) {
-            this.scene.remove(axes);
-        }
-        return axes;
-    };
-    ScatterPlot.prototype.add3dAxis = function () {
-        var axes = new THREE.AxisHelper();
-        axes.name = 'axes';
-        this.scene.add(axes);
-    };
-    /** Set 2d vs 3d mode. */
-    ScatterPlot.prototype.setDimensions = function (dimensionality) {
-        if ((dimensionality !== 2) && (dimensionality !== 3)) {
-            throw new RangeError('dimensionality must be 2 or 3');
-        }
-        this.dimensionality = dimensionality;
-        var def = this.cameraDef || this.makeDefaultCameraDef(dimensionality);
-        this.recreateCamera(def);
-        this.remove3dAxisFromScene();
-        if (dimensionality === 3) {
-            this.add3dAxis();
-        }
-    };
-    /** Gets the current camera information, suitable for serialization. */
-    ScatterPlot.prototype.getCameraDef = function () {
-        var def = new CameraDef();
-        var pos = this.camera.position;
-        var tgt = this.orbitCameraControls.target;
-        def.orthographic = !this.sceneIs3D();
-        def.position = [pos.x, pos.y, pos.z];
-        def.target = [tgt.x, tgt.y, tgt.z];
-        def.zoom = this.camera.zoom;
-        return def;
-    };
-    /** Sets parameters for the next camera recreation. */
-    ScatterPlot.prototype.setCameraParametersForNextCameraCreation = function (def, orbitAnimation) {
-        this.cameraDef = def;
-        this.orbitAnimationOnNextCameraCreation = orbitAnimation;
-    };
-    /** Gets the current camera position. */
-    ScatterPlot.prototype.getCameraPosition = function () {
-        var currPos = this.camera.position;
-        return [currPos.x, currPos.y, currPos.z];
-    };
-    /** Gets the current camera target. */
-    ScatterPlot.prototype.getCameraTarget = function () {
-        var currTarget = this.orbitCameraControls.target;
-        return [currTarget.x, currTarget.y, currTarget.z];
-    };
-    /** Sets up the camera from given position and target coordinates. */
-    ScatterPlot.prototype.setCameraPositionAndTarget = function (position, target) {
-        this.stopOrbitAnimation();
-        this.camera.position.set(position[0], position[1], position[2]);
-        this.orbitCameraControls.target.set(target[0], target[1], target[2]);
-        this.orbitCameraControls.update();
-        this.render();
-    };
-    /** Starts orbiting the camera around its current lookat target. */
-    ScatterPlot.prototype.startOrbitAnimation = function () {
-        if (!this.sceneIs3D()) {
-            return;
-        }
-        if (this.orbitAnimationId != null) {
-            this.stopOrbitAnimation();
-        }
-        this.orbitCameraControls.autoRotate = true;
-        this.orbitCameraControls.rotateSpeed =
-            ORBIT_ANIMATION_ROTATION_CYCLE_IN_SECONDS;
-        this.updateOrbitAnimation();
-    };
-    ScatterPlot.prototype.updateOrbitAnimation = function () {
-        var _this = this;
-        this.orbitCameraControls.update();
-        this.orbitAnimationId =
-            requestAnimationFrame(function () { return _this.updateOrbitAnimation(); });
-    };
-    /** Stops the orbiting animation on the camera. */
-    ScatterPlot.prototype.stopOrbitAnimation = function () {
-        this.orbitCameraControls.autoRotate = false;
-        this.orbitCameraControls.rotateSpeed = ORBIT_MOUSE_ROTATION_SPEED;
-        if (this.orbitAnimationId != null) {
-            cancelAnimationFrame(this.orbitAnimationId);
-            this.orbitAnimationId = null;
-        }
-    };
-    /** Adds a visualizer to the set, will start dispatching events to it */
-    ScatterPlot.prototype.addVisualizer = function (visualizer) {
-        if (this.scene) {
-            visualizer.setScene(this.scene);
-        }
-        visualizer.onResize(this.width, this.height);
-        visualizer.onPointPositionsChanged(this.worldSpacePointPositions);
-        this.visualizers.push(visualizer);
-    };
-    /** Removes all visualizers attached to this scatter plot. */
-    ScatterPlot.prototype.removeAllVisualizers = function () {
-        this.visualizers.forEach(function (v) { return v.dispose(); });
-        this.visualizers = [];
-    };
-    /** Update scatter plot with a new array of packed xyz point positions. */
-    ScatterPlot.prototype.setPointPositions = function (worldSpacePointPositions) {
-        this.worldSpacePointPositions = worldSpacePointPositions;
-        this.visualizers.forEach(function (v) { return v.onPointPositionsChanged(worldSpacePointPositions); });
-    };
-    ScatterPlot.prototype.render = function () {
-        {
-            var lightPos = this.camera.position.clone();
-            lightPos.x += 1;
-            lightPos.y += 1;
-            this.light.position.set(lightPos.x, lightPos.y, lightPos.z);
-        }
-        var cameraType = (this.camera instanceof THREE.PerspectiveCamera) ?
-            renderContext_1.CameraType.Perspective :
-            renderContext_1.CameraType.Orthographic;
-        var cameraSpacePointExtents = [0, 0];
-        if (this.worldSpacePointPositions != null) {
-            cameraSpacePointExtents = util.getNearFarPoints(this.worldSpacePointPositions, this.camera.position, this.orbitCameraControls.target);
-        }
-        var rc = new renderContext_1.RenderContext(this.camera, cameraType, this.orbitCameraControls.target, this.width, this.height, cameraSpacePointExtents[0], cameraSpacePointExtents[1], this.backgroundColor, this.pointColors, this.pointScaleFactors, this.labels, this.traceColors, this.traceOpacities, this.traceWidths);
-        // Render first pass to picking target. This render fills pickingTexture
-        // with colors that are actually point ids, so that sampling the texture at
-        // the mouse's current x,y coordinates will reveal the data point that the
-        // mouse is over.
-        this.visualizers.forEach(function (v) { return v.onPickingRender(rc); });
-        {
-            var axes = this.remove3dAxisFromScene();
-            this.renderer.render(this.scene, this.camera, this.pickingTexture);
-            if (axes != null) {
-                this.scene.add(axes);
-            }
-        }
-        // Render second pass to color buffer, to be displayed on the canvas.
-        this.visualizers.forEach(function (v) { return v.onRender(rc); });
-        this.renderer.render(this.scene, this.camera);
-    };
-    ScatterPlot.prototype.setMouseMode = function (mouseMode) {
-        this.mouseMode = mouseMode;
-        if (mouseMode === MouseMode.AREA_SELECT) {
-            this.selecting = true;
-            this.containerNode.style.cursor = 'crosshair';
-        }
-        else {
-            this.selecting = false;
-            this.containerNode.style.cursor = 'default';
-        }
-    };
-    /** Set the colors for every data point. (RGB triplets) */
-    ScatterPlot.prototype.setPointColors = function (colors) {
-        this.pointColors = colors;
-    };
-    /** Set the scale factors for every data point. (scalars) */
-    ScatterPlot.prototype.setPointScaleFactors = function (scaleFactors) {
-        this.pointScaleFactors = scaleFactors;
-    };
-    /** Set the labels to rendered */
-    ScatterPlot.prototype.setLabels = function (labels) {
-        this.labels = labels;
-    };
-    /** Set the colors for every data trace. (RGB triplets) */
-    ScatterPlot.prototype.setTraceColors = function (colors) {
-        this.traceColors = colors;
-    };
-    ScatterPlot.prototype.setTraceOpacities = function (opacities) {
-        this.traceOpacities = opacities;
-    };
-    ScatterPlot.prototype.setTraceWidths = function (widths) {
-        this.traceWidths = widths;
-    };
-    ScatterPlot.prototype.getMouseMode = function () {
-        return this.mouseMode;
-    };
-    ScatterPlot.prototype.resetZoom = function () {
-        this.recreateCamera(this.makeDefaultCameraDef(this.dimensionality));
-        this.render();
-    };
-    ScatterPlot.prototype.setDayNightMode = function (isNight) {
-        d3.select(this.containerNode)
-            .selectAll('canvas')
-            .style('filter', isNight ? 'invert(100%)' : null);
-    };
-    ScatterPlot.prototype.resize = function (render) {
-        if (render === void 0) { render = true; }
-        var _a = [this.width, this.height], oldW = _a[0], oldH = _a[1];
-        var _b = this.getLayoutValues(), newW = _b[0], newH = _b[1];
-        if (this.dimensionality === 3) {
-            var camera = this.camera;
-            camera.aspect = newW / newH;
-            camera.updateProjectionMatrix();
-        }
-        else {
-            var camera = this.camera;
-            // Scale the ortho frustum by however much the window changed.
-            var scaleW = newW / oldW;
-            var scaleH = newH / oldH;
-            var newCamHalfWidth = ((camera.right - camera.left) * scaleW) / 2;
-            var newCamHalfHeight = ((camera.top - camera.bottom) * scaleH) / 2;
-            camera.top = newCamHalfHeight;
-            camera.bottom = -newCamHalfHeight;
-            camera.left = -newCamHalfWidth;
-            camera.right = newCamHalfWidth;
-            camera.updateProjectionMatrix();
-        }
-        // Accouting for retina displays.
-        var dpr = window.devicePixelRatio || 1;
-        this.renderer.setPixelRatio(dpr);
-        this.renderer.setSize(newW, newH);
-        // the picking texture needs to be exactly the same as the render texture.
-        {
-            var renderCanvasSize = this.renderer.getSize();
-            var pixelRatio = this.renderer.getPixelRatio();
-            this.pickingTexture = new THREE.WebGLRenderTarget(renderCanvasSize.width * pixelRatio, renderCanvasSize.height * pixelRatio);
-            this.pickingTexture.texture.minFilter = THREE.LinearFilter;
-        }
-        this.visualizers.forEach(function (v) { return v.onResize(newW, newH); });
-        if (render) {
-            this.render();
-        }
-        ;
-    };
-    ScatterPlot.prototype.onCameraMove = function (listener) {
-        this.onCameraMoveListeners.push(listener);
-    };
-    ScatterPlot.prototype.clickOnPoint = function (pointIndex) {
-        this.nearestPoint = pointIndex;
-        this.onClick(null, false);
-    };
-    return ScatterPlot;
-}());
-exports.ScatterPlot = ScatterPlot;
-
-},{"./renderContext":15,"./scatterPlotRectangleSelector":17,"./util":24}],17:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var FILL = '#dddddd';
-var FILL_OPACITY = .2;
-var STROKE = '#aaaaaa';
-var STROKE_WIDTH = 2;
-var STROKE_DASHARRAY = '10 5';
-/**
- * A class that manages and renders a data selection rectangle.
- */
-var ScatterPlotRectangleSelector = (function () {
-    /**
-     * @param container The container HTML element that the selection SVG rect
-     *     will be a child of.
-     * @param selectionCallback The callback that accepts a bounding box to be
-     *     called when selection changes. Currently, we only call the callback on
-     *     mouseUp.
-     */
-    function ScatterPlotRectangleSelector(container, selectionCallback) {
-        this.svgElement = d3.select(container).select('#selector');
-        this.rectElement = this.svgElement.append('rect')
-            .style('stroke', STROKE)
-            .style('stroke-dasharray', STROKE_DASHARRAY)
-            .style('stroke-width', STROKE_WIDTH)
-            .style('fill', FILL)
-            .style('fill-opacity', FILL_OPACITY);
-        this.selectionCallback = selectionCallback;
-        this.isMouseDown = false;
-    }
-    ScatterPlotRectangleSelector.prototype.onMouseDown = function (offsetX, offsetY) {
-        this.isMouseDown = true;
-        this.svgElement.style('display', 'block');
-        this.startCoordinates = [offsetX, offsetY];
-        this.lastBoundingBox = {
-            x: this.startCoordinates[0],
-            y: this.startCoordinates[1],
-            width: 1,
-            height: 1
-        };
-    };
-    ScatterPlotRectangleSelector.prototype.onMouseMove = function (offsetX, offsetY) {
-        if (!this.isMouseDown) {
-            return;
-        }
-        this.lastBoundingBox.x = Math.min(offsetX, this.startCoordinates[0]);
-        this.lastBoundingBox.y = Math.max(offsetY, this.startCoordinates[1]);
-        this.lastBoundingBox.width =
-            Math.max(offsetX, this.startCoordinates[0]) - this.lastBoundingBox.x;
-        this.lastBoundingBox.height =
-            this.lastBoundingBox.y - Math.min(offsetY, this.startCoordinates[1]);
-        this.rectElement.attr({
-            x: this.lastBoundingBox.x,
-            y: this.lastBoundingBox.y - this.lastBoundingBox.height,
-            width: this.lastBoundingBox.width,
-            height: this.lastBoundingBox.height
-        });
-    };
-    ScatterPlotRectangleSelector.prototype.onMouseUp = function () {
-        this.isMouseDown = false;
-        this.svgElement.style('display', 'none');
-        this.rectElement.attr('width', 0);
-        this.rectElement.attr('height', 0);
-        this.selectionCallback(this.lastBoundingBox);
-    };
-    return ScatterPlotRectangleSelector;
-}());
-exports.ScatterPlotRectangleSelector = ScatterPlotRectangleSelector;
-
-},{}],18:[function(require,module,exports){
-arguments[4][13][0].apply(exports,arguments)
-},{"dup":13}],19:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util = require('./util');
-var FONT_SIZE = 80;
-var ONE_OVER_FONT_SIZE = 1 / FONT_SIZE;
-var LABEL_SCALE = 2.2; // at 1:1 texel/pixel ratio
-var LABEL_COLOR = 'black';
-var LABEL_BACKGROUND = 'white';
-var MAX_CANVAS_DIMENSION = 8192;
-var NUM_GLYPHS = 256;
-var RGB_ELEMENTS_PER_ENTRY = 3;
-var XYZ_ELEMENTS_PER_ENTRY = 3;
-var UV_ELEMENTS_PER_ENTRY = 2;
-var VERTICES_PER_GLYPH = 2 * 3; // 2 triangles, 3 verts per triangle
-/**
- * Each label is made up of triangles (two per letter.) Each vertex, then, is
- * the corner of one of these triangles (and thus the corner of a letter
- * rectangle.)
- * Each has the following attributes:
- *    posObj: The (x, y) position of the vertex within the label, where the
- *            bottom center of the word is positioned at (0, 0);
- *    position: The position of the label in worldspace.
- *    vUv: The (u, v) coordinates that index into the glyphs sheet (range 0, 1.)
- *    color: The color of the label (matches the cooresponding point's color.)
- *    wordShown: Boolean. Whether or not the label is visible.
- */
-var VERTEX_SHADER = "\n    attribute vec2 posObj;\n    attribute vec3 color;\n    varying vec2 vUv;\n    varying vec3 vColor;\n\n    void main() {\n      vUv = uv;\n      vColor = color;\n\n      // Rotate label to face camera.\n\n      vec4 vRight = vec4(\n        modelViewMatrix[0][0], modelViewMatrix[1][0], modelViewMatrix[2][0], 0);\n\n      vec4 vUp = vec4(\n        modelViewMatrix[0][1], modelViewMatrix[1][1], modelViewMatrix[2][1], 0);\n\n      vec4 vAt = -vec4(\n        modelViewMatrix[0][2], modelViewMatrix[1][2], modelViewMatrix[2][2], 0);\n\n      mat4 pointToCamera = mat4(vRight, vUp, vAt, vec4(0, 0, 0, 1));\n\n      vec2 scaledPos = posObj * " + ONE_OVER_FONT_SIZE + " * " + LABEL_SCALE + ";\n\n      vec4 posRotated = pointToCamera * vec4(scaledPos, 0, 1);\n      vec4 mvPosition = modelViewMatrix * (vec4(position, 0) + posRotated);\n      gl_Position = projectionMatrix * mvPosition;\n    }";
-var FRAGMENT_SHADER = "\n    uniform sampler2D texture;\n    uniform bool picking;\n    varying vec2 vUv;\n    varying vec3 vColor;\n\n    void main() {\n      if (picking) {\n        gl_FragColor = vec4(vColor, 1.0);\n      } else {\n        vec4 fromTexture = texture2D(texture, vUv);\n        gl_FragColor = vec4(vColor, 1.0) * fromTexture;\n      }\n    }";
-/**
- * Renders the text labels as 3d geometry in the world.
- */
-var ScatterPlotVisualizer3DLabels = (function () {
-    function ScatterPlotVisualizer3DLabels() {
-    }
-    ScatterPlotVisualizer3DLabels.prototype.createGlyphTexture = function () {
-        var canvas = document.createElement('canvas');
-        canvas.width = MAX_CANVAS_DIMENSION;
-        canvas.height = FONT_SIZE;
-        var ctx = canvas.getContext('2d');
-        ctx.font = 'bold ' + FONT_SIZE * 0.75 + 'px roboto';
-        ctx.textBaseline = 'top';
-        ctx.fillStyle = LABEL_BACKGROUND;
-        ctx.rect(0, 0, canvas.width, canvas.height);
-        ctx.fill();
-        ctx.fillStyle = LABEL_COLOR;
-        var spaceOffset = ctx.measureText(' ').width;
-        // For each letter, store length, position at the encoded index.
-        var glyphLengths = new Float32Array(NUM_GLYPHS);
-        var glyphOffset = new Float32Array(NUM_GLYPHS);
-        var leftCoord = 0;
-        for (var i = 0; i < NUM_GLYPHS; i++) {
-            var text = ' ' + String.fromCharCode(i);
-            var textLength = ctx.measureText(text).width;
-            glyphLengths[i] = textLength - spaceOffset;
-            glyphOffset[i] = leftCoord;
-            ctx.fillText(text, leftCoord - spaceOffset, 0);
-            leftCoord += textLength;
-        }
-        var tex = util.createTexture(canvas);
-        return { texture: tex, lengths: glyphLengths, offsets: glyphOffset };
-    };
-    ScatterPlotVisualizer3DLabels.prototype.processLabelVerts = function (pointCount) {
-        var numTotalLetters = 0;
-        this.labelVertexMap = [];
-        for (var i = 0; i < pointCount; i++) {
-            var label = this.labelStrings[i];
-            var vertsArray = [];
-            for (var j = 0; j < label.length; j++) {
-                for (var k = 0; k < VERTICES_PER_GLYPH; k++) {
-                    vertsArray.push(numTotalLetters * VERTICES_PER_GLYPH + k);
-                }
-                numTotalLetters++;
-            }
-            this.labelVertexMap.push(vertsArray);
-        }
-        this.totalVertexCount = numTotalLetters * VERTICES_PER_GLYPH;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.createColorBuffers = function (pointCount) {
-        var _this = this;
-        this.pickingColors =
-            new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        this.renderColors =
-            new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        var _loop_1 = function(i) {
-            var color = new THREE.Color(i);
-            this_1.labelVertexMap[i].forEach(function (j) {
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j] = color.r;
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = color.g;
-                _this.pickingColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = color.b;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j] = 1.0;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 1] = 1.0;
-                _this.renderColors[RGB_ELEMENTS_PER_ENTRY * j + 2] = 1.0;
-            });
-        };
-        var this_1 = this;
-        for (var i = 0; i < pointCount; i++) {
-            _loop_1(i);
-        }
-    };
-    ScatterPlotVisualizer3DLabels.prototype.createLabels = function () {
-        var _this = this;
-        if ((this.labelStrings == null) ||
-            (this.worldSpacePointPositions == null)) {
-            return;
-        }
-        var pointCount = this.worldSpacePointPositions.length / XYZ_ELEMENTS_PER_ENTRY;
-        if (pointCount !== this.labelStrings.length) {
-            return;
-        }
-        this.glyphTexture = this.createGlyphTexture();
-        this.uniforms = {
-            texture: { type: 't' },
-            picking: { type: 'bool' },
-        };
-        this.material = new THREE.ShaderMaterial({
-            uniforms: this.uniforms,
-            transparent: true,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER,
-        });
-        this.processLabelVerts(pointCount);
-        this.createColorBuffers(pointCount);
-        var positionArray = new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-        this.positions =
-            new THREE.BufferAttribute(positionArray, XYZ_ELEMENTS_PER_ENTRY);
-        var posArray = new Float32Array(this.totalVertexCount * XYZ_ELEMENTS_PER_ENTRY);
-        var uvArray = new Float32Array(this.totalVertexCount * UV_ELEMENTS_PER_ENTRY);
-        var colorsArray = new Float32Array(this.totalVertexCount * RGB_ELEMENTS_PER_ENTRY);
-        var positionObject = new THREE.BufferAttribute(posArray, 2);
-        var uv = new THREE.BufferAttribute(uvArray, UV_ELEMENTS_PER_ENTRY);
-        var colors = new THREE.BufferAttribute(colorsArray, RGB_ELEMENTS_PER_ENTRY);
-        this.geometry = new THREE.BufferGeometry();
-        this.geometry.addAttribute('posObj', positionObject);
-        this.geometry.addAttribute('position', this.positions);
-        this.geometry.addAttribute('uv', uv);
-        this.geometry.addAttribute('color', colors);
-        var lettersSoFar = 0;
-        for (var i = 0; i < pointCount; i++) {
-            var label = this.labelStrings[i];
-            var leftOffset = 0;
-            // Determine length of word in pixels.
-            for (var j = 0; j < label.length; j++) {
-                var letterCode = label.charCodeAt(j);
-                leftOffset += this.glyphTexture.lengths[letterCode];
-            }
-            leftOffset /= -2; // centers text horizontally around the origin
-            for (var j = 0; j < label.length; j++) {
-                var letterCode = label.charCodeAt(j);
-                var letterWidth = this.glyphTexture.lengths[letterCode];
-                var scale = FONT_SIZE;
-                var right = (leftOffset + letterWidth) / scale;
-                var left = (leftOffset) / scale;
-                var top_1 = FONT_SIZE / scale;
-                // First triangle
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, left, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, right, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, left, top_1);
-                // Second triangle
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, left, top_1);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, right, 0);
-                positionObject.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, right, top_1);
-                // Set UVs based on letter.
-                var uLeft = (this.glyphTexture.offsets[letterCode]);
-                var uRight = (this.glyphTexture.offsets[letterCode] + letterWidth);
-                // Scale so that uvs lie between 0 and 1 on the texture.
-                uLeft /= MAX_CANVAS_DIMENSION;
-                uRight /= MAX_CANVAS_DIMENSION;
-                var vTop = 1;
-                var vBottom = 0;
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 0, uLeft, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 1, uRight, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 2, uLeft, vBottom);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 3, uLeft, vBottom);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 4, uRight, vTop);
-                uv.setXY(lettersSoFar * VERTICES_PER_GLYPH + 5, uRight, vBottom);
-                lettersSoFar++;
-                leftOffset += letterWidth;
-            }
-        }
-        var _loop_2 = function(i) {
-            var p = util.vector3FromPackedArray(this_2.worldSpacePointPositions, i);
-            this_2.labelVertexMap[i].forEach(function (j) {
-                _this.positions.setXYZ(j, p.x, p.y, p.z);
-            });
-        };
-        var this_2 = this;
-        for (var i = 0; i < pointCount; i++) {
-            _loop_2(i);
-        }
-        ;
-        this.labelsMesh = new THREE.Mesh(this.geometry, this.material);
-        this.labelsMesh.frustumCulled = false;
-        this.scene.add(this.labelsMesh);
-    };
-    ScatterPlotVisualizer3DLabels.prototype.colorLabels = function (pointColors) {
-        if (this.labelStrings == null || this.geometry == null ||
-            pointColors == null) {
-            return;
-        }
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.renderColors;
-        var n = pointColors.length / XYZ_ELEMENTS_PER_ENTRY;
-        var src = 0;
-        for (var i = 0; i < n; ++i) {
-            var c = new THREE.Color(pointColors[src], pointColors[src + 1], pointColors[src + 2]);
-            var m = this.labelVertexMap[i].length;
-            for (var j = 0; j < m; ++j) {
-                colors.setXYZ(this.labelVertexMap[i][j], c.r, c.g, c.b);
-            }
-            src += RGB_ELEMENTS_PER_ENTRY;
-        }
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.dispose = function () {
-        if (this.labelsMesh) {
-            if (this.scene) {
-                this.scene.remove(this.labelsMesh);
-            }
-            this.labelsMesh = null;
-        }
-        if (this.geometry) {
-            this.geometry.dispose();
-            this.geometry = null;
-        }
-        if ((this.glyphTexture != null) && (this.glyphTexture.texture != null)) {
-            this.glyphTexture.texture.dispose();
-            this.glyphTexture.texture = null;
-        }
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onPickingRender = function (rc) {
-        if (this.geometry == null) {
-            this.createLabels();
-        }
-        if (this.geometry == null) {
-            return;
-        }
-        this.material.uniforms.texture.value = this.glyphTexture.texture;
-        this.material.uniforms.picking.value = true;
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.pickingColors;
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onRender = function (rc) {
-        if (this.geometry == null) {
-            this.createLabels();
-        }
-        if (this.geometry == null) {
-            return;
-        }
-        this.colorLabels(rc.pointColors);
-        this.material.uniforms.texture.value = this.glyphTexture.texture;
-        this.material.uniforms.picking.value = false;
-        var colors = this.geometry.getAttribute('color');
-        colors.array = this.renderColors;
-        colors.needsUpdate = true;
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onPointPositionsChanged = function (newPositions) {
-        this.worldSpacePointPositions = newPositions;
-        this.dispose();
-    };
-    ScatterPlotVisualizer3DLabels.prototype.setLabelStrings = function (labelStrings) {
-        this.labelStrings = labelStrings;
-        this.dispose();
-    };
-    ScatterPlotVisualizer3DLabels.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizer3DLabels;
-}());
-exports.ScatterPlotVisualizer3DLabels = ScatterPlotVisualizer3DLabels;
-
-},{"./util":24}],20:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var label_1 = require('./label');
-var renderContext_1 = require('./renderContext');
-var util = require('./util');
-var MAX_LABELS_ON_SCREEN = 10000;
-var LABEL_STROKE_WIDTH = 3;
-var LABEL_FILL_WIDTH = 6;
-/**
- * Creates and maintains a 2d canvas on top of the GL canvas. All labels, when
- * active, are rendered to the 2d canvas as part of the visible render pass.
- */
-var ScatterPlotVisualizerCanvasLabels = (function () {
-    function ScatterPlotVisualizerCanvasLabels(container) {
-        this.labelsActive = true;
-        this.canvas = container.append('canvas').node();
-        this.gc = this.canvas.getContext('2d');
-        d3.select(this.canvas).style({ position: 'absolute', left: 0, top: 0 });
-        this.canvas.style.pointerEvents = 'none';
-    }
-    ScatterPlotVisualizerCanvasLabels.prototype.removeAllLabels = function () {
-        var pixelWidth = this.canvas.width * window.devicePixelRatio;
-        var pixelHeight = this.canvas.height * window.devicePixelRatio;
-        this.gc.clearRect(0, 0, pixelWidth, pixelHeight);
-    };
-    /** Render all of the non-overlapping visible labels to the canvas. */
-    ScatterPlotVisualizerCanvasLabels.prototype.makeLabels = function (rc) {
-        if ((rc.labels == null) || (rc.labels.pointIndices.length === 0)) {
-            return;
-        }
-        if (this.worldSpacePointPositions == null) {
-            return;
-        }
-        var lrc = rc.labels;
-        var sceneIs3D = (rc.cameraType === renderContext_1.CameraType.Perspective);
-        var labelHeight = parseInt(this.gc.font, 10);
-        var dpr = window.devicePixelRatio;
-        var grid;
-        {
-            var pixw = this.canvas.width * dpr;
-            var pixh = this.canvas.height * dpr;
-            var bb = { loX: 0, hiX: pixw, loY: 0, hiY: pixh };
-            grid = new label_1.CollisionGrid(bb, pixw / 25, pixh / 50);
-        }
-        var opacityMap = d3.scale.pow()
-            .exponent(Math.E)
-            .domain([rc.farthestCameraSpacePointZ, rc.nearestCameraSpacePointZ])
-            .range([0.1, 1]);
-        var camPos = rc.camera.position;
-        var camToTarget = camPos.clone().sub(rc.cameraTarget);
-        var camToPoint = new THREE.Vector3();
-        this.gc.textBaseline = 'middle';
-        this.gc.miterLimit = 2;
-        // Have extra space between neighboring labels. Don't pack too tightly.
-        var labelMargin = 2;
-        // Shift the label to the right of the point circle.
-        var xShift = 4;
-        var n = Math.min(MAX_LABELS_ON_SCREEN, lrc.pointIndices.length);
-        for (var i = 0; i < n; ++i) {
-            var point = void 0;
-            {
-                var pi = lrc.pointIndices[i];
-                point = util.vector3FromPackedArray(this.worldSpacePointPositions, pi);
-            }
-            // discard points that are behind the camera
-            camToPoint.copy(camPos).sub(point);
-            if (camToTarget.dot(camToPoint) < 0) {
-                continue;
-            }
-            var _a = util.vector3DToScreenCoords(rc.camera, rc.screenWidth, rc.screenHeight, point), x = _a[0], y = _a[1];
-            x += xShift;
-            // Computing the width of the font is expensive,
-            // so we assume width of 1 at first. Then, if the label doesn't
-            // conflict with other labels, we measure the actual width.
-            var textBoundingBox = {
-                loX: x - labelMargin,
-                hiX: x + 1 + labelMargin,
-                loY: y - labelHeight / 2 - labelMargin,
-                hiY: y + labelHeight / 2 + labelMargin
-            };
-            if (grid.insert(textBoundingBox, true)) {
-                var text = lrc.labelStrings[i];
-                var fontSize = lrc.defaultFontSize * lrc.scaleFactors[i] * dpr;
-                this.gc.font = fontSize + 'px roboto';
-                // Now, check with properly computed width.
-                textBoundingBox.hiX += this.gc.measureText(text).width - 1;
-                if (grid.insert(textBoundingBox)) {
-                    var opacity = 1;
-                    if (sceneIs3D && (lrc.useSceneOpacityFlags[i] === 1)) {
-                        opacity = opacityMap(camToPoint.length());
-                    }
-                    this.gc.fillStyle =
-                        this.styleStringFromPackedRgba(lrc.fillColors, i, opacity);
-                    this.gc.strokeStyle =
-                        this.styleStringFromPackedRgba(lrc.strokeColors, i, opacity);
-                    this.gc.lineWidth = LABEL_STROKE_WIDTH;
-                    this.gc.strokeText(text, x, y);
-                    this.gc.lineWidth = LABEL_FILL_WIDTH;
-                    this.gc.fillText(text, x, y);
-                }
-            }
-        }
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.styleStringFromPackedRgba = function (packedRgbaArray, colorIndex, opacity) {
-        var offset = colorIndex * 3;
-        var r = packedRgbaArray[offset];
-        var g = packedRgbaArray[offset + 1];
-        var b = packedRgbaArray[offset + 2];
-        return 'rgba(' + r + ',' + g + ',' + b + ',' + opacity + ')';
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onResize = function (newWidth, newHeight) {
-        var dpr = window.devicePixelRatio;
-        d3.select(this.canvas)
-            .attr('width', newWidth * dpr)
-            .attr('height', newHeight * dpr)
-            .style({ width: newWidth + 'px', height: newHeight + 'px' });
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.dispose = function () {
-        this.removeAllLabels();
-        this.canvas = null;
-        this.gc = null;
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onPointPositionsChanged = function (newPositions) {
-        this.worldSpacePointPositions = newPositions;
-        this.removeAllLabels();
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.onRender = function (rc) {
-        if (!this.labelsActive) {
-            return;
-        }
-        this.removeAllLabels();
-        this.makeLabels(rc);
-    };
-    ScatterPlotVisualizerCanvasLabels.prototype.setScene = function (scene) { };
-    ScatterPlotVisualizerCanvasLabels.prototype.onPickingRender = function (renderContext) { };
-    return ScatterPlotVisualizerCanvasLabels;
-}());
-exports.ScatterPlotVisualizerCanvasLabels = ScatterPlotVisualizerCanvasLabels;
-
-},{"./label":11,"./renderContext":15,"./util":24}],21:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var renderContext_1 = require('./renderContext');
-var util = require('./util');
-var NUM_POINTS_FOG_THRESHOLD = 5000;
-var MIN_POINT_SIZE = 5.0;
-var IMAGE_SIZE = 30;
-// Constants relating to the indices of buffer arrays.
-var RGB_NUM_ELEMENTS = 3;
-var INDEX_NUM_ELEMENTS = 1;
-var XYZ_NUM_ELEMENTS = 3;
-var VERTEX_SHADER = "\n  // Index of the specific vertex (passed in as bufferAttribute), and the\n  // variable that will be used to pass it to the fragment shader.\n  attribute float spriteIndex;\n  attribute vec3 color;\n  attribute float scaleFactor;\n\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n\n  uniform bool sizeAttenuation;\n  uniform float pointSize;\n  uniform float spritesPerRow;\n  uniform float spritesPerColumn;\n\n  void main() {\n    // Pass index and color values to fragment shader.\n    vColor = color;\n    xyIndex = vec2(mod(spriteIndex, spritesPerRow),\n              floor(spriteIndex / spritesPerColumn));\n\n    // Transform current vertex by modelViewMatrix (model world position and\n    // camera world position matrix).\n    vec4 cameraSpacePos = modelViewMatrix * vec4(position, 1.0);\n\n    // Project vertex in camera-space to screen coordinates using the camera's\n    // projection matrix.\n    gl_Position = projectionMatrix * cameraSpacePos;\n\n    // Create size attenuation (if we're in 3D mode) by making the size of\n    // each point inversly proportional to its distance to the camera.\n    float outputPointSize = pointSize;\n    if (sizeAttenuation) {\n      outputPointSize = -pointSize / cameraSpacePos.z;\n    }\n\n    gl_PointSize =\n      max(outputPointSize * scaleFactor, " + MIN_POINT_SIZE.toFixed(1) + ");\n  }";
-var FRAGMENT_SHADER_POINT_TEST_CHUNK = "\n  bool point_in_unit_circle(vec2 spriteCoord) {\n    vec2 centerToP = spriteCoord - vec2(0.5, 0.5);\n    return dot(centerToP, centerToP) < (0.5 * 0.5);\n  }\n\n  bool point_in_unit_equilateral_triangle(vec2 spriteCoord) {\n    vec3 v0 = vec3(0, 1, 0);\n    vec3 v1 = vec3(0.5, 0, 0);\n    vec3 v2 = vec3(1, 1, 0);\n    vec3 p = vec3(spriteCoord, 0);\n    float p_in_v0_v1 = cross(v1 - v0, p - v0).z;\n    float p_in_v1_v2 = cross(v2 - v1, p - v1).z;\n    return (p_in_v0_v1 > 0.0) && (p_in_v1_v2 > 0.0);\n  }\n\n  bool point_in_unit_square(vec2 spriteCoord) {\n    return true;\n  }\n";
-var FRAGMENT_SHADER = "\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n\n  uniform sampler2D texture;\n  uniform float spritesPerRow;\n  uniform float spritesPerColumn;\n  uniform bool isImage;\n\n  " + THREE.ShaderChunk['common'] + "\n  " + THREE.ShaderChunk['fog_pars_fragment'] + "\n  " + FRAGMENT_SHADER_POINT_TEST_CHUNK + "\n\n  void main() {\n    if (isImage) {\n      // Coordinates of the vertex within the entire sprite image.\n      vec2 coords =\n        (gl_PointCoord + xyIndex) / vec2(spritesPerRow, spritesPerColumn);\n      gl_FragColor = vec4(vColor, 1.0) * texture2D(texture, coords);\n    } else {\n      bool inside = point_in_unit_circle(gl_PointCoord);\n      if (!inside) {\n        discard;\n      }\n      gl_FragColor = vec4(vColor, 1);\n    }\n    " + THREE.ShaderChunk['fog_fragment'] + "\n  }";
-var FRAGMENT_SHADER_PICKING = "\n  varying vec2 xyIndex;\n  varying vec3 vColor;\n  uniform bool isImage;\n\n  " + FRAGMENT_SHADER_POINT_TEST_CHUNK + "\n\n  void main() {\n    xyIndex; // Silence 'unused variable' warning.\n    if (isImage) {\n      gl_FragColor = vec4(vColor, 1);\n    } else {\n      bool inside = point_in_unit_circle(gl_PointCoord);\n      if (!inside) {\n        discard;\n      }\n      gl_FragColor = vec4(vColor, 1);\n    }\n  }";
-/**
- * Uses GL point sprites to render the dataset.
- */
-var ScatterPlotVisualizerSprites = (function () {
-    function ScatterPlotVisualizerSprites() {
-        this.texture = null;
-        this.standinTextureForPoints =
-            util.createTexture(document.createElement('canvas'));
-        this.renderMaterial = this.createRenderMaterial(false);
-        this.pickingMaterial = this.createPickingMaterial(false);
-    }
-    ScatterPlotVisualizerSprites.prototype.createTextureFromSpriteAtlas = function (spriteAtlas, spriteDimensions, spriteIndices) {
-        this.texture = util.createTexture(spriteAtlas);
-        this.spritesPerRow = spriteAtlas.width / spriteDimensions[0];
-        this.spritesPerColumn = spriteAtlas.height / spriteDimensions[1];
-        this.spriteDimensions = spriteDimensions;
-        this.spriteIndexBufferAttribute =
-            new THREE.BufferAttribute(spriteIndices, INDEX_NUM_ELEMENTS);
-        if (this.points != null) {
-            this.points.geometry
-                .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.createUniforms = function () {
-        return {
-            texture: { type: 't' },
-            spritesPerRow: { type: 'f' },
-            spritesPerColumn: { type: 'f' },
-            fogColor: { type: 'c' },
-            fogNear: { type: 'f' },
-            fogFar: { type: 'f' },
-            isImage: { type: 'bool' },
-            sizeAttenuation: { type: 'bool' },
-            pointSize: { type: 'f' }
-        };
-    };
-    ScatterPlotVisualizerSprites.prototype.createRenderMaterial = function (haveImage) {
-        var uniforms = this.createUniforms();
-        return new THREE.ShaderMaterial({
-            uniforms: uniforms,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER,
-            transparent: !haveImage,
-            depthTest: haveImage,
-            depthWrite: haveImage,
-            fog: true,
-            blending: THREE.MultiplyBlending,
-        });
-    };
-    ScatterPlotVisualizerSprites.prototype.createPickingMaterial = function (haveImage) {
-        var uniforms = this.createUniforms();
-        return new THREE.ShaderMaterial({
-            uniforms: uniforms,
-            vertexShader: VERTEX_SHADER,
-            fragmentShader: FRAGMENT_SHADER_PICKING,
-            transparent: true,
-            depthTest: true,
-            depthWrite: true,
-            fog: false,
-            blending: THREE.NormalBlending,
-        });
-    };
-    /**
-     * Create points, set their locations and actually instantiate the
-     * geometry.
-     */
-    ScatterPlotVisualizerSprites.prototype.createPointSprites = function (scene, positions) {
-        var pointCount = (positions != null) ? (positions.length / XYZ_NUM_ELEMENTS) : 0;
-        var geometry = this.createGeometry(pointCount);
-        this.fog = new THREE.Fog(0xFFFFFF); // unused value, gets overwritten.
-        this.points = new THREE.Points(geometry, this.renderMaterial);
-        this.points.frustumCulled = false;
-        if (this.spriteIndexBufferAttribute != null) {
-            this.points.geometry
-                .addAttribute('spriteIndex', this.spriteIndexBufferAttribute);
-        }
-        scene.add(this.points);
-    };
-    ScatterPlotVisualizerSprites.prototype.calculatePointSize = function (sceneIs3D) {
-        if (this.texture != null) {
-            return sceneIs3D ? IMAGE_SIZE : this.spriteDimensions[0];
-        }
-        var n = (this.worldSpacePointPositions != null) ?
-            (this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS) :
-            1;
-        var SCALE = 200;
-        var LOG_BASE = 8;
-        var DIVISOR = 1.5;
-        // Scale point size inverse-logarithmically to the number of points.
-        var pointSize = SCALE / Math.log(n) / Math.log(LOG_BASE);
-        return sceneIs3D ? pointSize : (pointSize / DIVISOR);
-    };
-    /**
-     * Set up buffer attributes to be used for the points/images.
-     */
-    ScatterPlotVisualizerSprites.prototype.createGeometry = function (pointCount) {
-        var n = pointCount;
-        // Fill pickingColors with each point's unique id as its color.
-        this.pickingColors = new Float32Array(n * RGB_NUM_ELEMENTS);
-        {
-            var dst = 0;
-            for (var i = 0; i < n; i++) {
-                var c = new THREE.Color(i);
-                this.pickingColors[dst++] = c.r;
-                this.pickingColors[dst++] = c.g;
-                this.pickingColors[dst++] = c.b;
-            }
-        }
-        var geometry = new THREE.BufferGeometry();
-        geometry.addAttribute('position', new THREE.BufferAttribute(null, XYZ_NUM_ELEMENTS));
-        geometry.addAttribute('color', new THREE.BufferAttribute(null, RGB_NUM_ELEMENTS));
-        geometry.addAttribute('scaleFactor', new THREE.BufferAttribute(null, INDEX_NUM_ELEMENTS));
-        return geometry;
-    };
-    ScatterPlotVisualizerSprites.prototype.setFogDistances = function (sceneIs3D, nearestPointZ, farthestPointZ) {
-        if (sceneIs3D) {
-            var n = this.worldSpacePointPositions.length / XYZ_NUM_ELEMENTS;
-            this.fog.near = nearestPointZ;
-            // If there are fewer points we want less fog. We do this
-            // by making the "far" value (that is, the distance from the camera to the
-            // far edge of the fog) proportional to the number of points.
-            var multiplier = 2 - Math.min(n, NUM_POINTS_FOG_THRESHOLD) / NUM_POINTS_FOG_THRESHOLD;
-            this.fog.far = farthestPointZ * multiplier;
-        }
-        else {
-            this.fog.near = Infinity;
-            this.fog.far = Infinity;
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.dispose = function () {
-        this.disposeGeometry();
-        this.disposeTextureAtlas();
-    };
-    ScatterPlotVisualizerSprites.prototype.disposeGeometry = function () {
-        if (this.points != null) {
-            this.scene.remove(this.points);
-            this.points.geometry.dispose();
-            this.points = null;
-            this.worldSpacePointPositions = null;
-        }
-    };
-    ScatterPlotVisualizerSprites.prototype.disposeTextureAtlas = function () {
-        if (this.texture != null) {
-            this.texture.dispose();
-        }
-        this.texture = null;
-        this.renderMaterial = null;
-        this.pickingMaterial = null;
-    };
-    ScatterPlotVisualizerSprites.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizerSprites.prototype.setSpriteAtlas = function (spriteImage, spriteDimensions, spriteIndices) {
-        this.disposeTextureAtlas();
-        this.createTextureFromSpriteAtlas(spriteImage, spriteDimensions, spriteIndices);
-        this.renderMaterial = this.createRenderMaterial(true);
-        this.pickingMaterial = this.createPickingMaterial(true);
-    };
-    ScatterPlotVisualizerSprites.prototype.clearSpriteAtlas = function () {
-        this.disposeTextureAtlas();
-        this.renderMaterial = this.createRenderMaterial(false);
-        this.pickingMaterial = this.createPickingMaterial(false);
-    };
-    ScatterPlotVisualizerSprites.prototype.onPointPositionsChanged = function (newPositions) {
-        if ((newPositions == null) || (newPositions.length === 0)) {
-            this.dispose();
-            return;
-        }
-        if (this.points != null) {
-            if (this.worldSpacePointPositions.length !== newPositions.length) {
-                this.disposeGeometry();
-            }
-        }
-        this.worldSpacePointPositions = newPositions;
-        if (this.points == null) {
-            this.createPointSprites(this.scene, newPositions);
-        }
-        var positions = this.points.geometry
-            .getAttribute('position');
-        positions.array = newPositions;
-        positions.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onPickingRender = function (rc) {
-        if (this.points == null) {
-            return;
-        }
-        var sceneIs3D = (rc.cameraType === renderContext_1.CameraType.Perspective);
-        this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-        this.pickingMaterial.uniforms.spritesPerRow.value = this.spritesPerColumn;
-        this.pickingMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-        this.pickingMaterial.uniforms.pointSize.value =
-            this.calculatePointSize(sceneIs3D);
-        this.points.material = this.pickingMaterial;
-        var colors = this.points.geometry
-            .getAttribute('color');
-        colors.array = this.pickingColors;
-        colors.needsUpdate = true;
-        var scaleFactors = this.points.geometry
-            .getAttribute('scaleFactor');
-        scaleFactors.array = rc.pointScaleFactors;
-        scaleFactors.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onRender = function (rc) {
-        if (!this.points) {
-            return;
-        }
-        var sceneIs3D = (rc.camera instanceof THREE.PerspectiveCamera);
-        this.setFogDistances(sceneIs3D, rc.nearestCameraSpacePointZ, rc.farthestCameraSpacePointZ);
-        this.scene.fog = this.fog;
-        this.scene.fog.color = new THREE.Color(rc.backgroundColor);
-        this.renderMaterial.uniforms.fogColor.value = this.scene.fog.color;
-        this.renderMaterial.uniforms.fogNear.value = this.fog.near;
-        this.renderMaterial.uniforms.fogFar.value = this.fog.far;
-        this.renderMaterial.uniforms.spritesPerRow.value = this.spritesPerRow;
-        this.renderMaterial.uniforms.spritesPerColumn.value = this.spritesPerColumn;
-        this.renderMaterial.uniforms.isImage.value = (this.texture != null);
-        this.renderMaterial.uniforms.texture.value =
-            (this.texture != null) ? this.texture : this.standinTextureForPoints;
-        this.renderMaterial.uniforms.sizeAttenuation.value = sceneIs3D;
-        this.renderMaterial.uniforms.pointSize.value =
-            this.calculatePointSize(sceneIs3D);
-        this.points.material = this.renderMaterial;
-        var colors = this.points.geometry
-            .getAttribute('color');
-        this.renderColors = rc.pointColors;
-        colors.array = this.renderColors;
-        colors.needsUpdate = true;
-        var scaleFactors = this.points.geometry
-            .getAttribute('scaleFactor');
-        scaleFactors.array = rc.pointScaleFactors;
-        scaleFactors.needsUpdate = true;
-    };
-    ScatterPlotVisualizerSprites.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizerSprites;
-}());
-exports.ScatterPlotVisualizerSprites = ScatterPlotVisualizerSprites;
-
-},{"./renderContext":15,"./util":24}],22:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util = require('./util');
-var RGB_NUM_ELEMENTS = 3;
-var XYZ_NUM_ELEMENTS = 3;
-/**
- * Renders 'traces' (polylines) that connect multiple points in the dataset
- */
-var ScatterPlotVisualizerTraces = (function () {
-    function ScatterPlotVisualizerTraces() {
-        this.tracePositionBuffer = {};
-        this.traceColorBuffer = {};
-    }
-    ScatterPlotVisualizerTraces.prototype.updateTraceIndicesInDataSet = function (ds) {
-        for (var i = 0; i < ds.traces.length; i++) {
-            var trace = ds.traces[i];
-            for (var j = 0; j < trace.pointIndices.length - 1; j++) {
-                ds.points[trace.pointIndices[j]].traceIndex = i;
-                ds.points[trace.pointIndices[j + 1]].traceIndex = i;
-            }
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.createTraces = function (scene) {
-        if (!this.dataSet || !this.dataSet.traces) {
-            return;
-        }
-        this.updateTraceIndicesInDataSet(this.dataSet);
-        this.traces = [];
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var geometry = new THREE.BufferGeometry();
-            geometry.addAttribute('position', this.tracePositionBuffer[i]);
-            geometry.addAttribute('color', this.traceColorBuffer[i]);
-            var material = new THREE.LineBasicMaterial({
-                linewidth: 1,
-                opacity: 1.0,
-                transparent: true,
-                vertexColors: THREE.VertexColors
-            });
-            var trace = new THREE.LineSegments(geometry, material);
-            trace.frustumCulled = false;
-            this.traces.push(trace);
-            scene.add(trace);
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.dispose = function () {
-        if (this.traces == null) {
-            return;
-        }
-        for (var i = 0; i < this.traces.length; i++) {
-            this.scene.remove(this.traces[i]);
-            this.traces[i].geometry.dispose();
-        }
-        this.traces = null;
-        this.tracePositionBuffer = {};
-        this.traceColorBuffer = {};
-    };
-    ScatterPlotVisualizerTraces.prototype.setScene = function (scene) {
-        this.scene = scene;
-    };
-    ScatterPlotVisualizerTraces.prototype.setDataSet = function (dataSet) {
-        this.dataSet = dataSet;
-    };
-    ScatterPlotVisualizerTraces.prototype.onPointPositionsChanged = function (newPositions) {
-        if ((newPositions == null) || (this.traces != null)) {
-            this.dispose();
-        }
-        if ((newPositions == null) || (this.dataSet == null)) {
-            return;
-        }
-        // Set up the position buffer arrays for each trace.
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var dataTrace = this.dataSet.traces[i];
-            var vertexCount = 2 * (dataTrace.pointIndices.length - 1);
-            var traces = new Float32Array(vertexCount * XYZ_NUM_ELEMENTS);
-            this.tracePositionBuffer[i] =
-                new THREE.BufferAttribute(traces, XYZ_NUM_ELEMENTS);
-            var colors = new Float32Array(vertexCount * RGB_NUM_ELEMENTS);
-            this.traceColorBuffer[i] =
-                new THREE.BufferAttribute(colors, RGB_NUM_ELEMENTS);
-        }
-        for (var i = 0; i < this.dataSet.traces.length; i++) {
-            var dataTrace = this.dataSet.traces[i];
-            var src = 0;
-            for (var j = 0; j < dataTrace.pointIndices.length - 1; j++) {
-                var p1Index = dataTrace.pointIndices[j];
-                var p2Index = dataTrace.pointIndices[j + 1];
-                var p1 = util.vector3FromPackedArray(newPositions, p1Index);
-                var p2 = util.vector3FromPackedArray(newPositions, p2Index);
-                this.tracePositionBuffer[i].setXYZ(src, p1.x, p1.y, p1.z);
-                this.tracePositionBuffer[i].setXYZ(src + 1, p2.x, p2.y, p2.z);
-                src += 2;
-            }
-            this.tracePositionBuffer[i].needsUpdate = true;
-        }
-        if (this.traces == null) {
-            this.createTraces(this.scene);
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.onRender = function (renderContext) {
-        if (this.traces == null) {
-            return;
-        }
-        for (var i = 0; i < this.traces.length; i++) {
-            this.traces[i].material.opacity = renderContext.traceOpacities[i];
-            this.traces[i].material.linewidth =
-                renderContext.traceWidths[i];
-            this.traceColorBuffer[i].array = renderContext.traceColors[i];
-            this.traceColorBuffer[i].needsUpdate = true;
-        }
-    };
-    ScatterPlotVisualizerTraces.prototype.onPickingRender = function (renderContext) { };
-    ScatterPlotVisualizerTraces.prototype.onResize = function (newWidth, newHeight) { };
-    return ScatterPlotVisualizerTraces;
-}());
-exports.ScatterPlotVisualizerTraces = ScatterPlotVisualizerTraces;
-
-},{"./util":24}],23:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-/**
- * A Space-partitioning tree (https://en.wikipedia.org/wiki/Space_partitioning)
- * that recursively divides the space into regions of equal sizes. This data
- * structure can act both as a Quad tree and an Octree when the data is 2 or
- * 3 dimensional respectively. One usage is in t-SNE in order to do Barnes-Hut
- * approximation.
- */
-var SPTree = (function () {
-    /**
-     * Constructs a new tree with the provided data.
-     *
-     * @param data List of n-dimensional data points.
-     * @param capacity Number of data points to store in a single node.
-     */
-    function SPTree(data) {
-        if (data.length < 1) {
-            throw new Error('There should be at least 1 data point');
-        }
-        // Make a bounding box based on the extent of the data.
-        this.dim = data[0].length;
-        // Each node has 2^d children, where d is the dimension of the space.
-        // Binary masks (e.g. 000, 001, ... 111 in 3D) are used to determine in
-        // which child (e.g. quadron in 2D) the new point is going to be assigned.
-        // For more details, see the insert() method and its comments.
-        this.masks = new Array(Math.pow(2, this.dim));
-        for (var d = 0; d < this.masks.length; ++d) {
-            this.masks[d] = (1 << d);
-        }
-        var min = new Array(this.dim);
-        fillArray(min, Number.POSITIVE_INFINITY);
-        var max = new Array(this.dim);
-        fillArray(max, Number.NEGATIVE_INFINITY);
-        for (var i = 0; i < data.length; ++i) {
-            // For each dim get the min and max.
-            // E.g. For 2-D, get the x_min, x_max, y_min, y_max.
-            for (var d = 0; d < this.dim; ++d) {
-                min[d] = Math.min(min[d], data[i][d]);
-                max[d] = Math.max(max[d], data[i][d]);
-            }
-        }
-        // Create a bounding box with the center of the largest span.
-        var center = new Array(this.dim);
-        var halfDim = 0;
-        for (var d = 0; d < this.dim; ++d) {
-            var span = max[d] - min[d];
-            center[d] = min[d] + span / 2;
-            halfDim = Math.max(halfDim, span / 2);
-        }
-        this.root = { box: { center: center, halfDim: halfDim }, point: data[0] };
-        for (var i = 1; i < data.length; ++i) {
-            this.insert(this.root, data[i]);
-        }
-    }
-    /**
-     * Visits every node in the tree. Each node can store 1 or more points,
-     * depending on the node capacity provided in the constructor.
-     *
-     * @param accessor Method that takes the currently visited node, and the
-     * low and high point of the region that this node occupies. E.g. in 2D,
-     * the low and high points will be the lower-left corner and the upper-right
-     * corner.
-     */
-    SPTree.prototype.visit = function (accessor, noBox) {
-        if (noBox === void 0) { noBox = false; }
-        this.visitNode(this.root, accessor, noBox);
-    };
-    SPTree.prototype.visitNode = function (node, accessor, noBox) {
-        var skipChildren;
-        if (noBox) {
-            skipChildren = accessor(node);
-        }
-        else {
-            var lowPoint = new Array(this.dim);
-            var highPoint = new Array(this.dim);
-            for (var d = 0; d < this.dim; ++d) {
-                lowPoint[d] = node.box.center[d] - node.box.halfDim;
-                highPoint[d] = node.box.center[d] + node.box.halfDim;
-            }
-            skipChildren = accessor(node, lowPoint, highPoint);
-        }
-        if (!node.children || skipChildren) {
-            return;
-        }
-        for (var i = 0; i < node.children.length; ++i) {
-            var child = node.children[i];
-            if (child) {
-                this.visitNode(child, accessor, noBox);
-            }
-        }
-    };
-    SPTree.prototype.insert = function (node, p) {
-        // Subdivide and then add the point to whichever node will accept it.
-        if (node.children == null) {
-            node.children = new Array(this.masks.length);
-        }
-        // Decide which child will get the new point by constructing a D-bits binary
-        // signature (D=3 for 3D) where the k-th bit is 1 if the point's k-th
-        // coordinate is greater than the node's k-th coordinate, 0 otherwise.
-        // Then the binary signature in decimal system gives us the index of the
-        // child where the new point should be.
-        var index = 0;
-        for (var d = 0; d < this.dim; ++d) {
-            if (p[d] > node.box.center[d]) {
-                index |= this.masks[d];
-            }
-        }
-        if (node.children[index] == null) {
-            this.makeChild(node, index, p);
-        }
-        else {
-            this.insert(node.children[index], p);
-        }
-    };
-    SPTree.prototype.makeChild = function (node, index, p) {
-        var oldC = node.box.center;
-        var h = node.box.halfDim / 2;
-        var newC = new Array(this.dim);
-        for (var d = 0; d < this.dim; ++d) {
-            newC[d] = (index & (1 << d)) ? oldC[d] + h : oldC[d] - h;
-        }
-        node.children[index] = { box: { center: newC, halfDim: h }, point: p };
-    };
-    return SPTree;
-}());
-exports.SPTree = SPTree;
-function fillArray(arr, value) {
-    for (var i = 0; i < arr.length; ++i) {
-        arr[i] = value;
-    }
-}
-
-},{}],24:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var logging = require('./logging');
-/**
- * Delay for running expensive tasks, in milliseconds.
- * The duration was empirically found so that it leaves enough time for the
- * browser to update its UI state before starting an expensive UI-blocking task.
- */
-var TASK_DELAY_MS = 200;
-/** Shuffles the array in-place in O(n) time using Fisher-Yates algorithm. */
-function shuffle(array) {
-    var m = array.length;
-    var t;
-    var i;
-    // While there remain elements to shuffle.
-    while (m) {
-        // Pick a remaining element
-        i = Math.floor(Math.random() * m--);
-        // And swap it with the current element.
-        t = array[m];
-        array[m] = array[i];
-        array[i] = t;
-    }
-    return array;
-}
-exports.shuffle = shuffle;
-/** Projects a 3d point into screen space */
-function vector3DToScreenCoords(cam, w, h, v) {
-    var dpr = window.devicePixelRatio;
-    var pv = new THREE.Vector3().copy(v).project(cam);
-    // The screen-space origin is at the middle of the screen, with +y up.
-    var coords = [((pv.x + 1) / 2 * w) * dpr, -((pv.y - 1) / 2 * h) * dpr];
-    return coords;
-}
-exports.vector3DToScreenCoords = vector3DToScreenCoords;
-/** Loads 3 contiguous elements from a packed xyz array into a Vector3. */
-function vector3FromPackedArray(a, pointIndex) {
-    var offset = pointIndex * 3;
-    return new THREE.Vector3(a[offset], a[offset + 1], a[offset + 2]);
-}
-exports.vector3FromPackedArray = vector3FromPackedArray;
-/**
- * Gets the camera-space z coordinates of the nearest and farthest points.
- * Ignores points that are behind the camera.
- */
-function getNearFarPoints(worldSpacePoints, cameraPos, cameraTarget) {
-    var shortestDist = Infinity;
-    var furthestDist = 0;
-    var camToTarget = new THREE.Vector3().copy(cameraTarget).sub(cameraPos);
-    var camPlaneNormal = new THREE.Vector3().copy(camToTarget).normalize();
-    var n = worldSpacePoints.length / 3;
-    var src = 0;
-    var p = new THREE.Vector3();
-    var camToPoint = new THREE.Vector3();
-    for (var i = 0; i < n; i++) {
-        p.x = worldSpacePoints[src];
-        p.y = worldSpacePoints[src + 1];
-        p.z = worldSpacePoints[src + 2];
-        src += 3;
-        camToPoint.copy(p).sub(cameraPos);
-        var dist = camPlaneNormal.dot(camToPoint);
-        if (dist < 0) {
-            continue;
-        }
-        furthestDist = (dist > furthestDist) ? dist : furthestDist;
-        shortestDist = (dist < shortestDist) ? dist : shortestDist;
-    }
-    return [shortestDist, furthestDist];
-}
-exports.getNearFarPoints = getNearFarPoints;
-/**
- * Generate a texture for the points/images and sets some initial params
- */
-function createTexture(image) {
-    var tex = new THREE.Texture(image);
-    tex.needsUpdate = true;
-    // Used if the texture isn't a power of 2.
-    tex.minFilter = THREE.LinearFilter;
-    tex.generateMipmaps = false;
-    tex.flipY = false;
-    return tex;
-}
-exports.createTexture = createTexture;
-/**
- * Assert that the condition is satisfied; if not, log user-specified message
- * to the console.
- */
-function assert(condition, message) {
-    if (!condition) {
-        message = message || 'Assertion failed';
-        throw new Error(message);
-    }
-}
-exports.assert = assert;
-function getSearchPredicate(query, inRegexMode, fieldName) {
-    var predicate;
-    if (inRegexMode) {
-        var regExp_1 = new RegExp(query, 'i');
-        predicate = function (p) { return regExp_1.test(p.metadata[fieldName].toString()); };
-    }
-    else {
-        // Doing a case insensitive substring match.
-        query = query.toLowerCase();
-        predicate = function (p) {
-            var label = p.metadata[fieldName].toString().toLowerCase();
-            return label.indexOf(query) >= 0;
-        };
-    }
-    return predicate;
-}
-exports.getSearchPredicate = getSearchPredicate;
-/**
- * Runs an expensive task asynchronously with some delay
- * so that it doesn't block the UI thread immediately.
- *
- * @param message The message to display to the user.
- * @param task The expensive task to run.
- * @param msgId Optional. ID of an existing message. If provided, will overwrite
- *     an existing message and won't automatically clear the message when the
- *     task is done.
- * @return The value returned by the task.
- */
-function runAsyncTask(message, task, msgId) {
-    if (msgId === void 0) { msgId = null; }
-    var autoClear = (msgId == null);
-    msgId = logging.setModalMessage(message, msgId);
-    return new Promise(function (resolve, reject) {
-        d3.timer(function () {
-            try {
-                var result = task();
-                // Clearing the old message.
-                if (autoClear) {
-                    logging.setModalMessage(null, msgId);
-                }
-                resolve(result);
-            }
-            catch (ex) {
-                reject(ex);
-            }
-            return true;
-        }, TASK_DELAY_MS);
-    });
-}
-exports.runAsyncTask = runAsyncTask;
-/**
- * Parses the URL for query parameters, e.g. ?foo=1&bar=2 will return
- *   {'foo': '1', 'bar': '2'}.
- * @param url The URL to parse.
- * @return A map of queryParam key to its value.
- */
-function getURLParams(url) {
-    if (!url) {
-        return {};
-    }
-    var queryString = url.indexOf('?') !== -1 ? url.split('?')[1] : url;
-    if (queryString.indexOf('#')) {
-        queryString = queryString.split('#')[0];
-    }
-    var queryEntries = queryString.split('&');
-    var queryParams = {};
-    for (var i = 0; i < queryEntries.length; i++) {
-        var queryEntryComponents = queryEntries[i].split('=');
-        queryParams[queryEntryComponents[0].toLowerCase()] =
-            decodeURIComponent(queryEntryComponents[1]);
-    }
-    return queryParams;
-}
-exports.getURLParams = getURLParams;
-/** List of substrings that auto generated tensors have in their name. */
-var SUBSTR_GEN_TENSORS = ['/Adagrad'];
-/** Returns true if the tensor was automatically generated by TF API calls. */
-function tensorIsGenerated(tensorName) {
-    for (var i = 0; i < SUBSTR_GEN_TENSORS.length; i++) {
-        if (tensorName.indexOf(SUBSTR_GEN_TENSORS[i]) >= 0) {
-            return true;
-        }
-    }
-    return false;
-}
-exports.tensorIsGenerated = tensorIsGenerated;
-function xor(cond1, cond2) {
-    return (cond1 || cond2) && !(cond1 && cond2);
-}
-exports.xor = xor;
-/** Checks to see if the browser supports webgl. */
-function hasWebGLSupport() {
-    try {
-        var c = document.createElement('canvas');
-        var gl = c.getContext('webgl') || c.getContext('experimental-webgl');
-        return gl != null && typeof weblas !== 'undefined';
-    }
-    catch (e) {
-        return false;
-    }
-}
-exports.hasWebGLSupport = hasWebGLSupport;
-
-},{"./logging":12}],25:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var util_1 = require('./util');
-/** Returns the dot product of two vectors. */
-function dot(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i] * b[i];
-    }
-    return result;
-}
-exports.dot = dot;
-/** Sums all the elements in the vector */
-function sum(a) {
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i];
-    }
-    return result;
-}
-exports.sum = sum;
-/** Returns the sum of two vectors, i.e. a + b */
-function add(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = new Float32Array(a.length);
-    for (var i = 0; i < a.length; ++i) {
-        result[i] = a[i] + b[i];
-    }
-    return result;
-}
-exports.add = add;
-/** Subtracts vector b from vector a, i.e. returns a - b */
-function sub(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = new Float32Array(a.length);
-    for (var i = 0; i < a.length; ++i) {
-        result[i] = a[i] - b[i];
-    }
-    return result;
-}
-exports.sub = sub;
-/** Returns the square norm of the vector */
-function norm2(a) {
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        result += a[i] * a[i];
-    }
-    return result;
-}
-exports.norm2 = norm2;
-/** Returns the euclidean distance between two vectors. */
-function dist(a, b) {
-    return Math.sqrt(dist2(a, b));
-}
-exports.dist = dist;
-/** Returns the square euclidean distance between two vectors. */
-function dist2(a, b) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-    }
-    return result;
-}
-exports.dist2 = dist2;
-/** Returns the square euclidean distance between two 2D points. */
-function dist2_2D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist2_2D = dist2_2D;
-/** Returns the square euclidean distance between two 3D points. */
-function dist2_3D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    var dZ = a[2] - b[2];
-    return dX * dX + dY * dY + dZ * dZ;
-}
-exports.dist2_3D = dist2_3D;
-/** Returns the euclidean distance between 2 3D points. */
-function dist_3D(a, b) {
-    return Math.sqrt(dist2_3D(a, b));
-}
-exports.dist_3D = dist_3D;
-/**
- * Returns the square euclidean distance between two vectors, with an early
- * exit (returns -1) if the distance is >= to the provided limit.
- */
-function dist2WithLimit(a, b, limit) {
-    util_1.assert(a.length === b.length, 'Vectors a and b must be of same length');
-    var result = 0;
-    for (var i = 0; i < a.length; ++i) {
-        var diff = a[i] - b[i];
-        result += diff * diff;
-        if (result >= limit) {
-            return -1;
-        }
-    }
-    return result;
-}
-exports.dist2WithLimit = dist2WithLimit;
-/** Returns the square euclidean distance between two 2D points. */
-function dist22D(a, b) {
-    var dX = a[0] - b[0];
-    var dY = a[1] - b[1];
-    return dX * dX + dY * dY;
-}
-exports.dist22D = dist22D;
-/** Modifies the vector in-place to have unit norm. */
-function unit(a) {
-    var norm = Math.sqrt(norm2(a));
-    util_1.assert(norm >= 0, 'Norm of the vector must be > 0');
-    for (var i = 0; i < a.length; ++i) {
-        a[i] /= norm;
-    }
-}
-exports.unit = unit;
-/**
- *  Projects the vectors to a lower dimension
- *
- * @param vectors Array of vectors to be projected.
- * @param newDim The resulting dimension of the vectors.
- */
-function projectRandom(vectors, newDim) {
-    var dim = vectors[0].length;
-    var N = vectors.length;
-    var newVectors = new Array(N);
-    for (var i = 0; i < N; ++i) {
-        newVectors[i] = new Float32Array(newDim);
-    }
-    // Make nDim projections.
-    for (var k = 0; k < newDim; ++k) {
-        var randomVector = rn(dim);
-        for (var i = 0; i < N; ++i) {
-            newVectors[i][k] = dot(vectors[i], randomVector);
-        }
-    }
-    return newVectors;
-}
-exports.projectRandom = projectRandom;
-/**
- * Projects a vector onto a 2D plane specified by the two direction vectors.
- */
-function project2d(a, dir1, dir2) {
-    return [dot(a, dir1), dot(a, dir2)];
-}
-exports.project2d = project2d;
-/**
- * Computes the centroid of the data points. If the provided data points are not
- * vectors, an accessor function needs to be provided.
- */
-function centroid(dataPoints, accessor) {
-    if (dataPoints.length === 0) {
-        return null;
-    }
-    if (accessor == null) {
-        accessor = function (a) { return a; };
-    }
-    util_1.assert(dataPoints.length >= 0, '`vectors` must be of length >= 1');
-    var centroid = new Float32Array(accessor(dataPoints[0]).length);
-    for (var i = 0; i < dataPoints.length; ++i) {
-        var dataPoint = dataPoints[i];
-        var vector = accessor(dataPoint);
-        for (var j = 0; j < centroid.length; ++j) {
-            centroid[j] += vector[j];
-        }
-    }
-    for (var j = 0; j < centroid.length; ++j) {
-        centroid[j] /= dataPoints.length;
-    }
-    return centroid;
-}
-exports.centroid = centroid;
-/**
- * Generates a vector of the specified size where each component is drawn from
- * a random (0, 1) gaussian distribution.
- */
-function rn(size) {
-    var normal = d3.random.normal();
-    var result = new Float32Array(size);
-    for (var i = 0; i < size; ++i) {
-        result[i] = normal();
-    }
-    return result;
-}
-exports.rn = rn;
-/**
- * Returns the cosine distance ([0, 2]) between two vectors
- * that have been normalized to unit norm.
- */
-function cosDistNorm(a, b) {
-    return 1 - dot(a, b);
-}
-exports.cosDistNorm = cosDistNorm;
-/**
- * Returns the cosine distance ([0, 2]) between two vectors.
- */
-function cosDist(a, b) {
-    return 1 - cosSim(a, b);
-}
-exports.cosDist = cosDist;
-/** Returns the cosine similarity ([-1, 1]) between two vectors. */
-function cosSim(a, b) {
-    return dot(a, b) / Math.sqrt(norm2(a) * norm2(b));
-}
-exports.cosSim = cosSim;
-/**
- * Converts list of vectors (matrix) into a 1-dimensional
- * typed array with row-first order.
- */
-function toTypedArray(dataPoints, accessor) {
-    var N = dataPoints.length;
-    var dim = accessor(dataPoints[0]).length;
-    var result = new Float32Array(N * dim);
-    for (var i = 0; i < N; ++i) {
-        var vector = accessor(dataPoints[i]);
-        for (var d = 0; d < dim; ++d) {
-            result[i * dim + d] = vector[d];
-        }
-    }
-    return result;
-}
-exports.toTypedArray = toTypedArray;
-/**
- * Transposes an RxC matrix represented as a flat typed array
- * into a CxR matrix, again represented as a flat typed array.
- */
-function transposeTypedArray(r, c, typedArray) {
-    var result = new Float32Array(r * c);
-    for (var i = 0; i < r; ++i) {
-        for (var j = 0; j < c; ++j) {
-            result[j * r + i] = typedArray[i * c + j];
-        }
-    }
-    return result;
-}
-exports.transposeTypedArray = transposeTypedArray;
-
-},{"./util":24}],26:[function(require,module,exports){
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var logging = require('./logging');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.BookmarkPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-bookmark-panel',
-    properties: {
-        savedStates: Object,
-        // Keep a separate polymer property because the savedStates doesn't change
-        // when adding and removing states.
-        hasStates: { type: Boolean, value: false },
-        selectedState: Number
-    }
-});
-var BookmarkPanel = (function (_super) {
-    __extends(BookmarkPanel, _super);
-    function BookmarkPanel() {
-        _super.apply(this, arguments);
-        this.hasStates = false;
-    }
-    BookmarkPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.savedStates = [];
-        this.setupUploadButton();
-        this.ignoreNextProjectionEvent = false;
-    };
-    BookmarkPanel.prototype.initialize = function (projector, projectorEventContext) {
-        var _this = this;
-        this.projector = projector;
-        projectorEventContext.registerProjectionChangedListener(function () {
-            if (_this.ignoreNextProjectionEvent) {
-                _this.ignoreNextProjectionEvent = false;
-            }
-            else {
-                _this.clearStateSelection();
-            }
-        });
-    };
-    BookmarkPanel.prototype.setSelectedTensor = function (run, tensorInfo, dataProvider) {
-        var _this = this;
-        // Clear any existing bookmarks.
-        this.addStates(null);
-        if (tensorInfo && tensorInfo.bookmarksPath) {
-            // Get any bookmarks that may come when the projector starts up.
-            dataProvider.getBookmarks(run, tensorInfo.tensorName, function (bookmarks) {
-                _this.addStates(bookmarks);
-                _this._expandMore();
-            });
-        }
-        else {
-            this._expandLess();
-        }
-    };
-    /** Handles a click on show bookmarks tray button. */
-    BookmarkPanel.prototype._expandMore = function () {
-        this.$.panel.show();
-        this.dom.select('#expand-more').style('display', 'none');
-        this.dom.select('#expand-less').style('display', '');
-    };
-    /** Handles a click on hide bookmarks tray button. */
-    BookmarkPanel.prototype._expandLess = function () {
-        this.$.panel.hide();
-        this.dom.select('#expand-more').style('display', '');
-        this.dom.select('#expand-less').style('display', 'none');
-    };
-    /** Handles a click on the add bookmark button. */
-    BookmarkPanel.prototype._addBookmark = function () {
-        var currentState = this.projector.getCurrentState();
-        currentState.label = 'State ' + this.savedStates.length;
-        currentState.isSelected = true;
-        this.selectedState = this.savedStates.length;
-        for (var i = 0; i < this.savedStates.length; i++) {
-            this.savedStates[i].isSelected = false;
-            // We have to call notifyPath so that polymer knows this element was
-            // updated.
-            this.notifyPath('savedStates.' + i + '.isSelected', false, false);
-        }
-        this.push('savedStates', currentState);
-        this.updateHasStates();
-    };
-    /** Handles a click on the download bookmarks button. */
-    BookmarkPanel.prototype._downloadFile = function () {
-        var serializedState = this.serializeAllSavedStates();
-        var blob = new Blob([serializedState], { type: 'text/plain' });
-        var textFile = window.URL.createObjectURL(blob);
-        // Force a download.
-        var a = document.createElement('a');
-        document.body.appendChild(a);
-        a.style.display = 'none';
-        a.href = textFile;
-        a.download = 'state';
-        a.click();
-        document.body.removeChild(a);
-        window.URL.revokeObjectURL(textFile);
-    };
-    /** Handles a click on the upload bookmarks button. */
-    BookmarkPanel.prototype._uploadFile = function () {
-        var fileInput = this.dom.select('#state-file');
-        fileInput.node().click();
-    };
-    BookmarkPanel.prototype.setupUploadButton = function () {
-        var _this = this;
-        // Show and setup the load view button.
-        var fileInput = this.dom.select('#state-file');
-        fileInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var str = evt.target.result;
-                var savedStates = JSON.parse(str);
-                // Verify the bookmarks match.
-                if (_this.savedStatesValid(savedStates)) {
-                    _this.addStates(savedStates);
-                    _this.loadSavedState(0);
-                }
-                else {
-                    logging.setWarningMessage("Unable to load bookmarks: wrong dataset, expected dataset " +
-                        ("with shape (" + savedStates[0].dataSetDimensions + ")."));
-                }
-            };
-            fileReader.readAsText(file);
-        });
-    };
-    BookmarkPanel.prototype.addStates = function (savedStates) {
-        if (savedStates == null) {
-            this.savedStates = [];
-        }
-        else {
-            for (var i = 0; i < savedStates.length; i++) {
-                savedStates[i].isSelected = false;
-                this.push('savedStates', savedStates[i]);
-            }
-        }
-        this.updateHasStates();
-    };
-    /** Deselects any selected state selection. */
-    BookmarkPanel.prototype.clearStateSelection = function () {
-        for (var i = 0; i < this.savedStates.length; i++) {
-            this.setSelectionState(i, false);
-        }
-    };
-    /** Handles a radio button click on a saved state. */
-    BookmarkPanel.prototype._radioButtonHandler = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.loadSavedState(index);
-        this.setSelectionState(index, true);
-    };
-    BookmarkPanel.prototype.loadSavedState = function (index) {
-        for (var i = 0; i < this.savedStates.length; i++) {
-            if (this.savedStates[i].isSelected) {
-                this.setSelectionState(i, false);
-            }
-            else if (index === i) {
-                this.setSelectionState(i, true);
-                this.ignoreNextProjectionEvent = true;
-                this.projector.loadState(this.savedStates[i]);
-            }
-        }
-    };
-    BookmarkPanel.prototype.setSelectionState = function (stateIndex, selected) {
-        this.savedStates[stateIndex].isSelected = selected;
-        var path = 'savedStates.' + stateIndex + '.isSelected';
-        this.notifyPath(path, selected, false);
-    };
-    /**
-     * Crawls up the DOM to find an ancestor with a data-index attribute. This is
-     * used to match events to their bookmark index.
-     */
-    BookmarkPanel.prototype.getParentDataIndex = function (evt) {
-        for (var i = 0; i < evt.path.length; i++) {
-            var dataIndex = evt.path[i].getAttribute('data-index');
-            if (dataIndex != null) {
-                return +dataIndex;
-            }
-        }
-        return -1;
-    };
-    /** Handles a clear button click on a bookmark. */
-    BookmarkPanel.prototype._clearButtonHandler = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.splice('savedStates', index, 1);
-        this.updateHasStates();
-    };
-    /** Handles a label change event on a bookmark. */
-    BookmarkPanel.prototype._labelChange = function (evt) {
-        var index = this.getParentDataIndex(evt);
-        this.savedStates[index].label = evt.target.value;
-    };
-    /**
-     * Used to determine whether to select the radio button for a given bookmark.
-     */
-    BookmarkPanel.prototype._isSelectedState = function (index) {
-        return index === this.selectedState;
-    };
-    BookmarkPanel.prototype._isNotSelectedState = function (index) {
-        return index !== this.selectedState;
-    };
-    /**
-     * Gets all of the saved states as a serialized string.
-     */
-    BookmarkPanel.prototype.serializeAllSavedStates = function () {
-        return JSON.stringify(this.savedStates);
-    };
-    /**
-     * Loads all of the serialized states and shows them in the list of
-     * viewable states.
-     */
-    BookmarkPanel.prototype.loadSavedStates = function (serializedStates) {
-        this.savedStates = JSON.parse(serializedStates);
-        this.updateHasStates();
-    };
-    /**
-     * Updates the hasState polymer property.
-     */
-    BookmarkPanel.prototype.updateHasStates = function () {
-        this.hasStates = (this.savedStates.length !== 0);
-    };
-    /** Sanity checks a State array to ensure it matches the current dataset. */
-    BookmarkPanel.prototype.savedStatesValid = function (states) {
-        for (var i = 0; i < states.length; i++) {
-            if (states[i].dataSetDimensions[0] !== this.projector.dataSet.dim[0] ||
-                states[i].dataSetDimensions[1] !== this.projector.dataSet.dim[1]) {
-                return false;
-            }
-        }
-        return true;
-    };
-    return BookmarkPanel;
-}(exports.BookmarkPanelPolymer));
-exports.BookmarkPanel = BookmarkPanel;
-document.registerElement(BookmarkPanel.prototype.is, BookmarkPanel);
-
-},{"./logging":12,"./vz-projector-util":33}],27:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var data_provider_1 = require('./data-provider');
-var util = require('./util');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-exports.DataPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-data-panel',
-    properties: {
-        selectedTensor: { type: String, observer: '_selectedTensorChanged' },
-        selectedRun: { type: String, observer: '_selectedRunChanged' },
-        selectedColorOptionName: {
-            type: String,
-            notify: true,
-            observer: '_selectedColorOptionNameChanged'
-        },
-        selectedLabelOption: { type: String, notify: true, observer: '_selectedLabelOptionChanged' },
-        normalizeData: Boolean,
-        showForceCategoricalColorsCheckbox: Boolean
-    }
-});
-var DataPanel = (function (_super) {
-    __extends(DataPanel, _super);
-    function DataPanel() {
-        _super.apply(this, arguments);
-        this.forceCategoricalColoring = false;
-    }
-    DataPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.normalizeData = true;
-    };
-    DataPanel.prototype.initialize = function (projector, dp) {
-        var _this = this;
-        this.projector = projector;
-        this.dataProvider = dp;
-        this.setupUploadButtons();
-        // Tell the projector whenever the data normalization changes.
-        // Unknown why, but the polymer checkbox button stops working as soon as
-        // you do d3.select() on it.
-        this.querySelector('#normalize-data-checkbox')
-            .addEventListener('change', function () {
-            _this.projector.setNormalizeData(_this.normalizeData);
-        });
-        var forceCategoricalColoringCheckbox = this.querySelector('#force-categorical-checkbox');
-        forceCategoricalColoringCheckbox.addEventListener('change', function () {
-            _this.setForceCategoricalColoring(forceCategoricalColoringCheckbox.checked);
-        });
-        // Get all the runs.
-        this.dataProvider.retrieveRuns(function (runs) {
-            _this.runNames = runs;
-            // Choose the first run by default.
-            if (_this.runNames.length > 0) {
-                _this.selectedRun = runs[0];
-            }
-        });
-    };
-    DataPanel.prototype.setForceCategoricalColoring = function (forceCategoricalColoring) {
-        this.forceCategoricalColoring = forceCategoricalColoring;
-        this.querySelector('#force-categorical-checkbox')
-            .checked = this.forceCategoricalColoring;
-        this.updateMetadataUI(this.spriteAndMetadata.stats, this.metadataFile);
-        // The selected color option name doesn't change when we switch to using
-        // categorical coloring for stats with too many unique values, so we
-        // manually call this polymer observer so that we update the UI.
-        this._selectedColorOptionNameChanged();
-    };
-    DataPanel.prototype.getSeparatorClass = function (isSeparator) {
-        return isSeparator ? 'separator' : null;
-    };
-    DataPanel.prototype.metadataChanged = function (spriteAndMetadata, metadataFile) {
-        this.spriteAndMetadata = spriteAndMetadata;
-        this.metadataFile = metadataFile;
-        this.updateMetadataUI(this.spriteAndMetadata.stats, this.metadataFile);
-        this.selectedColorOptionName = this.colorOptions[0].name;
-    };
-    DataPanel.prototype.addWordBreaks = function (longString) {
-        if (longString == null) {
-            return '';
-        }
-        return longString.replace(/([\/=-_,])/g, '$1<wbr>');
-    };
-    DataPanel.prototype.updateMetadataUI = function (columnStats, metadataFile) {
-        var _this = this;
-        this.dom.select('#metadata-file')
-            .html(this.addWordBreaks(metadataFile))
-            .attr('title', metadataFile);
-        // Label by options.
-        var labelIndex = -1;
-        this.labelOptions = columnStats.map(function (stats, i) {
-            // Make the default label by the first non-numeric column.
-            if (!stats.isNumeric && labelIndex === -1) {
-                labelIndex = i;
-            }
-            return stats.name;
-        });
-        this.selectedLabelOption = this.labelOptions[Math.max(0, labelIndex)];
-        // Color by options.
-        var standardColorOption = [
-            { name: 'No color map' },
-        ];
-        var metadataColorOption = columnStats
-            .filter(function (stats) {
-            return !stats.tooManyUniqueValues || stats.isNumeric;
-        })
-            .map(function (stats) {
-            var map;
-            var items;
-            var thresholds;
-            var isCategorical = _this.forceCategoricalColoring || !stats.tooManyUniqueValues;
-            if (isCategorical) {
-                var scale = d3.scale.category20();
-                var range_1 = scale.range();
-                // Re-order the range.
-                var newRange = range_1.map(function (color, i) {
-                    var index = (i * 3) % range_1.length;
-                    return range_1[index];
-                });
-                items = stats.uniqueEntries;
-                scale.range(newRange).domain(items.map(function (x) { return x.label; }));
-                map = scale;
-            }
-            else {
-                thresholds = [
-                    { color: '#ffffdd', value: stats.min },
-                    { color: '#1f2d86', value: stats.max }
-                ];
-                map = d3.scale.linear()
-                    .domain(thresholds.map(function (t) { return t.value; }))
-                    .range(thresholds.map(function (t) { return t.color; }));
-            }
-            var desc = !isCategorical ? 'gradient' :
-                stats.uniqueEntries.length +
-                    ((stats.uniqueEntries.length > 20) ? ' non-unique' : '') +
-                    ' colors';
-            return {
-                name: stats.name,
-                desc: desc,
-                map: map,
-                items: items,
-                thresholds: thresholds,
-                tooManyUniqueValues: stats.tooManyUniqueValues
-            };
-        });
-        if (metadataColorOption.length > 0) {
-            // Add a separator line between built-in color maps
-            // and those based on metadata columns.
-            standardColorOption.push({ name: 'Metadata', isSeparator: true });
-        }
-        this.colorOptions = standardColorOption.concat(metadataColorOption);
-    };
-    DataPanel.prototype.setNormalizeData = function (normalizeData) {
-        this.normalizeData = normalizeData;
-    };
-    DataPanel.prototype._selectedTensorChanged = function () {
-        var _this = this;
-        this.projector.updateDataSet(null, null, null);
-        if (this.selectedTensor == null) {
-            return;
-        }
-        this.dataProvider.retrieveTensor(this.selectedRun, this.selectedTensor, function (ds) {
-            var metadataFile = _this.getEmbeddingInfoByName(_this.selectedTensor).metadataPath;
-            _this.dataProvider.retrieveSpriteAndMetadata(_this.selectedRun, _this.selectedTensor, function (metadata) {
-                _this.projector.updateDataSet(ds, metadata, metadataFile);
-            });
-        });
-        this.projector.setSelectedTensor(this.selectedRun, this.getEmbeddingInfoByName(this.selectedTensor));
-    };
-    DataPanel.prototype._selectedRunChanged = function () {
-        var _this = this;
-        this.dataProvider.retrieveProjectorConfig(this.selectedRun, function (info) {
-            _this.projectorConfig = info;
-            var names = _this.projectorConfig.embeddings.map(function (e) { return e.tensorName; })
-                .filter(function (name) {
-                var shape = _this.getEmbeddingInfoByName(name).tensorShape;
-                return shape.length === 2 && shape[0] > 1 && shape[1] > 1;
-            })
-                .sort(function (a, b) {
-                var embA = _this.getEmbeddingInfoByName(a);
-                var embB = _this.getEmbeddingInfoByName(b);
-                // Prefer tensors with metadata.
-                if (util.xor(!!embA.metadataPath, !!embB.metadataPath)) {
-                    return embA.metadataPath ? -1 : 1;
-                }
-                // Prefer non-generated tensors.
-                var isGenA = util.tensorIsGenerated(a);
-                var isGenB = util.tensorIsGenerated(b);
-                if (util.xor(isGenA, isGenB)) {
-                    return isGenB ? -1 : 1;
-                }
-                // Prefer bigger tensors.
-                var sizeA = embA.tensorShape[0];
-                var sizeB = embB.tensorShape[0];
-                if (sizeA !== sizeB) {
-                    return sizeB - sizeA;
-                }
-                // Sort alphabetically by tensor name.
-                return a <= b ? -1 : 1;
-            });
-            _this.tensorNames = names.map(function (name) {
-                return { name: name, shape: _this.getEmbeddingInfoByName(name).tensorShape };
-            });
-            var wordBreakablePath = _this.addWordBreaks(_this.projectorConfig.modelCheckpointPath);
-            _this.dom.select('#checkpoint-file')
-                .html(wordBreakablePath)
-                .attr('title', _this.projectorConfig.modelCheckpointPath);
-            // If in demo mode, let the order decide which tensor to load by default.
-            var defaultTensor = _this.projector.servingMode === 'demo' ?
-                _this.projectorConfig.embeddings[0].tensorName :
-                names[0];
-            if (_this.selectedTensor === defaultTensor) {
-                // Explicitly call the observer. Polymer won't call it if the previous
-                // string matches the current string.
-                _this._selectedTensorChanged();
-            }
-            else {
-                _this.selectedTensor = defaultTensor;
-            }
-        });
-    };
-    DataPanel.prototype._selectedLabelOptionChanged = function () {
-        this.projector.setSelectedLabelOption(this.selectedLabelOption);
-    };
-    DataPanel.prototype._selectedColorOptionNameChanged = function () {
-        var colorOption;
-        for (var i = 0; i < this.colorOptions.length; i++) {
-            if (this.colorOptions[i].name === this.selectedColorOptionName) {
-                colorOption = this.colorOptions[i];
-                break;
-            }
-        }
-        if (!colorOption) {
-            return;
-        }
-        this.showForceCategoricalColorsCheckbox = !!colorOption.tooManyUniqueValues;
-        if (colorOption.map == null) {
-            this.colorLegendRenderInfo = null;
-        }
-        else if (colorOption.items) {
-            var items = colorOption.items.map(function (item) {
-                return {
-                    color: colorOption.map(item.label),
-                    label: item.label,
-                    count: item.count
-                };
-            });
-            this.colorLegendRenderInfo = { items: items, thresholds: null };
-        }
-        else {
-            this.colorLegendRenderInfo = {
-                items: null,
-                thresholds: colorOption.thresholds
-            };
-        }
-        this.projector.setSelectedColorOption(colorOption);
-    };
-    DataPanel.prototype.tensorWasReadFromFile = function (rawContents, fileName) {
-        var _this = this;
-        data_provider_1.parseRawTensors(rawContents, function (ds) {
-            _this.dom.select('#checkpoint-file')
-                .text(fileName)
-                .attr('title', fileName);
-            _this.projector.updateDataSet(ds);
-        });
-    };
-    DataPanel.prototype.metadataWasReadFromFile = function (rawContents, fileName) {
-        var _this = this;
-        data_provider_1.parseRawMetadata(rawContents, function (metadata) {
-            _this.projector.updateDataSet(_this.projector.dataSet, metadata, fileName);
-        });
-    };
-    DataPanel.prototype.getEmbeddingInfoByName = function (tensorName) {
-        for (var i = 0; i < this.projectorConfig.embeddings.length; i++) {
-            var e = this.projectorConfig.embeddings[i];
-            if (e.tensorName === tensorName) {
-                return e;
-            }
-        }
-    };
-    DataPanel.prototype.setupUploadButtons = function () {
-        var _this = this;
-        // Show and setup the upload button.
-        var fileInput = this.dom.select('#file');
-        fileInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var content = evt.target.result;
-                _this.tensorWasReadFromFile(content, file.name);
-            };
-            fileReader.readAsText(file);
-        });
-        var uploadButton = this.dom.select('#upload-tensors');
-        uploadButton.on('click', function () {
-            fileInput.node().click();
-        });
-        // Show and setup the upload metadata button.
-        var fileMetadataInput = this.dom.select('#file-metadata');
-        fileMetadataInput.on('change', function () {
-            var file = d3.event.target.files[0];
-            // Clear out the value of the file chooser. This ensures that if the user
-            // selects the same file, we'll re-read it.
-            d3.event.target.value = '';
-            var fileReader = new FileReader();
-            fileReader.onload = function (evt) {
-                var contents = evt.target.result;
-                _this.metadataWasReadFromFile(contents, file.name);
-            };
-            fileReader.readAsText(file);
-        });
-        var uploadMetadataButton = this.dom.select('#upload-metadata');
-        uploadMetadataButton.on('click', function () {
-            fileMetadataInput.node().click();
-        });
-        if (this.projector.servingMode === 'demo') {
-            this.$$('#demo-data-buttons-container').style.display =
-                'block';
-            // Fill out the projector config.
-            var projectorConfigTemplate_1 = this.$$('#projector-config-template');
-            var projectorConfigTemplateJson_1 = {
-                embeddings: [{
-                        tensorName: 'My tensor',
-                        tensorShape: [1000, 50],
-                        tensorPath: 'https://gist.github.com/.../tensors.tsv',
-                        metadataPath: 'https://gist.github.com/.../optional.metadata.tsv',
-                    }],
-            };
-            this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            // Set up optional field checkboxes.
-            var spriteFieldCheckbox_1 = this.$$('#config-sprite-checkbox');
-            spriteFieldCheckbox_1.addEventListener('change', function () {
-                if (spriteFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].sprite = {
-                        imagePath: 'https://github.com/.../optional.sprite.png',
-                        singleImageDim: [32, 32]
-                    };
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].sprite;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            var bookmarksFieldCheckbox_1 = this.$$('#config-bookmarks-checkbox');
-            bookmarksFieldCheckbox_1.addEventListener('change', function () {
-                if (bookmarksFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].bookmarksPath =
-                        'https://gist.github.com/.../bookmarks.txt';
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].bookmarksPath;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            var metadataFieldCheckbox_1 = this.$$('#config-metadata-checkbox');
-            metadataFieldCheckbox_1.addEventListener('change', function () {
-                if (metadataFieldCheckbox_1.checked) {
-                    projectorConfigTemplateJson_1.embeddings[0].metadataPath =
-                        'https://gist.github.com/.../optional.metadata.tsv';
-                }
-                else {
-                    delete projectorConfigTemplateJson_1.embeddings[0].metadataPath;
-                }
-                _this.setProjectorConfigTemplateJson(projectorConfigTemplate_1, projectorConfigTemplateJson_1);
-            });
-            // Update the link and the readonly shareable URL.
-            var projectorConfigUrlInput_1 = this.$$('#projector-config-url');
-            var projectorConfigDemoUrlInput_1 = this.$$('#projector-share-url');
-            var projectorConfigDemoUrlLink_1 = this.$$('#projector-share-url-link');
-            projectorConfigUrlInput_1.addEventListener('input', function () {
-                var projectorDemoUrl = location.protocol + '//' + location.host +
-                    location.pathname + '?config=' +
-                    projectorConfigUrlInput_1.value;
-                projectorConfigDemoUrlInput_1.value = projectorDemoUrl;
-                projectorConfigDemoUrlLink_1.href = projectorDemoUrl;
-            });
-        }
-    };
-    DataPanel.prototype.setProjectorConfigTemplateJson = function (projectorConfigTemplate, config) {
-        projectorConfigTemplate.value =
-            JSON.stringify(config, null, /** replacer */ 2 /** white space */);
-    };
-    DataPanel.prototype._getNumTensorsLabel = function () {
-        return this.tensorNames.length === 1 ? '1 tensor' :
-            this.tensorNames.length + ' tensors';
-    };
-    DataPanel.prototype._getNumRunsLabel = function () {
-        return this.runNames.length === 1 ? '1 run' :
-            this.runNames.length + ' runs';
-    };
-    DataPanel.prototype._hasChoices = function (choices) {
-        return choices.length > 1;
-    };
-    return DataPanel;
-}(exports.DataPanelPolymer));
-exports.DataPanel = DataPanel;
-document.registerElement(DataPanel.prototype.is, DataPanel);
-
-},{"./data-provider":6,"./util":24,"./vz-projector-util":33}],28:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.PolymerClass = vz_projector_util_1.PolymerElement({ is: 'vz-projector-input', properties: { label: String, message: String } });
-/** Input control with custom capabilities (e.g. regex). */
-var ProjectorInput = (function (_super) {
-    __extends(ProjectorInput, _super);
-    function ProjectorInput() {
-        _super.apply(this, arguments);
-    }
-    /** Subscribe to be called everytime the input changes. */
-    ProjectorInput.prototype.registerInputChangedListener = function (listener) {
-        this.textChangedListeners.push(listener);
-    };
-    ProjectorInput.prototype.ready = function () {
-        var _this = this;
-        this.inRegexMode = false;
-        this.textChangedListeners = [];
-        this.dom = d3.select(this);
-        this.paperInput = this.querySelector('paper-input');
-        this.inRegexModeButton =
-            this.querySelector('paper-button');
-        this.paperInput.setAttribute('error-message', 'Invalid regex');
-        this.paperInput.addEventListener('input', function () {
-            _this.onTextChanged();
-        });
-        this.paperInput.addEventListener('keydown', function (event) {
-            event.stopPropagation();
-        });
-        this.inRegexModeButton.addEventListener('click', function () { return _this.onClickRegexModeButton(); });
-        this.updateRegexModeDisplaySlashes();
-        this.onTextChanged();
-    };
-    ProjectorInput.prototype.onClickRegexModeButton = function () {
-        this.inRegexMode = this.inRegexModeButton.active;
-        this.updateRegexModeDisplaySlashes();
-        this.onTextChanged();
-    };
-    ProjectorInput.prototype.notifyInputChanged = function (value, inRegexMode) {
-        this.textChangedListeners.forEach(function (l) { return l(value, inRegexMode); });
-    };
-    ProjectorInput.prototype.onTextChanged = function () {
-        try {
-            if (this.inRegexMode) {
-                new RegExp(this.paperInput.value);
-            }
-        }
-        catch (invalidRegexException) {
-            this.paperInput.setAttribute('invalid', 'true');
-            this.message = '';
-            this.notifyInputChanged(null, true);
-            return;
-        }
-        this.paperInput.removeAttribute('invalid');
-        this.notifyInputChanged(this.paperInput.value, this.inRegexMode);
-    };
-    ProjectorInput.prototype.updateRegexModeDisplaySlashes = function () {
-        d3.select(this.paperInput)
-            .selectAll('.slash')
-            .style('display', this.inRegexMode ? null : 'none');
-    };
-    ProjectorInput.prototype.getValue = function () {
-        return this.paperInput.value;
-    };
-    ProjectorInput.prototype.getInRegexMode = function () {
-        return this.inRegexMode;
-    };
-    ProjectorInput.prototype.set = function (value, inRegexMode) {
-        this.inRegexModeButton.active = inRegexMode;
-        this.paperInput.value = value;
-        this.onClickRegexModeButton();
-    };
-    return ProjectorInput;
-}(exports.PolymerClass));
-exports.ProjectorInput = ProjectorInput;
-document.registerElement(ProjectorInput.prototype.is, ProjectorInput);
-
-},{"./vz-projector-util":33}],29:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var adapter = require('./projectorScatterPlotAdapter');
-var vector = require('./vector');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-/** Limit the number of search results we show to the user. */
-var LIMIT_RESULTS = 100;
-// tslint:disable-next-line
-exports.PolymerClass = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-inspector-panel',
-    properties: { selectedMetadataField: String, metadataFields: Array }
-});
-var InspectorPanel = (function (_super) {
-    __extends(InspectorPanel, _super);
-    function InspectorPanel() {
-        _super.apply(this, arguments);
-    }
-    InspectorPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.resetFilterButton = this.dom.select('.reset-filter');
-        this.setFilterButton = this.dom.select('.set-filter');
-        this.clearSelectionButton = this.dom.select('.clear-selection');
-        this.limitMessage = this.dom.select('.limit-msg');
-        this.searchBox = this.querySelector('#search-box');
-        // https://www.polymer-project.org/1.0/docs/devguide/styling#scope-subtree
-        this.scopeSubtree(this, true);
-    };
-    InspectorPanel.prototype.initialize = function (projector, projectorEventContext) {
-        var _this = this;
-        this.projector = projector;
-        this.projectorEventContext = projectorEventContext;
-        this.setupUI(projector);
-        projectorEventContext.registerSelectionChangedListener(function (selection, neighbors) {
-            return _this.updateInspectorPane(selection, neighbors);
-        });
-    };
-    /** Updates the nearest neighbors list in the inspector. */
-    InspectorPanel.prototype.updateInspectorPane = function (indices, neighbors) {
-        this.neighborsOfFirstPoint = neighbors;
-        this.selectedPointIndices = indices;
-        this.updateFilterButtons(indices.length + neighbors.length);
-        this.updateNeighborsList(neighbors);
-        if (neighbors.length === 0) {
-            this.updateSearchResults(indices);
-        }
-        else {
-            this.updateSearchResults([]);
-        }
-    };
-    InspectorPanel.prototype.enableResetFilterButton = function (enabled) {
-        this.resetFilterButton.attr('disabled', enabled ? null : true);
-    };
-    InspectorPanel.prototype.restoreUIFromBookmark = function (bookmark) {
-        this.enableResetFilterButton(bookmark.filteredPoints != null);
-    };
-    InspectorPanel.prototype.metadataChanged = function (spriteAndMetadata) {
-        var labelIndex = -1;
-        this.metadataFields = spriteAndMetadata.stats.map(function (stats, i) {
-            if (!stats.isNumeric && labelIndex === -1) {
-                labelIndex = i;
-            }
-            return stats.name;
-        });
-        labelIndex = Math.max(0, labelIndex);
-        // Make the default label the first non-numeric column.
-        this.selectedMetadataField = spriteAndMetadata.stats[labelIndex].name;
-    };
-    InspectorPanel.prototype.datasetChanged = function () {
-        this.enableResetFilterButton(false);
-    };
-    InspectorPanel.prototype.updateSearchResults = function (indices) {
-        var _this = this;
-        var container = this.dom.select('.matches-list');
-        container.style('display', indices.length ? null : 'none');
-        var list = container.select('.list');
-        list.html('');
-        if (indices.length === 0) {
-            return;
-        }
-        this.limitMessage.style('display', indices.length <= LIMIT_RESULTS ? 'none' : null);
-        indices = indices.slice(0, LIMIT_RESULTS);
-        var rows = list.selectAll('.row').data(indices).enter().append('div').attr('class', 'row');
-        rows.append('a')
-            .attr('class', 'label')
-            .attr('title', function (index) { return _this.getLabelFromIndex(index); })
-            .text(function (index) { return _this.getLabelFromIndex(index); });
-        rows.on('mouseenter', function (index) {
-            _this.projectorEventContext.notifyHoverOverPoint(index);
-        });
-        rows.on('mouseleave', function () {
-            _this.projectorEventContext.notifyHoverOverPoint(null);
-        });
-        rows.on('click', function (index) {
-            _this.projectorEventContext.notifySelectionChanged([index]);
-        });
-    };
-    InspectorPanel.prototype.getLabelFromIndex = function (pointIndex) {
-        var point = this.projector.dataSet.points[pointIndex];
-        return point.metadata[this.selectedMetadataField].toString();
-    };
-    InspectorPanel.prototype.updateNeighborsList = function (neighbors) {
-        var _this = this;
-        var nnlist = this.dom.select('.nn-list');
-        nnlist.html('');
-        this.dom.select('.nn').style('display', neighbors.length ? null : 'none');
-        if (neighbors.length === 0) {
-            return;
-        }
-        this.searchBox.message = '';
-        var minDist = neighbors.length > 0 ? neighbors[0].dist : 0;
-        var n = nnlist.selectAll('.neighbor')
-            .data(neighbors)
-            .enter()
-            .append('div')
-            .attr('class', 'neighbor')
-            .append('a')
-            .attr('class', 'neighbor-link')
-            .attr('title', function (d) { return _this.getLabelFromIndex(d.index); });
-        var labelValue = n.append('div').attr('class', 'label-and-value');
-        labelValue.append('div')
-            .attr('class', 'label')
-            .style('color', function (d) { return adapter.dist2color(_this.distFunc, d.dist, minDist); })
-            .text(function (d) { return _this.getLabelFromIndex(d.index); });
-        labelValue.append('div')
-            .attr('class', 'value')
-            .text(function (d) { return d.dist.toFixed(3); });
-        var bar = n.append('div').attr('class', 'bar');
-        bar.append('div')
-            .attr('class', 'fill')
-            .style('border-top-color', function (d) {
-            return adapter.dist2color(_this.distFunc, d.dist, minDist);
-        })
-            .style('width', function (d) { return adapter.normalizeDist(_this.distFunc, d.dist, minDist) * 100 +
-            '%'; });
-        bar.selectAll('.tick')
-            .data(d3.range(1, 4))
-            .enter()
-            .append('div')
-            .attr('class', 'tick')
-            .style('left', function (d) { return d * 100 / 4 + '%'; });
-        n.on('mouseenter', function (d) {
-            _this.projectorEventContext.notifyHoverOverPoint(d.index);
-        });
-        n.on('mouseleave', function () {
-            _this.projectorEventContext.notifyHoverOverPoint(null);
-        });
-        n.on('click', function (d) {
-            _this.projectorEventContext.notifySelectionChanged([d.index]);
-        });
-    };
-    InspectorPanel.prototype.updateFilterButtons = function (numPoints) {
-        if (numPoints > 1) {
-            this.setFilterButton.text("Isolate " + numPoints + " points")
-                .attr('disabled', null);
-            this.clearSelectionButton.attr('disabled', null);
-        }
-        else {
-            this.setFilterButton.attr('disabled', true);
-            this.clearSelectionButton.attr('disabled', true);
-        }
-    };
-    InspectorPanel.prototype.setupUI = function (projector) {
-        var _this = this;
-        this.distFunc = vector.cosDist;
-        var eucDist = this.dom.select('.distance a.euclidean');
-        eucDist.on('click', function () {
-            _this.dom.selectAll('.distance a').classed('selected', false);
-            eucDist.classed('selected', true);
-            _this.distFunc = vector.dist;
-            _this.projectorEventContext.notifyDistanceMetricChanged(_this.distFunc);
-            var neighbors = projector.dataSet.findNeighbors(_this.selectedPointIndices[0], _this.distFunc, _this.numNN);
-            _this.updateNeighborsList(neighbors);
-        });
-        var cosDist = this.dom.select('.distance a.cosine');
-        cosDist.on('click', function () {
-            _this.dom.selectAll('.distance a').classed('selected', false);
-            cosDist.classed('selected', true);
-            _this.distFunc = vector.cosDist;
-            _this.projectorEventContext.notifyDistanceMetricChanged(_this.distFunc);
-            var neighbors = projector.dataSet.findNeighbors(_this.selectedPointIndices[0], _this.distFunc, _this.numNN);
-            _this.updateNeighborsList(neighbors);
-        });
-        // Called whenever the search text input changes.
-        var updateInput = function (value, inRegexMode) {
-            if (value == null || value.trim() === '') {
-                _this.searchBox.message = '';
-                _this.projectorEventContext.notifySelectionChanged([]);
-                return;
-            }
-            var indices = projector.dataSet.query(value, inRegexMode, _this.selectedMetadataField);
-            if (indices.length === 0) {
-                _this.searchBox.message = '0 matches.';
-            }
-            else {
-                _this.searchBox.message = indices.length + " matches.";
-            }
-            _this.projectorEventContext.notifySelectionChanged(indices);
-        };
-        this.searchBox.registerInputChangedListener(function (value, inRegexMode) {
-            updateInput(value, inRegexMode);
-        });
-        // Nearest neighbors controls.
-        var numNNInput = this.$$('#nn-slider');
-        var updateNumNN = function () {
-            _this.numNN = +numNNInput.value;
-            _this.dom.select('.num-nn .nn-count').text(_this.numNN);
-            if (_this.selectedPointIndices != null) {
-                _this.projectorEventContext.notifySelectionChanged([_this.selectedPointIndices[0]]);
-            }
-        };
-        numNNInput.addEventListener('change', updateNumNN);
-        updateNumNN();
-        // Filtering dataset.
-        this.setFilterButton.on('click', function () {
-            var indices = _this.selectedPointIndices.concat(_this.neighborsOfFirstPoint.map(function (n) { return n.index; }));
-            projector.filterDataset(indices);
-            _this.enableResetFilterButton(true);
-            _this.updateFilterButtons(0);
-        });
-        this.resetFilterButton.on('click', function () {
-            projector.resetFilterDataset();
-            _this.enableResetFilterButton(false);
-        });
-        this.clearSelectionButton.on('click', function () {
-            projector.adjustSelectionAndHover([]);
-        });
-        this.enableResetFilterButton(false);
-    };
-    return InspectorPanel;
-}(exports.PolymerClass));
-exports.InspectorPanel = InspectorPanel;
-document.registerElement(InspectorPanel.prototype.is, InspectorPanel);
-
-},{"./projectorScatterPlotAdapter":14,"./vector":25,"./vz-projector-util":33}],30:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.LegendPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-legend',
-    properties: { renderInfo: { type: Object, observer: '_renderInfoChanged' } }
-});
-var Legend = (function (_super) {
-    __extends(Legend, _super);
-    function Legend() {
-        _super.apply(this, arguments);
-    }
-    Legend.prototype.ready = function () {
-        this.dom = d3.select(this);
-    };
-    Legend.prototype._renderInfoChanged = function () {
-        var _this = this;
-        if (this.renderInfo == null) {
-            return;
-        }
-        if (this.renderInfo.thresholds) {
-            // <linearGradient> is under dom-if so we should wait for it to be
-            // inserted in the dom tree using async().
-            this.async(function () { return _this.setupLinearGradient(); });
-        }
-    };
-    Legend.prototype._getLastThreshold = function () {
-        if (this.renderInfo == null || this.renderInfo.thresholds == null) {
-            return;
-        }
-        return this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1]
-            .value;
-    };
-    Legend.prototype.getOffset = function (value) {
-        var min = this.renderInfo.thresholds[0].value;
-        var max = this.renderInfo.thresholds[this.renderInfo.thresholds.length - 1].value;
-        return (100 * (value - min) / (max - min)).toFixed(2) + '%';
-    };
-    Legend.prototype.setupLinearGradient = function () {
-        var _this = this;
-        var linearGradient = this.dom.select('#gradient');
-        var width = this.dom.select('svg.gradient').node().clientWidth;
-        // Set the svg <rect> to be the width of its <svg> parent.
-        this.dom.select('svg.gradient rect').attr('width', width);
-        // Remove all <stop> children from before.
-        linearGradient.selectAll('*').remove();
-        // Add a <stop> child in <linearGradient> for each gradient threshold.
-        this.renderInfo.thresholds.forEach(function (t) {
-            linearGradient.append('stop')
-                .attr('offset', _this.getOffset(t.value))
-                .attr('stop-color', t.color);
-        });
-    };
-    return Legend;
-}(exports.LegendPolymer));
-exports.Legend = Legend;
-document.registerElement(Legend.prototype.is, Legend);
-
-},{"./vz-projector-util":33}],31:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-// tslint:disable-next-line
-exports.MetadataCardPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-metadata-card',
-    properties: {
-        hasMetadata: { type: Boolean, value: false },
-        metadata: { type: Array },
-        label: String
-    }
-});
-var MetadataCard = (function (_super) {
-    __extends(MetadataCard, _super);
-    function MetadataCard() {
-        _super.apply(this, arguments);
-    }
-    MetadataCard.prototype.ready = function () {
-        this.dom = d3.select(this);
-    };
-    /** Handles a click on the expand more icon. */
-    MetadataCard.prototype._expandMore = function () {
-        this.$$('#metadata-container').toggle();
-        this.dom.select('#expand-more').style('display', 'none');
-        this.dom.select('#expand-less').style('display', '');
-    };
-    /** Handles a click on the expand less icon. */
-    MetadataCard.prototype._expandLess = function () {
-        this.$$('#metadata-container').toggle();
-        this.dom.select('#expand-more').style('display', '');
-        this.dom.select('#expand-less').style('display', 'none');
-    };
-    MetadataCard.prototype.updateMetadata = function (pointMetadata) {
-        this.pointMetadata = pointMetadata;
-        this.hasMetadata = (pointMetadata != null);
-        if (pointMetadata) {
-            var metadata = [];
-            for (var metadataKey in pointMetadata) {
-                if (!pointMetadata.hasOwnProperty(metadataKey)) {
-                    continue;
-                }
-                metadata.push({ key: metadataKey, value: pointMetadata[metadataKey] });
-            }
-            this.metadata = metadata;
-            this.label = '' + this.pointMetadata[this.labelOption];
-        }
-    };
-    MetadataCard.prototype.setLabelOption = function (labelOption) {
-        this.labelOption = labelOption;
-        if (this.pointMetadata) {
-            this.label = '' + this.pointMetadata[this.labelOption];
-        }
-    };
-    return MetadataCard;
-}(exports.MetadataCardPolymer));
-exports.MetadataCard = MetadataCard;
-document.registerElement(MetadataCard.prototype.is, MetadataCard);
-
-},{"./vz-projector-util":33}],32:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var data = require('./data');
-var data_1 = require('./data');
-var vector = require('./vector');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-var NUM_PCA_COMPONENTS = 10;
-// tslint:disable-next-line
-exports.ProjectionsPanelPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector-projections-panel',
-    properties: {
-        pcaIs3d: { type: Boolean, value: true, observer: '_pcaDimensionToggleObserver' },
-        tSNEis3d: { type: Boolean, value: true, observer: '_tsneDimensionToggleObserver' },
-        // PCA projection.
-        pcaComponents: Array,
-        pcaX: { type: Number, value: 0, observer: 'showPCAIfEnabled' },
-        pcaY: { type: Number, value: 1, observer: 'showPCAIfEnabled' },
-        pcaZ: { type: Number, value: 2, observer: 'showPCAIfEnabled' },
-        // Custom projection.
-        customSelectedSearchByMetadataOption: {
-            type: String,
-            observer: '_customSelectedSearchByMetadataOptionChanged'
-        },
-    }
-});
-/**
- * A polymer component which handles the projection tabs in the projector.
- */
-var ProjectionsPanel = (function (_super) {
-    __extends(ProjectionsPanel, _super);
-    function ProjectionsPanel() {
-        _super.apply(this, arguments);
-    }
-    ProjectionsPanel.prototype.initialize = function (projector) {
-        this.polymerChangesTriggerReprojection = true;
-        this.projector = projector;
-        // Set up TSNE projections.
-        this.perplexity = 30;
-        this.learningRate = 10;
-        // Setup Custom projections.
-        this.centroidValues = { xLeft: null, xRight: null, yUp: null, yDown: null };
-        this.clearCentroids();
-        this.setupUIControls();
-    };
-    ProjectionsPanel.prototype.ready = function () {
-        this.dom = d3.select(this);
-        this.zDropdown = this.dom.select('#z-dropdown');
-        this.runTsneButton = this.dom.select('.run-tsne');
-        this.stopTsneButton = this.dom.select('.stop-tsne');
-        this.perplexitySlider = this.$$('#perplexity-slider');
-        this.learningRateInput =
-            this.$$('#learning-rate-slider');
-        this.iterationLabel = this.dom.select('.run-tsne-iter');
-    };
-    ProjectionsPanel.prototype.disablePolymerChangesTriggerReprojection = function () {
-        this.polymerChangesTriggerReprojection = false;
-    };
-    ProjectionsPanel.prototype.enablePolymerChangesTriggerReprojection = function () {
-        this.polymerChangesTriggerReprojection = true;
-    };
-    ProjectionsPanel.prototype.updateTSNEPerplexityFromSliderChange = function () {
-        if (this.perplexitySlider) {
-            this.perplexity = +this.perplexitySlider.value;
-        }
-        this.dom.select('.tsne-perplexity span').text(this.perplexity);
-    };
-    ProjectionsPanel.prototype.updateTSNELearningRateFromUIChange = function () {
-        if (this.learningRateInput) {
-            this.learningRate = Math.pow(10, +this.learningRateInput.value);
-        }
-        this.dom.select('.tsne-learning-rate span').text(this.learningRate);
-    };
-    ProjectionsPanel.prototype.setupUIControls = function () {
-        var _this = this;
-        {
-            var self_1 = this;
-            this.dom.selectAll('.ink-tab').on('click', function () {
-                var id = this.getAttribute('data-tab');
-                self_1.showTab(id);
-            });
-        }
-        this.runTsneButton.on('click', function () { return _this.runTSNE(); });
-        this.stopTsneButton.on('click', function () { return _this.dataSet.stopTSNE(); });
-        this.perplexitySlider.value = this.perplexity.toString();
-        this.perplexitySlider.addEventListener('change', function () { return _this.updateTSNEPerplexityFromSliderChange(); });
-        this.updateTSNEPerplexityFromSliderChange();
-        this.learningRateInput.addEventListener('change', function () { return _this.updateTSNELearningRateFromUIChange(); });
-        this.updateTSNELearningRateFromUIChange();
-        this.setupCustomProjectionInputFields();
-        // TODO: figure out why `--paper-input-container-input` css mixin didn't
-        // work.
-        this.dom.selectAll('paper-dropdown-menu paper-input input')
-            .style('font-size', '14px');
-    };
-    ProjectionsPanel.prototype.restoreUIFromBookmark = function (bookmark) {
-        this.disablePolymerChangesTriggerReprojection();
-        // PCA
-        this.pcaX = bookmark.pcaComponentDimensions[0];
-        this.pcaY = bookmark.pcaComponentDimensions[1];
-        if (bookmark.pcaComponentDimensions.length === 3) {
-            this.pcaZ = bookmark.pcaComponentDimensions[2];
-        }
-        this.pcaIs3d = (bookmark.pcaComponentDimensions.length === 3);
-        // t-SNE
-        if (this.perplexitySlider) {
-            this.perplexitySlider.value = bookmark.tSNEPerplexity.toString();
-        }
-        if (this.learningRateInput) {
-            this.learningRateInput.value = bookmark.tSNELearningRate.toString();
-        }
-        this.tSNEis3d = bookmark.tSNEis3d;
-        // custom
-        this.customSelectedSearchByMetadataOption =
-            bookmark.customSelectedSearchByMetadataOption;
-        if (this.customProjectionXLeftInput) {
-            this.customProjectionXLeftInput.set(bookmark.customXLeftText, bookmark.customXLeftRegex);
-        }
-        if (this.customProjectionXRightInput) {
-            this.customProjectionXRightInput.set(bookmark.customXRightText, bookmark.customXRightRegex);
-        }
-        if (this.customProjectionYUpInput) {
-            this.customProjectionYUpInput.set(bookmark.customYUpText, bookmark.customYUpRegex);
-        }
-        if (this.customProjectionYDownInput) {
-            this.customProjectionYDownInput.set(bookmark.customYDownText, bookmark.customYDownRegex);
-        }
-        this.computeAllCentroids();
-        this.setZDropdownEnabled(this.pcaIs3d);
-        this.updateTSNEPerplexityFromSliderChange();
-        this.updateTSNELearningRateFromUIChange();
-        if (this.iterationLabel) {
-            this.iterationLabel.text(bookmark.tSNEIteration.toString());
-        }
-        this.showTab(bookmark.selectedProjection);
-        this.enablePolymerChangesTriggerReprojection();
-    };
-    ProjectionsPanel.prototype.populateBookmarkFromUI = function (bookmark) {
-        this.disablePolymerChangesTriggerReprojection();
-        // PCA
-        bookmark.pcaComponentDimensions = [this.pcaX, this.pcaY];
-        if (this.pcaIs3d) {
-            bookmark.pcaComponentDimensions.push(this.pcaZ);
-        }
-        // t-SNE
-        if (this.perplexitySlider != null) {
-            bookmark.tSNEPerplexity = +this.perplexitySlider.value;
-        }
-        if (this.learningRateInput != null) {
-            bookmark.tSNELearningRate = +this.learningRateInput.value;
-        }
-        bookmark.tSNEis3d = this.tSNEis3d;
-        // custom
-        bookmark.customSelectedSearchByMetadataOption =
-            this.customSelectedSearchByMetadataOption;
-        if (this.customProjectionXLeftInput != null) {
-            bookmark.customXLeftText = this.customProjectionXLeftInput.getValue();
-            bookmark.customXLeftRegex =
-                this.customProjectionXLeftInput.getInRegexMode();
-        }
-        if (this.customProjectionXRightInput != null) {
-            bookmark.customXRightText = this.customProjectionXRightInput.getValue();
-            bookmark.customXRightRegex =
-                this.customProjectionXRightInput.getInRegexMode();
-        }
-        if (this.customProjectionYUpInput != null) {
-            bookmark.customYUpText = this.customProjectionYUpInput.getValue();
-            bookmark.customYUpRegex = this.customProjectionYUpInput.getInRegexMode();
-        }
-        if (this.customProjectionYDownInput != null) {
-            bookmark.customYDownText = this.customProjectionYDownInput.getValue();
-            bookmark.customYDownRegex =
-                this.customProjectionYDownInput.getInRegexMode();
-        }
-        this.enablePolymerChangesTriggerReprojection();
-    };
-    // This method is marked as public as it is used as the view method that
-    // abstracts DOM manipulation so we can stub it in a test.
-    // TODO(nsthorat): Move this to its own class as the glue between this class
-    // and the DOM.
-    ProjectionsPanel.prototype.setZDropdownEnabled = function (enabled) {
-        if (this.zDropdown) {
-            this.zDropdown.attr('disabled', enabled ? null : true);
-        }
-    };
-    ProjectionsPanel.prototype.dataSetUpdated = function (dataSet, originalDataSet, dim) {
-        this.dataSet = dataSet;
-        this.originalDataSet = originalDataSet;
-        this.dim = dim;
-        var pointCount = (dataSet == null) ? 0 : dataSet.points.length;
-        var perplexity = Math.max(5, Math.ceil(Math.sqrt(pointCount) / 4));
-        this.perplexitySlider.value = perplexity.toString();
-        this.updateTSNEPerplexityFromSliderChange();
-        this.clearCentroids();
-        this.dom.select('#tsne-sampling')
-            .style('display', pointCount > data.TSNE_SAMPLE_SIZE ? null : 'none');
-        var wasSampled = (dataSet == null) ? false : (dataSet.dim[0] > data.PCA_SAMPLE_DIM ||
-            dataSet.dim[1] > data.PCA_SAMPLE_DIM);
-        this.dom.select('#pca-sampling')
-            .style('display', wasSampled ? null : 'none');
-        this.showTab('pca');
-    };
-    ProjectionsPanel.prototype._pcaDimensionToggleObserver = function () {
-        this.setZDropdownEnabled(this.pcaIs3d);
-        this.beginProjection(this.currentProjection);
-    };
-    ProjectionsPanel.prototype._tsneDimensionToggleObserver = function () {
-        this.beginProjection(this.currentProjection);
-    };
-    ProjectionsPanel.prototype.metadataChanged = function (spriteAndMetadata) {
-        // Project by options for custom projections.
-        var searchByMetadataIndex = -1;
-        this.searchByMetadataOptions = spriteAndMetadata.stats.map(function (stats, i) {
-            // Make the default label by the first non-numeric column.
-            if (!stats.isNumeric && searchByMetadataIndex === -1) {
-                searchByMetadataIndex = i;
-            }
-            return stats.name;
-        });
-        this.customSelectedSearchByMetadataOption =
-            this.searchByMetadataOptions[Math.max(0, searchByMetadataIndex)];
-    };
-    ProjectionsPanel.prototype.showTab = function (id) {
-        var _this = this;
-        this.currentProjection = id;
-        var tab = this.dom.select('.ink-tab[data-tab="' + id + '"]');
-        this.dom.selectAll('.ink-tab').classed('active', false);
-        tab.classed('active', true);
-        this.dom.selectAll('.ink-panel-content').classed('active', false);
-        this.dom.select('.ink-panel-content[data-panel="' + id + '"]')
-            .classed('active', true);
-        // guard for unit tests, where polymer isn't attached and $ doesn't exist.
-        if (this.$ != null) {
-            var main_1 = this.$['main'];
-            // In order for the projections panel to animate its height, we need to
-            // set it explicitly.
-            requestAnimationFrame(function () {
-                _this.style.height = main_1.clientHeight + 'px';
-            });
-        }
-        this.beginProjection(id);
-    };
-    ProjectionsPanel.prototype.beginProjection = function (projection) {
-        if (this.polymerChangesTriggerReprojection === false) {
-            return;
-        }
-        if (projection === 'pca') {
-            if (this.dataSet != null) {
-                this.dataSet.stopTSNE();
-            }
-            this.showPCA();
-        }
-        else if (projection === 'tsne') {
-            this.showTSNE();
-        }
-        else if (projection === 'custom') {
-            if (this.dataSet != null) {
-                this.dataSet.stopTSNE();
-            }
-            this.computeAllCentroids();
-            this.reprojectCustom();
-        }
-    };
-    ProjectionsPanel.prototype.showTSNE = function () {
-        var dataSet = this.dataSet;
-        if (dataSet == null) {
-            return;
-        }
-        var accessors = data.getProjectionComponents('tsne', [0, 1, this.tSNEis3d ? 2 : null]);
-        var dimensionality = this.tSNEis3d ? 3 : 2;
-        var projection = new data_1.Projection('tsne', accessors, dimensionality, dataSet);
-        this.projector.setProjection(projection);
-        if (!this.dataSet.hasTSNERun) {
-            this.runTSNE();
-        }
-        else {
-            this.projector.notifyProjectionPositionsUpdated();
-        }
-    };
-    ProjectionsPanel.prototype.runTSNE = function () {
-        var _this = this;
-        this.runTsneButton.attr('disabled', true);
-        this.stopTsneButton.attr('disabled', null);
-        this.dataSet.projectTSNE(this.perplexity, this.learningRate, this.tSNEis3d ? 3 : 2, function (iteration) {
-            if (iteration != null) {
-                _this.iterationLabel.text(iteration);
-                _this.projector.notifyProjectionPositionsUpdated();
-            }
-            else {
-                _this.runTsneButton.attr('disabled', null);
-                _this.stopTsneButton.attr('disabled', true);
-            }
-        });
-    };
-    // tslint:disable-next-line:no-unused-variable
-    ProjectionsPanel.prototype.showPCAIfEnabled = function () {
-        if (this.polymerChangesTriggerReprojection) {
-            this.showPCA();
-        }
-    };
-    ProjectionsPanel.prototype.updateTotalVarianceMessage = function () {
-        var variances = this.dataSet.fracVariancesExplained;
-        var totalVariance = variances[this.pcaX] + variances[this.pcaY];
-        var msg = 'Total variance described: ';
-        if (this.pcaIs3d) {
-            totalVariance += variances[this.pcaZ];
-        }
-        msg += (totalVariance * 100).toFixed(1) + '%.';
-        this.dom.select('#total-variance').html(msg);
-    };
-    ProjectionsPanel.prototype.showPCA = function () {
-        var _this = this;
-        if (this.dataSet == null) {
-            return;
-        }
-        this.dataSet.projectPCA().then(function () {
-            // Polymer properties are 1-based.
-            var accessors = data.getProjectionComponents('pca', [_this.pcaX, _this.pcaY, _this.pcaZ]);
-            var dimensionality = _this.pcaIs3d ? 3 : 2;
-            var projection = new data_1.Projection('pca', accessors, dimensionality, _this.dataSet);
-            _this.projector.setProjection(projection);
-            var numComponents = Math.min(NUM_PCA_COMPONENTS, _this.dataSet.dim[1]);
-            _this.updateTotalVarianceMessage();
-            _this.pcaComponents = d3.range(0, numComponents).map(function (i) {
-                var fracVariance = _this.dataSet.fracVariancesExplained[i];
-                return {
-                    id: i,
-                    componentNumber: i + 1,
-                    percVariance: (fracVariance * 100).toFixed(1)
-                };
-            });
-        });
-    };
-    ProjectionsPanel.prototype.reprojectCustom = function () {
-        if (this.centroids == null || this.centroids.xLeft == null ||
-            this.centroids.xRight == null || this.centroids.yUp == null ||
-            this.centroids.yDown == null) {
-            return;
-        }
-        var xDir = vector.sub(this.centroids.xRight, this.centroids.xLeft);
-        this.dataSet.projectLinear(xDir, 'linear-x');
-        var yDir = vector.sub(this.centroids.yUp, this.centroids.yDown);
-        this.dataSet.projectLinear(yDir, 'linear-y');
-        var accessors = data.getProjectionComponents('custom', ['x', 'y']);
-        var projection = new data_1.Projection('custom', accessors, 2, this.dataSet);
-        this.projector.setProjection(projection);
-    };
-    ProjectionsPanel.prototype.clearCentroids = function () {
-        this.centroids = { xLeft: null, xRight: null, yUp: null, yDown: null };
-        this.allCentroid = null;
-    };
-    ProjectionsPanel.prototype._customSelectedSearchByMetadataOptionChanged = function (newVal, oldVal) {
-        if (this.polymerChangesTriggerReprojection === false) {
-            return;
-        }
-        if (this.currentProjection === 'custom') {
-            this.computeAllCentroids();
-            this.reprojectCustom();
-        }
-    };
-    ProjectionsPanel.prototype.setupCustomProjectionInputFields = function () {
-        this.customProjectionXLeftInput =
-            this.setupCustomProjectionInputField('xLeft');
-        this.customProjectionXRightInput =
-            this.setupCustomProjectionInputField('xRight');
-        this.customProjectionYUpInput = this.setupCustomProjectionInputField('yUp');
-        this.customProjectionYDownInput =
-            this.setupCustomProjectionInputField('yDown');
-    };
-    ProjectionsPanel.prototype.computeAllCentroids = function () {
-        this.computeCentroid('xLeft');
-        this.computeCentroid('xRight');
-        this.computeCentroid('yUp');
-        this.computeCentroid('yDown');
-    };
-    ProjectionsPanel.prototype.computeCentroid = function (name) {
-        var input = this.querySelector('#' + name);
-        if (input == null) {
-            return;
-        }
-        var value = input.getValue();
-        if (value == null) {
-            return;
-        }
-        var inRegexMode = input.getInRegexMode();
-        var result = this.getCentroid(value, inRegexMode);
-        if (result.numMatches === 0) {
-            input.message = '0 matches. Using a random vector.';
-            result.centroid = vector.rn(this.dim);
-        }
-        else {
-            input.message = result.numMatches + " matches.";
-        }
-        this.centroids[name] = result.centroid;
-        this.centroidValues[name] = value;
-    };
-    ProjectionsPanel.prototype.setupCustomProjectionInputField = function (name) {
-        var _this = this;
-        var input = this.querySelector('#' + name);
-        input.registerInputChangedListener(function (input, inRegexMode) {
-            if (_this.polymerChangesTriggerReprojection) {
-                _this.computeCentroid(name);
-                _this.reprojectCustom();
-            }
-        });
-        return input;
-    };
-    ProjectionsPanel.prototype.getCentroid = function (pattern, inRegexMode) {
-        var _this = this;
-        if (pattern == null || pattern === '') {
-            return { numMatches: 0 };
-        }
-        // Search by the original dataset since we often want to filter and project
-        // only the nearest neighbors of A onto B-C where B and C are not nearest
-        // neighbors of A.
-        var accessor = function (i) { return _this.originalDataSet.points[i].vector; };
-        var r = this.originalDataSet.query(pattern, inRegexMode, this.customSelectedSearchByMetadataOption);
-        return { centroid: vector.centroid(r, accessor), numMatches: r.length };
-    };
-    ProjectionsPanel.prototype.getPcaSampledDimText = function () {
-        return data.PCA_SAMPLE_DIM.toLocaleString();
-    };
-    ProjectionsPanel.prototype.getPcaSampleSizeText = function () {
-        return data.PCA_SAMPLE_SIZE.toLocaleString();
-    };
-    ProjectionsPanel.prototype.getTsneSampleSizeText = function () {
-        return data.TSNE_SAMPLE_SIZE.toLocaleString();
-    };
-    return ProjectionsPanel;
-}(exports.ProjectionsPanelPolymer));
-exports.ProjectionsPanel = ProjectionsPanel;
-document.registerElement(ProjectionsPanel.prototype.is, ProjectionsPanel);
-
-},{"./data":7,"./vector":25,"./vz-projector-util":33}],33:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-function PolymerElement(spec) {
-    return Polymer.Class(spec);
-}
-exports.PolymerElement = PolymerElement;
-
-},{}],34:[function(require,module,exports){
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-"use strict";
-var __extends = (this && this.__extends) || function (d, b) {
-    for (var p in b) if (b.hasOwnProperty(p)) d[p] = b[p];
-    function __() { this.constructor = d; }
-    d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
-};
-var analyticsLogger_1 = require('./analyticsLogger');
-var data = require('./data');
-var data_1 = require('./data');
-var data_provider_demo_1 = require('./data-provider-demo');
-var data_provider_proto_1 = require('./data-provider-proto');
-var data_provider_server_1 = require('./data-provider-server');
-var logging = require('./logging');
-var projectorScatterPlotAdapter_1 = require('./projectorScatterPlotAdapter');
-var scatterPlot_1 = require('./scatterPlot');
-var util = require('./util');
-// tslint:disable-next-line:no-unused-variable
-var vz_projector_util_1 = require('./vz-projector-util');
-/**
- * The minimum number of dimensions the data should have to automatically
- * decide to normalize the data.
- */
-var THRESHOLD_DIM_NORMALIZE = 50;
-var POINT_COLOR_MISSING = 'black';
-exports.ProjectorPolymer = vz_projector_util_1.PolymerElement({
-    is: 'vz-projector',
-    properties: {
-        routePrefix: String,
-        dataProto: { type: String, observer: '_dataProtoChanged' },
-        servingMode: String,
-        projectorConfigJsonPath: String,
-        pageViewLogging: Boolean,
-        eventLogging: Boolean
-    }
-});
-var INDEX_METADATA_FIELD = '__index__';
-var Projector = (function (_super) {
-    __extends(Projector, _super);
-    function Projector() {
-        _super.apply(this, arguments);
-    }
-    Projector.prototype.ready = function () {
-        this.dom = d3.select(this);
-        logging.setDomContainer(this);
-        this.analyticsLogger =
-            new analyticsLogger_1.AnalyticsLogger(this.pageViewLogging, this.eventLogging);
-        this.analyticsLogger.logPageView('embeddings');
-        if (!util.hasWebGLSupport()) {
-            this.analyticsLogger.logWebGLDisabled();
-            logging.setErrorMessage('Your browser or device does not have WebGL enabled. Please enable ' +
-                'hardware acceleration, or use a browser that supports WebGL.');
-            return;
-        }
-        this.selectionChangedListeners = [];
-        this.hoverListeners = [];
-        this.projectionChangedListeners = [];
-        this.distanceMetricChangedListeners = [];
-        this.selectedPointIndices = [];
-        this.neighborsOfFirstPoint = [];
-        this.dataPanel = this.$['data-panel'];
-        this.inspectorPanel = this.$['inspector-panel'];
-        this.inspectorPanel.initialize(this, this);
-        this.projectionsPanel = this.$['projections-panel'];
-        this.projectionsPanel.initialize(this);
-        this.bookmarkPanel = this.$['bookmark-panel'];
-        this.bookmarkPanel.initialize(this, this);
-        this.metadataCard = this.$['metadata-card'];
-        this.statusBar = this.dom.select('#status-bar');
-        this.scopeSubtree(this.$$('#notification-dialog'), true);
-        this.setupUIControls();
-        this.initializeDataProvider();
-    };
-    Projector.prototype.setSelectedLabelOption = function (labelOption) {
-        this.selectedLabelOption = labelOption;
-        this.metadataCard.setLabelOption(this.selectedLabelOption);
-        this.projectorScatterPlotAdapter.setLabelPointAccessor(labelOption);
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.projectorScatterPlotAdapter.render();
-    };
-    Projector.prototype.setSelectedColorOption = function (colorOption) {
-        this.selectedColorOption = colorOption;
-        this.projectorScatterPlotAdapter.setLegendPointColorer(this.getLegendPointColorer(colorOption));
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.projectorScatterPlotAdapter.render();
-    };
-    Projector.prototype.setNormalizeData = function (normalizeData) {
-        this.normalizeData = normalizeData;
-        this.setCurrentDataSet(this.originalDataSet.getSubset());
-    };
-    Projector.prototype.updateDataSet = function (ds, spriteAndMetadata, metadataFile) {
-        this.dataSetFilterIndices = null;
-        this.originalDataSet = ds;
-        if (ds != null) {
-            this.normalizeData =
-                this.originalDataSet.dim[1] >= THRESHOLD_DIM_NORMALIZE;
-            spriteAndMetadata = spriteAndMetadata || {};
-            if (spriteAndMetadata.pointsInfo == null) {
-                var _a = this.makeDefaultPointsInfoAndStats(ds.points), pointsInfo = _a[0], stats = _a[1];
-                spriteAndMetadata.pointsInfo = pointsInfo;
-                spriteAndMetadata.stats = stats;
-            }
-            var metadataMergeSucceeded = ds.mergeMetadata(spriteAndMetadata);
-            if (!metadataMergeSucceeded) {
-                return;
-            }
-        }
-        if (this.projectorScatterPlotAdapter != null) {
-            if (ds == null) {
-                this.projectorScatterPlotAdapter.setLabelPointAccessor(null);
-                this.setProjection(null);
-            }
-            else {
-                this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-                this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-                this.projectorScatterPlotAdapter.resize();
-                this.projectorScatterPlotAdapter.render();
-            }
-        }
-        if (ds != null) {
-            this.dataPanel.setNormalizeData(this.normalizeData);
-            this.setCurrentDataSet(ds.getSubset());
-            this.projectorScatterPlotAdapter.setLabelPointAccessor(this.selectedLabelOption);
-            this.inspectorPanel.datasetChanged();
-            this.inspectorPanel.metadataChanged(spriteAndMetadata);
-            this.projectionsPanel.metadataChanged(spriteAndMetadata);
-            this.dataPanel.metadataChanged(spriteAndMetadata, metadataFile);
-            // Set the container to a fixed height, otherwise in Colab the
-            // height can grow indefinitely.
-            var container = this.dom.select('#container');
-            container.style('height', container.property('clientHeight') + 'px');
-        }
-        else {
-            this.setCurrentDataSet(null);
-        }
-    };
-    Projector.prototype.setSelectedTensor = function (run, tensorInfo) {
-        this.bookmarkPanel.setSelectedTensor(run, tensorInfo, this.dataProvider);
-    };
-    /**
-     * Registers a listener to be called any time the selected point set changes.
-     */
-    Projector.prototype.registerSelectionChangedListener = function (listener) {
-        this.selectionChangedListeners.push(listener);
-    };
-    Projector.prototype.filterDataset = function (pointIndices) {
-        var selectionSize = this.selectedPointIndices.length;
-        if (this.dataSetBeforeFilter == null) {
-            this.dataSetBeforeFilter = this.dataSet;
-        }
-        this.setCurrentDataSet(this.dataSet.getSubset(pointIndices));
-        this.dataSetFilterIndices = pointIndices;
-        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.adjustSelectionAndHover(d3.range(selectionSize));
-    };
-    Projector.prototype.resetFilterDataset = function () {
-        var _this = this;
-        var originalPointIndices = this.selectedPointIndices.map(function (filteredIndex) { return _this.dataSet.points[filteredIndex].index; });
-        this.setCurrentDataSet(this.dataSetBeforeFilter);
-        if (this.projection != null) {
-            this.projection.dataSet = this.dataSetBeforeFilter;
-        }
-        this.dataSetBeforeFilter = null;
-        this.projectorScatterPlotAdapter.updateScatterPlotPositions();
-        this.projectorScatterPlotAdapter.updateScatterPlotAttributes();
-        this.dataSetFilterIndices = [];
-        this.adjustSelectionAndHover(originalPointIndices);
-    };
-    /**
-     * Used by clients to indicate that a selection has occurred.
-     */
-    Projector.prototype.notifySelectionChanged = function (newSelectedPointIndices) {
-        var _this = this;
-        this.selectedPointIndices = newSelectedPointIndices;
-        var neighbors = [];
-        if (newSelectedPointIndices.length === 1) {
-            neighbors = this.dataSet.findNeighbors(newSelectedPointIndices[0], this.inspectorPanel.distFunc, this.inspectorPanel.numNN);
-            this.metadataCard.updateMetadata(this.dataSet.points[newSelectedPointIndices[0]].metadata);
-        }
-        else {
-            this.metadataCard.updateMetadata(null);
-        }
-        this.selectionChangedListeners.forEach(function (l) { return l(_this.selectedPointIndices, neighbors); });
-    };
-    /**
-     * Registers a listener to be called any time the mouse hovers over a point.
-     */
-    Projector.prototype.registerHoverListener = function (listener) {
-        this.hoverListeners.push(listener);
-    };
-    /**
-     * Used by clients to indicate that a hover is occurring.
-     */
-    Projector.prototype.notifyHoverOverPoint = function (pointIndex) {
-        this.hoverListeners.forEach(function (l) { return l(pointIndex); });
-    };
-    Projector.prototype.registerProjectionChangedListener = function (listener) {
-        this.projectionChangedListeners.push(listener);
-    };
-    Projector.prototype.notifyProjectionChanged = function (projection) {
-        this.projectionChangedListeners.forEach(function (l) { return l(projection); });
-    };
-    Projector.prototype.registerDistanceMetricChangedListener = function (l) {
-        this.distanceMetricChangedListeners.push(l);
-    };
-    Projector.prototype.notifyDistanceMetricChanged = function (distMetric) {
-        this.distanceMetricChangedListeners.forEach(function (l) { return l(distMetric); });
-    };
-    Projector.prototype._dataProtoChanged = function (dataProtoString) {
-        var dataProto = dataProtoString ? JSON.parse(dataProtoString) : null;
-        this.initializeDataProvider(dataProto);
-    };
-    Projector.prototype.makeDefaultPointsInfoAndStats = function (points) {
-        var pointsInfo = [];
-        points.forEach(function (p) {
-            var pointInfo = {};
-            pointInfo[INDEX_METADATA_FIELD] = p.index;
-            pointsInfo.push(pointInfo);
-        });
-        var stats = [{
-                name: INDEX_METADATA_FIELD,
-                isNumeric: false,
-                tooManyUniqueValues: true,
-                min: 0,
-                max: pointsInfo.length - 1
-            }];
-        return [pointsInfo, stats];
-    };
-    Projector.prototype.initializeDataProvider = function (dataProto) {
-        if (this.servingMode === 'demo') {
-            var projectorConfigUrl = void 0;
-            // Only in demo mode do we allow the config being passed via URL.
-            var urlParams = util.getURLParams(window.location.search);
-            if ('config' in urlParams) {
-                projectorConfigUrl = urlParams['config'];
-            }
-            else {
-                projectorConfigUrl = this.projectorConfigJsonPath;
-            }
-            this.dataProvider = new data_provider_demo_1.DemoDataProvider(projectorConfigUrl);
-        }
-        else if (this.servingMode === 'server') {
-            if (!this.routePrefix) {
-                throw 'route-prefix is a required parameter';
-            }
-            this.dataProvider = new data_provider_server_1.ServerDataProvider(this.routePrefix);
-        }
-        else if (this.servingMode === 'proto' && dataProto != null) {
-            this.dataProvider = new data_provider_proto_1.ProtoDataProvider(dataProto);
-        }
-        this.dataPanel.initialize(this, this.dataProvider);
-    };
-    Projector.prototype.getLegendPointColorer = function (colorOption) {
-        var _this = this;
-        if ((colorOption == null) || (colorOption.map == null)) {
-            return null;
-        }
-        var colorer = function (ds, i) {
-            var value = ds.points[i].metadata[_this.selectedColorOption.name];
-            if (value == null) {
-                return POINT_COLOR_MISSING;
-            }
-            return colorOption.map(value);
-        };
-        return colorer;
-    };
-    Projector.prototype.get3DLabelModeButton = function () {
-        return this.querySelector('#labels3DMode');
-    };
-    Projector.prototype.get3DLabelMode = function () {
-        var label3DModeButton = this.get3DLabelModeButton();
-        return label3DModeButton.active;
-    };
-    Projector.prototype.adjustSelectionAndHover = function (selectedPointIndices, hoverIndex) {
-        this.notifySelectionChanged(selectedPointIndices);
-        this.notifyHoverOverPoint(hoverIndex);
-        this.setMouseMode(scatterPlot_1.MouseMode.CAMERA_AND_CLICK_SELECT);
-    };
-    Projector.prototype.setMouseMode = function (mouseMode) {
-        var selectModeButton = this.querySelector('#selectMode');
-        selectModeButton.active = (mouseMode === scatterPlot_1.MouseMode.AREA_SELECT);
-        this.projectorScatterPlotAdapter.scatterPlot.setMouseMode(mouseMode);
-    };
-    Projector.prototype.setCurrentDataSet = function (ds) {
-        this.adjustSelectionAndHover([]);
-        if (this.dataSet != null) {
-            this.dataSet.stopTSNE();
-        }
-        if ((ds != null) && this.normalizeData) {
-            ds.normalize();
-        }
-        this.dim = (ds == null) ? 0 : ds.dim[1];
-        this.dom.select('span.numDataPoints').text((ds == null) ? '0' : ds.dim[0]);
-        this.dom.select('span.dim').text((ds == null) ? '0' : ds.dim[1]);
-        this.dataSet = ds;
-        this.projectionsPanel.dataSetUpdated(this.dataSet, this.originalDataSet, this.dim);
-        this.projectorScatterPlotAdapter.setDataSet(this.dataSet);
-        this.projectorScatterPlotAdapter.scatterPlot
-            .setCameraParametersForNextCameraCreation(null, true);
-    };
-    Projector.prototype.setupUIControls = function () {
-        var _this = this;
-        // View controls
-        this.querySelector('#reset-zoom').addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.scatterPlot.resetZoom();
-            _this.projectorScatterPlotAdapter.scatterPlot.startOrbitAnimation();
-        });
-        var selectModeButton = this.querySelector('#selectMode');
-        selectModeButton.addEventListener('click', function (event) {
-            _this.setMouseMode(selectModeButton.active ? scatterPlot_1.MouseMode.AREA_SELECT :
-                scatterPlot_1.MouseMode.CAMERA_AND_CLICK_SELECT);
-        });
-        var nightModeButton = this.querySelector('#nightDayMode');
-        nightModeButton.addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.scatterPlot.setDayNightMode(nightModeButton.active);
-        });
-        var labels3DModeButton = this.get3DLabelModeButton();
-        labels3DModeButton.addEventListener('click', function () {
-            _this.projectorScatterPlotAdapter.set3DLabelMode(_this.get3DLabelMode());
-        });
-        window.addEventListener('resize', function () {
-            var container = _this.dom.select('#container');
-            var parentHeight = container.node().parentNode.clientHeight;
-            container.style('height', parentHeight + 'px');
-            _this.projectorScatterPlotAdapter.resize();
-        });
-        {
-            this.projectorScatterPlotAdapter = new projectorScatterPlotAdapter_1.ProjectorScatterPlotAdapter(this.getScatterContainer(), this);
-            this.projectorScatterPlotAdapter.setLabelPointAccessor(this.selectedLabelOption);
-        }
-        this.projectorScatterPlotAdapter.scatterPlot.onCameraMove(function (cameraPosition, cameraTarget) {
-            return _this.bookmarkPanel.clearStateSelection();
-        });
-        this.registerHoverListener(function (hoverIndex) { return _this.onHover(hoverIndex); });
-        this.registerSelectionChangedListener(function (selectedPointIndices, neighborsOfFirstPoint) {
-            return _this.onSelectionChanged(selectedPointIndices, neighborsOfFirstPoint);
-        });
-    };
-    Projector.prototype.onHover = function (hoverIndex) {
-        this.hoverPointIndex = hoverIndex;
-        var hoverText = null;
-        if (hoverIndex != null) {
-            var point = this.dataSet.points[hoverIndex];
-            if (point.metadata[this.selectedLabelOption]) {
-                hoverText = point.metadata[this.selectedLabelOption].toString();
-            }
-        }
-        if (this.selectedPointIndices.length === 0) {
-            this.statusBar.style('display', hoverText ? null : 'none');
-            this.statusBar.text(hoverText);
-        }
-    };
-    Projector.prototype.getScatterContainer = function () {
-        return this.dom.select('#scatter');
-    };
-    Projector.prototype.onSelectionChanged = function (selectedPointIndices, neighborsOfFirstPoint) {
-        this.selectedPointIndices = selectedPointIndices;
-        this.neighborsOfFirstPoint = neighborsOfFirstPoint;
-        var totalNumPoints = this.selectedPointIndices.length + neighborsOfFirstPoint.length;
-        this.statusBar.text("Selected " + totalNumPoints + " points")
-            .style('display', totalNumPoints > 0 ? null : 'none');
-    };
-    Projector.prototype.setProjection = function (projection) {
-        this.projection = projection;
-        if (projection != null) {
-            this.analyticsLogger.logProjectionChanged(projection.projectionType);
-        }
-        this.notifyProjectionChanged(projection);
-    };
-    Projector.prototype.notifyProjectionPositionsUpdated = function () {
-        this.projectorScatterPlotAdapter.notifyProjectionPositionsUpdated();
-    };
-    /**
-     * Gets the current view of the embedding and saves it as a State object.
-     */
-    Projector.prototype.getCurrentState = function () {
-        var state = new data_1.State();
-        // Save the individual datapoint projections.
-        state.projections = [];
-        for (var i = 0; i < this.dataSet.points.length; i++) {
-            var point = this.dataSet.points[i];
-            var projections = {};
-            var keys = Object.keys(point.projections);
-            for (var j = 0; j < keys.length; ++j) {
-                projections[keys[j]] = point.projections[keys[j]];
-            }
-            state.projections.push(projections);
-        }
-        state.selectedProjection = this.projection.projectionType;
-        state.dataSetDimensions = this.dataSet.dim;
-        state.tSNEIteration = this.dataSet.tSNEIteration;
-        state.selectedPoints = this.selectedPointIndices;
-        state.filteredPoints = this.dataSetFilterIndices;
-        this.projectorScatterPlotAdapter.populateBookmarkFromUI(state);
-        state.selectedColorOptionName = this.dataPanel.selectedColorOptionName;
-        state.forceCategoricalColoring = this.dataPanel.forceCategoricalColoring;
-        state.selectedLabelOption = this.selectedLabelOption;
-        this.projectionsPanel.populateBookmarkFromUI(state);
-        return state;
-    };
-    /** Loads a State object into the world. */
-    Projector.prototype.loadState = function (state) {
-        this.setProjection(null);
-        {
-            this.projectionsPanel.disablePolymerChangesTriggerReprojection();
-            if (this.dataSetBeforeFilter != null) {
-                this.resetFilterDataset();
-            }
-            if (state.filteredPoints != null) {
-                this.filterDataset(state.filteredPoints);
-            }
-            this.projectionsPanel.enablePolymerChangesTriggerReprojection();
-        }
-        for (var i = 0; i < state.projections.length; i++) {
-            var point = this.dataSet.points[i];
-            var projection = state.projections[i];
-            var keys = Object.keys(projection);
-            for (var j = 0; j < keys.length; ++j) {
-                point.projections[keys[j]] = projection[keys[j]];
-            }
-        }
-        this.dataSet.hasTSNERun = (state.selectedProjection === 'tsne');
-        this.dataSet.tSNEIteration = state.tSNEIteration;
-        this.projectionsPanel.restoreUIFromBookmark(state);
-        this.inspectorPanel.restoreUIFromBookmark(state);
-        this.dataPanel.selectedColorOptionName = state.selectedColorOptionName;
-        this.dataPanel.setForceCategoricalColoring(!!state.forceCategoricalColoring);
-        this.selectedLabelOption = state.selectedLabelOption;
-        this.projectorScatterPlotAdapter.restoreUIFromBookmark(state);
-        {
-            var dimensions = data_1.stateGetAccessorDimensions(state);
-            var components = data.getProjectionComponents(state.selectedProjection, dimensions);
-            var projection = new data_1.Projection(state.selectedProjection, components, dimensions.length, this.dataSet);
-            this.setProjection(projection);
-        }
-        this.notifySelectionChanged(state.selectedPoints);
-    };
-    return Projector;
-}(exports.ProjectorPolymer));
-exports.Projector = Projector;
-document.registerElement(Projector.prototype.is, Projector);
-
-},{"./analyticsLogger":1,"./data":7,"./data-provider-demo":3,"./data-provider-proto":4,"./data-provider-server":5,"./logging":12,"./projectorScatterPlotAdapter":14,"./scatterPlot":16,"./util":24,"./vz-projector-util":33}],35:[function(require,module,exports){
-arguments[4][8][0].apply(exports,arguments)
-},{"dup":8}]},{},[35,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34]);
-</script>
-</dom-module>
-</body></html>
\ No newline at end of file
diff --git a/tensorflow/tensorboard/gulp_tasks/compile.js b/tensorflow/tensorboard/gulp_tasks/compile.js
deleted file mode 100644
index 3d0d725cfb2b7177cfcdfc3b2418a19994ea89cc..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/gulp_tasks/compile.js
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var ts = require('gulp-typescript');
-var typescript = require('typescript');
-var gutil = require('gulp-util');
-var filter = require('gulp-filter');
-var merge = require('merge2');
-var browserify = require('browserify');
-var tsify = require('tsify');
-var source = require('vinyl-source-stream');
-var glob = require('glob').sync;
-var concat = require('gulp-concat');
-
-var tsProject = ts.createProject('./tsconfig.json', {
-  typescript: typescript,
-  noExternalResolve: true, // opt-in for faster compilation!
-});
-
-/** List of components (and their external deps) that are using es6 modules. */
-var ES6_COMPONENTS = [{
-  name: 'vz_projector',
-  deps: [
-    'd3/d3.min.js', 'weblas/dist/weblas.js', 'three.js/build/three.min.js',
-    'three.js/examples/js/controls/OrbitControls.js',
-    'numericjs/lib/numeric-1.2.6.js'
-  ]
-}];
-
-module.exports = function(includeDeps) {
-  return function() {
-    // Compile all components that are using ES6 modules into a bundle.js
-    // using browserify.
-    var entries = ['typings/index.d.ts'];
-    var deps = {};
-    ES6_COMPONENTS.forEach(function(component) {
-      // Collect all the typescript files across the components.
-      entries = entries.concat(glob(
-          'components/' + component.name + '/**/*.ts',
-          // Do not include tests or IDE-purposed files.
-          {ignore: ['**/*_test.ts', '**/deps.d.ts']}));
-      // Collect the unique external deps across all components using es6
-      // modules.
-      component.deps.forEach(function(dep) {
-        deps['components/' + dep] = true;
-      });
-    });
-    deps = Object.keys(deps);
-
-    // Compile, bundle all the typescript files and prepend their deps.
-    browserify(entries)
-        .plugin(tsify)
-        .bundle()
-        .on('error', function(error) { console.error(error.toString()); })
-        .pipe(source('bundle.js'))
-        .pipe(gulp.dest('components'))
-        .on('end', function() {
-          // Typescript was compiled and bundled. Now we need to prepend
-          // the external dependencies.
-          if (includeDeps) {
-            gulp.src(deps.concat(['components/bundle.js']))
-                .pipe(concat('bundle.js'))
-                .pipe(gulp.dest('components'));
-          }
-        });
-
-    // Compile components that are using global namespaces producing 1 js file
-    // for each ts file.
-    var isComponent = filter([
-      'components/tf_*/**/*.ts', 'components/vz_*/**/*.ts', 'typings/**/*.ts',
-      'components/plottable/plottable.d.ts'
-      // Ignore components that use es6 modules.
-    ].concat(ES6_COMPONENTS.map(function(component) {
-      return '!components/' + component.name + '/**/*.ts';
-    })));
-
-    return tsProject.src()
-        .pipe(isComponent)
-        .pipe(ts(tsProject))
-        .js.pipe(gulp.dest('.'));
-  };
-};
diff --git a/tensorflow/tensorboard/gulp_tasks/util.js b/tensorflow/tensorboard/gulp_tasks/util.js
deleted file mode 100644
index 7a1d2a58ab67b7ba1787e4c653f34a52318a8d0c..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/gulp_tasks/util.js
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var fs = require('fs');
-var path = require('path');
-
-/**
- * Returns a list of web components inside the components directory for which
- * the name predicate is true.
- */
-exports.getComponents = function(namePredicate) {
-  return fs.readdirSync('components')
-      .filter(function(file) {
-        return fs.statSync(path.join('components', file)).isDirectory() &&
-            namePredicate(file);
-      })
-      .map(function(dir) { return '/' + dir + '/'; });
-};
-
-/**
- * Returns a list of tensorboard web components that are inside the components
- * directory.
- */
-exports.tbComponents = exports.getComponents(function(name) {
-  var prefix = name.slice(0, 3);
-  return prefix == 'tf_' || prefix == 'vz_';
-});
diff --git a/tensorflow/tensorboard/gulp_tasks/vulcanize.js b/tensorflow/tensorboard/gulp_tasks/vulcanize.js
deleted file mode 100644
index 89700e1d4cc8f06c034dd10dc3794e152d703369..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/gulp_tasks/vulcanize.js
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var path = require('path');
-var util = require('./util');
-var vulcanize = require('gulp-vulcanize');
-var replace = require('gulp-replace');
-var rename = require('gulp-rename');
-var header = require('gulp-header');
-
-var HEADER_STR = '<!-- Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n\
-\n\
-Licensed under the Apache License, Version 2.0 (the "License");\n\
-you may not use this file except in compliance with the License.\n\
-You may obtain a copy of the License at\n\
-\n\
-   http://www.apache.org/licenses/LICENSE-2.0\n\
-\n\
-Unless required by applicable law or agreed to in writing, software\n\
-distributed under the License is distributed on an "AS IS" BASIS,\n\
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
-See the License for the specific language governing permissions and\n\
-limitations under the License.\n\
-============================================================================\n\
-\n\
-This file is generated by `gulp` & `vulcanize`. Do not directly change it.\n\
-Instead, use `gulp regenerate` to create a new version with your changes.\n\
--->\n\n'
-
-var base = path.join(__dirname, '../components');
-// List of redirects of the form path1|path2 for every tensorboard component
-// in order to replace dashes with underscores.
-// E.g. .../tf-tensorboard|.../tf_tensorboard
-var redirects = util.tbComponents.map(function(dir) {
-  return path.join(base, dir.replace(/_/g, '-')) + '|' + path.join(base, dir);
-});
-
-var nonTBComponents = util.getComponents(function(name) {
-  var prefix = name.slice(0, 3);
-  return prefix !== 'tf_'  && prefix !== 'vz_';
-});
-
-// These manual additions are necessary. The task should not inline these
-// third-party javascript files. However, vulcanization still needs the HTML
-// files found within those directories. Upon adding new third-party javascript,
-// consider updating this list.
-nonTBComponents.push('/tf-imports/d3.js');
-nonTBComponents.push('/tf-imports/dagre.js');
-nonTBComponents.push('/tf-imports/graphlib.js');
-nonTBComponents.push('/tf-imports/lodash.js');
-nonTBComponents.push('/tf-imports/plottable.js');
-
-module.exports = function(overwrite) {
-  return function() {
-    var suffix = overwrite ? '' : '.OPENSOURCE';
-    // Vulcanize TensorBoard without external libraries.
-    gulp.src('components/tf_tensorboard/tf-tensorboard.html')
-        .pipe(vulcanize({
-          inlineScripts: true,
-          inlineCss: true,
-          stripComments: true,
-          excludes: nonTBComponents,
-          redirects: redirects
-        }))
-        .pipe(header(HEADER_STR))
-        .pipe(rename('tf-tensorboard.html' + suffix))
-        .pipe(gulp.dest('./dist'));
-  }
-}
diff --git a/tensorflow/tensorboard/gulpfile.js b/tensorflow/tensorboard/gulpfile.js
deleted file mode 100644
index 257ee0ab83d7abe5dbc696b91f2b6d5d536f4c5a..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/gulpfile.js
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-var gulp = require('gulp');
-var server = require('gulp-server-livereload');
-var minimist = require('minimist');
-var util = require('./gulp_tasks/util');
-
-var options = minimist(process.argv.slice(2), {
-  default: {
-    p: 8000,  // port for gulp server
-    h: '0.0.0.0', // host to serve on
-  }
-});
-
-function getTask(task) {
-    return require('./gulp_tasks/' + task);
-}
-
-
-gulp.task('compile', getTask('compile')(true));
-gulp.task('first-compile', getTask('compile')(true));
-gulp.task('compile-without-deps', getTask('compile')(false));
-gulp.task('test.onlytest', getTask('test'));
-gulp.task('test', ['compile'], getTask('test'));
-
-gulp.task('watch', [], function() {
-  // Avoid watching generated .d.ts in the build (aka output) directory.
-  return gulp.watch(
-      ['components/tf_*/**/*.ts', 'components/vz_*/**/*.ts'],
-      {ignoreInitial: true}, ['compile']);
-});
-
-var httpPrefix = 'http://' + options.h + ':' + options.p + '/components';
-var proxies = util.tbComponents.map(function(component) {
-  return {
-    source: '/components' + component.replace(/_/g, '-'),
-    target: httpPrefix + component
-  };
-});
-
-// Do first-compile before turning on server, to avoid spamming
-// livereload info
-// TODO(danmane): Disconnect this once we can get livereload to
-// no longer spam.
-gulp.task('server', ['first-compile'], function() {
-  gulp.src('.').pipe(server({
-    host: options.h,
-    port: options.p,
-    livereload: {
-      enable: true,
-      // Don't livereload on .ts changes, since they aren't loaded by browser.
-      filter: function(filePath, cb) { cb(!(/\.ts$/.test(filePath))); },
-      port: 27729 + options.p
-    },
-    proxies: proxies,
-    directoryListing: true,
-  }));
-});
-
-// TODO(danmane): When testing is nicer, integrate into vulcanize task
-// gulp vulcanize: Regenerate the tf-tensorboard.html.OPENSOURCE file for pre-release
-gulp.task(
-    'vulcanize', ['compile-without-deps'],
-    getTask('vulcanize')(false));
-// gulp regenerate: Regenerate the tf-tensorboard.html for interactive bazel development
-gulp.task(
-    'regenerate', ['compile-without-deps'],
-    getTask('vulcanize')(true));
-
-// TODO(danmane): consider making bower install part of default task
-gulp.task('default', ['watch', 'server']);
-
-// Clean all compiled JS files.
-var cleanCompiledTypeScript = require('gulp-clean-compiled-typescript');
-gulp.task('clean', function () {
-  return gulp.src(['./components/**/*.ts', '!./components/**/deps.d.ts'])
-      .pipe(cleanCompiledTypeScript());
-});
diff --git a/tensorflow/tensorboard/hacks.bzl b/tensorflow/tensorboard/hacks.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f1d4be790612ac912dc1b1a2298f8bc8dd99dee6
--- /dev/null
+++ b/tensorflow/tensorboard/hacks.bzl
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO(jart): Merge this file into defs.bzl once that file is sync unified.
+
+def tensorboard_typescript_bundle(
+    name,
+    out,
+    namespace_srcs,
+    namespace_symbol_aliases={},
+    namespace_symbol_aliases_public={},
+    **kwargs):
+  """Rolls TypeScript ES6 modules into one vanilla source file without imports.
+
+  This is a genrule wrapper that concatenates TypeScripts sources inside
+  namespace blocks while removing ^import lines. Because the sources themselves
+  are not parsed, the structure of the modules must be passed to this macro as
+  a Skylark data structure.
+
+  Args:
+    name: Name of this build rule target.
+    out: Path of outputted TypeScript source file.
+    namespace_srcs: Multimap of namespace strings to build file targets. The
+        ordering of the dictionary and nested lists does not matter when
+        generating a typings file, but *does* matter when generating a source
+        file.
+    namespace_symbol_aliases: Map of namespace strings where each value is a
+        map of symbol names to fully qualified symbol names.
+    namespace_symbol_aliases_public: Same as namespace_symbol_aliases but the
+        symbol will be visible to other namespaces.
+  """
+  cmd = ["(", "echo // GENERATED BY TENSORBOARD_TYPESCRIPT_BUNDLE"]
+  inputs = set()
+  for namespace, srcs in namespace_srcs.items():
+    cmd.append("echo")
+    if out[-5:] == ".d.ts":
+      cmd.append("echo 'declare namespace %s {'" % namespace)
+    elif out[-3:] == ".ts":
+      cmd.append("echo 'module %s {'" % namespace)
+    else:
+      fail("'out' must end with .ts or .d.ts: " + out)
+    for symbol, canon in namespace_symbol_aliases.get(namespace, {}).items():
+      cmd.append("echo 'import %s = %s;'" % (symbol, canon))
+    for symbol, canon in namespace_symbol_aliases_public.get(namespace,
+                                                             {}).items():
+      cmd.append("echo 'export import %s = %s;'" % (symbol, canon))
+    inputs += srcs
+    for src in srcs:
+      cmd.append("for f in $(locations %s); do" % src)
+      cmd.append("  echo")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo // " + namespace)
+      cmd.append("  echo // $$f")
+      cmd.append("  echo /////////////////////////////////////////////////////")
+      cmd.append("  echo")
+      cmd.append("  sed 's!^import !// import !' $$f \\")
+      cmd.append("    | sed 's!^export declare !export !' \\")
+      cmd.append("    | sed '/^export .* from /d' \\")
+      cmd.append("    | sed '/^export {.*};$$/d'")
+      cmd.append("done")
+    cmd.append("echo '}'")
+  cmd.append(") >$@")
+  native.genrule(
+      name = name,
+      srcs = list(inputs),
+      outs = [out],
+      cmd = "\n".join(cmd),
+      **kwargs
+  )
diff --git a/tensorflow/tensorboard/http_api.md b/tensorflow/tensorboard/http_api.md
index 16c2f95ae1caa58c6cc3ca2f0a5f1a5b90c692c7..00aeb6353e2e99eecd1cba73bd967d1db8ffae44 100644
--- a/tensorflow/tensorboard/http_api.md
+++ b/tensorflow/tensorboard/http_api.md
@@ -36,6 +36,13 @@ Returns a JSON object with a key "logdir" that maps to the `logdir` argument
 
 The `logdir` argument is the path of the directory that contains events files.
 
+## `data/plugins_listing`
+
+Returns a dict mapping from plugin name to a boolean indicating whether the
+plugin is active. A plugin might be inactive, for instance, if it lacks relevant
+data. Every plugin has a key. This route helps the frontend avoid issuing
+requests to an inactive plugin - the routes of an inactive plugin do not work.
+
 ## `data/runs`
 
 Returns a dictionary mapping from `run name` (quoted string) to dictionaries
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de57d7ce2e4900caa0a8c9db7d43ab105cac2250
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+java_binary(
+    name = "Vulcanize",
+    srcs = ["Vulcanize.java"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_guava",
+        "@com_google_protobuf_java",
+        "@io_bazel_rules_closure//closure/compiler",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure:webpath",
+        "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles:build_info_java_proto",
+        "@io_bazel_rules_closure//java/org/jsoup/nodes",
+        "@org_jsoup",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(["**"]),
+    tags = ["notsan"],
+)
diff --git a/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
new file mode 100644
index 0000000000000000000000000000000000000000..e572415856cd7151d04aa2cbd1b8c49678782acd
--- /dev/null
+++ b/tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize/Vulcanize.java
@@ -0,0 +1,317 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.tensorflow.tensorboard.vulcanize;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+import static com.google.common.base.Verify.verifyNotNull;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import com.google.common.base.CharMatcher;
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import com.google.javascript.jscomp.BasicErrorManager;
+import com.google.javascript.jscomp.CheckLevel;
+import com.google.javascript.jscomp.Compiler;
+import com.google.javascript.jscomp.CompilerOptions;
+import com.google.javascript.jscomp.CompilerOptions.LanguageMode;
+import com.google.javascript.jscomp.CompilerOptions.Reach;
+import com.google.javascript.jscomp.JSError;
+import com.google.javascript.jscomp.PropertyRenamingPolicy;
+import com.google.javascript.jscomp.SourceFile;
+import com.google.javascript.jscomp.VariableRenamingPolicy;
+import com.google.protobuf.TextFormat;
+import io.bazel.rules.closure.Webpath;
+import io.bazel.rules.closure.webfiles.BuildInfo.Webfiles;
+import io.bazel.rules.closure.webfiles.BuildInfo.WebfilesSource;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Comment;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Html5Printer;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+
+/** Simple one-off solution for TensorBoard vulcanization. */
+public final class Vulcanize {
+
+  private static final Parser parser = Parser.htmlParser();
+  private static final Map<Webpath, Path> webfiles = new HashMap<>();
+  private static final Set<Webpath> alreadyInlined = new HashSet<>();
+  private static final Set<String> legalese = new HashSet<>();
+  private static final List<String> licenses = new ArrayList<>();
+  private static final List<Webpath> stack = new ArrayList<>();
+  private static Webpath outputPath;
+  private static Node licenseComment;
+  private static boolean nominify;
+
+  public static void main(String[] args) throws IOException {
+    Webpath inputPath = Webpath.get(args[0]);
+    outputPath = Webpath.get(args[1]);
+    Path output = Paths.get(args[2]);
+    for (int i = 3; i < args.length; i++) {
+      Webfiles manifest = loadWebfilesPbtxt(Paths.get(args[i]));
+      for (WebfilesSource src : manifest.getSrcList()) {
+        webfiles.put(Webpath.get(src.getWebpath()), Paths.get(src.getPath()));
+      }
+    }
+    stack.add(inputPath);
+    Document document = parse(Files.readAllBytes(webfiles.get(inputPath)));
+    transform(document);
+    if (licenseComment != null) {
+      licenseComment.attr("comment", String.format("\n%s\n", Joiner.on("\n\n").join(licenses)));
+    }
+    Files.write(
+        output,
+        Html5Printer.stringify(document).getBytes(UTF_8),
+        StandardOpenOption.WRITE,
+        StandardOpenOption.CREATE,
+        StandardOpenOption.TRUNCATE_EXISTING);
+  }
+
+  private static void transform(Node root) throws IOException {
+    Node node = checkNotNull(root);
+    Node newNode;
+    while (true) {
+      newNode = enterNode(node);
+      if (node.equals(root)) {
+        root = newNode;
+      }
+      node = newNode;
+      if (node.childNodeSize() > 0) {
+        node = node.childNode(0);
+      } else {
+        while (true) {
+          newNode = leaveNode(node);
+          if (node.equals(root)) {
+            root = newNode;
+          }
+          node = newNode;
+          if (node.equals(root)) {
+            return;
+          }
+          Node next = node.nextSibling();
+          if (next == null) {
+            if (node.parentNode() == null) {
+              return;
+            }
+            node = verifyNotNull(node.parentNode(), "unexpected root: %s", node);
+          } else {
+            node = next;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  private static Node enterNode(Node node) throws IOException {
+    Node newNode = node;
+    if (node instanceof Element) {
+      if (node.nodeName().equals("link") && node.attr("rel").equals("import")) {
+        // Inline HTML.
+        Webpath href = me().lookup(Webpath.get(node.attr("href")));
+        if (alreadyInlined.add(href)) {
+          newNode =
+              parse(Files.readAllBytes(checkNotNull(webfiles.get(href), "%s in %s", href, me())));
+          stack.add(href);
+          node.replaceWith(newNode);
+        } else {
+          newNode = new TextNode("", node.baseUri());
+          node.replaceWith(newNode);
+        }
+      } else if (node.nodeName().equals("script")) {
+        nominify = node.hasAttr("nominify");
+        node.removeAttr("nominify");
+        Webpath src;
+        String script;
+        if (node.attr("src").isEmpty()) {
+          // Minify JavaScript.
+          StringBuilder sb = new StringBuilder();
+          for (Node child : node.childNodes()) {
+            if (child instanceof DataNode) {
+              sb.append(((DataNode) child).getWholeData());
+            }
+          }
+          src = me();
+          script = sb.toString();
+        } else {
+          // Inline JavaScript.
+          src = me().lookup(Webpath.get(node.attr("src")));
+          Path other = webfiles.get(src);
+          if (other != null) {
+            script = new String(Files.readAllBytes(other), UTF_8);
+            node.removeAttr("src");
+          } else {
+            src = me();
+            script = "";
+          }
+        }
+        script = minify(src, script);
+        newNode =
+            new Element(Tag.valueOf("script"), node.baseUri(), node.attributes())
+                .appendChild(new DataNode(script, node.baseUri()));
+        node.replaceWith(newNode);
+      } else if (node.nodeName().equals("link")
+          && node.attr("rel").equals("stylesheet")
+          && !node.attr("href").isEmpty()) {
+        // Inline CSS.
+        Webpath href = me().lookup(Webpath.get(node.attr("href")));
+        Path other = webfiles.get(href);
+        if (other != null) {
+          newNode =
+              new Element(Tag.valueOf("style"), node.baseUri(), node.attributes())
+                  .appendChild(
+                      new DataNode(new String(Files.readAllBytes(other), UTF_8), node.baseUri()));
+          newNode.removeAttr("rel");
+          newNode.removeAttr("href");
+          node.replaceWith(newNode);
+        }
+      }
+      rootifyAttribute(newNode, "href");
+      rootifyAttribute(newNode, "src");
+      rootifyAttribute(newNode, "action");
+      rootifyAttribute(newNode, "assetpath");
+    } else if (node instanceof Comment) {
+      String text = ((Comment) node).getData();
+      if (text.contains("@license")) {
+        handleLicense(text);
+        if (licenseComment == null) {
+          licenseComment = node;
+        } else {
+          newNode = new TextNode("", node.baseUri());
+          node.replaceWith(newNode);
+        }
+      } else {
+        newNode = new TextNode("", node.baseUri());
+        node.replaceWith(newNode);
+      }
+    }
+    return newNode;
+  }
+
+  private static String minify(Webpath src, String script) {
+    if (nominify) {
+      return script;
+    }
+    Compiler compiler = new Compiler(new JsPrintlessErrorManager());
+    CompilerOptions options = new CompilerOptions();
+    options.skipAllCompilerPasses(); // too lazy to get externs
+    options.setLanguageIn(LanguageMode.ECMASCRIPT_2016);
+    options.setLanguageOut(LanguageMode.ECMASCRIPT5);
+    options.setContinueAfterErrors(true);
+    options.setManageClosureDependencies(false);
+    options.setRenamingPolicy(VariableRenamingPolicy.LOCAL, PropertyRenamingPolicy.OFF);
+    options.setShadowVariables(true);
+    options.setInlineVariables(Reach.LOCAL_ONLY);
+    options.setFlowSensitiveInlineVariables(true);
+    options.setInlineFunctions(Reach.LOCAL_ONLY);
+    options.setAssumeClosuresOnlyCaptureReferences(false);
+    options.setCheckGlobalThisLevel(CheckLevel.OFF);
+    options.setFoldConstants(true);
+    options.setCoalesceVariableNames(true);
+    options.setDeadAssignmentElimination(true);
+    options.setCollapseVariableDeclarations(true);
+    options.setConvertToDottedProperties(true);
+    options.setLabelRenaming(true);
+    options.setRemoveDeadCode(true);
+    options.setOptimizeArgumentsArray(true);
+    options.setRemoveUnusedVariables(Reach.LOCAL_ONLY);
+    options.setCollapseObjectLiterals(true);
+    options.setProtectHiddenSideEffects(true);
+    //options.setPrettyPrint(true);
+    compiler.disableThreads();
+    compiler.compile(
+        ImmutableList.<SourceFile>of(),
+        ImmutableList.of(SourceFile.fromCode(src.toString(), script)),
+        options);
+    return compiler.toSource();
+  }
+
+  private static void handleLicense(String text) {
+    if (legalese.add(CharMatcher.whitespace().removeFrom(text))) {
+      licenses.add(CharMatcher.anyOf("\r\n").trimFrom(text));
+    }
+  }
+
+  private static Node leaveNode(Node node) {
+    if (node instanceof Document) {
+      stack.remove(stack.size() - 1);
+    }
+    return node;
+  }
+
+  private static Webpath me() {
+    return Iterables.getLast(stack);
+  }
+
+  private static void rootifyAttribute(Node node, String attribute) {
+    String value = node.attr(attribute);
+    if (value.isEmpty()) {
+      return;
+    }
+    Webpath uri = Webpath.get(value);
+    if (webfiles.containsKey(uri)) {
+      node.attr(attribute, outputPath.getParent().relativize(uri).toString());
+    }
+  }
+
+  private static Document parse(byte[] bytes) {
+    return parse(new ByteArrayInputStream(bytes));
+  }
+
+  private static Document parse(InputStream input) {
+    Document document;
+    try {
+      document = Jsoup.parse(input, null, "", parser);
+    } catch (IOException e) {
+      throw new AssertionError("I/O error when parsing byte array D:", e);
+    }
+    document.outputSettings().indentAmount(0);
+    document.outputSettings().prettyPrint(false);
+    return document;
+  }
+
+  private static Webfiles loadWebfilesPbtxt(Path path) throws IOException {
+    Webfiles.Builder build = Webfiles.newBuilder();
+    TextFormat.getParser().merge(new String(Files.readAllBytes(path), UTF_8), build);
+    return build.build();
+  }
+
+  private static final class JsPrintlessErrorManager extends BasicErrorManager {
+
+    @Override
+    public void println(CheckLevel level, JSError error) {}
+
+    @Override
+    public void printSummary() {}
+  }
+}
diff --git a/tensorflow/tensorboard/lib/BUILD b/tensorflow/tensorboard/lib/BUILD
deleted file mode 100644
index 9c497396c68bb68c97b71bba60ff56ec44639741..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/lib/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Description:
-# BUILD rules for the static resources in TensorBoard.
-
-package(default_visibility = [
-    "//tensorflow:internal",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files([
-    "LICENSE",
-])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "**/*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/package.json b/tensorflow/tensorboard/package.json
deleted file mode 100644
index ca6a9e89ce556e3b74d1adca6b4e5810381c0082..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/package.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "name": "tensorflow-vis",
-  "version": "0.0.0",
-  "description": "Visualizers for TensorFlow",
-  "scripts": {
-    "test": "gulp test",
-    "prepare": "npm install && bower install && typings install",
-    "compile": "gulp compile"
-  },
-  "keywords": [
-    "tensorflow"
-  ],
-  "author": "Google",
-  "license": "Apache-2.0",
-  "devDependencies": {
-    "browserify": "^13.1.0",
-    "gulp": "~3.9.0",
-    "gulp-bower": "0.0.13",
-    "gulp-clean-compiled-typescript": "~1.0.1",
-    "gulp-cli": "^1.1.0",
-    "gulp-concat": "^2.6.0",
-    "gulp-filter": "~3.0.1",
-    "gulp-header": "~1.7.1",
-    "gulp-rename": "~1.2.2",
-    "gulp-replace": "~0.5.4",
-    "gulp-server-livereload": "1.9.2",
-    "gulp-typescript": "~2.10.0",
-    "gulp-util": "~3.0.7",
-    "gulp-vulcanize": "~6.1.0",
-    "merge2": "~0.3.6",
-    "minimist": "~1.2.0",
-    "tsify": "^0.14.8",
-    "typescript": "2.1.5",
-    "typings": "1.4.0",
-    "vinyl-source-stream": "^1.1.0",
-    "vulcanize": "^1.14.0",
-    "web-component-tester": "4.2.2"
-  }
-}
diff --git a/tensorflow/tensorboard/plugins/base_plugin.py b/tensorflow/tensorboard/plugins/base_plugin.py
index 86cfeb6cc24b3ce02d8743c091fd97dfb17871d9..259046dfb4f681fd4ba07179e580f994d2231d98 100644
--- a/tensorflow/tensorboard/plugins/base_plugin.py
+++ b/tensorflow/tensorboard/plugins/base_plugin.py
@@ -30,6 +30,12 @@ class TBPlugin(object):
   """TensorBoard plugin interface. Every plugin must extend from this class."""
   __metaclass__ = ABCMeta
 
+  # The plugin_name will also be a prefix in the http handlers generated by
+  # the plugin, e.g. `data/plugins/$PLUGIN_NAME/$HANDLER`
+  # The plugin name must be unique for each registered plugin, or
+  # a ValueError will be thrown when the application is constructed
+  plugin_name = None
+
   @abstractmethod
   def get_plugin_apps(self, multiplexer, logdir):
     """Returns a set of WSGI applications that the plugin implements.
@@ -45,3 +51,15 @@ class TBPlugin(object):
       A dict mapping route paths to WSGI applications.
     """
     raise NotImplementedError()
+
+  @abstractmethod
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    A plugin may not be active for instance if it lacks relevant data. If a
+    plugin is inactive, the frontend may avoid issuing requests to its routes.
+
+    Returns:
+      A boolean value. Whether this plugin is active.
+    """
+    raise NotImplementedError()
diff --git a/tensorflow/tensorboard/plugins/debugger/BUILD b/tensorflow/tensorboard/plugins/debugger/BUILD
deleted file mode 100644
index 38aa719b9b96b1e326ec6333d22d00beea4feaa4..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/debugger/BUILD
+++ /dev/null
@@ -1,55 +0,0 @@
-# Description:
-# TensorBoard plugin for interacting with tfdbg, the TensorFlow debugger
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-## TensorFlow Debugger Plugiin ##
-py_library(
-    name = "debugger_plugin",
-    srcs = ["debugger_plugin.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/tensorboard/backend:http_util",
-        "//tensorflow/tensorboard/backend/event_processing:event_accumulator",
-        "//tensorflow/tensorboard/backend/event_processing:event_file_loader",
-        "//tensorflow/tensorboard/plugins:base_plugin",
-    ],
-)
-
-py_test(
-    name = "debugger_plugin_test",
-    size = "small",
-    srcs = ["debugger_plugin_test.py"],
-    main = "debugger_plugin_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":debugger_plugin",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:util",
-        "//tensorflow/tensorboard/backend:application",
-        "//tensorflow/tensorboard/backend/event_processing:event_multiplexer",
-        "//third_party/py/numpy",
-        "@org_pocoo_werkzeug//:werkzeug",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        [
-            "*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tensorboard/plugins/debugger/debugger_plugin.py b/tensorflow/tensorboard/plugins/debugger/debugger_plugin.py
deleted file mode 100644
index 43902efe24ed318cce55319eb87b13088c228bb0..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/debugger/debugger_plugin.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The plugin for serving data from a TensorFlow debugger."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import glob
-import json
-import os
-import re
-
-from werkzeug import wrappers
-
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.tensorboard.backend import http_util
-from tensorflow.tensorboard.backend.event_processing import event_accumulator
-from tensorflow.tensorboard.backend.event_processing import event_file_loader
-from tensorflow.tensorboard.plugins import base_plugin
-
-# The prefix of routes provided by this plugin.
-PLUGIN_PREFIX_ROUTE = 'debugger'
-
-# HTTP routes.
-_HEALTH_PILLS_ROUTE = '/health_pills'
-
-# The POST key of HEALTH_PILLS_ROUTE for a JSON list of node names.
-_NODE_NAMES_POST_KEY = 'node_names'
-
-# The POST key of HEALTH_PILLS_ROUTE for the run to retrieve health pills for.
-_RUN_POST_KEY = 'run'
-
-# The default run to retrieve health pills for.
-_DEFAULT_RUN = '.'
-
-# The POST key of HEALTH_PILLS_ROUTE for the specific step to retrieve health
-# pills for.
-_STEP_POST_KEY = 'step'
-
-# A glob pattern for files containing debugger-related events.
-_DEBUGGER_EVENTS_GLOB_PATTERN = 'events.debugger*'
-
-
-class DebuggerPlugin(base_plugin.TBPlugin):
-  """TensorFlow Debugger plugin. Receives requests for debugger-related data.
-
-  That data could include health pills, which unveil the status of tensor
-  values.
-  """
-
-  def get_plugin_apps(self, multiplexer, logdir):
-    """Obtains a mapping between routes and handlers. Stores the logdir.
-
-    Args:
-      multiplexer: The EventMultiplexer that provides TB data.
-      logdir: The logdir string - the directory of events files.
-
-    Returns:
-      A mapping between routes and handlers (functions that respond to
-      requests).
-    """
-    self._event_multiplexer = multiplexer
-    self._logdir = logdir
-    return {
-        _HEALTH_PILLS_ROUTE: self._serve_health_pills_handler,
-    }
-
-  @wrappers.Request.application
-  def _serve_health_pills_handler(self, request):
-    """A (wrapped) werkzeug handler for serving health pills.
-
-    Accepts POST requests and responds with health pills. The request accepts
-    several POST parameters:
-
-      node_names: (required string) A JSON-ified list of node names for which
-          the client would like to request health pills.
-      run: (optional string) The run to retrieve health pills for. Defaults to
-          '.'. This data is sent via POST (not GET) since URL length is limited.
-      step: (optional integer): The session run step for which to
-          retrieve health pills. If provided, the handler reads the health pills
-          of that step from disk (which is slow) and produces a response with
-          only health pills at that step. If not provided, the handler returns a
-          response with health pills at all steps sampled by the event
-          multiplexer (the fast path). The motivation here is that, sometimes,
-          one desires to examine health pills at a specific step (to say find
-          the first step that causes a model to blow up with NaNs).
-          get_plugin_apps must be called before this slower feature is used
-          because that method passes the logdir (directory path) to this plugin.
-
-    This handler responds with a JSON-ified object mapping from node names to a
-    list (of size 1) of health pill event objects, each of which has these
-    properties.
-
-    {
-        'wall_time': float,
-        'step': int,
-        'node_name': string,
-        'output_slot': int,
-        # A list of 12 floats that summarizes the elements of the tensor.
-        'value': float[],
-    }
-
-    Node names for which there are no health pills to be found are excluded from
-    the mapping.
-
-    Args:
-      request: The request issued by the client for health pills.
-
-    Returns:
-      A werkzeug BaseResponse object.
-    """
-    if request.method != 'POST':
-      logging.error(
-          '%s requests are forbidden by the debugger plugin.', request.method)
-      return wrappers.Response(status=405)
-
-    if _NODE_NAMES_POST_KEY not in request.form:
-      logging.error(
-          'The %r POST key was not found in the request for health pills.',
-          _NODE_NAMES_POST_KEY)
-      return wrappers.Response(status=400)
-
-    jsonified_node_names = request.form[_NODE_NAMES_POST_KEY]
-    try:
-      node_names = json.loads(jsonified_node_names)
-    except Exception as e:  # pylint: disable=broad-except
-      # Different JSON libs raise different exceptions, so we just do a
-      # catch-all here. This problem is complicated by how Tensorboard might be
-      # run in many different environments, as it is open-source.
-      logging.error('Could not decode node name JSON string %r: %s',
-                    jsonified_node_names, e)
-      return wrappers.Response(status=400)
-
-    if not isinstance(node_names, list):
-      logging.error('%r is not a JSON list of node names:',
-                    jsonified_node_names)
-      return wrappers.Response(status=400)
-
-    run = request.form.get(_RUN_POST_KEY, _DEFAULT_RUN)
-    step_string = request.form.get(_STEP_POST_KEY, None)
-    if step_string is None:
-      # Use all steps sampled by the event multiplexer (Relatively fast).
-      mapping = self._obtain_sampled_health_pills(run, node_names)
-    else:
-      # Read disk to obtain the health pills for that step (Relatively slow).
-      # Make sure that the directory for the run exists.
-      # Determine the directory of events file to read.
-      events_directory = self._logdir
-      if run != _DEFAULT_RUN:
-        # Use the directory for the specific run.
-        events_directory = os.path.join(events_directory, run)
-
-      step = int(step_string)
-      try:
-        mapping = self._obtain_health_pills_at_step(
-            events_directory, node_names, step)
-      except IOError as error:
-        logging.error(
-            'Error retrieving health pills for step %d: %s', step, error)
-        return wrappers.Response(status=404)
-
-    # Convert event_accumulator.HealthPillEvents to JSON-able dicts.
-    jsonable_mapping = {}
-    for node_name, events in mapping.items():
-      jsonable_mapping[node_name] = [e._asdict() for e in events]
-    return http_util.Respond(request, jsonable_mapping, 'application/json')
-
-  def _obtain_sampled_health_pills(self, run, node_names):
-    """Obtains the health pills for a run sampled by the event multiplexer.
-
-    This is much faster than the alternative path of reading health pills from
-    disk.
-
-    Args:
-      run: The run to fetch health pills for.
-      node_names: A list of node names for which to retrieve health pills.
-
-    Returns:
-      A dictionary mapping from node name to a list of
-      event_accumulator.HealthPillEvents.
-    """
-    mapping = {}
-    for node_name in node_names:
-      try:
-        mapping[node_name] = self._event_multiplexer.HealthPills(run, node_name)
-      except KeyError:
-        logging.info('No health pills found for node %r.', node_name)
-        continue
-
-    return mapping
-
-  def _obtain_health_pills_at_step(self, events_directory, node_names, step):
-    """Reads disk to obtain the health pills for a run at a specific step.
-
-    This could be much slower than the alternative path of just returning all
-    health pills sampled by the event multiplexer. It could take tens of minutes
-    to complete this call for large graphs for big step values (in the
-    thousands).
-
-    Args:
-      events_directory: The directory containing events for the desired run.
-      node_names: A list of node names for which to retrieve health pills.
-      step: The step to obtain health pills for.
-
-    Returns:
-      A dictionary mapping from node name to a list of health pill objects (see
-      docs for _serve_health_pills_handler for properties of those objects).
-
-    Raises:
-      IOError: If no files with health pill events could be found.
-    """
-    # Obtain all files with debugger-related events.
-    pattern = os.path.join(events_directory, _DEBUGGER_EVENTS_GLOB_PATTERN)
-    file_paths = glob.glob(pattern)
-
-    if not file_paths:
-      raise IOError(
-          'No events files found that matches the pattern %r.', pattern)
-
-    # Sort by name (and thus by timestamp).
-    file_paths.sort()
-
-    mapping = collections.defaultdict(list)
-    node_name_set = frozenset(node_names)
-
-    for file_path in file_paths:
-      should_stop = self._process_health_pill_event(
-          node_name_set, mapping, step, file_path)
-      if should_stop:
-        break
-
-    return mapping
-
-  def _process_health_pill_event(self, node_name_set, mapping, target_step,
-                                 file_path):
-    """Creates health pills out of data in an event.
-
-    Creates health pills out of the event and adds them to the mapping.
-
-    Args:
-      node_name_set: A set of node names that are relevant.
-      mapping: The mapping from node name to event_accumulator.HealthPillEvents.
-          This object may be destructively modified.
-      target_step: The target step at which to obtain health pills.
-      file_path: The path to the file with health pill events.
-
-    Returns:
-      Whether we should stop reading events because future events are no longer
-      relevant.
-    """
-    events_loader = event_file_loader.EventFileLoader(file_path)
-    for event in events_loader.Load():
-      if not event.HasField('summary'):
-        logging.warning('An event in a debugger events file lacks a summary.')
-        continue
-
-      if event.step < target_step:
-        # This event is not of the relevant step. We perform this check
-        # first because the majority of events will be eliminated from
-        # consideration by this check.
-        continue
-
-      if event.step > target_step:
-        # We have passed the relevant step. No need to read more events.
-        return True
-
-      for value in event.summary.value:
-        # Since we seek health pills for a specific step, this function
-        # returns 1 health pill per node per step. The wall time is the
-        # seconds since the epoch.
-        health_pill = self._process_health_pill_value(
-            node_name_set, event.wall_time, event.step, value)
-        if not health_pill:
-          continue
-        mapping[health_pill.node_name].append(health_pill)
-
-    # Keep reading events.
-    return False
-
-  def _process_health_pill_value(self, node_name_set, wall_time, step, value):
-    """Creates a dict containing various properties of a health pill.
-
-    Args:
-      node_name_set: A set of node names that are relevant.
-      wall_time: The wall time in seconds.
-      step: The session run step of the event.
-      value: The health pill value.
-
-    Returns:
-      An event_accumulator.HealthPillEvent. Or None if one could not be created.
-    """
-    if not value.HasField('tensor'):
-      logging.warning(
-          'An event in a debugger events file lacks a tensor value.')
-      return None
-
-    if value.tag != event_accumulator.HEALTH_PILL_EVENT_TAG:
-      logging.warning(
-          ('A debugger-related event lacks the %r tag. It instead has '
-           'the %r tag.'), event_accumulator.HEALTH_PILL_EVENT_TAG, value.tag)
-      return None
-
-    match = re.match(r'^(.*):(\d+):DebugNumericSummary$', value.node_name)
-    if not match:
-      logging.warning(
-          ('A event with a health pill has an invalid watch, (i.e., an '
-           'unexpected debug op): %r'), value.node_name)
-      return None
-
-    node_name = match.group(1)
-    if node_name not in node_name_set:
-      # This event is not relevant.
-      return None
-
-    # Since we seek health pills for a specific step, this function
-    # returns 1 health pill per node per step. The wall time is the
-    # seconds since the epoch.
-    return event_accumulator.HealthPillEvent(
-        wall_time=wall_time,
-        step=step,
-        node_name=node_name,
-        output_slot=int(match.group(2)),
-        value=list(tensor_util.MakeNdarray(value.tensor)))
diff --git a/tensorflow/tensorboard/plugins/debugger/debugger_plugin_test.py b/tensorflow/tensorboard/plugins/debugger/debugger_plugin_test.py
deleted file mode 100644
index 9e71e2713d18c9e84eb675c7f2bbbb202f2d9068..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/plugins/debugger/debugger_plugin_test.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests the Tensorboard debugger data plugin."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import json
-import os
-import shutil
-
-import numpy as np
-from werkzeug import test as werkzeug_test
-from werkzeug import wrappers
-
-from tensorflow.core.framework import types_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-from tensorflow.tensorboard.backend import application
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.plugins.debugger import debugger_plugin
-
-
-class DebuggerPluginTest(test.TestCase):
-
-  def setUp(self):
-    # Populate the log directory with debugger event for run '.'.
-    self.log_dir = self.get_temp_dir()
-    file_prefix = compat.as_bytes(os.path.join(self.log_dir, 'events.debugger'))
-    writer = pywrap_tensorflow.EventsWriter(file_prefix)
-    writer.WriteEvent(
-        self._CreateEventWithDebugNumericSummary(
-            op_name='layers/Matmul',
-            output_slot=0,
-            wall_time=42,
-            step=2,
-            list_of_values=[1, 2, 3]))
-    writer.WriteEvent(
-        self._CreateEventWithDebugNumericSummary(
-            op_name='layers/Matmul',
-            output_slot=1,
-            wall_time=43,
-            step=7,
-            list_of_values=[4, 5, 6]))
-    writer.WriteEvent(
-        self._CreateEventWithDebugNumericSummary(
-            op_name='logits/Add',
-            output_slot=0,
-            wall_time=1337,
-            step=7,
-            list_of_values=[7, 8, 9]))
-    writer.WriteEvent(
-        self._CreateEventWithDebugNumericSummary(
-            op_name='logits/Add',
-            output_slot=0,
-            wall_time=1338,
-            step=8,
-            list_of_values=[10, 11, 12]))
-    writer.Close()
-
-    # Populate the log directory with debugger event for run 'run_foo'.
-    run_foo_directory = os.path.join(self.log_dir, 'run_foo')
-    os.mkdir(run_foo_directory)
-    file_prefix = compat.as_bytes(
-        os.path.join(run_foo_directory, 'events.debugger'))
-    writer = pywrap_tensorflow.EventsWriter(file_prefix)
-    writer.WriteEvent(
-        self._CreateEventWithDebugNumericSummary(
-            op_name='layers/Variable',
-            output_slot=0,
-            wall_time=4242,
-            step=42,
-            list_of_values=[13, 14, 15]))
-    writer.Close()
-
-    # Start a server that will receive requests and respond with health pills.
-    self.multiplexer = event_multiplexer.EventMultiplexer({
-        '.': self.log_dir,
-        'run_foo': run_foo_directory,
-    })
-    self.plugin = debugger_plugin.DebuggerPlugin()
-    wsgi_app = application.TensorBoardWSGIApp(
-        self.log_dir, {'debugger': self.plugin},
-        self.multiplexer,
-        reload_interval=0)
-    self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
-
-  def tearDown(self):
-    # Remove the directory with debugger-related events files.
-    shutil.rmtree(self.log_dir, ignore_errors=True)
-
-  def _CreateEventWithDebugNumericSummary(
-      self, op_name, output_slot, wall_time, step, list_of_values):
-    """Creates event with a health pill summary.
-
-    Args:
-      op_name: The name of the op to which a DebugNumericSummary was attached.
-      output_slot: The numeric output slot for the tensor.
-      wall_time: The numeric wall time of the event.
-      step: The step of the event.
-      list_of_values: A python list of values within the tensor.
-
-    Returns:
-      A event_pb2.Event with a health pill summary.
-    """
-    event = event_pb2.Event(step=step, wall_time=wall_time)
-    value = event.summary.value.add(
-        tag='__health_pill__',
-        node_name='%s:%d:DebugNumericSummary' % (op_name, output_slot))
-    value.tensor.tensor_shape.dim.add(size=len(list_of_values))
-    value.tensor.dtype = types_pb2.DT_DOUBLE
-    value.tensor.tensor_content = np.array(
-        list_of_values, dtype=np.float64).tobytes()
-    return event
-
-  def _DeserializeResponse(self, byte_content):
-    """Deserializes byte content that is a JSON encoding.
-
-    Args:
-      byte_content: The byte content of a JSON response.
-
-    Returns:
-      The deserialized python object decoded from JSON.
-    """
-    return json.loads(byte_content.decode('utf-8'))
-
-  def testHealthPillsRouteProvided(self):
-    """Tests that the plugin offers the route for requesting health pills."""
-    apps = self.plugin.get_plugin_apps(self.multiplexer, self.log_dir)
-    self.assertIn('/health_pills', apps)
-    self.assertIsInstance(apps['/health_pills'], collections.Callable)
-
-  def testRequestHealthPillsForRunFoo(self):
-    """Tests that the plugin produces health pills for a specified run."""
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps(['layers/Variable', 'unavailable_node']),
-            'run': 'run_foo',
-        })
-    self.assertEqual(200, response.status_code)
-    self.assertDictEqual({
-        'layers/Variable': [{
-            'wall_time': 4242,
-            'step': 42,
-            'node_name': 'layers/Variable',
-            'output_slot': 0,
-            'value': [13, 14, 15],
-        }],
-    }, self._DeserializeResponse(response.get_data()))
-
-  def testRequestHealthPillsForDefaultRun(self):
-    """Tests that the plugin produces health pills for the default '.' run."""
-    # Do not provide a 'run' parameter in POST data.
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps(['logits/Add', 'unavailable_node']),
-        })
-    self.assertEqual(200, response.status_code)
-    # The health pills for 'layers/Matmul' should not be included since the
-    # request excluded that node name.
-    self.assertDictEqual({
-        'logits/Add': [
-            {
-                'wall_time': 1337,
-                'step': 7,
-                'node_name': 'logits/Add',
-                'output_slot': 0,
-                'value': [7, 8, 9],
-            },
-            {
-                'wall_time': 1338,
-                'step': 8,
-                'node_name': 'logits/Add',
-                'output_slot': 0,
-                'value': [10, 11, 12],
-            },
-        ],
-    }, self._DeserializeResponse(response.get_data()))
-
-  def testGetRequestsUnsupported(self):
-    """Tests that GET requests are unsupported."""
-    response = self.server.get('/data/plugin/debugger/health_pills')
-    self.assertEqual(405, response.status_code)
-
-  def testRequestsWithoutProperPostKeyUnsupported(self):
-    """Tests that requests lacking the node_names POST key are unsupported."""
-    response = self.server.post('/data/plugin/debugger/health_pills')
-    self.assertEqual(400, response.status_code)
-
-  def testRequestsWithBadJsonUnsupported(self):
-    """Tests that requests with undecodable JSON are unsupported."""
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': 'some obviously non JSON text',
-        })
-    self.assertEqual(400, response.status_code)
-
-  def testRequestsWithNonListPostDataUnsupported(self):
-    """Tests that requests with loads lacking lists of ops are unsupported."""
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps({
-                'this is a dict': 'and not a list.'
-            }),
-        })
-    self.assertEqual(400, response.status_code)
-
-  def testFetchHealthPillsForSpecificStep(self):
-    """Tests that requesting health pills at a specific steps works.
-
-    This path may be slow in real life because it reads from disk.
-    """
-    # Request health pills for these nodes at step 7 specifically.
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps(['logits/Add', 'layers/Matmul']),
-            'step': 7
-        })
-    self.assertEqual(200, response.status_code)
-    # The response should only include health pills at step 7.
-    self.assertDictEqual({
-        'logits/Add': [
-            {
-                'wall_time': 1337,
-                'step': 7,
-                'node_name': 'logits/Add',
-                'output_slot': 0,
-                'value': [7, 8, 9],
-            },
-        ],
-        'layers/Matmul': [
-            {
-                'wall_time': 43,
-                'step': 7,
-                'node_name': 'layers/Matmul',
-                'output_slot': 1,
-                'value': [4, 5, 6],
-            },
-        ],
-    }, self._DeserializeResponse(response.get_data()))
-
-  def testNoHealthPillsForSpecificStep(self):
-    """Tests that an empty mapping is returned for no health pills at a step."""
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps(['some/clearly/non-existent/op']),
-            'step': 7
-        })
-    self.assertEqual(200, response.status_code)
-    self.assertDictEqual({}, self._DeserializeResponse(response.get_data()))
-
-  def testNoHealthPillsForOutOfRangeStep(self):
-    """Tests that an empty mapping is returned for an out of range step."""
-    response = self.server.post(
-        '/data/plugin/debugger/health_pills',
-        data={
-            'node_names': json.dumps(['logits/Add', 'layers/Matmul']),
-            # This step higher than that of any event written to disk.
-            'step': 42424242
-        })
-    self.assertEqual(200, response.status_code)
-    self.assertDictEqual({}, self._DeserializeResponse(response.get_data()))
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/tensorboard/plugins/projector/BUILD b/tensorflow/tensorboard/plugins/projector/BUILD
index 7c0ab64fb8df91426233176aa54847df18982767..b4ed26c2065e73eccaf39b28afaaef118a4209d6 100644
--- a/tensorflow/tensorboard/plugins/projector/BUILD
+++ b/tensorflow/tensorboard/plugins/projector/BUILD
@@ -7,17 +7,15 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
 py_library(
     name = "projector_plugin",
     srcs = ["projector_plugin.py"],
     srcs_version = "PY2AND3",
-    visibility = [
-        "//learning/vis/projector:__subpackages__",
-        "//tensorflow:internal",
-    ],
+    visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/tensorboard:protos_all_py",
+        ":protos_all_py",
         "//tensorflow/python:errors",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:lib",
@@ -54,6 +52,12 @@ py_test(
     ],
 )
 
+tf_proto_library(
+    name = "protos_all",
+    srcs = glob(["*.proto"]),
+    visibility = ["//visibility:public"],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(["**"]),
diff --git a/tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto b/tensorflow/tensorboard/plugins/projector/projector_config.proto
similarity index 100%
rename from tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
rename to tensorflow/tensorboard/plugins/projector/projector_config.proto
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin.py b/tensorflow/tensorboard/plugins/projector/projector_plugin.py
index 70450bf2a963f68c3f6e07193d8f60da1f314b30..f631a3d19b498bc618ccb13d3a57dde7e6f7c520 100644
--- a/tensorflow/tensorboard/plugins/projector/projector_plugin.py
+++ b/tensorflow/tensorboard/plugins/projector/projector_plugin.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import imghdr
 import math
 import os
@@ -27,7 +28,6 @@ from six import BytesIO
 from werkzeug import wrappers
 from google.protobuf import json_format
 from google.protobuf import text_format
-from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -40,11 +40,17 @@ from tensorflow.python.training.saver import checkpoint_exists
 from tensorflow.python.training.saver import latest_checkpoint
 from tensorflow.tensorboard.backend.http_util import Respond
 from tensorflow.tensorboard.plugins.base_plugin import TBPlugin
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
 
 # The prefix of routes provided by this plugin.
-PLUGIN_PREFIX_ROUTE = 'projector'
+_PLUGIN_PREFIX_ROUTE = 'projector'
 
 PROJECTOR_FILENAME = 'projector_config.pbtxt'
+_PLUGIN_NAME = 'org_tensorflow_tensorboard_projector'
+_PLUGINS_DIR = 'plugins'
+
+# Number of tensors in the LRU cache.
+_TENSOR_CACHE_CAPACITY = 1
 
 # HTTP routes.
 CONFIG_ROUTE = '/info'
@@ -63,6 +69,34 @@ _IMGHDR_TO_MIMETYPE = {
 _DEFAULT_IMAGE_MIMETYPE = 'application/octet-stream'
 
 
+class LRUCache(object):
+  """LRU cache. Used for storing the last used tensor."""
+
+  def __init__(self, size):
+    if size < 1:
+      raise ValueError('The cache size must be >=1')
+    self._size = size
+    self._dict = collections.OrderedDict()
+
+  def get(self, key):
+    try:
+      value = self._dict.pop(key)
+      self._dict[key] = value
+      return value
+    except KeyError:
+      return None
+
+  def set(self, key, value):
+    if value is None:
+      raise ValueError('value must be != None')
+    try:
+      self._dict.pop(key)
+    except KeyError:
+      if len(self._dict) >= self._size:
+        self._dict.popitem(last=False)
+    self._dict[key] = value
+
+
 class EmbeddingMetadata(object):
   """Metadata container for an embedding.
 
@@ -112,7 +146,7 @@ class EmbeddingMetadata(object):
 
 class ProjectorPluginAsset(plugin_asset.PluginAsset):
   """Provides a registry for assets needed by the Projector plugin."""
-  plugin_name = 'org_tensorflow_tensorboard_projector'
+  plugin_name = _PLUGIN_NAME
 
   def __init__(self):
     self._config = projector_config_pb2.ProjectorConfig()
@@ -250,7 +284,7 @@ class ProjectorPluginAsset(plugin_asset.PluginAsset):
     return self._assets
 
 
-def _read_tensor_file(fpath):
+def _read_tensor_tsv_file(fpath):
   with file_io.FileIO(fpath, 'r') as f:
     tensor = []
     for line in f:
@@ -259,12 +293,20 @@ def _read_tensor_file(fpath):
   return np.array(tensor, dtype='float32')
 
 
+def _assets_dir_to_logdir(assets_dir):
+  sub_path = os.path.sep + _PLUGINS_DIR + os.path.sep
+  if sub_path in assets_dir:
+    two_parents_up = os.pardir + os.path.sep + os.pardir
+    return os.path.abspath(os.path.join(assets_dir, two_parents_up))
+  return assets_dir
+
+
 def _latest_checkpoints_changed(configs, run_path_pairs):
   """Returns true if the latest checkpoint has changed in any of the runs."""
-  for run_name, logdir in run_path_pairs:
+  for run_name, assets_dir in run_path_pairs:
     if run_name not in configs:
       config = projector_config_pb2.ProjectorConfig()
-      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
+      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
       if file_io.file_exists(config_fpath):
         file_content = file_io.read_file_to_string(config_fpath)
         text_format.Merge(file_content, config)
@@ -272,6 +314,7 @@ def _latest_checkpoints_changed(configs, run_path_pairs):
       config = configs[run_name]
 
     # See if you can find a checkpoint file in the logdir.
+    logdir = _assets_dir_to_logdir(assets_dir)
     ckpt_path = _find_latest_checkpoint(logdir)
     if not ckpt_path:
       continue
@@ -302,9 +345,18 @@ def _parse_positive_int_param(request, param_name):
     return -1
 
 
+def _rel_to_abs_asset_path(fpath, config_fpath):
+  fpath = os.path.expanduser(fpath)
+  if not os.path.isabs(fpath):
+    return os.path.join(os.path.dirname(config_fpath), fpath)
+  return fpath
+
+
 class ProjectorPlugin(TBPlugin):
   """Embedding projector."""
 
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
   def __init__(self):
     self._handlers = None
     self.readers = {}
@@ -312,8 +364,11 @@ class ProjectorPlugin(TBPlugin):
     self.logdir = None
     self._configs = None
     self.old_num_run_paths = None
+    self.multiplexer = None
+    self.tensor_cache = LRUCache(_TENSOR_CACHE_CAPACITY)
 
   def get_plugin_apps(self, multiplexer, logdir):
+    self.multiplexer = multiplexer
     self.run_paths = multiplexer.RunPaths()
     self.logdir = logdir
     self._handlers = {
@@ -326,10 +381,21 @@ class ProjectorPlugin(TBPlugin):
     }
     return self._handlers
 
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    This plugin is only active if any run has an embedding.
+
+    Returns:
+      A boolean. Whether this plugin is active.
+    """
+    return bool(self.configs)
+
   @property
   def configs(self):
     """Returns a map of run paths to `ProjectorConfig` protos."""
     run_path_pairs = list(self.run_paths.items())
+    self._append_plugin_asset_directories(run_path_pairs)
     # If there are no summary event files, the projector should still work,
     # treating the `logdir` as the model checkpoint directory.
     if not run_path_pairs:
@@ -357,7 +423,12 @@ class ProjectorPlugin(TBPlugin):
           embedding.tensor_name = embedding.tensor_name[:-2]
         # Find the size of embeddings associated with a tensors file.
         if embedding.tensor_path and not embedding.tensor_shape:
-          tensor = _read_tensor_file(embedding.tensor_path)
+          fpath = _rel_to_abs_asset_path(embedding.tensor_path,
+                                         self.config_fpaths[run])
+          tensor = self.tensor_cache.get(embedding.tensor_name)
+          if tensor is None:
+            tensor = _read_tensor_tsv_file(fpath)
+            self.tensor_cache.set(embedding.tensor_name, tensor)
           embedding.tensor_shape.extend([len(tensor), len(tensor[0])])
 
       reader = self._get_reader_for_run(run)
@@ -395,21 +466,23 @@ class ProjectorPlugin(TBPlugin):
     """Reads and returns the projector config files in every run directory."""
     configs = {}
     config_fpaths = {}
-    for run_name, logdir in run_path_pairs:
+    for run_name, assets_dir in run_path_pairs:
       config = projector_config_pb2.ProjectorConfig()
-      config_fpath = os.path.join(logdir, PROJECTOR_FILENAME)
+      config_fpath = os.path.join(assets_dir, PROJECTOR_FILENAME)
       if file_io.file_exists(config_fpath):
         file_content = file_io.read_file_to_string(config_fpath)
         text_format.Merge(file_content, config)
-
       has_tensor_files = False
       for embedding in config.embeddings:
         if embedding.tensor_path:
+          if not embedding.tensor_name:
+            embedding.tensor_name = os.path.basename(embedding.tensor_path)
           has_tensor_files = True
           break
 
       if not config.model_checkpoint_path:
         # See if you can find a checkpoint file in the logdir.
+        logdir = _assets_dir_to_logdir(assets_dir)
         ckpt_path = _find_latest_checkpoint(logdir)
         if not ckpt_path and not has_tensor_files:
           continue
@@ -419,7 +492,7 @@ class ProjectorPlugin(TBPlugin):
       # Sanity check for the checkpoint file.
       if (config.model_checkpoint_path and
           not checkpoint_exists(config.model_checkpoint_path)):
-        logging.warning('Checkpoint file %s not found',
+        logging.warning('Checkpoint file "%s" not found',
                         config.model_checkpoint_path)
         continue
       configs[run_name] = config
@@ -436,7 +509,7 @@ class ProjectorPlugin(TBPlugin):
       try:
         reader = NewCheckpointReader(config.model_checkpoint_path)
       except Exception:  # pylint: disable=broad-except
-        logging.warning('Failed reading %s', config.model_checkpoint_path)
+        logging.warning('Failed reading "%s"', config.model_checkpoint_path)
     self.readers[run] = reader
     return reader
 
@@ -467,6 +540,14 @@ class ProjectorPlugin(TBPlugin):
         return info
     return None
 
+  def _append_plugin_asset_directories(self, run_path_pairs):
+    for run, assets in self.multiplexer.PluginAssets(_PLUGIN_NAME).items():
+      if PROJECTOR_FILENAME not in assets:
+        continue
+      assets_dir = os.path.join(self.run_paths[run], _PLUGINS_DIR, _PLUGIN_NAME)
+      assets_path_pair = (run, os.path.abspath(assets_dir))
+      run_path_pairs.append(assets_path_pair)
+
   @wrappers.Request.application
   def _serve_runs(self, request):
     """Returns a list of runs that have embeddings."""
@@ -479,7 +560,7 @@ class ProjectorPlugin(TBPlugin):
       return Respond(request, 'query parameter "run" is required', 'text/plain',
                      400)
     if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
 
     config = self.configs[run]
     return Respond(request,
@@ -503,17 +584,19 @@ class ProjectorPlugin(TBPlugin):
                      'text/plain', 400)
 
     if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
 
     config = self.configs[run]
     fpath = self._get_metadata_file_for_tensor(name, config)
     if not fpath:
       return Respond(
           request,
-          'No metadata file found for tensor %s in the config file %s' %
+          'No metadata file found for tensor "%s" in the config file "%s"' %
           (name, self.config_fpaths[run]), 'text/plain', 400)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
     if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s is not a file' % fpath, 'text/plain', 400)
+      return Respond(request, '"%s" not found, or is not a file' % fpath,
+                     'text/plain', 400)
 
     num_header_rows = 0
     with file_io.FileIO(fpath, 'r') as f:
@@ -546,35 +629,39 @@ class ProjectorPlugin(TBPlugin):
                      'text/plain', 400)
 
     if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
 
-    reader = self._get_reader_for_run(run)
     config = self.configs[run]
 
-    if reader is None:
+    tensor = self.tensor_cache.get(name)
+    if tensor is None:
       # See if there is a tensor file in the config.
       embedding = self._get_embedding(name, config)
-      if not embedding or not embedding.tensor_path:
-        return Respond(request,
-                       'Tensor %s has no tensor_path in the config' % name,
-                       'text/plain', 400)
-      if not file_io.file_exists(embedding.tensor_path):
-        return Respond(request,
-                       'Tensor file %s does not exist' % embedding.tensor_path,
-                       'text/plain', 400)
-      tensor = _read_tensor_file(embedding.tensor_path)
-    else:
-      if not reader.has_tensor(name):
-        return Respond(request, 'Tensor %s not found in checkpoint dir %s' %
-                       (name, config.model_checkpoint_path), 'text/plain', 400)
-      try:
-        tensor = reader.get_tensor(name)
-      except errors.InvalidArgumentError as e:
-        return Respond(request, str(e), 'text/plain', 400)
+
+      if embedding and embedding.tensor_path:
+        fpath = _rel_to_abs_asset_path(embedding.tensor_path,
+                                       self.config_fpaths[run])
+        if not file_io.file_exists(fpath):
+          return Respond(request,
+                         'Tensor file "%s" does not exist' % fpath,
+                         'text/plain', 400)
+        tensor = _read_tensor_tsv_file(fpath)
+      else:
+        reader = self._get_reader_for_run(run)
+        if not reader or not reader.has_tensor(name):
+          return Respond(request,
+                         'Tensor "%s" not found in checkpoint dir "%s"' %
+                         (name, config.model_checkpoint_path), 'text/plain',
+                         400)
+        try:
+          tensor = reader.get_tensor(name)
+        except errors.InvalidArgumentError as e:
+          return Respond(request, str(e), 'text/plain', 400)
+
+      self.tensor_cache.set(name, tensor)
 
     if num_rows:
       tensor = tensor[:num_rows]
-
     if tensor.dtype != 'float32':
       tensor = tensor.astype(dtype='float32', copy=False)
     data_bytes = tensor.tobytes()
@@ -593,17 +680,19 @@ class ProjectorPlugin(TBPlugin):
                      'text/plain', 400)
 
     if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
 
     config = self.configs[run]
     fpath = self._get_bookmarks_file_for_tensor(name, config)
     if not fpath:
       return Respond(
           request,
-          'No bookmarks file found for tensor %s in the config file %s' %
+          'No bookmarks file found for tensor "%s" in the config file "%s"' %
           (name, self.config_fpaths[run]), 'text/plain', 400)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
     if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s is not a file' % fpath, 'text/plain', 400)
+      return Respond(request, '"%s" not found, or is not a file' % fpath,
+                     'text/plain', 400)
 
     bookmarks_json = None
     with file_io.FileIO(fpath, 'rb') as f:
@@ -623,7 +712,7 @@ class ProjectorPlugin(TBPlugin):
                      'text/plain', 400)
 
     if run not in self.configs:
-      return Respond(request, 'Unknown run: %s' % run, 'text/plain', 400)
+      return Respond(request, 'Unknown run: "%s"' % run, 'text/plain', 400)
 
     config = self.configs[run]
     embedding_info = self._get_embedding(name, config)
@@ -631,12 +720,13 @@ class ProjectorPlugin(TBPlugin):
     if not embedding_info or not embedding_info.sprite.image_path:
       return Respond(
           request,
-          'No sprite image file found for tensor %s in the config file %s' %
+          'No sprite image file found for tensor "%s" in the config file "%s"' %
           (name, self.config_fpaths[run]), 'text/plain', 400)
 
     fpath = os.path.expanduser(embedding_info.sprite.image_path)
+    fpath = _rel_to_abs_asset_path(fpath, self.config_fpaths[run])
     if not file_io.file_exists(fpath) or file_io.is_directory(fpath):
-      return Respond(request, '%s does not exist or is directory' % fpath,
+      return Respond(request, '"%s" does not exist or is directory' % fpath,
                      'text/plain', 400)
     f = file_io.FileIO(fpath, 'rb')
     encoded_image_string = f.read()
diff --git a/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py b/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
index 069e8be84ecd5ddcc0f35f8939afd277e779af3f..81330b3a762c8ab8085273a656bd3c5262e417e9 100644
--- a/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
+++ b/tensorflow/tensorboard/plugins/projector/projector_plugin_test.py
@@ -28,8 +28,9 @@ import numpy as np
 from werkzeug import test as werkzeug_test
 from werkzeug import wrappers
 from google.protobuf import text_format
-from tensorflow.contrib.tensorboard.plugins.projector import projector_config_pb2
+from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.core.util import event_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import image_ops
@@ -43,6 +44,7 @@ from tensorflow.python.summary.writer import writer
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.tensorboard.backend import application
 from tensorflow.tensorboard.backend.event_processing import event_multiplexer
+from tensorflow.tensorboard.plugins.projector import projector_config_pb2
 from tensorflow.tensorboard.plugins.projector import projector_plugin
 
 
@@ -55,7 +57,7 @@ class ProjectorAppTest(test.TestCase):
     self._GenerateProjectorTestData()
     self._SetupWSGIApp()
     run_json = self._GetJson('/data/plugin/projector/runs')
-    self.assertEqual(run_json, ['.'])
+    self.assertTrue(run_json)
 
   def testRunsWithNoCheckpoint(self):
     self._SetupWSGIApp()
@@ -73,14 +75,49 @@ class ProjectorAppTest(test.TestCase):
     run_json = self._GetJson('/data/plugin/projector/runs')
     self.assertEqual(run_json, [])
 
-  def testInfoWithValidCheckpoint(self):
+  def testRunsWithInvalidModelCheckpointPathInConfig(self):
+    config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
+    config = projector_config_pb2.ProjectorConfig()
+    config.model_checkpoint_path = 'does_not_exist'
+    embedding = config.embeddings.add()
+    embedding.tensor_name = 'var1'
+    with gfile.GFile(config_path, 'w') as f:
+      f.write(text_format.MessageToString(config))
+    self._SetupWSGIApp()
+
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testInfoWithValidCheckpointNoEventsData(self):
     self._GenerateProjectorTestData()
     self._SetupWSGIApp()
 
     info_json = self._GetJson('/data/plugin/projector/info?run=.')
     self.assertItemsEqual(info_json['embeddings'], [{
         'tensorShape': [1, 2],
-        'tensorName': 'var1'
+        'tensorName': 'var1',
+        'bookmarksPath': 'bookmarks.json'
+    }, {
+        'tensorShape': [10, 10],
+        'tensorName': 'var2'
+    }, {
+        'tensorShape': [100, 100],
+        'tensorName': 'var3'
+    }])
+
+  def testInfoWithValidCheckpointAndEventsData(self):
+    self._GenerateProjectorTestData()
+    self._GenerateEventsData()
+    self._SetupWSGIApp()
+
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+    run = run_json[0]
+    info_json = self._GetJson('/data/plugin/projector/info?run=%s' % run)
+    self.assertItemsEqual(info_json['embeddings'], [{
+        'tensorShape': [1, 2],
+        'tensorName': 'var1',
+        'bookmarksPath': 'bookmarks.json'
     }, {
         'tensorShape': [10, 10],
         'tensorName': 'var2'
@@ -95,19 +132,286 @@ class ProjectorAppTest(test.TestCase):
 
     url = '/data/plugin/projector/tensor?run=.&name=var1'
     tensor_bytes = self._Get(url).data
-    tensor = np.reshape(np.fromstring(tensor_bytes, dtype='float32'), [1, 2])
-    expected_tensor = np.array([[6, 6]], dtype='float32')
+    expected_tensor = np.array([[6, 6]], dtype=np.float32)
+    self._AssertTensorResponse(tensor_bytes, expected_tensor)
+
+  def testBookmarksRequestMissingRunAndName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksRequestMissingName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksRequestMissingRun(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?name=var1'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksUnknownRun(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=unknown&name=var1'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarksUnknownName(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.&name=unknown'
+    self.assertEqual(self._Get(url).status_code, 400)
+
+  def testBookmarks(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    url = '/data/plugin/projector/bookmarks?run=.&name=var1'
+    bookmark = self._GetJson(url)
+    self.assertEqual(bookmark, {'a': 'b'})
+
+  def testEndpointsNoAssets(self):
+    g = ops.Graph()
+    with g.as_default():
+      plugin_asset.get_plugin_asset(projector_plugin.ProjectorPluginAsset)
+
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testEndpointsMetadataForVariableAssets(self):
+    self._GenerateProjectorTestData()
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('labels', ['a', 'b', 'c'])
+    manager.add_metadata_for_embedding_variable('test', metadata)
+
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+
+    run = run_json[0]
+    metedata_query = '/data/plugin/projector/metadata?run=%s&name=test' % run
+    metadata_tsv = self._Get(metedata_query).data
+    self.assertEqual(metadata_tsv, b'a\nb\nc\n')
+
+    unk_tensor_query = '/data/plugin/projector/tensor?run=%s&name=test' % run
+    response = self._Get(unk_tensor_query)
+    self.assertEqual(response.status_code, 400)
+
+    expected_tensor = np.array([[6, 6]], dtype=np.float32)
+    tensor_query = '/data/plugin/projector/tensor?run=%s&name=var1' % run
+    tensor_bytes = self._Get(tensor_query).data
+    self._AssertTensorResponse(tensor_bytes, expected_tensor)
+
+  def testEndpointsMetadataForVariableAssetsButNoCheckpoint(self):
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('labels', ['a', 'b', 'c'])
+    manager.add_metadata_for_embedding_variable('test', metadata)
+
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertEqual(run_json, [])
+
+  def testEndpointsTensorAndMetadataAssets(self):
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('labels', ['a', 'b', 'c'])
+    manager.add_metadata_for_embedding_variable('test', metadata)
+    expected_tensor = np.array([[1, 2], [3, 4], [5, 6]])
+    image1 = np.array([[[1, 2, 3], [4, 5, 6]],
+                       [[7, 8, 9], [10, 11, 12]]])
+    image2 = np.array([[[10, 20, 30], [40, 50, 60]],
+                       [[70, 80, 90], [100, 110, 120]]])
+    manager.add_embedding('emb', expected_tensor, metadata, [image1, image2],
+                          [2, 2])
+
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+
+    run = run_json[0]
+    metadata_query = '/data/plugin/projector/metadata?run=%s&name=emb' % run
+    metadata_tsv = self._Get(metadata_query).data
+    self.assertEqual(metadata_tsv, b'a\nb\nc\n')
+
+    unk_metadata_query = '/data/plugin/projector/metadata?run=%s&name=q' % run
+    response = self._Get(unk_metadata_query)
+    self.assertEqual(response.status_code, 400)
+
+    tensor_query = '/data/plugin/projector/tensor?run=%s&name=emb' % run
+    tensor_bytes = self._Get(tensor_query).data
+    self._AssertTensorResponse(tensor_bytes, expected_tensor)
+
+    unk_tensor_query = '/data/plugin/projector/tensor?run=%s&name=var1' % run
+    response = self._Get(unk_tensor_query)
+    self.assertEqual(response.status_code, 400)
+
+    image_query = '/data/plugin/projector/sprite_image?run=%s&name=emb' % run
+    image_bytes = self._Get(image_query).data
+    with ops.Graph().as_default():
+      s = session.Session()
+      image_array = image_ops.decode_png(image_bytes).eval(session=s).tolist()
+    expected_sprite_image = [
+        [[1, 2, 3], [4, 5, 6], [10, 20, 30], [40, 50, 60]],
+        [[7, 8, 9], [10, 11, 12], [70, 80, 90], [100, 110, 120]],
+        [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+        [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
+    ]
+    self.assertEqual(image_array, expected_sprite_image)
+
+  def testSpriteImageRequestMissingRunAndName(self):
+    self._SetupWSGIApp()
+    q = '/data/plugin/projector/sprite_image'
+    response = self._Get(q)
+    self.assertEqual(response.status_code, 400)
+
+  def testSpriteImageRequestMissingName(self):
+    self._SetupWSGIApp()
+    q = '/data/plugin/projector/sprite_image?run=.'
+    response = self._Get(q)
+    self.assertEqual(response.status_code, 400)
+
+  def testSpriteImageRequestMissingRun(self):
+    self._SetupWSGIApp()
+    q = '/data/plugin/projector/sprite_image?name=emb'
+    response = self._Get(q)
+    self.assertEqual(response.status_code, 400)
+
+  def testSpriteImageUnknownRun(self):
+    self._GenerateProjectorTestData()
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+    image1 = np.array([[[1, 2, 3], [4, 5, 6]],
+                       [[7, 8, 9], [10, 11, 12]]])
+    image2 = np.array([[[10, 20, 30], [40, 50, 60]],
+                       [[70, 80, 90], [100, 110, 120]]])
+    manager.add_metadata_for_embedding_variable('var1',
+                                                thumbnails=[image1, image2],
+                                                thumbnail_dim=[2, 2])
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+    self._SetupWSGIApp()
+
+    q = '/data/plugin/projector/sprite_image?run=unknown&name=var1'
+    response = self._Get(q)
+    self.assertEqual(response.status_code, 400)
+
+  def testSpriteImageUnknownName(self):
+    self._GenerateProjectorTestData()
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+    image1 = np.array([[[1, 2, 3], [4, 5, 6]],
+                       [[7, 8, 9], [10, 11, 12]]])
+    image2 = np.array([[[10, 20, 30], [40, 50, 60]],
+                       [[70, 80, 90], [100, 110, 120]]])
+    manager.add_metadata_for_embedding_variable('var1',
+                                                thumbnails=[image1, image2],
+                                                thumbnail_dim=[2, 2])
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+    self._SetupWSGIApp()
+    q = '/data/plugin/projector/sprite_image?run=.&name=unknown'
+    response = self._Get(q)
+    self.assertEqual(response.status_code, 400)
+
+  def testEndpointsComboTensorAssetsAndCheckpoint(self):
+    self._GenerateProjectorTestData()
+    g = ops.Graph()
+    with g.as_default():
+      manager = plugin_asset.get_plugin_asset(
+          projector_plugin.ProjectorPluginAsset)
+
+    metadata = projector_plugin.EmbeddingMetadata(3)
+    metadata.add_column('labels', ['a', 'b', 'c'])
+    manager.add_metadata_for_embedding_variable('var1', metadata)
+
+    new_tensor_values = np.array([[1, 2], [3, 4], [5, 6]])
+    manager.add_embedding('new_tensor', new_tensor_values)
+
+    fw = writer.FileWriter(self.log_dir, graph=g)
+    fw.close()
+
+    self._SetupWSGIApp()
+    run_json = self._GetJson('/data/plugin/projector/runs')
+    self.assertTrue(run_json)
+
+    run = run_json[0]
+    var1_values = np.array([[6, 6]], dtype=np.float32)
+    var1_tensor_query = '/data/plugin/projector/tensor?run=%s&name=var1' % run
+    tensor_bytes = self._Get(var1_tensor_query).data
+    self._AssertTensorResponse(tensor_bytes, var1_values)
+
+    metadata_query = '/data/plugin/projector/metadata?run=%s&name=var1' % run
+    metadata_tsv = self._Get(metadata_query).data
+    self.assertEqual(metadata_tsv, b'a\nb\nc\n')
+
+    tensor_query = '/data/plugin/projector/tensor?run=%s&name=new_tensor' % run
+    tensor_bytes = self._Get(tensor_query).data
+    self._AssertTensorResponse(tensor_bytes, new_tensor_values)
+
+  def _AssertTensorResponse(self, tensor_bytes, expected_tensor):
+    tensor = np.reshape(np.fromstring(tensor_bytes, dtype=np.float32),
+                        expected_tensor.shape)
     self.assertTrue(np.array_equal(tensor, expected_tensor))
 
+  def testPluginIsActive(self):
+    self._GenerateProjectorTestData()
+    self._SetupWSGIApp()
+
+    # Embedding data is available.
+    self.assertTrue(self.plugin.is_active())
+
+  def testPluginIsNotActive(self):
+    self._SetupWSGIApp()
+
+    # Embedding data is not available.
+    self.assertFalse(self.plugin.is_active())
+
   def _SetupWSGIApp(self):
     multiplexer = event_multiplexer.EventMultiplexer(
         size_guidance=application.DEFAULT_SIZE_GUIDANCE,
         purge_orphaned_data=True)
-    plugin = projector_plugin.ProjectorPlugin()
-    plugin.get_plugin_apps(multiplexer, self.log_dir)
-    plugins = {'projector': plugin}
+    self.plugin = projector_plugin.ProjectorPlugin()
     wsgi_app = application.TensorBoardWSGIApp(
-        self.log_dir, plugins, multiplexer, reload_interval=0)
+        self.log_dir, [self.plugin], multiplexer, reload_interval=0)
     self.server = werkzeug_test.Client(wsgi_app, wrappers.BaseResponse)
 
   def _Get(self, path):
@@ -120,12 +424,28 @@ class ProjectorAppTest(test.TestCase):
       data = gzip.GzipFile('', 'rb', 9, io.BytesIO(data)).read()
     return json.loads(data.decode('utf-8'))
 
+  def _GenerateEventsData(self):
+    fw = writer.FileWriter(self.log_dir)
+    event = event_pb2.Event(
+        wall_time=1,
+        step=1,
+        summary=summary_pb2.Summary(
+            value=[summary_pb2.Summary.Value(
+                tag='s1', simple_value=0)]))
+    fw.add_event(event)
+    fw.close()
+
   def _GenerateProjectorTestData(self):
     config_path = os.path.join(self.log_dir, 'projector_config.pbtxt')
     config = projector_config_pb2.ProjectorConfig()
     embedding = config.embeddings.add()
     # Add an embedding by its canonical tensor name.
     embedding.tensor_name = 'var1:0'
+
+    with gfile.GFile(os.path.join(self.log_dir, 'bookmarks.json'), 'w') as f:
+      f.write('{"a": "b"}')
+    embedding.bookmarks_path = 'bookmarks.json'
+
     config_pbtxt = text_format.MessageToString(config)
     with gfile.GFile(config_path, 'w') as f:
       f.write(config_pbtxt)
@@ -344,6 +664,30 @@ class ProjectorPluginAssetTest(test.TestCase):
           'test', np.array([[1], [2], [3]]), thumbnails=thumbnails,
           thumbnail_dim=[4])
 
+  def testAddEmbeddingThumbnailListHasNoEntries(self):
+    manager = plugin_asset.get_plugin_asset(
+        projector_plugin.ProjectorPluginAsset)
+
+    with self.assertRaises(ValueError):
+      manager.add_embedding('test', np.array([[1]]), thumbnails=[],
+                            thumbnail_dim=[1, 1])
+
+  def testAddEmbeddingThumbnailListNotOfRank4(self):
+    manager = plugin_asset.get_plugin_asset(
+        projector_plugin.ProjectorPluginAsset)
+
+    with self.assertRaises(ValueError):
+      manager.add_embedding('test2', np.array([[1]]),
+                            thumbnails=np.array([[1]]), thumbnail_dim=[1, 1])
+
+  def testAddEmbeddingThumbnailListEntriesNot3DTensors(self):
+    manager = plugin_asset.get_plugin_asset(
+        projector_plugin.ProjectorPluginAsset)
+
+    with self.assertRaises(ValueError):
+      manager.add_embedding('test3', np.array([[1]]), thumbnails=[[1, 2, 3]],
+                            thumbnail_dim=[1, 1])
+
   def testAddEmbeddingWithMetadataOfIncorrectLength(self):
     manager = plugin_asset.get_plugin_asset(
         projector_plugin.ProjectorPluginAsset)
@@ -394,8 +738,8 @@ class ProjectorPluginAssetTest(test.TestCase):
 
     with ops.Graph().as_default() as g:
       plugin_asset.get_plugin_asset(projector_plugin.ProjectorPluginAsset)
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
+      fw = writer.FileWriter(logdir, graph=g)
+      fw.close()
 
     with gfile.Open(os.path.join(plugin_dir, 'projector_config.pbtxt')) as f:
       content = f.read()
@@ -407,12 +751,50 @@ class ProjectorPluginAssetTest(test.TestCase):
                               projector_plugin.ProjectorPluginAsset.plugin_name)
 
     with ops.Graph().as_default() as g:
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
+      fw = writer.FileWriter(logdir, graph=g)
+      fw.close()
 
     self.assertFalse(
         gfile.Exists(plugin_dir),
         'The projector plugin directory should not exist.')
 
+
+class LRUCacheTest(test.TestCase):
+
+  def testInvalidSize(self):
+    with self.assertRaises(ValueError):
+      projector_plugin.LRUCache(0)
+
+  def testSimpleGetAndSet(self):
+    cache = projector_plugin.LRUCache(1)
+    value = cache.get('a')
+    self.assertIsNone(value)
+    cache.set('a', 10)
+    self.assertEqual(cache.get('a'), 10)
+
+  def testErrorsWhenSettingNoneAsValue(self):
+    cache = projector_plugin.LRUCache(1)
+    with self.assertRaises(ValueError):
+      cache.set('a', None)
+
+  def testLRUReplacementPolicy(self):
+    cache = projector_plugin.LRUCache(2)
+    cache.set('a', 1)
+    cache.set('b', 2)
+    cache.set('c', 3)
+    self.assertIsNone(cache.get('a'))
+    self.assertEqual(cache.get('b'), 2)
+    self.assertEqual(cache.get('c'), 3)
+
+    # Make 'b' the most recently used.
+    cache.get('b')
+    cache.set('d', 4)
+
+    # Make sure 'c' got replaced with 'd'.
+    self.assertIsNone(cache.get('c'))
+    self.assertEqual(cache.get('b'), 2)
+    self.assertEqual(cache.get('d'), 4)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tensorboard/plugins/text/BUILD b/tensorflow/tensorboard/plugins/text/BUILD
index 3e1455ea5a4848e932a4f8f3e6d25695955c7d14..f6ed41375f78c925c52667546e67d56c2ee0e28a 100644
--- a/tensorflow/tensorboard/plugins/text/BUILD
+++ b/tensorflow/tensorboard/plugins/text/BUILD
@@ -24,6 +24,7 @@ py_library(
         "@org_mozilla_bleach",
         "@org_pocoo_werkzeug//:werkzeug",
         "@org_pythonhosted_markdown",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
index a87949877bf35da6d142074d56229644bea8b6e7..280f77a2ae566fc7f5f9b0804af5c610b8440305 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin.py
@@ -19,20 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import textwrap
+
+# pylint: disable=g-bad-import-order
+# Necessary for an internal test with special behavior for numpy.
+import numpy as np
+# pylint: enable=g-bad-import-order
 
 import bleach
 # pylint: disable=g-bad-import-order
 # Google-only: import markdown_freewisdom
 import markdown
+import six
 # pylint: enable=g-bad-import-order
 from werkzeug import wrappers
 
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.summary import text_summary
 from tensorflow.tensorboard.backend import http_util
 from tensorflow.tensorboard.plugins import base_plugin
 
 # The prefix of routes provided by this plugin.
-PLUGIN_PREFIX_ROUTE = 'text'
+_PLUGIN_PREFIX_ROUTE = 'text'
 
 # HTTP routes
 RUNS_ROUTE = '/runs'
@@ -68,6 +76,10 @@ ALLOWED_TAGS = [
 
 ALLOWED_ATTRIBUTES = {'a': ['href', 'title'], 'img': ['src', 'title', 'alt']}
 
+WARNING_TEMPLATE = textwrap.dedent("""\
+  **Warning:** This text summary contained data of dimensionality %d, but only \
+  2d tables are supported. Showing a 2d slice of the data instead.""")
+
 
 def markdown_and_sanitize(markdown_string):
   """Takes a markdown string and converts it into sanitized html.
@@ -84,9 +96,10 @@ def markdown_and_sanitize(markdown_string):
   Returns:
     a string containing sanitized html for input markdown
   """
-  # Convert to utf-8 because we get a bytearray in python3
-  if not isinstance(markdown_string, str):
+  # Convert to utf-8 whenever we have a binary input.
+  if isinstance(markdown_string, six.binary_type):
     markdown_string = markdown_string.decode('utf-8')
+
   string_html = markdown.markdown(
       markdown_string, extensions=['markdown.extensions.tables'])
   string_sanitized = bleach.clean(
@@ -94,18 +107,153 @@ def markdown_and_sanitize(markdown_string):
   return string_sanitized
 
 
+def make_table_row(contents, tag='td'):
+  """Given an iterable of string contents, make a table row.
+
+  Args:
+    contents: An iterable yielding strings.
+    tag: The tag to place contents in. Defaults to 'td', you might want 'th'.
+
+  Returns:
+    A string containing the content strings, organized into a table row.
+
+  Example: make_table_row(['one', 'two', 'three']) == '''
+  <tr>
+  <td>one</td>
+  <td>two</td>
+  <td>three</td>
+  </tr>'''
+  """
+  columns = ('<%s>%s</%s>\n' % (tag, s, tag) for s in contents)
+  return '<tr>\n' + ''.join(columns) + '</tr>\n'
+
+
+def make_table(contents, headers=None):
+  """Given a numpy ndarray of strings, concatenate them into a html table.
+
+  Args:
+    contents: A np.ndarray of strings. May be 1d or 2d. In the 1d case, the
+      table is laid out vertically (i.e. row-major).
+    headers: A np.ndarray or list of string header names for the table.
+
+  Returns:
+    A string containing all of the content strings, organized into a table.
+
+  Raises:
+    ValueError: If contents is not a np.ndarray.
+    ValueError: If contents is not 1d or 2d.
+    ValueError: If contents is empty.
+    ValueError: If headers is present and not a list, tuple, or ndarray.
+    ValueError: If headers is not 1d.
+    ValueError: If number of elements in headers does not correspond to number
+      of columns in contents.
+  """
+  if not isinstance(contents, np.ndarray):
+    raise ValueError('make_table contents must be a numpy ndarray')
+
+  if contents.ndim not in [1, 2]:
+    raise ValueError('make_table requires a 1d or 2d numpy array, was %dd' %
+                     contents.ndim)
+
+  if headers:
+    if isinstance(headers, list) or isinstance(headers, tuple):
+      headers = np.array(headers)
+    if not isinstance(headers, np.ndarray):
+      raise ValueError('Could not convert headers %s into np.ndarray' % headers)
+    if headers.ndim != 1:
+      raise ValueError('Headers must be 1d, is %dd' % headers.ndim)
+    expected_n_columns = contents.shape[1] if contents.ndim == 2 else 1
+    if headers.shape[0] != expected_n_columns:
+      raise ValueError('Number of headers %d must match number of columns %d' %
+                       (headers.shape[0], expected_n_columns))
+    header = '<thead>\n%s</thead>\n' % make_table_row(headers, tag='th')
+  else:
+    header = ''
+
+  n_rows = contents.shape[0]
+  if contents.ndim == 1:
+    # If it's a vector, we need to wrap each element in a new list, otherwise
+    # we would turn the string itself into a row (see test code)
+    rows = (make_table_row([contents[i]]) for i in range(n_rows))
+  else:
+    rows = (make_table_row(contents[i, :]) for i in range(n_rows))
+
+  return '<table>\n%s<tbody>\n%s</tbody>\n</table>' % (header, ''.join(rows))
+
+
+def reduce_to_2d(arr):
+  """Given a np.npdarray with nDims > 2, reduce it to 2d.
+
+  It does this by selecting the zeroth coordinate for every dimension greater
+  than two.
+
+  Args:
+    arr: a numpy ndarray of dimension at least 2.
+
+  Returns:
+    A two-dimensional subarray from the input array.
+
+  Raises:
+    ValueError: If the argument is not a numpy ndarray, or the dimensionality
+      is too low.
+  """
+  if not isinstance(arr, np.ndarray):
+    raise ValueError('reduce_to_2d requires a numpy.ndarray')
+
+  ndims = len(arr.shape)
+  if ndims < 2:
+    raise ValueError('reduce_to_2d requires an array of dimensionality >=2')
+  # slice(None) is equivalent to `:`, so we take arr[0,0,...0,:,:]
+  slices = ([0] * (ndims - 2)) + [slice(None), slice(None)]
+  return arr[slices]
+
+
+def text_array_to_html(text_arr):
+  """Take a numpy.ndarray containing strings, and convert it into html.
+
+  If the ndarray contains a single scalar string, that string is converted to
+  html via our sanitized markdown parser. If it contains an array of strings,
+  the strings are individually converted to html and then composed into a table
+  using make_table. If the array contains dimensionality greater than 2,
+  all but two of the dimensions are removed, and a warning message is prefixed
+  to the table.
+
+  Args:
+    text_arr: A numpy.ndarray containing strings.
+
+  Returns:
+    The array converted to html.
+  """
+  if not text_arr.shape:
+    # It is a scalar. No need to put it in a table, just apply markdown
+    return markdown_and_sanitize(text_arr.astype(np.dtype(str)).tostring())
+  warning = ''
+  if len(text_arr.shape) > 2:
+    warning = markdown_and_sanitize(WARNING_TEMPLATE % len(text_arr.shape))
+    text_arr = reduce_to_2d(text_arr)
+
+  html_arr = [markdown_and_sanitize(x) for x in text_arr.reshape(-1)]
+  html_arr = np.array(html_arr).reshape(text_arr.shape)
+
+  return warning + make_table(html_arr)
+
+
 def process_string_tensor_event(event):
   """Convert a TensorEvent into a JSON-compatible response."""
+  string_arr = tensor_util.MakeNdarray(event.tensor_proto)
+  html = text_array_to_html(string_arr)
   return {
       'wall_time': event.wall_time,
       'step': event.step,
-      'text': markdown_and_sanitize(event.tensor_proto.string_val[0]),
+      'text': html,
   }
 
 
 class TextPlugin(base_plugin.TBPlugin):
   """Text Plugin for TensorBoard."""
 
+  plugin_name = _PLUGIN_PREFIX_ROUTE
+
   def index_impl(self):
     run_to_series = {}
     name = text_summary.TextSummaryPluginAsset.plugin_name
@@ -145,3 +293,13 @@ class TextPlugin(base_plugin.TBPlugin):
         RUNS_ROUTE: self.runs_route,
         TEXT_ROUTE: self.text_route,
     }
+
+  def is_active(self):
+    """Determines whether this plugin is active.
+
+    This plugin is only active if TensorBoard sampled any text summaries.
+
+    Returns:
+      Whether this plugin is active.
+    """
+    return bool(self.index_impl())
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin_test.py b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
index f1d06aa4f176aca020e5ffe91484ab174486a545..a7f0235889953e595a7ff44ff99fb2e62fe16c93 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin_test.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +21,7 @@ from __future__ import print_function
 
 import os
 import textwrap
+import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
@@ -31,7 +33,7 @@ from tensorflow.python.summary import text_summary
 from tensorflow.tensorboard.backend.event_processing import event_multiplexer
 from tensorflow.tensorboard.plugins.text import text_plugin
 
-GEMS = ["garnet", "amethyst", "pearl", "steven"]
+GEMS = ['garnet', 'amethyst', 'pearl', 'steven']
 
 
 class TextPluginTest(test.TestCase):
@@ -52,10 +54,12 @@ class TextPluginTest(test.TestCase):
   def generate_testdata(self):
     ops.reset_default_graph()
     sess = session.Session()
-    placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    summary_tensor = text_summary.text_summary("message", placeholder)
+    placeholder = array_ops.placeholder(dtypes.string)
+    summary_tensor = text_summary.text_summary('message', placeholder)
 
-    run_names = ["fry", "leela"]
+    vector_summary = text_summary.text_summary('vector', placeholder)
+
+    run_names = ['fry', 'leela']
     for run_name in run_names:
       subdir = os.path.join(self.logdir, run_name)
       writer = summary.FileWriter(subdir)
@@ -63,38 +67,62 @@ class TextPluginTest(test.TestCase):
 
       step = 0
       for gem in GEMS:
-        message = run_name + " *loves* " + gem
+        message = run_name + ' *loves* ' + gem
         feed_dict = {placeholder: message}
         summ = sess.run(summary_tensor, feed_dict=feed_dict)
         writer.add_summary(summ, global_step=step)
         step += 1
+
+      vector_message = ['one', 'two', 'three', 'four']
+      summ = sess.run(vector_summary, feed_dict={placeholder: vector_message})
+      writer.add_summary(summ)
       writer.close()
 
   def testIndex(self):
     index = self.plugin.index_impl()
     self.assertEqual(index, {
-        "fry": ["message"],
-        "leela": ["message"],
+        'fry': ['message', 'vector'],
+        'leela': ['message', 'vector'],
     })
 
   def testText(self):
-    fry = self.plugin.text_impl("fry", "message")
-    leela = self.plugin.text_impl("leela", "message")
+    fry = self.plugin.text_impl('fry', 'message')
+    leela = self.plugin.text_impl('leela', 'message')
     self.assertEqual(len(fry), 4)
     self.assertEqual(len(leela), 4)
     for i in range(4):
-      self.assertEqual(fry[i]["step"], i)
-      self.assertConverted(fry[i]["text"], "fry *loves* " + GEMS[i])
-      self.assertEqual(leela[i]["step"], i)
-      self.assertConverted(leela[i]["text"], "leela *loves* " + GEMS[i])
+      self.assertEqual(fry[i]['step'], i)
+      self.assertConverted(fry[i]['text'], 'fry *loves* ' + GEMS[i])
+      self.assertEqual(leela[i]['step'], i)
+      self.assertConverted(leela[i]['text'], 'leela *loves* ' + GEMS[i])
+
+    table = self.plugin.text_impl('fry', 'vector')[0]['text']
+    self.assertEqual(table,
+                     textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>one</p></td>
+      </tr>
+      <tr>
+      <td><p>two</p></td>
+      </tr>
+      <tr>
+      <td><p>three</p></td>
+      </tr>
+      <tr>
+      <td><p>four</p></td>
+      </tr>
+      </tbody>
+      </table>"""))
 
   def assertTextConverted(self, actual, expected):
     self.assertEqual(text_plugin.markdown_and_sanitize(actual), expected)
 
   def testMarkdownConversion(self):
-    emphasis = "*Italics1* _Italics2_ **bold1** __bold2__"
-    emphasis_converted = ("<p><em>Italics1</em> <em>Italics2</em> "
-                          "<strong>bold1</strong> <strong>bold2</strong></p>")
+    emphasis = '*Italics1* _Italics2_ **bold1** __bold2__'
+    emphasis_converted = ('<p><em>Italics1</em> <em>Italics2</em> '
+                          '<strong>bold1</strong> <strong>bold2</strong></p>')
 
     self.assertEqual(
         text_plugin.markdown_and_sanitize(emphasis), emphasis_converted)
@@ -117,7 +145,7 @@ class TextPluginTest(test.TestCase):
     self.assertEqual(
         text_plugin.markdown_and_sanitize(md_list), md_list_converted)
 
-    link = "[TensorFlow](http://tensorflow.org)"
+    link = '[TensorFlow](http://tensorflow.org)'
     link_converted = '<p><a href="http://tensorflow.org">TensorFlow</a></p>'
     self.assertEqual(text_plugin.markdown_and_sanitize(link), link_converted)
 
@@ -159,11 +187,228 @@ class TextPluginTest(test.TestCase):
     self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
 
     dangerous = textwrap.dedent("""\
-    hello <a name="n"
-    href="javascript:alert('xss')">*you*</a>""")
-    sanitized = "<p>hello <a><em>you</em></a></p>"
+    hello <a name='n'
+    href='javascript:alert('xss')'>*you*</a>""")
+    sanitized = '<p>hello <a><em>you</em></a></p>'
     self.assertEqual(text_plugin.markdown_and_sanitize(dangerous), sanitized)
 
+  def testTableGeneration(self):
+    array2d = np.array([['one', 'two'], ['three', 'four']])
+    expected_table = textwrap.dedent("""\
+    <table>
+    <tbody>
+    <tr>
+    <td>one</td>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    <td>four</td>
+    </tr>
+    </tbody>
+    </table>""")
+    self.assertEqual(text_plugin.make_table(array2d), expected_table)
+
+    expected_table_with_headers = textwrap.dedent("""\
+    <table>
+    <thead>
+    <tr>
+    <th>c1</th>
+    <th>c2</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>one</td>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    <td>four</td>
+    </tr>
+    </tbody>
+    </table>""")
+
+    actual_with_headers = text_plugin.make_table(array2d, headers=['c1', 'c2'])
+    self.assertEqual(actual_with_headers, expected_table_with_headers)
+
+    array_1d = np.array(['one', 'two', 'three', 'four', 'five'])
+    expected_1d = textwrap.dedent("""\
+    <table>
+    <tbody>
+    <tr>
+    <td>one</td>
+    </tr>
+    <tr>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    </tr>
+    <tr>
+    <td>four</td>
+    </tr>
+    <tr>
+    <td>five</td>
+    </tr>
+    </tbody>
+    </table>""")
+    self.assertEqual(text_plugin.make_table(array_1d), expected_1d)
+
+    expected_1d_with_headers = textwrap.dedent("""\
+    <table>
+    <thead>
+    <tr>
+    <th>X</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+    <td>one</td>
+    </tr>
+    <tr>
+    <td>two</td>
+    </tr>
+    <tr>
+    <td>three</td>
+    </tr>
+    <tr>
+    <td>four</td>
+    </tr>
+    <tr>
+    <td>five</td>
+    </tr>
+    </tbody>
+    </table>""")
+    actual_1d_with_headers = text_plugin.make_table(array_1d, headers=['X'])
+    self.assertEqual(actual_1d_with_headers, expected_1d_with_headers)
+
+  def testMakeTableExceptions(self):
+    # Verify that contents is being type-checked and shape-checked.
+    with self.assertRaises(ValueError):
+      text_plugin.make_table([])
+
+    with self.assertRaises(ValueError):
+      text_plugin.make_table('foo')
+
+    with self.assertRaises(ValueError):
+      invalid_shape = np.full((3, 3, 3), 'nope', dtype=np.dtype('S3'))
+      text_plugin.make_table(invalid_shape)
+
+    # Test headers exceptions in 2d array case.
+    test_array = np.full((3, 3), 'foo', dtype=np.dtype('S3'))
+    with self.assertRaises(ValueError):
+      # Headers is wrong type.
+      text_plugin.make_table(test_array, headers='foo')
+    with self.assertRaises(ValueError):
+      # Too many headers.
+      text_plugin.make_table(test_array, headers=['foo', 'bar', 'zod', 'zoink'])
+    with self.assertRaises(ValueError):
+      # headers is 2d
+      text_plugin.make_table(test_array, headers=test_array)
+
+    # Also make sure the column counting logic works in the 1d array case.
+    test_array = np.array(['foo', 'bar', 'zod'])
+    with self.assertRaises(ValueError):
+      # Too many headers.
+      text_plugin.make_table(test_array, headers=test_array)
+
+  def test_reduce_to_2d(self):
+
+    def make_range_array(dim):
+      """Produce an incrementally increasing multidimensional array.
+
+      Args:
+        dim: the number of dimensions for the array
+
+      Returns:
+        An array of increasing integer elements, with dim dimensions and size
+        two in each dimension.
+
+      Example: rangeArray(2) results in [[0,1],[2,3]].
+      """
+      return np.array(range(2**dim)).reshape([2] * dim)
+
+    for i in range(2, 5):
+      actual = text_plugin.reduce_to_2d(make_range_array(i))
+      expected = make_range_array(2)
+      np.testing.assert_array_equal(actual, expected)
+
+  def test_text_array_to_html(self):
+
+    convert = text_plugin.text_array_to_html
+    scalar = np.array('foo')
+    scalar_expected = '<p>foo</p>'
+    self.assertEqual(convert(scalar), scalar_expected)
+
+    vector = np.array(['foo', 'bar'])
+    vector_expected = textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      </tr>
+      <tr>
+      <td><p>bar</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(vector), vector_expected)
+
+    d2 = np.array([['foo', 'bar'], ['zoink', 'zod']])
+    d2_expected = textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      <td><p>bar</p></td>
+      </tr>
+      <tr>
+      <td><p>zoink</p></td>
+      <td><p>zod</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(d2), d2_expected)
+
+    d3 = np.array([[['foo', 'bar'], ['zoink', 'zod']], [['FOO', 'BAR'],
+                                                        ['ZOINK', 'ZOD']]])
+
+    warning = text_plugin.markdown_and_sanitize(text_plugin.WARNING_TEMPLATE %
+                                                3)
+    d3_expected = warning + textwrap.dedent("""\
+      <table>
+      <tbody>
+      <tr>
+      <td><p>foo</p></td>
+      <td><p>bar</p></td>
+      </tr>
+      <tr>
+      <td><p>zoink</p></td>
+      <td><p>zod</p></td>
+      </tr>
+      </tbody>
+      </table>""")
+    self.assertEqual(convert(d3), d3_expected)
+
+  def testPluginIsActive(self):
+    plugin = text_plugin.TextPlugin()
+    multiplexer = event_multiplexer.EventMultiplexer()
+    plugin.get_plugin_apps(event_multiplexer.EventMultiplexer(), None)
+
+    # The plugin is inactive because text summaries are not available.
+    self.assertFalse(plugin.is_active())
+
+    multiplexer.AddRunsFromDirectory(self.logdir)
+    multiplexer.Reload()
+
+    # The plugin is active because text summaries are available.
+    self.assertTrue(self.plugin.is_active())
+
+  def testUnicode(self):
+    self.assertConverted(u'<p>Iñtërnâtiônàlizætiøn⚡💩</p>',
+                         'Iñtërnâtiônàlizætiøn⚡💩')
+
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tensorboard/scripts/generate_testdata.py b/tensorflow/tensorboard/scripts/generate_testdata.py
index f89ab690ba3360827a32d48ac328f3254da7f38c..e8fb9cd6d4bb26c46c44506a7b2a61a3453121f3 100644
--- a/tensorflow/tensorboard/scripts/generate_testdata.py
+++ b/tensorflow/tensorboard/scripts/generate_testdata.py
@@ -38,7 +38,7 @@ from tensorflow.python.platform import app
 from tensorflow.python.platform import flags
 from tensorflow.python.summary.writer import writer as writer_lib
 
-tf.flags.DEFINE_string("target", None, """The directoy where serialized data
+tf.flags.DEFINE_string("target", None, """The directory where serialized data
 will be written""")
 
 flags.DEFINE_boolean("overwrite", False, """Whether to remove and overwrite
@@ -138,7 +138,7 @@ def WriteAudioSeries(writer, tag, n_audio=1):
   min_frequency_hz = 440
   max_frequency_hz = 880
   sample_rate = 4000
-  duration_frames = sample_rate * 0.5  # 0.5 seconds.
+  duration_frames = sample_rate // 2  # 0.5 seconds.
   frequencies_per_run = 1
   num_channels = 2
 
diff --git a/tensorflow/tensorboard/tensorboard.py b/tensorflow/tensorboard/tensorboard.py
index f3900d1e5dfe582e7a5d0e7655d92a8eae476f01..f371a01f35ba569c1c14f75cc3812fb6790f2668 100644
--- a/tensorflow/tensorboard/tensorboard.py
+++ b/tensorflow/tensorboard/tensorboard.py
@@ -32,7 +32,8 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.tensorboard.backend import application
 from tensorflow.tensorboard.backend.event_processing import event_file_inspector as efi
-
+from tensorflow.tensorboard.plugins.projector import projector_plugin
+from tensorflow.tensorboard.plugins.text import text_plugin
 
 # TensorBoard flags
 
@@ -88,8 +89,18 @@ flags.DEFINE_string(
 FLAGS = flags.FLAGS
 
 
-def create_tb_app():
-  """Read the flags, and create a TensorBoard WSGI application."""
+def create_tb_app(plugins):
+  """Read the flags, and create a TensorBoard WSGI application.
+
+  Args:
+    plugins: A list of plugins for TensorBoard to initialize.
+
+  Raises:
+    ValueError: if a logdir is not specified.
+
+  Returns:
+    A new TensorBoard WSGI application.
+  """
   if not FLAGS.logdir:
     raise ValueError('A logdir must be specified. Run `tensorboard --help` for '
                      'details and examples.')
@@ -98,7 +109,8 @@ def create_tb_app():
   return application.standard_tensorboard_wsgi(
       logdir=logdir,
       purge_orphaned_data=FLAGS.purge_orphaned_data,
-      reload_interval=FLAGS.reload_interval)
+      reload_interval=FLAGS.reload_interval,
+      plugins=plugins)
 
 
 def make_simple_server(tb_app, host, port):
@@ -184,7 +196,11 @@ def main(unused_argv=None):
     efi.inspect(FLAGS.logdir, event_file, FLAGS.tag)
     return 0
   else:
-    tb = create_tb_app()
+    plugins = [
+        projector_plugin.ProjectorPlugin(),
+        text_plugin.TextPlugin(),
+    ]
+    tb = create_tb_app(plugins)
     run_simple_server(tb)
 
 if __name__ == '__main__':
diff --git a/tensorflow/tensorboard/tsconfig.json b/tensorflow/tensorboard/tsconfig.json
deleted file mode 100644
index ac69c30533f8dca44b14782f2837449fdcf82f23..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/tsconfig.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "compilerOptions": {
-    "noImplicitAny": false,
-    "noEmitOnError": true,
-    "target": "ES5",
-    "module": "commonjs"
-  },
-  "compileOnSave": false,
-  "exclude": [
-    "node_modules",
-    "typings/main.d.ts",
-    "typings/main",
-    "lib",
-    "components/**/deps.d.ts"
-  ]
-}
diff --git a/tensorflow/tensorboard/vulcanize.bzl b/tensorflow/tensorboard/vulcanize.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f7d88047afca8556642fa24bed710c79a1285fd3
--- /dev/null
+++ b/tensorflow/tensorboard/vulcanize.bzl
@@ -0,0 +1,100 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@io_bazel_rules_closure//closure/private:defs.bzl", "unfurl", "long_path")
+
+def _tensorboard_html_binary(ctx):
+  deps = unfurl(ctx.attr.deps, provider="webfiles")
+  manifests = set(order="link")
+  files = set()
+  for dep in deps:
+    manifests += dep.webfiles.manifests
+    files += dep.data_runfiles.files
+
+  # vulcanize
+  ctx.action(
+      inputs=list(manifests + files),
+      outputs=[ctx.outputs.html],
+      executable=ctx.executable._Vulcanize,
+      arguments=([ctx.attr.input_path,
+                  ctx.attr.output_path,
+                  ctx.outputs.html.path] +
+                 [m.path for m in manifests]),
+      progress_message="Vulcanizing %s" % ctx.attr.input_path)
+
+  # webfiles manifest
+  manifest_srcs = [struct(path=ctx.outputs.html.path,
+                          longpath=long_path(ctx, ctx.outputs.html),
+                          webpath=ctx.attr.output_path)]
+  manifest = ctx.new_file(ctx.configuration.bin_dir,
+                          "%s.pbtxt" % ctx.label.name)
+  ctx.file_action(
+      output=manifest,
+      content=struct(
+          label=str(ctx.label),
+          src=manifest_srcs).to_proto())
+  manifests += [manifest]
+
+  # webfiles server
+  params = struct(
+      label=str(ctx.label),
+      bind="[::]:6006",
+      manifest=[long_path(ctx, man) for man in manifests],
+      external_asset=[struct(webpath=k, path=v)
+                      for k, v in ctx.attr.external_assets.items()])
+  params_file = ctx.new_file(ctx.configuration.bin_dir,
+                             "%s_server_params.pbtxt" % ctx.label.name)
+  ctx.file_action(output=params_file, content=params.to_proto())
+  ctx.file_action(
+      executable=True,
+      output=ctx.outputs.executable,
+      content="#!/bin/sh\nexec %s %s" % (
+          ctx.executable._WebfilesServer.short_path,
+          long_path(ctx, params_file)))
+
+  transitive_runfiles = set()
+  transitive_runfiles += ctx.attr._WebfilesServer.data_runfiles.files
+  for dep in deps:
+    transitive_runfiles += dep.data_runfiles.files
+  return struct(
+      files=set([ctx.outputs.html]),
+      runfiles=ctx.runfiles(
+          files=ctx.files.data + [manifest,
+                                  params_file,
+                                  ctx.outputs.html,
+                                  ctx.outputs.executable],
+          transitive_files=transitive_runfiles))
+
+tensorboard_html_binary = rule(
+    implementation=_tensorboard_html_binary,
+    executable=True,
+    attrs={
+        "input_path": attr.string(mandatory=True),
+        "output_path": attr.string(mandatory=True),
+        "data": attr.label_list(cfg="data", allow_files=True),
+        "deps": attr.label_list(providers=["webfiles"], mandatory=True),
+        "external_assets": attr.string_dict(default={"/_/runfiles": "."}),
+        "_Vulcanize": attr.label(
+            default=Label("//tensorflow/tensorboard/java/org/tensorflow/tensorboard/vulcanize:Vulcanize"),
+            executable=True,
+            cfg="host"),
+        "_WebfilesServer": attr.label(
+            default=Label(
+                "@io_bazel_rules_closure//java/io/bazel/rules/closure/webfiles/server:WebfilesServer"),
+            executable=True,
+            cfg="host"),
+    },
+    outputs={
+        "html": "%{name}.html",
+    })
diff --git a/tensorflow/tensorboard/wct.conf.json b/tensorflow/tensorboard/wct.conf.json
deleted file mode 100644
index 519218ce41804992385eca93fe55e26cb2c34d4e..0000000000000000000000000000000000000000
--- a/tensorflow/tensorboard/wct.conf.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "suites": [
-    "components/tf_*/test",
-    "components/vz_*/test"
-  ],
-  "plugins": ["local"]
-}
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index c8254f0062b4276720b4f897431e5d4716274f7a..348745f8d2bb0c40f9c1e9c3d7630b463257d66c 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,135 +1,153 @@
 # -*- Python -*-
 
+
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
+
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
-    "tf_additional_xla_deps_py",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "if_cuda",
-    "cuda_default_copts"
-)
+    "tf_additional_xla_deps_py",)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "cuda_default_copts")
 
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
+    "if_mkl",)
+
 
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
-  return [str(Label("//tensorflow/core:" + p))
-          for p in core_proto_sources_relative]
+  return [
+      "//tensorflow/core:" + p for p in core_proto_sources_relative
+  ]
+
 
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
-  return ([str(Label("//tensorflow/core/" + p.replace(".proto", ".pb.h")))
-          for p in core_proto_sources_relative] +
-         [str(Label("//tensorflow/core/" + p.replace(".proto", ".proto.h")))
-          for p in core_proto_sources_relative])
+  return ([
+      "//tensorflow/core/" + p.replace(".proto", ".pb.h")
+      for p in core_proto_sources_relative
+  ] + [
+      "//tensorflow/core/" + p.replace(".proto", ".proto.h")
+      for p in core_proto_sources_relative
+  ])
+
+
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+  return str(Label(dep))
+
 
 def if_android_x86(a):
   return select({
-      str(Label("//tensorflow:android_x86")): a,
-      str(Label("//tensorflow:android_x86_64")): a,
+      clean_dep("//tensorflow:android_x86"): a,
+      clean_dep("//tensorflow:android_x86_64"): a,
       "//conditions:default": [],
   })
 
 
 def if_android_arm(a):
   return select({
-      str(Label("//tensorflow:android_arm")): a,
+      clean_dep("//tensorflow:android_arm"): a,
       "//conditions:default": [],
   })
 
+
 def if_android_arm64(a):
   return select({
-      str(Label("//tensorflow:android_arm64")): a,
+      clean_dep("//tensorflow:android_arm64"): a,
       "//conditions:default": [],
   })
 
+
 def if_not_android(a):
   return select({
-      str(Label("//tensorflow:android")): [],
+      clean_dep("//tensorflow:android"): [],
       "//conditions:default": a,
   })
 
+
 def if_android(a):
   return select({
-      str(Label("//tensorflow:android")): a,
+      clean_dep("//tensorflow:android"): a,
       "//conditions:default": [],
   })
 
+
 def if_ios(a):
   return select({
-      str(Label("//tensorflow:ios")): a,
+      clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+
 def if_mobile(a):
   return select({
-      str(Label("//tensorflow:android")): a,
-      str(Label("//tensorflow:ios")): a,
+      clean_dep("//tensorflow:android"): a,
+      clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
+
 def if_not_mobile(a):
   return select({
-      str(Label("//tensorflow:android")): [],
-      str(Label("//tensorflow:ios")): [],
+      clean_dep("//tensorflow:android"): [],
+      clean_dep("//tensorflow:ios"): [],
       "//conditions:default": a,
   })
 
+
 def if_not_windows(a):
   return select({
-      str(Label("//tensorflow:windows")): [],
+      clean_dep("//tensorflow:windows"): [],
       "//conditions:default": a,
   })
 
+
 def if_x86(a):
   return select({
-      str(Label("//tensorflow:linux_x86_64")): a,
-      str(Label("//tensorflow:windows")): a,
+      clean_dep("//tensorflow:linux_x86_64"): a,
+      clean_dep("//tensorflow:windows"): a,
       "//conditions:default": [],
   })
 
+
 # LINT.IfChange
 def tf_copts():
-  return (["-DEIGEN_AVOID_STL_ARRAY",
-           "-Iexternal/gemmlowp",
-           "-Wno-sign-compare",
-           "-fno-exceptions",] +
-          if_cuda(["-DGOOGLE_CUDA=1"]) +
-          if_mkl(["-DINTEL_MKL=1"]) +
-          if_android_arm(["-mfpu=neon"]) +
-          if_x86(["-msse3"]) +
-          select({
-              str(Label("//tensorflow:android")): [
-                  "-std=c++11",
-                  "-DTF_LEAN_BINARY",
-                  "-O2",
-              ],
-              str(Label("//tensorflow:darwin")): [],
-              str(Label("//tensorflow:windows")): [
-                "/DLANG_CXX11",
-                "/D__VERSION__=\\\"MSVC\\\"",
-                "/DPLATFORM_WINDOWS",
-                "/DTF_COMPILE_LIBRARY",
-                "/DEIGEN_HAS_C99_MATH",
-                "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
-              ],
-              str(Label("//tensorflow:ios")): ["-std=c++11"],
-              "//conditions:default": ["-pthread"]}))
+  return ([
+      "-DEIGEN_AVOID_STL_ARRAY",
+      "-Iexternal/gemmlowp",
+      "-Wno-sign-compare",
+      "-fno-exceptions",
+  ] + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_android_arm(
+      ["-mfpu=neon"]) + if_x86(["-msse3"]) + select({
+          clean_dep("//tensorflow:android"): [
+              "-std=c++11",
+              "-DTF_LEAN_BINARY",
+              "-O2",
+          ],
+          clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): [
+              "/DLANG_CXX11",
+              "/D__VERSION__=\\\"MSVC\\\"",
+              "/DPLATFORM_WINDOWS",
+              "/DTF_COMPILE_LIBRARY",
+              "/DEIGEN_HAS_C99_MATH",
+              "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+          ],
+          clean_dep("//tensorflow:ios"): ["-std=c++11"],
+          "//conditions:default": ["-pthread"]
+      }))
+
 
 def tf_opts_nortti_if_android():
   return if_android([
@@ -137,8 +155,11 @@ def tf_opts_nortti_if_android():
       "-DGOOGLE_PROTOBUF_NO_RTTI",
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ]) + if_android_x86(["-msse4.1"])
+
+
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps=None):
@@ -147,16 +168,20 @@ def tf_gen_op_libs(op_lib_names, deps=None):
   if not deps:
     deps = []
   for n in op_lib_names:
-    native.cc_library(name=n + "_op_lib",
-                      copts=tf_copts(),
-                      srcs=["ops/" + n + ".cc"],
-                      deps=deps + [str(Label("//tensorflow/core:framework"))],
-                      visibility=["//visibility:public"],
-                      alwayslink=1,
-                      linkstatic=1,)
-
-def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
-                         op_gen=str(Label("//tensorflow/cc:cc_op_gen_main")),
+    native.cc_library(
+        name=n + "_op_lib",
+        copts=tf_copts(),
+        srcs=["ops/" + n + ".cc"],
+        deps=deps + [clean_dep("//tensorflow/core:framework")],
+        visibility=["//visibility:public"],
+        alwayslink=1,
+        linkstatic=1,)
+
+
+def tf_gen_op_wrapper_cc(name,
+                         out_ops_file,
+                         pkg="",
+                         op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                          deps=None,
                          override_file=None,
                          include_internal_ops=0):
@@ -165,12 +190,11 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
   if deps == None:
     deps = [pkg + ":" + name + "_op_lib"]
   native.cc_binary(
-      name = tool,
-      copts = tf_copts(),
-      linkopts = ["-lm"],
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = [op_gen] + deps
-  )
+      name=tool,
+      copts=tf_copts(),
+      linkopts=["-lm"],
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=[op_gen] + deps)
 
   if override_file == None:
     srcs = []
@@ -180,14 +204,17 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
     override_arg = "$(location " + override_file + ")"
   native.genrule(
       name=name + "_genrule",
-      outs=[out_ops_file + ".h", out_ops_file + ".cc",
-            out_ops_file + "_internal.h", out_ops_file + "_internal.cc"],
+      outs=[
+          out_ops_file + ".h", out_ops_file + ".cc",
+          out_ops_file + "_internal.h", out_ops_file + "_internal.cc"
+      ],
       srcs=srcs,
       tools=[":" + tool],
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
            "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
            str(include_internal_ops)))
 
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
 # files for each of the ops files mentioned, and then generate a
@@ -214,18 +241,18 @@ def tf_gen_op_wrapper_cc(name, out_ops_file, pkg="",
 #            hdrs = [ "ops/array_ops_internal.h",
 #                     "ops/math_ops_internal.h" ],
 #            deps = [ ... ])
-# TODO(josh11b): Cleaner approach for hidden ops.
+# TODO(joshl): Cleaner approach for hidden ops.
 def tf_gen_op_wrappers_cc(name,
                           op_lib_names=[],
                           other_srcs=[],
                           other_hdrs=[],
                           pkg="",
                           deps=[
-                              str(Label("//tensorflow/cc:ops")),
-                              str(Label("//tensorflow/cc:scope")),
-                              str(Label("//tensorflow/cc:const_op")),
+                              clean_dep("//tensorflow/cc:ops"),
+                              clean_dep("//tensorflow/cc:scope"),
+                              clean_dep("//tensorflow/cc:const_op"),
                           ],
-                          op_gen=str(Label("//tensorflow/cc:cc_op_gen_main")),
+                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                           override_file=None,
                           include_internal_ops=0,
                           visibility=None):
@@ -235,59 +262,72 @@ def tf_gen_op_wrappers_cc(name,
   internalhdrs = []
   for n in op_lib_names:
     tf_gen_op_wrapper_cc(
-        n, "ops/" + n, pkg=pkg, op_gen=op_gen, override_file=override_file,
+        n,
+        "ops/" + n,
+        pkg=pkg,
+        op_gen=op_gen,
+        override_file=override_file,
         include_internal_ops=include_internal_ops)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
     internalsrcs += ["ops/" + n + "_internal.cc"]
     internalhdrs += ["ops/" + n + "_internal.h"]
 
-  native.cc_library(name=name,
-                    srcs=subsrcs,
-                    hdrs=subhdrs,
-                    deps=deps + if_not_android([
-                        str(Label("//tensorflow/core:core_cpu")),
-                        str(Label("//tensorflow/core:framework")),
-                        str(Label("//tensorflow/core:lib")),
-                        str(Label("//tensorflow/core:protos_all_cc")),
-                    ]) + if_android([
-                        str(Label("//tensorflow/core:android_tensorflow_lib")),
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=visibility)
-  native.cc_library(name=name + "_internal",
-                    srcs=internalsrcs,
-                    hdrs=internalhdrs,
-                    deps=deps + if_not_android([
-                        str(Label("//tensorflow/core:core_cpu")),
-                        str(Label("//tensorflow/core:framework")),
-                        str(Label("//tensorflow/core:lib")),
-                        str(Label("//tensorflow/core:protos_all_cc")),
-                    ]) + if_android([
-                        str(Label("//tensorflow/core:android_tensorflow_lib")),
-                    ]),
-                    copts=tf_copts(),
-                    alwayslink=1,
-                    visibility=[str(Label("//tensorflow:internal"))])
+  native.cc_library(
+      name=name,
+      srcs=subsrcs,
+      hdrs=subhdrs,
+      deps=deps + if_not_android([
+          clean_dep("//tensorflow/core:core_cpu"),
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib"),
+          clean_dep("//tensorflow/core:protos_all_cc"),
+      ]) + if_android([
+          clean_dep("//tensorflow/core:android_tensorflow_lib"),
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=visibility)
+  native.cc_library(
+      name=name + "_internal",
+      srcs=internalsrcs,
+      hdrs=internalhdrs,
+      deps=deps + if_not_android([
+          clean_dep("//tensorflow/core:core_cpu"),
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib"),
+          clean_dep("//tensorflow/core:protos_all_cc"),
+      ]) + if_android([
+          clean_dep("//tensorflow/core:android_tensorflow_lib"),
+      ]),
+      copts=tf_copts(),
+      alwayslink=1,
+      visibility=[clean_dep("//tensorflow:internal")])
+
 
 # Invoke this rule in .../tensorflow/python to build the wrapper library.
-def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
-                         require_shape_functions=False, hidden_file=None,
+def tf_gen_op_wrapper_py(name,
+                         out=None,
+                         hidden=None,
+                         visibility=None,
+                         deps=[],
+                         require_shape_functions=False,
+                         hidden_file=None,
                          generated_target_name=None):
   # Construct a cc_binary containing the specified ops.
   tool_name = "gen_" + name + "_py_wrappers_cc"
   if not deps:
     deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
   native.cc_binary(
-      name = tool_name,
-      linkopts = ["-lm"],
-      copts = tf_copts(),
-      linkstatic = 1,   # Faster to link this one-time-use binary dynamically
-      deps = ([str(Label("//tensorflow/core:framework")),
-               str(Label("//tensorflow/python:python_op_gen_main"))] + deps),
-      visibility = [str(Label("//tensorflow:internal"))],
-  )
+      name=tool_name,
+      linkopts=["-lm"],
+      copts=tf_copts(),
+      linkstatic=1,  # Faster to link this one-time-use binary dynamically
+      deps=([
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/python:python_op_gen_main")
+      ] + deps),
+      visibility=[clean_dep("//tensorflow:internal")],)
 
   # Invoke the previous cc_binary to generate a python file.
   if not out:
@@ -299,8 +339,8 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") " + ",".join(hidden)
-             + " " + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " + ",".join(hidden) + " " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   elif hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
@@ -309,77 +349,120 @@ def tf_gen_op_wrapper_py(name, out=None, hidden=None, visibility=None, deps=[],
         outs=[out],
         srcs=[hidden_file],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") @$(location "
-             + hidden_file + ") " + ("1" if require_shape_functions else "0")
-             + " > $@"))
+        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
   else:
     # No ops should be hidden in the generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
         tools=[tool_name],
-        cmd=("$(location " + tool_name + ") "
-             + ("1" if require_shape_functions else "0") + " > $@"))
+        cmd=("$(location " + tool_name + ") " +
+             ("1" if require_shape_functions else "0") + " > $@"))
 
   # Make a py_library out of the generated python file.
   if not generated_target_name:
     generated_target_name = name
-  native.py_library(name=generated_target_name,
-                    srcs=[out],
-                    srcs_version="PY2AND3",
-                    visibility=visibility,
-                    deps=[
-                        str(Label("//tensorflow/python:framework_for_generated_wrappers_v2")),
-                    ],)
+  native.py_library(
+      name=generated_target_name,
+      srcs=[out],
+      srcs_version="PY2AND3",
+      visibility=visibility,
+      deps=[
+          clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
+      ],)
+
 
 # Define a bazel macro that creates cc_test for tensorflow.
 # TODO(opensource): we need to enable this to work around the hidden symbol
 # __cudaRegisterFatBinary error. Need more investigations.
-def tf_cc_test(name, srcs, deps, linkstatic=0, tags=[], data=[], size="medium",
-               suffix="", args=None, linkopts=[]):
-  native.cc_test(name="%s%s" % (name, suffix),
-                 srcs=srcs,
-                 size=size,
-                 args=args,
-                 copts=tf_copts(),
-                 data=data,
-                 deps=deps,
-                 linkopts=["-lpthread", "-lm"] + linkopts,
-                 linkstatic=linkstatic,
-                 tags=tags)
+def tf_cc_test(name,
+               srcs,
+               deps,
+               linkstatic=0,
+               tags=[],
+               data=[],
+               size="medium",
+               suffix="",
+               args=None,
+               linkopts=[]):
+  native.cc_test(
+      name="%s%s" % (name, suffix),
+      srcs=srcs,
+      size=size,
+      args=args,
+      copts=tf_copts(),
+      data=data,
+      deps=deps,
+      linkopts=["-lpthread", "-lm"] + linkopts,
+      linkstatic=linkstatic,
+      tags=tags)
+
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
-def tf_cc_test_gpu(name, srcs, deps, linkstatic=0, tags=[], data=[],
-                   size="medium", suffix="", args=None):
-  tf_cc_test(name, srcs, deps, linkstatic=linkstatic, tags=tags, data=data,
-             size=size, suffix=suffix, args=args)
-
-def tf_cuda_cc_test(name, srcs=[], deps=[], tags=[], data=[], size="medium",
-                    linkstatic=0, args=[], linkopts=[]):
-  tf_cc_test(name=name,
-             srcs=srcs,
-             deps=deps,
-             tags=tags + ["manual"],
-             data=data,
-             size=size,
-             linkstatic=linkstatic,
-             linkopts=linkopts,
-             args=args)
-  tf_cc_test(name=name,
-             srcs=srcs,
-             suffix="_gpu",
-             deps=deps + if_cuda([str(Label("//tensorflow/core:gpu_runtime"))]),
-             linkstatic=if_cuda(1, 0),
-             tags=tags + tf_cuda_tests_tags(),
-             data=data,
-             size=size,
-             linkopts=linkopts,
-             args=args)
+def tf_cc_test_gpu(name,
+                   srcs,
+                   deps,
+                   linkstatic=0,
+                   tags=[],
+                   data=[],
+                   size="medium",
+                   suffix="",
+                   args=None):
+  tf_cc_test(
+      name,
+      srcs,
+      deps,
+      linkstatic=linkstatic,
+      tags=tags,
+      data=data,
+      size=size,
+      suffix=suffix,
+      args=args)
+
+
+def tf_cuda_cc_test(name,
+                    srcs=[],
+                    deps=[],
+                    tags=[],
+                    data=[],
+                    size="medium",
+                    linkstatic=0,
+                    args=[],
+                    linkopts=[]):
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      deps=deps,
+      tags=tags + ["manual"],
+      data=data,
+      size=size,
+      linkstatic=linkstatic,
+      linkopts=linkopts,
+      args=args)
+  tf_cc_test(
+      name=name,
+      srcs=srcs,
+      suffix="_gpu",
+      deps=deps + if_cuda([clean_dep("//tensorflow/core:gpu_runtime")]),
+      linkstatic=if_cuda(1, 0),
+      tags=tags + tf_cuda_tests_tags(),
+      data=data,
+      size=size,
+      linkopts=linkopts,
+      args=args)
+
 
 # Create a cc_test for each of the tensorflow tests listed in "tests"
-def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                args=None, linkopts=[]):
+def tf_cc_tests(srcs,
+                deps,
+                name="",
+                linkstatic=0,
+                tags=[],
+                size="medium",
+                args=None,
+                linkopts=[]):
   for src in srcs:
     tf_cc_test(
         name=src_to_test_name(src),
@@ -391,17 +474,35 @@ def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
         args=args,
         linkopts=linkopts)
 
-def tf_cc_test_mkl(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
-                    args=None):
+
+def tf_cc_test_mkl(srcs,
+                   deps,
+                   name="",
+                   linkstatic=0,
+                   tags=[],
+                   size="medium",
+                   args=None):
   if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args))
 
-def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium",
+
+def tf_cc_tests_gpu(srcs,
+                    deps,
+                    name="",
+                    linkstatic=0,
+                    tags=[],
+                    size="medium",
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
 
-def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
-                     args=None, linkopts=[]):
+def tf_cuda_cc_tests(srcs,
+                     deps,
+                     name="",
+                     tags=[],
+                     size="medium",
+                     linkstatic=0,
+                     args=None,
+                     linkopts=[]):
   for src in srcs:
     tf_cuda_cc_test(
         name=src_to_test_name(src),
@@ -413,48 +514,52 @@ def tf_cuda_cc_tests(srcs, deps, name='', tags=[], size="medium", linkstatic=0,
         args=args,
         linkopts=linkopts)
 
+
 def _cuda_copts():
-    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+  """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
     If we're doing CUDA compilation, returns copts for our particular CUDA
     compiler.  If we're not doing CUDA compilation, returns an empty list.
 
     """
-    return cuda_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_cuda//cuda:using_nvcc": (
-            [
-                "-nvcc_options=relaxed-constexpr",
-                "-nvcc_options=ftz=true",
-            ]
-        ),
-        "@local_config_cuda//cuda:using_clang": (
-            [
-                "-fcuda-flush-denormals-to-zero",
-            ]
-        ),
-    })
+  return cuda_default_copts() + select({
+      "//conditions:default": [],
+      "@local_config_cuda//cuda:using_nvcc": ([
+          "-nvcc_options=relaxed-constexpr",
+          "-nvcc_options=ftz=true",
+      ]),
+      "@local_config_cuda//cuda:using_clang": ([
+          "-fcuda-flush-denormals-to-zero",
+      ]),
+  })
+
 
 # Build defs for TensorFlow kernels
 
+
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
-def tf_gpu_kernel_library(srcs, copts=[], cuda_copts=[], deps=[], hdrs=[],
+def tf_gpu_kernel_library(srcs,
+                          copts=[],
+                          cuda_copts=[],
+                          deps=[],
+                          hdrs=[],
                           **kwargs):
   copts = copts + _cuda_copts() + if_cuda(cuda_copts) + tf_copts()
 
   native.cc_library(
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      deps = deps + if_cuda([
-          str(Label("//tensorflow/core:cuda")),
-          str(Label("//tensorflow/core:gpu_lib")),
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      deps=deps + if_cuda([
+          clean_dep("//tensorflow/core:cuda"),
+          clean_dep("//tensorflow/core:gpu_lib"),
       ]),
       alwayslink=1,
       **kwargs)
 
+
 def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
 
@@ -479,15 +584,23 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
     copts = []
 
   native.cc_library(
-      deps = deps + if_cuda(cuda_deps + [
-          str(Label("//tensorflow/core:cuda")),
+      deps=deps + if_cuda(cuda_deps + [
+          clean_dep("//tensorflow/core:cuda"),
           "@local_config_cuda//cuda:cuda_headers"
       ]),
-      copts = copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
+      copts=copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1"]),
       **kwargs)
 
-def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
-                      deps=None, alwayslink=1, copts=tf_copts(), **kwargs):
+
+def tf_kernel_library(name,
+                      prefix=None,
+                      srcs=None,
+                      gpu_srcs=None,
+                      hdrs=None,
+                      deps=None,
+                      alwayslink=1,
+                      copts=tf_copts(),
+                      **kwargs):
   """A rule to build a TensorFlow OpKernel.
 
   May either specify srcs/hdrs or prefix.  Similar to tf_cuda_library,
@@ -517,43 +630,58 @@ def tf_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
     deps = []
 
   if prefix:
-    if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+    if native.glob([prefix + "*.cu.cc"], exclude=["*test*"]):
       if not gpu_srcs:
         gpu_srcs = []
-      gpu_srcs = gpu_srcs + native.glob([prefix + "*.cu.cc", prefix + "*.h"],
-                                        exclude = [prefix + "*test*"])
-    srcs = srcs + native.glob([prefix + "*.cc"],
-                              exclude = [prefix + "*test*", prefix + "*.cu.cc"])
-    hdrs = hdrs + native.glob([prefix + "*.h"], exclude = [prefix + "*test*",
-                                                           prefix + "*.cu.h"])
-
-  cuda_deps = [str(Label("//tensorflow/core:gpu_lib"))]
+      gpu_srcs = gpu_srcs + native.glob(
+          [prefix + "*.cu.cc", prefix + "*.h"], exclude=[prefix + "*test*"])
+    srcs = srcs + native.glob(
+        [prefix + "*.cc"], exclude=[prefix + "*test*", prefix + "*.cu.cc"])
+    hdrs = hdrs + native.glob(
+        [prefix + "*.h"], exclude=[prefix + "*test*", prefix + "*.cu.h"])
+
+  cuda_deps = [clean_dep("//tensorflow/core:gpu_lib")]
   if gpu_srcs:
     for gpu_src in gpu_srcs:
       if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
-        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".format(gpu_src))
+        fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc".
+             format(gpu_src))
     tf_gpu_kernel_library(
-        name = name + "_gpu",
-        srcs = gpu_srcs,
-        deps = deps,
-        **kwargs)
+        name=name + "_gpu", srcs=gpu_srcs, deps=deps, **kwargs)
     cuda_deps.extend([":" + name + "_gpu"])
   tf_cuda_library(
-      name = name,
-      srcs = srcs,
-      hdrs = hdrs,
-      copts = copts,
-      cuda_deps = cuda_deps,
-      linkstatic = 1,   # Needed since alwayslink is broken in bazel b/27630669
-      alwayslink = alwayslink,
-      deps = deps,
+      name=name,
+      srcs=srcs,
+      hdrs=hdrs,
+      copts=copts,
+      cuda_deps=cuda_deps,
+      linkstatic=1,  # Needed since alwayslink is broken in bazel b/27630669
+      alwayslink=alwayslink,
+      deps=deps,
       **kwargs)
 
-def tf_mkl_kernel_library(name, prefix=None, srcs=None, gpu_srcs=None, hdrs=None,
-                      deps=None, alwayslink=1, copts=tf_copts(), **kwargs):
-  if_mkl(tf_kernel_library(name, prefix=prefix, srcs=srcs, gpu_srcs=gpu_srcs, 
-                                 hdrs=hdrs, deps=deps, alwayslink=alwayslink, 
-                                 copts=copts, **kwargs))
+
+def tf_mkl_kernel_library(name,
+                          prefix=None,
+                          srcs=None,
+                          gpu_srcs=None,
+                          hdrs=None,
+                          deps=None,
+                          alwayslink=1,
+                          copts=tf_copts(),
+                          **kwargs):
+  if_mkl(
+      tf_kernel_library(
+          name,
+          prefix=prefix,
+          srcs=srcs,
+          gpu_srcs=gpu_srcs,
+          hdrs=hdrs,
+          deps=deps,
+          alwayslink=alwayslink,
+          copts=copts,
+          **kwargs))
+
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
@@ -570,59 +698,61 @@ def _py_wrap_cc_impl(ctx):
   inputs += ctx.files.toolchain_deps
   swig_include_dirs = set(_get_repository_roots(ctx, inputs))
   swig_include_dirs += sorted([f.dirname for f in ctx.files._swiglib])
-  args = ["-c++",
-          "-python",
-          "-module", module_name,
-          "-o", ctx.outputs.cc_out.path,
-          "-outdir", ctx.outputs.py_out.dirname]
+  args = [
+      "-c++", "-python", "-module", module_name, "-o", ctx.outputs.cc_out.path,
+      "-outdir", ctx.outputs.py_out.dirname
+  ]
   args += ["-l" + f.path for f in ctx.files.swig_includes]
   args += ["-I" + i for i in swig_include_dirs]
   args += [src.path]
-  outputs = [ctx.outputs.cc_out,
-             ctx.outputs.py_out]
-  ctx.action(executable=ctx.executable._swig,
-             arguments=args,
-             inputs=list(inputs),
-             outputs=outputs,
-             mnemonic="PythonSwig",
-             progress_message="SWIGing " + src.path)
+  outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
+  ctx.action(
+      executable=ctx.executable._swig,
+      arguments=args,
+      inputs=list(inputs),
+      outputs=outputs,
+      mnemonic="PythonSwig",
+      progress_message="SWIGing " + src.path)
   return struct(files=set(outputs))
 
+
 _py_wrap_cc = rule(
-    attrs = {
-        "srcs": attr.label_list(
-            mandatory = True,
-            allow_files = True,
-        ),
-        "swig_includes": attr.label_list(
-            cfg = "data",
-            allow_files = True,
-        ),
-        "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
-        "toolchain_deps": attr.label_list(
-            allow_files = True,
-        ),
-        "module_name": attr.string(mandatory = True),
-        "py_module_name": attr.string(mandatory = True),
-        "_swig": attr.label(
-            default = Label("@swig//:swig"),
-            executable = True,
-            cfg = "host",
-        ),
-        "_swiglib": attr.label(
-            default = Label("@swig//:templates"),
-            allow_files = True,
-        ),
+    attrs={
+        "srcs":
+            attr.label_list(
+                mandatory=True,
+                allow_files=True,),
+        "swig_includes":
+            attr.label_list(
+                cfg="data",
+                allow_files=True,),
+        "deps":
+            attr.label_list(
+                allow_files=True,
+                providers=["cc"],),
+        "toolchain_deps":
+            attr.label_list(
+                allow_files=True,),
+        "module_name":
+            attr.string(mandatory=True),
+        "py_module_name":
+            attr.string(mandatory=True),
+        "_swig":
+            attr.label(
+                default=Label("@swig//:swig"),
+                executable=True,
+                cfg="host",),
+        "_swiglib":
+            attr.label(
+                default=Label("@swig//:templates"),
+                allow_files=True,),
     },
-    outputs = {
+    outputs={
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation = _py_wrap_cc_impl,
-)
+    implementation=_py_wrap_cc_impl,)
+
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -653,6 +783,7 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
+
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = set()
@@ -660,38 +791,36 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
+
 _transitive_hdrs = rule(
-    attrs = {
+    attrs={
         "deps": attr.label_list(
-            allow_files = True,
-            providers = ["cc"],
-        ),
+            allow_files=True,
+            providers=["cc"],),
     },
-    implementation = _transitive_hdrs_impl,
-)
+    implementation=_transitive_hdrs_impl,)
+
 
 def transitive_hdrs(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.filegroup(name=name,
-                   srcs=[":" + name + "_gather"])
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.filegroup(name=name, srcs=[":" + name + "_gather"])
+
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], **kwargs):
-  _transitive_hdrs(name=name + "_gather",
-                   deps=deps)
-  native.cc_library(name=name,
-                    hdrs=[":" + name + "_gather"],
-                    **kwargs)
+  _transitive_hdrs(name=name + "_gather", deps=deps)
+  native.cc_library(name=name, hdrs=[":" + name + "_gather"], **kwargs)
+
 
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf//:protobuf_headers",
-      str(Label("//third_party/eigen3")),
-      str(Label("//tensorflow/core:framework_headers_lib")),
+      clean_dep("//third_party/eigen3"),
+      clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
 
+
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -705,14 +834,16 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
+
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl,
-    attr_aspects=["deps"])
+    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
+
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
+
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -724,62 +855,71 @@ def _check_deps_impl(ctx):
     for dep in input_dep.tf_collected_deps:
       for disallowed_dep in disallowed_deps:
         if dep == disallowed_dep.label:
-          fail(_dep_label(input_dep) + " cannot depend on " +
-               _dep_label(disallowed_dep))
+          fail(
+              _dep_label(input_dep) + " cannot depend on " + _dep_label(
+                  disallowed_dep))
   return struct()
 
+
 check_deps = rule(
     _check_deps_impl,
-    attrs = {
-        "deps": attr.label_list(
-            aspects=[collect_deps_aspect],
-            mandatory = True,
-            allow_files = True
-        ),
-        "disallowed_deps": attr.label_list(
-            mandatory = True,
-            allow_files = True
-        )},
-)
+    attrs={
+        "deps":
+            attr.label_list(
+                aspects=[collect_deps_aspect], mandatory=True,
+                allow_files=True),
+        "disallowed_deps":
+            attr.label_list(mandatory=True, allow_files=True)
+    },)
+
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
 def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
   cuda_deps = [
-      str(Label("//tensorflow/core:stream_executor_headers_lib")),
+      clean_dep("//tensorflow/core:stream_executor_headers_lib"),
       "@local_config_cuda//cuda:cudart_static",
   ]
   deps = deps + tf_custom_op_library_additional_deps()
   if gpu_srcs:
     basename = name.split(".")[0]
     native.cc_library(
-        name = basename + "_gpu",
-        srcs = gpu_srcs,
-        copts = _cuda_copts(),
-        deps = deps + if_cuda(cuda_deps))
+        name=basename + "_gpu",
+        srcs=gpu_srcs,
+        copts=_cuda_copts(),
+        deps=deps + if_cuda(cuda_deps))
     cuda_deps.extend([":" + basename + "_gpu"])
 
-  check_deps(name=name+"_check_deps",
-             deps=deps + if_cuda(cuda_deps),
-             disallowed_deps=[str(Label("//tensorflow/core:framework")),
-                              str(Label("//tensorflow/core:lib"))])
-
-  native.cc_binary(name=name,
-                   srcs=srcs,
-                   deps=deps + if_cuda(cuda_deps),
-                   data=[name + "_check_deps"],
-                   copts=tf_copts(),
-                   linkshared=1,
-                   linkopts = select({
-                       "//conditions:default": [
-                           "-lm",
-                       ],
-                       str(Label("//tensorflow:darwin")): [],
-                   }),
-  )
-
-def tf_custom_op_py_library(name, srcs=[], dso=[], kernels=[],
-                            srcs_version="PY2AND3", visibility=None, deps=[]):
+  check_deps(
+      name=name + "_check_deps",
+      deps=deps + if_cuda(cuda_deps),
+      disallowed_deps=[
+          clean_dep("//tensorflow/core:framework"),
+          clean_dep("//tensorflow/core:lib")
+      ])
+
+  native.cc_binary(
+      name=name,
+      srcs=srcs,
+      deps=deps + if_cuda(cuda_deps),
+      data=[name + "_check_deps"],
+      copts=tf_copts(),
+      linkshared=1,
+      linkopts=select({
+          "//conditions:default": [
+              "-lm",
+          ],
+          clean_dep("//tensorflow:darwin"): [],
+      }),)
+
+
+def tf_custom_op_py_library(name,
+                            srcs=[],
+                            dso=[],
+                            kernels=[],
+                            srcs_version="PY2AND3",
+                            visibility=None,
+                            deps=[]):
   kernels = kernels  # unused argument
   native.py_library(
       name=name,
@@ -787,86 +927,103 @@ def tf_custom_op_py_library(name, srcs=[], dso=[], kernels=[],
       srcs=srcs,
       srcs_version=srcs_version,
       visibility=visibility,
-      deps=deps,
-  )
+      deps=deps,)
+
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
+
 def tf_extension_copts():
   return []  # No extension c opts
 
-def tf_py_wrap_cc(name, srcs, swig_includes=[], deps=[], copts=[], **kwargs):
+
+def tf_py_wrap_cc(name,
+                             srcs,
+                             swig_includes=[],
+                             deps=[],
+                             copts=[],
+                             **kwargs):
   module_name = name.split("/")[-1]
   # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
   # and use that as the name for the rule producing the .so file.
   cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"])
-  cc_library_pyd_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".pyd"])
+  cc_library_pyd_name = "/".join(
+      name.split("/")[:-1] + ["_" + module_name + ".pyd"])
   extra_deps = []
-  _py_wrap_cc(name=name + "_py_wrap",
-              srcs=srcs,
-              swig_includes=swig_includes,
-              deps=deps + extra_deps,
-              toolchain_deps=["//tools/defaults:crosstool"],
-              module_name=module_name,
-              py_module_name=name)
+  _py_wrap_cc(
+      name=name + "_py_wrap",
+      srcs=srcs,
+      swig_includes=swig_includes,
+      deps=deps + extra_deps,
+      toolchain_deps=["//tools/defaults:crosstool"],
+      module_name=module_name,
+      py_module_name=name)
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
-          str(Label("//tensorflow:tf_exported_symbols.lds"))
-      ],
-      str(Label("//tensorflow:windows")): [
+          clean_dep("//tensorflow:tf_exported_symbols.lds")
       ],
+      clean_dep("//tensorflow:windows"): [],
       "//conditions:default": [
           "-Wl,--version-script",
-          str(Label("//tensorflow:tf_version_script.lds"))
-      ]})
+          clean_dep("//tensorflow:tf_version_script.lds")
+      ]
+  })
   extra_deps += select({
       "@local_config_cuda//cuda:darwin": [
-        str(Label("//tensorflow:tf_exported_symbols.lds"))
-      ],
-      str(Label("//tensorflow:windows")): [
+          clean_dep("//tensorflow:tf_exported_symbols.lds")
       ],
+      clean_dep("//tensorflow:windows"): [],
       "//conditions:default": [
-        str(Label("//tensorflow:tf_version_script.lds"))
+          clean_dep("//tensorflow:tf_version_script.lds")
       ]
   })
 
   native.cc_binary(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + ["-Wno-self-assign",
-                      "-Wno-sign-compare",
-                      "-Wno-write-strings"]
-             + tf_extension_copts()),
+      copts=(copts + [
+          "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
+      ] + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
       linkshared=1,
       deps=deps + extra_deps)
   native.genrule(
-      name = "gen_" + cc_library_pyd_name,
-      srcs = [":" + cc_library_name],
-      outs = [cc_library_pyd_name],
-      cmd = "cp $< $@",
-  )
-  native.py_library(name=name,
-                    srcs=[":" + name + ".py"],
-                    srcs_version="PY2AND3",
-                    data=select({
-                      str(Label("//tensorflow:windows")): [":" + cc_library_pyd_name],
-                      "//conditions:default": [":" + cc_library_name],
-                    }))
+      name="gen_" + cc_library_pyd_name,
+      srcs=[":" + cc_library_name],
+      outs=[cc_library_pyd_name],
+      cmd="cp $< $@",)
+  native.py_library(
+      name=name,
+      srcs=[":" + name + ".py"],
+      srcs_version="PY2AND3",
+      data=select({
+          clean_dep("//tensorflow:windows"): [":" + cc_library_pyd_name],
+          "//conditions:default": [":" + cc_library_name],
+      }))
+
 
 def py_test(deps=[], **kwargs):
   native.py_test(
       deps=select({
-          "//conditions:default" : deps,
-          str(Label("//tensorflow:no_tensorflow_py_deps")) : []
+          "//conditions:default": deps,
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
       }),
       **kwargs)
 
-def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-               tags=[], shard_count=1, additional_deps=[], flaky=0,
+
+def tf_py_test(name,
+               srcs,
+               size="medium",
+               data=[],
+               main=None,
+               args=[],
+               tags=[],
+               shard_count=1,
+               additional_deps=[],
+               flaky=0,
                xla_enabled=False):
   if xla_enabled:
     additional_deps += tf_additional_xla_deps_py()
@@ -877,50 +1034,71 @@ def tf_py_test(name, srcs, size="medium", data=[], main=None, args=[],
       main=main,
       args=args,
       tags=tags,
-      visibility=[str(Label("//tensorflow:internal"))],
+      visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
       deps=select({
-          "//conditions:default" : [
-            str(Label("//tensorflow/python:extra_py_tests_deps")),
-            str(Label("//tensorflow/python:gradient_checker")),
+          "//conditions:default": [
+              clean_dep("//tensorflow/python:extra_py_tests_deps"),
+              clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          str(Label("//tensorflow:no_tensorflow_py_deps")) : []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
       }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
-def cuda_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+
+def cuda_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  tf_py_test(name=name,
-             size=size,
-             srcs=srcs,
-             data=data,
-             main=main,
-             args=args,
-             tags=test_tags,
-             shard_count=shard_count,
-             additional_deps=additional_deps,
-             flaky=flaky,
-             xla_enabled=xla_enabled)
-
-def sycl_py_test(name, srcs, size="medium", data=[], main=None, args=[],
-                 shard_count=1, additional_deps=[], tags=[], flaky=0,
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
+
+
+def sycl_py_test(name,
+                 srcs,
+                 size="medium",
+                 data=[],
+                 main=None,
+                 args=[],
+                 shard_count=1,
+                 additional_deps=[],
+                 tags=[],
+                 flaky=0,
                  xla_enabled=False):
- test_tags = tags + tf_sycl_tests_tags()
- tf_py_test(name=name,
-            size=size,
-            srcs=srcs,
-            data=data,
-            main=main,
-            args=args,
-            tags=test_tags,
-            shard_count=shard_count,
-            additional_deps=additional_deps,
-            flaky=flaky,
-            xla_enabled=xla_enabled)
+  test_tags = tags + tf_sycl_tests_tags()
+  tf_py_test(
+      name=name,
+      size=size,
+      srcs=srcs,
+      data=data,
+      main=main,
+      args=args,
+      tags=test_tags,
+      shard_count=shard_count,
+      additional_deps=additional_deps,
+      flaky=flaky,
+      xla_enabled=xla_enabled)
+
 
 def py_tests(name,
              srcs,
@@ -935,22 +1113,39 @@ def py_tests(name,
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
       test_name = "%s_%s" % (prefix, test_name)
-    tf_py_test(name=test_name,
-               size=size,
-               srcs=[src],
-               main=src,
-               tags=tags,
-               shard_count=shard_count,
-               data=data,
-               additional_deps=additional_deps,
-               xla_enabled=xla_enabled)
-
-def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
-                  shard_count=1, tags=[], prefix="", xla_enabled=False):
+    tf_py_test(
+        name=test_name,
+        size=size,
+        srcs=[src],
+        main=src,
+        tags=tags,
+        shard_count=shard_count,
+        data=data,
+        additional_deps=additional_deps,
+        xla_enabled=xla_enabled)
+
+
+def cuda_py_tests(name,
+                  srcs,
+                  size="medium",
+                  additional_deps=[],
+                  data=[],
+                  shard_count=1,
+                  tags=[],
+                  prefix="",
+                  xla_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
-  py_tests(name=name, size=size, srcs=srcs, additional_deps=additional_deps,
-           data=data, tags=test_tags, shard_count=shard_count,prefix=prefix,
-           xla_enabled=xla_enabled)
+  py_tests(
+      name=name,
+      size=size,
+      srcs=srcs,
+      additional_deps=additional_deps,
+      data=data,
+      tags=test_tags,
+      shard_count=shard_count,
+      prefix=prefix,
+      xla_enabled=xla_enabled)
+
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -958,40 +1153,46 @@ def cuda_py_tests(name, srcs, size="medium", additional_deps=[], data=[],
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
 def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
-  out_hdrs = ([p.replace(".proto", ".pb_text.h") for p in srcs] +
-              [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
+  out_hdrs = (
+      [p.replace(".proto", ".pb_text.h")
+       for p in srcs] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs])
   out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
   native.genrule(
-        name = name,
-        srcs = srcs + [str(Label("//tensorflow/tools/proto_text:placeholder.txt"))],
-        outs = out_hdrs + out_srcs,
-        cmd = "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
-              "$(@D) " + srcs_relative_dir + " $(SRCS)",
-        tools = [str(Label("//tensorflow/tools/proto_text:gen_proto_text_functions"))],
-    )
+      name=name,
+      srcs=srcs + [clean_dep("//tensorflow/tools/proto_text:placeholder.txt")],
+      outs=out_hdrs + out_srcs,
+      cmd=
+      "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) "
+      + "$(@D) " + srcs_relative_dir + " $(SRCS)",
+      tools=[
+          clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions")
+      ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
+
 def tf_genrule_cmd_append_to_srcs(to_append):
-    return ("cat $(SRCS) > $(@) && " +
-            "echo >> $(@) && " +
-            "echo " + to_append + " >> $(@)")
+  return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+          " >> $(@)")
 
 
 def tf_version_info_genrule():
   native.genrule(
-      name = "version_info_gen",
-      srcs = [
-          str(Label("//tensorflow/tools/git:gen/spec.json")),
-          str(Label("//tensorflow/tools/git:gen/head")),
-          str(Label("//tensorflow/tools/git:gen/branch_ref")),
+      name="version_info_gen",
+      srcs=[
+          clean_dep("//tensorflow/tools/git:gen/spec.json"),
+          clean_dep("//tensorflow/tools/git:gen/head"),
+          clean_dep("//tensorflow/tools/git:gen/branch_ref"),
       ],
-      outs = ["util/version_info.cc"],
-      cmd = "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
-      local = 1,
-      tools = [str(Label("//tensorflow/tools/git:gen_git_source.py"))],
-  )
-
-def cc_library_with_android_deps(deps, android_deps=[],
-                                common_deps=[], **kwargs):
+      outs=["util/version_info.cc"],
+      cmd=
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\"",
+      local=1,
+      tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
+
+
+def cc_library_with_android_deps(deps,
+                                 android_deps=[],
+                                 common_deps=[],
+                                 **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
   native.cc_library(deps=deps, **kwargs)
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..08436396a6c04a59461b6800b908c29aabb91a1b
--- /dev/null
+++ b/tensorflow/tools/api/golden/BUILD
@@ -0,0 +1,24 @@
+# TensorFlow API backwards compatibility test goldens.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "api_golden",
+    srcs = glob(["*.pbtxt"]),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f79029d3fe0b88a454b11456b3785c3ae28a253c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-aggregation-method.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.AggregationMethod"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "ADD_N"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "EXPERIMENTAL_ACCUMULATE_N"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "EXPERIMENTAL_TREE"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb1aaba2831e63cea9b9a38954b361e5cabd072
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.AttrValue.ListValue"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.ListValue\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "B_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FUNC_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "F_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "I_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "S_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7a3a1f02faf104a03eecc4a45f5a54ab1a26f9a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
@@ -0,0 +1,120 @@
+path: "tensorflow.AttrValue"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrValue\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "B_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FUNC_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "F_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "I_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ListValue"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "PLACEHOLDER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "S_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8f3e8fb154c5a1a2bb61759d9241d7e79fe884e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-auto-parallel-options.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.AutoParallelOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.rewriter_config_pb2.AutoParallelOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENABLE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NUM_REPLICAS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9a32c16b34a78bd5a182b7c0635a559bddc611d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator-base.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.ConditionalAccumulatorBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d23b3bd0cae1f9ab1c2896244a17d4d93e2427e9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-conditional-accumulator.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.ConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29bb3be35cba5f261f44811c731ba4c1fc007612
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.ConfigProto.DeviceCountEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.DeviceCountEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da6af3919e96bd6145c33a84aca89c44473ce66c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.ConfigProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.ConfigProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ALLOW_SOFT_PLACEMENT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CLUSTER_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DEVICE_COUNT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEVICE_FILTERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DeviceCountEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "GPU_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTER_OP_PARALLELISM_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTRA_OP_PARALLELISM_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_DEVICE_PLACEMENT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPERATION_TIMEOUT_IN_MS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PLACEMENT_PERIOD_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "RPC_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_INTER_OP_THREAD_POOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "USE_PER_SESSION_THREADS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b5b88bba80e6bf7b9d4917c73e3876e00ef956b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-d-type.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.DType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "as_datatype_enum"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "as_numpy_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "base_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_bool"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_complex"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_floating"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_integer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_numpy_compatible"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_quantized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_unsigned"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "limits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "min"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "real_dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'type_enum\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92e535c341447628a50d8941998a4065e78d12a5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-device-spec.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.DeviceSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "job"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_string"
+    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_from"
+    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "parse_from_string"
+    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_string"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9ab27719b4d71f3d7ed10963ad896ccafa82f15
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-dimension.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.Dimension"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf8c124288854abc847a59db2c68b29759bfc7a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.Event"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILE_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "META_GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_LOG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUMMARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WALL_TIME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72cc53244768ad515c0ce33b937a2eae3a9fd98a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-f-i-f-o-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.FIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6933814a7b68f775e694fe940a7c65a8e31b9398
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-len-feature.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.FixedLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c53878795190924e205a1e7efe1672f216869c41
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-len-sequence-feature.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.FixedLenSequenceFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "allow_missing"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "default_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c77b3dd5cca6c7741764e6b4bcea82ef30a47fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-fixed-length-record-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.FixedLengthRecordReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.FixedLengthRecordReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'hop_bytes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30f7e4e11655797fbd8f0ea65c2eb84768ca486b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.GPUOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GPUOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ALLOCATOR_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ALLOW_GROWTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFERRED_DELETION_BYTES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "POLLING_ACTIVE_DELAY_USECS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "POLLING_INACTIVE_DELAY_MSECS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VISIBLE_DEVICE_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1495e847cb08ed39ee5e365744ab1d798c3eed41
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.GraphDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.graph_pb2.GraphDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LIBRARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef2cfe3787e02da813ac0173a0fafce844bdbf38
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
@@ -0,0 +1,136 @@
+path: "tensorflow.GraphKeys"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "ACTIVATIONS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ASSET_FILEPATHS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "BIASES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CONCATENATED_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "COND_CONTEXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "EVAL_STEP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GLOBAL_STEP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GLOBAL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_RESOURCES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOCAL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LOSSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MODEL_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MOVING_AVERAGE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "QUEUE_RUNNERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "READY_FOR_LOCAL_INIT_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "READY_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGULARIZATION_LOSSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "RESOURCES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVEABLE_OBJECTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUMMARIES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUMMARY_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TABLE_INITIALIZERS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINABLE_RESOURCE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINABLE_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAIN_OP"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "UPDATE_OPS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "WEIGHTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "WHILE_CONTEXT"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0844f891cad3d4ea798dec82d318e2bc53c53683
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.GraphOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GraphOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BUILD_COST_MODEL_AFTER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BUILD_COST_MODEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENABLE_BFLOAT16_SENDRECV_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ENABLE_RECV_SCHEDULING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "INFER_SHAPES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPTIMIZER_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PLACE_PRUNED_GRAPH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "REWRITE_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TIMELINE_STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4460de57aa3314983545ebbeadf0780872f8315b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-graph.pbtxt
@@ -0,0 +1,133 @@
+path: "tensorflow.Graph"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Graph\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "building_function"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "finalized"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def_versions"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "seed"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "version"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_to_collection"
+    argspec: "args=[\'self\', \'name\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_to_collections"
+    argspec: "args=[\'self\', \'names\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_graph_def"
+    argspec: "args=[\'self\', \'from_version\', \'add_shapes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "as_graph_element"
+    argspec: "args=[\'self\', \'obj\', \'allow_tensor\', \'allow_operation\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "clear_collection"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_with"
+    argspec: "args=[\'self\', \'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "container"
+    argspec: "args=[\'self\', \'container_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "control_dependencies"
+    argspec: "args=[\'self\', \'control_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "create_op"
+    argspec: "args=[\'self\', \'op_type\', \'inputs\', \'dtypes\', \'input_types\', \'name\', \'attrs\', \'op_def\', \'compute_shapes\', \'compute_device\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "device"
+    argspec: "args=[\'self\', \'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_all_collection_keys"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'self\', \'name\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection_ref"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_name_scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_operation_by_name"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_operations"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_by_name"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gradient_override_map"
+    argspec: "args=[\'self\', \'op_type_map\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_feedable"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_fetchable"
+    argspec: "args=[\'self\', \'tensor_or_op\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "name_scope"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prevent_feeding"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "prevent_fetching"
+    argspec: "args=[\'self\', \'op\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unique_name"
+    argspec: "args=[\'self\', \'name\', \'mark_as_used\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2567d2fe60293833b340d790ac1110f91d018107
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
@@ -0,0 +1,104 @@
+path: "tensorflow.HistogramProto"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.HistogramProto\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BUCKET_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BUCKET_LIMIT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MAX_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MIN_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUM_SQUARES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2eda320d6368324f4caea64767fe55aae28494f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-identity-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.IdentityReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.IdentityReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fee84d85307dffb675b507a31c4f1fda60de869d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-indexed-slices.pbtxt
@@ -0,0 +1,42 @@
+path: "tensorflow.IndexedSlices"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'indices\', \'dense_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9503ec440fcbf900279b7f57f7ea6a39f3220a39
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-interactive-session.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.InteractiveSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.client.session.InteractiveSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sess_str"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_callable"
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run"
+    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run_setup"
+    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a43c5eb7e30c3c2b025e750de5786ef4338e4ffc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.LogMessage"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.LogMessage\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEBUGGING"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ERROR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FATAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INFO"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Level"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNKNOWN"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2750bd780caa418f933ada2073c5e8d0475c2a33
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.NameAttrList.AttrEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d10faf67d027a4dc8c7a32ec31ea22773104508a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.NameAttrList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.NameAttrList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ATTR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AttrEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1b62d60f1e8c95a5e8cc13bc8162cf1de087195
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.NodeDef.AttrEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.AttrEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b812b4df2b3c15af3c2c81944a82d9878865b8fb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
@@ -0,0 +1,100 @@
+path: "tensorflow.NodeDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.NodeDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "ATTR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AttrEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DEVICE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "INPUT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e59615534fc2b3ed4fb128caf8ea092ebfd25f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-op-error.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.OpError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64240f706983bb2ced63e49937800d2db4e627f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-operation.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.Operation"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Operation\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "control_inputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "traceback"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "traceback_with_start_lines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'g\', \'inputs\', \'output_types\', \'control_inputs\', \'input_types\', \'original_op\', \'op_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "colocation_groups"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_attr"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "values"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5dd1ee47c969e1c31a0b44eb579ba255d49ebb46
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
@@ -0,0 +1,128 @@
+path: "tensorflow.OptimizerOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.OptimizerOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DO_COMMON_SUBEXPRESSION_ELIMINATION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DO_CONSTANT_FOLDING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DO_FUNCTION_INLINING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "GLOBAL_JIT_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GlobalJitLevel"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "L0"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "L1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Level"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "OFF"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ON_1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ON_2"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPT_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bfe723ce754830efeebd7644871ff29f9809423
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbe25f3a5b9ecc1596c77862396c684b6ddb9c5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-priority-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9263d73a51161e9df083992528400b57302832d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-queue-base.pbtxt
@@ -0,0 +1,61 @@
+path: "tensorflow.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec783ffe5a01d66965d6370ec1bc6c83178b5a8c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-random-shuffle-queue.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt b/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6a3ce76a157686becd92e2c7f873bfbc7572116
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-reader-base.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.ReaderBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reader_ref\', \'supports_serialize\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt b/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d6e4137d12d4a1ff283a114d4f0cc5602b0b734
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-register-gradient.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.RegisterGradient"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.RegisterGradient\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34d2e1761280de8079f82bef02b7dc2cc5ace442
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-rewriter-config.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.RewriterConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.rewriter_config_pb2.RewriterConfig\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "AUTO_PARALLEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CONSTANT_FOLDING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DISABLE_MODEL_PRUNING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MANUAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MEMORY_OPTIMIZATION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MemOptType"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "NO_MEM_OPT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPTIMIZERS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OPTIMIZE_TENSOR_LAYOUT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..808fa0fa217a407b2c86459b32fcef46b96afa5c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.RunMetadata"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunMetadata\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COST_GRAPH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "PARTITION_GRAPHS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_STATS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ad6804a78cbcf4820df5990aba099a607289bc6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
@@ -0,0 +1,116 @@
+path: "tensorflow.RunOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunOptions\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DEBUG_OPTIONS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FULL_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "HARDWARE_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INTER_OP_THREAD_POOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NO_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SOFTWARE_TRACE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TIMEOUT_IN_MS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TRACE_LEVEL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TraceLevel"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec66d7f3354083f953066e33dff73ba9c185fc16
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.SessionLog"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CHECKPOINT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHECKPOINT_PATH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MSG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "START"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_UNSPECIFIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STOP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SessionStatus"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5eec14f365e8b9d0bb38cdc80d7c5581f446a4a5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-session.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.Session"
+tf_class {
+  is_instance: "<class \'tensorflow.python.client.session.Session\'>"
+  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
+  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "sess_str"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_callable"
+    argspec: "args=[\'self\', \'fetches\', \'feed_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run"
+    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "partial_run_setup"
+    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'target\', \'containers\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2260279ad2bcfc246f42b225adc05f7c19f1aac1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-conditional-accumulator.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.SparseConditionalAccumulator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "accumulator_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\'], "
+  }
+  member_method {
+    name: "apply_grad"
+    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "apply_indexed_slices_grad"
+    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+  member_method {
+    name: "num_accumulated"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_global_step"
+    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "take_indexed_slices_grad"
+    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d875394fb5de73f67629b77c902a2ed2a03dd982
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-feature.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.SparseFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "already_sorted"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "index_key"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_key"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d33fd4d5d7b6b3e2eb7454b5326d993c139f0490
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor-value.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.SparseTensorValue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorValue\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eac236d4982b809a0478665096c2b18d69c54184
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-sparse-tensor.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.SparseTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dense_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "indices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'sparse_tensor_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..781010d75e23c16624b193e9f1041b6d58eef34e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.Summary.Audio"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTENT_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LENGTH_FRAMES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_CHANNELS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAMPLE_RATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..feb9c7ee9270a7d64cf228dffeb1187fbd225704
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.Summary.Image"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COLORSPACE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HEIGHT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WIDTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d02fb9ecd48a153aab9d0acddab6996601524e07
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.Summary.Value"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "AUDIO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HISTO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "IMAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SIMPLE_VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38de17fa9e52b87d19413a64271b70755e604610
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.Summary"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "Audio"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "Image"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Value"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdf79373919b6c5f26c68996d8f1cf30e8992203
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-t-f-record-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.TFRecordReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.TFRecordReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0fad4df524b7fa23793c88cf5cf55861c40a944
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-array.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.TensorArray"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.tensor_array_ops.TensorArray\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flow"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "handle"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'size\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'handle\', \'flow\', \'infer_shape\', \'element_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'self\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "grad"
+    argspec: "args=[\'self\', \'source\', \'flow\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter"
+    argspec: "args=[\'self\', \'indices\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'self\', \'value\', \'lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unstack"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'index\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87632fb7b9e413fd0bb2006ed7ed53721260241e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.TensorInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.TensorInfo\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "DTYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_SHAPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d5b9cb8f5ed3cf088f5bd27809ff98f00801217d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-shape.pbtxt
@@ -0,0 +1,73 @@
+path: "tensorflow.TensorShape"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dims"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ndims"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_proto"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_has_rank"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_is_fully_defined"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_same_rank"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "concatenate"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_fully_defined"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_elements"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank_at_least"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_rank_at_most"
+    argspec: "args=[\'self\', \'rank\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38d19bb5374037981c01b29053ab8d05b551eb84
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.Tensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "OVERLOADABLE_OPERATORS"
+    mtype: "<type \'set\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "value_index"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'op\', \'value_index\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "consumers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'feed_dict\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9779f07620d2cc1ef3b0ff1b2d32796fc10834a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-text-line-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.TextLineReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.TextLineReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'skip_header_lines\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54b66f43f8e7d714e82ae9d68b37ac348c476c97
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-var-len-feature.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.VarLenFeature"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9b2dfd67723d4fbe04f6c7c505beb5bcf8e962a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable-scope.pbtxt
@@ -0,0 +1,97 @@
+path: "tensorflow.VariableScope"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variable_scope.VariableScope\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "caching_device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "custom_getter"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "original_name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "partitioner"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "reuse"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "use_resource"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reuse\', \'name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'name_scope\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'None\', \'None\', \'None\', \'\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable"
+    argspec: "args=[\'self\', \'var_store\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'reuse\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "global_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reuse_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_caching_device"
+    argspec: "args=[\'self\', \'caching_device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_custom_getter"
+    argspec: "args=[\'self\', \'custom_getter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_dtype"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_initializer"
+    argspec: "args=[\'self\', \'initializer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_partitioner"
+    argspec: "args=[\'self\', \'partitioner\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_regularizer"
+    argspec: "args=[\'self\', \'regularizer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_use_resource"
+    argspec: "args=[\'self\', \'use_resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "trainable_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3ccd468b216ab817c9ed05dcb292eaf1f44398
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.-save-slice-info.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.Variable.SaveSliceInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.SaveSliceInfo\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d67a2713f7a7d792bbb7679661ae4b822287eb26
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
@@ -0,0 +1,101 @@
+path: "tensorflow.Variable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "SaveSliceInfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign"
+    argspec: "args=[\'self\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "assign_add"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "assign_sub"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "count_up_to"
+    argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialized_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load"
+    argspec: "args=[\'self\', \'value\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scatter_sub"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt b/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ac759891c62ae44bf8f8c365da75664f2e65ce2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-whole-file-reader.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.WholeFileReader"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.io_ops.WholeFileReader\'>"
+  is_instance: "<class \'tensorflow.python.ops.io_ops.ReaderBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "reader_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "supports_serialize"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_records_produced"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "num_work_units_completed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'queue\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_up_to"
+    argspec: "args=[\'self\', \'queue\', \'num_records\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "restore_state"
+    argspec: "args=[\'self\', \'state\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_state"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/tensorflow.app.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85044a8987963126ae12aaa0e5eb5d1ecc134539
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.app.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.app"
+tf_module {
+  member {
+    name: "flags"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ccc60314001f261a2b4a5560bea83ffa017fd914
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.compat.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.compat"
+tf_module {
+  member {
+    name: "bytes_or_text_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "complex_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "integral_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member {
+    name: "real_types"
+    mtype: "<type \'tuple\'>"
+  }
+  member_method {
+    name: "as_bytes"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+  member_method {
+    name: "as_str"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+  member_method {
+    name: "as_str_any"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_text"
+    argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00ec669b1685f3cbdacd676bac61755bebb9f6da
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.constant_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.constant_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9186b0b9d5fecff35b43d2ef5dc0f2c99f3412
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-aborted-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.AbortedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.AbortedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e155081dd28a8a859e940338f70e9db24dff0d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-already-exists-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.AlreadyExistsError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.AlreadyExistsError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b02a0e023aaecb5930c45aa35dbb1f0d97432cea
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-cancelled-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.CancelledError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.CancelledError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1fa66342a7022031faec68f65de9cb0ae28bcba
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-data-loss-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.DataLossError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.DataLossError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e037936191b5d52c2422f2587e7196614104d6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-deadline-exceeded-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.DeadlineExceededError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.DeadlineExceededError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..384d4b534c6ea05f9ce0fdbad32dcaf02db0ac58
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-failed-precondition-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.FailedPreconditionError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.FailedPreconditionError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac5c4d7879bbe5b040209abee088b78b15ae6f5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-internal-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.InternalError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.InternalError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..161edd4a7c5763fe6fd96d80024065a3e3138de3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-invalid-argument-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.InvalidArgumentError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.InvalidArgumentError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e64730ac6d7c0d3517a8a072b9622691a7e77d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-not-found-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.NotFoundError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.NotFoundError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1f14c0457d95fd09fe485ae241ba9a9852879db
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-op-error.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.errors.OpError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6365e472868607d1ca4056859d56d16d022b3128
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-out-of-range-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.OutOfRangeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OutOfRangeError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc8a66f9eadf3985b6805afa3adf729e7c24f3d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-permission-denied-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.PermissionDeniedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.PermissionDeniedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85bb384b46992c4565b14b3c13c8115fb1998abd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-resource-exhausted-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.ResourceExhaustedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.ResourceExhaustedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d57d7ac2f20b98f464c5a67abdd926cd20de5e32
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unauthenticated-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnauthenticatedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnauthenticatedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc33e6ed8d1a9b7160b321c18735690b7b52a7d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unavailable-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnavailableError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnavailableError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8c2e22dbd7e66909f4ba613ba7f19b6abbaa4b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unimplemented-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnimplementedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnimplementedError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ffcfae95b8c7ccea29dd5b7b75e8c74fa245f7e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.-unknown-error.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.errors.UnknownError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.UnknownError\'>"
+  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
+  is_instance: "<type \'exceptions.Exception\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "error_code"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "node_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=[\'2\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.errors.pbtxt b/tensorflow/tools/api/golden/tensorflow.errors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ad1c19603b496fd53eddd01ccd2e102a476589c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.errors.pbtxt
@@ -0,0 +1,151 @@
+path: "tensorflow.errors"
+tf_module {
+  member {
+    name: "ABORTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ALREADY_EXISTS"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "AbortedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AlreadyExistsError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CANCELLED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CancelledError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DATA_LOSS"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEADLINE_EXCEEDED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DataLossError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DeadlineExceededError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FAILED_PRECONDITION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FailedPreconditionError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INTERNAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INVALID_ARGUMENT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "InternalError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InvalidArgumentError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NOT_FOUND"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NotFoundError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OK"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OUT_OF_RANGE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OpError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OutOfRangeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PERMISSION_DENIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PermissionDeniedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RESOURCE_EXHAUSTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ResourceExhaustedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNAUTHENTICATED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNAVAILABLE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNIMPLEMENTED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UNKNOWN"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "UnauthenticatedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnavailableError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnimplementedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnknownError"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "error_code_from_exception_type"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exception_type_from_error_code"
+    argspec: "args=[\'error_code\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "raise_exception_on_not_ok_status"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5dbfe2172640916803204a4c8f2c5e250bc982d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator-spec.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.estimator.EstimatorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "eval_metric_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "export_outputs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "loss"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "predictions"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scaffold"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "train_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "training_chief_hooks"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "training_hooks"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a769fd546ccd0d943a11909d0a23313d0c15c24
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.estimator.Estimator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a1c24fa63fc074c2b4ae9b3225a6abb47958b68
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-mode-keys.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.estimator.ModeKeys"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "EVAL"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAIN"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d69c475a313075a5b165dba9a80e30cf8212657d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -0,0 +1,77 @@
+path: "tensorflow.estimator.RunConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cluster_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "evaluation_master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_every_n_hours"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "keep_checkpoint_max"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "master"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_ps_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_worker_replicas"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_checkpoints_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summary_steps"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tf_random_seed"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "replace"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cf7af8da95479cf49469b2f328db0919fd5ce95
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.ClassificationOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2df1840c4a4f03fc08ba535b4f6557d49608fa5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-classification-output.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.estimator.export.ClassificationOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "classes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scores"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scores\', \'classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d165ccbf91865e48f40f88ff817bff03881a03b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.ExportOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa62e8ced801d66951ef5a62ec4fdd9795226ebd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-export-output.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.export.ExportOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..743495ba98cf4db0abeba86e26b812d9e3c8695b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.PredictOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e0160b10ce13a0b3499143d151ee7e58ad858fb2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-predict-output.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.export.PredictOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "outputs"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbf4e3dec85d7d00045bfe4e7086ba23edf61a84
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.estimator.export.RegressionOutput.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..905f0e055350fe9a7d5790e531fb2b089332f279
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-regression-output.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.export.RegressionOutput"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_signature_def"
+    argspec: "args=[\'self\', \'receiver_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e0443088dfaf5b87fcd6152a575a317d306f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.-serving-input-receiver.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.estimator.export.ServingInputReceiver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "features"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "receiver_tensors"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d0dddb3bc0305a28fab0c95c31e4869f5db0aa8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.export.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.estimator.export"
+tf_module {
+  member {
+    name: "ClassificationOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "ExportOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "PredictOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "RegressionOutput"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
+  member {
+    name: "ServingInputReceiver"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "build_parsing_serving_input_receiver_fn"
+    argspec: "args=[\'feature_spec\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "build_raw_serving_input_receiver_fn"
+    argspec: "args=[\'features\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b318fea1f82077c3924a843dd6b3857a3fdc0e8e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.inputs.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.estimator.inputs"
+tf_module {
+  member_method {
+    name: "numpy_input_fn"
+    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\'], "
+  }
+  member_method {
+    name: "pandas_input_fn"
+    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\', \'target_column\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\', \'target\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d5dc73271dbc972c9177a6274f1632862f93ef0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator"
+tf_module {
+  member {
+    name: "Estimator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "EstimatorSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ModeKeys"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RunConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "export"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "inputs"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c633a850f8e069135f122292bac019e2646aa61
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.feature_column"
+tf_module {
+  member_method {
+    name: "bucketized_column"
+    argspec: "args=[\'source_column\', \'boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
+  member_method {
+    name: "crossed_column"
+    argspec: "args=[\'keys\', \'hash_bucket_size\', \'hash_key\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "embedding_column"
+    argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "indicator_column"
+    argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "input_layer"
+    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "linear_model"
+    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "make_parse_example_spec"
+    argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "weighted_categorical_column"
+    argspec: "args=[\'categorical_column\', \'weight_feature_key\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eecfaffd0a6f6e611eba8bf3f5bb709bc9e0157f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-fast-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.FastGFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..305251059d90b52aa2e76e99a4ec65e68b73fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e8894180a4a685d5a35ba02df53c6e054db01b9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.-open.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.gfile.Open"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b55a8b7c4e30e349c1ea256664002b19191c82
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.gfile.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.gfile"
+tf_module {
+  member {
+    name: "FastGFile"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Open"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Copy"
+    argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "DeleteRecursively"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exists"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Glob"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsDirectory"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ListDirectory"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MakeDirs"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MkDir"
+    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Remove"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rename"
+    argspec: "args=[\'oldname\', \'newname\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "Stat"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Walk"
+    argspec: "args=[\'top\', \'in_order\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76a2df757e7151c464607dd290db3806bf8328e5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.graph_util.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.graph_util"
+tf_module {
+  member_method {
+    name: "convert_variables_to_constants"
+    argspec: "args=[\'sess\', \'input_graph_def\', \'output_node_names\', \'variable_names_whitelist\', \'variable_names_blacklist\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "extract_sub_graph"
+    argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "must_run_on_cpu"
+    argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "remove_training_nodes"
+    argspec: "args=[\'input_graph\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tensor_shape_from_node_def_name"
+    argspec: "args=[\'graph\', \'input_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbc360b13ee7dc8228f5fb4fe0cd6fc21504d0d0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.image.-resize-method.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.image.ResizeMethod"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.image_ops_impl.ResizeMethod\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "AREA"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BICUBIC"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "BILINEAR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NEAREST_NEIGHBOR"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f7790f2996d795ab7681c93d32909e01250725c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -0,0 +1,179 @@
+path: "tensorflow.image"
+tf_module {
+  member {
+    name: "ResizeMethod"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "adjust_brightness"
+    argspec: "args=[\'image\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_contrast"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "adjust_gamma"
+    argspec: "args=[\'image\', \'gamma\', \'gain\'], varargs=None, keywords=None, defaults=[\'1\', \'1\'], "
+  }
+  member_method {
+    name: "adjust_hue"
+    argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "adjust_saturation"
+    argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "central_crop"
+    argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_image_dtype"
+    argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "crop_and_resize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "crop_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "draw_bounding_boxes"
+    argspec: "args=[\'images\', \'boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "encode_png"
+    argspec: "args=[\'image\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "extract_glimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "flip_left_right"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flip_up_down"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "grayscale_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "hsv_to_rgb"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "pad_to_bounding_box"
+    argspec: "args=[\'image\', \'offset_height\', \'offset_width\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "per_image_standardization"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "random_brightness"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_contrast"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_left_right"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_flip_up_down"
+    argspec: "args=[\'image\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_hue"
+    argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_saturation"
+    argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "resize_area"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_bilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "resize_image_with_crop_or_pad"
+    argspec: "args=[\'image\', \'target_height\', \'target_width\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_images"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
+  }
+  member_method {
+    name: "resize_nearest_neighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rgb_to_grayscale"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rgb_to_hsv"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rot90"
+    argspec: "args=[\'image\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "sample_distorted_bounding_box"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "total_variation"
+    argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "transpose_image"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78b10c44a23c8a1093b1727eb7fb7efae87a33cd
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.layers.pbtxt
@@ -0,0 +1,63 @@
+path: "tensorflow.layers"
+tf_module {
+  member_method {
+    name: "average_pooling1d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "average_pooling2d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "average_pooling3d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "batch_normalization"
+    argspec: "args=[\'inputs\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'training\', \'trainable\', \'name\', \'reuse\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'None\', \'None\', \'False\', \'True\', \'None\', \'None\', \'False\', \'None\', \'0.99\'], "
+  }
+  member_method {
+    name: "conv1d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_transpose"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'(1, 1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_transpose"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dense"
+    argspec: "args=[\'inputs\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dropout"
+    argspec: "args=[\'inputs\', \'rate\', \'noise_shape\', \'seed\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling1d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling2d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "max_pooling3d"
+    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
+  }
+  member_method {
+    name: "separable_conv2d"
+    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/tensorflow.logging.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85bb15455da624962744a0cc856e79e0a6d57d7c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.logging.pbtxt
@@ -0,0 +1,83 @@
+path: "tensorflow.logging"
+tf_module {
+  member {
+    name: "DEBUG"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ERROR"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FATAL"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INFO"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WARN"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "TaskLevelStatusMessage"
+    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "debug"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "error"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "fatal"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_verbosity"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "info"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "log_every_n"
+    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_first_n"
+    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log_if"
+    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vlog"
+    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "warn"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "warning"
+    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4bdc73370bffb3c44945fc5c9e4fbafcdd72255e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.losses.-reduction.pbtxt
@@ -0,0 +1,32 @@
+path: "tensorflow.losses.Reduction"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "MEAN"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_BY_NONZERO_WEIGHTS"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "validate"
+    argspec: "args=[\'cls\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/tensorflow.losses.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79443839b9a38370453d2aa9a6e868c9a0720196
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.losses.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.losses"
+tf_module {
+  member {
+    name: "Reduction"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "absolute_difference"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
+  }
+  member_method {
+    name: "compute_weighted_loss"
+    argspec: "args=[\'losses\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "cosine_distance"
+    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "get_losses"
+    argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
+  }
+  member_method {
+    name: "get_regularization_loss"
+    argspec: "args=[\'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'total_regularization_loss\'], "
+  }
+  member_method {
+    name: "get_regularization_losses"
+    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_total_loss"
+    argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
+  }
+  member_method {
+    name: "hinge_loss"
+    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "huber_loss"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'delta\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "log_loss"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'epsilon\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1e-07\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "mean_pairwise_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\'], "
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "sigmoid_cross_entropy"
+    argspec: "args=[\'multi_class_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "softmax_cross_entropy"
+    argspec: "args=[\'onehot_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+  member_method {
+    name: "sparse_softmax_cross_entropy"
+    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..262d11c38e1fcd0e35b1872a081236d29cd80510
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
@@ -0,0 +1,99 @@
+path: "tensorflow.metrics"
+tf_module {
+  member_method {
+    name: "accuracy"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "auc"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\'], "
+  }
+  member_method {
+    name: "false_negatives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "false_positives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean"
+    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_cosine_distance"
+    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_iou"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_per_class_accuracy"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_relative_error"
+    argspec: "args=[\'labels\', \'predictions\', \'normalizer\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "mean_tensor"
+    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "percentage_below"
+    argspec: "args=[\'values\', \'threshold\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "precision"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "precision_at_thresholds"
+    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recall_at_thresholds"
+    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "root_mean_squared_error"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sensitivity_at_specificity"
+    argspec: "args=[\'labels\', \'predictions\', \'specificity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_average_precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "specificity_at_sensitivity"
+    argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "true_positives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1b60fbdcbb50bca8c0481d86ba620b982532ca6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -0,0 +1,323 @@
+path: "tensorflow.nn"
+tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "atrous_conv2d"
+    argspec: "args=[\'value\', \'filters\', \'rate\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atrous_conv2d_transpose"
+    argspec: "args=[\'value\', \'filters\', \'output_shape\', \'rate\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "avg_pool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool3d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_norm_with_global_normalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_normalization"
+    argspec: "args=[\'x\', \'mean\', \'variance\', \'offset\', \'scale\', \'variance_epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bias_add"
+    argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "bidirectional_dynamic_rnn"
+    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "compute_accidental_hits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv1d"
+    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv2d_transpose"
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_backprop_filter_v2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv3d_transpose"
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+  }
+  member_method {
+    name: "convolution"
+    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "crelu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ctc_beam_search_decoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'100\', \'1\', \'True\'], "
+  }
+  member_method {
+    name: "ctc_greedy_decoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "ctc_loss"
+    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_native_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "dilation2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dropout"
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "elu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "embedding_lookup"
+    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'name\', \'validate_indices\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "embedding_lookup_sparse"
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'name\', \'combiner\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "erosion2d"
+    argspec: "args=[\'value\', \'kernel\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fractional_avg_pool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fractional_max_pool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fused_batch_norm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.001\', \'NHWC\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "in_top_k"
+    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "l2_loss"
+    argspec: "args=[\'t\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "l2_normalize"
+    argspec: "args=[\'x\', \'dim\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-12\', \'None\'], "
+  }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "local_response_normalization"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "log_poisson_loss"
+    argspec: "args=[\'targets\', \'log_input\', \'compute_full_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "log_softmax"
+    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "log_uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "lrn"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool3d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool_with_argmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "moments"
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "nce_loss"
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+  }
+  member_method {
+    name: "normalize_moments"
+    argspec: "args=[\'counts\', \'mean_ss\', \'variance_ss\', \'shift\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pool"
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_avg_pool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "quantized_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_max_pool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "quantized_relu_x"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "raw_rnn"
+    argspec: "args=[\'cell\', \'loop_fn\', \'parallel_iterations\', \'swap_memory\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "relu"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "relu6"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "relu_layer"
+    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sampled_softmax_loss"
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\'], "
+  }
+  member_method {
+    name: "separable_conv2d"
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sigmoid_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "softmax"
+    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
+  member_method {
+    name: "softmax_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "softplus"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "softsign"
+    argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_softmax_cross_entropy_with_logits"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sufficient_statistics"
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "top_k"
+    argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "uniform_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "weighted_cross_entropy_with_logits"
+    argspec: "args=[\'targets\', \'logits\', \'pos_weight\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "weighted_moments"
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "with_space_to_batch"
+    argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "xw_plus_b"
+    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "zero_fraction"
+    argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..210b56242b27fe4a832cfe50a53626d716d8877e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.ones_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.ones_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13ec7454f41eac2b23e07ba62068bb48dddac90b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.orthogonal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.orthogonal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0fdd693903d89fb381300af58db21b400cb3c12
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -0,0 +1,1955 @@
+path: "tensorflow"
+tf_module {
+  member {
+    name: "AggregationMethod"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AttrValue"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "AutoParallelOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "COMPILER_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ConditionalAccumulatorBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ConfigProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DeviceSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Dimension"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Event"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLenSequenceFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FixedLengthRecordReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GIT_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GPUOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_CONSUMER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Graph"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GraphDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "GraphKeys"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GraphOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "HistogramProto"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "IdentityReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "IndexedSlices"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InteractiveSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogMessage"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "NameAttrList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "NodeDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "OpError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Operation"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizerOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QUANTIZED_DTYPES"
+    mtype: "<type \'frozenset\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReaderBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RegisterGradient"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RewriterConfig"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "RunMetadata"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "RunOptions"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Session"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionLog"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SparseConditionalAccumulator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTensorValue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Summary"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TFRecordReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Tensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TensorArray"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TensorInfo"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TextLineReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VarLenFeature"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Variable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "VariableScope"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WholeFileReader"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "app"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "bfloat16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "bool"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "compat"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "constant_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "contrib"
+    mtype: "<class \'tensorflow.python.util.lazy_loader.LazyLoader\'>"
+  }
+  member {
+    name: "double"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "errors"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "estimator"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "feature_column"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "flags"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "graph_util"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "half"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "image"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "logging"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "newaxis"
+    mtype: "<type \'NoneType\'>"
+  }
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "ones_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "python_io"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "pywrap_tensorflow"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "qint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "random_normal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_uniform_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "resource"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "resource_loader"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "saved_model"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "sets"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "spectral"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "string"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "summary"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "sysconfig"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "test"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "train"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "truncated_normal_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uniform_unit_scaling_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "user_ops"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "zeros_initializer"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "NoGradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NotDifferentiable"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "abs"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "accumulate_n"
+    argspec: "args=[\'inputs\', \'shape\', \'tensor_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "acos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_check_numerics_ops"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_n"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_to_collection"
+    argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "all_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "arg_max"
+    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "arg_min"
+    argspec: "args=[\'input\', \'dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "argmax"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "argmin"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "as_dtype"
+    argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "asin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_greater_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_integer"
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_less_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_negative"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_non_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_none_equal"
+    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_positive"
+    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_proper_iterable"
+    argspec: "args=[\'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "assert_rank"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_rank_at_least"
+    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_same_float_dtype"
+    argspec: "args=[\'tensors\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_scalar"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assert_type"
+    argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assert_variables_initialized"
+    argspec: "args=[\'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign_add"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "assign_sub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "atan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "atan2"
+    argspec: "args=[\'y\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_to_space"
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "batch_to_space_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "betainc"
+    argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bincount"
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "bitcast"
+    argspec: "args=[\'input\', \'type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "boolean_mask"
+    argspec: "args=[\'tensor\', \'mask\', \'name\'], varargs=None, keywords=None, defaults=[\'boolean_mask\'], "
+  }
+  member_method {
+    name: "broadcast_dynamic_shape"
+    argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_static_shape"
+    argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "case"
+    argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'case\'], "
+  }
+  member_method {
+    name: "cast"
+    argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ceil"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "check_numerics"
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cholesky_solve"
+    argspec: "args=[\'chol\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clip_by_average_norm"
+    argspec: "args=[\'t\', \'clip_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clip_by_global_norm"
+    argspec: "args=[\'t_list\', \'clip_norm\', \'use_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "clip_by_norm"
+    argspec: "args=[\'t\', \'clip_norm\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "clip_by_value"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "complex"
+    argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "concat"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'concat\'], "
+  }
+  member_method {
+    name: "cond"
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conj"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "constant"
+    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
+  }
+  member_method {
+    name: "container"
+    argspec: "args=[\'container_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "control_dependencies"
+    argspec: "args=[\'control_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "convert_to_tensor"
+    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert_to_tensor_or_indexed_slices"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert_to_tensor_or_sparse_tensor"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "cos"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_nonzero"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "count_up_to"
+    argspec: "args=[\'ref\', \'limit\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "create_partitioned_variables"
+    argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cross"
+    argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "decode_base64"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_csv"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_json_example"
+    argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_raw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "delete_session_tensor"
+    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "deserialize_many_sparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "device"
+    argspec: "args=[\'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "digamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "divide"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dynamic_partition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dynamic_stitch"
+    argspec: "args=[\'indices\', \'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "edit_distance"
+    argspec: "args=[\'hypothesis\', \'truth\', \'normalize\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'edit_distance\'], "
+  }
+  member_method {
+    name: "einsum"
+    argspec: "args=[\'equation\'], varargs=inputs, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "encode_base64"
+    argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "erf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "erfc"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "exp"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "expand_dims"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "expm1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "extract_image_patches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "eye"
+    argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_args"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_args_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_per_channel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "fake_quant_with_min_max_vars_per_channel_gradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fill"
+    argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fixed_size_partitioner"
+    argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "floor"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floor_div"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floordiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "floormod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "foldl"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "foldr"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "gather_nd"
+    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection"
+    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_collection_ref"
+    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_default_graph"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_default_session"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_local_variable"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_seed"
+    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_session_handle"
+    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_session_tensor"
+    argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_variable"
+    argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_variable_scope"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "global_norm"
+    argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "global_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "global_variables_initializer"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gradients"
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "greater"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "greater_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "hessians"
+    argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "histogram_fixed_width"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'100\', \"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "identity"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "igamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "igammac"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "imag"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "import_graph_def"
+    argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "initialize_all_tables"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
+  member_method {
+    name: "initialize_all_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialize_local_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialize_variables"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
+  }
+  member_method {
+    name: "invert_permutation"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_numeric_tensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_variable_initialized"
+    argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "lbeta"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'lbeta\'], "
+  }
+  member_method {
+    name: "less"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "less_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lgamma"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "lin_space"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "linspace"
+    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "load_file_system_library"
+    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_op_library"
+    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "local_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "local_variables_initializer"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "log"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log1p"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "log_sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_and"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_not"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_or"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "logical_xor"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'LogicalXor\'], "
+  }
+  member_method {
+    name: "make_template"
+    argspec: "args=[\'name_\', \'func_\', \'create_scope_now_\', \'unique_name_\', \'custom_getter_\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "map_fn"
+    argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "matching_files"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_band_part"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_determinant"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_diag"
+    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_diag_part"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_inverse"
+    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_set_diag"
+    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "matrix_solve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_solve_ls"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "matrix_transpose"
+    argspec: "args=[\'a\', \'name\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\'], "
+  }
+  member_method {
+    name: "matrix_triangular_solve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maximum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "meshgrid"
+    argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "min_max_variable_partitioner"
+    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
+  }
+  member_method {
+    name: "minimum"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "mod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "model_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "moving_average_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "multiply"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "name_scope"
+    argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "negative"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_op"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_regularizer"
+    argspec: "args=[\'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "norm"
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "not_equal"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "one_hot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ones"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "ones_like"
+    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "op_scope"
+    argspec: "args=[\'values\', \'name\', \'default_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pad"
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
+  member_method {
+    name: "parallel_stack"
+    argspec: "args=[\'values\', \'name\'], varargs=None, keywords=None, defaults=[\'parallel_stack\'], "
+  }
+  member_method {
+    name: "parse_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_example"
+    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_single_sequence_example"
+    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "parse_tensor"
+    argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "placeholder_with_default"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "polygamma"
+    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "pow"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "py_func"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "qr"
+    argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantize_v2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "quantized_concat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "random_crop"
+    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_gamma"
+    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_poisson"
+    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_shuffle"
+    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "random_uniform"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
+  }
+  member_method {
+    name: "rank"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_file"
+    argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "real"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "realdiv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reciprocal"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reduce_all"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_any"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_join"
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_logsumexp"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_max"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_mean"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_min"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_prod"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_sum"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "register_tensor_conversion_function"
+    argspec: "args=[\'base_type\', \'conversion_func\', \'priority\'], varargs=None, keywords=None, defaults=[\'100\'], "
+  }
+  member_method {
+    name: "report_uninitialized_variables"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'report_uninitialized_variables\'], "
+  }
+  member_method {
+    name: "required_space_to_batch_paddings"
+    argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "reset_default_graph"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reshape"
+    argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reverse"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "reverse_sequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "reverse_v2"
+    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rint"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "round"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rsqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "saturate_cast"
+    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scalar_mul"
+    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scan"
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_add"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_div"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_mul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_add"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_sub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_sub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_update"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "segment_max"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_mean"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_min"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_prod"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_sum"
+    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "self_adjoint_eig"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "self_adjoint_eigvals"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_mask"
+    argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
+  }
+  member_method {
+    name: "serialize_many_sparse"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize_sparse"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_random_seed"
+    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setdiff1d"
+    argspec: "args=[\'x\', \'y\', \'index_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "shape"
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "shape_n"
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sigmoid"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sign"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sin"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+  }
+  member_method {
+    name: "slice"
+    argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_batch_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "space_to_depth"
+    argspec: "args=[\'input\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_add"
+    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "sparse_concat"
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_fill_empty_rows"
+    argspec: "args=[\'sp_input\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_mask"
+    argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_matmul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_maximum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_merge"
+    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+  }
+  member_method {
+    name: "sparse_minimum"
+    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_placeholder"
+    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_sum"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reduce_sum_sparse"
+    argspec: "args=[\'sp_input\', \'axis\', \'keep_dims\', \'reduction_axes\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_reorder"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_reset_shape"
+    argspec: "args=[\'sp_input\', \'new_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_reshape"
+    argspec: "args=[\'sp_input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_retain"
+    argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_segment_mean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_segment_sqrt_n"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_segment_sum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_softmax"
+    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_split"
+    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_tensor_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_tensor_to_dense"
+    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_to_dense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "sparse_to_indicator"
+    argspec: "args=[\'sp_input\', \'vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sparse_transpose"
+    argspec: "args=[\'sp_input\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'value\', \'num_or_size_splits\', \'axis\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'split\'], "
+  }
+  member_method {
+    name: "sqrt"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "square"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "squared_difference"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "squeeze"
+    argspec: "args=[\'input\', \'axis\', \'name\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "stack"
+    argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'stack\'], "
+  }
+  member_method {
+    name: "stop_gradient"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "strided_slice"
+    argspec: "args=[\'input_\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'var\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'0\', \'0\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "string_join"
+    argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "string_split"
+    argspec: "args=[\'source\', \'delimiter\'], varargs=None, keywords=None, defaults=[\' \'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket_fast"
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_hash_bucket_strong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "string_to_number"
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "subtract"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "svd"
+    argspec: "args=[\'tensor\', \'full_matrices\', \'compute_uv\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "tables_initializer"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
+  }
+  member_method {
+    name: "tan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tanh"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensordot"
+    argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_bfloat16"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
+  }
+  member_method {
+    name: "to_double"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
+  }
+  member_method {
+    name: "to_float"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToFloat\'], "
+  }
+  member_method {
+    name: "to_int32"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt32\'], "
+  }
+  member_method {
+    name: "to_int64"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "trainable_variables"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'a\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\'], "
+  }
+  member_method {
+    name: "truediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "truncated_normal"
+    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "truncatediv"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "truncatemod"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tuple"
+    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unique"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unique_with_counts"
+    argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_max"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unsorted_segment_sum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "unstack"
+    argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'unstack\'], "
+  }
+  member_method {
+    name: "variable_axis_size_partitioner"
+    argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
+  }
+  member_method {
+    name: "variable_op_scope"
+    argspec: "args=[\'values\', \'name_or_scope\', \'default_name\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variable_scope"
+    argspec: "args=[\'name_or_scope\', \'default_name\', \'values\', \'initializer\', \'regularizer\', \'caching_device\', \'partitioner\', \'custom_getter\', \'reuse\', \'dtype\', \'use_resource\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "variables_initializer"
+    argspec: "args=[\'var_list\', \'name\'], varargs=None, keywords=None, defaults=[\'init\'], "
+  }
+  member_method {
+    name: "verify_tensor_all_finite"
+    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "where"
+    argspec: "args=[\'condition\', \'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "while_loop"
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "write_file"
+    argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "zeros"
+    argspec: "args=[\'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "zeros_like"
+    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "zeta"
+    argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4941dda50e4964f8400a4cb5033c8e918aeaea5d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-compression-type.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.python_io.TFRecordCompressionType"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordCompressionType\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GZIP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "ZLIB"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0853716023ae5271fba6e8024e719eebb22ec56d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-options.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.python_io.TFRecordOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordOptions\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "compression_type_map"
+    mtype: "<type \'dict\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_compression_type_string"
+    argspec: "args=[\'cls\', \'options\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af0c11ca14d4f38547a49ac511ee13e15847eb33
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.-t-f-record-writer.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.python_io.TFRecordWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt b/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c9953e5fe3c883fd5e6e19ae011cc464f4107af
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.python_io.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.python_io"
+tf_module {
+  member {
+    name: "TFRecordCompressionType"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFRecordWriter"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "tf_record_iterator"
+    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5993fdeb9c232ebc4090d9fffd8857da8ca6ada4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.random_normal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.random_normal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a434ed1599ef8b99b6e0496be388aa0e44755249
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.random_uniform_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.random_uniform_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..288b78b4cd0ad3f5d5bc1f9c773977d50a6db086
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.resource_loader.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.resource_loader"
+tf_module {
+  member_method {
+    name: "get_data_files_path"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_path_to_datafile"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_root_dir_with_all_resources"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_resource"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readahead_file_path"
+    argspec: "args=[\'path\', \'readahead\'], varargs=None, keywords=None, defaults=[\'128M\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56d76902fd0fe72ced6c0267295d9a9dc822a745
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.saved_model.builder.SavedModelBuilder"
+tf_class {
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph_and_variables"
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adc697ad1c0bdd0c9b52be736fca3a19a2a82ef3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.builder"
+tf_module {
+  member {
+    name: "SavedModelBuilder"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20e10aa094f704f2168de37abb73f6edf6765f93
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.constants.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.saved_model.constants"
+tf_module {
+  member {
+    name: "ASSETS_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ASSETS_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LEGACY_INIT_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MAIN_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PBTXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_SCHEMA_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VARIABLES_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_FILENAME"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..896e2160c693039ab5582be13286f387c08d8f37
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.loader"
+tf_module {
+  member_method {
+    name: "load"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\'], varargs=None, keywords=saver_kwargs, defaults=None"
+  }
+  member_method {
+    name: "maybe_saved_model_directory"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..176cb788c249e68f1221713e96c7e808c39c8f6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.main_op.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.main_op"
+tf_module {
+  member_method {
+    name: "main_op"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "main_op_with_restore"
+    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5683766b28975a3a17da3cdbfbaa4e8baab5f3ba
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.saved_model"
+tf_module {
+  member {
+    name: "builder"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "loader"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "main_op"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "signature_constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "signature_def_utils"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "tag_constants"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "utils"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..478d410e066b1ce3a17bb3ef9cc6e4503991ad0b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_constants.pbtxt
@@ -0,0 +1,47 @@
+path: "tensorflow.saved_model.signature_constants"
+tf_module {
+  member {
+    name: "CLASSIFY_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_CLASSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_SCORES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9867d84c3e509fab084a1e0fb955df4c4293fc7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.signature_def_utils.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.saved_model.signature_def_utils"
+tf_module {
+  member_method {
+    name: "build_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "classification_signature_def"
+    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_signature_def"
+    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "regression_signature_def"
+    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c24b7ad3cf38cfd949959d078e5d70838d0b2d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.saved_model.tag_constants"
+tf_module {
+  member {
+    name: "SERVING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINING"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc150e56a36ca22479cdd6a0563466ef6275e143
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.utils.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.utils"
+tf_module {
+  member_method {
+    name: "build_tensor_info"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/tensorflow.sets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a196b1a556e283671cc75af28df3eaa62532975
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sets.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.sets"
+tf_module {
+  member_method {
+    name: "set_difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "set_intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84883c1a39564c71707710f7d267e39bcec6d5c2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.spectral.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.spectral"
+tf_module {
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ab3449d80f6108d83b721563427bd07d07a7104b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
@@ -0,0 +1,112 @@
+path: "tensorflow.summary.Event"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILE_VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LOG_MESSAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "META_GRAPH_DEF_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SESSION_LOG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SUMMARY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WALL_TIME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a5b63dceae3c0ac27b34c2e896ee3b90bbd7f75
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer-cache.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.summary.FileWriterCache"
+tf_class {
+  is_instance: "<class \'tensorflow.python.summary.writer.writer_cache.FileWriterCache\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "clear"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcf747971b7b8bf243502b2388da635705b8ee3e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.summary.FileWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.summary.writer.writer.FileWriter\'>"
+  is_instance: "<class \'tensorflow.python.summary.writer.writer.SummaryToEventTransformer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_event"
+    argspec: "args=[\'self\', \'event\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_graph"
+    argspec: "args=[\'self\', \'graph\', \'global_step\', \'graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_meta_graph"
+    argspec: "args=[\'self\', \'meta_graph_def\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_run_metadata"
+    argspec: "args=[\'self\', \'run_metadata\', \'tag\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_session_log"
+    argspec: "args=[\'self\', \'session_log\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_summary"
+    argspec: "args=[\'self\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_logdir"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reopen"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92ca4872caf1c1ce7e19201b0a612c1a74ef59b0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.summary.SessionLog"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CHECKPOINT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "CHECKPOINT_PATH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "MSG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "START"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STATUS_UNSPECIFIED"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "STOP"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SessionStatus"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f93da2196adbc28524f93746a8e047b5c0f610d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.summary.SummaryDescription"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.SummaryDescription\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "TYPE_HINT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..605e305e82cc3f4dd6a0bce68f846a43347a00e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.summary.Summary.Audio"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTENT_TYPE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "LENGTH_FRAMES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NUM_CHANNELS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAMPLE_RATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0646972196dc728b3f39aad07540aa7b6893ab88
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.summary.Summary.Image"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "COLORSPACE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HEIGHT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "WIDTH_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5294b37f5776a6e9bbe106bae152e97e0e2ca8d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
@@ -0,0 +1,108 @@
+path: "tensorflow.summary.Summary.Value"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "AUDIO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "HISTO_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "IMAGE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NODE_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SIMPLE_VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TENSOR_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..132ef1b7d2e933c3fe953ca2eb19b32133db8186
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
@@ -0,0 +1,92 @@
+path: "tensorflow.summary.Summary"
+tf_class {
+  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "Audio"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "Image"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "Value"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dce20819de06fb3a31d6b044a8c751c22da5c74
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.summary.TaggedRunMetadata"
+tf_class {
+  is_instance: "<class \'tensorflow.core.util.event_pb2.TaggedRunMetadata\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "RUN_METADATA_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TAG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3d0bea10cb6c1d9c1ca37c010decab309f9a605
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.summary.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.summary"
+tf_module {
+  member {
+    name: "Event"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FileWriter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FileWriterCache"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionLog"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Summary"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SummaryDescription"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "TaggedRunMetadata"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "audio"
+    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "get_summary_description"
+    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "histogram"
+    argspec: "args=[\'name\', \'values\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "image"
+    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
+  member_method {
+    name: "merge"
+    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "merge_all"
+    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=[\'summaries\'], "
+  }
+  member_method {
+    name: "scalar"
+    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_summary"
+    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "text"
+    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02dec04b9ccdb4ddf38ffee6e3a81617245b123d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sysconfig.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.sysconfig"
+tf_module {
+  member_method {
+    name: "get_include"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_lib"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df528e26b60f8d8ddcc1eaf0ed292cc7ff0ebd94
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.test.-benchmark.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.test.Benchmark"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.benchmark.TensorFlowBenchmark\'>"
+  is_instance: "<class \'tensorflow.python.platform.benchmark.Benchmark\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "is_abstract"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "report_benchmark"
+    argspec: "args=[\'self\', \'iters\', \'cpu_time\', \'wall_time\', \'throughput\', \'extras\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "run_op_benchmark"
+    argspec: "args=[\'self\', \'sess\', \'op_or_tensor\', \'feed_dict\', \'burn_iters\', \'min_iters\', \'store_trace\', \'store_memory_usage\', \'name\', \'extras\', \'mbs\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'10\', \'False\', \'True\', \'None\', \'None\', \'0\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e717ad23713ef57e2f505ac7e1dc53b279a7b87
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.test.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.test"
+tf_module {
+  member {
+    name: "Benchmark"
+    mtype: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
+  }
+  member {
+    name: "TestCase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "mock"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "assert_equal_graph_def"
+    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "compute_gradient"
+    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradient_error"
+    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_local_cluster"
+    argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\'], varargs=None, keywords=None, defaults=[\'grpc\'], "
+  }
+  member_method {
+    name: "get_temp_dir"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "gpu_device_name"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_built_with_cuda"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_gpu_available"
+    argspec: "args=[\'cuda_only\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "main"
+    argspec: "args=[\'argv\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "test_src_dir_path"
+    argspec: "args=[\'relative_path\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c91c5b4d9ef2436dd10a64c2adec261cd4dd282
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdadeltaOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-08\', \'False\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05d38d62ccda3a336f3f31e682d619ec8515ad3d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdagradDAOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'global_step\', \'initial_gradient_squared_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'AdagradDA\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19ca9f5763715ab6228db76033c80cbb9fbce499
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdagradOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8144e2db78bf96b7969f71f4776b796f4fb454c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.AdamOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf52b817f342a3ccd8bcf5f4f532b886a318f23
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.BytesList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.BytesList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3037baa8c951ecd9b60267ee7cc8674ead88dbe
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d3688e565761758e765d00086de8b59dcc3801b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.train.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..abbe273be32c6fd20b1a6464f3e99966bd3c8953
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-chief-session-creator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.ChiefSessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.ChiefSessionCreator\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..93ff856b09de15f12954bb11802a935b82c1d278
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.ClusterDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.ClusterDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "JOB_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1658b15a5f82167f9167338145b479c9e9197ea5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-spec.pbtxt
@@ -0,0 +1,37 @@
+path: "tensorflow.train.ClusterSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.ClusterSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "jobs"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_cluster_def"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_dict"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "job_tasks"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "num_tasks"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "task_address"
+    argspec: "args=[\'self\', \'job_name\', \'task_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "task_indices"
+    argspec: "args=[\'self\', \'job_name\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11277f077eef830aec3be61ddd981bfd3a55d149
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-coordinator.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.train.Coordinator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.coordinator.Coordinator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "joined"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'clean_stop_exception_types\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "clear_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\', \'threads\', \'stop_grace_period_secs\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'120\', \'False\'], "
+  }
+  member_method {
+    name: "raise_requested_exception"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "register_thread"
+    argspec: "args=[\'self\', \'thread\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_on_exception"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+  member_method {
+    name: "wait_for_stop"
+    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7215a20372e981a2fb20f20d9e4cfa43973c7cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.Example"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.example_pb2.Example\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURES_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..737acbe07c93da30b4a206cbdae2efcbc2cb2159
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.train.ExponentialMovingAverage"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.moving_averages.ExponentialMovingAverage\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'decay\', \'num_updates\', \'zero_debias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'ExponentialMovingAverage\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "average"
+    argspec: "args=[\'self\', \'var\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "average_name"
+    argspec: "args=[\'self\', \'var\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables_to_restore"
+    argspec: "args=[\'self\', \'moving_avg_variables\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ad98354d69453d6f66a858991d4a19e2525d1e0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.FeatureList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd171f4ca3ef1e48848be1bd71f8a56685534b8c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.FeatureLists.FeatureListEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureListEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d95017d584ad95f96a54ef52a966aa6f2a69a58
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.FeatureLists"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureLists\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FeatureListEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cca132bba91c46398c2fecb4ff7b45bd5ed2af2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.train.Feature"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Feature\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "BYTES_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FLOAT_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INT64_LIST_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..858aee03415dead500cdb450f5885a904f620221
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.Features.FeatureEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49cd12153bf3078eb1e68cfd6efad6e2673439f4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.Features"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Features\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "FeatureEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bec4d032cedc0711ca07049d5d04490e8bc3f30
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31cf9aaeb2c640f8db205c0753f20acc75338fe0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.train.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e3f01334b547feef87d07166eb3784659c41d542
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.FloatList"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.FloatList\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2dc11df57b60b15a797b1866743b27ea1068624e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.FtrlOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..147448618e2df9f71ac794e369b108629e10ce0a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdd4c525685f86f2a57aa7fcbb78b659ee88ba74
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.GradientDescentOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'GradientDescent\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8917dc122cfd0b0a7de0a3a74da3c45104d9eaff
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
@@ -0,0 +1,80 @@
+path: "tensorflow.train.Int64List"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.feature_pb2.Int64List\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac6d81541a43e934ebd131afe07be0bd6e427a7b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.JobDef.TasksEntry"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.TasksEntry\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "KEY_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VALUE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce34537fa13b92f7900128d769ac3161d2b4d287
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
@@ -0,0 +1,88 @@
+path: "tensorflow.train.JobDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.JobDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TASKS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TasksEntry"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e55c47b3567d42ecc9a0adf7254feb6487834e1b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c61859004e897a14b580dc0b55957edfa6ae6860
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-looper-thread.pbtxt
@@ -0,0 +1,73 @@
+path: "tensorflow.train.LooperThread"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.coordinator.LooperThread\'>"
+  is_instance: "<class \'threading.Thread\'>"
+  member {
+    name: "daemon"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ident"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "getName"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isAlive"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isDaemon"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_alive"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "loop"
+    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setDaemon"
+    argspec: "args=[\'self\', \'daemonic\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "setName"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_loop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cf5488a15e4832bfda4324739e97f9f5466fe2a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.MomentumOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_locking\', \'name\', \'use_nesterov\'], varargs=None, keywords=None, defaults=[\'False\', \'Momentum\', \'False\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3a5cc015b4d5a0ca3487764787bc877716d9fedc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-monitored-session.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.train.MonitoredSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.MonitoredSession\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'session_creator\', \'hooks\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'120\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25fd5e75a79f6e4fe2cf77ebc7aa0d1fef759e7f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.train.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "message"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d1c89f9b37b5e63ecf2cf766986cb8faa5872c4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20b0c4d1b565aaba30cd440a7a5480291631a89b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -0,0 +1,45 @@
+path: "tensorflow.train.Optimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..571d846b6c5abf53a7570f996c8e59581680adbf
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.ProximalAdagradOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'ProximalAdagrad\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1feb136e7f70f0d41c79eeee03fff3663bb4c643
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.ProximalGradientDescentOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.0\', \'False\', \'ProximalGradientDescent\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d84d0058eea34d2d4413c8b1a09bd7d5720c07f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-queue-runner.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.QueueRunner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.queue_runner_impl.QueueRunner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cancel_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "close_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "enqueue_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "exceptions_raised"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_closed_exception_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'queue\', \'enqueue_ops\', \'close_op\', \'cancel_op\', \'queue_closed_exception_types\', \'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_threads"
+    argspec: "args=[\'self\', \'sess\', \'coord\', \'daemon\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2aa4ae6d2d20af16eee5ad7dcce84d81b97d8300
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -0,0 +1,46 @@
+path: "tensorflow.train.RMSPropOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'decay\', \'momentum\', \'epsilon\', \'use_locking\', \'centered\', \'name\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.0\', \'1e-10\', \'False\', \'False\', \'RMSProp\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84498a64f5b04526e989ec03f1894dcea19d850e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
@@ -0,0 +1,120 @@
+path: "tensorflow.train.SaverDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.saver_pb2.SaverDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CheckpointFormatVersion"
+    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FILENAME_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "KEEP_CHECKPOINT_EVERY_N_HOURS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "LEGACY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MAX_TO_KEEP_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "RESTORE_OP_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SAVE_TENSOR_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SHARDED_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "V1"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "V2"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7494fe1cc8446d55a460ccb98d99eb9c035bf608
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver.pbtxt
@@ -0,0 +1,53 @@
+path: "tensorflow.train.Saver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.saver.Saver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "last_checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'var_list\', \'reshape\', \'sharded\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\', \'name\', \'restore_sequentially\', \'saver_def\', \'builder\', \'defer_build\', \'allow_empty\', \'write_version\', \'pad_step_number\', \'save_relative_paths\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'5\', \'10000.0\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'2\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "as_saver_def"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "export_meta_graph"
+    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'saver_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "recover_last_checkpoints"
+    argspec: "args=[\'self\', \'checkpoint_paths\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'sess\', \'save_path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\'], "
+  }
+  member_method {
+    name: "set_last_checkpoints"
+    argspec: "args=[\'self\', \'last_checkpoints\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_last_checkpoints_with_time"
+    argspec: "args=[\'self\', \'last_checkpoints_with_time\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..62b956c5ef7dc54e92431f25ec948e341c0e1f24
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-scaffold.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.Scaffold"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.Scaffold\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "init_feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_for_local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "saver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_op"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_or_default"
+    argspec: "args=[\'arg_name\', \'collection_key\', \'default_constructor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45528d4e87ebaa0a9c5fcf5d8a135774481b503f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-second-or-step-timer.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ab95537021167f368d3a8f6b1e1ec1a3996aa88
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
@@ -0,0 +1,84 @@
+path: "tensorflow.train.SequenceExample"
+tf_class {
+  is_instance: "<class \'tensorflow.core.example.example_pb2.SequenceExample\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CONTEXT_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "FEATURE_LISTS_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af0a3b73cc2ff3510e9a0426c28696fe51097f9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
@@ -0,0 +1,96 @@
+path: "tensorflow.train.ServerDef"
+tf_class {
+  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.ServerDef\'>"
+  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
+  member {
+    name: "CLUSTER_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DEFAULT_SESSION_CONFIG_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "DESCRIPTOR"
+    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
+  }
+  member {
+    name: "Extensions"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member {
+    name: "JOB_NAME_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "PROTOCOL_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TASK_INDEX_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "ByteSize"
+  }
+  member_method {
+    name: "Clear"
+  }
+  member_method {
+    name: "ClearExtension"
+  }
+  member_method {
+    name: "ClearField"
+  }
+  member_method {
+    name: "CopyFrom"
+  }
+  member_method {
+    name: "DiscardUnknownFields"
+  }
+  member_method {
+    name: "FindInitializationErrors"
+  }
+  member_method {
+    name: "FromString"
+  }
+  member_method {
+    name: "HasExtension"
+  }
+  member_method {
+    name: "HasField"
+  }
+  member_method {
+    name: "IsInitialized"
+  }
+  member_method {
+    name: "ListFields"
+  }
+  member_method {
+    name: "MergeFrom"
+  }
+  member_method {
+    name: "MergeFromString"
+  }
+  member_method {
+    name: "ParseFromString"
+  }
+  member_method {
+    name: "RegisterExtension"
+  }
+  member_method {
+    name: "SerializePartialToString"
+  }
+  member_method {
+    name: "SerializeToString"
+  }
+  member_method {
+    name: "SetInParent"
+  }
+  member_method {
+    name: "WhichOneof"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b8f185f5b699e860c6fbb50b8d2912984908982
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-server.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.train.Server"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "server_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'server_or_cluster_def\', \'job_name\', \'task_index\', \'protocol\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "create_local_server"
+    argspec: "args=[\'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..beb232715f725047dd8c03054b899a90fa81eec2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-creator.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.train.SessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc31bb4e4b396917a00d1162125b6d2e47343322
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.SessionManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_manager.SessionManager\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\'], "
+  }
+  member_method {
+    name: "prepare_session"
+    argspec: "args=[\'self\', \'master\', \'init_op\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\', \'init_feed_dict\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'7200\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "recover_session"
+    argspec: "args=[\'self\', \'master\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'7200\', \'None\'], "
+  }
+  member_method {
+    name: "wait_for_session"
+    argspec: "args=[\'self\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'inf\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..442990893e33c92bd05a72b198a6584bc979b2fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-args.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.train.SessionRunArgs"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "fetches"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d5adb15c95f8a6ebde4ca0e0c535dfebc5edfbf2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.train.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db1aa24acf0e295b4b787eef68250401dd6a6e27
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-hook.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.train.SessionRunHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b401d59c400f1d08f47daa2d264a9a5bfc91538
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.train.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..62bfdab40bb83c634e101388ecb69da1233c60f9
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-singular-monitored-session.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.SingularMonitoredSession"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SingularMonitoredSession\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "raw_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13261f6dde1cf8e6fd228950600303370947b7ea
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e388599b0bf63379fa95a3276e3f4859eab86d6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..697c3667b09f42f208dec38938f5a1ce0cc09029
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.train.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc9bd5c136bcedd6345a64db165ff6e847b20d3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
@@ -0,0 +1,153 @@
+path: "tensorflow.train.Supervisor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.supervisor.Supervisor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "USE_DEFAULT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "coord"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "global_step"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_chief"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_for_local_init_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ready_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_model_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_path"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "save_summaries_secs"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "saver"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session_manager"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "summary_writer"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "Loop"
+    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "PrepareSession"
+    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
+  }
+  member_method {
+    name: "RequestStop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ShouldStop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StartQueueRunners"
+    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StartStandardServices"
+    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stop"
+    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "StopOnException"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SummaryComputed"
+    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "WaitForStop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "loop"
+    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "managed_session"
+    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
+  }
+  member_method {
+    name: "prepare_or_wait_for_session"
+    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "should_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "start_standard_services"
+    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "stop_on_exception"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary_computed"
+    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "wait_for_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..915d8501af0ac238b0eb6afd200d9f7c0c432a85
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.train.SyncReplicasOptimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
+  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "GATE_GRAPH"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_NONE"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GATE_OP"
+    mtype: "<type \'int\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_gradients"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_chief_queue_runner"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_init_tokens_op"
+    argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "get_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "make_session_run_hook"
+    argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..140407651a9827c7250c9008e5eb46122bb4e5f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.train.WorkerSessionCreator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.monitored_session.WorkerSessionCreator\'>"
+  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "create_session"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00c4f7709a28ef844b58f5525f316589d5f7c651
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -0,0 +1,407 @@
+path: "tensorflow.train"
+tf_module {
+  member {
+    name: "AdadeltaOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdagradDAOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdagradOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "AdamOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BytesList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ChiefSessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ClusterDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "ClusterSpec"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Coordinator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Example"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "ExponentialMovingAverage"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Feature"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeatureList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeatureLists"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Features"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FloatList"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "FtrlOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GradientDescentOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Int64List"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "JobDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LooperThread"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MomentumOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MonitoredSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Optimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProximalAdagradOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProximalGradientDescentOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueRunner"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RMSPropOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Saver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SaverDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Scaffold"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SequenceExample"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ServerDef"
+    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  }
+  member {
+    name: "SessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionManager"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SingularMonitoredSession"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Supervisor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SyncReplicasOptimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerSessionCreator"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "queue_runner"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "MonitoredTrainingSession"
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'100\', \'None\', \'None\', \'120\', \'100\'], "
+  }
+  member_method {
+    name: "NewCheckpointReader"
+    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_queue_runner"
+    argspec: "args=[\'qr\', \'collection\'], varargs=None, keywords=None, defaults=[\'queue_runners\'], "
+  }
+  member_method {
+    name: "assert_global_step"
+    argspec: "args=[\'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "basic_train_loop"
+    argspec: "args=[\'supervisor\', \'train_step_fn\', \'args\', \'kwargs\', \'master\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\'], "
+  }
+  member_method {
+    name: "batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'num_threads\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "checkpoint_exists"
+    argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "create_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "do_quantize_training_on_graphdef"
+    argspec: "args=[\'input_graph\', \'num_bits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "exponential_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_meta_graph"
+    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "generate_checkpoint_state_proto"
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_checkpoint_mtimes"
+    argspec: "args=[\'checkpoint_prefixes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_checkpoint_state"
+    argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_or_create_global_step"
+    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "global_step"
+    argspec: "args=[\'sess\', \'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "import_meta_graph"
+    argspec: "args=[\'meta_graph_or_file\', \'clear_devices\', \'import_scope\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "input_producer"
+    argspec: "args=[\'input_tensor\', \'element_shape\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'summary_name\', \'name\', \'cancel_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\', \'32\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_time_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "limit_epochs"
+    argspec: "args=[\'tensor\', \'num_epochs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "match_filenames_once"
+    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "maybe_batch"
+    argspec: "args=[\'tensors\', \'keep_input\', \'batch_size\', \'num_threads\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_batch_join"
+    argspec: "args=[\'tensors_list\', \'keep_input\', \'batch_size\', \'capacity\', \'enqueue_many\', \'shapes\', \'dynamic_pad\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'32\', \'False\', \'None\', \'False\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_shuffle_batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'keep_input\', \'num_threads\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "maybe_shuffle_batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'keep_input\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "natural_exp_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "piecewise_constant"
+    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "polynomial_decay"
+    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "range_input_producer"
+    argspec: "args=[\'limit\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "replica_device_setter"
+    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "sdca_fprint"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sdca_optimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\', \'adaptative\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "sdca_shrink_l1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "shuffle_batch"
+    argspec: "args=[\'tensors\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'num_threads\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "shuffle_batch_join"
+    argspec: "args=[\'tensors_list\', \'batch_size\', \'capacity\', \'min_after_dequeue\', \'seed\', \'enqueue_many\', \'shapes\', \'allow_smaller_final_batch\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "slice_input_producer"
+    argspec: "args=[\'tensor_list\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'sess\', \'coord\', \'daemon\', \'start\', \'collection\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'True\', \'queue_runners\'], "
+  }
+  member_method {
+    name: "string_input_producer"
+    argspec: "args=[\'string_tensor\', \'num_epochs\', \'shuffle\', \'seed\', \'capacity\', \'shared_name\', \'name\', \'cancel_op\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'32\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "summary_iterator"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_checkpoint_state"
+    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "write_graph"
+    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23d402de30888c1c503a3971cefa1167af3bc8c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.-queue-runner.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.train.queue_runner.QueueRunner"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.queue_runner_impl.QueueRunner\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "cancel_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "close_op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "enqueue_ops"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "exceptions_raised"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_closed_exception_types"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'queue\', \'enqueue_ops\', \'close_op\', \'cancel_op\', \'queue_closed_exception_types\', \'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "create_threads"
+    argspec: "args=[\'self\', \'sess\', \'coord\', \'daemon\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'queue_runner_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e2d04304967dd08d2c389c209dd43c731c5f956
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.train.queue_runner.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.train.queue_runner"
+tf_module {
+  member {
+    name: "QueueRunner"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "add_queue_runner"
+    argspec: "args=[\'qr\', \'collection\'], varargs=None, keywords=None, defaults=[\'queue_runners\'], "
+  }
+  member_method {
+    name: "start_queue_runners"
+    argspec: "args=[\'sess\', \'coord\', \'daemon\', \'start\', \'collection\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'True\', \'queue_runners\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1e1c230a9f79e87294eb6038f870726a0ba85a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.truncated_normal_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.truncated_normal_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1b18dc92fbee9565dba81e8c09534bea6734f23
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.uniform_unit_scaling_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.uniform_unit_scaling_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.UniformUnitScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'factor\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e229b02ceec6739974d3b4ae2bb02ef273398c45
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.zeros_initializer.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.zeros_initializer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cdfa0e7be524e3bb4ec039ac19bea72747afb58c
--- /dev/null
+++ b/tensorflow/tools/api/lib/BUILD
@@ -0,0 +1,39 @@
+# Helper libraries for TensorFlow API compatibility test.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+tf_proto_library(
+    name = "api_objects_proto",
+    srcs = ["api_objects.proto"],
+)
+
+py_library(
+    name = "python_object_to_proto_visitor",
+    srcs = ["python_object_to_proto_visitor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":api_objects_proto_py",
+        "//tensorflow/tools/common:traverse",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/lib/api_objects.proto b/tensorflow/tools/api/lib/api_objects.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0966a5f1d530ecd70c9e904c12816f0aa33b3ada
--- /dev/null
+++ b/tensorflow/tools/api/lib/api_objects.proto
@@ -0,0 +1,31 @@
+syntax = "proto2";
+
+package third_party.tensorflow.tools.api;
+
+message TFAPIMember {
+  optional string name = 1;
+  optional string mtype = 2;
+};
+
+message TFAPIMethod {
+  optional string name = 1;
+  optional string path = 2;
+  optional string argspec = 3;
+};
+
+message TFAPIModule {
+  repeated TFAPIMember member = 1;
+  repeated TFAPIMethod member_method = 2;
+};
+
+message TFAPIClass {
+  repeated string is_instance = 1;
+  repeated TFAPIMember member = 2;
+  repeated TFAPIMethod member_method = 3;
+};
+
+message TFAPIObject {
+  optional string path = 1;
+  optional TFAPIModule tf_module = 2;
+  optional TFAPIClass tf_class = 3;
+};
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..34edbf61f5e73bfdfd33bdafc8ef8d3e0c0e15e6
--- /dev/null
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -0,0 +1,166 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""A visitor class that generates protobufs for each pyton object."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.api.lib import api_objects_pb2
+
+# Following object need to be handled individually.
+_CORNER_CASES = {
+    '': {'tools': {}},
+    'test.TestCase': {},
+    'test.TestCase.failureException': {},
+}
+
+
+def _SanitizedArgSpec(obj):
+  """Get an ArgSpec string that is free of addresses.
+
+  We have callables as function arg defaults. This results in addresses in
+  getargspec output. This function returns a sanitized string list of base
+  classes.
+
+  Args:
+    obj: A python routine for us the create the sanitized arspec of.
+
+  Returns:
+    string, a string representation of the argspec.
+  """
+  output_string = ''
+  unsanitized_arg_spec = tf_inspect.getargspec(obj)
+
+  for clean_attr in ('args', 'varargs', 'keywords'):
+    output_string += '%s=%s, ' % (clean_attr,
+                                  getattr(unsanitized_arg_spec, clean_attr))
+
+  if unsanitized_arg_spec.defaults:
+    sanitized_defaults = []
+    for val in unsanitized_arg_spec.defaults:
+      str_val = str(val)
+      if ' object at 0x' in str_val:
+        sanitized_defaults.append('%s instance>' % str_val.split(' at ')[0])
+      else:
+        sanitized_defaults.append(str_val)
+
+    output_string += 'defaults=%s, ' % sanitized_defaults
+
+  else:
+    output_string += 'defaults=None'
+
+  return output_string
+
+
+def _SanitizedMRO(obj):
+  """Get a list of superclasses with minimal amount of non-TF classes.
+
+  Based on many parameters like python version, OS, protobuf implementation
+  or changes in google core libraries the list of superclasses of a class
+  can change. We only return the first non-TF class to be robust to non API
+  affecting changes. The Method Resolution Order returned by `tf_inspect.getmro`
+  is still maintained in the return value.
+
+  Args:
+    obj: A python routine for us the create the sanitized arspec of.
+
+  Returns:
+    list of strings, string representation of the class names.
+  """
+  return_list = []
+  for cls in tf_inspect.getmro(obj):
+    str_repr = str(cls)
+    return_list.append(str_repr)
+    if 'tensorflow' not in str_repr:
+      break
+
+  return return_list
+
+
+class PythonObjectToProtoVisitor(object):
+  """A visitor that summarizes given python objects as protobufs."""
+
+  def __init__(self):
+    # A dict to store all protocol buffers.
+    # Keyed by "path" to the object.
+    self._protos = {}
+
+  def GetProtos(self):
+    """Return the list of protos stored."""
+    return self._protos
+
+  def __call__(self, path, parent, children):
+    # The path to the object.
+    lib_path = 'tensorflow.%s' % path if path else 'tensorflow'
+
+    # A small helper method to construct members(children) protos.
+    def _AddMember(member_name, member_obj, proto):
+      """Add the child object to the object being constructed."""
+      _, member_obj = tf_decorator.unwrap(member_obj)
+      if member_name == '__init__' or not member_name.startswith('_'):
+        if tf_inspect.isroutine(member_obj):
+          new_method = proto.member_method.add()
+          new_method.name = member_name
+          # If member_obj is a python builtin, there is no way to get its
+          # argspec, because it is implemented on the C side. It also has no
+          # func_code.
+          if getattr(member_obj, 'func_code', None):
+            new_method.argspec = _SanitizedArgSpec(member_obj)
+        else:
+          new_member = proto.member.add()
+          new_member.name = member_name
+          new_member.mtype = str(type(member_obj))
+
+    parent_corner_cases = _CORNER_CASES.get(path, {})
+
+    if path not in _CORNER_CASES or parent_corner_cases:
+      # Decide if we have a module or a class.
+      if tf_inspect.ismodule(parent):
+        # Create a module object.
+        module_obj = api_objects_pb2.TFAPIModule()
+        for name, child in children:
+          if name in parent_corner_cases:
+            # If we have an empty entry, skip this object.
+            if parent_corner_cases[name]:
+              module_obj.member.add(**(parent_corner_cases[name]))
+          else:
+            _AddMember(name, child, module_obj)
+
+        # Store the constructed module object.
+        self._protos[lib_path] = api_objects_pb2.TFAPIObject(
+            path=lib_path, tf_module=module_obj)
+      elif tf_inspect.isclass(parent):
+        # Construct a class.
+        class_obj = api_objects_pb2.TFAPIClass()
+        class_obj.is_instance.extend(_SanitizedMRO(parent))
+        for name, child in children:
+          if name in parent_corner_cases:
+            # If we have an empty entry, skip this object.
+            if parent_corner_cases[name]:
+              module_obj.member.add(**(parent_corner_cases[name]))
+          else:
+            _AddMember(name, child, class_obj)
+
+        # Store the constructed class object.
+        self._protos[lib_path] = api_objects_pb2.TFAPIObject(
+            path=lib_path, tf_class=class_obj)
+      else:
+        logging.error('Illegal call to ApiProtoDump::_py_obj_to_proto.'
+                      'Object is neither a module nor a class: %s', path)
diff --git a/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt b/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54b0cfcb3c1a45d422dc5537f17bb15dd2ed081b
--- /dev/null
+++ b/tensorflow/tools/api/tests/API_UPDATE_WARNING.txt
@@ -0,0 +1,7 @@
+Golden file update requested!
+All test failures have been skipped, see the logs for detected diffs.
+This test is now going to write new golden files.
+Make sure to package the updates together with your change.
+
+You will need an explicit API approval. This may take longer than a normal
+review.
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bfee211dca4891f2e112972a56753fc3f1418f5a
--- /dev/null
+++ b/tensorflow/tools/api/tests/BUILD
@@ -0,0 +1,43 @@
+# TensorFlow API backwards compatibility tests.
+
+package(
+    default_visibility = ["//tensorflow/tools/api:__subpackages__"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files([
+    "README.txt",
+    "API_UPDATE_WARNING.txt",
+])
+
+py_test(
+    name = "api_compatibility_test",
+    srcs = ["api_compatibility_test.py"],
+    data = [
+        "//tensorflow/tools/api/golden:api_golden",
+        "//tensorflow/tools/api/tests:API_UPDATE_WARNING.txt",
+        "//tensorflow/tools/api/tests:README.txt",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+        "@protobuf//:protobuf_python",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/tools/api/tests/README.txt b/tensorflow/tools/api/tests/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3463eeec6fe30aeb47a14b61c605ecc305eecff2
--- /dev/null
+++ b/tensorflow/tools/api/tests/README.txt
@@ -0,0 +1,13 @@
+TensorFlow API backwards compatibility test
+This test ensures all changes to the public API of TensorFlow are intended.
+
+If this test fails, it means a change has been made to the public API. Backwards
+incompatible changes are not allowed. You can run the test as follows to update
+test goldens and package them with your change.
+
+    $ bazel build tensorflow/tools/api/tests:api_compatibility_test
+    $ bazel-bin/tensorflow/tools/api/tests/api_compatibility_test \
+          --update_goldens True
+
+You will need an API approval to make changes to the public TensorFlow API. This
+includes additions to the API.
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd119c7305e8f0e93f57685241050560564b033
--- /dev/null
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -0,0 +1,242 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+"""TensorFlow API compatibility tests.
+
+This test ensures all changes to the public API of TensorFlow are intended.
+
+If this test fails, it means a change has been made to the public API. Backwards
+incompatible changes are not allowed. You can run the test with
+"--update_goldens" flag set to "True" to update goldens when making changes to
+the public TF python API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import re
+import sys
+import unittest
+
+import tensorflow as tf
+
+from google.protobuf import text_format
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.tools.api.lib import api_objects_pb2
+from tensorflow.tools.api.lib import python_object_to_proto_visitor
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+
+# FLAGS defined at the bottom:
+FLAGS = None
+# DEFINE_boolean, update_goldens, default False:
+_UPDATE_GOLDENS_HELP = """
+     Update stored golden files if API is updated. WARNING: All API changes
+     have to be authorized by TensorFlow leads.
+"""
+
+# DEFINE_boolean, verbose_diffs, default False:
+_VERBOSE_DIFFS_HELP = """
+     If set to true, print line by line diffs on all libraries. If set to
+     false, only print which libraries have differences.
+"""
+
+_API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
+_TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
+_UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
+
+
+def _KeyToFilePath(key):
+  """From a given key, construct a filepath."""
+  def _ReplaceCapsWithDash(matchobj):
+    match = matchobj.group(0)
+    return '-%s' % (match.lower())
+
+  case_insensitive_key = re.sub('([A-Z]{1})', _ReplaceCapsWithDash, key)
+  return os.path.join(_API_GOLDEN_FOLDER, '%s.pbtxt' % case_insensitive_key)
+
+
+def _FileNameToKey(filename):
+  """From a given filename, construct a key we use for api objects."""
+  def _ReplaceDashWithCaps(matchobj):
+    match = matchobj.group(0)
+    return match[1].upper()
+
+  base_filename = os.path.basename(filename)
+  base_filename_without_ext = os.path.splitext(base_filename)[0]
+  api_object_key = re.sub(
+      '((-[a-z]){1})', _ReplaceDashWithCaps, base_filename_without_ext)
+  return api_object_key
+
+
+class ApiCompatibilityTest(test.TestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(ApiCompatibilityTest, self).__init__(*args, **kwargs)
+
+    golden_update_warning_filename = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _UPDATE_WARNING_FILE)
+    self._update_golden_warning = file_io.read_file_to_string(
+        golden_update_warning_filename)
+
+    test_readme_filename = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _TEST_README_FILE)
+    self._test_readme_message = file_io.read_file_to_string(
+        test_readme_filename)
+
+  def _AssertProtoDictEquals(self,
+                             expected_dict,
+                             actual_dict,
+                             verbose=False,
+                             update_goldens=False):
+    """Diff given dicts of protobufs and report differences a readable way.
+
+    Args:
+      expected_dict: a dict of TFAPIObject protos constructed from golden
+          files.
+      actual_dict: a ict of TFAPIObject protos constructed by reading from the
+          TF package linked to the test.
+      verbose: Whether to log the full diffs, or simply report which files were
+          different.
+      update_goldens: Whether to update goldens when there are diffs found.
+    """
+    diffs = []
+    verbose_diffs = []
+
+    expected_keys = set(expected_dict.keys())
+    actual_keys = set(actual_dict.keys())
+    only_in_expected = expected_keys - actual_keys
+    only_in_actual = actual_keys - expected_keys
+    all_keys = expected_keys | actual_keys
+
+    # This will be populated below.
+    updated_keys = []
+
+    for key in all_keys:
+      diff_message = ''
+      verbose_diff_message = ''
+      # First check if the key is not found in one or the other.
+      if key in only_in_expected:
+        diff_message = 'Object %s expected but not found (removed).' % key
+        verbose_diff_message = diff_message
+      elif key in only_in_actual:
+        diff_message = 'New object %s found (added).' % key
+        verbose_diff_message = diff_message
+      else:
+        # Now we can run an actual proto diff.
+        try:
+          self.assertProtoEquals(expected_dict[key], actual_dict[key])
+        except AssertionError as e:
+          updated_keys.append(key)
+          diff_message = 'Change detected in python object: %s.' % key
+          verbose_diff_message = str(e)
+
+      # All difference cases covered above. If any difference found, add to the
+      # list.
+      if diff_message:
+        diffs.append(diff_message)
+        verbose_diffs.append(verbose_diff_message)
+
+    # If diffs are found, handle them based on flags.
+    if diffs:
+      diff_count = len(diffs)
+      logging.error(self._test_readme_message)
+      logging.error('%d differences found between API and golden.', diff_count)
+      messages = verbose_diffs if verbose else diffs
+      for i in range(diff_count):
+        logging.error('Issue %d\t: %s', i + 1, messages[i])
+
+      if update_goldens:
+        # Write files if requested.
+        logging.warning(self._update_golden_warning)
+
+        # If the keys are only in expected, some objects are deleted.
+        # Remove files.
+        for key in only_in_expected:
+          filepath = _KeyToFilePath(key)
+          file_io.delete_file(filepath)
+
+        # If the files are only in actual (current library), these are new
+        # modules. Write them to files. Also record all updates in files.
+        for key in only_in_actual | set(updated_keys):
+          filepath = _KeyToFilePath(key)
+          file_io.write_string_to_file(
+              filepath, text_format.MessageToString(actual_dict[key]))
+      else:
+        # Fail if we cannot fix the test by updating goldens.
+        self.fail('%d differences found between API and golden.' % diff_count)
+
+    else:
+      logging.info('No differences found between API and golden.')
+
+  @unittest.skipUnless(
+      sys.version_info.major == 2 and os.uname()[0] == 'Linux',
+      'API compabitility test goldens are generated using python2 on Linux.')
+  def testAPIBackwardsCompatibility(self):
+    # Extract all API stuff.
+    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
+
+    public_api_visitor = public_api.PublicAPIVisitor(visitor)
+    public_api_visitor.do_not_descend_map[''].append('contrib')
+    traverse.traverse(tf, public_api_visitor)
+
+    proto_dict = visitor.GetProtos()
+
+    # Read all golden files.
+    expression = os.path.join(
+        resource_loader.get_root_dir_with_all_resources(),
+        _KeyToFilePath('*'))
+    golden_file_list = file_io.get_matching_files(expression)
+
+    def _ReadFileToProto(filename):
+      """Read a filename, create a protobuf from its contents."""
+      ret_val = api_objects_pb2.TFAPIObject()
+      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
+      return ret_val
+
+    golden_proto_dict = {
+        _FileNameToKey(filename): _ReadFileToProto(filename)
+        for filename in golden_file_list
+    }
+
+    # Diff them. Do not fail if called with update.
+    # If the test is run to update goldens, only report diffs but do not fail.
+    self._AssertProtoDictEquals(
+        golden_proto_dict,
+        proto_dict,
+        verbose=FLAGS.verbose_diffs,
+        update_goldens=FLAGS.update_goldens)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--update_goldens', type=bool, default=False, help=_UPDATE_GOLDENS_HELP)
+  parser.add_argument(
+      '--verbose_diffs', type=bool, default=False, help=_VERBOSE_DIFFS_HELP)
+  FLAGS, unparsed = parser.parse_known_args()
+
+  # Now update argv, so that unittest library does not get confused.
+  sys.argv = [sys.argv[0]] + unparsed
+  test.main()
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index c2e41e49187fbbb118dc05f85d3aae7e0d76efdb..8c480f8d9dbecb04a24bdf4fa763c5df5a39ef15 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -334,8 +334,8 @@ int Main(int argc, char** argv) {
       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
       Flag("memory_limit", &memory_limit,
            "how many items to show by memory used"),
-      Flag("show_type", &show_time, "whether to list stats by op type"),
-      Flag("show_summary", &show_time,
+      Flag("show_type", &show_type, "whether to list stats by op type"),
+      Flag("show_summary", &show_summary,
            "whether to show a summary of the stats"),
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake
index 22eaf11b91869ba1e0f6f98c6c51ee269b18e8b9..9013dc012d90ae17ba57815e3cbab829239c6a4c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cmake
+++ b/tensorflow/tools/ci_build/Dockerfile.cmake
@@ -1,3 +1,17 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 FROM ubuntu:16.04
 
 MAINTAINER Shanqing Cai <cais@google.com>
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
new file mode 100644
index 0000000000000000000000000000000000000000..00aaa9f760b56447054a0f9d7eaa979848458b4d
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -0,0 +1,36 @@
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+
+MAINTAINER Ilya Biryukov <ibiryukov@google.com>
+
+# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# /usr/local/cuda
+RUN cp /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa
+
+# LLVM requires cmake version 3.4.3, but ppa:george-edison55/cmake-3.x only
+# provides version 3.2.2.
+# So we skip it in `install_deb_packages.sh`, and later install it from
+# https://cmake.org in `install_cmake_for_clang.sh`.
+RUN /install/install_deb_packages.sh --without_cmake
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_golang.sh
+
+# Install cmake and build clang
+RUN /install/install_cmake_for_clang.sh
+RUN /install/build_and_install_clang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+# Configure the build for our CUDA configuration.
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_CLANG 1
+ENV CLANG_CUDA_COMPILER_PATH /usr/local/bin/clang
+ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 1fa618e698fa2205dfdbe00168b4a5f3c0fe0a08..ad83669950f7b284860f84ce87855fe3e3b3e0a9 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -34,7 +34,11 @@ run continuous integration [ci.tensorflow.org](https://ci.tensorflow.org).
    ```bash
    tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
    ```
-
+   If you are using the Docker image on Windows or OS X, the Docker VM's default
+   memory limit may be too low to build TensorFlow. This can result in
+   strange-looking errors, e.g. the compilation may fail with `gcc: internal
+   compiler error: Killed (program cc1plus)`. Try increasing the memory limit in
+   the Docker preferences.
 
 
 ## Jobs
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index 407562d4babcf4fdded54a8e310aabf3869d49d4..63250e0a4da6491a19f0881151cf04a367ad5131 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -31,8 +31,10 @@ configure_android_workspace
 CPUS=armeabi-v7a,arm64-v8a,x86,x86_64
 
 OUT_DIR="$(pwd)/out/"
+AAR_LIB_TMP="$(pwd)/aar_libs"
 
 rm -rf ${OUT_DIR}
+rm -rf ${AAR_LIB_TMP}
 
 # Build all relevant native libraries for each architecture.
 for CPU in ${CPUS//,/ }
@@ -50,6 +52,9 @@ do
     copy_lib bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so
     copy_lib bazel-bin/tensorflow/examples/android/libtensorflow_demo.so
     copy_lib bazel-bin/tensorflow/tools/benchmark/benchmark_model
+
+    mkdir -p ${AAR_LIB_TMP}/jni/${CPU}
+    cp bazel-bin/tensorflow/contrib/android/libtensorflow_inference.so ${AAR_LIB_TMP}/jni/${CPU}
 done
 
 # Build Jar and also demo containing native libs for all architectures.
@@ -60,12 +65,25 @@ echo "========== Building TensorFlow Android Jar and Demo =========="
 bazel --bazelrc=/dev/null build -c opt --fat_apk_cpu=${CPUS} \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/contrib/android:android_tensorflow_inference_java \
+    //tensorflow/contrib/android:android_tensorflow_inference_java.aar \
     //tensorflow/examples/android:tensorflow_demo
 
-echo "Copying demo and Jar to ${OUT_DIR}"
+echo "Copying demo, AAR and Jar to ${OUT_DIR}"
 cp bazel-bin/tensorflow/examples/android/tensorflow_demo.apk \
     bazel-bin/tensorflow/contrib/android/libandroid_tensorflow_inference_java.jar ${OUT_DIR}
 
+cp bazel-bin/tensorflow/contrib/android/android_tensorflow_inference_java.aar \
+   ${OUT_DIR}/tensorflow.aar
+
+# TODO(andrewharp): build native libs into AAR directly once
+# https://github.com/bazelbuild/bazel/issues/348 is resolved.
+echo "Adding native libs to AAR"
+chmod +w ${OUT_DIR}/tensorflow.aar
+pushd ${AAR_LIB_TMP}
+zip -ur ${OUT_DIR}/tensorflow.aar $(find jni -name *.so)
+popd
+rm -rf ${AAR_LIB_TMP}
+
 # Test Makefile build just to make sure it still works.
 if [ -z "$NDK_ROOT" ]; then
    export NDK_ROOT=${ANDROID_NDK_HOME}
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index f813d6c13f5e0a40ada2a66a461340c7693b6293..25cb51ea7ccfb300d064f9a1a313bed57212832b 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -47,6 +47,10 @@ export CI_BUILD_PYTHON="${CI_BUILD_PYTHON:-python}"
 export PYTHON_BIN_PATH="${PYTHON_BIN_PATH:-$(which ${CI_BUILD_PYTHON})}"
 if [ "${CONTAINER_TYPE}" == "gpu" ]; then
   export TF_NEED_CUDA=1
+elif [ "${CONTAINER_TYPE}" == "gpu_clang" ]; then
+  export TF_NEED_CUDA=1
+  export TF_CUDA_CLANG=1
+  export CLANG_CUDA_COMPILER_PATH="/usr/local/bin/clang"
 else
   export TF_NEED_CUDA=0
 fi
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 0ae31dfb5152dd152daef20b53c334049cb26334..5052d3626c9f762a31f72071446aa4da094bc396 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -34,6 +34,7 @@
 # - lib_package/libtensorflow_jni${SUFFIX}.tar.gz
 # - lib_package/libtensorflow.jar
 # - lib_package/libtensorflow-src.jar
+# - lib_package/libtensorflow_proto.zip
 #
 # ASSUMPTIONS:
 # - build_libtensorflow_tarball is invoked from the root of the git tree.
@@ -45,6 +46,10 @@ function build_libtensorflow_tarball() {
     echo "Must run this from the root of the bazel workspace"
     exit 1
   fi
+  # Delete any leftovers from previous builds in this workspace.
+  DIR=lib_package
+  rm -rf ${DIR}
+
   TARBALL_SUFFIX="${1}"
   BAZEL="bazel --bazelrc ./tensorflow/tools/ci_build/install/.bazelrc"
   BAZEL_OPTS="-c opt"
@@ -69,12 +74,13 @@ function build_libtensorflow_tarball() {
     //tensorflow/tools/lib_package:libtensorflow.tar.gz \
     //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz \
     //tensorflow/java:libtensorflow.jar \
-    //tensorflow/java:libtensorflow-src.jar
+    //tensorflow/java:libtensorflow-src.jar \
+    //tensorflow/tools/lib_package:libtensorflow_proto.zip
 
-  DIR=lib_package
-  rm -rf ${DIR}
   mkdir -p ${DIR}
   cp bazel-bin/tensorflow/tools/lib_package/libtensorflow.tar.gz ${DIR}/libtensorflow${TARBALL_SUFFIX}.tar.gz
   cp bazel-bin/tensorflow/tools/lib_package/libtensorflow_jni.tar.gz ${DIR}/libtensorflow_jni${TARBALL_SUFFIX}.tar.gz
   cp bazel-bin/tensorflow/java/libtensorflow.jar bazel-bin/tensorflow/java/libtensorflow-src.jar ${DIR}
+  cp bazel-genfiles/tensorflow/tools/lib_package/libtensorflow_proto.zip ${DIR}
+  chmod -x ${DIR}/*
 }
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 751f7de9a1e3b5e929ce9d0c8e62ae2442f1e048..e0a1391d6eda174e5eebf33be60fc065a0b18281 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -269,7 +269,7 @@ pip install --upgrade pip==8.1.2
 
 # Force tensorflow reinstallation. Otherwise it may not get installed from
 # last build if it had the same version number as previous build.
-PIP_FLAGS="--upgrade --force-reinstall --no-deps"
+PIP_FLAGS="--upgrade --force-reinstall"
 pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
 echo "Successfully installed pip package ${WHL_PATH}"
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 10bed0b786b12255a64bfa233b4dc2b5ed8dd17a..553e9652a2f9b09554a380ff2dd044669530acc9 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -86,6 +86,9 @@ BAZEL_TEST_TARGETS="//${PIP_TEST_PREFIX}/tensorflow/contrib/... \
   //${PIP_TEST_PREFIX}/tensorflow/python/... \
   //${PIP_TEST_PREFIX}/tensorflow/tensorboard/..."
 
+# Clean the bazel cache
+bazel clean
+
 # Run configure again, we might be using a different python path, due to
 # virtualenv.
 export TF_NEED_GCP=0
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index f0fa8a9381060a16741bae360ba5ecdbfddfef2d..3b640dd5e89b82a36be9ad7d1aead1214cdd193e 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -18,7 +18,7 @@
 #                    <COMMAND>
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build:
-#                 e.g., (cpu | gpu | android | tensorboard)
+#                 e.g., (cpu | gpu | gpu_clang | android | tensorboard)
 #
 # DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.
 #                  If this optional value is not supplied (via the
@@ -84,7 +84,7 @@ if [[ "${CONTAINER_TYPE}" == "cmake" ]]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${CONTAINER_TYPE}" == "gpu" ]]; then
+if [[ "${CONTAINER_TYPE}" == "gpu" ]] || [[ "${CONTAINER_TYPE}" == "gpu_clang" ]]; then
   DOCKER_BINARY="nvidia-docker"
 else
   DOCKER_BINARY="docker"
@@ -104,7 +104,7 @@ BUILD_TAG="${BUILD_TAG:-tf_ci}"
 
 # Add extra params for cuda devices and libraries for GPU container.
 # And clear them if we are not building for GPU.
-if [ "${CONTAINER_TYPE}" != "gpu" ]; then
+if [[ "${CONTAINER_TYPE}" != "gpu" ]] && [[ "${CONTAINER_TYPE}" != "gpu_clang" ]]; then
   GPU_EXTRA_PARAMS=""
 fi
 
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index cb204bc25f3d6f67c516aa53b45091578bff3cdc..dfaf50eb4f9d1136c0939525925ac33e401d5b8c 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -18,7 +18,7 @@
 #   ci_parameterized_build.sh
 #
 # The script obeys the following required environment variables:
-#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | ANDROID | ANDROID_FULL)
+#   TF_BUILD_CONTAINER_TYPE:   (CPU | GPU | GPU_CLANG | ANDROID | ANDROID_FULL)
 #   TF_BUILD_PYTHON_VERSION:   (PYTHON2 | PYTHON3 | PYTHON3.5)
 #   TF_BUILD_IS_PIP:           (NO_PIP | PIP | BOTH)
 #
@@ -224,8 +224,13 @@ fi
 # Process container type
 if [[ ${CTYPE} == "cpu" ]] || [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
   :
-elif [[ ${CTYPE} == "gpu" ]]; then
-  OPT_FLAG="${OPT_FLAG} --config=cuda"
+elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
+  if [[ ${CTYPE} == "gpu" ]]; then
+    OPT_FLAG="${OPT_FLAG} --config=cuda"
+  else # ${CTYPE} == "gpu_clang"
+    OPT_FLAG="${OPT_FLAG} --config=cuda_clang"
+  fi
+
 
   # Attempt to determine CUDA capability version automatically and use it if
   # CUDA capability version is not specified by the environment variables.
@@ -328,21 +333,35 @@ fi
 OPT_FLAG=$(str_strip "${OPT_FLAG}")
 
 
-# Filter out benchmark tests if this is not a benchmarks job
+# 1) Filter out benchmark tests if this is not a benchmarks job;
+# 2) Filter out tests with the "nomac" tag if the build is on Mac OS X.
 EXTRA_ARGS=""
+IS_MAC=0
+if [[ "$(uname)" == "Darwin" ]]; then
+  IS_MAC=1
+fi
 if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
   ITEMS=(${TF_BUILD_APPEND_ARGUMENTS})
 
   for ITEM in "${ITEMS[@]}"; do
-    if [[ ${ITEM} == *"--test_tag_filters="* ]] &&
-      [[ ${ITEM} != *"benchmark-test"* ]]; then
-      EXTRA_ARGS="${EXTRA_ARGS} ${ITEM},-benchmark-test"
+    if [[ ${ITEM} == *"--test_tag_filters="* ]]; then
+      NEW_ITEM="${ITEM}"
+      if [[ ${NEW_ITEM} != *"benchmark-test"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-benchmark-test"
+      fi
+      if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
+        NEW_ITEM="${NEW_ITEM},-nomac"
+      fi
+      EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
       EXTRA_ARGS="${EXTRA_ARGS} ${ITEM}"
     fi
   done
 else
   EXTRA_ARGS="${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-benchmark-test"
+  if [[ ${IS_MAC} == "1" ]]; then
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+  fi
 fi
 
 # For any "tool" dependencies in genrules, Bazel will build them for host
@@ -363,7 +382,7 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} ${EXTRA_ARGS} -- "\
 "${BAZEL_TARGET}"
-  elif [[ ${CTYPE} == "gpu" ]]; then
+  elif [[ ${CTYPE} == "gpu" ]] || [[ ${CTYPE} == "gpu_clang" ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
 "--local_test_jobs=${TF_GPU_COUNT} "\
@@ -429,7 +448,8 @@ if [[ ${TF_BUILD_PYTHON_VERSION} == "python2" ]]; then
   :
 elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" || \
         ${TF_BUILD_PYTHON_VERSION} == "python3.4" || \
-        ${TF_BUILD_PYTHON_VERSION} == "python3.5" ]]; then
+        ${TF_BUILD_PYTHON_VERSION} == "python3.5" || \
+        ${TF_BUILD_PYTHON_VERSION} == "python3.6" ]]; then
   # Supply proper environment variable to select Python 3
   if [[ "${DO_DOCKER}" == "1" ]]; then
     EXTRA_PARAMS="${EXTRA_PARAMS} -e CI_BUILD_PYTHON=${TF_BUILD_PYTHON_VERSION}"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 9ecf16c46f12f0193e4b4c4577e1743d724551e1..fd2874df91e03853648ccca6a0d7b3520da74d55 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -92,6 +92,8 @@ do_pylint() {
   ERROR_WHITELIST="^tensorflow/python/framework/function_test\.py.*\[E1123.*noinline "\
 "^tensorflow/python/platform/default/_gfile\.py.*\[E0301.*non-iterator "\
 "^tensorflow/python/platform/default/_googletest\.py.*\[E0102.*function\salready\sdefined "\
+"^tensorflow/python/feature_column/feature_column_test\.py.*\[E0110.*abstract-class-instantiated "\
+"^tensorflow/contrib/layers/python/layers/feature_column\.py.*\[E0110.*abstract-class-instantiated "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
diff --git a/tensorflow/tools/ci_build/install/build_and_install_clang.sh b/tensorflow/tools/ci_build/install/build_and_install_clang.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3fb99649485ef1719c5c3b561f21a21b49844c91
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/build_and_install_clang.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -ex
+
+LLVM_SVN_REVISION="299268"
+CLANG_TMP_DIR=/tmp/clang-build
+
+mkdir "$CLANG_TMP_DIR"
+
+pushd "$CLANG_TMP_DIR"
+
+# Checkout llvm+clang
+svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/llvm/trunk "$CLANG_TMP_DIR/llvm"
+svn co -q -r$LLVM_SVN_REVISION http://llvm.org/svn/llvm-project/cfe/trunk "$CLANG_TMP_DIR/llvm/tools/clang"
+
+# Build 1st stage. Compile clang with system compiler
+mkdir "$CLANG_TMP_DIR/build-1"
+cd "$CLANG_TMP_DIR/build-1"
+cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release "$CLANG_TMP_DIR/llvm"
+make -j `nproc` clang clang-headers
+
+# Build 2nd stage. Compile clang with clang built in stage 1
+mkdir "$CLANG_TMP_DIR/build-2"
+cd "$CLANG_TMP_DIR/build-2"
+
+CC="$CLANG_TMP_DIR/build-1/bin/clang" \
+CXX="$CLANG_TMP_DIR/build-1/bin/clang++" \
+cmake -G"Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local "$CLANG_TMP_DIR/llvm"
+
+make -j `nproc` install-clang install-clang-headers
+
+popd
+
+# Cleanup
+rm -rf "$CLANG_TMP_DIR"
diff --git a/tensorflow/tools/ci_build/install/install_auditwheel.sh b/tensorflow/tools/ci_build/install/install_auditwheel.sh
index 2538a393d3f6ef27e96e9ac863d2e1dba1c8f930..e6f6124d56774a43e521d6529695f5abc161dabb 100755
--- a/tensorflow/tools/ci_build/install/install_auditwheel.sh
+++ b/tensorflow/tools/ci_build/install/install_auditwheel.sh
@@ -16,7 +16,7 @@
 
 set -e
 
-sudo pip3 install auditwheel
+sudo pip3 install auditwheel==1.5.0
 
 set +e
 patchelf_location=$(which patchelf)
diff --git a/tensorflow/tools/ci_build/install/install_buildifier.sh b/tensorflow/tools/ci_build/install/install_buildifier.sh
index 5420934c6b57b80bbc7837f7de953841855ea891..b2dfcf8db7605a08ed9554784b8de5cecac86af7 100755
--- a/tensorflow/tools/ci_build/install/install_buildifier.sh
+++ b/tensorflow/tools/ci_build/install/install_buildifier.sh
@@ -17,7 +17,7 @@
 set -e
 BUILDIFIER_DIR="buildifier"
 mkdir ${BUILDIFIER_DIR}
-curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.3.tar.gz | \
+curl -Ls https://github.com/bazelbuild/buildifier/archive/0.4.5.tar.gz | \
     tar -C "${BUILDIFIER_DIR}" --strip-components=1 -xz
 pushd ${BUILDIFIER_DIR}
 
diff --git a/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh b/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3e626a69ab5e6b7f8d1b4997b459301606501a8e
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_cmake_for_clang.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz"
+
+wget -O - "${CMAKE_URL}" | tar xzf - -C /usr/local --strip-components=1
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index a62a6f8a3c1dee3b09b06d4f957bca578684186e..da1f2199d0daf5cfe3e9d94165e3af6704c58050 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -13,11 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+#
+# Usage:
+#     ./install_deb_packages [--without_cmake]
+# Pass --without_cmake to prevent cmake from being installed with apt-get
 
 set -e
 ubuntu_version=$(cat /etc/issue | grep -i ubuntu | awk '{print $2}' | \
   awk -F'.' '{print $1}')
 
+if [[ "$1" != "" ]] && [[ "$1" != "--without_cmake" ]]; then
+  echo "Unknown argument '$1'"
+  exit 1
+fi
+
 # Install dependencies from ubuntu deb repository.
 apt-get update
 
@@ -32,12 +41,12 @@ apt-get install -y --no-install-recommends \
     autoconf \
     automake \
     build-essential \
-    cmake \
     curl \
     ffmpeg \
     git \
     libcurl4-openssl-dev \
     libtool \
+    mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
     pkg-config \
@@ -48,12 +57,22 @@ apt-get install -y --no-install-recommends \
     python3-setuptools \
     rsync \
     sudo \
+    subversion \
     swig \
     unzip \
     wget \
     zip \
     zlib1g-dev
 
+# populate the database
+updatedb
+
+if [[ "$1" != "--without_cmake" ]]; then
+  apt-get install -y --no-install-recommends \
+    cmake
+fi
+
+
 # Install ca-certificates, and update the certificate store.
 apt-get install -y ca-certificates-java
 update-ca-certificates -f
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 5ebd69bd3dac37b57de038d93f040d1e444b861d..b8f9fc845394c34d47fa088da592f0f4bf9f155d 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -73,8 +73,8 @@ pip2 install py-cpuinfo
 pip3 install py-cpuinfo
 
 # pylint tests require the following:
-pip2 install pylint
-pip3 install pylint
+pip2 install pylint==1.6.4
+pip3 install pylint==1.6.4
 
 # pep8 tests require the following:
 pip2 install pep8
diff --git a/tensorflow/tools/ci_build/linux/cmake/run.sh b/tensorflow/tools/ci_build/linux/cmake/run.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index acde10f1240aabe541986f3c7592657d5a73ad5b..a03cab0cca5c375e668a2adeae64c48ac2b217a0 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -32,7 +32,7 @@ export PYTHON_BIN_PATH=`which python3`
 yes "" | ./configure
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --test_tag_filters=-gpu,-benchmark-test --test_lang_filters=py -k \
-    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
+bazel test --test_tag_filters=-gpu,-benchmark-test -k \
+    --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --test_output=errors -- \
     //tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e5f4a22f7ade7eb5c260a7a486cd5d3fa75d5859
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_JOBS=$(sysctl -n hw.ncpu)
+N_JOBS=$((N_JOBS+1))
+
+echo ""
+echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo ""
+
+# Run configure.
+export TF_NEED_GCP=0
+export TF_NEED_HDFS=0
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=$(which python2)
+yes "" | ./configure
+which bazel
+bazel test --test_tag_filters=-gpu,-benchmark-test,-nomac \
+    --test_timeout 300,450,1200,3600 \
+    --test_size_filters=small,medium \
+    --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... \
+    -//tensorflow/tensorboard/...
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 762c53172588246acf0350125e06799bd960269c..d90a1b905d91415dda576c5dc71df2f41502fa9d 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -28,6 +28,7 @@ export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 1da5e8c2bf34cb8ae713b4bcde22f629ba47f877..79973647c11fffb1907b7f39fe5f43a3fb450b5b 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -29,6 +29,7 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_OPENCL=0
+export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
 export PATH="/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin"
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index 0779ed91bc3b5d969e3d9b2c6d19cc72df9e6c75..e71017e621ccc8b42cdf8d4e4bd27a81791bbe4c 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -38,7 +38,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager
 
 
 # Set the cache size for astng objects.
@@ -322,4 +322,4 @@ indent-after-paren=4
 [GOOGLE LINES]
 
 # Regexp for a proper copyright notice.
-copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\.
\ No newline at end of file
+copyright=Copyright \d{4} The TensorFlow Authors\. +All [Rr]ights [Rr]eserved\.
diff --git a/tensorflow/tools/ci_build/update_version.sh b/tensorflow/tools/ci_build/update_version.sh
index cde0ab79093f75bc23843c97305d8fb3e10a28f0..682f5329f58fffa5f2030c7e33db14bd3e343165 100755
--- a/tensorflow/tools/ci_build/update_version.sh
+++ b/tensorflow/tools/ci_build/update_version.sh
@@ -61,7 +61,9 @@ fi
 MAJOR=$(echo "${NEW_VER}" | cut -d \. -f 1)
 MINOR=$(echo "${NEW_VER}" | cut -d \. -f 2)
 PATCH=$(echo "${NEW_VER}" | cut -d \. -f 3)
+PATCH_NUM=$(echo "$PATCH" | cut -d \- -f 1)
 PIP_PATCH="${PATCH//-}"
+SUFFIX=$(echo $NEW_VER | sed "s/${MAJOR}.${MINOR}.${PATCH%-*}//g")
 
 # Update tensorflow/core/public/version.h
 VERSION_H="${TF_SRC_DIR}/core/public/version.h"
@@ -71,13 +73,17 @@ OLD_MAJOR=$(cat ${VERSION_H} | grep -E "^#define TF_MAJOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
 OLD_MINOR=$(cat ${VERSION_H} | grep -E "^#define TF_MINOR_VERSION [0-9]+" | \
 cut -d ' ' -f 3)
-OLD_PATCH=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]-]+" | \
+OLD_PATCH_NUM=$(cat ${VERSION_H} | grep -E "^#define TF_PATCH_VERSION [[:alnum:]-]+" | \
 cut -d ' ' -f 3)
+OLD_EXTENSION=$(cat ${VERSION_H} | grep -E "^#define TF_VERSION_SUFFIX \"[[:alnum:]-]+\"" | \
+cut -d ' ' -f 3)
+OLD_PATCH="$OLD_PATCH_NUM${OLD_EXTENSION//\"}"
+OLD_PIP_PATCH="${OLD_PATCH//-}"
 
 sed -i -e "s/^#define TF_MAJOR_VERSION ${OLD_MAJOR}/#define TF_MAJOR_VERSION ${MAJOR}/g" ${VERSION_H}
 sed -i -e "s/^#define TF_MINOR_VERSION ${OLD_MINOR}/#define TF_MINOR_VERSION ${MINOR}/g" ${VERSION_H}
-sed -i -e "s/^#define TF_PATCH_VERSION ${OLD_PATCH}/#define TF_PATCH_VERSION ${PATCH}/g" "${VERSION_H}"
-
+sed -i -e "s/^#define TF_PATCH_VERSION ${OLD_PATCH}/#define TF_PATCH_VERSION ${PATCH_NUM}/g" "${VERSION_H}"
+sed -i -e "s/^#define TF_VERSION_SUFFIX \".*\"/#define TF_VERSION_SUFFIX \"${SUFFIX}\"/g" "${VERSION_H}"
 
 # Update setup.py
 SETUP_PY="${TF_SRC_DIR}/tools/pip_package/setup.py"
@@ -92,6 +98,26 @@ check_existence file "${README_MD}"
 
 sed -i -r -e "s/${OLD_MAJOR}\.${OLD_MINOR}\.([[:alnum:]]+)-/${MAJOR}.${MINOR}.${PIP_PATCH}-/g" "${README_MD}"
 
+# Update the install md files
+NEW_PIP_TAG=$MAJOR.$MINOR.$PIP_PATCH
+OLD_PIP_TAG=$OLD_MAJOR.$OLD_MINOR.$OLD_PIP_PATCH
+
+for file in ${TF_SRC_DIR}/docs_src/install/install_{linux,mac,windows,sources}.md
+do
+  sed -i "s/tensorflow-${OLD_PIP_TAG}/tensorflow-${NEW_PIP_TAG}/g" $file
+  sed -i "s/tensorflow_gpu-${OLD_PIP_TAG}/tensorflow_gpu-${NEW_PIP_TAG}/g" $file
+  sed -i "s/TensorFlow ${OLD_PIP_TAG}/TensorFlow ${NEW_PIP_TAG}/g" $file
+done
+
+NEW_TAG=$MAJOR.$MINOR.$PATCH
+OLD_TAG=$OLD_MAJOR.$OLD_MINOR.$OLD_PATCH
+
+for file in ${TF_SRC_DIR}/docs_src/install/install_{java,go,c}.md
+do
+  sed -i "s/x86_64-${OLD_TAG}/x86_64-${NEW_TAG}/g" $file
+  sed -i "s/libtensorflow-${OLD_TAG}.jar/libtensorflow-${NEW_TAG}.jar/g" $file
+  sed -i "s/<version>${OLD_TAG}<\/version>/<version>${NEW_TAG}<\/version>/g" $file
+done
 
 # Updates to be made if there are major / minor version changes
 MAJOR_MINOR_CHANGE=0
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 1488e8d78c85053e49accac4cde3f72691f361f6..21b6fa2bb996954354fdce89f99c899f9b5177be 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -97,7 +97,7 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
 # Python tests
-# The first argument is the name of the python test direcotry
+# The first argument is the name of the python test directory
 function get_failing_cpu_py_tests() {
     echo "
     //$1/tensorflow/python:basic_session_run_hooks_test + \
@@ -175,6 +175,13 @@ function run_configure_for_cpu_build {
   if [ -z "$CC_OPT_FLAGS" ]; then
     export CC_OPT_FLAGS="-march=native"
   fi
+  if [ -z "$TF_NEED_MKL" ]; then
+    export TF_NEED_MKL=0
+  fi
+  export TF_NEED_VERBS=0
+  export TF_NEED_GCP=0
+  export TF_NEED_HDFS=0
+  export TF_NEED_OPENCL=0
   echo "" | ./configure
 }
 
@@ -194,6 +201,11 @@ function run_configure_for_gpu_build {
   if [ -z "$CC_OPT_FLAGS" ]; then
     export CC_OPT_FLAGS="-march=native"
   fi
+  export TF_NEED_VERBS=0
+  export TF_NEED_MKL=0
+  export TF_NEED_GCP=0
+  export TF_NEED_HDFS=0
+  export TF_NEED_OPENCL=0
   echo "" | ./configure
 }
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index b9937475219f4dd334121fde13b3be21b50bac3e..47274d8c723efcd55ee6a9382b998f7c5ffd33db 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -30,10 +30,11 @@ export TMPDIR="C:/tmp"
 mkdir -p "$TMPDIR"
 
 # Set bash path
-export BAZEL_SH="C:/tools/msys64/usr/bin/bash"
+export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
 # Set Python path for ./configure
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python"
+export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
 # Set Python path for cc_configure.bzl
 export BAZEL_PYTHON="C:/Program Files/Anaconda3/python"
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 3ebc591f4802f9090bb13fd12b5b53b94d7b6a6b..07ad70dd344bd1b8a48f815d0400dd5e4f6b73c2 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -37,5 +37,4 @@ SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 %CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS%
 
 :: Run msbuild in the resulting VS project files to build a pip package.
-%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj  > msbuild.log 2>&1
-if %errorlevel% neq 0 cat msbuild.log else echo "Successfully build pip package."
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index b428bebc6f60b5ecceb3c347ba96d0389a89b59f..9ac3613f27e1bc96501490b7610f047785b9ada2 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -31,15 +31,6 @@ if [ ! -e "WORKSPACE" ]; then
   exit 1
 fi
 
-#### BEGIN HACKS TO BE RESOLVED WITH NEWER BAZEL VERSIONS ####
-# Disable nccl.
-# This can be removed once we switch to a bazel release that includes
-# https://github.com/bazelbuild/bazel/commit/8e0991cb19eadfcb651cd6987255d5f7c0a58e0a
-# (the fix for https://github.com/bazelbuild/bazel/issues/2494).
-# Most likley bazel 0.4.5 will contain that.
-sed -i -e "s/\"@nccl_archive/#\"@nccl_archive/"  ./tensorflow/contrib/nccl/BUILD
-sed -i -e "s/\"@nccl_archive/#\"@nccl_archive/"  ./tensorflow/tools/pip_package/BUILD
-
 # Enable JNI support for Windows in Bazel.
 # This can be removed once
 # https://github.com/bazelbuild/bazel/pull/2599
@@ -66,7 +57,7 @@ bazel build -c opt ${BUILD_OPTS} \
   tensorflow/tools/lib_package:jnilicenses_generate
 
 # Revert the hacks above
-git checkout ./tensorflow/contrib/nccl/BUILD ./tensorflow/tools/pip_package/BUILD
+git checkout ./tensorflow/tools/pip_package/BUILD
 git checkout ./tensorflow/java/src/main/native/BUILD
 rm -f ./tensorflow/java/src/main/native/windows_jni_md.h
 
@@ -90,6 +81,6 @@ cp bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE ${DI
 cd ${DIR}
 zip -j libtensorflow-cpu-windows-$(uname -m).zip \
   lib/tensorflow.dll \
-  include/c/c_api.h \
-  include/c/LICENSE
+  include/tensorflow/c/c_api.h \
+  include/tensorflow/c/LICENSE
 rm -rf lib include
diff --git a/tensorflow/tools/common/public_api.py b/tensorflow/tools/common/public_api.py
index 3364ff6bc9aa68ba1922e0edbbb499a2e855822e..cab3b2ff6a0d39938d915c53aff657e43c065c99 100644
--- a/tensorflow/tools/common/public_api.py
+++ b/tensorflow/tools/common/public_api.py
@@ -18,9 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import re
 
+from tensorflow.python.util import tf_inspect
+
 
 class PublicAPIVisitor(object):
   """Visitor to use with `traverse` to visit exactly the public TF API."""
@@ -36,28 +37,35 @@ class PublicAPIVisitor(object):
     """
     self._visitor = visitor
 
-  # Modules/classes we do not want to descend into if we hit them. Usually,
-  # sytem modules exposed through platforms for compatibility reasons.
-  # Each entry maps a module path to a name to ignore in traversal.
-  _do_not_descend_map = {
-      '': [
-          'core',
-          'examples',
-          'flags',  # Don't add flags
-          'platform',  # TODO(drpng): This can be removed once sealed off.
-          'pywrap_tensorflow',  # TODO(drpng): This can be removed once sealed.
-          'user_ops',  # TODO(drpng): This can be removed once sealed.
-          'python',
-          'tools'
-      ],
-
-      # Some implementations have this internal module that we shouldn't expose.
-      'flags': ['cpp_flags'],
-
-      # Everything below here is legitimate.
-      'app': ['flags'],  # It'll stay, but it's not officially part of the API.
-      'test': ['mock'],  # Imported for compatibility between py2/3.
-  }
+    # Modules/classes we do not want to descend into if we hit them. Usually,
+    # system modules exposed through platforms for compatibility reasons.
+    # Each entry maps a module path to a name to ignore in traversal.
+    self._do_not_descend_map = {
+        '': [
+            'core',
+            'examples',
+            'flags',  # Don't add flags
+            # TODO(drpng): This can be removed once sealed off.
+            'platform',
+            # TODO(drpng): This can be removed once sealed.
+            'pywrap_tensorflow',
+            # TODO(drpng): This can be removed once sealed.
+            'user_ops',
+            'python',
+            'tools',
+            'tensorboard',
+        ],
+
+        # Some implementations have this internal module that we shouldn't
+        # expose.
+        'flags': ['cpp_flags'],
+
+        ## Everything below here is legitimate.
+        # It'll stay, but it's not officially part of the API.
+        'app': ['flags'],
+        # Imported for compatibility between py2/3.
+        'test': ['mock'],
+    }
 
   @property
   def do_not_descend_map(self):
@@ -86,7 +94,7 @@ class PublicAPIVisitor(object):
     """Visitor interface, see `traverse` for details."""
 
     # Avoid long waits in cases of pretty unambiguous failure.
-    if inspect.ismodule(parent) and len(path.split('.')) > 10:
+    if tf_inspect.ismodule(parent) and len(path.split('.')) > 10:
       raise RuntimeError('Modules nested too deep:\n%s\n\nThis is likely a '
                          'problem with an accidental public import.' % path)
 
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 443838d96821d8133311d243aa89bc67bb9b39a8..9607f80686df5c9d9d23f32ffd8ae9b550356736 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -18,9 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import sys
 
+from tensorflow.python.util import tf_inspect
 
 __all__ = ['traverse']
 
@@ -29,11 +29,11 @@ def _traverse_internal(root, visit, stack, path):
   """Internal helper for traverse."""
 
   # Only traverse modules and classes
-  if not inspect.isclass(root) and not inspect.ismodule(root):
+  if not tf_inspect.isclass(root) and not tf_inspect.ismodule(root):
     return
 
   try:
-    children = inspect.getmembers(root)
+    children = tf_inspect.getmembers(root)
   except ImportError:
     # On some Python installations, some modules do not support enumerating
     # members (six in particular), leading to import errors.
@@ -43,7 +43,8 @@ def _traverse_internal(root, visit, stack, path):
   visit(path, root, children)
   for name, child in children:
     # Do not descend into built-in modules
-    if inspect.ismodule(child) and child.__name__ in sys.builtin_module_names:
+    if tf_inspect.ismodule(
+        child) and child.__name__ in sys.builtin_module_names:
       continue
 
     # Break cycles
@@ -72,8 +73,8 @@ def traverse(root, visit):
   never descends into built-in modules.
 
   `children`, a list of `(name, object)` pairs are determined by
-  `inspect.getmembers`. To avoid visiting parts of the tree, `children` can be
-  modified in place, using `del` or slice assignment.
+  `tf_inspect.getmembers`. To avoid visiting parts of the tree, `children` can
+  be modified in place, using `del` or slice assignment.
 
   Cycles (determined by reference equality, `is`) stop the traversal. A stack of
   objects is kept to find cycles. Objects forming cycles may appear in
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 80439f835a6783630532ff9b587d5341923604b2..9a4a8ff71d9c6e4cf8a3325c5d49cbb0066cfa8d 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -34,6 +34,10 @@ class APIChangeSpec(object):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
     self.function_keyword_renames = {
+        "tf.batch_matmul": {
+            "adj_x": "adjoint_a",
+            "adj_y": "adjoint_b",
+        },
         "tf.count_nonzero": {
             "reduction_indices": "axis"
         },
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index db56a687f6bce8bfcd979b89ebe48b425744d48d..3a557814960498cb397781232154958872234e49 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -133,7 +133,7 @@ class CensusDataSource(object):
       columns: Columns to retrieve from the data files (A list of strings)
       label_column: Name of the label column
       categorical_columns: Names of the categorical columns (A list of strings)
-      continuous_columns: Names of the continuous columsn (A list of strings)
+      continuous_columns: Names of the continuous columns (A list of strings)
     """
 
     # Retrieve data from disk (if available) or download from the web.
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 9d008ec9ce5969b2cb0f61c958a606c04dda576d..865af8dd7b2af686dad852f35187f2d226533596 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -9,7 +9,7 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-py_library(
+py_binary(
     name = "grpc_tensorflow_server",
     srcs = [
         "grpc_tensorflow_server.py",
diff --git a/tensorflow/tools/dist_test/server/Dockerfile.test b/tensorflow/tools/dist_test/server/Dockerfile.test
index 3cd3d5206db4edf28e17a5e9896ab7eb942288e4..908af8af9bb0cc67ae21aec72b42faab970093cc 100644
--- a/tensorflow/tools/dist_test/server/Dockerfile.test
+++ b/tensorflow/tools/dist_test/server/Dockerfile.test
@@ -52,13 +52,13 @@ ADD . /var/tf-k8s
 # Download MNIST data for tests
 RUN mkdir -p /tmp/mnist-data
 RUN curl -o /tmp/mnist-data/train-labels-idx1-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz
 RUN curl -o /tmp/mnist-data/train-images-idx3-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz
 RUN curl -o /tmp/mnist-data/t10k-labels-idx1-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz
 RUN curl -o /tmp/mnist-data/t10k-images-idx3-ubyte.gz \
-    http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
+    https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz
 
 # Download Census data for Wide & Deep test
 RUN mkdir -p /tmp/census-data
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
old mode 100755
new mode 100644
index 2d774577b6d93ef7712d3595ab6592a5a701b14d..bd6700a0b1f43208b317e14953c1110cbe39248b
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 import argparse
 import sys
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
 from tensorflow.python.training import server_lib
@@ -103,8 +104,11 @@ def main(unused_args):
     raise ValueError("Invalid task_id: %d" % FLAGS.task_id)
   server_def.task_index = FLAGS.task_id
 
+  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
+      per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction))
+
   # Create GRPC Server instance
-  server = server_lib.Server(server_def)
+  server = server_lib.Server(server_def, config=config)
 
   # join() is blocking, unlike start()
   server.join()
@@ -137,6 +141,11 @@ if __name__ == "__main__":
       default=0,
       help="Task index, e.g., 0"
   )
+  parser.add_argument(
+      "--gpu_memory_fraction",
+      type=float,
+      default=1.0,
+      help="Fraction of GPU memory allocated",)
   parser.add_argument(
       "--verbose",
       type="bool",
@@ -145,5 +154,6 @@ if __name__ == "__main__":
       default=False,
       help="Verbose mode"
   )
+
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index b61eb5db37001b5f515c52d7cd2508a02e35cfb7..5b3f1f936a48bb448b712152c57c095226efea8e 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -66,4 +66,4 @@ EXPOSE 8888
 
 WORKDIR "/notebooks"
 
-CMD ["/run_jupyter.sh"]
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 7bf7fd5719b1f5806bec2f4c1bb4a3a4e50cfc60..c801ceff9387b1a896a979dd71292816392d0534 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,18 +48,6 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
@@ -71,8 +61,8 @@ ENV BAZEL_VERSION 0.4.5
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
-    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE.txt && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
     chmod +x bazel-*.sh && \
     ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
     cd / && \
@@ -92,7 +82,8 @@ WORKDIR /tensorflow
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 769731974a2f6155dd0780463eed4bc219dd52ab..24350c507e7f9fb00954293989e70557ea02a192 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         unzip \
         zip \
         zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
@@ -46,18 +48,6 @@ COPY run_jupyter.sh /
 
 # Set up Bazel.
 
-# We need to add a custom PPA to pick up JDK8, since trusty doesn't
-# have an openjdk8 backport.  openjdk-r is maintained by a reliable contributor:
-# Matthias Klose (https://launchpad.net/~doko).  It will do until
-# we either update the base image beyond 14.04 or openjdk-8 is
-# finally backported to trusty; see e.g.
-#   https://bugs.launchpad.net/trusty-backports/+bug/1368094
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre-headless && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
 # Running bazel inside a `docker build` command causes trouble, cf:
 #   https://github.com/bazelbuild/bazel/issues/134
 # The easiest solution is to set up a bazelrc file forcing --batch.
@@ -71,8 +61,8 @@ ENV BAZEL_VERSION 0.4.5
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
-    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    curl -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE.txt && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+    curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
     chmod +x bazel-*.sh && \
     ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
     cd / && \
@@ -92,7 +82,8 @@ ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
 
 RUN tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda tensorflow/tools/pip_package:build_pip_package && \
+    bazel build -c opt --config=cuda --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
     rm -rf /tmp/pip && \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index b2b39501cdd0bacfe90eddf663ba763880a9ee91..88876421f5405db3af7008add2f8e9f0ef893125 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -69,4 +69,4 @@ EXPOSE 8888
 
 WORKDIR "/notebooks"
 
-CMD ["/run_jupyter.sh"]
+CMD ["/run_jupyter.sh", "--allow-root"]
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index b35b14df1fd03272e0a068d68a29b8af0afe9f0e..c9f2b1ab9ef832912b053a42c36726570864f320 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -134,7 +134,7 @@
     "import os\n",
     "from six.moves.urllib.request import urlretrieve\n",
     "\n",
-    "SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'\n",
+    "SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'\n",
     "WORK_DIRECTORY = \"/tmp/mnist-data\"\n",
     "\n",
     "def maybe_download(filename):\n",
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 886266caaf8c0b6851c6054583f840b4c362ae31..f88af68cdebf3921cc1e326034c4ad6279bba4d2 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -64,7 +64,7 @@
 #
 #   TF_DOCKER_BUILD_OPTIONS
 #     (Optional)
-#     Specifices the desired build options. Defaults to OPT.
+#     Specifies the desired build options. Defaults to OPT.
 
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index ed2626efabb63939d4f57fafe28b296ee0f8aee6..8e27b133c2fa33a8f6366b0f94a596cf1ca7c1a2 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -11,13 +11,6 @@ package(
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-py_binary(
-    name = "gen_cc_md",
-    srcs = ["gen_cc_md.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
 py_library(
     name = "doc_generator_visitor",
     srcs = [
@@ -106,6 +99,20 @@ py_binary(
     ],
 )
 
+py_test(
+    name = "build_docs_test",
+    size = "small",
+    srcs = ["build_docs_test.py"],
+    data = ["//tensorflow:docs_src"],
+    srcs_version = "PY2AND3",
+    tags = ["manual"],
+    deps = [
+        ":generate_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/debug:debug_py",
+    ],
+)
+
 py_binary(
     name = "generate_1_0",
     srcs = ["generate_1_0.py"],
@@ -134,38 +141,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "doxy_config",
-    srcs = ["tf-doxy_for_md-config"],
-)
-
-sh_binary(
-    name = "gen_docs",
-    srcs = ["gen_docs.sh"],
-    data = [
-        ":doxy_config",
-        ":gen_cc_md",
-        "//tensorflow/python:gen_docs_combined",
-    ],
-)
-
-sh_test(
-    name = "gen_docs_test",
-    size = "small",
-    srcs = [
-        "gen_docs_test.sh",
-    ],
-    data = [
-        ":gen_docs",
-        "//tensorflow/core:all_files",
-        "//tensorflow/python:all_files",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d28dd93b9a8d5eb19af414622c1d1b22516f9c1c
--- /dev/null
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -0,0 +1,51 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the python doc generator and fail if there are any broken links."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+from tensorflow.python import debug as tf_debug
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import resource_loader
+from tensorflow.tools.docs import generate_lib
+
+
+class Flags(object):
+  resource_root = resource_loader.get_root_dir_with_all_resources()
+  src_dir = os.path.join(resource_root, 'third_party/tensorflow/docs_src')
+  base_dir = os.path.join(resource_root, 'third_party/tensorflow/')
+  output_dir = googletest.GetTempDir()
+
+
+class BuildDocsTest(googletest.TestCase):
+
+  def testBuildDocs(self):
+    doc_generator = generate_lib.DocGenerator()
+
+    doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
+
+    status = doc_generator.build(Flags())
+
+    if status:
+      self.fail('Found %s Errors!' % status)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
index 178ac0940e735b3397fa4bca870077096da218e1..8f7b91fa752f9e594176a6fcb02da1fc8f9bc103 100644
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ b/tensorflow/tools/docs/doc_generator_visitor.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
-
 import six
 
+from tensorflow.python.util import tf_inspect
+
 
 class DocGeneratorVisitor(object):
   """A visitor that generates docs for a python object when __call__ed."""
@@ -133,8 +133,8 @@ class DocGeneratorVisitor(object):
       parent_name: The fully qualified name of a symbol found during traversal.
       parent: The Python object referenced by `parent_name`.
       children: A list of `(name, py_object)` pairs enumerating, in alphabetical
-        order, the children (as determined by `inspect.getmembers`) of `parent`.
-        `name` is the local name of `py_object` in `parent`.
+        order, the children (as determined by `tf_inspect.getmembers`) of
+          `parent`. `name` is the local name of `py_object` in `parent`.
 
     Raises:
       RuntimeError: If this visitor is called with a `parent` that is not a
@@ -144,9 +144,9 @@ class DocGeneratorVisitor(object):
     self._index[parent_name] = parent
     self._tree[parent_name] = []
 
-    if not (inspect.ismodule(parent) or inspect.isclass(parent)):
-      raise RuntimeError('Unexpected type in visitor -- %s: %r' %
-                         (parent_name, parent))
+    if not (tf_inspect.ismodule(parent) or tf_inspect.isclass(parent)):
+      raise RuntimeError('Unexpected type in visitor -- %s: %r' % (parent_name,
+                                                                   parent))
 
     for i, (name, child) in enumerate(list(children)):
       # Don't document __metaclass__
@@ -170,7 +170,7 @@ class DocGeneratorVisitor(object):
     master names to a lexicographically sorted list of all aliases for that name
     (incl. the master name).
 
-    All these are computed and set as fields if they haven't aready.
+    All these are computed and set as fields if they haven't already.
     """
     if self._reverse_index is not None:
       return
@@ -190,9 +190,8 @@ class DocGeneratorVisitor(object):
       # have no usable docstring and won't be documented automatically.
       if (py_object is not None and
           not isinstance(py_object, six.integer_types + six.string_types +
-                         (six.binary_type, six.text_type, float, complex, bool)
-                        ) and
-          py_object is not ()):
+                         (six.binary_type, six.text_type, float, complex, bool))
+          and py_object is not ()):
         object_id = id(py_object)
         if object_id in reverse_index:
           master_name = reverse_index[object_id]
diff --git a/tensorflow/tools/docs/gen_cc_md.py b/tensorflow/tools/docs/gen_cc_md.py
deleted file mode 100644
index 931df3230b4e8b58174f1a65f2517cb8e16b7d49..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/gen_cc_md.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convert Doxygen .xml files to MarkDown (.md files)."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import re
-
-from BeautifulSoup import BeautifulStoneSoup
-import tensorflow as tf
-
-ANCHOR_RE = re.compile(r'\W+')
-
-PAGE_TEMPLATE = '''# `{0} {1}`
-
-{2}
-
-###Member Details
-
-{3}'''
-
-INDEX_TEMPLATE = '''# TensorFlow C++ Session API reference documentation
-
-TensorFlow's public C++ API includes only the API for executing graphs, as of
-version 0.5. To control the execution of a graph from C++:
-
-1. Build the computation graph using the [Python API](../python/).
-1. Use [`tf.train.write_graph()`](../python/train.md#write_graph) to
-write the graph to a file.
-1. Load the graph using the C++ Session API. For example:
-
-  ```c++
-  // Reads a model graph definition from disk, and creates a session object you
-  // can use to run it.
-  Status LoadGraph(string graph_file_name, Session** session) {
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(
-        ReadBinaryProto(Env::Default(), graph_file_name, &graph_def));
-    TF_RETURN_IF_ERROR(NewSession(SessionOptions(), session));
-    TF_RETURN_IF_ERROR((*session)->Create(graph_def));
-    return Status::OK();
-  }
-```
-
-1. Run the graph with a call to `session->Run()`
-
-## Env
-
-@@Env
-@@RandomAccessFile
-@@WritableFile
-@@EnvWrapper
-
-## Session
-
-@@Session
-@@SessionOptions
-
-## Status
-
-@@Status
-@@Status::State
-
-## Tensor
-
-@@Tensor
-@@TensorShape
-@@TensorShapeDim
-@@TensorShapeUtils
-@@PartialTensorShape
-@@PartialTensorShapeUtils
-
-## Thread
-
-@@Thread
-@@ThreadOptions
-'''
-
-FLAGS = None
-
-
-def member_definition(member_elt):
-  def_text = ''
-
-  def_elt = member_elt.find('definition')
-  if def_elt:
-    def_text = def_elt.text
-
-  return def_text
-
-
-def member_sig(member_elt):
-  def_text = member_definition(member_elt)
-
-  argstring_text = ''
-  argstring = member_elt.find('argsstring')
-  if argstring:
-    argstring_text = argstring.text
-
-  sig = def_text + argstring_text
-  return sig
-
-
-def anchorize(name):
-  return ANCHOR_RE.sub('_', name)
-
-
-def element_text(member_elt, elt_name):
-  """Extract all `para` text from (`elt_name` in) `member_elt`."""
-  text = []
-  if elt_name:
-    elt = member_elt.find(elt_name)
-  else:
-    elt = member_elt
-
-  if elt:
-    paras = elt.findAll('para')
-    for p in paras:
-      text.append(p.getText(separator=u' ').strip())
-  return '\n\n'.join(text)
-
-
-def full_member_entry(member_elt):
-  """Generate the description of `member_elt` for "Member Details"."""
-  anchor = '{#' + anchorize(member_definition(member_elt)) + '}'
-  full_entry = '#### `%s` %s' % (member_sig(member_elt), anchor)
-
-  complete_descr = element_text(member_elt, 'briefdescription') + '\n\n'
-  complete_descr += element_text(member_elt, 'detaileddescription')
-
-  if complete_descr:
-    full_entry += '\n\n' + complete_descr
-
-  return full_entry
-
-
-def brief_member_entry(member_elt):
-  """Generate the description of `member_elt` for the "Member Summary"."""
-  brief_item = ''
-  brief_descr = element_text(member_elt, 'briefdescription')
-  if brief_descr:
-    brief_item = '\n  * ' + brief_descr
-  sig = member_sig(member_elt)
-  memdef = member_definition(member_elt)
-  linkified_sig = '[`{0}`](#{1})'.format(sig, anchorize(memdef))
-
-  return '* ' + linkified_sig + brief_item
-
-
-def all_briefs(members):
-  briefs = [brief_member_entry(member_elt) for member_elt in members]
-  return '\n'.join(briefs)
-
-
-def all_fulls(members):
-  fulls = [full_member_entry(member_elt) for member_elt in members]
-  return '\n\n'.join(fulls)
-
-
-def page_overview(class_elt):
-  """Returns the contents of the .md file for `class_elt`."""
-  overview_brief = ''
-  overview_details = ''
-
-  briefs = class_elt.findAll('briefdescription', recursive=False)
-  if briefs:
-    overview_brief = element_text(briefs[0], None)
-
-  details = class_elt.findAll('detaileddescription', recursive=False)
-  if details:
-    overview_details = element_text(details[0], None)
-
-  return overview_brief + '\n\n' + overview_details
-
-
-def page_with_name(pages, name):
-  def match(n):
-    for i in xrange(len(pages)):
-      if pages[i].get_name() == n:
-        return i
-    return None
-  return match(name) or match('tensorflow::' + name)
-
-
-def get_all_indexed_pages():
-  all_pages = set()
-  lines = INDEX_TEMPLATE.split('\n')
-  for i in range(len(lines)):
-    if lines[i].startswith('@@'):
-      name = lines[i][2:]
-      all_pages.add(name)
-  return all_pages
-
-
-def index_page(pages):
-  """Create the index page linking to `pages` using INDEX_TEMPLATE."""
-  pages = pages[:]
-  lines = INDEX_TEMPLATE.split('\n')
-  all_md_files = []
-  for i in range(len(lines)):
-    if lines[i].startswith('@@'):
-      name = lines[i][2:]
-      page_index = page_with_name(pages, name)
-      if page_index is None:
-        raise ValueError('Missing page with name: ' + name)
-      lines[i] = '* [{0}]({1})'.format(
-          pages[page_index].get_name(), pages[page_index].get_md_filename())
-      all_md_files.append(pages[page_index].get_md_filename())
-      pages.pop(page_index)
-
-  return '\n'.join(lines)
-
-
-def page_in_name_list(page, names):
-  for name in names:
-    if page.get_name() == name or page.get_name() == 'tensorflow::' + name:
-      return True
-  return False
-
-
-class Page(object):
-  """Holds the MarkDown converted contents of a .xml page."""
-
-  def __init__(self, xml_path, deftype):
-    self.type = deftype
-    xml_file = open(xml_path)
-    xml = xml_file.read()
-    xml = xml.replace('<computeroutput>', '`').replace('</computeroutput>', '`')
-    # TODO(josh11b): Should not use HTML entities inside ```...```.
-    soup = BeautifulStoneSoup(
-        xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
-    self.name = soup.find('compoundname').text
-    print('Making page with name ' + self.name + ' (from ' + xml_path + ')')
-    members = soup('memberdef', prot='public')
-    fulls = all_fulls(members)
-    self.overview = page_overview(soup.find('compounddef'))
-    self.page_text = PAGE_TEMPLATE.format(
-        self.type, self.name, self.overview, fulls)
-
-  def get_text(self):
-    return self.page_text
-
-  def get_name(self):
-    return self.name
-
-  def get_short_name(self):
-    parse = self.get_name().split('::')
-    return parse[len(parse)-1]
-
-  def get_type(self):
-    return self.type
-
-  def get_md_filename(self):
-    capitalized_type = self.get_type()[0].upper() + self.get_type()[1:]
-    return capitalized_type + anchorize(self.get_short_name()) + '.md'
-
-
-def main(unused_argv):
-  print('Converting in ' + FLAGS.src_dir)
-  pages = []
-  all_pages = get_all_indexed_pages()
-  xml_files = os.listdir(FLAGS.src_dir)
-  for fname in xml_files:
-    if len(fname) < 6: continue
-    newpage = None
-    if fname[0:5] == 'class':
-      newpage = Page(os.path.join(FLAGS.src_dir, fname), 'class')
-    elif fname[0:6] == 'struct':
-      newpage = Page(os.path.join(FLAGS.src_dir, fname), 'struct')
-    if newpage is not None and page_in_name_list(newpage, all_pages):
-      pages.append(newpage)
-      md_filename = newpage.get_md_filename()
-      print('Writing ' + md_filename)
-      md_file = open(os.path.join(FLAGS.out_dir, md_filename), 'w')
-      print(newpage.get_text(), file=md_file)
-
-  index_text = index_page(pages)
-  index_md_file = open(os.path.join(FLAGS.out_dir, 'index.md'), 'w')
-  print(index_text, file=index_md_file)
-  return 0
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--src_dir',
-      type=str,
-      default=None,
-      help='Directory containing the doxygen output.'
-  )
-  parser.add_argument(
-      '--out_dir',
-      type=str,
-      default=None,
-      help='Directory to which docs should be written.'
-  )
-  FLAGS = parser.parse_args()
-
-  tf.app.run()
diff --git a/tensorflow/tools/docs/gen_docs.sh b/tensorflow/tools/docs/gen_docs.sh
deleted file mode 100755
index 4f529270ab4b250f857aae16c2ac5869525bb5ef..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/gen_docs.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# This script needs to be run from the tensorflow/tools/docs directory
-# Pass -a to also rebuild C++ docs. This requires doxygen.
-
-set -e
-
-DOC_DIR="g3doc/api_docs"
-DOXYGEN_BIN=${DOXYGEN:-doxygen}
-DOXYGEN_CONFIG="tools/docs/tf-doxy_for_md-config"
-# The TMP_DIR is set inside DOXYGEN_CONFIG and cannot be changed independently
-TMP_DIR=/tmp/tensorflow-docs/xml
-
-if [ ! -f gen_docs.sh ]; then
-  echo "This script must be run from inside the tensorflow/tools/docs directory."
-  exit 1
-fi
-
-# go to the tensorflow/ directory
-pushd ../..
-BASE=$(pwd)
-
-# Make Python docs
-bazel run -- //tensorflow/python:gen_docs_combined \
-    --out_dir=$BASE/$DOC_DIR/python
-
-# Check if we should build c++ docs (if -a is given)
-if [ x$1 == x-a ]; then
-  mkdir -p $TMP_DIR
-  $DOXYGEN_BIN "$BASE/$DOXYGEN_CONFIG"
-  bazel run -- //tensorflow/tools/docs:gen_cc_md \
-      --out_dir=$BASE/$DOC_DIR/cc \
-      --src_dir=$TMP_DIR
-fi
-
-popd
diff --git a/tensorflow/tools/docs/gen_docs_test.sh b/tensorflow/tools/docs/gen_docs_test.sh
deleted file mode 100755
index c8c1955aa06f98381942180eea4ea9706d15cb3d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/gen_docs_test.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -eux
-
-if [ -d $TEST_SRCDIR/org_tensorflow ]; then
-  TFDIR=$TEST_SRCDIR/org_tensorflow/tensorflow
-else
-  # Support 0.2.1- runfiles.
-  TFDIR=$TEST_SRCDIR/tensorflow
-fi
-DOXYGEN=doxygen
-DOXYGEN_CONFIG="tf-doxy_for_md-config"
-TMP_DIR=/tmp/tensorflow-docs
-mkdir -p $TMP_DIR/python
-mkdir -p $TMP_DIR/xml
-mkdir -p $TMP_DIR/cc
-
-pushd $TFDIR
-python/gen_docs_combined --out_dir=$TMP_DIR/python
-
-# TODO(wicke): this does not work well inside the build/test jail
-#$DOXYGEN "tools/docs/$DOXYGEN_CONFIG"
-#tools/docs/gen_cc_md \
-#    --out_dir=$TMP_DIR/cc \
-#    --src_dir=$TMP_DIR/xml
-popd
-echo "PASS"
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index 1a3bbcbf7a29d30345ff0acd5660e7e8e4cf21ac..fc93085e3e0316cf274f4d9b325d6af0ea3a2f83 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -18,16 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import os
 import sys
 
 import tensorflow as tf
 
 from tensorflow.python import debug as tf_debug
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import generate_lib
 
-
 if __name__ == '__main__':
   doc_generator = generate_lib.DocGenerator()
   doc_generator.add_output_dir_argument()
@@ -38,7 +37,7 @@ if __name__ == '__main__':
   # tensorflow/, we can compute the base directory (two levels up), which is
   # valid unless we're trying to apply this to a different code base, or are
   # moving the script around.
-  script_dir = os.path.dirname(inspect.getfile(inspect.currentframe()))
+  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
   default_base_dir = os.path.join(script_dir, '..', '..')
   doc_generator.add_base_dir_argument(default_base_dir)
 
@@ -46,6 +45,5 @@ if __name__ == '__main__':
 
   # tf_debug is not imported with tf, it's a separate module altogether
   doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-  doc_generator.load_contrib()
 
   sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
index 97a37252a96a492f309c8d692651e6e68ee722b2..ddafcebd11803599785b57ecf268fcab364f6deb 100644
--- a/tensorflow/tools/docs/generate_1_0.py
+++ b/tensorflow/tools/docs/generate_1_0.py
@@ -18,16 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import inspect
 import os
 import sys
 
 import tensorflow as tf
 
 from tensorflow.python import debug as tf_debug
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import generate_lib
 
-
 if __name__ == '__main__':
   doc_generator = generate_lib.DocGenerator()
   doc_generator.add_output_dir_argument()
@@ -38,7 +37,7 @@ if __name__ == '__main__':
   # tensorflow/, we can compute the base directory (two levels up), which is
   # valid unless we're trying to apply this to a different code base, or are
   # moving the script around.
-  script_dir = os.path.dirname(inspect.getfile(inspect.currentframe()))
+  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
   default_base_dir = os.path.join(script_dir, '..', '..')
   doc_generator.add_base_dir_argument(default_base_dir)
 
@@ -47,7 +46,6 @@ if __name__ == '__main__':
   # tf_debug is not imported with tf, it's a separate module altogether
   doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
 
-  doc_generator.load_contrib()
   doc_generator.set_do_not_descend_map({
       '': ['cli', 'lib', 'wrappers'],
       'contrib': [
@@ -68,21 +66,14 @@ if __name__ == '__main__':
           'tfprof',
       ],
       'contrib.bayesflow': [
-          'entropy', 'monte_carlo',
-          'special_math', 'stochastic_gradient_estimators',
-          'stochastic_graph', 'stochastic_tensor',
-          'stochastic_variables', 'variational_inference'
+          'entropy', 'monte_carlo', 'special_math',
+          'stochastic_gradient_estimators', 'stochastic_graph',
+          'stochastic_tensor', 'stochastic_variables', 'variational_inference'
       ],
       'contrib.distributions': ['bijector'],
       'contrib.ffmpeg': ['ffmpeg_ops'],
       'contrib.graph_editor': [
-          'edit',
-          'match',
-          'reroute',
-          'subgraph',
-          'transform',
-          'select',
-          'util'
+          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
       ],
       'contrib.layers': ['feature_column', 'summaries'],
       'contrib.learn': [
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 640ddb6df832a0645d682d84e6ccb2a7dc3cd619..3494c7f8c5a9d71967fea0137a2f81a38cfdbd38 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import inspect
 import os
 
 import six
 
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 from tensorflow.tools.docs import doc_generator_visitor
@@ -32,18 +32,18 @@ from tensorflow.tools.docs import pretty_docs
 from tensorflow.tools.docs import py_guide_parser
 
 
-def  _is_free_function(py_object, full_name, index):
+def _is_free_function(py_object, full_name, index):
   """Check if input is a free function (and not a class- or static method)."""
-  if not inspect.isfunction(py_object):
+  if not tf_inspect.isfunction(py_object):
     return False
 
-  # Static methods are functions to inspect (in 2.7), so check if the parent
+  # Static methods are functions to tf_inspect (in 2.7), so check if the parent
   # is a class. If there is no parent, it's not a function.
   if '.' not in full_name:
     return False
 
   parent_name = full_name.rsplit('.', 1)[0]
-  if inspect.isclass(index[parent_name]):
+  if tf_inspect.isclass(index[parent_name]):
     return False
 
   return True
@@ -64,8 +64,16 @@ def write_docs(output_dir, parser_config, yaml_toc):
     parser_config: A `parser.ParserConfig` object, containing all the necessary
       indices.
     yaml_toc: Set to `True` to generate a "_toc.yaml" file.
+
+  Raises:
+    ValueError: if `output_dir` is not an absolute path
   """
   # Make output_dir.
+  if not os.path.isabs(output_dir):
+    raise ValueError(
+        "'output_dir' must be an absolute path.\n"
+        "    output_dir='%s'" % output_dir)
+
   try:
     if not os.path.exists(output_dir):
       os.makedirs(output_dir)
@@ -87,7 +95,7 @@ def write_docs(output_dir, parser_config, yaml_toc):
       continue
 
     # Methods and some routines are documented only as part of their class.
-    if not (inspect.ismodule(py_object) or inspect.isclass(py_object) or
+    if not (tf_inspect.ismodule(py_object) or tf_inspect.isclass(py_object) or
             _is_free_function(py_object, full_name, parser_config.index)):
       continue
 
@@ -99,7 +107,7 @@ def write_docs(output_dir, parser_config, yaml_toc):
     symbol_to_file[full_name] = sitepath
 
     # For a module, remember the module for the table-of-contents
-    if inspect.ismodule(py_object):
+    if tf_inspect.ismodule(py_object):
       if full_name in parser_config.tree:
         module_children.setdefault(full_name, [])
 
@@ -109,7 +117,7 @@ def write_docs(output_dir, parser_config, yaml_toc):
       subname = str(full_name)
       while True:
         subname = subname[:subname.rindex('.')]
-        if inspect.ismodule(parser_config.index[subname]):
+        if tf_inspect.ismodule(parser_config.index[subname]):
           module_children.setdefault(subname, []).append(full_name)
           break
 
@@ -143,23 +151,24 @@ def write_docs(output_dir, parser_config, yaml_toc):
       f.write('# Automatically generated file; please do not edit\ntoc:\n')
       for module in modules:
         f.write('  - title: ' + module + '\n'
-                '    section:\n' +
-                '    - title: Overview\n' +
-                '      path: /TARGET_DOC_ROOT/VERSION/' +
-                symbol_to_file[module] + '\n')
+                '    section:\n' + '    - title: Overview\n' +
+                '      path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[module]
+                + '\n')
 
         symbols_in_module = module_children.get(module, [])
-        symbols_in_module.sort(key=lambda a: a.upper())
+        # Sort case-insensitive, if equal sort case sensitive (upper first)
+        symbols_in_module.sort(key=lambda a: (a.upper(), a))
 
         for full_name in symbols_in_module:
-          f.write('    - title: ' + full_name[len(module)+1:] + '\n'
+          f.write('    - title: ' + full_name[len(module) + 1:] + '\n'
                   '      path: /TARGET_DOC_ROOT/VERSION/' +
                   symbol_to_file[full_name] + '\n')
 
   # Write a global index containing all full names with links.
   with open(os.path.join(output_dir, 'index.md'), 'w') as f:
-    f.write(parser.generate_global_index('TensorFlow', parser_config.index,
-                                         parser_config.reference_resolver))
+    f.write(
+        parser.generate_global_index('TensorFlow', parser_config.index,
+                                     parser_config.reference_resolver))
 
 
 def add_dict_to_dict(add_from, add_to):
@@ -189,7 +198,6 @@ def _get_default_do_not_descend_map():
           'tensor_forest',
           'tensorboard',
           'testing',
-          'training',
           'tfprof',
       ],
       'contrib.bayesflow': [
@@ -198,13 +206,7 @@ def _get_default_do_not_descend_map():
       ],
       'contrib.ffmpeg': ['ffmpeg_ops'],
       'contrib.graph_editor': [
-          'edit',
-          'match',
-          'reroute',
-          'subgraph',
-          'transform',
-          'select',
-          'util'
+          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
       ],
       'contrib.keras': ['api', 'python'],
       'contrib.layers': ['feature_column', 'summaries'],
@@ -263,10 +265,19 @@ class _DocInfo(object):
 def build_doc_index(src_dir):
   """Build an index from a keyword designating a doc to _DocInfo objects."""
   doc_index = {}
+  if not os.path.isabs(src_dir):
+    raise ValueError("'src_dir' must be an absolute path.\n"
+                     "    src_dir='%s'" % src_dir)
+
+  if not os.path.exists(src_dir):
+    raise ValueError("'src_dir' path must exist.\n"
+                     "    src_dir='%s'" % src_dir)
+
   for dirpath, _, filenames in os.walk(src_dir):
     suffix = os.path.relpath(path=dirpath, start=src_dir)
     for base_name in filenames:
-      if not base_name.endswith('.md'): continue
+      if not base_name.endswith('.md'):
+        continue
       title_parser = _GetMarkdownTitle()
       title_parser.process(os.path.join(dirpath, base_name))
       key_parts = os.path.join(suffix, base_name[:-3]).split('/')
@@ -283,8 +294,8 @@ def build_doc_index(src_dir):
 class _GuideRef(object):
 
   def __init__(self, base_name, title, section_title, section_tag):
-    self.url = 'api_guides/python/' + (
-        ('%s#%s' % (base_name, section_tag)) if section_tag else base_name)
+    self.url = 'api_guides/python/' + (('%s#%s' % (base_name, section_tag))
+                                       if section_tag else base_name)
     self.link_text = (('%s > %s' % (title, section_title))
                       if section_title else title)
 
@@ -320,8 +331,9 @@ class _GenerateGuideIndex(py_guide_parser.PyGuideParser):
     """Index @{symbol} references as in the current file & section."""
     for match in parser.SYMBOL_REFERENCE_RE.finditer(line):
       val = self.index.get(match.group(1), [])
-      val.append(_GuideRef(
-          self.base_name, self.title, self.section_title, self.section_tag))
+      val.append(
+          _GuideRef(self.base_name, self.title, self.section_title,
+                    self.section_tag))
       self.index[match.group(1)] = val
 
 
@@ -383,8 +395,8 @@ def _other_docs(src_dir, output_dir, reference_resolver):
         print('Processing doc %s...' % suffix)
         md_string = open(full_in_path).read()
 
-      output = reference_resolver.replace_references(
-          md_string, relative_path_to_root)
+      output = reference_resolver.replace_references(md_string,
+                                                     relative_path_to_root)
       with open(full_out_path, 'w') as f:
         f.write(header + output)
 
@@ -406,8 +418,7 @@ class DocGenerator(object):
         type=str,
         default=None,
         required=True,
-        help='Directory to write docs to.'
-    )
+        help='Directory to write docs to.')
 
   def add_src_dir_argument(self):
     self.argument_parser.add_argument(
@@ -415,16 +426,14 @@ class DocGenerator(object):
         type=str,
         default=None,
         required=True,
-        help='Directory with the source docs.'
-    )
+        help='Directory with the source docs.')
 
   def add_base_dir_argument(self, default_base_dir):
     self.argument_parser.add_argument(
         '--base_dir',
         type=str,
         default=default_base_dir,
-        help='Base directory to to strip from file names referenced in docs.'
-    )
+        help='Base directory to to strip from file names referenced in docs.')
 
   def parse_known_args(self):
     flags, _ = self.argument_parser.parse_known_args()
@@ -439,19 +448,6 @@ class DocGenerator(object):
   def set_py_modules(self, py_modules):
     self._py_modules = py_modules
 
-  def load_contrib(self):
-    """Access something in contrib so tf.contrib is properly loaded."""
-    # Without this, it ends up hidden behind lazy loading.  Requires
-    # that the caller has already called set_py_modules().
-    if self._py_modules is None:
-      raise RuntimeError(
-          'Must call set_py_modules() before running load_contrib().')
-    for name, module in self._py_modules:
-      if name == 'tf':
-        _ = module.contrib.__name__
-        return True
-    return False
-
   def py_module_names(self):
     if self._py_modules is None:
       raise RuntimeError(
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index c8d4c3fe7e215316a6aab96d8927cb56ae288a40..49f7ff597ed75ea33bac348e9daa996fa824e60f 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -56,7 +56,7 @@ class GenerateTest(googletest.TestCase):
 
   def test_extraction(self):
     py_modules = [('tf', tf), ('tfdbg', tf_debug)]
-    _ = tf.contrib.__name__  # Trigger loading of tf.contrib
+
     try:
       generate_lib.extract(
           py_modules, generate_lib._get_default_do_not_descend_map())
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 3da58d2b3c7b1e645cb363a4b0c76b3a9aaa24e7..526ffe93cd48781af1e8ac0cc7f6c8d05a1f1a65 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import ast
 import collections
 import functools
-import inspect
 import json
 import os
 import re
@@ -30,6 +29,8 @@ import codegen
 import six
 
 from google.protobuf.message import Message as ProtoMessage
+from tensorflow.python.util import tf_inspect
+
 
 # A regular expression capturing a python indentifier.
 IDENTIFIER_RE = '[a-zA-Z_][a-zA-Z0-9_]*'
@@ -71,12 +72,12 @@ def _get_raw_docstring(py_object):
   Returns:
     The docstring, or the empty string if no docstring was found.
   """
-  # For object instances, inspect.getdoc does give us the docstring of their
+  # For object instances, tf_inspect.getdoc does give us the docstring of their
   # type, which is not what we want. Only return the docstring if it is useful.
-  if (inspect.isclass(py_object) or inspect.ismethod(py_object) or
-      inspect.isfunction(py_object) or inspect.ismodule(py_object) or
+  if (tf_inspect.isclass(py_object) or tf_inspect.ismethod(py_object) or
+      tf_inspect.isfunction(py_object) or tf_inspect.ismodule(py_object) or
       isinstance(py_object, property)):
-    return inspect.getdoc(py_object) or ''
+    return tf_inspect.getdoc(py_object) or ''
   else:
     return ''
 
@@ -119,12 +120,12 @@ class ReferenceResolver(object):
       an instance of `ReferenceResolver` ()
     """
     is_class = {
-        name: inspect.isclass(visitor.index[name])
+        name: tf_inspect.isclass(visitor.index[name])
         for name, obj in visitor.index.items()
     }
 
     is_module = {
-        name: inspect.ismodule(visitor.index[name])
+        name: tf_inspect.ismodule(visitor.index[name])
         for name, obj in visitor.index.items()
     }
 
@@ -530,7 +531,7 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `inspect.getargspec`. For `functools.partial` objects,
+  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
   corrects the signature of the underlying function to take into account the
   removed arguments.
 
@@ -539,11 +540,11 @@ def _get_arg_spec(func):
 
   Returns:
     An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `inspect.getargspec`.
+    by `tf_inspect.getargspec`.
   """
   # getargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = inspect.getargspec(func.func)
+    argspec = tf_inspect.getargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -566,12 +567,12 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return inspect.ArgSpec(args=argspec_args,
-                           varargs=argspec.varargs,
-                           keywords=argspec.keywords,
-                           defaults=tuple(argspec_defaults))
+    return tf_inspect.ArgSpec(args=argspec_args,
+                              varargs=argspec.varargs,
+                              keywords=argspec.keywords,
+                              defaults=tuple(argspec_defaults))
   else:  # Regular function or method, getargspec will work fine.
-    return inspect.getargspec(func)
+    return tf_inspect.getargspec(func)
 
 
 def _remove_first_line_indent(string):
@@ -583,7 +584,7 @@ def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
   This function produces a list of strings representing the arguments to a
-  python function. It uses inspect.getargspec, which
+  python function. It uses tf_inspect.getargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -603,9 +604,6 @@ def _generate_signature(func, reverse_index):
     code.
   """
 
-  # This produces poor signatures for decorated functions.
-  # TODO(wicke): We need to use something like the decorator module to fix it.
-
   args_list = []
 
   argspec = _get_arg_spec(func)
@@ -624,7 +622,7 @@ def _generate_signature(func, reverse_index):
   # Add all args with defaults.
   if argspec.defaults:
     try:
-      source = _remove_first_line_indent(inspect.getsource(func))
+      source = _remove_first_line_indent(tf_inspect.getsource(func))
       func_ast = ast.parse(source)
       ast_defaults = func_ast.body[0].args.defaults
     except IOError:  # If this is a builtin, getsource fails with IOError
@@ -689,7 +687,7 @@ def _get_guides_markdown(duplicate_names, guide_index, relative_path):
 
 
 def _get_defining_class(py_class, name):
-  for cls in inspect.getmro(py_class):
+  for cls in tf_inspect.getmro(py_class):
     if name in cls.__dict__:
       return cls
   return None
@@ -936,15 +934,15 @@ class _ClassPageInfo(object):
       if isinstance(child, property):
         self._add_property(short_name, child_name, child, child_doc)
 
-      elif inspect.isclass(child):
+      elif tf_inspect.isclass(child):
         if defining_class is None:
           continue
         url = parser_config.reference_resolver.reference_to_url(
             child_name, relative_path)
         self._add_class(short_name, child_name, child, child_doc, url)
 
-      elif (inspect.ismethod(child) or inspect.isfunction(child) or
-            inspect.isroutine(child)):
+      elif (tf_inspect.ismethod(child) or tf_inspect.isfunction(child) or
+            tf_inspect.isroutine(child)):
         if defining_class is None:
           continue
 
@@ -967,7 +965,7 @@ class _ClassPageInfo(object):
           child_signature = _generate_signature(child,
                                                 parser_config.reverse_index)
         except TypeError:
-          # If this is a (dynamically created) slot wrapper, inspect will
+          # If this is a (dynamically created) slot wrapper, tf_inspect will
           # raise typeerror when trying to get to the code. Ignore such
           # functions.
           continue
@@ -1106,13 +1104,13 @@ class _ModulePageInfo(object):
       url = parser_config.reference_resolver.reference_to_url(
           member_full_name, relative_path)
 
-      if inspect.ismodule(member):
+      if tf_inspect.ismodule(member):
         self._add_module(name, member_full_name, member, member_doc, url)
 
-      elif inspect.isclass(member):
+      elif tf_inspect.isclass(member):
         self._add_class(name, member_full_name, member, member_doc, url)
 
-      elif inspect.isfunction(member):
+      elif tf_inspect.isfunction(member):
         self._add_function(name, member_full_name, member, member_doc, url)
 
       else:
@@ -1196,17 +1194,17 @@ def docs_for_object(full_name, py_object, parser_config):
   duplicate_names = parser_config.duplicates.get(master_name, [full_name])
 
   # TODO(wicke): Once other pieces are ready, enable this also for partials.
-  if (inspect.ismethod(py_object) or inspect.isfunction(py_object) or
+  if (tf_inspect.ismethod(py_object) or tf_inspect.isfunction(py_object) or
       # Some methods in classes from extensions come in as routines.
-      inspect.isroutine(py_object)):
+      tf_inspect.isroutine(py_object)):
     page_info = _FunctionPageInfo(master_name)
     page_info.set_signature(py_object, parser_config.reverse_index)
 
-  elif inspect.isclass(py_object):
+  elif tf_inspect.isclass(py_object):
     page_info = _ClassPageInfo(master_name)
     page_info.collect_docs_for_class(py_object, parser_config)
 
-  elif inspect.ismodule(py_object):
+  elif tf_inspect.ismodule(py_object):
     page_info = _ModulePageInfo(master_name)
     page_info.collect_docs_for_module(parser_config)
 
@@ -1341,7 +1339,7 @@ def _get_defined_in(py_object, parser_config):
   # TODO(wicke): Only use decorators that support this in TF.
 
   try:
-    path = os.path.relpath(path=inspect.getfile(py_object),
+    path = os.path.relpath(path=tf_inspect.getfile(py_object),
                            start=parser_config.base_dir)
   except TypeError:  # getfile throws TypeError if py_object is a builtin.
     return _PythonBuiltin()
@@ -1384,15 +1382,15 @@ def generate_global_index(library_name, index, reference_resolver):
   """
   symbol_links = []
   for full_name, py_object in six.iteritems(index):
-    if (inspect.ismodule(py_object) or inspect.isfunction(py_object) or
-        inspect.isclass(py_object)):
+    if (tf_inspect.ismodule(py_object) or tf_inspect.isfunction(py_object) or
+        tf_inspect.isclass(py_object)):
       # In Python 3, unbound methods are functions, so eliminate those.
-      if inspect.isfunction(py_object):
+      if tf_inspect.isfunction(py_object):
         if full_name.count('.') == 0:
           parent_name = ''
         else:
           parent_name = full_name[:full_name.rfind('.')]
-        if parent_name in index and inspect.isclass(index[parent_name]):
+        if parent_name in index and tf_inspect.isclass(index[parent_name]):
           # Skip methods (=functions with class parents).
           continue
       symbol_links.append((
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index 2bab6b3de4bc84268eae6a76f7b8c688efa1444a..3e02160130f1959484472ecc77e8b2e883294a1e 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
-import inspect
 import os
 import sys
 
 from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import parser
 
 
@@ -152,7 +152,7 @@ class ParserTest(googletest.TestCase):
 
     # Make sure the brief docstring is present
     self.assertEqual(
-        inspect.getdoc(TestClass).split('\n')[0], page_info.doc.brief)
+        tf_inspect.getdoc(TestClass).split('\n')[0], page_info.doc.brief)
 
     # Make sure the method is present
     self.assertEqual(TestClass.a_method, page_info.methods[0].obj)
@@ -204,7 +204,8 @@ class ParserTest(googletest.TestCase):
         full_name='TestModule', py_object=module, parser_config=parser_config)
 
     # Make sure the brief docstring is present
-    self.assertEqual(inspect.getdoc(module).split('\n')[0], page_info.doc.brief)
+    self.assertEqual(tf_inspect.getdoc(module).split('\n')[0],
+                     page_info.doc.brief)
 
     # Make sure that the members are there
     funcs = {f_info.obj for f_info in page_info.functions}
@@ -246,7 +247,7 @@ class ParserTest(googletest.TestCase):
 
     # Make sure the brief docstring is present
     self.assertEqual(
-        inspect.getdoc(test_function).split('\n')[0], page_info.doc.brief)
+        tf_inspect.getdoc(test_function).split('\n')[0], page_info.doc.brief)
 
     # Make sure the extracted signature is good.
     self.assertEqual(['unused_arg', "unused_kwarg='default'"],
@@ -285,7 +286,7 @@ class ParserTest(googletest.TestCase):
 
     # Make sure the brief docstring is present
     self.assertEqual(
-        inspect.getdoc(test_function_with_args_kwargs).split('\n')[0],
+        tf_inspect.getdoc(test_function_with_args_kwargs).split('\n')[0],
         page_info.doc.brief)
 
     # Make sure the extracted signature is good.
@@ -402,41 +403,42 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None, None,
-                               (1, 2))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+                                  None, (1, 2))
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None, None,
-                               (1, 2))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
+                                  None, (1, 2))
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None, (1, 2))
+    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
+                                  (1, 2))
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 824f46170ed0e41103dfa5e689bb0236a69bdb6e..918d475e0d7cc186bf03585455e5f0115de61b82 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -213,6 +213,14 @@ def _build_module_page(page_info):
 
 def _build_signature(obj_info):
   """Returns a md code block showing the function signature."""
+  # Special case tf.range, since it has an optional first argument
+  if obj_info.full_name == 'tf.range':
+    return (
+        '``` python\n'
+        "range(limit, delta=1, dtype=None, name='range')\n"
+        "range(start, limit, delta=1, dtype=None, name='range')\n"
+        '```\n\n')
+
   signature_template = '\n'.join([
       '``` python',
       '{name}({sig})',
@@ -230,7 +238,7 @@ def _build_signature(obj_info):
 
 
 def _build_compatibility(compatibility):
-  """Return the compatability section as an md string."""
+  """Return the compatibility section as an md string."""
   parts = []
   sorted_keys = sorted(compatibility.keys())
   for key in sorted_keys:
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 3ca6d11b8478c8fcfc00e5f4dc3a93ee0f9d6a15..216353ecee377260efd5a19c8536ac41c17592a9 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -34,8 +34,8 @@ def md_files_in_dir(py_guide_src_dir):
 class PyGuideParser(object):
   """Simple parsing of a guide .md file.
 
-  Decendents can override the process_*() functions (called by process())
-  to either record infromation from the guide, or call replace_line()
+  Descendants can override the process_*() functions (called by process())
+  to either record information from the guide, or call replace_line()
   to affect the return value of process().
   """
 
diff --git a/tensorflow/tools/docs/tf-doxy_for_md-config b/tensorflow/tools/docs/tf-doxy_for_md-config
deleted file mode 100644
index b7fd6e95076cad2fd9b77d78bbeb826c9dfe64d2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/tf-doxy_for_md-config
+++ /dev/null
@@ -1,2280 +0,0 @@
-# Doxyfile 1.8.5
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "TensorFlow"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 0.0.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = /tmp/tensorflow-docs/
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-
-# Traditional, Croatian, Czech, Danish, Dutch, English, Esperanto, Farsi,
-# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en,
-# Korean, Korean-en, Latvian, Norwegian, Macedonian, Persian, Polish,
-# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
-# Turkish, Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C.
-#
-# Note For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. Do not use file names with spaces, bibtex cannot handle them. See
-# also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = core/framework core/lib/core core/platform core/public
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
-
-FILE_PATTERNS          =
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                =
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
-# defined cascading style sheet that is included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
-# Doxygen will copy the style sheet file to the output directory. For an example
-# see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = NO
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavours of web server based searching depending on the
-# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
-# searching and an index file used by the script. When EXTERNAL_SEARCH is
-# enabled the indexing and searching needs to be provided by external tools. See
-# the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
-# replace them by respectively the title of the page, the current date and time,
-# only the current date, the version number of doxygen, the project name (see
-# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                =
-
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all refrences to function-like macros that are alone on a line, have an
-# all uppercase name, and do not end with a semicolon. Such function macros are
-# typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have an unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font n the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif and svg.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index 1df00692725817f62b60d880a94d6316cc268c23..581bded65da1011df6c7c6b0e2c98d5e5dce72bc 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -3,7 +3,7 @@ FROM ubuntu:16.04
 MAINTAINER Shanqing Cai <cais@google.com>
 
 RUN apt-get update
-RUN apt-get install -y --no-install-recommends \
+RUN apt-get install -y \
     curl \
     libcurl4-openssl-dev \
     python \
diff --git a/tensorflow/tools/gcs_test/python/gcs_smoke.py b/tensorflow/tools/gcs_test/python/gcs_smoke.py
index 615e142c4718238d43f0a552b3484dd644879cc9..51933a52a66642283c250c664651b30aa4410fda 100644
--- a/tensorflow/tools/gcs_test/python/gcs_smoke.py
+++ b/tensorflow/tools/gcs_test/python/gcs_smoke.py
@@ -36,7 +36,7 @@ flags.DEFINE_integer("num_examples", 10, "Number of examples to generate")
 FLAGS = flags.FLAGS
 
 def create_examples(num_examples, input_mean):
-  """Create ExampleProto's containg data."""
+  """Create ExampleProto's containing data."""
   ids = np.arange(num_examples).reshape([num_examples, 1])
   inputs = np.random.randn(num_examples, 1) + input_mean
   target = inputs - input_mean
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 89068c6d0169cd5401bc18e25d8355fd28890a60..20f958f640856b0af36a8c2be5438deeec331675 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -97,6 +97,7 @@ cc_library(
         "//tensorflow/core:tensorflow",
     ] + if_not_windows([
         "//tensorflow/core/kernels:quantized_ops",
+        "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
     ]),
     alwayslink = 1,
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 06ae78ef5db774f2beae4ae7501baef0bbe5a24b..53f7f1685f3861dae9d0b556c856d99d6707c602 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -81,10 +81,10 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul:0' \
 --outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_old_batch_norms \
+--transforms='
+strip_unused_nodes(type=float, shape="1,299,299,3")
+remove_nodes(op=Identity, op=CheckNumerics)
+fold_old_batch_norms
 '
 ```
 
@@ -94,7 +94,10 @@ transforms to modify the graph with. The transforms are given as a list of
 names, and can each have arguments themselves. These transforms define the
 pipeline of modifications that are applied in order to produce the output.
 Sometimes you need some transforms to happen before others, and the ordering
-within the list lets you specify which happen first.
+within the list lets you specify which happen first. 
+Note that the optimization 
+`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control 
+flow operations, such as `tf.cond`, `tf.map_fn`, and `tf.while`.
 
 ## Inspecting Graphs
 
@@ -103,7 +106,7 @@ output layers of the model are. The best source for these is the model training
 process, where for a classifier the inputs will be the nodes that receive the
 data from the training set, and the output will be the predictions. If you're
 unsure, the
-[summarize_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph_main.cc)
+[`summarize_graph`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/graph_transforms/summarize_graph_main.cc)
 tool can inspect the model and provide guesses about likely input and output nodes,
 as well as other information that's useful for debugging. Here's an example of
 how to use it on the [Inception V3
@@ -136,15 +139,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-remove_nodes(op=Identity, op=CheckNumerics) \
-fold_constants(ignore_errors=true) \
-fold_batch_norms \
-fold_old_batch_norms\
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  remove_nodes(op=Identity, op=CheckNumerics)
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms'
 ```
 
 The batch norm folding is included twice because there are two different flavors
@@ -170,21 +172,20 @@ then you'll need to make local modifications to the build files to include the
 right .cc file that defines it. In a lot of cases the op is just a vestigial
 remnant from the training process though, and if that's true then you can run
 the [strip_unused_nodes](#strip_unused_nodes), specifying the inputs and outputs
-of your inference usage, to remove those unneccessary nodes:
+of your inference usage, to remove those unnecessary nodes:
 
 ```bash
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-strip_unused_nodes(type=float, shape="1,299,299,3") \
-fold_constants(ignore_errors=true) \
-fold_batch_norms \
-fold_old_batch_norms\
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms'
 ```
 
 ### Shrinking File Size
@@ -212,11 +213,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-round_weights(num_steps=256) \
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  round_weights(num_steps=256)'
 ```
 
 You should see that the `optimized_inception_graph.pb` output file is the same
@@ -236,11 +240,14 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
---transforms='\
-quantize_weights \
-'
+--inputs='Mul' \
+--outputs='softmax' \
+--transforms='
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  quantize_weights'
 ```
 
 You should see that the size of the output graph is about a quarter of the
@@ -263,9 +270,8 @@ bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --out_graph=optimized_inception_graph.pb \
 --inputs='Mul:0' \
 --outputs='softmax:0' \
---transforms='\
-obfuscate_names \
-'
+--transforms='
+  obfuscate_names'
 ```
 
 ### Eight-bit Calculations
@@ -280,17 +286,19 @@ bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 --in_graph=tensorflow_inception_graph.pb \
 --out_graph=optimized_inception_graph.pb \
---inputs='Mul:0' \
---outputs='softmax:0' \
+--inputs='Mul' \
+--outputs='softmax' \
 --transforms='
- add_default_attributes
- strip_unused_nodes(type=float, shape="1,299,299,3")
- remove_nodes(op=Identity, op=CheckNumerics)
- fold_old_batch_norms
- quantize_weights
- quantize_nodes
- strip_unused_nodes
- sort_by_execution_order'
+  add_default_attributes
+  strip_unused_nodes(type=float, shape="1,299,299,3")
+  remove_nodes(op=Identity, op=CheckNumerics)
+  fold_constants(ignore_errors=true)
+  fold_batch_norms
+  fold_old_batch_norms
+  quantize_weights
+  quantize_nodes
+  strip_unused_nodes
+  sort_by_execution_order'
 ```
 
 This process converts all the operations in the graph that have eight-bit
@@ -315,7 +323,7 @@ themselves contain commas (for example shape definitions).
 The --inputs and --outputs are shared across all transforms, since it's common
 to need to know what the ingoing and outgoing nodes in the graph are. You should
 make sure you set these correctly before calling the graph transform tool, and
-if you're in doubt check with the model's author, or use the `check_graph` tool
+if you're in doubt check with the model's author, or use the [`summarize_graph`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms#inspecting-graphs) tool
 to examine likely inputs and outputs.
 
 All transforms can be passed the `ignore_errors` flag, with the value set to
@@ -423,12 +431,11 @@ graph:
 ```bash
 bazel build tensorflow/tools/graph_transforms:transform_graph
 bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
---logtostderr \
 --in_graph=/tmp/quantized_inception.pb \
 --out_graph=/tmp/logged_quantized_inception.pb \
 --inputs=Mul \
 --outputs=softmax \
---transforms='\
+--transforms='
 insert_logging(op=RequantizationRange, show_name=true, message="__requant_min_max:")\
 '
 ```
@@ -442,12 +449,10 @@ log:
 bazel build tensorflow/examples/label_image:label_image
 bazel-bin/tensorflow/examples/label_image/label_image \
 --image=${HOME}/Downloads/grace_hopper.jpg \
---logtostderr \
 --input_layer=Mul \
 --output_layer=softmax \
 --graph=/tmp/logged_quantized_inception.pb \
---labels=learning/brain/models/image/inception_v3/imagenet_comp_graph_label_strings.txt \
---logtostderr \
+--labels=${HOME}/Downloads/imagenet_comp_graph_label_strings.txt \
 2>/tmp/min_max_log_small.txt
 ```
 
@@ -580,7 +585,10 @@ Converts any large (more than 15 element) float Const op into an eight-bit
 equivalent, followed by a float conversion op so that the result is usable by
 subsequent nodes. This is mostly useful for [shrinking file
 sizes](#shrinking-file-size), but also helps with the more advanced
-[quantize_nodes](#quantize_nodes) transform.
+[quantize_nodes](#quantize_nodes) transform. Even though there are no
+prerequesites, it is advisable to run [fold_batch_norms](#fold_batch_norms) or
+[fold_old_batch_norms](#fold_old_batch_norms), because rounding variances down
+to zero may cause significant loss of precision.
 
 ### remove_attribute
 
@@ -665,7 +673,11 @@ Rounds all float values in large Const ops (more than 15 elements) to the given
 number of steps. The unique values are chosen per buffer by linearly allocating
 between the largest and smallest values present. This is useful when you'll be
 deploying on mobile, and you want a model that will compress effectively. See
-[shrinking file size](#shrinking-file-size) for more details.
+[shrinking file size](#shrinking-file-size) for more details. Even though there
+are no prerequesites, it is advisable to run
+[fold_batch_norms](#fold_batch_norms) or
+[fold_old_batch_norms](#fold_old_batch_norms), because rounding variances down
+to zero may cause significant loss of precision.
 
 ### sparsify_gather
 
@@ -986,7 +998,7 @@ There are a few things to know about the `ReplaceMatchingOpTypes` function:
     important nodes are listed in the `output_nodes` argument that's passed into
     each replacement function call. You can disable this checking by setting
     `allow_inconsistencies` to true in the options, but otherwise any
-    replacements that break the graph constraints will be cancelled. If you do
+    replacements that break the graph constraints will be canceled. If you do
     allow inconsistencies, it's your transform's responsibility to fix them up
     before you return your final result. Functions like `RenameNodeInputs` can
     be useful if you are doing wholesale node renaming for example.
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 8d1f19bf30b762f8b3ad0a4c4012c0b245e92ebc..6f44da7ee0fa25ec0ff1676def507db94e092b88 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -147,13 +147,14 @@ Status FoldConstants(const GraphDef& input_graph_def,
   TF_RETURN_IF_ERROR(
       ImportGraphDef(import_opts, cleaned_graph_def, &input_graph, nullptr));
   DeviceAttributes device_attributes;
+  subgraph::RewriteGraphMetadata metadata;
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
       &input_graph, context.input_names, context.output_names, {},
-      device_attributes));
+      device_attributes, false /* use_function_convention */, &metadata));
   bool was_mutated;
-  TF_RETURN_IF_ERROR(DoConstantFoldingWithStatus(
-      ConstantFoldingOptions(), nullptr, Env::Default(), nullptr, &input_graph,
-      &was_mutated));
+  TF_RETURN_IF_ERROR(ConstantFold(ConstantFoldingOptions(), nullptr,
+                                  Env::Default(), nullptr, &input_graph,
+                                  &was_mutated));
   GraphDef folded_graph_def;
   input_graph.ToGraphDef(&folded_graph_def);
   GraphDef send_recvs_replaced;
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index dac13f5c3214044d8829e19f9735ff7b600d2f22..902f92952a6405ad6eed3f61364f6e127bfda8cb 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -72,9 +72,9 @@ class ConstantFoldingTest : public ::testing::Test {
                         {"output_expect_remains"});
   }
 
-  void TestConstantFolding(const GraphDef graph_def,
+  void TestConstantFolding(const GraphDef& graph_def,
                            std::vector<std::pair<string, Tensor> > inputs,
-                           std::vector<string> outputs) {
+                           const std::vector<string>& outputs) {
     std::unique_ptr<tensorflow::Session> unfolded_session(
         tensorflow::NewSession(tensorflow::SessionOptions()));
     TF_ASSERT_OK(unfolded_session->Create(graph_def));
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 5d1c76834ff880b2c1b6193c54d6900873fb3c23..78078ab6abd932f45bdc6434c23c8f01543a3a51 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -941,7 +941,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
   // keep interoperability with float ops.
   TF_RETURN_IF_ERROR(RemoveRedundantQuantizations(deduped_graph_def, context,
                                                   output_graph_def));
-  TF_RETURN_IF_ERROR(IsGraphValid(merged_graph_def));
+  TF_RETURN_IF_ERROR(IsGraphValid(*output_graph_def));
 
   return Status::OK();
 }
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index e6f1498224f401330478af7d6039f2066057d25c..66d800f0da1f49a2026a71927d6910e18e87f2f5 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -70,6 +70,10 @@ Status QuantizeWeights(const GraphDef& input_graph_def,
           min = std::min(min, value);
           max = std::max(max, value);
         }
+        // Make sure the quantization range includes 0.0f. Not all quantized
+        // Ops behave properly if 0.0f is not in the range.
+        min = std::min(min, 0.0f);
+        max = std::max(0.0f, max);
         // min_value == max_value is a tricky case. It can occur for general
         // tensors, and of course for scalars. The quantized ops cannot deal
         // with this case, so we set max_value to something else.
diff --git a/tensorflow/tools/graph_transforms/quantize_weights_test.cc b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
index cd5feed3580fa533e7e34f65b1a069c6a5ef5828..e1a105bdd31b002d6ef1ed73e68738202c5dd04c 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights_test.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights_test.cc
@@ -35,51 +35,46 @@ Status QuantizeWeights(const GraphDef& input_graph_def,
 
 class QuantizeWeightsTest : public ::testing::Test {
  protected:
-  void TestQuantizeWeights() {
+  void BuildGraphDef(const TensorShape& input_shape,
+                     std::initializer_list<float> input_values,
+                     const TensorShape& weight_shape,
+                     std::initializer_list<float> weight_values,
+                     GraphDef* original_graph_def) {
     auto root = tensorflow::Scope::NewRootScope();
-    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
-    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
-    test::FillValues<float>(
-        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
-                      -5.0f, -3.0f, -6.0f});
+    Tensor input_data(DT_FLOAT, input_shape);
+    test::FillValues<float>(&input_data, input_values);
     Output input_op =
-        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+        ops::Const(root.WithOpName("input_op"), Input::Initializer(input_data));
 
-    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 10}));
-    test::FillValues<float>(
-        &weights_data,
-        {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
-         3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
-         0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
-         0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
-    Output weights_op =
-        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+    Tensor weights_data(DT_FLOAT, weight_shape);
+    test::FillValues<float>(&weights_data, weight_values);
+    Output weights_op = ops::Const(root.WithOpName("weights_op"),
+                                   Input::Initializer(weights_data));
 
-    Output conv_op = Conv2D(root.WithOpName("output"), input_op, weights_op,
-                            {1, 1, 1, 1}, "VALID");
+    Output conv_op = ops::Conv2D(root.WithOpName("output"), input_op,
+                                 weights_op, {1, 1, 1, 1}, "VALID");
 
-    GraphDef original_graph_def;
-    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+    TF_ASSERT_OK(root.ToGraphDef(original_graph_def));
+  }
 
-    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
-    TF_ASSERT_OK(original_session->Create(original_graph_def));
-    std::vector<Tensor> original_outputs;
-    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+  void TestQuantizeWeights() {
+    GraphDef original_graph_def;
+    BuildGraphDef({1, 1, 6, 2},
+                  {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                   -5.0f, -3.0f, -6.0f},
+                  {1, 2, 2, 10},
+                  {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
+                   3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
+                   0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
+                   0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
+                  &original_graph_def);
 
     GraphDef quantized_graph_def;
     TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
                                  &quantized_graph_def));
 
-    std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
-    TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
-    std::vector<Tensor> quantized_outputs;
-    TF_ASSERT_OK(
-        quantized_session->Run({}, {"output"}, {}, &quantized_outputs));
-
-    test::ExpectTensorNear<float>(original_outputs[0], quantized_outputs[0],
-                                  0.5);
-
+    // Verify the structure of the quantized graph.
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(quantized_graph_def, &node_lookup);
     EXPECT_EQ(1, node_lookup.count("input_op"));
@@ -94,10 +89,69 @@ class QuantizeWeightsTest : public ::testing::Test {
     const NodeDef* q_weights_const = node_lookup.at(weights_const_name);
     EXPECT_EQ("Const", q_weights_const->op());
     EXPECT_EQ(DT_QUINT8, q_weights_const->attr().at("dtype").type());
+
+    // Run the the original graph.
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    // Run the the quantized graph.
+    std::unique_ptr<Session> quantized_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(quantized_session->Create(quantized_graph_def));
+    std::vector<Tensor> quantized_outputs;
+    TF_ASSERT_OK(
+        quantized_session->Run({}, {"output"}, {}, &quantized_outputs));
+
+    // Compare the results
+    test::ExpectTensorNear<float>(original_outputs[0], quantized_outputs[0],
+                                  0.5);
   }
 };
 
 TEST_F(QuantizeWeightsTest, TestQuantizeWeights) { TestQuantizeWeights(); }
 
+TEST_F(QuantizeWeightsTest, RangesAlwaysIncludeZero) {
+  GraphDef original_graph_def;
+  BuildGraphDef({1, 1, 4, 4},
+                {-1.0f, -4.0f, -2.0f, -5.0f, -1.0f, -4.0f, -2.0f, -5.0f, -1.0f,
+                 -4.0f, -2.0f, -5.0f, -1.0f, -4.0f, -2.0f, -5.0f},
+                {1, 2, 2, 10},
+                {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f,
+                 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f,
+                 0.1f, 0.2f, 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f,
+                 0.3f, 0.4f, 1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f},
+                &original_graph_def);
+  GraphDef quantized_graph_def;
+  TF_ASSERT_OK(QuantizeWeights(original_graph_def, {{}, {"output"}},
+                               &quantized_graph_def));
+
+  std::map<string, const NodeDef*> node_lookup;
+  MapNamesToNodes(quantized_graph_def, &node_lookup);
+
+  auto expected_tensor = [](float value) {
+    Tensor tensor(DT_FLOAT, TensorShape({}));
+    test::FillValues<float>(&tensor, {value});
+    return tensor;
+  };
+  auto existing_tensor = [&node_lookup](string op) {
+    const NodeDef* node_def = node_lookup.at(op);
+    CHECK(node_def);
+    return GetNodeTensorAttr(*node_def, "value");
+  };
+
+  // The max of input_op is moved from -1.0 to 0.0.
+  test::ExpectTensorNear<float>(
+      expected_tensor(-5.0), existing_tensor("input_op_quantized_min"), 1e-5);
+  test::ExpectTensorNear<float>(
+      expected_tensor(0.0), existing_tensor("input_op_quantized_max"), 1e-5);
+
+  // The min of weights_op is moved from 0.1 to 0.0.
+  test::ExpectTensorNear<float>(
+      expected_tensor(0.0), existing_tensor("weights_op_quantized_min"), 1e-5);
+  test::ExpectTensorNear<float>(
+      expected_tensor(4.0), existing_tensor("weights_op_quantized_max"), 1e-5);
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
index f45dfbba0ced3546c0eae498f08b1dd25d90c80e..e49257804575be11a6e9a7ddb223cece2ced9a18 100644
--- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc
+++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc
@@ -50,8 +50,8 @@ void PrintNodeInfo(const NodeDef* node) {
   std::cout << ", shape=" << shape.DebugString() << ") ";
 }
 
-void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
-                         const std::vector<const NodeDef*> variables,
+void PrintBenchmarkUsage(const std::vector<const NodeDef*>& placeholders,
+                         const std::vector<const NodeDef*>& variables,
                          const std::vector<const NodeDef*> outputs,
                          const string& graph_path) {
   std::vector<const NodeDef*> all_inputs(placeholders);
@@ -94,7 +94,6 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
   std::cout << "bazel run tensorflow/tools/benchmark:benchmark_model --";
   std::cout << " --graph=" << graph_path;
   std::cout << " --show_flops";
-  std::cout << " --logtostderr";
   std::cout << " --input_layer=" << input_layer_value;
   std::cout << " --input_layer_type=" << input_layer_type_value;
   std::cout << " --input_layer_shape=" << input_layer_shape_value;
@@ -102,7 +101,18 @@ void PrintBenchmarkUsage(const std::vector<const NodeDef*> placeholders,
   std::cout << std::endl;
 }
 
-Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
+Status PrintStructure(const GraphDef& graph) {
+  GraphDef sorted_graph;
+  TF_RETURN_IF_ERROR(SortByExecutionOrder(graph, &sorted_graph));
+  for (const NodeDef& node : sorted_graph.node()) {
+    std::cout << node.name() << " (" << node.op() << "): ["
+              << str_util::Join(node.input(), ", ") << "]" << std::endl;
+  }
+  return Status::OK();
+}
+
+Status SummarizeGraph(const GraphDef& graph, const string& graph_path,
+                      bool print_structure) {
   std::vector<const NodeDef*> placeholders;
   std::vector<const NodeDef*> variables;
   for (const NodeDef& node : graph.node()) {
@@ -233,13 +243,20 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) {
 
   PrintBenchmarkUsage(placeholders, variables, outputs, graph_path);
 
+  if (print_structure) {
+    TF_RETURN_IF_ERROR(PrintStructure(graph));
+  }
+
   return Status::OK();
 }
 
 int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
   string in_graph = "";
+  bool print_structure = false;
   std::vector<Flag> flag_list = {
       Flag("in_graph", &in_graph, "input graph file name"),
+      Flag("print_structure", &print_structure,
+           "whether to print the network connections of the graph"),
   };
   string usage = Flags::Usage(argv[0], flag_list);
 
@@ -269,7 +286,8 @@ int ParseFlagsAndSummarizeGraph(int argc, char* argv[]) {
     return -1;
   }
 
-  Status summarize_result = SummarizeGraph(graph_def, in_graph);
+  Status summarize_result =
+      SummarizeGraph(graph_def, in_graph, print_structure);
   if (!summarize_result.ok()) {
     LOG(ERROR) << summarize_result.error_message() << "\n" << usage;
     return -1;
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index a1c4073fb99e264b4c026a55db8c6510ef072996..b8bf2dc0901425bbae3a3e9245e77e0e8dc6d2b4 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -48,6 +48,11 @@ Status ParseTransformParameters(const string& transforms_string,
       func_parameters.clear();
       // Eat up any leading spaces.
       Scanner(remaining).AnySpace().GetResult(&remaining, &match);
+      if (remaining.empty()) {
+        // Nothing remains after consuming trailing spaces.
+        // Consumed all transform parameter string without errors.
+        return Status::OK();
+      }
       // See if we have a valid transform name.
       const bool found_transform_name =
           Scanner(remaining)
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index dd60b998543f983df411ca7235ea058cd48a370a..bc2412fcbdba90731318eea1a2239aa914b35ffc 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -205,6 +205,19 @@ class TransformGraphTest : public ::testing::Test {
     EXPECT_EQ(0, params_list.size());
   }
 
+  void TestParseExtraSpaces() {
+    TransformParameters params_list;
+    ParseTransformParameters(" ", &params_list).IgnoreError();
+    EXPECT_EQ(0, params_list.size());
+
+    TF_EXPECT_OK(ParseTransformParameters("  foo bar \\\n", &params_list));
+    EXPECT_EQ(2, params_list.size());
+    EXPECT_EQ("foo", params_list[0].first);
+    EXPECT_TRUE(params_list[0].second.empty());
+    EXPECT_EQ("bar", params_list[1].first);
+    EXPECT_TRUE(params_list[1].second.empty());
+  }
+
   void TestShouldIgnoreErrors() {
     bool ignore_errors;
     TF_EXPECT_OK(
diff --git a/tensorflow/tools/graph_transforms/transform_utils.h b/tensorflow/tools/graph_transforms/transform_utils.h
index 54808efa9fbc8af356e5502b5686ab0d3409cd18..95cb21d64c6bcc5500d9611a833b621e7f800803 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.h
+++ b/tensorflow/tools/graph_transforms/transform_utils.h
@@ -106,7 +106,7 @@ void FilterGraphDef(const GraphDef& input_graph_def,
                     std::function<bool(const NodeDef&)> selector,
                     GraphDef* output_graph_def);
 
-// Creates a copy of the input graph, with all occurences of the attributes with
+// Creates a copy of the input graph, with all occurrences of the attributes with
 // the names in the argument removed from the node defs.
 void RemoveAttributes(const GraphDef& input_graph_def,
                       const std::vector<string>& attributes,
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 3e049724f670dcaf4384a01cb7208f4498578b7b..1e36af93ea762b3f8c1bd09411e6a29a362e93e3 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -1,12 +1,17 @@
-# Packaging the TensorFlow C API and Java libraries into standalone archives
-# for use with language bindings and installations without Python.
-#
-# TODO(ashankar): Something similar for the C++ API (caveat: ABI compatibility)
+# Packaging for TensorFlow artifacts other than the Python API (pip whl).
+# This includes the C API, Java API, and protocol buffer files.
 
 package(default_visibility = ["//visibility:private"])
 
 load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
 
+genrule(
+    name = "libtensorflow_proto",
+    srcs = ["//tensorflow/core:protos_all_proto_srcs"],
+    outs = ["libtensorflow_proto.zip"],
+    cmd = "zip $@ $(SRCS)",
+)
+
 pkg_tar(
     name = "libtensorflow",
     extension = "tar.gz",
@@ -79,11 +84,13 @@ genrule(
     srcs = [
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
@@ -106,11 +113,13 @@ genrule(
     srcs = [
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@highwayhash//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2a96e80ccb2a91ca6b6c5ff5fea85f348fc55b97..c44b966451023e1f31f00d5000c232bb595cbba8 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -68,6 +68,7 @@ py_binary(
         ":included_headers",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/session_bundle:session_bundle_pip",
+        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/contrib/slim/python/slim/data:data_pip",
         "//tensorflow/python:util_example_parser_configuration",
         "//tensorflow/python/debug:debug_pip",
@@ -91,12 +92,15 @@ filegroup(
     name = "licenses",
     data = [
         "//third_party/eigen3:LICENSE",
+        "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "@boringssl//:LICENSE",
         "@com_googlesource_code_re2//:LICENSE",
+        "@com_microsoft_typescript//:LICENSE.txt",
         "@curl//:COPYING",
         "@eigen_archive//:COPYING.MPL2",
         "@farmhash_archive//:COPYING",
+        "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
         "@grpc//:LICENSE",
@@ -106,13 +110,14 @@ filegroup(
         "@libxsmm_archive//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nanopb_git//:LICENSE.txt",
-        "@png_archive//:LICENSE",
-        "@protobuf//:LICENSE",
-        "@six_archive//:LICENSE",
         "@org_html5lib//:LICENSE",
         "@org_mozilla_bleach//:LICENSE",
+        "@org_nodejs//:LICENSE",
         "@org_pocoo_werkzeug//:LICENSE",
         "@org_pythonhosted_markdown//:LICENSE.md",
+        "@png_archive//:LICENSE",
+        "@protobuf//:LICENSE",
+        "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + if_not_windows([
@@ -139,6 +144,7 @@ sh_binary(
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
+            "//tensorflow/contrib/signal:signal_py",
             "//tensorflow/contrib/slim:slim",
             "//tensorflow/contrib/slim/python/slim/data:data_pip",
             "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -146,6 +152,7 @@ sh_binary(
             "//tensorflow/contrib/tensor_forest:init_py",
             "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
             "//tensorflow/examples/tutorials/mnist:package",
+            "//tensorflow/python:distributed_framework_test_lib",
             "//tensorflow/python:util_example_parser_configuration",
             "//tensorflow/python/debug:debug_pip",
             "//tensorflow/python/saved_model:saved_model",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 10ab871b021aea179b04b3ec2e914d196691a86b..459d6ee3284f6f984b83601ceda44a68cbaaba8b 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -28,10 +28,13 @@ import subprocess
 PIP_PACKAGE_QUERY = """bazel query \
   'deps(//tensorflow/tools/pip_package:build_pip_package)'"""
 
-PY_TEST_QUERY = """bazel query 'filter("^((?!(benchmark|manual|no_pip)).)*$", \
-  deps(kind(py_test,\
-  //tensorflow/python/... + \
-  //tensorflow/tensorboard/...), 1))'"""
+PY_TEST_QUERY = """bazel query 'deps(\
+  filter("^((?!benchmark).)*$",\
+  kind(py_test,\
+  //tensorflow/python/... \
+  + //tensorflow/tensorboard/... \
+  + //tensorflow/contrib/... \
+  - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'"""
 
 # Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
@@ -40,10 +43,21 @@ BLACKLIST = [
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
     "//tensorflow/python:test_ops_2",
+    "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
     "//tensorflow/core:image_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
-    "//tensorflow/python:framework/test_file_system.so"
+    "//tensorflow/python/feature_column:vocabulary_testdata",
+    "//tensorflow/python:framework/test_file_system.so",
+    # contrib
+    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
+    "//tensorflow/contrib/keras:testing_utils",
+    "//tensorflow/contrib/ffmpeg:test_data",
+    "//tensorflow/contrib/factorization/examples:mnist",
+    "//tensorflow/contrib/factorization/examples:mnist.py",
+    "//tensorflow/contrib/factorization:factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/bayesflow:reinforce_simple_example",
+    "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
 ]
 
 
@@ -110,7 +124,10 @@ def main():
       affected_tests_list = affected_tests.split("\n")[:-2]
       print("\n".join(affected_tests_list))
 
-    raise RuntimeError("One or more dependencies are not in the pip package.")
+    raise RuntimeError("""One or more dependencies are not in the pip package.
+Please either blacklist the dependencies in
+tensorflow/tensorflow/tensorflow/tools/pip_package/pip_smoke_test.py
+or add them to tensorflow/tensorflow/tensorflow/tools/pip_package/BUILD.""")
 
   else:
     print("TEST PASSED")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6f1c539608bcfa50ab6a9ccbae464e39cbd73627..ae6516db891191f4a92334d2163b4737c12c3a68 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,14 +29,14 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.1.0-rc0'
+_VERSION = '1.1.0'
 
 REQUIRED_PACKAGES = [
     'numpy >= 1.11.0',
     'six >= 1.10.0',
     'protobuf >= 3.2.0',
     'werkzeug >= 0.11.10',
-    'html5lib == 1.0b8',
+    'html5lib == 0.9999999',  # identical to 1.0b8
     'markdown == 2.2.0',
     'bleach == 1.5.0',
 ]
@@ -59,6 +59,7 @@ else:
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'tensorboard = tensorflow.tensorboard.tensorboard:main',
+    'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
 ]
 # pylint: enable=line-too-long
 
@@ -187,13 +188,11 @@ setup(
     # Add in any packaged data.
     include_package_data=True,
     package_data={
-        'tensorflow': [EXTENSION_NAME,
-                       'tensorboard/dist/bazel-html-imports.html',
-                       'tensorboard/dist/index.html',
-                       'tensorboard/dist/tf-tensorboard.html',
-                       'tensorboard/lib/css/global.css',
-                       'tensorboard/TAG',
-                     ] + matches,
+        'tensorflow': [
+            EXTENSION_NAME,
+            'tensorboard/components/index.html',
+            'tensorboard/TAG',
+        ] + matches,
     },
     zip_safe=False,
     distclass=BinaryDistribution,
@@ -212,7 +211,6 @@ setup(
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Software Development :: Libraries :: Python Modules',
         'Topic :: Software Development :: Libraries',
-        ],
+    ],
     license='Apache 2.0',
-    keywords='tensorflow tensor machine learning',
-    )
+    keywords='tensorflow tensor machine learning',)
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index 17ab542a59864a934ed3fc18bebf8b0c52f5566c..ecb29a65a08b098cd167e5cbb2bdb5821e01a543 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -130,6 +130,7 @@ int MainImpl(int argc, char** argv) {
 
       const string path = output_root + "/" + proto_path_no_suffix + suffix;
       FILE* f = fopen(path.c_str(), "w");
+      if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
         return -1;
       }
diff --git a/tensorflow/tools/quantization/quantize_graph.py b/tensorflow/tools/quantization/quantize_graph.py
index d09349a79b985750bdf513f7823a1f4b531e8b5e..a0cfc352d4f65a32dde13893dc937a72d7434e28 100644
--- a/tensorflow/tools/quantization/quantize_graph.py
+++ b/tensorflow/tools/quantization/quantize_graph.py
@@ -453,7 +453,8 @@ class GraphRewriter(object):
 
   def round_nodes_recursively(self, current_node):
     """The entry point for simple rounding quantization."""
-    if self.already_visited[current_node.name]:
+    if (current_node.name in self.already_visited
+       ) and self.already_visited[current_node.name]:
       return
     self.already_visited[current_node.name] = True
     for input_node_name in current_node.input:
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index 3b458dc6aa31da95f22e13fa8273ed601f6e2f36..df71840b64db3a1a451ec74b12d039a412976666 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -688,7 +688,7 @@ class QuantizeGraphTest(test.TestCase):
 
   def test_quantized_input_range_bias_add(self):
     input_shape = [1, 1, 2, 6]
-    input_n = quantize_graph.create_node("PlaceholderV2", "input", [])
+    input_n = quantize_graph.create_node("Placeholder", "input", [])
     quantize_graph.set_attr_dtype(input_n, "dtype", dtypes.float32)
     quantize_graph.set_attr_shape(input_n, "shape", input_shape)
     offset_n = quantize_graph.create_constant_node(
@@ -713,7 +713,7 @@ class QuantizeGraphTest(test.TestCase):
     shapes = [[3, 2], [2, 4]]
     inputs = []
     for i, shape in enumerate(shapes):
-      node = quantize_graph.create_node("PlaceholderV2", "input_%s" % i, [])
+      node = quantize_graph.create_node("Placeholder", "input_%s" % i, [])
       quantize_graph.set_attr_dtype(node, "dtype", dtypes.float32)
       quantize_graph.set_attr_shape(node, "shape", shape)
       inputs.append(node)
diff --git a/tensorflow/tools/test/run_and_gather_logs_lib.py b/tensorflow/tools/test/run_and_gather_logs_lib.py
index ebb527bc09526b12b813332fc45bf28a0207fac6..e803d5cdacb5efcaf35577ec4b09e0340b315906 100644
--- a/tensorflow/tools/test/run_and_gather_logs_lib.py
+++ b/tensorflow/tools/test/run_and_gather_logs_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 import shlex
 import subprocess
 import tempfile
@@ -26,6 +27,7 @@ import time
 
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.platform import gfile
+from tensorflow.tools.test import gpu_info_lib
 from tensorflow.tools.test import system_info_lib
 
 
@@ -93,7 +95,8 @@ def process_benchmarks(log_files):
   return benchmarks
 
 
-def run_and_gather_logs(name, test_name, test_args, benchmark_type):
+def run_and_gather_logs(name, test_name, test_args,
+                        benchmark_type):
   """Run the bazel test given by test_name.  Gather and return the logs.
 
   Args:
@@ -148,8 +151,17 @@ def run_and_gather_logs(name, test_name, test_args, benchmark_type):
     if not log_files:
       raise MissingLogsError("No log files found at %s." % test_file_prefix)
 
+    test_adjusted_name = name
+    gpu_config = gpu_info_lib.gather_gpu_devices()
+    if gpu_config:
+      gpu_name = gpu_config[0].model
+      gpu_short_name_match = re.search(r"Tesla [KP][4,8]0", gpu_name)
+      if gpu_short_name_match:
+        gpu_short_name = gpu_short_name_match.group(0)
+        test_adjusted_name = name + "|" + gpu_short_name.replace(" ", "_")
+
     return (process_test_logs(
-        name,
+        test_adjusted_name,
         test_name=test_name,
         test_args=test_args,
         benchmark_type=benchmark_type,
diff --git a/tensorflow/tools/tfprof/README.md b/tensorflow/tools/tfprof/README.md
index c83cdd45b6c248d7e9f4d0d9552d0b2c4f4a1195..69f09411a9c419f167d1e76cd0491396ec8aedb5 100644
--- a/tensorflow/tools/tfprof/README.md
+++ b/tensorflow/tools/tfprof/README.md
@@ -1,6 +1,6 @@
 # tfprof: A Profiling Tool for TensorFlow Models
 
-Author: Xin Pan (xpan@google.com, github: panyx0718)
+Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
 
 Consultants: Jon Shlens, Pete Warden
 
@@ -8,14 +8,26 @@ Consultants: Jon Shlens, Pete Warden
 ###Major Features
 
 1.  Measure model parameters, float operations, tensor shapes.
-2.  Measure op execution times, requested memory size and device placement.
+2.  Profile op execution times, requested memory size and device placement.
 3.  Inspect checkpoint tensors' shapes and their values.
-4.  Explore model based on name scope or graph structure.
-5.  Selectively grouping/filtering/accounting/ordering ops.
+4.  Selectively group, filter, account and order ops.
+
+####tfprof supports 3 views to organize TensorFlow model profiles
+
+    *  code view: Stats are associated your Python codes and organized as call stacks.
+    *  scope view: Stats are organized as name scope hierarchies.
+    *  graph view: Stats are organized as Tensorflow Op graph.
+
+####For each view, there are 3 ways to display outputs:
+
+    *  stdout: Results are written to stdout.
+    *  timeline: Visualized in chrome browser as time series.
+    *  file: Results are dumped to file.
+
 
 [Python API Tutorials](#python-api-tutorials): It can be called directly from
 Python codes. Results are either printed
-to stdout or dumped to file. tensorflow.tfprof.TFProfNode proto is returned from
+to stdout or dumped to file. tensorflow.tfprof.TFGraphNodeProto proto is returned from
 the API to allow users to perform further analysis.
 
 [CLI Tutorials](#cli-tutorials):
@@ -30,16 +42,26 @@ statistics.
 
 tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
 
-### Examine the shapes and sizes of all trainiable Variables.
+### Examine the shapes and sizes of all trainable Variables.
 ```python
 # Print trainable variable parameter statistics to stdout.
+# By default, statistics are associated with each graph node.
 param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
     tf.get_default_graph(),
     tfprof_options=tf.contrib.tfprof.model_analyzer.
         TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
 
-# param_stats is tensorflow.tfprof.TFProfNode proto. It organize the statistics
-# of each graph node in tree scructure. Let's print the root below.
+
+# Set tfprof_cmd='code' to associate statistics with Python codes.
+opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
+opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
+param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    tfprof_cmd='code'
+    tfprof_options=opts)
+
+# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
+# Let's print the root below.
 sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 ```
 
@@ -68,13 +90,11 @@ compute the memory and timing statistics.
 #
 # Note: When run on GPU, a kernel is first scheduled (enqueued) and then
 #       executed asynchronously. tfprof only tracks the execution time.
-#       Which is from proto CostGraphDef::Node::compute_cost.
 #       In addition, a substantial of time might be spent between Python and
 #       TensorFlow runtime, which is also not tracked by tfprof.
 #
-config = tf.ConfigProto(graph_options=tf.GraphOptions(build_cost_model=1))
 run_metadata = tf.RunMetadata()
-with tf.Session(config=config) as sess:
+with tf.Session() as sess:
   _ = sess.run(train_op,
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
@@ -84,8 +104,20 @@ Finally, you may run `print_model_analysis` to explore the timing and memory
 demands of the model.
 
 ``` python
+# See model_analyzer_test.py for more examples.
+#
 # Print to stdout an analysis of the memory usage and the timing information
-# from running the graph broken down by operations.
+# broken down by python codes.
+opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
+opts['show_name_regexes'] = ['.*my_code.py.*']
+tf.contrib.tfprof.model_analyzer.print_model_analysis(
+    tf.get_default_graph(),
+    run_meta=run_metadata,
+    tfprof_cmd='code',
+    tfprof_options=opts)
+
+# Print to stdout an analysis of the memory usage and the timing information
+# broken down by operations.
 tf.contrib.tfprof.model_analyzer.print_model_analysis(
     tf.get_default_graph(),
     run_meta=run_metadata,
@@ -94,6 +126,18 @@ tf.contrib.tfprof.model_analyzer.print_model_analysis(
 
 Users can change ```tfprof_options``` to fully leverage tfprof's power.
 
+```
+For example set opts['output'] = 'timeline:outfile=<filename>' to
+generate a timeline json file. Open a Chrome Browser, open URL
+chrome://tracing, and load the json file. Below are 2 examples of graph
+view and scope view. See code view example in later examples.
+```
+
+<left>
+![CodeTimeline](g3doc/graph_timeline.png)
+![CodeTimeline](g3doc/scope_timeline.png)
+</left>
+
 
 ## CLI Tutorials
 
@@ -138,9 +182,9 @@ bazel-bin/tensorflow/tools/tfprof/tfprof \
     --run_meta_path=run_meta \
     --checkpoint_path=model.ckpt
 #
-# tfprof_log is used to define customized op types and float ops.
+# tfprof_log is used to define customized op types, float ops and code traces.
 # Use tfprof_logger.write_op_log() to create tfprof_log.
-# See 11) in Examples section on generating tfprof_log file.
+# See 12) in Examples section on generating tfprof_log file.
 bazel-bin/tensorflow/tools/tfprof/tfprof \
     --graph_path=graph.pbtxt \
     --run_meta_path=run_meta \
@@ -170,11 +214,44 @@ tfprof>
 # supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
 # [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types]
 -select                     params
--viz                        false
--dump_to_file
+# format: output_type:key=value,key=value...
+# output_types: stdout (default), timeline, file.
+# key=value pairs:
+#   1. timeline: outfile=<filename>
+#   2. file: outfile=<filename>
+#   3. stdout: None.
+# E.g. timeline:outfile=/tmp/timeline.json
+-output
 ```
 
-3) I want to see the `BatchNorm`'s gamma value in checkpoint.
+3) I want to see which line of my python codes costs most time!
+
+```shell
+# Requires --graph_path --op_log_path
+tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
+_TFProfRoot (0us/22.44ms)
+  model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
+    model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
+      model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
+        model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
+          model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
+          model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
+          model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
+            model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
+            model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
+            ...
+          model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
+          model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
+        model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
+```
+
+Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
+<left>
+![CodeTimeline](g3doc/code_timeline.png)
+</left>
+
+
+4) I want to see the `BatchNorm`'s gamma value in checkpoint.
 
 ```shell
 # Requires --graph_path, --checkpoint_path.
@@ -186,7 +263,7 @@ _TFProfRoot ()
 [1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
 ```
 
-4) I want to see my checkpoint tensors shape and number of parameters.
+5) I want to see my checkpoint tensors shape and number of parameters.
 
 ```shell
 # Requires --graph_path, --checkpoint_path.
@@ -205,7 +282,7 @@ _TFProfRoot (--/930.58k params)
   unit_last/final_bn/moving_variance (64, 64/64 params)
 ```
 
-5) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
+6) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
 it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
 graph dependencies.
 
@@ -221,7 +298,7 @@ _TFProfRoot (0us/3.61sec)
   unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
 ```
 
-6) I want to know the expensive operations during the back propagation.
+7) I want to know the expensive operations during the back propagation.
 Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
 command to explore based on name scope hierarchies.
 
@@ -238,7 +315,7 @@ _TFProfRoot (0us/2.29sec)
   ...
 ```
 
-7) Show the number of float operations in the model.
+8) Show the number of float operations in the model.
 Note: float operations calculation depends on
 1) op.RegisterStatistics. If an op doesn’t
 have RegisterStatistics defined, its float operations cannot be counted.
@@ -263,7 +340,7 @@ _TFProfRoot (0/17.63b flops)
   ...
 ```
 
-8) Show the number of parameters of all `tf.trainable_variables()` in the model.
+9) Show the number of parameters of all `tf.trainable_variables()` in the model.
 
 ```shell
 # Requires --graph_path --op_log_path.
@@ -283,7 +360,7 @@ generated by write_op_log() Python API. write_op_log() help users create some
 common op types implicitly. Users can define their own op types and log it
 through the write_op_log() API.
 
-9) What if I’m lazy and don’t want to define op type? I have given my ops
+109) What if I’m lazy and don’t want to define op type? I have given my ops
 well-defined names in my model’s code. And want to use names to select a group
 of ops. Let’s try it!
 
@@ -301,7 +378,7 @@ in terminal. Otherwise, tfprof accounts all ops matched by
 `-account_type_regexes` recursively even if they are hidden due to some
 options such as -max_depth.
 
-10) TensorFlow has built-in op types. For example, built-in op type `Variable`
+11) TensorFlow has built-in op types. For example, built-in op type `Variable`
 seems to include `Variable's` created by your model. However, be careful when
 depending on it because TensorFlow creates extra `Variable` ops implicitly and
 the implicitly created ops can have the same prefix as the `Variable's` you
@@ -327,7 +404,7 @@ _TFProfRoot (--/930.58k params)
 ```
 
 
-11) A example of defining extra op type for ops using `OpLog`
+12) A example of defining extra op type for ops using `OpLog`
 
 First, in Python code, create an `OpLog` proto and add op type
 information to it:
@@ -375,10 +452,10 @@ the tool adds all `Variables` inside `tf.trainable_variables()` to
 12) Run tfprof in one-shot mode and dump result to file.
 
 ```shell
-# Printed to stdout if --dump_to_file is not set.
+# By default output to stdout. Use -output option to change output types.
 tfprof scope --graph_path=graph.pbtxt  \
              --max_depth=3 \
-             --dump_to_file="/tmp/dump"
+             --output="file:outfile=/tmp/dump"
 Reading Files...
 Parsing GraphDef...
 Preparing Views...
@@ -439,7 +516,7 @@ with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
 <b>--op_log_path:</b>
 tensorflow::tfprof::OpLog. A proto used to provide extra op information
 for ops. By giving a group of ops a type name, users can easily aggregate the
-statistics for those ops without accidently missing or including extra ops.
+statistics for those ops without accidentally missing or including extra ops.
 tfprof exposes the following Python API to add op information and logging.
 
 ```python
@@ -490,4 +567,9 @@ as long as they match the `-account_xxx` options.
 
 `-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|num_hidden_ops|tensor_value|device|op_types].
 
-`-dump_to_file`: Dump the output to a file, instead of terminal.
+`-output`: Output results as stdout, file or timeline.
+The format is ```output_type:key=value,key=value```.
+For example: ```timeline:outfile=<filename>```.
+timeline: key=outfile, value=<filename>.
+stdout: none.
+file: key=outfile, value=<filename>.
diff --git a/tensorflow/tools/tfprof/g3doc/code_timeline.png b/tensorflow/tools/tfprof/g3doc/code_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5ab246f7da14c0384a5704aa8053a97540a9dab
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/code_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/graph_timeline.png b/tensorflow/tools/tfprof/g3doc/graph_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..255a91fd5ff6005086f7d4a1dfdd43cc8d115ee1
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/graph_timeline.png differ
diff --git a/tensorflow/tools/tfprof/g3doc/scope_timeline.png b/tensorflow/tools/tfprof/g3doc/scope_timeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6d95af84aac473e68b30fc6fbefa99d4431948f
Binary files /dev/null and b/tensorflow/tools/tfprof/g3doc/scope_timeline.png differ
diff --git a/tensorflow/tools/tfprof/internal/BUILD b/tensorflow/tools/tfprof/internal/BUILD
index c5482a977694d4bc1116590ab3c73c4d2562c085..e90f0ec40a07548b4879205972cc679c60c372a9 100644
--- a/tensorflow/tools/tfprof/internal/BUILD
+++ b/tensorflow/tools/tfprof/internal/BUILD
@@ -15,11 +15,13 @@ cc_library(
     srcs = ["tfprof_stats.cc"],
     hdrs = ["tfprof_stats.h"],
     deps = [
+        ":tfprof_code",
         ":tfprof_graph",
         ":tfprof_node",
         ":tfprof_options",
         ":tfprof_scope",
         ":tfprof_show",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -29,6 +31,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_timeline",
+    srcs = ["tfprof_timeline.cc"],
+    hdrs = ["tfprof_timeline.h"],
+    deps = [
+        ":tfprof_node_show",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
 cc_library(
     name = "tfprof_node",
     srcs = ["tfprof_node.cc"],
@@ -61,6 +77,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_code",
+    srcs = ["tfprof_code.cc"],
+    hdrs = ["tfprof_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_show_code",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_graph",
     srcs = ["tfprof_graph.cc"],
@@ -71,6 +108,7 @@ cc_library(
         ":tfprof_options",
         ":tfprof_show",
         ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -80,6 +118,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tfprof_node_show",
+    srcs = ["tfprof_node_show.cc"],
+    hdrs = ["tfprof_node_show.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_options",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_show",
     srcs = ["tfprof_show.cc"],
@@ -87,8 +140,32 @@ cc_library(
     deps = [
         ":tfprof_constants",
         ":tfprof_node",
+        ":tfprof_node_show",
         ":tfprof_options",
         ":tfprof_tensor",
+        ":tfprof_timeline",
+        ":tfprof_utils",
+        "//tensorflow/c:checkpoint_reader",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "tfprof_show_code",
+    srcs = ["tfprof_show_code.cc"],
+    hdrs = ["tfprof_show_code.h"],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_node",
+        ":tfprof_node_show",
+        ":tfprof_options",
+        ":tfprof_scope",
+        ":tfprof_show",
+        ":tfprof_tensor",
+        ":tfprof_timeline",
         ":tfprof_utils",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
@@ -124,6 +201,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tfprof_timeline_test",
+    srcs = ["tfprof_timeline_test.cc"],
+    data = [
+        "testdata/graph.pbtxt",
+        "testdata/run_meta",
+    ],
+    deps = [
+        ":tfprof_constants",
+        ":tfprof_options",
+        ":tfprof_stats",
+        ":tfprof_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/tools/tfprof:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tfprof_utils",
     srcs = ["tfprof_utils.cc"],
diff --git a/tensorflow/tools/tfprof/internal/print_model_analysis.cc b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
index dfe4019fbb403aaf8c66e56475cffe4ff3ab2b98..f73675e8a7322a3ef3971a1785bfea980d15a54a 100644
--- a/tensorflow/tools/tfprof/internal/print_model_analysis.cc
+++ b/tensorflow/tools/tfprof/internal/print_model_analysis.cc
@@ -40,13 +40,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   graph_ptr->ParseFromString(*graph);
 
   std::unique_ptr<RunMetadata> run_meta_ptr;
-  if (run_meta) {
+  if (run_meta && !run_meta->empty()) {
     run_meta_ptr.reset(new RunMetadata());
     run_meta_ptr->ParseFromString(*run_meta);
   }
 
   std::unique_ptr<OpLog> op_log_ptr;
-  if (op_log) {
+  if (op_log && !op_log->empty()) {
     op_log_ptr.reset(new OpLog());
     op_log_ptr->ParseFromString(*op_log);
   }
@@ -56,18 +56,32 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
   TFStats tf_stats(std::move(graph_ptr), std::move(run_meta_ptr),
                    std::move(op_log_ptr), std::move(ckpt_reader));
 
-  Options opts = Options::FromProtoStr(*options);
+  Options opts;
+  tensorflow::Status s = Options::FromProtoStr(*options, &opts);
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    return "";
+  }
 
-  if (opts.dump_to_file.empty()) {
+  if (opts.output_type == kOutput[1]) {
     printf("\n=========================Options=============================\n");
     printf("%s", opts.ToString().c_str());
     printf("\n==================Model Analysis Report======================\n");
-    TFProfNode root(tf_stats.PrintGraph(*command, opts));
+    string ret = "";
+    if (*command == kCmds[2]) {
+      ret = tf_stats.PrintCode(opts).SerializeAsString();
+    } else {
+      ret = tf_stats.PrintGraph(*command, opts).SerializeAsString();
+    }
     printf("\n======================End of Report==========================\n");
     fflush(stdout);
-    return root.SerializeAsString();
+    return ret;
+  }
+  if (*command == kCmds[2]) {
+    return tf_stats.PrintCode(opts).SerializeAsString();
+  } else {
+    return tf_stats.PrintGraph(*command, opts).SerializeAsString();
   }
-  return tf_stats.PrintGraph(*command, opts).SerializeAsString();
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.cc b/tensorflow/tools/tfprof/internal/tfprof_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9739db1e0b5ef4c7f13cc8fb15b635c9f81b3a70
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.cc
@@ -0,0 +1,224 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
+
+#include <stdio.h>
+#include <utility>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+
+namespace tensorflow {
+namespace tfprof {
+namespace {
+// Convert to Trace proto into a short readable string.
+string GetTraceString(const CodeDef::Trace& trace) {
+  string ntrace = "";
+  if (trace.file().find_last_of('/') != trace.file().npos) {
+    ntrace += trace.file().substr(trace.file().find_last_of('/') + 1);
+  } else {
+    ntrace += trace.file();
+  }
+  ntrace += strings::StrCat(":", trace.lineno());
+  if (trace.function().length() < 20) {
+    ntrace += ":" + trace.function();
+  } else {
+    ntrace += ":" + trace.function().substr(0, 17) + "...";
+  }
+  if (trace.line().length() < 20) {
+    ntrace += ":" + trace.line();
+  } else {
+    ntrace += ":" + trace.line().substr(0, 17) + "...";
+  }
+  return ntrace;
+}
+}  // namespace
+
+void TFCode::AddNode(TFGraphNode* node) {
+  if (!node->code()) {
+    return;
+  }
+  TFCodeNode* pre_trace_node = nullptr;
+  for (int i = 0; i < node->code()->traces_size(); ++i) {
+    // Unlike op name, which is globally unique, trace name is only unique
+    // w.r.t. it's parent.
+    const string& trace = GetTraceString(node->code()->traces(i));
+    if (i == 0) {
+      if (!trace_root_) {
+        trace_root_.reset(new TFCodeNode(trace));
+      }
+      CHECK(trace_root_->name() == trace) << "Different trace root";
+      pre_trace_node = trace_root_.get();
+      continue;
+    }
+    pre_trace_node->AddChildren(trace);
+    TFCodeNode* trace_node = pre_trace_node->children()[trace].get();
+
+    if (i == node->code()->traces_size() - 1) {
+      trace_node->AddGraphNode(node);
+    }
+    pre_trace_node = trace_node;
+  }
+}
+
+void TFCode::Build() {
+  if (!trace_root_) {
+    return;
+  }
+  code_root_ = BuildCodeNodes(trace_root_.get());
+}
+
+CodeNode* TFCode::BuildCodeNodes(TFCodeNode* root) {
+  auto code_root = std::unique_ptr<CodeNode>(new CodeNode(root));
+  CodeNode* code_root_ptr = code_root.get();
+  code_nodes_.insert(std::move(code_root));
+
+  for (auto it = root->children().cbegin(); it != root->children().cend();
+       ++it) {
+    code_root_ptr->children.push_back(BuildCodeNodes(it->second.get()));
+  }
+  return code_root_ptr;
+}
+
+const ShowCodeNode* TFCode::ShowInternal(const Options& opts,
+                                         Timeline* timeline) {
+  // Search from roots recursively to find start node, if start_name_regexes
+  // is specified.
+  tfprof_trace_root_.reset(new TFCodeNode(kTFProfRoot));
+  tfprof_code_root_.reset(new CodeNode(tfprof_trace_root_.get()));
+  if (!code_root_) {
+    return tfprof_code_root_.get();
+  }
+
+  std::vector<CodeNode*> roots = {code_root_};
+  if (opts.start_name_regexes.size() != 1 ||
+      opts.start_name_regexes[0] != ".*") {
+    roots = SearchRoot(roots, opts.start_name_regexes);
+  }
+
+  tfprof_code_root_->children.assign(roots.begin(), roots.end());
+  Account({tfprof_code_root_.get()}, opts);
+
+  CodeNode* root = PrintScope({tfprof_code_root_.get()}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateCodeTimeline(root);
+  }
+  return root;
+}
+
+std::vector<CodeNode*> TFCode::SearchRoot(std::vector<CodeNode*> roots,
+                                          const std::vector<string>& regexes) {
+  std::vector<CodeNode*> res;
+  if (roots.empty()) {
+    return res;
+  }
+  for (CodeNode* root : roots) {
+    bool match_start_node = false;
+    for (const string& regex : regexes) {
+      if (RE2::FullMatch(root->name(), regex)) {
+        res.push_back(root);
+        match_start_node = true;
+        break;
+      }
+    }
+    if (match_start_node) {
+      // Found a start node at this branch, no need to continue.
+      continue;
+    }
+    std::vector<CodeNode*> nroots = SearchRoot(root->children, regexes);
+    res.insert(res.end(), nroots.begin(), nroots.end());
+  }
+  return res;
+}
+
+std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots,
+                                          const Options& opts, int depth,
+                                          int last_ident) {
+  std::vector<CodeNode*> show_nodes;
+
+  for (CodeNode* node : roots) {
+    int nlast_ident = last_ident;
+    bool show = ShouldShow(node, opts, depth);
+    if (show) {
+      node->formatted_str.clear();
+      if (opts.account_displayed_op_only) {
+        node->ResetTotalStats();
+        node->AddSelfToTotalStats();
+      }
+      nlast_ident += 2;
+    }
+
+    std::vector<CodeNode*> show_cnodes;
+    if (!ShouldTrim(node, opts.trim_name_regexes)) {
+      show_cnodes = PrintScope(node->children, opts, depth + 1, nlast_ident);
+    }
+    if (show) {
+      show_cnodes = SortNodes(show_cnodes, opts);
+      string children_str;
+      for (CodeNode* sc : show_cnodes) {
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
+        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
+        if (opts.account_displayed_op_only) {
+          node->AggregateTotalStats(sc);
+        }
+      }
+
+      node->formatted_str =
+          strings::Printf("%s%s\n", string(last_ident, ' ').c_str(),
+                          node->Format(opts).c_str());
+
+      if (opts.select.find(kShown[5]) != opts.select.end()) {
+        fprintf(stderr, "code view has no tensor value to show\n");
+      }
+
+      node->formatted_str += children_str;
+      show_nodes.push_back(node);
+    } else {
+      show_nodes.insert(show_nodes.end(), show_cnodes.begin(),
+                        show_cnodes.end());
+    }
+  }
+  return show_nodes;
+}
+
+void TFCode::Account(const std::vector<CodeNode*>& roots, const Options& opts) {
+  if (roots.empty()) return;
+
+  for (CodeNode* node : roots) {
+    node->ResetTotalStats();
+    Account(node->children, opts);
+
+    node->account = ShouldAccount(node, opts);
+    if (node->account) {
+      node->AddSelfToTotalStats();
+    }
+    for (CodeNode* c : node->children) {
+      node->AggregateTotalStats(c);
+    }
+  }
+}
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_code.h b/tensorflow/tools/tfprof/internal/tfprof_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a28624f1e1f7139d68f5c153f946c405b5ed9a
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_code.h
@@ -0,0 +1,74 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Build a tree structure based on the TensorFlow model's python code stacks.
+// Stats are aggregated from descendants from ancestors.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_show_code.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFCode : public TFShowCode {
+ public:
+  explicit TFCode() : code_root_(nullptr), trace_root_(nullptr) {}
+  ~TFCode() override {}
+
+  void AddNode(TFGraphNode* node) override;
+
+  void Build() override;
+
+ private:
+  CodeNode* BuildCodeNodes(TFCodeNode* root);
+
+  const ShowCodeNode* ShowInternal(const Options& opts,
+                                   Timeline* timeline) override;
+
+  std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
+                                    const std::vector<string>& regexes);
+
+  std::vector<CodeNode*> PrintScope(const std::vector<CodeNode*> roots,
+                                    const Options& opts, int depth,
+                                    int last_ident);
+
+  void Account(const std::vector<CodeNode*>& roots, const Options& opts);
+
+  CodeNode* code_root_;
+  std::unique_ptr<TFCodeNode> trace_root_;
+  std::unique_ptr<TFCodeNode> tfprof_trace_root_;
+  std::unique_ptr<CodeNode> tfprof_code_root_;
+  std::set<std::unique_ptr<CodeNode>> code_nodes_;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.cc b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
index 469b258f98b50cfdddba17232b69c2e09c27c443..23084146c2c465dfb6fcd4d8b7ac51e68d472fb5 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.cc
@@ -31,14 +31,14 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFGraph::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
+void TFGraph::AddNode(TFGraphNode* node) {
+  string name = node->name();
   nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(node));
 }
 
@@ -49,7 +49,7 @@ void TFGraph::Build() {
   // Filter out the root nodes (node not input of any other node).
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     GraphNode* node = it->second.get();
-    const std::map<string, TFNode*>& inputs = node->node->inputs();
+    const std::map<string, TFGraphNode*>& inputs = node->node->inputs();
     for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
          inputs_it++) {
       nonroots.insert(inputs_it->first);
@@ -66,7 +66,7 @@ void TFGraph::Build() {
   }
 }
 
-const ShowNode* TFGraph::ShowInternal(const Options& opts) {
+const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search the nodes to start from.
   std::vector<GraphNode*> roots = roots_;
   if (opts.start_name_regexes.size() != 1 ||
@@ -81,11 +81,13 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts) {
   std::map<string, int64> account_visits;
   Account({root}, opts, &account_visits);
 
-  if (opts.viz) {
-    printf("Visualizing feature disabled...\n");
-  }
   std::set<string> visits;
-  return PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+  root = PrintGraph({root}, opts, 1, 0, 0, &visits)[0];
+
+  if (timeline) {
+    timeline->GenerateGraphTimeline(root);
+  }
+  return root;
 }
 
 std::vector<GraphNode*> TFGraph::SearchRoot(
@@ -155,8 +157,14 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (GraphNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
-        node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
+        // This swap and reinit pattern is critical for performance.
+        node->mutable_proto()->add_children()->Swap(sc->mutable_proto());
+        sc->ReInit();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
@@ -181,7 +189,6 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
           node->formatted_str += value_str;
         }
       }
-
       node->formatted_str += children_str;
       show_nodes.push_back(node);
     } else {
diff --git a/tensorflow/tools/tfprof/internal/tfprof_graph.h b/tensorflow/tools/tfprof/internal/tfprof_graph.h
index b16f80b33db44d124591898d1983ed3fb5a48e56..4d4aa8b2b1d9445b9502b1103c5c257052a48e7f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_graph.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_graph.h
@@ -37,32 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-class GraphNode : public ShowNode {
- public:
-  explicit GraphNode(TFNode* node) : ShowNode(node) {
-    mutable_proto()->set_inputs(node->inputs().size());
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  void AggregateTotalStats(GraphNode* node) {
-    ShowNode::AggregateTotalStats(node);
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      node->proto().total_inputs() + 1);
-  }
-
-  void AddSelfToTotalStats() {
-    ShowNode::AddSelfToTotalStats();
-    mutable_proto()->set_total_inputs(proto().total_inputs() +
-                                      proto().inputs());
-  }
-
-  void ResetTotalStats() {
-    ShowNode::ResetTotalStats();
-    mutable_proto()->set_total_inputs(0);
-  }
-
-  std::vector<GraphNode*> children;
-};
 
 // Organize tensorflow ops in a graph structure, pointing from output ops
 // to input ops.
@@ -72,12 +46,13 @@ class TFGraph : public TFShow {
       : TFShow(ckpt_reader) {}
   ~TFGraph() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   bool ShouldShowIfExtra(ShowNode* node, const Options& opts,
                          int depth) override {
@@ -99,14 +74,14 @@ class TFGraph : public TFShow {
   std::vector<GraphNode*> GenerateGraphDot(
       GraphNode* root, GraphNode* last_shown, const Options& opts, int depth,
       int hidden, std::set<string>* declared_nodes,
-      std::set<string>* declared_edges, TFProfNode* parent);
+      std::set<string>* declared_edges, TFGraphNodeProto* parent);
 
   void Account(const std::vector<GraphNode*>& roots, const Options& opts,
                std::map<string, int64>* visits);
 
   std::vector<GraphNode*> roots_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<GraphNode>> nodes_map_;
 };
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.cc b/tensorflow/tools/tfprof/internal/tfprof_node.cc
index 08bd91d99c66ce2c5e17024edf225fafc3f9204d..74c8fcbe4816561805cd085fc190a0d709a6a7fc 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.cc
@@ -20,19 +20,22 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
-  if (!device.empty()) {
-    // This might override device from GraphDef.
-    device_ = device;
-  }
+// Notes about start and end time from the NodeExecStats proto.
+// For GPU, there is no difference between op_end_rel_micros and
+// all_end_rel_micros. All are kernel times.
+// For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
+// some post-processing.
+// Here, we only consider kernel time for simplicity.
+void TFGraphNode::AddStepStat(const string& device,
+                              const NodeExecStats* step_stat) {
   step_stat_ = step_stat;
+  CHECK(step_stat_);
 
-  op_start_micros_ = step_stat_->all_start_micros();
-  if (step_stat_->op_end_rel_micros() && step_stat_->op_start_rel_micros()) {
-    op_schedule_micros_ =
-        step_stat_->op_end_rel_micros() - step_stat_->op_start_rel_micros();
-  }
-  all_spent_micros_ = step_stat_->all_end_rel_micros();
+  string dev = str_util::Lowercase(device);
+
+  devices_.insert(dev);
+  op_kernel_execs_[dev].push_back(std::make_pair(
+      step_stat_->all_start_micros(), step_stat_->op_end_rel_micros()));
 
   for (const auto& output : step_stat_->output()) {
     if (output.has_tensor_description() &&
@@ -43,9 +46,5 @@ void TFNode::AddStepStat(const string& device, const NodeExecStats* step_stat) {
     }
   }
 }
-
-void TFNode::AddNodeStat(const CostGraphDef::Node* cost_node) {
-  kernel_compute_micros_ = cost_node->compute_cost();
-}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node.h b/tensorflow/tools/tfprof/internal/tfprof_node.h
index 677c8d3c870005a95b56701ec8366d1e53cbdbb5..8e57db7ba2cc103ba0c769d122499cbbf98a4c18 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_node.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_node.h
@@ -23,26 +23,24 @@ limitations under the License.
 
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
 
-class TFNode {
+class TFGraphNode {
  public:
-  TFNode(const NodeDef* node)
+  TFGraphNode(const NodeDef* node)
       : node_(node),
+        code_(nullptr),
         step_stat_(nullptr),
-        op_start_micros_(0),
-        op_schedule_micros_(0),
-        kernel_compute_micros_(0),
-        all_spent_micros_(0),
         requested_bytes_(0),
         float_ops_(0) {
     if (!node) return;
@@ -67,56 +65,140 @@ class TFNode {
       update_shape(shape_vec);
     }
     op_types_.insert(node->op());
-    device_ = node->device();
   }
 
-  TFNode() : TFNode(nullptr) {}
+  TFGraphNode() : TFGraphNode(nullptr) {}
 
-  void AddInput(TFNode* input) { inputs_[input->node_def()->name()] = input; }
+  void AddInput(TFGraphNode* input, int64 output_idx) {
+    inputs_[input->name()] = input;
+    output_idx_[input->name()] = output_idx;
+  }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
 
   void AddStepStat(const string& device, const NodeExecStats* step_stat);
 
-  // Add CostGraphDef::Node.
-  void AddNodeStat(const CostGraphDef::Node* cost_node);
-
   void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
 
+  void AddCode(const CodeDef* code) { code_ = code; }
+
+  const string& name() const { return node_->name(); }
   const NodeDef* node_def() { return node_; }
-  const std::map<string, TFNode*>& inputs() { return inputs_; }
-  int64 op_start_micros() { return op_start_micros_; }
-  // This is time spent in Op::Compute(), which is GPU kernel schedule time.
-  // Currently not used.
-  int64 op_schedule_micros() { return op_schedule_micros_; }
+
+  const NodeExecStats* step_stats() const { return step_stat_; }
+
+  const std::map<string, TFGraphNode*>& inputs() const { return inputs_; }
+  const std::map<string, int64>& output_idx() { return output_idx_; }
+
   // This is time spent in kernel execution.
-  int64 kernel_compute_micros() { return kernel_compute_micros_; }
-  int64 all_spent_micros() { return all_spent_micros_; }
-  int64 requested_byptes() { return requested_bytes_; }
-  int64 float_ops() { return float_ops_; }
-  string device() { return device_; }
-  const std::set<string>& op_types() { return op_types_; }
+  int64 kernel_exec_micros() const {
+    if (!step_stat_) return 0;
+    int64 total = 0;
+    for (const auto& execs : op_kernel_execs_) {
+      for (const auto& exec : execs.second) {
+        total += exec.second;
+      }
+    }
+    return total;
+  }
+  const std::map<string, std::vector<std::pair<int64, int64>>>&
+  op_kernel_execs() const {
+    return op_kernel_execs_;
+  }
 
-  const std::vector<int64>& shape() { return shape_; }
+  int64 requested_bytes() const { return requested_bytes_; }
+  int64 float_ops() const { return float_ops_; }
+  const CodeDef* code() { return code_; }
+  std::set<string> devices() const { return devices_; }
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<int64>& shape() const { return shape_; }
 
  private:
   void update_shape(const std::vector<int64>& shape) { shape_ = shape; }
 
-  std::map<string, TFNode*> inputs_;
+  std::map<string, TFGraphNode*> inputs_;
+  std::map<string, int64> output_idx_;
+
   const NodeDef* node_;
+  const CodeDef* code_;
   const NodeExecStats* step_stat_;
 
   std::vector<int64> shape_;
   std::set<string> op_types_;
-  string device_;
-  int64 op_start_micros_;
-  int64 op_schedule_micros_;
-  int64 kernel_compute_micros_;
-  int64 all_spent_micros_;
+
+  // device -> vector of {op_start_micros, op_kernel_exec_micros} pairs.
+  std::map<string, std::vector<std::pair<int64, int64>>> op_kernel_execs_;
+
+  std::set<string> devices_;
   int64 requested_bytes_;
   int64 float_ops_;
 };
 
+class TFCodeNode {
+ public:
+  TFCodeNode(const string& trace)
+      : trace_(trace),
+        kernel_exec_micros_(0),
+        requested_bytes_(0),
+        float_ops_(0) {}
+
+  void AddGraphNode(const TFGraphNode* node) {
+    if (nodes_.find(node->name()) != nodes_.end()) {
+      return;
+    }
+    nodes_[node->name()] = node;
+
+    kernel_exec_micros_ += node->kernel_exec_micros();
+    requested_bytes_ += node->requested_bytes();
+    float_ops_ += node->float_ops();
+    op_types_.insert(node->op_types().begin(), node->op_types().end());
+    if (node->shape().size() > 0) {
+      shapes_.push_back(node->shape());
+    }
+    std::set<string> devices = node->devices();
+    devices_.insert(devices.begin(), devices.end());
+  }
+  const std::map<string, const TFGraphNode*>& graph_nodes() const {
+    return nodes_;
+  }
+
+  void AddChildren(const string& trace) {
+    if (children_.find(trace) != children_.end()) {
+      return;
+    }
+    children_[trace].reset(new TFCodeNode(trace));
+  }
+  std::map<string, std::unique_ptr<TFCodeNode>>& children() {
+    return children_;
+  }
+
+  const string& name() const { return trace_; }
+
+  int64 kernel_exec_micros() const { return kernel_exec_micros_; }
+
+  int64 requested_bytes() const { return requested_bytes_; }
+
+  int64 float_ops() const { return float_ops_; }
+
+  const std::set<string>& devices() const { return devices_; }
+
+  const std::set<string>& op_types() const { return op_types_; }
+
+  const std::vector<std::vector<int64>>& shapes() const { return shapes_; }
+
+ private:
+  const string trace_;
+  std::set<string> op_types_;
+  int64 kernel_exec_micros_;
+  int64 requested_bytes_;
+  int64 float_ops_;
+
+  std::set<string> devices_;
+  std::vector<std::vector<int64>> shapes_;
+  std::map<string, const TFGraphNode*> nodes_;
+  std::map<string, std::unique_ptr<TFCodeNode>> children_;
+};
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.cc b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5390676dd0d3ee06c0ff7e1dfa01079a118235
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.cc
@@ -0,0 +1,296 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace tfprof {
+ShowNode::ShowNode(const TFGraphNode* node) : node(node), account(true) {
+  ReInit();
+}
+
+void ShowNode::ReInit() {
+  mutable_proto()->set_name(name());
+  for (const string& device : node->devices()) {
+    *mutable_proto()->mutable_devices()->Add() = device;
+  }
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shape().empty()) {
+    int64 params = 1;
+    bool complete_shape = true;
+    for (int64 d : node->shape()) {
+      // Sometimes parameters could be <0 when a dim is unknown.
+      if (d < 0) {
+        complete_shape = false;
+        break;
+      }
+      params *= d;
+    }
+    if (complete_shape) {
+      mutable_proto()->set_parameters(proto_.parameters() + params);
+    } else {
+      fprintf(stderr, "Incomplete shape.");
+    }
+  }
+}
+
+string ShowNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    const string shape = FormatShapes(node->shape());
+    if (!shape.empty()) {
+      info.push_back(shape);
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (proto().devices_size() > 0) {
+      info.push_back(str_util::Join(proto().devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    if (proto().devices_size() > 0) {
+      op_types.insert(str_util::Join(proto().devices(), "|"));
+    }
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFGraphNodeProto* ShowNode::mutable_proto() { return &proto_; }
+
+const TFGraphNodeProto& ShowNode::proto() const { return proto_; }
+
+void ShowNode::AggregateTotalStats(ShowNode* node) {
+  TFGraphNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+ShowCodeNode::ShowCodeNode(const TFCodeNode* node) : node(node), account(true) {
+  std::vector<ScopeNode> snodes;
+  for (auto it : node->graph_nodes()) {
+    ScopeNode snode(it.second);
+    snodes.push_back(snode);
+    snodes[snodes.size() - 1].AddSelfToTotalStats();
+    *mutable_proto()->mutable_graph_nodes()->Add() =
+        snodes[snodes.size() - 1].proto();
+  }
+
+  mutable_proto()->set_name(name());
+  mutable_proto()->set_exec_micros(node->kernel_exec_micros());
+  mutable_proto()->set_requested_bytes(node->requested_bytes());
+  mutable_proto()->set_float_ops(node->float_ops());
+
+  if (!node->shapes().empty()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      int64 params = 1;
+      bool complete_shape = true;
+      for (int64 d : shape) {
+        // Sometimes parameters could be <0 when a dim is unknown.
+        if (d < 0) {
+          complete_shape = false;
+          break;
+        }
+        params *= d;
+      }
+      if (complete_shape) {
+        mutable_proto()->set_parameters(proto().parameters() + params);
+      } else {
+        fprintf(stderr, "Incomplete shape.");
+      }
+    }
+  }
+}
+
+string ShowCodeNode::Format(const Options& opts) {
+  if (opts.select.empty()) {
+    return name();
+  }
+  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
+}
+
+string ShowCodeNode::FormatMeta(const Options& opts) {
+  std::vector<string> info;
+  std::vector<string> shapes;
+  if (opts.select.find(kShown[2]) != opts.select.end()) {
+    for (const std::vector<int64>& shape : node->shapes()) {
+      if (!shape.empty()) {
+        shapes.push_back(FormatShapes(shape));
+      }
+    }
+    if (!shapes.empty()) {
+      info.push_back(str_util::Join(shapes, "|"));
+    }
+    string params = FormatNumber(proto().total_parameters()) + " params";
+    if (account) {
+      params = FormatNumber(proto().parameters()) + "/" + params;
+    } else {
+      params = "--/" + params;
+    }
+    info.push_back(params);
+  }
+  if (opts.select.find(kShown[3]) != opts.select.end()) {
+    string fops = FormatNumber(proto().total_float_ops()) + " flops";
+    if (account) {
+      fops = FormatNumber(proto().float_ops()) + "/" + fops;
+    } else {
+      fops = "--/" + fops;
+    }
+    info.push_back(fops);
+  }
+  if (opts.select.find(kShown[0]) != opts.select.end()) {
+    string memory = FormatMemory(proto().total_requested_bytes());
+    if (account) {
+      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
+
+    } else {
+      memory = "--/" + memory;
+    }
+    info.push_back(memory);
+  }
+  if (opts.select.find(kShown[1]) != opts.select.end()) {
+    string time = FormatTime(proto().total_exec_micros());
+    if (account) {
+      time = FormatTime(proto().exec_micros()) + "/" + time;
+    } else {
+      time = "--/" + time;
+    }
+    info.push_back(time);
+  }
+  if (opts.select.find(kShown[6]) != opts.select.end()) {
+    if (!node->devices().empty()) {
+      info.push_back(str_util::Join(node->devices(), "|"));
+    }
+  }
+  if (opts.select.find(kShown[7]) != opts.select.end()) {
+    std::set<string> op_types = node->op_types();
+    // Device is considered a type.
+    op_types.insert(node->devices().cbegin(), node->devices().cend());
+    info.push_back(str_util::Join(op_types, "|"));
+  }
+  return str_util::Join(info, ", ");
+}
+
+TFCodeNodeProto* ShowCodeNode::mutable_proto() { return &proto_; }
+
+const TFCodeNodeProto& ShowCodeNode::proto() const { return proto_; }
+
+void ShowCodeNode::AggregateTotalStats(ShowCodeNode* node) {
+  TFCodeNodeProto* node_pb = node->mutable_proto();
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         node_pb->total_exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             node_pb->total_requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        node_pb->total_parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       node_pb->total_float_ops());
+}
+
+void ShowCodeNode::AddSelfToTotalStats() {
+  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
+                                         proto().exec_micros());
+  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
+                                             proto().requested_bytes());
+  mutable_proto()->set_total_parameters(proto().total_parameters() +
+                                        proto().parameters());
+  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
+                                       proto().float_ops());
+}
+
+void ShowCodeNode::ResetTotalStats() {
+  mutable_proto()->set_total_exec_micros(0);
+  mutable_proto()->set_total_requested_bytes(0);
+  mutable_proto()->set_total_parameters(0);
+  mutable_proto()->set_total_float_ops(0);
+  mutable_proto()->mutable_children()->Clear();
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_node_show.h b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ce0f63f9b572d3f35dbf022a688bbf3189487cb
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_node_show.h
@@ -0,0 +1,173 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Nodes used for different views.
+// ScopeNode is for scope view. GraphNode is for graph view and CodeNode
+// is for code view.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class ShowNode {
+ public:
+  explicit ShowNode(const TFGraphNode* node);
+  virtual ~ShowNode() {}
+
+  const string& name() const { return node->name(); }
+  TFGraphNodeProto* mutable_proto();
+  const TFGraphNodeProto& proto() const;
+
+  void ReInit();
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFGraphNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFGraphNodeProto proto_;
+};
+
+class GraphNode : public ShowNode {
+ public:
+  explicit GraphNode(TFGraphNode* node) : ShowNode(node) {
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void ReInit() {
+    ShowNode::ReInit();
+    mutable_proto()->set_inputs(node->inputs().size());
+    mutable_proto()->set_total_inputs(0);
+  }
+
+  void AggregateTotalStats(GraphNode* node) {
+    ShowNode::AggregateTotalStats(node);
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      node->proto().total_inputs() + 1);
+  }
+
+  void AddSelfToTotalStats() {
+    ShowNode::AddSelfToTotalStats();
+    mutable_proto()->set_total_inputs(proto().total_inputs() +
+                                      proto().inputs());
+  }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    mutable_proto()->set_total_inputs(0);
+    show_children.clear();
+  }
+
+  std::vector<GraphNode*> children;
+  std::vector<GraphNode*> show_children;
+};
+
+class ScopeNode : public ShowNode {
+ public:
+  explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
+  ~ScopeNode() override {}
+
+  void ReInit() { ShowNode::ReInit(); }
+
+  void AggregateTotalStats(ScopeNode* node) {
+    ShowNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<ScopeNode*> children;
+  std::vector<ScopeNode*> show_children;
+};
+
+class ShowCodeNode {
+ public:
+  explicit ShowCodeNode(const TFCodeNode* node);
+  virtual ~ShowCodeNode() {}
+
+  const string& name() const { return node->name(); }
+  TFCodeNodeProto* mutable_proto();
+  const TFCodeNodeProto& proto() const;
+
+  string Format(const Options& opts);
+
+  string FormatMeta(const Options& opts);
+
+  const TFCodeNode* node;
+  bool account;
+  string formatted_str;
+
+ protected:
+  void AggregateTotalStats(ShowCodeNode* node);
+
+  void AddSelfToTotalStats();
+
+  void ResetTotalStats();
+
+  TFCodeNodeProto proto_;
+};
+
+class CodeNode : public ShowCodeNode {
+ public:
+  explicit CodeNode(const TFCodeNode* node) : ShowCodeNode(node) {}
+  ~CodeNode() override {}
+
+  void AggregateTotalStats(CodeNode* node) {
+    ShowCodeNode::AggregateTotalStats(node);
+  }
+
+  void AddSelfToTotalStats() { ShowCodeNode::AddSelfToTotalStats(); }
+
+  void ResetTotalStats() {
+    ShowCodeNode::ResetTotalStats();
+    show_children.clear();
+  }
+
+  std::vector<CodeNode*> children;
+  std::vector<CodeNode*> show_children;
+};
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_NODE_SHOW_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.cc b/tensorflow/tools/tfprof/internal/tfprof_options.cc
index 03282533ffd4518e4c44bddfd31bbbdb18e0f9ab..f592a4cf8cf7435b63b27f46e4961741d47aa1c9 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.cc
@@ -17,16 +17,133 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/tfprof_options.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
+namespace {
+string KeyValueToStr(const std::map<string, string>& kv_map) {
+  std::vector<string> kv_vec;
+  kv_vec.reserve(kv_map.size());
+  for (const auto& pair : kv_map) {
+    kv_vec.push_back(strings::StrCat(pair.first, "=", pair.second));
+  }
+  return str_util::Join(kv_vec, ",");
+}
+}  // namespace
+
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options) {
+  // The default is to use stdout.
+  if (output_opt.empty()) {
+    *output_type = kOutput[1];
+    return tensorflow::Status::OK();
+  }
+
+  std::set<string> output_types(kOutput,
+                                kOutput + sizeof(kOutput) / sizeof(*kOutput));
+  auto opt_split = output_opt.find(":");
+  std::vector<string> kv_split;
+  if (opt_split == output_opt.npos) {
+    if (output_types.find(output_opt) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_opt.c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    *output_type = output_opt;
+  } else {
+    *output_type = output_opt.substr(0, opt_split);
+    if (output_types.find(*output_type) == output_types.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("E.g. Unknown output type: %s, Valid types: %s\n",
+                          output_type->c_str(),
+                          str_util::Join(output_types, ",").c_str()));
+    }
+    kv_split = str_util::Split(output_opt.substr(opt_split + 1), ",",
+                               str_util::SkipEmpty());
+  }
 
-Options Options::FromProtoStr(const string& opts_proto_str) {
+  std::set<string> valid_options;
+  std::set<string> required_options;
+  if (*output_type == kOutput[0]) {
+    valid_options.insert(
+        kTimelineOpts,
+        kTimelineOpts + sizeof(kTimelineOpts) / sizeof(*kTimelineOpts));
+    required_options.insert(
+        kTimelineRequiredOpts,
+        kTimelineRequiredOpts +
+            sizeof(kTimelineRequiredOpts) / sizeof(*kTimelineRequiredOpts));
+  } else if (*output_type == kOutput[2]) {
+    valid_options.insert(kFileOpts,
+                         kFileOpts + sizeof(kFileOpts) / sizeof(*kFileOpts));
+    required_options.insert(kFileRequiredOpts,
+                            kFileRequiredOpts + sizeof(kFileRequiredOpts) /
+                                                    sizeof(*kFileRequiredOpts));
+  }
+
+  for (const string& kv_str : kv_split) {
+    const std::vector<string> kv =
+        str_util::Split(kv_str, "=", str_util::SkipEmpty());
+    if (kv.size() != 2) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          "Visualize format: -output timeline:key=value,key=value,...");
+    }
+    if (valid_options.find(kv[0]) == valid_options.end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Unrecognized options %s for output_type: %s\n",
+                          kv[0].c_str(), output_type->c_str()));
+    }
+    (*output_options)[kv[0]] = kv[1];
+  }
+
+  for (const string& opt : required_options) {
+    if (output_options->find(opt) == output_options->end()) {
+      return tensorflow::Status(
+          tensorflow::error::INVALID_ARGUMENT,
+          strings::Printf("Missing required output_options for %s\n"
+                          "E.g. -output %s:%s=...\n",
+                          output_type->c_str(), output_type->c_str(),
+                          opt.c_str()));
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Options::FromProtoStr(const string& opts_proto_str,
+                                         Options* opts) {
   OptionsProto opts_pb;
-  CHECK(opts_pb.ParseFromString(opts_proto_str));
-  Options opts(
+  if (!opts_pb.ParseFromString(opts_proto_str)) {
+    return tensorflow::Status(
+        tensorflow::error::INTERNAL,
+        strings::StrCat("Failed to parse option string from Python API: ",
+                        opts_proto_str));
+  }
+
+  string output_type;
+  std::map<string, string> output_options;
+  tensorflow::Status s =
+      ParseOutput(opts_pb.output(), &output_type, &output_options);
+  if (!s.ok()) return s;
+
+  if (!opts_pb.dump_to_file().empty()) {
+    fprintf(stderr,
+            "-dump_to_file option is deprecated. "
+            "Please use -output file:outfile=<filename>\n");
+    fprintf(stderr, "-output %s is overwritten with -output file:outfile=%s\n",
+            opts_pb.output().c_str(), opts_pb.dump_to_file().c_str());
+    output_type = kOutput[2];
+    output_options.clear();
+    output_options[kFileOpts[0]] = opts_pb.dump_to_file();
+  }
+
+  *opts = Options(
       opts_pb.max_depth(), opts_pb.min_bytes(), opts_pb.min_micros(),
       opts_pb.min_params(), opts_pb.min_float_ops(),
       std::vector<string>(opts_pb.device_regexes().begin(),
@@ -44,8 +161,8 @@ Options Options::FromProtoStr(const string& opts_proto_str) {
                           opts_pb.hide_name_regexes().end()),
       opts_pb.account_displayed_op_only(),
       std::vector<string>(opts_pb.select().begin(), opts_pb.select().end()),
-      opts_pb.viz(), opts_pb.dump_to_file());
-  return opts;
+      output_type, output_options);
+  return tensorflow::Status::OK();
 }
 
 string Options::ToString() const {
@@ -64,8 +181,7 @@ string Options::ToString() const {
       "%-28s%s\n"
       "%-28s%s\n"
       "%-28s%s\n"
-      "%-28s%s\n"
-      "%-28s%s\n",
+      "%-28s%s:%s\n",
       kOptions[0], max_depth, kOptions[1], min_bytes, kOptions[2], min_micros,
       kOptions[3], min_params, kOptions[4], min_float_ops, kOptions[5],
       str_util::Join(device_regexes, ",").c_str(), kOptions[6],
@@ -76,8 +192,8 @@ string Options::ToString() const {
       str_util::Join(show_name_regexes, ",").c_str(), kOptions[11],
       str_util::Join(hide_name_regexes, ",").c_str(), kOptions[12],
       (account_displayed_op_only ? "true" : "false"), kOptions[13],
-      str_util::Join(select, ",").c_str(), kOptions[14],
-      (viz ? "true" : "false"), kOptions[15], dump_to_file.c_str());
+      str_util::Join(select, ",").c_str(), kOptions[14], output_type.c_str(),
+      KeyValueToStr(output_options).c_str());
   return s;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_options.h b/tensorflow/tools/tfprof/internal/tfprof_options.h
index a5b55e77fac0818bae927ce0e42110c0eca1c206..cf48b4de8162732c9b3e77f89d1029d4aa62ae0e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_options.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -40,8 +41,7 @@ static const char* const kOptions[] = {
     "-hide_name_regexes",
     "-account_displayed_op_only",
     "-select",
-    "-viz",
-    "-dump_to_file",
+    "-output",
 };
 
 static const char* const kOrderBy[] = {
@@ -55,14 +55,33 @@ static const char* const kShown[] = {
 };
 
 static const char* const kCmds[] = {
-    "scope", "graph", "set", "help",
+    "scope", "graph", "code", "set", "help",
+};
+
+static const char* const kOutput[] = {"timeline", "stdout", "file"};
+
+static const char* const kTimelineOpts[] = {
+    "outfile",
+};
+
+static const char* const kTimelineRequiredOpts[] = {"outfile"};
+
+static const char* const kFileOpts[] = {
+    "outfile",
+};
+
+static const char* const kFileRequiredOpts[] = {
+    "outfile",
 };
 
 struct Options {
  public:
-  static Options FromProtoStr(const string& opts_proto_str);
+  static tensorflow::Status FromProtoStr(const string& opts_proto_str,
+                                         Options* opts);
 
   virtual ~Options() {}
+  Options()
+      : Options(0, 0, 0, 0, 0, {}, "", {}, {}, {}, {}, {}, false, {}, "", {}) {}
   Options(int max_depth, tensorflow::int64 min_bytes,
           tensorflow::int64 min_micros, tensorflow::int64 min_params,
           tensorflow::int64 min_float_ops,
@@ -73,7 +92,8 @@ struct Options {
           const std::vector<string>& show_name_regexes,
           const std::vector<string>& hide_name_regexes,
           bool account_displayed_op_only, const std::vector<string>& select,
-          bool viz, const string& dump_to_file = "")
+          const string& output_type,
+          const std::map<string, string>& output_options)
       : max_depth(max_depth),
         min_bytes(min_bytes),
         min_micros(min_micros),
@@ -88,8 +108,8 @@ struct Options {
         hide_name_regexes(hide_name_regexes),
         account_displayed_op_only(account_displayed_op_only),
         select(select.begin(), select.end()),
-        viz(viz),
-        dump_to_file(dump_to_file) {}
+        output_type(output_type),
+        output_options(output_options) {}
 
   string ToString() const;
 
@@ -109,10 +129,17 @@ struct Options {
   bool account_displayed_op_only;
 
   std::set<string> select;
-  bool viz;
-  string dump_to_file;
+
+  string output_type;
+  std::map<string, string> output_options;
 };
 
+// Parse the -output option.
+// 'output_opt': User input string with format: output_type:key=value,key=value.
+// 'output_type' and 'output_options' are extracted from 'output_opt'.
+tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
+                               std::map<string, string>* output_options);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.cc b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
index 949d2d54e42b8683e8bb365e0b23c49feeb686e3..fe525c4bd840d287538096fd6d8fe2347f813991 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.cc
@@ -35,15 +35,15 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
   parent_nodes_[name] =
-      std::unique_ptr<TFNode>(new TFNode(node_defs_.back().get()));
+      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
   nodes_map_[name] =
       std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
 }
 
-void TFScope::AddNode(TFNode* node) {
-  string name = node->node_def()->name();
-  if (nodes_map_.find(node->node_def()->name()) == nodes_map_.end()) {
+void TFScope::AddNode(TFGraphNode* node) {
+  string name = node->name();
+  if (nodes_map_.find(node->name()) == nodes_map_.end()) {
     nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(node));
   }
 
@@ -72,7 +72,7 @@ void TFScope::Build() {
   }
 }
 
-const ShowNode* TFScope::ShowInternal(const Options& opts) {
+const ShowNode* TFScope::ShowInternal(const Options& opts, Timeline* timeline) {
   // Search from roots recursively to find start node, if start_name_regexes
   // is specified.
   std::vector<ScopeNode*> roots = roots_;
@@ -86,6 +86,9 @@ const ShowNode* TFScope::ShowInternal(const Options& opts) {
   Account({root}, opts);
 
   root = PrintScope({root}, opts, 1, 0)[0];
+  if (timeline) {
+    timeline->GenerateScopeTimeline(root);
+  }
   return root;
 }
 
@@ -139,8 +142,13 @@ std::vector<ScopeNode*> TFScope::PrintScope(const std::vector<ScopeNode*> roots,
       show_cnodes = SortNodes(show_cnodes, opts);
       string children_str;
       for (ScopeNode* sc : show_cnodes) {
-        children_str += sc->formatted_str;
+        if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
+          children_str += sc->formatted_str;
+          sc->formatted_str.clear();
+        }
         node->mutable_proto()->add_children()->MergeFrom(sc->proto());
+        sc->mutable_proto()->mutable_children()->Clear();
+        node->show_children.push_back(sc);
         if (opts.account_displayed_op_only) {
           node->AggregateTotalStats(sc);
         }
diff --git a/tensorflow/tools/tfprof/internal/tfprof_scope.h b/tensorflow/tools/tfprof/internal/tfprof_scope.h
index a7c58920a2497377d65d70104d0d5e6c71d1b793..7bdcc794cd054c88859f4cde182b286f5f26db7e 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_scope.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_scope.h
@@ -37,34 +37,19 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 
-class ScopeNode : public ShowNode {
- public:
-  explicit ScopeNode(TFNode* node) : ShowNode(node) {}
-  ~ScopeNode() override {}
-
-  void AggregateTotalStats(ScopeNode* node) {
-    ShowNode::AggregateTotalStats(node);
-  }
-
-  void AddSelfToTotalStats() { ShowNode::AddSelfToTotalStats(); }
-
-  void ResetTotalStats() { ShowNode::ResetTotalStats(); }
-
-  std::vector<ScopeNode*> children;
-};
-
 class TFScope : public TFShow {
  public:
   explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
       : TFShow(ckpt_reader) {}
   ~TFScope() override {}
 
-  void AddNode(TFNode* node) override;
+  void AddNode(TFGraphNode* node) override;
 
   void Build() override;
 
  private:
-  const ShowNode* ShowInternal(const Options& opts) override;
+  const ShowNode* ShowInternal(const Options& opts,
+                               Timeline* timeline) override;
 
   ScopeNode* CreateParentNode(const string& name);
 
@@ -79,7 +64,7 @@ class TFScope : public TFShow {
 
   std::vector<ScopeNode*> roots_;
   std::vector<std::unique_ptr<NodeDef>> node_defs_;
-  std::map<string, std::unique_ptr<TFNode>> parent_nodes_;
+  std::map<string, std::unique_ptr<TFGraphNode>> parent_nodes_;
   std::map<string, std::unique_ptr<ScopeNode>> nodes_map_;
 };
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.cc b/tensorflow/tools/tfprof/internal/tfprof_show.cc
index 08ae82fea43cfb0e94e089e8e2945c969501f17d..b96db5468e71ad6c3ea9169ee2f77b0d00db1c3f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.cc
@@ -18,154 +18,32 @@ limitations under the License.
 #include <memory>
 #include <set>
 
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
 namespace tfprof {
-ShowNode::ShowNode(TFNode* node) : node(node), account(true) {
-  mutable_proto()->set_name(name());
-  if (!node->device().empty()) {
-    mutable_proto()->set_device(node->device());
-  }
-  mutable_proto()->set_exec_micros(node->kernel_compute_micros());
-  mutable_proto()->set_requested_bytes(node->requested_byptes());
-  mutable_proto()->set_float_ops(node->float_ops());
-
-  if (!node->shape().empty()) {
-    int64 params = 1;
-    bool complete_shape = true;
-    for (int64 d : node->shape()) {
-      // Sometimes parameters could be <0 when a dim is unknown.
-      if (d < 0) {
-        complete_shape = false;
-        break;
-      }
-      params *= d;
-    }
-    if (complete_shape) {
-      mutable_proto()->set_parameters(proto_.parameters() + params);
-    } else {
-      fprintf(stderr, "Incomplete shape.");
-    }
-  }
-}
-
-string ShowNode::Format(const Options& opts) {
-  if (opts.select.empty()) {
-    return name();
-  }
-  return strings::Printf("%s (%s)", name().c_str(), FormatMeta(opts).c_str());
-}
-
-string ShowNode::FormatMeta(const Options& opts) {
-  std::vector<string> info;
-  if (opts.select.find(kShown[2]) != opts.select.end()) {
-    const string shape = FormatShapes(node->shape());
-    if (!shape.empty()) {
-      info.push_back(shape);
-    }
-    string params = FormatNumber(proto().total_parameters()) + " params";
-    if (account) {
-      params = FormatNumber(proto().parameters()) + "/" + params;
-    } else {
-      params = "--/" + params;
-    }
-    info.push_back(params);
-  }
-  if (opts.select.find(kShown[3]) != opts.select.end()) {
-    string fops = FormatNumber(proto().total_float_ops()) + " flops";
-    if (account) {
-      fops = FormatNumber(proto().float_ops()) + "/" + fops;
-    } else {
-      fops = "--/" + fops;
-    }
-    info.push_back(fops);
-  }
-  if (opts.select.find(kShown[0]) != opts.select.end()) {
-    string memory = FormatMemory(proto().total_requested_bytes());
-    if (account) {
-      memory = FormatMemory(proto().requested_bytes()) + "/" + memory;
-
-    } else {
-      memory = "--/" + memory;
-    }
-    info.push_back(memory);
-  }
-  if (opts.select.find(kShown[1]) != opts.select.end()) {
-    string time = FormatTime(proto().total_exec_micros());
-    if (account) {
-      time = FormatTime(proto().exec_micros()) + "/" + time;
-    } else {
-      time = "--/" + time;
-    }
-    info.push_back(time);
-  }
-  if (opts.select.find(kShown[6]) != opts.select.end()) {
-    if (!proto().device().empty()) {
-      info.push_back(proto().device());
-    }
-  }
-  if (opts.select.find(kShown[7]) != opts.select.end()) {
-    std::set<string> op_types = node->op_types();
-    // Device is considered a type.
-    if (!proto().device().empty()) {
-      op_types.insert(proto().device());
-    }
-    info.push_back(str_util::Join(op_types, "|"));
-  }
-  return str_util::Join(info, ", ");
-}
-
-TFProfNode* ShowNode::mutable_proto() { return &proto_; }
-
-const TFProfNode& ShowNode::proto() const { return proto_; }
-
-void ShowNode::AggregateTotalStats(ShowNode* node) {
-  TFProfNode* node_pb = node->mutable_proto();
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         node_pb->total_exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             node_pb->total_requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        node_pb->total_parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       node_pb->total_float_ops());
-}
-
-void ShowNode::AddSelfToTotalStats() {
-  mutable_proto()->set_total_exec_micros(proto().total_exec_micros() +
-                                         proto().exec_micros());
-  mutable_proto()->set_total_requested_bytes(proto().total_requested_bytes() +
-                                             proto().requested_bytes());
-  mutable_proto()->set_total_parameters(proto().total_parameters() +
-                                        proto().parameters());
-  mutable_proto()->set_total_float_ops(proto().total_float_ops() +
-                                       proto().float_ops());
-}
 
-void ShowNode::ResetTotalStats() {
-  mutable_proto()->set_total_exec_micros(0);
-  mutable_proto()->set_total_requested_bytes(0);
-  mutable_proto()->set_total_parameters(0);
-  mutable_proto()->set_total_float_ops(0);
-}
-
-const TFProfNode& TFShow::Show(const Options& opts) {
-  const ShowNode* root = ShowInternal(opts);
-  if (opts.dump_to_file.empty()) {
-    printf("%s", root->formatted_str.c_str());
-    fflush(stdout);
-  } else {
-    Status s = WriteStringToFile(Env::Default(), opts.dump_to_file,
-                                 root->formatted_str);
+const TFGraphNodeProto& TFShow::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
     if (!s.ok()) {
       fprintf(stderr, "%s\n", s.ToString().c_str());
     }
+    return root->proto();
+  } else {
+    const ShowNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
   }
-  return root->proto();
 }
 
 bool TFShow::LookUpCheckPoint(const string& name,
@@ -205,10 +83,13 @@ bool TFShow::ShouldShow(ShowNode* node, const Options& opts, int depth) {
     show = true;
   } else {
     for (const string& regex : opts.device_regexes) {
-      if (RE2::FullMatch(node->proto().device(), regex)) {
-        show = true;
-        break;
+      for (const string& device : node->proto().devices()) {
+        if (RE2::FullMatch(device, regex)) {
+          show = true;
+          break;
+        }
       }
+      if (show) break;
     }
   }
   // Don't show if device_regexes don't cover it.
@@ -254,11 +135,11 @@ bool TFShow::ShouldAccount(ShowNode* node, const Options& opts) {
         return true;
       }
     }
-    if (RE2::FullMatch(node->proto().device(), regex)) {
-      return true;
-    }
+    for (const string& device : node->proto().devices())
+      if (RE2::FullMatch(device, regex)) {
+        return true;
+      }
   }
-
   return false;
 }
 
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show.h b/tensorflow/tools/tfprof/internal/tfprof_show.h
index a17358bb6b4b95ef1f28678529a37c9517c28c4a..803b3010442f0c3f607cd3fd30b000a7c0838f8f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_show.h
@@ -28,51 +28,27 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
 #include "tensorflow/tools/tfprof/tfprof_output.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
-class ShowNode {
- public:
-  explicit ShowNode(TFNode* node);
-  virtual ~ShowNode() {}
-
-  const string& name() const { return node->node_def()->name(); }
-  TFProfNode* mutable_proto();
-  const TFProfNode& proto() const;
-
-  string Format(const Options& opts);
-
-  string FormatMeta(const Options& opts);
-
-  TFNode* node;
-  bool account;
-  string formatted_str;
-
- protected:
-  void AggregateTotalStats(ShowNode* node);
-
-  void AddSelfToTotalStats();
-
-  void ResetTotalStats();
-
-  TFProfNode proto_;
-};
-
 class TFShow {
  public:
   explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
       : ckpt_reader_(ckpt_reader) {}
   virtual ~TFShow() {}
-  virtual void AddNode(TFNode* node) = 0;
+  virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
-  const TFProfNode& Show(const Options& opts);
+  const TFGraphNodeProto& Show(const Options& opts);
 
  protected:
-  virtual const ShowNode* ShowInternal(const Options& opts) = 0;
+  virtual const ShowNode* ShowInternal(const Options& opts,
+                                       Timeline* timeline) = 0;
 
   bool LookUpCheckPoint(const string& name,
                         std::unique_ptr<TFProfTensor>* tensor);
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.cc b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfec09ad193a46c35cf452d1dc50f5a38235ae72
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.cc
@@ -0,0 +1,140 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_show_code.h"
+
+#include <memory>
+#include <set>
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_scope.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+const TFCodeNodeProto& TFShowCode::Show(const Options& opts) {
+  if (opts.output_type == kOutput[0]) {
+    Timeline timeline(opts.output_options.at(kTimelineOpts[0]));
+    return ShowInternal(opts, &timeline)->proto();
+  } else if (opts.output_type == kOutput[2]) {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    Status s =
+        WriteStringToFile(Env::Default(), opts.output_options.at(kFileOpts[0]),
+                          root->formatted_str);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+    }
+    return root->proto();
+  } else {
+    const ShowCodeNode* root = ShowInternal(opts, nullptr);
+    printf("%s", root->formatted_str.c_str());
+    fflush(stdout);
+    return root->proto();
+  }
+}
+
+bool TFShowCode::ShouldShow(ShowCodeNode* node, const Options& opts,
+                            int depth) {
+  // Always show kTFProfRoot.
+  if (node->name() == kTFProfRoot) return true;
+
+  if (!node->account) return false;
+  // TODO(xpan): Think more carefully about node filtering in code view.
+  // Unlike graph/scope view, which users want to see the exact leaf op.
+  // In code view, users want to see the middle code traces they wrote.
+  //
+  // This is a subtle difference from scope/graph view. Usually mostly
+  // want to see the middle code traces (i.e. their own codes.), instead
+  // of the TensorFlow internal codes traces.
+  if (node->proto().total_requested_bytes() < opts.min_bytes ||
+      node->proto().total_exec_micros() < opts.min_micros ||
+      node->proto().total_parameters() < opts.min_params ||
+      node->proto().total_float_ops() < opts.min_float_ops ||
+      depth > opts.max_depth || !ShouldShowIfExtra(node, opts, depth)) {
+    return false;
+  }
+
+  bool show = false;
+  if (opts.device_regexes.size() == 1 && opts.device_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.device_regexes) {
+      for (const string& device : node->node->devices()) {
+        if (RE2::FullMatch(device, regex)) {
+          show = true;
+          break;
+        }
+      }
+      if (show) break;
+    }
+  }
+  // Don't show if device_regexes don't cover it.
+  if (!show) return false;
+
+  show = false;
+  if (opts.show_name_regexes.size() == 1 && opts.show_name_regexes[0] == ".*") {
+    show = true;
+  } else {
+    for (const string& regex : opts.show_name_regexes) {
+      if (RE2::FullMatch(node->name(), regex)) {
+        show = true;
+        break;
+      }
+    }
+  }
+  // Don't show if show_name_regexes don't cover it.
+  if (!show) return false;
+  // Don't show if hide_name_regexes cover it.
+  for (const string& regex : opts.hide_name_regexes) {
+    if (RE2::FullMatch(node->name(), regex)) return false;
+  }
+  return true;
+}
+
+bool TFShowCode::ShouldTrim(ShowCodeNode* node,
+                            const std::vector<string>& regexes) {
+  for (const string& regex : regexes) {
+    if (RE2::FullMatch(node->name(), regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TFShowCode::ShouldAccount(ShowCodeNode* node, const Options& opts) {
+  if (opts.account_type_regexes.size() == 1 &&
+      opts.account_type_regexes[0] == ".*") {
+    return true;
+  }
+  for (const string& regex : opts.account_type_regexes) {
+    for (const string& type : node->node->op_types()) {
+      if (RE2::FullMatch(type, regex)) {
+        return true;
+      }
+    }
+    for (const string& device : node->node->devices()) {
+      if (RE2::FullMatch(device, regex)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_code.h b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd38945fc669c3ab3dafbec584cb723f7f24cc
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_code.h
@@ -0,0 +1,103 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Parent class and utilities for tfprof_code.
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_tensor.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+class TFShowCode {
+ public:
+  explicit TFShowCode() {}
+  virtual ~TFShowCode() {}
+  virtual void AddNode(TFGraphNode* node) = 0;
+  virtual void Build() = 0;
+  const TFCodeNodeProto& Show(const Options& opts);
+
+ protected:
+  virtual const ShowCodeNode* ShowInternal(const Options& opts,
+                                           Timeline* timeline) = 0;
+
+  bool LookUpCheckPoint(const string& name,
+                        std::unique_ptr<TFProfTensor>* tensor);
+
+  // Overridden by subclass if extra requirements need to be met.
+  virtual bool ShouldShowIfExtra(ShowCodeNode* node, const Options& opts,
+                                 int depth) {
+    return true;
+  }
+
+  bool ShouldShow(ShowCodeNode* node, const Options& opts, int depth);
+
+  bool ShouldTrim(ShowCodeNode* node, const std::vector<string>& regexes);
+
+  bool ShouldAccount(ShowCodeNode* node, const Options& opts);
+
+  template <typename T>
+  std::vector<T*> SortNodes(const std::vector<T*>& nodes, const Options& opts) {
+    if (opts.order_by.empty() || nodes.empty()) {
+      return nodes;
+    }
+    std::vector<T*> sorted_nodes = nodes;
+    std::sort(sorted_nodes.begin(), sorted_nodes.end(),
+              [&opts](const T* n1, const T* n2) {
+                if (n1->name() == kTFProfRoot) return true;
+                if (n2->name() == kTFProfRoot) return false;
+                bool name_cmp = n1->name() < n2->name();
+                if (opts.order_by == kOrderBy[0]) {
+                  return name_cmp;
+                } else if (opts.order_by == kOrderBy[1]) {
+                  return n1->proto().total_requested_bytes() >
+                         n2->proto().total_requested_bytes();
+                } else if (opts.order_by == kOrderBy[2]) {
+                  return n1->proto().total_exec_micros() >
+                         n2->proto().total_exec_micros();
+                } else if (opts.order_by == kOrderBy[3]) {
+                  return n1->proto().total_parameters() >
+                         n2->proto().total_parameters();
+                } else if (opts.order_by == kOrderBy[4]) {
+                  return n1->proto().total_float_ops() >
+                         n2->proto().total_float_ops();
+                }
+                return name_cmp;
+              });
+    return sorted_nodes;
+  }
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_SHOW_CODE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
index ffaa576639e28435bb0d97537bb262ce5865b6db..f0621c9af0f24c960ff47ab754812a5a10919f7b 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_show_test.cc
@@ -75,7 +75,7 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
                {"VariableV2"},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
                {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false, dump_file);
+               "file", {{"outfile", dump_file}});
   tf_stats_->PrintGraph("scope", opts);
 
   string dump_str;
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.cc b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
index edc0689d69968f7ebf36c88c36d76ed329b88eeb..566b4cee440e6ced508e19eff2e79ca9df1b8131 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -56,37 +58,52 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
   printf("Preparing Views...\n");
   scope_view_ = std::unique_ptr<TFScope>(new TFScope(ckpt_reader_.get()));
   graph_view_ = std::unique_ptr<TFGraph>(new TFGraph(ckpt_reader_.get()));
+  code_view_ = std::unique_ptr<TFCode>(new TFCode());
+
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     scope_view_->AddNode(&it->second);
     graph_view_->AddNode(&it->second);
+    code_view_->AddNode(&it->second);
   }
   scope_view_->Build();
   graph_view_->Build();
+  code_view_->Build();
 }
 
-const TFProfNode& TFStats::PrintGraph(const string& cmd, const Options& opts) {
+const TFGraphNodeProto& TFStats::PrintGraph(const string& cmd,
+                                            const Options& opts) {
   if (cmd == kCmds[0]) {
     return scope_view_->Show(opts);
   } else if (cmd == kCmds[1]) {
     return graph_view_->Show(opts);
   } else {
     fprintf(stderr, "Unknown command: %s\n", cmd.c_str());
-    return empty_node_;
+    return empty_graph_node_;
   }
 }
 
+const TFCodeNodeProto& TFStats::PrintCode(const Options& opts) {
+  return code_view_->Show(opts);
+}
+
 void TFStats::ParseGraph() {
   for (const NodeDef& node : graph_->node()) {
     CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
-    nodes_map_[node.name()] = TFNode(&node);
+    nodes_map_[node.name()] = TFGraphNode(&node);
   }
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     const NodeDef* node_def = it->second.node_def();
     for (string node_input : node_def->input()) {
+      int output_idx = 0;
       // input name format can be: "^node:src_output"
       auto prefix_pos = node_input.find(":");
       if (prefix_pos != node_input.npos) {
-        node_input.substr(0, prefix_pos);
+        std::vector<string> input_parts = str_util::Split(node_input, ":");
+        CHECK(input_parts.size() == 2)
+            << "Unknown NodeDef.input format: " << node_input;
+        node_input = input_parts[0];
+        CHECK(strings::safe_strto32(input_parts[1], &output_idx))
+            << "Failed to parse integer: " << output_idx;
       }
       if (node_input.substr(0, 1) == "^") {
         node_input = node_input.substr(1);
@@ -95,7 +112,7 @@ void TFStats::ParseGraph() {
       if (input_node == nodes_map_.end()) {
         continue;
       }
-      it->second.AddInput(&input_node->second);
+      it->second.AddInput(&input_node->second, output_idx);
     }
   }
 }
@@ -110,6 +127,9 @@ void TFStats::ParseOpLog() {
     if (entry.float_ops()) {
       node->second.AddFloatOps(entry.float_ops());
     }
+    if (entry.has_code_def()) {
+      node->second.AddCode(&entry.code_def());
+    }
   }
 }
 
@@ -125,20 +145,6 @@ void TFStats::ParseRunMeta() {
       node->second.AddStepStat(dev_stat.device(), &node_stat);
     }
   }
-
-  if (!run_meta_->has_cost_graph()) {
-    fprintf(stderr,
-            "Missing CostGraphDef in RunMetadata.\nMaybe you forget to"
-            "set tf.ConfigProto(graph_options=tf.GraphOptions("
-            "build_cost_model=1)) to Session()\n");
-  }
-  for (const auto& node_pb : run_meta_->cost_graph().node()) {
-    auto node = nodes_map_.find(node_pb.name());
-    if (node == nodes_map_.end()) {
-      continue;
-    }
-    node->second.AddNodeStat(&node_pb);
-  }
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats.h b/tensorflow/tools/tfprof/internal/tfprof_stats.h
index 3a8b46ae315a4f2b1211a20a712ce5f20ee33632..585dca6771a1a506464ecd8f7bb5be09b2d56a91 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats.h
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_code.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_graph.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_node.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -56,7 +57,8 @@ class TFStats {
 
   // Prints the results to stdout. Also returns the printed output in
   // a proto.
-  const TFProfNode& PrintGraph(const string& cmd, const Options& opts);
+  const TFGraphNodeProto& PrintGraph(const string& cmd, const Options& opts);
+  const TFCodeNodeProto& PrintCode(const Options& opts);
 
  private:
   void ParseGraph();
@@ -67,13 +69,16 @@ class TFStats {
 
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
+  std::unique_ptr<TFCode> code_view_;
   std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<RunMetadata> run_meta_;
   std::unique_ptr<OpLog> op_log_;
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
-  // Store TFNode instead of TFNode* to avoid large number of dynamic alloc.
-  std::map<string, TFNode> nodes_map_;
-  TFProfNode empty_node_;
+  // Store TFGraphNode instead of TFGraphNode* to avoid large number of
+  // dynamic alloc.
+  std::map<string, TFGraphNode> nodes_map_;
+  TFGraphNodeProto empty_graph_node_;
+  TFCodeNodeProto empty_code_node_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
index 3c97f0eb65adc171abcbf7ad39d22f7739bdd9f9..eb01425e044d46a5dd62d0a1f2fc3b87f45c0e6f 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_stats_test.cc
@@ -74,30 +74,30 @@ TEST_F(TFProfStatsTest, CustomOpType) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name",
                {kTrainableVarType},  // accout_type_regexes
                {".*"}, {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -107,29 +107,29 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
   Options opts(
       3, 0, 0, 0, 0, {".*"}, "name", {kCkptVarType},  // accout_type_regexes
       {".*"}, {""}, {".*"}, {""}, false,
-      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+      {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 5\ntotal_requested_bytes: 1480\ntotal_parameters: "
       "370\nchildren {\n  name: \"conv2d/bias\"\n  exec_micros: 1\n  "
       "requested_bytes: 20\n  parameters: 5\n  total_exec_micros: 1\n  "
-      "total_requested_bytes: 20\n  total_parameters: 5\n  device: "
+      "total_requested_bytes: 20\n  total_parameters: 5\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 0\n  "
       "total_float_ops: 0\n}\nchildren {\n  name: \"conv2d/kernel\"\n  "
       "exec_micros: 1\n  requested_bytes: 540\n  parameters: 135\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 540\n  total_parameters: "
-      "135\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "135\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/bias\"\n  "
       "exec_micros: 1\n  requested_bytes: 20\n  parameters: 5\n  "
       "total_exec_micros: 1\n  total_requested_bytes: 20\n  total_parameters: "
-      "5\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "5\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nchildren {\n  name: \"conv2d_1/kernel\"\n  "
       "exec_micros: 2\n  requested_bytes: 900\n  parameters: 225\n  "
       "total_exec_micros: 2\n  total_requested_bytes: 900\n  total_parameters: "
-      "225\n  device: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
+      "225\n  devices: \"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: "
       "0\n  total_float_ops: 0\n}\nfloat_ops: 0\ntotal_float_ops: 0\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -139,11 +139,11 @@ TEST_F(TFProfStatsTest, TestGraph) {
   Options opts(100, 0, 10000, 0, 0, {".*"}, "name", {".*"},
                {"cost.*"},  // start_name_regexes
                {""}, {".*"}, {""}, false,
-               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("graph", opts);
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"}, "",
+               {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("graph", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: 0\ninputs: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -154,28 +154,28 @@ TEST_F(TFProfStatsTest, TestGraph) {
 
 TEST_F(TFProfStatsTest, TestFloatOps) {
   Options opts(10, 0, 0, 0, 1, {".*"}, "name", {".*"}, {".*"}, {""}, {".*"},
-               {""}, false, {"float_ops"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {""}, false, {"float_ops"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 96\ntotal_requested_bytes: "
       "8656\ntotal_parameters: 370\nchildren {\n  name: \"conv2d/BiasAdd\"\n  "
       "exec_micros: 12\n  requested_bytes: 1440\n  total_exec_micros: 12\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 360\n  "
       "total_float_ops: 360\n}\nchildren {\n  name: \"conv2d/convolution\"\n  "
       "exec_micros: 60\n  requested_bytes: 1440\n  total_exec_micros: 60\n  "
-      "total_requested_bytes: 1440\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 1440\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 19440\n  "
       "total_float_ops: 19440\n}\nchildren {\n  name: \"conv2d_2/BiasAdd\"\n  "
       "exec_micros: 2\n  requested_bytes: 640\n  total_exec_micros: 2\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 160\n  "
       "total_float_ops: 160\n}\nchildren {\n  name: \"conv2d_2/convolution\"\n "
       " exec_micros: 13\n  requested_bytes: 640\n  total_exec_micros: 13\n  "
-      "total_requested_bytes: 640\n  total_parameters: 0\n  device: "
+      "total_requested_bytes: 640\n  total_parameters: 0\n  devices: "
       "\"/job:localhost/replica:0/task:0/cpu:0\"\n  float_ops: 14400\n  "
       "total_float_ops: 14400\n}\nfloat_ops: 0\ntotal_float_ops: 34360\n",
       &expected));
@@ -186,10 +186,10 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   Options opts(100, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_2_1.*DW"},  // show_name_regexes.
                {""}, true,        // account_displayed_op_only.
-               {"params"}, false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               {"params"}, "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
@@ -202,9 +202,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   Options opts(10, 0, 0, 0, 0, {".*"}, "name", {".*"}, {".*"}, {""},
                {"unit_1_0.*gamma"}, {""}, false,
                {"tensor_value"},  // Show tensor value from checkpoint.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
-  TFProfNode expected;
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 96\ntotal_requested_bytes: "
diff --git a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
index 8c19910355baa077bd38412ae56af7617edc2cdd..79a781210dbba00e3522710a516d64933af207d6 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_tensor_test.cc
@@ -57,10 +57,10 @@ class TFProfTensorTest : public ::testing::Test {
 TEST_F(TFProfTensorTest, Basics) {
   Options opts(3, 0, 0, 0, 0, {".*"}, "name", {"VariableV2"}, {".*"}, {""},
                {".*"}, {""}, false, {"tensor_value"},  // show the tensor value.
-               false);
-  const TFProfNode& root = tf_stats_->PrintGraph("scope", opts);
+               "", {});
+  const TFGraphNodeProto& root = tf_stats_->PrintGraph("scope", opts);
 
-  TFProfNode expected;
+  TFGraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\nexec_micros: 0\nrequested_bytes: "
       "0\ntotal_exec_micros: 0\ntotal_requested_bytes: 0\ntotal_parameters: "
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5640c0e569d3ea809ff04673146ac377ca844ef
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.cc
@@ -0,0 +1,245 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_timeline.h"
+
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
+                                              const string& category,
+                                              const string& name, int64 pid,
+                                              int64 tid, int64 ts) {
+  Json::Value event(Json::objectValue);
+  event["ph"] = Json::Value(ph);
+  event["cat"] = Json::Value(category);
+  event["name"] = Json::Value(name);
+  event["pid"] = Json::Value(pid);
+  event["tid"] = Json::Value(tid);
+  event["ts"] = Json::Value(ts);
+  return event;
+}
+
+void ChromeTraceFormatter::EmitPID(const string& name, int64 pid) {
+  Json::Value event(Json::objectValue);
+  event["name"] = Json::Value("process_name");
+  event["ph"] = Json::Value("M");
+  event["pid"] = Json::Value(pid);
+  Json::Value args(Json::objectValue);
+  args["name"] = Json::Value(name);
+  event["args"] = args;
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitRegion(int64 ts, int64 duration, int64 pid,
+                                      int64 tid, const string& category,
+                                      const string& name, Json::Value args) {
+  Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
+  event["dur"] = Json::Value(duration);
+  event["args"] = std::move(args);
+  metadata_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowStart(const string& name, int64 ts,
+                                         int64 pid, int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
+                                       int64 tid, int64 flow_id) {
+  Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
+  event["id"] = flow_id;
+  events_.push_back(event);
+}
+
+string ChromeTraceFormatter::Format() {
+  Json::Value trace;
+  trace["traceEvents"] = Json::Value(Json::arrayValue);
+  for (const Json::Value& v : metadata_) {
+    trace["traceEvents"].append(v);
+  }
+  for (const Json::Value& v : events_) {
+    trace["traceEvents"].append(v);
+  }
+  return trace.toStyledString();
+}
+
+void Timeline::GenerateGraphTimeline(const GraphNode* gnode) {
+  fprintf(stdout, "adding graph nodes.\n");
+  AddGraphNode(gnode);
+  AllocateLanes();
+  fprintf(stdout, "generating trace file.\n");
+  int64 flow_id = 1;
+  for (const auto& process : alloc_nodes_) {
+    for (const auto& lane : process.second) {
+      for (const auto& node : lane.second) {
+        TimeNode* tnode = node.second;
+
+        Json::Value args(Json::objectValue);
+        args["name"] = Json::Value(tnode->name);
+        args["op"] = Json::Value(tnode->name);
+        chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
+                                     process.first, lane.first, "Op",
+                                     tnode->name, args);
+
+        for (TimeNode* next_tnode : node.second->next_tnodes) {
+          chrome_formatter_.EmitFlowStart(
+              tnode->name + "_flow", tnode->start_micros + tnode->exec_micros,
+              process.first, lane.first, flow_id);
+          chrome_formatter_.EmitFlowEnd(
+              tnode->name + "_flow", next_tnode->start_micros,
+              next_tnode->process->pid, next_tnode->tid, flow_id);
+          flow_id += 1;
+        }
+      }
+    }
+  }
+  OutputTimeline();
+}
+
+void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::GenerateCodeTimeline(const CodeNode* node) {
+  std::set<int64> visited_depth;
+  EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
+  OutputTimeline();
+}
+
+void Timeline::OutputTimeline() {
+  Status s =
+      WriteStringToFile(Env::Default(), outfile_, chrome_formatter_.Format());
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to write timeline file: %s\nError: %s\n",
+            outfile_.c_str(), s.ToString().c_str());
+    return;
+  }
+  fprintf(stdout, "\n******************************************************\n");
+  fprintf(stdout,
+          "Timeline file is written to %s.\n"
+          "Open a Chrome browser, enter URL chrome://tracing and "
+          "load the timeline file.",
+          outfile_.c_str());
+  fprintf(stdout, "\n******************************************************\n");
+  fflush(stdout);
+}
+
+std::vector<TimeNode*> Timeline::AddGraphNode(const GraphNode* gnode) {
+  std::vector<TimeNode*> tnodes;
+  if (!gnode) return tnodes;
+
+  std::vector<TimeNode*> shown_cinputs;
+  for (GraphNode* schild : gnode->show_children) {
+    std::vector<TimeNode*> inputs = AddGraphNode(schild);
+    shown_cinputs.insert(shown_cinputs.end(), inputs.begin(), inputs.end());
+  }
+  if (!gnode->node->step_stats()) {
+    return shown_cinputs;
+  }
+
+  const TFGraphNode* node = gnode->node;
+  for (const auto& kernel_execs : node->op_kernel_execs()) {
+    const string& device = kernel_execs.first;
+    const std::vector<std::pair<int64, int64>>& execs = kernel_execs.second;
+
+    if (process_.find(device) == process_.end()) {
+      int64 pid = AllocatePID();
+      process_[device].reset(new Process(pid));
+      chrome_formatter_.EmitPID(device, pid);
+    }
+    Process* p = process_[device].get();
+
+    for (const auto& exec : execs) {
+      int64 start_micros = exec.first;
+      int64 exec_micros = exec.second;
+      // TODO(xpan): There might be start time duplication here.
+      if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
+        // TODO(xpan): Give each kernel call a unique_name.
+        tnodes_[device][start_micros].reset(
+            new TimeNode(p, node->name(), start_micros, exec_micros));
+      }
+      TimeNode* tnode_ptr = tnodes_[device][start_micros].get();
+
+      for (int i = 0; i < shown_cinputs.size(); i++) {
+        shown_cinputs[i]->next_tnodes.push_back(tnode_ptr);
+      }
+      tnodes.push_back(tnode_ptr);
+    }
+  }
+  return tnodes;
+}
+
+void Timeline::AllocateLanes() {
+  for (auto& process : tnodes_) {
+    Process* p = process_[process.first].get();
+    for (auto& tnode : process.second) {
+      int64 start_time = tnode.second->start_micros;
+      int64 end_time = tnode.second->exec_micros - 1;
+
+      int64 l = -1;
+      for (int i = 0; i < p->lanes.size(); ++i) {
+        const auto& lane = p->lanes[i];
+        auto cur_it = lane.lower_bound(start_time);
+        if (cur_it == lane.end()) {
+          --cur_it;
+        }
+        l = i;
+        for (; cur_it != lane.begin(); --cur_it) {
+          if (cur_it->second < start_time) {
+            break;
+          }
+          if (cur_it->first <= end_time) {
+            l = -1;
+            break;
+          }
+        }
+        if (l >= 0) {
+          break;
+        }
+      }
+      if (l < 0) {
+        l = p->lanes.size();
+        std::map<int64, int64> nlane;
+        nlane[start_time] = end_time;
+        p->lanes.push_back(nlane);
+      } else {
+        p->lanes[l][start_time] = end_time;
+      }
+      tnode.second->tid = l;
+      alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
+    }
+  }
+}
+
+int64 Timeline::AllocatePID() {
+  int64 cur_pid = next_pid_;
+  next_pid_ += 1;
+  return cur_pid;
+}
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline.h b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d26874abd2836b8725d9630e7f4b9ca61df1aa9
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline.h
@@ -0,0 +1,147 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+#define THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_node_show.h"
+
+namespace tensorflow {
+namespace tfprof {
+
+typedef std::map<string, string> Event;
+
+class ChromeTraceFormatter {
+ public:
+  ChromeTraceFormatter() {}
+
+  Json::Value CreateEvent(const string& ph, const string& category,
+                          const string& name, int64 pid, int64 tid, int64 ts);
+
+  void EmitPID(const string& name, int64 pid);
+
+  void EmitRegion(int64 ts, int64 duration, int64 pid, int64 tid,
+                  const string& category, const string& name, Json::Value args);
+
+  void EmitFlowStart(const string& name, int64 ts, int64 pid, int64 tid,
+                     int64 flow_id);
+
+  void EmitFlowEnd(const string& name, int64 ts, int64 pid, int64 tid,
+                   int64 flow_id);
+
+  string Format();
+
+ private:
+  std::vector<Json::Value> events_;
+  std::vector<Json::Value> metadata_;
+};
+
+class Process {
+ public:
+  Process(int64 pid) : pid(pid) {}
+
+  // Each lane is a map from start_time to end_time.
+  std::vector<std::map<int64, int64>> lanes;
+  int64 pid;
+};
+
+class TimeNode {
+ public:
+  TimeNode(Process* process, const string& name, int64 start_micros,
+           int64 exec_micros)
+      : process(process),
+        name(name),
+        start_micros(start_micros),
+        exec_micros(exec_micros),
+        tid(-1) {}
+  virtual ~TimeNode() {}
+
+  Process* process;
+  string name;
+  int64 start_micros;
+  int64 exec_micros;
+  int64 tid;
+  std::vector<TimeNode*> next_tnodes;
+};
+
+class Timeline {
+ public:
+  Timeline(const string& outfile) : outfile_(outfile) {}
+  ~Timeline() {}
+
+  void GenerateGraphTimeline(const GraphNode* gnode);
+
+  void GenerateScopeTimeline(const ScopeNode* node);
+
+  void GenerateCodeTimeline(const CodeNode* node);
+
+ private:
+  void OutputTimeline();
+
+  template <typename Node>
+  void EmitTreeNode(const Node* node, int64 start_time, int64 duration,
+                    int64 depth, std::set<int64>* visited_depth) {
+    if (visited_depth->find(depth) == visited_depth->end()) {
+      chrome_formatter_.EmitPID(strings::StrCat("Scope:", depth), depth);
+      visited_depth->insert(depth);
+    }
+
+    Json::Value args(Json::objectValue);
+    args["name"] = Json::Value(node->name());
+    args["op"] = Json::Value(node->name());
+    chrome_formatter_.EmitRegion(start_time, duration, depth, 0, "Op",
+                                 node->name(), args);
+
+    int64 total_micros = 0;
+    int64 c_start_time = start_time;
+    for (const Node* child : node->show_children) {
+      int64 total_exec_micros = child->proto().total_exec_micros();
+      if (total_exec_micros <= 0) {
+        continue;
+      }
+      EmitTreeNode(child, c_start_time, total_exec_micros, depth + 1,
+                   visited_depth);
+      c_start_time += total_exec_micros;
+      total_micros += total_exec_micros;
+    }
+    CHECK(total_micros <= duration) << node->name() << " parent:" << duration
+                                    << " children:" << total_micros;
+  }
+
+  std::vector<TimeNode*> AddGraphNode(const GraphNode* gnode);
+
+  void AllocateLanes();
+
+  int64 AllocatePID();
+
+  const string outfile_;
+  int64 next_pid_ = 0;
+  int64 allocator_pid_ = -1;
+  ChromeTraceFormatter chrome_formatter_;
+  std::map<string, int64> device_pids_;
+
+  std::map<string, std::unique_ptr<Process>> process_;
+  std::map<int64, std::map<int64, std::map<int64, TimeNode*>>> alloc_nodes_;
+  std::map<string, std::map<int64, std::unique_ptr<TimeNode>>> tnodes_;
+};
+
+}  // namespace tfprof
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_TOOLS_TFPROF_INTERNAL_TFPROF_TIMELINE_H_
diff --git a/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2dfe6ab335e11eab5464dec4390868e3f6518fa8
--- /dev/null
+++ b/tensorflow/tools/tfprof/internal/tfprof_timeline_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/tfprof/internal/tfprof_stats.h"
+
+#include <utility>
+
+#include "tensorflow/c/checkpoint_reader.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_constants.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_options.h"
+#include "tensorflow/tools/tfprof/internal/tfprof_utils.h"
+#include "tensorflow/tools/tfprof/tfprof_log.pb.h"
+#include "tensorflow/tools/tfprof/tfprof_output.pb.h"
+
+namespace tensorflow {
+namespace tfprof {
+class TFProfTimelineTest : public ::testing::Test {
+ protected:
+  TFProfTimelineTest() {
+    string graph_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/graph.pbtxt");
+    std::unique_ptr<tensorflow::GraphDef> graph_pb(new tensorflow::GraphDef());
+    TF_CHECK_OK(ReadGraphDef(Env::Default(), graph_path, graph_pb.get()));
+
+    std::unique_ptr<tensorflow::RunMetadata> run_meta_pb(
+        new tensorflow::RunMetadata());
+    string run_meta_path =
+        io::JoinPath(testing::TensorFlowSrcRoot(),
+                     "tools/tfprof/internal/testdata/run_meta");
+    TF_CHECK_OK(
+        ReadBinaryProto(Env::Default(), run_meta_path, run_meta_pb.get()));
+
+    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
+                                nullptr, nullptr));
+  }
+
+  std::unique_ptr<TFStats> tf_stats_;
+};
+
+// Before adding test, first dump the json file and
+// manually check it's correct
+TEST_F(TFProfTimelineTest, GraphView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(10000, 0, 0, 0, 0, {".*"}, "name",
+               {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("graph", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(14171250174278825648ull, Hash64(dump_str));
+}
+
+TEST_F(TFProfTimelineTest, ScopeView) {
+  string dump_file = io::JoinPath(testing::TmpDir(), "dump");
+  Options opts(5, 0, 0, 0, 0, {".*"}, "name", {".*"},  // accout_type_regexes
+               {".*"}, {""}, {".*"}, {""}, false,
+               {"params", "bytes", "micros", "float_ops", "num_hidden_ops"},
+               "timeline", {{"outfile", dump_file}});
+  tf_stats_->PrintGraph("scope", opts);
+
+  string dump_str;
+  TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file, &dump_str));
+  EXPECT_EQ(2355241164346147404ull, Hash64(dump_str));
+}
+
+// TODO(xpan): tfprof_log is too large to include in testdata when adding
+// code traces.
+
+}  // namespace tfprof
+}  // namespace tensorflow
diff --git a/tensorflow/tools/tfprof/internal/tfprof_utils.cc b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
index 6d557e91933648f5dd82dc4c1daee6717ed296d5..8e55e009d3346cf932c094d304d456528c00cdce 100644
--- a/tensorflow/tools/tfprof/internal/tfprof_utils.cc
+++ b/tensorflow/tools/tfprof/internal/tfprof_utils.cc
@@ -94,7 +94,7 @@ string StripQuote(const string& s) {
   return s.substr(start, end - start + 1);
 }
 
-tensorflow::Status ReturnError(const std::vector<string> pieces, int idx) {
+tensorflow::Status ReturnError(const std::vector<string>& pieces, int idx) {
   string val;
   if (pieces.size() > idx + 1) {
     val = pieces[idx + 1];
@@ -251,19 +251,13 @@ tensorflow::Status ParseCmdLine(const string& line, string* cmd,
       opts->select = requested_set;
       ++i;
     } else if (pieces[i] == tensorflow::tfprof::kOptions[14]) {
-      if ((pieces.size() > i + 1 && pieces[i + 1].find("-") == 0) ||
-          pieces.size() == i + 1) {
-        opts->viz = true;
-      } else if (!StringToBool(pieces[i + 1], &opts->viz)) {
-        return ReturnError(pieces, i);
-      } else {
-        ++i;
-      }
-    } else if (pieces[i] == tensorflow::tfprof::kOptions[15]) {
       if (pieces.size() <= i + 1) {
         return ReturnError(pieces, i);
       }
-      opts->dump_to_file = StripQuote(pieces[i + 1]);
+
+      tensorflow::Status s =
+          ParseOutput(pieces[i + 1], &opts->output_type, &opts->output_options);
+      if (!s.ok()) return s;
       ++i;
     } else {
       return ReturnError(pieces, i);
diff --git a/tensorflow/tools/tfprof/tfprof_log.proto b/tensorflow/tools/tfprof/tfprof_log.proto
index cae6e1e3a8c08f64e28460c2850d5f6beeb69e61..5c47142e0ab6e3f647d869016a8ab4f9f9eb9e99 100644
--- a/tensorflow/tools/tfprof/tfprof_log.proto
+++ b/tensorflow/tools/tfprof/tfprof_log.proto
@@ -2,6 +2,17 @@ syntax = "proto2";
 
 package tensorflow.tfprof;
 
+// It specifies the Python callstack that creates an op.
+message CodeDef {
+  repeated Trace traces = 1;
+  message Trace {
+    optional string file = 1;
+    optional int32 lineno = 2;
+    optional string function = 3;
+    optional string line = 4;
+  }
+}
+
 message OpLogEntry {
   // op name.
   optional string name = 1;
@@ -12,6 +23,8 @@ message OpLogEntry {
   // User can define extra op type information for an op. This allows the user
   // to select a group of ops precisely using op_type as a key.
   repeated string types = 3;
+  // Used to support tfprof "code" view.
+  optional CodeDef code_def = 4;
 }
 
 message OpLog {
diff --git a/tensorflow/tools/tfprof/tfprof_main.cc b/tensorflow/tools/tfprof/tfprof_main.cc
index a8ed6e38132df19391b4a8bdfa69a9a3254439f8..cfe239da229246c2d22706aa6d4c73cfac6a2e73 100644
--- a/tensorflow/tools/tfprof/tfprof_main.cc
+++ b/tensorflow/tools/tfprof/tfprof_main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/tfprof/internal/tfprof_options.h"
@@ -82,8 +83,7 @@ int main(int argc, char** argv) {
   tensorflow::string FLAGS_hide_name_regexes;
   bool FLAGS_account_displayed_op_only = false;
   tensorflow::string FLAGS_select = "params";
-  bool FLAGS_viz = false;
-  tensorflow::string FLAGS_dump_to_file = "";
+  tensorflow::string FLAGS_output = "";
   for (int i = 0; i < argc; i++) {
     fprintf(stderr, "%s\n", argv[i]);
   }
@@ -117,7 +117,7 @@ int main(int argc, char** argv) {
                        &FLAGS_account_displayed_op_only,
                        "account displayed op only"),
       tensorflow::Flag("select", &FLAGS_select, "select"),
-      tensorflow::Flag("dump_to_file", &FLAGS_dump_to_file, "dump to file"),
+      tensorflow::Flag("output", &FLAGS_output, "output"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -144,6 +144,12 @@ int main(int argc, char** argv) {
   std::vector<tensorflow::string> select =
       Split(FLAGS_select, ',', tensorflow::str_util::SkipEmpty());
 
+  tensorflow::string output_type;
+  std::map<tensorflow::string, tensorflow::string> output_options;
+  tensorflow::Status s = tensorflow::tfprof::ParseOutput(
+      FLAGS_output, &output_type, &output_options);
+  CHECK(s.ok()) << s.ToString();
+
   tensorflow::string cmd = "";
   if (argc == 1 && FLAGS_graph_path.empty()) {
     printf("1) go/tfprof: Tutorial.\n");
@@ -160,12 +166,13 @@ int main(int argc, char** argv) {
         "Profiling everything!\n");
     return 0;
   } else if (argc > 1) {
-    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[3]) {
+    if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[4]) {
       tensorflow::tfprof::PrintHelp();
       return 0;
     }
     if (tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[0] ||
-        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1]) {
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[1] ||
+        tensorflow::string(argv[1]) == tensorflow::tfprof::kCmds[2]) {
       cmd = argv[1];
     }
   }
@@ -185,10 +192,18 @@ int main(int argc, char** argv) {
 
   std::unique_ptr<tensorflow::tfprof::OpLog> op_log(
       new tensorflow::tfprof::OpLog());
-  if (!ReadBinaryProto(tensorflow::Env::Default(), FLAGS_op_log_path,
-                       op_log.get())
-           .ok()) {
-    op_log.release();
+  if (!FLAGS_op_log_path.empty()) {
+    tensorflow::string op_log_str;
+    s = tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                     FLAGS_op_log_path, &op_log_str);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
+      return 1;
+    }
+    if (!tensorflow::ParseProtoUnlimited(op_log.get(), op_log_str)) {
+      fprintf(stderr, "Failed to parse op_log_path\n");
+      return 1;
+    }
   }
 
   std::unique_ptr<tensorflow::checkpoint::CheckpointReader> ckpt_reader;
@@ -211,10 +226,13 @@ int main(int argc, char** argv) {
       FLAGS_max_depth, FLAGS_min_bytes, FLAGS_min_micros, FLAGS_min_params,
       FLAGS_min_float_ops, device_regexes, FLAGS_order_by, account_type_regexes,
       start_name_regexes, trim_name_regexes, show_name_regexes,
-      hide_name_regexes, FLAGS_account_displayed_op_only, select, FLAGS_viz,
-      FLAGS_dump_to_file);
+      hide_name_regexes, FLAGS_account_displayed_op_only, select, output_type,
+      output_options);
 
-  if (!cmd.empty()) {
+  if (cmd == tensorflow::tfprof::kCmds[2]) {
+    tf_stat.PrintCode(opts);
+    return 0;
+  } else if (!cmd.empty()) {
     tf_stat.PrintGraph(cmd, opts);
     return 0;
   }
@@ -240,10 +258,12 @@ int main(int argc, char** argv) {
       fprintf(stderr, "E: %s\n", s.ToString().c_str());
       continue;
     }
-    if (cmd == tensorflow::tfprof::kCmds[2]) {
+    if (cmd == tensorflow::tfprof::kCmds[3]) {
       opts = new_opts;
-    } else if (cmd == tensorflow::tfprof::kCmds[3]) {
+    } else if (cmd == tensorflow::tfprof::kCmds[4]) {
       tensorflow::tfprof::PrintHelp();
+    } else if (cmd == tensorflow::tfprof::kCmds[2]) {
+      tf_stat.PrintCode(new_opts);
     } else {
       tf_stat.PrintGraph(cmd, new_opts);
     }
diff --git a/tensorflow/tools/tfprof/tfprof_options.proto b/tensorflow/tools/tfprof/tfprof_options.proto
index 0d8e6880390328586068fe57daff1f4a66fb0bc8..84a2e14005374a7fdefbb411e05ea5b7f07808f1 100644
--- a/tensorflow/tools/tfprof/tfprof_options.proto
+++ b/tensorflow/tools/tfprof/tfprof_options.proto
@@ -19,6 +19,6 @@ message OptionsProto {
   repeated string hide_name_regexes = 12;
   optional bool account_displayed_op_only = 13;
   repeated string select = 14;
-  optional bool viz = 15;
+  optional string output = 15;
   optional string dump_to_file = 16;
-}
\ No newline at end of file
+}
diff --git a/tensorflow/tools/tfprof/tfprof_output.proto b/tensorflow/tools/tfprof/tfprof_output.proto
index 9afd41046e4eb00150e0bb514d59e363d80c9d59..93e6c1233c30d008fed73f0e863c627e54d05c37 100644
--- a/tensorflow/tools/tfprof/tfprof_output.proto
+++ b/tensorflow/tools/tfprof/tfprof_output.proto
@@ -14,7 +14,8 @@ message TFProfTensorProto {
   repeated string value_str = 4;
 }
 
-message TFProfNode {
+// A node in TensorFlow graph. Used by scope/graph view.
+message TFGraphNodeProto {
   // op name.
   optional string name = 1;
   // tensor value restored from checkpoint.
@@ -30,7 +31,8 @@ message TFProfNode {
   // Number of inputs to the op.
   optional int64 inputs = 5;
   // Device the op is assigned to.
-  optional string device = 10;
+  // Since an op can fire multiple kernel calls, there can be multiple devices.
+  repeated string devices = 10;
 
   // The following are the aggregated stats from all accounted descendants and
   // the op itself. The actual descendants depend on the data structure used
@@ -45,5 +47,34 @@ message TFProfNode {
   repeated TensorShapeProto shapes = 11;
   // Descendants of the graph. The actual descendants depend on the data
   // structure used (scope, graph).
-  repeated TFProfNode children = 12;
+  repeated TFGraphNodeProto children = 12;
+}
+
+// A node in TensorFlow Python call trace stack. Used by code view.
+message TFCodeNodeProto {
+  // A trace in the trace stack.
+  optional string name = 1;
+
+  // code execution time.
+  optional int64 exec_micros = 2;
+  // Total requested bytes by the code.
+  optional int64 requested_bytes = 3;
+  // Number of parameters if available.
+  optional int64 parameters = 4;
+  // Number of float operations.
+  optional int64 float_ops = 5;
+
+  // The following are the aggregated stats from called descendents and the
+  // trace itself. The actual descendants depend on the data structure used.
+  optional int64 total_exec_micros = 6;
+  optional int64 total_requested_bytes = 7;
+  optional int64 total_parameters = 8;
+  optional int64 total_float_ops = 9;
+
+  // A set of graph nodes created by the leaf of the call stack.
+  // 'children' field should be empty if graph_nodes is non-empty.
+  repeated TFGraphNodeProto graph_nodes = 10;
+  // Descendants of the graph. The actual descendants depend on the data
+  // structure used (scope, graph).
+  repeated TFCodeNodeProto children = 11;
 }
\ No newline at end of file
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4a39723cdc92ada6bcac4207d744fd8b7a81190b..3f8619af085b90bfa590ce636533787bce2deaa0 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,10 +1,24 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
-load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "webfiles_external")
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "web_library_external")
+load("//third_party/py:python_configure.bzl", "python_configure")
+
+
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  return repository_ctx.os.name.lower().find("windows") != -1
+
+
+def _get_env_var(repository_ctx, name):
+  """Find an environment variable."""
+  if name in repository_ctx.os.environ:
+    return repository_ctx.os.environ[name]
+  else:
+    return None
 
 
 # Parse the bazel version string from `native.bazel_version`.
@@ -14,20 +28,23 @@ def _parse_bazel_version(bazel_version):
 
   # Split into (release, date) parts and only return the release
   # as a tuple of integers.
-  parts = version.split('-', 1)
+  parts = version.split("-", 1)
 
   # Turn "release" into a tuple of strings
   version_tuple = ()
-  for number in parts[0].split('.'):
+  for number in parts[0].split("."):
     version_tuple += (str(number),)
   return version_tuple
 
+
 # Check that a specific bazel version is being used.
 def check_version(bazel_version):
   if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % bazel_version)
+    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" %
+         bazel_version)
   elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+    print("\nCurrent Bazel is not a release version, cannot check for " +
+          "compatibility.")
     print("Make sure that you are running at least Bazel %s.\n" % bazel_version)
   else:
     current_bazel_version = _parse_bazel_version(native.bazel_version)
@@ -35,102 +52,120 @@ def check_version(bazel_version):
     if minimum_bazel_version > current_bazel_version:
       fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
           native.bazel_version, bazel_version))
-  pass
+
 
 def _repos_are_siblings():
   return Label("@foo//bar").workspace_root.startswith("../")
 
+
 # Temporary workaround to support including TensorFlow as a submodule until this
 # use-case is supported in the next Bazel release.
 def _temp_workaround_http_archive_impl(repo_ctx):
-   repo_ctx.template("BUILD", repo_ctx.attr.build_file,
-                     {
-                         "%prefix%" : ".." if _repos_are_siblings() else "external",
-                         "%ws%": repo_ctx.attr.repository
-                     }, False)
-   repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                 "", repo_ctx.attr.strip_prefix)
-   if repo_ctx.attr.patch_file != None:
-     _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
+      "%prefix%": ".." if _repos_are_siblings() else "external",
+      "%ws%": repo_ctx.attr.repository
+  }, False)
+  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
+                                "", repo_ctx.attr.strip_prefix)
+  if repo_ctx.attr.patch_file != None:
+    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
+
 
 temp_workaround_http_archive = repository_rule(
-   implementation=_temp_workaround_http_archive_impl,
-   attrs = {
-      "build_file": attr.label(),
-      "repository": attr.string(),
-      "patch_file": attr.label(default = None),
-      "urls": attr.string_list(default = []),
-      "sha256": attr.string(default = ""),
-      "strip_prefix": attr.string(default = ""),
-   })
-
-# Executes specified command with arguments and calls 'fail' if it exited with non-zero code
+    implementation = _temp_workaround_http_archive_impl,
+    attrs = {
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "patch_file": attr.label(default = None),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
+
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args)
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
   if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" +
-          "Stdout: {2}\n" +
-          "Stderr: {3}").format(" ".join(cmd_and_args),
-                                result.return_code, result.stdout, result.stderr))
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
 
 # Apply a patch_file to the repository root directory
 # Runs 'patch -p1'
 def _apply_patch(repo_ctx, patch_file):
-  _execute_and_check_ret_code(repo_ctx, ["patch", "-p1",
-                                         "-d", repo_ctx.path("."),
-                                         "-i", repo_ctx.path(patch_file)])
+  cmd = [
+      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
+  ]
+  if _is_windows(repo_ctx):
+    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  _execute_and_check_ret_code(repo_ctx, cmd)
+
 
 # Download the repository and apply a patch to its root
 def _patched_http_archive_impl(repo_ctx):
-  repo_ctx.download_and_extract(repo_ctx.attr.urls,
-                                sha256 = repo_ctx.attr.sha256,
-                                stripPrefix = repo_ctx.attr.strip_prefix)
+  repo_ctx.download_and_extract(
+      repo_ctx.attr.urls,
+      sha256=repo_ctx.attr.sha256,
+      stripPrefix=repo_ctx.attr.strip_prefix)
   _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
 
+
 patched_http_archive = repository_rule(
     implementation = _patched_http_archive_impl,
     attrs = {
-      "patch_file": attr.label(),
-      "build_file": attr.label(),
-      "repository": attr.string(),
-      "urls": attr.string_list(default = []),
-      "sha256": attr.string(default = ""),
-      "strip_prefix": attr.string(default = ""),
-    })
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+        "repository": attr.string(),
+        "urls": attr.string_list(default = []),
+        "sha256": attr.string(default = ""),
+        "strip_prefix": attr.string(default = ""),
+    },
+)
+
 
 # If TensorFlow is linked as a submodule.
 # path_prefix and tf_repo_name are no longer used.
-def tf_workspace(path_prefix = "", tf_repo_name = ""):
+def tf_workspace(path_prefix="", tf_repo_name=""):
   # We must check the bazel version before trying to parse any other BUILD
   # files, in case the parsing of those build files depends on the bazel
   # version we require here.
   check_version("0.4.5")
-  cuda_configure(name = "local_config_cuda")
-  sycl_configure(name = "local_config_sycl")
+  cuda_configure(name="local_config_cuda")
+  sycl_configure(name="local_config_sycl")
+  python_configure(name="local_config_python")
   if path_prefix:
-    print("path_prefix was specified to tf_workspace but is no longer used and will be removed in the future.")
+    print("path_prefix was specified to tf_workspace but is no longer used " +
+          "and will be removed in the future.")
   if tf_repo_name:
-    print("tf_repo_name was specified to tf_workspace but is no longer used and will be removed in the future.")
+    print("tf_repo_name was specified to tf_workspace but is no longer used " +
+          "and will be removed in the future.")
 
   native.new_http_archive(
       name = "eigen_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/deff8b280204.tar.gz",
+          "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz",
       ],
-      sha256 = "a39834683eb5bdb9a7434f0ab3621d2cbc3b07e8002db6de101e45ec536723eb",
-      strip_prefix = "eigen-eigen-deff8b280204",
+      sha256 = "ca7beac153d4059c02c8fc59816c82d54ea47fe58365e8aded4082ded0b820c4",
+      strip_prefix = "eigen-eigen-f3a22f35b044",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
   native.new_http_archive(
       name = "libxsmm_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/hfp/libxsmm/archive/1.8.tar.gz",
-          "https://github.com/hfp/libxsmm/archive/1.8.tar.gz",
+          "http://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
       ],
-      sha256 = "0330201afb5525d0950ec861fec9dd75eb40a03845ebe03d2c635cf8bfc14fea",
-      strip_prefix = "libxsmm-1.8",
+      sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
+      strip_prefix = "libxsmm-1.8.1",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
@@ -142,7 +177,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "ortools_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "http://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
           "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
@@ -153,7 +188,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "http://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
           "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
       ],
       sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
@@ -163,7 +198,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "gemmlowp",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
+          "http://mirror.bazel.build/github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
           "https://github.com/google/gemmlowp/archive/a6f29d8ac48d63293f845f2253eccbf86bc28321.tar.gz",
       ],
       sha256 = "75d40ea8e68b0d1644f052fffe8f14a410b2a73d40ccb859a95c0578d194ec26",
@@ -173,7 +208,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "farmhash_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
+          "http://mirror.bazel.build/github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
           "https://github.com/google/farmhash/archive/92e897b282426729f4724d91a637596c7e2fe28f.zip",
       ],
       sha256 = "4c626d1f306bda2c6804ab955892f803f5245f4dcaecb4979dc08b091256da54",
@@ -189,7 +224,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "highwayhash",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
           "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
@@ -200,7 +235,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "nasm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
+          "http://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
           "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.12.02.tar.bz2/d15843c3fb7db39af80571ee27ec6fad/nasm-2.12.02.tar.bz2",
       ],
       sha256 = "00b0891c678c065446ca59bcee64719d0096d54d6886e6e472aeee2e170ae324",
@@ -211,7 +246,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "jpeg",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "http://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
           "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
@@ -223,7 +258,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "png_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/glennrp/libpng/archive/v1.2.53.zip",
+          "http://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.zip",
           "https://github.com/glennrp/libpng/archive/v1.2.53.zip",
       ],
       sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
@@ -234,7 +269,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "gif_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
       ],
@@ -246,7 +281,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "six_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
           "http://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
       ],
       sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
@@ -257,7 +292,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "org_pythonhosted_markdown",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
           "https://pypi.python.org/packages/1d/25/3f6d2cb31ec42ca5bd3bfbea99b63892b735d76e26f20dd2dcc34ffe4f0d/Markdown-2.6.8.tar.gz",
       ],
       strip_prefix = "Markdown-2.6.8",
@@ -268,18 +303,18 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "org_html5lib",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/html5lib/html5lib-python/archive/1.0b8.tar.gz",
-          "https://github.com/html5lib/html5lib-python/archive/1.0b8.tar.gz",
+          "http://mirror.bazel.build/github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",
+          "https://github.com/html5lib/html5lib-python/archive/0.9999999.tar.gz",  # identical to 1.0b8
       ],
-      sha256 = "adb36c879264e8880b92589c4c4fe0814cd9d157b73328b14d728f48a6bab0a4",
-      strip_prefix = "html5lib-python-1.0b8",
+      sha256 = "184257f98539159a433e2a2197309657ae1283b4c44dbd9c87b2f02ff36adce8",
+      strip_prefix = "html5lib-python-0.9999999",
       build_file = str(Label("//third_party:html5lib.BUILD")),
   )
 
   native.new_http_archive(
       name = "org_mozilla_bleach",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/mozilla/bleach/archive/v1.5.tar.gz",
+          "http://mirror.bazel.build/github.com/mozilla/bleach/archive/v1.5.tar.gz",
           "https://github.com/mozilla/bleach/archive/v1.5.tar.gz",
       ],
       strip_prefix = "bleach-1.5",
@@ -290,7 +325,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "org_pocoo_werkzeug",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
+          "http://mirror.bazel.build/pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
           "https://pypi.python.org/packages/b7/7f/44d3cfe5a12ba002b253f6985a4477edfa66da53787a2a838a40f6415263/Werkzeug-0.11.10.tar.gz",
       ],
       strip_prefix = "Werkzeug-0.11.10",
@@ -306,7 +341,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   patched_http_archive(
       name = "protobuf",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -324,7 +359,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "com_google_protobuf",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -334,7 +369,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
+          "http://mirror.bazel.build/github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
           "https://github.com/google/protobuf/archive/2b7430d96aeff2bb624c8d52182ff5e4b9f7f18a.tar.gz",
       ],
       sha256 = "e5d3d4e227a0f7afb8745df049bbd4d55474b158ca5aaa2a0e31099af24be1d0",
@@ -344,7 +379,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "gmock_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/googletest/archive/release-1.8.0.zip",
+          "http://mirror.bazel.build/github.com/google/googletest/archive/release-1.8.0.zip",
           "https://github.com/google/googletest/archive/release-1.8.0.zip",
       ],
       sha256 = "f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf",
@@ -362,10 +397,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@gmock_archive//:gtest_main",
   )
 
-  native.git_repository(
-    name   = "com_github_gflags_gflags",
-    commit = "f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
-    remote = "https://github.com/gflags/gflags.git",
+  native.http_archive(
+      name = "com_github_gflags_gflags",
+      urls = [
+          "http://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+          "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+      ],
+      sha256 = "4d222fab8f1ede4709cdff417d15a1336f862d7334a81abf76d09c15ecf9acd1",
+      strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
   )
 
   native.bind(
@@ -377,7 +416,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
+          "http://mirror.bazel.build/ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
           "http://ftp.exim.org/pub/pcre/pcre-8.39.tar.gz",
       ],
       strip_prefix = "pcre-8.39",
@@ -388,7 +427,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
+          "http://mirror.bazel.build/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
           "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz",
       ],
@@ -400,7 +439,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/curl.haxx.se/download/curl-7.49.1.tar.gz",
+          "http://mirror.bazel.build/curl.haxx.se/download/curl-7.49.1.tar.gz",
           "https://curl.haxx.se/download/curl-7.49.1.tar.gz",
       ],
       strip_prefix = "curl-7.49.1",
@@ -423,7 +462,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "grpc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
+          "http://mirror.bazel.build/github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
           "https://github.com/grpc/grpc/archive/d7ff4ff40071d2b486a052183e3e9f9382afb745.tar.gz",
       ],
       sha256 = "a15f352436ab92c521b1ac11e729e155ace38d0856380cf25048c5d1d9ba8e31",
@@ -447,7 +486,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+          "http://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
           "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
@@ -459,11 +498,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "llvm",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/llvm-mirror/llvm/archive/5d2b26453d4bca5a13b69b0130e4369d1fcd393d.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/5d2b26453d4bca5a13b69b0130e4369d1fcd393d.tar.gz",
+          "http://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b.tar.gz",
       ],
-      sha256 = "3cecf39bf4b3854629d610bb321bb57e0e46bda9110bd51c3bae5a4171c82bab",
-      strip_prefix = "llvm-5d2b26453d4bca5a13b69b0130e4369d1fcd393d",
+      sha256 = "42c57d798a037d9dea692ce1da8ff4d24966ab5a40494015b374341e43411a37",
+      strip_prefix = "llvm-c978c0ff91f7c4ea58cfbd8f378e51c6af2c2b4b",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
       repository = tf_repo_name,
   )
@@ -471,7 +510,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "jsoncpp_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
           "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
@@ -487,7 +526,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.http_archive(
       name = "boringssl",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
+          "http://mirror.bazel.build/github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",
           "https://github.com/google/boringssl/archive/bbcaa15b0647816b9a1a9b9e0d209cd6712f0105.tar.gz",  # 2016-07-11
       ],
       sha256 = "025264d6e9a7ad371f2f66d17a28b6627de0c9592dc2eb54afd062f68f1f9aa3",
@@ -497,7 +536,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "nanopb_git",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
+          "http://mirror.bazel.build/github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
           "https://github.com/nanopb/nanopb/archive/1251fa1065afc0d62f635e0f63fec8276e14e13c.tar.gz",
       ],
       sha256 = "ab1455c8edff855f4f55b68480991559e51c11e7dab060bbab7cffb12dd3af33",
@@ -513,7 +552,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   native.new_http_archive(
       name = "zlib_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/zlib.net/zlib-1.2.8.tar.gz",
+          "http://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
           "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
       ],
       sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
@@ -526,10 +565,20 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       actual = "@zlib_archive//:zlib",
   )
 
+  native.new_http_archive(
+      name = "fft2d",
+      urls = [
+          "http://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+          "http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
+      ],
+      sha256 = "52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296",
+      build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
+  )
+
   temp_workaround_http_archive(
       name = "snappy",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/google/snappy/archive/1.1.4.zip",
+          "http://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.zip",
           "https://github.com/google/snappy/archive/1.1.4.zip",
       ],
       sha256 = "6c74d2b663170d68184da353cdd71b5b7d57bc8888ef1e99b4929b5d680dba54",
@@ -541,14 +590,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "nccl_archive",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
-          "https://github.com/nvidia/nccl/archive/024d1e267845f2ed06f3e2e42476d50f04a00ee6.tar.gz",
+          "http://mirror.bazel.build/github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
+          "https://github.com/nvidia/nccl/archive/ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b.tar.gz",
       ],
-      sha256 = "6787f0eed88d52ee8e32956fa4947d92c139da469f1d8e311c307f27d641118e",
-      strip_prefix = "nccl-024d1e267845f2ed06f3e2e42476d50f04a00ee6",
-      build_file = str(Label("//third_party/nccl:nccl.BUILD")),
-      # TODO: Remove patching after the fix is merged into nccl(see https://github.com/NVIDIA/nccl/pull/78)
-      patch_file = str(Label("//third_party/nccl:fix_clang_compilation.patch")),
+      sha256 = "6c34a0862d9f8ed4ad5984c6a8206b351957bb14cf6ad7822720f285f4aada04",
+      strip_prefix = "nccl-ccfc4567dc3e2a37fb42cfbc64d10eb526e7da7b",
+      build_file = str(Label("//third_party:nccl.BUILD")),
       repository = tf_repo_name,
   )
 
@@ -556,7 +603,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "junit",
       jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
       jar_urls = [
-          "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
+          "http://mirror.bazel.build/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar",
           "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar",
       ],
@@ -569,7 +616,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "org_hamcrest_core",
       jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9",
       jar_urls = [
-          "http://bazel-mirror.storage.googleapis.com/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
+          "http://mirror.bazel.build/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
           "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar",
       ],
@@ -580,7 +627,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   temp_workaround_http_archive(
       name = "jemalloc",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
           "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
@@ -589,6 +636,17 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       repository = tf_repo_name,
   )
 
+  native.new_http_archive(
+      name = "com_google_pprof",
+      urls = [
+          "http://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+          "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+      ],
+      sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
+      strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
+      build_file = str(Label("//third_party:pprof.BUILD")),
+  )
+
   ##############################################################################
   # TensorBoard Build Tools
 
@@ -606,23 +664,27 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       licenses = ["notice"],
       sha256_urls_extract_macos = {
           "47109a00cac344d80296c195451bb5eee7c21727fcef1594384ddfe1f852957a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-darwin-x64.tar.xz",
           ],
       },
       sha256_urls_windows = {
+          "3d4cfca9dcec556a077a2324bf5bd165ea3e6e64a2bfd7fc6e7a1f0dc4eb552b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
+              "https://raw.githubusercontent.com/nodejs/node/v4.3.2/LICENSE",
+          ],
           "606c44c42d17866c017c50c0afadad411d9492ac4281d2431b937f881911614e": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.exe",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.exe",
               "http://nodejs.org/dist/v4.3.2/win-x64/node.exe",
           ],
           "451a40570099a95488d6438f175813629e0430f87f23c8659bc18dc42494820a": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/win-x64/node.lib",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/win-x64/node.lib",
               "http://nodejs.org/dist/v4.3.2/win-x64/node.lib",
           ],
       },
       sha256_urls_extract = {
           "4350d0431b49697517c6cca5d66adf5f74eb9101c52f52ae959fa94225822d44": [
-              "http://bazel-mirror.storage.googleapis.com/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
+              "http://mirror.bazel.build/nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
               "http://nodejs.org/dist/v4.3.2/node-v4.3.2-linux-x64.tar.xz",
           ],
       },
@@ -640,13 +702,17 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       name = "com_microsoft_typescript",
       licenses = ["notice"],  # Apache 2.0
       sha256_urls = {
-          "e3d9e320a2cae99be4aaa37953961a48323cdf16ba9aa2557a44d69571cd9b8d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/tsc.js",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/tsc.js",
+          "a7d00bfd54525bc694b6e32f64c7ebcf5e6b7ae3657be5cc12767bce74654a47": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/LICENSE.txt",
+          ],
+          "8465342c318f9c4cf0a29b109fa63ee3742dd4dc7080d05d9fd8f604814d04cf": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/tsc.js",
           ],
-          "f189cebe96eb76b238c6e364e72d4b0324e699f83eeae5deac23506cb3764fc6": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/lib.es6.d.ts",
-              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.1.6/lib/lib.es6.d.ts",
+          "a67e36da3029d232e4e938e61a0a3302f516d71e7100d54dbf5362ad8618e994": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
+              "https://raw.githubusercontent.com/Microsoft/TypeScript/v2.3.1/lib/lib.es6.d.ts",
           ],
       },
       extra_build_file_content = "\n".join([
@@ -678,15 +744,17 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   ##############################################################################
   # TensorBoard JavaScript Production Dependencies
 
-  filegroup_external(
+  web_library_external(
       name = "com_lodash",
       licenses = ["notice"],  # MIT
-      sha256_urls = {
-          "7c7b391810bc08cf815683431857c51b5ee190062ae4f557e1e4689d6dd910ea": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-              "https://raw.githubusercontent.com/lodash/lodash/3.8.0/lodash.js",
-          ],
-      },
+      sha256 = "0e88207e5f90af4ce8790d6e1e7d09d2702d81bce0bafdc253d18c0a5bf7661e",
+      urls = [
+          "http://mirror.bazel.build/github.com/lodash/lodash/archive/3.10.1.tar.gz",
+          "https://github.com/lodash/lodash/archive/3.10.1.tar.gz",
+      ],
+      strip_prefix = "lodash-3.10.1",
+      path = "/lodash",
+      srcs = ["lodash.js"],
   )
 
   filegroup_external(
@@ -694,8 +762,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
+          "0e94aada97f12dee6118064add9170484c55022f5d53206ee4407143cd36ddcd": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
+              "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/license.txt",
+          ],
           "dfaca3b8485bee735788cc6eebca82ea25719adc1fb8911c7799c6bd5a95df3b": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
               "https://raw.githubusercontent.com/sloisel/numeric/v1.2.6/src/numeric.js",
           ],
       },
@@ -707,27 +779,47 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "77510d7538dbd3b59f1c8a06f68131b38562e3be546364747618d5112723e818": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.css",
           ],
           "cd46dc709b01cd361e8399f797760871a6a207bc832e08fcff385ced02ef2b43": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.d.ts",
           ],
           "32647b0fb4175fa875a71e6d56c761b88d975186ed6a8820e2c7854165a8988d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
               "https://raw.githubusercontent.com/palantir/plottable/v1.16.1/plottable.js",
           ],
       },
   )
 
+  # TODO: Delete previous rule and rename this one org_palantir_plottable
+  filegroup_external(
+      name = "com_palantir_plottable_v3",
+      # no @license header
+      licenses = ["notice"],  # MIT
+      sha256_urls_extract = {
+          # Plottable doesn't have a release tarball on GitHub. Using the
+          # sources directly from git also requires running Node tooling
+          # beforehand to generate files. NPM is the only place to get it.
+          "e3159beb279391c47433789f22b32bac88488cfcad6c0b6ec8605ce6b0081b0d": [
+              "http://mirror.bazel.build/registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+              "https://registry.npmjs.org/plottable/-/plottable-3.1.0.tgz",
+          ],
+      },
+  )
+
   filegroup_external(
       name = "io_github_cpettitt_dagre",
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
+          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
+              "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/LICENSE",
+          ],
           "7323829ddd77924a69e2b1235ded3eac30acd990da0f037e0fbd3c8e9035b50d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
               "https://raw.githubusercontent.com/cpettitt/dagre/v0.7.4/dist/dagre.core.js",
           ],
       },
@@ -735,11 +827,14 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
   filegroup_external(
       name = "io_github_cpettitt_graphlib",
-      # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
+          "6a349742a6cb219d5a2fc8d0844f6d89a6efc62e20c664450d884fc7ff2d6015": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
+              "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/LICENSE",
+          ],
           "772045d412b1513b549be991c2e1846c38019429d43974efcae943fbe83489bf": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
               "https://raw.githubusercontent.com/cpettitt/graphlib/v1.0.7/dist/graphlib.core.js",
           ],
       },
@@ -750,8 +845,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
+          "633f2861a9a862b9cd7967e841e14dd3527912f209d6563595774fa31e3d84cb": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSES",
+              "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/LICENSE",
+          ],
           "f138fce57f673ca8a633f4aee5ae5b6fcb6ad0de59069a42a74e996fd04d8fcc": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
               "https://raw.githubusercontent.com/waylonflinn/weblas/v0.9.0/dist/weblas.js",
           ],
       },
@@ -763,74 +862,392 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       licenses = ["notice"],  # BSD-3-Clause
       sha256_urls = {
           "bc1e38838f5c5c8e040132d41efee6bfddbef728210bd566479dc1694af1d3f5": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
+              "http://mirror.bazel.build/raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
               "https://raw.githubusercontent.com/d3/d3/v3.5.15/d3.js",
           ],
       },
   )
 
+  # TODO: Delete previous rule and rename this one org_d3js
+  filegroup_external(
+      name = "org_d3js_v4",
+      # no @license header
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256_urls_extract = {
+          "b5fac5b296bc196e6aa7b59f9e33986fc44d23d59a0e211705187be9e35b943d": [
+              "http://mirror.bazel.build/github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+              "https://github.com/d3/d3/releases/download/v4.8.0/d3.zip",
+          ],
+      },
+      # TODO(jart): Use srcs=["d3.js"] instead of this once supported.
+      generated_rule_name = "all_files",
+      extra_build_file_content = "\n".join([
+          "filegroup(",
+          "    name = \"org_d3js_v4\",",
+          "    srcs = [\"d3.js\"],",
+          ")",
+      ]),
+  )
+
   filegroup_external(
       name = "org_definitelytyped",
       licenses = ["notice"],  # MIT
       sha256_urls = {
           "b7da645f6e5555feb7aeede73775da0023ce2257df9c8e76c9159266035a9c0d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/chai/chai.d.ts",
           ],
           "177293828c7a206bf2a7f725753d51396d38668311aa37c96445f91bbf8128a7": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
-              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/6e2f2280ef16ef277049d0ce8583af167d586c59/d3/d3.d.ts",  # v3
           ],
           "e4cd3d5de0eb3bc7b1063b50d336764a0ac82a658b39b5cf90511f489ffdee60": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/efd40e67ff323f7147651bdbef03c03ead7b1675/lodash/lodash.d.ts",
           ],
           "695a03dd2ccb238161d97160b239ab841562710e5c4e42886aefd4ace2ce152e": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
               "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/ebc69904eb78f94030d5d517b42db20867f679c0/mocha/mocha.d.ts",
           ],
+          "513ccd9ee1c708881120eeacd56788fc3b3da8e5c6172b20324cebbe858803fe": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/708609e0764daeb5eb64104af7aca50c520c4e6e/sinon/sinon.d.ts",
+          ],
+          "44eba36339bd1c0792072b7b204ee926fe5ffe1e9e2da916e67ac55548e3668a": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/a872802c0c84ba98ff207d5e673a1fa867c67fd6/polymer/polymer.d.ts",
+          ],
+          "9453c3e6bae824e90758c3b38975c1ed77e6abd79bf513bcb08368fcdb14898e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/f5407eba29c04fb8387c86df27512bd055b195d2/threejs/three.d.ts",
+          ],
+          "691756a6eb455f340c9e834de0d49fff269e7b8c1799c2454465dcd6a4435b80": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/46719185c564694c5583c4b7ad94dbb786ecad46/webcomponents.js/webcomponents.js.d.ts",
+          ],
       },
   )
 
   filegroup_external(
-      name = "org_threejs",
-      # no @license header
+      name = "org_definitelytyped_types_d3_array",
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+          "61e7abb7b1f01fbcb0cab8cf39003392f422566209edd681fbd070eaa84ca000": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-array/index.d.ts",
           ],
-          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
-              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_axis",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "95f75c8dcc89850b2e72581d96a7b5f46ea4ac852f828893f141f14a597421f9": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-axis/index.d.ts",
           ],
       },
   )
 
-  ##############################################################################
-  # TensorBoard JavaScript Testing Dependencies
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_brush",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a2738e693ce8a8640c2d29001e77582c9c361fd23bda44db471629866b60ada7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-brush/index.d.ts",
+          ],
+      },
+  )
 
   filegroup_external(
-      name = "com_chaijs",
-      # no @license header
+      name = "org_definitelytyped_types_d3_chord",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c54d24756eb6d744b31e538ad9bab3a75f6d54e2288b29cc72338d4a057d3e83": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-chord/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_collection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "f987667167b1d2970911247e325eb1c37ca0823646f81ccec837ae59039822f7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-collection/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_color",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "9580c81f38ddcce7be0ac9bd3d0d083adebc34e17441709f90b9e4dcd1c19a56": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-color/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dispatch",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "169f80b4cceca8e2e9ed384d81a5db0624cc01a26451dfb5a7e0cec6ea9cfb06": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dispatch/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_drag",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "08d35d139dde58c2722be98d718d01204fd6167d310f09b379e832f3c741489d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-drag/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_dsv",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62594d00cf9e4bb895339c8e56f64330e202a5eb2a0fa580a1f6e6336f2c93ce": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-dsv/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_ease",
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "b926b325ad9843bf0b7a6d580ef78bb560e47c484b98680098d4fd9b31b77cd9": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
-              "https://raw.githubusercontent.com/chaijs/chai/2.3.0/chai.js",
+          "d1cf8f99b7bf758c2ba3c0a4ce553e151d4d9b4cf45a6e8bd0edec7ce90f725b": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-ease/index.d.ts",
           ],
       },
   )
 
   filegroup_external(
-      name = "org_mochajs",
+      name = "org_definitelytyped_types_d3_force",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "288421e2008668d2076a4684657dd3d29b992832ef02c552981eb94a91042553": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-force/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_format",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "b42cb17e580c1fd0b64d478f7bd80ca806efaefda24426a833cf1f30a7275bca": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-format/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_hierarchy",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a5683f5835d8716c6b89c075235078438cfab5897023ed720bfa492e244e969e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-hierarchy/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_interpolate",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "590a71b741323ac3139b333ec8b743e24717fdd5b32bcff48ee521162a9dfe1c": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-interpolate/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_path",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "96f35ba041bcaa265e2b373ee675177410d44d31c980e4f7fbeefd4bcba15b00": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-path/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_polygon",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "ce453451e8105cac6a4f4a4263ca2142ebb4bf442e342f470a81da691f220fcb": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-polygon/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_quadtree",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "238e278f1be5d6985a19800800cffee80f81199f71d848e3bbc288d1791a6f90": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-quadtree/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_queue",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "e6ae19aad83495475653578de64fb9d6bf9764eda6c84d70f7935ec84bcc482e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-queue/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_random",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "d31b92ed86c23ec0a4776f99fa81ff033c95b96c8304d8aa9baf3b94af779aa8": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-random/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_request",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "44bb7b07d977028e6567540a3303b06fc9b33fb0960bc75c520e0733c840d89f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-request/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_scale",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "02ce7c644ba34bd1abb84da2e832f248b048b6a23812be4365bd837f186c9f1f": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-scale/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_selection",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "699043ddb28dfa5e46d87bc6a24cfc6d604237f298259d3fb3c7066e05e8c86e": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-selection/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_shape",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "62668a7aaaf6232762b544f9f89c0f557ca7cfb0cd343a358dda7ecbe26f5739": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-shape/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_time",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "0502490ce682fd9265fb1d5d693ce6cd82e3b05e5f5ee3433731266ecb03d5fc": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-time/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_timer",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "6f191f9aea704aa64b1defa40dfdff1447a6e6bb815feff1660f894500a9c94d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-timer/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_transition",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a0a7c0c9bfb5c7d6d9d22a8d16b4484b66d13f2ed226954037546cb3da4098ba": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-transition/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_voronoi",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "c6bd5f229f915151d0ef678fe50b1aa6a62334ea0a8c6fc0effbac9f7032efc7": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-voronoi/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_definitelytyped_types_d3_zoom",
+      licenses = ["notice"],  # MIT
+      sha256_urls = {
+          "a25dc17fbd304cf7a0e5e7bbb8339c930d464eb40c4d6e5f839ce9c0191f4110": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+              "https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/1550dfd1b8e38d9bf104b3fd16ea9bf98a2b358e/types/d3-zoom/index.d.ts",
+          ],
+      },
+  )
+
+  filegroup_external(
+      name = "org_threejs",
       # no @license header
       licenses = ["notice"],  # MIT
       sha256_urls = {
-          "e36d865a17ffdf5868e55e736526ae30f3d4bc667c85a2a28cd5c850a82361e2": [
-              "http://bazel-mirror.storage.googleapis.com/raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
-              "https://raw.githubusercontent.com/mochajs/mocha/2.3.4/mocha.js",
+          "7aff264bd84c90bed3c72a4dc31db8c19151853c6df6980f52b01d3e9872c82d": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/build/three.js",
+          ],
+          "0e98ded15bb7fe398a655667e76b39909d36c0973a8950d01c62f65f93161c27": [
+              "http://mirror.bazel.build/raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
+              "https://raw.githubusercontent.com/mrdoob/three.js/ad419d40bdaab80abbb34b8f359b4ee840033a02/examples/js/controls/OrbitControls.js",
           ],
       },
   )
@@ -838,12 +1255,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
   ##############################################################################
   # TensorBoard Polymer Dependencies
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_font_roboto",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "fae51429b56a4a4c15f1f0c23b733c7095940cc9c04c275fa7adb3bf055b23b3",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
           "https://github.com/PolymerElements/font-roboto/archive/v1.0.1.tar.gz",
       ],
       strip_prefix = "font-roboto-1.0.1",
@@ -851,12 +1268,30 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       srcs = ["roboto.html"],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_hydrolysis",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "703b50f6b00f9e0546b5a3451da57bb20f77a166e27e4967923b9e835bab9b80",
+      urls = [
+          "http://mirror.bazel.build/github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+          "https://github.com/Polymer/polymer-analyzer/archive/v1.19.3.tar.gz",
+      ],
+      strip_prefix = "polymer-analyzer-1.19.3",
+      path = "/hydrolysis",
+      srcs = [
+          "hydrolysis-analyzer.html",
+          "hydrolysis.html",
+          "hydrolysis.js",
+      ],
+      deps = ["@org_polymer"],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_a11y_announcer",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6bce143db7a374a68535ec8b861a5f30e81f2f1e4ee36a55bda2a891f6fd2818",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-announcer/archive/v1.0.5.tar.gz",
       ],
       strip_prefix = "iron-a11y-announcer-1.0.5",
@@ -865,12 +1300,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_a11y_keys_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6823efc47a83208fd51d39c5a1d3eb0c0bebc705df1ce01310509da22a13ebd2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
           "https://github.com/PolymerElements/iron-a11y-keys-behavior/archive/v1.1.8.tar.gz",
       ],
       strip_prefix = "iron-a11y-keys-behavior-1.1.8",
@@ -879,12 +1314,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_ajax",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9162d8af4611e911ac3ebbfc08bb7038ac04f6e79a9287b1476fe36ad6770bc5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/iron-ajax/archive/v1.2.0.tar.gz",
       ],
       strip_prefix = "iron-ajax-1.2.0",
@@ -899,12 +1334,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_autogrow_textarea",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "50bbb901d2c8f87462e3552e3d671a552faa12c37c485e548d7a234ebffbc427",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/iron-autogrow-textarea/archive/v1.0.12.tar.gz",
       ],
       strip_prefix = "iron-autogrow-textarea-1.0.12",
@@ -919,12 +1354,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_behaviors",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a1e8d4b7a13f3d36beba9c2a6b186ed33a53e6af2e79f98c1fcc7e85e7b53f89",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
           "https://github.com/PolymerElements/iron-behaviors/archive/v1.0.17.tar.gz",
       ],
       strip_prefix = "iron-behaviors-1.0.17",
@@ -939,12 +1374,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_checked_element_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "539a0e1c4df0bc702d3bd342388e4e56c77ec4c2066cce69e41426a69f92e8bd",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-checked-element-behavior/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "iron-checked-element-behavior-1.0.4",
@@ -957,12 +1392,37 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_iron_component_page",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "3636e8b9a1f229fc33b5aad3933bd02a9825f66e679a0be31855d7c8245c4b4b",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+          "https://github.com/PolymerElements/iron-component-page/archive/v1.1.4.tar.gz",
+      ],
+      strip_prefix = "iron-component-page-1.1.4",
+      path = "/iron-component-page",
+      srcs = ["iron-component-page.html"],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_hydrolysis",
+          "@org_polymer_iron_ajax",
+          "@org_polymer_iron_doc_viewer",
+          "@org_polymer_iron_flex_layout",
+          "@org_polymer_iron_icons",
+          "@org_polymer_iron_selector",
+          "@org_polymer_paper_header_panel",
+          "@org_polymer_paper_styles",
+          "@org_polymer_paper_toolbar",
+      ],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_collapse",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "275808994a609a2f9923e2dd2db1957945ab141ba840eadc33f19e1f406d600e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
           "https://github.com/PolymerElements/iron-collapse/archive/v1.0.8.tar.gz",
       ],
       strip_prefix = "iron-collapse-1.0.8",
@@ -974,12 +1434,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_demo_helpers",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "aa7458492a6ac3d1f6344640a4c2ab07bce64e7ad0422b83b5d665707598cce6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-demo-helpers/archive/v1.1.0.tar.gz",
       ],
       strip_prefix = "iron-demo-helpers-1.1.0",
@@ -999,12 +1459,37 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
+      name = "org_polymer_iron_doc_viewer",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "f0e9dfbbcd94d7e88ce82cb61e615406ace63c185fee9396f7f182206ca5cc9a",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+          "https://github.com/PolymerElements/iron-doc-viewer/archive/v1.0.12.tar.gz",
+      ],
+      strip_prefix = "iron-doc-viewer-1.0.12",
+      path = "/iron-doc-viewer",
+      srcs = [
+          "iron-doc-property-styles.html",
+          "iron-doc-property.html",
+          "iron-doc-viewer-styles.html",
+          "iron-doc-viewer.html",
+      ],
+      deps = [
+          "@org_polymer",
+          "@org_polymer_marked_element",
+          "@org_polymer_paper_button",
+          "@org_polymer_paper_styles",
+          "@org_polymer_prism_element",
+      ],
+  )
+
+  web_library_external(
       name = "org_polymer_iron_dropdown",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "f7e4a31d096d10d8af1920397695cb17f3eb1cbe5e5ff91a861dabfcc085f376",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/iron-dropdown/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "iron-dropdown-1.4.0",
@@ -1023,12 +1508,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_fit_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "10132a2ea309a37c4c07b8fead71f64abc588ee6107931e34680f5f36dd8291e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/iron-fit-behavior/archive/v1.2.5.tar.gz",
       ],
       strip_prefix = "iron-fit-behavior-1.2.5",
@@ -1037,12 +1522,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_flex_layout",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "79287f6ca1c2d4e003f68b88fe19d03a1b6a0011e2b4cae579fe4d1474163a2e",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/iron-flex-layout/archive/v1.3.0.tar.gz",
       ],
       strip_prefix = "iron-flex-layout-1.3.0",
@@ -1056,12 +1541,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_form_element_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "1dd9371c638e5bc2ecba8a64074aa680dfb8712198e9612f9ed24d387efc8f26",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/iron-form-element-behavior/archive/v1.0.6.tar.gz",
       ],
       strip_prefix = "iron-form-element-behavior-1.0.6",
@@ -1070,12 +1555,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_icon",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9ed58a69159a02c07a6050d242e6d4e585a29f3245b8c8c390cfd52ddb786dc4",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/iron-icon/archive/v1.0.11.tar.gz",
       ],
       strip_prefix = "iron-icon-1.0.11",
@@ -1088,12 +1573,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_icons",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3b18542c147c7923dc3a36b1a51984a73255d610f297d43c9aaccc52859bd0d0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/iron-icons/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "iron-icons-1.1.3",
@@ -1117,12 +1602,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_iconset_svg",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7e3925b7e63a7d22524c4b43ce16ab80d06a576649644783643c11a003284368",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
           "https://github.com/PolymerElements/iron-iconset-svg/archive/v1.1.0.tar.gz",
       ],
       strip_prefix = "iron-iconset-svg-1.1.0",
@@ -1134,12 +1619,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_input",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c505101ead08ab25526b1f49baecc8c28b4221b92a65e7334c783bdc81553c36",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
           "https://github.com/PolymerElements/iron-input/archive/1.0.10.tar.gz",
       ],
       strip_prefix = "iron-input-1.0.10",
@@ -1152,12 +1637,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_list",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "72a6530b9f0ad5557f5d287845792a0ada74d8b159198e27f940e226313dc116",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
           "https://github.com/PolymerElements/iron-list/archive/v1.3.9.tar.gz",
       ],
       strip_prefix = "iron-list-1.3.9",
@@ -1171,12 +1656,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_menu_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ad27889343bc9a709258b073f69abc028bb1ffd3fdb975cd2d3939f7f5d7bb6c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
           "https://github.com/PolymerElements/iron-menu-behavior/archive/v1.1.10.tar.gz",
       ],
       strip_prefix = "iron-menu-behavior-1.1.10",
@@ -1192,12 +1677,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_meta",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "fb05e6031bae6b4effe5f15d44b3f548d5807f9e3b3aa2442ba17cf4b8b84361",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-meta/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "iron-meta-1.1.1",
@@ -1206,12 +1691,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_overlay_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3df5b54ff2e0510c87a2aff8c9d730d3fe83d3d11277cc1a49fa29b549acb46c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
           "https://github.com/PolymerElements/iron-overlay-behavior/archive/v1.10.1.tar.gz",
       ],
       strip_prefix = "iron-overlay-behavior-1.10.1",
@@ -1230,12 +1715,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_range_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "b2f2b6d52284542330bd30b586e217926eb0adec5e13934a3cef557717c22dc2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/iron-range-behavior/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "iron-range-behavior-1.0.4",
@@ -1244,12 +1729,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_resizable_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a87a78ee9223c2f6afae7fc94a3ff91cbce6f7e2a7ed3f2979af7945c9281616",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-resizable-behavior/archive/v1.0.3.tar.gz",
       ],
       strip_prefix = "iron-resizable-behavior-1.0.3",
@@ -1258,12 +1743,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_scroll_target_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "d0de0c804b1ec91d814754144afd9da1cdb082690de88bd5e47fd5f41990746f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
           "https://github.com/PolymerElements/iron-scroll-target-behavior/archive/v1.0.3.tar.gz",
       ],
       strip_prefix = "iron-scroll-target-behavior-1.0.3",
@@ -1272,12 +1757,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_selector",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ba28a47443bad3b744611c9d7a79fb21dbdf2e35edc5ef8f812e2dcd72b16747",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
           "https://github.com/PolymerElements/iron-selector/archive/v1.5.2.tar.gz",
       ],
       strip_prefix = "iron-selector-1.5.2",
@@ -1291,12 +1776,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_iron_validatable_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "aef4901e68043824f36104799269573dd345ffaac494186e466fdc79c06fdb63",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/iron-validatable-behavior/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "iron-validatable-behavior-1.1.1",
@@ -1308,12 +1793,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_marked",
       licenses = ["notice"],  # MIT
       sha256 = "93d30bd593736ca440938d77808b7ef5972da0f3fcfe4ae63ae7b4ce117da2cb",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/chjj/marked/archive/v0.3.2.zip",
+          "http://mirror.bazel.build/github.com/chjj/marked/archive/v0.3.2.zip",
           "https://github.com/chjj/marked/archive/v0.3.2.zip",
       ],
       strip_prefix = "marked-0.3.2",
@@ -1321,12 +1806,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       srcs = ["lib/marked.js"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_marked_element",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7547616df95f8b903757e6afbabfcdba5322c2bcec3f17c726b8bba5adf4bc5f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/marked-element/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "marked-element-1.1.3",
@@ -1341,12 +1826,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_neon_animation",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "8800c314a76b2da190a2b203259c1091f6d38e0057ed37c2a3d0b734980fa9a5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/neon-animation/archive/v1.2.2.tar.gz",
       ],
       strip_prefix = "neon-animation-1.2.2",
@@ -1390,12 +1875,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_behaviors",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7cfcb9082ef9909da262df6b5c120bc62dbeaff278cb563e8fc60465ddd387e5",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
           "https://github.com/PolymerElements/paper-behaviors/archive/v1.0.12.tar.gz",
       ],
       strip_prefix = "paper-behaviors-1.0.12",
@@ -1414,12 +1899,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "896c0a7e34bfcce63fc23c63e105ed9c4d62fa3a6385b7161e1e5cd4058820a6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
           "https://github.com/PolymerElements/paper-button/archive/v1.0.11.tar.gz",
       ],
       strip_prefix = "paper-button-1.0.11",
@@ -1434,12 +1919,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_checkbox",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6828a6954a048b1230fbd2606faffbae950ba1d042175b96ec50ae355786a166",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-checkbox/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "paper-checkbox-1.4.0",
@@ -1452,12 +1937,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c6a9709e7f528d03dcd574503c18b72d4751ca30017346d16e6a791d37ed9259",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
           "https://github.com/PolymerElements/paper-dialog/archive/v1.0.4.tar.gz",
       ],
       strip_prefix = "paper-dialog-1.0.4",
@@ -1470,12 +1955,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog_behavior",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a7e0e27ce63554bc14f384cf94bcfa24da8dc5f5120dfd565f45e166261aee40",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-behavior/archive/v1.2.5.tar.gz",
       ],
       strip_prefix = "paper-dialog-behavior-1.2.5",
@@ -1485,7 +1970,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "paper-dialog-common.css",
           "paper-dialog-shared-styles.html",
       ],
-      suppress = ["cssSyntax"],
       deps = [
           "@org_polymer",
           "@org_polymer_iron_flex_layout",
@@ -1494,12 +1978,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dialog_scrollable",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a2e69283e7674f782c44d811387a0f8da2d01fac0172743d1add65e253e6b5ff",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
           "https://github.com/PolymerElements/paper-dialog-scrollable/archive/1.1.5.tar.gz",
       ],
       strip_prefix = "paper-dialog-scrollable-1.1.5",
@@ -1513,12 +1997,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_dropdown_menu",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9d88f654ec03ee9be211df9e69bede9e8a22b51bf1dbcc63b79762e4256d81ad",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
           "https://github.com/PolymerElements/paper-dropdown-menu/archive/v1.4.0.tar.gz",
       ],
       strip_prefix = "paper-dropdown-menu-1.4.0",
@@ -1545,12 +2029,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_header_panel",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "0db4bd8a4bf6f20dcd0dffb4f907b31c93a8647c9c021344239cf30b40b87075",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-header-panel/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-header-panel-1.1.4",
@@ -1562,12 +2046,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_icon_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "9cba5bcfd6aeb4c41581c1392c678cf2278d360e9d122f4d9db54a9ebb404496",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
           "https://github.com/PolymerElements/paper-icon-button/archive/v1.1.3.tar.gz",
       ],
       strip_prefix = "paper-icon-button-1.1.3",
@@ -1584,12 +2068,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_input",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "17c3dea9bb1c2026cc61324696c6c774214a0dc37686b91ca214a6af550994db",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
           "https://github.com/PolymerElements/paper-input/archive/v1.1.18.tar.gz",
       ],
       strip_prefix = "paper-input-1.1.18",
@@ -1615,12 +2099,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_item",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "12ee0dcb61b0d5721c5988571f6974d7b2211e97724f4195893fbcc9058cdac8",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-item/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-item-1.1.4",
@@ -1640,12 +2124,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_listbox",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "3cb35f4fe9a3f15185a9e91711dba8f27e9291c8cd371ebf1be21b8f1d5f65fb",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-listbox/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-listbox-1.1.2",
@@ -1658,12 +2142,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_material",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "09f6c8bd6ddbea2be541dc86306efe41cdfb31bec0b69d35a5dc29772bbc8506",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
           "https://github.com/PolymerElements/paper-material/archive/v1.0.6.tar.gz",
       ],
       strip_prefix = "paper-material-1.0.6",
@@ -1678,12 +2162,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_menu",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "a3cee220926e315f7412236b3628288774694447c0da4428345f36d0f127ba3b",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
           "https://github.com/PolymerElements/paper-menu/archive/v1.2.2.tar.gz",
       ],
       strip_prefix = "paper-menu-1.2.2",
@@ -1703,12 +2187,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_menu_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "be3290c288a2bd4f9887213db22c75add99cc29ff4d088100c0bc4eb0e57997b",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
           "https://github.com/PolymerElements/paper-menu-button/archive/v1.5.1.tar.gz",
       ],
       strip_prefix = "paper-menu-button-1.5.1",
@@ -1727,12 +2211,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_progress",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "2b6776b2f023c1f344feea17ba29b58d879e46f8ed43b7256495054b5183fff6",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-progress/archive/v1.0.9.tar.gz",
       ],
       strip_prefix = "paper-progress-1.0.9",
@@ -1746,12 +2230,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_radio_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6e911d0c308aa388136b3af79d1bdcbe5a1f4159cbc79d71efb4ff3b6c0b4e91",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-radio-button/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-radio-button-1.1.2",
@@ -1764,12 +2248,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_radio_group",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "7885ad1f81e9dcc03dcea4139b54a201ff55c18543770cd44f94530046c9e163",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
           "https://github.com/PolymerElements/paper-radio-group/archive/v1.0.9.tar.gz",
       ],
       strip_prefix = "paper-radio-group-1.0.9",
@@ -1783,12 +2267,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_ripple",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ba76bfb1c737260a8a103d3ca97faa1f7c3288c7db9b2519f401b7a782147c09",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
           "https://github.com/PolymerElements/paper-ripple/archive/v1.0.5.tar.gz",
       ],
       strip_prefix = "paper-ripple-1.0.5",
@@ -1800,12 +2284,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_slider",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "08e7c541dbf5d2e959208810bfc03188e82ced87e4d30d325172967f67962c3c",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
           "https://github.com/PolymerElements/paper-slider/archive/v1.0.10.tar.gz",
       ],
       strip_prefix = "paper-slider-1.0.10",
@@ -1824,21 +2308,19 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_spinner",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6a752907fab7899cbeed15b478e7b9299047c15fbf9d1561d6eb4d204bdbd178",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
           "https://github.com/PolymerElements/paper-spinner/archive/v1.1.1.tar.gz",
       ],
       strip_prefix = "paper-spinner-1.1.1",
       path = "/paper-spinner",
       srcs = [
-          "paper-spinner.html",
-          "paper-spinner-behavior.html",
-          "paper-spinner-lite.html",
-          "paper-spinner-styles.html"
+          "paper-spinner.html", "paper-spinner-behavior.html",
+          "paper-spinner-lite.html", "paper-spinner-styles.html"
       ],
       deps = [
           "@org_polymer",
@@ -1847,12 +2329,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_styles",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "6d26b0a4c286402098853dc7388f6b22f30dfb7a74e47b34992ac03380144bb2",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-styles/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-styles-1.1.4",
@@ -1878,12 +2360,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_tabs",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "c23b6a5221db35e5b1ed3eb8e8696b952572563e285adaec96aba1e3134db825",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
           "https://github.com/PolymerElements/paper-tabs/archive/v1.7.0.tar.gz",
       ],
       strip_prefix = "paper-tabs-1.7.0",
@@ -1907,12 +2389,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toast",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "55f623712ed1f2bae6d6fadc522a2458e083ccd44cc0a907672547e7b10758a9",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
           "https://github.com/PolymerElements/paper-toast/archive/v1.3.0.tar.gz",
       ],
       strip_prefix = "paper-toast-1.3.0",
@@ -1925,12 +2407,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toggle_button",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4aa7cf0396fa2994a8bc2ac6e8428f48b07b945bb7c41bd52041ef5827b45de3",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
           "https://github.com/PolymerElements/paper-toggle-button/archive/v1.2.0.tar.gz",
       ],
       strip_prefix = "paper-toggle-button-1.2.0",
@@ -1944,12 +2426,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_toolbar",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "dbddffc0654d9fb5fb48843087eebe16bf7a134902495a664c96c11bf8a2c63d",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
           "https://github.com/PolymerElements/paper-toolbar/archive/v1.1.4.tar.gz",
       ],
       strip_prefix = "paper-toolbar-1.1.4",
@@ -1962,12 +2444,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_paper_tooltip",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4c6667acf01f73da14c3cbc0aa574bf14280304567987ee0314534328377d2ad",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
           "https://github.com/PolymerElements/paper-tooltip/archive/v1.1.2.tar.gz",
       ],
       strip_prefix = "paper-tooltip-1.1.2",
@@ -1979,13 +2461,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "07a9e62ffb52193da3af09adda2fbac5cc690439978520e2d03e783863f65f91",
       strip_prefix = "polymer-1.7.0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
+          "http://mirror.bazel.build/github.com/polymer/polymer/archive/v1.7.0.tar.gz",
           "https://github.com/polymer/polymer/archive/v1.7.0.tar.gz",
       ],
       path = "/polymer",
@@ -1996,12 +2478,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_prism",
       licenses = ["notice"],  # MIT
       sha256 = "e06eb54f2a80e6b3cd0bd4d59f900423bcaee53fc03998a056df63740c684683",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
+          "http://mirror.bazel.build/github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
           "https://github.com/PrismJS/prism/archive/abee2b7587f1925e57777044270e2a1860810994.tar.gz",
       ],
       strip_prefix = "prism-abee2b7587f1925e57777044270e2a1860810994",
@@ -2012,12 +2494,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_prism_element",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "ad70bf9cd5bbdf525d465e1b0658867ab4022193eb9c74087a839044b46312b4",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
           "https://github.com/PolymerElements/prism-element/archive/1.0.4.tar.gz",
       ],
       strip_prefix = "prism-element-1.0.4",
@@ -2032,13 +2514,13 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       ],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_promise_polyfill",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "4495450e5d884c3e16b537b43afead7f84d17c7dc061bcfcbf440eac083e4ef5",
       strip_prefix = "promise-polyfill-1.0.0",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
+          "http://mirror.bazel.build/github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
           "https://github.com/PolymerLabs/promise-polyfill/archive/v1.0.0.tar.gz",
       ],
       path = "/promise-polyfill",
@@ -2051,12 +2533,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       deps = ["@org_polymer"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_web_animations_js",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "f8bd760cbdeba131f6790bd5abe170bcbf7b1755ff58ed16d0b82fa8a7f34a7f",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
+          "http://mirror.bazel.build/github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
           "https://github.com/web-animations/web-animations-js/archive/2.2.1.tar.gz",
       ],
       strip_prefix = "web-animations-js-2.2.1",
@@ -2064,12 +2546,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
       srcs = ["web-animations-next-lite.min.js"],
   )
 
-  webfiles_external(
+  web_library_external(
       name = "org_polymer_webcomponentsjs",
       licenses = ["notice"],  # BSD-3-Clause
       sha256 = "138c43306ee0a6d699ddca9b3c6b0f4982974ea8b7bdad291ea7276c72301df9",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
+          "http://mirror.bazel.build/github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
           "https://github.com/webcomponents/webcomponentsjs/archive/v0.7.22.tar.gz",
       ],
       strip_prefix = "webcomponentsjs-0.7.22",
@@ -2089,3 +2571,132 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
           "webcomponents-lite.min.js",
       ],
   )
+
+  ##############################################################################
+  # TensorBoard Testing Dependencies
+
+  web_library_external(
+      name = "org_npmjs_registry_accessibility_developer_tools",
+      licenses = ["notice"],  # Apache License 2.0
+      sha256 = "1d6a72f401c9d53f68238c617dd43a05cd85ca5aa2e676a5b3c352711448e093",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+          "https://registry.npmjs.org/accessibility-developer-tools/-/accessibility-developer-tools-2.10.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/accessibility-developer-tools",
+      suppress = ["strictDependencies"],
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_async",
+      licenses = ["notice"],  # MIT
+      sha256 = "08655255ae810bf4d1cb1642df57658fcce823776d3ba8f4b46f4bbff6c87ece",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/async/-/async-1.5.0.tgz",
+          "https://registry.npmjs.org/async/-/async-1.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/async",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_chai",
+      licenses = ["notice"],  # MIT
+      sha256 = "aca8137bed5bb295bd7173325b7ad604cd2aeb341d739232b4f9f0b26745be90",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+          "https://registry.npmjs.org/chai/-/chai-3.5.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_mocha",
+      licenses = ["notice"],  # MIT
+      sha256 = "13ef37a071196a2fba680799b906555d3f0ab61e80a7e8f73f93e77914590dd4",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+          "https://registry.npmjs.org/mocha/-/mocha-2.5.3.tgz",
+      ],
+      suppress = ["strictDependencies"],
+      strip_prefix = "package",
+      path = "/mocha",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "49edb057695fc9019aae992bf7e677a07de7c6ce2bf9f9facde4a245045d1532",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+          "https://registry.npmjs.org/sinon/-/sinon-1.17.4.tgz",
+      ],
+      strip_prefix = "package/lib",
+      path = "/sinonjs",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_sinon_chai",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "b85fc56f713832960b56fe9269ee4bb2cd41edd2ceb130b0936e5bdbed5dea63",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+          "https://registry.npmjs.org/sinon-chai/-/sinon-chai-2.8.0.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/sinon-chai",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_stacky",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "c659e60f7957d9d80c23a7aacc4d71b19c6421a08f91174c0062de369595acae",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+          "https://registry.npmjs.org/stacky/-/stacky-1.3.1.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/stacky",
+  )
+
+  web_library_external(
+      name = "org_npmjs_registry_web_component_tester",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "9d4ebd4945df8a936916d4d32b7f280f2a3afa35f79e7ca8ad3ed0a42770c537",
+      urls = [
+          "http://mirror.bazel.build/registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+          "https://registry.npmjs.org/web-component-tester/-/web-component-tester-4.3.6.tgz",
+      ],
+      strip_prefix = "package",
+      path = "/web-component-tester",
+      suppress = [
+          "absolutePaths",
+          "strictDependencies",
+      ],
+      deps = [
+          "@com_lodash",
+          "@org_npmjs_registry_accessibility_developer_tools",
+          "@org_npmjs_registry_async",
+          "@org_npmjs_registry_chai",
+          "@org_npmjs_registry_mocha",
+          "@org_npmjs_registry_sinon",
+          "@org_npmjs_registry_sinon_chai",
+          "@org_npmjs_registry_stacky",
+          "@org_polymer_test_fixture",
+      ],
+  )
+
+  web_library_external(
+      name = "org_polymer_test_fixture",
+      licenses = ["notice"],  # BSD-3-Clause
+      sha256 = "59d6cfb1187733b71275becfea181fe0aa1f734df5ff77f5850c806bbbf9a0d9",
+      strip_prefix = "test-fixture-2.0.1",
+      urls = [
+          "http://mirror.bazel.build/github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+          "https://github.com/PolymerElements/test-fixture/archive/v2.0.1.tar.gz",
+      ],
+      path = "/test-fixture",
+      exclude = ["test/**"],
+  )
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
index 00d2e7c0c7813d084ca616bb201645161e271b8c..861a87b68bfd058fdc184335fd19957f624f9fcc 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
@@ -1,9 +1,11 @@
-#ifdef _WIN32
-#define sleep(seconds) Sleep(1000*seconds)
-#endif  // _WIN32
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef _WIN32
+#ifndef SLEEP_FUNC_HEADER_GUARD
+#define SLEEP_FUNC_HEADER_GUARD
+inline void sleep(unsigned int seconds) { Sleep(1000*seconds); }
+#endif
+
 // On Windows, Eigen will include Windows.h, which defines various
 // macros that conflict with TensorFlow symbols. Undefine them here to
 // prevent clashes.
diff --git a/third_party/fft2d/BUILD b/third_party/fft2d/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..93ea06e81b85d3ffca90133225604e9ac3a44333
--- /dev/null
+++ b/third_party/fft2d/BUILD
@@ -0,0 +1,30 @@
+# Headers for 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+# This is a separate package because the original downloaded archive doesn't
+# contain any header files.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+# See fft/readme.txt
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "fft2d_headers",
+    srcs = ["fft.h"],
+)
+
+objc_library(
+    name = "fft2d_headersd_ios",
+    srcs = ["fft.h"],
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+    name = "fft2d_headers_srcs",
+    srcs = ["fft.h"],
+)
diff --git a/third_party/fft2d/LICENSE b/third_party/fft2d/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..2bd85506a8cef226c5260c3bcddc5d8b6148ece9
--- /dev/null
+++ b/third_party/fft2d/LICENSE
@@ -0,0 +1,3 @@
+Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+You may use, copy, modify this code for any purpose and 
+without fee. You may distribute this ORIGINAL package.
diff --git a/third_party/fft2d/fft.h b/third_party/fft2d/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..252cc01fec30bcfa0b6b396b92fb6a1805023baf
--- /dev/null
+++ b/third_party/fft2d/fft.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Declarations for 1D FFT routines in third_party/fft2d/fft.
+
+#ifndef THIRD_PARTY_FFT2D_FFT_H__
+#define THIRD_PARTY_FFT2D_FFT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void cdft(int, int, double *, int *, double *);
+extern void rdft(int, int, double *, int *, double *);
+extern void ddct(int, int, double *, int *, double *);
+extern void ddst(int, int, double *, int *, double *);
+extern void dfct(int, double *, double *, int *, double *);
+extern void dfst(int, double *, double *, int *, double *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_FFT2D_FFT_H__
diff --git a/third_party/fft2d/fft2d.BUILD b/third_party/fft2d/fft2d.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3dbd36aec046a201253ac40bd250b20815a6a22a
--- /dev/null
+++ b/third_party/fft2d/fft2d.BUILD
@@ -0,0 +1,36 @@
+# 2D Fast Fourier Transform package
+# from http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+# Unrestricted use; can only distribute original package.
+licenses(["notice"])
+
+exports_files(["fft/readme.txt"])
+
+FFT2D_SRCS = [
+    "fft/fftsg.c",
+]
+
+# This is the main 2D FFT library.  The 2D FFTs in this library call
+# 1D FFTs.  In addition, fast DCTs are provided for the special case
+# of 8x8 and 16x16.  This code in this library is referred to as
+# "Version II" on http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html.
+cc_library(
+    name = "fft2d",
+    srcs = FFT2D_SRCS,
+    linkopts = ["-lm"],
+)
+
+objc_library(
+    name = "fft2d_ios",
+    srcs = FFT2D_SRCS,
+)
+
+# Export the source code so that it could be compiled for Andoid native apps.
+filegroup(
+    name = "fft2d_srcs",
+    srcs = FFT2D_SRCS,
+)
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
deleted file mode 100644
index b77a45c3257c4f9e3865dd9ff58db7cb0285eed7..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,249 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  # As part of the TensorFlow release, we place some cuda-related compilation
-  # files in @local_config_cuda//crosstool/clang/bin, and this relative
-  # path, combined with the rest of our Bazel configuration causes our
-  # compilation to use those files.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-%{gcc_host_compiler_includes}
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-fPIE"
-  linker_flag: "-pie"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  cxx_flag: "-std=c++11"
-  ar_flag: "-static"
-  ar_flag: "-s"
-  ar_flag: "-o"
-  linker_flag: "-lc++"
-  linker_flag: "-undefined"
-  linker_flag: "dynamic_lookup"
-  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
-  # setting from the local compiler, and also how to make incremental builds correct.
-  cxx_builtin_include_directory: "/"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified.
-  compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-
-  # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
-    # However, that can't happen here, as it requires special handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-  }
-}
diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
index 116f67cbae4a0a4f9b12fba14e81b8d743b1e7fd..05290d647ea1b25f073f6e0c2a8de07c0fe65d58 100644
--- a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
@@ -121,7 +121,7 @@ toolchain {
   # linker_flag: "-Wl,--detect-odr-violations"
 
   # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
+%{cuda_include_path}
 
   compilation_mode_flags {
     mode: DBG
@@ -220,7 +220,7 @@ toolchain {
   linker_flag: "-no-canonical-prefixes"
 
   # Include directory for cuda headers.
-  cxx_builtin_include_directory: "%{cuda_include_path}"
+%{cuda_include_path}
 
   compilation_mode_flags {
     mode: DBG
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index b7d6cc61dd7c44d02300c4270e84fea9b63c796a..242439daf456d6fd31a140e5d2c56d3e89900652 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -49,9 +49,7 @@ import pipes
 CPU_COMPILER = ('%{cpu_compiler}')
 GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-NVCC_PATH = CURRENT_DIR + '/../../../cuda/bin/nvcc'
-LLVM_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
@@ -229,7 +227,7 @@ def InvokeNvcc(argv, log=False):
 
   # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
   # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ' ' + cmd
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
   if log: Log(cmd)
   return os.system(cmd)
 
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index e101f9fbd84880e371c63e187c8512a5df4d63df..f7610dd7a99e3c65ac494d23f0a408d4391680c0 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -1,7 +1,5 @@
 licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
 
-load("@local_config_cuda//cuda:platform.bzl", "readlink_command")
-
 package(default_visibility = ["//visibility:public"])
 
 config_setting(
@@ -41,9 +39,10 @@ config_setting(
 
 cc_library(
     name = "cuda_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        %{cuda_headers}
+    ],
     includes = [
         ".",
         "include",
@@ -54,7 +53,7 @@ cc_library(
 cc_library(
     name = "cudart_static",
     srcs = ["lib/%{cudart_static_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkopts = select({
         ":freebsd": [],
         "//conditions:default": ["-ldl"],
@@ -68,7 +67,7 @@ cc_library(
 cc_library(
     name = "cuda_driver",
     srcs = ["lib/%{cuda_driver_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     visibility = ["//visibility:public"],
 )
 
@@ -76,7 +75,7 @@ cc_library(
     name = "cudart",
     srcs = ["lib/%{cudart_lib}"],
     data = ["lib/%{cudart_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -85,16 +84,26 @@ cc_library(
     name = "cublas",
     srcs = ["lib/%{cublas_lib}"],
     data = ["lib/%{cublas_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cusolver",
+    srcs = ["lib/%{cusolver_lib}"],
+    data = ["lib/%{cusolver_lib}"],
+    includes = ["include"],
+    linkstatic = 1,
+    linkopts = ["-lgomp"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cudnn",
     srcs = ["lib/%{cudnn_lib}"],
     data = ["lib/%{cudnn_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -103,7 +112,7 @@ cc_library(
     name = "cufft",
     srcs = ["lib/%{cufft_lib}"],
     data = ["lib/%{cufft_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -112,7 +121,7 @@ cc_library(
     name = "curand",
     srcs = ["lib/%{curand_lib}"],
     data = ["lib/%{curand_lib}"],
-    includes = ["include/"],
+    includes = ["include"],
     linkstatic = 1,
     visibility = ["//visibility:public"],
 )
@@ -132,9 +141,10 @@ cc_library(
 
 cc_library(
     name = "cupti_headers",
-    hdrs = glob([
-        "**/*.h",
-    ]),
+    hdrs = [
+        "cuda_config.h",
+        ":cuda-extras",
+    ],
     includes = [
         ".",
         "extras/CUPTI/include/",
@@ -150,6 +160,8 @@ cc_library(
 
 cc_library(
     name = "libdevice_root",
-    data = glob(["nvvm/libdevice/*.bc"]),
+    data = [":cuda-nvvm"],
     visibility = ["//visibility:public"],
 )
+
+%{cuda_include_genrules}
diff --git a/third_party/gpus/cuda/platform.bzl.tpl b/third_party/gpus/cuda/platform.bzl.tpl
deleted file mode 100644
index 01ef24b94edf840126822f55da93fd9a84b4fc73..0000000000000000000000000000000000000000
--- a/third_party/gpus/cuda/platform.bzl.tpl
+++ /dev/null
@@ -1,15 +0,0 @@
-CUDA_VERSION = "%{cuda_version}"
-CUDNN_VERSION = "%{cudnn_version}"
-PLATFORM = "%{platform}"
-
-def cuda_sdk_version():
-  return CUDA_VERSION
-
-def cudnn_sdk_version():
-  return CUDNN_VERSION
-
-def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 05ff584be02358dff695c4e4dadce1a5c2e318de..12d324e9a1439e0dfe2cb4c0b352472da1d63253 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -5,7 +5,7 @@
 
   * `TF_NEED_CUDA`: Whether to enable building with CUDA.
   * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Wheter to use clang as a cuda compiler.
+  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
   * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
     both host and device code compilation if TF_CUDA_CLANG is 1.
   * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
@@ -147,6 +147,36 @@ def _host_compiler_includes(repository_ctx, cc):
     inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
   return "\n".join(inc_entries)
 
+def _cuda_include_path(repository_ctx, cuda_config):
+  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+
+  Args:
+    repository_ctx: The repository context.
+    cc: The path to the gcc host compiler.
+
+  Returns:
+    A string containing the cxx_builtin_include_directory for each of the gcc
+    host compiler include directories, which can be added to the CROSSTOOL
+    file.
+  """
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
+                                  (cuda_config.cuda_toolkit_path,
+                                   ".exe" if cuda_config.cpu_value == "Windows" else ""))
+  result = repository_ctx.execute([nvcc_path, '-v',
+                                  '/dev/null', '-o', '/dev/null'])
+  target_dir = ""
+  for one_line in result.stderr.splitlines():
+    if one_line.startswith('#$ _TARGET_DIR_='):
+      target_dir = (cuda_config.cuda_toolkit_path + '/' +
+                    one_line.replace('#$ _TARGET_DIR_=', '') + "/include")
+  inc_entries = []
+  if target_dir != "":
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+  default_include = cuda_config.cuda_toolkit_path + '/include'
+  inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
+                     default_include)
+  return "\n".join(inc_entries)
+
 
 def _enable_cuda(repository_ctx):
   if "TF_NEED_CUDA" in repository_ctx.os.environ:
@@ -541,6 +571,9 @@ def _find_libs(repository_ctx, cuda_config):
       "cublas": _find_cuda_lib(
           "cublas", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
           cuda_config.cuda_version),
+      "cusolver": _find_cuda_lib(
+          "cusolver", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
+          cuda_config.cuda_version),
       "curand": _find_cuda_lib(
           "curand", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path,
           cuda_config.cuda_version),
@@ -683,10 +716,6 @@ def _create_dummy_repository(repository_ctx):
            "%{cuda_is_configured}": "False",
            "%{cuda_extra_copts}": "[]"
        })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-       })
   _tpl(repository_ctx, "cuda:BUILD",
        {
            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
@@ -695,29 +724,13 @@ def _create_dummy_repository(repository_ctx):
            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
            "%{cudart_lib}": _lib_name("cudart", cpu_value),
            "%{cublas_lib}": _lib_name("cublas", cpu_value),
+           "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
            "%{cufft_lib}": _lib_name("cufft", cpu_value),
            "%{curand_lib}": _lib_name("curand", cpu_value),
            "%{cupti_lib}": _lib_name("cupti", cpu_value),
-       })
-  _tpl(repository_ctx, "cuda:BUILD",
-       {
-           "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
-           "%{cudart_static_lib}": _lib_name("cudart_static", cpu_value,
-                                             static=True),
-           "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-           "%{cudart_lib}": _lib_name("cudart", cpu_value),
-           "%{cublas_lib}": _lib_name("cublas", cpu_value),
-           "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
-           "%{cufft_lib}": _lib_name("cufft", cpu_value),
-           "%{curand_lib}": _lib_name("curand", cpu_value),
-           "%{cupti_lib}": _lib_name("cupti", cpu_value),
-       })
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-           "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-           "%{platform}": cpu_value,
+           "%{cuda_include_genrules}": '',
+           "%{cuda_headers}": '',
        })
 
   # Create dummy files for the CUDA toolkit since they are still required by
@@ -730,6 +743,7 @@ def _create_dummy_repository(repository_ctx):
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudart", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/lib/%s" % _lib_name("cusolver", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cudnn", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("curand", cpu_value))
   repository_ctx.file("cuda/lib/%s" % _lib_name("cufft", cpu_value))
@@ -754,17 +768,64 @@ def _create_dummy_repository(repository_ctx):
                       _DUMMY_CROSSTOOL_BZL_FILE)
   repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-def _symlink_dir(repository_ctx, src_dir, dest_dir):
-  """Symlinks all the files in a directory.
 
-  Args:
-    repository_ctx: The repository context.
-    src_dir: The source directory.
-    dest_dir: The destination directory to create the symlinks in.
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
+    src_files = [], dest_files = []):
+  """Returns a genrule to symlink a set of files.
+
+  If src_dir is passed, files will be read from the given directory; otherwise
+  we assume files are in src_files and dest_files
+  """
+  if src_dir != None:
+    files = _read_dir(repository_ctx, src_dir)
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, '').splitlines()
+    src_files = files.splitlines()
+  command = []
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = ' $(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else ' $(@D)/' + dest_files[i]
+      command.append('ln -s ' + src_files[i] + dest)
+      outs.append('      "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
+                     "\n".join(outs))
+  return genrule
+
+
+def _genrule(src_dir, genrule_name, command, outs):
+  """Returns a string with a genrule.
+
+  Genrule executes the given command and produces the given outputs.
   """
-  files = repository_ctx.path(src_dir).readdir()
-  for src_file in files:
-    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+  return (
+      'genrule(\n' +
+      '    name = "' +
+      genrule_name + '",\n' +
+      '    outs = [\n' +
+      outs +
+      '    ],\n' +
+      '    cmd = """\n' +
+      command +
+      '    """,\n' +
+      ')\n\n'
+  )
+
+
+def _read_dir(repository_ctx, src_dir):
+  """Returns a string with all files in a directory.
+
+  Finds all files inside a directory, traversing subfolders and following
+  symlinks. The returned string contains the full path of all files
+  separated by line breaks.
+  """
+  find_result = repository_ctx.execute([
+      "find", src_dir, "-follow", "-type", "f"
+  ])
+  return find_result.stdout
+
 
 def _use_cuda_clang(repository_ctx):
   if "TF_CUDA_CLANG" in repository_ctx.os.environ:
@@ -787,25 +848,42 @@ def _create_cuda_repository(repository_ctx):
   cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
                                             cuda_config.cudnn_install_basedir)
 
-  # Set up symbolic links for the cuda toolkit. We link at the individual file
-  # level not at the directory level. This is because the external library may
-  # have a different file layout from our desired structure.
+  # Set up symbolic links for the cuda toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # cuda_toolkit_path
   cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/include", "cuda/include")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/bin", "cuda/bin")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/nvvm", "cuda/nvvm")
-  _symlink_dir(repository_ctx, cuda_toolkit_path + "/extras/CUPTI/include",
-               "cuda/extras/CUPTI/include")
+  cuda_include_path = cuda_toolkit_path + "/include"
+  genrules = [_symlink_genrule_for_dir(repository_ctx,
+      cuda_include_path, "include", "cuda-include")]
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/nvvm", "nvvm", "cuda-nvvm"))
+  genrules.append(_symlink_genrule_for_dir(repository_ctx,
+      cuda_toolkit_path + "/extras/CUPTI/include",
+      "extras/CUPTI/include", "cuda-extras"))
 
   cuda_libs = _find_libs(repository_ctx, cuda_config)
+  cuda_lib_src = []
+  cuda_lib_dest = []
   for lib in cuda_libs.values():
-    repository_ctx.symlink(lib.path, "cuda/lib/" + lib.file_name)
+    cuda_lib_src.append(lib.path)
+    cuda_lib_dest.append("lib/" + lib.file_name)
+  genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
+                                       cuda_lib_src, cuda_lib_dest))
 
   # Set up the symbolic links for cudnn if cudnn was was not installed to
   # CUDA_TOOLKIT_PATH.
-  if not repository_ctx.path("cuda/include/cudnn.h").exists:
-    repository_ctx.symlink(cudnn_header_dir + "/cudnn.h",
-                           "cuda/include/cudnn.h")
+  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+      cuda_include_path, '').splitlines()
+  if '/cudnn.h' not in included_files:
+    genrules.append(_symlink_genrule_for_dir(repository_ctx, None, "include/",
+        "cudnn-include", [cudnn_header_dir + "/cudnn.h"], ["cudnn.h"]))
+  else:
+    genrules.append(
+            'filegroup(\n' +
+            '    name = "cudnn-include",\n' +
+            '    srcs = [],\n' +
+            ')\n'
+        )
 
   # Set up BUILD file for cuda/
   _tpl(repository_ctx, "cuda:build_defs.bzl",
@@ -822,37 +900,38 @@ def _create_cuda_repository(repository_ctx):
                cuda_config.cpu_value),
            "%{cudart_lib}": cuda_libs["cudart"].file_name,
            "%{cublas_lib}": cuda_libs["cublas"].file_name,
+           "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
            "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
            "%{cufft_lib}": cuda_libs["cufft"].file_name,
            "%{curand_lib}": cuda_libs["curand"].file_name,
            "%{cupti_lib}": cuda_libs["cupti"].file_name,
+           "%{cuda_include_genrules}": "\n".join(genrules),
+           "%{cuda_headers}": ('":cuda-include",\n' +
+                               '        ":cudnn-include",')
        })
-
-  _tpl(repository_ctx, "cuda:platform.bzl",
-       {
-           "%{cuda_version}": cuda_config.cuda_version,
-           "%{cudnn_version}": cuda_config.cudnn_version,
-           "%{platform}": cuda_config.cpu_value,
-       })
-
   # Set up crosstool/
   _file(repository_ctx, "crosstool:BUILD")
   cc = find_cc(repository_ctx)
   host_compiler_includes = _host_compiler_includes(repository_ctx, cc)
   cuda_defines = {
-           "%{cuda_include_path}": cuda_config.cuda_toolkit_path + '/include',
+           "%{cuda_include_path}": _cuda_include_path(repository_ctx,
+                                                      cuda_config),
            "%{host_compiler_includes}": host_compiler_includes,
        }
   if _use_cuda_clang(repository_ctx):
     cuda_defines["%{clang_path}"] = cc
     _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
   else:
+    nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
+        (cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "")))
     _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
     _tpl(repository_ctx,
          "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
          {
              "%{cpu_compiler}": str(cc),
              "%{cuda_version}": cuda_config.cuda_version,
+             "%{nvcc_path}": nvcc_path,
              "%{gcc_host_compiler_path}": str(cc),
              "%{cuda_compute_capabilities}": ", ".join(
                  ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
diff --git a/third_party/grpc.BUILD b/third_party/grpc.BUILD
index 1d1e2222dea8044b046a7cddb22a009542553e8c..b79259618f2f06c941b5a8e3427dd0d5a0fe1e40 100644
--- a/third_party/grpc.BUILD
+++ b/third_party/grpc.BUILD
@@ -176,8 +176,7 @@ cc_library(
         ".",
         "include",
     ],
-    deps = [
-    ],
+    linkopts = ["-lpthread"],
 )
 
 cc_library(
@@ -1782,6 +1781,7 @@ cc_library(
         ".",
         "include",
     ],
+    linkopts = ["-lpthread"],
     deps = [
         ":gpr",
         ":grpc_unsecure",
diff --git a/third_party/hadoop/hdfs.h b/third_party/hadoop/hdfs.h
index 560d8bba0e0e7944d22d378d54f26b243badc2f1..a664f3b50cf94230151952a143b6eb00b4b97a02 100644
--- a/third_party/hadoop/hdfs.h
+++ b/third_party/hadoop/hdfs.h
@@ -171,7 +171,7 @@ void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
  * Connect to the hdfs.
  * @param nn   The NameNode.  See hdfsBuilderSetNameNode for details.
  * @param port The port on which the server is listening.
- * @param user the user name (this is hadoop domain user). Or NULL is equivelant
+ * @param user the user name (this is hadoop domain user). Or NULL is equivalent
  * to hhdfsConnect(host, port)
  * @return Returns a handle to the filesystem or NULL on error.
  * @deprecated Use hdfsBuilderConnect instead.
@@ -397,7 +397,7 @@ hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize,
                       short replication, tSize blocksize);
 
 /**
- * hdfsTruncateFile - Truncate a hdfs file to given lenght.
+ * hdfsTruncateFile - Truncate a hdfs file to given length.
  * @param fs The configured filesystem handle.
  * @param path The full path to the file.
  * @param newlength The size the file is to be truncated to
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index aabff39d7b25f6d610fd8c0c28ee4f0e756c29b8..3a9a9a80f2e5aa433c27ecb65fa279f146f43da6 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -89,6 +89,17 @@ cc_library(
         "-D_REENTRANT",
     ],
     includes = ["include"],
+    # pthread_atfork() is called for PPC.
+    linkopts = select({
+        "@%ws%//tensorflow:linux_ppc64le": [
+            "-lpthread",
+        ],
+        "@%ws%//tensorflow:linux_x86_64": [
+            "-lpthread",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     visibility = ["//visibility:public"],
 )
 
@@ -183,12 +194,17 @@ sh_binary(
     srcs = ["include/jemalloc/internal/size_classes.sh"],
 )
 
-# Size classes for Linux x86_64. Update if adding builds for other
+# Size classes for Linux x86_64 and ppc64le. Update if adding builds for other
 # architectures. See size_classes.sh for details on the arguments.
+# For default case, kept the arguments same as that of  x86_64 for now.
 genrule(
     name = "size_classes_h",
     outs = ["include/jemalloc/internal/size_classes.h"],
-    cmd = "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    cmd = select({
+        "@%ws%//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
+        "@%ws%//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+        "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+    }),
     tools = [":size_classes_sh"],
 )
 
@@ -210,7 +226,13 @@ template_rule(
         "#undef JEMALLOC_PREFIX": "#define JEMALLOC_PREFIX \"jemalloc_\"",
         "#undef JEMALLOC_CPREFIX": "#define JEMALLOC_CPREFIX \"JEMALLOC_\"",
         "#undef JEMALLOC_PRIVATE_NAMESPACE": "#define JEMALLOC_PRIVATE_NAMESPACE je_",
-        "#undef CPU_SPINWAIT": "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+        "#undef CPU_SPINWAIT": "\n".join([
+            "#if defined(__powerpc64__) || defined(__powerpc__)",
+            "#define CPU_SPINWAIT __asm__ volatile(\"or 27,27,27\")",
+            "#else",
+            "#define CPU_SPINWAIT __asm__ volatile(\"pause\")",
+            "#endif",
+        ]),
         "#undef JEMALLOC_HAVE_BUILTIN_CLZ": "#define JEMALLOC_HAVE_BUILTIN_CLZ",
         "#undef JEMALLOC_USE_SYSCALL": "#define JEMALLOC_USE_SYSCALL",
         "#undef JEMALLOC_HAVE_SECURE_GETENV": "#define JEMALLOC_HAVE_SECURE_GETENV",
@@ -226,7 +248,13 @@ template_rule(
         "#undef JEMALLOC_DSS": "#define JEMALLOC_DSS",
         "#undef JEMALLOC_FILL": "#define JEMALLOC_FILL",
         "#undef LG_TINY_MIN": "#define LG_TINY_MIN 3",
-        "#undef LG_PAGE": "#define LG_PAGE 12",
+        "#undef LG_PAGE": "\n".join([
+            "#if defined(__powerpc64__) || defined(__powerpc__)",
+            "#define LG_PAGE 16",
+            "#else",
+            "#define LG_PAGE 12",
+            "#endif",
+        ]),
         "#undef JEMALLOC_MAPS_COALESCE": "#define JEMALLOC_MAPS_COALESCE",
         "#undef JEMALLOC_TLS": "#define JEMALLOC_TLS",
         "#undef JEMALLOC_INTERNAL_UNREACHABLE": "#define JEMALLOC_INTERNAL_UNREACHABLE __builtin_unreachable",
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 32f48d9fd214ae07ec1f39628245b85e4a9b896a..f9f1ea1085712d33dd510284d0d1bc228b3d1341 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -11,18 +11,19 @@ exports_files(["LICENSE"])
 libxsmm_interface_arguments = "0 1"
 
 # Arguments to ./scripts/libxsmm_config.py, see that file for detailed description.
-#  ilp64: 0 (no)
-#  big: 0 (no)
-#  offload: 0 (no)
+#  ilp64: no
+#  big: no
+#  offload: no
 #  alignment [b]
-#  prefetch: -1 (auto)
-#  threshold: 0 (auto)
+#  prefetch: 1 (auto)
+#  threshold: fallback to BLAS if n*m*k above this
 #  synchronize: yes
-#  jit: 1 (yes)
-#  flags: 0 (none)
+#  jit: yes
+#  flags
 #  alpha = 1
 #  beta = 1
-libxsmm_config_arguments = "0 0 0 64 -1 0 1 1 0 1 1"
+#  gemm = 2
+libxsmm_config_arguments = "0 0 0 64 1 0 1 1 0 1 1 2"
 
 # Arguments to ./scripts/libxsmm_dispatch.py, see that file for detailed description.
 #  (dummy argument)
@@ -66,6 +67,8 @@ cc_library(
         "src/libxsmm_dnn_convolution_winograd_weight_update.c",
         "src/libxsmm_dnn_handle.c",
         "src/libxsmm_dump.c",
+        "src/libxsmm_ext_gemm.c",
+        "src/libxsmm_ext_trans.c",
         "src/libxsmm_fsspmdm.c",
         "src/libxsmm_gemm.c",
         "src/libxsmm_main.c",
@@ -92,11 +95,15 @@ cc_library(
         "include/libxsmm_sync.h",
         "include/libxsmm_timer.h",
         "include/libxsmm_typedefs.h",
+        # Source files #included internally:
+        "src/libxsmm_gemm_diff.c",
+        "src/libxsmm_hash.c",
         # Generated:
         "include/libxsmm.h",
         "include/libxsmm_config.h",
         "include/libxsmm_dispatch.h",
-    ] + glob([ # trigger rebuild if template changed
+    ] + glob([
+        # trigger rebuild if template changed
         "src/template/*.c",
     ]),
     copts = [
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 5657ef554fdcb17f83281ae8c3e5e9312fd77698..15aa53962d1ac0c90da57fd246af9b57216b2ef6 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -70,6 +70,7 @@ cmake_vars = {
 
     # Features
     "HAVE_BACKTRACE": 1,
+    "BACKTRACE_HEADER": "execinfo.h",
     "HAVE_DLOPEN": 1,
     "HAVE_FUTIMES": 1,
     "HAVE_GETCWD": 1,
@@ -151,6 +152,11 @@ all_cmake_vars = select({
         cmake_vars + llvm_target_cmake_vars("X86", "x86_64-apple-darwin") +
         darwin_cmake_vars,
     ),
+    "@%ws%//tensorflow:linux_ppc64le": cmake_var_string(
+        cmake_vars +
+        llvm_target_cmake_vars("PowerPC", "powerpc64le-unknown-linux_gnu") +
+        linux_cmake_vars,
+    ),
     "//conditions:default": cmake_var_string(
         cmake_vars +
         llvm_target_cmake_vars("X86", "x86_64-unknown-linux_gnu") +
@@ -360,6 +366,7 @@ llvm_target_list = [
             ("-gen-asm-matcher", "lib/Target/ARM/ARMGenAsmMatcher.inc"),
             ("-gen-dag-isel", "lib/Target/ARM/ARMGenDAGISel.inc"),
             ("-gen-fast-isel", "lib/Target/ARM/ARMGenFastISel.inc"),
+            ("-gen-global-isel", "lib/Target/ARM/ARMGenGlobalISel.inc"),
             ("-gen-callingconv", "lib/Target/ARM/ARMGenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/ARM/ARMGenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/ARM/ARMGenDisassemblerTables.inc"),
@@ -434,6 +441,16 @@ llvm_target_list = [
     for target in llvm_target_list
 ]
 
+# This target is used to provide *.def files to x86_code_gen.
+# Files with '.def' extension are not allowed in 'srcs' of 'cc_library' rule.
+cc_library(
+    name = "x86_defs",
+    hdrs = glob([
+        "lib/Target/X86/*.def",
+    ]),
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "aarch64_asm_parser",
     srcs = glob([
@@ -621,6 +638,7 @@ cc_library(
         "lib/Analysis/*.cpp",
         "lib/Analysis/*.inc",
         "include/llvm/Transforms/Utils/Local.h",
+        "include/llvm/Transforms/Scalar.h",
         "lib/Analysis/*.h",
     ]),
     hdrs = glob([
@@ -723,6 +741,7 @@ cc_library(
         "lib/Target/ARM/MCTargetDesc/*.cpp",
         "lib/Target/ARM/MCTargetDesc/*.inc",
         "lib/Target/ARM/*.h",
+        "include/llvm/CodeGen/GlobalISel/GISelAccessor.h",
     ]),
     hdrs = glob([
         "include/llvm/Target/ARM/MCTargetDesc/*.h",
@@ -878,6 +897,7 @@ cc_library(
         ":analysis",
         ":config",
         ":core",
+        ":mc",
         ":support",
     ],
 )
@@ -902,7 +922,9 @@ cc_library(
         ":bit_writer",
         ":config",
         ":core",
+        ":instrumentation",
         ":mc",
+        ":profile_data",
         ":scalar",
         ":support",
         ":target",
@@ -1099,6 +1121,9 @@ cc_library(
         "lib/Transforms/IPO/*.c",
         "lib/Transforms/IPO/*.cpp",
         "lib/Transforms/IPO/*.inc",
+        "include/llvm/Transforms/SampleProfile.h",
+        "include/llvm-c/Transforms/IPO.h",
+        "include/llvm-c/Transforms/PassManagerBuilder.h",
         "lib/Transforms/IPO/*.h",
     ]),
     hdrs = glob([
@@ -1108,6 +1133,7 @@ cc_library(
     ]),
     deps = [
         ":analysis",
+        ":bit_reader",
         ":bit_writer",
         ":config",
         ":core",
@@ -1364,6 +1390,7 @@ cc_library(
         "lib/Transforms/ObjCARC/*.c",
         "lib/Transforms/ObjCARC/*.cpp",
         "lib/Transforms/ObjCARC/*.inc",
+        "include/llvm/Transforms/ObjCARC.h",
         "lib/Transforms/ObjCARC/*.h",
     ]),
     hdrs = glob([
@@ -1673,6 +1700,7 @@ cc_library(
         "lib/Support/Unix/*.inc",
         "lib/Support/Unix/*.h",
         "include/llvm-c/*.h",
+        "include/llvm/CodeGen/MachineValueType.h",
         "lib/Support/*.h",
     ]),
     hdrs = glob([
@@ -1681,7 +1709,11 @@ cc_library(
         "include/llvm/Support/*.inc",
         "include/llvm/ADT/*.h",
         "include/llvm/Support/ELFRelocs/*.def",
-    ]) + ["include/llvm/Support/DataTypes.h"],
+        "include/llvm/Support/WasmRelocs/*.def",
+    ]) + [
+        "include/llvm/Support/DataTypes.h",
+        "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
+    ],
     deps = [
         ":config",
         ":demangle",
@@ -1778,6 +1810,7 @@ cc_library(
         ":analysis",
         ":config",
         ":core",
+        ":scalar",
         ":support",
         ":transform_utils",
     ],
@@ -1857,6 +1890,7 @@ cc_library(
         ":support",
         ":target",
         ":x86_asm_printer",
+        ":x86_defs",
         ":x86_desc",
         ":x86_info",
         ":x86_utils",
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 7e95ebd35514649df3d9ac172aca9763b88c4896..8c86766effa97a08f6089194a5d9202da0e003b3 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -16,6 +16,7 @@ load(
 cc_library(
     name = "intel_binary_blob",
     srcs = if_mkl([
+        "libdl.so.2",
         "libmklml_intel.so",
         "libiomp5.so",
     ]),
diff --git a/third_party/nccl/fix_clang_compilation.patch b/third_party/nccl/fix_clang_compilation.patch
deleted file mode 100644
index e8d2a7dc9f30d9a0ee1149864f37a209fa955660..0000000000000000000000000000000000000000
--- a/third_party/nccl/fix_clang_compilation.patch
+++ /dev/null
@@ -1,85 +0,0 @@
-From 8241cd7b6ed1425eeb88fd380090575978e358f4 Mon Sep 17 00:00:00 2001
-From: Ilya Biryukov <ibiryukov@google.com>
-Date: Thu, 16 Mar 2017 12:01:11 +0100
-Subject: [PATCH 1/1] Fix compilation error when compiling with 'clang -x
- cuda'.
-
-Functions vFetch and vStore are not found by ADL with clang,
-so they need to be declared before usage in ReduceCopy.
----
- src/common_kernel.h | 52 ++++++++++++++++++++++++++--------------------------
- 1 file changed, 26 insertions(+), 26 deletions(-)
-
-diff --git a/src/common_kernel.h b/src/common_kernel.h
-index 28fbc85..cc71f8a 100644
---- a/src/common_kernel.h
-+++ b/src/common_kernel.h
-@@ -30,6 +30,32 @@
- #define BAR(type, barid, nthreads) \
-     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
- 
-+template<typename T> inline __device__
-+T vFetch(const volatile T* ptr) {
-+  return *ptr;
-+}
-+
-+#ifdef CUDA_HAS_HALF
-+template<> inline __device__
-+half vFetch<half>(const volatile half* ptr) {
-+  half r;
-+  r.x = ptr->x;
-+  return r;
-+}
-+#endif
-+
-+template<typename T> inline __device__
-+void vStore(volatile T* ptr, const T val) {
-+  *ptr = val;
-+}
-+
-+#ifdef CUDA_HAS_HALF
-+template<> inline __device__
-+void vStore<half>(volatile half* ptr, const half val) {
-+  ptr->x = val.x;
-+}
-+#endif
-+
- __device__ unsigned int spinct;
- 
- // Spin wait until func evaluates to true
-@@ -225,32 +251,6 @@ __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
- }
- 
--template<typename T> inline __device__
--T vFetch(const volatile T* ptr) {
--  return *ptr;
--}
--
--#ifdef CUDA_HAS_HALF
--template<> inline __device__
--half vFetch<half>(const volatile half* ptr) {
--  half r;
--  r.x = ptr->x;
--  return r;
--}
--#endif
--
--template<typename T> inline __device__
--void vStore(volatile T* ptr, const T val) {
--  *ptr = val;
--}
--
--#ifdef CUDA_HAS_HALF
--template<> inline __device__
--void vStore<half>(volatile half* ptr, const half val) {
--  ptr->x = val.x;
--}
--#endif
--
- // Assumptions:
- // - there is exactly 1 block
- // - THREADS is the number of producer threads
--- 
-2.12.0.367.g23dc2f6d3c-goog
-
diff --git a/third_party/nccl/nccl.BUILD b/third_party/nccl/nccl.BUILD
deleted file mode 100644
index 06b9b8ff68a5e8aa877d605daf02bec1ea4d6bfa..0000000000000000000000000000000000000000
--- a/third_party/nccl/nccl.BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-# NVIDIA nccl
-# A package of optimized primitives for collective multi-GPU communication.
-
-licenses(["notice"])  # BSD
-
-exports_files(["LICENSE.txt"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
-SRCS = [
-    "src/all_gather.cu",
-    "src/all_reduce.cu",
-    "src/broadcast.cu",
-    "src/core.cu",
-    "src/libwrap.cu",
-    "src/reduce.cu",
-    "src/reduce_scatter.cu",
-]
-
-# Copy .cu to .cu.cc so they can be in srcs of cc_library.
-[
-    genrule(
-        name = "gen_" + src,
-        srcs = [src],
-        outs = [src + ".cc"],
-        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
-    )
-    for src in SRCS
-]
-
-SRCS_CU_CC = [src + ".cc" for src in SRCS]
-
-cc_library(
-    name = "nccl",
-    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
-    hdrs = if_cuda(["src/nccl.h"]),
-    copts = [
-        "-DCUDA_MAJOR=0",
-        "-DCUDA_MINOR=0",
-        "-DNCCL_MAJOR=0",
-        "-DNCCL_MINOR=0",
-        "-DNCCL_PATCH=0",
-        "-Iexternal/nccl_archive/src",
-        "-O3",
-    ] + cuda_default_copts(),
-    linkopts = select({
-        "@%ws%//tensorflow:android": [
-            "-pie",
-        ],
-        "@%ws%//tensorflow:darwin": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "ws2_32.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/ortools.BUILD b/third_party/ortools.BUILD
index 1ebc8aa0be894590268632a84b08452afafb050f..61191e3d2711c955725078715c1b8238edfe069e 100644
--- a/third_party/ortools.BUILD
+++ b/third_party/ortools.BUILD
@@ -7,10 +7,7 @@ exports_files(["LICENSE-2.0.txt"])
 native.cc_library(
     name = "linear_solver_glop",
     deps = [
-    "@ortools_archive//linear_solver:linear_solver_glop",
-	 ],
+        "@ortools_archive//linear_solver:linear_solver_glop",
+    ],
     visibility = ["//visibility:public"],
 )
-
-
-
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..edd52095949cfdeff5cde3a1c696fe419b01a016
--- /dev/null
+++ b/third_party/pprof.BUILD
@@ -0,0 +1,18 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+load("@protobuf//:protobuf.bzl", "py_proto_library")
+
+exports_files(["pprof/LICENSE"])
+
+py_proto_library(
+    name = "pprof_proto_py",
+    srcs = ["proto/profile.proto"],
+    default_runtime = "@protobuf//:protobuf_python",
+    protoc = "@protobuf//:protoc",
+    srcs_version = "PY2AND3",
+    deps = ["@protobuf//:protobuf_python"],
+)
diff --git a/third_party/py/BUILD b/third_party/py/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/py/BUILD.tpl b/third_party/py/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..1ee9c071adb2d9f4aec84b92277c5067f153b666
--- /dev/null
+++ b/third_party/py/BUILD.tpl
@@ -0,0 +1,25 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+%{PYTHON_INCLUDE_GENRULE}
+
+%{NUMPY_INCLUDE_GENRULE}
diff --git a/third_party/py/numpy/BUILD b/third_party/py/numpy/BUILD
index 1d461505a692f5ffd89af94f311acfaf5a2670fd..be8332572b17e286fe06853c2eaaa30f042b6fbb 100644
--- a/third_party/py/numpy/BUILD
+++ b/third_party/py/numpy/BUILD
@@ -8,11 +8,9 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-cc_library(
+alias(
     name = "headers",
-    hdrs = glob(["numpy_include/**/*.h"]),
-    data = ["//util/python:python_checked"],
-    includes = ["numpy_include"],
+    actual = "@local_config_python//:numpy_headers",
 )
 
 genrule(
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..64ca6ee02f3c6aa65d2c5b58ee74f9c4cbef8123
--- /dev/null
+++ b/third_party/py/python_configure.bzl
@@ -0,0 +1,313 @@
+# -*- Python -*-
+"""Repository rule for Python autoconfiguration.
+
+`python_configure` depends on the following environment variables:
+
+  * `NUMPY_INCLUDE_PATH`: Location of Numpy libraries.
+  * `PYTHON_BIN_PATH`: location of python binary.
+  * `PYTHON_INCLUDE_PATH`: Location of python binaries.
+  * `PYTHON_LIB_PATH`: Location of python libraries.
+"""
+
+_NUMPY_INCLUDE_PATH = "NUMPY_INCLUDE_PATH"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+_PYTHON_INCLUDE_PATH = "PYTHON_INCLUDE_PATH"
+_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
+
+
+def _tpl(repository_ctx, tpl, substitutions={}, out=None):
+  if not out:
+    out = tpl
+  repository_ctx.template(
+      out,
+      Label("//third_party/py:%s.tpl" % tpl),
+      substitutions)
+
+
+def _python_configure_warning(msg):
+  """Output warning message during auto configuration."""
+  yellow = "\033[1;33m"
+  no_color = "\033[0m"
+  print("\n%sPython Configuration Warning:%s %s\n" % (yellow, no_color, msg))
+
+
+def _python_configure_fail(msg):
+  """Output failure message when auto configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
+
+
+def _get_env_var(repository_ctx, name, default = None, enable_warning = True):
+  """Find an environment variable in system path."""
+  if name in repository_ctx.os.environ:
+    return repository_ctx.os.environ[name]
+  if default != None:
+    if enable_warning:
+      _python_configure_warning(
+          "'%s' environment variable is not set, using '%s' as default" % (name, default))
+    return default
+  _python_configure_fail("'%s' environment variable is not set" % name)
+
+
+def _is_windows(repository_ctx):
+  """Returns true if the host operating system is windows."""
+  os_name = repository_ctx.os.name.lower()
+  if os_name.find("windows") != -1:
+    return True
+  return False
+
+
+def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
+             empty_stdout_fine=False):
+  """Executes an arbitrary shell command.
+
+  Args:
+    repository_ctx: the repository_ctx object
+    cmdline: list of strings, the command to execute
+    error_msg: string, a summary of the error if the command fails
+    error_details: string, details about the error or steps to fix it
+    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+      it's an error
+  Return:
+    the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    _python_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else ""]))
+  return result
+
+
+def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name):
+  """returns a genrule to symlink all files in a directory."""
+  # Get the list of files under this directory
+  find_result = None
+  if src_dir[-1] == "/":
+    src_dir = src_dir[:-1]
+  if dest_dir[-1] == "/":
+    dest_dir = dest_dir[:-1]
+  if _is_windows(repository_ctx):
+    src_dir = src_dir.replace("/", "\\")
+    find_result = _execute(
+        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+        empty_stdout_fine=True)
+    # src_files will be used in genrule.outs where the paths must
+    # use forward slashes.
+    src_files = find_result.stdout.replace("\\", "/").splitlines()
+  else:
+    find_result = _execute(
+        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True)
+    src_files = find_result.stdout.splitlines()
+
+  dest_files = [e[len(src_dir) + 1:] for e in src_files]
+  outs = ['      "%s/%s",' % (dest_dir, e) for e in dest_files]
+  cmd_script = None
+
+  if _is_windows(repository_ctx):
+    dest_dir = dest_dir.replace("/", "\\")
+    # Copy the source directory to the output directory.
+    # Creating a junction pointing to the source directory and declaring the
+    # outputs like we do on Linux, then expecting that Bazel would pick up these
+    # outputs via the junction is ill-conceived, because Bazel deletes the
+    # output files from previous build prior to executing the action, meaning it
+    # would delete the actual files via the old junction.
+    cmd_script = "%s-cmd.cmd" % genrule_name
+    repository_ctx.file(
+        cmd_script,
+        executable = True,
+        content = "\n".join([
+            '@rem Auto-generated by //third_party/py/python_configure.bzl',
+            '@rem -------------------------------------------------------',
+            '@set DEST_DIR=%%1\\%s\\' % dest_dir,
+            '@rmdir /s /q %DEST_DIR%',
+            '@xcopy /s /e /q /y "%s" "%%DEST_DIR%%"' % src_dir]))
+    cmd = "cmd.exe /c \\\"$$(echo $(location %s) $(@D) | sed 's,/,\\\\\\\\,g')\\\"" % cmd_script
+  else:
+    cmd_script = "%s-cmd.sh" % genrule_name
+    repository_ctx.file(
+        cmd_script,
+        executable = True,
+        content = "\n".join(
+            ['#!/bin/bash',
+             '# Auto-generated by //third_party/py/python_configure.bzl',
+             'DEST_DIR="$1/%s"' % dest_dir] +
+            ['ln -s "%s" "${DEST_DIR}/%s"' % (src_files[i], dest_files[i])
+             for i in range(len(src_files))]))
+    cmd = "$(location %s) $(@D)" % cmd_script
+
+  return "\n".join([
+      'genrule(',
+      '    name = "%s",' % genrule_name,
+      '    outs = [',
+      ] + outs + [
+      '    ],',
+      '    cmd = "%s",' % cmd,
+      '    visibility = ["//visibility:private"],',
+      '    tools = ["%s"],' % cmd_script,
+      ')'])
+
+
+def _get_python_lib(repository_ctx, python_bin):
+  """Gets the python lib path."""
+  print_lib = ("<<END\n" +
+      "from __future__ import print_function\n" +
+      "import site\n" +
+      "import os\n" +
+      "\n" +
+      "try:\n" +
+      "  input = raw_input\n" +
+      "except NameError:\n" +
+      "  pass\n" +
+      "\n" +
+      "python_paths = []\n" +
+      "if os.getenv('PYTHONPATH') is not None:\n" +
+      "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
+      "try:\n" +
+      "  library_paths = site.getsitepackages()\n" +
+      "except AttributeError:\n" +
+      " from distutils.sysconfig import get_python_lib\n" +
+      " library_paths = [get_python_lib()]\n" +
+      "all_paths = set(python_paths + library_paths)\n" +
+      "paths = []\n" +
+      "for path in all_paths:\n" +
+      "  if os.path.isdir(path):\n" +
+      "    paths.append(path)\n" +
+      "if len(paths) >=1:\n" +
+      "  print(paths[0])\n" +
+      "END")
+  cmd = '%s - %s' % (python_bin, print_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  return result.stdout.strip('\n')
+
+
+def _check_python_lib(repository_ctx, python_lib):
+  """Checks the python lib path."""
+  cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  if result.return_code == 1:
+    _python_configure_fail("Invalid python library path:  %s" % python_lib)
+
+
+def _check_python_bin(repository_ctx, python_bin):
+  """Checks the python bin path."""
+  cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
+  result = repository_ctx.execute(["bash", "-c", cmd])
+  if result.return_code == 1:
+    _python_configure_fail(
+        "PYTHON_BIN_PATH is not executable.  Is it the python binary?")
+
+
+def _get_python_include(repository_ctx, python_bin):
+  """Gets the python include path."""
+  result = _execute(repository_ctx,
+                    [python_bin, "-c",
+                     'from __future__ import print_function;' +
+                     'from distutils import sysconfig;' +
+                     'print(sysconfig.get_python_inc())'],
+                    error_msg="Problem getting python include path.",
+                    error_details=("Is the Python binary path set up right? " +
+                                   "(See ./configure or BAZEL_BIN_PATH.) " +
+                                   "Is distutils installed?"))
+  return result.stdout.splitlines()[0]
+
+
+def _get_numpy_include(repository_ctx, python_bin):
+  """Gets the numpy include path."""
+  return _execute(repository_ctx,
+                  [python_bin, "-c",
+                   'from __future__ import print_function;' +
+                   'import numpy;' +
+                   ' print(numpy.get_include());'],
+                  error_msg="Problem getting numpy include path.",
+                  error_details="Is numpy installed?").stdout.splitlines()[0]
+
+
+def _create_local_python_repository(repository_ctx):
+  """Creates the repository containing files set up to build with Python."""
+  python_include = None
+  numpy_include = None
+  empty_config = False
+  # If local checks were requested, the python and numpy include will be auto
+  # detected on the host config (using _PYTHON_BIN_PATH).
+  if repository_ctx.attr.local_checks:
+    python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH)
+    _check_python_bin(repository_ctx, python_bin)
+    python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH, _get_python_lib(repository_ctx, python_bin))
+    _check_python_lib(repository_ctx, python_lib)
+    python_include = _get_python_include(repository_ctx, python_bin)
+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
+  else:
+    # Otherwise, we assume user provides all paths (via ENV or attrs)
+    python_include = _get_env_var(repository_ctx, _PYTHON_INCLUDE_PATH,
+                                  repository_ctx.attr.python_include)
+    numpy_include = _get_env_var(repository_ctx, _NUMPY_INCLUDE_PATH,
+                                 repository_ctx.attr.numpy_include) + '/numpy'
+  if empty_config:
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "python_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+        "%{NUMPY_INCLUDE_GENRULE}": ('filegroup(\n' +
+                                      '    name = "numpy_include",\n' +
+                                      '    srcs = [],\n' +
+                                      ')\n'),
+    })
+  else:
+    python_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, python_include, 'python_include', 'python_include')
+    numpy_include_rule = _symlink_genrule_for_dir(
+        repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+    })
+
+
+def _create_remote_python_repository(repository_ctx):
+  """Creates pointers to a remotely configured repo set up to build with Python.
+  """
+  _tpl(repository_ctx, "remote.BUILD", {
+      "%{REMOTE_PYTHON_REPO}": repository_ctx.attr.remote_config_repo,
+  }, "BUILD")
+
+
+def _python_autoconf_impl(repository_ctx):
+  """Implementation of the python_autoconf repository rule."""
+  if repository_ctx.attr.remote_config_repo != "":
+    _create_remote_python_repository(repository_ctx)
+  else:
+    _create_local_python_repository(repository_ctx)
+
+
+python_configure = repository_rule(
+    implementation = _python_autoconf_impl,
+    attrs = {
+        "local_checks": attr.bool(mandatory = False, default = True),
+        "python_include": attr.string(mandatory = False),
+        "numpy_include": attr.string(mandatory = False),
+        "remote_config_repo": attr.string(mandatory = False, default =""),
+    },
+    environ = [
+        _PYTHON_BIN_PATH,
+        _PYTHON_INCLUDE_PATH,
+        _PYTHON_LIB_PATH,
+        _NUMPY_INCLUDE_PATH,
+    ],
+)
+"""Detects and configures the local Python.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+python_configure(name = "local_config_python")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/py/remote.BUILD.tpl b/third_party/py/remote.BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..332508ec84c1d1f79f28382deb98a2344e4d95d4
--- /dev/null
+++ b/third_party/py/remote.BUILD.tpl
@@ -0,0 +1,13 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+alias(
+    name = "python_headers",
+    actual = "@%{REMOTE_PYTHON_REPO}//:python_headers",
+)
+
+alias(
+    name = "numpy_headers",
+    actual = "@%{REMOTE_PYTHON_REPO}//:numpy_headers",
+)
diff --git a/third_party/sycl/crosstool/BUILD b/third_party/sycl/crosstool/BUILD
index 8b137891791fe96927ad78e64b0aad7bded08bdc..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644
--- a/third_party/sycl/crosstool/BUILD
+++ b/third_party/sycl/crosstool/BUILD
@@ -1 +0,0 @@
-
diff --git a/third_party/sycl/sycl/LICENSE.text b/third_party/sycl/sycl/LICENSE.text
index 0c2955c4d76d90e9580b43e28fd4e81fe7064de8..8d3f050b392a301d61af779674bc9fe00a043422 100644
--- a/third_party/sycl/sycl/LICENSE.text
+++ b/third_party/sycl/sycl/LICENSE.text
@@ -67,7 +67,7 @@ you; so please press the "CANCEL" button to cancel your download.
         ComputeCpp within its marketing materials, without the
         express prior written permission of Codeplay.
  4. Support. Codeplay does not provide any guarantees of support for
-    the Software to the user. Codeplay will use reasonable endeavours
+    the Software to the user. Codeplay will use reasonable endeavors
     to respond to users' support requests, for the most recent
     release only, via the community support website at https://
     computecpp.codeplay.com.
@@ -78,7 +78,7 @@ you; so please press the "CANCEL" button to cancel your download.
     copyrights, trade secrets and other proprietary rights in the
     Software, including the rights to make and license the use of all
     copies. To the extent that any patents owned by Codeplay or its
-    licensors relate to any component of the Software, the licence
+    licensors relate to any component of the Software, the license
     granted to the user in accordance with this Agreement allows for
     the lawful use of such patents but only for the purposes of this
     Agreement and not further or otherwise. Therefore, the user may
diff --git a/tools/bazel.rc.template b/tools/bazel.rc
similarity index 73%
rename from tools/bazel.rc.template
rename to tools/bazel.rc
index 097ff7b9d07aba67d5d35979e09eb08ad9afdbda..e67a290cf40ca7f688dfdb03210786c8c85abe48 100644
--- a/tools/bazel.rc.template
+++ b/tools/bazel.rc
@@ -14,18 +14,9 @@ build:sycl --define=using_sycl=true
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
 
-build --force_python=py$PYTHON_MAJOR_VERSION
-build --host_force_python=py$PYTHON_MAJOR_VERSION
-build --python$PYTHON_MAJOR_VERSION_path=$PYTHON_BINARY
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
 
-build --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --define PYTHON_BIN_PATH=$PYTHON_BINARY
-test --force_python=py$PYTHON_MAJOR_VERSION
-test --host_force_python=py$PYTHON_MAJOR_VERSION
-run --define PYTHON_BIN_PATH=$PYTHON_BINARY
-
 build --spawn_strategy=standalone
 test --spawn_strategy=standalone
 run --spawn_strategy=standalone
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
new file mode 100755
index 0000000000000000000000000000000000000000..abeebeadea49bbbf808813c2d3609558e75785d5
--- /dev/null
+++ b/tools/tf_env_collect.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -u  # Check for undefined variables
+
+echo "Collecting system information..."
+
+OUTPUT_FILE=tf_env.txt
+
+echo >> $OUTPUT_FILE
+echo "== cat /etc/issue ===============================================" >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+uname=`uname -s`
+if [ "$(uname)" == "Darwin" ]; then
+  echo Mac OS X `sw_vers -productVersion` >> $OUTPUT_FILE
+elif [ "$(uname)" == "Linux" ]; then
+  cat /etc/*release | grep VERSION >> $OUTPUT_FILE
+fi
+
+
+echo >> $OUTPUT_FILE
+echo '== are we in docker =============================================' >> $OUTPUT_FILE
+num=`cat /proc/1/cgroup | grep docker | wc -l`;
+if [ $num -ge 1 ]; then
+  echo "Yes" >> $OUTPUT_FILE
+else
+  echo "No" >> $OUTPUT_FILE
+fi
+
+echo >> $OUTPUT_FILE
+echo '== compiler =====================================================' >> $OUTPUT_FILE
+c++ --version 2>&1 >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== uname -a =====================================================' >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== check pips ===================================================' >> $OUTPUT_FILE
+pip list 2>&1 | grep "proto\|numpy\|tensorflow" >> $OUTPUT_FILE
+
+
+echo >> $OUTPUT_FILE
+echo '== check for virtualenv =========================================' >> $OUTPUT_FILE
+python -c "import sys;print(hasattr(sys, \"real_prefix\"))" >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== tensorflow import ============================================' >> $OUTPUT_FILE
+cat <<EOF > /tmp/check_tf.py
+import tensorflow as tf;
+print("tf.VERSION = %s" % tf.VERSION)
+print("tf.GIT_VERSION = %s" % tf.GIT_VERSION)
+print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
+with tf.Session() as sess:
+  print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
+EOF
+python /tmp/check_tf.py 2>&1  >> ${OUTPUT_FILE}
+
+DEBUG_LD=libs python -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+grep libcudnn.so /tmp/loadedlibs >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== env ==========================================================' >> $OUTPUT_FILE
+if [ -z ${LD_LIBRARY_PATH+x} ]; then
+  echo "LD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+  echo "DYLD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+
+
+echo >> $OUTPUT_FILE >> $OUTPUT_FILE
+echo '== nvidia-smi ===================================================' >> $OUTPUT_FILE
+nvidia-smi 2>&1 >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+
+echo '== cuda libs  ===================================================' >> $OUTPUT_FILE
+find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+
+# Remove any words with google.
+mv $OUTPUT_FILE old-$OUTPUT_FILE
+grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
+
+echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
+echo "and use it to populate the fields in the github issue template."
+echo
+echo "cat ${OUTPUT_FILE}"
+echo
+
diff --git a/util/python/BUILD b/util/python/BUILD
index 29688b875df5933f612a3c5931e22d00d7cab2f7..96daf9947ad43e7d9f3a771166d714af0b1a8036 100644
--- a/util/python/BUILD
+++ b/util/python/BUILD
@@ -2,31 +2,7 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
-cc_library(
+alias(
     name = "python_headers",
-    hdrs = glob([
-        "python_include/**/*.h",
-    ]),
-    data = [":python_checked"],
-    includes = ["python_include"],
-)
-
-genrule(
-    name = "python_check",
-    srcs = [
-        "python_config.sh",
-        "configure_files",
-    ],
-    outs = [
-        "python_checked",
-    ],
-    cmd = "OUTPUTDIR=\"$(@D)/\"; $(location :python_config.sh) --check && touch $$OUTPUTDIR/python_checked",
-    local = 1,
-)
-
-filegroup(
-    name = "configure_files",
-    data = glob([
-        "*",
-    ]),
+    actual = "@local_config_python//:python_headers",
 )
diff --git a/util/python/python_config.sh b/util/python/python_config.sh
deleted file mode 100755
index 4b18bf3578d77e86826244243b9e09c61cbbf8da..0000000000000000000000000000000000000000
--- a/util/python/python_config.sh
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e -o errexit
-
-if [ -d "../org_tensorflow" ]; then
-  script_path="../org_tensorflow"
-else
-  # Prefix expected paths with ./ locally and external/reponame/ for remote repos.
-  # TODO(kchodorow): remove once runfiles paths are fixed, see
-  # https://github.com/bazelbuild/bazel/issues/848.
-  script_path=$(dirname $(dirname $(dirname "$0")))
-  script_path=${script_path:-.}
-fi
-
-EXPECTED_PATHS="$script_path/util/python/python_include"\
-" $script_path/util/python/python_lib"\
-" $script_path/third_party/py/numpy/numpy_include"
-
-function main {
-  argument="$1"
-  shift
-  case $argument in
-    --check)
-      check_python
-      exit 0
-      ;;
-    --setup)
-      setup_python "$1"
-      exit 0
-      ;;
-  esac
-}
-
-function python_path {
-  "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import site
-import os
-
-try:
-  input = raw_input
-except NameError:
-  pass
-
-python_paths = []
-if os.getenv('PYTHONPATH') is not None:
-  python_paths = os.getenv('PYTHONPATH').split(':')
-try:
-  library_paths = site.getsitepackages()
-except AttributeError:
- from distutils.sysconfig import get_python_lib
- library_paths = [get_python_lib()]
-all_paths = set(python_paths + library_paths)
-
-paths = []
-for path in all_paths:
-  if os.path.isdir(path):
-    paths.append(path)
-
-if len(paths) == 1:
-  print(paths[0])
-else:
-  ret_paths = ",".join(paths)
-  print(ret_paths)
-END
-}
-
-function default_python_path {
-  PYTHON_ARG="$1" "$PYTHON_BIN_PATH" - <<END
-from __future__ import print_function
-import os
-
-default = os.getenv('PYTHON_ARG')
-default = str(default)
-print(default)
-END
-}
-
-function setup_python {
-  PYTHON_BIN_PATH="$1";
-
-  if [ -z "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH was not provided.  Did you run configure?"
-    exit 1
-  fi
-  if [ ! -x "$PYTHON_BIN_PATH" ]  || [ -d "$PYTHON_BIN_PATH" ]; then
-    echo "PYTHON_BIN_PATH is not executable.  Is it the python binary?"
-    exit 1
-  fi
-
-  local python_major_version=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import sys; print(sys.version_info[0]);')
-  if [ "$python_major_version" == "" ]; then
-    echo -e "\n\nERROR: Problem getting python version.  Is $PYTHON_BIN_PATH the correct python binary?"
-    exit 1
-  fi
-
-  local python_include="$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; from distutils import sysconfig; print(sysconfig.get_python_inc());')"
-  if [ "$python_include" == "" ]; then
-    echo -e "\n\nERROR: Problem getting python include path.  Is distutils installed?"
-    exit 1
-  fi
-
-  if [ -z "$PYTHON_LIB_PATH" ]; then
-    local python_lib_path
-    # Split python_path into an array of paths, this allows path containing spaces
-    IFS=','
-    python_lib_path=($(python_path))
-    unset IFS
-
-    if [ 1 = "$USE_DEFAULT_PYTHON_LIB_PATH" ]; then
-      PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-      echo "Using python library path: $PYTHON_LIB_PATH"
-
-    else
-      echo "Found possible Python library paths:"
-      for x in "${python_lib_path[@]}"; do
-        echo "  $x"
-      done
-      set -- "${python_lib_path[@]}"
-      echo "Please input the desired Python library path to use.  Default is ["$1"]"
-      read b || true
-      if [ "$b" == "" ]; then
-        PYTHON_LIB_PATH="$(default_python_path "${python_lib_path[0]}")"
-        echo "Using python library path: $PYTHON_LIB_PATH"
-      else
-        PYTHON_LIB_PATH="$b"
-      fi
-    fi
-  fi
-
-  if test -d "$PYTHON_LIB_PATH" -a -x "$PYTHON_LIB_PATH"; then
-    python_lib="$PYTHON_LIB_PATH"
-  else
-    echo -e "\n\nERROR: Invalid python library path: ${PYTHON_LIB_PATH}."
-    exit 1
-  fi
-
-  local numpy_include=$("${PYTHON_BIN_PATH}" -c 'from __future__ import print_function; import numpy; print(numpy.get_include());')
-  if [ "$numpy_include" == "" ]; then
-    echo -e "\n\nERROR: Problem getting numpy include path.  Is numpy installed?"
-    exit 1
-  fi
-
-  for x in $EXPECTED_PATHS; do
-    if [ -e "$x" ]; then
-      rm -rf "$x"
-    fi
-  done
-
-# ln -sf is actually implemented as copying in msys since creating symbolic
-# links is privileged on Windows. But copying is too slow, so invoke mklink
-# to create junctions on Windows.
-  if is_windows; then
-    cmd /c "mklink /J util\\python\\python_include \"${python_include}\""
-    cmd /c "mklink /J util\\python\\python_lib \"${python_lib}\""
-    cmd /c "mklink /J third_party\\py\\numpy\\numpy_include \"${numpy_include}\""
-  else
-    ln -sf "${python_include}" util/python/python_include
-    ln -sf "${python_lib}" util/python/python_lib
-    ln -sf "${numpy_include}" third_party/py/numpy/numpy_include
-  fi
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows; then
-    PYTHON_BIN_PATH="$(cygpath -m "$PYTHON_BIN_PATH")"
-  fi
-
-  # Write tools/bazel.rc
-  echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc
-  sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \
-      -e "s|\$PYTHON_BINARY|\"$PYTHON_BIN_PATH\"|g" \
-      tools/bazel.rc.template >> tools/bazel.rc
-  # Write tools/python_bin_path.sh
-  echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh
-}
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-function is_windows() {
-  # On windows, the shell script is actually running in msys
-  if [[ "${PLATFORM}" =~ msys_nt* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function check_python {
-  for x in $EXPECTED_PATHS; do
-    if [ ! -e "$x" ]; then
-      echo -e "\n\nERROR: Cannot find '${x}'.  Did you run configure?\n\n" 1>&2
-      exit 1
-    fi
-    # Don't check symbolic link on Windows
-    if ! is_windows && [ ! -L "${x}" ]; then
-      echo -e "\n\nERROR: '${x}' is not a symbolic link.  Internal error.\n\n" 1>&2
-      exit 1
-    fi
-    if is_windows; then
-      # In msys, readlink <path> doesn't work, because no symbolic link on
-      # Windows. readlink -f <path> returns the real path of a junction.
-      true_path=$(readlink -f "${x}")
-    else
-      true_path=$(readlink "${x}")
-    fi
-    if [ ! -d "${true_path}" ]; then
-      echo -e "\n\nERROR: '${x}' does not refer to an existing directory: ${true_path}.  Do you need to rerun configure?\n\n" 1>&2
-      exit 1
-    fi
-  done
-}
-
-main "$@"